LKML Archive on lore.kernel.org
 help / color / Atom feed
* [PATCH v3 0/6] Rewrite x86/ftrace to use text_poke()
  2019-10-07  9:02 [RESEND] everything text-poke: ftrace, modules, static_call and jump_label Peter Zijlstra
@ 2019-10-07  8:17 ` Peter Zijlstra
  2019-10-07  8:17   ` [PATCH v3 1/6] x86/alternatives: Teach text_poke_bp() to emulate instructions Peter Zijlstra
                     ` (6 more replies)
  2019-10-07  8:25 ` [PATCH v2 0/4] Propagate module notifier errors Peter Zijlstra
                   ` (2 subsequent siblings)
  3 siblings, 7 replies; 96+ messages in thread
From: Peter Zijlstra @ 2019-10-07  8:17 UTC (permalink / raw)
  To: x86
  Cc: peterz, linux-kernel, rostedt, mhiramat, bristot, jbaron,
	torvalds, tglx, mingo, namit, hpa, luto, ard.biesheuvel,
	jpoimboe

Ftrace was one of the last W^X violators; these patches move it over to the
generic text_poke() interface and thereby get rid of this oddity.



^ permalink raw reply	[flat|nested] 96+ messages in thread

* [PATCH v3 1/6] x86/alternatives: Teach text_poke_bp() to emulate instructions
  2019-10-07  8:17 ` [PATCH v3 0/6] Rewrite x86/ftrace to use text_poke() Peter Zijlstra
@ 2019-10-07  8:17   ` Peter Zijlstra
  2019-10-08 14:29     ` Borislav Petkov
  2019-10-09 12:03     ` Daniel Bristot de Oliveira
  2019-10-07  8:17   ` [PATCH v3 2/6] x86/alternatives: Update int3_emulate_push() comment Peter Zijlstra
                     ` (5 subsequent siblings)
  6 siblings, 2 replies; 96+ messages in thread
From: Peter Zijlstra @ 2019-10-07  8:17 UTC (permalink / raw)
  To: x86
  Cc: peterz, linux-kernel, rostedt, mhiramat, bristot, jbaron,
	torvalds, tglx, mingo, namit, hpa, luto, ard.biesheuvel,
	jpoimboe

In preparation for static_call and variable size jump_label support,
teach text_poke_bp() to emulate instructions, namely:

  JMP32, JMP8, CALL, NOP2, NOP_ATOMIC5, INT3

The current text_poke_bp() takes a @handler argument which is used as
a jump target when the temporary INT3 is hit by a different CPU.

When patching CALL instructions, this doesn't work because we'd miss
the PUSH of the return address. Instead, teach poke_int3_handler() to
emulate an instruction, typically the instruction we're patching in.

This fits almost all text_poke_bp() users, except
arch_unoptimize_kprobe() which restores random text, and for that site
we have to build an explicit emulate instruction.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Daniel Bristot de Oliveira <bristot@redhat.com>
---
 arch/x86/include/asm/text-patching.h |   24 ++++--
 arch/x86/kernel/alternative.c        |  132 ++++++++++++++++++++++++++---------
 arch/x86/kernel/jump_label.c         |    9 --
 arch/x86/kernel/kprobes/opt.c        |   11 ++
 4 files changed, 130 insertions(+), 46 deletions(-)

--- a/arch/x86/include/asm/text-patching.h
+++ b/arch/x86/include/asm/text-patching.h
@@ -26,10 +26,11 @@ static inline void apply_paravirt(struct
 #define POKE_MAX_OPCODE_SIZE	5
 
 struct text_poke_loc {
-	void *detour;
 	void *addr;
-	size_t len;
-	const char opcode[POKE_MAX_OPCODE_SIZE];
+	int len;
+	s32 rel32;
+	u8 opcode;
+	const char text[POKE_MAX_OPCODE_SIZE];
 };
 
 extern void text_poke_early(void *addr, const void *opcode, size_t len);
@@ -51,8 +52,10 @@ extern void text_poke_early(void *addr,
 extern void *text_poke(void *addr, const void *opcode, size_t len);
 extern void *text_poke_kgdb(void *addr, const void *opcode, size_t len);
 extern int poke_int3_handler(struct pt_regs *regs);
-extern void text_poke_bp(void *addr, const void *opcode, size_t len, void *handler);
+extern void text_poke_bp(void *addr, const void *opcode, size_t len, const void *emulate);
 extern void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries);
+extern void text_poke_loc_init(struct text_poke_loc *tp, void *addr,
+			       const void *opcode, size_t len, const void *emulate);
 extern int after_bootmem;
 extern __ro_after_init struct mm_struct *poking_mm;
 extern __ro_after_init unsigned long poking_addr;
@@ -63,8 +66,17 @@ static inline void int3_emulate_jmp(stru
 	regs->ip = ip;
 }
 
-#define INT3_INSN_SIZE 1
-#define CALL_INSN_SIZE 5
+#define INT3_INSN_SIZE		1
+#define INT3_INSN_OPCODE	0xCC
+
+#define CALL_INSN_SIZE		5
+#define CALL_INSN_OPCODE	0xE8
+
+#define JMP32_INSN_SIZE		5
+#define JMP32_INSN_OPCODE	0xE9
+
+#define JMP8_INSN_SIZE		2
+#define JMP8_INSN_OPCODE	0xEB
 
 static inline void int3_emulate_push(struct pt_regs *regs, unsigned long val)
 {
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -956,16 +956,15 @@ NOKPROBE_SYMBOL(patch_cmp);
 int poke_int3_handler(struct pt_regs *regs)
 {
 	struct text_poke_loc *tp;
-	unsigned char int3 = 0xcc;
 	void *ip;
 
 	/*
 	 * Having observed our INT3 instruction, we now must observe
 	 * bp_patching.nr_entries.
 	 *
-	 * 	nr_entries != 0			INT3
-	 * 	WMB				RMB
-	 * 	write INT3			if (nr_entries)
+	 *	nr_entries != 0			INT3
+	 *	WMB				RMB
+	 *	write INT3			if (nr_entries)
 	 *
 	 * Idem for other elements in bp_patching.
 	 */
@@ -978,9 +977,9 @@ int poke_int3_handler(struct pt_regs *re
 		return 0;
 
 	/*
-	 * Discount the sizeof(int3). See text_poke_bp_batch().
+	 * Discount the INT3. See text_poke_bp_batch().
 	 */
-	ip = (void *) regs->ip - sizeof(int3);
+	ip = (void *) regs->ip - INT3_INSN_SIZE;
 
 	/*
 	 * Skip the binary search if there is a single member in the vector.
@@ -997,8 +996,28 @@ int poke_int3_handler(struct pt_regs *re
 			return 0;
 	}
 
-	/* set up the specified breakpoint detour */
-	regs->ip = (unsigned long) tp->detour;
+	ip += tp->len;
+
+	switch (tp->opcode) {
+	case INT3_INSN_OPCODE:
+		/*
+		 * Someone poked an explicit INT3, they'll want to handle it,
+		 * do not consume.
+		 */
+		return 0;
+
+	case CALL_INSN_OPCODE:
+		int3_emulate_call(regs, (long)ip + tp->rel32);
+		break;
+
+	case JMP32_INSN_OPCODE:
+	case JMP8_INSN_OPCODE:
+		int3_emulate_jmp(regs, (long)ip + tp->rel32);
+		break;
+
+	default:
+		BUG();
+	}
 
 	return 1;
 }
@@ -1014,7 +1033,7 @@ NOKPROBE_SYMBOL(poke_int3_handler);
  * synchronization using int3 breakpoint.
  *
  * The way it is done:
- * 	- For each entry in the vector:
+ *	- For each entry in the vector:
  *		- add a int3 trap to the address that will be patched
  *	- sync cores
  *	- For each entry in the vector:
@@ -1027,9 +1046,9 @@ NOKPROBE_SYMBOL(poke_int3_handler);
  */
 void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries)
 {
-	int patched_all_but_first = 0;
-	unsigned char int3 = 0xcc;
+	unsigned char int3 = INT3_INSN_OPCODE;
 	unsigned int i;
+	int do_sync;
 
 	lockdep_assert_held(&text_mutex);
 
@@ -1053,16 +1072,16 @@ void text_poke_bp_batch(struct text_poke
 	/*
 	 * Second step: update all but the first byte of the patched range.
 	 */
-	for (i = 0; i < nr_entries; i++) {
+	for (do_sync = 0, i = 0; i < nr_entries; i++) {
 		if (tp[i].len - sizeof(int3) > 0) {
 			text_poke((char *)tp[i].addr + sizeof(int3),
-				  (const char *)tp[i].opcode + sizeof(int3),
+				  (const char *)tp[i].text + sizeof(int3),
 				  tp[i].len - sizeof(int3));
-			patched_all_but_first++;
+			do_sync++;
 		}
 	}
 
-	if (patched_all_but_first) {
+	if (do_sync) {
 		/*
 		 * According to Intel, this core syncing is very likely
 		 * not necessary and we'd be safe even without it. But
@@ -1075,10 +1094,17 @@ void text_poke_bp_batch(struct text_poke
 	 * Third step: replace the first byte (int3) by the first byte of
 	 * replacing opcode.
 	 */
-	for (i = 0; i < nr_entries; i++)
-		text_poke(tp[i].addr, tp[i].opcode, sizeof(int3));
+	for (do_sync = 0, i = 0; i < nr_entries; i++) {
+		if (tp[i].text[0] == INT3_INSN_OPCODE)
+			continue;
+
+		text_poke(tp[i].addr, tp[i].text, sizeof(int3));
+		do_sync++;
+	}
+
+	if (do_sync)
+		on_each_cpu(do_sync_core, NULL, 1);
 
-	on_each_cpu(do_sync_core, NULL, 1);
 	/*
 	 * sync_core() implies an smp_mb() and orders this store against
 	 * the writing of the new instruction.
@@ -1087,6 +1113,60 @@ void text_poke_bp_batch(struct text_poke
 	bp_patching.nr_entries = 0;
 }
 
+void text_poke_loc_init(struct text_poke_loc *tp, void *addr,
+			const void *opcode, size_t len, const void *emulate)
+{
+	struct insn insn;
+
+	if (!opcode)
+		opcode = (void *)tp->text;
+	else
+		memcpy((void *)tp->text, opcode, len);
+
+	if (!emulate)
+		emulate = opcode;
+
+	kernel_insn_init(&insn, emulate, MAX_INSN_SIZE);
+	insn_get_length(&insn);
+
+	BUG_ON(!insn_complete(&insn));
+	BUG_ON(len != insn.length);
+
+	tp->addr = addr;
+	tp->len = len;
+	tp->opcode = insn.opcode.bytes[0];
+
+	switch (tp->opcode) {
+	case INT3_INSN_OPCODE:
+		break;
+
+	case CALL_INSN_OPCODE:
+	case JMP32_INSN_OPCODE:
+	case JMP8_INSN_OPCODE:
+		tp->rel32 = insn.immediate.value;
+		break;
+
+	default: /* assume NOP */
+		switch (len) {
+		case 2: /* NOP2 -- emulate as JMP8+0 */
+			BUG_ON(memcmp(emulate, ideal_nops[len], len));
+			tp->opcode = JMP8_INSN_OPCODE;
+			tp->rel32 = 0;
+			break;
+
+		case 5: /* NOP5 -- emulate as JMP32+0 */
+			BUG_ON(memcmp(emulate, ideal_nops[NOP_ATOMIC5], len));
+			tp->opcode = JMP32_INSN_OPCODE;
+			tp->rel32 = 0;
+			break;
+
+		default: /* unknown instruction */
+			BUG();
+		}
+		break;
+	}
+}
+
 /**
  * text_poke_bp() -- update instructions on live kernel on SMP
  * @addr:	address to patch
@@ -1098,20 +1178,10 @@ void text_poke_bp_batch(struct text_poke
  * dynamically allocated memory. This function should be used when it is
  * not possible to allocate memory.
  */
-void text_poke_bp(void *addr, const void *opcode, size_t len, void *handler)
+void text_poke_bp(void *addr, const void *opcode, size_t len, const void *emulate)
 {
-	struct text_poke_loc tp = {
-		.detour = handler,
-		.addr = addr,
-		.len = len,
-	};
-
-	if (len > POKE_MAX_OPCODE_SIZE) {
-		WARN_ONCE(1, "len is larger than %d\n", POKE_MAX_OPCODE_SIZE);
-		return;
-	}
-
-	memcpy((void *)tp.opcode, opcode, len);
+	struct text_poke_loc tp;
 
+	text_poke_loc_init(&tp, addr, opcode, len, emulate);
 	text_poke_bp_batch(&tp, 1);
 }
--- a/arch/x86/kernel/jump_label.c
+++ b/arch/x86/kernel/jump_label.c
@@ -89,8 +89,7 @@ static void __ref __jump_label_transform
 		return;
 	}
 
-	text_poke_bp((void *)jump_entry_code(entry), &code, JUMP_LABEL_NOP_SIZE,
-		     (void *)jump_entry_code(entry) + JUMP_LABEL_NOP_SIZE);
+	text_poke_bp((void *)jump_entry_code(entry), &code, JUMP_LABEL_NOP_SIZE, NULL);
 }
 
 void arch_jump_label_transform(struct jump_entry *entry,
@@ -147,11 +146,9 @@ bool arch_jump_label_transform_queue(str
 	}
 
 	__jump_label_set_jump_code(entry, type,
-				   (union jump_code_union *) &tp->opcode, 0);
+				   (union jump_code_union *)&tp->text, 0);
 
-	tp->addr = entry_code;
-	tp->detour = entry_code + JUMP_LABEL_NOP_SIZE;
-	tp->len = JUMP_LABEL_NOP_SIZE;
+	text_poke_loc_init(tp, entry_code, NULL, JUMP_LABEL_NOP_SIZE, NULL);
 
 	tp_vec_nr++;
 
--- a/arch/x86/kernel/kprobes/opt.c
+++ b/arch/x86/kernel/kprobes/opt.c
@@ -437,8 +437,7 @@ void arch_optimize_kprobes(struct list_h
 		insn_buff[0] = RELATIVEJUMP_OPCODE;
 		*(s32 *)(&insn_buff[1]) = rel;
 
-		text_poke_bp(op->kp.addr, insn_buff, RELATIVEJUMP_SIZE,
-			     op->optinsn.insn);
+		text_poke_bp(op->kp.addr, insn_buff, RELATIVEJUMP_SIZE, NULL);
 
 		list_del_init(&op->list);
 	}
@@ -448,12 +447,18 @@ void arch_optimize_kprobes(struct list_h
 void arch_unoptimize_kprobe(struct optimized_kprobe *op)
 {
 	u8 insn_buff[RELATIVEJUMP_SIZE];
+	u8 emulate_buff[RELATIVEJUMP_SIZE];
 
 	/* Set int3 to first byte for kprobes */
 	insn_buff[0] = BREAKPOINT_INSTRUCTION;
 	memcpy(insn_buff + 1, op->optinsn.copied_insn, RELATIVE_ADDR_SIZE);
+
+	emulate_buff[0] = RELATIVEJUMP_OPCODE;
+	*(s32 *)(&emulate_buff[1]) = (s32)((long)op->optinsn.insn -
+			((long)op->kp.addr + RELATIVEJUMP_SIZE));
+
 	text_poke_bp(op->kp.addr, insn_buff, RELATIVEJUMP_SIZE,
-		     op->optinsn.insn);
+		     emulate_buff);
 }
 
 /*



^ permalink raw reply	[flat|nested] 96+ messages in thread

* [PATCH v3 2/6] x86/alternatives: Update int3_emulate_push() comment
  2019-10-07  8:17 ` [PATCH v3 0/6] Rewrite x86/ftrace to use text_poke() Peter Zijlstra
  2019-10-07  8:17   ` [PATCH v3 1/6] x86/alternatives: Teach text_poke_bp() to emulate instructions Peter Zijlstra
@ 2019-10-07  8:17   ` Peter Zijlstra
  2019-10-07  8:17   ` [PATCH v3 3/6] x86/alternatives,jump_label: Provide better text_poke() batching interface Peter Zijlstra
                     ` (4 subsequent siblings)
  6 siblings, 0 replies; 96+ messages in thread
From: Peter Zijlstra @ 2019-10-07  8:17 UTC (permalink / raw)
  To: x86
  Cc: peterz, linux-kernel, rostedt, mhiramat, bristot, jbaron,
	torvalds, tglx, mingo, namit, hpa, luto, ard.biesheuvel,
	jpoimboe

Update the comment now that we've merged x86_32 support.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 arch/x86/include/asm/text-patching.h |    3 +++
 1 file changed, 3 insertions(+)

--- a/arch/x86/include/asm/text-patching.h
+++ b/arch/x86/include/asm/text-patching.h
@@ -85,6 +85,9 @@ static inline void int3_emulate_push(str
 	 * stack where the break point happened, and the saving of
 	 * pt_regs. We can extend the original stack because of
 	 * this gap. See the idtentry macro's create_gap option.
+	 *
+	 * Similarly entry_32.S will have a gap on the stack for (any) hardware
+	 * exception and pt_regs; see FIXUP_FRAME.
 	 */
 	regs->sp -= sizeof(unsigned long);
 	*(unsigned long *)regs->sp = val;



^ permalink raw reply	[flat|nested] 96+ messages in thread

* [PATCH v3 3/6] x86/alternatives,jump_label: Provide better text_poke() batching interface
  2019-10-07  8:17 ` [PATCH v3 0/6] Rewrite x86/ftrace to use text_poke() Peter Zijlstra
  2019-10-07  8:17   ` [PATCH v3 1/6] x86/alternatives: Teach text_poke_bp() to emulate instructions Peter Zijlstra
  2019-10-07  8:17   ` [PATCH v3 2/6] x86/alternatives: Update int3_emulate_push() comment Peter Zijlstra
@ 2019-10-07  8:17   ` Peter Zijlstra
  2019-10-09 12:04     ` Daniel Bristot de Oliveira
  2019-10-07  8:17   ` [PATCH v3 4/6] x86/alternatives: Add and use text_gen_insn() helper Peter Zijlstra
                     ` (3 subsequent siblings)
  6 siblings, 1 reply; 96+ messages in thread
From: Peter Zijlstra @ 2019-10-07  8:17 UTC (permalink / raw)
  To: x86
  Cc: peterz, linux-kernel, rostedt, mhiramat, bristot, jbaron,
	torvalds, tglx, mingo, namit, hpa, luto, ard.biesheuvel,
	jpoimboe

Adding another text_poke_bp_batch() user made me realize the interface
is all sorts of wrong. The text poke vector should be internal to the
implementation.

This then results in a trivial interface:

  text_poke_queue()  - which has the 'normal' text_poke_bp() interface
  text_poke_finish() - which takes no arguments and flushes any
                       pending text_poke()s.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Daniel Bristot de Oliveira <bristot@redhat.com>
---
 arch/x86/include/asm/text-patching.h |   15 +----
 arch/x86/kernel/alternative.c        |   64 ++++++++++++++++++++--
 arch/x86/kernel/jump_label.c         |   99 ++++++++++++-----------------------
 3 files changed, 96 insertions(+), 82 deletions(-)

--- a/arch/x86/include/asm/text-patching.h
+++ b/arch/x86/include/asm/text-patching.h
@@ -25,14 +25,6 @@ static inline void apply_paravirt(struct
  */
 #define POKE_MAX_OPCODE_SIZE	5
 
-struct text_poke_loc {
-	void *addr;
-	int len;
-	s32 rel32;
-	u8 opcode;
-	const char text[POKE_MAX_OPCODE_SIZE];
-};
-
 extern void text_poke_early(void *addr, const void *opcode, size_t len);
 
 /*
@@ -53,9 +45,10 @@ extern void *text_poke(void *addr, const
 extern void *text_poke_kgdb(void *addr, const void *opcode, size_t len);
 extern int poke_int3_handler(struct pt_regs *regs);
 extern void text_poke_bp(void *addr, const void *opcode, size_t len, const void *emulate);
-extern void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries);
-extern void text_poke_loc_init(struct text_poke_loc *tp, void *addr,
-			       const void *opcode, size_t len, const void *emulate);
+
+extern void text_poke_queue(void *addr, const void *opcode, size_t len, const void *emulate);
+extern void text_poke_finish(void);
+
 extern int after_bootmem;
 extern __ro_after_init struct mm_struct *poking_mm;
 extern __ro_after_init unsigned long poking_addr;
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -936,6 +936,14 @@ static void do_sync_core(void *info)
 	sync_core();
 }
 
+struct text_poke_loc {
+	void *addr;
+	int len;
+	s32 rel32;
+	u8 opcode;
+	const char text[POKE_MAX_OPCODE_SIZE];
+};
+
 static struct bp_patching_desc {
 	struct text_poke_loc *vec;
 	int nr_entries;
@@ -1023,6 +1031,10 @@ int poke_int3_handler(struct pt_regs *re
 }
 NOKPROBE_SYMBOL(poke_int3_handler);
 
+#define TP_VEC_MAX (PAGE_SIZE / sizeof(struct text_poke_loc))
+static struct text_poke_loc tp_vec[TP_VEC_MAX];
+static int tp_vec_nr;
+
 /**
  * text_poke_bp_batch() -- update instructions on live kernel on SMP
  * @tp:			vector of instructions to patch
@@ -1044,7 +1056,7 @@ NOKPROBE_SYMBOL(poke_int3_handler);
  *		  replacing opcode
  *	- sync cores
  */
-void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries)
+static void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries)
 {
 	unsigned char int3 = INT3_INSN_OPCODE;
 	unsigned int i;
@@ -1118,11 +1130,7 @@ void text_poke_loc_init(struct text_poke
 {
 	struct insn insn;
 
-	if (!opcode)
-		opcode = (void *)tp->text;
-	else
-		memcpy((void *)tp->text, opcode, len);
-
+	memcpy((void *)tp->text, opcode, len);
 	if (!emulate)
 		emulate = opcode;
 
@@ -1167,6 +1175,50 @@ void text_poke_loc_init(struct text_poke
 	}
 }
 
+/*
+ * We hard rely on the tp_vec being ordered; ensure this is so by flushing
+ * early if needed.
+ */
+static bool tp_order_fail(void *addr)
+{
+	struct text_poke_loc *tp;
+
+	if (!tp_vec_nr)
+		return false;
+
+	if (!addr) /* force */
+		return true;
+
+	tp = &tp_vec[tp_vec_nr - 1];
+	if ((unsigned long)tp->addr > (unsigned long)addr)
+		return true;
+
+	return false;
+}
+
+static void text_poke_flush(void *addr)
+{
+	if (tp_vec_nr == TP_VEC_MAX || tp_order_fail(addr)) {
+		text_poke_bp_batch(tp_vec, tp_vec_nr);
+		tp_vec_nr = 0;
+	}
+}
+
+void text_poke_finish(void)
+{
+	text_poke_flush(NULL);
+}
+
+void text_poke_queue(void *addr, const void *opcode, size_t len, const void *emulate)
+{
+	struct text_poke_loc *tp;
+
+	text_poke_flush(addr);
+
+	tp = &tp_vec[tp_vec_nr++];
+	text_poke_loc_init(tp, addr, opcode, len, emulate);
+}
+
 /**
  * text_poke_bp() -- update instructions on live kernel on SMP
  * @addr:	address to patch
--- a/arch/x86/kernel/jump_label.c
+++ b/arch/x86/kernel/jump_label.c
@@ -35,18 +35,19 @@ static void bug_at(unsigned char *ip, in
 	BUG();
 }
 
-static void __jump_label_set_jump_code(struct jump_entry *entry,
-				       enum jump_label_type type,
-				       union jump_code_union *code,
-				       int init)
+static const void *
+__jump_label_set_jump_code(struct jump_entry *entry, enum jump_label_type type, int init)
 {
+	static union jump_code_union code; /* relies on text_mutex */
 	const unsigned char default_nop[] = { STATIC_KEY_INIT_NOP };
 	const unsigned char *ideal_nop = ideal_nops[NOP_ATOMIC5];
 	const void *expect;
 	int line;
 
-	code->jump = 0xe9;
-	code->offset = jump_entry_target(entry) -
+	lockdep_assert_held(&text_mutex);
+
+	code.jump = JMP32_INSN_OPCODE;
+	code.offset = jump_entry_target(entry) -
 		       (jump_entry_code(entry) + JUMP_LABEL_NOP_SIZE);
 
 	if (init) {
@@ -54,23 +55,23 @@ static void __jump_label_set_jump_code(s
 	} else if (type == JUMP_LABEL_JMP) {
 		expect = ideal_nop; line = __LINE__;
 	} else {
-		expect = code->code; line = __LINE__;
+		expect = code.code; line = __LINE__;
 	}
 
 	if (memcmp((void *)jump_entry_code(entry), expect, JUMP_LABEL_NOP_SIZE))
 		bug_at((void *)jump_entry_code(entry), line);
 
 	if (type == JUMP_LABEL_NOP)
-		memcpy(code, ideal_nop, JUMP_LABEL_NOP_SIZE);
+		memcpy(&code, ideal_nop, JUMP_LABEL_NOP_SIZE);
+
+	return &code;
 }
 
-static void __ref __jump_label_transform(struct jump_entry *entry,
-					 enum jump_label_type type,
-					 int init)
+static void inline __jump_label_transform(struct jump_entry *entry,
+					  enum jump_label_type type,
+					  int init)
 {
-	union jump_code_union code;
-
-	__jump_label_set_jump_code(entry, type, &code, init);
+	const void *opcode = __jump_label_set_jump_code(entry, type, init);
 
 	/*
 	 * As long as only a single processor is running and the code is still
@@ -84,31 +85,33 @@ static void __ref __jump_label_transform
 	 * always nop being the 'currently valid' instruction
 	 */
 	if (init || system_state == SYSTEM_BOOTING) {
-		text_poke_early((void *)jump_entry_code(entry), &code,
+		text_poke_early((void *)jump_entry_code(entry), opcode,
 				JUMP_LABEL_NOP_SIZE);
 		return;
 	}
 
-	text_poke_bp((void *)jump_entry_code(entry), &code, JUMP_LABEL_NOP_SIZE, NULL);
+	text_poke_bp((void *)jump_entry_code(entry), opcode, JUMP_LABEL_NOP_SIZE, NULL);
 }
 
-void arch_jump_label_transform(struct jump_entry *entry,
-			       enum jump_label_type type)
+static void __ref jump_label_transform(struct jump_entry *entry,
+				       enum jump_label_type type,
+				       int init)
 {
 	mutex_lock(&text_mutex);
-	__jump_label_transform(entry, type, 0);
+	__jump_label_transform(entry, type, init);
 	mutex_unlock(&text_mutex);
 }
 
-#define TP_VEC_MAX (PAGE_SIZE / sizeof(struct text_poke_loc))
-static struct text_poke_loc tp_vec[TP_VEC_MAX];
-static int tp_vec_nr;
+void arch_jump_label_transform(struct jump_entry *entry,
+			       enum jump_label_type type)
+{
+	jump_label_transform(entry, type, 0);
+}
 
 bool arch_jump_label_transform_queue(struct jump_entry *entry,
 				     enum jump_label_type type)
 {
-	struct text_poke_loc *tp;
-	void *entry_code;
+	const void *opcode;
 
 	if (system_state == SYSTEM_BOOTING) {
 		/*
@@ -118,53 +121,19 @@ bool arch_jump_label_transform_queue(str
 		return true;
 	}
 
-	/*
-	 * No more space in the vector, tell upper layer to apply
-	 * the queue before continuing.
-	 */
-	if (tp_vec_nr == TP_VEC_MAX)
-		return false;
-
-	tp = &tp_vec[tp_vec_nr];
-
-	entry_code = (void *)jump_entry_code(entry);
-
-	/*
-	 * The INT3 handler will do a bsearch in the queue, so we need entries
-	 * to be sorted. We can survive an unsorted list by rejecting the entry,
-	 * forcing the generic jump_label code to apply the queue. Warning once,
-	 * to raise the attention to the case of an unsorted entry that is
-	 * better not happen, because, in the worst case we will perform in the
-	 * same way as we do without batching - with some more overhead.
-	 */
-	if (tp_vec_nr > 0) {
-		int prev = tp_vec_nr - 1;
-		struct text_poke_loc *prev_tp = &tp_vec[prev];
-
-		if (WARN_ON_ONCE(prev_tp->addr > entry_code))
-			return false;
-	}
-
-	__jump_label_set_jump_code(entry, type,
-				   (union jump_code_union *)&tp->text, 0);
-
-	text_poke_loc_init(tp, entry_code, NULL, JUMP_LABEL_NOP_SIZE, NULL);
-
-	tp_vec_nr++;
-
+	mutex_lock(&text_mutex);
+	opcode = __jump_label_set_jump_code(entry, type, 0);
+	text_poke_queue((void *)jump_entry_code(entry),
+			opcode, JUMP_LABEL_NOP_SIZE, NULL);
+	mutex_unlock(&text_mutex);
 	return true;
 }
 
 void arch_jump_label_transform_apply(void)
 {
-	if (!tp_vec_nr)
-		return;
-
 	mutex_lock(&text_mutex);
-	text_poke_bp_batch(tp_vec, tp_vec_nr);
+	text_poke_finish();
 	mutex_unlock(&text_mutex);
-
-	tp_vec_nr = 0;
 }
 
 static enum {
@@ -193,5 +162,5 @@ __init_or_module void arch_jump_label_tr
 			jlstate = JL_STATE_NO_UPDATE;
 	}
 	if (jlstate == JL_STATE_UPDATE)
-		__jump_label_transform(entry, type, 1);
+		jump_label_transform(entry, type, 1);
 }



^ permalink raw reply	[flat|nested] 96+ messages in thread

* [PATCH v3 4/6] x86/alternatives: Add and use text_gen_insn() helper
  2019-10-07  8:17 ` [PATCH v3 0/6] Rewrite x86/ftrace to use text_poke() Peter Zijlstra
                     ` (2 preceding siblings ...)
  2019-10-07  8:17   ` [PATCH v3 3/6] x86/alternatives,jump_label: Provide better text_poke() batching interface Peter Zijlstra
@ 2019-10-07  8:17   ` Peter Zijlstra
  2019-10-08  6:23     ` Masami Hiramatsu
  2019-10-07  8:17   ` [PATCH v3 5/6] x86/ftrace: Use text_poke() Peter Zijlstra
                     ` (2 subsequent siblings)
  6 siblings, 1 reply; 96+ messages in thread
From: Peter Zijlstra @ 2019-10-07  8:17 UTC (permalink / raw)
  To: x86
  Cc: peterz, linux-kernel, rostedt, mhiramat, bristot, jbaron,
	torvalds, tglx, mingo, namit, hpa, luto, ard.biesheuvel,
	jpoimboe

Provide a simple helper function to create common instruction
encodings.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Daniel Bristot de Oliveira <bristot@redhat.com>
Cc: Masami Hiramatsu <mhiramat@kernel.org>
---
 arch/x86/include/asm/text-patching.h |    2 +
 arch/x86/kernel/alternative.c        |   36 +++++++++++++++++++++++++++++++++++
 arch/x86/kernel/jump_label.c         |   31 ++++++++++--------------------
 arch/x86/kernel/kprobes/opt.c        |    7 ------
 4 files changed, 50 insertions(+), 26 deletions(-)

--- a/arch/x86/include/asm/text-patching.h
+++ b/arch/x86/include/asm/text-patching.h
@@ -49,6 +49,8 @@ extern void text_poke_bp(void *addr, con
 extern void text_poke_queue(void *addr, const void *opcode, size_t len, const void *emulate);
 extern void text_poke_finish(void);
 
+extern void *text_gen_insn(u8 opcode, const void *addr, const void *dest);
+
 extern int after_bootmem;
 extern __ro_after_init struct mm_struct *poking_mm;
 extern __ro_after_init unsigned long poking_addr;
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -1237,3 +1237,39 @@ void text_poke_bp(void *addr, const void
 	text_poke_loc_init(&tp, addr, opcode, len, emulate);
 	text_poke_bp_batch(&tp, 1);
 }
+
+union text_poke_insn {
+	u8 text[POKE_MAX_OPCODE_SIZE];
+	struct {
+		u8 opcode;
+		s32 disp;
+	} __attribute__((packed));
+};
+
+void *text_gen_insn(u8 opcode, const void *addr, const void *dest)
+{
+	static union text_poke_insn insn; /* text_mutex */
+	int size = 0;
+
+	lockdep_assert_held(&text_mutex);
+
+	insn.opcode = opcode;
+
+#define __CASE(insn)	\
+	case insn##_INSN_OPCODE: size = insn##_INSN_SIZE; break
+
+	switch(opcode) {
+	__CASE(INT3);
+	__CASE(CALL);
+	__CASE(JMP32);
+	__CASE(JMP8);
+	}
+
+	if (size > 1) {
+		insn.disp = (long)dest - (long)(addr + size);
+		if (size == 2)
+			BUG_ON((insn.disp >> 31) != (insn.disp >> 7));
+	}
+
+	return &insn.text;
+}
--- a/arch/x86/kernel/jump_label.c
+++ b/arch/x86/kernel/jump_label.c
@@ -16,15 +16,7 @@
 #include <asm/alternative.h>
 #include <asm/text-patching.h>
 
-union jump_code_union {
-	char code[JUMP_LABEL_NOP_SIZE];
-	struct {
-		char jump;
-		int offset;
-	} __attribute__((packed));
-};
-
-static void bug_at(unsigned char *ip, int line)
+static void bug_at(const void *ip, int line)
 {
 	/*
 	 * The location is not an op that we were expecting.
@@ -38,33 +30,32 @@ static void bug_at(unsigned char *ip, in
 static const void *
 __jump_label_set_jump_code(struct jump_entry *entry, enum jump_label_type type, int init)
 {
-	static union jump_code_union code; /* relies on text_mutex */
 	const unsigned char default_nop[] = { STATIC_KEY_INIT_NOP };
 	const unsigned char *ideal_nop = ideal_nops[NOP_ATOMIC5];
-	const void *expect;
+	const void *expect, *code;
+	const void *addr, *dest;
 	int line;
 
-	lockdep_assert_held(&text_mutex);
+	addr = (void *)jump_entry_code(entry);
+	dest = (void *)jump_entry_target(entry);
 
-	code.jump = JMP32_INSN_OPCODE;
-	code.offset = jump_entry_target(entry) -
-		       (jump_entry_code(entry) + JUMP_LABEL_NOP_SIZE);
+	code = text_gen_insn(JMP32_INSN_OPCODE, addr, dest);
 
 	if (init) {
 		expect = default_nop; line = __LINE__;
 	} else if (type == JUMP_LABEL_JMP) {
 		expect = ideal_nop; line = __LINE__;
 	} else {
-		expect = code.code; line = __LINE__;
+		expect = code; line = __LINE__;
 	}
 
-	if (memcmp((void *)jump_entry_code(entry), expect, JUMP_LABEL_NOP_SIZE))
-		bug_at((void *)jump_entry_code(entry), line);
+	if (memcmp(addr, expect, JUMP_LABEL_NOP_SIZE))
+		bug_at(addr, line);
 
 	if (type == JUMP_LABEL_NOP)
-		memcpy(&code, ideal_nop, JUMP_LABEL_NOP_SIZE);
+		code = ideal_nop;
 
-	return &code;
+	return code;
 }
 
 static void inline __jump_label_transform(struct jump_entry *entry,
--- a/arch/x86/kernel/kprobes/opt.c
+++ b/arch/x86/kernel/kprobes/opt.c
@@ -447,18 +447,13 @@ void arch_optimize_kprobes(struct list_h
 void arch_unoptimize_kprobe(struct optimized_kprobe *op)
 {
 	u8 insn_buff[RELATIVEJUMP_SIZE];
-	u8 emulate_buff[RELATIVEJUMP_SIZE];
 
 	/* Set int3 to first byte for kprobes */
 	insn_buff[0] = BREAKPOINT_INSTRUCTION;
 	memcpy(insn_buff + 1, op->optinsn.copied_insn, RELATIVE_ADDR_SIZE);
 
-	emulate_buff[0] = RELATIVEJUMP_OPCODE;
-	*(s32 *)(&emulate_buff[1]) = (s32)((long)op->optinsn.insn -
-			((long)op->kp.addr + RELATIVEJUMP_SIZE));
-
 	text_poke_bp(op->kp.addr, insn_buff, RELATIVEJUMP_SIZE,
-		     emulate_buff);
+		     text_gen_insn(JMP32_INSN_OPCODE, op->kp.addr, op->optinsn.insn));
 }
 
 /*



^ permalink raw reply	[flat|nested] 96+ messages in thread

* [PATCH v3 5/6] x86/ftrace: Use text_poke()
  2019-10-07  8:17 ` [PATCH v3 0/6] Rewrite x86/ftrace to use text_poke() Peter Zijlstra
                     ` (3 preceding siblings ...)
  2019-10-07  8:17   ` [PATCH v3 4/6] x86/alternatives: Add and use text_gen_insn() helper Peter Zijlstra
@ 2019-10-07  8:17   ` Peter Zijlstra
  2019-10-08 14:43     ` Steven Rostedt
  2019-10-07  8:17   ` [PATCH v3 6/6] x86/mm: Remove set_kernel_text_r[ow]() Peter Zijlstra
  2019-10-08 15:07   ` [PATCH v3 0/6] Rewrite x86/ftrace to use text_poke() Steven Rostedt
  6 siblings, 1 reply; 96+ messages in thread
From: Peter Zijlstra @ 2019-10-07  8:17 UTC (permalink / raw)
  To: x86
  Cc: peterz, linux-kernel, rostedt, mhiramat, bristot, jbaron,
	torvalds, tglx, mingo, namit, hpa, luto, ard.biesheuvel,
	jpoimboe

Move ftrace over to using the generic x86 text_poke functions; this
avoids having a second/different copy of that code around.

This also avoids ftrace violating the (new) W^X rule and avoids
fragmenting the kernel text page-tables, due to no longer having to
toggle them RW.

Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Daniel Bristot de Oliveira <bristot@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 arch/x86/include/asm/ftrace.h |    2 
 arch/x86/kernel/alternative.c |    4 
 arch/x86/kernel/ftrace.c      |  630 ++++++------------------------------------
 arch/x86/kernel/traps.c       |    9 
 4 files changed, 93 insertions(+), 552 deletions(-)

--- a/arch/x86/include/asm/ftrace.h
+++ b/arch/x86/include/asm/ftrace.h
@@ -35,8 +35,6 @@ struct dyn_arch_ftrace {
 	/* No extra data needed for x86 */
 };
 
-int ftrace_int3_handler(struct pt_regs *regs);
-
 #define FTRACE_GRAPH_TRAMP_ADDR FTRACE_GRAPH_ADDR
 
 #endif /*  CONFIG_DYNAMIC_FTRACE */
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -941,7 +941,7 @@ static struct bp_patching_desc {
 	int nr_entries;
 } bp_patching;
 
-static int patch_cmp(const void *key, const void *elt)
+static int notrace patch_cmp(const void *key, const void *elt)
 {
 	struct text_poke_loc *tp = (struct text_poke_loc *) elt;
 
@@ -953,7 +953,7 @@ static int patch_cmp(const void *key, co
 }
 NOKPROBE_SYMBOL(patch_cmp);
 
-int poke_int3_handler(struct pt_regs *regs)
+int notrace poke_int3_handler(struct pt_regs *regs)
 {
 	struct text_poke_loc *tp;
 	void *ip;
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -43,16 +43,12 @@ int ftrace_arch_code_modify_prepare(void
 	 * ftrace has it set to "read/write".
 	 */
 	mutex_lock(&text_mutex);
-	set_kernel_text_rw();
-	set_all_modules_text_rw();
 	return 0;
 }
 
 int ftrace_arch_code_modify_post_process(void)
     __releases(&text_mutex)
 {
-	set_all_modules_text_ro();
-	set_kernel_text_ro();
 	mutex_unlock(&text_mutex);
 	return 0;
 }
@@ -60,67 +56,34 @@ int ftrace_arch_code_modify_post_process
 union ftrace_code_union {
 	char code[MCOUNT_INSN_SIZE];
 	struct {
-		unsigned char op;
+		char op;
 		int offset;
 	} __attribute__((packed));
 };
 
-static int ftrace_calc_offset(long ip, long addr)
-{
-	return (int)(addr - ip);
-}
-
-static unsigned char *
-ftrace_text_replace(unsigned char op, unsigned long ip, unsigned long addr)
+static const char *ftrace_text_replace(char op, unsigned long ip, unsigned long addr)
 {
 	static union ftrace_code_union calc;
 
-	calc.op		= op;
-	calc.offset	= ftrace_calc_offset(ip + MCOUNT_INSN_SIZE, addr);
+	calc.op = op;
+	calc.offset = (int)(addr - (ip + MCOUNT_INSN_SIZE));
 
 	return calc.code;
 }
 
-static unsigned char *
-ftrace_call_replace(unsigned long ip, unsigned long addr)
-{
-	return ftrace_text_replace(0xe8, ip, addr);
-}
-
-static inline int
-within(unsigned long addr, unsigned long start, unsigned long end)
-{
-	return addr >= start && addr < end;
-}
-
-static unsigned long text_ip_addr(unsigned long ip)
+static const char *ftrace_nop_replace(void)
 {
-	/*
-	 * On x86_64, kernel text mappings are mapped read-only, so we use
-	 * the kernel identity mapping instead of the kernel text mapping
-	 * to modify the kernel text.
-	 *
-	 * For 32bit kernels, these mappings are same and we can use
-	 * kernel identity mapping to modify code.
-	 */
-	if (within(ip, (unsigned long)_text, (unsigned long)_etext))
-		ip = (unsigned long)__va(__pa_symbol(ip));
-
-	return ip;
+	return ideal_nops[NOP_ATOMIC5];
 }
 
-static const unsigned char *ftrace_nop_replace(void)
+static const char *ftrace_call_replace(unsigned long ip, unsigned long addr)
 {
-	return ideal_nops[NOP_ATOMIC5];
+	return ftrace_text_replace(CALL_INSN_OPCODE, ip, addr);
 }
 
-static int
-ftrace_modify_code_direct(unsigned long ip, unsigned const char *old_code,
-		   unsigned const char *new_code)
+static int ftrace_verify_code(unsigned long ip, const char *old_code)
 {
-	unsigned char replaced[MCOUNT_INSN_SIZE];
-
-	ftrace_expected = old_code;
+	char cur_code[MCOUNT_INSN_SIZE];
 
 	/*
 	 * Note:
@@ -129,31 +92,38 @@ ftrace_modify_code_direct(unsigned long
 	 * Carefully read and modify the code with probe_kernel_*(), and make
 	 * sure what we read is what we expected it to be before modifying it.
 	 */
-
 	/* read the text we want to modify */
-	if (probe_kernel_read(replaced, (void *)ip, MCOUNT_INSN_SIZE))
+	if (probe_kernel_read(cur_code, (void *)ip, MCOUNT_INSN_SIZE)) {
+		WARN_ON(1);
 		return -EFAULT;
+	}
 
 	/* Make sure it is what we expect it to be */
-	if (memcmp(replaced, old_code, MCOUNT_INSN_SIZE) != 0)
+	if (memcmp(cur_code, old_code, MCOUNT_INSN_SIZE) != 0) {
+		WARN_ON(1);
 		return -EINVAL;
+	}
 
-	ip = text_ip_addr(ip);
-
-	/* replace the text with the new text */
-	if (probe_kernel_write((void *)ip, new_code, MCOUNT_INSN_SIZE))
-		return -EPERM;
+	return 0;
+}
 
-	sync_core();
+static int
+ftrace_modify_code_direct(unsigned long ip, const char *old_code,
+			  const char *new_code)
+{
+	int ret = ftrace_verify_code(ip, old_code);
+	if (ret)
+		return ret;
 
+	/* replace the text with the new text */
+	text_poke_early((void *)ip, new_code, MCOUNT_INSN_SIZE);
 	return 0;
 }
 
-int ftrace_make_nop(struct module *mod,
-		    struct dyn_ftrace *rec, unsigned long addr)
+int ftrace_make_nop(struct module *mod, struct dyn_ftrace *rec, unsigned long addr)
 {
-	unsigned const char *new, *old;
 	unsigned long ip = rec->ip;
+	const char *new, *old;
 
 	old = ftrace_call_replace(ip, addr);
 	new = ftrace_nop_replace();
@@ -167,19 +137,20 @@ int ftrace_make_nop(struct module *mod,
 	 * just modify the code directly.
 	 */
 	if (addr == MCOUNT_ADDR)
-		return ftrace_modify_code_direct(rec->ip, old, new);
-
-	ftrace_expected = NULL;
+		return ftrace_modify_code_direct(ip, old, new);
 
-	/* Normal cases use add_brk_on_nop */
+	/*
+	 * x86 overrides ftrace_replace_code -- this function will never be used
+	 * in this case.
+	 */
 	WARN_ONCE(1, "invalid use of ftrace_make_nop");
 	return -EINVAL;
 }
 
 int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr)
 {
-	unsigned const char *new, *old;
 	unsigned long ip = rec->ip;
+	const char *new, *old;
 
 	old = ftrace_nop_replace();
 	new = ftrace_call_replace(ip, addr);
@@ -189,43 +160,6 @@ int ftrace_make_call(struct dyn_ftrace *
 }
 
 /*
- * The modifying_ftrace_code is used to tell the breakpoint
- * handler to call ftrace_int3_handler(). If it fails to
- * call this handler for a breakpoint added by ftrace, then
- * the kernel may crash.
- *
- * As atomic_writes on x86 do not need a barrier, we do not
- * need to add smp_mb()s for this to work. It is also considered
- * that we can not read the modifying_ftrace_code before
- * executing the breakpoint. That would be quite remarkable if
- * it could do that. Here's the flow that is required:
- *
- *   CPU-0                          CPU-1
- *
- * atomic_inc(mfc);
- * write int3s
- *				<trap-int3> // implicit (r)mb
- *				if (atomic_read(mfc))
- *					call ftrace_int3_handler()
- *
- * Then when we are finished:
- *
- * atomic_dec(mfc);
- *
- * If we hit a breakpoint that was not set by ftrace, it does not
- * matter if ftrace_int3_handler() is called or not. It will
- * simply be ignored. But it is crucial that a ftrace nop/caller
- * breakpoint is handled. No other user should ever place a
- * breakpoint on an ftrace nop/caller location. It must only
- * be done by this code.
- */
-atomic_t modifying_ftrace_code __read_mostly;
-
-static int
-ftrace_modify_code(unsigned long ip, unsigned const char *old_code,
-		   unsigned const char *new_code);
-
-/*
  * Should never be called:
  *  As it is only called by __ftrace_replace_code() which is called by
  *  ftrace_replace_code() that x86 overrides, and by ftrace_update_code()
@@ -237,452 +171,84 @@ int ftrace_modify_call(struct dyn_ftrace
 				 unsigned long addr)
 {
 	WARN_ON(1);
-	ftrace_expected = NULL;
 	return -EINVAL;
 }
 
-static unsigned long ftrace_update_func;
-static unsigned long ftrace_update_func_call;
-
-static int update_ftrace_func(unsigned long ip, void *new)
-{
-	unsigned char old[MCOUNT_INSN_SIZE];
-	int ret;
-
-	memcpy(old, (void *)ip, MCOUNT_INSN_SIZE);
-
-	ftrace_update_func = ip;
-	/* Make sure the breakpoints see the ftrace_update_func update */
-	smp_wmb();
-
-	/* See comment above by declaration of modifying_ftrace_code */
-	atomic_inc(&modifying_ftrace_code);
-
-	ret = ftrace_modify_code(ip, old, new);
-
-	atomic_dec(&modifying_ftrace_code);
-
-	return ret;
-}
-
 int ftrace_update_ftrace_func(ftrace_func_t func)
 {
-	unsigned long ip = (unsigned long)(&ftrace_call);
-	unsigned char *new;
-	int ret;
-
-	ftrace_update_func_call = (unsigned long)func;
-
-	new = ftrace_call_replace(ip, (unsigned long)func);
-	ret = update_ftrace_func(ip, new);
-
-	/* Also update the regs callback function */
-	if (!ret) {
-		ip = (unsigned long)(&ftrace_regs_call);
-		new = ftrace_call_replace(ip, (unsigned long)func);
-		ret = update_ftrace_func(ip, new);
-	}
-
-	return ret;
-}
-
-static nokprobe_inline int is_ftrace_caller(unsigned long ip)
-{
-	if (ip == ftrace_update_func)
-		return 1;
-
-	return 0;
-}
-
-/*
- * A breakpoint was added to the code address we are about to
- * modify, and this is the handle that will just skip over it.
- * We are either changing a nop into a trace call, or a trace
- * call to a nop. While the change is taking place, we treat
- * it just like it was a nop.
- */
-int ftrace_int3_handler(struct pt_regs *regs)
-{
 	unsigned long ip;
+	const char *new;
 
-	if (WARN_ON_ONCE(!regs))
-		return 0;
-
-	ip = regs->ip - INT3_INSN_SIZE;
-
-	if (ftrace_location(ip)) {
-		int3_emulate_call(regs, (unsigned long)ftrace_regs_caller);
-		return 1;
-	} else if (is_ftrace_caller(ip)) {
-		if (!ftrace_update_func_call) {
-			int3_emulate_jmp(regs, ip + CALL_INSN_SIZE);
-			return 1;
-		}
-		int3_emulate_call(regs, ftrace_update_func_call);
-		return 1;
-	}
-
-	return 0;
-}
-NOKPROBE_SYMBOL(ftrace_int3_handler);
-
-static int ftrace_write(unsigned long ip, const char *val, int size)
-{
-	ip = text_ip_addr(ip);
-
-	if (probe_kernel_write((void *)ip, val, size))
-		return -EPERM;
-
-	return 0;
-}
-
-static int add_break(unsigned long ip, const char *old)
-{
-	unsigned char replaced[MCOUNT_INSN_SIZE];
-	unsigned char brk = BREAKPOINT_INSTRUCTION;
-
-	if (probe_kernel_read(replaced, (void *)ip, MCOUNT_INSN_SIZE))
-		return -EFAULT;
-
-	ftrace_expected = old;
-
-	/* Make sure it is what we expect it to be */
-	if (memcmp(replaced, old, MCOUNT_INSN_SIZE) != 0)
-		return -EINVAL;
-
-	return ftrace_write(ip, &brk, 1);
-}
-
-static int add_brk_on_call(struct dyn_ftrace *rec, unsigned long addr)
-{
-	unsigned const char *old;
-	unsigned long ip = rec->ip;
-
-	old = ftrace_call_replace(ip, addr);
-
-	return add_break(rec->ip, old);
-}
-
-
-static int add_brk_on_nop(struct dyn_ftrace *rec)
-{
-	unsigned const char *old;
-
-	old = ftrace_nop_replace();
-
-	return add_break(rec->ip, old);
-}
-
-static int add_breakpoints(struct dyn_ftrace *rec, bool enable)
-{
-	unsigned long ftrace_addr;
-	int ret;
-
-	ftrace_addr = ftrace_get_addr_curr(rec);
-
-	ret = ftrace_test_record(rec, enable);
-
-	switch (ret) {
-	case FTRACE_UPDATE_IGNORE:
-		return 0;
-
-	case FTRACE_UPDATE_MAKE_CALL:
-		/* converting nop to call */
-		return add_brk_on_nop(rec);
-
-	case FTRACE_UPDATE_MODIFY_CALL:
-	case FTRACE_UPDATE_MAKE_NOP:
-		/* converting a call to a nop */
-		return add_brk_on_call(rec, ftrace_addr);
-	}
-	return 0;
-}
-
-/*
- * On error, we need to remove breakpoints. This needs to
- * be done caefully. If the address does not currently have a
- * breakpoint, we know we are done. Otherwise, we look at the
- * remaining 4 bytes of the instruction. If it matches a nop
- * we replace the breakpoint with the nop. Otherwise we replace
- * it with the call instruction.
- */
-static int remove_breakpoint(struct dyn_ftrace *rec)
-{
-	unsigned char ins[MCOUNT_INSN_SIZE];
-	unsigned char brk = BREAKPOINT_INSTRUCTION;
-	const unsigned char *nop;
-	unsigned long ftrace_addr;
-	unsigned long ip = rec->ip;
-
-	/* If we fail the read, just give up */
-	if (probe_kernel_read(ins, (void *)ip, MCOUNT_INSN_SIZE))
-		return -EFAULT;
-
-	/* If this does not have a breakpoint, we are done */
-	if (ins[0] != brk)
-		return 0;
-
-	nop = ftrace_nop_replace();
-
-	/*
-	 * If the last 4 bytes of the instruction do not match
-	 * a nop, then we assume that this is a call to ftrace_addr.
-	 */
-	if (memcmp(&ins[1], &nop[1], MCOUNT_INSN_SIZE - 1) != 0) {
-		/*
-		 * For extra paranoidism, we check if the breakpoint is on
-		 * a call that would actually jump to the ftrace_addr.
-		 * If not, don't touch the breakpoint, we make just create
-		 * a disaster.
-		 */
-		ftrace_addr = ftrace_get_addr_new(rec);
-		nop = ftrace_call_replace(ip, ftrace_addr);
-
-		if (memcmp(&ins[1], &nop[1], MCOUNT_INSN_SIZE - 1) == 0)
-			goto update;
-
-		/* Check both ftrace_addr and ftrace_old_addr */
-		ftrace_addr = ftrace_get_addr_curr(rec);
-		nop = ftrace_call_replace(ip, ftrace_addr);
-
-		ftrace_expected = nop;
-
-		if (memcmp(&ins[1], &nop[1], MCOUNT_INSN_SIZE - 1) != 0)
-			return -EINVAL;
-	}
-
- update:
-	return ftrace_write(ip, nop, 1);
-}
-
-static int add_update_code(unsigned long ip, unsigned const char *new)
-{
-	/* skip breakpoint */
-	ip++;
-	new++;
-	return ftrace_write(ip, new, MCOUNT_INSN_SIZE - 1);
-}
-
-static int add_update_call(struct dyn_ftrace *rec, unsigned long addr)
-{
-	unsigned long ip = rec->ip;
-	unsigned const char *new;
-
-	new = ftrace_call_replace(ip, addr);
-	return add_update_code(ip, new);
-}
-
-static int add_update_nop(struct dyn_ftrace *rec)
-{
-	unsigned long ip = rec->ip;
-	unsigned const char *new;
-
-	new = ftrace_nop_replace();
-	return add_update_code(ip, new);
-}
-
-static int add_update(struct dyn_ftrace *rec, bool enable)
-{
-	unsigned long ftrace_addr;
-	int ret;
-
-	ret = ftrace_test_record(rec, enable);
-
-	ftrace_addr  = ftrace_get_addr_new(rec);
-
-	switch (ret) {
-	case FTRACE_UPDATE_IGNORE:
-		return 0;
-
-	case FTRACE_UPDATE_MODIFY_CALL:
-	case FTRACE_UPDATE_MAKE_CALL:
-		/* converting nop to call */
-		return add_update_call(rec, ftrace_addr);
-
-	case FTRACE_UPDATE_MAKE_NOP:
-		/* converting a call to a nop */
-		return add_update_nop(rec);
-	}
-
-	return 0;
-}
-
-static int finish_update_call(struct dyn_ftrace *rec, unsigned long addr)
-{
-	unsigned long ip = rec->ip;
-	unsigned const char *new;
-
-	new = ftrace_call_replace(ip, addr);
-
-	return ftrace_write(ip, new, 1);
-}
-
-static int finish_update_nop(struct dyn_ftrace *rec)
-{
-	unsigned long ip = rec->ip;
-	unsigned const char *new;
-
-	new = ftrace_nop_replace();
-
-	return ftrace_write(ip, new, 1);
-}
-
-static int finish_update(struct dyn_ftrace *rec, bool enable)
-{
-	unsigned long ftrace_addr;
-	int ret;
-
-	ret = ftrace_update_record(rec, enable);
-
-	ftrace_addr = ftrace_get_addr_new(rec);
-
-	switch (ret) {
-	case FTRACE_UPDATE_IGNORE:
-		return 0;
+	ip = (unsigned long)(&ftrace_call);
+	new = ftrace_call_replace(ip, (unsigned long)func);
+	text_poke_bp((void *)ip, new, MCOUNT_INSN_SIZE, NULL);
 
-	case FTRACE_UPDATE_MODIFY_CALL:
-	case FTRACE_UPDATE_MAKE_CALL:
-		/* converting nop to call */
-		return finish_update_call(rec, ftrace_addr);
-
-	case FTRACE_UPDATE_MAKE_NOP:
-		/* converting a call to a nop */
-		return finish_update_nop(rec);
-	}
+	ip = (unsigned long)(&ftrace_regs_call);
+	new = ftrace_call_replace(ip, (unsigned long)func);
+	text_poke_bp((void *)ip, new, MCOUNT_INSN_SIZE, NULL);
 
 	return 0;
 }
 
-static void do_sync_core(void *data)
-{
-	sync_core();
-}
-
-static void run_sync(void)
-{
-	int enable_irqs;
-
-	/* No need to sync if there's only one CPU */
-	if (num_online_cpus() == 1)
-		return;
-
-	enable_irqs = irqs_disabled();
-
-	/* We may be called with interrupts disabled (on bootup). */
-	if (enable_irqs)
-		local_irq_enable();
-	on_each_cpu(do_sync_core, NULL, 1);
-	if (enable_irqs)
-		local_irq_disable();
-}
-
 void ftrace_replace_code(int enable)
 {
 	struct ftrace_rec_iter *iter;
 	struct dyn_ftrace *rec;
-	const char *report = "adding breakpoints";
-	int count = 0;
+	const char *new, *old;
 	int ret;
 
 	for_ftrace_rec_iter(iter) {
 		rec = ftrace_rec_iter_record(iter);
 
-		ret = add_breakpoints(rec, enable);
-		if (ret)
-			goto remove_breakpoints;
-		count++;
-	}
-
-	run_sync();
-
-	report = "updating code";
-	count = 0;
-
-	for_ftrace_rec_iter(iter) {
-		rec = ftrace_rec_iter_record(iter);
-
-		ret = add_update(rec, enable);
-		if (ret)
-			goto remove_breakpoints;
-		count++;
+		switch (ftrace_test_record(rec, enable)) {
+		case FTRACE_UPDATE_IGNORE:
+		default:
+			continue;
+
+		case FTRACE_UPDATE_MAKE_CALL:
+			old = ftrace_nop_replace();
+			break;
+
+		case FTRACE_UPDATE_MODIFY_CALL:
+		case FTRACE_UPDATE_MAKE_NOP:
+			old = ftrace_call_replace(rec->ip, ftrace_get_addr_curr(rec));
+			break;
+		}
+
+		ret = ftrace_verify_code(rec->ip, old);
+		if (ret) {
+			ftrace_bug(ret, rec);
+			return;
+		}
 	}
 
-	run_sync();
-
-	report = "removing breakpoints";
-	count = 0;
-
 	for_ftrace_rec_iter(iter) {
 		rec = ftrace_rec_iter_record(iter);
 
-		ret = finish_update(rec, enable);
-		if (ret)
-			goto remove_breakpoints;
-		count++;
-	}
-
-	run_sync();
-
-	return;
+		switch (ftrace_test_record(rec, enable)) {
+		case FTRACE_UPDATE_IGNORE:
+		default:
+			continue;
+
+		case FTRACE_UPDATE_MAKE_CALL:
+		case FTRACE_UPDATE_MODIFY_CALL:
+			new = ftrace_call_replace(rec->ip, ftrace_get_addr_new(rec));
+			break;
+
+		case FTRACE_UPDATE_MAKE_NOP:
+			new = ftrace_nop_replace();
+			break;
+		}
 
- remove_breakpoints:
-	pr_warn("Failed on %s (%d):\n", report, count);
-	ftrace_bug(ret, rec);
-	for_ftrace_rec_iter(iter) {
-		rec = ftrace_rec_iter_record(iter);
-		/*
-		 * Breakpoints are handled only when this function is in
-		 * progress. The system could not work with them.
-		 */
-		if (remove_breakpoint(rec))
-			BUG();
+		text_poke_queue((void *)rec->ip, new, MCOUNT_INSN_SIZE, NULL);
+		ftrace_update_record(rec, enable);
 	}
-	run_sync();
-}
-
-static int
-ftrace_modify_code(unsigned long ip, unsigned const char *old_code,
-		   unsigned const char *new_code)
-{
-	int ret;
-
-	ret = add_break(ip, old_code);
-	if (ret)
-		goto out;
-
-	run_sync();
-
-	ret = add_update_code(ip, new_code);
-	if (ret)
-		goto fail_update;
-
-	run_sync();
-
-	ret = ftrace_write(ip, new_code, 1);
-	/*
-	 * The breakpoint is handled only when this function is in progress.
-	 * The system could not work if we could not remove it.
-	 */
-	BUG_ON(ret);
- out:
-	run_sync();
-	return ret;
-
- fail_update:
-	/* Also here the system could not work with the breakpoint */
-	if (ftrace_write(ip, old_code, 1))
-		BUG();
-	goto out;
+	text_poke_finish();
 }
 
 void arch_ftrace_update_code(int command)
 {
-	/* See comment above by declaration of modifying_ftrace_code */
-	atomic_inc(&modifying_ftrace_code);
-
 	ftrace_modify_all_code(command);
-
-	atomic_dec(&modifying_ftrace_code);
 }
 
 int __init ftrace_dyn_arch_init(void)
@@ -828,11 +394,7 @@ create_trampoline(struct ftrace_ops *ops
 
 	set_vm_flush_reset_perms(trampoline);
 
-	/*
-	 * Module allocation needs to be completed by making the page
-	 * executable. The page is still writable, which is a security hazard,
-	 * but anyhow ftrace breaks W^X completely.
-	 */
+	set_memory_ro((unsigned long)trampoline, npages);
 	set_memory_x((unsigned long)trampoline, npages);
 	return (unsigned long)trampoline;
 fail:
@@ -859,11 +421,10 @@ static unsigned long calc_trampoline_cal
 void arch_ftrace_update_trampoline(struct ftrace_ops *ops)
 {
 	ftrace_func_t func;
-	unsigned char *new;
 	unsigned long offset;
 	unsigned long ip;
 	unsigned int size;
-	int ret, npages;
+	const char *new;
 
 	if (ops->trampoline) {
 		/*
@@ -872,14 +433,11 @@ void arch_ftrace_update_trampoline(struc
 		 */
 		if (!(ops->flags & FTRACE_OPS_FL_ALLOC_TRAMP))
 			return;
-		npages = PAGE_ALIGN(ops->trampoline_size) >> PAGE_SHIFT;
-		set_memory_rw(ops->trampoline, npages);
 	} else {
 		ops->trampoline = create_trampoline(ops, &size);
 		if (!ops->trampoline)
 			return;
 		ops->trampoline_size = size;
-		npages = PAGE_ALIGN(size) >> PAGE_SHIFT;
 	}
 
 	offset = calc_trampoline_call_offset(ops->flags & FTRACE_OPS_FL_SAVE_REGS);
@@ -887,15 +445,11 @@ void arch_ftrace_update_trampoline(struc
 
 	func = ftrace_ops_get_func(ops);
 
-	ftrace_update_func_call = (unsigned long)func;
-
+	mutex_lock(&text_mutex);
 	/* Do a safe modify in case the trampoline is executing */
 	new = ftrace_call_replace(ip, (unsigned long)func);
-	ret = update_ftrace_func(ip, new);
-	set_memory_ro(ops->trampoline, npages);
-
-	/* The update should never fail */
-	WARN_ON(ret);
+	text_poke_bp((void *)ip, new, MCOUNT_INSN_SIZE, NULL);
+	mutex_unlock(&text_mutex);
 }
 
 /* Return the address of the function the trampoline calls */
@@ -981,19 +535,18 @@ void arch_ftrace_trampoline_free(struct
 #ifdef CONFIG_DYNAMIC_FTRACE
 extern void ftrace_graph_call(void);
 
-static unsigned char *ftrace_jmp_replace(unsigned long ip, unsigned long addr)
+static const char *ftrace_jmp_replace(unsigned long ip, unsigned long addr)
 {
-	return ftrace_text_replace(0xe9, ip, addr);
+	return ftrace_text_replace(JMP32_INSN_OPCODE, ip, addr);
 }
 
 static int ftrace_mod_jmp(unsigned long ip, void *func)
 {
-	unsigned char *new;
+	const char *new;
 
-	ftrace_update_func_call = 0UL;
 	new = ftrace_jmp_replace(ip, (unsigned long)func);
-
-	return update_ftrace_func(ip, new);
+	text_poke_bp((void *)ip, new, MCOUNT_INSN_SIZE, NULL); // BATCH
+	return 0;
 }
 
 int ftrace_enable_ftrace_graph_caller(void)
@@ -1019,10 +572,9 @@ int ftrace_disable_ftrace_graph_caller(v
 void prepare_ftrace_return(unsigned long self_addr, unsigned long *parent,
 			   unsigned long frame_pointer)
 {
+	unsigned long return_hooker = (unsigned long)&return_to_handler;
 	unsigned long old;
 	int faulted;
-	unsigned long return_hooker = (unsigned long)
-				&return_to_handler;
 
 	/*
 	 * When resuming from suspend-to-ram, this function can be indirectly
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -568,15 +568,6 @@ NOKPROBE_SYMBOL(do_general_protection);
 
 dotraplinkage void notrace do_int3(struct pt_regs *regs, long error_code)
 {
-#ifdef CONFIG_DYNAMIC_FTRACE
-	/*
-	 * ftrace must be first, everything else may cause a recursive crash.
-	 * See note by declaration of modifying_ftrace_code in ftrace.c
-	 */
-	if (unlikely(atomic_read(&modifying_ftrace_code)) &&
-	    ftrace_int3_handler(regs))
-		return;
-#endif
 	if (poke_int3_handler(regs))
 		return;
 



^ permalink raw reply	[flat|nested] 96+ messages in thread

* [PATCH v3 6/6] x86/mm: Remove set_kernel_text_r[ow]()
  2019-10-07  8:17 ` [PATCH v3 0/6] Rewrite x86/ftrace to use text_poke() Peter Zijlstra
                     ` (4 preceding siblings ...)
  2019-10-07  8:17   ` [PATCH v3 5/6] x86/ftrace: Use text_poke() Peter Zijlstra
@ 2019-10-07  8:17   ` Peter Zijlstra
  2019-10-08 15:07   ` [PATCH v3 0/6] Rewrite x86/ftrace to use text_poke() Steven Rostedt
  6 siblings, 0 replies; 96+ messages in thread
From: Peter Zijlstra @ 2019-10-07  8:17 UTC (permalink / raw)
  To: x86
  Cc: peterz, linux-kernel, rostedt, mhiramat, bristot, jbaron,
	torvalds, tglx, mingo, namit, hpa, luto, ard.biesheuvel,
	jpoimboe

With the last and only user of these functions gone (ftrace) remove
them as well to avoid ever growing new users.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 arch/x86/include/asm/set_memory.h |    2 --
 arch/x86/mm/init_32.c             |   28 ----------------------------
 arch/x86/mm/init_64.c             |   36 ------------------------------------
 3 files changed, 66 deletions(-)

--- a/arch/x86/include/asm/set_memory.h
+++ b/arch/x86/include/asm/set_memory.h
@@ -80,8 +80,6 @@ int set_direct_map_invalid_noflush(struc
 int set_direct_map_default_noflush(struct page *page);
 
 extern int kernel_set_to_readonly;
-void set_kernel_text_rw(void);
-void set_kernel_text_ro(void);
 
 #ifdef CONFIG_X86_64
 static inline int set_mce_nospec(unsigned long pfn)
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -874,34 +874,6 @@ void arch_remove_memory(int nid, u64 sta
 
 int kernel_set_to_readonly __read_mostly;
 
-void set_kernel_text_rw(void)
-{
-	unsigned long start = PFN_ALIGN(_text);
-	unsigned long size = PFN_ALIGN(_etext) - start;
-
-	if (!kernel_set_to_readonly)
-		return;
-
-	pr_debug("Set kernel text: %lx - %lx for read write\n",
-		 start, start+size);
-
-	set_pages_rw(virt_to_page(start), size >> PAGE_SHIFT);
-}
-
-void set_kernel_text_ro(void)
-{
-	unsigned long start = PFN_ALIGN(_text);
-	unsigned long size = PFN_ALIGN(_etext) - start;
-
-	if (!kernel_set_to_readonly)
-		return;
-
-	pr_debug("Set kernel text: %lx - %lx for read only\n",
-		 start, start+size);
-
-	set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
-}
-
 static void mark_nxdata_nx(void)
 {
 	/*
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -1260,42 +1260,6 @@ void __init mem_init(void)
 
 int kernel_set_to_readonly;
 
-void set_kernel_text_rw(void)
-{
-	unsigned long start = PFN_ALIGN(_text);
-	unsigned long end = PFN_ALIGN(__stop___ex_table);
-
-	if (!kernel_set_to_readonly)
-		return;
-
-	pr_debug("Set kernel text: %lx - %lx for read write\n",
-		 start, end);
-
-	/*
-	 * Make the kernel identity mapping for text RW. Kernel text
-	 * mapping will always be RO. Refer to the comment in
-	 * static_protections() in pageattr.c
-	 */
-	set_memory_rw(start, (end - start) >> PAGE_SHIFT);
-}
-
-void set_kernel_text_ro(void)
-{
-	unsigned long start = PFN_ALIGN(_text);
-	unsigned long end = PFN_ALIGN(__stop___ex_table);
-
-	if (!kernel_set_to_readonly)
-		return;
-
-	pr_debug("Set kernel text: %lx - %lx for read only\n",
-		 start, end);
-
-	/*
-	 * Set the kernel identity mapping for text RO.
-	 */
-	set_memory_ro(start, (end - start) >> PAGE_SHIFT);
-}
-
 void mark_rodata_ro(void)
 {
 	unsigned long start = PFN_ALIGN(_text);



^ permalink raw reply	[flat|nested] 96+ messages in thread

* [PATCH v2 0/4] Propagate module notifier errors
  2019-10-07  9:02 [RESEND] everything text-poke: ftrace, modules, static_call and jump_label Peter Zijlstra
  2019-10-07  8:17 ` [PATCH v3 0/6] Rewrite x86/ftrace to use text_poke() Peter Zijlstra
@ 2019-10-07  8:25 ` Peter Zijlstra
  2019-10-07  8:25   ` [PATCH v2 1/4] notifier: Fix broken error handling pattern Peter Zijlstra
                     ` (3 more replies)
  2019-10-07  8:27 ` [PATCH v2 00/13] Add static_call() Peter Zijlstra
  2019-10-07  8:44 ` [RFC][PATCH 0/9] Variable size jump_label support Peter Zijlstra
  3 siblings, 4 replies; 96+ messages in thread
From: Peter Zijlstra @ 2019-10-07  8:25 UTC (permalink / raw)
  To: x86
  Cc: peterz, linux-kernel, rostedt, mhiramat, bristot, jbaron,
	torvalds, tglx, mingo, namit, hpa, luto, ard.biesheuvel,
	jpoimboe

These patches came from the desire to propagate MODULE_STATE_COMING errors.
While looking at that I spotted fail with a number of module notifiers
themselves and with the whole notification business.

I will probably add a notifier for MODULE_STATE_UNFORMED, but that is later.

Fixing this error propagation fixes a fairly serious (but really uncommon) bug
when memory allocation fails for jump-labels while loading a module. And since
we're going to be introducing more of that (static_call), fix it now.


^ permalink raw reply	[flat|nested] 96+ messages in thread

* [PATCH v2 1/4] notifier: Fix broken error handling pattern
  2019-10-07  8:25 ` [PATCH v2 0/4] Propagate module notifier errors Peter Zijlstra
@ 2019-10-07  8:25   ` Peter Zijlstra
  2019-10-10 22:01     ` Rafael J. Wysocki
  2019-10-07  8:25   ` [PATCH v2 2/4] module: Fix up module_notifier return values Peter Zijlstra
                     ` (2 subsequent siblings)
  3 siblings, 1 reply; 96+ messages in thread
From: Peter Zijlstra @ 2019-10-07  8:25 UTC (permalink / raw)
  To: x86
  Cc: peterz, linux-kernel, rostedt, mhiramat, bristot, jbaron,
	torvalds, tglx, mingo, namit, hpa, luto, ard.biesheuvel,
	Pavel Machek, Alexios Zavras, Allison Randal, Sam Protsenko,
	Rafael J. Wysocki, Andrew Morton, Todd Brandt, Vasily Averin,
	Len Brown, Greg Kroah-Hartman

The current notifiers have the following error handling pattern all
over the place:

	int err, nr;

	err = __foo_notifier_call_chain(&chain, val_up, v, -1, &nr);
	if (err & NOTIFIER_STOP_MASK)
		__foo_notifier_call_chain(&chain, val_down, v, nr-1, NULL)

And aside from the endless repetition thereof, it is broken. Consider
blocking notifiers; both calls take and drop the rwsem, this means
that the notifier list can change in between the two calls, making @nr
meaningless.

Fix this by replacing all the __foo_notifier_call_chain() functions
with foo_notifier_call_chain_robust() that embeds the above pattern,
but ensures it is inside a single lock region.

Note: I switched atomic_notifier_call_chain_robust() to use
      the spinlock, since RCU cannot provide the guarantee
      required for the recovery.

Note: software_resume() error handling was broken afaict.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Pavel Machek <pavel@ucw.cz>
Cc: Alexios Zavras <alexios.zavras@intel.com>
Cc: Allison Randal <allison@lohutok.net>
Cc: Sam Protsenko <semen.protsenko@linaro.org>
Cc: "Rafael J. Wysocki" <rjw@rjwysocki.net>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Todd Brandt <todd.e.brandt@linux.intel.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vasily Averin <vvs@virtuozzo.com>
Cc: Len Brown <len.brown@intel.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/notifier.h           |   15 +--
 kernel/cpu_pm.c                    |   46 ++++-------
 kernel/notifier.c                  |  144 ++++++++++++++++++++++---------------
 kernel/power/hibernate.c           |   26 +++---
 kernel/power/main.c                |    8 +-
 kernel/power/power.h               |    3 
 kernel/power/suspend.c             |   14 +--
 kernel/power/user.c                |   14 +--
 tools/power/pm-graph/sleepgraph.py |    2 
 9 files changed, 139 insertions(+), 133 deletions(-)

--- a/include/linux/notifier.h
+++ b/include/linux/notifier.h
@@ -165,20 +165,19 @@ extern int srcu_notifier_chain_unregiste
 
 extern int atomic_notifier_call_chain(struct atomic_notifier_head *nh,
 		unsigned long val, void *v);
-extern int __atomic_notifier_call_chain(struct atomic_notifier_head *nh,
-	unsigned long val, void *v, int nr_to_call, int *nr_calls);
 extern int blocking_notifier_call_chain(struct blocking_notifier_head *nh,
 		unsigned long val, void *v);
-extern int __blocking_notifier_call_chain(struct blocking_notifier_head *nh,
-	unsigned long val, void *v, int nr_to_call, int *nr_calls);
 extern int raw_notifier_call_chain(struct raw_notifier_head *nh,
 		unsigned long val, void *v);
-extern int __raw_notifier_call_chain(struct raw_notifier_head *nh,
-	unsigned long val, void *v, int nr_to_call, int *nr_calls);
 extern int srcu_notifier_call_chain(struct srcu_notifier_head *nh,
 		unsigned long val, void *v);
-extern int __srcu_notifier_call_chain(struct srcu_notifier_head *nh,
-	unsigned long val, void *v, int nr_to_call, int *nr_calls);
+
+extern int atomic_notifier_call_chain_robust(struct atomic_notifier_head *nh,
+		unsigned long val_up, unsigned long val_down, void *v);
+extern int blocking_notifier_call_chain_robust(struct blocking_notifier_head *nh,
+		unsigned long val_up, unsigned long val_down, void *v);
+extern int raw_notifier_call_chain_robust(struct raw_notifier_head *nh,
+		unsigned long val_up, unsigned long val_down, void *v);
 
 #define NOTIFY_DONE		0x0000		/* Don't care */
 #define NOTIFY_OK		0x0001		/* Suits me */
--- a/kernel/cpu_pm.c
+++ b/kernel/cpu_pm.c
@@ -15,23 +15,31 @@
 
 static ATOMIC_NOTIFIER_HEAD(cpu_pm_notifier_chain);
 
-static int cpu_pm_notify(enum cpu_pm_event event, int nr_to_call, int *nr_calls)
+static int cpu_pm_notify(enum cpu_pm_event event)
 {
 	int ret;
 
 	/*
-	 * __atomic_notifier_call_chain has a RCU read critical section, which
+	 * atomic_notifier_call_chain has a RCU read critical section, which
 	 * could be disfunctional in cpu idle. Copy RCU_NONIDLE code to let
 	 * RCU know this.
 	 */
 	rcu_irq_enter_irqson();
-	ret = __atomic_notifier_call_chain(&cpu_pm_notifier_chain, event, NULL,
-		nr_to_call, nr_calls);
+	ret = atomic_notifier_call_chain(&cpu_pm_notifier_chain, event, NULL);
 	rcu_irq_exit_irqson();
 
 	return notifier_to_errno(ret);
 }
 
+static int cpu_pm_notify_robust(enum cpu_pm_event event_up, enum cpu_pm_event event_down)
+{
+	int ret;
+
+	ret = atomic_notifier_call_chain_robust(&cpu_pm_notifier_chain, event_up, event_down, NULL);
+
+	return notifier_to_errno(ret);
+}
+
 /**
  * cpu_pm_register_notifier - register a driver with cpu_pm
  * @nb: notifier block to register
@@ -80,18 +88,7 @@ EXPORT_SYMBOL_GPL(cpu_pm_unregister_noti
  */
 int cpu_pm_enter(void)
 {
-	int nr_calls;
-	int ret = 0;
-
-	ret = cpu_pm_notify(CPU_PM_ENTER, -1, &nr_calls);
-	if (ret)
-		/*
-		 * Inform listeners (nr_calls - 1) about failure of CPU PM
-		 * PM entry who are notified earlier to prepare for it.
-		 */
-		cpu_pm_notify(CPU_PM_ENTER_FAILED, nr_calls - 1, NULL);
-
-	return ret;
+	return cpu_pm_notify_robust(CPU_PM_ENTER, CPU_PM_ENTER_FAILED);
 }
 EXPORT_SYMBOL_GPL(cpu_pm_enter);
 
@@ -109,7 +106,7 @@ EXPORT_SYMBOL_GPL(cpu_pm_enter);
  */
 int cpu_pm_exit(void)
 {
-	return cpu_pm_notify(CPU_PM_EXIT, -1, NULL);
+	return cpu_pm_notify(CPU_PM_EXIT);
 }
 EXPORT_SYMBOL_GPL(cpu_pm_exit);
 
@@ -131,18 +128,7 @@ EXPORT_SYMBOL_GPL(cpu_pm_exit);
  */
 int cpu_cluster_pm_enter(void)
 {
-	int nr_calls;
-	int ret = 0;
-
-	ret = cpu_pm_notify(CPU_CLUSTER_PM_ENTER, -1, &nr_calls);
-	if (ret)
-		/*
-		 * Inform listeners (nr_calls - 1) about failure of CPU cluster
-		 * PM entry who are notified earlier to prepare for it.
-		 */
-		cpu_pm_notify(CPU_CLUSTER_PM_ENTER_FAILED, nr_calls - 1, NULL);
-
-	return ret;
+	return cpu_pm_notify_robust(CPU_CLUSTER_PM_ENTER, CPU_CLUSTER_PM_ENTER_FAILED);
 }
 EXPORT_SYMBOL_GPL(cpu_cluster_pm_enter);
 
@@ -163,7 +149,7 @@ EXPORT_SYMBOL_GPL(cpu_cluster_pm_enter);
  */
 int cpu_cluster_pm_exit(void)
 {
-	return cpu_pm_notify(CPU_CLUSTER_PM_EXIT, -1, NULL);
+	return cpu_pm_notify(CPU_CLUSTER_PM_EXIT);
 }
 EXPORT_SYMBOL_GPL(cpu_cluster_pm_exit);
 
--- a/kernel/notifier.c
+++ b/kernel/notifier.c
@@ -106,6 +106,34 @@ static int notifier_call_chain(struct no
 }
 NOKPROBE_SYMBOL(notifier_call_chain);
 
+/**
+ * notifier_call_chain_robust - Inform the registered notifiers about an event
+ *                              and rollback on error.
+ * @nl:		Pointer to head of the blocking notifier chain
+ * @val_up:	Value passed unmodified to the notifier function
+ * @val_down:	Value passed unmodified to the notifier function when recovering
+ *              from an error on @val_up
+ * @v		Pointer passed unmodified to the notifier function
+ *
+ * NOTE:	It is important the @nl chain doesn't change between the two
+ *		invocations of notifier_call_chain() such that we visit the
+ *		exact same notifier callbacks; this rules out any RCU usage.
+ *
+ * Returns:	the return value of the @val_up call.
+ */
+static int notifier_call_chain_robust(struct notifier_block **nl,
+				     unsigned long val_up, unsigned long val_down,
+				     void *v)
+{
+	int ret, nr = 0;
+
+	ret = notifier_call_chain(nl, val_up, v, -1, &nr);
+	if (ret & NOTIFY_STOP_MASK)
+		notifier_call_chain(nl, val_down, v, nr-1, NULL);
+
+	return ret;
+}
+
 /*
  *	Atomic notifier chain routines.  Registration and unregistration
  *	use a spinlock, and call_chain is synchronized by RCU (no locks).
@@ -156,13 +184,30 @@ int atomic_notifier_chain_unregister(str
 }
 EXPORT_SYMBOL_GPL(atomic_notifier_chain_unregister);
 
+int atomic_notifier_call_chain_robust(struct atomic_notifier_head *nh,
+		unsigned long val_up, unsigned long val_down, void *v)
+{
+	unsigned long flags;
+	int ret;
+
+	/*
+	 * Musn't use RCU; because then the notifier list can
+	 * change between the up and down traversal.
+	 */
+	spin_lock_irqsave(&nh->lock, flags);
+	ret = notifier_call_chain_robust(&nh->head, val_up, val_down, v);
+	spin_unlock_irqrestore(&nh->lock, flags);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(atomic_notifier_call_chain_robust);
+NOKPROBE_SYMBOL(atomic_notifier_call_chain_robust);
+
 /**
- *	__atomic_notifier_call_chain - Call functions in an atomic notifier chain
+ *	atomic_notifier_call_chain - Call functions in an atomic notifier chain
  *	@nh: Pointer to head of the atomic notifier chain
  *	@val: Value passed unmodified to notifier function
  *	@v: Pointer passed unmodified to notifier function
- *	@nr_to_call: See the comment for notifier_call_chain.
- *	@nr_calls: See the comment for notifier_call_chain.
  *
  *	Calls each function in a notifier chain in turn.  The functions
  *	run in an atomic context, so they must not block.
@@ -175,24 +220,16 @@ EXPORT_SYMBOL_GPL(atomic_notifier_chain_
  *	Otherwise the return value is the return value
  *	of the last notifier function called.
  */
-int __atomic_notifier_call_chain(struct atomic_notifier_head *nh,
-				 unsigned long val, void *v,
-				 int nr_to_call, int *nr_calls)
+int atomic_notifier_call_chain(struct atomic_notifier_head *nh,
+			       unsigned long val, void *v)
 {
 	int ret;
 
 	rcu_read_lock();
-	ret = notifier_call_chain(&nh->head, val, v, nr_to_call, nr_calls);
+	ret = notifier_call_chain(&nh->head, val, v, -1, NULL);
 	rcu_read_unlock();
-	return ret;
-}
-EXPORT_SYMBOL_GPL(__atomic_notifier_call_chain);
-NOKPROBE_SYMBOL(__atomic_notifier_call_chain);
 
-int atomic_notifier_call_chain(struct atomic_notifier_head *nh,
-			       unsigned long val, void *v)
-{
-	return __atomic_notifier_call_chain(nh, val, v, -1, NULL);
+	return ret;
 }
 EXPORT_SYMBOL_GPL(atomic_notifier_call_chain);
 NOKPROBE_SYMBOL(atomic_notifier_call_chain);
@@ -285,13 +322,30 @@ int blocking_notifier_chain_unregister(s
 }
 EXPORT_SYMBOL_GPL(blocking_notifier_chain_unregister);
 
+int blocking_notifier_call_chain_robust(struct blocking_notifier_head *nh,
+		unsigned long val_up, unsigned long val_down, void *v)
+{
+	int ret = NOTIFY_DONE;
+
+	/*
+	 * We check the head outside the lock, but if this access is
+	 * racy then it does not matter what the result of the test
+	 * is, we re-check the list after having taken the lock anyway:
+	 */
+	if (rcu_access_pointer(nh->head)) {
+		down_read(&nh->rwsem);
+		ret = notifier_call_chain_robust(&nh->head, val_up, val_down, v);
+		up_read(&nh->rwsem);
+	}
+	return ret;
+}
+EXPORT_SYMBOL_GPL(blocking_notifier_call_chain_robust);
+
 /**
- *	__blocking_notifier_call_chain - Call functions in a blocking notifier chain
+ *	blocking_notifier_call_chain - Call functions in a blocking notifier chain
  *	@nh: Pointer to head of the blocking notifier chain
  *	@val: Value passed unmodified to notifier function
  *	@v: Pointer passed unmodified to notifier function
- *	@nr_to_call: See comment for notifier_call_chain.
- *	@nr_calls: See comment for notifier_call_chain.
  *
  *	Calls each function in a notifier chain in turn.  The functions
  *	run in a process context, so they are allowed to block.
@@ -303,9 +357,8 @@ EXPORT_SYMBOL_GPL(blocking_notifier_chai
  *	Otherwise the return value is the return value
  *	of the last notifier function called.
  */
-int __blocking_notifier_call_chain(struct blocking_notifier_head *nh,
-				   unsigned long val, void *v,
-				   int nr_to_call, int *nr_calls)
+int blocking_notifier_call_chain(struct blocking_notifier_head *nh,
+		unsigned long val, void *v)
 {
 	int ret = NOTIFY_DONE;
 
@@ -316,19 +369,11 @@ int __blocking_notifier_call_chain(struc
 	 */
 	if (rcu_access_pointer(nh->head)) {
 		down_read(&nh->rwsem);
-		ret = notifier_call_chain(&nh->head, val, v, nr_to_call,
-					nr_calls);
+		ret = notifier_call_chain(&nh->head, val, v, -1, NULL);
 		up_read(&nh->rwsem);
 	}
 	return ret;
 }
-EXPORT_SYMBOL_GPL(__blocking_notifier_call_chain);
-
-int blocking_notifier_call_chain(struct blocking_notifier_head *nh,
-		unsigned long val, void *v)
-{
-	return __blocking_notifier_call_chain(nh, val, v, -1, NULL);
-}
 EXPORT_SYMBOL_GPL(blocking_notifier_call_chain);
 
 /*
@@ -370,13 +415,18 @@ int raw_notifier_chain_unregister(struct
 }
 EXPORT_SYMBOL_GPL(raw_notifier_chain_unregister);
 
+int raw_notifier_call_chain_robust(struct raw_notifier_head *nh,
+		unsigned long val_up, unsigned long val_down, void *v)
+{
+	return notifier_call_chain_robust(&nh->head, val_up, val_down, v);
+}
+EXPORT_SYMBOL_GPL(raw_notifier_call_chain_robust);
+
 /**
- *	__raw_notifier_call_chain - Call functions in a raw notifier chain
+ *	raw_notifier_call_chain - Call functions in a raw notifier chain
  *	@nh: Pointer to head of the raw notifier chain
  *	@val: Value passed unmodified to notifier function
  *	@v: Pointer passed unmodified to notifier function
- *	@nr_to_call: See comment for notifier_call_chain.
- *	@nr_calls: See comment for notifier_call_chain
  *
  *	Calls each function in a notifier chain in turn.  The functions
  *	run in an undefined context.
@@ -389,18 +439,10 @@ EXPORT_SYMBOL_GPL(raw_notifier_chain_unr
  *	Otherwise the return value is the return value
  *	of the last notifier function called.
  */
-int __raw_notifier_call_chain(struct raw_notifier_head *nh,
-			      unsigned long val, void *v,
-			      int nr_to_call, int *nr_calls)
-{
-	return notifier_call_chain(&nh->head, val, v, nr_to_call, nr_calls);
-}
-EXPORT_SYMBOL_GPL(__raw_notifier_call_chain);
-
 int raw_notifier_call_chain(struct raw_notifier_head *nh,
 		unsigned long val, void *v)
 {
-	return __raw_notifier_call_chain(nh, val, v, -1, NULL);
+	return notifier_call_chain(&nh->head, val, v, -1, NULL);
 }
 EXPORT_SYMBOL_GPL(raw_notifier_call_chain);
 
@@ -472,12 +514,10 @@ int srcu_notifier_chain_unregister(struc
 EXPORT_SYMBOL_GPL(srcu_notifier_chain_unregister);
 
 /**
- *	__srcu_notifier_call_chain - Call functions in an SRCU notifier chain
+ *	srcu_notifier_call_chain - Call functions in an SRCU notifier chain
  *	@nh: Pointer to head of the SRCU notifier chain
  *	@val: Value passed unmodified to notifier function
  *	@v: Pointer passed unmodified to notifier function
- *	@nr_to_call: See comment for notifier_call_chain.
- *	@nr_calls: See comment for notifier_call_chain
  *
  *	Calls each function in a notifier chain in turn.  The functions
  *	run in a process context, so they are allowed to block.
@@ -489,25 +529,17 @@ EXPORT_SYMBOL_GPL(srcu_notifier_chain_un
  *	Otherwise the return value is the return value
  *	of the last notifier function called.
  */
-int __srcu_notifier_call_chain(struct srcu_notifier_head *nh,
-			       unsigned long val, void *v,
-			       int nr_to_call, int *nr_calls)
+int srcu_notifier_call_chain(struct srcu_notifier_head *nh,
+		unsigned long val, void *v)
 {
 	int ret;
 	int idx;
 
 	idx = srcu_read_lock(&nh->srcu);
-	ret = notifier_call_chain(&nh->head, val, v, nr_to_call, nr_calls);
+	ret = notifier_call_chain(&nh->head, val, v, -1, NULL);
 	srcu_read_unlock(&nh->srcu, idx);
 	return ret;
 }
-EXPORT_SYMBOL_GPL(__srcu_notifier_call_chain);
-
-int srcu_notifier_call_chain(struct srcu_notifier_head *nh,
-		unsigned long val, void *v)
-{
-	return __srcu_notifier_call_chain(nh, val, v, -1, NULL);
-}
 EXPORT_SYMBOL_GPL(srcu_notifier_call_chain);
 
 /**
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -693,8 +693,8 @@ static int load_image_and_restore(void)
  */
 int hibernate(void)
 {
-	int error, nr_calls = 0;
 	bool snapshot_test = false;
+	int error;
 
 	if (!hibernation_available()) {
 		pm_pr_dbg("Hibernation not available.\n");
@@ -710,11 +710,9 @@ int hibernate(void)
 
 	pr_info("hibernation entry\n");
 	pm_prepare_console();
-	error = __pm_notifier_call_chain(PM_HIBERNATION_PREPARE, -1, &nr_calls);
-	if (error) {
-		nr_calls--;
-		goto Exit;
-	}
+	error = pm_notifier_call_chain_robust(PM_HIBERNATION_PREPARE, PM_POST_HIBERNATION);
+	if (error)
+		goto Restore;
 
 	ksys_sync_helper();
 
@@ -772,7 +770,8 @@ int hibernate(void)
 	/* Don't bother checking whether freezer_test_done is true */
 	freezer_test_done = false;
  Exit:
-	__pm_notifier_call_chain(PM_POST_HIBERNATION, nr_calls, NULL);
+	pm_notifier_call_chain(PM_POST_HIBERNATION);
+ Restore:
 	pm_restore_console();
 	atomic_inc(&snapshot_device_available);
  Unlock:
@@ -800,7 +799,7 @@ int hibernate(void)
  */
 static int software_resume(void)
 {
-	int error, nr_calls = 0;
+	int error;
 
 	/*
 	 * If the user said "noresume".. bail out early.
@@ -887,11 +886,9 @@ static int software_resume(void)
 
 	pr_info("resume from hibernation\n");
 	pm_prepare_console();
-	error = __pm_notifier_call_chain(PM_RESTORE_PREPARE, -1, &nr_calls);
-	if (error) {
-		nr_calls--;
-		goto Close_Finish;
-	}
+	error = pm_notifier_call_chain_robust(PM_RESTORE_PREPARE, PM_POST_RESTORE);
+	if (error)
+		goto Restore;
 
 	pm_pr_dbg("Preparing processes for restore.\n");
 	error = freeze_processes();
@@ -900,7 +897,8 @@ static int software_resume(void)
 	error = load_image_and_restore();
 	thaw_processes();
  Finish:
-	__pm_notifier_call_chain(PM_POST_RESTORE, nr_calls, NULL);
+	pm_notifier_call_chain(PM_POST_RESTORE);
+ Restore:
 	pm_restore_console();
 	pr_info("resume from hibernation failed (%d)\n", error);
 	atomic_inc(&snapshot_device_available);
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -79,18 +79,18 @@ int unregister_pm_notifier(struct notifi
 }
 EXPORT_SYMBOL_GPL(unregister_pm_notifier);
 
-int __pm_notifier_call_chain(unsigned long val, int nr_to_call, int *nr_calls)
+int pm_notifier_call_chain_robust(unsigned long val_up, unsigned long val_down)
 {
 	int ret;
 
-	ret = __blocking_notifier_call_chain(&pm_chain_head, val, NULL,
-						nr_to_call, nr_calls);
+	ret = blocking_notifier_call_chain_robust(&pm_chain_head, val_up, val_down, NULL);
 
 	return notifier_to_errno(ret);
 }
+
 int pm_notifier_call_chain(unsigned long val)
 {
-	return __pm_notifier_call_chain(val, -1, NULL);
+	return blocking_notifier_call_chain(&pm_chain_head, val, NULL);
 }
 
 /* If set, devices may be suspended and resumed asynchronously. */
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -210,8 +210,7 @@ static inline void suspend_test_finish(c
 
 #ifdef CONFIG_PM_SLEEP
 /* kernel/power/main.c */
-extern int __pm_notifier_call_chain(unsigned long val, int nr_to_call,
-				    int *nr_calls);
+extern int pm_notifier_call_chain_robust(unsigned long val_up, unsigned long val_down);
 extern int pm_notifier_call_chain(unsigned long val);
 #endif
 
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -352,18 +352,16 @@ static int suspend_test(int level)
  */
 static int suspend_prepare(suspend_state_t state)
 {
-	int error, nr_calls = 0;
+	int error;
 
 	if (!sleep_state_supported(state))
 		return -EPERM;
 
 	pm_prepare_console();
 
-	error = __pm_notifier_call_chain(PM_SUSPEND_PREPARE, -1, &nr_calls);
-	if (error) {
-		nr_calls--;
-		goto Finish;
-	}
+	error = pm_notifier_call_chain_robust(PM_SUSPEND_PREPARE, PM_POST_SUSPEND);
+	if (error)
+		goto Restore;
 
 	trace_suspend_resume(TPS("freeze_processes"), 0, true);
 	error = suspend_freeze_processes();
@@ -373,8 +371,8 @@ static int suspend_prepare(suspend_state
 
 	suspend_stats.failed_freeze++;
 	dpm_save_failed_step(SUSPEND_FREEZE);
- Finish:
-	__pm_notifier_call_chain(PM_POST_SUSPEND, nr_calls, NULL);
+	pm_notifier_call_chain(PM_POST_SUSPEND);
+ Restore:
 	pm_restore_console();
 	return error;
 }
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -44,7 +44,7 @@ atomic_t snapshot_device_available = ATO
 static int snapshot_open(struct inode *inode, struct file *filp)
 {
 	struct snapshot_data *data;
-	int error, nr_calls = 0;
+	int error;
 
 	if (!hibernation_available())
 		return -EPERM;
@@ -71,9 +71,7 @@ static int snapshot_open(struct inode *i
 			swap_type_of(swsusp_resume_device, 0, NULL) : -1;
 		data->mode = O_RDONLY;
 		data->free_bitmaps = false;
-		error = __pm_notifier_call_chain(PM_HIBERNATION_PREPARE, -1, &nr_calls);
-		if (error)
-			__pm_notifier_call_chain(PM_POST_HIBERNATION, --nr_calls, NULL);
+		error = pm_notifier_call_chain_robust(PM_HIBERNATION_PREPARE, PM_POST_HIBERNATION);
 	} else {
 		/*
 		 * Resuming.  We may need to wait for the image device to
@@ -83,15 +81,11 @@ static int snapshot_open(struct inode *i
 
 		data->swap = -1;
 		data->mode = O_WRONLY;
-		error = __pm_notifier_call_chain(PM_RESTORE_PREPARE, -1, &nr_calls);
+		error = pm_notifier_call_chain_robust(PM_RESTORE_PREPARE, PM_POST_RESTORE);
 		if (!error) {
 			error = create_basic_memory_bitmaps();
 			data->free_bitmaps = !error;
-		} else
-			nr_calls--;
-
-		if (error)
-			__pm_notifier_call_chain(PM_POST_RESTORE, nr_calls, NULL);
+		}
 	}
 	if (error)
 		atomic_inc(&snapshot_device_available);
--- a/tools/power/pm-graph/sleepgraph.py
+++ b/tools/power/pm-graph/sleepgraph.py
@@ -153,7 +153,7 @@ import base64
 	tracefuncs = {
 		'sys_sync': {},
 		'ksys_sync': {},
-		'__pm_notifier_call_chain': {},
+		'pm_notifier_call_chain_robust': {},
 		'pm_prepare_console': {},
 		'pm_notifier_call_chain': {},
 		'freeze_processes': {},



^ permalink raw reply	[flat|nested] 96+ messages in thread

* [PATCH v2 2/4] module: Fix up module_notifier return values.
  2019-10-07  8:25 ` [PATCH v2 0/4] Propagate module notifier errors Peter Zijlstra
  2019-10-07  8:25   ` [PATCH v2 1/4] notifier: Fix broken error handling pattern Peter Zijlstra
@ 2019-10-07  8:25   ` Peter Zijlstra
  2019-10-07  8:25   ` [PATCH v2 3/4] module: Properly propagate MODULE_STATE_COMING failure Peter Zijlstra
  2019-10-07  8:25   ` [PATCH v2 4/4] jump_label,module: Fix module lifetime for __jump_label_mod_text_reserved Peter Zijlstra
  3 siblings, 0 replies; 96+ messages in thread
From: Peter Zijlstra @ 2019-10-07  8:25 UTC (permalink / raw)
  To: x86
  Cc: peterz, linux-kernel, rostedt, mhiramat, bristot, jbaron,
	torvalds, tglx, mingo, namit, hpa, luto, ard.biesheuvel,
	Mathieu Desnoyers, Joel Fernandes (Google),
	Robert Richter, Paul E. McKenney, Yonghong Song,
	Alexei Starovoitov, Ingo Molnar, oprofile-list, Daniel Borkmann,
	Song Liu, Martin KaFai Lau

While auditing all module notifiers I noticed a whole bunch of fail
wrt the return value. Notifiers have a 'special' return semantics.

As is; NOTIFY_DONE vs NOTIFY_OK is a bit vague; but
notifier_from_errno(0) results in NOTIFY_OK and NOTIFY_DONE has a
comment that says "Don't care".

>From this I've used NOTIFY_DONE when the function completely ignores
the callback and notifier_to_error() isn't used.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Reviewed-by: Joel Fernandes (Google) <joel@joelfernandes.org>
Reviewed-by: Robert Richter <rric@kernel.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: "Paul E. McKenney" <paulmck@linux.ibm.com>
Cc: Yonghong Song <yhs@fb.com>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Cc: oprofile-list@lists.sf.net
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Song Liu <songliubraving@fb.com>
Cc: Martin KaFai Lau <kafai@fb.com>
---
 drivers/oprofile/buffer_sync.c |    4 ++--
 kernel/trace/bpf_trace.c       |    8 ++++++--
 kernel/trace/trace.c           |    2 +-
 kernel/trace/trace_events.c    |    2 +-
 kernel/trace/trace_printk.c    |    4 ++--
 kernel/tracepoint.c            |    2 +-
 6 files changed, 13 insertions(+), 9 deletions(-)

--- a/drivers/oprofile/buffer_sync.c
+++ b/drivers/oprofile/buffer_sync.c
@@ -116,7 +116,7 @@ module_load_notify(struct notifier_block
 {
 #ifdef CONFIG_MODULES
 	if (val != MODULE_STATE_COMING)
-		return 0;
+		return NOTIFY_DONE;
 
 	/* FIXME: should we process all CPU buffers ? */
 	mutex_lock(&buffer_mutex);
@@ -124,7 +124,7 @@ module_load_notify(struct notifier_block
 	add_event_entry(MODULE_LOADED_CODE);
 	mutex_unlock(&buffer_mutex);
 #endif
-	return 0;
+	return NOTIFY_OK;
 }
 
 
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -1451,10 +1451,11 @@ static int bpf_event_notify(struct notif
 {
 	struct bpf_trace_module *btm, *tmp;
 	struct module *mod = module;
+	int ret = 0;
 
 	if (mod->num_bpf_raw_events == 0 ||
 	    (op != MODULE_STATE_COMING && op != MODULE_STATE_GOING))
-		return 0;
+		goto out;
 
 	mutex_lock(&bpf_module_mutex);
 
@@ -1464,6 +1465,8 @@ static int bpf_event_notify(struct notif
 		if (btm) {
 			btm->module = module;
 			list_add(&btm->list, &bpf_trace_modules);
+		} else {
+			ret = -ENOMEM;
 		}
 		break;
 	case MODULE_STATE_GOING:
@@ -1479,7 +1482,8 @@ static int bpf_event_notify(struct notif
 
 	mutex_unlock(&bpf_module_mutex);
 
-	return 0;
+out:
+	return notifier_from_errno(ret);
 }
 
 static struct notifier_block bpf_module_nb = {
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -8696,7 +8696,7 @@ static int trace_module_notify(struct no
 		break;
 	}
 
-	return 0;
+	return NOTIFY_OK;
 }
 
 static struct notifier_block trace_module_nb = {
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -2442,7 +2442,7 @@ static int trace_module_notify(struct no
 	mutex_unlock(&trace_types_lock);
 	mutex_unlock(&event_mutex);
 
-	return 0;
+	return NOTIFY_OK;
 }
 
 static struct notifier_block trace_module_nb = {
--- a/kernel/trace/trace_printk.c
+++ b/kernel/trace/trace_printk.c
@@ -95,7 +95,7 @@ static int module_trace_bprintk_format_n
 		if (val == MODULE_STATE_COMING)
 			hold_module_trace_bprintk_format(start, end);
 	}
-	return 0;
+	return NOTIFY_OK;
 }
 
 /*
@@ -173,7 +173,7 @@ __init static int
 module_trace_bprintk_format_notify(struct notifier_block *self,
 		unsigned long val, void *data)
 {
-	return 0;
+	return NOTIFY_OK;
 }
 static inline const char **
 find_next_mod_format(int start_index, void *v, const char **fmt, loff_t *pos)
--- a/kernel/tracepoint.c
+++ b/kernel/tracepoint.c
@@ -521,7 +521,7 @@ static int tracepoint_module_notify(stru
 	case MODULE_STATE_UNFORMED:
 		break;
 	}
-	return ret;
+	return notifier_from_errno(ret);
 }
 
 static struct notifier_block tracepoint_module_nb = {



^ permalink raw reply	[flat|nested] 96+ messages in thread

* [PATCH v2 3/4] module: Properly propagate MODULE_STATE_COMING failure
  2019-10-07  8:25 ` [PATCH v2 0/4] Propagate module notifier errors Peter Zijlstra
  2019-10-07  8:25   ` [PATCH v2 1/4] notifier: Fix broken error handling pattern Peter Zijlstra
  2019-10-07  8:25   ` [PATCH v2 2/4] module: Fix up module_notifier return values Peter Zijlstra
@ 2019-10-07  8:25   ` Peter Zijlstra
  2019-10-08 13:08     ` Miroslav Benes
  2019-10-07  8:25   ` [PATCH v2 4/4] jump_label,module: Fix module lifetime for __jump_label_mod_text_reserved Peter Zijlstra
  3 siblings, 1 reply; 96+ messages in thread
From: Peter Zijlstra @ 2019-10-07  8:25 UTC (permalink / raw)
  To: x86
  Cc: peterz, linux-kernel, rostedt, mhiramat, bristot, jbaron,
	torvalds, tglx, mingo, namit, hpa, luto, ard.biesheuvel,
	Yonghong Song, Alexei Starovoitov, Daniel Borkmann, Song Liu,
	Jessica Yu, Martin KaFai Lau

Now that notifiers got unbroken; use the proper interface to handle
notifier errors and propagate them.

There were already MODULE_STATE_COMING notifiers that failed; notably:

 - jump_label_module_notifier()
 - tracepoint_module_notify()
 - bpf_event_notify()

By propagating this error, we fix those users.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Yonghong Song <yhs@fb.com>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: Song Liu <songliubraving@fb.com>
Cc: Jessica Yu <jeyu@kernel.org>
Cc: Martin KaFai Lau <kafai@fb.com>
---
 kernel/module.c |   10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

--- a/kernel/module.c
+++ b/kernel/module.c
@@ -3751,9 +3751,13 @@ static int prepare_coming_module(struct
 	if (err)
 		return err;
 
-	blocking_notifier_call_chain(&module_notify_list,
-				     MODULE_STATE_COMING, mod);
-	return 0;
+	err = blocking_notifier_call_chain_robust(&module_notify_list,
+			MODULE_STATE_COMING, MODULE_STATE_GOING, mod);
+	err = notifier_to_errno(err);
+	if (err)
+		klp_module_going(mod);
+
+	return err;
 }
 
 static int unknown_module_param_cb(char *param, char *val, const char *modname,



^ permalink raw reply	[flat|nested] 96+ messages in thread

* [PATCH v2 4/4] jump_label,module: Fix module lifetime for __jump_label_mod_text_reserved
  2019-10-07  8:25 ` [PATCH v2 0/4] Propagate module notifier errors Peter Zijlstra
                     ` (2 preceding siblings ...)
  2019-10-07  8:25   ` [PATCH v2 3/4] module: Properly propagate MODULE_STATE_COMING failure Peter Zijlstra
@ 2019-10-07  8:25   ` Peter Zijlstra
  3 siblings, 0 replies; 96+ messages in thread
From: Peter Zijlstra @ 2019-10-07  8:25 UTC (permalink / raw)
  To: x86
  Cc: peterz, linux-kernel, rostedt, mhiramat, bristot, jbaron,
	torvalds, tglx, mingo, namit, hpa, luto, ard.biesheuvel,
	jpoimboe

Nothing ensures the module exists while we're iterating
mod->jump_entries in __jump_label_mod_text_reserved(), take a module
reference to ensure the module sticks around.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 kernel/jump_label.c |   10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

--- a/kernel/jump_label.c
+++ b/kernel/jump_label.c
@@ -539,19 +539,25 @@ static void static_key_set_mod(struct st
 static int __jump_label_mod_text_reserved(void *start, void *end)
 {
 	struct module *mod;
+	int ret;
 
 	preempt_disable();
 	mod = __module_text_address((unsigned long)start);
 	WARN_ON_ONCE(__module_text_address((unsigned long)end) != mod);
+	if (!try_module_get(mod))
+		mod = NULL;
 	preempt_enable();
 
 	if (!mod)
 		return 0;
 
-
-	return __jump_label_text_reserved(mod->jump_entries,
+	ret = __jump_label_text_reserved(mod->jump_entries,
 				mod->jump_entries + mod->num_jump_entries,
 				start, end);
+
+	module_put(mod);
+
+	return ret;
 }
 
 static void __jump_label_mod_update(struct static_key *key)



^ permalink raw reply	[flat|nested] 96+ messages in thread

* [PATCH v2 00/13] Add static_call()
  2019-10-07  9:02 [RESEND] everything text-poke: ftrace, modules, static_call and jump_label Peter Zijlstra
  2019-10-07  8:17 ` [PATCH v3 0/6] Rewrite x86/ftrace to use text_poke() Peter Zijlstra
  2019-10-07  8:25 ` [PATCH v2 0/4] Propagate module notifier errors Peter Zijlstra
@ 2019-10-07  8:27 ` Peter Zijlstra
  2019-10-07  8:27   ` [PATCH v2 01/13] compiler.h: Make __ADDRESSABLE() symbol truly unique Peter Zijlstra
                     ` (13 more replies)
  2019-10-07  8:44 ` [RFC][PATCH 0/9] Variable size jump_label support Peter Zijlstra
  3 siblings, 14 replies; 96+ messages in thread
From: Peter Zijlstra @ 2019-10-07  8:27 UTC (permalink / raw)
  To: x86
  Cc: peterz, linux-kernel, rostedt, mhiramat, bristot, jbaron,
	torvalds, tglx, mingo, namit, hpa, luto, ard.biesheuvel,
	jpoimboe

This series, which depends on the previous two, introduces static_call().

static_call(), is the idea of static_branch() applied to indirect function
calls. Remove a data load (indirection) by modifying the text.

These patches are still based on the work Josh did earlier, but incorporated
feedback from the last posting and have a lot of extra patches which resulted
from me trying to actually use static_call().

This still relies on objtool to generate the .static_call_sites section, mostly
because this is a natural place for x86_64 and works for both GCC and LLVM.
Other architectures can pick other means if/when they implement the inline
patching. The out-of-line (aka. trampoline) variant doesn't require this.




^ permalink raw reply	[flat|nested] 96+ messages in thread

* [PATCH v2 01/13] compiler.h: Make __ADDRESSABLE() symbol truly unique
  2019-10-07  8:27 ` [PATCH v2 00/13] Add static_call() Peter Zijlstra
@ 2019-10-07  8:27   ` Peter Zijlstra
  2019-10-07  8:27   ` [PATCH v2 02/13] static_call: Add basic static call infrastructure Peter Zijlstra
                     ` (12 subsequent siblings)
  13 siblings, 0 replies; 96+ messages in thread
From: Peter Zijlstra @ 2019-10-07  8:27 UTC (permalink / raw)
  To: x86
  Cc: peterz, linux-kernel, rostedt, mhiramat, bristot, jbaron,
	torvalds, tglx, mingo, namit, hpa, luto, ard.biesheuvel,
	jpoimboe

From: Josh Poimboeuf <jpoimboe@redhat.com>

The __ADDRESSABLE() macro uses the __LINE__ macro to create a temporary
symbol which has a unique name.  However, if the macro is used multiple
times from within another macro, the line number will always be the
same, resulting in duplicate symbols.

Make the temporary symbols truly unique by using __UNIQUE_ID instead of
__LINE__.

Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
---
 include/linux/compiler.h |    2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -299,7 +299,7 @@ unsigned long read_word_at_a_time(const
  */
 #define __ADDRESSABLE(sym) \
 	static void * __section(.discard.addressable) __used \
-		__PASTE(__addressable_##sym, __LINE__) = (void *)&sym;
+		__UNIQUE_ID(__PASTE(__addressable_,sym)) = (void *)&sym;
 
 /**
  * offset_to_ptr - convert a relative memory offset to an absolute pointer



^ permalink raw reply	[flat|nested] 96+ messages in thread

* [PATCH v2 02/13] static_call: Add basic static call infrastructure
  2019-10-07  8:27 ` [PATCH v2 00/13] Add static_call() Peter Zijlstra
  2019-10-07  8:27   ` [PATCH v2 01/13] compiler.h: Make __ADDRESSABLE() symbol truly unique Peter Zijlstra
@ 2019-10-07  8:27   ` Peter Zijlstra
  2019-10-07 11:33     ` Peter Zijlstra
  2019-10-07  8:27   ` [PATCH v2 03/13] static_call: Add inline " Peter Zijlstra
                     ` (11 subsequent siblings)
  13 siblings, 1 reply; 96+ messages in thread
From: Peter Zijlstra @ 2019-10-07  8:27 UTC (permalink / raw)
  To: x86
  Cc: peterz, linux-kernel, rostedt, mhiramat, bristot, jbaron,
	torvalds, tglx, mingo, namit, hpa, luto, ard.biesheuvel,
	jpoimboe

From: Josh Poimboeuf <jpoimboe@redhat.com>

Static calls are a replacement for global function pointers.  They use
code patching to allow direct calls to be used instead of indirect
calls.  They give the flexibility of function pointers, but with
improved performance.  This is especially important for cases where
retpolines would otherwise be used, as retpolines can significantly
impact performance.

The concept and code are an extension of previous work done by Ard
Biesheuvel and Steven Rostedt:

  https://lkml.kernel.org/r/20181005081333.15018-1-ard.biesheuvel@linaro.org, jpoimboe@redhat.com
  https://lkml.kernel.org/r/20181006015110.653946300@goodmis.org

There are two implementations, depending on arch support:

 1) out-of-line: patched trampolines (CONFIG_HAVE_STATIC_CALL)
 2) basic function pointers

For more details, see the comments in include/linux/static_call.h.

[peterz: simplified interface]
Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 arch/Kconfig                      |    3 
 include/linux/static_call.h       |  134 ++++++++++++++++++++++++++++++++++++++
 include/linux/static_call_types.h |   15 ++++
 3 files changed, 152 insertions(+)
 create mode 100644 include/linux/static_call.h
 create mode 100644 include/linux/static_call_types.h

--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -960,6 +960,9 @@ config RELR
 config ARCH_HAS_MEM_ENCRYPT
 	bool
 
+config HAVE_STATIC_CALL
+	bool
+
 source "kernel/gcov/Kconfig"
 
 source "scripts/gcc-plugins/Kconfig"
--- /dev/null
+++ b/include/linux/static_call.h
@@ -0,0 +1,134 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_STATIC_CALL_H
+#define _LINUX_STATIC_CALL_H
+
+/*
+ * Static call support
+ *
+ * Static calls use code patching to hard-code function pointers into direct
+ * branch instructions.  They give the flexibility of function pointers, but
+ * with improved performance.  This is especially important for cases where
+ * retpolines would otherwise be used, as retpolines can significantly impact
+ * performance.
+ *
+ *
+ * API overview:
+ *
+ *   DECLARE_STATIC_CALL(name, func);
+ *   DEFINE_STATIC_CALL(name, func);
+ *   static_call(name)(args...);
+ *   static_call_update(name, func);
+ *
+ * Usage example:
+ *
+ *   # Start with the following functions (with identical prototypes):
+ *   int func_a(int arg1, int arg2);
+ *   int func_b(int arg1, int arg2);
+ *
+ *   # Define a 'my_name' reference, associated with func_a() by default
+ *   DEFINE_STATIC_CALL(my_name, func_a);
+ *
+ *   # Call func_a()
+ *   static_call(my_name)(arg1, arg2);
+ *
+ *   # Update 'my_name' to point to func_b()
+ *   static_call_update(my_name, &func_b);
+ *
+ *   # Call func_b()
+ *   static_call(my_name)(arg1, arg2);
+ *
+ *
+ * Implementation details:
+ *
+ *    This requires some arch-specific code (CONFIG_HAVE_STATIC_CALL).
+ *    Otherwise basic indirect calls are used (with function pointers).
+ *
+ *    Each static_call() site calls into a trampoline associated with the name.
+ *    The trampoline has a direct branch to the default function.  Updates to a
+ *    name will modify the trampoline's branch destination.
+ *
+ *    If the arch has CONFIG_HAVE_STATIC_CALL_INLINE, then the call sites
+ *    themselves will be patched at runtime to call the functions directly,
+ *    rather than calling through the trampoline.  This requires objtool or a
+ *    compiler plugin to detect all the static_call() sites and annotate them
+ *    in the .static_call_sites section.
+ */
+
+#include <linux/types.h>
+#include <linux/cpu.h>
+#include <linux/static_call_types.h>
+
+#ifdef CONFIG_HAVE_STATIC_CALL
+#include <asm/static_call.h>
+/*
+ * Either @site or @tramp can be NULL.
+ */
+extern void arch_static_call_transform(void *site, void *tramp, void *func);
+#endif
+
+
+#define DECLARE_STATIC_CALL(name, func)					\
+	extern struct static_call_key STATIC_CALL_NAME(name);		\
+	extern typeof(func) STATIC_CALL_TRAMP(name)
+
+#define static_call_update(name, func)					\
+({									\
+	BUILD_BUG_ON(!__same_type(*(func), STATIC_CALL_TRAMP(name)));	\
+	__static_call_update(&STATIC_CALL_NAME(name),			\
+			     &STATIC_CALL_TRAMP(name), func);		\
+})
+
+#if defined(CONFIG_HAVE_STATIC_CALL)
+
+struct static_call_key {
+	void *func;
+};
+
+#define DEFINE_STATIC_CALL(name, _func)					\
+	DECLARE_STATIC_CALL(name, _func);				\
+	struct static_call_key STATIC_CALL_NAME(name) = {		\
+		.func = _func,						\
+	};								\
+	ARCH_DEFINE_STATIC_CALL_TRAMP(name, _func)
+
+#define static_call(name)	STATIC_CALL_TRAMP(name)
+
+static inline
+void __static_call_update(struct static_call_key *key, void *tramp, void *func)
+{
+	cpus_read_lock();
+	WRITE_ONCE(key->func, func);
+	arch_static_call_transform(NULL, tramp, func);
+	cpus_read_unlock();
+}
+
+#define EXPORT_STATIC_CALL(name)	EXPORT_SYMBOL(STATIC_CALL_TRAMP(name))
+#define EXPORT_STATIC_CALL_GPL(name)	EXPORT_SYMBOL_GPL(STATIC_CALL_TRAMP(name))
+
+#else /* Generic implementation */
+
+struct static_call_key {
+	void *func;
+};
+
+#define DEFINE_STATIC_CALL(name, _func)					\
+	DECLARE_STATIC_CALL(name, _func);				\
+	struct static_call_key STATIC_CALL_NAME(name) = {		\
+		.func = _func,						\
+	}
+
+#define static_call(name)						\
+	((typeof(STATIC_CALL_TRAMP(name))*)(STATIC_CALL_NAME(name).func))
+
+static inline
+void __static_call_update(struct static_call_key *key, void *tramp, void *func)
+{
+	WRITE_ONCE(key->func, func);
+}
+
+#define EXPORT_STATIC_CALL(name)	EXPORT_SYMBOL(STATIC_CALL_NAME(name))
+#define EXPORT_STATIC_CALL_GPL(key)	EXPORT_SYMBOL_GPL(STATIC_CALL_NAME(name))
+
+#endif /* CONFIG_HAVE_STATIC_CALL */
+
+#endif /* _LINUX_STATIC_CALL_H */
--- /dev/null
+++ b/include/linux/static_call_types.h
@@ -0,0 +1,15 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _STATIC_CALL_TYPES_H
+#define _STATIC_CALL_TYPES_H
+
+#include <linux/stringify.h>
+
+#define STATIC_CALL_PREFIX	____static_call_
+#define STATIC_CALL_PREFIX_STR	__stringify(STATIC_CALL_PREFIX)
+
+#define STATIC_CALL_NAME(name)	__PASTE(STATIC_CALL_PREFIX, name)
+
+#define STATIC_CALL_TRAMP(name)	    STATIC_CALL_NAME(name##_tramp)
+#define STATIC_CALL_TRAMP_STR(name) __stringify(STATIC_CALL_TRAMP(name))
+
+#endif /* _STATIC_CALL_TYPES_H */



^ permalink raw reply	[flat|nested] 96+ messages in thread

* [PATCH v2 03/13] static_call: Add inline static call infrastructure
  2019-10-07  8:27 ` [PATCH v2 00/13] Add static_call() Peter Zijlstra
  2019-10-07  8:27   ` [PATCH v2 01/13] compiler.h: Make __ADDRESSABLE() symbol truly unique Peter Zijlstra
  2019-10-07  8:27   ` [PATCH v2 02/13] static_call: Add basic static call infrastructure Peter Zijlstra
@ 2019-10-07  8:27   ` " Peter Zijlstra
  2019-10-07  8:27   ` [PATCH v2 04/13] static_call: Avoid kprobes on inline static_call()s Peter Zijlstra
                     ` (10 subsequent siblings)
  13 siblings, 0 replies; 96+ messages in thread
From: Peter Zijlstra @ 2019-10-07  8:27 UTC (permalink / raw)
  To: x86
  Cc: peterz, linux-kernel, rostedt, mhiramat, bristot, jbaron,
	torvalds, tglx, mingo, namit, hpa, luto, ard.biesheuvel,
	jpoimboe

From: Josh Poimboeuf <jpoimboe@redhat.com>

Add infrastructure for an arch-specific CONFIG_HAVE_STATIC_CALL_INLINE
option, which is a faster version of CONFIG_HAVE_STATIC_CALL.  At
runtime, the static call sites are patched directly, rather than using
the out-of-line trampolines.

Compared to out-of-line static calls, the performance benefits are more
modest, but still measurable.  Steven Rostedt did some tracepoint
measurements:

  https://lkml.kernel.org/r/20181126155405.72b4f718@gandalf.local.home

This code is heavily inspired by the jump label code (aka "static
jumps"), as some of the concepts are very similar.

For more details, see the comments in include/linux/static_call.h.

[peterz: simplified interface; merged trampolines]
Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 arch/Kconfig                      |    4 
 include/asm-generic/vmlinux.lds.h |    7 
 include/linux/module.h            |   10 +
 include/linux/static_call.h       |   37 ++++
 include/linux/static_call_types.h |    9 +
 kernel/Makefile                   |    1 
 kernel/module.c                   |    5 
 kernel/static_call.c              |  302 ++++++++++++++++++++++++++++++++++++++
 8 files changed, 374 insertions(+), 1 deletion(-)
 create mode 100644 kernel/static_call.c

--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -963,6 +963,10 @@ config ARCH_HAS_MEM_ENCRYPT
 config HAVE_STATIC_CALL
 	bool
 
+config HAVE_STATIC_CALL_INLINE
+	bool
+	depends on HAVE_STATIC_CALL
+
 source "kernel/gcov/Kconfig"
 
 source "scripts/gcc-plugins/Kconfig"
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -333,6 +333,12 @@
 	KEEP(*(__jump_table))						\
 	__stop___jump_table = .;
 
+#define STATIC_CALL_DATA						\
+	. = ALIGN(8);							\
+	__start_static_call_sites = .;					\
+	KEEP(*(.static_call_sites))					\
+	__stop_static_call_sites = .;
+
 /*
  * Allow architectures to handle ro_after_init data on their
  * own by defining an empty RO_AFTER_INIT_DATA.
@@ -342,6 +348,7 @@
 	__start_ro_after_init = .;					\
 	*(.data..ro_after_init)						\
 	JUMP_TABLE_DATA							\
+	STATIC_CALL_DATA						\
 	__end_ro_after_init = .;
 #endif
 
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -22,6 +22,7 @@
 #include <linux/error-injection.h>
 #include <linux/tracepoint-defs.h>
 #include <linux/srcu.h>
+#include <linux/static_call_types.h>
 
 #include <linux/percpu.h>
 #include <asm/module.h>
@@ -476,6 +477,10 @@ struct module {
 	unsigned int num_ftrace_callsites;
 	unsigned long *ftrace_callsites;
 #endif
+#ifdef CONFIG_HAVE_STATIC_CALL_INLINE
+	int num_static_call_sites;
+	struct static_call_site *static_call_sites;
+#endif
 
 #ifdef CONFIG_LIVEPATCH
 	bool klp; /* Is this a livepatch module? */
@@ -732,6 +737,11 @@ static inline bool within_module(unsigne
 {
 	return false;
 }
+
+static inline bool within_module_init(unsigned long addr, const struct module *mod)
+{
+	return false;
+}
 
 /* Get/put a kernel symbol (calls should be symmetric) */
 #define symbol_get(x) ({ extern typeof(x) x __attribute__((weak)); &(x); })
--- a/include/linux/static_call.h
+++ b/include/linux/static_call.h
@@ -78,7 +78,42 @@ extern void arch_static_call_transform(v
 			     &STATIC_CALL_TRAMP(name), func);		\
 })
 
-#if defined(CONFIG_HAVE_STATIC_CALL)
+#ifdef CONFIG_HAVE_STATIC_CALL_INLINE
+
+struct static_call_mod {
+	struct static_call_mod *next;
+	struct module *mod; /* for vmlinux, mod == NULL */
+	struct static_call_site *sites;
+};
+
+struct static_call_key {
+	void *func;
+	struct static_call_mod *next;
+};
+
+extern void __static_call_update(struct static_call_key *key, void *tramp, void *func);
+extern int static_call_mod_init(struct module *mod);
+
+#define DEFINE_STATIC_CALL(name, _func)					\
+	DECLARE_STATIC_CALL(name, _func);				\
+	struct static_call_key STATIC_CALL_NAME(name) = {		\
+		.func = _func,						\
+		.next = NULL,						\
+	};								\
+	__ADDRESSABLE(STATIC_CALL_NAME(name));				\
+	ARCH_DEFINE_STATIC_CALL_TRAMP(name, _func)
+
+#define static_call(name)	STATIC_CALL_TRAMP(name)
+
+#define EXPORT_STATIC_CALL(name)					\
+	EXPORT_SYMBOL(STATIC_CALL_NAME(name));				\
+	EXPORT_SYMBOL(STATIC_CALL_TRAMP(name))
+
+#define EXPORT_STATIC_CALL_GPL(name)					\
+	EXPORT_SYMBOL_GPL(STATIC_CALL_NAME(name));			\
+	EXPORT_SYMBOL_GPL(STATIC_CALL_TRAMP(name))
+
+#elif defined(CONFIG_HAVE_STATIC_CALL)
 
 struct static_call_key {
 	void *func;
--- a/include/linux/static_call_types.h
+++ b/include/linux/static_call_types.h
@@ -12,4 +12,13 @@
 #define STATIC_CALL_TRAMP(name)	    STATIC_CALL_NAME(name##_tramp)
 #define STATIC_CALL_TRAMP_STR(name) __stringify(STATIC_CALL_TRAMP(name))
 
+/*
+ * The static call site table needs to be created by external tooling (objtool
+ * or a compiler plugin).
+ */
+struct static_call_site {
+	s32 addr;
+	s32 key;
+};
+
 #endif /* _STATIC_CALL_TYPES_H */
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -103,6 +103,7 @@ obj-$(CONFIG_TRACEPOINTS) += trace/
 obj-$(CONFIG_IRQ_WORK) += irq_work.o
 obj-$(CONFIG_CPU_PM) += cpu_pm.o
 obj-$(CONFIG_BPF) += bpf/
+obj-$(CONFIG_HAVE_STATIC_CALL_INLINE) += static_call.o
 
 obj-$(CONFIG_PERF_EVENTS) += events/
 
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -3231,6 +3231,11 @@ static int find_module_sections(struct m
 					    sizeof(*mod->ei_funcs),
 					    &mod->num_ei_funcs);
 #endif
+#ifdef CONFIG_HAVE_STATIC_CALL_INLINE
+	mod->static_call_sites = section_objs(info, ".static_call_sites",
+					      sizeof(*mod->static_call_sites),
+					      &mod->num_static_call_sites);
+#endif
 	mod->extable = section_objs(info, "__ex_table",
 				    sizeof(*mod->extable), &mod->num_exentries);
 
--- /dev/null
+++ b/kernel/static_call.c
@@ -0,0 +1,302 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/init.h>
+#include <linux/static_call.h>
+#include <linux/bug.h>
+#include <linux/smp.h>
+#include <linux/sort.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/cpu.h>
+#include <linux/processor.h>
+#include <asm/sections.h>
+
+extern struct static_call_site __start_static_call_sites[],
+			       __stop_static_call_sites[];
+
+static bool static_call_initialized;
+
+#define STATIC_CALL_INIT 1UL
+
+/* mutex to protect key modules/sites */
+static DEFINE_MUTEX(static_call_mutex);
+
+static void static_call_lock(void)
+{
+	mutex_lock(&static_call_mutex);
+}
+
+static void static_call_unlock(void)
+{
+	mutex_unlock(&static_call_mutex);
+}
+
+static inline void *static_call_addr(struct static_call_site *site)
+{
+	return (void *)((long)site->addr + (long)&site->addr);
+}
+
+
+static inline struct static_call_key *static_call_key(const struct static_call_site *site)
+{
+	return (struct static_call_key *)
+		(((long)site->key + (long)&site->key) & ~STATIC_CALL_INIT);
+}
+
+/* These assume the key is word-aligned. */
+static inline bool static_call_is_init(struct static_call_site *site)
+{
+	return ((long)site->key + (long)&site->key) & STATIC_CALL_INIT;
+}
+
+static inline void static_call_set_init(struct static_call_site *site)
+{
+	site->key = ((long)static_call_key(site) | STATIC_CALL_INIT) -
+		    (long)&site->key;
+}
+
+static int static_call_site_cmp(const void *_a, const void *_b)
+{
+	const struct static_call_site *a = _a;
+	const struct static_call_site *b = _b;
+	const struct static_call_key *key_a = static_call_key(a);
+	const struct static_call_key *key_b = static_call_key(b);
+
+	if (key_a < key_b)
+		return -1;
+
+	if (key_a > key_b)
+		return 1;
+
+	return 0;
+}
+
+static void static_call_site_swap(void *_a, void *_b, int size)
+{
+	long delta = (unsigned long)_a - (unsigned long)_b;
+	struct static_call_site *a = _a;
+	struct static_call_site *b = _b;
+	struct static_call_site tmp = *a;
+
+	a->addr = b->addr  - delta;
+	a->key  = b->key   - delta;
+
+	b->addr = tmp.addr + delta;
+	b->key  = tmp.key  + delta;
+}
+
+static inline void static_call_sort_entries(struct static_call_site *start,
+					    struct static_call_site *stop)
+{
+	sort(start, stop - start, sizeof(struct static_call_site),
+	     static_call_site_cmp, static_call_site_swap);
+}
+
+void __static_call_update(struct static_call_key *key, void *tramp, void *func)
+{
+	struct static_call_site *site, *stop;
+	struct static_call_mod *site_mod;
+
+	cpus_read_lock();
+	static_call_lock();
+
+	if (key->func == func)
+		goto done;
+
+	key->func = func;
+
+	arch_static_call_transform(NULL, tramp, func);
+
+	/*
+	 * If uninitialized, we'll not update the callsites, but they still
+	 * point to the trampoline and we just patched that.
+	 */
+	if (WARN_ON_ONCE(!static_call_initialized))
+		goto done;
+
+	for (site_mod = key->next; site_mod; site_mod = site_mod->next) {
+		if (!site_mod->sites) {
+			/*
+			 * This can happen if the static call key is defined in
+			 * a module which doesn't use it.
+			 */
+			continue;
+		}
+
+		stop = __stop_static_call_sites;
+
+#ifdef CONFIG_MODULES
+		if (site_mod->mod) {
+			stop = site_mod->mod->static_call_sites +
+			       site_mod->mod->num_static_call_sites;
+		}
+#endif
+
+		for (site = site_mod->sites;
+		     site < stop && static_call_key(site) == key; site++) {
+			void *site_addr = static_call_addr(site);
+			struct module *mod = site_mod->mod;
+
+			if (static_call_is_init(site)) {
+				/*
+				 * Don't write to call sites which were in
+				 * initmem and have since been freed.
+				 */
+				if (!mod && system_state >= SYSTEM_RUNNING)
+					continue;
+				if (mod && !within_module_init((unsigned long)site_addr, mod))
+					continue;
+			}
+
+			if (!kernel_text_address((unsigned long)site_addr)) {
+				WARN_ONCE(1, "can't patch static call site at %pS",
+					  site_addr);
+				continue;
+			}
+
+			arch_static_call_transform(site_addr, NULL, func);
+		}
+	}
+
+done:
+	static_call_unlock();
+	cpus_read_unlock();
+}
+EXPORT_SYMBOL_GPL(__static_call_update);
+
+static int __static_call_init(struct module *mod,
+			      struct static_call_site *start,
+			      struct static_call_site *stop)
+{
+	struct static_call_site *site;
+	struct static_call_key *key, *prev_key = NULL;
+	struct static_call_mod *site_mod;
+
+	if (start == stop)
+		return 0;
+
+	static_call_sort_entries(start, stop);
+
+	for (site = start; site < stop; site++) {
+		void *site_addr = static_call_addr(site);
+
+		if ((mod && within_module_init((unsigned long)site_addr, mod)) ||
+		    (!mod && init_section_contains(site_addr, 1)))
+			static_call_set_init(site);
+
+		key = static_call_key(site);
+		if (key != prev_key) {
+			prev_key = key;
+
+			site_mod = kzalloc(sizeof(*site_mod), GFP_KERNEL);
+			if (!site_mod)
+				return -ENOMEM;
+
+			site_mod->mod = mod;
+			site_mod->sites = site;
+			site_mod->next = key->next;
+			key->next = site_mod;
+		}
+
+		arch_static_call_transform(site_addr, NULL, key->func);
+	}
+
+	return 0;
+}
+
+#ifdef CONFIG_MODULES
+
+static int static_call_add_module(struct module *mod)
+{
+	return __static_call_init(mod, mod->static_call_sites,
+				  mod->static_call_sites + mod->num_static_call_sites);
+}
+
+static void static_call_del_module(struct module *mod)
+{
+	struct static_call_site *start = mod->static_call_sites;
+	struct static_call_site *stop = mod->static_call_sites +
+					mod->num_static_call_sites;
+	struct static_call_key *key, *prev_key = NULL;
+	struct static_call_mod *site_mod, **prev;
+	struct static_call_site *site;
+
+	for (site = start; site < stop; site++) {
+		key = static_call_key(site);
+		if (key == prev_key)
+			continue;
+
+		prev_key = key;
+
+		for (prev = &key->next, site_mod = key->next;
+		     site_mod && site_mod->mod != mod;
+		     prev = &site_mod->next, site_mod = site_mod->next)
+			;
+
+		if (!site_mod)
+			continue;
+
+		*prev = site_mod->next;
+		kfree(site_mod);
+	}
+}
+
+static int static_call_module_notify(struct notifier_block *nb,
+				     unsigned long val, void *data)
+{
+	struct module *mod = data;
+	int ret = 0;
+
+	cpus_read_lock();
+	static_call_lock();
+
+	switch (val) {
+	case MODULE_STATE_COMING:
+		ret = static_call_add_module(mod);
+		if (ret) {
+			WARN(1, "Failed to allocate memory for static calls");
+			static_call_del_module(mod);
+		}
+		break;
+	case MODULE_STATE_GOING:
+		static_call_del_module(mod);
+		break;
+	}
+
+	static_call_unlock();
+	cpus_read_unlock();
+
+	return notifier_from_errno(ret);
+}
+
+static struct notifier_block static_call_module_nb = {
+	.notifier_call = static_call_module_notify,
+};
+
+#endif /* CONFIG_MODULES */
+
+static void __init static_call_init(void)
+{
+	int ret;
+
+	if (static_call_initialized)
+		return;
+
+	cpus_read_lock();
+	static_call_lock();
+	ret = __static_call_init(NULL, __start_static_call_sites,
+				 __stop_static_call_sites);
+	static_call_unlock();
+	cpus_read_unlock();
+
+	if (ret) {
+		pr_err("Failed to allocate memory for static_call!\n");
+		BUG();
+	}
+
+	static_call_initialized = true;
+
+#ifdef CONFIG_MODULES
+	register_module_notifier(&static_call_module_nb);
+#endif
+}
+early_initcall(static_call_init);



^ permalink raw reply	[flat|nested] 96+ messages in thread

* [PATCH v2 04/13] static_call: Avoid kprobes on inline static_call()s
  2019-10-07  8:27 ` [PATCH v2 00/13] Add static_call() Peter Zijlstra
                     ` (2 preceding siblings ...)
  2019-10-07  8:27   ` [PATCH v2 03/13] static_call: Add inline " Peter Zijlstra
@ 2019-10-07  8:27   ` Peter Zijlstra
  2019-10-07  8:27   ` [PATCH v2 05/13] x86/static_call: Add out-of-line static call implementation Peter Zijlstra
                     ` (9 subsequent siblings)
  13 siblings, 0 replies; 96+ messages in thread
From: Peter Zijlstra @ 2019-10-07  8:27 UTC (permalink / raw)
  To: x86
  Cc: peterz, linux-kernel, rostedt, mhiramat, bristot, jbaron,
	torvalds, tglx, mingo, namit, hpa, luto, ard.biesheuvel,
	jpoimboe

Similar to how we disallow kprobes on any other dynamic text
(ftrace/jump_label) also disallow kprobes on inline static_call()s.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 arch/x86/kernel/kprobes/opt.c |    4 +-
 include/linux/static_call.h   |   11 +++++++
 kernel/kprobes.c              |    2 +
 kernel/static_call.c          |   64 ++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 80 insertions(+), 1 deletion(-)

--- a/arch/x86/kernel/kprobes/opt.c
+++ b/arch/x86/kernel/kprobes/opt.c
@@ -16,6 +16,7 @@
 #include <linux/kallsyms.h>
 #include <linux/ftrace.h>
 #include <linux/frame.h>
+#include <linux/static_call.h>
 
 #include <asm/text-patching.h>
 #include <asm/cacheflush.h>
@@ -188,7 +189,8 @@ static int copy_optimized_instructions(u
 	/* Check whether the address range is reserved */
 	if (ftrace_text_reserved(src, src + len - 1) ||
 	    alternatives_text_reserved(src, src + len - 1) ||
-	    jump_label_text_reserved(src, src + len - 1))
+	    jump_label_text_reserved(src, src + len - 1) ||
+	    static_call_text_reserved(src, src + len - 1))
 		return -EBUSY;
 
 	return len;
--- a/include/linux/static_call.h
+++ b/include/linux/static_call.h
@@ -93,6 +93,7 @@ struct static_call_key {
 
 extern void __static_call_update(struct static_call_key *key, void *tramp, void *func);
 extern int static_call_mod_init(struct module *mod);
+extern int static_call_text_reserved(void *start, void *end);
 
 #define DEFINE_STATIC_CALL(name, _func)					\
 	DECLARE_STATIC_CALL(name, _func);				\
@@ -137,6 +138,11 @@ void __static_call_update(struct static_
 	cpus_read_unlock();
 }
 
+static inline int static_call_text_reserved(void *start, void *end)
+{
+	return 0;
+}
+
 #define EXPORT_STATIC_CALL(name)	EXPORT_SYMBOL(STATIC_CALL_TRAMP(name))
 #define EXPORT_STATIC_CALL_GPL(name)	EXPORT_SYMBOL_GPL(STATIC_CALL_TRAMP(name))
 
@@ -161,6 +167,11 @@ void __static_call_update(struct static_
 	WRITE_ONCE(key->func, func);
 }
 
+static inline int static_call_text_reserved(void *start, void *end)
+{
+	return 0;
+}
+
 #define EXPORT_STATIC_CALL(name)	EXPORT_SYMBOL(STATIC_CALL_NAME(name))
 #define EXPORT_STATIC_CALL_GPL(key)	EXPORT_SYMBOL_GPL(STATIC_CALL_NAME(name))
 
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -35,6 +35,7 @@
 #include <linux/ftrace.h>
 #include <linux/cpu.h>
 #include <linux/jump_label.h>
+#include <linux/static_call.h>
 
 #include <asm/sections.h>
 #include <asm/cacheflush.h>
@@ -1539,6 +1540,7 @@ static int check_kprobe_address_safe(str
 	if (!kernel_text_address((unsigned long) p->addr) ||
 	    within_kprobe_blacklist((unsigned long) p->addr) ||
 	    jump_label_text_reserved(p->addr, p->addr) ||
+	    static_call_text_reserved(p->addr, p->addr) ||
 	    find_bug((unsigned long)p->addr)) {
 		ret = -EINVAL;
 		goto out;
--- a/kernel/static_call.c
+++ b/kernel/static_call.c
@@ -204,8 +204,58 @@ static int __static_call_init(struct mod
 	return 0;
 }
 
+static int addr_conflict(struct static_call_site *site, void *start, void *end)
+{
+	unsigned long addr = (unsigned long)static_call_addr(site);
+
+	if (addr <= (unsigned long)end &&
+	    addr + CALL_INSN_SIZE > (unsigned long)start)
+		return 1;
+
+	return 0;
+}
+
+static int __static_call_text_reserved(struct static_call_site *iter_start,
+				       struct static_call_site *iter_stop,
+				       void *start, void *end)
+{
+	struct static_call_site *iter = iter_start;
+
+	while (iter < iter_stop) {
+		if (addr_conflict(iter, start, end))
+			return 1;
+		iter++;
+	}
+
+	return 0;
+}
+
 #ifdef CONFIG_MODULES
 
+static int __static_call_mod_text_reserved(void *start, void *end)
+{
+	struct module *mod;
+	int ret;
+
+	preempt_disable();
+	mod = __module_text_address((unsigned long)start);
+	WARN_ON_ONCE(__module_text_address((unsigned long)end) != mod);
+	if (!try_module_get(mod))
+		mod = NULL;
+	preempt_enable();
+
+	if (!mod)
+		return 0;
+
+	ret = __static_call_text_reserved(mod->static_call_sites,
+			mod->static_call_sites + mod->num_static_call_sites,
+			start, end);
+
+	module_put(mod);
+
+	return ret;
+}
+
 static int static_call_add_module(struct module *mod)
 {
 	return __static_call_init(mod, mod->static_call_sites,
@@ -275,6 +325,20 @@ static struct notifier_block static_call
 
 #endif /* CONFIG_MODULES */
 
+int static_call_text_reserved(void *start, void *end)
+{
+	int ret = __static_call_text_reserved(__start_static_call_sites,
+			__stop_static_call_sites, start, end);
+
+	if (ret)
+		return ret;
+
+#ifdef CONFIG_MODULES
+	ret = __static_call_mod_text_reserved(start, end);
+#endif
+	return ret;
+}
+
 static void __init static_call_init(void)
 {
 	int ret;



^ permalink raw reply	[flat|nested] 96+ messages in thread

* [PATCH v2 05/13] x86/static_call: Add out-of-line static call implementation
  2019-10-07  8:27 ` [PATCH v2 00/13] Add static_call() Peter Zijlstra
                     ` (3 preceding siblings ...)
  2019-10-07  8:27   ` [PATCH v2 04/13] static_call: Avoid kprobes on inline static_call()s Peter Zijlstra
@ 2019-10-07  8:27   ` Peter Zijlstra
  2019-10-07  8:27   ` [PATCH v2 06/13] x86/static_call: Add inline static call implementation for x86-64 Peter Zijlstra
                     ` (8 subsequent siblings)
  13 siblings, 0 replies; 96+ messages in thread
From: Peter Zijlstra @ 2019-10-07  8:27 UTC (permalink / raw)
  To: x86
  Cc: peterz, linux-kernel, rostedt, mhiramat, bristot, jbaron,
	torvalds, tglx, mingo, namit, hpa, luto, ard.biesheuvel,
	jpoimboe

From: Josh Poimboeuf <jpoimboe@redhat.com>

Add the x86 out-of-line static call implementation.  For each key, a
permanent trampoline is created which is the destination for all static
calls for the given key.  The trampoline has a direct jump which gets
patched by static_call_update() when the destination function changes.

Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
[peterz: fixed trampoline, rewrote patching code]
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 arch/x86/Kconfig                   |    1 +
 arch/x86/include/asm/static_call.h |   22 ++++++++++++++++++++++
 arch/x86/kernel/Makefile           |    1 +
 arch/x86/kernel/static_call.c      |   31 +++++++++++++++++++++++++++++++
 4 files changed, 55 insertions(+)
 create mode 100644 arch/x86/include/asm/static_call.h
 create mode 100644 arch/x86/kernel/static_call.c

--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -205,6 +205,7 @@ config X86
 	select HAVE_FUNCTION_ARG_ACCESS_API
 	select HAVE_STACKPROTECTOR		if CC_HAS_SANE_STACKPROTECTOR
 	select HAVE_STACK_VALIDATION		if X86_64
+	select HAVE_STATIC_CALL
 	select HAVE_RSEQ
 	select HAVE_SYSCALL_TRACEPOINTS
 	select HAVE_UNSTABLE_SCHED_CLOCK
--- /dev/null
+++ b/arch/x86/include/asm/static_call.h
@@ -0,0 +1,22 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_STATIC_CALL_H
+#define _ASM_STATIC_CALL_H
+
+#include <asm/text-patching.h>
+
+/*
+ * For CONFIG_HAVE_STATIC_CALL, this is a permanent trampoline which
+ * does a direct jump to the function.  The direct jump gets patched by
+ * static_call_update().
+ */
+#define ARCH_DEFINE_STATIC_CALL_TRAMP(name, func)			\
+	asm(".pushsection .text, \"ax\"				\n"	\
+	    ".align 4						\n"	\
+	    ".globl " STATIC_CALL_TRAMP_STR(name) "		\n"	\
+	    STATIC_CALL_TRAMP_STR(name) ":			\n"	\
+	    "	jmp.d32 " #func "				\n"	\
+	    ".type " STATIC_CALL_TRAMP_STR(name) ", @function	\n"	\
+	    ".size " STATIC_CALL_TRAMP_STR(name) ", . - " STATIC_CALL_TRAMP_STR(name) " \n" \
+	    ".popsection					\n")
+
+#endif /* _ASM_STATIC_CALL_H */
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -63,6 +63,7 @@ obj-y			+= tsc.o tsc_msr.o io_delay.o rt
 obj-y			+= pci-iommu_table.o
 obj-y			+= resource.o
 obj-y			+= irqflags.o
+obj-y			+= static_call.o
 
 obj-y				+= process.o
 obj-y				+= fpu/
--- /dev/null
+++ b/arch/x86/kernel/static_call.c
@@ -0,0 +1,31 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/static_call.h>
+#include <linux/memory.h>
+#include <linux/bug.h>
+#include <asm/text-patching.h>
+
+static void __static_call_transform(void *insn, u8 opcode, void *func)
+{
+	const void *code = text_gen_insn(opcode, (long)insn, (long)func);
+
+	if (WARN_ONCE(*(u8 *)insn != opcode,
+		      "unexpected static call insn opcode 0x%x at %pS\n",
+		      opcode, insn))
+		return;
+
+	if (memcmp(insn, code, CALL_INSN_SIZE) == 0)
+		return;
+
+	text_poke_bp(insn, code, CALL_INSN_SIZE, NULL);
+}
+
+void arch_static_call_transform(void *site, void *tramp, void *func)
+{
+	mutex_lock(&text_mutex);
+
+	if (tramp)
+		__static_call_transform(tramp, JMP32_INSN_OPCODE, func);
+
+	mutex_unlock(&text_mutex);
+}
+EXPORT_SYMBOL_GPL(arch_static_call_transform);



^ permalink raw reply	[flat|nested] 96+ messages in thread

* [PATCH v2 06/13] x86/static_call: Add inline static call implementation for x86-64
  2019-10-07  8:27 ` [PATCH v2 00/13] Add static_call() Peter Zijlstra
                     ` (4 preceding siblings ...)
  2019-10-07  8:27   ` [PATCH v2 05/13] x86/static_call: Add out-of-line static call implementation Peter Zijlstra
@ 2019-10-07  8:27   ` Peter Zijlstra
  2019-10-07  8:27   ` [PATCH v2 07/13] static_call: Simple self-test Peter Zijlstra
                     ` (7 subsequent siblings)
  13 siblings, 0 replies; 96+ messages in thread
From: Peter Zijlstra @ 2019-10-07  8:27 UTC (permalink / raw)
  To: x86
  Cc: peterz, linux-kernel, rostedt, mhiramat, bristot, jbaron,
	torvalds, tglx, mingo, namit, hpa, luto, ard.biesheuvel,
	jpoimboe

From: Josh Poimboeuf <jpoimboe@redhat.com>

Add the inline static call implementation for x86-64.  For each key, a
temporary trampoline is created, named __static_call_tramp_<key>.  The
trampoline has an indirect jump to the destination function.

Objtool uses the trampoline naming convention to detect all the call
sites.  It then annotates those call sites in the .static_call_sites
section.

During boot (and module init), the call sites are patched to call
directly into the destination function.  The temporary trampoline is
then no longer used.

[peterz: merged trampolines, put trampoline in section]
Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 arch/x86/Kconfig                        |    3 
 arch/x86/include/asm/static_call.h      |   15 +++
 arch/x86/kernel/static_call.c           |    3 
 arch/x86/kernel/vmlinux.lds.S           |    1 
 include/asm-generic/vmlinux.lds.h       |    6 +
 tools/include/linux/static_call_types.h |   24 ++++++
 tools/objtool/check.c                   |  127 +++++++++++++++++++++++++++++++-
 tools/objtool/check.h                   |    2 
 tools/objtool/elf.h                     |    1 
 tools/objtool/sync-check.sh             |    1 
 10 files changed, 180 insertions(+), 3 deletions(-)
 create mode 100644 tools/objtool/include/linux/static_call_types.h

--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -206,6 +206,7 @@ config X86
 	select HAVE_STACKPROTECTOR		if CC_HAS_SANE_STACKPROTECTOR
 	select HAVE_STACK_VALIDATION		if X86_64
 	select HAVE_STATIC_CALL
+	select HAVE_STATIC_CALL_INLINE		if HAVE_STACK_VALIDATION
 	select HAVE_RSEQ
 	select HAVE_SYSCALL_TRACEPOINTS
 	select HAVE_UNSTABLE_SCHED_CLOCK
@@ -221,6 +222,7 @@ config X86
 	select RTC_MC146818_LIB
 	select SPARSE_IRQ
 	select SRCU
+	select STACK_VALIDATION			if HAVE_STACK_VALIDATION && (HAVE_STATIC_CALL_INLINE || RETPOLINE)
 	select SYSCTL_EXCEPTION_TRACE
 	select THREAD_INFO_IN_TASK
 	select USER_STACKTRACE_SUPPORT
@@ -445,7 +447,6 @@ config GOLDFISH
 config RETPOLINE
 	bool "Avoid speculative indirect branches in kernel"
 	default y
-	select STACK_VALIDATION if HAVE_STACK_VALIDATION
 	help
 	  Compile kernel with the retpoline compiler options to guard against
 	  kernel-to-user data leaks by avoiding speculative indirect
--- a/arch/x86/include/asm/static_call.h
+++ b/arch/x86/include/asm/static_call.h
@@ -5,12 +5,25 @@
 #include <asm/text-patching.h>
 
 /*
+ * For CONFIG_HAVE_STATIC_CALL_INLINE, this is a temporary trampoline which
+ * uses the current value of the key->func pointer to do an indirect jump to
+ * the function.  This trampoline is only used during boot, before the call
+ * sites get patched by static_call_update().  The name of this trampoline has
+ * a magical aspect: objtool uses it to find static call sites so it can create
+ * the .static_call_sites section.
+ *
  * For CONFIG_HAVE_STATIC_CALL, this is a permanent trampoline which
  * does a direct jump to the function.  The direct jump gets patched by
  * static_call_update().
+ *
+ * Having the trampoline in a special section has two benefits:
+ *  - it makes it 'easy' for objtool to find all the call-sites;
+ *  - it forces GCC to emit a JMP.d32 when it does tail-call optimization on
+ *    the call; since you cannot compute the relative displacement across
+ *    sections.
  */
 #define ARCH_DEFINE_STATIC_CALL_TRAMP(name, func)			\
-	asm(".pushsection .text, \"ax\"				\n"	\
+	asm(".pushsection .static_call.text, \"ax\"		\n"	\
 	    ".align 4						\n"	\
 	    ".globl " STATIC_CALL_TRAMP_STR(name) "		\n"	\
 	    STATIC_CALL_TRAMP_STR(name) ":			\n"	\
--- a/arch/x86/kernel/static_call.c
+++ b/arch/x86/kernel/static_call.c
@@ -26,6 +26,9 @@ void arch_static_call_transform(void *si
 	if (tramp)
 		__static_call_transform(tramp, JMP32_INSN_OPCODE, func);
 
+	if (IS_ENABLED(CONFIG_HAVE_STATIC_CALL_INLINE) && site)
+		__static_call_transform(site, CALL_INSN_OPCODE, func);
+
 	mutex_unlock(&text_mutex);
 }
 EXPORT_SYMBOL_GPL(arch_static_call_transform);
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -133,6 +133,7 @@ SECTIONS
 		IRQENTRY_TEXT
 		ALIGN_ENTRY_TEXT_END
 		SOFTIRQENTRY_TEXT
+		STATIC_CALL_TEXT
 		*(.fixup)
 		*(.gnu.warning)
 
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -576,6 +576,12 @@
 		*(.softirqentry.text)					\
 		__softirqentry_text_end = .;
 
+#define STATIC_CALL_TEXT						\
+		ALIGN_FUNCTION();					\
+		__static_call_text_start = .;				\
+		*(.static_call.text)					\
+		__static_call_text_end = .;
+
 /* Section used for early init (in .S files) */
 #define HEAD_TEXT  KEEP(*(.head.text))
 
--- /dev/null
+++ b/tools/include/linux/static_call_types.h
@@ -0,0 +1,24 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _STATIC_CALL_TYPES_H
+#define _STATIC_CALL_TYPES_H
+
+#include <linux/stringify.h>
+
+#define STATIC_CALL_PREFIX	____static_call_
+#define STATIC_CALL_PREFIX_STR	__stringify(STATIC_CALL_PREFIX)
+
+#define STATIC_CALL_NAME(name)	__PASTE(STATIC_CALL_PREFIX, name)
+
+#define STATIC_CALL_TRAMP(name)	    STATIC_CALL_NAME(name##_tramp)
+#define STATIC_CALL_TRAMP_STR(name) __stringify(STATIC_CALL_TRAMP(name))
+
+/*
+ * The static call site table needs to be created by external tooling (objtool
+ * or a compiler plugin).
+ */
+struct static_call_site {
+	s32 addr;
+	s32 key;
+};
+
+#endif /* _STATIC_CALL_TYPES_H */
--- a/tools/objtool/check.c
+++ b/tools/objtool/check.c
@@ -15,6 +15,7 @@
 
 #include <linux/hashtable.h>
 #include <linux/kernel.h>
+#include <linux/static_call_types.h>
 
 #define FAKE_JUMP_OFFSET -1
 
@@ -1285,6 +1286,21 @@ static int read_retpoline_hints(struct o
 	return 0;
 }
 
+static int read_static_call_tramps(struct objtool_file *file)
+{
+	struct section *sec, *sc_sec = find_section_by_name(file->elf, ".static_call.text");
+	struct symbol *func;
+
+	for_each_sec(file, sec) {
+		list_for_each_entry(func, &sec->symbol_list, list) {
+			if (func->sec == sc_sec)
+				func->static_call_tramp = true;
+		}
+	}
+
+	return 0;
+}
+
 static void mark_rodata(struct objtool_file *file)
 {
 	struct section *sec;
@@ -1356,6 +1372,10 @@ static int decode_sections(struct objtoo
 	if (ret)
 		return ret;
 
+	ret = read_static_call_tramps(file);
+	if (ret)
+		return ret;
+
 	return 0;
 }
 
@@ -2083,6 +2103,12 @@ static int validate_branch(struct objtoo
 			if (ret)
 				return ret;
 
+			if (insn->type == INSN_CALL && insn->call_dest->static_call_tramp) {
+				list_add_tail(&insn->static_call_node,
+					      &file->static_call_list);
+			}
+
+
 			if (!no_fp && func && !is_fentry_call(insn) &&
 			    !has_valid_stack_frame(&state)) {
 				WARN_FUNC("call without frame pointer save/setup",
@@ -2403,6 +2429,97 @@ static int validate_reachable_instructio
 	return 0;
 }
 
+static int create_static_call_sections(struct objtool_file *file)
+{
+	struct section *sec, *rela_sec;
+	struct rela *rela;
+	struct static_call_site *site;
+	struct instruction *insn;
+	char *key_name, *tmp;
+	struct symbol *key_sym;
+	int idx;
+
+	sec = find_section_by_name(file->elf, ".static_call_sites");
+	if (sec) {
+		WARN("file already has .static_call_sites section, skipping");
+		return 0;
+	}
+
+	if (list_empty(&file->static_call_list))
+		return 0;
+
+	idx = 0;
+	list_for_each_entry(insn, &file->static_call_list, static_call_node)
+		idx++;
+
+	sec = elf_create_section(file->elf, ".static_call_sites",
+				 sizeof(struct static_call_site), idx);
+	if (!sec)
+		return -1;
+
+	rela_sec = elf_create_rela_section(file->elf, sec);
+	if (!rela_sec)
+		return -1;
+
+	idx = 0;
+	list_for_each_entry(insn, &file->static_call_list, static_call_node) {
+
+		site = (struct static_call_site *)sec->data->d_buf + idx;
+		memset(site, 0, sizeof(struct static_call_site));
+
+		/* populate rela for 'addr' */
+		rela = malloc(sizeof(*rela));
+		if (!rela) {
+			perror("malloc");
+			return -1;
+		}
+		memset(rela, 0, sizeof(*rela));
+		rela->sym = insn->sec->sym;
+		rela->addend = insn->offset;
+		rela->type = R_X86_64_PC32;
+		rela->offset = idx * sizeof(struct static_call_site);
+		list_add_tail(&rela->list, &rela_sec->rela_list);
+		hash_add(rela_sec->rela_hash, &rela->hash, rela->offset);
+
+		/* find key symbol */
+		key_name = strdup(insn->call_dest->name);
+		tmp = strstr(key_name, "_tramp");
+		if (!tmp) {
+			WARN("static_call: trampoline name malformed: %s", key_name);
+			return -1;
+		}
+		*tmp = 0;
+
+		key_sym = find_symbol_by_name(file->elf, key_name);
+		if (!key_sym) {
+			WARN("static_call: can't find static_call_key symbol: %s", key_name);
+			return -1;
+		}
+		free(key_name);
+
+		/* populate rela for 'key' */
+		rela = malloc(sizeof(*rela));
+		if (!rela) {
+			perror("malloc");
+			return -1;
+		}
+		memset(rela, 0, sizeof(*rela));
+		rela->sym = key_sym;
+		rela->addend = 0;
+		rela->type = R_X86_64_PC32;
+		rela->offset = idx * sizeof(struct static_call_site) + 4;
+		list_add_tail(&rela->list, &rela_sec->rela_list);
+		hash_add(rela_sec->rela_hash, &rela->hash, rela->offset);
+
+		idx++;
+	}
+
+	if (elf_rebuild_rela_section(rela_sec))
+		return -1;
+
+	return 0;
+}
+
 static void cleanup(struct objtool_file *file)
 {
 	struct instruction *insn, *tmpinsn;
@@ -2428,12 +2545,13 @@ int check(const char *_objname, bool orc
 
 	objname = _objname;
 
-	file.elf = elf_read(objname, orc ? O_RDWR : O_RDONLY);
+	file.elf = elf_read(objname, O_RDWR);
 	if (!file.elf)
 		return 1;
 
 	INIT_LIST_HEAD(&file.insn_list);
 	hash_init(file.insn_hash);
+	INIT_LIST_HEAD(&file.static_call_list);
 	file.c_file = find_section_by_name(file.elf, ".comment");
 	file.ignore_unreachables = no_unreachable;
 	file.hints = false;
@@ -2472,6 +2590,11 @@ int check(const char *_objname, bool orc
 		warnings += ret;
 	}
 
+	ret = create_static_call_sections(&file);
+	if (ret < 0)
+		goto out;
+	warnings += ret;
+
 	if (orc) {
 		ret = create_orc(&file);
 		if (ret < 0)
@@ -2480,7 +2603,9 @@ int check(const char *_objname, bool orc
 		ret = create_orc_sections(&file);
 		if (ret < 0)
 			goto out;
+	}
 
+	if (orc || !list_empty(&file.static_call_list)) {
 		ret = elf_write(file.elf);
 		if (ret < 0)
 			goto out;
--- a/tools/objtool/check.h
+++ b/tools/objtool/check.h
@@ -28,6 +28,7 @@ struct insn_state {
 struct instruction {
 	struct list_head list;
 	struct hlist_node hash;
+	struct list_head static_call_node;
 	struct section *sec;
 	unsigned long offset;
 	unsigned int len;
@@ -51,6 +52,7 @@ struct objtool_file {
 	struct elf *elf;
 	struct list_head insn_list;
 	DECLARE_HASHTABLE(insn_hash, 16);
+	struct list_head static_call_list;
 	bool ignore_unreachables, c_file, hints, rodata;
 };
 
--- a/tools/objtool/elf.h
+++ b/tools/objtool/elf.h
@@ -51,6 +51,7 @@ struct symbol {
 	unsigned int len;
 	struct symbol *pfunc, *cfunc, *alias;
 	bool uaccess_safe;
+	bool static_call_tramp;
 };
 
 struct rela {
--- a/tools/objtool/sync-check.sh
+++ b/tools/objtool/sync-check.sh
@@ -6,6 +6,7 @@ arch/x86/include/asm/inat_types.h
 arch/x86/include/asm/orc_types.h
 arch/x86/lib/x86-opcode-map.txt
 arch/x86/tools/gen-insn-attr-x86.awk
+include/linux/static_call_types.h
 '
 
 check_2 () {



^ permalink raw reply	[flat|nested] 96+ messages in thread

* [PATCH v2 07/13] static_call: Simple self-test
  2019-10-07  8:27 ` [PATCH v2 00/13] Add static_call() Peter Zijlstra
                     ` (5 preceding siblings ...)
  2019-10-07  8:27   ` [PATCH v2 06/13] x86/static_call: Add inline static call implementation for x86-64 Peter Zijlstra
@ 2019-10-07  8:27   ` Peter Zijlstra
  2019-10-07  8:27   ` [PATCH v2 08/13] tracepoints: Use static_call Peter Zijlstra
                     ` (6 subsequent siblings)
  13 siblings, 0 replies; 96+ messages in thread
From: Peter Zijlstra @ 2019-10-07  8:27 UTC (permalink / raw)
  To: x86
  Cc: peterz, linux-kernel, rostedt, mhiramat, bristot, jbaron,
	torvalds, tglx, mingo, namit, hpa, luto, ard.biesheuvel,
	jpoimboe


Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 arch/Kconfig         |    6 ++++++
 kernel/static_call.c |   28 ++++++++++++++++++++++++++++
 2 files changed, 34 insertions(+)

--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -103,6 +103,12 @@ config STATIC_KEYS_SELFTEST
 	help
 	  Boot time self-test of the branch patching code.
 
+config STATIC_CALL_SELFTEST
+	bool "Static call selftest"
+	depends on HAVE_STATIC_CALL
+	help
+	  Boot time self-test of the call patching code.
+
 config OPTPROBES
 	def_bool y
 	depends on KPROBES && HAVE_OPTPROBES
--- a/kernel/static_call.c
+++ b/kernel/static_call.c
@@ -364,3 +364,31 @@ static void __init static_call_init(void
 #endif
 }
 early_initcall(static_call_init);
+
+#ifdef CONFIG_STATIC_CALL_SELFTEST
+
+static int func_a(int x)
+{
+	return x+1;
+}
+
+static int func_b(int x)
+{
+	return x+2;
+}
+
+DEFINE_STATIC_CALL(sc_selftest, func_a);
+
+static int __init test_static_call_init(void)
+{
+	WARN_ON(static_call(sc_selftest)(2) != 3);
+	static_call_update(sc_selftest, &func_b);
+	WARN_ON(static_call(sc_selftest)(2) != 4);
+	static_call_update(sc_selftest, &func_a);
+	WARN_ON(static_call(sc_selftest)(2) != 3);
+
+	return 0;
+}
+early_initcall(test_static_call_init);
+
+#endif /* CONFIG_STATIC_CALL_SELFTEST */



^ permalink raw reply	[flat|nested] 96+ messages in thread

* [PATCH v2 08/13] tracepoints: Use static_call
  2019-10-07  8:27 ` [PATCH v2 00/13] Add static_call() Peter Zijlstra
                     ` (6 preceding siblings ...)
  2019-10-07  8:27   ` [PATCH v2 07/13] static_call: Simple self-test Peter Zijlstra
@ 2019-10-07  8:27   ` Peter Zijlstra
  2019-10-07  8:27   ` [PATCH v2 09/13] x86/alternatives: Teach text_poke_bp() to emulate RET Peter Zijlstra
                     ` (5 subsequent siblings)
  13 siblings, 0 replies; 96+ messages in thread
From: Peter Zijlstra @ 2019-10-07  8:27 UTC (permalink / raw)
  To: x86
  Cc: peterz, linux-kernel, rostedt, mhiramat, bristot, jbaron,
	torvalds, tglx, mingo, namit, hpa, luto, ard.biesheuvel,
	jpoimboe

From: Steven Rostedt  <rostedt@goodmis.org>

[peterz: updated to new interface]
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 include/linux/tracepoint-defs.h |    5 ++
 include/linux/tracepoint.h      |   75 ++++++++++++++++++++++++++--------------
 include/trace/define_trace.h    |   14 +++----
 kernel/tracepoint.c             |   19 ++++++++--
 4 files changed, 77 insertions(+), 36 deletions(-)

--- a/include/linux/tracepoint-defs.h
+++ b/include/linux/tracepoint-defs.h
@@ -11,6 +11,8 @@
 #include <linux/atomic.h>
 #include <linux/static_key.h>
 
+struct static_call_key;
+
 struct trace_print_flags {
 	unsigned long		mask;
 	const char		*name;
@@ -30,6 +32,9 @@ struct tracepoint_func {
 struct tracepoint {
 	const char *name;		/* Tracepoint name */
 	struct static_key key;
+	struct static_call_key *static_call_key;
+	void *static_call_tramp;
+	void *iterator;
 	int (*regfunc)(void);
 	void (*unregfunc)(void);
 	struct tracepoint_func __rcu *funcs;
--- a/include/linux/tracepoint.h
+++ b/include/linux/tracepoint.h
@@ -19,6 +19,7 @@
 #include <linux/cpumask.h>
 #include <linux/rcupdate.h>
 #include <linux/tracepoint-defs.h>
+#include <linux/static_call.h>
 
 struct module;
 struct tracepoint;
@@ -92,7 +93,9 @@ extern int syscall_regfunc(void);
 extern void syscall_unregfunc(void);
 #endif /* CONFIG_HAVE_SYSCALL_TRACEPOINTS */
 
+#ifndef PARAMS
 #define PARAMS(args...) args
+#endif
 
 #define TRACE_DEFINE_ENUM(x)
 #define TRACE_DEFINE_SIZEOF(x)
@@ -159,12 +162,11 @@ static inline struct tracepoint *tracepo
  * as "(void *, void)". The DECLARE_TRACE_NOARGS() will pass in just
  * "void *data", where as the DECLARE_TRACE() will pass in "void *data, proto".
  */
-#define __DO_TRACE(tp, proto, args, cond, rcuidle)			\
+#define __DO_TRACE(name, proto, args, cond, rcuidle)			\
 	do {								\
 		struct tracepoint_func *it_func_ptr;			\
-		void *it_func;						\
-		void *__data;						\
 		int __maybe_unused __idx = 0;				\
+		void *__data;						\
 									\
 		if (!(cond))						\
 			return;						\
@@ -184,14 +186,11 @@ static inline struct tracepoint *tracepo
 			rcu_irq_enter_irqson();				\
 		}							\
 									\
-		it_func_ptr = rcu_dereference_raw((tp)->funcs);		\
-									\
+		it_func_ptr =						\
+			rcu_dereference_raw((&__tracepoint_##name)->funcs); \
 		if (it_func_ptr) {					\
-			do {						\
-				it_func = (it_func_ptr)->func;		\
-				__data = (it_func_ptr)->data;		\
-				((void(*)(proto))(it_func))(args);	\
-			} while ((++it_func_ptr)->func);		\
+			__data = (it_func_ptr)->data;			\
+			static_call(tp_func_##name)(args);		\
 		}							\
 									\
 		if (rcuidle) {						\
@@ -207,7 +206,7 @@ static inline struct tracepoint *tracepo
 	static inline void trace_##name##_rcuidle(proto)		\
 	{								\
 		if (static_key_false(&__tracepoint_##name.key))		\
-			__DO_TRACE(&__tracepoint_##name,		\
+			__DO_TRACE(name,				\
 				TP_PROTO(data_proto),			\
 				TP_ARGS(data_args),			\
 				TP_CONDITION(cond), 1);			\
@@ -229,11 +228,13 @@ static inline struct tracepoint *tracepo
  * poking RCU a bit.
  */
 #define __DECLARE_TRACE(name, proto, args, cond, data_proto, data_args) \
+	extern int __tracepoint_iter_##name(data_proto);		\
+	DECLARE_STATIC_CALL(tp_func_##name, __tracepoint_iter_##name); \
 	extern struct tracepoint __tracepoint_##name;			\
 	static inline void trace_##name(proto)				\
 	{								\
 		if (static_key_false(&__tracepoint_##name.key))		\
-			__DO_TRACE(&__tracepoint_##name,		\
+			__DO_TRACE(name,				\
 				TP_PROTO(data_proto),			\
 				TP_ARGS(data_args),			\
 				TP_CONDITION(cond), 0);			\
@@ -279,21 +280,45 @@ static inline struct tracepoint *tracepo
  * structures, so we create an array of pointers that will be used for iteration
  * on the tracepoints.
  */
-#define DEFINE_TRACE_FN(name, reg, unreg)				 \
-	static const char __tpstrtab_##name[]				 \
-	__attribute__((section("__tracepoints_strings"))) = #name;	 \
-	struct tracepoint __tracepoint_##name				 \
-	__attribute__((section("__tracepoints"), used)) =		 \
-		{ __tpstrtab_##name, STATIC_KEY_INIT_FALSE, reg, unreg, NULL };\
-	__TRACEPOINT_ENTRY(name);
+#define DEFINE_TRACE_FN(name, reg, unreg, proto, args)			\
+	static const char __tpstrtab_##name[]				\
+	__attribute__((section("__tracepoints_strings"))) = #name;	\
+	extern struct static_call_key tp_func_##name;			\
+	int __tracepoint_iter_##name(void *__data, proto);		\
+	struct tracepoint __tracepoint_##name				\
+	__attribute__((section("__tracepoints"), used)) =		\
+		{ __tpstrtab_##name, STATIC_KEY_INIT_FALSE,		\
+		  &STATIC_CALL_NAME(tp_func_##name),			\
+		  &STATIC_CALL_TRAMP(tp_func_##name),			\
+		  &__tracepoint_iter_##name,				\
+		  reg, unreg, NULL };					\
+	__TRACEPOINT_ENTRY(name);					\
+	int __tracepoint_iter_##name(void *__data, proto)		\
+	{								\
+		struct tracepoint_func *it_func_ptr;			\
+		void *it_func;						\
+									\
+		it_func_ptr =						\
+			rcu_dereference_raw((&__tracepoint_##name)->funcs); \
+		do {							\
+			it_func = (it_func_ptr)->func;			\
+			__data = (it_func_ptr)->data;			\
+			((void(*)(void *, proto))(it_func))(__data, args); \
+		} while ((++it_func_ptr)->func);			\
+		return 0;						\
+	}								\
+	DEFINE_STATIC_CALL(tp_func_##name, __tracepoint_iter_##name);
 
-#define DEFINE_TRACE(name)						\
-	DEFINE_TRACE_FN(name, NULL, NULL);
+#define DEFINE_TRACE(name, proto, args)		\
+	DEFINE_TRACE_FN(name, NULL, NULL, PARAMS(proto), PARAMS(args));
 
 #define EXPORT_TRACEPOINT_SYMBOL_GPL(name)				\
-	EXPORT_SYMBOL_GPL(__tracepoint_##name)
+	EXPORT_SYMBOL_GPL(__tracepoint_##name);				\
+	EXPORT_STATIC_CALL_GPL(tp_func_##name)
 #define EXPORT_TRACEPOINT_SYMBOL(name)					\
-	EXPORT_SYMBOL(__tracepoint_##name)
+	EXPORT_SYMBOL(__tracepoint_##name);				\
+	EXPORT_STATIC_CALL(tp_func_##name)
+
 
 #else /* !TRACEPOINTS_ENABLED */
 #define __DECLARE_TRACE(name, proto, args, cond, data_proto, data_args) \
@@ -322,8 +347,8 @@ static inline struct tracepoint *tracepo
 		return false;						\
 	}
 
-#define DEFINE_TRACE_FN(name, reg, unreg)
-#define DEFINE_TRACE(name)
+#define DEFINE_TRACE_FN(name, reg, unreg, proto, args)
+#define DEFINE_TRACE(name, proto, args)
 #define EXPORT_TRACEPOINT_SYMBOL_GPL(name)
 #define EXPORT_TRACEPOINT_SYMBOL(name)
 
--- a/include/trace/define_trace.h
+++ b/include/trace/define_trace.h
@@ -25,7 +25,7 @@
 
 #undef TRACE_EVENT
 #define TRACE_EVENT(name, proto, args, tstruct, assign, print)	\
-	DEFINE_TRACE(name)
+	DEFINE_TRACE(name, PARAMS(proto), PARAMS(args))
 
 #undef TRACE_EVENT_CONDITION
 #define TRACE_EVENT_CONDITION(name, proto, args, cond, tstruct, assign, print) \
@@ -39,12 +39,12 @@
 #undef TRACE_EVENT_FN
 #define TRACE_EVENT_FN(name, proto, args, tstruct,		\
 		assign, print, reg, unreg)			\
-	DEFINE_TRACE_FN(name, reg, unreg)
+	DEFINE_TRACE_FN(name, reg, unreg, PARAMS(proto), PARAMS(args))
 
 #undef TRACE_EVENT_FN_COND
 #define TRACE_EVENT_FN_COND(name, proto, args, cond, tstruct,		\
 		assign, print, reg, unreg)			\
-	DEFINE_TRACE_FN(name, reg, unreg)
+	DEFINE_TRACE_FN(name, reg, unreg, PARAMS(proto), PARAMS(args))
 
 #undef TRACE_EVENT_NOP
 #define TRACE_EVENT_NOP(name, proto, args, struct, assign, print)
@@ -54,15 +54,15 @@
 
 #undef DEFINE_EVENT
 #define DEFINE_EVENT(template, name, proto, args) \
-	DEFINE_TRACE(name)
+	DEFINE_TRACE(name, PARAMS(proto), PARAMS(args))
 
 #undef DEFINE_EVENT_FN
 #define DEFINE_EVENT_FN(template, name, proto, args, reg, unreg) \
-	DEFINE_TRACE_FN(name, reg, unreg)
+	DEFINE_TRACE_FN(name, reg, unreg, PARAMS(proto), PARAMS(args))
 
 #undef DEFINE_EVENT_PRINT
 #define DEFINE_EVENT_PRINT(template, name, proto, args, print)	\
-	DEFINE_TRACE(name)
+	DEFINE_TRACE(name, PARAMS(proto), PARAMS(args))
 
 #undef DEFINE_EVENT_CONDITION
 #define DEFINE_EVENT_CONDITION(template, name, proto, args, cond) \
@@ -70,7 +70,7 @@
 
 #undef DECLARE_TRACE
 #define DECLARE_TRACE(name, proto, args)	\
-	DEFINE_TRACE(name)
+	DEFINE_TRACE(name, PARAMS(proto), PARAMS(args))
 
 #undef TRACE_INCLUDE
 #undef __TRACE_INCLUDE
--- a/kernel/tracepoint.c
+++ b/kernel/tracepoint.c
@@ -127,7 +127,7 @@ static void debug_print_probes(struct tr
 
 static struct tracepoint_func *
 func_add(struct tracepoint_func **funcs, struct tracepoint_func *tp_func,
-	 int prio)
+	 int prio, int *tot_probes)
 {
 	struct tracepoint_func *old, *new;
 	int nr_probes = 0;
@@ -170,11 +170,12 @@ func_add(struct tracepoint_func **funcs,
 	new[nr_probes + 1].func = NULL;
 	*funcs = new;
 	debug_print_probes(*funcs);
+	*tot_probes = nr_probes + 1;
 	return old;
 }
 
 static void *func_remove(struct tracepoint_func **funcs,
-		struct tracepoint_func *tp_func)
+		struct tracepoint_func *tp_func, int *left)
 {
 	int nr_probes = 0, nr_del = 0, i;
 	struct tracepoint_func *old, *new;
@@ -228,6 +229,7 @@ static int tracepoint_add_func(struct tr
 			       struct tracepoint_func *func, int prio)
 {
 	struct tracepoint_func *old, *tp_funcs;
+	int probes = 0;
 	int ret;
 
 	if (tp->regfunc && !static_key_enabled(&tp->key)) {
@@ -238,7 +240,7 @@ static int tracepoint_add_func(struct tr
 
 	tp_funcs = rcu_dereference_protected(tp->funcs,
 			lockdep_is_held(&tracepoints_mutex));
-	old = func_add(&tp_funcs, func, prio);
+	old = func_add(&tp_funcs, func, prio, &probes);
 	if (IS_ERR(old)) {
 		WARN_ON_ONCE(PTR_ERR(old) != -ENOMEM);
 		return PTR_ERR(old);
@@ -253,6 +255,10 @@ static int tracepoint_add_func(struct tr
 	rcu_assign_pointer(tp->funcs, tp_funcs);
 	if (!static_key_enabled(&tp->key))
 		static_key_slow_inc(&tp->key);
+
+	__static_call_update(tp->static_call_key, tp->static_call_tramp,
+			     probes == 1 ? tp_funcs->func : tp->iterator);
+
 	release_probes(old);
 	return 0;
 }
@@ -267,10 +273,11 @@ static int tracepoint_remove_func(struct
 		struct tracepoint_func *func)
 {
 	struct tracepoint_func *old, *tp_funcs;
+	int probes_left = 0;
 
 	tp_funcs = rcu_dereference_protected(tp->funcs,
 			lockdep_is_held(&tracepoints_mutex));
-	old = func_remove(&tp_funcs, func);
+	old = func_remove(&tp_funcs, func, &probes_left);
 	if (IS_ERR(old)) {
 		WARN_ON_ONCE(PTR_ERR(old) != -ENOMEM);
 		return PTR_ERR(old);
@@ -284,6 +291,10 @@ static int tracepoint_remove_func(struct
 		if (static_key_enabled(&tp->key))
 			static_key_slow_dec(&tp->key);
 	}
+
+	__static_call_update(tp->static_call_key, tp->static_call_tramp,
+			     probes_left == 1 ? tp_funcs->func : tp->iterator);
+
 	rcu_assign_pointer(tp->funcs, tp_funcs);
 	release_probes(old);
 	return 0;



^ permalink raw reply	[flat|nested] 96+ messages in thread

* [PATCH v2 09/13] x86/alternatives: Teach text_poke_bp() to emulate RET
  2019-10-07  8:27 ` [PATCH v2 00/13] Add static_call() Peter Zijlstra
                     ` (7 preceding siblings ...)
  2019-10-07  8:27   ` [PATCH v2 08/13] tracepoints: Use static_call Peter Zijlstra
@ 2019-10-07  8:27   ` Peter Zijlstra
  2019-10-07  8:27   ` [PATCH v2 10/13] static_call: Add static_cond_call() Peter Zijlstra
                     ` (4 subsequent siblings)
  13 siblings, 0 replies; 96+ messages in thread
From: Peter Zijlstra @ 2019-10-07  8:27 UTC (permalink / raw)
  To: x86
  Cc: peterz, linux-kernel, rostedt, mhiramat, bristot, jbaron,
	torvalds, tglx, mingo, namit, hpa, luto, ard.biesheuvel,
	jpoimboe

Future patches will need to poke a RET instruction, provide the
infrastructure required for this.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 arch/x86/include/asm/text-patching.h |   16 ++++++++++++++++
 arch/x86/kernel/alternative.c        |    6 ++++++
 2 files changed, 22 insertions(+)

--- a/arch/x86/include/asm/text-patching.h
+++ b/arch/x86/include/asm/text-patching.h
@@ -64,6 +64,9 @@ static inline void int3_emulate_jmp(stru
 #define INT3_INSN_SIZE		1
 #define INT3_INSN_OPCODE	0xCC
 
+#define RET_INSN_SIZE		1
+#define RET_INSN_OPCODE		0xC3
+
 #define CALL_INSN_SIZE		5
 #define CALL_INSN_OPCODE	0xE8
 
@@ -88,11 +91,24 @@ static inline void int3_emulate_push(str
 	*(unsigned long *)regs->sp = val;
 }
 
+static inline unsigned long int3_emulate_pop(struct pt_regs *regs)
+{
+	unsigned long val = *(unsigned long *)regs->sp;
+	regs->sp += sizeof(unsigned long);
+	return val;
+}
+
 static inline void int3_emulate_call(struct pt_regs *regs, unsigned long func)
 {
 	int3_emulate_push(regs, regs->ip - INT3_INSN_SIZE + CALL_INSN_SIZE);
 	int3_emulate_jmp(regs, func);
 }
+
+static inline void int3_emulate_ret(struct pt_regs *regs)
+{
+	unsigned long ip = int3_emulate_pop(regs);
+	int3_emulate_jmp(regs, ip);
+}
 #endif /* !CONFIG_UML_X86 */
 
 #endif /* _ASM_X86_TEXT_PATCHING_H */
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -1014,6 +1014,10 @@ int notrace poke_int3_handler(struct pt_
 		 */
 		return 0;
 
+	case RET_INSN_OPCODE:
+		int3_emulate_ret(regs);
+		break;
+
 	case CALL_INSN_OPCODE:
 		int3_emulate_call(regs, (long)ip + tp->rel32);
 		break;
@@ -1146,6 +1150,7 @@ void text_poke_loc_init(struct text_poke
 
 	switch (tp->opcode) {
 	case INT3_INSN_OPCODE:
+	case RET_INSN_OPCODE:
 		break;
 
 	case CALL_INSN_OPCODE:
@@ -1260,6 +1265,7 @@ void *text_gen_insn(u8 opcode, unsigned
 
 	switch(opcode) {
 	__CASE(INT3);
+	__CASE(RET);
 	__CASE(CALL);
 	__CASE(JMP32);
 	__CASE(JMP8);



^ permalink raw reply	[flat|nested] 96+ messages in thread

* [PATCH v2 10/13] static_call: Add static_cond_call()
  2019-10-07  8:27 ` [PATCH v2 00/13] Add static_call() Peter Zijlstra
                     ` (8 preceding siblings ...)
  2019-10-07  8:27   ` [PATCH v2 09/13] x86/alternatives: Teach text_poke_bp() to emulate RET Peter Zijlstra
@ 2019-10-07  8:27   ` Peter Zijlstra
  2019-10-07  8:27   ` [PATCH v2 11/13] static_call: Handle tail-calls Peter Zijlstra
                     ` (3 subsequent siblings)
  13 siblings, 0 replies; 96+ messages in thread
From: Peter Zijlstra @ 2019-10-07  8:27 UTC (permalink / raw)
  To: x86
  Cc: peterz, linux-kernel, rostedt, mhiramat, bristot, jbaron,
	torvalds, tglx, mingo, namit, hpa, luto, ard.biesheuvel,
	jpoimboe

Extend the static_call infrastructure to optimize the following common
pattern:

	if (func_ptr)
		func_ptr(args...)

For the trampoline (which is in effect a tail-call), we patch the
JMP.d32 into a RET, which then directly consumes the trampoline call.

For the in-line sites we replace the CALL with a NOP5.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 arch/x86/kernel/static_call.c |   42 ++++++++++++++++++++++++++++++++----------
 include/linux/static_call.h   |    7 +++++++
 2 files changed, 39 insertions(+), 10 deletions(-)

--- a/arch/x86/kernel/static_call.c
+++ b/arch/x86/kernel/static_call.c
@@ -4,19 +4,41 @@
 #include <linux/bug.h>
 #include <asm/text-patching.h>
 
-static void __static_call_transform(void *insn, u8 opcode, void *func)
+enum insn_type {
+	call = 0, /* site call */
+	nop = 1,  /* site cond-call */
+	jmp = 2,  /* tramp / site tail-call */
+	ret = 3,  /* tramp / site cond-tail-call */
+};
+
+static void __static_call_transform(void *insn, enum insn_type type, void *func)
 {
-	const void *code = text_gen_insn(opcode, (long)insn, (long)func);
+	int size = CALL_INSN_SIZE;
+	const void *code;
 
-	if (WARN_ONCE(*(u8 *)insn != opcode,
-		      "unexpected static call insn opcode 0x%x at %pS\n",
-		      opcode, insn))
-		return;
+	switch (type) {
+	case call:
+		code = text_gen_insn(CALL_INSN_OPCODE, insn, func);
+		break;
+
+	case nop:
+		code = ideal_nops[NOP_ATOMIC5];
+		break;
+
+	case jmp:
+		code = text_gen_insn(JMP32_INSN_OPCODE, insn, func);
+		break;
+
+	case ret:
+		code = text_gen_insn(RET_INSN_OPCODE, insn, func);
+		size = RET_INSN_SIZE;
+		break;
+	}
 
-	if (memcmp(insn, code, CALL_INSN_SIZE) == 0)
+	if (memcmp(insn, code, size) == 0)
 		return;
 
-	text_poke_bp(insn, code, CALL_INSN_SIZE, NULL);
+	text_poke_bp(insn, code, size, NULL);
 }
 
 void arch_static_call_transform(void *site, void *tramp, void *func)
@@ -24,10 +46,10 @@ void arch_static_call_transform(void *si
 	mutex_lock(&text_mutex);
 
 	if (tramp)
-		__static_call_transform(tramp, JMP32_INSN_OPCODE, func);
+		__static_call_transform(tramp, jmp + !func, func);
 
 	if (IS_ENABLED(CONFIG_HAVE_STATIC_CALL_INLINE) && site)
-		__static_call_transform(site, CALL_INSN_OPCODE, func);
+		__static_call_transform(site, !func, func);
 
 	mutex_unlock(&text_mutex);
 }
--- a/include/linux/static_call.h
+++ b/include/linux/static_call.h
@@ -17,6 +17,7 @@
  *   DECLARE_STATIC_CALL(name, func);
  *   DEFINE_STATIC_CALL(name, func);
  *   static_call(name)(args...);
+ *   static_cond_call(name)(args...)
  *   static_call_update(name, func);
  *
  * Usage example:
@@ -105,6 +106,7 @@ extern int static_call_text_reserved(voi
 	ARCH_DEFINE_STATIC_CALL_TRAMP(name, _func)
 
 #define static_call(name)	STATIC_CALL_TRAMP(name)
+#define static_cond_call(name)	STATIC_CALL_TRAMP(name)
 
 #define EXPORT_STATIC_CALL(name)					\
 	EXPORT_SYMBOL(STATIC_CALL_NAME(name));				\
@@ -128,6 +130,7 @@ struct static_call_key {
 	ARCH_DEFINE_STATIC_CALL_TRAMP(name, _func)
 
 #define static_call(name)	STATIC_CALL_TRAMP(name)
+#define static_cond_call(name)	STATIC_CALL_TRAMP(name)
 
 static inline
 void __static_call_update(struct static_call_key *key, void *tramp, void *func)
@@ -161,6 +164,10 @@ struct static_call_key {
 #define static_call(name)						\
 	((typeof(STATIC_CALL_TRAMP(name))*)(STATIC_CALL_NAME(name).func))
 
+#define static_cond_call(name)						\
+	if (STATIC_CALL_NAME(name).func)				\
+		((typeof(STATIC_CALL_TRAMP(name))*)(STATIC_CALL_NAME(name).func))
+
 static inline
 void __static_call_update(struct static_call_key *key, void *tramp, void *func)
 {



^ permalink raw reply	[flat|nested] 96+ messages in thread

* [PATCH v2 11/13] static_call: Handle tail-calls
  2019-10-07  8:27 ` [PATCH v2 00/13] Add static_call() Peter Zijlstra
                     ` (9 preceding siblings ...)
  2019-10-07  8:27   ` [PATCH v2 10/13] static_call: Add static_cond_call() Peter Zijlstra
@ 2019-10-07  8:27   ` Peter Zijlstra
  2019-10-07  8:27   ` [PATCH v2 12/13] static_call: Allow early init Peter Zijlstra
                     ` (2 subsequent siblings)
  13 siblings, 0 replies; 96+ messages in thread
From: Peter Zijlstra @ 2019-10-07  8:27 UTC (permalink / raw)
  To: x86
  Cc: peterz, linux-kernel, rostedt, mhiramat, bristot, jbaron,
	torvalds, tglx, mingo, namit, hpa, luto, ard.biesheuvel,
	jpoimboe

GCC can turn our static_call(name)(args...) into a tail call, in which
case we get a JMP.d32 into the trampoline (which then does a further
tail-call).

Teach objtool to recognise and mark these in .static_call_sites and
adjust the code patching to deal with this.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 arch/x86/kernel/static_call.c           |    4 ++--
 include/linux/static_call.h             |    4 ++--
 include/linux/static_call_types.h       |    7 +++++++
 kernel/static_call.c                    |   21 +++++++++++++--------
 tools/include/linux/static_call_types.h |    7 +++++++
 tools/objtool/check.c                   |   18 +++++++++++++-----
 6 files changed, 44 insertions(+), 17 deletions(-)

--- a/arch/x86/kernel/static_call.c
+++ b/arch/x86/kernel/static_call.c
@@ -41,7 +41,7 @@ static void __static_call_transform(void
 	text_poke_bp(insn, code, size, NULL);
 }
 
-void arch_static_call_transform(void *site, void *tramp, void *func)
+void arch_static_call_transform(void *site, void *tramp, void *func, bool tail)
 {
 	mutex_lock(&text_mutex);
 
@@ -49,7 +49,7 @@ void arch_static_call_transform(void *si
 		__static_call_transform(tramp, jmp + !func, func);
 
 	if (IS_ENABLED(CONFIG_HAVE_STATIC_CALL_INLINE) && site)
-		__static_call_transform(site, !func, func);
+		__static_call_transform(site, 2*tail + !func, func);
 
 	mutex_unlock(&text_mutex);
 }
--- a/include/linux/static_call.h
+++ b/include/linux/static_call.h
@@ -64,7 +64,7 @@
 /*
  * Either @site or @tramp can be NULL.
  */
-extern void arch_static_call_transform(void *site, void *tramp, void *func);
+extern void arch_static_call_transform(void *site, void *tramp, void *func, bool tail);
 #endif
 
 
@@ -137,7 +137,7 @@ void __static_call_update(struct static_
 {
 	cpus_read_lock();
 	WRITE_ONCE(key->func, func);
-	arch_static_call_transform(NULL, tramp, func);
+	arch_static_call_transform(NULL, tramp, func, false);
 	cpus_read_unlock();
 }
 
--- a/include/linux/static_call_types.h
+++ b/include/linux/static_call_types.h
@@ -13,6 +13,13 @@
 #define STATIC_CALL_TRAMP_STR(name) __stringify(STATIC_CALL_TRAMP(name))
 
 /*
+ * Flags in the low bits of static_call_site::key.
+ */
+#define STATIC_CALL_SITE_TAIL 1UL	/* tail call */
+#define STATIC_CALL_SITE_INIT 2UL	/* init section */
+#define STATIC_CALL_SITE_FLAGS 3UL
+
+/*
  * The static call site table needs to be created by external tooling (objtool
  * or a compiler plugin).
  */
--- a/kernel/static_call.c
+++ b/kernel/static_call.c
@@ -15,8 +15,6 @@ extern struct static_call_site __start_s
 
 static bool static_call_initialized;
 
-#define STATIC_CALL_INIT 1UL
-
 /* mutex to protect key modules/sites */
 static DEFINE_MUTEX(static_call_mutex);
 
@@ -39,18 +37,23 @@ static inline void *static_call_addr(str
 static inline struct static_call_key *static_call_key(const struct static_call_site *site)
 {
 	return (struct static_call_key *)
-		(((long)site->key + (long)&site->key) & ~STATIC_CALL_INIT);
+		(((long)site->key + (long)&site->key) & ~STATIC_CALL_SITE_FLAGS);
 }
 
 /* These assume the key is word-aligned. */
 static inline bool static_call_is_init(struct static_call_site *site)
 {
-	return ((long)site->key + (long)&site->key) & STATIC_CALL_INIT;
+	return ((long)site->key + (long)&site->key) & STATIC_CALL_SITE_INIT;
+}
+
+static inline bool static_call_is_tail(struct static_call_site *site)
+{
+	return ((long)site->key + (long)&site->key) & STATIC_CALL_SITE_TAIL;
 }
 
 static inline void static_call_set_init(struct static_call_site *site)
 {
-	site->key = ((long)static_call_key(site) | STATIC_CALL_INIT) -
+	site->key = ((long)static_call_key(site) | STATIC_CALL_SITE_INIT) -
 		    (long)&site->key;
 }
 
@@ -104,7 +107,7 @@ void __static_call_update(struct static_
 
 	key->func = func;
 
-	arch_static_call_transform(NULL, tramp, func);
+	arch_static_call_transform(NULL, tramp, func, false);
 
 	/*
 	 * If uninitialized, we'll not update the callsites, but they still
@@ -153,7 +156,8 @@ void __static_call_update(struct static_
 				continue;
 			}
 
-			arch_static_call_transform(site_addr, NULL, func);
+			arch_static_call_transform(site_addr, NULL, func,
+				static_call_is_tail(site));
 		}
 	}
 
@@ -197,7 +201,8 @@ static int __static_call_init(struct mod
 			key->next = site_mod;
 		}
 
-		arch_static_call_transform(site_addr, NULL, key->func);
+		arch_static_call_transform(site_addr, NULL, key->func,
+				static_call_is_tail(site));
 	}
 
 	return 0;
--- a/tools/include/linux/static_call_types.h
+++ b/tools/include/linux/static_call_types.h
@@ -13,6 +13,13 @@
 #define STATIC_CALL_TRAMP_STR(name) __stringify(STATIC_CALL_TRAMP(name))
 
 /*
+ * Flags in the low bits of static_call_site::key.
+ */
+#define STATIC_CALL_SITE_TAIL 1UL	/* tail call */
+#define STATIC_CALL_SITE_INIT 2UL	/* init section */
+#define STATIC_CALL_SITE_FLAGS 3UL
+
+/*
  * The static call site table needs to be created by external tooling (objtool
  * or a compiler plugin).
  */
--- a/tools/objtool/check.c
+++ b/tools/objtool/check.c
@@ -585,6 +585,10 @@ static int add_jump_destinations(struct
 		} else {
 			/* external sibling call */
 			insn->call_dest = rela->sym;
+			if (insn->call_dest->static_call_tramp) {
+				list_add_tail(&insn->static_call_node,
+					      &file->static_call_list);
+			}
 			continue;
 		}
 
@@ -636,6 +640,10 @@ static int add_jump_destinations(struct
 
 				/* internal sibling call */
 				insn->call_dest = insn->jump_dest->func;
+				if (insn->call_dest->static_call_tramp) {
+					list_add_tail(&insn->static_call_node,
+						      &file->static_call_list);
+				}
 			}
 		}
 	}
@@ -1348,6 +1356,10 @@ static int decode_sections(struct objtoo
 	if (ret)
 		return ret;
 
+	ret = read_static_call_tramps(file);
+	if (ret)
+		return ret;
+
 	ret = add_jump_destinations(file);
 	if (ret)
 		return ret;
@@ -1372,10 +1384,6 @@ static int decode_sections(struct objtoo
 	if (ret)
 		return ret;
 
-	ret = read_static_call_tramps(file);
-	if (ret)
-		return ret;
-
 	return 0;
 }
 
@@ -2505,7 +2513,7 @@ static int create_static_call_sections(s
 		}
 		memset(rela, 0, sizeof(*rela));
 		rela->sym = key_sym;
-		rela->addend = 0;
+		rela->addend = is_sibling_call(insn) ? STATIC_CALL_SITE_TAIL : 0;
 		rela->type = R_X86_64_PC32;
 		rela->offset = idx * sizeof(struct static_call_site) + 4;
 		list_add_tail(&rela->list, &rela_sec->rela_list);



^ permalink raw reply	[flat|nested] 96+ messages in thread

* [PATCH v2 12/13] static_call: Allow early init
  2019-10-07  8:27 ` [PATCH v2 00/13] Add static_call() Peter Zijlstra
                     ` (10 preceding siblings ...)
  2019-10-07  8:27   ` [PATCH v2 11/13] static_call: Handle tail-calls Peter Zijlstra
@ 2019-10-07  8:27   ` Peter Zijlstra
  2019-10-07  8:27   ` [RFC][PATCH v2 13/13] x86/perf, static_call: Optimize x86_pmu methods Peter Zijlstra
  2019-10-07 11:33   ` [PATCH v2 00/13] Add static_call() Peter Zijlstra
  13 siblings, 0 replies; 96+ messages in thread
From: Peter Zijlstra @ 2019-10-07  8:27 UTC (permalink / raw)
  To: x86
  Cc: peterz, linux-kernel, rostedt, mhiramat, bristot, jbaron,
	torvalds, tglx, mingo, namit, hpa, luto, ard.biesheuvel,
	jpoimboe

In order to use static_call() to wire up x86_pmu, we need to
initialize earlier; copy some of the tricks from jump_label to enable
this.

Primarily we overload key->next to store a sites pointer when there
are no modules, this avoids having to use kmalloc() to initialize the
sites and allows us to run much earlier.

(arguably, this is much much earlier than needed for perf, but it
might allow other uses.)

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 arch/x86/kernel/setup.c       |    2 +
 arch/x86/kernel/static_call.c |    3 ++
 include/linux/static_call.h   |   15 ++++++++++--
 kernel/static_call.c          |   52 +++++++++++++++++++++++++++++++++++++++---
 4 files changed, 67 insertions(+), 5 deletions(-)

--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -73,6 +73,7 @@
 #include <linux/jiffies.h>
 #include <linux/mem_encrypt.h>
 #include <linux/sizes.h>
+#include <linux/static_call.h>
 
 #include <linux/usb/xhci-dbgp.h>
 #include <video/edid.h>
@@ -896,6 +897,7 @@ void __init setup_arch(char **cmdline_p)
 	early_cpu_init();
 	arch_init_ideal_nops();
 	jump_label_init();
+	static_call_init();
 	early_ioremap_init();
 
 	setup_olpc_ofw_pgd();
--- a/arch/x86/kernel/static_call.c
+++ b/arch/x86/kernel/static_call.c
@@ -38,6 +38,9 @@ static void __static_call_transform(void
 	if (memcmp(insn, code, size) == 0)
 		return;
 
+	if (unlikely(system_state == SYSTEM_BOOTING))
+		return text_poke_early(insn, code, size);
+
 	text_poke_bp(insn, code, size, NULL);
 }
 
--- a/include/linux/static_call.h
+++ b/include/linux/static_call.h
@@ -81,6 +81,8 @@ extern void arch_static_call_transform(v
 
 #ifdef CONFIG_HAVE_STATIC_CALL_INLINE
 
+extern void __init static_call_init(void);
+
 struct static_call_mod {
 	struct static_call_mod *next;
 	struct module *mod; /* for vmlinux, mod == NULL */
@@ -89,7 +91,12 @@ struct static_call_mod {
 
 struct static_call_key {
 	void *func;
-	struct static_call_mod *next;
+	union {
+		/* bit0 => 0 - next, 1 - sites */
+		unsigned long type;
+		struct static_call_mod *next;
+		struct static_call_site *sites;
+	};
 };
 
 extern void __static_call_update(struct static_call_key *key, void *tramp, void *func);
@@ -100,7 +107,7 @@ extern int static_call_text_reserved(voi
 	DECLARE_STATIC_CALL(name, _func);				\
 	struct static_call_key STATIC_CALL_NAME(name) = {		\
 		.func = _func,						\
-		.next = NULL,						\
+		.type = 1,						\
 	};								\
 	__ADDRESSABLE(STATIC_CALL_NAME(name));				\
 	ARCH_DEFINE_STATIC_CALL_TRAMP(name, _func)
@@ -118,6 +125,8 @@ extern int static_call_text_reserved(voi
 
 #elif defined(CONFIG_HAVE_STATIC_CALL)
 
+static inline void static_call_init(void) { }
+
 struct static_call_key {
 	void *func;
 };
@@ -151,6 +160,8 @@ static inline int static_call_text_reser
 
 #else /* Generic implementation */
 
+static inline void static_call_init(void) { }
+
 struct static_call_key {
 	void *func;
 };
--- a/kernel/static_call.c
+++ b/kernel/static_call.c
@@ -94,10 +94,31 @@ static inline void static_call_sort_entr
 	     static_call_site_cmp, static_call_site_swap);
 }
 
+static inline bool static_call_key_has_next(struct static_call_key *key)
+{
+	return !(key->type & 1);
+}
+
+static inline struct static_call_mod *static_call_key_next(struct static_call_key *key)
+{
+	if (static_call_key_has_next(key))
+		return key->next->next;
+
+	return NULL;
+}
+
+static inline struct static_call_site *static_call_key_sites(struct static_call_key *key)
+{
+	if (static_call_key_has_next(key))
+		return key->next->sites;
+
+	return (struct static_call_site *)(key->type & ~1);
+}
+
 void __static_call_update(struct static_call_key *key, void *tramp, void *func)
 {
 	struct static_call_site *site, *stop;
-	struct static_call_mod *site_mod;
+	struct static_call_mod *site_mod, first;
 
 	cpus_read_lock();
 	static_call_lock();
@@ -116,7 +137,13 @@ void __static_call_update(struct static_
 	if (WARN_ON_ONCE(!static_call_initialized))
 		goto done;
 
-	for (site_mod = key->next; site_mod; site_mod = site_mod->next) {
+	first = (struct static_call_mod){
+		.next = static_call_key_next(key),
+		.mod = NULL,
+		.sites = static_call_key_sites(key),
+	};
+
+	for (site_mod = &first; site_mod; site_mod = site_mod->next) {
 		if (!site_mod->sites) {
 			/*
 			 * This can happen if the static call key is defined in
@@ -191,16 +218,35 @@ static int __static_call_init(struct mod
 		if (key != prev_key) {
 			prev_key = key;
 
+			if (!mod) {
+				key->sites = site;
+				key->type |= 1;
+				goto do_transform;
+			}
+
 			site_mod = kzalloc(sizeof(*site_mod), GFP_KERNEL);
 			if (!site_mod)
 				return -ENOMEM;
 
+			if (!static_call_key_has_next(key)) {
+				site_mod->mod = NULL;
+				site_mod->next = NULL;
+				site_mod->sites = static_call_key_sites(key);
+
+				key->next = site_mod;
+
+				site_mod = kzalloc(sizeof(*site_mod), GFP_KERNEL);
+				if (!site_mod)
+					return -ENOMEM;
+			}
+
 			site_mod->mod = mod;
 			site_mod->sites = site;
 			site_mod->next = key->next;
 			key->next = site_mod;
 		}
 
+do_transform:
 		arch_static_call_transform(site_addr, NULL, key->func,
 				static_call_is_tail(site));
 	}
@@ -343,7 +389,7 @@ int static_call_text_reserved(void *star
 	return ret;
 }
 
-static void __init static_call_init(void)
+void __init static_call_init(void)
 {
 	int ret;
 



^ permalink raw reply	[flat|nested] 96+ messages in thread

* [RFC][PATCH v2 13/13] x86/perf, static_call: Optimize x86_pmu methods
  2019-10-07  8:27 ` [PATCH v2 00/13] Add static_call() Peter Zijlstra
                     ` (11 preceding siblings ...)
  2019-10-07  8:27   ` [PATCH v2 12/13] static_call: Allow early init Peter Zijlstra
@ 2019-10-07  8:27   ` Peter Zijlstra
  2019-10-07 11:33   ` [PATCH v2 00/13] Add static_call() Peter Zijlstra
  13 siblings, 0 replies; 96+ messages in thread
From: Peter Zijlstra @ 2019-10-07  8:27 UTC (permalink / raw)
  To: x86
  Cc: peterz, linux-kernel, rostedt, mhiramat, bristot, jbaron,
	torvalds, tglx, mingo, namit, hpa, luto, ard.biesheuvel,
	jpoimboe

Replace many of the indirect calls with static_call().

XXX run performance numbers

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 arch/x86/events/core.c |  136 +++++++++++++++++++++++++++++++++++--------------
 1 file changed, 98 insertions(+), 38 deletions(-)

--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -28,6 +28,7 @@
 #include <linux/bitops.h>
 #include <linux/device.h>
 #include <linux/nospec.h>
+#include <linux/static_call.h>
 
 #include <asm/apic.h>
 #include <asm/stacktrace.h>
@@ -51,6 +52,45 @@ DEFINE_PER_CPU(struct cpu_hw_events, cpu
 
 DEFINE_STATIC_KEY_FALSE(rdpmc_always_available_key);
 
+static void _x86_pmu_add(struct perf_event *event) { }
+static void _x86_pmu_del(struct perf_event *event) { }
+static void _x86_pmu_read(struct perf_event *event) { x86_perf_event_update(event); }
+static void _x86_pmu_put_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event) { }
+static void _x86_pmu_drain_pebs(struct pt_regs *regs) { }
+static void _x86_pmu_pebs_aliases(struct perf_event *event) { }
+static void _x86_pmu_start_scheduling(struct cpu_hw_events *cpuc) { }
+static void _x86_pmu_commit_scheduling(struct cpu_hw_events *cpuc, int idx, int cntr) { }
+static void _x86_pmu_stop_scheduling(struct cpu_hw_events *cpuc) { }
+static void _x86_pmu_sched_task(struct perf_event_context *ctx, bool sched_in) { }
+
+DEFINE_STATIC_CALL(x86_pmu_handle_irq, x86_pmu_handle_irq);
+DEFINE_STATIC_CALL(x86_pmu_disable_all, x86_pmu_disable_all);
+DEFINE_STATIC_CALL(x86_pmu_enable_all, x86_pmu_enable_all);
+DEFINE_STATIC_CALL(x86_pmu_enable, x86_pmu_enable_event);
+DEFINE_STATIC_CALL(x86_pmu_disable, x86_pmu_disable_event);
+
+DEFINE_STATIC_CALL(x86_pmu_add, _x86_pmu_add);
+DEFINE_STATIC_CALL(x86_pmu_del, _x86_pmu_del);
+DEFINE_STATIC_CALL(x86_pmu_read, _x86_pmu_read);
+
+DEFINE_STATIC_CALL(x86_pmu_schedule_events, x86_schedule_events);
+
+// addr_offset
+// rdpmc_index
+// event_map
+
+DEFINE_STATIC_CALL(x86_pmu_get_event_constraints, x86_get_event_constraints);
+DEFINE_STATIC_CALL(x86_pmu_put_event_constraints, _x86_pmu_put_event_constraints);
+
+DEFINE_STATIC_CALL(x86_pmu_drain_pebs, _x86_pmu_drain_pebs);
+DEFINE_STATIC_CALL(x86_pmu_pebs_aliases, _x86_pmu_pebs_aliases);
+
+DEFINE_STATIC_CALL(x86_pmu_start_scheduling, _x86_pmu_start_scheduling);
+DEFINE_STATIC_CALL(x86_pmu_commit_scheduling, _x86_pmu_commit_scheduling);
+DEFINE_STATIC_CALL(x86_pmu_stop_scheduling, _x86_pmu_stop_scheduling);
+
+DEFINE_STATIC_CALL(x86_pmu_sched_task, _x86_pmu_sched_task);
+
 u64 __read_mostly hw_cache_event_ids
 				[PERF_COUNT_HW_CACHE_MAX]
 				[PERF_COUNT_HW_CACHE_OP_MAX]
@@ -651,7 +691,7 @@ static void x86_pmu_disable(struct pmu *
 	cpuc->enabled = 0;
 	barrier();
 
-	x86_pmu.disable_all();
+	static_call(x86_pmu_disable_all)();
 }
 
 void x86_pmu_enable_all(int added)
@@ -884,8 +924,7 @@ int x86_schedule_events(struct cpu_hw_ev
 	if (cpuc->txn_flags & PERF_PMU_TXN_ADD)
 		n0 -= cpuc->n_txn;
 
-	if (x86_pmu.start_scheduling)
-		x86_pmu.start_scheduling(cpuc);
+	static_cond_call(x86_pmu_start_scheduling)(cpuc);
 
 	for (i = 0, wmin = X86_PMC_IDX_MAX, wmax = 0; i < n; i++) {
 		c = cpuc->event_constraint[i];
@@ -902,7 +941,7 @@ int x86_schedule_events(struct cpu_hw_ev
 		 * change due to external factors (sibling state, allow_tfa).
 		 */
 		if (!c || (c->flags & PERF_X86_EVENT_DYNAMIC)) {
-			c = x86_pmu.get_event_constraints(cpuc, i, cpuc->event_list[i]);
+			c = static_call(x86_pmu_get_event_constraints)(cpuc, i, cpuc->event_list[i]);
 			cpuc->event_constraint[i] = c;
 		}
 
@@ -969,8 +1008,7 @@ int x86_schedule_events(struct cpu_hw_ev
 	if (!unsched && assign) {
 		for (i = 0; i < n; i++) {
 			e = cpuc->event_list[i];
-			if (x86_pmu.commit_scheduling)
-				x86_pmu.commit_scheduling(cpuc, i, assign[i]);
+			static_cond_call(x86_pmu_commit_scheduling)(cpuc, i, assign[i]);
 		}
 	} else {
 		for (i = n0; i < n; i++) {
@@ -979,15 +1017,13 @@ int x86_schedule_events(struct cpu_hw_ev
 			/*
 			 * release events that failed scheduling
 			 */
-			if (x86_pmu.put_event_constraints)
-				x86_pmu.put_event_constraints(cpuc, e);
+			static_cond_call(x86_pmu_put_event_constraints)(cpuc, e);
 
 			cpuc->event_constraint[i] = NULL;
 		}
 	}
 
-	if (x86_pmu.stop_scheduling)
-		x86_pmu.stop_scheduling(cpuc);
+	static_cond_call(x86_pmu_stop_scheduling)(cpuc);
 
 	return unsched ? -EINVAL : 0;
 }
@@ -1174,7 +1210,7 @@ static void x86_pmu_enable(struct pmu *p
 	cpuc->enabled = 1;
 	barrier();
 
-	x86_pmu.enable_all(added);
+	static_call(x86_pmu_enable_all)(added);
 }
 
 static DEFINE_PER_CPU(u64 [X86_PMC_IDX_MAX], pmc_prev_left);
@@ -1288,7 +1324,7 @@ static int x86_pmu_add(struct perf_event
 	if (cpuc->txn_flags & PERF_PMU_TXN_ADD)
 		goto done_collect;
 
-	ret = x86_pmu.schedule_events(cpuc, n, assign);
+	ret = static_call(x86_pmu_schedule_events)(cpuc, n, assign);
 	if (ret)
 		goto out;
 	/*
@@ -1306,13 +1342,11 @@ static int x86_pmu_add(struct perf_event
 	cpuc->n_added += n - n0;
 	cpuc->n_txn += n - n0;
 
-	if (x86_pmu.add) {
-		/*
-		 * This is before x86_pmu_enable() will call x86_pmu_start(),
-		 * so we enable LBRs before an event needs them etc..
-		 */
-		x86_pmu.add(event);
-	}
+	/*
+	 * This is before x86_pmu_enable() will call x86_pmu_start(),
+	 * so we enable LBRs before an event needs them etc..
+	 */
+	static_cond_call(x86_pmu_add)(event);
 
 	ret = 0;
 out:
@@ -1340,7 +1374,7 @@ static void x86_pmu_start(struct perf_ev
 	cpuc->events[idx] = event;
 	__set_bit(idx, cpuc->active_mask);
 	__set_bit(idx, cpuc->running);
-	x86_pmu.enable(event);
+	static_call(x86_pmu_enable)(event);
 	perf_event_update_userpage(event);
 }
 
@@ -1410,7 +1444,7 @@ void x86_pmu_stop(struct perf_event *eve
 	struct hw_perf_event *hwc = &event->hw;
 
 	if (test_bit(hwc->idx, cpuc->active_mask)) {
-		x86_pmu.disable(event);
+		static_call(x86_pmu_disable)(event);
 		__clear_bit(hwc->idx, cpuc->active_mask);
 		cpuc->events[hwc->idx] = NULL;
 		WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
@@ -1460,8 +1494,7 @@ static void x86_pmu_del(struct perf_even
 	if (i >= cpuc->n_events - cpuc->n_added)
 		--cpuc->n_added;
 
-	if (x86_pmu.put_event_constraints)
-		x86_pmu.put_event_constraints(cpuc, event);
+	static_cond_call(x86_pmu_put_event_constraints)(cpuc, event);
 
 	/* Delete the array entry. */
 	while (++i < cpuc->n_events) {
@@ -1474,13 +1507,12 @@ static void x86_pmu_del(struct perf_even
 	perf_event_update_userpage(event);
 
 do_del:
-	if (x86_pmu.del) {
-		/*
-		 * This is after x86_pmu_stop(); so we disable LBRs after any
-		 * event can need them etc..
-		 */
-		x86_pmu.del(event);
-	}
+
+	/*
+	 * This is after x86_pmu_stop(); so we disable LBRs after any
+	 * event can need them etc..
+	 */
+	static_cond_call(x86_pmu_del)(event);
 }
 
 int x86_pmu_handle_irq(struct pt_regs *regs)
@@ -1558,7 +1590,7 @@ perf_event_nmi_handler(unsigned int cmd,
 		return NMI_DONE;
 
 	start_clock = sched_clock();
-	ret = x86_pmu.handle_irq(regs);
+	ret = static_call(x86_pmu_handle_irq)(regs);
 	finish_clock = sched_clock();
 
 	perf_sample_event_took(finish_clock - start_clock);
@@ -1765,6 +1797,32 @@ ssize_t x86_event_sysfs_show(char *page,
 static struct attribute_group x86_pmu_attr_group;
 static struct attribute_group x86_pmu_caps_group;
 
+static void x86_pmu_static_call_update(void)
+{
+	static_call_update(x86_pmu_handle_irq, x86_pmu.handle_irq);
+	static_call_update(x86_pmu_disable_all, x86_pmu.disable_all);
+	static_call_update(x86_pmu_enable_all, x86_pmu.enable_all);
+	static_call_update(x86_pmu_enable, x86_pmu.enable);
+	static_call_update(x86_pmu_disable, x86_pmu.disable);
+
+	static_call_update(x86_pmu_add, x86_pmu.add);
+	static_call_update(x86_pmu_del, x86_pmu.del);
+	static_call_update(x86_pmu_read, x86_pmu.read);
+
+	static_call_update(x86_pmu_schedule_events, x86_pmu.schedule_events);
+	static_call_update(x86_pmu_get_event_constraints, x86_pmu.get_event_constraints);
+	static_call_update(x86_pmu_put_event_constraints, x86_pmu.put_event_constraints);
+
+	static_call_update(x86_pmu_drain_pebs, x86_pmu.drain_pebs);
+	static_call_update(x86_pmu_pebs_aliases, x86_pmu.pebs_aliases);
+
+	static_call_update(x86_pmu_start_scheduling, x86_pmu.start_scheduling);
+	static_call_update(x86_pmu_commit_scheduling, x86_pmu.commit_scheduling);
+	static_call_update(x86_pmu_stop_scheduling, x86_pmu.stop_scheduling);
+
+	static_call_update(x86_pmu_sched_task, x86_pmu.sched_task);
+}
+
 static int __init init_hw_perf_events(void)
 {
 	struct x86_pmu_quirk *quirk;
@@ -1829,6 +1887,11 @@ static int __init init_hw_perf_events(vo
 	pr_info("... fixed-purpose events:   %d\n",     x86_pmu.num_counters_fixed);
 	pr_info("... event mask:             %016Lx\n", x86_pmu.intel_ctrl);
 
+	if (!x86_pmu.read)
+		x86_pmu.read = _x86_pmu_read;
+
+	x86_pmu_static_call_update();
+
 	/*
 	 * Install callbacks. Core will call them for each online
 	 * cpu.
@@ -1865,11 +1928,9 @@ static int __init init_hw_perf_events(vo
 }
 early_initcall(init_hw_perf_events);
 
-static inline void x86_pmu_read(struct perf_event *event)
+static void x86_pmu_read(struct perf_event *event)
 {
-	if (x86_pmu.read)
-		return x86_pmu.read(event);
-	x86_perf_event_update(event);
+	static_call(x86_pmu_read)(event);
 }
 
 /*
@@ -1946,7 +2007,7 @@ static int x86_pmu_commit_txn(struct pmu
 	if (!x86_pmu_initialized())
 		return -EAGAIN;
 
-	ret = x86_pmu.schedule_events(cpuc, n, assign);
+	ret = static_call(x86_pmu_schedule_events)(cpuc, n, assign);
 	if (ret)
 		return ret;
 
@@ -2239,8 +2300,7 @@ static const struct attribute_group *x86
 
 static void x86_pmu_sched_task(struct perf_event_context *ctx, bool sched_in)
 {
-	if (x86_pmu.sched_task)
-		x86_pmu.sched_task(ctx, sched_in);
+	static_cond_call(x86_pmu_sched_task)(ctx, sched_in);
 }
 
 void perf_check_microcode(void)



^ permalink raw reply	[flat|nested] 96+ messages in thread

* [RFC][PATCH 0/9] Variable size jump_label support
  2019-10-07  9:02 [RESEND] everything text-poke: ftrace, modules, static_call and jump_label Peter Zijlstra
                   ` (2 preceding siblings ...)
  2019-10-07  8:27 ` [PATCH v2 00/13] Add static_call() Peter Zijlstra
@ 2019-10-07  8:44 ` Peter Zijlstra
  2019-10-07  8:44   ` [RFC][PATCH 1/9] jump_label, x86: Strip ASM " Peter Zijlstra
                     ` (9 more replies)
  3 siblings, 10 replies; 96+ messages in thread
From: Peter Zijlstra @ 2019-10-07  8:44 UTC (permalink / raw)
  To: x86
  Cc: peterz, linux-kernel, rostedt, mhiramat, bristot, jbaron,
	torvalds, tglx, mingo, namit, hpa, luto, ard.biesheuvel,
	jpoimboe, hjl.tools

These here patches are something I've been poking at for a while, enabling
jump_label to use 2 byte jumps/nops.

It _almost_ works :-/

That is, you can build some kernels with it (x86_64-defconfig for example works
just fine).

The problem comes when GCC generates a branch into another section, mostly
.text.unlikely. At that point GAS just gives up and throws a fit (more details
in the last patch).

Aside from anyone coming up with a really clever GAS trick, I don't see how we
can do this other than:

 - get binutils/gas 'fixed', for this there's two possible approaches:

  * add some extra operators such that something like:

      .set disp %[l_yes] ~ 1b ? (%[l_yes] - (1b + 2)) : 128

    works, where the new '~' infix operator would indicate left and right hand
    operands are from the same section, and the ?: conditional operator would be
    added to make it all work.

  * add a 'fake' mnemonic for x86 that directly generates the right NOP:

      nojmp{.d8,d32} %target

    which would completely mirror how the existing 'jmp{.d8,d32} %target'
    works, except it would emit single instruction NOP2/NOP5.

 - use 'jmp' and get objtool to rewrite the text. Steven has earlier proposed
   something like that (using recordmcount) and Linus hated that.


^ permalink raw reply	[flat|nested] 96+ messages in thread

* [RFC][PATCH 1/9] jump_label, x86: Strip ASM jump_label support
  2019-10-07  8:44 ` [RFC][PATCH 0/9] Variable size jump_label support Peter Zijlstra
@ 2019-10-07  8:44   ` " Peter Zijlstra
  2019-10-07  8:44   ` [RFC][PATCH 2/9] jump_label, x86: Factor out the __jump_table generation Peter Zijlstra
                     ` (8 subsequent siblings)
  9 siblings, 0 replies; 96+ messages in thread
From: Peter Zijlstra @ 2019-10-07  8:44 UTC (permalink / raw)
  To: x86
  Cc: peterz, linux-kernel, rostedt, mhiramat, bristot, jbaron,
	torvalds, tglx, mingo, namit, hpa, luto, ard.biesheuvel,
	jpoimboe, hjl.tools

In prepration for variable size jump_label support; remove all ASM
bits that are not used.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 arch/x86/entry/calling.h          |    2 +-
 arch/x86/include/asm/jump_label.h |   28 ++++------------------------
 2 files changed, 5 insertions(+), 25 deletions(-)

--- a/arch/x86/entry/calling.h
+++ b/arch/x86/entry/calling.h
@@ -337,7 +337,7 @@ For 32-bit we have the following convent
 .macro CALL_enter_from_user_mode
 #ifdef CONFIG_CONTEXT_TRACKING
 #ifdef CONFIG_JUMP_LABEL
-	STATIC_JUMP_IF_FALSE .Lafter_call_\@, context_tracking_enabled, def=0
+	STATIC_BRANCH_FALSE_LIKELY .Lafter_call_\@, context_tracking_enabled
 #endif
 	call enter_from_user_mode
 .Lafter_call_\@:
--- a/arch/x86/include/asm/jump_label.h
+++ b/arch/x86/include/asm/jump_label.h
@@ -55,36 +55,16 @@ static __always_inline bool arch_static_
 
 #else	/* __ASSEMBLY__ */
 
-.macro STATIC_JUMP_IF_TRUE target, key, def
+.macro STATIC_BRANCH_FALSE_LIKELY target, key
 .Lstatic_jump_\@:
-	.if \def
 	/* Equivalent to "jmp.d32 \target" */
 	.byte		0xe9
-	.long		\target - .Lstatic_jump_after_\@
-.Lstatic_jump_after_\@:
-	.else
-	.byte		STATIC_KEY_INIT_NOP
-	.endif
-	.pushsection __jump_table, "aw"
-	_ASM_ALIGN
-	.long		.Lstatic_jump_\@ - ., \target - .
-	_ASM_PTR	\key - .
-	.popsection
-.endm
+	.long		\target - (. + 4)
 
-.macro STATIC_JUMP_IF_FALSE target, key, def
-.Lstatic_jump_\@:
-	.if \def
-	.byte		STATIC_KEY_INIT_NOP
-	.else
-	/* Equivalent to "jmp.d32 \target" */
-	.byte		0xe9
-	.long		\target - .Lstatic_jump_after_\@
-.Lstatic_jump_after_\@:
-	.endif
 	.pushsection __jump_table, "aw"
 	_ASM_ALIGN
-	.long		.Lstatic_jump_\@ - ., \target - .
+	.long		.Lstatic_jump_\@ - .
+	.long		\target - .
 	_ASM_PTR	\key + 1 - .
 	.popsection
 .endm



^ permalink raw reply	[flat|nested] 96+ messages in thread

* [RFC][PATCH 2/9] jump_label, x86: Factor out the __jump_table generation
  2019-10-07  8:44 ` [RFC][PATCH 0/9] Variable size jump_label support Peter Zijlstra
  2019-10-07  8:44   ` [RFC][PATCH 1/9] jump_label, x86: Strip ASM " Peter Zijlstra
@ 2019-10-07  8:44   ` Peter Zijlstra
  2019-10-07  8:44   ` [RFC][PATCH 3/9] jump_label, x86: Remove init NOP optimization Peter Zijlstra
                     ` (7 subsequent siblings)
  9 siblings, 0 replies; 96+ messages in thread
From: Peter Zijlstra @ 2019-10-07  8:44 UTC (permalink / raw)
  To: x86
  Cc: peterz, linux-kernel, rostedt, mhiramat, bristot, jbaron,
	torvalds, tglx, mingo, namit, hpa, luto, ard.biesheuvel,
	jpoimboe, hjl.tools

Both arch_static_branch() and arch_static_branch_jump() have the same
blurb to generate the __jump_table entry, share it.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 arch/x86/include/asm/jump_label.h |   24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

--- a/arch/x86/include/asm/jump_label.h
+++ b/arch/x86/include/asm/jump_label.h
@@ -20,15 +20,19 @@
 #include <linux/stringify.h>
 #include <linux/types.h>
 
+#define JUMP_TABLE_ENTRY				\
+	".pushsection __jump_table,  \"aw\" \n\t"	\
+	_ASM_ALIGN "\n\t"				\
+	".long 1b - . \n\t"				\
+	".long %l[l_yes] - . \n\t"			\
+	_ASM_PTR "%c0 + %c1 - .\n\t"			\
+	".popsection \n\t"
+
 static __always_inline bool arch_static_branch(struct static_key *key, bool branch)
 {
 	asm_volatile_goto("1:"
 		".byte " __stringify(STATIC_KEY_INIT_NOP) "\n\t"
-		".pushsection __jump_table,  \"aw\" \n\t"
-		_ASM_ALIGN "\n\t"
-		".long 1b - ., %l[l_yes] - . \n\t"
-		_ASM_PTR "%c0 + %c1 - .\n\t"
-		".popsection \n\t"
+		JUMP_TABLE_ENTRY
 		: :  "i" (key), "i" (branch) : : l_yes);
 
 	return false;
@@ -39,13 +43,9 @@ static __always_inline bool arch_static_
 static __always_inline bool arch_static_branch_jump(struct static_key *key, bool branch)
 {
 	asm_volatile_goto("1:"
-		".byte 0xe9\n\t .long %l[l_yes] - 2f\n\t"
-		"2:\n\t"
-		".pushsection __jump_table,  \"aw\" \n\t"
-		_ASM_ALIGN "\n\t"
-		".long 1b - ., %l[l_yes] - . \n\t"
-		_ASM_PTR "%c0 + %c1 - .\n\t"
-		".popsection \n\t"
+		".byte 0xe9 \n\t"
+		".long %l[l_yes] - (. + 4) \n\t"
+		JUMP_TABLE_ENTRY
 		: :  "i" (key), "i" (branch) : : l_yes);
 
 	return false;



^ permalink raw reply	[flat|nested] 96+ messages in thread

* [RFC][PATCH 3/9] jump_label, x86: Remove init NOP optimization
  2019-10-07  8:44 ` [RFC][PATCH 0/9] Variable size jump_label support Peter Zijlstra
  2019-10-07  8:44   ` [RFC][PATCH 1/9] jump_label, x86: Strip ASM " Peter Zijlstra
  2019-10-07  8:44   ` [RFC][PATCH 2/9] jump_label, x86: Factor out the __jump_table generation Peter Zijlstra
@ 2019-10-07  8:44   ` Peter Zijlstra
  2019-10-07  8:44   ` [RFC][PATCH 4/9] jump_label, x86: Improve error when we fail expected text Peter Zijlstra
                     ` (6 subsequent siblings)
  9 siblings, 0 replies; 96+ messages in thread
From: Peter Zijlstra @ 2019-10-07  8:44 UTC (permalink / raw)
  To: x86
  Cc: peterz, linux-kernel, rostedt, mhiramat, bristot, jbaron,
	torvalds, tglx, mingo, namit, hpa, luto, ard.biesheuvel,
	jpoimboe, hjl.tools

Instead of checking if the emitted (default) NOP is the ideal NOP, and
conditionally rewrite the NOP, just rewrite the NOP.

This shouldn't be a problem because init / module_load uses
text_poke_early() which is cheap and this saves us from having to go
figure out which NOP to compare against when we go variable size.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 arch/x86/kernel/jump_label.c |   19 ++++---------------
 1 file changed, 4 insertions(+), 15 deletions(-)

--- a/arch/x86/kernel/jump_label.c
+++ b/arch/x86/kernel/jump_label.c
@@ -137,21 +137,10 @@ __init_or_module void arch_jump_label_tr
 				      enum jump_label_type type)
 {
 	/*
-	 * This function is called at boot up and when modules are
-	 * first loaded. Check if the default nop, the one that is
-	 * inserted at compile time, is the ideal nop. If it is, then
-	 * we do not need to update the nop, and we can leave it as is.
-	 * If it is not, then we need to update the nop to the ideal nop.
+	 * Rewrite the NOP on init / module-load to ensure we got the ideal
+	 * nop.  Don't bother with trying to figure out what size and what nop
+	 * it should be for now, simply do an unconditional rewrite.
 	 */
-	if (jlstate == JL_STATE_START) {
-		const unsigned char default_nop[] = { STATIC_KEY_INIT_NOP };
-		const unsigned char *ideal_nop = ideal_nops[NOP_ATOMIC5];
-
-		if (memcmp(ideal_nop, default_nop, 5) != 0)
-			jlstate = JL_STATE_UPDATE;
-		else
-			jlstate = JL_STATE_NO_UPDATE;
-	}
-	if (jlstate == JL_STATE_UPDATE)
+	if (jlstate == JL_STATE_UPDATE || jlstate == JL_STATE_START)
 		jump_label_transform(entry, type, 1);
 }



^ permalink raw reply	[flat|nested] 96+ messages in thread

* [RFC][PATCH 4/9] jump_label, x86: Improve error when we fail expected text
  2019-10-07  8:44 ` [RFC][PATCH 0/9] Variable size jump_label support Peter Zijlstra
                     ` (2 preceding siblings ...)
  2019-10-07  8:44   ` [RFC][PATCH 3/9] jump_label, x86: Remove init NOP optimization Peter Zijlstra
@ 2019-10-07  8:44   ` Peter Zijlstra
  2019-10-07  8:44   ` [RFC][PATCH 5/9] jump_label, x86: Introduce jump_entry_size() Peter Zijlstra
                     ` (5 subsequent siblings)
  9 siblings, 0 replies; 96+ messages in thread
From: Peter Zijlstra @ 2019-10-07  8:44 UTC (permalink / raw)
  To: x86
  Cc: peterz, linux-kernel, rostedt, mhiramat, bristot, jbaron,
	torvalds, tglx, mingo, namit, hpa, luto, ard.biesheuvel,
	jpoimboe, hjl.tools

There is only a single usage site left, remove the function and extend
the print to include more information, like the expected text and the
patch type.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 arch/x86/kernel/jump_label.c |   23 ++++++++++-------------
 1 file changed, 10 insertions(+), 13 deletions(-)

--- a/arch/x86/kernel/jump_label.c
+++ b/arch/x86/kernel/jump_label.c
@@ -16,17 +16,6 @@
 #include <asm/alternative.h>
 #include <asm/text-patching.h>
 
-static void bug_at(const void *ip, int line)
-{
-	/*
-	 * The location is not an op that we were expecting.
-	 * Something went wrong. Crash the box, as something could be
-	 * corrupting the kernel.
-	 */
-	pr_crit("jump_label: Fatal kernel bug, unexpected op at %pS [%p] (%5ph) %d\n", ip, ip, ip, line);
-	BUG();
-}
-
 static const void *
 __jump_label_set_jump_code(struct jump_entry *entry, enum jump_label_type type, int init)
 {
@@ -49,8 +38,16 @@ __jump_label_set_jump_code(struct jump_e
 		expect = code; line = __LINE__;
 	}
 
-	if (memcmp(addr, expect, JUMP_LABEL_NOP_SIZE))
-		bug_at(addr, line);
+	if (memcmp(addr, expect, JUMP_LABEL_NOP_SIZE)) {
+		/*
+		 * The location is not an op that we were expecting.
+		 * Something went wrong. Crash the box, as something could be
+		 * corrupting the kernel.
+		 */
+		pr_crit("jump_label: Fatal kernel bug, unexpected op at %pS [%p] (%5ph != %5ph)) line:%d init:%d type:%d\n",
+				addr, addr, addr, expect, line, init, type);
+		BUG();
+	}
 
 	if (type == JUMP_LABEL_NOP)
 		code = ideal_nop;



^ permalink raw reply	[flat|nested] 96+ messages in thread

* [RFC][PATCH 5/9] jump_label, x86: Introduce jump_entry_size()
  2019-10-07  8:44 ` [RFC][PATCH 0/9] Variable size jump_label support Peter Zijlstra
                     ` (3 preceding siblings ...)
  2019-10-07  8:44   ` [RFC][PATCH 4/9] jump_label, x86: Improve error when we fail expected text Peter Zijlstra
@ 2019-10-07  8:44   ` Peter Zijlstra
  2019-10-07  8:44   ` [RFC][PATCH 6/9] jump_label, x86: Add variable length patching support Peter Zijlstra
                     ` (4 subsequent siblings)
  9 siblings, 0 replies; 96+ messages in thread
From: Peter Zijlstra @ 2019-10-07  8:44 UTC (permalink / raw)
  To: x86
  Cc: peterz, linux-kernel, rostedt, mhiramat, bristot, jbaron,
	torvalds, tglx, mingo, namit, hpa, luto, ard.biesheuvel,
	jpoimboe, hjl.tools

This allows architectures to have variable sized jumps.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 arch/x86/include/asm/jump_label.h |    4 ++--
 arch/x86/kernel/jump_label.c      |    5 +++++
 include/linux/jump_label.h        |    9 +++++++++
 kernel/jump_label.c               |    2 +-
 4 files changed, 17 insertions(+), 3 deletions(-)

--- a/arch/x86/include/asm/jump_label.h
+++ b/arch/x86/include/asm/jump_label.h
@@ -4,8 +4,6 @@
 
 #define HAVE_JUMP_LABEL_BATCH
 
-#define JUMP_LABEL_NOP_SIZE 5
-
 #ifdef CONFIG_X86_64
 # define STATIC_KEY_INIT_NOP P6_NOP5_ATOMIC
 #else
@@ -53,6 +51,8 @@ static __always_inline bool arch_static_
 	return true;
 }
 
+extern int arch_jump_entry_size(struct jump_entry *entry);
+
 #else	/* __ASSEMBLY__ */
 
 .macro STATIC_BRANCH_FALSE_LIKELY target, key
--- a/arch/x86/kernel/jump_label.c
+++ b/arch/x86/kernel/jump_label.c
@@ -16,6 +16,11 @@
 #include <asm/alternative.h>
 #include <asm/text-patching.h>
 
+int arch_jump_entry_size(struct jump_entry *entry)
+{
+	return JMP32_INSN_SIZE;
+}
+
 static const void *
 __jump_label_set_jump_code(struct jump_entry *entry, enum jump_label_type type, int init)
 {
--- a/include/linux/jump_label.h
+++ b/include/linux/jump_label.h
@@ -176,6 +176,15 @@ static inline void jump_entry_set_init(s
 	entry->key |= 2;
 }
 
+static inline int jump_entry_size(struct jump_entry *entry)
+{
+#ifdef JUMP_LABEL_NOP_SIZE
+	return JUMP_LABEL_NOP_SIZE;
+#else
+	return arch_jump_entry_size(entry);
+#endif
+}
+
 #endif
 #endif
 
--- a/kernel/jump_label.c
+++ b/kernel/jump_label.c
@@ -309,7 +309,7 @@ EXPORT_SYMBOL_GPL(jump_label_rate_limit)
 static int addr_conflict(struct jump_entry *entry, void *start, void *end)
 {
 	if (jump_entry_code(entry) <= (unsigned long)end &&
-	    jump_entry_code(entry) + JUMP_LABEL_NOP_SIZE > (unsigned long)start)
+	    jump_entry_code(entry) + jump_entry_size(entry) > (unsigned long)start)
 		return 1;
 
 	return 0;



^ permalink raw reply	[flat|nested] 96+ messages in thread

* [RFC][PATCH 6/9] jump_label, x86: Add variable length patching support
  2019-10-07  8:44 ` [RFC][PATCH 0/9] Variable size jump_label support Peter Zijlstra
                     ` (4 preceding siblings ...)
  2019-10-07  8:44   ` [RFC][PATCH 5/9] jump_label, x86: Introduce jump_entry_size() Peter Zijlstra
@ 2019-10-07  8:44   ` Peter Zijlstra
  2019-10-07  8:44   ` [RFC][PATCH 7/9] jump_label,objtool: Validate variable size jump labels Peter Zijlstra
                     ` (3 subsequent siblings)
  9 siblings, 0 replies; 96+ messages in thread
From: Peter Zijlstra @ 2019-10-07  8:44 UTC (permalink / raw)
  To: x86
  Cc: peterz, linux-kernel, rostedt, mhiramat, bristot, jbaron,
	torvalds, tglx, mingo, namit, hpa, luto, ard.biesheuvel,
	jpoimboe, hjl.tools

This allows the patching to to emit 2 byte JMP/NOP instruction in
addition to the 5 byte JMP/NOP we already did. This allows for more
compact code.

This code is not yet used, as we don't emit shorter code at compile
time yet.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 arch/x86/include/asm/jump_label.h |    8 +++--
 arch/x86/include/asm/nops.h       |    1 
 arch/x86/kernel/jump_label.c      |   60 ++++++++++++++++++++++++--------------
 3 files changed, 45 insertions(+), 24 deletions(-)

--- a/arch/x86/include/asm/jump_label.h
+++ b/arch/x86/include/asm/jump_label.h
@@ -5,9 +5,11 @@
 #define HAVE_JUMP_LABEL_BATCH
 
 #ifdef CONFIG_X86_64
-# define STATIC_KEY_INIT_NOP P6_NOP5_ATOMIC
+# define STATIC_KEY_NOP2 P6_NOP2
+# define STATIC_KEY_NOP5 P6_NOP5_ATOMIC
 #else
-# define STATIC_KEY_INIT_NOP GENERIC_NOP5_ATOMIC
+# define STATIC_KEY_NOP2 GENERIC_NOP2
+# define STATIC_KEY_NOP5 GENERIC_NOP5_ATOMIC
 #endif
 
 #include <asm/asm.h>
@@ -29,7 +31,7 @@
 static __always_inline bool arch_static_branch(struct static_key *key, bool branch)
 {
 	asm_volatile_goto("1:"
-		".byte " __stringify(STATIC_KEY_INIT_NOP) "\n\t"
+		".byte " __stringify(STATIC_KEY_NOP5) "\n\t"
 		JUMP_TABLE_ENTRY
 		: :  "i" (key), "i" (branch) : : l_yes);
 
--- a/arch/x86/include/asm/nops.h
+++ b/arch/x86/include/asm/nops.h
@@ -5,6 +5,7 @@
 /*
  * Define nops for use with alternative() and for tracing.
  *
+ * *_NOP2 must be a single instruction
  * *_NOP5_ATOMIC must be a single instruction.
  */
 
--- a/arch/x86/kernel/jump_label.c
+++ b/arch/x86/kernel/jump_label.c
@@ -21,50 +21,70 @@ int arch_jump_entry_size(struct jump_ent
 	return JMP32_INSN_SIZE;
 }
 
-static const void *
-__jump_label_set_jump_code(struct jump_entry *entry, enum jump_label_type type, int init)
+struct jump_label_patch {
+	const void *code;
+	int size;
+};
+
+static struct jump_label_patch
+__jump_label_patch(struct jump_entry *entry, enum jump_label_type type, int init)
 {
-	const unsigned char default_nop[] = { STATIC_KEY_INIT_NOP };
-	const unsigned char *ideal_nop = ideal_nops[NOP_ATOMIC5];
-	const void *expect, *code;
+	const unsigned char default_nop2[] = { STATIC_KEY_NOP2 };
+	const unsigned char default_nop5[] = { STATIC_KEY_NOP5 };
+	const void *expect, *code, *nop, *default_nop;
 	const void *addr, *dest;
-	int line;
+	int line, size;
 
 	addr = (void *)jump_entry_code(entry);
 	dest = (void *)jump_entry_target(entry);
 
-	code = text_gen_insn(JMP32_INSN_OPCODE, addr, dest);
+	size = arch_jump_entry_size(entry);
+	switch (size) {
+	case JMP8_INSN_SIZE:
+		code = text_gen_insn(JMP8_INSN_OPCODE, addr, dest);
+		default_nop = default_nop2;
+		nop = ideal_nops[2];
+		break;
+
+	case JMP32_INSN_SIZE:
+		code = text_gen_insn(JMP32_INSN_OPCODE, addr, dest);
+		default_nop = default_nop5;
+		nop = ideal_nops[NOP_ATOMIC5];
+		break;
+
+	default: BUG();
+	}
 
 	if (init) {
 		expect = default_nop; line = __LINE__;
 	} else if (type == JUMP_LABEL_JMP) {
-		expect = ideal_nop; line = __LINE__;
+		expect = nop; line = __LINE__;
 	} else {
 		expect = code; line = __LINE__;
 	}
 
-	if (memcmp(addr, expect, JUMP_LABEL_NOP_SIZE)) {
+	if (memcmp(addr, expect, size)) {
 		/*
 		 * The location is not an op that we were expecting.
 		 * Something went wrong. Crash the box, as something could be
 		 * corrupting the kernel.
 		 */
-		pr_crit("jump_label: Fatal kernel bug, unexpected op at %pS [%p] (%5ph != %5ph)) line:%d init:%d type:%d\n",
-				addr, addr, addr, expect, line, init, type);
+		pr_crit("jump_label: Fatal kernel bug, unexpected op at %pS [%p] (%5ph != %5ph)) line:%d init:%d size:%d type:%d\n",
+				addr, addr, addr, expect, line, init, size, type);
 		BUG();
 	}
 
 	if (type == JUMP_LABEL_NOP)
-		code = ideal_nop;
+		code = nop;
 
-	return code;
+	return (struct jump_label_patch){.code = code, .size = size};
 }
 
 static void inline __jump_label_transform(struct jump_entry *entry,
 					  enum jump_label_type type,
 					  int init)
 {
-	const void *opcode = __jump_label_set_jump_code(entry, type, init);
+	const struct jump_label_patch jlp = __jump_label_patch(entry, type, init);
 
 	/*
 	 * As long as only a single processor is running and the code is still
@@ -78,12 +98,11 @@ static void inline __jump_label_transfor
 	 * always nop being the 'currently valid' instruction
 	 */
 	if (init || system_state == SYSTEM_BOOTING) {
-		text_poke_early((void *)jump_entry_code(entry), opcode,
-				JUMP_LABEL_NOP_SIZE);
+		text_poke_early((void *)jump_entry_code(entry), jlp.code, jlp.size);
 		return;
 	}
 
-	text_poke_bp((void *)jump_entry_code(entry), opcode, JUMP_LABEL_NOP_SIZE, NULL);
+	text_poke_bp((void *)jump_entry_code(entry), jlp.code, jlp.size, NULL);
 }
 
 static void __ref jump_label_transform(struct jump_entry *entry,
@@ -104,7 +123,7 @@ void arch_jump_label_transform(struct ju
 bool arch_jump_label_transform_queue(struct jump_entry *entry,
 				     enum jump_label_type type)
 {
-	const void *opcode;
+	struct jump_label_patch jlp;
 
 	if (system_state == SYSTEM_BOOTING) {
 		/*
@@ -115,9 +134,8 @@ bool arch_jump_label_transform_queue(str
 	}
 
 	mutex_lock(&text_mutex);
-	opcode = __jump_label_set_jump_code(entry, type, 0);
-	text_poke_queue((void *)jump_entry_code(entry),
-			opcode, JUMP_LABEL_NOP_SIZE, NULL);
+	jlp = __jump_label_patch(entry, type, 0);
+	text_poke_queue((void *)jump_entry_code(entry), jlp.code, jlp.size, NULL);
 	mutex_unlock(&text_mutex);
 	return true;
 }



^ permalink raw reply	[flat|nested] 96+ messages in thread

* [RFC][PATCH 7/9] jump_label,objtool: Validate variable size jump labels
  2019-10-07  8:44 ` [RFC][PATCH 0/9] Variable size jump_label support Peter Zijlstra
                     ` (5 preceding siblings ...)
  2019-10-07  8:44   ` [RFC][PATCH 6/9] jump_label, x86: Add variable length patching support Peter Zijlstra
@ 2019-10-07  8:44   ` Peter Zijlstra
  2019-10-07  8:44   ` [RFC][PATCH 8/9] jump_label,objtool: Generate possible statistics Peter Zijlstra
                     ` (2 subsequent siblings)
  9 siblings, 0 replies; 96+ messages in thread
From: Peter Zijlstra @ 2019-10-07  8:44 UTC (permalink / raw)
  To: x86
  Cc: peterz, linux-kernel, rostedt, mhiramat, bristot, jbaron,
	torvalds, tglx, mingo, namit, hpa, luto, ard.biesheuvel,
	jpoimboe, hjl.tools

Since variable sized jump_label support is somewhat 'inspired', ensure
objtool validates the tricky bits. Specifically it is important that
the displacement for the 2 byte jump is limited to a single byte --
since this is the hardest part of the whole scheme and relies on
somewhat dodgy GAS tricks.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: "H.J. Lu" <hjl.tools@gmail.com>
---
 tools/objtool/check.c |   26 +++++++++++++++++++++++++-
 1 file changed, 25 insertions(+), 1 deletion(-)

--- a/tools/objtool/check.c
+++ b/tools/objtool/check.c
@@ -817,8 +817,32 @@ static int handle_jump_alt(struct objtoo
 			   struct instruction *orig_insn,
 			   struct instruction **new_insn)
 {
-	if (orig_insn->type == INSN_NOP)
+	if ((orig_insn->len != 2) && (orig_insn->len != 5)) {
+		WARN_FUNC("jump_label: unsupported INSN length: %d",
+				orig_insn->sec, orig_insn->offset, orig_insn->len);
+		return -1;
+	}
+
+	if (orig_insn->type == INSN_NOP) {
+		long disp;
+
+		if (orig_insn->len == 2) {
+			if (special_alt->orig_sec != special_alt->new_sec) {
+				WARN_FUNC("jump_label: JMP8 crossing sections",
+						orig_insn->sec, orig_insn->offset);
+				return -1;
+			}
+
+			disp = special_alt->new_off - (special_alt->orig_off + 2);
+			if ((disp >> 31) != (disp >> 7)) {
+				WARN_FUNC("jump_label: JMP8 displacement not a byte",
+						orig_insn->sec, orig_insn->offset);
+				return -1;
+			}
+		}
+
 		return 0;
+	}
 
 	if (orig_insn->type != INSN_JUMP_UNCONDITIONAL) {
 		WARN_FUNC("unsupported instruction at jump label",



^ permalink raw reply	[flat|nested] 96+ messages in thread

* [RFC][PATCH 8/9] jump_label,objtool: Generate possible statistics
  2019-10-07  8:44 ` [RFC][PATCH 0/9] Variable size jump_label support Peter Zijlstra
                     ` (6 preceding siblings ...)
  2019-10-07  8:44   ` [RFC][PATCH 7/9] jump_label,objtool: Validate variable size jump labels Peter Zijlstra
@ 2019-10-07  8:44   ` Peter Zijlstra
  2019-10-07  8:44   ` [RFC][PATCH 9/9] jump_label, x86: Enable JMP8/NOP2 support Peter Zijlstra
  2019-10-07 12:07   ` [RFC][PATCH 0/9] Variable size jump_label support Peter Zijlstra
  9 siblings, 0 replies; 96+ messages in thread
From: Peter Zijlstra @ 2019-10-07  8:44 UTC (permalink / raw)
  To: x86
  Cc: peterz, linux-kernel, rostedt, mhiramat, bristot, jbaron,
	torvalds, tglx, mingo, namit, hpa, luto, ard.biesheuvel,
	jpoimboe, hjl.tools

	x86_64-defconfig	x86_64-allmodconfig-UBSAN-KASAN

NOP2	1641			21796
JMP8	48			114

NOP5	1010			31042
JMP32	29			91

Which results in a possible 3*(1641+48) ~ 5k saving for defconfig
and 3*(21796+114) ~ 64k saving for allmodconfig.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: "H.J. Lu" <hjl.tools@gmail.com>
---
 tools/objtool/check.c |   43 +++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 41 insertions(+), 2 deletions(-)

--- a/tools/objtool/check.c
+++ b/tools/objtool/check.c
@@ -817,6 +817,8 @@ static int handle_jump_alt(struct objtoo
 			   struct instruction *orig_insn,
 			   struct instruction **new_insn)
 {
+	long disp;
+
 	if ((orig_insn->len != 2) && (orig_insn->len != 5)) {
 		WARN_FUNC("jump_label: unsupported INSN length: %d",
 				orig_insn->sec, orig_insn->offset, orig_insn->len);
@@ -824,9 +826,12 @@ static int handle_jump_alt(struct objtoo
 	}
 
 	if (orig_insn->type == INSN_NOP) {
-		long disp;
 
-		if (orig_insn->len == 2) {
+		switch (orig_insn->len) {
+		case 2:
+			WARN_FUNC("jump_label: NOP2 present",
+					orig_insn->sec, orig_insn->offset);
+
 			if (special_alt->orig_sec != special_alt->new_sec) {
 				WARN_FUNC("jump_label: JMP8 crossing sections",
 						orig_insn->sec, orig_insn->offset);
@@ -839,6 +844,20 @@ static int handle_jump_alt(struct objtoo
 						orig_insn->sec, orig_insn->offset);
 				return -1;
 			}
+			break;
+
+		case 5:
+			if (special_alt->orig_sec == special_alt->new_sec) {
+				disp = special_alt->new_off - (special_alt->orig_off + 2);
+				if ((disp>>31) == (disp>>7)) {
+					WARN_FUNC("jump_label: NOP2 possible",
+							orig_insn->sec, orig_insn->offset);
+					break;
+				}
+			}
+			WARN_FUNC("jump_label: NOP5 present",
+					orig_insn->sec, orig_insn->offset);
+			break;
 		}
 
 		return 0;
@@ -850,6 +869,26 @@ static int handle_jump_alt(struct objtoo
 		return -1;
 	}
 
+	switch (orig_insn->len) {
+	case 2:
+		WARN_FUNC("jump_label: JMP8 present",
+				orig_insn->sec, orig_insn->offset);
+		break;
+
+	case 5:
+		if (special_alt->orig_sec == special_alt->new_sec) {
+			disp = special_alt->new_off - (special_alt->orig_off + 2);
+			if ((disp>>31) == (disp>>7)) {
+				WARN_FUNC("jump_label: JMP8 possible",
+						orig_insn->sec, orig_insn->offset);
+				break;
+			}
+		}
+		WARN_FUNC("jump_label: JMP32 present",
+				orig_insn->sec, orig_insn->offset);
+		break;
+	}
+
 	*new_insn = list_next_entry(orig_insn, list);
 	return 0;
 }



^ permalink raw reply	[flat|nested] 96+ messages in thread

* [RFC][PATCH 9/9] jump_label, x86: Enable JMP8/NOP2 support
  2019-10-07  8:44 ` [RFC][PATCH 0/9] Variable size jump_label support Peter Zijlstra
                     ` (7 preceding siblings ...)
  2019-10-07  8:44   ` [RFC][PATCH 8/9] jump_label,objtool: Generate possible statistics Peter Zijlstra
@ 2019-10-07  8:44   ` Peter Zijlstra
  2019-10-07 12:07   ` [RFC][PATCH 0/9] Variable size jump_label support Peter Zijlstra
  9 siblings, 0 replies; 96+ messages in thread
From: Peter Zijlstra @ 2019-10-07  8:44 UTC (permalink / raw)
  To: x86
  Cc: peterz, linux-kernel, rostedt, mhiramat, bristot, jbaron,
	torvalds, tglx, mingo, namit, hpa, luto, ard.biesheuvel,
	jpoimboe, hjl.tools

Enable and emit short JMP/NOP jump_label entries.

A lot of the jumps are in fact short, like around tracepoints:

0000 0000000000000920 <native_read_msr>:                                   | 0000 0000000000000920 <native_read_msr>:
0000      920:  53                      push   %rbx                        | 0000      920:  53                      push   %rbx
0001      921:  89 f9                   mov    %edi,%ecx                   | 0001      921:  89 f9                   mov    %edi,%ecx
0003      923:  0f 32                   rdmsr                              | 0003      923:  0f 32                   rdmsr
0005      925:  48 c1 e2 20             shl    $0x20,%rdx                  | 0005      925:  48 c1 e2 20             shl    $0x20,%rdx
0009      929:  48 89 d3                mov    %rdx,%rbx                   | 0009      929:  48 89 d3                mov    %rdx,%rbx
000c      92c:  48 09 c3                or     %rax,%rbx                   | 000c      92c:  48 09 c3                or     %rax,%rbx
000f      92f:  0f 1f 44 00 00          nopl   0x0(%rax,%rax,1)            \ 000f      92f:  66 90                   xchg   %ax,%ax
0014      934:  48 89 d8                mov    %rbx,%rax                   \ 0011      931:  48 89 d8                mov    %rbx,%rax
0017      937:  5b                      pop    %rbx                        \ 0014      934:  5b                      pop    %rbx
0018      938:  c3                      retq                               \ 0015      935:  c3                      retq
0019      939:  48 89 de                mov    %rbx,%rsi                   \ 0016      936:  48 89 de                mov    %rbx,%rsi
001c      93c:  31 d2                   xor    %edx,%edx                   \ 0019      939:  31 d2                   xor    %edx,%edx
001e      93e:  e8 00 00 00 00          callq  943 <native_read_msr+0x23>  \ 001b      93b:  e8 00 00 00 00          callq  940 <native_read_msr+0x20>
001f                    93f: R_X86_64_PLT32     do_trace_read_msr-0x4      \ 001c                    93c: R_X86_64_PLT32     do_trace_read_msr-0x4
0023      943:  48 89 d8                mov    %rbx,%rax                   \ 0020      940:  48 89 d8                mov    %rbx,%rax
0026      946:  5b                      pop    %rbx                        \ 0023      943:  5b                      pop    %rbx
0027      947:  c3                      retq                               \ 0024      944:  c3                      retq

.rela__jump_table
  000000000010  000200000002 R_X86_64_PC32     0000000000000000 .text + 92f
  000000000014  000200000002 R_X86_64_PC32     0000000000000000 .text + 939 (or 936)
  000000000018  014500000018 R_X86_64_PC64     0000000000000000 __tracepoint_read_msr + 8

The below patch works as long as the jump doesn't cross sections; the
moment GCC generates a branch crossing sections and feeds it into our
asm-goto things come apart like:

  /tmp/ccM70dCh.s: Assembler messages:
  /tmp/ccM70dCh.s: Error: invalid operands (.text.unlikely and .text sections) for `-' when setting `disp'
  ../arch/x86/include/asm/jump_label.h:39: Error: invalid operands (.text.unlikely and *ABS* sections) for `>>'
  ../arch/x86/include/asm/jump_label.h:39: Error: invalid operands (.text.unlikely and *ABS* sections) for `>>'

Which is really unfortunate since it is a completely sane thing to
happen. We really need a GAS extention to handle this :-/

All we really need is to detect the two offsets are from different
sections and punt to the 5 byte nop. But AFAICT there is nothing that
can do that.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: "H.J. Lu" <hjl.tools@gmail.com>
---
 arch/x86/Kconfig                  |   10 ++++++++++
 arch/x86/include/asm/jump_label.h |   36 +++++++++++++++++++++++++++++++++++-
 arch/x86/kernel/jump_label.c      |   17 +++++++++++++++++
 3 files changed, 62 insertions(+), 1 deletion(-)

--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -230,6 +230,16 @@ config X86
 	select X86_FEATURE_NAMES		if PROC_FS
 	select PROC_PID_ARCH_STATUS		if PROC_FS
 
+#
+# This mostly depends on the asm ".nops 5" directive existing and emitting a
+# single instruction nop, this is true for x86_64, but not for i386, which
+# violates the single instruction constraint.
+#
+config CC_HAS_ASM_NOPS
+	def_bool y
+	depends on X86_64
+	depends on $(success,echo 'void foo(void) { asm inline (".nops 5"); }' | $(CC) -x c - -c -o /dev/null)
+
 config INSTRUCTION_DECODER
 	def_bool y
 	depends on KPROBES || PERF_EVENTS || UPROBES
--- a/arch/x86/include/asm/jump_label.h
+++ b/arch/x86/include/asm/jump_label.h
@@ -4,6 +4,10 @@
 
 #define HAVE_JUMP_LABEL_BATCH
 
+#ifdef CONFIG_CC_HAS_ASM_NOPS
+#define HAVE_JUMP_LABEL_VARIABLE
+#endif
+
 #ifdef CONFIG_X86_64
 # define STATIC_KEY_NOP2 P6_NOP2
 # define STATIC_KEY_NOP5 P6_NOP5_ATOMIC
@@ -31,7 +35,29 @@
 static __always_inline bool arch_static_branch(struct static_key *key, bool branch)
 {
 	asm_volatile_goto("1:"
+#ifdef HAVE_JUMP_LABEL_VARIABLE
+		/*
+		 * This comes apart mightily when %[l_yes] and 1b are in
+		 * different sections; like for instance .text and
+		 * .text.unlikely. Sadly there is nothing to actually detect
+		 * and handle this case explicitly.
+		 *
+		 * GAS sucks!!
+		 */
+		".set disp, (%l[l_yes]) - (1b + 2) \n\t"
+		".set res, (disp >> 31) == (disp >> 7) \n\t"
+		".set is_byte, -res \n\t"
+		".set is_long, -(~res) \n\t"
+
+		/*
+		 * This relies on .nops:
+		 *  - matching the above STATIC_KEY_NOP* bytes
+		 *  - emitting a single instruction nop for 2 and 5 bytes.
+		 */
+		".nops (2*is_byte) + (5*is_long)\n\t"
+#else
 		".byte " __stringify(STATIC_KEY_NOP5) "\n\t"
+#endif
 		JUMP_TABLE_ENTRY
 		: :  "i" (key), "i" (branch) : : l_yes);
 
@@ -43,8 +69,13 @@ static __always_inline bool arch_static_
 static __always_inline bool arch_static_branch_jump(struct static_key *key, bool branch)
 {
 	asm_volatile_goto("1:"
+#ifdef HAVE_JUMP_LABEL_VARIABLE
+		"jmp %l[l_yes] \n\t"
+#else
+		/* Equivalent to "jmp.d32 \target" */
 		".byte 0xe9 \n\t"
 		".long %l[l_yes] - (. + 4) \n\t"
+#endif
 		JUMP_TABLE_ENTRY
 		: :  "i" (key), "i" (branch) : : l_yes);
 
@@ -59,9 +90,12 @@ extern int arch_jump_entry_size(struct j
 
 .macro STATIC_BRANCH_FALSE_LIKELY target, key
 .Lstatic_jump_\@:
-	/* Equivalent to "jmp.d32 \target" */
+#ifdef HAVE_JUMP_LABEL_VARIABLE
+	jmp \target
+#else
 	.byte		0xe9
 	.long		\target - (. + 4)
+#endif
 
 	.pushsection __jump_table, "aw"
 	_ASM_ALIGN
--- a/arch/x86/kernel/jump_label.c
+++ b/arch/x86/kernel/jump_label.c
@@ -18,7 +18,24 @@
 
 int arch_jump_entry_size(struct jump_entry *entry)
 {
+#ifdef HAVE_JUMP_LABEL_VARIABLE
+	struct insn insn;
+
+	/*
+	 * Because the instruction size heuristic doesn't purely rely on
+	 * displacement, but also on section, and we're hindered by GNU as UB
+	 * to emit the assemble time choice, we have to discover the size at
+	 * runtime.
+	 */
+	kernel_insn_init(&insn, (void *)jump_entry_code(entry), MAX_INSN_SIZE);
+	insn_get_length(&insn);
+	BUG_ON(!insn_complete(&insn));
+	BUG_ON(insn.length != 2 && insn.length != 5);
+
+	return insn.length;
+#else
 	return JMP32_INSN_SIZE;
+#endif
 }
 
 struct jump_label_patch {



^ permalink raw reply	[flat|nested] 96+ messages in thread

* [RESEND] everything text-poke: ftrace, modules, static_call and jump_label
@ 2019-10-07  9:02 Peter Zijlstra
  2019-10-07  8:17 ` [PATCH v3 0/6] Rewrite x86/ftrace to use text_poke() Peter Zijlstra
                   ` (3 more replies)
  0 siblings, 4 replies; 96+ messages in thread
From: Peter Zijlstra @ 2019-10-07  9:02 UTC (permalink / raw)
  To: x86
  Cc: peterz, linux-kernel, rostedt, mhiramat, bristot, jbaron,
	torvalds, tglx, mingo, namit, hpa, luto, ard.biesheuvel,
	jpoimboe


Because typing on a Monday is hard, here's a resend with the LKML address
corrected.

---

Hi All,

Here are 4 series that have accumulated in my queue for a while and I
figured I'd 'finish' and send out again.

The first series is rewriting x86/ftrace to use the normal text_poke()
interfaces we have, which rids us of the last major W^X violation and a
whole second implementation of the INT3 breakpoint code patching
infrastructure.

The second series cleans up various module stuff.

The third series is a refresh of the static_call() implementation.  I've
been meaning to use that to get rid of a number of indirect calls in the
x86 pmu implementation (RETPOLINE made them so much more painful), but
also other people have been asking for it. This series has a lot of new
patches since last time, most all a result of me trying to actually use
it.

The fourth series is something that has been languishing in my queue for
a long while and I figure I'd send it out to see if anybody has any
clever ideas on how to proceed with it.

They're all related, and the 3rd and 4rd series depend on the first two.
But instead of sending a giant 30+ patch series, I figured I'd send you 4
smaller series instead ;-)

Enjoy!


^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH v2 00/13] Add static_call()
  2019-10-07  8:27 ` [PATCH v2 00/13] Add static_call() Peter Zijlstra
                     ` (12 preceding siblings ...)
  2019-10-07  8:27   ` [RFC][PATCH v2 13/13] x86/perf, static_call: Optimize x86_pmu methods Peter Zijlstra
@ 2019-10-07 11:33   ` Peter Zijlstra
  13 siblings, 0 replies; 96+ messages in thread
From: Peter Zijlstra @ 2019-10-07 11:33 UTC (permalink / raw)
  To: x86
  Cc: linux-kernel, rostedt, mhiramat, bristot, jbaron, torvalds, tglx,
	mingo, namit, hpa, luto, ard.biesheuvel, jpoimboe

On Mon, Oct 07, 2019 at 10:27:08AM +0200, Peter Zijlstra wrote:
> This series, which depends on the previous two, introduces static_call().
> 
> static_call(), is the idea of static_branch() applied to indirect function
> calls. Remove a data load (indirection) by modifying the text.
> 
> These patches are still based on the work Josh did earlier, but incorporated
> feedback from the last posting and have a lot of extra patches which resulted
> from me trying to actually use static_call().
> 
> This still relies on objtool to generate the .static_call_sites section, mostly
> because this is a natural place for x86_64 and works for both GCC and LLVM.
> Other architectures can pick other means if/when they implement the inline
> patching. The out-of-line (aka. trampoline) variant doesn't require this.
> 

FWIW, Steve, if we were to do something like:

CFLAGS += -mfentry -mfentry_name=____static_call_fentry_tramp

And have:

struct static_call_key ____static_call_fentry = {
	.func = NULL,
	.next = 1,
};
asm(".pushsection .static_call.text, \"ax\" \n"
    ".align 4 \n"
    ".globl ____static_call_fentry_tramp \n"
    "____static_call_fentry_tramp: \n"
    "  ret \n"
    ".type ____static_call_fentry_tramp, @function \n"
    ".size ____static_call_fentry_tramp, . - ____static_call_fentry_tramp \n"
    ".popsection \n");

Then the whole function entry thing would automagicaly turn into
something like static_cond_call(fentry)(...);

Not saying we should do that, but we could ;-)

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH v2 02/13] static_call: Add basic static call infrastructure
  2019-10-07  8:27   ` [PATCH v2 02/13] static_call: Add basic static call infrastructure Peter Zijlstra
@ 2019-10-07 11:33     ` Peter Zijlstra
  0 siblings, 0 replies; 96+ messages in thread
From: Peter Zijlstra @ 2019-10-07 11:33 UTC (permalink / raw)
  To: x86
  Cc: linux-kernel, rostedt, mhiramat, bristot, jbaron, torvalds, tglx,
	mingo, namit, hpa, luto, ard.biesheuvel, jpoimboe

On Mon, Oct 07, 2019 at 10:27:10AM +0200, Peter Zijlstra wrote:

> +#define STATIC_CALL_PREFIX	____static_call_

Yesterday I got an allmodconfig build complaining about symbols being
too long, in part due to this prefix. Should we change it to something
like: "__SC__" ?


^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [RFC][PATCH 0/9] Variable size jump_label support
  2019-10-07  8:44 ` [RFC][PATCH 0/9] Variable size jump_label support Peter Zijlstra
                     ` (8 preceding siblings ...)
  2019-10-07  8:44   ` [RFC][PATCH 9/9] jump_label, x86: Enable JMP8/NOP2 support Peter Zijlstra
@ 2019-10-07 12:07   ` Peter Zijlstra
  2019-10-07 12:55     ` Ingo Molnar
  9 siblings, 1 reply; 96+ messages in thread
From: Peter Zijlstra @ 2019-10-07 12:07 UTC (permalink / raw)
  To: x86
  Cc: linux-kernel, rostedt, mhiramat, bristot, jbaron, torvalds, tglx,
	mingo, namit, hpa, luto, ard.biesheuvel, jpoimboe, hjl.tools


In the failed thread Ingo posted:

>On Mon, Oct 07, 2019 at 01:26:06PM +0200, Ingo Molnar wrote:
> * Peter Zijlstra <peterz@infradead.org> wrote:
> 
> > These here patches are something I've been poking at for a while, 
> > enabling jump_label to use 2 byte jumps/nops.
> > 
> > It _almost_ works :-/
> > 
> > That is, you can build some kernels with it (x86_64-defconfig for 
> > example works just fine).
> > 
> > The problem comes when GCC generates a branch into another section, 
> > mostly .text.unlikely. At that point GAS just gives up and throws a fit 
> > (more details in the last patch).
> > 
> > Aside from anyone coming up with a really clever GAS trick, I don't see 
> > how we can do this other than:
> 
> >  - use 'jmp' and get objtool to rewrite the text. Steven has earlier proposed
> >    something like that (using recordmcount) and Linus hated that.
> 
> As long as GCC+GAS correctly generates a 2-byte or 5-byte JMP depending 
> on the target distance, the objtool solution should work fine, shouldn't 
> it?
> 
> I can see the recordmcount solution sucking, it would depend on early 
> kernel patchery. But build time patchery is something we already depend 
> on, so assuming some objtool catastrophy it's a more robust solution, 
> isn't it?

IIRC the recordmcount variant from Steve was also rewriting JMP8 to NOP2
at build time.

I dug this here link out of my IRC logs:

  https://lore.kernel.org/lkml/1318007374.4729.58.camel@gandalf.stny.rr.com/

Looking at that, part of the reason might've been running yet another
tool, instead of having one tool do everything.


^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [RFC][PATCH 0/9] Variable size jump_label support
  2019-10-07 12:07   ` [RFC][PATCH 0/9] Variable size jump_label support Peter Zijlstra
@ 2019-10-07 12:55     ` Ingo Molnar
  2019-10-07 15:08       ` Steven Rostedt
  0 siblings, 1 reply; 96+ messages in thread
From: Ingo Molnar @ 2019-10-07 12:55 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: x86, linux-kernel, rostedt, mhiramat, bristot, jbaron, torvalds,
	tglx, namit, hpa, luto, ard.biesheuvel, jpoimboe, hjl.tools


* Peter Zijlstra <peterz@infradead.org> wrote:

> IIRC the recordmcount variant from Steve was also rewriting JMP8 to NOP2
> at build time.
>
> I dug this here link out of my IRC logs:
> 
>   https://lore.kernel.org/lkml/1318007374.4729.58.camel@gandalf.stny.rr.com/

Ancient indeed ...

> Looking at that, part of the reason might've been running yet another 
> tool, instead of having one tool do everything.

Yeah - that too wouldn't be a problem with objtool, as we are running it 
anyway, right?

So I can see about 2 valid technical reasons why Linus would have 
objected to that old approach from Steve while finding the objtool 
approach more acceptable.

Basically the main assumption is that we better never run out of 
competent objtool experts... :-)

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [RFC][PATCH 0/9] Variable size jump_label support
  2019-10-07 12:55     ` Ingo Molnar
@ 2019-10-07 15:08       ` Steven Rostedt
  0 siblings, 0 replies; 96+ messages in thread
From: Steven Rostedt @ 2019-10-07 15:08 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Peter Zijlstra, x86, linux-kernel, mhiramat, bristot, jbaron,
	torvalds, tglx, namit, hpa, luto, ard.biesheuvel, jpoimboe,
	hjl.tools

On Mon, 7 Oct 2019 14:55:19 +0200
Ingo Molnar <mingo@kernel.org> wrote:

> * Peter Zijlstra <peterz@infradead.org> wrote:
> 
> > IIRC the recordmcount variant from Steve was also rewriting JMP8 to NOP2
> > at build time.
> >
> > I dug this here link out of my IRC logs:
> > 
> >   https://lore.kernel.org/lkml/1318007374.4729.58.camel@gandalf.stny.rr.com/  
> 
> Ancient indeed ...
> 
> > Looking at that, part of the reason might've been running yet another 
> > tool, instead of having one tool do everything.  
> 
> Yeah - that too wouldn't be a problem with objtool, as we are running it 
> anyway, right?
> 
> So I can see about 2 valid technical reasons why Linus would have 
> objected to that old approach from Steve while finding the objtool 
> approach more acceptable.
> 
> Basically the main assumption is that we better never run out of 
> competent objtool experts... :-)

Actually, even back then I said that it would be best to merge all the
tools into one (I just didn't have the time to implement it), and then
we could pull this off. I have one of my developers working to merge
record-mcount into objtool now (there's been some patches floating
around).

Then with a single tool, it shouldn't be controversial.

-- Steve

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH v3 4/6] x86/alternatives: Add and use text_gen_insn() helper
  2019-10-07  8:17   ` [PATCH v3 4/6] x86/alternatives: Add and use text_gen_insn() helper Peter Zijlstra
@ 2019-10-08  6:23     ` Masami Hiramatsu
  2019-10-08  8:15       ` Peter Zijlstra
  0 siblings, 1 reply; 96+ messages in thread
From: Masami Hiramatsu @ 2019-10-08  6:23 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: x86, linux-kernel, rostedt, mhiramat, bristot, jbaron, torvalds,
	tglx, mingo, namit, hpa, luto, ard.biesheuvel, jpoimboe

On Mon, 07 Oct 2019 10:17:20 +0200
Peter Zijlstra <peterz@infradead.org> wrote:

> Provide a simple helper function to create common instruction
> encodings.

Thanks for using correct INSN_OPCODE:)
This looks good to me.

Reviewed-by: Masami Hiramatsu <mhiramat@kernel.org>

Thanks,

> 
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> Cc: Steven Rostedt <rostedt@goodmis.org>
> Cc: Daniel Bristot de Oliveira <bristot@redhat.com>
> Cc: Masami Hiramatsu <mhiramat@kernel.org>
> ---
>  arch/x86/include/asm/text-patching.h |    2 +
>  arch/x86/kernel/alternative.c        |   36 +++++++++++++++++++++++++++++++++++
>  arch/x86/kernel/jump_label.c         |   31 ++++++++++--------------------
>  arch/x86/kernel/kprobes/opt.c        |    7 ------
>  4 files changed, 50 insertions(+), 26 deletions(-)
> 
> --- a/arch/x86/include/asm/text-patching.h
> +++ b/arch/x86/include/asm/text-patching.h
> @@ -49,6 +49,8 @@ extern void text_poke_bp(void *addr, con
>  extern void text_poke_queue(void *addr, const void *opcode, size_t len, const void *emulate);
>  extern void text_poke_finish(void);
>  
> +extern void *text_gen_insn(u8 opcode, const void *addr, const void *dest);
> +
>  extern int after_bootmem;
>  extern __ro_after_init struct mm_struct *poking_mm;
>  extern __ro_after_init unsigned long poking_addr;
> --- a/arch/x86/kernel/alternative.c
> +++ b/arch/x86/kernel/alternative.c
> @@ -1237,3 +1237,39 @@ void text_poke_bp(void *addr, const void
>  	text_poke_loc_init(&tp, addr, opcode, len, emulate);
>  	text_poke_bp_batch(&tp, 1);
>  }
> +
> +union text_poke_insn {
> +	u8 text[POKE_MAX_OPCODE_SIZE];
> +	struct {
> +		u8 opcode;
> +		s32 disp;
> +	} __attribute__((packed));
> +};
> +
> +void *text_gen_insn(u8 opcode, const void *addr, const void *dest)
> +{
> +	static union text_poke_insn insn; /* text_mutex */
> +	int size = 0;
> +
> +	lockdep_assert_held(&text_mutex);
> +
> +	insn.opcode = opcode;
> +
> +#define __CASE(insn)	\
> +	case insn##_INSN_OPCODE: size = insn##_INSN_SIZE; break
> +
> +	switch(opcode) {
> +	__CASE(INT3);
> +	__CASE(CALL);
> +	__CASE(JMP32);
> +	__CASE(JMP8);
> +	}
> +
> +	if (size > 1) {
> +		insn.disp = (long)dest - (long)(addr + size);
> +		if (size == 2)
> +			BUG_ON((insn.disp >> 31) != (insn.disp >> 7));
> +	}
> +
> +	return &insn.text;
> +}
> --- a/arch/x86/kernel/jump_label.c
> +++ b/arch/x86/kernel/jump_label.c
> @@ -16,15 +16,7 @@
>  #include <asm/alternative.h>
>  #include <asm/text-patching.h>
>  
> -union jump_code_union {
> -	char code[JUMP_LABEL_NOP_SIZE];
> -	struct {
> -		char jump;
> -		int offset;
> -	} __attribute__((packed));
> -};
> -
> -static void bug_at(unsigned char *ip, int line)
> +static void bug_at(const void *ip, int line)
>  {
>  	/*
>  	 * The location is not an op that we were expecting.
> @@ -38,33 +30,32 @@ static void bug_at(unsigned char *ip, in
>  static const void *
>  __jump_label_set_jump_code(struct jump_entry *entry, enum jump_label_type type, int init)
>  {
> -	static union jump_code_union code; /* relies on text_mutex */
>  	const unsigned char default_nop[] = { STATIC_KEY_INIT_NOP };
>  	const unsigned char *ideal_nop = ideal_nops[NOP_ATOMIC5];
> -	const void *expect;
> +	const void *expect, *code;
> +	const void *addr, *dest;
>  	int line;
>  
> -	lockdep_assert_held(&text_mutex);
> +	addr = (void *)jump_entry_code(entry);
> +	dest = (void *)jump_entry_target(entry);
>  
> -	code.jump = JMP32_INSN_OPCODE;
> -	code.offset = jump_entry_target(entry) -
> -		       (jump_entry_code(entry) + JUMP_LABEL_NOP_SIZE);
> +	code = text_gen_insn(JMP32_INSN_OPCODE, addr, dest);
>  
>  	if (init) {
>  		expect = default_nop; line = __LINE__;
>  	} else if (type == JUMP_LABEL_JMP) {
>  		expect = ideal_nop; line = __LINE__;
>  	} else {
> -		expect = code.code; line = __LINE__;
> +		expect = code; line = __LINE__;
>  	}
>  
> -	if (memcmp((void *)jump_entry_code(entry), expect, JUMP_LABEL_NOP_SIZE))
> -		bug_at((void *)jump_entry_code(entry), line);
> +	if (memcmp(addr, expect, JUMP_LABEL_NOP_SIZE))
> +		bug_at(addr, line);
>  
>  	if (type == JUMP_LABEL_NOP)
> -		memcpy(&code, ideal_nop, JUMP_LABEL_NOP_SIZE);
> +		code = ideal_nop;
>  
> -	return &code;
> +	return code;
>  }
>  
>  static void inline __jump_label_transform(struct jump_entry *entry,
> --- a/arch/x86/kernel/kprobes/opt.c
> +++ b/arch/x86/kernel/kprobes/opt.c
> @@ -447,18 +447,13 @@ void arch_optimize_kprobes(struct list_h
>  void arch_unoptimize_kprobe(struct optimized_kprobe *op)
>  {
>  	u8 insn_buff[RELATIVEJUMP_SIZE];
> -	u8 emulate_buff[RELATIVEJUMP_SIZE];
>  
>  	/* Set int3 to first byte for kprobes */
>  	insn_buff[0] = BREAKPOINT_INSTRUCTION;
>  	memcpy(insn_buff + 1, op->optinsn.copied_insn, RELATIVE_ADDR_SIZE);
>  
> -	emulate_buff[0] = RELATIVEJUMP_OPCODE;
> -	*(s32 *)(&emulate_buff[1]) = (s32)((long)op->optinsn.insn -
> -			((long)op->kp.addr + RELATIVEJUMP_SIZE));
> -
>  	text_poke_bp(op->kp.addr, insn_buff, RELATIVEJUMP_SIZE,
> -		     emulate_buff);
> +		     text_gen_insn(JMP32_INSN_OPCODE, op->kp.addr, op->optinsn.insn));
>  }
>  
>  /*
> 
> 


-- 
Masami Hiramatsu <mhiramat@kernel.org>

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH v3 4/6] x86/alternatives: Add and use text_gen_insn() helper
  2019-10-08  6:23     ` Masami Hiramatsu
@ 2019-10-08  8:15       ` Peter Zijlstra
  0 siblings, 0 replies; 96+ messages in thread
From: Peter Zijlstra @ 2019-10-08  8:15 UTC (permalink / raw)
  To: Masami Hiramatsu
  Cc: x86, linux-kernel, rostedt, bristot, jbaron, torvalds, tglx,
	mingo, namit, hpa, luto, ard.biesheuvel, jpoimboe

On Tue, Oct 08, 2019 at 03:23:49PM +0900, Masami Hiramatsu wrote:
> On Mon, 07 Oct 2019 10:17:20 +0200
> Peter Zijlstra <peterz@infradead.org> wrote:
> 
> > Provide a simple helper function to create common instruction
> > encodings.
> 
> Thanks for using correct INSN_OPCODE:)
> This looks good to me.
> 

Right, I have it on my todo list to convert all of kprobes over. But
feel tree to beat me to it :-)

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH v2 3/4] module: Properly propagate MODULE_STATE_COMING failure
  2019-10-07  8:25   ` [PATCH v2 3/4] module: Properly propagate MODULE_STATE_COMING failure Peter Zijlstra
@ 2019-10-08 13:08     ` Miroslav Benes
  0 siblings, 0 replies; 96+ messages in thread
From: Miroslav Benes @ 2019-10-08 13:08 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: x86, linux-kernel, rostedt, mhiramat, bristot, jbaron, torvalds,
	tglx, mingo, namit, hpa, luto, ard.biesheuvel, Yonghong Song,
	Alexei Starovoitov, Daniel Borkmann, Song Liu, Jessica Yu,
	Martin KaFai Lau

On Mon, 7 Oct 2019, Peter Zijlstra wrote:

> Now that notifiers got unbroken; use the proper interface to handle
> notifier errors and propagate them.
> 
> There were already MODULE_STATE_COMING notifiers that failed; notably:
> 
>  - jump_label_module_notifier()
>  - tracepoint_module_notify()
>  - bpf_event_notify()
> 
> By propagating this error, we fix those users.
> 
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> Cc: Yonghong Song <yhs@fb.com>
> Cc: Alexei Starovoitov <ast@kernel.org>
> Cc: Daniel Borkmann <daniel@iogearbox.net>
> Cc: Song Liu <songliubraving@fb.com>
> Cc: Jessica Yu <jeyu@kernel.org>
> Cc: Martin KaFai Lau <kafai@fb.com>
> ---
>  kernel/module.c |   10 +++++++---
>  1 file changed, 7 insertions(+), 3 deletions(-)
> 
> --- a/kernel/module.c
> +++ b/kernel/module.c
> @@ -3751,9 +3751,13 @@ static int prepare_coming_module(struct
>  	if (err)
>  		return err;
>  
> -	blocking_notifier_call_chain(&module_notify_list,
> -				     MODULE_STATE_COMING, mod);
> -	return 0;
> +	err = blocking_notifier_call_chain_robust(&module_notify_list,
> +			MODULE_STATE_COMING, MODULE_STATE_GOING, mod);
> +	err = notifier_to_errno(err);
> +	if (err)
> +		klp_module_going(mod);
> +
> +	return err;
>  }

It looks almost ok. At least klp_ error handling is correct now. I see
only one possible problem. If there is an error in a MODULE_STATE_COMING 
notifier, all MODULE_STATE_GOING notifiers will be called with mod->state 
set to MODULE_STATE_COMING. Not nice. I don't think it is actually a 
problem, because all notifiers, that I checked, only use the correct value 
(MODULE_STATE_COMING or MODULE_STATE_GOING, coming from the function 
parameter) and not mod->state. Better to doublecheck though.

However, mod->state is not set to MODULE_STATE_GOING anywhere under 
bug_cleanup label in load_module(). That is a bug and it is there 
regardless of this patch.

Jessica?

Regards
Miroslav

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH v3 1/6] x86/alternatives: Teach text_poke_bp() to emulate instructions
  2019-10-07  8:17   ` [PATCH v3 1/6] x86/alternatives: Teach text_poke_bp() to emulate instructions Peter Zijlstra
@ 2019-10-08 14:29     ` Borislav Petkov
  2019-10-08 14:40       ` Steven Rostedt
  2019-10-08 14:48       ` Peter Zijlstra
  2019-10-09 12:03     ` Daniel Bristot de Oliveira
  1 sibling, 2 replies; 96+ messages in thread
From: Borislav Petkov @ 2019-10-08 14:29 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: x86, linux-kernel, rostedt, mhiramat, bristot, jbaron, torvalds,
	tglx, mingo, namit, hpa, luto, ard.biesheuvel, jpoimboe

On Mon, Oct 07, 2019 at 10:17:17AM +0200, Peter Zijlstra wrote:
> In preparation for static_call and variable size jump_label support,
> teach text_poke_bp() to emulate instructions, namely:
> 
>   JMP32, JMP8, CALL, NOP2, NOP_ATOMIC5, INT3
> 
> The current text_poke_bp() takes a @handler argument which is used as
> a jump target when the temporary INT3 is hit by a different CPU.
> 
> When patching CALL instructions, this doesn't work because we'd miss
> the PUSH of the return address. Instead, teach poke_int3_handler() to
> emulate an instruction, typically the instruction we're patching in.
> 
> This fits almost all text_poke_bp() users, except
> arch_unoptimize_kprobe() which restores random text, and for that site
> we have to build an explicit emulate instruction.

...

> @@ -63,8 +66,17 @@ static inline void int3_emulate_jmp(stru
>  	regs->ip = ip;
>  }
>  
> -#define INT3_INSN_SIZE 1
> -#define CALL_INSN_SIZE 5
> +#define INT3_INSN_SIZE		1
> +#define INT3_INSN_OPCODE	0xCC
> +
> +#define CALL_INSN_SIZE		5
> +#define CALL_INSN_OPCODE	0xE8
> +
> +#define JMP32_INSN_SIZE		5
> +#define JMP32_INSN_OPCODE	0xE9
> +
> +#define JMP8_INSN_SIZE		2
> +#define JMP8_INSN_OPCODE	0xEB

You probably should switch those to have the name prefix come first and
make them even shorter:

OPCODE_CALL
INSN_SIZE_CALL
OPCODE_JMP32
INSN_SIZE_JMP32
OPCODE_JMP8
...

This way you have the opcodes prefixed with OPCODE_ and the insn sizes
with INSN_SIZE_. I.e., what they actually are.

> --- a/arch/x86/kernel/alternative.c
> +++ b/arch/x86/kernel/alternative.c

...

> @@ -1027,9 +1046,9 @@ NOKPROBE_SYMBOL(poke_int3_handler);
>   */
>  void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries)
>  {
> -	int patched_all_but_first = 0;
> -	unsigned char int3 = 0xcc;
> +	unsigned char int3 = INT3_INSN_OPCODE;
>  	unsigned int i;
> +	int do_sync;
>  
>  	lockdep_assert_held(&text_mutex);
>  
> @@ -1053,16 +1072,16 @@ void text_poke_bp_batch(struct text_poke
>  	/*
>  	 * Second step: update all but the first byte of the patched range.
>  	 */
> -	for (i = 0; i < nr_entries; i++) {
> +	for (do_sync = 0, i = 0; i < nr_entries; i++) {
>  		if (tp[i].len - sizeof(int3) > 0) {
>  			text_poke((char *)tp[i].addr + sizeof(int3),
> -				  (const char *)tp[i].opcode + sizeof(int3),
> +				  (const char *)tp[i].text + sizeof(int3),
>  				  tp[i].len - sizeof(int3));
> -			patched_all_but_first++;
> +			do_sync++;
>  		}
>  	}
>  
> -	if (patched_all_but_first) {
> +	if (do_sync) {
>  		/*
>  		 * According to Intel, this core syncing is very likely
>  		 * not necessary and we'd be safe even without it. But
> @@ -1075,10 +1094,17 @@ void text_poke_bp_batch(struct text_poke
>  	 * Third step: replace the first byte (int3) by the first byte of
>  	 * replacing opcode.
>  	 */
> -	for (i = 0; i < nr_entries; i++)
> -		text_poke(tp[i].addr, tp[i].opcode, sizeof(int3));
> +	for (do_sync = 0, i = 0; i < nr_entries; i++) {

Can we have the do_sync reset outside of the loop?

> +		if (tp[i].text[0] == INT3_INSN_OPCODE)
> +			continue;

I'm guessing we preset the 0th byte to 0xcc somewhere.... I just can't
seem to find it...

-- 
Regards/Gruss,
    Boris.

https://people.kernel.org/tglx/notes-about-netiquette

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH v3 1/6] x86/alternatives: Teach text_poke_bp() to emulate instructions
  2019-10-08 14:29     ` Borislav Petkov
@ 2019-10-08 14:40       ` Steven Rostedt
  2019-10-08 14:50         ` Borislav Petkov
  2019-10-08 14:48       ` Peter Zijlstra
  1 sibling, 1 reply; 96+ messages in thread
From: Steven Rostedt @ 2019-10-08 14:40 UTC (permalink / raw)
  To: Borislav Petkov
  Cc: Peter Zijlstra, x86, linux-kernel, mhiramat, bristot, jbaron,
	torvalds, tglx, mingo, namit, hpa, luto, ard.biesheuvel,
	jpoimboe

On Tue, 8 Oct 2019 16:29:24 +0200
Borislav Petkov <bp@alien8.de> wrote:

> On Mon, Oct 07, 2019 at 10:17:17AM +0200, Peter Zijlstra wrote:
> > In preparation for static_call and variable size jump_label support,
> > teach text_poke_bp() to emulate instructions, namely:
> > 
> >   JMP32, JMP8, CALL, NOP2, NOP_ATOMIC5, INT3
> > 
> > The current text_poke_bp() takes a @handler argument which is used as
> > a jump target when the temporary INT3 is hit by a different CPU.
> > 
> > When patching CALL instructions, this doesn't work because we'd miss
> > the PUSH of the return address. Instead, teach poke_int3_handler() to
> > emulate an instruction, typically the instruction we're patching in.
> > 
> > This fits almost all text_poke_bp() users, except
> > arch_unoptimize_kprobe() which restores random text, and for that site
> > we have to build an explicit emulate instruction.  
> 
> ...
> 
> > @@ -63,8 +66,17 @@ static inline void int3_emulate_jmp(stru
> >  	regs->ip = ip;
> >  }
> >  
> > -#define INT3_INSN_SIZE 1
> > -#define CALL_INSN_SIZE 5
> > +#define INT3_INSN_SIZE		1
> > +#define INT3_INSN_OPCODE	0xCC
> > +
> > +#define CALL_INSN_SIZE		5
> > +#define CALL_INSN_OPCODE	0xE8
> > +
> > +#define JMP32_INSN_SIZE		5
> > +#define JMP32_INSN_OPCODE	0xE9
> > +
> > +#define JMP8_INSN_SIZE		2
> > +#define JMP8_INSN_OPCODE	0xEB  
> 
> You probably should switch those to have the name prefix come first and
> make them even shorter:
> 
> OPCODE_CALL
> INSN_SIZE_CALL
> OPCODE_JMP32
> INSN_SIZE_JMP32
> OPCODE_JMP8
> ...
> 
> This way you have the opcodes prefixed with OPCODE_ and the insn sizes
> with INSN_SIZE_. I.e., what they actually are.

Honestly, I like the original way better.

Seeing OPCODE_JMP32 and INSN_SIZE_JMP32 doesn't look like they are
related to me.

-- Steve

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH v3 5/6] x86/ftrace: Use text_poke()
  2019-10-07  8:17   ` [PATCH v3 5/6] x86/ftrace: Use text_poke() Peter Zijlstra
@ 2019-10-08 14:43     ` Steven Rostedt
  2019-10-08 17:11       ` Peter Zijlstra
  2019-10-10  2:41       ` Steven Rostedt
  0 siblings, 2 replies; 96+ messages in thread
From: Steven Rostedt @ 2019-10-08 14:43 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: x86, linux-kernel, mhiramat, bristot, jbaron, torvalds, tglx,
	mingo, namit, hpa, luto, ard.biesheuvel, jpoimboe

On Mon, 07 Oct 2019 10:17:21 +0200
Peter Zijlstra <peterz@infradead.org> wrote:

> Move ftrace over to using the generic x86 text_poke functions; this
> avoids having a second/different copy of that code around.
> 
> This also avoids ftrace violating the (new) W^X rule and avoids
> fragmenting the kernel text page-tables, due to no longer having to
> toggle them RW.
> 
> Cc: Steven Rostedt <rostedt@goodmis.org>
> Cc: Daniel Bristot de Oliveira <bristot@redhat.com>
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> ---


BTW, I'd really like to take this patch series through my tree. That
way I can really hammer it, as well as I have code that will be built
on top of it.

I'll review the other series in this thread, but I'm assuming they
don't rely on this series? Or do they?

-- Steve

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH v3 1/6] x86/alternatives: Teach text_poke_bp() to emulate instructions
  2019-10-08 14:29     ` Borislav Petkov
  2019-10-08 14:40       ` Steven Rostedt
@ 2019-10-08 14:48       ` Peter Zijlstra
  2019-10-08 14:54         ` Borislav Petkov
  1 sibling, 1 reply; 96+ messages in thread
From: Peter Zijlstra @ 2019-10-08 14:48 UTC (permalink / raw)
  To: Borislav Petkov
  Cc: x86, linux-kernel, rostedt, mhiramat, bristot, jbaron, torvalds,
	tglx, mingo, namit, hpa, luto, ard.biesheuvel, jpoimboe

On Tue, Oct 08, 2019 at 04:29:24PM +0200, Borislav Petkov wrote:
> On Mon, Oct 07, 2019 at 10:17:17AM +0200, Peter Zijlstra wrote:

> > @@ -63,8 +66,17 @@ static inline void int3_emulate_jmp(stru
> >  	regs->ip = ip;
> >  }
> >  
> > -#define INT3_INSN_SIZE 1
> > -#define CALL_INSN_SIZE 5
> > +#define INT3_INSN_SIZE		1
> > +#define INT3_INSN_OPCODE	0xCC
> > +
> > +#define CALL_INSN_SIZE		5
> > +#define CALL_INSN_OPCODE	0xE8
> > +
> > +#define JMP32_INSN_SIZE		5
> > +#define JMP32_INSN_OPCODE	0xE9
> > +
> > +#define JMP8_INSN_SIZE		2
> > +#define JMP8_INSN_OPCODE	0xEB
> 
> You probably should switch those to have the name prefix come first and
> make them even shorter:
> 
> OPCODE_CALL
> INSN_SIZE_CALL
> OPCODE_JMP32
> INSN_SIZE_JMP32
> OPCODE_JMP8
> ...
> 
> This way you have the opcodes prefixed with OPCODE_ and the insn sizes
> with INSN_SIZE_. I.e., what they actually are.

I really don't like that; the important part is which instruction and
that really should come first. Also, your variant is horribly
inconsistent.

> > --- a/arch/x86/kernel/alternative.c
> > +++ b/arch/x86/kernel/alternative.c
> 
> ...
> 
> > @@ -1027,9 +1046,9 @@ NOKPROBE_SYMBOL(poke_int3_handler);
> >   */
> >  void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries)
> >  {
> > -	int patched_all_but_first = 0;
> > -	unsigned char int3 = 0xcc;
> > +	unsigned char int3 = INT3_INSN_OPCODE;
> >  	unsigned int i;
> > +	int do_sync;
> >  
> >  	lockdep_assert_held(&text_mutex);
> >  
> > @@ -1053,16 +1072,16 @@ void text_poke_bp_batch(struct text_poke
> >  	/*
> >  	 * Second step: update all but the first byte of the patched range.
> >  	 */
> > -	for (i = 0; i < nr_entries; i++) {
> > +	for (do_sync = 0, i = 0; i < nr_entries; i++) {
> >  		if (tp[i].len - sizeof(int3) > 0) {
> >  			text_poke((char *)tp[i].addr + sizeof(int3),
> > -				  (const char *)tp[i].opcode + sizeof(int3),
> > +				  (const char *)tp[i].text + sizeof(int3),
> >  				  tp[i].len - sizeof(int3));
> > -			patched_all_but_first++;
> > +			do_sync++;
> >  		}
> >  	}
> >  
> > -	if (patched_all_but_first) {
> > +	if (do_sync) {
> >  		/*
> >  		 * According to Intel, this core syncing is very likely
> >  		 * not necessary and we'd be safe even without it. But
> > @@ -1075,10 +1094,17 @@ void text_poke_bp_batch(struct text_poke
> >  	 * Third step: replace the first byte (int3) by the first byte of
> >  	 * replacing opcode.
> >  	 */
> > -	for (i = 0; i < nr_entries; i++)
> > -		text_poke(tp[i].addr, tp[i].opcode, sizeof(int3));
> > +	for (do_sync = 0, i = 0; i < nr_entries; i++) {
> 
> Can we have the do_sync reset outside of the loop?

Can, but why? That's more lines for no raisin ;-)

> > +		if (tp[i].text[0] == INT3_INSN_OPCODE)
> > +			continue;
> 
> I'm guessing we preset the 0th byte to 0xcc somewhere.... I just can't
> seem to find it...

Very first pass, we write INT3's everywhere.

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH v3 1/6] x86/alternatives: Teach text_poke_bp() to emulate instructions
  2019-10-08 14:40       ` Steven Rostedt
@ 2019-10-08 14:50         ` Borislav Petkov
  0 siblings, 0 replies; 96+ messages in thread
From: Borislav Petkov @ 2019-10-08 14:50 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: Peter Zijlstra, x86, linux-kernel, mhiramat, bristot, jbaron,
	torvalds, tglx, mingo, namit, hpa, luto, ard.biesheuvel,
	jpoimboe

On Tue, Oct 08, 2019 at 10:40:10AM -0400, Steven Rostedt wrote:
> Seeing OPCODE_JMP32 and INSN_SIZE_JMP32 doesn't look like they are
> related to me.

But if it starts with OPCODE_ you know what it is - an opcode.
JMP32_INSN_OPCODE can first be shortened to JMP32_OPCODE and then having
instruction mnemonic start the macro name doesn't show they belong to
the same type of things. Ditto for the insn size.

But I'm way too busy to bikeshed so whatever...

-- 
Regards/Gruss,
    Boris.

https://people.kernel.org/tglx/notes-about-netiquette

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH v3 1/6] x86/alternatives: Teach text_poke_bp() to emulate instructions
  2019-10-08 14:48       ` Peter Zijlstra
@ 2019-10-08 14:54         ` Borislav Petkov
  2019-10-08 15:04           ` Steven Rostedt
  0 siblings, 1 reply; 96+ messages in thread
From: Borislav Petkov @ 2019-10-08 14:54 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: x86, linux-kernel, rostedt, mhiramat, bristot, jbaron, torvalds,
	tglx, mingo, namit, hpa, luto, ard.biesheuvel, jpoimboe

On Tue, Oct 08, 2019 at 04:48:34PM +0200, Peter Zijlstra wrote:
> Can, but why? That's more lines for no raisin ;-)

Here's a raisin: I was looking at this and then all of a sudden went:
"W00t, why is this do_sync part of the loop at all? Do they belong
together? Nope."

If we're going to save on lines, we've lost long ago.

> Very first pass, we write INT3's everywhere.

Doh.

Thx.

-- 
Regards/Gruss,
    Boris.

https://people.kernel.org/tglx/notes-about-netiquette

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH v3 1/6] x86/alternatives: Teach text_poke_bp() to emulate instructions
  2019-10-08 14:54         ` Borislav Petkov
@ 2019-10-08 15:04           ` Steven Rostedt
  2019-10-08 15:24             ` Borislav Petkov
  0 siblings, 1 reply; 96+ messages in thread
From: Steven Rostedt @ 2019-10-08 15:04 UTC (permalink / raw)
  To: Borislav Petkov
  Cc: Peter Zijlstra, x86, linux-kernel, mhiramat, bristot, jbaron,
	torvalds, tglx, mingo, namit, hpa, luto, ard.biesheuvel,
	jpoimboe

On Tue, 8 Oct 2019 16:54:24 +0200
Borislav Petkov <bp@alien8.de> wrote:

> On Tue, Oct 08, 2019 at 04:48:34PM +0200, Peter Zijlstra wrote:
> > Can, but why? That's more lines for no raisin ;-)  
> 
> Here's a raisin: I was looking at this and then all of a sudden went:
> "W00t, why is this do_sync part of the loop at all? Do they belong
> together? Nope."
> 

But it is part of the loop...


+	for (do_sync = 0, i = 0; i < nr_entries; i++) {
+		if (tp[i].text[0] == INT3_INSN_OPCODE)
+			continue;
+
+		text_poke(tp[i].addr, tp[i].text, sizeof(int3));
+		do_sync++;
+	}
+

The difference between do_sync and i is that i gets incremented at
every iteration, where do_sync gets incremented only when the first
conditional is false. But I still see do_sync as a loop variable.

-- Steve



^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH v3 0/6] Rewrite x86/ftrace to use text_poke()
  2019-10-07  8:17 ` [PATCH v3 0/6] Rewrite x86/ftrace to use text_poke() Peter Zijlstra
                     ` (5 preceding siblings ...)
  2019-10-07  8:17   ` [PATCH v3 6/6] x86/mm: Remove set_kernel_text_r[ow]() Peter Zijlstra
@ 2019-10-08 15:07   ` Steven Rostedt
  6 siblings, 0 replies; 96+ messages in thread
From: Steven Rostedt @ 2019-10-08 15:07 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: x86, linux-kernel, mhiramat, bristot, jbaron, torvalds, tglx,
	mingo, namit, hpa, luto, ard.biesheuvel, jpoimboe

On Mon, 07 Oct 2019 10:17:16 +0200
Peter Zijlstra <peterz@infradead.org> wrote:

> Ftrace was one of the last W^X violators; these patches move it over to the
> generic text_poke() interface and thereby get rid of this oddity.
> 

Ingo, Thomas or H.Peter,

If it's OK with you, can you ack this series so that I can pull it
through my tree? I have some code that will conflict with this that I
want to write on top of it. As well as hammer it out to make sure
ftrace works fine.

Thanks!

-- Steve

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH v3 1/6] x86/alternatives: Teach text_poke_bp() to emulate instructions
  2019-10-08 15:04           ` Steven Rostedt
@ 2019-10-08 15:24             ` Borislav Petkov
  0 siblings, 0 replies; 96+ messages in thread
From: Borislav Petkov @ 2019-10-08 15:24 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: Peter Zijlstra, x86, linux-kernel, mhiramat, bristot, jbaron,
	torvalds, tglx, mingo, namit, hpa, luto, ard.biesheuvel,
	jpoimboe

On Tue, Oct 08, 2019 at 11:04:12AM -0400, Steven Rostedt wrote:
> The difference between do_sync and i is that i gets incremented at
> every iteration, where do_sync gets incremented only when the first
> conditional is false. But I still see do_sync as a loop variable.

I'd prefer it this way:

	do_sync = 0;

	for (i = 0; i < nr_entries; i++) {
	        if (tp[i].text[0] == INT3_INSN_OPCODE)
	                continue;

	        text_poke(tp[i].addr, tp[i].text, sizeof(int3));
	        do_sync++;
	}

	if (do_sync)
	        on_each_cpu(do_sync_core, NULL, 1);

Clear and simple. We clear it, the loop runs and we check it after the
loop ends. Clear and simple pattern which you see everywhere in code.
All well known and uneventful.

Now if the do_sync clearing is in the for () brackets, you have to stop
and look for it and wonder, why is that thing there, is there anything
special about it?

And with the amount of code going through us every day, I'd prefer well
known and uneventful in any day of the week.

-- 
Regards/Gruss,
    Boris.

https://people.kernel.org/tglx/notes-about-netiquette

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH v3 5/6] x86/ftrace: Use text_poke()
  2019-10-08 14:43     ` Steven Rostedt
@ 2019-10-08 17:11       ` Peter Zijlstra
  2019-10-08 17:27         ` Steven Rostedt
  2019-10-10  2:41       ` Steven Rostedt
  1 sibling, 1 reply; 96+ messages in thread
From: Peter Zijlstra @ 2019-10-08 17:11 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: x86, linux-kernel, mhiramat, bristot, jbaron, torvalds, tglx,
	mingo, namit, hpa, luto, ard.biesheuvel, jpoimboe

On Tue, Oct 08, 2019 at 10:43:35AM -0400, Steven Rostedt wrote:

> BTW, I'd really like to take this patch series through my tree. That
> way I can really hammer it, as well as I have code that will be built
> on top of it.

Works for me; or we can have a topic branch in tip we both can use.
Ingo?

> I'll review the other series in this thread, but I'm assuming they
> don't rely on this series? Or do they?

Indeed, this series stands on it's own. The rest depends on this.

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH v3 5/6] x86/ftrace: Use text_poke()
  2019-10-08 17:11       ` Peter Zijlstra
@ 2019-10-08 17:27         ` Steven Rostedt
  0 siblings, 0 replies; 96+ messages in thread
From: Steven Rostedt @ 2019-10-08 17:27 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: x86, linux-kernel, mhiramat, bristot, jbaron, torvalds, tglx,
	mingo, namit, hpa, luto, ard.biesheuvel, jpoimboe

On Tue, 8 Oct 2019 19:11:37 +0200
Peter Zijlstra <peterz@infradead.org> wrote:

> On Tue, Oct 08, 2019 at 10:43:35AM -0400, Steven Rostedt wrote:
> 
> > BTW, I'd really like to take this patch series through my tree. That
> > way I can really hammer it, as well as I have code that will be built
> > on top of it.  
> 
> Works for me; or we can have a topic branch in tip we both can use.
> Ingo?

In case you want to use this branch, I just pushed:

git://git.kernel.org/pub/scm/linux/kernel/git/rostedt/linux-trace.git

  ftrace/text-poke

That is these 6 patches applied to v5.4-rc2

I'm going to start running them through my tests. I can work on these
directly. And if Ingo wants to pull this into tip, then we can do that,
and apply the other patches on top of them.

Ingo, would that work for you?

-- Steve



> 
> > I'll review the other series in this thread, but I'm assuming they
> > don't rely on this series? Or do they?  
> 
> Indeed, this series stands on it's own. The rest depends on this.


^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH v3 1/6] x86/alternatives: Teach text_poke_bp() to emulate instructions
  2019-10-07  8:17   ` [PATCH v3 1/6] x86/alternatives: Teach text_poke_bp() to emulate instructions Peter Zijlstra
  2019-10-08 14:29     ` Borislav Petkov
@ 2019-10-09 12:03     ` Daniel Bristot de Oliveira
  1 sibling, 0 replies; 96+ messages in thread
From: Daniel Bristot de Oliveira @ 2019-10-09 12:03 UTC (permalink / raw)
  To: Peter Zijlstra, x86
  Cc: linux-kernel, rostedt, mhiramat, jbaron, torvalds, tglx, mingo,
	namit, hpa, luto, ard.biesheuvel, jpoimboe

On 07/10/2019 10:17, Peter Zijlstra wrote:
> In preparation for static_call and variable size jump_label support,
> teach text_poke_bp() to emulate instructions, namely:
> 
>   JMP32, JMP8, CALL, NOP2, NOP_ATOMIC5, INT3
> 
> The current text_poke_bp() takes a @handler argument which is used as
> a jump target when the temporary INT3 is hit by a different CPU.
> 
> When patching CALL instructions, this doesn't work because we'd miss
> the PUSH of the return address. Instead, teach poke_int3_handler() to
> emulate an instruction, typically the instruction we're patching in.
> 
> This fits almost all text_poke_bp() users, except
> arch_unoptimize_kprobe() which restores random text, and for that site
> we have to build an explicit emulate instruction.
> 
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> Reviewed-by: Masami Hiramatsu <mhiramat@kernel.org>
> Cc: Steven Rostedt <rostedt@goodmis.org>
> Cc: Daniel Bristot de Oliveira <bristot@redhat.com>

Reviewed-by: Daniel Bristot de Oliveira <bristot@redhat.com>

Thanks!
-- Daniel

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH v3 3/6] x86/alternatives,jump_label: Provide better text_poke() batching interface
  2019-10-07  8:17   ` [PATCH v3 3/6] x86/alternatives,jump_label: Provide better text_poke() batching interface Peter Zijlstra
@ 2019-10-09 12:04     ` Daniel Bristot de Oliveira
  0 siblings, 0 replies; 96+ messages in thread
From: Daniel Bristot de Oliveira @ 2019-10-09 12:04 UTC (permalink / raw)
  To: Peter Zijlstra, x86
  Cc: linux-kernel, rostedt, mhiramat, jbaron, torvalds, tglx, mingo,
	namit, hpa, luto, ard.biesheuvel, jpoimboe

On 07/10/2019 10:17, Peter Zijlstra wrote:
> Adding another text_poke_bp_batch() user made me realize the interface
> is all sorts of wrong. The text poke vector should be internal to the
> implementation.
> 
> This then results in a trivial interface:
> 
>   text_poke_queue()  - which has the 'normal' text_poke_bp() interface
>   text_poke_finish() - which takes no arguments and flushes any
>                        pending text_poke()s.
> 
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> Reviewed-by: Masami Hiramatsu <mhiramat@kernel.org>
> Cc: Steven Rostedt <rostedt@goodmis.org>
> Cc: Daniel Bristot de Oliveira <bristot@redhat.com>

Reviewed-by: Daniel Bristot de Oliveira <bristot@redhat.com>

Thanks!
-- Daniel

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH v3 5/6] x86/ftrace: Use text_poke()
  2019-10-08 14:43     ` Steven Rostedt
  2019-10-08 17:11       ` Peter Zijlstra
@ 2019-10-10  2:41       ` Steven Rostedt
  2019-10-10  9:20         ` Peter Zijlstra
  1 sibling, 1 reply; 96+ messages in thread
From: Steven Rostedt @ 2019-10-10  2:41 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: x86, linux-kernel, mhiramat, bristot, jbaron, torvalds, tglx,
	mingo, namit, hpa, luto, ard.biesheuvel, jpoimboe

On Tue, 8 Oct 2019 10:43:35 -0400
Steven Rostedt <rostedt@goodmis.org> wrote:


> BTW, I'd really like to take this patch series through my tree. That
> way I can really hammer it, as well as I have code that will be built
> on top of it.

I did a bit of hammering and found two bugs. One I sent a patch to fix
(adding a module when tracing is enabled), but the other bug I
triggered, I'm too tired to debug right now. But figured I'd mention it
anyway.

If you add on the kernel command line:

 ftrace=function ftrace_filter=schedule

You will get this (note, I had KASAN enabled, so it showed more info):



[    1.274356] ftrace: allocating 34274 entries in 134 pages
[    1.320059] Starting tracer 'function'
[    1.323724] ==================================================================
[    1.330798] BUG: KASAN: null-ptr-deref in __get_locked_pte+0x21/0x210
[    1.337186] Read of size 8 at addr 0000000000000050 by task swapper/0
[    1.343579]
[    1.345049] CPU: 0 PID: 0 Comm: swapper Not tainted 5.4.0-rc2-test+ #50
[    1.351613] Hardware name: Hewlett-Packard HP Compaq Pro 6300 SFF/339A, BIOS K01 v03.03 07/14/2016
[    1.360510] Call Trace:
[    1.362932]  dump_stack+0x7c/0xc0
[    1.366213]  ? __get_locked_pte+0x21/0x210
[    1.370272]  ? __get_locked_pte+0x21/0x210
[    1.374331]  __kasan_report.cold.10+0x5/0x3e
[    1.378566]  ? __get_locked_pte+0x21/0x210
[    1.382625]  kasan_report+0xe/0x20
[    1.385993]  __get_locked_pte+0x21/0x210
[    1.389880]  ? 0xffffffffa000005c
[    1.393162]  __text_poke+0x1ca/0x5b0
[    1.396705]  ? optimize_nops.isra.8+0xd0/0xd0
[    1.401023]  ? insn_get_length+0x4f/0x70
[    1.404910]  ? text_poke_loc_init+0x186/0x220
[    1.409229]  ? text_poke_kgdb+0x10/0x10
[    1.413031]  ? 0xffffffffa0000000
[    1.416312]  text_poke_bp_batch+0xb4/0x1e0
[    1.420372]  ? __text_poke+0x5b0/0x5b0
[    1.424088]  ? do_raw_spin_lock+0x113/0x1d0
[    1.428233]  ? 0xffffffffa000005c
[    1.431515]  ? 0xffffffffa0000000
[    1.434797]  text_poke_bp+0x7a/0xa0
[    1.438253]  ? text_poke_queue+0xb0/0xb0
[    1.442139]  ? 0xffffffffa0000000
[    1.445421]  ? 0xffffffffa000005c
[    1.448706]  ? find_vmap_area+0x56/0x80
[    1.452505]  arch_ftrace_update_trampoline+0x114/0x380
[    1.457603]  ? ftrace_caller+0x4e/0x4e
[    1.461316]  ? 0xffffffffa0000091
[    1.464599]  ? arch_ftrace_update_code+0x10/0x10
[    1.469179]  ? mutex_lock+0x86/0xd0
[    1.472634]  ? __mutex_lock_slowpath+0x10/0x10
[    1.477039]  ? mutex_unlock+0x1d/0x40
[    1.480668]  ? arch_jump_label_transform_queue+0x7a/0x90
[    1.485937]  ? __jump_label_update+0x91/0x140
[    1.490256]  ? mutex_unlock+0x1d/0x40
[    1.493885]  ? tracing_start_sched_switch.cold.0+0x60/0x60
[    1.499327]  __register_ftrace_function+0xaf/0xf0
[    1.503991]  ftrace_startup+0x24/0x130
[    1.507706]  register_ftrace_function+0x2d/0x80
[    1.512198]  function_trace_init+0xc1/0x100
[    1.516346]  tracing_set_tracer+0x1fb/0x3c0
[    1.520492]  ? free_snapshot+0x50/0x50
[    1.524205]  ? __kasan_kmalloc.constprop.6+0xc1/0xd0
[    1.529131]  ? __list_add_valid+0x29/0xa0
[    1.533106]  register_tracer+0x235/0x26c
[    1.536993]  early_trace_init+0x29b/0x3a0
[    1.540967]  start_kernel+0x2a3/0x5f7
[    1.544595]  ? mem_encrypt_init+0x6/0x6
[    1.548396]  ? load_ucode_intel_bsp+0x5e/0xa3
[    1.552715]  ? init_intel_microcode+0xc3/0xc3
[    1.557036]  ? load_ucode_bsp+0xcc/0x154
[    1.560923]  secondary_startup_64+0xa4/0xb0
[    1.565070] ==================================================================


-- Steve

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH v3 5/6] x86/ftrace: Use text_poke()
  2019-10-10  2:41       ` Steven Rostedt
@ 2019-10-10  9:20         ` Peter Zijlstra
  2019-10-10 13:19           ` Steven Rostedt
  0 siblings, 1 reply; 96+ messages in thread
From: Peter Zijlstra @ 2019-10-10  9:20 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: x86, linux-kernel, mhiramat, bristot, jbaron, torvalds, tglx,
	mingo, namit, hpa, luto, ard.biesheuvel, jpoimboe

On Wed, Oct 09, 2019 at 10:41:35PM -0400, Steven Rostedt wrote:
> On Tue, 8 Oct 2019 10:43:35 -0400
> Steven Rostedt <rostedt@goodmis.org> wrote:
> 
> 
> > BTW, I'd really like to take this patch series through my tree. That
> > way I can really hammer it, as well as I have code that will be built
> > on top of it.
> 
> I did a bit of hammering and found two bugs. One I sent a patch to fix
> (adding a module when tracing is enabled), but the other bug I
> triggered, I'm too tired to debug right now. But figured I'd mention it
> anyway.

I'm thinking this should fix it... Just not sure this is the right plce,
then again, we're doing the same thing in jump_label and static_call, so
perhaps we should do it like this.

--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -1230,10 +1230,15 @@ void text_poke_queue(void *addr, const v
  * dynamically allocated memory. This function should be used when it is
  * not possible to allocate memory.
  */
-void text_poke_bp(void *addr, const void *opcode, size_t len, const void *emulate)
+void __ref text_poke_bp(void *addr, const void *opcode, size_t len, const void *emulate)
 {
 	struct text_poke_loc tp;
 
+	if (unlikely(system_state == SYSTEM_BOOTING)) {
+		text_poke_early(addr, opcode, len);
+		return;
+	}
+
 	text_poke_loc_init(&tp, addr, opcode, len, emulate);
 	text_poke_bp_batch(&tp, 1);
 }

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH v3 5/6] x86/ftrace: Use text_poke()
  2019-10-10  9:20         ` Peter Zijlstra
@ 2019-10-10 13:19           ` Steven Rostedt
  2019-10-10 14:05             ` Peter Zijlstra
  0 siblings, 1 reply; 96+ messages in thread
From: Steven Rostedt @ 2019-10-10 13:19 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: x86, linux-kernel, mhiramat, bristot, jbaron, torvalds, tglx,
	mingo, namit, hpa, luto, ard.biesheuvel, jpoimboe

On Thu, 10 Oct 2019 11:20:54 +0200
Peter Zijlstra <peterz@infradead.org> wrote:

> On Wed, Oct 09, 2019 at 10:41:35PM -0400, Steven Rostedt wrote:
> > On Tue, 8 Oct 2019 10:43:35 -0400
> > Steven Rostedt <rostedt@goodmis.org> wrote:
> > 
> >   
> > > BTW, I'd really like to take this patch series through my tree. That
> > > way I can really hammer it, as well as I have code that will be built
> > > on top of it.  
> > 
> > I did a bit of hammering and found two bugs. One I sent a patch to fix
> > (adding a module when tracing is enabled), but the other bug I
> > triggered, I'm too tired to debug right now. But figured I'd mention it
> > anyway.  
> 
> I'm thinking this should fix it... Just not sure this is the right plce,
> then again, we're doing the same thing in jump_label and static_call, so
> perhaps we should do it like this.
> 
> --- a/arch/x86/kernel/alternative.c
> +++ b/arch/x86/kernel/alternative.c
> @@ -1230,10 +1230,15 @@ void text_poke_queue(void *addr, const v
>   * dynamically allocated memory. This function should be used when it is
>   * not possible to allocate memory.
>   */
> -void text_poke_bp(void *addr, const void *opcode, size_t len, const void *emulate)
> +void __ref text_poke_bp(void *addr, const void *opcode, size_t len, const void *emulate)
>  {
>  	struct text_poke_loc tp;
>  
> +	if (unlikely(system_state == SYSTEM_BOOTING)) {
> +		text_poke_early(addr, opcode, len);
> +		return;
> +	}

We need a new system state. SYSTEM_UP ? (Arg, that name is confusing,
SYSTEM_BOOTING_SMP?) Or perhaps just test num_online_cpus()?

	if (unlikely(system_state == SYSTEM_BOOTING &&
		     num_online_cpus() == 1)

?

Because we can't do the above once we have more than one CPU running.

-- Steve

> +
>  	text_poke_loc_init(&tp, addr, opcode, len, emulate);
>  	text_poke_bp_batch(&tp, 1);
>  }


^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH v3 5/6] x86/ftrace: Use text_poke()
  2019-10-10 13:19           ` Steven Rostedt
@ 2019-10-10 14:05             ` Peter Zijlstra
  2019-10-10 15:54               ` Steven Rostedt
  0 siblings, 1 reply; 96+ messages in thread
From: Peter Zijlstra @ 2019-10-10 14:05 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: x86, linux-kernel, mhiramat, bristot, jbaron, torvalds, tglx,
	mingo, namit, hpa, luto, ard.biesheuvel, jpoimboe

On Thu, Oct 10, 2019 at 09:19:56AM -0400, Steven Rostedt wrote:
> On Thu, 10 Oct 2019 11:20:54 +0200
> Peter Zijlstra <peterz@infradead.org> wrote:
> 
> > On Wed, Oct 09, 2019 at 10:41:35PM -0400, Steven Rostedt wrote:
> > > On Tue, 8 Oct 2019 10:43:35 -0400
> > > Steven Rostedt <rostedt@goodmis.org> wrote:
> > > 
> > >   
> > > > BTW, I'd really like to take this patch series through my tree. That
> > > > way I can really hammer it, as well as I have code that will be built
> > > > on top of it.  
> > > 
> > > I did a bit of hammering and found two bugs. One I sent a patch to fix
> > > (adding a module when tracing is enabled), but the other bug I
> > > triggered, I'm too tired to debug right now. But figured I'd mention it
> > > anyway.  
> > 
> > I'm thinking this should fix it... Just not sure this is the right plce,
> > then again, we're doing the same thing in jump_label and static_call, so
> > perhaps we should do it like this.
> > 
> > --- a/arch/x86/kernel/alternative.c
> > +++ b/arch/x86/kernel/alternative.c
> > @@ -1230,10 +1230,15 @@ void text_poke_queue(void *addr, const v
> >   * dynamically allocated memory. This function should be used when it is
> >   * not possible to allocate memory.
> >   */
> > -void text_poke_bp(void *addr, const void *opcode, size_t len, const void *emulate)
> > +void __ref text_poke_bp(void *addr, const void *opcode, size_t len, const void *emulate)
> >  {
> >  	struct text_poke_loc tp;
> >  
> > +	if (unlikely(system_state == SYSTEM_BOOTING)) {
> > +		text_poke_early(addr, opcode, len);
> > +		return;
> > +	}
> 
> We need a new system state. SYSTEM_UP ? (Arg, that name is confusing,
> SYSTEM_BOOTING_SMP?) Or perhaps just test num_online_cpus()?
> 
> 	if (unlikely(system_state == SYSTEM_BOOTING &&
> 		     num_online_cpus() == 1)
> 
> ?
> 
> Because we can't do the above once we have more than one CPU running.

We loose BOOTING _long_ before we gain SMP.

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH v3 5/6] x86/ftrace: Use text_poke()
  2019-10-10 14:05             ` Peter Zijlstra
@ 2019-10-10 15:54               ` Steven Rostedt
  2019-10-10 17:28                 ` Peter Zijlstra
  0 siblings, 1 reply; 96+ messages in thread
From: Steven Rostedt @ 2019-10-10 15:54 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: x86, linux-kernel, mhiramat, bristot, jbaron, torvalds, tglx,
	mingo, namit, hpa, luto, ard.biesheuvel, jpoimboe

On Thu, 10 Oct 2019 16:05:13 +0200
Peter Zijlstra <peterz@infradead.org> wrote:

> > Because we can't do the above once we have more than one CPU running.  
> 
> We loose BOOTING _long_ before we gain SMP.

Ah, yep. But I finally got it working with the following patch:

diff --git a/arch/x86/include/asm/text-patching.h b/arch/x86/include/asm/text-patching.h
index 95beb85aef65..d7037d038005 100644
--- a/arch/x86/include/asm/text-patching.h
+++ b/arch/x86/include/asm/text-patching.h
@@ -25,7 +25,7 @@ static inline void apply_paravirt(struct paravirt_patch_site *start,
  */
 #define POKE_MAX_OPCODE_SIZE	5
 
-extern void text_poke_early(void *addr, const void *opcode, size_t len);
+extern void *text_poke_early(void *addr, const void *opcode, size_t len);
 
 /*
  * Clear and restore the kernel write-protection flag on the local CPU.
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index fa5dfde9b09a..2ee644a20f46 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -267,7 +267,7 @@ static void __init_or_module add_nops(void *insns, unsigned int len)
 
 extern struct alt_instr __alt_instructions[], __alt_instructions_end[];
 extern s32 __smp_locks[], __smp_locks_end[];
-void text_poke_early(void *addr, const void *opcode, size_t len);
+void *text_poke_early(void *addr, const void *opcode, size_t len);
 
 /*
  * Are we looking at a near JMP with a 1 or 4-byte displacement.
@@ -756,8 +756,8 @@ void __init alternative_instructions(void)
  * instructions. And on the local CPU you need to be protected against NMI or
  * MCE handlers seeing an inconsistent instruction while you patch.
  */
-void __init_or_module text_poke_early(void *addr, const void *opcode,
-				      size_t len)
+void *__init_or_module text_poke_early(void *addr, const void *opcode,
+				       size_t len)
 {
 	unsigned long flags;
 
@@ -780,6 +780,7 @@ void __init_or_module text_poke_early(void *addr, const void *opcode,
 		 * that causes hangs on some VIA CPUs.
 		 */
 	}
+	return NULL;
 }
 
 __ro_after_init struct mm_struct *poking_mm;
@@ -1058,10 +1059,14 @@ static int tp_vec_nr;
  */
 static void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries)
 {
+	void *(*poke)(void *addr, const void *opcode, size_t len) = text_poke;
 	unsigned char int3 = INT3_INSN_OPCODE;
 	unsigned int i;
 	int do_sync;
 
+	if (system_state == SYSTEM_BOOTING)
+		poke = text_poke_early;
+
 	lockdep_assert_held(&text_mutex);
 
 	bp_patching.vec = tp;
@@ -1077,7 +1082,7 @@ static void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries
 	 * First step: add a int3 trap to the address that will be patched.
 	 */
 	for (i = 0; i < nr_entries; i++)
-		text_poke(tp[i].addr, &int3, sizeof(int3));
+		poke(tp[i].addr, &int3, sizeof(int3));
 
 	on_each_cpu(do_sync_core, NULL, 1);
 
@@ -1086,8 +1091,8 @@ static void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries
 	 */
 	for (do_sync = 0, i = 0; i < nr_entries; i++) {
 		if (tp[i].len - sizeof(int3) > 0) {
-			text_poke((char *)tp[i].addr + sizeof(int3),
-				  (const char *)tp[i].text + sizeof(int3),
+			poke((char *)tp[i].addr + sizeof(int3),
+			     (const char *)tp[i].text + sizeof(int3),
 				  tp[i].len - sizeof(int3));
 			do_sync++;
 		}
@@ -1110,7 +1115,7 @@ static void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries
 		if (tp[i].text[0] == INT3_INSN_OPCODE)
 			continue;
 
-		text_poke(tp[i].addr, tp[i].text, sizeof(int3));
+		poke(tp[i].addr, tp[i].text, sizeof(int3));
 		do_sync++;
 	}
 
@@ -1234,6 +1239,10 @@ void text_poke_bp(void *addr, const void *opcode, size_t len, const void *emulat
 {
 	struct text_poke_loc tp;
 
+	if (unlikely(system_state == SYSTEM_BOOTING)) {
+		text_poke_early(addr, opcode, len);
+		return;
+	}
 	text_poke_loc_init(&tp, addr, opcode, len, emulate);
 	text_poke_bp_batch(&tp, 1);
 }
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index f2e59d858ca9..2dd462f86d1f 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -308,7 +308,8 @@ union ftrace_op_code_union {
 #define RET_SIZE		1
 
 static unsigned long
-create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size)
+create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size,
+		  unsigned int *pages)
 {
 	unsigned long start_offset;
 	unsigned long end_offset;
@@ -394,8 +395,11 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size)
 
 	set_vm_flush_reset_perms(trampoline);
 
-	set_memory_ro((unsigned long)trampoline, npages);
+	/* We can't use text_poke_bp() at start up */
+	if (system_state != SYSTEM_BOOTING)
+		set_memory_ro((unsigned long)trampoline, npages);
 	set_memory_x((unsigned long)trampoline, npages);
+	*pages = npages;
 	return (unsigned long)trampoline;
 fail:
 	tramp_free(trampoline);
@@ -423,7 +427,9 @@ void arch_ftrace_update_trampoline(struct ftrace_ops *ops)
 	ftrace_func_t func;
 	unsigned long offset;
 	unsigned long ip;
+	unsigned int pages;
 	unsigned int size;
+	bool set_ro = false;
 	const char *new;
 
 	if (ops->trampoline) {
@@ -434,7 +440,9 @@ void arch_ftrace_update_trampoline(struct ftrace_ops *ops)
 		if (!(ops->flags & FTRACE_OPS_FL_ALLOC_TRAMP))
 			return;
 	} else {
-		ops->trampoline = create_trampoline(ops, &size);
+		if (system_state == SYSTEM_BOOTING)
+			set_ro = true;
+		ops->trampoline = create_trampoline(ops, &size, &pages);
 		if (!ops->trampoline)
 			return;
 		ops->trampoline_size = size;
@@ -450,6 +458,8 @@ void arch_ftrace_update_trampoline(struct ftrace_ops *ops)
 	new = ftrace_call_replace(ip, (unsigned long)func);
 	text_poke_bp((void *)ip, new, MCOUNT_INSN_SIZE, NULL);
 	mutex_unlock(&text_mutex);
+	if (set_ro)
+		set_memory_ro((unsigned long)ops->trampoline, pages);
 }
 
 /* Return the address of the function the trampoline calls */



Is it really important to use text_poke() on text that is coming live?
That is, I really hate the above "set_ro" hack. This is because you
moved the ro setting to create_trampoline() and then forcing the
text_poke() on text that has just been created. I prefer to just modify
it and then setting it to ro before it gets executed. Otherwise we need
to do all these dances.

The same is with the module code. My patch was turning text to
read-write that is not to be executed yet. Really, what's wrong with
that?

-- Steve

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH v3 5/6] x86/ftrace: Use text_poke()
  2019-10-10 15:54               ` Steven Rostedt
@ 2019-10-10 17:28                 ` Peter Zijlstra
  2019-10-10 17:48                   ` Steven Rostedt
  2019-10-11 12:59                   ` Peter Zijlstra
  0 siblings, 2 replies; 96+ messages in thread
From: Peter Zijlstra @ 2019-10-10 17:28 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: x86, linux-kernel, mhiramat, bristot, jbaron, torvalds, tglx,
	mingo, namit, hpa, luto, ard.biesheuvel, jpoimboe

On Thu, Oct 10, 2019 at 11:54:49AM -0400, Steven Rostedt wrote:
> On Thu, 10 Oct 2019 16:05:13 +0200
> Peter Zijlstra <peterz@infradead.org> wrote:
> 
> > > Because we can't do the above once we have more than one CPU running.  
> > 
> > We loose BOOTING _long_ before we gain SMP.
> 
> Ah, yep. But I finally got it working with the following patch:

That looks like it can be done simpler; but my head is exploding, I'll
have to look at this in the AM.

> Is it really important to use text_poke() on text that is coming live?

What is really important is that we never have memory that is both
writable and executable (W^X).

Moving as much poking to before marking it RO (or moving the marking RO
later if that is the same thing) is a sane approach.

But once it gains X, we must never again mark it W, without first
removing X.

> That is, I really hate the above "set_ro" hack. This is because you
> moved the ro setting to create_trampoline() and then forcing the
> text_poke() on text that has just been created. I prefer to just modify
> it and then setting it to ro before it gets executed. Otherwise we need
> to do all these dances.

I thought create_trampoline() finished the whole thing; if it does not,
either make create_trampoline() do everything, or add a
finish_trampoline() callback to mark it complete.

> The same is with the module code. My patch was turning text to
> read-write that is not to be executed yet. Really, what's wrong with
> that?

The fact that it is executable; also the fact that you do it right after
we mark it ro. Flipping the memory protections back and forth is just
really poor everything.

Once this ftrace thing is sorted, we'll change x86 to _refuse_ to make
executable (kernel) memory writable.

Really the best solution is to move all the poking into
ftrace_module_init(), before we mark it RO+X. That's what I'm going to
do for jump_label and static_call as well, I just need to add that extra
notifier callback.

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH v3 5/6] x86/ftrace: Use text_poke()
  2019-10-10 17:28                 ` Peter Zijlstra
@ 2019-10-10 17:48                   ` Steven Rostedt
  2019-10-11 10:45                     ` Peter Zijlstra
  2019-10-11 12:59                   ` Peter Zijlstra
  1 sibling, 1 reply; 96+ messages in thread
From: Steven Rostedt @ 2019-10-10 17:48 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: x86, linux-kernel, mhiramat, bristot, jbaron, torvalds, tglx,
	mingo, namit, hpa, luto, ard.biesheuvel, jpoimboe

On Thu, 10 Oct 2019 19:28:19 +0200
Peter Zijlstra <peterz@infradead.org> wrote:

> > That is, I really hate the above "set_ro" hack. This is because you
> > moved the ro setting to create_trampoline() and then forcing the
> > text_poke() on text that has just been created. I prefer to just modify
> > it and then setting it to ro before it gets executed. Otherwise we need
> > to do all these dances.  
> 
> I thought create_trampoline() finished the whole thing; if it does not,
> either make create_trampoline() do everything, or add a
> finish_trampoline() callback to mark it complete.

I'm good with a finish_trampoline(). I can make a patch that does that.

> 
> > The same is with the module code. My patch was turning text to
> > read-write that is not to be executed yet. Really, what's wrong with
> > that?  
> 
> The fact that it is executable; also the fact that you do it right after
> we mark it ro. Flipping the memory protections back and forth is just
> really poor everything.

Hmm, I wonder if we can work with a way to make this work in the module
code as well. Moving the place it sets 'x' and 'ro' around :-/

> 
> Once this ftrace thing is sorted, we'll change x86 to _refuse_ to make
> executable (kernel) memory writable.

OK.

> 
> Really the best solution is to move all the poking into
> ftrace_module_init(), before we mark it RO+X. That's what I'm going to
> do for jump_label and static_call as well, I just need to add that extra
> notifier callback.

I'll have to think about other ways to handle the other archs with the
all or nothing approach. Perhaps we can use my patch as an arch
dependent situation, I'll try another approach.

-- Steve

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH v2 1/4] notifier: Fix broken error handling pattern
  2019-10-07  8:25   ` [PATCH v2 1/4] notifier: Fix broken error handling pattern Peter Zijlstra
@ 2019-10-10 22:01     ` Rafael J. Wysocki
  0 siblings, 0 replies; 96+ messages in thread
From: Rafael J. Wysocki @ 2019-10-10 22:01 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: x86, linux-kernel, rostedt, mhiramat, bristot, jbaron, torvalds,
	tglx, mingo, namit, hpa, luto, ard.biesheuvel, Pavel Machek,
	Alexios Zavras, Allison Randal, Sam Protsenko, Andrew Morton,
	Todd Brandt, Vasily Averin, Len Brown, Greg Kroah-Hartman

On Monday, October 7, 2019 10:25:42 AM CEST Peter Zijlstra wrote:
> The current notifiers have the following error handling pattern all
> over the place:
> 
> 	int err, nr;
> 
> 	err = __foo_notifier_call_chain(&chain, val_up, v, -1, &nr);
> 	if (err & NOTIFIER_STOP_MASK)
> 		__foo_notifier_call_chain(&chain, val_down, v, nr-1, NULL)
> 
> And aside from the endless repetition thereof, it is broken. Consider
> blocking notifiers; both calls take and drop the rwsem, this means
> that the notifier list can change in between the two calls, making @nr
> meaningless.
> 
> Fix this by replacing all the __foo_notifier_call_chain() functions
> with foo_notifier_call_chain_robust() that embeds the above pattern,
> but ensures it is inside a single lock region.
> 
> Note: I switched atomic_notifier_call_chain_robust() to use
>       the spinlock, since RCU cannot provide the guarantee
>       required for the recovery.
> 
> Note: software_resume() error handling was broken afaict.
> 
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> Cc: Pavel Machek <pavel@ucw.cz>
> Cc: Alexios Zavras <alexios.zavras@intel.com>
> Cc: Allison Randal <allison@lohutok.net>
> Cc: Sam Protsenko <semen.protsenko@linaro.org>
> Cc: "Rafael J. Wysocki" <rjw@rjwysocki.net>
> Cc: Andrew Morton <akpm@linux-foundation.org>
> Cc: Todd Brandt <todd.e.brandt@linux.intel.com>
> Cc: Thomas Gleixner <tglx@linutronix.de>
> Cc: Vasily Averin <vvs@virtuozzo.com>
> Cc: Len Brown <len.brown@intel.com>
> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>

Acked-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>

for the bits I care about.

> ---
>  include/linux/notifier.h           |   15 +--
>  kernel/cpu_pm.c                    |   46 ++++-------
>  kernel/notifier.c                  |  144 ++++++++++++++++++++++---------------
>  kernel/power/hibernate.c           |   26 +++---
>  kernel/power/main.c                |    8 +-
>  kernel/power/power.h               |    3 
>  kernel/power/suspend.c             |   14 +--
>  kernel/power/user.c                |   14 +--
>  tools/power/pm-graph/sleepgraph.py |    2 
>  9 files changed, 139 insertions(+), 133 deletions(-)
> 
> --- a/include/linux/notifier.h
> +++ b/include/linux/notifier.h
> @@ -165,20 +165,19 @@ extern int srcu_notifier_chain_unregiste
>  
>  extern int atomic_notifier_call_chain(struct atomic_notifier_head *nh,
>  		unsigned long val, void *v);
> -extern int __atomic_notifier_call_chain(struct atomic_notifier_head *nh,
> -	unsigned long val, void *v, int nr_to_call, int *nr_calls);
>  extern int blocking_notifier_call_chain(struct blocking_notifier_head *nh,
>  		unsigned long val, void *v);
> -extern int __blocking_notifier_call_chain(struct blocking_notifier_head *nh,
> -	unsigned long val, void *v, int nr_to_call, int *nr_calls);
>  extern int raw_notifier_call_chain(struct raw_notifier_head *nh,
>  		unsigned long val, void *v);
> -extern int __raw_notifier_call_chain(struct raw_notifier_head *nh,
> -	unsigned long val, void *v, int nr_to_call, int *nr_calls);
>  extern int srcu_notifier_call_chain(struct srcu_notifier_head *nh,
>  		unsigned long val, void *v);
> -extern int __srcu_notifier_call_chain(struct srcu_notifier_head *nh,
> -	unsigned long val, void *v, int nr_to_call, int *nr_calls);
> +
> +extern int atomic_notifier_call_chain_robust(struct atomic_notifier_head *nh,
> +		unsigned long val_up, unsigned long val_down, void *v);
> +extern int blocking_notifier_call_chain_robust(struct blocking_notifier_head *nh,
> +		unsigned long val_up, unsigned long val_down, void *v);
> +extern int raw_notifier_call_chain_robust(struct raw_notifier_head *nh,
> +		unsigned long val_up, unsigned long val_down, void *v);
>  
>  #define NOTIFY_DONE		0x0000		/* Don't care */
>  #define NOTIFY_OK		0x0001		/* Suits me */
> --- a/kernel/cpu_pm.c
> +++ b/kernel/cpu_pm.c
> @@ -15,23 +15,31 @@
>  
>  static ATOMIC_NOTIFIER_HEAD(cpu_pm_notifier_chain);
>  
> -static int cpu_pm_notify(enum cpu_pm_event event, int nr_to_call, int *nr_calls)
> +static int cpu_pm_notify(enum cpu_pm_event event)
>  {
>  	int ret;
>  
>  	/*
> -	 * __atomic_notifier_call_chain has a RCU read critical section, which
> +	 * atomic_notifier_call_chain has a RCU read critical section, which
>  	 * could be disfunctional in cpu idle. Copy RCU_NONIDLE code to let
>  	 * RCU know this.
>  	 */
>  	rcu_irq_enter_irqson();
> -	ret = __atomic_notifier_call_chain(&cpu_pm_notifier_chain, event, NULL,
> -		nr_to_call, nr_calls);
> +	ret = atomic_notifier_call_chain(&cpu_pm_notifier_chain, event, NULL);
>  	rcu_irq_exit_irqson();
>  
>  	return notifier_to_errno(ret);
>  }
>  
> +static int cpu_pm_notify_robust(enum cpu_pm_event event_up, enum cpu_pm_event event_down)
> +{
> +	int ret;
> +
> +	ret = atomic_notifier_call_chain_robust(&cpu_pm_notifier_chain, event_up, event_down, NULL);
> +
> +	return notifier_to_errno(ret);
> +}
> +
>  /**
>   * cpu_pm_register_notifier - register a driver with cpu_pm
>   * @nb: notifier block to register
> @@ -80,18 +88,7 @@ EXPORT_SYMBOL_GPL(cpu_pm_unregister_noti
>   */
>  int cpu_pm_enter(void)
>  {
> -	int nr_calls;
> -	int ret = 0;
> -
> -	ret = cpu_pm_notify(CPU_PM_ENTER, -1, &nr_calls);
> -	if (ret)
> -		/*
> -		 * Inform listeners (nr_calls - 1) about failure of CPU PM
> -		 * PM entry who are notified earlier to prepare for it.
> -		 */
> -		cpu_pm_notify(CPU_PM_ENTER_FAILED, nr_calls - 1, NULL);
> -
> -	return ret;
> +	return cpu_pm_notify_robust(CPU_PM_ENTER, CPU_PM_ENTER_FAILED);
>  }
>  EXPORT_SYMBOL_GPL(cpu_pm_enter);
>  
> @@ -109,7 +106,7 @@ EXPORT_SYMBOL_GPL(cpu_pm_enter);
>   */
>  int cpu_pm_exit(void)
>  {
> -	return cpu_pm_notify(CPU_PM_EXIT, -1, NULL);
> +	return cpu_pm_notify(CPU_PM_EXIT);
>  }
>  EXPORT_SYMBOL_GPL(cpu_pm_exit);
>  
> @@ -131,18 +128,7 @@ EXPORT_SYMBOL_GPL(cpu_pm_exit);
>   */
>  int cpu_cluster_pm_enter(void)
>  {
> -	int nr_calls;
> -	int ret = 0;
> -
> -	ret = cpu_pm_notify(CPU_CLUSTER_PM_ENTER, -1, &nr_calls);
> -	if (ret)
> -		/*
> -		 * Inform listeners (nr_calls - 1) about failure of CPU cluster
> -		 * PM entry who are notified earlier to prepare for it.
> -		 */
> -		cpu_pm_notify(CPU_CLUSTER_PM_ENTER_FAILED, nr_calls - 1, NULL);
> -
> -	return ret;
> +	return cpu_pm_notify_robust(CPU_CLUSTER_PM_ENTER, CPU_CLUSTER_PM_ENTER_FAILED);
>  }
>  EXPORT_SYMBOL_GPL(cpu_cluster_pm_enter);
>  
> @@ -163,7 +149,7 @@ EXPORT_SYMBOL_GPL(cpu_cluster_pm_enter);
>   */
>  int cpu_cluster_pm_exit(void)
>  {
> -	return cpu_pm_notify(CPU_CLUSTER_PM_EXIT, -1, NULL);
> +	return cpu_pm_notify(CPU_CLUSTER_PM_EXIT);
>  }
>  EXPORT_SYMBOL_GPL(cpu_cluster_pm_exit);
>  
> --- a/kernel/notifier.c
> +++ b/kernel/notifier.c
> @@ -106,6 +106,34 @@ static int notifier_call_chain(struct no
>  }
>  NOKPROBE_SYMBOL(notifier_call_chain);
>  
> +/**
> + * notifier_call_chain_robust - Inform the registered notifiers about an event
> + *                              and rollback on error.
> + * @nl:		Pointer to head of the blocking notifier chain
> + * @val_up:	Value passed unmodified to the notifier function
> + * @val_down:	Value passed unmodified to the notifier function when recovering
> + *              from an error on @val_up
> + * @v		Pointer passed unmodified to the notifier function
> + *
> + * NOTE:	It is important the @nl chain doesn't change between the two
> + *		invocations of notifier_call_chain() such that we visit the
> + *		exact same notifier callbacks; this rules out any RCU usage.
> + *
> + * Returns:	the return value of the @val_up call.
> + */
> +static int notifier_call_chain_robust(struct notifier_block **nl,
> +				     unsigned long val_up, unsigned long val_down,
> +				     void *v)
> +{
> +	int ret, nr = 0;
> +
> +	ret = notifier_call_chain(nl, val_up, v, -1, &nr);
> +	if (ret & NOTIFY_STOP_MASK)
> +		notifier_call_chain(nl, val_down, v, nr-1, NULL);
> +
> +	return ret;
> +}
> +
>  /*
>   *	Atomic notifier chain routines.  Registration and unregistration
>   *	use a spinlock, and call_chain is synchronized by RCU (no locks).
> @@ -156,13 +184,30 @@ int atomic_notifier_chain_unregister(str
>  }
>  EXPORT_SYMBOL_GPL(atomic_notifier_chain_unregister);
>  
> +int atomic_notifier_call_chain_robust(struct atomic_notifier_head *nh,
> +		unsigned long val_up, unsigned long val_down, void *v)
> +{
> +	unsigned long flags;
> +	int ret;
> +
> +	/*
> +	 * Musn't use RCU; because then the notifier list can
> +	 * change between the up and down traversal.
> +	 */
> +	spin_lock_irqsave(&nh->lock, flags);
> +	ret = notifier_call_chain_robust(&nh->head, val_up, val_down, v);
> +	spin_unlock_irqrestore(&nh->lock, flags);
> +
> +	return ret;
> +}
> +EXPORT_SYMBOL_GPL(atomic_notifier_call_chain_robust);
> +NOKPROBE_SYMBOL(atomic_notifier_call_chain_robust);
> +
>  /**
> - *	__atomic_notifier_call_chain - Call functions in an atomic notifier chain
> + *	atomic_notifier_call_chain - Call functions in an atomic notifier chain
>   *	@nh: Pointer to head of the atomic notifier chain
>   *	@val: Value passed unmodified to notifier function
>   *	@v: Pointer passed unmodified to notifier function
> - *	@nr_to_call: See the comment for notifier_call_chain.
> - *	@nr_calls: See the comment for notifier_call_chain.
>   *
>   *	Calls each function in a notifier chain in turn.  The functions
>   *	run in an atomic context, so they must not block.
> @@ -175,24 +220,16 @@ EXPORT_SYMBOL_GPL(atomic_notifier_chain_
>   *	Otherwise the return value is the return value
>   *	of the last notifier function called.
>   */
> -int __atomic_notifier_call_chain(struct atomic_notifier_head *nh,
> -				 unsigned long val, void *v,
> -				 int nr_to_call, int *nr_calls)
> +int atomic_notifier_call_chain(struct atomic_notifier_head *nh,
> +			       unsigned long val, void *v)
>  {
>  	int ret;
>  
>  	rcu_read_lock();
> -	ret = notifier_call_chain(&nh->head, val, v, nr_to_call, nr_calls);
> +	ret = notifier_call_chain(&nh->head, val, v, -1, NULL);
>  	rcu_read_unlock();
> -	return ret;
> -}
> -EXPORT_SYMBOL_GPL(__atomic_notifier_call_chain);
> -NOKPROBE_SYMBOL(__atomic_notifier_call_chain);
>  
> -int atomic_notifier_call_chain(struct atomic_notifier_head *nh,
> -			       unsigned long val, void *v)
> -{
> -	return __atomic_notifier_call_chain(nh, val, v, -1, NULL);
> +	return ret;
>  }
>  EXPORT_SYMBOL_GPL(atomic_notifier_call_chain);
>  NOKPROBE_SYMBOL(atomic_notifier_call_chain);
> @@ -285,13 +322,30 @@ int blocking_notifier_chain_unregister(s
>  }
>  EXPORT_SYMBOL_GPL(blocking_notifier_chain_unregister);
>  
> +int blocking_notifier_call_chain_robust(struct blocking_notifier_head *nh,
> +		unsigned long val_up, unsigned long val_down, void *v)
> +{
> +	int ret = NOTIFY_DONE;
> +
> +	/*
> +	 * We check the head outside the lock, but if this access is
> +	 * racy then it does not matter what the result of the test
> +	 * is, we re-check the list after having taken the lock anyway:
> +	 */
> +	if (rcu_access_pointer(nh->head)) {
> +		down_read(&nh->rwsem);
> +		ret = notifier_call_chain_robust(&nh->head, val_up, val_down, v);
> +		up_read(&nh->rwsem);
> +	}
> +	return ret;
> +}
> +EXPORT_SYMBOL_GPL(blocking_notifier_call_chain_robust);
> +
>  /**
> - *	__blocking_notifier_call_chain - Call functions in a blocking notifier chain
> + *	blocking_notifier_call_chain - Call functions in a blocking notifier chain
>   *	@nh: Pointer to head of the blocking notifier chain
>   *	@val: Value passed unmodified to notifier function
>   *	@v: Pointer passed unmodified to notifier function
> - *	@nr_to_call: See comment for notifier_call_chain.
> - *	@nr_calls: See comment for notifier_call_chain.
>   *
>   *	Calls each function in a notifier chain in turn.  The functions
>   *	run in a process context, so they are allowed to block.
> @@ -303,9 +357,8 @@ EXPORT_SYMBOL_GPL(blocking_notifier_chai
>   *	Otherwise the return value is the return value
>   *	of the last notifier function called.
>   */
> -int __blocking_notifier_call_chain(struct blocking_notifier_head *nh,
> -				   unsigned long val, void *v,
> -				   int nr_to_call, int *nr_calls)
> +int blocking_notifier_call_chain(struct blocking_notifier_head *nh,
> +		unsigned long val, void *v)
>  {
>  	int ret = NOTIFY_DONE;
>  
> @@ -316,19 +369,11 @@ int __blocking_notifier_call_chain(struc
>  	 */
>  	if (rcu_access_pointer(nh->head)) {
>  		down_read(&nh->rwsem);
> -		ret = notifier_call_chain(&nh->head, val, v, nr_to_call,
> -					nr_calls);
> +		ret = notifier_call_chain(&nh->head, val, v, -1, NULL);
>  		up_read(&nh->rwsem);
>  	}
>  	return ret;
>  }
> -EXPORT_SYMBOL_GPL(__blocking_notifier_call_chain);
> -
> -int blocking_notifier_call_chain(struct blocking_notifier_head *nh,
> -		unsigned long val, void *v)
> -{
> -	return __blocking_notifier_call_chain(nh, val, v, -1, NULL);
> -}
>  EXPORT_SYMBOL_GPL(blocking_notifier_call_chain);
>  
>  /*
> @@ -370,13 +415,18 @@ int raw_notifier_chain_unregister(struct
>  }
>  EXPORT_SYMBOL_GPL(raw_notifier_chain_unregister);
>  
> +int raw_notifier_call_chain_robust(struct raw_notifier_head *nh,
> +		unsigned long val_up, unsigned long val_down, void *v)
> +{
> +	return notifier_call_chain_robust(&nh->head, val_up, val_down, v);
> +}
> +EXPORT_SYMBOL_GPL(raw_notifier_call_chain_robust);
> +
>  /**
> - *	__raw_notifier_call_chain - Call functions in a raw notifier chain
> + *	raw_notifier_call_chain - Call functions in a raw notifier chain
>   *	@nh: Pointer to head of the raw notifier chain
>   *	@val: Value passed unmodified to notifier function
>   *	@v: Pointer passed unmodified to notifier function
> - *	@nr_to_call: See comment for notifier_call_chain.
> - *	@nr_calls: See comment for notifier_call_chain
>   *
>   *	Calls each function in a notifier chain in turn.  The functions
>   *	run in an undefined context.
> @@ -389,18 +439,10 @@ EXPORT_SYMBOL_GPL(raw_notifier_chain_unr
>   *	Otherwise the return value is the return value
>   *	of the last notifier function called.
>   */
> -int __raw_notifier_call_chain(struct raw_notifier_head *nh,
> -			      unsigned long val, void *v,
> -			      int nr_to_call, int *nr_calls)
> -{
> -	return notifier_call_chain(&nh->head, val, v, nr_to_call, nr_calls);
> -}
> -EXPORT_SYMBOL_GPL(__raw_notifier_call_chain);
> -
>  int raw_notifier_call_chain(struct raw_notifier_head *nh,
>  		unsigned long val, void *v)
>  {
> -	return __raw_notifier_call_chain(nh, val, v, -1, NULL);
> +	return notifier_call_chain(&nh->head, val, v, -1, NULL);
>  }
>  EXPORT_SYMBOL_GPL(raw_notifier_call_chain);
>  
> @@ -472,12 +514,10 @@ int srcu_notifier_chain_unregister(struc
>  EXPORT_SYMBOL_GPL(srcu_notifier_chain_unregister);
>  
>  /**
> - *	__srcu_notifier_call_chain - Call functions in an SRCU notifier chain
> + *	srcu_notifier_call_chain - Call functions in an SRCU notifier chain
>   *	@nh: Pointer to head of the SRCU notifier chain
>   *	@val: Value passed unmodified to notifier function
>   *	@v: Pointer passed unmodified to notifier function
> - *	@nr_to_call: See comment for notifier_call_chain.
> - *	@nr_calls: See comment for notifier_call_chain
>   *
>   *	Calls each function in a notifier chain in turn.  The functions
>   *	run in a process context, so they are allowed to block.
> @@ -489,25 +529,17 @@ EXPORT_SYMBOL_GPL(srcu_notifier_chain_un
>   *	Otherwise the return value is the return value
>   *	of the last notifier function called.
>   */
> -int __srcu_notifier_call_chain(struct srcu_notifier_head *nh,
> -			       unsigned long val, void *v,
> -			       int nr_to_call, int *nr_calls)
> +int srcu_notifier_call_chain(struct srcu_notifier_head *nh,
> +		unsigned long val, void *v)
>  {
>  	int ret;
>  	int idx;
>  
>  	idx = srcu_read_lock(&nh->srcu);
> -	ret = notifier_call_chain(&nh->head, val, v, nr_to_call, nr_calls);
> +	ret = notifier_call_chain(&nh->head, val, v, -1, NULL);
>  	srcu_read_unlock(&nh->srcu, idx);
>  	return ret;
>  }
> -EXPORT_SYMBOL_GPL(__srcu_notifier_call_chain);
> -
> -int srcu_notifier_call_chain(struct srcu_notifier_head *nh,
> -		unsigned long val, void *v)
> -{
> -	return __srcu_notifier_call_chain(nh, val, v, -1, NULL);
> -}
>  EXPORT_SYMBOL_GPL(srcu_notifier_call_chain);
>  
>  /**
> --- a/kernel/power/hibernate.c
> +++ b/kernel/power/hibernate.c
> @@ -693,8 +693,8 @@ static int load_image_and_restore(void)
>   */
>  int hibernate(void)
>  {
> -	int error, nr_calls = 0;
>  	bool snapshot_test = false;
> +	int error;
>  
>  	if (!hibernation_available()) {
>  		pm_pr_dbg("Hibernation not available.\n");
> @@ -710,11 +710,9 @@ int hibernate(void)
>  
>  	pr_info("hibernation entry\n");
>  	pm_prepare_console();
> -	error = __pm_notifier_call_chain(PM_HIBERNATION_PREPARE, -1, &nr_calls);
> -	if (error) {
> -		nr_calls--;
> -		goto Exit;
> -	}
> +	error = pm_notifier_call_chain_robust(PM_HIBERNATION_PREPARE, PM_POST_HIBERNATION);
> +	if (error)
> +		goto Restore;
>  
>  	ksys_sync_helper();
>  
> @@ -772,7 +770,8 @@ int hibernate(void)
>  	/* Don't bother checking whether freezer_test_done is true */
>  	freezer_test_done = false;
>   Exit:
> -	__pm_notifier_call_chain(PM_POST_HIBERNATION, nr_calls, NULL);
> +	pm_notifier_call_chain(PM_POST_HIBERNATION);
> + Restore:
>  	pm_restore_console();
>  	atomic_inc(&snapshot_device_available);
>   Unlock:
> @@ -800,7 +799,7 @@ int hibernate(void)
>   */
>  static int software_resume(void)
>  {
> -	int error, nr_calls = 0;
> +	int error;
>  
>  	/*
>  	 * If the user said "noresume".. bail out early.
> @@ -887,11 +886,9 @@ static int software_resume(void)
>  
>  	pr_info("resume from hibernation\n");
>  	pm_prepare_console();
> -	error = __pm_notifier_call_chain(PM_RESTORE_PREPARE, -1, &nr_calls);
> -	if (error) {
> -		nr_calls--;
> -		goto Close_Finish;
> -	}
> +	error = pm_notifier_call_chain_robust(PM_RESTORE_PREPARE, PM_POST_RESTORE);
> +	if (error)
> +		goto Restore;
>  
>  	pm_pr_dbg("Preparing processes for restore.\n");
>  	error = freeze_processes();
> @@ -900,7 +897,8 @@ static int software_resume(void)
>  	error = load_image_and_restore();
>  	thaw_processes();
>   Finish:
> -	__pm_notifier_call_chain(PM_POST_RESTORE, nr_calls, NULL);
> +	pm_notifier_call_chain(PM_POST_RESTORE);
> + Restore:
>  	pm_restore_console();
>  	pr_info("resume from hibernation failed (%d)\n", error);
>  	atomic_inc(&snapshot_device_available);
> --- a/kernel/power/main.c
> +++ b/kernel/power/main.c
> @@ -79,18 +79,18 @@ int unregister_pm_notifier(struct notifi
>  }
>  EXPORT_SYMBOL_GPL(unregister_pm_notifier);
>  
> -int __pm_notifier_call_chain(unsigned long val, int nr_to_call, int *nr_calls)
> +int pm_notifier_call_chain_robust(unsigned long val_up, unsigned long val_down)
>  {
>  	int ret;
>  
> -	ret = __blocking_notifier_call_chain(&pm_chain_head, val, NULL,
> -						nr_to_call, nr_calls);
> +	ret = blocking_notifier_call_chain_robust(&pm_chain_head, val_up, val_down, NULL);
>  
>  	return notifier_to_errno(ret);
>  }
> +
>  int pm_notifier_call_chain(unsigned long val)
>  {
> -	return __pm_notifier_call_chain(val, -1, NULL);
> +	return blocking_notifier_call_chain(&pm_chain_head, val, NULL);
>  }
>  
>  /* If set, devices may be suspended and resumed asynchronously. */
> --- a/kernel/power/power.h
> +++ b/kernel/power/power.h
> @@ -210,8 +210,7 @@ static inline void suspend_test_finish(c
>  
>  #ifdef CONFIG_PM_SLEEP
>  /* kernel/power/main.c */
> -extern int __pm_notifier_call_chain(unsigned long val, int nr_to_call,
> -				    int *nr_calls);
> +extern int pm_notifier_call_chain_robust(unsigned long val_up, unsigned long val_down);
>  extern int pm_notifier_call_chain(unsigned long val);
>  #endif
>  
> --- a/kernel/power/suspend.c
> +++ b/kernel/power/suspend.c
> @@ -352,18 +352,16 @@ static int suspend_test(int level)
>   */
>  static int suspend_prepare(suspend_state_t state)
>  {
> -	int error, nr_calls = 0;
> +	int error;
>  
>  	if (!sleep_state_supported(state))
>  		return -EPERM;
>  
>  	pm_prepare_console();
>  
> -	error = __pm_notifier_call_chain(PM_SUSPEND_PREPARE, -1, &nr_calls);
> -	if (error) {
> -		nr_calls--;
> -		goto Finish;
> -	}
> +	error = pm_notifier_call_chain_robust(PM_SUSPEND_PREPARE, PM_POST_SUSPEND);
> +	if (error)
> +		goto Restore;
>  
>  	trace_suspend_resume(TPS("freeze_processes"), 0, true);
>  	error = suspend_freeze_processes();
> @@ -373,8 +371,8 @@ static int suspend_prepare(suspend_state
>  
>  	suspend_stats.failed_freeze++;
>  	dpm_save_failed_step(SUSPEND_FREEZE);
> - Finish:
> -	__pm_notifier_call_chain(PM_POST_SUSPEND, nr_calls, NULL);
> +	pm_notifier_call_chain(PM_POST_SUSPEND);
> + Restore:
>  	pm_restore_console();
>  	return error;
>  }
> --- a/kernel/power/user.c
> +++ b/kernel/power/user.c
> @@ -44,7 +44,7 @@ atomic_t snapshot_device_available = ATO
>  static int snapshot_open(struct inode *inode, struct file *filp)
>  {
>  	struct snapshot_data *data;
> -	int error, nr_calls = 0;
> +	int error;
>  
>  	if (!hibernation_available())
>  		return -EPERM;
> @@ -71,9 +71,7 @@ static int snapshot_open(struct inode *i
>  			swap_type_of(swsusp_resume_device, 0, NULL) : -1;
>  		data->mode = O_RDONLY;
>  		data->free_bitmaps = false;
> -		error = __pm_notifier_call_chain(PM_HIBERNATION_PREPARE, -1, &nr_calls);
> -		if (error)
> -			__pm_notifier_call_chain(PM_POST_HIBERNATION, --nr_calls, NULL);
> +		error = pm_notifier_call_chain_robust(PM_HIBERNATION_PREPARE, PM_POST_HIBERNATION);
>  	} else {
>  		/*
>  		 * Resuming.  We may need to wait for the image device to
> @@ -83,15 +81,11 @@ static int snapshot_open(struct inode *i
>  
>  		data->swap = -1;
>  		data->mode = O_WRONLY;
> -		error = __pm_notifier_call_chain(PM_RESTORE_PREPARE, -1, &nr_calls);
> +		error = pm_notifier_call_chain_robust(PM_RESTORE_PREPARE, PM_POST_RESTORE);
>  		if (!error) {
>  			error = create_basic_memory_bitmaps();
>  			data->free_bitmaps = !error;
> -		} else
> -			nr_calls--;
> -
> -		if (error)
> -			__pm_notifier_call_chain(PM_POST_RESTORE, nr_calls, NULL);
> +		}
>  	}
>  	if (error)
>  		atomic_inc(&snapshot_device_available);
> --- a/tools/power/pm-graph/sleepgraph.py
> +++ b/tools/power/pm-graph/sleepgraph.py
> @@ -153,7 +153,7 @@ import base64
>  	tracefuncs = {
>  		'sys_sync': {},
>  		'ksys_sync': {},
> -		'__pm_notifier_call_chain': {},
> +		'pm_notifier_call_chain_robust': {},
>  		'pm_prepare_console': {},
>  		'pm_notifier_call_chain': {},
>  		'freeze_processes': {},
> 
> 
> 





^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH v3 5/6] x86/ftrace: Use text_poke()
  2019-10-10 17:48                   ` Steven Rostedt
@ 2019-10-11 10:45                     ` Peter Zijlstra
  2019-10-11 10:47                       ` Peter Zijlstra
  0 siblings, 1 reply; 96+ messages in thread
From: Peter Zijlstra @ 2019-10-11 10:45 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: x86, linux-kernel, mhiramat, bristot, jbaron, torvalds, tglx,
	mingo, namit, hpa, luto, ard.biesheuvel, jpoimboe

On Thu, Oct 10, 2019 at 01:48:30PM -0400, Steven Rostedt wrote:
> On Thu, 10 Oct 2019 19:28:19 +0200
> Peter Zijlstra <peterz@infradead.org> wrote:
> 
> > > That is, I really hate the above "set_ro" hack. This is because you
> > > moved the ro setting to create_trampoline() and then forcing the
> > > text_poke() on text that has just been created. I prefer to just modify
> > > it and then setting it to ro before it gets executed. Otherwise we need
> > > to do all these dances.  
> > 
> > I thought create_trampoline() finished the whole thing; if it does not,
> > either make create_trampoline() do everything, or add a
> > finish_trampoline() callback to mark it complete.
> 
> I'm good with a finish_trampoline(). I can make a patch that does that.

I found it easier to just make create_trampoline do it all. The below
patch seems to cure both issues for me.

---
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -1213,6 +1213,11 @@ void text_poke_queue(void *addr, const v
 {
 	struct text_poke_loc *tp;
 
+	if (unlikely(system_state == SYSTEM_BOOTING)) {
+		text_poke_early(addr, opcode, len);
+		return;
+	}
+
 	text_poke_flush(addr);
 
 	tp = &tp_vec[tp_vec_nr++];
@@ -1230,10 +1235,15 @@ void text_poke_queue(void *addr, const v
  * dynamically allocated memory. This function should be used when it is
  * not possible to allocate memory.
  */
-void text_poke_bp(void *addr, const void *opcode, size_t len, const void *emulate)
+void __ref text_poke_bp(void *addr, const void *opcode, size_t len, const void *emulate)
 {
 	struct text_poke_loc tp;
 
+	if (unlikely(system_state == SYSTEM_BOOTING)) {
+		text_poke_early(addr, opcode, len);
+		return;
+	}
+
 	text_poke_loc_init(&tp, addr, opcode, len, emulate);
 	text_poke_bp_batch(&tp, 1);
 }
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -34,6 +34,8 @@
 
 #ifdef CONFIG_DYNAMIC_FTRACE
 
+static int ftrace_poke_late = 0;
+
 int ftrace_arch_code_modify_prepare(void)
     __acquires(&text_mutex)
 {
@@ -43,12 +45,15 @@ int ftrace_arch_code_modify_prepare(void
 	 * ftrace has it set to "read/write".
 	 */
 	mutex_lock(&text_mutex);
+	ftrace_poke_late = 1;
 	return 0;
 }
 
 int ftrace_arch_code_modify_post_process(void)
     __releases(&text_mutex)
 {
+	text_poke_finish();
+	ftrace_poke_late = 0;
 	mutex_unlock(&text_mutex);
 	return 0;
 }
@@ -116,7 +121,10 @@ ftrace_modify_code_direct(unsigned long
 		return ret;
 
 	/* replace the text with the new text */
-	text_poke_early((void *)ip, new_code, MCOUNT_INSN_SIZE);
+	if (ftrace_poke_late)
+		text_poke_queue((void *)ip, new_code, MCOUNT_INSN_SIZE, NULL);
+	else
+		text_poke_early((void *)ip, new_code, MCOUNT_INSN_SIZE);
 	return 0;
 }
 
@@ -308,11 +316,12 @@ union ftrace_op_code_union {
 #define RET_SIZE		1
 
 static unsigned long
-create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size)
+create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size, ftrace_func_t func)
 {
 	unsigned long start_offset;
 	unsigned long end_offset;
 	unsigned long op_offset;
+	unsigned long call_offset;
 	unsigned long offset;
 	unsigned long npages;
 	unsigned long size;
@@ -329,10 +338,12 @@ create_trampoline(struct ftrace_ops *ops
 		start_offset = (unsigned long)ftrace_regs_caller;
 		end_offset = (unsigned long)ftrace_regs_caller_end;
 		op_offset = (unsigned long)ftrace_regs_caller_op_ptr;
+		call_offset = (unsigned long)ftrace_regs_call;
 	} else {
 		start_offset = (unsigned long)ftrace_caller;
 		end_offset = (unsigned long)ftrace_epilogue;
 		op_offset = (unsigned long)ftrace_caller_op_ptr;
+		call_offset = (unsigned long)ftrace_call;
 	}
 
 	size = end_offset - start_offset;
@@ -389,6 +400,14 @@ create_trampoline(struct ftrace_ops *ops
 	/* put in the new offset to the ftrace_ops */
 	memcpy(trampoline + op_offset, &op_ptr, OP_REF_SIZE);
 
+	/* put in the call to the function */
+	mutex_lock(&text_mutex);
+	call_offset -= start_offset;
+	memcpy(trampoline + call_offset,
+			text_gen_insn(CALL_INSN_OPCODE, trampoline + call_offset, func),
+			CALL_INSN_SIZE);
+	mutex_unlock(&text_mutex);
+
 	/* ALLOC_TRAMP flags lets us know we created it */
 	ops->flags |= FTRACE_OPS_FL_ALLOC_TRAMP;
 
@@ -426,23 +445,23 @@ void arch_ftrace_update_trampoline(struc
 	unsigned int size;
 	const char *new;
 
-	if (ops->trampoline) {
-		/*
-		 * The ftrace_ops caller may set up its own trampoline.
-		 * In such a case, this code must not modify it.
-		 */
-		if (!(ops->flags & FTRACE_OPS_FL_ALLOC_TRAMP))
-			return;
-	} else {
-		ops->trampoline = create_trampoline(ops, &size);
+	if (!ops->trampoline) {
+		ops->trampoline = create_trampoline(ops, &size, ftrace_ops_get_func(ops));
 		if (!ops->trampoline)
 			return;
 		ops->trampoline_size = size;
+		return;
 	}
 
+	/*
+	 * The ftrace_ops caller may set up its own trampoline.
+	 * In such a case, this code must not modify it.
+	 */
+	if (!(ops->flags & FTRACE_OPS_FL_ALLOC_TRAMP))
+		return;
+
 	offset = calc_trampoline_call_offset(ops->flags & FTRACE_OPS_FL_SAVE_REGS);
 	ip = ops->trampoline + offset;
-
 	func = ftrace_ops_get_func(ops);
 
 	mutex_lock(&text_mutex);

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH v3 5/6] x86/ftrace: Use text_poke()
  2019-10-11 10:45                     ` Peter Zijlstra
@ 2019-10-11 10:47                       ` Peter Zijlstra
  2019-10-11 10:50                         ` Peter Zijlstra
  0 siblings, 1 reply; 96+ messages in thread
From: Peter Zijlstra @ 2019-10-11 10:47 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: x86, linux-kernel, mhiramat, bristot, jbaron, torvalds, tglx,
	mingo, namit, hpa, luto, ard.biesheuvel, jpoimboe

On Fri, Oct 11, 2019 at 12:45:52PM +0200, Peter Zijlstra wrote:
> +	if (!ops->trampoline) {
> +		ops->trampoline = create_trampoline(ops, &size, ftrace_ops_get_func(ops));

And now that I look at what I send, I see we already pass ops, so no
need to pass ftrace_ops_get_func().

Let me respin that.

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH v3 5/6] x86/ftrace: Use text_poke()
  2019-10-11 10:47                       ` Peter Zijlstra
@ 2019-10-11 10:50                         ` Peter Zijlstra
  0 siblings, 0 replies; 96+ messages in thread
From: Peter Zijlstra @ 2019-10-11 10:50 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: x86, linux-kernel, mhiramat, bristot, jbaron, torvalds, tglx,
	mingo, namit, hpa, luto, ard.biesheuvel, jpoimboe

On Fri, Oct 11, 2019 at 12:47:12PM +0200, Peter Zijlstra wrote:
> On Fri, Oct 11, 2019 at 12:45:52PM +0200, Peter Zijlstra wrote:
> > +	if (!ops->trampoline) {
> > +		ops->trampoline = create_trampoline(ops, &size, ftrace_ops_get_func(ops));
> 
> And now that I look at what I send, I see we already pass ops, so no
> need to pass ftrace_ops_get_func().
> 
> Let me respin that.

---
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -1209,10 +1209,15 @@ void text_poke_finish(void)
 	text_poke_flush(NULL);
 }
 
-void text_poke_queue(void *addr, const void *opcode, size_t len, const void *emulate)
+void __ref text_poke_queue(void *addr, const void *opcode, size_t len, const void *emulate)
 {
 	struct text_poke_loc *tp;
 
+	if (unlikely(system_state == SYSTEM_BOOTING)) {
+		text_poke_early(addr, opcode, len);
+		return;
+	}
+
 	text_poke_flush(addr);
 
 	tp = &tp_vec[tp_vec_nr++];
@@ -1230,10 +1235,15 @@ void text_poke_queue(void *addr, const v
  * dynamically allocated memory. This function should be used when it is
  * not possible to allocate memory.
  */
-void text_poke_bp(void *addr, const void *opcode, size_t len, const void *emulate)
+void __ref text_poke_bp(void *addr, const void *opcode, size_t len, const void *emulate)
 {
 	struct text_poke_loc tp;
 
+	if (unlikely(system_state == SYSTEM_BOOTING)) {
+		text_poke_early(addr, opcode, len);
+		return;
+	}
+
 	text_poke_loc_init(&tp, addr, opcode, len, emulate);
 	text_poke_bp_batch(&tp, 1);
 }
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -34,6 +34,8 @@
 
 #ifdef CONFIG_DYNAMIC_FTRACE
 
+static int ftrace_poke_late = 0;
+
 int ftrace_arch_code_modify_prepare(void)
     __acquires(&text_mutex)
 {
@@ -43,12 +45,15 @@ int ftrace_arch_code_modify_prepare(void
 	 * ftrace has it set to "read/write".
 	 */
 	mutex_lock(&text_mutex);
+	ftrace_poke_late = 1;
 	return 0;
 }
 
 int ftrace_arch_code_modify_post_process(void)
     __releases(&text_mutex)
 {
+	text_poke_finish();
+	ftrace_poke_late = 0;
 	mutex_unlock(&text_mutex);
 	return 0;
 }
@@ -116,7 +121,10 @@ ftrace_modify_code_direct(unsigned long
 		return ret;
 
 	/* replace the text with the new text */
-	text_poke_early((void *)ip, new_code, MCOUNT_INSN_SIZE);
+	if (ftrace_poke_late)
+		text_poke_queue((void *)ip, new_code, MCOUNT_INSN_SIZE, NULL);
+	else
+		text_poke_early((void *)ip, new_code, MCOUNT_INSN_SIZE);
 	return 0;
 }
 
@@ -313,6 +321,7 @@ create_trampoline(struct ftrace_ops *ops
 	unsigned long start_offset;
 	unsigned long end_offset;
 	unsigned long op_offset;
+	unsigned long call_offset;
 	unsigned long offset;
 	unsigned long npages;
 	unsigned long size;
@@ -329,10 +338,12 @@ create_trampoline(struct ftrace_ops *ops
 		start_offset = (unsigned long)ftrace_regs_caller;
 		end_offset = (unsigned long)ftrace_regs_caller_end;
 		op_offset = (unsigned long)ftrace_regs_caller_op_ptr;
+		call_offset = (unsigned long)ftrace_regs_call;
 	} else {
 		start_offset = (unsigned long)ftrace_caller;
 		end_offset = (unsigned long)ftrace_epilogue;
 		op_offset = (unsigned long)ftrace_caller_op_ptr;
+		call_offset = (unsigned long)ftrace_call;
 	}
 
 	size = end_offset - start_offset;
@@ -389,6 +400,15 @@ create_trampoline(struct ftrace_ops *ops
 	/* put in the new offset to the ftrace_ops */
 	memcpy(trampoline + op_offset, &op_ptr, OP_REF_SIZE);
 
+	/* put in the call to the function */
+	mutex_lock(&text_mutex);
+	call_offset -= start_offset;
+	memcpy(trampoline + call_offset,
+	       text_gen_insn(CALL_INSN_OPCODE,
+			     trampoline + call_offset,
+			     ftrace_ops_get_func(ops)), CALL_INSN_SIZE);
+	mutex_unlock(&text_mutex);
+
 	/* ALLOC_TRAMP flags lets us know we created it */
 	ops->flags |= FTRACE_OPS_FL_ALLOC_TRAMP;
 
@@ -426,23 +446,23 @@ void arch_ftrace_update_trampoline(struc
 	unsigned int size;
 	const char *new;
 
-	if (ops->trampoline) {
-		/*
-		 * The ftrace_ops caller may set up its own trampoline.
-		 * In such a case, this code must not modify it.
-		 */
-		if (!(ops->flags & FTRACE_OPS_FL_ALLOC_TRAMP))
-			return;
-	} else {
+	if (!ops->trampoline) {
 		ops->trampoline = create_trampoline(ops, &size);
 		if (!ops->trampoline)
 			return;
 		ops->trampoline_size = size;
+		return;
 	}
 
+	/*
+	 * The ftrace_ops caller may set up its own trampoline.
+	 * In such a case, this code must not modify it.
+	 */
+	if (!(ops->flags & FTRACE_OPS_FL_ALLOC_TRAMP))
+		return;
+
 	offset = calc_trampoline_call_offset(ops->flags & FTRACE_OPS_FL_SAVE_REGS);
 	ip = ops->trampoline + offset;
-
 	func = ftrace_ops_get_func(ops);
 
 	mutex_lock(&text_mutex);

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH v3 5/6] x86/ftrace: Use text_poke()
  2019-10-10 17:28                 ` Peter Zijlstra
  2019-10-10 17:48                   ` Steven Rostedt
@ 2019-10-11 12:59                   ` Peter Zijlstra
  2019-10-11 13:33                     ` Steven Rostedt
                                       ` (2 more replies)
  1 sibling, 3 replies; 96+ messages in thread
From: Peter Zijlstra @ 2019-10-11 12:59 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: x86, linux-kernel, mhiramat, bristot, jbaron, torvalds, tglx,
	mingo, namit, hpa, luto, ard.biesheuvel, jpoimboe, jeyu

On Thu, Oct 10, 2019 at 07:28:19PM +0200, Peter Zijlstra wrote:

> Really the best solution is to move all the poking into
> ftrace_module_init(), before we mark it RO+X. That's what I'm going to
> do for jump_label and static_call as well, I just need to add that extra
> notifier callback.

OK, so I started writing that patch... or rather, I wrote the patch and
started on the Changelog when I ran into trouble describing why we need
it.

That is, I'm struggling to explain why we cannot flip
prepare_coming_module() and complete_formation().

Yes, it breaks ftrace, but I'm thinking that is all it breaks. So let me
see if we can cure that.


^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH v3 5/6] x86/ftrace: Use text_poke()
  2019-10-11 12:59                   ` Peter Zijlstra
@ 2019-10-11 13:33                     ` Steven Rostedt
  2019-10-11 13:45                       ` Peter Zijlstra
  2019-10-15 13:07                     ` Jessica Yu
  2019-10-15 13:28                     ` Steven Rostedt
  2 siblings, 1 reply; 96+ messages in thread
From: Steven Rostedt @ 2019-10-11 13:33 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: x86, linux-kernel, mhiramat, bristot, jbaron, torvalds, tglx,
	mingo, namit, hpa, luto, ard.biesheuvel, jpoimboe, jeyu

On Fri, 11 Oct 2019 14:59:03 +0200
Peter Zijlstra <peterz@infradead.org> wrote:

> On Thu, Oct 10, 2019 at 07:28:19PM +0200, Peter Zijlstra wrote:
> 
> > Really the best solution is to move all the poking into
> > ftrace_module_init(), before we mark it RO+X. That's what I'm going to
> > do for jump_label and static_call as well, I just need to add that extra
> > notifier callback.  
> 
> OK, so I started writing that patch... or rather, I wrote the patch and
> started on the Changelog when I ran into trouble describing why we need
> it.
> 
> That is, I'm struggling to explain why we cannot flip
> prepare_coming_module() and complete_formation().
> 
> Yes, it breaks ftrace, but I'm thinking that is all it breaks. So let me
> see if we can cure that.

For someone that doesn't use modules, you are making me very nervous
with all the changes you are making to the module code! ;-)

-- Steve

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH v3 5/6] x86/ftrace: Use text_poke()
  2019-10-11 13:33                     ` Steven Rostedt
@ 2019-10-11 13:45                       ` Peter Zijlstra
  0 siblings, 0 replies; 96+ messages in thread
From: Peter Zijlstra @ 2019-10-11 13:45 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: x86, linux-kernel, mhiramat, bristot, jbaron, torvalds, tglx,
	mingo, namit, hpa, luto, ard.biesheuvel, jpoimboe, jeyu

On Fri, Oct 11, 2019 at 09:33:19AM -0400, Steven Rostedt wrote:
> On Fri, 11 Oct 2019 14:59:03 +0200
> Peter Zijlstra <peterz@infradead.org> wrote:
> 
> > On Thu, Oct 10, 2019 at 07:28:19PM +0200, Peter Zijlstra wrote:
> > 
> > > Really the best solution is to move all the poking into
> > > ftrace_module_init(), before we mark it RO+X. That's what I'm going to
> > > do for jump_label and static_call as well, I just need to add that extra
> > > notifier callback.  
> > 
> > OK, so I started writing that patch... or rather, I wrote the patch and
> > started on the Changelog when I ran into trouble describing why we need
> > it.
> > 
> > That is, I'm struggling to explain why we cannot flip
> > prepare_coming_module() and complete_formation().
> > 
> > Yes, it breaks ftrace, but I'm thinking that is all it breaks. So let me
> > see if we can cure that.
> 
> For someone that doesn't use modules, you are making me very nervous
> with all the changes you are making to the module code! ;-)

Hey, today I build a kernel with modules just for you :-)

And whatever it takes right, I just want to clean this crap up.

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH v3 5/6] x86/ftrace: Use text_poke()
  2019-10-11 12:59                   ` Peter Zijlstra
  2019-10-11 13:33                     ` Steven Rostedt
@ 2019-10-15 13:07                     ` Jessica Yu
  2019-10-15 13:56                       ` Peter Zijlstra
  2019-10-15 13:28                     ` Steven Rostedt
  2 siblings, 1 reply; 96+ messages in thread
From: Jessica Yu @ 2019-10-15 13:07 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Steven Rostedt, x86, linux-kernel, mhiramat, bristot, jbaron,
	torvalds, tglx, mingo, namit, hpa, luto, ard.biesheuvel,
	jpoimboe

+++ Peter Zijlstra [11/10/19 14:59 +0200]:
>On Thu, Oct 10, 2019 at 07:28:19PM +0200, Peter Zijlstra wrote:
>
>> Really the best solution is to move all the poking into
>> ftrace_module_init(), before we mark it RO+X. That's what I'm going to
>> do for jump_label and static_call as well, I just need to add that extra
>> notifier callback.
>
>OK, so I started writing that patch... or rather, I wrote the patch and
>started on the Changelog when I ran into trouble describing why we need
>it.
>
>That is, I'm struggling to explain why we cannot flip
>prepare_coming_module() and complete_formation().
>
>Yes, it breaks ftrace, but I'm thinking that is all it breaks. So let me
>see if we can cure that.

I'm having trouble visualizing what changes you're planning on making.
I get that you want to squash ftrace_module_enable() into
ftrace_module_init(), before module_enable_ro(). I'm fine with that as
long as the races Steven described are addressed for affected arches.
And livepatch should be OK as long as klp_module_coming() is always
called *after* ftrace_module_enable(). But are you moving
klp_module_coming() before the module_enable_* calls as well?  And if
so, why?

>The fact that it is executable; also the fact that you do it right after
>we mark it ro. Flipping the memory protections back and forth is just
>really poor everything.
>
>Once this ftrace thing is sorted, we'll change x86 to _refuse_ to make
>executable (kernel) memory writable.

Not sure if relevant, but just thought I'd clarify: IIRC,
klp_module_coming() is not poking the coming module, but it calls
module_enable_ro() on itself (the livepatch module) so it can apply
relocations and such on the new code, which lives inside the livepatch
module, and it needs to possibly do this numerous times over the
lifetime of the patch module for any coming module it is responsible
for patching (i.e., call module_enable_ro() on the patch module, not
necessarily the coming module). So I am not be sure why
klp_module_coming() should be moved before complete_formation(). I
hope I'm remembering the details correctly, livepatch folks feel free
to chime in if I'm incorrect here.

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH v3 5/6] x86/ftrace: Use text_poke()
  2019-10-11 12:59                   ` Peter Zijlstra
  2019-10-11 13:33                     ` Steven Rostedt
  2019-10-15 13:07                     ` Jessica Yu
@ 2019-10-15 13:28                     ` Steven Rostedt
  2019-10-15 13:42                       ` Peter Zijlstra
  2019-10-15 16:09                       ` Jessica Yu
  2 siblings, 2 replies; 96+ messages in thread
From: Steven Rostedt @ 2019-10-15 13:28 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: x86, linux-kernel, mhiramat, bristot, jbaron, torvalds, tglx,
	mingo, namit, hpa, luto, ard.biesheuvel, jpoimboe, jeyu

On Fri, 11 Oct 2019 14:59:03 +0200
Peter Zijlstra <peterz@infradead.org> wrote:

> On Thu, Oct 10, 2019 at 07:28:19PM +0200, Peter Zijlstra wrote:
> 
> > Really the best solution is to move all the poking into
> > ftrace_module_init(), before we mark it RO+X. That's what I'm going to
> > do for jump_label and static_call as well, I just need to add that extra
> > notifier callback.  
> 
> OK, so I started writing that patch... or rather, I wrote the patch and
> started on the Changelog when I ran into trouble describing why we need
> it.
> 
> That is, I'm struggling to explain why we cannot flip
> prepare_coming_module() and complete_formation().
> 
> Yes, it breaks ftrace, but I'm thinking that is all it breaks. So let me
> see if we can cure that.

You are mainly worried about making text that is executable into
read-write again. What if we kept my one patch that just changed the
module in ftrace_module_enable() from read-only to read-write, but
before we ever set it executable.

Jessica, would this patch break anything?

It moves the setting of the module execution to after calling both
ftrace_module_enable() and klp_module_coming().

This would make it possible to do the module code and still keep with
the no executable code becoming writable.

-- Steve

diff --git a/kernel/module.c b/kernel/module.c
index ff2d7359a418..6e2fd40a6ed9 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -3728,7 +3728,6 @@ static int complete_formation(struct module *mod, struct load_info *info)
 
 	module_enable_ro(mod, false);
 	module_enable_nx(mod);
-	module_enable_x(mod);
 
 	/* Mark state as coming so strong_try_module_get() ignores us,
 	 * but kallsyms etc. can see us. */
@@ -3751,6 +3750,11 @@ static int prepare_coming_module(struct module *mod)
 	if (err)
 		return err;
 
+	/* Make module executable after ftrace is enabled */
+	mutex_lock(&module_mutex);
+	module_enable_x(mod);
+	mutex_unlock(&module_mutex);
+
 	blocking_notifier_call_chain(&module_notify_list,
 				     MODULE_STATE_COMING, mod);
 	return 0;

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH v3 5/6] x86/ftrace: Use text_poke()
  2019-10-15 13:28                     ` Steven Rostedt
@ 2019-10-15 13:42                       ` Peter Zijlstra
  2019-10-15 16:09                       ` Jessica Yu
  1 sibling, 0 replies; 96+ messages in thread
From: Peter Zijlstra @ 2019-10-15 13:42 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: x86, linux-kernel, mhiramat, bristot, jbaron, torvalds, tglx,
	mingo, namit, hpa, luto, ard.biesheuvel, jpoimboe, jeyu

On Tue, Oct 15, 2019 at 09:28:02AM -0400, Steven Rostedt wrote:
> On Fri, 11 Oct 2019 14:59:03 +0200
> Peter Zijlstra <peterz@infradead.org> wrote:
> 
> > On Thu, Oct 10, 2019 at 07:28:19PM +0200, Peter Zijlstra wrote:
> > 
> > > Really the best solution is to move all the poking into
> > > ftrace_module_init(), before we mark it RO+X. That's what I'm going to
> > > do for jump_label and static_call as well, I just need to add that extra
> > > notifier callback.  
> > 
> > OK, so I started writing that patch... or rather, I wrote the patch and
> > started on the Changelog when I ran into trouble describing why we need
> > it.
> > 
> > That is, I'm struggling to explain why we cannot flip
> > prepare_coming_module() and complete_formation().
> > 
> > Yes, it breaks ftrace, but I'm thinking that is all it breaks. So let me
> > see if we can cure that.
> 
> You are mainly worried about making text that is executable into
> read-write again. What if we kept my one patch that just changed the
> module in ftrace_module_enable() from read-only to read-write, but
> before we ever set it executable.

This still flips the protections back and forth, which is still really
ugly. And afaict the only reason this is required is that
set_all_modules_text_*() stuff.

So please, instead of tinkering around, lets just kill that horrible
interface and be done with it. There's only 2 users left, fixing those
can't be too hard.

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH v3 5/6] x86/ftrace: Use text_poke()
  2019-10-15 13:07                     ` Jessica Yu
@ 2019-10-15 13:56                       ` Peter Zijlstra
  2019-10-15 14:11                         ` Peter Zijlstra
                                           ` (3 more replies)
  0 siblings, 4 replies; 96+ messages in thread
From: Peter Zijlstra @ 2019-10-15 13:56 UTC (permalink / raw)
  To: Jessica Yu
  Cc: Steven Rostedt, x86, linux-kernel, mhiramat, bristot, jbaron,
	torvalds, tglx, mingo, namit, hpa, luto, ard.biesheuvel,
	jpoimboe

On Tue, Oct 15, 2019 at 03:07:40PM +0200, Jessica Yu wrote:
> I'm having trouble visualizing what changes you're planning on making.

I want all the text poking (jump_label, ftrace, klp whatever) to happen
_before_ we do the protection changes.

I also want to avoid flipping the protection state around unnecessarily,
because that clearly is just wasted work.

> I get that you want to squash ftrace_module_enable() into
> ftrace_module_init(), before module_enable_ro(). I'm fine with that as
> long as the races Steven described are addressed for affected arches.

Right, the problem is set_all_modules_text_*(), that relies on COMING
having made the protection changes.

After the x86 changes, there's only 2 more architectures that use that
function. I'll work on getting those converted and then we can delete
that function and worry no more about it.

> And livepatch should be OK as long as klp_module_coming() is always
> called *after* ftrace_module_enable(). But are you moving
> klp_module_coming() before the module_enable_* calls as well?  And if
> so, why?

I wanted to move the COMING notifier callback before the protection
changes, because that is the easiest way to convert jump_label and
static_call and AFAICT nothing really cared aside from ftrace.

The alternative is introducing additional module states, which just adds
complexity we don't really need if we can just flip things around a
little.

> > The fact that it is executable; also the fact that you do it right after
> > we mark it ro. Flipping the memory protections back and forth is just
> > really poor everything.
> > 
> > Once this ftrace thing is sorted, we'll change x86 to _refuse_ to make
> > executable (kernel) memory writable.
> 
> Not sure if relevant, but just thought I'd clarify: IIRC,
> klp_module_coming() is not poking the coming module, but it calls
> module_enable_ro() on itself (the livepatch module) so it can apply
> relocations and such on the new code, which lives inside the livepatch
> module, and it needs to possibly do this numerous times over the
> lifetime of the patch module for any coming module it is responsible
> for patching (i.e., call module_enable_ro() on the patch module, not
> necessarily the coming module). So I am not be sure why
> klp_module_coming() should be moved before complete_formation(). I
> hope I'm remembering the details correctly, livepatch folks feel free
> to chime in if I'm incorrect here.

You mean it does module_disable_ro() ? That would be broken and it needs
to be fixed. Can some livepatch person explain what it does and why?

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH v3 5/6] x86/ftrace: Use text_poke()
  2019-10-15 13:56                       ` Peter Zijlstra
@ 2019-10-15 14:11                         ` Peter Zijlstra
  2019-10-15 14:13                         ` Miroslav Benes
                                           ` (2 subsequent siblings)
  3 siblings, 0 replies; 96+ messages in thread
From: Peter Zijlstra @ 2019-10-15 14:11 UTC (permalink / raw)
  To: Jessica Yu
  Cc: Steven Rostedt, x86, linux-kernel, mhiramat, bristot, jbaron,
	torvalds, tglx, mingo, namit, hpa, luto, ard.biesheuvel,
	jpoimboe, mbenes

On Tue, Oct 15, 2019 at 03:56:34PM +0200, Peter Zijlstra wrote:
> On Tue, Oct 15, 2019 at 03:07:40PM +0200, Jessica Yu wrote:

> > > Once this ftrace thing is sorted, we'll change x86 to _refuse_ to make
> > > executable (kernel) memory writable.
> > 
> > Not sure if relevant, but just thought I'd clarify: IIRC,
> > klp_module_coming() is not poking the coming module, but it calls
> > module_enable_ro() on itself (the livepatch module) so it can apply
> > relocations and such on the new code, which lives inside the livepatch
> > module, and it needs to possibly do this numerous times over the
> > lifetime of the patch module for any coming module it is responsible
> > for patching (i.e., call module_enable_ro() on the patch module, not
> > necessarily the coming module). So I am not be sure why
> > klp_module_coming() should be moved before complete_formation(). I
> > hope I'm remembering the details correctly, livepatch folks feel free
> > to chime in if I'm incorrect here.
> 
> You mean it does module_disable_ro() ? That would be broken and it needs
> to be fixed. Can some livepatch person explain what it does and why?

mbenes confirmed; what would be needed is for the live-patch module to
have all module dependent parts to be in their own section and have the
sections be page-aligned. Then we can do the protection on sections
instead of on the whole module.

Damn, and I thought I was so close to getting W^X sorted :/

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH v3 5/6] x86/ftrace: Use text_poke()
  2019-10-15 13:56                       ` Peter Zijlstra
  2019-10-15 14:11                         ` Peter Zijlstra
@ 2019-10-15 14:13                         ` Miroslav Benes
  2019-10-15 15:06                           ` Joe Lawrence
  2019-10-15 14:42                         ` Peter Zijlstra
  2019-10-15 15:51                         ` Jessica Yu
  3 siblings, 1 reply; 96+ messages in thread
From: Miroslav Benes @ 2019-10-15 14:13 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Jessica Yu, Steven Rostedt, x86, linux-kernel, mhiramat, bristot,
	jbaron, torvalds, tglx, mingo, namit, hpa, luto, ard.biesheuvel,
	jpoimboe, live-patching

[ live-patching ML CCed ]

On Tue, 15 Oct 2019, Peter Zijlstra wrote:

> On Tue, Oct 15, 2019 at 03:07:40PM +0200, Jessica Yu wrote:
> 
> > > The fact that it is executable; also the fact that you do it right after
> > > we mark it ro. Flipping the memory protections back and forth is just
> > > really poor everything.
> > > 
> > > Once this ftrace thing is sorted, we'll change x86 to _refuse_ to make
> > > executable (kernel) memory writable.
> > 
> > Not sure if relevant, but just thought I'd clarify: IIRC,
> > klp_module_coming() is not poking the coming module, but it calls
> > module_enable_ro() on itself (the livepatch module) so it can apply
> > relocations and such on the new code, which lives inside the livepatch
> > module, and it needs to possibly do this numerous times over the
> > lifetime of the patch module for any coming module it is responsible
> > for patching (i.e., call module_enable_ro() on the patch module, not
> > necessarily the coming module). So I am not be sure why
> > klp_module_coming() should be moved before complete_formation(). I
> > hope I'm remembering the details correctly, livepatch folks feel free
> > to chime in if I'm incorrect here.
> 
> You mean it does module_disable_ro() ? That would be broken and it needs
> to be fixed. Can some livepatch person explain what it does and why?

Yes, it does. klp_module_coming() calls module_disable_ro() on all 
patching modules which patch the coming module in order to call 
apply_relocate_add(). New (patching) code for a module can be relocated 
only when the relevant module is loaded.

Miroslav

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH v3 5/6] x86/ftrace: Use text_poke()
  2019-10-15 13:56                       ` Peter Zijlstra
  2019-10-15 14:11                         ` Peter Zijlstra
  2019-10-15 14:13                         ` Miroslav Benes
@ 2019-10-15 14:42                         ` Peter Zijlstra
  2019-10-15 18:31                           ` Peter Zijlstra
  2019-10-15 15:51                         ` Jessica Yu
  3 siblings, 1 reply; 96+ messages in thread
From: Peter Zijlstra @ 2019-10-15 14:42 UTC (permalink / raw)
  To: Jessica Yu
  Cc: Steven Rostedt, x86, linux-kernel, mhiramat, bristot, jbaron,
	torvalds, tglx, mingo, namit, hpa, luto, ard.biesheuvel,
	jpoimboe, rabin, Mark Rutland, Will Deacon, james.morse

On Tue, Oct 15, 2019 at 03:56:34PM +0200, Peter Zijlstra wrote:
> Right, the problem is set_all_modules_text_*(), that relies on COMING
> having made the protection changes.
> 
> After the x86 changes, there's only 2 more architectures that use that
> function. I'll work on getting those converted and then we can delete
> that function and worry no more about it.

Here's a possible patch for arch/arm, which only leaves arch/nds32/.

---
 arch/arm/kernel/ftrace.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/arch/arm/kernel/ftrace.c b/arch/arm/kernel/ftrace.c
index bda949fd84e8..c87e68e4ccf7 100644
--- a/arch/arm/kernel/ftrace.c
+++ b/arch/arm/kernel/ftrace.c
@@ -22,6 +22,7 @@
 #include <asm/ftrace.h>
 #include <asm/insn.h>
 #include <asm/set_memory.h>
+#include <asm/patch.h>
 
 #ifdef CONFIG_THUMB2_KERNEL
 #define	NOP		0xf85deb04	/* pop.w {lr} */
@@ -31,13 +32,15 @@
 
 #ifdef CONFIG_DYNAMIC_FTRACE
 
+static int patch_text_remap = 0;
+
 static int __ftrace_modify_code(void *data)
 {
 	int *command = data;
 
-	set_kernel_text_rw();
+	patch_text_remap = 1;
 	ftrace_modify_all_code(*command);
-	set_kernel_text_ro();
+	patch_text_remap = 0;
 
 	return 0;
 }
@@ -59,13 +62,13 @@ static unsigned long adjust_address(struct dyn_ftrace *rec, unsigned long addr)
 
 int ftrace_arch_code_modify_prepare(void)
 {
-	set_all_modules_text_rw();
+	patch_text_remap = 1;
 	return 0;
 }
 
 int ftrace_arch_code_modify_post_process(void)
 {
-	set_all_modules_text_ro();
+	patch_text_remap = 0;
 	/* Make sure any TLB misses during machine stop are cleared. */
 	flush_tlb_all();
 	return 0;
@@ -97,10 +100,7 @@ static int ftrace_modify_code(unsigned long pc, unsigned long old,
 			return -EINVAL;
 	}
 
-	if (probe_kernel_write((void *)pc, &new, MCOUNT_INSN_SIZE))
-		return -EPERM;
-
-	flush_icache_range(pc, pc + MCOUNT_INSN_SIZE);
+	__patch_text_real((void *)pc, new, patch_text_remap);
 
 	return 0;
 }

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH v3 5/6] x86/ftrace: Use text_poke()
  2019-10-15 14:13                         ` Miroslav Benes
@ 2019-10-15 15:06                           ` Joe Lawrence
  2019-10-15 15:31                             ` Jessica Yu
  2019-10-16  6:51                             ` Miroslav Benes
  0 siblings, 2 replies; 96+ messages in thread
From: Joe Lawrence @ 2019-10-15 15:06 UTC (permalink / raw)
  To: Miroslav Benes, Peter Zijlstra
  Cc: Jessica Yu, Steven Rostedt, x86, linux-kernel, mhiramat, bristot,
	jbaron, torvalds, tglx, mingo, namit, hpa, luto, ard.biesheuvel,
	jpoimboe, live-patching

On 10/15/19 10:13 AM, Miroslav Benes wrote:
> Yes, it does. klp_module_coming() calls module_disable_ro() on all
> patching modules which patch the coming module in order to call
> apply_relocate_add(). New (patching) code for a module can be relocated
> only when the relevant module is loaded.

FWIW, would the LPC blue-sky2 model (ie, Steve's suggestion @ plumber's 
where livepatches only patch a single object and updates are kept on 
disk to handle coming module updates as they are loaded) eliminate those 
outstanding relocations and the need to perform this late permission 
flipping?

-- Joe

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH v3 5/6] x86/ftrace: Use text_poke()
  2019-10-15 15:06                           ` Joe Lawrence
@ 2019-10-15 15:31                             ` Jessica Yu
  2019-10-15 22:17                               ` Joe Lawrence
  2019-10-16  6:51                             ` Miroslav Benes
  1 sibling, 1 reply; 96+ messages in thread
From: Jessica Yu @ 2019-10-15 15:31 UTC (permalink / raw)
  To: Joe Lawrence
  Cc: Miroslav Benes, Peter Zijlstra, Steven Rostedt, x86,
	linux-kernel, mhiramat, bristot, jbaron, torvalds, tglx, mingo,
	namit, hpa, luto, ard.biesheuvel, jpoimboe, live-patching

+++ Joe Lawrence [15/10/19 11:06 -0400]:
>On 10/15/19 10:13 AM, Miroslav Benes wrote:
>>Yes, it does. klp_module_coming() calls module_disable_ro() on all
>>patching modules which patch the coming module in order to call
>>apply_relocate_add(). New (patching) code for a module can be relocated
>>only when the relevant module is loaded.
>
>FWIW, would the LPC blue-sky2 model (ie, Steve's suggestion @ 
>plumber's where livepatches only patch a single object and updates are 
>kept on disk to handle coming module updates as they are loaded) 
>eliminate those outstanding relocations and the need to perform this 
>late permission flipping?

I wasn't at Plumbers sadly, was this idea documented/talked about in
detail somewhere? (recording, slides, etherpad, etc?). What do you
mean by updates are kept on disk? Maybe someone can explain it more
in detail? :)


^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH v3 5/6] x86/ftrace: Use text_poke()
  2019-10-15 13:56                       ` Peter Zijlstra
                                           ` (2 preceding siblings ...)
  2019-10-15 14:42                         ` Peter Zijlstra
@ 2019-10-15 15:51                         ` Jessica Yu
  3 siblings, 0 replies; 96+ messages in thread
From: Jessica Yu @ 2019-10-15 15:51 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Steven Rostedt, x86, linux-kernel, mhiramat, bristot, jbaron,
	torvalds, tglx, mingo, namit, hpa, luto, ard.biesheuvel,
	jpoimboe

+++ Peter Zijlstra [15/10/19 15:56 +0200]:
>On Tue, Oct 15, 2019 at 03:07:40PM +0200, Jessica Yu wrote:
>> I'm having trouble visualizing what changes you're planning on making.
>
>I want all the text poking (jump_label, ftrace, klp whatever) to happen
>_before_ we do the protection changes.
>
>I also want to avoid flipping the protection state around unnecessarily,
>because that clearly is just wasted work.

OK, that makes sense, thanks for clarifying. 

>> I get that you want to squash ftrace_module_enable() into
>> ftrace_module_init(), before module_enable_ro(). I'm fine with that as
>> long as the races Steven described are addressed for affected arches.
>
>Right, the problem is set_all_modules_text_*(), that relies on COMING
>having made the protection changes.
>
>After the x86 changes, there's only 2 more architectures that use that
>function. I'll work on getting those converted and then we can delete
>that function and worry no more about it.
>
>> And livepatch should be OK as long as klp_module_coming() is always
>> called *after* ftrace_module_enable(). But are you moving
>> klp_module_coming() before the module_enable_* calls as well?  And if
>> so, why?
>
>I wanted to move the COMING notifier callback before the protection
>changes, because that is the easiest way to convert jump_label and
>static_call and AFAICT nothing really cared aside from ftrace.

I think I would be fine with this as long as no notifiers/users rely
on the assumption that COMING == module enabled protections already.
I've yet to audit all the module notifiers (but I trust you've done
this already), and I think ftrace was the only user that relied on
this. For livepatch it's probably not immediately fixable without some
serious overhaul.

>The alternative is introducing additional module states, which just adds
>complexity we don't really need if we can just flip things around a
>little.

Yeah, I would prefer not adding more states if possible :-)

>> > The fact that it is executable; also the fact that you do it right after
>> > we mark it ro. Flipping the memory protections back and forth is just
>> > really poor everything.
>> >
>> > Once this ftrace thing is sorted, we'll change x86 to _refuse_ to make
>> > executable (kernel) memory writable.
>>
>> Not sure if relevant, but just thought I'd clarify: IIRC,
>> klp_module_coming() is not poking the coming module, but it calls
>> module_enable_ro() on itself (the livepatch module) so it can apply
>> relocations and such on the new code, which lives inside the livepatch
>> module, and it needs to possibly do this numerous times over the
>> lifetime of the patch module for any coming module it is responsible
>> for patching (i.e., call module_enable_ro() on the patch module, not
>> necessarily the coming module). So I am not be sure why
>> klp_module_coming() should be moved before complete_formation(). I
>> hope I'm remembering the details correctly, livepatch folks feel free
>> to chime in if I'm incorrect here.
>
>You mean it does module_disable_ro() ? That would be broken and it needs
>to be fixed. Can some livepatch person explain what it does and why?

Gah, sorry, yes I meant module_disable_ro().

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH v3 5/6] x86/ftrace: Use text_poke()
  2019-10-15 13:28                     ` Steven Rostedt
  2019-10-15 13:42                       ` Peter Zijlstra
@ 2019-10-15 16:09                       ` Jessica Yu
  1 sibling, 0 replies; 96+ messages in thread
From: Jessica Yu @ 2019-10-15 16:09 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: Peter Zijlstra, x86, linux-kernel, mhiramat, bristot, jbaron,
	torvalds, tglx, mingo, namit, hpa, luto, ard.biesheuvel,
	jpoimboe

+++ Steven Rostedt [15/10/19 09:28 -0400]:
>On Fri, 11 Oct 2019 14:59:03 +0200
>Peter Zijlstra <peterz@infradead.org> wrote:
>
>> On Thu, Oct 10, 2019 at 07:28:19PM +0200, Peter Zijlstra wrote:
>>
>> > Really the best solution is to move all the poking into
>> > ftrace_module_init(), before we mark it RO+X. That's what I'm going to
>> > do for jump_label and static_call as well, I just need to add that extra
>> > notifier callback.
>>
>> OK, so I started writing that patch... or rather, I wrote the patch and
>> started on the Changelog when I ran into trouble describing why we need
>> it.
>>
>> That is, I'm struggling to explain why we cannot flip
>> prepare_coming_module() and complete_formation().
>>
>> Yes, it breaks ftrace, but I'm thinking that is all it breaks. So let me
>> see if we can cure that.
>
>You are mainly worried about making text that is executable into
>read-write again. What if we kept my one patch that just changed the
>module in ftrace_module_enable() from read-only to read-write, but
>before we ever set it executable.
>
>Jessica, would this patch break anything?
>
>It moves the setting of the module execution to after calling both
>ftrace_module_enable() and klp_module_coming().
>
>This would make it possible to do the module code and still keep with
>the no executable code becoming writable.
>
>-- Steve
>
>diff --git a/kernel/module.c b/kernel/module.c
>index ff2d7359a418..6e2fd40a6ed9 100644
>--- a/kernel/module.c
>+++ b/kernel/module.c
>@@ -3728,7 +3728,6 @@ static int complete_formation(struct module *mod, struct load_info *info)
>
> 	module_enable_ro(mod, false);
> 	module_enable_nx(mod);
>-	module_enable_x(mod);
>
> 	/* Mark state as coming so strong_try_module_get() ignores us,
> 	 * but kallsyms etc. can see us. */
>@@ -3751,6 +3750,11 @@ static int prepare_coming_module(struct module *mod)
> 	if (err)
> 		return err;
>
>+	/* Make module executable after ftrace is enabled */
>+	mutex_lock(&module_mutex);
>+	module_enable_x(mod);
>+	mutex_unlock(&module_mutex);
>+
> 	blocking_notifier_call_chain(&module_notify_list,
> 				     MODULE_STATE_COMING, mod);
> 	return 0;

As long as we enable x before parse_args(), which this patch does, then
I don't think this change would break anything.


^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH v3 5/6] x86/ftrace: Use text_poke()
  2019-10-15 14:42                         ` Peter Zijlstra
@ 2019-10-15 18:31                           ` Peter Zijlstra
  0 siblings, 0 replies; 96+ messages in thread
From: Peter Zijlstra @ 2019-10-15 18:31 UTC (permalink / raw)
  To: Jessica Yu
  Cc: Steven Rostedt, x86, linux-kernel, mhiramat, bristot, jbaron,
	torvalds, tglx, mingo, namit, hpa, luto, ard.biesheuvel,
	jpoimboe, rabin, Mark Rutland, Will Deacon, james.morse

On Tue, Oct 15, 2019 at 04:42:58PM +0200, Peter Zijlstra wrote:
> On Tue, Oct 15, 2019 at 03:56:34PM +0200, Peter Zijlstra wrote:
> > Right, the problem is set_all_modules_text_*(), that relies on COMING
> > having made the protection changes.
> > 
> > After the x86 changes, there's only 2 more architectures that use that
> > function. I'll work on getting those converted and then we can delete
> > that function and worry no more about it.
> 
> Here's a possible patch for arch/arm, which only leaves arch/nds32/.

*sigh*, so I'd written the patching code for nds32, but then discovered
it doesn't have STRICT_MODULE_RWX and therefore we can simply delete the
thing.

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH v3 5/6] x86/ftrace: Use text_poke()
  2019-10-15 15:31                             ` Jessica Yu
@ 2019-10-15 22:17                               ` Joe Lawrence
  2019-10-15 22:27                                 ` Steven Rostedt
  0 siblings, 1 reply; 96+ messages in thread
From: Joe Lawrence @ 2019-10-15 22:17 UTC (permalink / raw)
  To: Jessica Yu
  Cc: Miroslav Benes, Peter Zijlstra, Steven Rostedt, x86,
	linux-kernel, mhiramat, bristot, jbaron, torvalds, tglx, mingo,
	namit, hpa, luto, ard.biesheuvel, jpoimboe, live-patching

On 10/15/19 11:31 AM, Jessica Yu wrote:
> +++ Joe Lawrence [15/10/19 11:06 -0400]:
>> On 10/15/19 10:13 AM, Miroslav Benes wrote:
>>> Yes, it does. klp_module_coming() calls module_disable_ro() on all
>>> patching modules which patch the coming module in order to call
>>> apply_relocate_add(). New (patching) code for a module can be relocated
>>> only when the relevant module is loaded.
>>
>> FWIW, would the LPC blue-sky2 model (ie, Steve's suggestion @
>> plumber's where livepatches only patch a single object and updates are
>> kept on disk to handle coming module updates as they are loaded)
>> eliminate those outstanding relocations and the need to perform this
>> late permission flipping?
> 
> I wasn't at Plumbers sadly, was this idea documented/talked about in
> detail somewhere? (recording, slides, etherpad, etc?). What do you
> mean by updates are kept on disk? Maybe someone can explain it more
> in detail? :)
> 

Livepatching folks -- I don't have the LPC summary link (etherpad?) that 
Jiri put together.  Does someone have that handy for Jessica?

Thanks,

-- Joe

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH v3 5/6] x86/ftrace: Use text_poke()
  2019-10-15 22:17                               ` Joe Lawrence
@ 2019-10-15 22:27                                 ` Steven Rostedt
  2019-10-16  7:42                                   ` Peter Zijlstra
  2019-10-16  7:49                                   ` Peter Zijlstra
  0 siblings, 2 replies; 96+ messages in thread
From: Steven Rostedt @ 2019-10-15 22:27 UTC (permalink / raw)
  To: Joe Lawrence
  Cc: Jessica Yu, Miroslav Benes, Peter Zijlstra, x86, linux-kernel,
	mhiramat, bristot, jbaron, torvalds, tglx, mingo, namit, hpa,
	luto, ard.biesheuvel, jpoimboe, live-patching

On Tue, 15 Oct 2019 18:17:43 -0400
Joe Lawrence <joe.lawrence@redhat.com> wrote:

> 
> Livepatching folks -- I don't have the LPC summary link (etherpad?) that 
> Jiri put together.  Does someone have that handy for Jessica?

Yes, and I'll be posting this on the LPC web site soon. But here's the
write up that Jiri sent me (with the etherpad link at the end):

Live patching miniconference covered 8 topics overall.

(1) First session, titled "What happened in kernel live patching over the last
year" was led by Miroslav Benes.  It was quite a natural followup to where we
ended at the LPC 2018 miniconf, summarizing which of the points that have been
agreed on back then have already been fully implemented, where obstacles have
been enounctered, etc.

The most prominent feature that has been merged during past year was "atomic
replace", which allows for easier stacking of patches. This is especially
useful for distros, as it naturally aligns with the way patches are being
distributed by them.
Another big step forward since LPC 2018 miniconf was addition of livepatching
selftests, which already tremendously helped in various cases, as it e.g.
helped to track down quite a few issues during development of reliable
stacktraces on s390. Proposal has been made that all major KLP features in the
future should be accompanied by accompanying selftest, which the audience
agreed on.
One of the last year's discussion topics / pain points were GCC optimizations
which are not compatible with livepatching. GCC upstream now has
-flive-patching option, which disables all those interfering optimizations.

(2) Second session, titled "Rethinking late module patching" was led by Miroslav
Benes again.
The problem statement is: in case when there is a patch loaded for module that
is yet to be loaded, it has to be patched before it starts executing. The
current solution relies on hooks in the module loader, and module is patched
when its being linked.  It gets a bit nasty with the arch-specifics of the
module loader handling all the relocations, patching of alternatives, etc. One
of the issues is that all the paravirt / jump label patching has to be done
after relocations are resolved, this is getting a bit fragile and not well
maintainable.
Miroslav sketched out the possible solutions:

	- livepatch would immediately load all the modules for which it has
	  patch via dependency; half-loading modules (not promoting to final
	  LIVE state)
	- splitting the currently one big monolithic livepatch to a per-object
	  structure; might cause issues with consistency model
	- "blue sky" idea from Joe Lawrence: livepatch loaded modules,
	  binary-patch .ko on disk, blacklist vulnerable version

Miroslav proposed to actually stick to the current solution, and improve
selftests coverage for all the considered-fragile arch-specific module linking
code hooks. The discussion then mostly focused, based on proposals from several
attendees (most prominently Steven Rostedt and Amit Shah), on expanding on the
"blue sky" idea.
The final proposal converged to having a separate .ko for livepatches that's
installed on the disk along with the module.  This addresses the module
signature issue (as signature does not actually change), as well as module
removal case (the case where a module was previously loaded while a livepatch
is applied, and then later unloaded and reloaded).  The slight downside is that
this will require changes to the module loader to also look for livepatches
when loading a module.  When unloading the module, the livepatch module will
also need to be unloaded.  Steven approved of this approach over his previous
suggestion.

(3) Third session, titled "Source-based livepatch creation tooling", was led by
Nicolai Stange.
The primary objective of the session was basing on the source-based creation
of livepatches, while avoiding the tedious (and error-prone task) of copying
a lot of kernel code around (from the source tree to the livepatch). Nicolai
spent par of last year writing a klp-ccp (KLP Copy and Paste) utility, which
automates a big chunk of the process.
Nicolai then presented the still open issues with the tool and with the process
around it, most promonent ones being:

	- obtaining original GCC commandline that was used to build the
	  original kernel
	- externalizability of static functions; we need to know whether GCC
	  emitted static function into the patched object

Miroslav proposed to extend existing IPA dumping capabiity of GCC to emit also
the information about dead code elimination; DWARF information is guaranteed
not to be reliable when it comes to IPA optimizations.

(4) Fourth session, titled "Objtool on power -- update", was led by Kamalesh
Babulal.
Kamalesh reported that as a followup to last year's miniconference, the objtool
support for powerpc actually came to life. It hasn't yet been posted upstream,
but is currently available on github [1].
Kamalesh further reported, that decoder has basic functionality (stack
operations + validation, branches, unreachable code, switch table (through gcc
plugin), conditional branches, prologue sequences). It turns out that stack
validation on powerpc is easier than on x86, as the ABI is much more strict
there; which leaves the validation phase to mostly focus on hand-written
assembly.
The next steps are basing on arm64 objtool code which already abstracted out
the arch-specific bits, and further optimizations can be stacked on top of that
(switch table detection, more testing, different gcc versions).

(5) Fifth session, titled "Do we need a Livepatch Developers Guide?", was led
by Joe Lawrence.
Joe postulated, that Current in-kernel documentation provides very good
documentation for individual features the infrastructure provides to the
livepatch author, but Joe further suggested to also include something along the
lines of what they currently have for kpatch, which takes a more general look
from the point of view of livepatch developer.

Proposals that have been brought up for discussion:
    - FAQ
    - collecting already existing CVE fixes and ammend them with a lot of
      commentary
    - creating a livepatch blog on people.kernel.org

Mark Brown asked for documenting what architectures need to implement in order
to support livepatching.
Amit Shah asked if the 'kpatch' and 'kpatch-build' script/program be renamed to
'livepatch'-friendly names so that kernel sources can also reference them for
the user docs part of it.
Both Mark's and Amit's remarks have been considered very valid and useful, and
agreement was reached that they will be taken care of.

(6) Sixth session, titled "API for state changes made by callbacks", was led
by Petr Mladek.

Petr described his proposal for API for changing, updating and disabling
patches (by callbacks). Example where this was needed: L1TF fix, which needed
to change PTE semantics (particular bits). This can't be done before all the
code understands this new PTE format/semantics. Therefore pre-patch and
post-patch callbacks had to do the actual modifications to all the existing
PTEs. What is also currently missing is tracking compatibilities / dependencies
between individual livepatches.
Petr's proposal (v2) is already on ML.
struct klp_state is being introduced which tracks the actual states of the
patch. klp_is_patch_compatible() checks the compatibility of the current states
to the states that the new livepatch is going to bring.
No principal issues / objections have been raised, and it's appreciated by the
patch author(s), so v3 will be submitted and pre-merge bikeshedding will start.

(7) Seventh session, titled "klp-convert and livepatch relocations", was led
by Joe Lawrence.

Joe started the session with problem statement: accessing non exported / static
symbols from inside the patch module. One possible workardound is manually via
kallsyms. Second workaround is klp-convert, which actually creates proper
relocations inside the livepatch module from the symbol database during the
final .ko link.
Currently module loader looks for special livepatch relocations and resolves
those during runtime; kernel support for these relocations have so far been
added for x86 only. Special livepatch relocations are supported and processed
also on other architectures. Special quirks/sections are not yet supported.
Plus klp-convert would still be needed even with late module patching update.
vmlinux or modules could have ambiguous static symbols.

It turns out that the features / bugs below have to be resolved before we
can claim the klp-convert support for relocation complete:
    - handle all the corner cases (jump labels, static keys, ...) properly and
      have a good regression tests in place
    - one day we might (or might not) add support for out-of-tree modules which
      need klp-convert
    - BFD bug 24456 (multiple relocations to the same .text section)

(8) Eight sesstion, titled "Making livepatching infrastructure better", was led
by Kamalesh Babulal.


The primary goal of the discussion as presented by Kamalesh was simple: how to
improve our testing coverage.  Currently we have sample modules + kselftests.
We seem to be currently missing specific unit cases and tests for corner cases.
What Kamalesh would also like to see would be more stress testing oriented
tests for the infrastructure. We should make sure that projects like kernelCI
are running with CONFIG_LIVEPATCH=y.
Another thing Kamalesh currently sees as missing are failure test cases too. 
It should be checked with sosreport and supportconfig guys whether those
diagnostic tools do provide necessary coverage of (at least) livepatching sysfs
state. This is especially a task for distro people to figure out.
Nicolai proposed as one of the testcases identity patching, as that should
reveal issues directly in the infrastructure.

(9) Last, ninth session, titled "Open sourcing live patching services", was led
by Alice Ferrazzi.
This session followed up on previous suggestion of having public repository for
livepatches against LTS kernel.
Alice reported on improviement of elivepatch since last year as having moved
everything to docker.
Alice proposed to more share livepatch sources; SUSE does publish those [2][3],
but it's important to mention that livepatches are very closely tied to
particular kernel version.

[1] https://github.com/kamalesh-babulal/linux/tree/objtool-v1
[2] On https://kernel.suse.com/
[3] Example source-based SUSE's livepatch is at https://kernel.suse.com/cgit/kernel-livepatch/tree/uname_patch/kgr_patch_uname.c?id=d4e00de0b0a3f858fec4e83640f12e1f17298667

Eherpad: https://etherpad.net/p/LPC2019_Live_Patching

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH v3 5/6] x86/ftrace: Use text_poke()
  2019-10-15 15:06                           ` Joe Lawrence
  2019-10-15 15:31                             ` Jessica Yu
@ 2019-10-16  6:51                             ` Miroslav Benes
  2019-10-16  9:23                               ` Peter Zijlstra
  2019-10-16 12:39                               ` Peter Zijlstra
  1 sibling, 2 replies; 96+ messages in thread
From: Miroslav Benes @ 2019-10-16  6:51 UTC (permalink / raw)
  To: Joe Lawrence
  Cc: Peter Zijlstra, Jessica Yu, Steven Rostedt, x86, linux-kernel,
	mhiramat, bristot, jbaron, torvalds, tglx, mingo, namit, hpa,
	luto, ard.biesheuvel, jpoimboe, live-patching

On Tue, 15 Oct 2019, Joe Lawrence wrote:

> On 10/15/19 10:13 AM, Miroslav Benes wrote:
> > Yes, it does. klp_module_coming() calls module_disable_ro() on all
> > patching modules which patch the coming module in order to call
> > apply_relocate_add(). New (patching) code for a module can be relocated
> > only when the relevant module is loaded.
> 
> FWIW, would the LPC blue-sky2 model (ie, Steve's suggestion @ plumber's where
> livepatches only patch a single object and updates are kept on disk to handle
> coming module updates as they are loaded) eliminate those outstanding
> relocations and the need to perform this late permission flipping?

Yes, it should, but we don't have to wait for it. PeterZ proposed a 
different solution to this specific issue in 
https://lore.kernel.org/lkml/20191015141111.GP2359@hirez.programming.kicks-ass.net/

It should not be a problem to create a live patch module like that and the 
code in kernel/livepatch/ is almost ready. Something like 
module_section_disable_ro(mod, section) (and similar for X protection) 
should be enough. Module reloads would still require juggling with the 
protections, but I think it is all feasible.

Miroslav

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH v3 5/6] x86/ftrace: Use text_poke()
  2019-10-15 22:27                                 ` Steven Rostedt
@ 2019-10-16  7:42                                   ` Peter Zijlstra
  2019-10-16 10:15                                     ` Miroslav Benes
  2019-10-16  7:49                                   ` Peter Zijlstra
  1 sibling, 1 reply; 96+ messages in thread
From: Peter Zijlstra @ 2019-10-16  7:42 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: Joe Lawrence, Jessica Yu, Miroslav Benes, x86, linux-kernel,
	mhiramat, bristot, jbaron, torvalds, tglx, mingo, namit, hpa,
	luto, ard.biesheuvel, jpoimboe, live-patching

> which are not compatible with livepatching. GCC upstream now has
> -flive-patching option, which disables all those interfering optimizations.

Which, IIRC, has a significant performance impact and should thus really
not be used...

If distros ship that crap, I'm going to laugh at them the next time they
want a single digit performance improvement because *important*.

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH v3 5/6] x86/ftrace: Use text_poke()
  2019-10-15 22:27                                 ` Steven Rostedt
  2019-10-16  7:42                                   ` Peter Zijlstra
@ 2019-10-16  7:49                                   ` Peter Zijlstra
  2019-10-16 10:20                                     ` Miroslav Benes
  1 sibling, 1 reply; 96+ messages in thread
From: Peter Zijlstra @ 2019-10-16  7:49 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: Joe Lawrence, Jessica Yu, Miroslav Benes, x86, linux-kernel,
	mhiramat, bristot, jbaron, torvalds, tglx, mingo, namit, hpa,
	luto, ard.biesheuvel, jpoimboe, live-patching

On Tue, Oct 15, 2019 at 06:27:05PM -0400, Steven Rostedt wrote:

> (7) Seventh session, titled "klp-convert and livepatch relocations", was led
> by Joe Lawrence.
> 
> Joe started the session with problem statement: accessing non exported / static
> symbols from inside the patch module. One possible workardound is manually via
> kallsyms. Second workaround is klp-convert, which actually creates proper
> relocations inside the livepatch module from the symbol database during the
> final .ko link.
> Currently module loader looks for special livepatch relocations and resolves
> those during runtime; kernel support for these relocations have so far been
> added for x86 only. Special livepatch relocations are supported and processed
> also on other architectures. Special quirks/sections are not yet supported.
> Plus klp-convert would still be needed even with late module patching update.
> vmlinux or modules could have ambiguous static symbols.
> 
> It turns out that the features / bugs below have to be resolved before we
> can claim the klp-convert support for relocation complete:
>     - handle all the corner cases (jump labels, static keys, ...) properly and
>       have a good regression tests in place

I suppose all the patches in this series-of-series here will make life
harder for KLP, static_call() and 2 byte jumps etc..

>     - one day we might (or might not) add support for out-of-tree modules which
>       need klp-convert
>     - BFD bug 24456 (multiple relocations to the same .text section)



^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH v3 5/6] x86/ftrace: Use text_poke()
  2019-10-16  6:51                             ` Miroslav Benes
@ 2019-10-16  9:23                               ` Peter Zijlstra
  2019-10-16  9:36                                 ` Jessica Yu
  2019-10-16 12:39                               ` Peter Zijlstra
  1 sibling, 1 reply; 96+ messages in thread
From: Peter Zijlstra @ 2019-10-16  9:23 UTC (permalink / raw)
  To: Miroslav Benes
  Cc: Joe Lawrence, Jessica Yu, Steven Rostedt, x86, linux-kernel,
	mhiramat, bristot, jbaron, torvalds, tglx, mingo, namit, hpa,
	luto, ard.biesheuvel, jpoimboe, live-patching

On Wed, Oct 16, 2019 at 08:51:27AM +0200, Miroslav Benes wrote:
> On Tue, 15 Oct 2019, Joe Lawrence wrote:
> 
> > On 10/15/19 10:13 AM, Miroslav Benes wrote:
> > > Yes, it does. klp_module_coming() calls module_disable_ro() on all
> > > patching modules which patch the coming module in order to call
> > > apply_relocate_add(). New (patching) code for a module can be relocated
> > > only when the relevant module is loaded.
> > 
> > FWIW, would the LPC blue-sky2 model (ie, Steve's suggestion @ plumber's where
> > livepatches only patch a single object and updates are kept on disk to handle
> > coming module updates as they are loaded) eliminate those outstanding
> > relocations and the need to perform this late permission flipping?
> 
> Yes, it should, but we don't have to wait for it. PeterZ proposed a 
> different solution to this specific issue in 
> https://lore.kernel.org/lkml/20191015141111.GP2359@hirez.programming.kicks-ass.net/
> 
> It should not be a problem to create a live patch module like that and the 
> code in kernel/livepatch/ is almost ready. Something like 
> module_section_disable_ro(mod, section) (and similar for X protection) 
> should be enough. Module reloads would still require juggling with the 
> protections, but I think it is all feasible.

Just had a browse around the module code, and while the section info is
in load_info, it doesn't get retained for active modules.

So I suppose I'll go add that for KLP enabled things.

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH v3 5/6] x86/ftrace: Use text_poke()
  2019-10-16  9:23                               ` Peter Zijlstra
@ 2019-10-16  9:36                                 ` Jessica Yu
  2019-10-16  9:51                                   ` Peter Zijlstra
  0 siblings, 1 reply; 96+ messages in thread
From: Jessica Yu @ 2019-10-16  9:36 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Miroslav Benes, Joe Lawrence, Steven Rostedt, x86, linux-kernel,
	mhiramat, bristot, jbaron, torvalds, tglx, mingo, namit, hpa,
	luto, ard.biesheuvel, jpoimboe, live-patching

+++ Peter Zijlstra [16/10/19 11:23 +0200]:
>On Wed, Oct 16, 2019 at 08:51:27AM +0200, Miroslav Benes wrote:
>> On Tue, 15 Oct 2019, Joe Lawrence wrote:
>>
>> > On 10/15/19 10:13 AM, Miroslav Benes wrote:
>> > > Yes, it does. klp_module_coming() calls module_disable_ro() on all
>> > > patching modules which patch the coming module in order to call
>> > > apply_relocate_add(). New (patching) code for a module can be relocated
>> > > only when the relevant module is loaded.
>> >
>> > FWIW, would the LPC blue-sky2 model (ie, Steve's suggestion @ plumber's where
>> > livepatches only patch a single object and updates are kept on disk to handle
>> > coming module updates as they are loaded) eliminate those outstanding
>> > relocations and the need to perform this late permission flipping?
>>
>> Yes, it should, but we don't have to wait for it. PeterZ proposed a
>> different solution to this specific issue in
>> https://lore.kernel.org/lkml/20191015141111.GP2359@hirez.programming.kicks-ass.net/
>>
>> It should not be a problem to create a live patch module like that and the
>> code in kernel/livepatch/ is almost ready. Something like
>> module_section_disable_ro(mod, section) (and similar for X protection)
>> should be enough. Module reloads would still require juggling with the
>> protections, but I think it is all feasible.
>
>Just had a browse around the module code, and while the section info is
>in load_info, it doesn't get retained for active modules.
>
>So I suppose I'll go add that for KLP enabled things.

Elf section info does get saved for livepatch modules though, see
mod->klp_info. And wouldn't this only be needed for livepatch modules?

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH v3 5/6] x86/ftrace: Use text_poke()
  2019-10-16  9:36                                 ` Jessica Yu
@ 2019-10-16  9:51                                   ` Peter Zijlstra
  0 siblings, 0 replies; 96+ messages in thread
From: Peter Zijlstra @ 2019-10-16  9:51 UTC (permalink / raw)
  To: Jessica Yu
  Cc: Miroslav Benes, Joe Lawrence, Steven Rostedt, x86, linux-kernel,
	mhiramat, bristot, jbaron, torvalds, tglx, mingo, namit, hpa,
	luto, ard.biesheuvel, jpoimboe, live-patching

On Wed, Oct 16, 2019 at 11:36:10AM +0200, Jessica Yu wrote:
> +++ Peter Zijlstra [16/10/19 11:23 +0200]:
> > On Wed, Oct 16, 2019 at 08:51:27AM +0200, Miroslav Benes wrote:
> > > On Tue, 15 Oct 2019, Joe Lawrence wrote:
> > > 
> > > > On 10/15/19 10:13 AM, Miroslav Benes wrote:
> > > > > Yes, it does. klp_module_coming() calls module_disable_ro() on all
> > > > > patching modules which patch the coming module in order to call
> > > > > apply_relocate_add(). New (patching) code for a module can be relocated
> > > > > only when the relevant module is loaded.
> > > >
> > > > FWIW, would the LPC blue-sky2 model (ie, Steve's suggestion @ plumber's where
> > > > livepatches only patch a single object and updates are kept on disk to handle
> > > > coming module updates as they are loaded) eliminate those outstanding
> > > > relocations and the need to perform this late permission flipping?
> > > 
> > > Yes, it should, but we don't have to wait for it. PeterZ proposed a
> > > different solution to this specific issue in
> > > https://lore.kernel.org/lkml/20191015141111.GP2359@hirez.programming.kicks-ass.net/
> > > 
> > > It should not be a problem to create a live patch module like that and the
> > > code in kernel/livepatch/ is almost ready. Something like
> > > module_section_disable_ro(mod, section) (and similar for X protection)
> > > should be enough. Module reloads would still require juggling with the
> > > protections, but I think it is all feasible.
> > 
> > Just had a browse around the module code, and while the section info is
> > in load_info, it doesn't get retained for active modules.
> > 
> > So I suppose I'll go add that for KLP enabled things.
> 
> Elf section info does get saved for livepatch modules though, see
> mod->klp_info. And wouldn't this only be needed for livepatch modules?

Right, I just found that, but it is x86 only for some mysterious reason.
And yes, it's KLP only.

I was thikning of adding a KLP only list of {name,addr,size} sections
that start with ".klp" in layout_sections(). Would that not work across
architectures?

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH v3 5/6] x86/ftrace: Use text_poke()
  2019-10-16  7:42                                   ` Peter Zijlstra
@ 2019-10-16 10:15                                     ` Miroslav Benes
  0 siblings, 0 replies; 96+ messages in thread
From: Miroslav Benes @ 2019-10-16 10:15 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Steven Rostedt, Joe Lawrence, Jessica Yu, x86, linux-kernel,
	mhiramat, bristot, jbaron, torvalds, tglx, mingo, namit, hpa,
	luto, ard.biesheuvel, jpoimboe, live-patching

On Wed, 16 Oct 2019, Peter Zijlstra wrote:

> > which are not compatible with livepatching. GCC upstream now has
> > -flive-patching option, which disables all those interfering optimizations.
> 
> Which, IIRC, has a significant performance impact and should thus really
> not be used...

Fortunately, the impact is negligible.

Miroslav

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH v3 5/6] x86/ftrace: Use text_poke()
  2019-10-16  7:49                                   ` Peter Zijlstra
@ 2019-10-16 10:20                                     ` Miroslav Benes
  2019-10-16 13:29                                       ` Miroslav Benes
  0 siblings, 1 reply; 96+ messages in thread
From: Miroslav Benes @ 2019-10-16 10:20 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Steven Rostedt, Joe Lawrence, Jessica Yu, x86, linux-kernel,
	mhiramat, bristot, jbaron, torvalds, tglx, mingo, namit, hpa,
	luto, ard.biesheuvel, jpoimboe, live-patching, pmladek

On Wed, 16 Oct 2019, Peter Zijlstra wrote:

> On Tue, Oct 15, 2019 at 06:27:05PM -0400, Steven Rostedt wrote:
> 
> > (7) Seventh session, titled "klp-convert and livepatch relocations", was led
> > by Joe Lawrence.
> > 
> > Joe started the session with problem statement: accessing non exported / static
> > symbols from inside the patch module. One possible workardound is manually via
> > kallsyms. Second workaround is klp-convert, which actually creates proper
> > relocations inside the livepatch module from the symbol database during the
> > final .ko link.
> > Currently module loader looks for special livepatch relocations and resolves
> > those during runtime; kernel support for these relocations have so far been
> > added for x86 only. Special livepatch relocations are supported and processed
> > also on other architectures. Special quirks/sections are not yet supported.
> > Plus klp-convert would still be needed even with late module patching update.
> > vmlinux or modules could have ambiguous static symbols.
> > 
> > It turns out that the features / bugs below have to be resolved before we
> > can claim the klp-convert support for relocation complete:
> >     - handle all the corner cases (jump labels, static keys, ...) properly and
> >       have a good regression tests in place
> 
> I suppose all the patches in this series-of-series here will make life
> harder for KLP, static_call() and 2 byte jumps etc..

Yes, I think so. We'll have to deal with that once it lands. That is why 
we want to get rid of all this arch-specific code in livepatch and 
reinvent the late module patching. So it is perhaps better to start 
working on it sooner than later. Adding Petr, who hesitantly signed up for 
the task...

Miroslav

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH v3 5/6] x86/ftrace: Use text_poke()
  2019-10-16  6:51                             ` Miroslav Benes
  2019-10-16  9:23                               ` Peter Zijlstra
@ 2019-10-16 12:39                               ` Peter Zijlstra
  1 sibling, 0 replies; 96+ messages in thread
From: Peter Zijlstra @ 2019-10-16 12:39 UTC (permalink / raw)
  To: Miroslav Benes
  Cc: Joe Lawrence, Jessica Yu, Steven Rostedt, x86, linux-kernel,
	mhiramat, bristot, jbaron, torvalds, tglx, mingo, namit, hpa,
	luto, ard.biesheuvel, jpoimboe, live-patching

On Wed, Oct 16, 2019 at 08:51:27AM +0200, Miroslav Benes wrote:
> On Tue, 15 Oct 2019, Joe Lawrence wrote:
> 
> > On 10/15/19 10:13 AM, Miroslav Benes wrote:
> > > Yes, it does. klp_module_coming() calls module_disable_ro() on all
> > > patching modules which patch the coming module in order to call
> > > apply_relocate_add(). New (patching) code for a module can be relocated
> > > only when the relevant module is loaded.
> > 
> > FWIW, would the LPC blue-sky2 model (ie, Steve's suggestion @ plumber's where
> > livepatches only patch a single object and updates are kept on disk to handle
> > coming module updates as they are loaded) eliminate those outstanding
> > relocations and the need to perform this late permission flipping?
> 
> Yes, it should, but we don't have to wait for it. PeterZ proposed a 
> different solution to this specific issue in 
> https://lore.kernel.org/lkml/20191015141111.GP2359@hirez.programming.kicks-ass.net/
> 
> It should not be a problem to create a live patch module like that and the 
> code in kernel/livepatch/ is almost ready. Something like 
> module_section_disable_ro(mod, section) (and similar for X protection) 
> should be enough. Module reloads would still require juggling with the 
> protections, but I think it is all feasible.

Something a little like so.. completely fresh of the keyboard.

---
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -853,6 +853,18 @@ static inline void module_enable_ro(cons
 static inline void module_disable_ro(const struct module *mod) { }
 #endif
 
+#if defined(CONFIG_STRICT_MODULE_RWX) && defined(CONFIG_LIVEPATCH)
+extern void module_section_disable_ro(struct module *mod, const char *sec);
+extern void module_section_enable_ro(struct module *mod, const char *sec);
+extern void module_section_disable_x(struct module *mod, const char *sec);
+extern void module_section_enable_x(struct module *mod, const char *sec);
+#else
+static inline void module_section_disable_ro(struct module *mod, const char *sec) { }
+static inline void module_section_enable_ro(struct module *mod, const char *sec) { }
+static inline void module_section_disable_x(struct module *mod, const char *sec) { }
+static inline void module_section_enable_x(struct module *mod, const char *sec) { }
+#endif
+
 #ifdef CONFIG_GENERIC_BUG
 void module_bug_finalize(const Elf_Ehdr *, const Elf_Shdr *,
 			 struct module *);
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -2107,6 +2107,54 @@ static void free_module_elf(struct modul
 	kfree(mod->klp_info->secstrings);
 	kfree(mod->klp_info);
 }
+
+#ifdef CONFIG_STRICT_MODULE_RWX
+
+static void __frob_section(struct Elf_Shdr *sec, int (*set_memory)(unsigned long start, int num_pages))
+{
+	BUG_ON((unsigned long)sec->sh_addr & (PAGE_SIZE-1));
+	BUG_ON((unsigned long)sec->sh_size & (PAGE_SIZE-1));
+	set_memory((unsigned long)sec->sh_addr, sec->sh_size >> PAGE_SHIFT);
+}
+
+static void frob_section(struct module *mod, const char *section,
+			 int (*set_memory)(unsigned long start, int num_pages))
+{
+	struct klp_modinfo *info = mod->klp_info;
+	const char *secname;
+	Elf_Shdr *s;
+
+	for (s = info->sechdrs; s < info->sechdrs + info->hdr.e_shnum; s++) {
+		secname = mod->klp_info->secstrings + s->sh_name;
+		if (strcmp(secname, section))
+			continue;
+
+		__frob_section(s, set_memory);
+	}
+}
+
+void module_section_disable_ro(struct module *mod, const char *section)
+{
+	frob_section(mod, section, set_memory_rw);
+}
+
+void module_section_enable_ro(struct module *mod, const char *section)
+{
+	frob_section(mod, section, set_memory_ro);
+}
+
+void module_section_disable_x(struct module *mod, const char *section)
+{
+	frob_section(mod, section, set_memory_nx);
+}
+
+void module_section_enable_x(struct module *mod, const char *section)
+{
+	frob_section(mod, section, set_memory_x);
+}
+
+#endif /* ONFIG_STRICT_MODULE_RWX */
+
 #else /* !CONFIG_LIVEPATCH */
 static int copy_module_elf(struct module *mod, struct load_info *info)
 {

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH v3 5/6] x86/ftrace: Use text_poke()
  2019-10-16 10:20                                     ` Miroslav Benes
@ 2019-10-16 13:29                                       ` Miroslav Benes
  0 siblings, 0 replies; 96+ messages in thread
From: Miroslav Benes @ 2019-10-16 13:29 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Steven Rostedt, Joe Lawrence, Jessica Yu, x86, linux-kernel,
	mhiramat, bristot, jbaron, torvalds, tglx, mingo, namit, hpa,
	luto, ard.biesheuvel, jpoimboe, live-patching, pmladek

On Wed, 16 Oct 2019, Miroslav Benes wrote:

> On Wed, 16 Oct 2019, Peter Zijlstra wrote:
> 
> > On Tue, Oct 15, 2019 at 06:27:05PM -0400, Steven Rostedt wrote:
> > 
> > > (7) Seventh session, titled "klp-convert and livepatch relocations", was led
> > > by Joe Lawrence.
> > > 
> > > Joe started the session with problem statement: accessing non exported / static
> > > symbols from inside the patch module. One possible workardound is manually via
> > > kallsyms. Second workaround is klp-convert, which actually creates proper
> > > relocations inside the livepatch module from the symbol database during the
> > > final .ko link.
> > > Currently module loader looks for special livepatch relocations and resolves
> > > those during runtime; kernel support for these relocations have so far been
> > > added for x86 only. Special livepatch relocations are supported and processed
> > > also on other architectures. Special quirks/sections are not yet supported.
> > > Plus klp-convert would still be needed even with late module patching update.
> > > vmlinux or modules could have ambiguous static symbols.
> > > 
> > > It turns out that the features / bugs below have to be resolved before we
> > > can claim the klp-convert support for relocation complete:
> > >     - handle all the corner cases (jump labels, static keys, ...) properly and
> > >       have a good regression tests in place
> > 
> > I suppose all the patches in this series-of-series here will make life
> > harder for KLP, static_call() and 2 byte jumps etc..
> 
> Yes, I think so. We'll have to deal with that once it lands. That is why 
> we want to get rid of all this arch-specific code in livepatch and 
> reinvent the late module patching. So it is perhaps better to start 
> working on it sooner than later. Adding Petr, who hesitantly signed up for 
> the task...

Thinking about it more... crazy idea. I think we could leverage these new 
ELF .text per vmlinux/module sections for the reinvention I was talking 
about. If we teach module loader to relocate (and apply alternatives and 
so on, everything in arch-specific module_finalize()) not the whole module 
in case of live patch modules, but separate ELF .text sections, it could 
solve the issue with late module patching we have. It is a variation on 
Steven's idea. When live patch module is loaded, only its section for 
present modules would be processed. Then whenever a to-be-patched module 
is loaded, its .text section in all present patch module would be 
processed.

The upside is that almost no work would be required on patch modules 
creation side. The downside is that klp_modinfo must stay. Module loader 
needs to be hacked a lot in both cases. So it remains to be seen which 
idea is easier to implement.

Jessica, do you think it would be feasible?

Petr, Joe, Josh, am I missing something or would it work?

Miroslav

^ permalink raw reply	[flat|nested] 96+ messages in thread

end of thread, back to index

Thread overview: 96+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-10-07  9:02 [RESEND] everything text-poke: ftrace, modules, static_call and jump_label Peter Zijlstra
2019-10-07  8:17 ` [PATCH v3 0/6] Rewrite x86/ftrace to use text_poke() Peter Zijlstra
2019-10-07  8:17   ` [PATCH v3 1/6] x86/alternatives: Teach text_poke_bp() to emulate instructions Peter Zijlstra
2019-10-08 14:29     ` Borislav Petkov
2019-10-08 14:40       ` Steven Rostedt
2019-10-08 14:50         ` Borislav Petkov
2019-10-08 14:48       ` Peter Zijlstra
2019-10-08 14:54         ` Borislav Petkov
2019-10-08 15:04           ` Steven Rostedt
2019-10-08 15:24             ` Borislav Petkov
2019-10-09 12:03     ` Daniel Bristot de Oliveira
2019-10-07  8:17   ` [PATCH v3 2/6] x86/alternatives: Update int3_emulate_push() comment Peter Zijlstra
2019-10-07  8:17   ` [PATCH v3 3/6] x86/alternatives,jump_label: Provide better text_poke() batching interface Peter Zijlstra
2019-10-09 12:04     ` Daniel Bristot de Oliveira
2019-10-07  8:17   ` [PATCH v3 4/6] x86/alternatives: Add and use text_gen_insn() helper Peter Zijlstra
2019-10-08  6:23     ` Masami Hiramatsu
2019-10-08  8:15       ` Peter Zijlstra
2019-10-07  8:17   ` [PATCH v3 5/6] x86/ftrace: Use text_poke() Peter Zijlstra
2019-10-08 14:43     ` Steven Rostedt
2019-10-08 17:11       ` Peter Zijlstra
2019-10-08 17:27         ` Steven Rostedt
2019-10-10  2:41       ` Steven Rostedt
2019-10-10  9:20         ` Peter Zijlstra
2019-10-10 13:19           ` Steven Rostedt
2019-10-10 14:05             ` Peter Zijlstra
2019-10-10 15:54               ` Steven Rostedt
2019-10-10 17:28                 ` Peter Zijlstra
2019-10-10 17:48                   ` Steven Rostedt
2019-10-11 10:45                     ` Peter Zijlstra
2019-10-11 10:47                       ` Peter Zijlstra
2019-10-11 10:50                         ` Peter Zijlstra
2019-10-11 12:59                   ` Peter Zijlstra
2019-10-11 13:33                     ` Steven Rostedt
2019-10-11 13:45                       ` Peter Zijlstra
2019-10-15 13:07                     ` Jessica Yu
2019-10-15 13:56                       ` Peter Zijlstra
2019-10-15 14:11                         ` Peter Zijlstra
2019-10-15 14:13                         ` Miroslav Benes
2019-10-15 15:06                           ` Joe Lawrence
2019-10-15 15:31                             ` Jessica Yu
2019-10-15 22:17                               ` Joe Lawrence
2019-10-15 22:27                                 ` Steven Rostedt
2019-10-16  7:42                                   ` Peter Zijlstra
2019-10-16 10:15                                     ` Miroslav Benes
2019-10-16  7:49                                   ` Peter Zijlstra
2019-10-16 10:20                                     ` Miroslav Benes
2019-10-16 13:29                                       ` Miroslav Benes
2019-10-16  6:51                             ` Miroslav Benes
2019-10-16  9:23                               ` Peter Zijlstra
2019-10-16  9:36                                 ` Jessica Yu
2019-10-16  9:51                                   ` Peter Zijlstra
2019-10-16 12:39                               ` Peter Zijlstra
2019-10-15 14:42                         ` Peter Zijlstra
2019-10-15 18:31                           ` Peter Zijlstra
2019-10-15 15:51                         ` Jessica Yu
2019-10-15 13:28                     ` Steven Rostedt
2019-10-15 13:42                       ` Peter Zijlstra
2019-10-15 16:09                       ` Jessica Yu
2019-10-07  8:17   ` [PATCH v3 6/6] x86/mm: Remove set_kernel_text_r[ow]() Peter Zijlstra
2019-10-08 15:07   ` [PATCH v3 0/6] Rewrite x86/ftrace to use text_poke() Steven Rostedt
2019-10-07  8:25 ` [PATCH v2 0/4] Propagate module notifier errors Peter Zijlstra
2019-10-07  8:25   ` [PATCH v2 1/4] notifier: Fix broken error handling pattern Peter Zijlstra
2019-10-10 22:01     ` Rafael J. Wysocki
2019-10-07  8:25   ` [PATCH v2 2/4] module: Fix up module_notifier return values Peter Zijlstra
2019-10-07  8:25   ` [PATCH v2 3/4] module: Properly propagate MODULE_STATE_COMING failure Peter Zijlstra
2019-10-08 13:08     ` Miroslav Benes
2019-10-07  8:25   ` [PATCH v2 4/4] jump_label,module: Fix module lifetime for __jump_label_mod_text_reserved Peter Zijlstra
2019-10-07  8:27 ` [PATCH v2 00/13] Add static_call() Peter Zijlstra
2019-10-07  8:27   ` [PATCH v2 01/13] compiler.h: Make __ADDRESSABLE() symbol truly unique Peter Zijlstra
2019-10-07  8:27   ` [PATCH v2 02/13] static_call: Add basic static call infrastructure Peter Zijlstra
2019-10-07 11:33     ` Peter Zijlstra
2019-10-07  8:27   ` [PATCH v2 03/13] static_call: Add inline " Peter Zijlstra
2019-10-07  8:27   ` [PATCH v2 04/13] static_call: Avoid kprobes on inline static_call()s Peter Zijlstra
2019-10-07  8:27   ` [PATCH v2 05/13] x86/static_call: Add out-of-line static call implementation Peter Zijlstra
2019-10-07  8:27   ` [PATCH v2 06/13] x86/static_call: Add inline static call implementation for x86-64 Peter Zijlstra
2019-10-07  8:27   ` [PATCH v2 07/13] static_call: Simple self-test Peter Zijlstra
2019-10-07  8:27   ` [PATCH v2 08/13] tracepoints: Use static_call Peter Zijlstra
2019-10-07  8:27   ` [PATCH v2 09/13] x86/alternatives: Teach text_poke_bp() to emulate RET Peter Zijlstra
2019-10-07  8:27   ` [PATCH v2 10/13] static_call: Add static_cond_call() Peter Zijlstra
2019-10-07  8:27   ` [PATCH v2 11/13] static_call: Handle tail-calls Peter Zijlstra
2019-10-07  8:27   ` [PATCH v2 12/13] static_call: Allow early init Peter Zijlstra
2019-10-07  8:27   ` [RFC][PATCH v2 13/13] x86/perf, static_call: Optimize x86_pmu methods Peter Zijlstra
2019-10-07 11:33   ` [PATCH v2 00/13] Add static_call() Peter Zijlstra
2019-10-07  8:44 ` [RFC][PATCH 0/9] Variable size jump_label support Peter Zijlstra
2019-10-07  8:44   ` [RFC][PATCH 1/9] jump_label, x86: Strip ASM " Peter Zijlstra
2019-10-07  8:44   ` [RFC][PATCH 2/9] jump_label, x86: Factor out the __jump_table generation Peter Zijlstra
2019-10-07  8:44   ` [RFC][PATCH 3/9] jump_label, x86: Remove init NOP optimization Peter Zijlstra
2019-10-07  8:44   ` [RFC][PATCH 4/9] jump_label, x86: Improve error when we fail expected text Peter Zijlstra
2019-10-07  8:44   ` [RFC][PATCH 5/9] jump_label, x86: Introduce jump_entry_size() Peter Zijlstra
2019-10-07  8:44   ` [RFC][PATCH 6/9] jump_label, x86: Add variable length patching support Peter Zijlstra
2019-10-07  8:44   ` [RFC][PATCH 7/9] jump_label,objtool: Validate variable size jump labels Peter Zijlstra
2019-10-07  8:44   ` [RFC][PATCH 8/9] jump_label,objtool: Generate possible statistics Peter Zijlstra
2019-10-07  8:44   ` [RFC][PATCH 9/9] jump_label, x86: Enable JMP8/NOP2 support Peter Zijlstra
2019-10-07 12:07   ` [RFC][PATCH 0/9] Variable size jump_label support Peter Zijlstra
2019-10-07 12:55     ` Ingo Molnar
2019-10-07 15:08       ` Steven Rostedt

LKML Archive on lore.kernel.org

Archives are clonable:
	git clone --mirror https://lore.kernel.org/lkml/0 lkml/git/0.git
	git clone --mirror https://lore.kernel.org/lkml/1 lkml/git/1.git
	git clone --mirror https://lore.kernel.org/lkml/2 lkml/git/2.git
	git clone --mirror https://lore.kernel.org/lkml/3 lkml/git/3.git
	git clone --mirror https://lore.kernel.org/lkml/4 lkml/git/4.git
	git clone --mirror https://lore.kernel.org/lkml/5 lkml/git/5.git
	git clone --mirror https://lore.kernel.org/lkml/6 lkml/git/6.git
	git clone --mirror https://lore.kernel.org/lkml/7 lkml/git/7.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 lkml lkml/ https://lore.kernel.org/lkml \
		linux-kernel@vger.kernel.org linux-kernel@archiver.kernel.org
	public-inbox-index lkml

Example config snippet for mirrors

Newsgroup available over NNTP:
	nntp://nntp.lore.kernel.org/org.kernel.vger.linux-kernel


AGPL code for this site: git clone https://public-inbox.org/ public-inbox