linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] Revert "x86: optimize page faults like all other achitectures and kill notifier cruft"
@ 2008-01-09  2:34 Dave Airlie
  2008-01-09  2:37 ` Andi Kleen
                   ` (2 more replies)
  0 siblings, 3 replies; 23+ messages in thread
From: Dave Airlie @ 2008-01-09  2:34 UTC (permalink / raw)
  To: Andrew Morton, torvalds, hch; +Cc: ak, pq, jbeulich, mingo, linux-kernel


[This an initial RFC but I'd like to have this patch in before 2.6.24 goes 
final as it really breaks this useful feature]

mmiotrace the MMIO access tracer used to reverse engineer binary blobs
used this notifier interface and is planned on being pushed upstream.

Having users able to just use the tracer module without having to rebuild 
their kernel to add in a page fault handler hack means we get a lot 
greater coverage for reverse engineering efforts.

Signed-off-by: David Airlie <airlied@linux.ie>

This reverts commit 74a0b5762713a26496db72eac34fbbed46f20fce.
Conflicts:

	include/asm-avr32/kprobes.h
	include/asm-ia64/kprobes.h
	include/asm-s390/kprobes.h
	include/asm-x86/kdebug_32.h
	include/asm-x86/kdebug_64.h
	include/asm-x86/kprobes_64.h
---
 arch/x86/kernel/kprobes_32.c  |    3 +-
 arch/x86/kernel/kprobes_64.c  |    1 +
 arch/x86/mm/fault_32.c        |   43 ++++++++++++++++++++++-----------------
 arch/x86/mm/fault_64.c        |   44 +++++++++++++++++++++++-----------------
 include/asm-avr32/kdebug.h    |   16 ++++++++++++++
 include/asm-avr32/kprobes.h   |    1 +
 include/asm-ia64/kdebug.h     |   15 ++++++++++++++
 include/asm-ia64/kprobes.h    |    1 +
 include/asm-powerpc/kdebug.h  |   19 +++++++++++++++++
 include/asm-powerpc/kprobes.h |    1 +
 include/asm-s390/kdebug.h     |   15 ++++++++++++++
 include/asm-s390/kprobes.h    |    1 +
 include/asm-sh/kdebug.h       |    2 +
 include/asm-sparc64/kdebug.h  |   18 ++++++++++++++++
 include/asm-sparc64/kprobes.h |    1 +
 include/asm-x86/kdebug.h      |    3 ++
 include/asm-x86/kprobes_32.h  |    2 +-
 include/asm-x86/kprobes_64.h  |    1 +
 kernel/kprobes.c              |   39 +++++++++++++++++++++++++++++++++--
 19 files changed, 183 insertions(+), 43 deletions(-)

diff --git a/arch/x86/kernel/kprobes_32.c b/arch/x86/kernel/kprobes_32.c
index 3a020f7..1ba8fee 100644
--- a/arch/x86/kernel/kprobes_32.c
+++ b/arch/x86/kernel/kprobes_32.c
@@ -586,7 +586,7 @@ out:
 	return 1;
 }
 
-int __kprobes kprobe_fault_handler(struct pt_regs *regs, int trapnr)
+static int __kprobes kprobe_fault_handler(struct pt_regs *regs, int trapnr)
 {
 	struct kprobe *cur = kprobe_running();
 	struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
@@ -668,6 +668,7 @@ int __kprobes kprobe_exceptions_notify(struct notifier_block *self,
 			ret = NOTIFY_STOP;
 		break;
 	case DIE_GPF:
+	case DIE_PAGE_FAULT:
 		/* kprobe_running() needs smp_processor_id() */
 		preempt_disable();
 		if (kprobe_running() &&
diff --git a/arch/x86/kernel/kprobes_64.c b/arch/x86/kernel/kprobes_64.c
index 5df19a9..279cea7 100644
--- a/arch/x86/kernel/kprobes_64.c
+++ b/arch/x86/kernel/kprobes_64.c
@@ -654,6 +654,7 @@ int __kprobes kprobe_exceptions_notify(struct notifier_block *self,
 			ret = NOTIFY_STOP;
 		break;
 	case DIE_GPF:
+	case DIE_PAGE_FAULT:
 		/* kprobe_running() needs smp_processor_id() */
 		preempt_disable();
 		if (kprobe_running() &&
diff --git a/arch/x86/mm/fault_32.c b/arch/x86/mm/fault_32.c
index a2273d4..f03cc93 100644
--- a/arch/x86/mm/fault_32.c
+++ b/arch/x86/mm/fault_32.c
@@ -25,7 +25,6 @@
 #include <linux/kprobes.h>
 #include <linux/uaccess.h>
 #include <linux/kdebug.h>
-#include <linux/kprobes.h>
 
 #include <asm/system.h>
 #include <asm/desc.h>
@@ -33,27 +32,33 @@
 
 extern void die(const char *,struct pt_regs *,long);
 
-#ifdef CONFIG_KPROBES
-static inline int notify_page_fault(struct pt_regs *regs)
+static ATOMIC_NOTIFIER_HEAD(notify_page_fault_chain);
+
+int register_page_fault_notifier(struct notifier_block *nb)
 {
-	int ret = 0;
-
-	/* kprobe_running() needs smp_processor_id() */
-	if (!user_mode_vm(regs)) {
-		preempt_disable();
-		if (kprobe_running() && kprobe_fault_handler(regs, 14))
-			ret = 1;
-		preempt_enable();
-	}
+	vmalloc_sync_all();
+	return atomic_notifier_chain_register(&notify_page_fault_chain, nb);
+}
+EXPORT_SYMBOL_GPL(register_page_fault_notifier);
 
-	return ret;
+int unregister_page_fault_notifier(struct notifier_block *nb)
+{
+	return atomic_notifier_chain_unregister(&notify_page_fault_chain, nb);
 }
-#else
-static inline int notify_page_fault(struct pt_regs *regs)
+EXPORT_SYMBOL_GPL(unregister_page_fault_notifier);
+
+static inline int notify_page_fault(struct pt_regs *regs, long err)
 {
-	return 0;
+	struct die_args args = {
+		.regs = regs,
+		.str = "page fault",
+		.err = err,
+		.trapnr = 14,
+		.signr = SIGSEGV
+	};
+	return atomic_notifier_call_chain(&notify_page_fault_chain,
+	                                  DIE_PAGE_FAULT, &args);
 }
-#endif
 
 /*
  * Return EIP plus the CS segment base.  The segment limit is also
@@ -331,7 +336,7 @@ fastcall void __kprobes do_page_fault(struct pt_regs *regs,
 	if (unlikely(address >= TASK_SIZE)) {
 		if (!(error_code & 0x0000000d) && vmalloc_fault(address) >= 0)
 			return;
-		if (notify_page_fault(regs))
+		if (notify_page_fault(regs, error_code) == NOTIFY_STOP)
 			return;
 		/*
 		 * Don't take the mm semaphore here. If we fixup a prefetch
@@ -340,7 +345,7 @@ fastcall void __kprobes do_page_fault(struct pt_regs *regs,
 		goto bad_area_nosemaphore;
 	}
 
-	if (notify_page_fault(regs))
+	if (notify_page_fault(regs, error_code) == NOTIFY_STOP)
 		return;
 
 	/* It's safe to allow irq's after cr2 has been saved and the vmalloc
diff --git a/arch/x86/mm/fault_64.c b/arch/x86/mm/fault_64.c
index 0e26230..0b0d3d4 100644
--- a/arch/x86/mm/fault_64.c
+++ b/arch/x86/mm/fault_64.c
@@ -25,7 +25,6 @@
 #include <linux/kprobes.h>
 #include <linux/uaccess.h>
 #include <linux/kdebug.h>
-#include <linux/kprobes.h>
 
 #include <asm/system.h>
 #include <asm/pgalloc.h>
@@ -41,27 +40,34 @@
 #define PF_RSVD	(1<<3)
 #define PF_INSTR	(1<<4)
 
-#ifdef CONFIG_KPROBES
-static inline int notify_page_fault(struct pt_regs *regs)
+static ATOMIC_NOTIFIER_HEAD(notify_page_fault_chain);
+
+/* Hook to register for page fault notifications */
+int register_page_fault_notifier(struct notifier_block *nb)
 {
-	int ret = 0;
-
-	/* kprobe_running() needs smp_processor_id() */
-	if (!user_mode(regs)) {
-		preempt_disable();
-		if (kprobe_running() && kprobe_fault_handler(regs, 14))
-			ret = 1;
-		preempt_enable();
-	}
+	vmalloc_sync_all();
+	return atomic_notifier_chain_register(&notify_page_fault_chain, nb);
+}
+EXPORT_SYMBOL_GPL(register_page_fault_notifier);
 
-	return ret;
+int unregister_page_fault_notifier(struct notifier_block *nb)
+{
+	return atomic_notifier_chain_unregister(&notify_page_fault_chain, nb);
 }
-#else
-static inline int notify_page_fault(struct pt_regs *regs)
+EXPORT_SYMBOL_GPL(unregister_page_fault_notifier);
+
+static inline int notify_page_fault(struct pt_regs *regs, long err)
 {
-	return 0;
+	struct die_args args = {
+		.regs = regs,
+		.str = "page fault",
+		.err = err,
+		.trapnr = 14,
+		.signr = SIGSEGV
+	};
+	return atomic_notifier_call_chain(&notify_page_fault_chain,
+	                                  DIE_PAGE_FAULT, &args);
 }
-#endif
 
 /* Sometimes the CPU reports invalid exceptions on prefetch.
    Check that here and ignore.
@@ -343,7 +349,7 @@ asmlinkage void __kprobes do_page_fault(struct pt_regs *regs,
 			if (vmalloc_fault(address) >= 0)
 				return;
 		}
-		if (notify_page_fault(regs))
+		if (notify_page_fault(regs, error_code) == NOTIFY_STOP)
 			return;
 		/*
 		 * Don't take the mm semaphore here. If we fixup a prefetch
@@ -352,7 +358,7 @@ asmlinkage void __kprobes do_page_fault(struct pt_regs *regs,
 		goto bad_area_nosemaphore;
 	}
 
-	if (notify_page_fault(regs))
+	if (notify_page_fault(regs, error_code) == NOTIFY_STOP)
 		return;
 
 	if (likely(regs->eflags & X86_EFLAGS_IF))
diff --git a/include/asm-avr32/kdebug.h b/include/asm-avr32/kdebug.h
index fd7e990..7f54e2b 100644
--- a/include/asm-avr32/kdebug.h
+++ b/include/asm-avr32/kdebug.h
@@ -1,10 +1,26 @@
 #ifndef __ASM_AVR32_KDEBUG_H
 #define __ASM_AVR32_KDEBUG_H
 
+#include <linux/notifier.h>
+
 /* Grossly misnamed. */
 enum die_val {
 	DIE_BREAKPOINT,
 	DIE_SSTEP,
 };
 
+/*
+ * These are only here because kprobes.c wants them to implement a
+ * blatant layering violation.  Will hopefully go away soon once all
+ * architectures are updated.
+ */
+static inline int register_page_fault_notifier(struct notifier_block *nb)
+{
+	return 0;
+}
+static inline int unregister_page_fault_notifier(struct notifier_block *nb)
+{
+	return 0;
+}
+
 #endif /* __ASM_AVR32_KDEBUG_H */
diff --git a/include/asm-avr32/kprobes.h b/include/asm-avr32/kprobes.h
index 996cb65..99de3a6 100644
--- a/include/asm-avr32/kprobes.h
+++ b/include/asm-avr32/kprobes.h
@@ -18,6 +18,7 @@ typedef u16	kprobe_opcode_t;
 #define MAX_INSN_SIZE		2
 
 #define kretprobe_blacklist_size 0
+#define ARCH_INACTIVE_KPROBE_COUNT 1
 
 #define arch_remove_kprobe(p)	do { } while (0)
 
diff --git a/include/asm-ia64/kdebug.h b/include/asm-ia64/kdebug.h
index 35e4940..320cd8e 100644
--- a/include/asm-ia64/kdebug.h
+++ b/include/asm-ia64/kdebug.h
@@ -26,6 +26,21 @@
  * 2005-Oct	Keith Owens <kaos@sgi.com>.  Expand notify_die to cover more
  *		events.
  */
+#include <linux/notifier.h>
+
+/*
+ * These are only here because kprobes.c wants them to implement a
+ * blatant layering violation.  Will hopefully go away soon once all
+ * architectures are updated.
+ */
+static inline int register_page_fault_notifier(struct notifier_block *nb)
+{
+	return 0;
+}
+static inline int unregister_page_fault_notifier(struct notifier_block *nb)
+{
+	return 0;
+}
 
 enum die_val {
 	DIE_BREAK = 1,
diff --git a/include/asm-ia64/kprobes.h b/include/asm-ia64/kprobes.h
index a93ce9e..b8467a0 100644
--- a/include/asm-ia64/kprobes.h
+++ b/include/asm-ia64/kprobes.h
@@ -84,6 +84,7 @@ struct kprobe_ctlblk {
 
 #define ARCH_SUPPORTS_KRETPROBES
 #define kretprobe_blacklist_size 0
+#define  ARCH_INACTIVE_KPROBE_COUNT 1
 
 #define SLOT0_OPCODE_SHIFT	(37)
 #define SLOT1_p1_OPCODE_SHIFT	(37 - (64-46))
diff --git a/include/asm-powerpc/kdebug.h b/include/asm-powerpc/kdebug.h
index ae6d206..295f016 100644
--- a/include/asm-powerpc/kdebug.h
+++ b/include/asm-powerpc/kdebug.h
@@ -2,6 +2,25 @@
 #define _ASM_POWERPC_KDEBUG_H
 #ifdef __KERNEL__
 
+/* nearly identical to x86_64/i386 code */
+
+#include <linux/notifier.h>
+
+/*
+ * These are only here because kprobes.c wants them to implement a
+ * blatant layering violation.  Will hopefully go away soon once all
+ * architectures are updated.
+ */
+static inline int register_page_fault_notifier(struct notifier_block *nb)
+{
+	return 0;
+}
+static inline int unregister_page_fault_notifier(struct notifier_block *nb)
+{
+	return 0;
+}
+extern struct atomic_notifier_head powerpc_die_chain;
+
 /* Grossly misnamed. */
 enum die_val {
 	DIE_OOPS = 1,
diff --git a/include/asm-powerpc/kprobes.h b/include/asm-powerpc/kprobes.h
index afabad2..54de529 100644
--- a/include/asm-powerpc/kprobes.h
+++ b/include/asm-powerpc/kprobes.h
@@ -81,6 +81,7 @@ typedef unsigned int kprobe_opcode_t;
 #endif
 
 #define ARCH_SUPPORTS_KRETPROBES
+#define  ARCH_INACTIVE_KPROBE_COUNT 1
 #define flush_insn_slot(p)	do { } while (0)
 #define kretprobe_blacklist_size 0
 
diff --git a/include/asm-s390/kdebug.h b/include/asm-s390/kdebug.h
index 40db27c..04418af 100644
--- a/include/asm-s390/kdebug.h
+++ b/include/asm-s390/kdebug.h
@@ -4,9 +4,24 @@
 /*
  * Feb 2006 Ported to s390 <grundym@us.ibm.com>
  */
+#include <linux/notifier.h>
 
 struct pt_regs;
 
+/*
+ * These are only here because kprobes.c wants them to implement a
+ * blatant layering violation. Will hopefully go away soon once all
+ * architectures are updated.
+ */
+static inline int register_page_fault_notifier(struct notifier_block *nb)
+{
+	return 0;
+}
+static inline int unregister_page_fault_notifier(struct notifier_block *nb)
+{
+	return 0;
+}
+
 enum die_val {
 	DIE_OOPS = 1,
 	DIE_BPT,
diff --git a/include/asm-s390/kprobes.h b/include/asm-s390/kprobes.h
index 948db3d..a504709 100644
--- a/include/asm-s390/kprobes.h
+++ b/include/asm-s390/kprobes.h
@@ -48,6 +48,7 @@ typedef u16 kprobe_opcode_t;
 
 #define ARCH_SUPPORTS_KRETPROBES
 #define kretprobe_blacklist_size 0
+#define ARCH_INACTIVE_KPROBE_COUNT 0
 
 #define KPROBE_SWAP_INST	0x10
 
diff --git a/include/asm-sh/kdebug.h b/include/asm-sh/kdebug.h
index 49cd690..382cfc7 100644
--- a/include/asm-sh/kdebug.h
+++ b/include/asm-sh/kdebug.h
@@ -1,6 +1,8 @@
 #ifndef __ASM_SH_KDEBUG_H
 #define __ASM_SH_KDEBUG_H
 
+#include <linux/notifier.h>
+
 /* Grossly misnamed. */
 enum die_val {
 	DIE_TRAP,
diff --git a/include/asm-sparc64/kdebug.h b/include/asm-sparc64/kdebug.h
index f905b77..9974c7b 100644
--- a/include/asm-sparc64/kdebug.h
+++ b/include/asm-sparc64/kdebug.h
@@ -1,8 +1,26 @@
 #ifndef _SPARC64_KDEBUG_H
 #define _SPARC64_KDEBUG_H
 
+/* Nearly identical to x86_64/i386 code. */
+
+#include <linux/notifier.h>
+
 struct pt_regs;
 
+/*
+ * These are only here because kprobes.c wants them to implement a
+ * blatant layering violation.  Will hopefully go away soon once all
+ * architectures are updated.
+ */
+static inline int register_page_fault_notifier(struct notifier_block *nb)
+{
+	return 0;
+}
+static inline int unregister_page_fault_notifier(struct notifier_block *nb)
+{
+	return 0;
+}
+
 extern void bad_trap(struct pt_regs *, long);
 
 /* Grossly misnamed. */
diff --git a/include/asm-sparc64/kprobes.h b/include/asm-sparc64/kprobes.h
index 5020eaf..a646a12 100644
--- a/include/asm-sparc64/kprobes.h
+++ b/include/asm-sparc64/kprobes.h
@@ -13,6 +13,7 @@ typedef u32 kprobe_opcode_t;
 #define kretprobe_blacklist_size 0
 
 #define arch_remove_kprobe(p)	do {} while (0)
+#define  ARCH_INACTIVE_KPROBE_COUNT 0
 
 #define flush_insn_slot(p)		\
 do { 	flushi(&(p)->ainsn.insn[0]);	\
diff --git a/include/asm-x86/kdebug.h b/include/asm-x86/kdebug.h
index e2f9b62..375acd7 100644
--- a/include/asm-x86/kdebug.h
+++ b/include/asm-x86/kdebug.h
@@ -5,6 +5,9 @@
 
 struct pt_regs;
 
+extern int register_page_fault_notifier(struct notifier_block *);
+extern int unregister_page_fault_notifier(struct notifier_block *);
+
 /* Grossly misnamed. */
 enum die_val {
 	DIE_OOPS = 1,
diff --git a/include/asm-x86/kprobes_32.h b/include/asm-x86/kprobes_32.h
index 9fe8f3b..44268fa 100644
--- a/include/asm-x86/kprobes_32.h
+++ b/include/asm-x86/kprobes_32.h
@@ -43,6 +43,7 @@ typedef u8 kprobe_opcode_t;
 	: (((unsigned long)current_thread_info()) + THREAD_SIZE - (ADDR)))
 
 #define ARCH_SUPPORTS_KRETPROBES
+#define  ARCH_INACTIVE_KPROBE_COUNT 0
 #define flush_insn_slot(p)	do { } while (0)
 
 extern const int kretprobe_blacklist_size;
@@ -90,5 +91,4 @@ static inline void restore_interrupts(struct pt_regs *regs)
 
 extern int kprobe_exceptions_notify(struct notifier_block *self,
 				    unsigned long val, void *data);
-extern int kprobe_fault_handler(struct pt_regs *regs, int trapnr);
 #endif				/* _ASM_KPROBES_H */
diff --git a/include/asm-x86/kprobes_64.h b/include/asm-x86/kprobes_64.h
index 743d762..b2878e2 100644
--- a/include/asm-x86/kprobes_64.h
+++ b/include/asm-x86/kprobes_64.h
@@ -43,6 +43,7 @@ typedef u8 kprobe_opcode_t;
 
 #define ARCH_SUPPORTS_KRETPROBES
 extern const int kretprobe_blacklist_size;
+#define  ARCH_INACTIVE_KPROBE_COUNT 1
 
 void kretprobe_trampoline(void);
 extern void arch_remove_kprobe(struct kprobe *p);
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index e3a5d81..a655f36 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -64,6 +64,7 @@
 
 static struct hlist_head kprobe_table[KPROBE_TABLE_SIZE];
 static struct hlist_head kretprobe_inst_table[KPROBE_TABLE_SIZE];
+static atomic_t kprobe_count;
 
 /* NOTE: change this value only with kprobe_mutex held */
 static bool kprobe_enabled;
@@ -72,6 +73,11 @@ DEFINE_MUTEX(kprobe_mutex);		/* Protects kprobe_table */
 DEFINE_SPINLOCK(kretprobe_lock);	/* Protects kretprobe_inst_table */
 static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL;
 
+static struct notifier_block kprobe_page_fault_nb = {
+	.notifier_call = kprobe_exceptions_notify,
+	.priority = 0x7fffffff /* we need to notified first */
+};
+
 #ifdef __ARCH_WANT_KPROBES_INSN_SLOT
 /*
  * kprobe->ainsn.insn points to the copy of the instruction to be
@@ -550,6 +556,8 @@ static int __kprobes __register_kprobe(struct kprobe *p,
 	old_p = get_kprobe(p->addr);
 	if (old_p) {
 		ret = register_aggr_kprobe(old_p, p);
+		if (!ret)
+			atomic_inc(&kprobe_count);
 		goto out;
 	}
 
@@ -561,9 +569,13 @@ static int __kprobes __register_kprobe(struct kprobe *p,
 	hlist_add_head_rcu(&p->hlist,
 		       &kprobe_table[hash_ptr(p->addr, KPROBE_HASH_BITS)]);
 
-	if (kprobe_enabled)
-		arch_arm_kprobe(p);
+	if (kprobe_enabled) {
+		if (atomic_add_return(1, &kprobe_count) == \
+				(ARCH_INACTIVE_KPROBE_COUNT + 1))
+			register_page_fault_notifier(&kprobe_page_fault_nb);
 
+		arch_arm_kprobe(p);
+	}
 out:
 	mutex_unlock(&kprobe_mutex);
 
@@ -646,6 +658,16 @@ valid_p:
 		}
 		mutex_unlock(&kprobe_mutex);
 	}
+
+	/* Call unregister_page_fault_notifier()
+	 * if no probes are active
+	 */
+	mutex_lock(&kprobe_mutex);
+	if (atomic_add_return(-1, &kprobe_count) == \
+				ARCH_INACTIVE_KPROBE_COUNT)
+		unregister_page_fault_notifier(&kprobe_page_fault_nb);
+	mutex_unlock(&kprobe_mutex);
+	return;
 }
 
 static struct notifier_block kprobe_exceptions_nb = {
@@ -805,6 +827,7 @@ static int __init init_kprobes(void)
 		INIT_HLIST_HEAD(&kprobe_table[i]);
 		INIT_HLIST_HEAD(&kretprobe_inst_table[i]);
 	}
+	atomic_set(&kprobe_count, 0);
 
 	if (kretprobe_blacklist_size) {
 		/* lookup the function address from its name */
@@ -921,6 +944,13 @@ static void __kprobes enable_all_kprobes(void)
 	if (kprobe_enabled)
 		goto already_enabled;
 
+	/*
+	 * Re-register the page fault notifier only if there are any
+	 * active probes at the time of enabling kprobes globally
+	 */
+	if (atomic_read(&kprobe_count) > ARCH_INACTIVE_KPROBE_COUNT)
+		register_page_fault_notifier(&kprobe_page_fault_nb);
+
 	for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
 		head = &kprobe_table[i];
 		hlist_for_each_entry_rcu(p, node, head, hlist)
@@ -961,7 +991,10 @@ static void __kprobes disable_all_kprobes(void)
 	mutex_unlock(&kprobe_mutex);
 	/* Allow all currently running kprobes to complete */
 	synchronize_sched();
-	return;
+
+	mutex_lock(&kprobe_mutex);
+	/* Unconditionally unregister the page_fault notifier */
+	unregister_page_fault_notifier(&kprobe_page_fault_nb);
 
 already_disabled:
 	mutex_unlock(&kprobe_mutex);
-- 
1.5.3.3


^ permalink raw reply related	[flat|nested] 23+ messages in thread

* Re: [PATCH] Revert "x86: optimize page faults like all other achitectures and kill notifier cruft"
  2008-01-09  2:34 [PATCH] Revert "x86: optimize page faults like all other achitectures and kill notifier cruft" Dave Airlie
@ 2008-01-09  2:37 ` Andi Kleen
  2008-01-09  3:06 ` Andrew Morton
  2008-01-09  7:17 ` Christoph Hellwig
  2 siblings, 0 replies; 23+ messages in thread
From: Andi Kleen @ 2008-01-09  2:37 UTC (permalink / raw)
  To: Dave Airlie
  Cc: Andrew Morton, torvalds, hch, ak, pq, jbeulich, mingo, linux-kernel

On Wed, Jan 09, 2008 at 02:34:46AM +0000, Dave Airlie wrote:
> 
> [This an initial RFC but I'd like to have this patch in before 2.6.24 goes 
> final as it really breaks this useful feature]
> 
> mmiotrace the MMIO access tracer used to reverse engineer binary blobs
> used this notifier interface and is planned on being pushed upstream.
> 
> Having users able to just use the tracer module without having to rebuild 
> their kernel to add in a page fault handler hack means we get a lot 
> greater coverage for reverse engineering efforts.
> 
> Signed-off-by: David Airlie <airlied@linux.ie>

Acked-by: Andi Kleen <ak@suse.de>

I never liked the original patch.

-Andi


^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH] Revert "x86: optimize page faults like all other achitectures and kill notifier cruft"
  2008-01-09  2:34 [PATCH] Revert "x86: optimize page faults like all other achitectures and kill notifier cruft" Dave Airlie
  2008-01-09  2:37 ` Andi Kleen
@ 2008-01-09  3:06 ` Andrew Morton
  2008-01-09  3:17   ` Dave Airlie
  2008-01-09  7:17 ` Christoph Hellwig
  2 siblings, 1 reply; 23+ messages in thread
From: Andrew Morton @ 2008-01-09  3:06 UTC (permalink / raw)
  To: Dave Airlie; +Cc: torvalds, hch, ak, pq, jbeulich, mingo, linux-kernel

On Wed, 9 Jan 2008 02:34:46 +0000 (GMT) Dave Airlie <airlied@linux.ie> wrote:

> 
> [This an initial RFC but I'd like to have this patch in before 2.6.24 goes 
> final as it really breaks this useful feature]
> 
> mmiotrace the MMIO access tracer used to reverse engineer binary blobs
> used this notifier interface and is planned on being pushed upstream.
> 
> Having users able to just use the tracer module without having to rebuild 
> their kernel to add in a page fault handler hack means we get a lot 
> greater coverage for reverse engineering efforts.

Sorry, but that's a really really small benefit.  This very small number of
fairly (or very) technical users will be able to work out a way of getting
this to work in 2.6.24.  And in 2.6.25 with a merged mmiotrace we can do
something different.

It's a modest convenience to a very small number of people.  And the cost? 
Multiple functions calls and multiple cachelines hit for every pagefault
on, what?  Tens of millions of machines?

Plus the code which is getting restored isn't even very good.  For every
pagefault it populates a struct on the stack, passes that around for a
while, does a bit of RCU stuff only to find that there was nothing to do. 
Surely we should at least be doing something along the lines of

	if (unlikely(notify_page_fault_chain.notifier_call != NULL)) {
		all that crap
	}


But that's all speculation.  Has anyone actually measured the pagefault
latency impact of this change?

> +/*
> + * These are only here because kprobes.c wants them to implement a
> + * blatant layering violation.  Will hopefully go away soon once all
> + * architectures are updated.
> + */
> +static inline int register_page_fault_notifier(struct notifier_block *nb)
> +{
> +	return 0;
> +}
> +static inline int unregister_page_fault_notifier(struct notifier_block *nb)
> +{
> +	return 0;
> +}
> +

And this doesn't look very good either.  For how long did this fixme remain
unfixed?


So I'd suggest that we leave things as they are for 2.6.24 - mmiotrace
people will work something out, I'm sure.  For 2.6.25 if we merge mmiotrace
we can look at doing something which is vaguely efficient and tasteful.


^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH] Revert "x86: optimize page faults like all other achitectures and kill notifier cruft"
  2008-01-09  3:06 ` Andrew Morton
@ 2008-01-09  3:17   ` Dave Airlie
  2008-01-09  3:29     ` Andrew Morton
  2008-01-09 15:18     ` Arjan van de Ven
  0 siblings, 2 replies; 23+ messages in thread
From: Dave Airlie @ 2008-01-09  3:17 UTC (permalink / raw)
  To: Andrew Morton; +Cc: torvalds, hch, ak, pq, jbeulich, mingo, linux-kernel


> On Wed, 9 Jan 2008 02:34:46 +0000 (GMT) Dave Airlie <airlied@linux.ie> wrote:
> 
> > 
> > [This an initial RFC but I'd like to have this patch in before 2.6.24 goes 
> > final as it really breaks this useful feature]
> > 
> > mmiotrace the MMIO access tracer used to reverse engineer binary blobs
> > used this notifier interface and is planned on being pushed upstream.
> > 
> > Having users able to just use the tracer module without having to rebuild 
> > their kernel to add in a page fault handler hack means we get a lot 
> > greater coverage for reverse engineering efforts.
> 
> Sorry, but that's a really really small benefit.  This very small number of
> fairly (or very) technical users will be able to work out a way of getting
> this to work in 2.6.24.  And in 2.6.25 with a merged mmiotrace we can do
> something different.

mmiotrace isn't targetted at fairly or technical users, its whole 
usefulness is that you don't need a kernel re-build, the distro kernels 
all contain enough support for us to just get a user to grab mmiotrace, 
run make and get a trace.... so in my eyes this a major feature regression 
to have to go back to custom kernel builds...

> It's a modest convenience to a very small number of people.  And the cost? 
> Multiple functions calls and multiple cachelines hit for every pagefault
> on, what?  Tens of millions of machines?

Which has been happening for how many months? perhaps if we merge 
mmiotrace in 2.6.25 we can clean up this function, otherwise I just count 
it as a feature regression...

> pagefault it populates a struct on the stack, passes that around for a
> while, does a bit of RCU stuff only to find that there was nothing to do. 
> Surely we should at least be doing something along the lines of
> 
> 	if (unlikely(notify_page_fault_chain.notifier_call != NULL)) {
> 		all that crap
> 	}
> 
> 
> But that's all speculation.  Has anyone actually measured the pagefault
> latency impact of this change?
> 
> > +/*
> > + * These are only here because kprobes.c wants them to implement a
> > + * blatant layering violation.  Will hopefully go away soon once all
> > + * architectures are updated.
> > + */
> > +static inline int register_page_fault_notifier(struct notifier_block *nb)
> > +{
> > +	return 0;
> > +}
> > +static inline int unregister_page_fault_notifier(struct notifier_block *nb)
> > +{
> > +	return 0;
> > +}
> > +
> 
> And this doesn't look very good either.  For how long did this fixme remain
> unfixed?
> 
> 
> So I'd suggest that we leave things as they are for 2.6.24 - mmiotrace
> people will work something out, I'm sure.  For 2.6.25 if we merge mmiotrace
> we can look at doing something which is vaguely efficient and tasteful.
> 

I just reverted Christophs patch I didn't try and work out if the old code 
had problems no one has fixed...

So all distros with 2.6.24 kernels are useless to mmiotrace I don't see 
why leaving things as is until a suitable replacement mechanism can be 
used.. I've heard others give out about this also madwifi and SuSE kernel 
folks...

Dave.

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH] Revert "x86: optimize page faults like all other achitectures and kill notifier cruft"
  2008-01-09  3:17   ` Dave Airlie
@ 2008-01-09  3:29     ` Andrew Morton
  2008-01-09  3:55       ` Dave Airlie
                         ` (2 more replies)
  2008-01-09 15:18     ` Arjan van de Ven
  1 sibling, 3 replies; 23+ messages in thread
From: Andrew Morton @ 2008-01-09  3:29 UTC (permalink / raw)
  To: Dave Airlie; +Cc: torvalds, hch, ak, pq, jbeulich, mingo, linux-kernel

On Wed, 9 Jan 2008 03:17:37 +0000 (GMT) Dave Airlie <airlied@linux.ie> wrote:

> 
> > On Wed, 9 Jan 2008 02:34:46 +0000 (GMT) Dave Airlie <airlied@linux.ie> wrote:
> > 
> > > 
> > > [This an initial RFC but I'd like to have this patch in before 2.6.24 goes 
> > > final as it really breaks this useful feature]
> > > 
> > > mmiotrace the MMIO access tracer used to reverse engineer binary blobs
> > > used this notifier interface and is planned on being pushed upstream.
> > > 
> > > Having users able to just use the tracer module without having to rebuild 
> > > their kernel to add in a page fault handler hack means we get a lot 
> > > greater coverage for reverse engineering efforts.
> > 
> > Sorry, but that's a really really small benefit.  This very small number of
> > fairly (or very) technical users will be able to work out a way of getting
> > this to work in 2.6.24.  And in 2.6.25 with a merged mmiotrace we can do
> > something different.
> 
> mmiotrace isn't targetted at fairly or technical users, its whole 
> usefulness is that you don't need a kernel re-build, the distro kernels 
> all contain enough support for us to just get a user to grab mmiotrace, 
> run make and get a trace.... so in my eyes this a major feature regression 
> to have to go back to custom kernel builds...

An alternative might be to come up with something decent and target 2.6.24.x

> > It's a modest convenience to a very small number of people.  And the cost? 
> > Multiple functions calls and multiple cachelines hit for every pagefault
> > on, what?  Tens of millions of machines?
> 
> Which has been happening for how many months? perhaps if we merge 
> mmiotrace in 2.6.25 we can clean up this function, otherwise I just count 
> it as a feature regression...

We put the crappy code back in for 2.6.24 then take it out immediately
after 2.6.24 and put something else in to support mmiotrace and perhaps the
other new mystery features to which you refer below.  hm.

> > pagefault it populates a struct on the stack, passes that around for a
> > while, does a bit of RCU stuff only to find that there was nothing to do. 
> > Surely we should at least be doing something along the lines of
> > 
> > 	if (unlikely(notify_page_fault_chain.notifier_call != NULL)) {
> > 		all that crap
> > 	}
> > 
> > 
> > But that's all speculation.  Has anyone actually measured the pagefault
> > latency impact of this change?

^^ this.

> > > +/*
> > > + * These are only here because kprobes.c wants them to implement a
> > > + * blatant layering violation.  Will hopefully go away soon once all
> > > + * architectures are updated.
> > > + */
> > > +static inline int register_page_fault_notifier(struct notifier_block *nb)
> > > +{
> > > +	return 0;
> > > +}
> > > +static inline int unregister_page_fault_notifier(struct notifier_block *nb)
> > > +{
> > > +	return 0;
> > > +}
> > > +
> > 
> > And this doesn't look very good either.  For how long did this fixme remain
> > unfixed?
> > 
> > 
> > So I'd suggest that we leave things as they are for 2.6.24 - mmiotrace
> > people will work something out, I'm sure.  For 2.6.25 if we merge mmiotrace
> > we can look at doing something which is vaguely efficient and tasteful.
> > 
> 
> I just reverted Christophs patch I didn't try and work out if the old code 
> had problems no one has fixed...
> 
> So all distros with 2.6.24 kernels are useless to mmiotrace I don't see 
> why leaving things as is until a suitable replacement mechanism can be 
> used.. I've heard others give out about this also madwifi and SuSE kernel 
> folks...

That change has been in the mainline tree for nearly three months.  All
these affected parties have left it until the eve of 2.6.24 to actually
tell us about it.  This is causing me sympathy problems :(

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH] Revert "x86: optimize page faults like all other achitectures and kill notifier cruft"
  2008-01-09  3:29     ` Andrew Morton
@ 2008-01-09  3:55       ` Dave Airlie
  2008-01-09  7:19         ` Christoph Hellwig
  2008-01-09  4:17       ` Andi Kleen
  2008-01-09  9:12       ` Jan Beulich
  2 siblings, 1 reply; 23+ messages in thread
From: Dave Airlie @ 2008-01-09  3:55 UTC (permalink / raw)
  To: Andrew Morton; +Cc: torvalds, hch, ak, pq, jbeulich, mingo, linux-kernel


> 
> An alternative might be to come up with something decent and target 2.6.24.x

I don't see mmiotrace getting merged into a stable kernel... how do 
however see it getting cleaned up for 2.6.25 now that people know how 
fragile the kernel hooks for it are..

> We put the crappy code back in for 2.6.24 then take it out immediately
> after 2.6.24 and put something else in to support mmiotrace and perhaps the
> other new mystery features to which you refer below.  hm.

(I think the other mystery feature is actually a Novell kernel debugger 
but I'm not sure, madwifi use it for similiar reasons to mmiotrace I 
think..)

> > > 		all that crap
> > > 	}
> > > 
> > > 
> > > But that's all speculation.  Has anyone actually measured the pagefault
> > > latency impact of this change?

Message-Id: <20070427140516.523272684@de.ibm.com>
Subject: [patch 20/38] Minor fault path optimization.
Date:	Fri, 27 Apr 2007 16:05:23 +0200

was a patch to do exactly that.. hch decided the feature wasn't useful and 
posted a patch to remove it..

> 
> That change has been in the mainline tree for nearly three months.  All
> these affected parties have left it until the eve of 2.6.24 to actually
> tell us about it.  This is causing me sympathy problems :(
> 

Jan first complained on the 4th Decemeber last year, I'm just posting this 
now because Linus said send him a patch to revert regressions rather than 
just complain, I've prepared the patch to put back the old behaviour from 
2.6.23. This was only brought to my notice this morning but I'm not going 
to let that stop me from trying to find a correct fix rather than just 
ripping the feature out..

I think we could apply the page fault cleanup patch I mentioned earlier on 
top of this patch and get back the 300 cycles and that would make people 
happy, it makes sense for mmiotrace to use kprobes hooks and not have to 
do this stuff directly but if that is what is wanted the mmiotrace guys 
can do it directly in the future.

Dave.

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH] Revert "x86: optimize page faults like all other achitectures and kill notifier cruft"
  2008-01-09  3:29     ` Andrew Morton
  2008-01-09  3:55       ` Dave Airlie
@ 2008-01-09  4:17       ` Andi Kleen
  2008-01-09  9:12       ` Jan Beulich
  2 siblings, 0 replies; 23+ messages in thread
From: Andi Kleen @ 2008-01-09  4:17 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Dave Airlie, torvalds, hch, pq, jbeulich, mingo, linux-kernel


> An alternative might be to come up with something decent and target 2.6.24.x

If you want zero cache line cost the only way is to handle that using Mathieu's 
inline patch infrastructure. Having a generic notifier type based on that would be 
probably a good idea.

-Andi

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH] Revert "x86: optimize page faults like all other achitectures and kill notifier cruft"
  2008-01-09  2:34 [PATCH] Revert "x86: optimize page faults like all other achitectures and kill notifier cruft" Dave Airlie
  2008-01-09  2:37 ` Andi Kleen
  2008-01-09  3:06 ` Andrew Morton
@ 2008-01-09  7:17 ` Christoph Hellwig
  2008-01-09  7:22   ` David Miller
  2 siblings, 1 reply; 23+ messages in thread
From: Christoph Hellwig @ 2008-01-09  7:17 UTC (permalink / raw)
  To: Dave Airlie
  Cc: Andrew Morton, torvalds, hch, ak, pq, jbeulich, mingo, linux-kernel

On Wed, Jan 09, 2008 at 02:34:46AM +0000, Dave Airlie wrote:
> 
> [This an initial RFC but I'd like to have this patch in before 2.6.24 goes 
> final as it really breaks this useful feature]
> 
> mmiotrace the MMIO access tracer used to reverse engineer binary blobs
> used this notifier interface and is planned on being pushed upstream.
> 
> Having users able to just use the tracer module without having to rebuild 
> their kernel to add in a page fault handler hack means we get a lot 
> greater coverage for reverse engineering efforts.
> 
> Signed-off-by: David Airlie <airlied@linux.ie>
> 
> This reverts commit 74a0b5762713a26496db72eac34fbbed46f20fce.
> Conflicts:

NACK.   If you want to do it you'll need a much better reason and an
in-tree user.  And if you want to redo it it should be available for
all platforms with a consistant API.


^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH] Revert "x86: optimize page faults like all other achitectures and kill notifier cruft"
  2008-01-09  3:55       ` Dave Airlie
@ 2008-01-09  7:19         ` Christoph Hellwig
  2008-01-09  7:23           ` David Miller
  0 siblings, 1 reply; 23+ messages in thread
From: Christoph Hellwig @ 2008-01-09  7:19 UTC (permalink / raw)
  To: Dave Airlie
  Cc: Andrew Morton, torvalds, hch, ak, pq, jbeulich, mingo, linux-kernel

On Wed, Jan 09, 2008 at 03:55:20AM +0000, Dave Airlie wrote:
> now because Linus said send him a patch to revert regressions rather than 
> just complain,

this is not a regression by any definition.  You were abusing exported
symbols for out of tree junk, so you'll lose.


^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH] Revert "x86: optimize page faults like all other achitectures and kill notifier cruft"
  2008-01-09  7:17 ` Christoph Hellwig
@ 2008-01-09  7:22   ` David Miller
  2008-01-09 13:19     ` Jiri Kosina
  2008-01-09 14:23     ` Andi Kleen
  0 siblings, 2 replies; 23+ messages in thread
From: David Miller @ 2008-01-09  7:22 UTC (permalink / raw)
  To: hch; +Cc: airlied, akpm, torvalds, ak, pq, jbeulich, mingo, linux-kernel

From: Christoph Hellwig <hch@lst.de>
Date: Wed, 9 Jan 2008 08:17:27 +0100

> NACK.   If you want to do it you'll need a much better reason and an
> in-tree user.  And if you want to redo it it should be available for
> all platforms with a consistant API.

I majorly NACK this as well, we don't want to bring this thing
back especially for specialized debugging hacks.

You can set a kprobe on the x86 fault handler to do things like
mmiotrace.

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH] Revert "x86: optimize page faults like all other achitectures and kill notifier cruft"
  2008-01-09  7:19         ` Christoph Hellwig
@ 2008-01-09  7:23           ` David Miller
  2008-01-09 10:41             ` Ingo Molnar
  0 siblings, 1 reply; 23+ messages in thread
From: David Miller @ 2008-01-09  7:23 UTC (permalink / raw)
  To: hch; +Cc: airlied, akpm, torvalds, ak, pq, jbeulich, mingo, linux-kernel

From: Christoph Hellwig <hch@lst.de>
Date: Wed, 9 Jan 2008 08:19:45 +0100

> On Wed, Jan 09, 2008 at 03:55:20AM +0000, Dave Airlie wrote:
> > now because Linus said send him a patch to revert regressions rather than 
> > just complain,
> 
> this is not a regression by any definition.  You were abusing exported
> symbols for out of tree junk, so you'll lose.

And furthermore, they don't even need it, use a kprobe.

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH] Revert "x86: optimize page faults like all other achitectures and kill notifier cruft"
  2008-01-09  3:29     ` Andrew Morton
  2008-01-09  3:55       ` Dave Airlie
  2008-01-09  4:17       ` Andi Kleen
@ 2008-01-09  9:12       ` Jan Beulich
  2 siblings, 0 replies; 23+ messages in thread
From: Jan Beulich @ 2008-01-09  9:12 UTC (permalink / raw)
  To: Andrew Morton; +Cc: mingo, pq, torvalds, Dave Airlie, hch, ak, linux-kernel

[-- Attachment #1: Type: text/plain, Size: 381 bytes --]

>That change has been in the mainline tree for nearly three months.  All
>these affected parties have left it until the eve of 2.6.24 to actually
>tell us about it.  This is causing me sympathy problems :(

Not true - I complained about this on Dec 3rd (attached), with the result of
not getting a response from anyone but Andi (agreeing to restore these
notifiers).

Jan

[-- Attachment #2: Type: message/rfc822, Size: 1620 bytes --]

From: "Jan Beulich" <jbeulich@novell.com>
To: <akpm@linux-foundation.org>,<torvalds@linux-foundation.org>, <hch@lst.de>
Cc: <ananth@in.ibm.com>,<prasanna@in.ibm.com>, <anil.s.keshavamurthy@intel.com>, <ak@suse.de>, <linux-arch@vger.kernel.org>, <linux-kernel@vger.kernel.org>
Subject: Re: x86: optimize page faults like all other achitectures and kill notifier cruft
Date: Mon, 03 Dec 2007 13:31:07 +0000

Ever since I started to try to get at least some fundamental infrastructure
pieces merged for using NLKD on Linux I was told that direct calls out of
exception handlers for the sake of an individual (and perhaps even small)
sub-system is undesirable.

Making the exception notifiers report the right (correct) information (and,
specific to x86, ensuring they get called in the right place) was one of the
fundamental things, and just now I see that this is being reverted for (in
my eyes) no good reason: Instead of adding direct calls to x86, all the
other architectures should have followed the notifier model in order for
the infrastructure to be usable by external components, especially if
these aren't allowed into the kernel.

Am I to conclude that replacing direct calls elsewhere in the tree (in order
to e.g. avoid all kinds of small sub-components leaving their footprint in
core files like kernel/fork.c) is no longer a desirable goal, thereby making
it almost impossible to ever host a kernel debugger *without* having to
patch core files.

Thanks, Jan


^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH] Revert "x86: optimize page faults like all other achitectures and kill notifier cruft"
  2008-01-09  7:23           ` David Miller
@ 2008-01-09 10:41             ` Ingo Molnar
  2008-01-09 16:52               ` Mathieu Desnoyers
  2008-01-09 20:01               ` Pekka Paalanen
  0 siblings, 2 replies; 23+ messages in thread
From: Ingo Molnar @ 2008-01-09 10:41 UTC (permalink / raw)
  To: David Miller
  Cc: hch, airlied, akpm, torvalds, ak, pq, jbeulich, linux-kernel,
	Mathieu Desnoyers, Ananth N Mavinakayanahalli, Masami Hiramatsu


(kprobes folks Cc:-ed)

* David Miller <davem@davemloft.net> wrote:

> From: Christoph Hellwig <hch@lst.de>
> Date: Wed, 9 Jan 2008 08:19:45 +0100
> 
> > On Wed, Jan 09, 2008 at 03:55:20AM +0000, Dave Airlie wrote:
> > > now because Linus said send him a patch to revert regressions rather than 
> > > just complain,
> > 
> > this is not a regression by any definition.  You were abusing 
> > exported symbols for out of tree junk, so you'll lose.
> 
> And furthermore, they don't even need it, use a kprobe.

i agree. There a few practical complication on x86: the do_page_fault() 
function is currently excluded from kprobe probing, for recursion 
reasons. handle_mm_fault() can be probed OTOH - but that does not catch 
vmalloc()-ed faults. The middle of do_page_fault() [line 348] should 
work better [the point after notify_page_fault()] - but it's usually 
more fragile to insert probes to such middle-of-the-function places.

So probing pagefaults is not as easy as it should/could be. We should 
put a practical NOP marker to around line 348, to make it easier (and 
faster) for systemtap to probe there.

(__kprobes is a highly confusing newspeak name btw - it should be 
__noprobe instead.)

	Ingo

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH] Revert "x86: optimize page faults like all other achitectures and kill notifier cruft"
  2008-01-09  7:22   ` David Miller
@ 2008-01-09 13:19     ` Jiri Kosina
  2008-01-09 13:48       ` David Miller
  2008-01-09 14:23     ` Andi Kleen
  1 sibling, 1 reply; 23+ messages in thread
From: Jiri Kosina @ 2008-01-09 13:19 UTC (permalink / raw)
  To: David Miller
  Cc: hch, airlied, Andrew Morton, Linus Torvalds, Andi Kleen, pq,
	jbeulich, Ingo Molnar, linux-kernel

On Tue, 8 Jan 2008, David Miller wrote:

> You can set a kprobe on the x86 fault handler to do things like
> mmiotrace.

Currently, on x86, you can not, because:

	fastcall void __kprobes do_page_fault( ... );

-- 
Jiri Kosina

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH] Revert "x86: optimize page faults like all other achitectures and kill notifier cruft"
  2008-01-09 13:19     ` Jiri Kosina
@ 2008-01-09 13:48       ` David Miller
  0 siblings, 0 replies; 23+ messages in thread
From: David Miller @ 2008-01-09 13:48 UTC (permalink / raw)
  To: jikos; +Cc: hch, airlied, akpm, torvalds, ak, pq, jbeulich, mingo, linux-kernel

From: Jiri Kosina <jikos@jikos.cz>
Date: Wed, 9 Jan 2008 14:19:58 +0100 (CET)

> On Tue, 8 Jan 2008, David Miller wrote:
> 
> > You can set a kprobe on the x86 fault handler to do things like
> > mmiotrace.
> 
> Currently, on x86, you can not, because:
> 
> 	fastcall void __kprobes do_page_fault( ... );

Read Ingo's reply.

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH] Revert "x86: optimize page faults like all other achitectures and kill notifier cruft"
  2008-01-09  7:22   ` David Miller
  2008-01-09 13:19     ` Jiri Kosina
@ 2008-01-09 14:23     ` Andi Kleen
  1 sibling, 0 replies; 23+ messages in thread
From: Andi Kleen @ 2008-01-09 14:23 UTC (permalink / raw)
  To: David Miller
  Cc: hch, airlied, akpm, torvalds, pq, jbeulich, mingo, linux-kernel


> You can set a kprobe on the x86 fault handler to do things like
> mmiotrace.

That would mean that if the kprobe faults it goes into an endless loop.
Most of do_page_fault() is not really safe for kprobes.

-Andi


^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH] Revert "x86: optimize page faults like all other achitectures and kill notifier cruft"
  2008-01-09  3:17   ` Dave Airlie
  2008-01-09  3:29     ` Andrew Morton
@ 2008-01-09 15:18     ` Arjan van de Ven
  2008-01-09 15:21       ` Andi Kleen
  1 sibling, 1 reply; 23+ messages in thread
From: Arjan van de Ven @ 2008-01-09 15:18 UTC (permalink / raw)
  To: Dave Airlie
  Cc: Andrew Morton, torvalds, hch, ak, pq, jbeulich, mingo, linux-kernel

On Wed, 9 Jan 2008 03:17:37 +0000 (GMT)
Dave Airlie <airlied@linux.ie> wrote:

> So all distros with 2.6.24 kernels are useless to mmiotrace I don't
> see why leaving things as is until a suitable replacement mechanism
> can be used.. 

you work for a distro.. surely you can convince your own distro to carry this patch for one release?
Maybe if it's this important.. so can the others...

-- 
If you want to reach me at my work email, use arjan@linux.intel.com
For development, discussion and tips for power savings, 
visit http://www.lesswatts.org

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH] Revert "x86: optimize page faults like all other achitectures and kill notifier cruft"
  2008-01-09 15:18     ` Arjan van de Ven
@ 2008-01-09 15:21       ` Andi Kleen
  2008-01-09 15:26         ` Ingo Molnar
  0 siblings, 1 reply; 23+ messages in thread
From: Andi Kleen @ 2008-01-09 15:21 UTC (permalink / raw)
  To: Arjan van de Ven
  Cc: Dave Airlie, Andrew Morton, torvalds, hch, ak, pq, jbeulich,
	mingo, linux-kernel

On Wed, Jan 09, 2008 at 07:18:46AM -0800, Arjan van de Ven wrote:
> On Wed, 9 Jan 2008 03:17:37 +0000 (GMT)
> Dave Airlie <airlied@linux.ie> wrote:
> 
> > So all distros with 2.6.24 kernels are useless to mmiotrace I don't
> > see why leaving things as is until a suitable replacement mechanism
> > can be used.. 
> 
> you work for a distro.. surely you can convince your own distro to carry this patch for one release?
> Maybe if it's this important.. so can the others...

But if it's good for a major distribution why is it not good for mainline?

We had this discussion at last kernel summit and the answer was clear.

-Andi

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH] Revert "x86: optimize page faults like all other achitectures and kill notifier cruft"
  2008-01-09 15:21       ` Andi Kleen
@ 2008-01-09 15:26         ` Ingo Molnar
  0 siblings, 0 replies; 23+ messages in thread
From: Ingo Molnar @ 2008-01-09 15:26 UTC (permalink / raw)
  To: Andi Kleen
  Cc: Arjan van de Ven, Dave Airlie, Andrew Morton, torvalds, hch, pq,
	jbeulich, linux-kernel


* Andi Kleen <ak@suse.de> wrote:

> On Wed, Jan 09, 2008 at 07:18:46AM -0800, Arjan van de Ven wrote:
> > On Wed, 9 Jan 2008 03:17:37 +0000 (GMT)
> > Dave Airlie <airlied@linux.ie> wrote:
> > 
> > > So all distros with 2.6.24 kernels are useless to mmiotrace I don't
> > > see why leaving things as is until a suitable replacement mechanism
> > > can be used.. 
> > 
> > you work for a distro.. surely you can convince your own distro to 
> > carry this patch for one release? Maybe if it's this important.. so 
> > can the others...
> 
> But if it's good for a major distribution why is it not good for 
> mainline?

it's a bit too late to get the out-of-tree module into v2.6.24, and the 
revert makes little sense without the extra out-of-tree module. We at a 
minimum need a clear explanation of why this functionality cannot be 
provided via kprobes/systemtap.

> We had this discussion at last kernel summit and the answer was clear.

it's a clear answer 1 week after the merge window opens, but not 1 week 
before the next stable kernel is released.

	Ingo

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH] Revert "x86: optimize page faults like all other achitectures and kill notifier cruft"
  2008-01-09 10:41             ` Ingo Molnar
@ 2008-01-09 16:52               ` Mathieu Desnoyers
  2008-01-09 19:07                 ` Andi Kleen
  2008-01-09 20:01               ` Pekka Paalanen
  1 sibling, 1 reply; 23+ messages in thread
From: Mathieu Desnoyers @ 2008-01-09 16:52 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: David Miller, hch, airlied, akpm, torvalds, ak, pq, jbeulich,
	linux-kernel, Ananth N Mavinakayanahalli, Masami Hiramatsu

* Ingo Molnar (mingo@elte.hu) wrote:
> 
> (kprobes folks Cc:-ed)
> 
> * David Miller <davem@davemloft.net> wrote:
> 
> > From: Christoph Hellwig <hch@lst.de>
> > Date: Wed, 9 Jan 2008 08:19:45 +0100
> > 
> > > On Wed, Jan 09, 2008 at 03:55:20AM +0000, Dave Airlie wrote:
> > > > now because Linus said send him a patch to revert regressions rather than 
> > > > just complain,
> > > 
> > > this is not a regression by any definition.  You were abusing 
> > > exported symbols for out of tree junk, so you'll lose.
> > 
> > And furthermore, they don't even need it, use a kprobe.
> 
> i agree. There a few practical complication on x86: the do_page_fault() 
> function is currently excluded from kprobe probing, for recursion 
> reasons. handle_mm_fault() can be probed OTOH - but that does not catch 
> vmalloc()-ed faults. The middle of do_page_fault() [line 348] should 
> work better [the point after notify_page_fault()] - but it's usually 
> more fragile to insert probes to such middle-of-the-function places.
> 
> So probing pagefaults is not as easy as it should/could be. We should 
> put a practical NOP marker to around line 348, to make it easier (and 
> faster) for systemtap to probe there.
> 
> (__kprobes is a highly confusing newspeak name btw - it should be 
> __noprobe instead.)
> 
> 	Ingo


Probing vmalloc faults is _really_ tricky : it also implies that the
handler (let's call it probe) connected to the probe point (marker or
kprobe) should _never_ cause a vmalloc page fault, it should therefore
never touch vmalloc'd memory, which is a very restrictive constraint,
especially for tracing which may need large buffers (the only sane
alternative is to allocate the buffers statically before the kernel
boots).

As for the location of the probe point, we have to determine how we want
to handle a OOPSing probe. If we put the probe point too soon in the
do_page_fault function, we will end up doing recursive page_fault rather
than a OOPS, which may make things harder to debug.

In the LTTng instrumentation, I volountarily excluded the
bad_area and bad_area_nosemaphore paths from the page fault
instrumentation for this exact reason.

Currently, I have markers around the handle_mm_fault call :

        trace_mark(kernel_arch_trap_entry, "trap_id %d ip #p%ld",
                14, instruction_pointer(regs));
        fault = handle_mm_fault(mm, vma, address, write);
        trace_mark(kernel_arch_trap_exit, MARK_NOARGS);

I also instrument handle_mm_fault, but I leave these markers in
do_page_fault to get the architecture specific trap id (the "trap_entry"
and "trap_exit" events) and the instruction pointer causing the fault.


My handle_mm_fault instrumentation :

(note that handle_mm_fault is also called by get_user_pages, not only
do_page_fault)

int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
                unsigned long address, int write_access)
{
        int res;
        pgd_t *pgd;
        pud_t *pud;
        pmd_t *pmd;
        pte_t *pte;

        trace_mark(mm_handle_fault_entry,
                "address %lu ip #p%ld write_access %d",
                address, KSTK_EIP(current), write_access);

        __set_current_state(TASK_RUNNING);

        count_vm_event(PGFAULT);

        if (unlikely(is_vm_hugetlb_page(vma))) {
                res = hugetlb_fault(mm, vma, address, write_access);
                goto end;
        }

        pgd = pgd_offset(mm, address);
        pud = pud_alloc(mm, pgd, address);
        if (!pud) {
                res = VM_FAULT_OOM;
                goto end;
        }
        pmd = pmd_alloc(mm, pud, address);
        if (!pmd) {
                res = VM_FAULT_OOM;
                goto end;
        }
        pte = pte_alloc_map(mm, pmd, address);
        if (!pte) {
                res = VM_FAULT_OOM;
                goto end;
        }

        res = handle_pte_fault(mm, vma, address, pte, pmd, write_access);
end:
        trace_mark(mm_handle_fault_exit, MARK_NOARGS);
        return res;
}

Mathieu


-- 
Mathieu Desnoyers
Computer Engineering Ph.D. Student, Ecole Polytechnique de Montreal
OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F  BA06 3F25 A8FE 3BAE 9A68

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH] Revert "x86: optimize page faults like all other achitectures and kill notifier cruft"
  2008-01-09 16:52               ` Mathieu Desnoyers
@ 2008-01-09 19:07                 ` Andi Kleen
  0 siblings, 0 replies; 23+ messages in thread
From: Andi Kleen @ 2008-01-09 19:07 UTC (permalink / raw)
  To: Mathieu Desnoyers
  Cc: Ingo Molnar, David Miller, hch, airlied, akpm, torvalds, pq,
	jbeulich, linux-kernel, Ananth N Mavinakayanahalli,
	Masami Hiramatsu


> Probing vmalloc faults is _really_ tricky : it also implies that the
> handler (let's call it probe) connected to the probe point (marker or
> kprobe) should _never_ cause a vmalloc page fault, 

That is why vmalloc_sync_all() was invented. It might make sense
to just call that on kprobe registration.

But I agree the other problems makes it a bad idea.

I think the better solution is to keep the notifier, but make 
it cheaper (e.g. by using constant patching ...)

-Andi

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH] Revert "x86: optimize page faults like all other achitectures and kill notifier cruft"
  2008-01-09 10:41             ` Ingo Molnar
  2008-01-09 16:52               ` Mathieu Desnoyers
@ 2008-01-09 20:01               ` Pekka Paalanen
  2008-01-09 20:08                 ` Andi Kleen
  1 sibling, 1 reply; 23+ messages in thread
From: Pekka Paalanen @ 2008-01-09 20:01 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: David Miller, hch, airlied, akpm, torvalds, ak, jbeulich,
	linux-kernel, Mathieu Desnoyers, Ananth N Mavinakayanahalli,
	Masami Hiramatsu

On Wed, 9 Jan 2008 11:41:49 +0100
Ingo Molnar <mingo@elte.hu> wrote:

> i agree. There a few practical complication on x86: the
> do_page_fault() function is currently excluded from kprobe probing,
> for recursion reasons. handle_mm_fault() can be probed OTOH - but
> that does not catch vmalloc()-ed faults. The middle of
> do_page_fault() [line 348] should work better [the point after
> notify_page_fault()] - but it's usually more fragile to insert probes
> to such middle-of-the-function places.

I have been reading about kprobes and one thing particularly bothers me
in the case of mmio-trace. The probe will actually service the page
fault, therefore it should be able force do_page_fault() to return at
the probe point. I could not figure out a way to do that.

Is it possible to do reliably with kprobes or markers?


Thanks for the replies.

-- 
Pekka Paalanen
http://www.iki.fi/pq/

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH] Revert "x86: optimize page faults like all other achitectures and kill notifier cruft"
  2008-01-09 20:01               ` Pekka Paalanen
@ 2008-01-09 20:08                 ` Andi Kleen
  0 siblings, 0 replies; 23+ messages in thread
From: Andi Kleen @ 2008-01-09 20:08 UTC (permalink / raw)
  To: Pekka Paalanen
  Cc: Ingo Molnar, David Miller, hch, airlied, akpm, torvalds,
	jbeulich, linux-kernel, Mathieu Desnoyers,
	Ananth N Mavinakayanahalli, Masami Hiramatsu


> I have been reading about kprobes and one thing particularly bothers me
> in the case of mmio-trace. The probe will actually service the page
> fault, therefore it should be able force do_page_fault() to return at
> the probe point. I could not figure out a way to do that.
> 
> Is it possible to do reliably with kprobes or markers?

Probes are generally not designed to change the flow of the 
underlying code.

While it's in theory possible it will be always unreliable.

For checks that result in logic changes you'll always need
some kind of explicit hook.

-Andi




^ permalink raw reply	[flat|nested] 23+ messages in thread

end of thread, other threads:[~2008-01-09 20:08 UTC | newest]

Thread overview: 23+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2008-01-09  2:34 [PATCH] Revert "x86: optimize page faults like all other achitectures and kill notifier cruft" Dave Airlie
2008-01-09  2:37 ` Andi Kleen
2008-01-09  3:06 ` Andrew Morton
2008-01-09  3:17   ` Dave Airlie
2008-01-09  3:29     ` Andrew Morton
2008-01-09  3:55       ` Dave Airlie
2008-01-09  7:19         ` Christoph Hellwig
2008-01-09  7:23           ` David Miller
2008-01-09 10:41             ` Ingo Molnar
2008-01-09 16:52               ` Mathieu Desnoyers
2008-01-09 19:07                 ` Andi Kleen
2008-01-09 20:01               ` Pekka Paalanen
2008-01-09 20:08                 ` Andi Kleen
2008-01-09  4:17       ` Andi Kleen
2008-01-09  9:12       ` Jan Beulich
2008-01-09 15:18     ` Arjan van de Ven
2008-01-09 15:21       ` Andi Kleen
2008-01-09 15:26         ` Ingo Molnar
2008-01-09  7:17 ` Christoph Hellwig
2008-01-09  7:22   ` David Miller
2008-01-09 13:19     ` Jiri Kosina
2008-01-09 13:48       ` David Miller
2008-01-09 14:23     ` Andi Kleen

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).