[RFC PATCH 0/2] Debugging infrastructure for Futexes using Markers

linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [RFC PATCH 0/2] Debugging infrastructure for Futexes using Markers
@ 2008-04-15 11:50 K. Prasad
  2008-04-15 11:53 ` [RFC PATCH 1/2] Marker probes in futex.c K. Prasad
  0 siblings, 1 reply; 46+ messages in thread
From: K. Prasad @ 2008-04-15 11:50 UTC (permalink / raw)
  To: linux-kernel; +Cc: tglx, a.p.zijlstra, mingo, mathieu.desnoyers

Hi All,
	The following set of patches a)Implant marker probes in futex.c
and b)define a sample use-case for the markers in futex.

The marker handler patches also make use of the enhancements proposed to
the 'trace' infrastructure i.e. debugfs_printk() interface.
http://lkml.org/lkml/2008/4/15/117

The patches are generated over 2.6.25-rc8-mm1 and have been tested on an
i386 machine.

Thanks,
K.Prasad

^ permalink raw reply	[flat|nested] 46+ messages in thread

* [RFC PATCH 1/2] Marker probes in futex.c
  2008-04-15 11:50 [RFC PATCH 0/2] Debugging infrastructure for Futexes using Markers K. Prasad
@ 2008-04-15 11:53 ` K. Prasad
  2008-04-15 11:55   ` [RFC PATCH 2/2] Marker handler for the probes in futex file K. Prasad
  2008-04-15 12:02   ` [RFC PATCH 1/2] Marker probes in futex.c Peter Zijlstra
  0 siblings, 2 replies; 46+ messages in thread
From: K. Prasad @ 2008-04-15 11:53 UTC (permalink / raw)
  To: linux-kernel; +Cc: tglx, a.p.zijlstra, mingo, mathieu.desnoyers

We place a probe at the function entry for each futex operation,
and also at each point where a futex operation fails.
 
Signed-off-by: K.Prasad <prasad@linux.vnet.ibm.com>
---
 kernel/futex.c |  110 +++++++++++++++++++++++++++++++++++++++++++++------------
 1 file changed, 88 insertions(+), 22 deletions(-)

Index: linux-2.6.25-rc8-mm2/kernel/futex.c
===================================================================
--- linux-2.6.25-rc8-mm2.orig/kernel/futex.c
+++ linux-2.6.25-rc8-mm2/kernel/futex.c
@@ -737,8 +737,14 @@ static int futex_wake(u32 __user *uaddr,
 	union futex_key key;
 	int ret;
 
-	if (!bitset)
-		return -EINVAL;
+	trace_mark(futex_wake_called, "uaddr:%p fshared:%p nr_wake:%d "
+			"bitset:%d",
+			uaddr, fshared, nr_wake, bitset);
+
+	if (unlikely(!bitset)) {
+		ret = -EINVAL;
+		goto out;
+	}
 
 	futex_lock_mm(fshared);
 
@@ -770,6 +776,8 @@ static int futex_wake(u32 __user *uaddr,
 	spin_unlock(&hb->lock);
 out:
 	futex_unlock_mm(fshared);
+	if (unlikely(ret != 0))
+		trace_mark(futex_wake_failed, "uaddr:%p ret:%d", uaddr, ret);
 	return ret;
 }
 
@@ -901,6 +909,14 @@ static int futex_requeue(u32 __user *uad
 	struct futex_q *this, *next;
 	int ret, drop_count = 0;
 
+	/*
+	 * Call this probe futex_cmp_requeue_called, although this function is
+	 * common to both FUTEX_REQUEUE and FUTEX_CMP_REQUEUE. This is because
+	 * FUTEX_REQUEUE is deprecated.
+	 */
+	trace_mark(futex_cmp_requeue_called, "uaddr1:%p fshared:%p uaddr2:%p "
+			"nr_wake:%d nr_requeue:%d cmpval:%p",
+			uaddr1, fshared, uaddr2, nr_wake, nr_requeue, cmpval);
  retry:
 	futex_lock_mm(fshared);
 
@@ -908,8 +924,12 @@ static int futex_requeue(u32 __user *uad
 	if (unlikely(ret != 0))
 		goto out;
 	ret = get_futex_key(uaddr2, fshared, &key2);
-	if (unlikely(ret != 0))
+	if (unlikely(ret != 0)) {
+		trace_mark(futex_cmp_requeue_uaddr2_failed, "uaddr1:%p "
+				"uaddr2:%p ret:%d",
+				uaddr1, uaddr2, ret);
 		goto out;
+	}
 
 	hb1 = hash_futex(&key1);
 	hb2 = hash_futex(&key2);
@@ -984,6 +1004,9 @@ out_unlock:
 
 out:
 	futex_unlock_mm(fshared);
+	if (unlikely(ret != 0))
+		trace_mark(futex_cmp_requeue_failed, "uaddr1:%p ret:%d", uaddr1,
+				ret);
 	return ret;
 }
 
@@ -1182,8 +1205,14 @@ static int futex_wait(u32 __user *uaddr,
 	struct hrtimer_sleeper t;
 	int rem = 0;
 
-	if (!bitset)
-		return -EINVAL;
+	trace_mark(futex_wait_called, "uaddr:%p fshared:%p val:%u "
+			"abs_time:%p bitset:%d",
+			uaddr, fshared, val, abs_time, bitset);
+
+	if (!bitset) {
+		ret = -EINVAL;
+		goto out;
+	}
 
 	q.pi_state = NULL;
 	q.bitset = bitset;
@@ -1231,11 +1260,13 @@ static int futex_wait(u32 __user *uaddr,
 
 		if (!ret)
 			goto retry;
-		return ret;
+		goto out;
 	}
 	ret = -EWOULDBLOCK;
-	if (uval != val)
+	if (uval != val) {
+		trace_mark(futex_wait_uval, "uaddr:%p uval:%u", uval, uaddr);
 		goto out_unlock_release_sem;
+	}
 
 	/* Only actually queue if *uaddr contained val.  */
 	__queue_me(&q, hb);
@@ -1292,6 +1323,7 @@ static int futex_wait(u32 __user *uaddr,
 			destroy_hrtimer_on_stack(&t.timer);
 		}
 	}
+	trace_mark(futex_wait_after_sched, "uaddr:%p", uaddr);
 	__set_current_state(TASK_RUNNING);
 
 	/*
@@ -1302,16 +1334,19 @@ static int futex_wait(u32 __user *uaddr,
 	/* If we were woken (and unqueued), we succeeded, whatever. */
 	if (!unqueue_me(&q))
 		return 0;
-	if (rem)
-		return -ETIMEDOUT;
+	if (rem) {
+		ret = -ETIMEDOUT;
+		goto out;
+	}
 
 	/*
 	 * We expect signal_pending(current), but another thread may
 	 * have handled it for us already.
 	 */
-	if (!abs_time)
-		return -ERESTARTSYS;
-	else {
+	if (!abs_time) {
+		ret = -ERESTARTSYS;
+		goto out;
+	} else {
 		struct restart_block *restart;
 		restart = &current_thread_info()->restart_block;
 		restart->fn = futex_wait_restart;
@@ -1323,7 +1358,8 @@ static int futex_wait(u32 __user *uaddr,
 
 		if (fshared)
 			restart->futex.flags |= FLAGS_SHARED;
-		return -ERESTART_RESTARTBLOCK;
+		ret = -ERESTART_RESTARTBLOCK;
+		goto out;
 	}
 
  out_unlock_release_sem:
@@ -1331,6 +1367,10 @@ static int futex_wait(u32 __user *uaddr,
 
  out_release_sem:
 	futex_unlock_mm(fshared);
+
+ out:
+	if (unlikely(ret != 0))
+		trace_mark(futex_wait_failed, "uaddr:%p ret:%d", uaddr, ret);
 	return ret;
 }
 
@@ -1366,8 +1406,14 @@ static int futex_lock_pi(u32 __user *uad
 	struct futex_q q;
 	int ret, lock_taken, ownerdied = 0, attempt = 0;
 
-	if (refill_pi_state_cache())
-		return -ENOMEM;
+	trace_mark(futex_lock_pi_called, "uaddr:%p fshared:%p detect:%d "
+			"time:%p trylock:%d",
+			uaddr, fshared, detect, time, trylock);
+
+	if (refill_pi_state_cache()) {
+		ret = -ENOMEM;
+		goto out;
+	}
 
 	if (time) {
 		to = &timeout;
@@ -1588,7 +1634,9 @@ static int futex_lock_pi(u32 __user *uad
 
 	if (to)
 		destroy_hrtimer_on_stack(&to->timer);
-	return ret != -EINTR ? ret : -ERESTARTNOINTR;
+	if (ret == -EINTR)
+		ret = -ERESTARTNOINTR;
+	goto out;
 
  out_unlock_release_sem:
 	queue_unlock(&q, hb);
@@ -1597,7 +1645,7 @@ static int futex_lock_pi(u32 __user *uad
 	futex_unlock_mm(fshared);
 	if (to)
 		destroy_hrtimer_on_stack(&to->timer);
-	return ret;
+	goto out;
 
  uaddr_faulted:
 	/*
@@ -1624,6 +1672,9 @@ static int futex_lock_pi(u32 __user *uad
 	if (!ret && (uval != -EFAULT))
 		goto retry;
 
+ out:
+	if (unlikely(ret != 0))
+		trace_mark(futex_lock_pi_failed, "uaddr:%p ret:%d", uaddr, ret);
 	if (to)
 		destroy_hrtimer_on_stack(&to->timer);
 	return ret;
@@ -1643,14 +1694,20 @@ static int futex_unlock_pi(u32 __user *u
 	union futex_key key;
 	int ret, attempt = 0;
 
+	trace_mark(futex_unlock_pi_called, "uaddr:%p fshared:%p",
+							uaddr, fshared);
 retry:
-	if (get_user(uval, uaddr))
-		return -EFAULT;
+	if (get_user(uval, uaddr)) {
+		ret = -EFAULT;
+		goto out_return;
+	}
 	/*
 	 * We release only a lock we actually own:
 	 */
-	if ((uval & FUTEX_TID_MASK) != task_pid_vnr(current))
-		return -EPERM;
+	if ((uval & FUTEX_TID_MASK) != task_pid_vnr(current)) {
+		ret = -EPERM;
+		goto out_return;
+	}
 	/*
 	 * First take all the futex related locks:
 	 */
@@ -1715,7 +1772,7 @@ out_unlock:
 out:
 	futex_unlock_mm(fshared);
 
-	return ret;
+	goto out_return;
 
 pi_faulted:
 	/*
@@ -1743,6 +1800,10 @@ pi_faulted:
 	if (!ret && (uval != -EFAULT))
 		goto retry;
 
+out_return:
+	if (unlikely(ret != 0))
+		trace_mark(futex_unlock_pi_failed, "uaddr:%p ret:%d",
+							uaddr, ret);
 	return ret;
 }
 
@@ -2120,6 +2181,11 @@ long do_futex(u32 __user *uaddr, int op,
 	default:
 		ret = -ENOSYS;
 	}
+	trace_mark(do_futex_probe, "uaddr: %lu op:%d val:%lu timeout:%p "
+			"uaddr2:%lu val2:%lu val3:%lu ret:%d",
+			(unsigned long)uaddr, op, (unsigned long)val,
+			timeout, (unsigned long)uaddr2, (unsigned long)val2,
+			(unsigned long)val3, ret);
 	return ret;
 }
 

^ permalink raw reply	[flat|nested] 46+ messages in thread

* [RFC PATCH 2/2] Marker handler for the probes in futex file
  2008-04-15 11:53 ` [RFC PATCH 1/2] Marker probes in futex.c K. Prasad
@ 2008-04-15 11:55   ` K. Prasad
  2008-04-15 12:02   ` [RFC PATCH 1/2] Marker probes in futex.c Peter Zijlstra
  1 sibling, 0 replies; 46+ messages in thread
From: K. Prasad @ 2008-04-15 11:55 UTC (permalink / raw)
  To: linux-kernel; +Cc: tglx, a.p.zijlstra, mingo, mathieu.desnoyers

These files define and implement marker handlers for the probes in
futex.c. They may be built by choosing the CONFIG_FUTEX_DEBUG option
(can be built as a kernel module also).
 
They export data to the user using debugfs_printk and the output
is available in futex_debug_trace/ directory under the debugfs mount.
Separate directories under this are created for each marker probe.

Signed-off-by: K.Prasad <prasad@linux.vnet.ibm.com>
---
 include/linux/futex_markers.h |   63 +++++++++++++
 init/Kconfig                  |   13 ++
 kernel/Makefile               |    1 
 kernel/futex_markers.c        |  201 ++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 278 insertions(+)

Index: linux-2.6.25-rc8-mm1/init/Kconfig
===================================================================
--- linux-2.6.25-rc8-mm1.orig/init/Kconfig
+++ linux-2.6.25-rc8-mm1/init/Kconfig
@@ -646,6 +646,19 @@ config FUTEX
 	  support for "fast userspace mutexes".  The resulting kernel may not
 	  run glibc-based applications correctly.
 
+config FUTEX_DEBUG
+	tristate "Enable debugging for Futex - currently stats in debugfs"
+	depends on FUTEX
+	depends on DEBUG_FS && MARKERS && TRACE
+	default m
+	help
+	  This option provides debugging data for Futex system calls
+	  in debugfs.
+
+	  Say Y/M here if you want to enable Futex debugging
+	  in-kernel/module.
+	  Say N if you are unsure.
+
 config ANON_INODES
 	bool
 
Index: linux-2.6.25-rc8-mm1/kernel/futex_markers.c
===================================================================
--- /dev/null
+++ linux-2.6.25-rc8-mm1/kernel/futex_markers.c
@@ -0,0 +1,201 @@
+/*
+ * futex_markers.c - Marker handler for probes in futex.c
+ *
+ * Copyright IBM Corp. 2008
+ *
+ * This program is free software;  you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
+ * the GNU General Public License for more details.
+ */
+
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/spinlock.h>
+#include <linux/module.h>
+#include <linux/completion.h>
+#include <linux/moduleparam.h>
+#include <linux/mutex.h>
+#include <linux/debugfs.h>
+#include <linux/futex_markers.h>
+
+#define FUTEX_DEBUG_STRING_SIZE 1024
+#define FUTEX_DEBUG_DIR_MAXLEN 25
+
+static void futex_event_debug_worker(const char *marker_name,
+				const char *format_str,
+				enum futex_ops_ futex_event, va_list *ap);
+
+/*
+ * This function helps to direct the output to one of the directories
+ */
+static inline char *choose_output_dir(enum futex_ops_ futex_event)
+{
+	char *temp = NULL;
+
+	switch (futex_event) {
+	case do_futex_probe:
+		temp = "do_futex_probe_debug";
+		break;
+	case futex_wait_called:
+	case futex_wait_uval:
+	case futex_wait_after_sched:
+	case futex_wait_failed:
+		temp = "futex_wait_debug";
+		break;
+	case futex_wake_called:
+	case futex_wake_failed:
+		temp = "futex_wake_debug";
+		break;
+	case futex_cmp_requeue_called:
+	case futex_cmp_requeue_uaddr2_failed:
+	case futex_cmp_requeue_failed:
+		temp = "futex_cmp_requeue_debug";
+		break;
+	case futex_lock_pi_called:
+	case futex_lock_pi_failed:
+		temp = "futex_lock_pi_debug";
+		break;
+	case futex_unlock_pi_called:
+	case futex_unlock_pi_failed:
+		temp = "futex_unlock_pi_debug";
+		break;
+	}
+	return temp;
+}
+
+/*
+ * Use the macro to define the other marker handlers
+ */
+DEFINE_FUTEX_OP_MARKER_HANDLER(futex_wake_called);
+DEFINE_FUTEX_OP_MARKER_HANDLER(futex_cmp_requeue_called);
+DEFINE_FUTEX_OP_MARKER_HANDLER(futex_wait_called);
+DEFINE_FUTEX_OP_MARKER_HANDLER(futex_wait_uval);
+DEFINE_FUTEX_OP_MARKER_HANDLER(futex_wait_after_sched);
+DEFINE_FUTEX_OP_MARKER_HANDLER(futex_lock_pi_called);
+DEFINE_FUTEX_OP_MARKER_HANDLER(futex_unlock_pi_called);
+
+DEFINE_FUTEX_OP_MARKER_HANDLER(futex_wait_failed);
+DEFINE_FUTEX_OP_MARKER_HANDLER(futex_wake_failed);
+DEFINE_FUTEX_OP_MARKER_HANDLER(futex_cmp_requeue_failed);
+DEFINE_FUTEX_OP_MARKER_HANDLER(futex_cmp_requeue_uaddr2_failed);
+DEFINE_FUTEX_OP_MARKER_HANDLER(futex_lock_pi_failed);
+DEFINE_FUTEX_OP_MARKER_HANDLER(futex_unlock_pi_failed);
+
+DEFINE_FUTEX_OP_MARKER_HANDLER(do_futex_probe);
+
+static struct futex_debug_probe futex_debug_probe_array[] =
+{
+	INIT_FUTEX_DEBUG_PROBE(futex_wake_called,
+			"uaddr:%p fshared:%p nr_wake:%d bitset:%d"),
+	INIT_FUTEX_DEBUG_PROBE(futex_wait_called,
+			"uaddr:%p fshared:%p val:%u abs_time:%p bitset:%d"),
+	INIT_FUTEX_DEBUG_PROBE(futex_wait_uval, "uaddr:%p uval:%u"),
+	INIT_FUTEX_DEBUG_PROBE(futex_wait_after_sched, "uaddr:%p"),
+	INIT_FUTEX_DEBUG_PROBE(futex_cmp_requeue_called,
+			"uaddr1:%p fshared:%p uaddr2:%p nr_wake:%d "
+			"nr_requeue:%d cmpval:%p"),
+	INIT_FUTEX_DEBUG_PROBE(futex_lock_pi_called,
+			"uaddr:%p fshared:%p detect:%d time:%p trylock:%d"),
+	INIT_FUTEX_DEBUG_PROBE(futex_unlock_pi_called, "uaddr:%p fshared:%p"),
+
+	INIT_FUTEX_DEBUG_PROBE(futex_wait_failed, "uaddr:%p ret:%d"),
+	INIT_FUTEX_DEBUG_PROBE(futex_wake_failed, "uaddr:%p ret:%d"),
+	INIT_FUTEX_DEBUG_PROBE(futex_cmp_requeue_failed, "uaddr1:%p ret:%d"),
+	INIT_FUTEX_DEBUG_PROBE(futex_cmp_requeue_uaddr2_failed,
+			"uaddr1:%p uaddr2:%p ret:%d"),
+	INIT_FUTEX_DEBUG_PROBE(futex_lock_pi_failed, "uaddr:%p ret:%d"),
+	INIT_FUTEX_DEBUG_PROBE(futex_unlock_pi_failed, "uaddr:%p ret:%d"),
+
+	INIT_FUTEX_DEBUG_PROBE(do_futex_probe,
+			"uaddr: %lu op:%d val:%lu timeout:%p "
+			"uaddr2:%lu val2:%lu val3:%lu ret:%d")
+};
+
+static void futex_event_debug_worker(const char *marker_name,
+				const char *format_str,
+				enum futex_ops_ futex_event, va_list *ap)
+{
+	unsigned long long temp_ts = 0;
+	struct debugfs_printk_data *dpk;
+	char *print_string;
+	int err_condition = 0;
+
+	/* Capture the kernel timestamp as soon as we enter the handler */
+	temp_ts = ktime_to_ns(ktime_get());
+
+
+	dpk = kzalloc(sizeof(*dpk), GFP_KERNEL);
+	if (!dpk)
+		err_condition = 1;
+
+	dpk->parent_dir = FUTEX_TRACE_ROOT_DIR;
+	dpk->dir = choose_output_dir(futex_event);
+	print_string = kzalloc(FUTEX_DEBUG_STRING_SIZE, GFP_KERNEL);
+	if (!print_string) {
+		err_condition = 1;
+		goto out;
+	}
+
+	snprintf(print_string, FUTEX_DEBUG_STRING_SIZE,
+		"%s TS:%llu PID:%u Process:%s %s\n", marker_name,
+		temp_ts, current->pid, current->comm, format_str);
+
+	debugfs_printk(dpk, print_string, ap);
+
+	/* Exit code. Free the memory held */
+ out:
+	kfree(dpk);
+
+	if (err_condition)
+		printk(KERN_ERR "Unable to find required free memory. "
+				"Futex Debug message discarded");
+}
+
+static int __init futex_debug_init(void)
+{
+	int ret;
+	int i;
+	struct futex_debug_probe *p;
+
+	for (i = 0; i < ARRAY_SIZE(futex_debug_probe_array); i++) {
+		p = &futex_debug_probe_array[i];
+		ret = marker_probe_register(p->name, p->format,
+							p->probe_func, p);
+		if (ret)
+			printk(KERN_INFO "Unable to register Futex Debug "
+				"probe %s\n", futex_debug_probe_array[i].name);
+
+		if (ret)
+			printk(KERN_INFO "Unable to arm Futex Debug probe %s\n",
+				p->name);
+	}
+	printk(KERN_INFO "Futex Debug markers registered\n");
+
+	return ret;
+}
+
+static void __exit futex_debug_cleanup(void)
+{
+	int i;
+	struct futex_debug_probe *p;
+
+	for (i = 0; i < ARRAY_SIZE(futex_debug_probe_array); i++) {
+		p = &futex_debug_probe_array[i];
+		marker_probe_unregister(p->name, p->probe_func, p);
+	}
+	printk(KERN_INFO "Futex Debug markers unregistered\n");
+
+	trace_cleanup_all(FUTEX_TRACE_ROOT_DIR);
+}
+
+MODULE_LICENSE("GPL");
+
+module_init(futex_debug_init);
+module_exit(futex_debug_cleanup);
Index: linux-2.6.25-rc8-mm1/include/linux/futex_markers.h
===================================================================
--- /dev/null
+++ linux-2.6.25-rc8-mm1/include/linux/futex_markers.h
@@ -0,0 +1,63 @@
+/*
+ * futex_markers.h
+ *
+ * Copyright IBM Corp. 2008
+ *
+ * This program is free software;  you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
+ * the GNU General Public License for more details.
+ */
+
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/marker.h>
+#include <linux/ktime.h>
+#include <linux/trace.h>
+
+#define FUTEX_TRACE_BUF_SIZE 4096
+#define FUTEX_TRACE_SUB_BUF_NR 40
+#define FUTEX_TRACE_ROOT_DIR "futex_debug_trace"
+
+struct futex_debug_probe {
+	char *name;
+	char *format;
+	marker_probe_func *probe_func;
+};
+
+enum futex_ops_ {
+	do_futex_probe,
+	futex_wait_called, futex_wait_uval,
+	futex_wait_after_sched, futex_wait_failed,
+	futex_wake_called, futex_wake_failed,
+	futex_cmp_requeue_called, futex_cmp_requeue_uaddr2_failed,
+	futex_cmp_requeue_failed,
+	futex_lock_pi_called, futex_lock_pi_failed,
+	futex_unlock_pi_called, futex_unlock_pi_failed
+};
+
+#define DEFINE_FUTEX_OP_MARKER_HANDLER(futex_event) \
+static void futex_event##_callback(const struct marker *mdata, \
+			void *private_data, const char *format, va_list *ap) \
+{ \
+	/* \
+	 * futex_event - Used to identify the type of futex event (enum) \
+	 * __stringify(futex_event) - helps print the type of futex event as a \
+	 *                            string in trace_printf() \
+	 * ap - Contains the parameters exported to the marker handler \
+	 */ \
+	futex_event_debug_worker(mdata->name, mdata->format, futex_event, ap); \
+ \
+}
+
+#define INIT_FUTEX_DEBUG_PROBE(futex_debug_event, format_specifier) \
+{ \
+	.name = __stringify(futex_debug_event), \
+	.format = format_specifier, \
+	.probe_func = futex_debug_event##_callback \
+}
Index: linux-2.6.25-rc8-mm1/kernel/Makefile
===================================================================
--- linux-2.6.25-rc8-mm1.orig/kernel/Makefile
+++ linux-2.6.25-rc8-mm1/kernel/Makefile
@@ -23,6 +23,7 @@ obj-$(CONFIG_FUTEX) += futex.o
 ifeq ($(CONFIG_COMPAT),y)
 obj-$(CONFIG_FUTEX) += futex_compat.o
 endif
+obj-$(CONFIG_FUTEX_DEBUG) += futex_markers.o
 obj-$(CONFIG_RT_MUTEXES) += rtmutex.o
 obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o
 obj-$(CONFIG_RT_MUTEX_TESTER) += rtmutex-tester.o

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [RFC PATCH 1/2] Marker probes in futex.c
  2008-04-15 11:53 ` [RFC PATCH 1/2] Marker probes in futex.c K. Prasad
  2008-04-15 11:55   ` [RFC PATCH 2/2] Marker handler for the probes in futex file K. Prasad
@ 2008-04-15 12:02   ` Peter Zijlstra
  2008-04-15 12:32     ` Mathieu Desnoyers
                       ` (2 more replies)
  1 sibling, 3 replies; 46+ messages in thread
From: Peter Zijlstra @ 2008-04-15 12:02 UTC (permalink / raw)
  To: prasad; +Cc: linux-kernel, tglx, mingo, mathieu.desnoyers

On Tue, 2008-04-15 at 17:23 +0530, K. Prasad wrote:

> +	trace_mark(futex_wait_called, "uaddr:%p fshared:%p val:%u "
> +			"abs_time:%p bitset:%d",
> +			uaddr, fshared, val, abs_time, bitset);

This is some seriuosly ugly looking gunk, why would we want stuff like
that scattered across the code?

What is wrong with a few simple hooks like:

  trace_futex_wait(uaddr, fshares, val, abs_time, bitset);

and then deal with that.

Also, you seem to expose way too much futex internals; do you really
need that? People will go use this marker crap like ABI and further
restrain us from changing the code.

/me unhappy.




^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [RFC PATCH 1/2] Marker probes in futex.c
  2008-04-15 12:02   ` [RFC PATCH 1/2] Marker probes in futex.c Peter Zijlstra
@ 2008-04-15 12:32     ` Mathieu Desnoyers
  2008-04-15 12:50       ` Alexey Dobriyan
  2008-04-15 12:56       ` Peter Zijlstra
  2008-04-15 15:52     ` K. Prasad
  2008-04-18 10:44     ` Andrew Morton
  2 siblings, 2 replies; 46+ messages in thread
From: Mathieu Desnoyers @ 2008-04-15 12:32 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: prasad, linux-kernel, tglx, mingo, Christoph Hellwig, Frank Ch. Eigler

* Peter Zijlstra (a.p.zijlstra@chello.nl) wrote:
> On Tue, 2008-04-15 at 17:23 +0530, K. Prasad wrote:
> 
> > +	trace_mark(futex_wait_called, "uaddr:%p fshared:%p val:%u "
> > +			"abs_time:%p bitset:%d",
> > +			uaddr, fshared, val, abs_time, bitset);
> 
> This is some seriuosly ugly looking gunk, why would we want stuff like
> that scattered across the code?
> 

I don't really see how it differs so much from printks, which kernel
developers are already familiar with.

> What is wrong with a few simple hooks like:
> 
>   trace_futex_wait(uaddr, fshares, val, abs_time, bitset);
> 
> and then deal with that.
> 

If any of your variable type changes, then you are exporting an unknown
data structure to user-space. _That_ would break a userspace tracer
whenever you change any of these kernel variables and you don't want
that.

Exporting the field names and variable types helps to identify the
variables by their given names rather than their respective order.
Having the field type insures binary compatibility.

Clearly we can turn your trace_futex_wait(uaddr, fshares, val, abs_time,
bitset); into a trace_mark() with a simple define, and I don't see any
problem with that. I just want to make sure the event name, field names
and field types are exported, and this is done by markers. However, I
wonder why none of the kernel printk() are turned into specialized
defines to make the code "cleaner".. maybe it's because it is useful to
have everything declared in one spot after all.

> Also, you seem to expose way too much futex internals; do you really
> need that? People will go use this marker crap like ABI and further
> restrain us from changing the code.
> 

Because we extract the field names and types, we can create tracer
plugins that would hook on field names rather than expect a specific
number of fields and fixed field types. It makes it possible to tolerate
missing fields pretty easily. But yes, tracer tools might have to be
adapted to internal kernel changes, since they must follow kernel
structure changes. However, staying as close as possible to a canonical
representation of event fields, staying far from the specific
implemetation, would help to lessen the inter-dependency. On the other
hand, it would probably hurt trace compactness and efficiency.

Mathieu

> /me unhappy.
> 
> 
> 

-- 
Mathieu Desnoyers
Computer Engineering Ph.D. Student, Ecole Polytechnique de Montreal
OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F  BA06 3F25 A8FE 3BAE 9A68

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [RFC PATCH 1/2] Marker probes in futex.c
  2008-04-15 12:32     ` Mathieu Desnoyers
@ 2008-04-15 12:50       ` Alexey Dobriyan
  2008-04-15 16:13         ` K. Prasad
  2008-04-15 12:56       ` Peter Zijlstra
  1 sibling, 1 reply; 46+ messages in thread
From: Alexey Dobriyan @ 2008-04-15 12:50 UTC (permalink / raw)
  To: Mathieu Desnoyers
  Cc: Peter Zijlstra, prasad, linux-kernel, tglx, mingo,
	Christoph Hellwig, Frank Ch. Eigler

On Tue, Apr 15, 2008 at 08:32:34AM -0400, Mathieu Desnoyers wrote:
> * Peter Zijlstra (a.p.zijlstra@chello.nl) wrote:
> > On Tue, 2008-04-15 at 17:23 +0530, K. Prasad wrote:
> > 
> > > +	trace_mark(futex_wait_called, "uaddr:%p fshared:%p val:%u "
> > > +			"abs_time:%p bitset:%d",
> > > +			uaddr, fshared, val, abs_time, bitset);
> > 
> > This is some seriuosly ugly looking gunk, why would we want stuff like
> > that scattered across the code?
> > 
> 
> I don't really see how it differs so much from printks, which kernel
> developers are already familiar with.

They aren't in every -E codepath, nor they are at the start
and at the end of every important function (like system call).

Such printks are usually inserted during debugging when you don't care
about ugliness and these patches will eventually make kernel looks like
being permanently debugged one.


^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [RFC PATCH 1/2] Marker probes in futex.c
  2008-04-15 12:32     ` Mathieu Desnoyers
  2008-04-15 12:50       ` Alexey Dobriyan
@ 2008-04-15 12:56       ` Peter Zijlstra
  2008-04-15 13:17         ` Ingo Molnar
  2008-04-15 13:25         ` [RFC PATCH 1/2] Marker probes in futex.c Mathieu Desnoyers
  1 sibling, 2 replies; 46+ messages in thread
From: Peter Zijlstra @ 2008-04-15 12:56 UTC (permalink / raw)
  To: Mathieu Desnoyers
  Cc: prasad, linux-kernel, tglx, mingo, Christoph Hellwig, Frank Ch. Eigler

On Tue, 2008-04-15 at 08:32 -0400, Mathieu Desnoyers wrote:
> * Peter Zijlstra (a.p.zijlstra@chello.nl) wrote:
> > On Tue, 2008-04-15 at 17:23 +0530, K. Prasad wrote:
> > 
> > > +	trace_mark(futex_wait_called, "uaddr:%p fshared:%p val:%u "
> > > +			"abs_time:%p bitset:%d",
> > > +			uaddr, fshared, val, abs_time, bitset);
> > 
> > This is some seriuosly ugly looking gunk, why would we want stuff like
> > that scattered across the code?
> > 
> 
> I don't really see how it differs so much from printks, which kernel
> developers are already familiar with.

Which never last longer than the debug session and are never exposed to
enterprise kABI crap.

> > What is wrong with a few simple hooks like:
> > 
> >   trace_futex_wait(uaddr, fshares, val, abs_time, bitset);
> > 
> > and then deal with that.
> > 
> 
> If any of your variable type changes, then you are exporting an unknown
> data structure to user-space. _That_ would break a userspace tracer
> whenever you change any of these kernel variables and you don't want
> that.

trace_futex_wait()'s signature would make the compiler issue a complaint
when the arguments suddenly changes type, no?

> Exporting the field names and variable types helps to identify the
> variables by their given names rather than their respective order.
> Having the field type insures binary compatibility.
> 
> Clearly we can turn your trace_futex_wait(uaddr, fshares, val, abs_time,
> bitset); into a trace_mark() with a simple define, and I don't see any
> problem with that. I just want to make sure the event name, field names
> and field types are exported, and this is done by markers. However, I
> wonder why none of the kernel printk() are turned into specialized
> defines to make the code "cleaner".. maybe it's because it is useful to
> have everything declared in one spot after all.

I must be missing something here, printks don't need to look pretty
because they never see the light of lkml. They get ripped out as soon as
I understand what the heck happened.

> > Also, you seem to expose way too much futex internals; do you really
> > need that? People will go use this marker crap like ABI and further
> > restrain us from changing the code.
> > 
> 
> Because we extract the field names and types, we can create tracer
> plugins that would hook on field names rather than expect a specific
> number of fields and fixed field types. It makes it possible to tolerate
> missing fields pretty easily. But yes, tracer tools might have to be
> adapted to internal kernel changes, since they must follow kernel
> structure changes. However, staying as close as possible to a canonical
> representation of event fields, staying far from the specific
> implemetation, would help to lessen the inter-dependency. On the other
> hand, it would probably hurt trace compactness and efficiency.

See, these tracer tools are my nightmare as member of an enterprise
linux team. They'll make an already hard job even harder, no thanks!

At least reduce the hooks to the very bare minimum; only log when the
mutex changes state; not this: we entered futex_wait; we exited with
state crap.

Just log:

  futex: <uaddr> wait
  futex: <uaddr> wakeup

And other fundamental events, the rest is just not needed.


^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [RFC PATCH 1/2] Marker probes in futex.c
  2008-04-15 12:56       ` Peter Zijlstra
@ 2008-04-15 13:17         ` Ingo Molnar
  2008-04-15 13:47           ` Mathieu Desnoyers
  2008-04-15 13:25         ` [RFC PATCH 1/2] Marker probes in futex.c Mathieu Desnoyers
  1 sibling, 1 reply; 46+ messages in thread
From: Ingo Molnar @ 2008-04-15 13:17 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Mathieu Desnoyers, prasad, linux-kernel, tglx, Christoph Hellwig,
	Frank Ch. Eigler


* Peter Zijlstra <a.p.zijlstra@chello.nl> wrote:

> > Because we extract the field names and types, we can create tracer 
> > plugins that would hook on field names rather than expect a specific 
> > number of fields and fixed field types. It makes it possible to 
> > tolerate missing fields pretty easily. But yes, tracer tools might 
> > have to be adapted to internal kernel changes, since they must 
> > follow kernel structure changes. However, staying as close as 
> > possible to a canonical representation of event fields, staying far 
> > from the specific implemetation, would help to lessen the 
> > inter-dependency. On the other hand, it would probably hurt trace 
> > compactness and efficiency.
> 
> See, these tracer tools are my nightmare as member of an enterprise 
> linux team. They'll make an already hard job even harder, no thanks!

i'm clearly NAK-ing all futex trace hooks until the true impact of the 
whole marker facility is better understood. I've tried them for the 
scheduler and they were a clear failure: too bloated and too persistent.

but more importantly, as things stand today i've yet to see a _any_ 
bugreport where these 'tracer' tools that are being referred to were 
actually used in the field to fix something. The latency tracers (and 
the other tracer variants in -rt) on the other hand have a documented 
track record of being useful in fixing bugs and instrumenting the 
kernel.

	Ingo

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [RFC PATCH 1/2] Marker probes in futex.c
  2008-04-15 12:56       ` Peter Zijlstra
  2008-04-15 13:17         ` Ingo Molnar
@ 2008-04-15 13:25         ` Mathieu Desnoyers
  2008-04-16 15:51           ` Peter Zijlstra
  1 sibling, 1 reply; 46+ messages in thread
From: Mathieu Desnoyers @ 2008-04-15 13:25 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: prasad, linux-kernel, tglx, mingo, Christoph Hellwig, Frank Ch. Eigler

* Peter Zijlstra (a.p.zijlstra@chello.nl) wrote:
> On Tue, 2008-04-15 at 08:32 -0400, Mathieu Desnoyers wrote:
> > * Peter Zijlstra (a.p.zijlstra@chello.nl) wrote:
> > > On Tue, 2008-04-15 at 17:23 +0530, K. Prasad wrote:
> > > 
> > > > +	trace_mark(futex_wait_called, "uaddr:%p fshared:%p val:%u "
> > > > +			"abs_time:%p bitset:%d",
> > > > +			uaddr, fshared, val, abs_time, bitset);
> > > 
> > > This is some seriuosly ugly looking gunk, why would we want stuff like
> > > that scattered across the code?
> > > 
> > 
> > I don't really see how it differs so much from printks, which kernel
> > developers are already familiar with.
> 
> Which never last longer than the debug session and are never exposed to
> enterprise kABI crap.
> 
> > > What is wrong with a few simple hooks like:
> > > 
> > >   trace_futex_wait(uaddr, fshares, val, abs_time, bitset);
> > > 
> > > and then deal with that.
> > > 
> > 
> > If any of your variable type changes, then you are exporting an unknown
> > data structure to user-space. _That_ would break a userspace tracer
> > whenever you change any of these kernel variables and you don't want
> > that.
> 
> trace_futex_wait()'s signature would make the compiler issue a complaint
> when the arguments suddenly changes type, no?
> 

Yes, but then you would have to create new code for each event you want
to trace. In the end, it would increase the icache footprint
considerably and would also make addition of new events cumbersome.

> > Exporting the field names and variable types helps to identify the
> > variables by their given names rather than their respective order.
> > Having the field type insures binary compatibility.
> > 
> > Clearly we can turn your trace_futex_wait(uaddr, fshares, val, abs_time,
> > bitset); into a trace_mark() with a simple define, and I don't see any
> > problem with that. I just want to make sure the event name, field names
> > and field types are exported, and this is done by markers. However, I
> > wonder why none of the kernel printk() are turned into specialized
> > defines to make the code "cleaner".. maybe it's because it is useful to
> > have everything declared in one spot after all.
> 
> I must be missing something here, printks don't need to look pretty
> because they never see the light of lkml. They get ripped out as soon as
> I understand what the heck happened.
> 

The thing is that the trace_marks really fills two purpose : they
extract information from the core kernel, which is meant to be shipped
on production systems so tracing tools can report what is happening on
the system and they also allow kernel hackers to add markers of their
own, so they can extract information about specific events they are
interested in along with the standard kernel instrumentation.

So, part of it is meant to be standard kernel information, part of it
can be used for debugging. And since the kernel code evolves through
time, it makes sense to have an infrastructure flexible enough to follow
these changes easily.

Your proposal is interesting though. We could keep the flexible markers
as the core infrastructure to declare static instrumentation and add
static inlines in C files to map function prototypes to markers, such
as:

static inline void trace_futex_wait(void *uaddr)
{
  trace_mark(futex_wait, "uaddr %p", uaddr);
}

static inline void trace_futex_wakeup(void *uaddr)
{
  trace_mark(futex_wakeup, "uaddr %p", uaddr);
}

But personnally I don't really see how these static inlines will look
cleaner than the trace_marks in them.


> > > Also, you seem to expose way too much futex internals; do you really
> > > need that? People will go use this marker crap like ABI and further
> > > restrain us from changing the code.
> > > 
> > 
> > Because we extract the field names and types, we can create tracer
> > plugins that would hook on field names rather than expect a specific
> > number of fields and fixed field types. It makes it possible to tolerate
> > missing fields pretty easily. But yes, tracer tools might have to be
> > adapted to internal kernel changes, since they must follow kernel
> > structure changes. However, staying as close as possible to a canonical
> > representation of event fields, staying far from the specific
> > implemetation, would help to lessen the inter-dependency. On the other
> > hand, it would probably hurt trace compactness and efficiency.
> 
> See, these tracer tools are my nightmare as member of an enterprise
> linux team. They'll make an already hard job even harder, no thanks!
> 
> At least reduce the hooks to the very bare minimum; only log when the
> mutex changes state; not this: we entered futex_wait; we exited with
> state crap.
> 
> Just log:
> 
>   futex: <uaddr> wait
>   futex: <uaddr> wakeup
> 
> And other fundamental events, the rest is just not needed.
> 

I totally agree with you on that. This is the approach I've used in the
LTTng instrumentation.

Mathieu

-- 
Mathieu Desnoyers
Computer Engineering Ph.D. Student, Ecole Polytechnique de Montreal
OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F  BA06 3F25 A8FE 3BAE 9A68

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [RFC PATCH 1/2] Marker probes in futex.c
  2008-04-15 13:17         ` Ingo Molnar
@ 2008-04-15 13:47           ` Mathieu Desnoyers
  2008-04-15 14:04             ` Ingo Molnar
  2008-04-15 16:48             ` Ingo Molnar
  0 siblings, 2 replies; 46+ messages in thread
From: Mathieu Desnoyers @ 2008-04-15 13:47 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Peter Zijlstra, prasad, linux-kernel, tglx, Christoph Hellwig,
	Frank Ch. Eigler

* Ingo Molnar (mingo@elte.hu) wrote:
> 
> * Peter Zijlstra <a.p.zijlstra@chello.nl> wrote:
> 
> > > Because we extract the field names and types, we can create tracer 
> > > plugins that would hook on field names rather than expect a specific 
> > > number of fields and fixed field types. It makes it possible to 
> > > tolerate missing fields pretty easily. But yes, tracer tools might 
> > > have to be adapted to internal kernel changes, since they must 
> > > follow kernel structure changes. However, staying as close as 
> > > possible to a canonical representation of event fields, staying far 
> > > from the specific implemetation, would help to lessen the 
> > > inter-dependency. On the other hand, it would probably hurt trace 
> > > compactness and efficiency.
> > 
> > See, these tracer tools are my nightmare as member of an enterprise 
> > linux team. They'll make an already hard job even harder, no thanks!
> 
> i'm clearly NAK-ing all futex trace hooks until the true impact of the 
> whole marker facility is better understood. I've tried them for the 
> scheduler and they were a clear failure: too bloated and too persistent.
> 

I have not seen any counter argument for the in-depth analysis of the
instruction cache impact of the optimized markers I've done. Arguing
that the markers are "bloated" based only on "size kernel/sched.o"
output is a bit misleading.

> but more importantly, as things stand today i've yet to see a _any_ 
> bugreport where these 'tracer' tools that are being referred to were 
> actually used in the field to fix something. The latency tracers (and 
> the other tracer variants in -rt) on the other hand have a documented 
> track record of being useful in fixing bugs and instrumenting the 
> kernel.
> 

You will probably be interested in the following paper, which explains
various situations in which using a tracer has solved real problems at
Google, IBM, Autodesk, which are Linux users running large clusters or
Linux systems with soft RT constraints.

Linux Kernel Debugging on Google-sized clusters at Ottawa Linux Symposium 2007 
http://ltt.polymtl.ca/papers/bligh-Reprint.pdf

Now for some performance impact :

Here are some results I have taken comparing the optimized markers
approach with the dynamic ftrace approach. These runs with some ALU work
in tight loops, using clflush() to flush the cache lines pointing to
"global" data (pointer read : current->pid) used in the loop. I also
have the numbers for running the loop without the ALU work, but I leave
them out since they only make the tables harder to read : basically, the
cached impact for running the empty loop with markers or ftrace
instrumentation is about 0 to 3 cycles. It's the uncached impact which
clearly makes the difference between both approaches.

On AMD64, adding the markers or ftrace statement actually accelerates
the runs when executed with an ALU work baseline. It adds 1 to 2 cycles
with executed alone in the loop without any work.

Frank Ch. Eigler is preparing some macrobenchmarks. I hope he will find
time to post them soon.

Results in cycles per loop

baseline :
Cycles for ALU loop                    28.10013
(will be substracted for cached runs)

Cycles for clflush() and ALU loop     230.11087
(will be substracted from non-cached runs)

gcc version 4.1.3 20070812 (prerelease) (Debian 4.1.2-15), -O2

------------------------------------------------------------------------------
|x86 Pentium 4, 3.0GHz, Linux 2.6.25-rc7           |  cached    |  uncached  |
------------------------------------------------------------------------------
|Added cycles for optimized marker                 |  0.002     |   0.07     |
|Added cycles for normal marker                    |  0.004     | 154.7      |
|Added cycles for stack setup + (1+4 bytes) NOPs   |            |            |
|(6 local vars)                                    |  0.035     |   0.6      |
|Added cycles for stack setup + (1+4 bytes) NOPs   |            |            |
|(1 pointer read, 5 local vars)                    |  0.030     | 222.8      |
------------------------------------------------------------------------------

Results in cycles per loop

baseline :
Cycles for ALU and loop                 25.32369
(will be substracted for cached runs)

Cycles for clflush() and ALU loop      118.24227
(will be substracted from non-cached runs)

gcc version 4.1.2 20061115 (prerelease) (Debian 4.1.1-21), -O2

------------------------------------------------------------------------------
|AMD64, 2.0GHz, Linux 2.6.25-rc7                   |  cached    |  uncached  |
------------------------------------------------------------------------------
|Added cycles for optimized marker                 | -1.0       |    0.2     |
|Added cycles for normal marker                    | -0.3       |   41.8     |
|Added cycles for stack setup + (1+4 bytes) NOPs   | -0.5       |    0.01    |
|(6 local vars)                                    |            |            |
|Added cycles for stack setup + (1+4 bytes) NOPs   |  2.7       |   51.8     |
|(1 pointer read, 5 local vars)                    |            |            |
------------------------------------------------------------------------------

test bench at : http://ltt.polymtl.ca/svn/markers-test/

Regards,

Mathieu

-- 
Mathieu Desnoyers
Computer Engineering Ph.D. Student, Ecole Polytechnique de Montreal
OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F  BA06 3F25 A8FE 3BAE 9A68

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [RFC PATCH 1/2] Marker probes in futex.c
  2008-04-15 13:47           ` Mathieu Desnoyers
@ 2008-04-15 14:04             ` Ingo Molnar
  2008-04-15 14:21               ` Frank Ch. Eigler
  2008-04-15 14:39               ` Mathieu Desnoyers
  2008-04-15 16:48             ` Ingo Molnar
  1 sibling, 2 replies; 46+ messages in thread
From: Ingo Molnar @ 2008-04-15 14:04 UTC (permalink / raw)
  To: Mathieu Desnoyers
  Cc: Peter Zijlstra, prasad, linux-kernel, tglx, Christoph Hellwig,
	Frank Ch. Eigler


* Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca> wrote:

> * Ingo Molnar (mingo@elte.hu) wrote:
> > 
> > * Peter Zijlstra <a.p.zijlstra@chello.nl> wrote:
> > 
> > > See, these tracer tools are my nightmare as member of an 
> > > enterprise linux team. They'll make an already hard job even 
> > > harder, no thanks!
> > 
> > i'm clearly NAK-ing all futex trace hooks until the true impact of 
> > the whole marker facility is better understood. I've tried them for 
> > the scheduler and they were a clear failure: too bloated and too 
> > persistent.
> 
> I have not seen any counter argument for the in-depth analysis of the 
> instruction cache impact of the optimized markers I've done. Arguing 
> that the markers are "bloated" based only on "size kernel/sched.o" 
> output is a bit misleading.

uhm, i'm not sure what you mean - how else would you quantify bloat than 
to look at the size of the affected subsystem?

	Ingo

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [RFC PATCH 1/2] Marker probes in futex.c
  2008-04-15 14:04             ` Ingo Molnar
@ 2008-04-15 14:21               ` Frank Ch. Eigler
  2008-04-15 14:39               ` Mathieu Desnoyers
  1 sibling, 0 replies; 46+ messages in thread
From: Frank Ch. Eigler @ 2008-04-15 14:21 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Mathieu Desnoyers, Peter Zijlstra, prasad, linux-kernel, tglx,
	Christoph Hellwig

Hi -

On Tue, Apr 15, 2008 at 04:04:30PM +0200, Ingo Molnar wrote:
> [...]
> > I have not seen any counter argument for the in-depth analysis of the 
> > instruction cache impact of the optimized markers I've done. Arguing 
> > that the markers are "bloated" based only on "size kernel/sched.o" 
> > output is a bit misleading.
> 
> uhm, i'm not sure what you mean - how else would you quantify bloat than 
> to look at the size of the affected subsystem?

For example, by measuring how much of that "bloat" is actually
experienced (e.g., in the form of presence in cache or additional time
taken.) when that instrumentation is turned off vs. on.  That should
sound familiar, as this is the sort of data that you outlined (but I
don't recall whether you actually provided numbers/recipes) when
describing the "dyn-ftrace" nop-padding work.

- FChE

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [RFC PATCH 1/2] Marker probes in futex.c
  2008-04-15 14:04             ` Ingo Molnar
  2008-04-15 14:21               ` Frank Ch. Eigler
@ 2008-04-15 14:39               ` Mathieu Desnoyers
  1 sibling, 0 replies; 46+ messages in thread
From: Mathieu Desnoyers @ 2008-04-15 14:39 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Peter Zijlstra, prasad, linux-kernel, tglx, Christoph Hellwig,
	Frank Ch. Eigler

* Ingo Molnar (mingo@elte.hu) wrote:
> 
> * Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca> wrote:
> 
> > * Ingo Molnar (mingo@elte.hu) wrote:
> > > 
> > > * Peter Zijlstra <a.p.zijlstra@chello.nl> wrote:
> > > 
> > > > See, these tracer tools are my nightmare as member of an 
> > > > enterprise linux team. They'll make an already hard job even 
> > > > harder, no thanks!
> > > 
> > > i'm clearly NAK-ing all futex trace hooks until the true impact of 
> > > the whole marker facility is better understood. I've tried them for 
> > > the scheduler and they were a clear failure: too bloated and too 
> > > persistent.
> > 
> > I have not seen any counter argument for the in-depth analysis of the 
> > instruction cache impact of the optimized markers I've done. Arguing 
> > that the markers are "bloated" based only on "size kernel/sched.o" 
> > output is a bit misleading.
> 
> uhm, i'm not sure what you mean - how else would you quantify bloat than 
> to look at the size of the affected subsystem?
> 
> 	Ingo

Data cache bloat inspection :
If you use the "size" output, it will take into account all the data
placed in special sections. At link time, these sections are put
together far from the actual cache hot kernel data.

Instruction cache bloat inspection :
If a code region is placed with cache cold instructions (unlikely
branches), it should not increase the cache impact, since although we
might use one more cache line, it won't be often loaded in cache because
all the code that shares this cache line is unlikely.

TLB entries bloat :
If code is added in unlikely branches, the instruction size increase
could increase the number of TLB entries required to keep cache hot
code. However, in our case, adding 10 (hot) + 50 (cold) bytes to the
scheduler code per optimized marker would require 68 markers to occupy a
whole 4kB TLB entry. Statistically, we could suppose that adding less
than 34 markers to the scheduler should not use any supplementary TLB
entry.  Adding 3 markers is therefore very unlikely to increase the TLB
impact. Given we have about 1024 TLB entries, adding 1/25th of a TLB
entry to the cache hot kernel instructions should not matter much,
especially since it might be absorbed by alignment.

And since the kernel core code is placed in "Huge TLB pages" on many
architectures nowadays, I really don't think the impact of a few bytes
out of 4MB is significant.

I therefore think that looking only at code size is misleading when
considering the cache impact of markers, since they have been designed
to put the bytes as far away as possible from cache-hot memory.

Mathieu

-- 
Mathieu Desnoyers
Computer Engineering Ph.D. Student, Ecole Polytechnique de Montreal
OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F  BA06 3F25 A8FE 3BAE 9A68

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [RFC PATCH 1/2] Marker probes in futex.c
  2008-04-15 12:02   ` [RFC PATCH 1/2] Marker probes in futex.c Peter Zijlstra
  2008-04-15 12:32     ` Mathieu Desnoyers
@ 2008-04-15 15:52     ` K. Prasad
  2008-04-16 15:51       ` Peter Zijlstra
  2008-04-18 10:44     ` Andrew Morton
  2 siblings, 1 reply; 46+ messages in thread
From: K. Prasad @ 2008-04-15 15:52 UTC (permalink / raw)
  To: Peter Zijlstra; +Cc: linux-kernel, tglx, mingo, mathieu.desnoyers

On Tue, Apr 15, 2008 at 02:02:22PM +0200, Peter Zijlstra wrote:
> On Tue, 2008-04-15 at 17:23 +0530, K. Prasad wrote:
> 
> > +	trace_mark(futex_wait_called, "uaddr:%p fshared:%p val:%u "
> > +			"abs_time:%p bitset:%d",
> > +			uaddr, fshared, val, abs_time, bitset);
> 
> This is some seriuosly ugly looking gunk, why would we want stuff like
> that scattered across the code?
> 
> What is wrong with a few simple hooks like:
> 
>   trace_futex_wait(uaddr, fshares, val, abs_time, bitset);
> 
> and then deal with that.
> 
> Also, you seem to expose way too much futex internals; do you really
> need that? People will go use this marker crap like ABI and further
> restrain us from changing the code.
The idea is to export as much data as possible (considering that the
cost involved in doing so is less) and use them only when required.

If we were to log just the futex_ops, just as you had suggested,
"Just log:
 
  futex: <uaddr> wait
  futex: <uaddr> wakeup"
it would provide only cursory information about code-flow and not enough
debug information to help the user solve the issue he's trying to debug.

Say for e.g. in futex_wait you may want to know if abs_time had a
non-zero value but the primitive debugging information would end
up being a handicap from discovering that.

If you can specifically point me to information you think would be
absolutely unnecessary, I can get them out of the trace_mark().

--K.Prasad

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [RFC PATCH 1/2] Marker probes in futex.c
  2008-04-15 12:50       ` Alexey Dobriyan
@ 2008-04-15 16:13         ` K. Prasad
  0 siblings, 0 replies; 46+ messages in thread
From: K. Prasad @ 2008-04-15 16:13 UTC (permalink / raw)
  To: Alexey Dobriyan
  Cc: Mathieu Desnoyers, Peter Zijlstra, prasad, linux-kernel, tglx,
	mingo, Christoph Hellwig, Frank Ch. Eigler

On Tue, Apr 15, 2008 at 04:50:09PM +0400, Alexey Dobriyan wrote:
> On Tue, Apr 15, 2008 at 08:32:34AM -0400, Mathieu Desnoyers wrote:
> > * Peter Zijlstra (a.p.zijlstra@chello.nl) wrote:
> > > On Tue, 2008-04-15 at 17:23 +0530, K. Prasad wrote:
> > > 
> > > > +	trace_mark(futex_wait_called, "uaddr:%p fshared:%p val:%u "
> > > > +			"abs_time:%p bitset:%d",
> > > > +			uaddr, fshared, val, abs_time, bitset);
> > > 
> > > This is some seriuosly ugly looking gunk, why would we want stuff like
> > > that scattered across the code?
> > > 
> > 
> > I don't really see how it differs so much from printks, which kernel
> > developers are already familiar with.
> 
> They aren't in every -E codepath, nor they are at the start
> and at the end of every important function (like system call).
> 
> Such printks are usually inserted during debugging when you don't care
> about ugliness and these patches will eventually make kernel looks like
> being permanently debugged one.
>
Wondering how useful would printks be when debugging large systems (say
analysing futex contention on a 32-way CPU system)....also to mention
the inability to do a binary dump of the interested data structures.

--K.Prasad
 

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [RFC PATCH 1/2] Marker probes in futex.c
  2008-04-15 13:47           ` Mathieu Desnoyers
  2008-04-15 14:04             ` Ingo Molnar
@ 2008-04-15 16:48             ` Ingo Molnar
  2008-04-15 21:38               ` Mathieu Desnoyers
  1 sibling, 1 reply; 46+ messages in thread
From: Ingo Molnar @ 2008-04-15 16:48 UTC (permalink / raw)
  To: Mathieu Desnoyers
  Cc: Peter Zijlstra, prasad, linux-kernel, tglx, Christoph Hellwig,
	Frank Ch. Eigler

* Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca> wrote:

> Now for some performance impact :

absent from your figures is dyn-ftrace :-)

dyn-ftrace is a young but rather nifty tool that extends on old-style 
static tracepoints rather forcefully, by adding 75,000+ tracepoints to 
the kernel here and now, with no in-source maintainence overhead at all. 
(It's a nice add-on to SystemTap because it adds space for a jprobe, but 
i digress ;-)

anyway, my objections against markers would be reduced quite 
significantly, if we were able to improve the immediate values 
optimization to not actually patch the MOV's constant portion, but to 
just patch both the MOV and the branch instruction, into two NOPs.

in fact, we can do even better: we should turn the immediate values 
optimization into a simple "patch out a jump to the slowpath" 
optimization. I.e. just a single instruction to patch in and out. (and 
that makes the NMI impact all that easier to handle as well)

That would pretty much meet my "the typical trace point should have the 
cost of a single NOP" threshold for fastpath tracing overhead, which 
i've been repeating for 2 years now, and which would make the scheduler 
markers a lot more palatable ;-)

hm?

	Ingo

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [RFC PATCH 1/2] Marker probes in futex.c
  2008-04-15 16:48             ` Ingo Molnar
@ 2008-04-15 21:38               ` Mathieu Desnoyers
  2008-04-16 13:17                 ` Ingo Molnar
  0 siblings, 1 reply; 46+ messages in thread
From: Mathieu Desnoyers @ 2008-04-15 21:38 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Peter Zijlstra, prasad, linux-kernel, tglx, Christoph Hellwig,
	Frank Ch. Eigler

* Ingo Molnar (mingo@elte.hu) wrote:
> 
> * Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca> wrote:
> 
> > Now for some performance impact :
> 
> absent from your figures is dyn-ftrace :-)
> 
> dyn-ftrace is a young but rather nifty tool that extends on old-style 
> static tracepoints rather forcefully, by adding 75,000+ tracepoints to 
> the kernel here and now, with no in-source maintainence overhead at all. 
> (It's a nice add-on to SystemTap because it adds space for a jprobe, but 
> i digress ;-)
> 
> anyway, my objections against markers would be reduced quite 
> significantly, if we were able to improve the immediate values 
> optimization to not actually patch the MOV's constant portion, but to 
> just patch both the MOV and the branch instruction, into two NOPs.
> 
> in fact, we can do even better: we should turn the immediate values 
> optimization into a simple "patch out a jump to the slowpath" 
> optimization. I.e. just a single instruction to patch in and out. (and 
> that makes the NMI impact all that easier to handle as well)
> 
> That would pretty much meet my "the typical trace point should have the 
> cost of a single NOP" threshold for fastpath tracing overhead, which 
> i've been repeating for 2 years now, and which would make the scheduler 
> markers a lot more palatable ;-)
> 
> hm?
> 

I think we could do something there. Let's have a look at a few
marker+immediate values fast paths on x86_32 :

    4631:       b0 00                   mov    $0x0,%al
    4633:       84 c0                   test   %al,%al
    4635:       0f 85 c6 00 00 00       jne    4701 <try_to_wake_up+0xea>

    7059:       b0 00                   mov    $0x0,%al
    705b:       84 c0                   test   %al,%al
    705d:       75 63                   jne    70c2 <sched_exec+0xb6>

    83ac:       b0 00                   mov    $0x0,%al
    83ae:       84 c0                   test   %al,%al
    83b0:       75 29                   jne    83db <wait_task_inactive+0x69>

If we want to support NMI context and have the ability to instrument
preemptable code without too much headache, we must insure that every
modification will leave the code in a "correct" state and that we do not
grow the size of any reachable instruction.  Also, we must insure gcc
did not put code between these instructions. Modifying non-relocatable
instructions would also be a pain, since we would have to deal with
instruction pointer relocation in the breakpoint code when the code
modification is being done.

Luckily, gcc almost never place any code between the mov, test and jne
instructions. But since we cannot we sure, we could dynamically check
for this code pattern after the mov instruction. If we find it, then we
play with it as if it was a single asm block, but if we don't find what
we expect, then we use standard immediate values for that. I expect the
heavily optimised version will be usable almost all the time.

This heavily optimized version could consist of a simple jump to the
address following the "jne" instruction. To activate the immediate
value, we could simply put back a mov $0x1,%al. I don't think we care
_that_ much about the active tracing performance, since we take a
supplementary function call already in that case.

We could probably force the mov into %al to make sure we search for only
one test pattern (%al,%al). We would have to decode the jne instruction
to see how big it is so we can put the correct offset in the jmp
instruction replacing the original mov.

The only problem that arises is if the gcc compiler uses the zero flag
set by testb by code following the jne instruction, but in our case, I
don't see how gcc could ever want to reuse the zero flag set by a test
on our own mov to a register unless we re-use the value loaded somewhere
else.

Dealing with the non-relocatable jmp instruction could be done by
checking, in the int3 immediate values notifiy callback, if the
instruction being modified is a jmp. If it is, we simply update the
return address without executing the bypass code.

What do you think of these ideas ?

Mathieu

-- 
Mathieu Desnoyers
Computer Engineering Ph.D. Student, Ecole Polytechnique de Montreal
OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F  BA06 3F25 A8FE 3BAE 9A68

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [RFC PATCH 1/2] Marker probes in futex.c
  2008-04-15 21:38               ` Mathieu Desnoyers
@ 2008-04-16 13:17                 ` Ingo Molnar
  2008-04-16 13:46                   ` Arjan van de Ven
  2008-04-16 20:10                   ` text_poke, vmap and vmalloc on x86_64 Mathieu Desnoyers
  0 siblings, 2 replies; 46+ messages in thread
From: Ingo Molnar @ 2008-04-16 13:17 UTC (permalink / raw)
  To: Mathieu Desnoyers
  Cc: Peter Zijlstra, prasad, linux-kernel, tglx, Christoph Hellwig,
	Frank Ch. Eigler, Arjan van de Ven


* Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca> wrote:

> I think we could do something there. Let's have a look at a few 
> marker+immediate values fast paths on x86_32 :
> 
> 
>     4631:       b0 00                   mov    $0x0,%al
>     4633:       84 c0                   test   %al,%al
>     4635:       0f 85 c6 00 00 00       jne    4701 <try_to_wake_up+0xea>
> 
> 
>     7059:       b0 00                   mov    $0x0,%al
>     705b:       84 c0                   test   %al,%al
>     705d:       75 63                   jne    70c2 <sched_exec+0xb6>
> 
> 
>     83ac:       b0 00                   mov    $0x0,%al
>     83ae:       84 c0                   test   %al,%al
>     83b0:       75 29                   jne    83db <wait_task_inactive+0x69>
> 
> 
> If we want to support NMI context and have the ability to instrument 
> preemptable code without too much headache, we must insure that every 
> modification will leave the code in a "correct" state and that we do 
> not grow the size of any reachable instruction.  Also, we must insure 
> gcc did not put code between these instructions. Modifying 
> non-relocatable instructions would also be a pain, since we would have 
> to deal with instruction pointer relocation in the breakpoint code 
> when the code modification is being done.
> 
> Luckily, gcc almost never place any code between the mov, test and jne 
> instructions. But since we cannot we sure, we could dynamically check 
> for this code pattern after the mov instruction. If we find it, then 
> we play with it as if it was a single asm block, but if we don't find 
> what we expect, then we use standard immediate values for that. I 
> expect the heavily optimised version will be usable almost all the 
> time.
> 
> This heavily optimized version could consist of a simple jump to the 
> address following the "jne" instruction. To activate the immediate 
> value, we could simply put back a mov $0x1,%al. I don't think we care 
> _that_ much about the active tracing performance, since we take a 
> supplementary function call already in that case.
> 
> We could probably force the mov into %al to make sure we search for 
> only one test pattern (%al,%al). We would have to decode the jne 
> instruction to see how big it is so we can put the correct offset in 
> the jmp instruction replacing the original mov.
> 
> The only problem that arises is if the gcc compiler uses the zero flag 
> set by testb by code following the jne instruction, but in our case, I 
> don't see how gcc could ever want to reuse the zero flag set by a test 
> on our own mov to a register unless we re-use the value loaded 
> somewhere else.
> 
> Dealing with the non-relocatable jmp instruction could be done by 
> checking, in the int3 immediate values notifiy callback, if the 
> instruction being modified is a jmp. If it is, we simply update the 
> return address without executing the bypass code.
> 
> What do you think of these ideas ?

sounds like a promising plan to me.

would you be interested in putting back the sched-tracer markers into 
sched.c, using the redo-markers patch i posted some time ago (attached 
below again) - and send me a series combined with immediate values 
against sched-devel/latest, so that i can have another look at their 
impact? [That would be the current optimized-markers approach initially, 
not the proposed super-optimized-markers approach.]

i wont worry much about the slowpath impact, you and Frank are right in 
that -freorder-blocks will probably hide the slowpath nicely and it will 
have other benefits as well.

Maybe a functional -freorder-blocks patch for x86 would help in judging 
that impact precisely - i think Arjan made one some time ago? Last time 
i tried it (a few weeks ago) it crashed early during bootup - but it 
would be an interesting feature to have.

	Ingo

---------------------------------->
Subject: sched: reintroduce markers
From: Ingo Molnar <mingo@elte.hu>

---
 include/linux/sched.h             |   36 --------
 kernel/sched.c                    |   11 +-
 kernel/trace/trace.h              |   12 --
 kernel/trace/trace_functions.c    |    2 
 kernel/trace/trace_sched_switch.c |  163 +++++++++++++++++++++++++++++++++++---
 kernel/trace/trace_sched_wakeup.c |  146 +++++++++++++++++++++++++++++++---
 6 files changed, 299 insertions(+), 71 deletions(-)

Index: linux/include/linux/sched.h
===================================================================
--- linux.orig/include/linux/sched.h
+++ linux/include/linux/sched.h
@@ -2031,42 +2031,6 @@ extern int sched_mc_power_savings, sched
 
 extern void normalize_rt_tasks(void);
 
-#ifdef CONFIG_CONTEXT_SWITCH_TRACER
-extern void
-ftrace_ctx_switch(struct task_struct *prev, struct task_struct *next);
-#else
-static inline void
-ftrace_ctx_switch(struct task_struct *prev, struct task_struct *next)
-{
-}
-#endif
-
-#ifdef CONFIG_SCHED_TRACER
-extern void
-ftrace_wake_up_task(struct task_struct *wakee, struct task_struct *curr);
-extern void
-ftrace_wake_up_new_task(struct task_struct *wakee, struct task_struct *curr);
-#else
-static inline void
-ftrace_wake_up_task(struct task_struct *wakee, struct task_struct *curr)
-{
-}
-static inline void
-ftrace_wake_up_new_task(struct task_struct *wakee, struct task_struct *curr)
-{
-}
-#endif
-
-#ifdef CONFIG_CONTEXT_SWITCH_TRACER
-extern void
-ftrace_ctx_switch(struct task_struct *prev, struct task_struct *next);
-#else
-static inline void
-ftrace_ctx_switch(struct task_struct *prev, struct task_struct *next)
-{
-}
-#endif
-
 #ifdef CONFIG_FAIR_GROUP_SCHED
 
 extern struct task_group init_task_group;
Index: linux/kernel/sched.c
===================================================================
--- linux.orig/kernel/sched.c
+++ linux/kernel/sched.c
@@ -1886,7 +1886,9 @@ static int try_to_wake_up(struct task_st
 
 out_activate:
 #endif /* CONFIG_SMP */
-	ftrace_wake_up_task(p, rq->curr);
+	trace_mark(kernel_sched_wakeup,
+		   "p %p rq->curr %p",
+		   p, rq->curr);
 
 	schedstat_inc(p, se.nr_wakeups);
 	if (sync)
@@ -2028,7 +2030,9 @@ void fastcall wake_up_new_task(struct ta
 		p->sched_class->task_new(rq, p);
 		inc_nr_running(rq);
 	}
-	ftrace_wake_up_new_task(p, rq->curr);
+	trace_mark(kernel_sched_wakeup_new,
+		   "p %p rq->curr %p",
+		   p, rq->curr);
 
 	check_preempt_curr(rq, p);
 #ifdef CONFIG_SMP
@@ -2202,7 +2206,8 @@ context_switch(struct rq *rq, struct tas
 	struct mm_struct *mm, *oldmm;
 
 	prepare_task_switch(rq, prev, next);
-	ftrace_ctx_switch(prev, next);
+	trace_mark(kernel_sched_schedule,
+		   "prev %p next %p", prev, next);
 	mm = next->mm;
 	oldmm = prev->active_mm;
 	/*
Index: linux/kernel/trace/trace.h
===================================================================
--- linux.orig/kernel/trace/trace.h
+++ linux/kernel/trace/trace.h
@@ -134,6 +134,8 @@ void tracing_start_function_trace(void);
 void tracing_stop_function_trace(void);
 int register_tracer(struct tracer *type);
 void unregister_tracer(struct tracer *type);
+void tracing_start_sched_switch(void);
+void tracing_stop_sched_switch(void);
 
 extern int tracing_sched_switch_enabled;
 extern unsigned long tracing_max_latency;
@@ -148,16 +150,6 @@ static inline notrace cycle_t now(int cp
 	return cpu_clock(cpu);
 }
 
-#ifdef CONFIG_SCHED_TRACER
-extern void notrace
-wakeup_sched_switch(struct task_struct *prev, struct task_struct *next);
-#else
-static inline void
-wakeup_sched_switch(struct task_struct *prev, struct task_struct *next)
-{
-}
-#endif
-
 #ifdef CONFIG_CONTEXT_SWITCH_TRACER
 typedef void
 (*tracer_switch_func_t)(void *private,
Index: linux/kernel/trace/trace_functions.c
===================================================================
--- linux.orig/kernel/trace/trace_functions.c
+++ linux/kernel/trace/trace_functions.c
@@ -30,10 +30,12 @@ static notrace void start_function_trace
 {
 	function_reset(tr);
 	tracing_start_function_trace();
+	tracing_start_sched_switch();
 }
 
 static notrace void stop_function_trace(struct trace_array *tr)
 {
+	tracing_stop_sched_switch();
 	tracing_stop_function_trace();
 }
 
Index: linux/kernel/trace/trace_sched_switch.c
===================================================================
--- linux.orig/kernel/trace/trace_sched_switch.c
+++ linux/kernel/trace/trace_sched_switch.c
@@ -16,12 +16,17 @@
 
 static struct trace_array	*ctx_trace;
 static int __read_mostly	tracer_enabled;
+static atomic_t			sched_ref;
 int __read_mostly		tracing_sched_switch_enabled;
 
+static DEFINE_SPINLOCK(sched_switch_func_lock);
+
 static void notrace
-ctx_switch_func(struct task_struct *prev, struct task_struct *next)
+sched_switch_func(void *private, struct task_struct *prev,
+		  struct task_struct *next)
 {
-	struct trace_array *tr = ctx_trace;
+	struct trace_array **ptr = private;
+	struct trace_array *tr = *ptr;
 	struct trace_array_cpu *data;
 	unsigned long flags;
 	long disabled;
@@ -42,20 +47,115 @@ ctx_switch_func(struct task_struct *prev
 	raw_local_irq_restore(flags);
 }
 
-void ftrace_ctx_switch(struct task_struct *prev, struct task_struct *next)
+static struct tracer_switch_ops sched_switch_ops __read_mostly =
+{
+	.func		= sched_switch_func,
+	.private	= &ctx_trace,
+};
+
+static tracer_switch_func_t __read_mostly
+	tracer_switch_func = sched_switch_func;
+
+static struct tracer_switch_ops __read_mostly
+	*tracer_switch_func_ops = &sched_switch_ops;
+
+static void notrace
+sched_switch_func_loop(void *private, struct task_struct *prev,
+		       struct task_struct *next)
 {
-	tracing_record_cmdline(prev);
+	struct tracer_switch_ops *ops = tracer_switch_func_ops;
+
+	/* take care of alpha acting funny */
+	read_barrier_depends();
 
+	for (; ops != NULL; ops = ops->next) {
+		/* silly alpha */
+		read_barrier_depends();
+		ops->func(ops->private, prev, next);
+	}
+}
+
+notrace int register_tracer_switch(struct tracer_switch_ops *ops)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&sched_switch_func_lock, flags);
+	ops->next = tracer_switch_func_ops;
 	/*
-	 * If tracer_switch_func only points to the local
-	 * switch func, it still needs the ptr passed to it.
+	 * We are entering ops into the switch list but another
+	 * CPU might be walking that list. We need to make sure
+	 * the ops->next pointer is valid before another CPU sees
+	 * the ops pointer included into the switch list.
 	 */
-	ctx_switch_func(prev, next);
+	smp_wmb();
+	tracer_switch_func_ops = ops;
+
+	if (ops->next == &sched_switch_ops)
+		tracer_switch_func = sched_switch_func_loop;
+
+	spin_unlock_irqrestore(&sched_switch_func_lock, flags);
+
+	return 0;
+}
+
+notrace int unregister_tracer_switch(struct tracer_switch_ops *ops)
+{
+	unsigned long flags;
+	struct tracer_switch_ops **p = &tracer_switch_func_ops;
+	int ret;
+
+	spin_lock_irqsave(&sched_switch_func_lock, flags);
 
 	/*
-	 * Chain to the wakeup tracer (this is a NOP if disabled):
+	 * If the sched_switch is the only one left, then
+	 *  only call that function.
 	 */
-	wakeup_sched_switch(prev, next);
+	if (*p == ops && ops->next == &sched_switch_ops) {
+		tracer_switch_func = sched_switch_func;
+		tracer_switch_func_ops = &sched_switch_ops;
+		goto out;
+	}
+
+	for (; *p != &sched_switch_ops; p = &(*p)->next)
+		if (*p == ops)
+			break;
+
+	if (*p != ops) {
+		ret = -1;
+		goto out;
+	}
+
+	*p = (*p)->next;
+
+ out:
+	spin_unlock_irqrestore(&sched_switch_func_lock, flags);
+
+	return 0;
+}
+
+static notrace void
+sched_switch_callback(const struct marker *mdata, void *private_data,
+		      const char *format, ...)
+{
+	struct task_struct *prev;
+	struct task_struct *next;
+	va_list ap;
+
+	if (!atomic_read(&sched_ref))
+		return;
+
+	tracing_record_cmdline(current);
+
+	va_start(ap, format);
+	prev = va_arg(ap, typeof(prev));
+	next = va_arg(ap, typeof(next));
+	va_end(ap);
+
+	/*
+	 * If tracer_switch_func only points to the local
+	 * switch func, it still needs the ptr passed to it.
+	 */
+	tracer_switch_func(mdata->private, prev, next);
 }
 
 static notrace void sched_switch_reset(struct trace_array *tr)
@@ -72,10 +172,12 @@ static notrace void start_sched_trace(st
 {
 	sched_switch_reset(tr);
 	tracer_enabled = 1;
+	tracing_start_sched_switch();
 }
 
 static notrace void stop_sched_trace(struct trace_array *tr)
 {
+	tracing_stop_sched_switch();
 	tracer_enabled = 0;
 }
 
@@ -110,6 +212,35 @@ static struct tracer sched_switch_trace 
 	.ctrl_update	= sched_switch_trace_ctrl_update,
 };
 
+static int tracing_sched_arm(void)
+{
+	int ret;
+
+	ret = marker_arm("kernel_sched_schedule");
+	if (ret)
+		pr_info("sched trace: Couldn't arm probe switch_to\n");
+
+	return ret;
+}
+
+void tracing_start_sched_switch(void)
+{
+	long ref;
+
+	ref = atomic_inc_return(&sched_ref);
+	if (tracing_sched_switch_enabled && ref == 1)
+		tracing_sched_arm();
+}
+
+void tracing_stop_sched_switch(void)
+{
+	long ref;
+
+	ref = atomic_dec_and_test(&sched_ref);
+	if (tracing_sched_switch_enabled && ref)
+		marker_disarm("kernel_sched_schedule");
+}
+
 __init static int init_sched_switch_trace(void)
 {
 	int ret;
@@ -118,8 +249,22 @@ __init static int init_sched_switch_trac
 	if (ret)
 		return ret;
 
+	ret = marker_probe_register("kernel_sched_schedule",
+				    "prev %p next %p",
+				    sched_switch_callback,
+				    &ctx_trace);
+	if (ret) {
+		pr_info("sched trace: Couldn't add marker"
+			" probe to switch_to\n");
+		goto out;
+	}
+
+	if (atomic_read(&sched_ref))
+		ret = tracing_sched_arm();
+
 	tracing_sched_switch_enabled = 1;
 
+out:
 	return ret;
 }
 device_initcall(init_sched_switch_trace);
Index: linux/kernel/trace/trace_sched_wakeup.c
===================================================================
--- linux.orig/kernel/trace/trace_sched_wakeup.c
+++ linux/kernel/trace/trace_sched_wakeup.c
@@ -44,12 +44,66 @@ static int notrace report_latency(cycle_
 	return 1;
 }
 
-void notrace
-wakeup_sched_switch(struct task_struct *prev, struct task_struct *next)
+#ifdef CONFIG_FTRACE
+/* irqsoff uses its own function trace to keep the overhead down */
+static void notrace
+wakeup_tracer_call(unsigned long ip, unsigned long parent_ip)
 {
-	unsigned long latency = 0, t0 = 0, t1 = 0;
 	struct trace_array *tr = wakeup_trace;
 	struct trace_array_cpu *data;
+	unsigned long flags;
+	long disabled;
+	int resched;
+	int tcpu;
+	int cpu;
+
+	if (likely(!tracer_enabled) || !tr)
+		return;
+
+	resched = need_resched();
+	preempt_disable_notrace();
+
+	cpu = raw_smp_processor_id();
+
+	data = tr->data[cpu];
+	disabled = atomic_inc_return(&data->disabled);
+	if (likely(disabled != 1))
+		goto out;
+
+	spin_lock_irqsave(&wakeup_lock, flags);
+	if (wakeup_task)
+		tcpu = task_cpu(wakeup_task);
+	else
+		tcpu = -1;
+	spin_unlock_irqrestore(&wakeup_lock, flags);
+
+	if (cpu == tcpu)
+		ftrace(tr, data, ip, parent_ip, flags);
+
+out:
+	atomic_dec(&data->disabled);
+
+	/* prevent recursion with scheduler */
+	if (resched)
+		preempt_enable_no_resched_notrace();
+	else
+		preempt_enable_notrace();
+}
+
+static struct ftrace_ops trace_ops __read_mostly =
+{
+	.func		= wakeup_tracer_call,
+};
+#endif /* CONFIG_FTRACE */
+
+static void notrace
+wakeup_sched_switch(void *private, struct task_struct *prev,
+		    struct task_struct *next)
+{
+	unsigned long latency = 0, t0 = 0, t1 = 0;
+	struct trace_array **ptr = private;
+	struct trace_array *tr = *ptr;
+	struct trace_array_cpu *data;
 	cycle_t T0, T1, delta;
 	unsigned long flags;
 	long disabled;
@@ -130,8 +184,14 @@ out_unlock:
 	spin_unlock_irqrestore(&wakeup_lock, flags);
 out:
 	atomic_dec(&tr->data[cpu]->disabled);
+
 }
 
+static struct tracer_switch_ops switch_ops __read_mostly = {
+	.func		= wakeup_sched_switch,
+	.private	= &wakeup_trace,
+};
+
 static void notrace __wakeup_reset(struct trace_array *tr)
 {
 	struct trace_array_cpu *data;
@@ -206,26 +266,51 @@ out:
 	atomic_dec(&tr->data[cpu]->disabled);
 }
 
-notrace void
-ftrace_wake_up_task(struct task_struct *wakee, struct task_struct *curr)
+static notrace void
+wake_up_callback(const struct marker *mdata, void *private_data,
+		 const char *format, ...)
 {
+	struct trace_array **ptr = mdata->private;
+	struct trace_array *tr = *ptr;
+	struct task_struct *curr;
+	struct task_struct *p;
+	va_list ap;
+
 	if (likely(!tracer_enabled))
 		return;
 
-	wakeup_check_start(wakeup_trace, wakee, curr);
-}
+	va_start(ap, format);
 
-notrace void
-ftrace_wake_up_new_task(struct task_struct *wakee, struct task_struct *curr)
-{
-	if (likely(!tracer_enabled))
-		return;
+	/* now get the meat: "p %p rq->curr %p" */
+	p = va_arg(ap, typeof(p));
+	curr = va_arg(ap, typeof(curr));
+
+	va_end(ap);
 
-	wakeup_check_start(wakeup_trace, wakee, curr);
+	wakeup_check_start(tr, p, curr);
 }
 
 static notrace void start_wakeup_tracer(struct trace_array *tr)
 {
+	int ret;
+
+	ret = marker_arm("kernel_sched_wakeup");
+	if (ret) {
+		pr_info("wakeup trace: Couldn't arm probe"
+			" kernel_sched_wakeup\n");
+		return;
+	}
+
+	ret = marker_arm("kernel_sched_wakeup_new");
+	if (ret) {
+		pr_info("wakeup trace: Couldn't arm probe"
+			" kernel_sched_wakeup_new\n");
+		goto out;
+	}
+
+	register_tracer_switch(&switch_ops);
+	tracing_start_sched_switch();
+
 	wakeup_reset(tr);
 
 	/*
@@ -238,13 +323,22 @@ static notrace void start_wakeup_tracer(
 	smp_wmb();
 
 	tracer_enabled = 1;
+	register_ftrace_function(&trace_ops);
 
 	return;
+
+out:
+	marker_disarm("kernel_sched_wakeup");
 }
 
 static notrace void stop_wakeup_tracer(struct trace_array *tr)
 {
 	tracer_enabled = 0;
+	tracing_stop_sched_switch();
+	unregister_tracer_switch(&switch_ops);
+	marker_disarm("kernel_sched_wakeup");
+	marker_disarm("kernel_sched_wakeup_new");
+	unregister_ftrace_function(&trace_ops);
 }
 
 static notrace void wakeup_tracer_init(struct trace_array *tr)
@@ -305,6 +399,32 @@ __init static int init_wakeup_tracer(voi
 	if (ret)
 		return ret;
 
+	ret = marker_probe_register("kernel_sched_wakeup",
+				    "p %p rq->curr %p",
+				    wake_up_callback,
+				    &wakeup_trace);
+	if (ret) {
+		pr_info("wakeup trace: Couldn't add marker"
+			" probe to kernel_sched_wakeup\n");
+		goto fail;
+	}
+
+	ret = marker_probe_register("kernel_sched_wakeup_new",
+				    "p %p rq->curr %p",
+				    wake_up_callback,
+				    &wakeup_trace);
+	if (ret) {
+		pr_info("wakeup trace: Couldn't add marker"
+			" probe to kernel_sched_wakeup_new\n");
+		goto fail_deprobe;
+	}
+
+	return 0;
+
+fail_deprobe:
+	marker_probe_unregister("kernel_sched_wakeup");
+fail:
+	unregister_tracer(&wakeup_tracer);
 	return 0;
 }
 device_initcall(init_wakeup_tracer);

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [RFC PATCH 1/2] Marker probes in futex.c
  2008-04-16 13:17                 ` Ingo Molnar
@ 2008-04-16 13:46                   ` Arjan van de Ven
  2008-04-16 14:00                     ` Mathieu Desnoyers
  2008-04-16 20:10                   ` text_poke, vmap and vmalloc on x86_64 Mathieu Desnoyers
  1 sibling, 1 reply; 46+ messages in thread
From: Arjan van de Ven @ 2008-04-16 13:46 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Mathieu Desnoyers, Peter Zijlstra, prasad, linux-kernel, tglx,
	Christoph Hellwig, Frank Ch. Eigler


> >     4631:       b0 00                   mov    $0x0,%al
> >     4633:       84 c0                   test   %al,%al
> >     4635:       0f 85 c6 00 00 00       jne    4701

the use of partial registers here is unfortunate and probably quite expensive ;(


> > If we want to support NMI context and have the ability to
> > instrument preemptable code without too much headache, we must
> > insure that every modification will leave the code in a "correct"
> > state and that we do not grow the size of any reachable
> > instruction.  Also, we must insure gcc did not put code between
> > these instructions. Modifying non-relocatable instructions would
> > also be a pain, since we would have to deal with instruction
> > pointer relocation in the breakpoint code when the code
> > modification is being done.

you also need to make sure no cpu is executing that code ever.. 
but you already deal with that right?

> > 
> > Luckily, gcc almost never place any code between the mov, test and
> > jne instructions. But since we cannot we sure, we could dynamically
> > check for this code pattern after the mov instruction. If we find
> > it, then we play with it as if it was a single asm block, but if we
> > don't find what we expect, then we use standard immediate values
> > for that. I expect the heavily optimised version will be usable
> > almost all the time.

I expect gcc to start using the macro-fusion capable ones more and more over time at least,
and for that the compare and jmp need to be consecutive.


-- 
If you want to reach me at my work email, use arjan@linux.intel.com
For development, discussion and tips for power savings, 
visit http://www.lesswatts.org

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [RFC PATCH 1/2] Marker probes in futex.c
  2008-04-16 13:46                   ` Arjan van de Ven
@ 2008-04-16 14:00                     ` Mathieu Desnoyers
  2008-04-16 14:24                       ` Arjan van de Ven
  0 siblings, 1 reply; 46+ messages in thread
From: Mathieu Desnoyers @ 2008-04-16 14:00 UTC (permalink / raw)
  To: Arjan van de Ven
  Cc: Ingo Molnar, Peter Zijlstra, prasad, linux-kernel, tglx,
	Christoph Hellwig, Frank Ch. Eigler

* Arjan van de Ven (arjan@infradead.org) wrote:
> 
> > >     4631:       b0 00                   mov    $0x0,%al
> > >     4633:       84 c0                   test   %al,%al
> > >     4635:       0f 85 c6 00 00 00       jne    4701
> 
> the use of partial registers here is unfortunate and probably quite expensive ;(
> 
> 

Yes, but it saves instruction cache. That's a tradeoff.

> > > If we want to support NMI context and have the ability to
> > > instrument preemptable code without too much headache, we must
> > > insure that every modification will leave the code in a "correct"
> > > state and that we do not grow the size of any reachable
> > > instruction.  Also, we must insure gcc did not put code between
> > > these instructions. Modifying non-relocatable instructions would
> > > also be a pain, since we would have to deal with instruction
> > > pointer relocation in the breakpoint code when the code
> > > modification is being done.
> 
> you also need to make sure no cpu is executing that code ever.. 
> but you already deal with that right?
> 

By "insure that every modification will leave the code in a "correct"
state", I mean that at any given time before, during or after the code
modification, if an NMI comes on any CPU and try to run the modified
code, it should have a valid version of the code to execute. Does it
make more sense ?

> > > 
> > > Luckily, gcc almost never place any code between the mov, test and
> > > jne instructions. But since we cannot we sure, we could dynamically
> > > check for this code pattern after the mov instruction. If we find
> > > it, then we play with it as if it was a single asm block, but if we
> > > don't find what we expect, then we use standard immediate values
> > > for that. I expect the heavily optimised version will be usable
> > > almost all the time.
> 
> I expect gcc to start using the macro-fusion capable ones more and more over time at least,
> and for that the compare and jmp need to be consecutive.
> 

Early reasults of the work I've done last night : I can detect about 96%
of the ~120 markers I've put in my instrumented kernel.

Not only does the compare and jmp need to be consecutive, but the movb
$0x0,%al also does. I *could* try to detect specific code inserted in
between, but I really have to make sure I don't get burned by the
compiler inserting a jmp there.

I'll post my work shortly.

Mathieu

> 
> -- 
> If you want to reach me at my work email, use arjan@linux.intel.com
> For development, discussion and tips for power savings, 
> visit http://www.lesswatts.org

-- 
Mathieu Desnoyers
Computer Engineering Ph.D. Student, Ecole Polytechnique de Montreal
OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F  BA06 3F25 A8FE 3BAE 9A68

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [RFC PATCH 1/2] Marker probes in futex.c
  2008-04-16 14:00                     ` Mathieu Desnoyers
@ 2008-04-16 14:24                       ` Arjan van de Ven
  2008-04-16 14:29                         ` Ingo Molnar
  2008-04-16 14:48                         ` Mathieu Desnoyers
  0 siblings, 2 replies; 46+ messages in thread
From: Arjan van de Ven @ 2008-04-16 14:24 UTC (permalink / raw)
  To: Mathieu Desnoyers
  Cc: Ingo Molnar, Peter Zijlstra, prasad, linux-kernel, tglx,
	Christoph Hellwig, Frank Ch. Eigler

On Wed, 16 Apr 2008 10:00:09 -0400
Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca> wrote:


> > > > If we want to support NMI context and have the ability to
> > > > instrument preemptable code without too much headache, we must
> > > > insure that every modification will leave the code in a
> > > > "correct" state and that we do not grow the size of any
> > > > reachable instruction.  Also, we must insure gcc did not put
> > > > code between these instructions. Modifying non-relocatable
> > > > instructions would also be a pain, since we would have to deal
> > > > with instruction pointer relocation in the breakpoint code when
> > > > the code modification is being done.
> > 
> > you also need to make sure no cpu is executing that code ever.. 
> > but you already deal with that right?
> > 
> 
> By "insure that every modification will leave the code in a "correct"
> state", I mean that at any given time before, during or after the code
> modification, if an NMI comes on any CPU and try to run the modified
> code, it should have a valid version of the code to execute. Does it
> make more sense ?

I understand your words. My concern is that I don't quite understand how you
guarantee that you'll not be executing the code you're modifying.
Just saying "it's consistent before and after" sounds nice but probably isn't
enough to be safe.



> Not only does the compare and jmp need to be consecutive, but the movb
> $0x0,%al also does. I *could* try to detect specific code inserted in
> between, but I really have to make sure I don't get burned by the
> compiler inserting a jmp there.

I wonder if just sticking in 2 barriers around your code make gcc stop moving stuff too much

-- 
If you want to reach me at my work email, use arjan@linux.intel.com
For development, discussion and tips for power savings, 
visit http://www.lesswatts.org

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [RFC PATCH 1/2] Marker probes in futex.c
  2008-04-16 14:24                       ` Arjan van de Ven
@ 2008-04-16 14:29                         ` Ingo Molnar
  2008-04-16 14:48                         ` Mathieu Desnoyers
  1 sibling, 0 replies; 46+ messages in thread
From: Ingo Molnar @ 2008-04-16 14:29 UTC (permalink / raw)
  To: Arjan van de Ven
  Cc: Mathieu Desnoyers, Peter Zijlstra, prasad, linux-kernel, tglx,
	Christoph Hellwig, Frank Ch. Eigler

* Arjan van de Ven <arjan@infradead.org> wrote:

> > Not only does the compare and jmp need to be consecutive, but the 
> > movb $0x0,%al also does. I *could* try to detect specific code 
> > inserted in between, but I really have to make sure I don't get 
> > burned by the compiler inserting a jmp there.
> 
> I wonder if just sticking in 2 barriers around your code make gcc stop 
> moving stuff too much

hm, an extra optimization barrier might have worse effects than even an 
extra instruction. I think if the detection and patching can be made 
100% safe, we dont care about the remaining 4% of markers that gcc 
somehow reorders.

and in parallel gcc folks might want to start helping us achieve 
single-instruction branch points? Currently there's no way to get flags 
values out of inline assembly, except via a register intermediary which 
adds another instruction. For flags that are unaffected by gcc's 
input/output constraint generation code it would make sense to allow 
them to be exported out of inline assembly.

	Ingo

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [RFC PATCH 1/2] Marker probes in futex.c
  2008-04-16 14:24                       ` Arjan van de Ven
  2008-04-16 14:29                         ` Ingo Molnar
@ 2008-04-16 14:48                         ` Mathieu Desnoyers
  2008-04-16 15:32                           ` Arjan van de Ven
  1 sibling, 1 reply; 46+ messages in thread
From: Mathieu Desnoyers @ 2008-04-16 14:48 UTC (permalink / raw)
  To: Arjan van de Ven
  Cc: Ingo Molnar, Peter Zijlstra, prasad, linux-kernel, tglx,
	Christoph Hellwig, Frank Ch. Eigler

* Arjan van de Ven (arjan@infradead.org) wrote:
> On Wed, 16 Apr 2008 10:00:09 -0400
> Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca> wrote:
> 
> 
> > > > > If we want to support NMI context and have the ability to
> > > > > instrument preemptable code without too much headache, we must
> > > > > insure that every modification will leave the code in a
> > > > > "correct" state and that we do not grow the size of any
> > > > > reachable instruction.  Also, we must insure gcc did not put
> > > > > code between these instructions. Modifying non-relocatable
> > > > > instructions would also be a pain, since we would have to deal
> > > > > with instruction pointer relocation in the breakpoint code when
> > > > > the code modification is being done.
> > > 
> > > you also need to make sure no cpu is executing that code ever.. 
> > > but you already deal with that right?
> > > 
> > 
> > By "insure that every modification will leave the code in a "correct"
> > state", I mean that at any given time before, during or after the code
> > modification, if an NMI comes on any CPU and try to run the modified
> > code, it should have a valid version of the code to execute. Does it
> > make more sense ?
> 
> I understand your words. My concern is that I don't quite understand how you
> guarantee that you'll not be executing the code you're modifying.
> Just saying "it's consistent before and after" sounds nice but probably isn't
> enough to be safe.
> 
Ah, I see. I insert a breakpoint and execute a bypass rather than the
code being modified. I only put back the 1st instruction byte after the
rest of the instruction has been modified.

> 
> 
> > Not only does the compare and jmp need to be consecutive, but the movb
> > $0x0,%al also does. I *could* try to detect specific code inserted in
> > between, but I really have to make sure I don't get burned by the
> > compiler inserting a jmp there.
> 
> I wonder if just sticking in 2 barriers around your code make gcc stop moving stuff too much
> 

I'm not sure people would like the side-effect for the rest of
optimizations, but it should be tried.

Mathieu

-- 
Mathieu Desnoyers
Computer Engineering Ph.D. Student, Ecole Polytechnique de Montreal
OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F  BA06 3F25 A8FE 3BAE 9A68

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [RFC PATCH 1/2] Marker probes in futex.c
  2008-04-16 14:48                         ` Mathieu Desnoyers
@ 2008-04-16 15:32                           ` Arjan van de Ven
  2008-04-16 15:39                             ` Mathieu Desnoyers
  0 siblings, 1 reply; 46+ messages in thread
From: Arjan van de Ven @ 2008-04-16 15:32 UTC (permalink / raw)
  To: Mathieu Desnoyers
  Cc: Ingo Molnar, Peter Zijlstra, prasad, linux-kernel, tglx,
	Christoph Hellwig, Frank Ch. Eigler

On Wed, 16 Apr 2008 10:48:14 -0400
Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca> wrote:

> * Arjan van de Ven (arjan@infradead.org) wrote:
> > On Wed, 16 Apr 2008 10:00:09 -0400
> > Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca> wrote:
> > 
> > 
> > > > > > If we want to support NMI context and have the ability to
> > > > > > instrument preemptable code without too much headache, we
> > > > > > must insure that every modification will leave the code in a
> > > > > > "correct" state and that we do not grow the size of any
> > > > > > reachable instruction.  Also, we must insure gcc did not put
> > > > > > code between these instructions. Modifying non-relocatable
> > > > > > instructions would also be a pain, since we would have to
> > > > > > deal with instruction pointer relocation in the breakpoint
> > > > > > code when the code modification is being done.
> > > > 
> > > > you also need to make sure no cpu is executing that code ever.. 
> > > > but you already deal with that right?
> > > > 
> > > 
> > > By "insure that every modification will leave the code in a
> > > "correct" state", I mean that at any given time before, during or
> > > after the code modification, if an NMI comes on any CPU and try
> > > to run the modified code, it should have a valid version of the
> > > code to execute. Does it make more sense ?
> > 
> > I understand your words. My concern is that I don't quite
> > understand how you guarantee that you'll not be executing the code
> > you're modifying. Just saying "it's consistent before and after"
> > sounds nice but probably isn't enough to be safe.
> > 
> Ah, I see. I insert a breakpoint and execute a bypass rather than the
> code being modified. I only put back the 1st instruction byte after
> the rest of the instruction has been modified.

sorry but I'm not convinced that that is safe without a real exclusion mechanism.

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [RFC PATCH 1/2] Marker probes in futex.c
  2008-04-16 15:32                           ` Arjan van de Ven
@ 2008-04-16 15:39                             ` Mathieu Desnoyers
  0 siblings, 0 replies; 46+ messages in thread
From: Mathieu Desnoyers @ 2008-04-16 15:39 UTC (permalink / raw)
  To: Arjan van de Ven
  Cc: Ingo Molnar, Peter Zijlstra, prasad, linux-kernel, tglx,
	Christoph Hellwig, Frank Ch. Eigler

* Arjan van de Ven (arjan@infradead.org) wrote:
> On Wed, 16 Apr 2008 10:48:14 -0400
> Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca> wrote:
> 
> > * Arjan van de Ven (arjan@infradead.org) wrote:
> > > On Wed, 16 Apr 2008 10:00:09 -0400
> > > Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca> wrote:
> > > 
> > > 
> > > > > > > If we want to support NMI context and have the ability to
> > > > > > > instrument preemptable code without too much headache, we
> > > > > > > must insure that every modification will leave the code in a
> > > > > > > "correct" state and that we do not grow the size of any
> > > > > > > reachable instruction.  Also, we must insure gcc did not put
> > > > > > > code between these instructions. Modifying non-relocatable
> > > > > > > instructions would also be a pain, since we would have to
> > > > > > > deal with instruction pointer relocation in the breakpoint
> > > > > > > code when the code modification is being done.
> > > > > 
> > > > > you also need to make sure no cpu is executing that code ever.. 
> > > > > but you already deal with that right?
> > > > > 
> > > > 
> > > > By "insure that every modification will leave the code in a
> > > > "correct" state", I mean that at any given time before, during or
> > > > after the code modification, if an NMI comes on any CPU and try
> > > > to run the modified code, it should have a valid version of the
> > > > code to execute. Does it make more sense ?
> > > 
> > > I understand your words. My concern is that I don't quite
> > > understand how you guarantee that you'll not be executing the code
> > > you're modifying. Just saying "it's consistent before and after"
> > > sounds nice but probably isn't enough to be safe.
> > > 
> > Ah, I see. I insert a breakpoint and execute a bypass rather than the
> > code being modified. I only put back the 1st instruction byte after
> > the rest of the instruction has been modified.
> 
> sorry but I'm not convinced that that is safe without a real exclusion mechanism.

I'll post the patchset soon. Please have a look at the rather lengthy
comment in arch/x86/kernel/immediate.c for details. I'll be glad to
answer any question that remains.

Mathieu

-- 
Mathieu Desnoyers
Computer Engineering Ph.D. Student, Ecole Polytechnique de Montreal
OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F  BA06 3F25 A8FE 3BAE 9A68

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [RFC PATCH 1/2] Marker probes in futex.c
  2008-04-15 13:25         ` [RFC PATCH 1/2] Marker probes in futex.c Mathieu Desnoyers
@ 2008-04-16 15:51           ` Peter Zijlstra
  2008-04-17 19:19             ` Frank Ch. Eigler
  0 siblings, 1 reply; 46+ messages in thread
From: Peter Zijlstra @ 2008-04-16 15:51 UTC (permalink / raw)
  To: Mathieu Desnoyers
  Cc: prasad, linux-kernel, tglx, mingo, Christoph Hellwig, Frank Ch. Eigler

On Tue, 2008-04-15 at 09:25 -0400, Mathieu Desnoyers wrote:
> * Peter Zijlstra (a.p.zijlstra@chello.nl) wrote:
> > On Tue, 2008-04-15 at 08:32 -0400, Mathieu Desnoyers wrote:
> > > * Peter Zijlstra (a.p.zijlstra@chello.nl) wrote:
> > > > On Tue, 2008-04-15 at 17:23 +0530, K. Prasad wrote:
> > > > 
> > > > > +	trace_mark(futex_wait_called, "uaddr:%p fshared:%p val:%u "
> > > > > +			"abs_time:%p bitset:%d",
> > > > > +			uaddr, fshared, val, abs_time, bitset);
> > > > 
> > > > This is some seriuosly ugly looking gunk, why would we want stuff like
> > > > that scattered across the code?
> > > > 
> > > 
> > > I don't really see how it differs so much from printks, which kernel
> > > developers are already familiar with.
> > 
> > Which never last longer than the debug session and are never exposed to
> > enterprise kABI crap.
> > 
> > > > What is wrong with a few simple hooks like:
> > > > 
> > > >   trace_futex_wait(uaddr, fshares, val, abs_time, bitset);
> > > > 
> > > > and then deal with that.
> > > > 
> > > 
> > > If any of your variable type changes, then you are exporting an unknown
> > > data structure to user-space. _That_ would break a userspace tracer
> > > whenever you change any of these kernel variables and you don't want
> > > that.
> > 
> > trace_futex_wait()'s signature would make the compiler issue a complaint
> > when the arguments suddenly changes type, no?
> > 
> 
> Yes, but then you would have to create new code for each event you want
> to trace. In the end, it would increase the icache footprint
> considerably and would also make addition of new events cumbersome.

Looking at patch 2/2 in this series that seems to be needed anyway. So
I'm not sure what adding all these character strings buy you.

> > > Exporting the field names and variable types helps to identify the
> > > variables by their given names rather than their respective order.
> > > Having the field type insures binary compatibility.
> > > 
> > > Clearly we can turn your trace_futex_wait(uaddr, fshares, val, abs_time,
> > > bitset); into a trace_mark() with a simple define, and I don't see any
> > > problem with that. I just want to make sure the event name, field names
> > > and field types are exported, and this is done by markers. However, I
> > > wonder why none of the kernel printk() are turned into specialized
> > > defines to make the code "cleaner".. maybe it's because it is useful to
> > > have everything declared in one spot after all.
> > 
> > I must be missing something here, printks don't need to look pretty
> > because they never see the light of lkml. They get ripped out as soon as
> > I understand what the heck happened.
> > 
> 
> The thing is that the trace_marks really fills two purpose : they
> extract information from the core kernel, which is meant to be shipped
> on production systems so tracing tools can report what is happening on
> the system and they also allow kernel hackers to add markers of their
> own, so they can extract information about specific events they are
> interested in along with the standard kernel instrumentation.
> 
> So, part of it is meant to be standard kernel information, part of it
> can be used for debugging. And since the kernel code evolves through
> time, it makes sense to have an infrastructure flexible enough to follow
> these changes easily.

The trouble I have with these free form thingies is that its too hard to
keep them consistent.

So we have this horrible printf syntax; but even worse:


> +       trace_mark(futex_wake_called, "uaddr:%p fshared:%p nr_wake:%d "
> +                       "bitset:%d",
> +                       uaddr, fshared, nr_wake, bitset);


> +       INIT_FUTEX_DEBUG_PROBE(futex_wake_called,
> +                       "uaddr:%p fshared:%p nr_wake:%d bitset:%d"),


Why the need to duplicate it; that's utter madness.


^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [RFC PATCH 1/2] Marker probes in futex.c
  2008-04-15 15:52     ` K. Prasad
@ 2008-04-16 15:51       ` Peter Zijlstra
  2008-04-17 22:02         ` Frank Ch. Eigler
  0 siblings, 1 reply; 46+ messages in thread
From: Peter Zijlstra @ 2008-04-16 15:51 UTC (permalink / raw)
  To: prasad; +Cc: linux-kernel, tglx, mingo, mathieu.desnoyers

On Tue, 2008-04-15 at 21:22 +0530, K. Prasad wrote:
> On Tue, Apr 15, 2008 at 02:02:22PM +0200, Peter Zijlstra wrote:
> > On Tue, 2008-04-15 at 17:23 +0530, K. Prasad wrote:
> > 
> > > +	trace_mark(futex_wait_called, "uaddr:%p fshared:%p val:%u "
> > > +			"abs_time:%p bitset:%d",
> > > +			uaddr, fshared, val, abs_time, bitset);
> > 
> > This is some seriuosly ugly looking gunk, why would we want stuff like
> > that scattered across the code?
> > 
> > What is wrong with a few simple hooks like:
> > 
> >   trace_futex_wait(uaddr, fshares, val, abs_time, bitset);
> > 
> > and then deal with that.
> > 
> > Also, you seem to expose way too much futex internals; do you really
> > need that? People will go use this marker crap like ABI and further
> > restrain us from changing the code.
> The idea is to export as much data as possible (considering that the
> cost involved in doing so is less) and use them only when required.
> 
> If we were to log just the futex_ops, just as you had suggested,
> "Just log:
>  
>   futex: <uaddr> wait
>   futex: <uaddr> wakeup"
> it would provide only cursory information about code-flow and not enough
> debug information to help the user solve the issue he's trying to debug.
> 
> Say for e.g. in futex_wait you may want to know if abs_time had a
> non-zero value but the primitive debugging information would end
> up being a handicap from discovering that.
> 
> If you can specifically point me to information you think would be
> absolutely unnecessary, I can get them out of the trace_mark().

I'm thinking everything is superflous; you're basically logging what
strace already gives you; except worse by encoding local variable names
and exposing kernel pointers.

Also get rid of that futex_markers.h thing - its utterly useless and
makes the code less readable.


^ permalink raw reply	[flat|nested] 46+ messages in thread

* text_poke, vmap and vmalloc on x86_64
  2008-04-16 13:17                 ` Ingo Molnar
  2008-04-16 13:46                   ` Arjan van de Ven
@ 2008-04-16 20:10                   ` Mathieu Desnoyers
  2008-04-16 21:22                     ` Mathieu Desnoyers
  1 sibling, 1 reply; 46+ messages in thread
From: Mathieu Desnoyers @ 2008-04-16 20:10 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Peter Zijlstra, prasad, linux-kernel, tglx, Christoph Hellwig,
	Frank Ch. Eigler, Arjan van de Ven

Hi,

I am trying to figure out why the following code seems to work fine on
x86_32, but the update fails to happen on x86_64. Any idea why ? From
what I see the memcpy seems to fall in a memory area which is not the
same as the one I test at the end of the function. It only fails for
module addresses, which are vmalloc'd. Once I fix this, I'll be able to
send the patchset.

(from my own alternative.c) :

void *__kprobes text_poke(void *addr, const void *opcode, size_t len)
{
        unsigned long flags;
        char *vaddr;
        int nr_pages = 2;
        struct page *pages[2];
        int i;

        if (*((uint8_t *)addr - 1) != BREAKPOINT_INSTRUCTION) {
                BUG_ON(len > sizeof(long));
                BUG_ON((((long)addr + len - 1) & ~(sizeof(long) - 1))
                        - ((long)addr & ~(sizeof(long) - 1)));
        }
        if (is_vmalloc_addr(addr)) {
                pages[0] = vmalloc_to_page(addr);
                pages[1] = vmalloc_to_page(addr + PAGE_SIZE);
        } else {
                pages[0] = virt_to_page(addr);
                pages[1] = virt_to_page(addr + PAGE_SIZE);
        }
        BUG_ON(!pages[0]);
        if (!pages[1])
                nr_pages = 1;
        vaddr = vmap(pages, nr_pages, VM_MAP, PAGE_KERNEL);
        BUG_ON(!vaddr);
        local_irq_save(flags);
        memcpy(&vaddr[(unsigned long)addr & ~PAGE_MASK], opcode, len);
        local_irq_restore(flags);
        vunmap(vaddr);
        sync_core();
        /* Could also do a CLFLUSH here to speed up CPU recovery; but
           that causes hangs on some VIA CPUs. */
        for (i = 0; i < len; i++)
                BUG_ON(((char *)addr)[i] != ((char *)opcode)[i]);
        return addr;
}

Mathieu

-- 
Mathieu Desnoyers
Computer Engineering Ph.D. Student, Ecole Polytechnique de Montreal
OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F  BA06 3F25 A8FE 3BAE 9A68

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: text_poke, vmap and vmalloc on x86_64
  2008-04-16 20:10                   ` text_poke, vmap and vmalloc on x86_64 Mathieu Desnoyers
@ 2008-04-16 21:22                     ` Mathieu Desnoyers
  0 siblings, 0 replies; 46+ messages in thread
From: Mathieu Desnoyers @ 2008-04-16 21:22 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Peter Zijlstra, prasad, linux-kernel, tglx, Christoph Hellwig,
	Frank Ch. Eigler, Arjan van de Ven

* Mathieu Desnoyers (mathieu.desnoyers@polymtl.ca) wrote:
> Hi,
> 
> I am trying to figure out why the following code seems to work fine on
> x86_32, but the update fails to happen on x86_64. Any idea why ? From
> what I see the memcpy seems to fall in a memory area which is not the
> same as the one I test at the end of the function. It only fails for
> module addresses, which are vmalloc'd. Once I fix this, I'll be able to
> send the patchset.
> 

Ah, found it. the is_vmalloc_addr() is somewhat misleading because it
does not include the vmalloc'd memory for modules, which is in a
different address space. I wonder if it has other unknown side-effects.

I used !core_kernel_text() instead.

Mathieu

> (from my own alternative.c) :
> 
> void *__kprobes text_poke(void *addr, const void *opcode, size_t len)
> {
>         unsigned long flags;
>         char *vaddr;
>         int nr_pages = 2;
>         struct page *pages[2];
>         int i;
> 
>         if (*((uint8_t *)addr - 1) != BREAKPOINT_INSTRUCTION) {
>                 BUG_ON(len > sizeof(long));
>                 BUG_ON((((long)addr + len - 1) & ~(sizeof(long) - 1))
>                         - ((long)addr & ~(sizeof(long) - 1)));
>         }
>         if (is_vmalloc_addr(addr)) {
>                 pages[0] = vmalloc_to_page(addr);
>                 pages[1] = vmalloc_to_page(addr + PAGE_SIZE);
>         } else {
>                 pages[0] = virt_to_page(addr);
>                 pages[1] = virt_to_page(addr + PAGE_SIZE);
>         }
>         BUG_ON(!pages[0]);
>         if (!pages[1])
>                 nr_pages = 1;
>         vaddr = vmap(pages, nr_pages, VM_MAP, PAGE_KERNEL);
>         BUG_ON(!vaddr);
>         local_irq_save(flags);
>         memcpy(&vaddr[(unsigned long)addr & ~PAGE_MASK], opcode, len);
>         local_irq_restore(flags);
>         vunmap(vaddr);
>         sync_core();
>         /* Could also do a CLFLUSH here to speed up CPU recovery; but
>            that causes hangs on some VIA CPUs. */
>         for (i = 0; i < len; i++)
>                 BUG_ON(((char *)addr)[i] != ((char *)opcode)[i]);
>         return addr;
> }
> 
> Mathieu
> 
> -- 
> Mathieu Desnoyers
> Computer Engineering Ph.D. Student, Ecole Polytechnique de Montreal
> OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F  BA06 3F25 A8FE 3BAE 9A68

-- 
Mathieu Desnoyers
Computer Engineering Ph.D. Student, Ecole Polytechnique de Montreal
OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F  BA06 3F25 A8FE 3BAE 9A68

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [RFC PATCH 1/2] Marker probes in futex.c
  2008-04-16 15:51           ` Peter Zijlstra
@ 2008-04-17 19:19             ` Frank Ch. Eigler
  2008-04-17 20:16               ` Mathieu Desnoyers
  2008-04-18  6:56               ` Peter Zijlstra
  0 siblings, 2 replies; 46+ messages in thread
From: Frank Ch. Eigler @ 2008-04-17 19:19 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Mathieu Desnoyers, prasad, linux-kernel, tglx, mingo, Christoph Hellwig

Hi -

On Wed, Apr 16, 2008 at 05:51:24PM +0200, Peter Zijlstra wrote:
> [...]
> > > > > What is wrong with a few simple hooks like:
> > > > >   trace_futex_wait(uaddr, fshares, val, abs_time, bitset);
> > > > > and then deal with that.
> [...]
> > Yes, but then you would have to create new code for each event you want
> > to trace. In the end, it would increase the icache footprint
> > considerably and would also make addition of new events cumbersome.
> > [...]

That, plus the new hand-written function (trace_futex_wait) would
still need to manage the packaging of the arguments for consumption by
separately compiled pieces.  It is desirable not to require such
hand-written functions to *also* be declared in headers for these
event consumers to compile against.

> So I'm not sure what adding all these character strings buy you.

The main thing is type checking by engaging gcc's printf format
checking logic.  In my original markers proposal, the types were
encoded into the function name, sort of as in C++:

  trace_mark_nnnnn(futex_wake_called, uaddr, fshares, val, abs_time, bitset);

where each "n" stands for some integral value, and could be chosen
amongst a small number of other types (say -- "s": char* string, "p":
void*, "l":64-bit long).  Then, type checking could be done by the
core compiler for both event producers and consumers.  One downside
was that the trace_mark_* permutations themselves would have to be
generated by some shell/perl script [1], and some deemed this probably
unacceptable.  I'm still not sure...

[1] some systemtap archaeology:
http://sourceware.org/git/?p=systemtap.git;a=commit;h=b171146c8e8d4fa749b8829c47750750dc19f11c

> >+       trace_mark(futex_wake_called, "uaddr:%p fshared:%p nr_wake:%d "
> > +                       "bitset:%d",
> > +                       uaddr, fshared, nr_wake, bitset);
> 
> > +       INIT_FUTEX_DEBUG_PROBE(futex_wake_called,
> > +                       "uaddr:%p fshared:%p nr_wake:%d bitset:%d"),
> 
> Why the need to duplicate it; that's utter madness.

This second instance is optional and is used as a consistency check
for the event consumer to hook up exactly to the intended producer.
The string could be empty.

- FChE

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [RFC PATCH 1/2] Marker probes in futex.c
  2008-04-17 19:19             ` Frank Ch. Eigler
@ 2008-04-17 20:16               ` Mathieu Desnoyers
  2008-04-19 12:13                 ` K. Prasad
  2008-04-18  6:56               ` Peter Zijlstra
  1 sibling, 1 reply; 46+ messages in thread
From: Mathieu Desnoyers @ 2008-04-17 20:16 UTC (permalink / raw)
  To: Frank Ch. Eigler
  Cc: Peter Zijlstra, prasad, linux-kernel, tglx, mingo, Christoph Hellwig

* Frank Ch. Eigler (fche@redhat.com) wrote:
[..]
> > >+       trace_mark(futex_wake_called, "uaddr:%p fshared:%p nr_wake:%d "
> > > +                       "bitset:%d",
> > > +                       uaddr, fshared, nr_wake, bitset);
> > 
> > > +       INIT_FUTEX_DEBUG_PROBE(futex_wake_called,
> > > +                       "uaddr:%p fshared:%p nr_wake:%d bitset:%d"),
> > 
> > Why the need to duplicate it; that's utter madness.
> 
> This second instance is optional and is used as a consistency check
> for the event consumer to hook up exactly to the intended producer.
> The string could be empty.
> 

empty -> NULL , yes :)

> 
> - FChE

-- 
Mathieu Desnoyers
Computer Engineering Ph.D. Student, Ecole Polytechnique de Montreal
OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F  BA06 3F25 A8FE 3BAE 9A68

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [RFC PATCH 1/2] Marker probes in futex.c
  2008-04-16 15:51       ` Peter Zijlstra
@ 2008-04-17 22:02         ` Frank Ch. Eigler
  2008-04-18  6:46           ` Peter Zijlstra
  0 siblings, 1 reply; 46+ messages in thread
From: Frank Ch. Eigler @ 2008-04-17 22:02 UTC (permalink / raw)
  To: Peter Zijlstra; +Cc: prasad, linux-kernel, tglx, mingo, mathieu.desnoyers

Peter Zijlstra <a.p.zijlstra@chello.nl> writes:

> [...]
>> If we were to log just the futex_ops, just as you had suggested,
>> "Just log:
>>  
>>   futex: <uaddr> wait
>>   futex: <uaddr> wakeup"
>> [...]
>> If you can specifically point me to information you think would be
>> absolutely unnecessary, I can get them out of the trace_mark().
>
> I'm thinking everything is superflous; you're basically logging what
> strace already gives you

But we don't want to run strace just for this stuff.  As you probably
know, strace involves invasive user-space context-switching between
the target and the tracer.

> except worse by encoding local variable names and exposing kernel
> pointers.

The pointers are probably excessive, the and the names don't really
matter.  What does matter is providing enough information for a
problem diagnosis tool & person to reconstruct what the kernel must
have been thinking when it did something noteworthy.

- FChE

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [RFC PATCH 1/2] Marker probes in futex.c
  2008-04-17 22:02         ` Frank Ch. Eigler
@ 2008-04-18  6:46           ` Peter Zijlstra
  2008-04-18 14:29             ` Frank Ch. Eigler
  2008-04-19 14:13             ` Mathieu Desnoyers
  0 siblings, 2 replies; 46+ messages in thread
From: Peter Zijlstra @ 2008-04-18  6:46 UTC (permalink / raw)
  To: Frank Ch. Eigler; +Cc: prasad, linux-kernel, tglx, mingo, mathieu.desnoyers

On Thu, 2008-04-17 at 18:02 -0400, Frank Ch. Eigler wrote:
> Peter Zijlstra <a.p.zijlstra@chello.nl> writes:
> 
> > [...]
> >> If we were to log just the futex_ops, just as you had suggested,
> >> "Just log:
> >>  
> >>   futex: <uaddr> wait
> >>   futex: <uaddr> wakeup"
> >> [...]
> >> If you can specifically point me to information you think would be
> >> absolutely unnecessary, I can get them out of the trace_mark().
> >
> > I'm thinking everything is superflous; you're basically logging what
> > strace already gives you
> 
> But we don't want to run strace just for this stuff.  As you probably
> know, strace involves invasive user-space context-switching between
> the target and the tracer.
> 
> > except worse by encoding local variable names and exposing kernel
> > pointers.
> 
> The pointers are probably excessive, the and the names don't really
> matter.

Then what do we do when someone comes along and changes one of those
names; do we go around changing the markers and then requiring all tools
to change as well?

(And no this isn't far fetched; I'm thinking of changing fshared in the
near future).

Sounds like people will complain and generate back pressure against such
changes - something we should avoid. As soon as these markers place a
significant burden on code maintenance I'm against it.

>   What does matter is providing enough information for a
> problem diagnosis tool & person to reconstruct what the kernel must
> have been thinking when it did something noteworthy.

Sure, but then just make a strace like tracer and be done with it - no
need to pollute the futex code with that.


^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [RFC PATCH 1/2] Marker probes in futex.c
  2008-04-17 19:19             ` Frank Ch. Eigler
  2008-04-17 20:16               ` Mathieu Desnoyers
@ 2008-04-18  6:56               ` Peter Zijlstra
  1 sibling, 0 replies; 46+ messages in thread
From: Peter Zijlstra @ 2008-04-18  6:56 UTC (permalink / raw)
  To: Frank Ch. Eigler
  Cc: Mathieu Desnoyers, prasad, linux-kernel, tglx, mingo, Christoph Hellwig

On Thu, 2008-04-17 at 15:19 -0400, Frank Ch. Eigler wrote:
> Hi -
> 
> On Wed, Apr 16, 2008 at 05:51:24PM +0200, Peter Zijlstra wrote:
> > [...]
> > > > > > What is wrong with a few simple hooks like:
> > > > > >   trace_futex_wait(uaddr, fshares, val, abs_time, bitset);
> > > > > > and then deal with that.
> > [...]
> > > Yes, but then you would have to create new code for each event you want
> > > to trace. In the end, it would increase the icache footprint
> > > considerably and would also make addition of new events cumbersome.
> > > [...]
> 
> That, plus the new hand-written function (trace_futex_wait) would
> still need to manage the packaging of the arguments for consumption by
> separately compiled pieces.  It is desirable not to require such
> hand-written functions to *also* be declared in headers for these
> event consumers to compile against.

*blink* so all this is so you don't have to put a declarion in a header
file?

How about we put these premanent markers in a header - Mathieu says
there are <200. Surely that's not too much trouble.

Then you can keep this trace_mark() (perhaps trace_printf() is a better
name) around for the ad-hoc debug hacks.

> > So I'm not sure what adding all these character strings buy you.
> 
> The main thing is type checking by engaging gcc's printf format
> checking logic.  In my original markers proposal, the types were
> encoded into the function name, sort of as in C++:
> 
>   trace_mark_nnnnn(futex_wake_called, uaddr, fshares, val, abs_time, bitset);
> 
> where each "n" stands for some integral value, and could be chosen
> amongst a small number of other types (say -- "s": char* string, "p":
> void*, "l":64-bit long).  Then, type checking could be done by the
> core compiler for both event producers and consumers.  One downside
> was that the trace_mark_* permutations themselves would have to be
> generated by some shell/perl script [1], and some deemed this probably
> unacceptable.  I'm still not sure...
> 
> [1] some systemtap archaeology:
> http://sourceware.org/git/?p=systemtap.git;a=commit;h=b171146c8e8d4fa749b8829c47750750dc19f11c
> 
> 
> > >+       trace_mark(futex_wake_called, "uaddr:%p fshared:%p nr_wake:%d "
> > > +                       "bitset:%d",
> > > +                       uaddr, fshared, nr_wake, bitset);
> > 
> > > +       INIT_FUTEX_DEBUG_PROBE(futex_wake_called,
> > > +                       "uaddr:%p fshared:%p nr_wake:%d bitset:%d"),
> > 
> > Why the need to duplicate it; that's utter madness.
> 
> This second instance is optional and is used as a consistency check
> for the event consumer to hook up exactly to the intended producer.
> The string could be empty.

So instead of writing normal C code and placing a declarion in a header,
you've come up with a scheme that needs to duplicate a text string to
check integrity. Sounds like a real good way to confuse people.


^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [RFC PATCH 1/2] Marker probes in futex.c
  2008-04-15 12:02   ` [RFC PATCH 1/2] Marker probes in futex.c Peter Zijlstra
  2008-04-15 12:32     ` Mathieu Desnoyers
  2008-04-15 15:52     ` K. Prasad
@ 2008-04-18 10:44     ` Andrew Morton
  2 siblings, 0 replies; 46+ messages in thread
From: Andrew Morton @ 2008-04-18 10:44 UTC (permalink / raw)
  To: Peter Zijlstra; +Cc: prasad, linux-kernel, tglx, mingo, mathieu.desnoyers

On Tue, 15 Apr 2008 14:02:22 +0200 Peter Zijlstra <a.p.zijlstra@chello.nl> wrote:

> On Tue, 2008-04-15 at 17:23 +0530, K. Prasad wrote:
> 
> > +	trace_mark(futex_wait_called, "uaddr:%p fshared:%p val:%u "
> > +			"abs_time:%p bitset:%d",
> > +			uaddr, fshared, val, abs_time, bitset);
> 
> This is some seriuosly ugly looking gunk, why would we want stuff like
> that scattered across the code?

My 2c: if we even begin to countenance merging stuff like that, it's time
for all the trace code to be removed.

<checks the email headers>

Nope, it wasn't April 1st.  Guys, get a grip.

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [RFC PATCH 1/2] Marker probes in futex.c
  2008-04-18  6:46           ` Peter Zijlstra
@ 2008-04-18 14:29             ` Frank Ch. Eigler
  2008-04-19 12:28               ` K. Prasad
  2008-04-19 14:52               ` Peter Zijlstra
  2008-04-19 14:13             ` Mathieu Desnoyers
  1 sibling, 2 replies; 46+ messages in thread
From: Frank Ch. Eigler @ 2008-04-18 14:29 UTC (permalink / raw)
  To: Peter Zijlstra; +Cc: prasad, linux-kernel, tglx, mingo, mathieu.desnoyers

Hi -

On Fri, Apr 18, 2008 at 08:46:44AM +0200, Peter Zijlstra wrote:
> [...]
> > > except worse by encoding local variable names and exposing kernel
> > > pointers.
> > 
> > The pointers are probably excessive, the and the names don't really
> > matter.
>
> Then what do we do when someone comes along and changes one of those
> names; do we go around changing the markers and then requiring all
> tools to change as well? (And no this isn't far fetched; I'm
> thinking of changing fshared in the near future).

At least two answers apply.  The markers being put in should be chosen
with the concurrence of the subsystem maintainer, who should help
identify those key quantities that are likely to be both long-lived
and good diagnostic value.  So that's the discussion we're having
right now: which values should be passed.  If they're long-lived, then
they can be given a long-lived name - and it doesn't have to be a
low-level C variable name.  (There's no reason why we can't also have
a slew of short-lived pure-debugging sorts of markers compiled in.  A
marker naming convention like "__futex..." can be adopted for such
purposes - where nothing is promised for version n+1, just hoping to
help diagnose problems in this version.)

The other answer is that we should ensure that tools do not assume
that the set of markers is fixed.  Let's never set such an
expectation.  (In systemtap, we have several abstraction and
version-adaptation facilities that aim to hide such changes.)
The kernel guy can choose at least two methods to help tools
without contraining himself too much.  He can change

        trace_mark(futex_foo, "p1 %d p2 %d", p1, p2);

to a pair:

        trace_mark(futex_foo, "p1 %d p2 %d", 0, p2); // backward compat.
        trace_mark(futex_foo2, "p2 %d p3 %d", p2, p3); // new marker

or even just drop the backward compatibility one altogether.

It will need judicious choices by the kernel and willingness by the
tools to keep up.  Those that don't will simply notice fewer events
coming in, but nothing important *breaking*.

The current crop of tools (lttng, systemtap) are both from friendly
groups who recognize that they have more of an expendable diagnostic
rather than operational role, and thus are willing to carry that
burden.  By the time new tools will show up, we will have more
experience with the degree and type of marker changes over time, and
they won't be in a position to place new constraints on the
establishment.

> Sounds like people will complain and generate back pressure against
> such changes - something we should avoid. As soon as these markers
> place a significant burden on code maintenance I'm against it.

Indeed.  This is why it's important for the subsystem maintainer to
wisely influence the markers as they go in.

> > That, plus the new hand-written function (trace_futex_wait) would
> > still need to manage the packaging of the arguments for consumption by
> > separately compiled pieces.  It is desirable not to require such
> > hand-written functions to *also* be declared in headers for these
> > event consumers to compile against.

> *blink* so all this is so you don't have to put a declarion in a
> header file? How about we put these premanent markers in a header -
> Mathieu says there are <200. Surely that's not too much trouble.
> [...]

It's not just that - it's a whole package including easy creation of
new markers, the code that manages their activation and deactivation,
the tool code that connects up to receive new events *and parameters*.
The current approach does not require tight compilation-level
coupling.  Indeed, for a new marker, the current approach requires
*no* code changes to anywhere other than the one-line inserted marker,
for tools like systemtap to connect and use them.  Cool eh?

- FChE

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [RFC PATCH 1/2] Marker probes in futex.c
  2008-04-17 20:16               ` Mathieu Desnoyers
@ 2008-04-19 12:13                 ` K. Prasad
  2008-04-19 21:33                   ` Mathieu Desnoyers
  0 siblings, 1 reply; 46+ messages in thread
From: K. Prasad @ 2008-04-19 12:13 UTC (permalink / raw)
  To: Mathieu Desnoyers
  Cc: Frank Ch. Eigler, Peter Zijlstra, prasad, linux-kernel, tglx,
	mingo, Christoph Hellwig

On Thu, Apr 17, 2008 at 04:16:44PM -0400, Mathieu Desnoyers wrote:
> * Frank Ch. Eigler (fche@redhat.com) wrote:
> [..]
> > > >+       trace_mark(futex_wake_called, "uaddr:%p fshared:%p nr_wake:%d "
> > > > +                       "bitset:%d",
> > > > +                       uaddr, fshared, nr_wake, bitset);
> > > 
> > > > +       INIT_FUTEX_DEBUG_PROBE(futex_wake_called,
> > > > +                       "uaddr:%p fshared:%p nr_wake:%d bitset:%d"),
> > > 
> > > Why the need to duplicate it; that's utter madness.
> > 
> > This second instance is optional and is used as a consistency check
> > for the event consumer to hook up exactly to the intended producer.
> > The string could be empty.
> > 
> 
> empty -> NULL , yes :)
Atleast until 2.6.25-rc8-mm2, I would expect the marker registration to
fail if the format strings don't match.
I find an early return in set_marker() in marker.c if the format
strings don't match.

Mathieu,
	Can you let me know if you would find the patch below - which
eliminates the need to pass format string to marker_probe_register()
function, acceptable?
I have done some elementary tests with the patch, but I know it would
fail if there are duplicate markers with different format strings.
However I presume that duplicate markers are meant to appear primarily
when markers are present in inlined functions (in which case their
format strings would be the same).

Thanks,
K.Prasad

Signed-off-by: K.Prasad <prasad@linux.vnet.ibm.com>
---
 include/linux/marker.h          |    4 ++--
 kernel/marker.c                 |   40 +++++++++++++++++-----------------------
 samples/markers/probe-example.c |    3 ---
 3 files changed, 19 insertions(+), 28 deletions(-)

Index: linux-register_marker_patch/kernel/marker.c
===================================================================
--- linux-register_marker_patch.orig/kernel/marker.c
+++ linux-register_marker_patch/kernel/marker.c
@@ -364,7 +364,7 @@ static struct marker_entry *get_marker(c
  * Add the marker to the marker hash table. Must be called with markers_mutex
  * held.
  */
-static struct marker_entry *add_marker(const char *name, const char *format)
+static struct marker_entry *add_marker(const char *name)
 {
 	struct hlist_head *head;
 	struct hlist_node *node;
@@ -372,9 +372,15 @@ static struct marker_entry *add_marker(c
 	size_t name_len = strlen(name) + 1;
 	size_t format_len = 0;
 	u32 hash = jhash(name, name_len-1, 0);
+	struct marker *iter;
+
+	/* Search for the marker with 'name' in __markers section */
+	for (iter = __start___markers;
+		((iter < __stop___markers) && (strcmp(name, iter->name)));
+			iter++);
 
-	if (format)
-		format_len = strlen(format) + 1;
+	if (iter->format)
+		format_len = strlen(iter->format) + 1;
 	head = &marker_table[hash & ((1 << MARKER_HASH_BITS)-1)];
 	hlist_for_each_entry(e, node, head, hlist) {
 		if (!strcmp(name, e->name)) {
@@ -392,9 +398,9 @@ static struct marker_entry *add_marker(c
 	if (!e)
 		return ERR_PTR(-ENOMEM);
 	memcpy(&e->name[0], name, name_len);
-	if (format) {
+	if (iter->format) {
 		e->format = &e->name[name_len];
-		memcpy(e->format, format, format_len);
+		memcpy(e->format, iter->format, format_len);
 		if (strcmp(e->format, MARK_NOARGS) == 0)
 			e->call = marker_probe_cb_noarg;
 		else
@@ -494,21 +500,9 @@ static int set_marker(struct marker_entr
 	int ret;
 	WARN_ON(strcmp((*entry)->name, elem->name) != 0);
 
-	if ((*entry)->format) {
-		if (strcmp((*entry)->format, elem->format) != 0) {
-			printk(KERN_NOTICE
-				"Format mismatch for probe %s "
-				"(%s), marker (%s)\n",
-				(*entry)->name,
-				(*entry)->format,
-				elem->format);
-			return -EPERM;
-		}
-	} else {
-		ret = marker_set_format(entry, elem->format);
-		if (ret)
-			return ret;
-	}
+	ret = marker_set_format(entry, elem->format);
+	if (ret)
+		return ret;
 
 	/*
 	 * probe_cb setup (statically known) is done here. It is
@@ -638,8 +632,8 @@ static void marker_update_probes(void)
  * Returns 0 if ok, error value on error.
  * The probe address must at least be aligned on the architecture pointer size.
  */
-int marker_probe_register(const char *name, const char *format,
-			marker_probe_func *probe, void *probe_private)
+int marker_probe_register(const char *name, marker_probe_func *probe,
+							void *probe_private)
 {
 	struct marker_entry *entry;
 	int ret = 0;
@@ -648,7 +642,7 @@ int marker_probe_register(const char *na
 	mutex_lock(&markers_mutex);
 	entry = get_marker(name);
 	if (!entry) {
-		entry = add_marker(name, format);
+		entry = add_marker(name);
 		if (IS_ERR(entry)) {
 			ret = PTR_ERR(entry);
 			goto end;
Index: linux-register_marker_patch/include/linux/marker.h
===================================================================
--- linux-register_marker_patch.orig/include/linux/marker.h
+++ linux-register_marker_patch/include/linux/marker.h
@@ -119,8 +119,8 @@ extern void marker_probe_cb_noarg(const 
  * Connect a probe to a marker.
  * private data pointer must be a valid allocated memory address, or NULL.
  */
-extern int marker_probe_register(const char *name, const char *format,
-				marker_probe_func *probe, void *probe_private);
+extern int marker_probe_register(const char *name, marker_probe_func *probe,
+				void *probe_private);
 
 /*
  * Returns the private data given to marker_probe_register.
Index: linux-register_marker_patch/samples/markers/probe-example.c
===================================================================
--- linux-register_marker_patch.orig/samples/markers/probe-example.c
+++ linux-register_marker_patch/samples/markers/probe-example.c
@@ -49,10 +49,8 @@ void probe_subsystem_eventb(void *probe_
 static struct probe_data probe_array[] =
 {
 	{	.name = "subsystem_event",
-		.format = "integer %d string %s",
 		.probe_func = probe_subsystem_event },
 	{	.name = "subsystem_eventb",
-		.format = MARK_NOARGS,
 		.probe_func = probe_subsystem_eventb },
 };
 
@@ -63,7 +61,6 @@ static int __init probe_init(void)
 
 	for (i = 0; i < ARRAY_SIZE(probe_array); i++) {
 		result = marker_probe_register(probe_array[i].name,
-				probe_array[i].format,
 				probe_array[i].probe_func, &probe_array[i]);
 		if (result)
 			printk(KERN_INFO "Unable to register probe %s\n",

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [RFC PATCH 1/2] Marker probes in futex.c
  2008-04-18 14:29             ` Frank Ch. Eigler
@ 2008-04-19 12:28               ` K. Prasad
  2008-04-19 14:52               ` Peter Zijlstra
  1 sibling, 0 replies; 46+ messages in thread
From: K. Prasad @ 2008-04-19 12:28 UTC (permalink / raw)
  To: a.p.zijlstra; +Cc: fche, prasad, linux-kernel, tglx, mingo, mathieu.desnoyers

On Fri, Apr 18, 2008 at 10:29:52AM -0400, Frank Ch. Eigler wrote:
> Hi -
> 
> On Fri, Apr 18, 2008 at 08:46:44AM +0200, Peter Zijlstra wrote:
> > [...]
> > > > except worse by encoding local variable names and exposing kernel
> > > > pointers.
> > > 
> > > The pointers are probably excessive, the and the names don't really
> > > matter.
> >
> > Then what do we do when someone comes along and changes one of those
> > names; do we go around changing the markers and then requiring all
> > tools to change as well? (And no this isn't far fetched; I'm
> > thinking of changing fshared in the near future).
> 
> At least two answers apply.  The markers being put in should be chosen
> with the concurrence of the subsystem maintainer, who should help
> identify those key quantities that are likely to be both long-lived
> and good diagnostic value.  So that's the discussion we're having
> right now: which values should be passed.  If they're long-lived, then
> they can be given a long-lived name - and it doesn't have to be a
> low-level C variable name.  (There's no reason why we can't also have
> a slew of short-lived pure-debugging sorts of markers compiled in.  A
> marker naming convention like "__futex..." can be adopted for such
> purposes - where nothing is promised for version n+1, just hoping to
> help diagnose problems in this version.)
> 
> The other answer is that we should ensure that tools do not assume
> that the set of markers is fixed.  Let's never set such an
> expectation.  (In systemtap, we have several abstraction and
> version-adaptation facilities that aim to hide such changes.)
> The kernel guy can choose at least two methods to help tools
> without contraining himself too much.  He can change
> 
>         trace_mark(futex_foo, "p1 %d p2 %d", p1, p2);
> 
> to a pair:
> 
>         trace_mark(futex_foo, "p1 %d p2 %d", 0, p2); // backward compat.
>         trace_mark(futex_foo2, "p2 %d p3 %d", p2, p3); // new marker
> 
> or even just drop the backward compatibility one altogether.
> 
> It will need judicious choices by the kernel and willingness by the
> tools to keep up.  Those that don't will simply notice fewer events
> coming in, but nothing important *breaking*.
> 
> The current crop of tools (lttng, systemtap) are both from friendly
> groups who recognize that they have more of an expendable diagnostic
> rather than operational role, and thus are willing to carry that
> burden.  By the time new tools will show up, we will have more
> experience with the degree and type of marker changes over time, and
> they won't be in a position to place new constraints on the
> establishment.
> 
> 
> > Sounds like people will complain and generate back pressure against
> > such changes - something we should avoid. As soon as these markers
> > place a significant burden on code maintenance I'm against it.
> 
> Indeed.  This is why it's important for the subsystem maintainer to
> wisely influence the markers as they go in.
> 
> 
> > > That, plus the new hand-written function (trace_futex_wait) would
> > > still need to manage the packaging of the arguments for consumption by
> > > separately compiled pieces.  It is desirable not to require such
> > > hand-written functions to *also* be declared in headers for these
> > > event consumers to compile against.
> 
> > *blink* so all this is so you don't have to put a declarion in a
> > header file? How about we put these premanent markers in a header -
> > Mathieu says there are <200. Surely that's not too much trouble.
> > [...]
> 
> It's not just that - it's a whole package including easy creation of
> new markers, the code that manages their activation and deactivation,
> the tool code that connects up to receive new events *and parameters*.
> The current approach does not require tight compilation-level
> coupling.  Indeed, for a new marker, the current approach requires
> *no* code changes to anywhere other than the one-line inserted marker,
> for tools like systemtap to connect and use them.  Cool eh?
> 
> 
> - FChE

Hi Peter,
	Adding to what Frank and others have said before, it would be
nice to have you point out which markers or parameters exported by
them are superfluous and I'm willing to have them removed from my patch.

I've just sent out another patch over marker infrastructure to Mathieu
which will eliminate the need to re-send the format string during
initialisation and if accepted, the patch should take care of any
concerns about duplication of code.

Indeed any markers in futex.c that you think would be absolutely
unnecessary can be removed on a case-by-case basis, but do let us know
them.

Thanks,
K.Prasad


^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [RFC PATCH 1/2] Marker probes in futex.c
  2008-04-18  6:46           ` Peter Zijlstra
  2008-04-18 14:29             ` Frank Ch. Eigler
@ 2008-04-19 14:13             ` Mathieu Desnoyers
  2008-04-19 14:56               ` Peter Zijlstra
  1 sibling, 1 reply; 46+ messages in thread
From: Mathieu Desnoyers @ 2008-04-19 14:13 UTC (permalink / raw)
  To: Peter Zijlstra; +Cc: Frank Ch. Eigler, prasad, linux-kernel, tglx, mingo

* Peter Zijlstra (a.p.zijlstra@chello.nl) wrote:
> On Thu, 2008-04-17 at 18:02 -0400, Frank Ch. Eigler wrote:
> > Peter Zijlstra <a.p.zijlstra@chello.nl> writes:
> > 
> > > [...]
> > >> If we were to log just the futex_ops, just as you had suggested,
> > >> "Just log:
> > >>  
> > >>   futex: <uaddr> wait
> > >>   futex: <uaddr> wakeup"
> > >> [...]
> > >> If you can specifically point me to information you think would be
> > >> absolutely unnecessary, I can get them out of the trace_mark().
> > >
> > > I'm thinking everything is superflous; you're basically logging what
> > > strace already gives you
> > 
> > But we don't want to run strace just for this stuff.  As you probably
> > know, strace involves invasive user-space context-switching between
> > the target and the tracer.
> > 
> > > except worse by encoding local variable names and exposing kernel
> > > pointers.
> > 
> > The pointers are probably excessive, the and the names don't really
> > matter.
> 
> Then what do we do when someone comes along and changes one of those
> names; do we go around changing the markers and then requiring all tools
> to change as well?
> 

We should really think about what we are doing before we add a marker in
the kernel code. The information extracted should be both useful and
expected not to change too much between versions. Ideally,
implementation details should not be exported. Exporting useless
information "just because we can" would indeed put pressure on
maintainers. That's where I expect them to be the best persons to tell
what is an implementation detail likely to change, and what is a more
"conceptually stable" information. e.g. a context switch is a context
switch, this does not change with the underlying implementation.

I think that whenever we can add a more "generic" marker which solves
many special cases, we should do so. In this case, using the system call
instrumentation found in my architecture specific instrumentation
patchset would comprehend futex instrumentation. By adding extraction of
all system call parameters, things such as futexes should be covered.
However, we would still need to instrument read() or exec() to extract
the file names. Otherwise, we would have to start doing
architecture-specific code which would "know" what arguments are passed
to each system call. I guess we could do that if it lessens
instrumentation intrusiveness, but we would have to deal with a system
call tracing infrastructure tied closely to system call parameters.
System call audit code seems to already do that, so I guess we could go
that way.

Then, I think we should turn to inner-kernel instrumentation only when
the information extracted from the stable kernel ABI (e.g. system calls)
is not complete enough to understand how things work. That would be the
case for block I/O tracing for instance.

Mathieu

> (And no this isn't far fetched; I'm thinking of changing fshared in the
> near future).
> 
> Sounds like people will complain and generate back pressure against such
> changes - something we should avoid. As soon as these markers place a
> significant burden on code maintenance I'm against it.
> 
> >   What does matter is providing enough information for a
> > problem diagnosis tool & person to reconstruct what the kernel must
> > have been thinking when it did something noteworthy.
> 
> Sure, but then just make a strace like tracer and be done with it - no
> need to pollute the futex code with that.
> 

-- 
Mathieu Desnoyers
Computer Engineering Ph.D. Student, Ecole Polytechnique de Montreal
OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F  BA06 3F25 A8FE 3BAE 9A68

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [RFC PATCH 1/2] Marker probes in futex.c
  2008-04-18 14:29             ` Frank Ch. Eigler
  2008-04-19 12:28               ` K. Prasad
@ 2008-04-19 14:52               ` Peter Zijlstra
  2008-04-19 18:03                 ` Frank Ch. Eigler
  1 sibling, 1 reply; 46+ messages in thread
From: Peter Zijlstra @ 2008-04-19 14:52 UTC (permalink / raw)
  To: Frank Ch. Eigler; +Cc: prasad, linux-kernel, tglx, mingo, mathieu.desnoyers

On Fri, 2008-04-18 at 10:29 -0400, Frank Ch. Eigler wrote:

> > > That, plus the new hand-written function (trace_futex_wait) would
> > > still need to manage the packaging of the arguments for consumption by
> > > separately compiled pieces.  It is desirable not to require such
> > > hand-written functions to *also* be declared in headers for these
> > > event consumers to compile against.
> 
> > *blink* so all this is so you don't have to put a declarion in a
> > header file? How about we put these premanent markers in a header -
> > Mathieu says there are <200. Surely that's not too much trouble.
> > [...]
> 
> It's not just that - it's a whole package including easy creation of
> new markers, the code that manages their activation and deactivation,
> the tool code that connects up to receive new events *and parameters*.
> The current approach does not require tight compilation-level
> coupling.  Indeed, for a new marker, the current approach requires
> *no* code changes to anywhere other than the one-line inserted marker,
> for tools like systemtap to connect and use them.  Cool eh?


I'm thinking the two use-cases are confused here. So we have

 a) permanent markers
 b) ad-hoc debug markers

I'm thinking that for the first class the compilation level coupling is
no problem at all. And for the second class it doesn't matter how ugly
they are as long as it works on the spot.

So I'm arguing these two should be separated.


^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [RFC PATCH 1/2] Marker probes in futex.c
  2008-04-19 14:13             ` Mathieu Desnoyers
@ 2008-04-19 14:56               ` Peter Zijlstra
  0 siblings, 0 replies; 46+ messages in thread
From: Peter Zijlstra @ 2008-04-19 14:56 UTC (permalink / raw)
  To: Mathieu Desnoyers; +Cc: Frank Ch. Eigler, prasad, linux-kernel, tglx, mingo

On Sat, 2008-04-19 at 10:13 -0400, Mathieu Desnoyers wrote:
> * Peter Zijlstra (a.p.zijlstra@chello.nl) wrote:
> > On Thu, 2008-04-17 at 18:02 -0400, Frank Ch. Eigler wrote:
> > > Peter Zijlstra <a.p.zijlstra@chello.nl> writes:
> > > 
> > > > [...]
> > > >> If we were to log just the futex_ops, just as you had suggested,
> > > >> "Just log:
> > > >>  
> > > >>   futex: <uaddr> wait
> > > >>   futex: <uaddr> wakeup"
> > > >> [...]
> > > >> If you can specifically point me to information you think would be
> > > >> absolutely unnecessary, I can get them out of the trace_mark().
> > > >
> > > > I'm thinking everything is superflous; you're basically logging what
> > > > strace already gives you
> > > 
> > > But we don't want to run strace just for this stuff.  As you probably
> > > know, strace involves invasive user-space context-switching between
> > > the target and the tracer.
> > > 
> > > > except worse by encoding local variable names and exposing kernel
> > > > pointers.
> > > 
> > > The pointers are probably excessive, the and the names don't really
> > > matter.
> > 
> > Then what do we do when someone comes along and changes one of those
> > names; do we go around changing the markers and then requiring all tools
> > to change as well?
> > 
> 
> We should really think about what we are doing before we add a marker in
> the kernel code. The information extracted should be both useful and
> expected not to change too much between versions. Ideally,
> implementation details should not be exported. Exporting useless
> information "just because we can" would indeed put pressure on
> maintainers. That's where I expect them to be the best persons to tell
> what is an implementation detail likely to change, and what is a more
> "conceptually stable" information. e.g. a context switch is a context
> switch, this does not change with the underlying implementation.
> 
> I think that whenever we can add a more "generic" marker which solves
> many special cases, we should do so. In this case, using the system call
> instrumentation found in my architecture specific instrumentation
> patchset would comprehend futex instrumentation. By adding extraction of
> all system call parameters, things such as futexes should be covered.
> However, we would still need to instrument read() or exec() to extract
> the file names. Otherwise, we would have to start doing
> architecture-specific code which would "know" what arguments are passed
> to each system call. I guess we could do that if it lessens
> instrumentation intrusiveness, but we would have to deal with a system
> call tracing infrastructure tied closely to system call parameters.
> System call audit code seems to already do that, so I guess we could go
> that way.
> 
> Then, I think we should turn to inner-kernel instrumentation only when
> the information extracted from the stable kernel ABI (e.g. system calls)
> is not complete enough to understand how things work. That would be the
> case for block I/O tracing for instance.

Agreed - so this futex instrumentation will not go anywhere. Prasad
could perhaps help out with your arch specific syscall tracer.


^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [RFC PATCH 1/2] Marker probes in futex.c
  2008-04-19 14:52               ` Peter Zijlstra
@ 2008-04-19 18:03                 ` Frank Ch. Eigler
  2008-04-19 18:29                   ` Peter Zijlstra
  0 siblings, 1 reply; 46+ messages in thread
From: Frank Ch. Eigler @ 2008-04-19 18:03 UTC (permalink / raw)
  To: Peter Zijlstra; +Cc: prasad, linux-kernel, tglx, mingo, mathieu.desnoyers

Hi -

On Sat, Apr 19, 2008 at 04:52:26PM +0200, Peter Zijlstra wrote:
> > It's not just that - it's a whole package including easy creation of
> > new markers, the code that manages their activation and deactivation,
> > the tool code that connects up to receive new events *and parameters*.
> > [...]
> I'm thinking the two use-cases are confused here. So we have
> 
>  a) permanent markers
>  b) ad-hoc debug markers

OTOH I think this is a false dichotomy.  Debugging is not only done by
a subsystem maintainer during the merge/rc period.  When something
goes wrong on a deployed machine, problem diagnosis requires data,
which ideally should be gathered as non-intrusively as possible - that
means no recompiling / rebooting, and ideally very little slowdown.

I bet that many of those markers you might consider "ad-hoc" would in
fact have some post-release diagnostic value.  Now, maybe in the case
of futexes, the kernel makes no sophisticated decisions that may need
subsequent review.  Maybe all the internals are directly calculable
from the results visible at the system call level (which threads block
and which return).  But this is certainly not so for many other parts
of the kernel.

With markers, you don't have to make this an agonizing decision.  You
just insert markers, regardless of whether it's high-level, universal,
"permanent"; or whether it's low-level, diagnostic, temporary.  The
same consumer tools will work for them all.

- FChE

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [RFC PATCH 1/2] Marker probes in futex.c
  2008-04-19 18:03                 ` Frank Ch. Eigler
@ 2008-04-19 18:29                   ` Peter Zijlstra
  2008-04-19 18:48                     ` Frank Ch. Eigler
  0 siblings, 1 reply; 46+ messages in thread
From: Peter Zijlstra @ 2008-04-19 18:29 UTC (permalink / raw)
  To: Frank Ch. Eigler; +Cc: prasad, linux-kernel, tglx, mingo, mathieu.desnoyers

On Sat, 2008-04-19 at 14:03 -0400, Frank Ch. Eigler wrote:
> Hi -
> 
> On Sat, Apr 19, 2008 at 04:52:26PM +0200, Peter Zijlstra wrote:
> > > It's not just that - it's a whole package including easy creation of
> > > new markers, the code that manages their activation and deactivation,
> > > the tool code that connects up to receive new events *and parameters*.
> > > [...]
> > I'm thinking the two use-cases are confused here. So we have
> > 
> >  a) permanent markers
> >  b) ad-hoc debug markers
> 
> OTOH I think this is a false dichotomy.  Debugging is not only done by
> a subsystem maintainer during the merge/rc period.  When something
> goes wrong on a deployed machine, problem diagnosis requires data,
> which ideally should be gathered as non-intrusively as possible - that
> means no recompiling / rebooting, and ideally very little slowdown.
> 
> I bet that many of those markers you might consider "ad-hoc" would in
> fact have some post-release diagnostic value.  Now, maybe in the case
> of futexes, the kernel makes no sophisticated decisions that may need
> subsequent review.  Maybe all the internals are directly calculable
> from the results visible at the system call level (which threads block
> and which return).  But this is certainly not so for many other parts
> of the kernel.
> 
> With markers, you don't have to make this an agonizing decision.  You
> just insert markers, regardless of whether it's high-level, universal,
> "permanent"; or whether it's low-level, diagnostic, temporary.  The
> same consumer tools will work for them all.

This again tries into the argument about not making markers depend on
the code structure or implementation details.

I'm really wanting to avoid ever having to be obstructed by a marker. So
any marker that does not represent a solid high level event (to take
Mathieu's example: a context switch is a context switch, and we'll
always have one) I'm not comfortable with merging that upstream.

So even though these ad-hoc markers might have some diagnostic value -
I'll never support merging them. If a customer might have some issue I
can hand him a custom kernel with these markers added in - I see
absolutely no reason to burden upstream with these.

So we _do_ have to make this decision; some shite should just not be
merged but be kept separate.


^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [RFC PATCH 1/2] Marker probes in futex.c
  2008-04-19 18:29                   ` Peter Zijlstra
@ 2008-04-19 18:48                     ` Frank Ch. Eigler
  2008-04-22 17:50                       ` Nicholas Miell
  0 siblings, 1 reply; 46+ messages in thread
From: Frank Ch. Eigler @ 2008-04-19 18:48 UTC (permalink / raw)
  To: Peter Zijlstra; +Cc: prasad, linux-kernel, tglx, mingo, mathieu.desnoyers

Hi -

On Sat, Apr 19, 2008 at 08:29:27PM +0200, Peter Zijlstra wrote:
> [...]
> > OTOH I think this is a false dichotomy.  Debugging is not only done by
> > a subsystem maintainer during the merge/rc period.  When something
> > goes wrong on a deployed machine, problem diagnosis requires data,
> > which ideally should be gathered as non-intrusively as possible - that
> > means no recompiling / rebooting, and ideally very little slowdown.
> > [...]

> This again tries into the argument about not making markers depend on
> the code structure or implementation details.

It should not *unnecessarily* depend on those.

> I'm really wanting to avoid ever having to be obstructed by a marker. So
> any marker that does not represent a solid high level event (to take
> Mathieu's example: a context switch is a context switch, and we'll
> always have one) I'm not comfortable with merging that upstream.

It is your prerogative as a subsystem maintainer to make a guess about
this.  Others may make their own decisions differently, considering
the small costs and potential benefits.

> So even though these ad-hoc markers might have some diagnostic value
> - I'll never support merging them. If a customer might have some
> issue I can hand him a custom kernel with these markers added in - I
> see absolutely no reason to burden upstream with these.

Perhaps the kinds of bugs in your code, coupled with the kinds of
customers who experience those bugs, make tolerable this means of
diagnosis (requiring a reboot of their machines into a custom
debugging kernel).  The customers I have dealt with (and frankly, I
too) need to diagnose problems on a live running system as much as
possible.  They don't run every kernel du jour, so they don't need the
purely hypothetical tools that are dependent on a permanent set of
markers.

- FChE

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [RFC PATCH 1/2] Marker probes in futex.c
  2008-04-19 12:13                 ` K. Prasad
@ 2008-04-19 21:33                   ` Mathieu Desnoyers
  0 siblings, 0 replies; 46+ messages in thread
From: Mathieu Desnoyers @ 2008-04-19 21:33 UTC (permalink / raw)
  To: K. Prasad
  Cc: Frank Ch. Eigler, Peter Zijlstra, linux-kernel, tglx, mingo,
	Christoph Hellwig

* K. Prasad (prasad@linux.vnet.ibm.com) wrote:
> On Thu, Apr 17, 2008 at 04:16:44PM -0400, Mathieu Desnoyers wrote:
> > * Frank Ch. Eigler (fche@redhat.com) wrote:
> > [..]
> > > > >+       trace_mark(futex_wake_called, "uaddr:%p fshared:%p nr_wake:%d "
> > > > > +                       "bitset:%d",
> > > > > +                       uaddr, fshared, nr_wake, bitset);
> > > > 
> > > > > +       INIT_FUTEX_DEBUG_PROBE(futex_wake_called,
> > > > > +                       "uaddr:%p fshared:%p nr_wake:%d bitset:%d"),
> > > > 
> > > > Why the need to duplicate it; that's utter madness.
> > > 
> > > This second instance is optional and is used as a consistency check
> > > for the event consumer to hook up exactly to the intended producer.
> > > The string could be empty.
> > > 
> > 
> > empty -> NULL , yes :)
> Atleast until 2.6.25-rc8-mm2, I would expect the marker registration to
> fail if the format strings don't match.
> I find an early return in set_marker() in marker.c if the format
> strings don't match.
> 
> Mathieu,
> 	Can you let me know if you would find the patch below - which
> eliminates the need to pass format string to marker_probe_register()
> function, acceptable?
> I have done some elementary tests with the patch, but I know it would
> fail if there are duplicate markers with different format strings.
> However I presume that duplicate markers are meant to appear primarily
> when markers are present in inlined functions (in which case their
> format strings would be the same).
> 

Sorry, I will have to NACK this one. A probe which registers on a marker
has two choices : either is passes a NULL format string, which tells the
marker infrastructure that th probe will dynamically parse the format
string to grab the arguments, like printk.

However, if the probe is expecting a particular layout of arguments, It
must tell so upon registration, so the marker infrastructure can check
if the marker it's trying to connect to still ave the same arguments.
That will make sure specialized probes won't break between kernel
version changes.

Mathieu

> Thanks,
> K.Prasad
> 
> Signed-off-by: K.Prasad <prasad@linux.vnet.ibm.com>
> ---
>  include/linux/marker.h          |    4 ++--
>  kernel/marker.c                 |   40 +++++++++++++++++-----------------------
>  samples/markers/probe-example.c |    3 ---
>  3 files changed, 19 insertions(+), 28 deletions(-)
> 
> Index: linux-register_marker_patch/kernel/marker.c
> ===================================================================
> --- linux-register_marker_patch.orig/kernel/marker.c
> +++ linux-register_marker_patch/kernel/marker.c
> @@ -364,7 +364,7 @@ static struct marker_entry *get_marker(c
>   * Add the marker to the marker hash table. Must be called with markers_mutex
>   * held.
>   */
> -static struct marker_entry *add_marker(const char *name, const char *format)
> +static struct marker_entry *add_marker(const char *name)
>  {
>  	struct hlist_head *head;
>  	struct hlist_node *node;
> @@ -372,9 +372,15 @@ static struct marker_entry *add_marker(c
>  	size_t name_len = strlen(name) + 1;
>  	size_t format_len = 0;
>  	u32 hash = jhash(name, name_len-1, 0);
> +	struct marker *iter;
> +
> +	/* Search for the marker with 'name' in __markers section */
> +	for (iter = __start___markers;
> +		((iter < __stop___markers) && (strcmp(name, iter->name)));
> +			iter++);
>  
> -	if (format)
> -		format_len = strlen(format) + 1;
> +	if (iter->format)
> +		format_len = strlen(iter->format) + 1;
>  	head = &marker_table[hash & ((1 << MARKER_HASH_BITS)-1)];
>  	hlist_for_each_entry(e, node, head, hlist) {
>  		if (!strcmp(name, e->name)) {
> @@ -392,9 +398,9 @@ static struct marker_entry *add_marker(c
>  	if (!e)
>  		return ERR_PTR(-ENOMEM);
>  	memcpy(&e->name[0], name, name_len);
> -	if (format) {
> +	if (iter->format) {
>  		e->format = &e->name[name_len];
> -		memcpy(e->format, format, format_len);
> +		memcpy(e->format, iter->format, format_len);
>  		if (strcmp(e->format, MARK_NOARGS) == 0)
>  			e->call = marker_probe_cb_noarg;
>  		else
> @@ -494,21 +500,9 @@ static int set_marker(struct marker_entr
>  	int ret;
>  	WARN_ON(strcmp((*entry)->name, elem->name) != 0);
>  
> -	if ((*entry)->format) {
> -		if (strcmp((*entry)->format, elem->format) != 0) {
> -			printk(KERN_NOTICE
> -				"Format mismatch for probe %s "
> -				"(%s), marker (%s)\n",
> -				(*entry)->name,
> -				(*entry)->format,
> -				elem->format);
> -			return -EPERM;
> -		}
> -	} else {
> -		ret = marker_set_format(entry, elem->format);
> -		if (ret)
> -			return ret;
> -	}
> +	ret = marker_set_format(entry, elem->format);
> +	if (ret)
> +		return ret;
>  
>  	/*
>  	 * probe_cb setup (statically known) is done here. It is
> @@ -638,8 +632,8 @@ static void marker_update_probes(void)
>   * Returns 0 if ok, error value on error.
>   * The probe address must at least be aligned on the architecture pointer size.
>   */
> -int marker_probe_register(const char *name, const char *format,
> -			marker_probe_func *probe, void *probe_private)
> +int marker_probe_register(const char *name, marker_probe_func *probe,
> +							void *probe_private)
>  {
>  	struct marker_entry *entry;
>  	int ret = 0;
> @@ -648,7 +642,7 @@ int marker_probe_register(const char *na
>  	mutex_lock(&markers_mutex);
>  	entry = get_marker(name);
>  	if (!entry) {
> -		entry = add_marker(name, format);
> +		entry = add_marker(name);
>  		if (IS_ERR(entry)) {
>  			ret = PTR_ERR(entry);
>  			goto end;
> Index: linux-register_marker_patch/include/linux/marker.h
> ===================================================================
> --- linux-register_marker_patch.orig/include/linux/marker.h
> +++ linux-register_marker_patch/include/linux/marker.h
> @@ -119,8 +119,8 @@ extern void marker_probe_cb_noarg(const 
>   * Connect a probe to a marker.
>   * private data pointer must be a valid allocated memory address, or NULL.
>   */
> -extern int marker_probe_register(const char *name, const char *format,
> -				marker_probe_func *probe, void *probe_private);
> +extern int marker_probe_register(const char *name, marker_probe_func *probe,
> +				void *probe_private);
>  
>  /*
>   * Returns the private data given to marker_probe_register.
> Index: linux-register_marker_patch/samples/markers/probe-example.c
> ===================================================================
> --- linux-register_marker_patch.orig/samples/markers/probe-example.c
> +++ linux-register_marker_patch/samples/markers/probe-example.c
> @@ -49,10 +49,8 @@ void probe_subsystem_eventb(void *probe_
>  static struct probe_data probe_array[] =
>  {
>  	{	.name = "subsystem_event",
> -		.format = "integer %d string %s",
>  		.probe_func = probe_subsystem_event },
>  	{	.name = "subsystem_eventb",
> -		.format = MARK_NOARGS,
>  		.probe_func = probe_subsystem_eventb },
>  };
>  
> @@ -63,7 +61,6 @@ static int __init probe_init(void)
>  
>  	for (i = 0; i < ARRAY_SIZE(probe_array); i++) {
>  		result = marker_probe_register(probe_array[i].name,
> -				probe_array[i].format,
>  				probe_array[i].probe_func, &probe_array[i]);
>  		if (result)
>  			printk(KERN_INFO "Unable to register probe %s\n",

-- 
Mathieu Desnoyers
Computer Engineering Ph.D. Student, Ecole Polytechnique de Montreal
OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F  BA06 3F25 A8FE 3BAE 9A68

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [RFC PATCH 1/2] Marker probes in futex.c
  2008-04-19 18:48                     ` Frank Ch. Eigler
@ 2008-04-22 17:50                       ` Nicholas Miell
  0 siblings, 0 replies; 46+ messages in thread
From: Nicholas Miell @ 2008-04-22 17:50 UTC (permalink / raw)
  To: Frank Ch. Eigler
  Cc: Peter Zijlstra, prasad, linux-kernel, tglx, mingo, mathieu.desnoyers


On Sat, 2008-04-19 at 14:48 -0400, Frank Ch. Eigler wrote:
> Hi -
> 
> On Sat, Apr 19, 2008 at 08:29:27PM +0200, Peter Zijlstra wrote:
> > [...]
> > > OTOH I think this is a false dichotomy.  Debugging is not only done by
> > > a subsystem maintainer during the merge/rc period.  When something
> > > goes wrong on a deployed machine, problem diagnosis requires data,
> > > which ideally should be gathered as non-intrusively as possible - that
> > > means no recompiling / rebooting, and ideally very little slowdown.
> > > [...]
> 
> > This again tries into the argument about not making markers depend on
> > the code structure or implementation details.
> 
> It should not *unnecessarily* depend on those.
> 
> > I'm really wanting to avoid ever having to be obstructed by a marker. So
> > any marker that does not represent a solid high level event (to take
> > Mathieu's example: a context switch is a context switch, and we'll
> > always have one) I'm not comfortable with merging that upstream.
> 
> It is your prerogative as a subsystem maintainer to make a guess about
> this.  Others may make their own decisions differently, considering
> the small costs and potential benefits.
> 
> > So even though these ad-hoc markers might have some diagnostic value
> > - I'll never support merging them. If a customer might have some
> > issue I can hand him a custom kernel with these markers added in - I
> > see absolutely no reason to burden upstream with these.
> 
> Perhaps the kinds of bugs in your code, coupled with the kinds of
> customers who experience those bugs, make tolerable this means of
> diagnosis (requiring a reboot of their machines into a custom
> debugging kernel).  The customers I have dealt with (and frankly, I
> too) need to diagnose problems on a live running system as much as
> possible.  They don't run every kernel du jour, so they don't need the
> purely hypothetical tools that are dependent on a permanent set of
> markers.
>  

Sun solved this problem for DTrace by applying their stability
attributes to all probe points -- i.e. the probes that represent an ABI
are marked as such and the temporary debugging probes are marked as
unstable.

-- 
Nicholas Miell <nmiell@comcast.net>


^ permalink raw reply	[flat|nested] 46+ messages in thread

end of thread, other threads:[~2008-04-22 17:50 UTC | newest]

Thread overview: 46+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2008-04-15 11:50 [RFC PATCH 0/2] Debugging infrastructure for Futexes using Markers K. Prasad
2008-04-15 11:53 ` [RFC PATCH 1/2] Marker probes in futex.c K. Prasad
2008-04-15 11:55   ` [RFC PATCH 2/2] Marker handler for the probes in futex file K. Prasad
2008-04-15 12:02   ` [RFC PATCH 1/2] Marker probes in futex.c Peter Zijlstra
2008-04-15 12:32     ` Mathieu Desnoyers
2008-04-15 12:50       ` Alexey Dobriyan
2008-04-15 16:13         ` K. Prasad
2008-04-15 12:56       ` Peter Zijlstra
2008-04-15 13:17         ` Ingo Molnar
2008-04-15 13:47           ` Mathieu Desnoyers
2008-04-15 14:04             ` Ingo Molnar
2008-04-15 14:21               ` Frank Ch. Eigler
2008-04-15 14:39               ` Mathieu Desnoyers
2008-04-15 16:48             ` Ingo Molnar
2008-04-15 21:38               ` Mathieu Desnoyers
2008-04-16 13:17                 ` Ingo Molnar
2008-04-16 13:46                   ` Arjan van de Ven
2008-04-16 14:00                     ` Mathieu Desnoyers
2008-04-16 14:24                       ` Arjan van de Ven
2008-04-16 14:29                         ` Ingo Molnar
2008-04-16 14:48                         ` Mathieu Desnoyers
2008-04-16 15:32                           ` Arjan van de Ven
2008-04-16 15:39                             ` Mathieu Desnoyers
2008-04-16 20:10                   ` text_poke, vmap and vmalloc on x86_64 Mathieu Desnoyers
2008-04-16 21:22                     ` Mathieu Desnoyers
2008-04-15 13:25         ` [RFC PATCH 1/2] Marker probes in futex.c Mathieu Desnoyers
2008-04-16 15:51           ` Peter Zijlstra
2008-04-17 19:19             ` Frank Ch. Eigler
2008-04-17 20:16               ` Mathieu Desnoyers
2008-04-19 12:13                 ` K. Prasad
2008-04-19 21:33                   ` Mathieu Desnoyers
2008-04-18  6:56               ` Peter Zijlstra
2008-04-15 15:52     ` K. Prasad
2008-04-16 15:51       ` Peter Zijlstra
2008-04-17 22:02         ` Frank Ch. Eigler
2008-04-18  6:46           ` Peter Zijlstra
2008-04-18 14:29             ` Frank Ch. Eigler
2008-04-19 12:28               ` K. Prasad
2008-04-19 14:52               ` Peter Zijlstra
2008-04-19 18:03                 ` Frank Ch. Eigler
2008-04-19 18:29                   ` Peter Zijlstra
2008-04-19 18:48                     ` Frank Ch. Eigler
2008-04-22 17:50                       ` Nicholas Miell
2008-04-19 14:13             ` Mathieu Desnoyers
2008-04-19 14:56               ` Peter Zijlstra
2008-04-18 10:44     ` Andrew Morton

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).