All of lore.kernel.org
 help / color / mirror / Atom feed
* [Patch 01/11] Introducing generic hardware breakpoint handler interfaces
       [not found] <20090319234044.410725944@K.Prasad>
@ 2009-03-19 23:48 ` K.Prasad
  2009-03-20 14:33   ` Alan Stern
  2009-03-19 23:48 ` [Patch 02/11] x86 architecture implementation of Hardware Breakpoint interfaces K.Prasad
                   ` (9 subsequent siblings)
  10 siblings, 1 reply; 55+ messages in thread
From: K.Prasad @ 2009-03-19 23:48 UTC (permalink / raw)
  To: Ingo Molnar, Linux Kernel Mailing List
  Cc: Alan Stern, Andrew Morton, Benjamin Herrenschmidt,
	Frederic Weisbecker, Maneesh Soni, Roland McGrath,
	Steven Rostedt, K.Prasad

[-- Attachment #1: 1 --]
[-- Type: text/plain, Size: 16776 bytes --]

This patch introduces two new files hw_breakpoint.[ch] which defines the 
generic interfaces to use hardware breakpoint infrastructure of the system. 

Signed-off-by: K.Prasad <prasad@linux.vnet.ibm.com>
Signed-off-by: Alan Stern <stern@rowland.harvard.edu>
---
 arch/Kconfig                        |    3 
 include/asm-generic/hw_breakpoint.h |  140 +++++++++++++
 kernel/Makefile                     |    1 
 kernel/hw_breakpoint.c              |  361 ++++++++++++++++++++++++++++++++++++
 4 files changed, 505 insertions(+)

Index: linux-2.6-tip.hbkpt/kernel/hw_breakpoint.c
===================================================================
--- /dev/null
+++ linux-2.6-tip.hbkpt/kernel/hw_breakpoint.c
@@ -0,0 +1,361 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) 2007 Alan Stern
+ * Copyright (C) IBM Corporation, 2009
+ */
+
+/*
+ * HW_breakpoint: a unified kernel/user-space hardware breakpoint facility,
+ * using the CPU's debug registers.
+ *
+ * This file contains the arch-independent routines.  It is not meant
+ * to be compiled as a standalone source file; rather it should be
+ * #include'd by the arch-specific implementation.
+ */
+
+#include <linux/irqflags.h>
+#include <linux/kallsyms.h>
+#include <linux/notifier.h>
+#include <linux/kprobes.h>
+#include <linux/kdebug.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/percpu.h>
+#include <linux/mutex.h>
+#include <linux/sched.h>
+#include <linux/init.h>
+#include <linux/smp.h>
+
+#include <asm/hw_breakpoint.h>
+#include <asm/processor.h>
+#include <asm/debugreg.h>
+
+/* Array of kernel-space breakpoint structures */
+struct hw_breakpoint *hbkpt_kernel[HB_NUM];
+/*
+ * Kernel breakpoints grow downwards, starting from HB_NUM
+ * 'hbkpt_kernel_pos' denotes lowest numbered breakpoint register occupied for
+ * kernel-space request
+ */
+unsigned int hbkpt_kernel_pos;
+
+/* An array containing refcount of threads using a given bkpt register */
+unsigned int hbkpt_user_max_refcount[HB_NUM];
+
+/* One higher than the highest counted user-space breakpoint register */
+unsigned int hbkpt_user_max;
+
+struct task_struct *last_debugged_task;
+
+/*
+ * Install the debug register values for a new thread.
+ */
+void switch_to_thread_hw_breakpoint(struct task_struct *tsk)
+{
+	/* Set the debug register */
+	arch_install_thread_hbkpt(tsk);
+	last_debugged_task = current;
+
+	put_cpu_no_resched();
+}
+
+/*
+ * Install the debug register values for just the kernel, no thread.
+ */
+void switch_to_none_hw_breakpoint(void)
+{
+	arch_install_none();
+	put_cpu_no_resched();
+}
+
+/*
+ * Load the debug registers during startup of a CPU.
+ */
+void load_debug_registers(void)
+{
+	int i;
+	unsigned long flags;
+
+	/* Prevent IPIs for new kernel breakpoint updates */
+	local_irq_save(flags);
+
+	for (i = hbkpt_kernel_pos; i < HB_NUM; i++)
+		if (hbkpt_kernel[i])
+			on_each_cpu(arch_install_kernel_hbkpt,
+				(void *)hbkpt_kernel[i], 0);
+	if (current->thread.dr7)
+		arch_install_thread_hbkpt(current);
+
+	local_irq_restore(flags);
+}
+
+/*
+ * Erase all the hardware breakpoint info associated with a thread.
+ *
+ * If tsk != current then tsk must not be usable (for example, a
+ * child being cleaned up from a failed fork).
+ */
+void flush_thread_hw_breakpoint(struct task_struct *tsk)
+{
+	int i;
+	struct thread_struct *thread = &(tsk->thread);
+
+	mutex_lock(&hw_breakpoint_mutex);
+
+	/* Let the breakpoints know they are being uninstalled */
+
+	/* The thread no longer has any breakpoints associated with it */
+	clear_tsk_thread_flag(tsk, TIF_DEBUG);
+	for (i = 0; i < HB_NUM; i++) {
+		if (thread->hbkpt[i]) {
+			hbkpt_user_max_refcount[i]--;
+			if (!hbkpt_user_max_refcount[i])
+				hbkpt_user_max--;
+			kfree(thread->hbkpt[i]);
+			thread->hbkpt[i] = NULL;
+		}
+	}
+	thread->hbkpt_num_installed = 0;
+
+	/* Actually uninstall the breakpoints if necessary */
+	if (tsk == current)
+		switch_to_none_hw_breakpoint();
+	mutex_unlock(&hw_breakpoint_mutex);
+}
+
+/*
+ * Copy the hardware breakpoint info from a thread to its cloned child.
+ */
+int copy_thread_hw_breakpoint(struct task_struct *tsk,
+		struct task_struct *child, unsigned long clone_flags)
+{
+	/* We will assume that breakpoint settings are not inherited
+	 * and the child starts out with no debug registers set.
+	 * But what about CLONE_PTRACE?
+	 */
+	clear_tsk_thread_flag(child, TIF_DEBUG);
+	return 0;
+}
+
+/*
+ * Validate the settings in a hw_breakpoint structure.
+ */
+static int validate_settings(struct hw_breakpoint *bp, struct task_struct *tsk)
+{
+	int ret;
+	unsigned int align;
+
+	ret = arch_validate_hwbkpt_settings(bp, &align, tsk);
+	if (ret < 0)
+		goto err;
+
+	/* Check that the low-order bits of the address are appropriate
+	 * for the alignment implied by len.
+	 */
+	if (bp->info.address & align)
+		return -EINVAL;
+
+	/* Check that the virtual address is in the proper range */
+	if (tsk) {
+		if (!arch_check_va_in_userspace(bp->info.address, tsk))
+			return -EFAULT;
+	} else {
+		if (!arch_check_va_in_kernelspace(bp->info.address))
+			return -EFAULT;
+	}
+ err:
+	return ret;
+}
+
+int __register_user_hw_breakpoint(int pos, struct task_struct *tsk,
+					struct hw_breakpoint *bp)
+{
+	struct thread_struct *thread = &(tsk->thread);
+	int rc;
+
+	/* Do not overcommit. Fail if kernel has used the hbkpt registers */
+	if (pos >= hbkpt_kernel_pos)
+		return -ENOSPC;
+
+	rc = validate_settings(bp, tsk);
+	if (rc)
+		return rc;
+
+	thread->hbkpt[pos] = bp;
+	thread->hbkpt_num_installed++;
+	hbkpt_user_max_refcount[pos]++;
+	/* 'tsk' is the thread that uses max number of hbkpt registers */
+	if (hbkpt_user_max < thread->hbkpt_num_installed)
+		hbkpt_user_max++;
+
+	arch_register_user_hw_breakpoint(pos, bp, tsk);
+
+	/*
+	 * Does it need to be installed right now?
+	 * Otherwise it will get installed the next time tsk runs
+	 */
+	if (tsk == current)
+		switch_to_thread_hw_breakpoint(tsk);
+	return rc;
+}
+
+/*
+ * Modify the address of a hbkpt register already in use by the task
+ * Do not invoke this in-lieu of a __unregister_user_hw_breakpoint()
+ */
+int __modify_user_hw_breakpoint(int pos, struct task_struct *tsk,
+					struct hw_breakpoint *bp)
+{
+	int rc;
+	struct thread_struct *thread = &(tsk->thread);
+
+	if ((pos >= hbkpt_kernel_pos) || (validate_settings(bp, tsk)))
+		return -EINVAL;
+
+	thread->hbkpt[pos] = bp;
+
+	/*
+	 * 'pos' must be that of a hbkpt register already used by 'tsk'
+	 * Otherwise arch_modify_user_hw_breakpoint() will fail
+	 */
+	rc = arch_modify_user_hw_breakpoint(pos, bp, tsk);
+	if (rc)
+		return rc;
+
+	if (tsk == current)
+		switch_to_thread_hw_breakpoint(tsk);
+	return 0;
+}
+
+/*
+ * Actual implementation of unregister_user_hw_breakpoint.
+ */
+void __unregister_user_hw_breakpoint(int pos, struct task_struct *tsk,
+						struct hw_breakpoint *bp)
+{
+	struct thread_struct *thread = &(tsk->thread);
+
+	if (!bp)
+		return;
+
+	hbkpt_user_max_refcount[pos]--;
+	if ((hbkpt_user_max == pos + 1) && (hbkpt_user_max_refcount[pos] == 0))
+		hbkpt_user_max--;
+	thread->hbkpt_num_installed--;
+
+	arch_unregister_user_hw_breakpoint(pos, bp, tsk);
+
+	if (tsk == current)
+		switch_to_thread_hw_breakpoint(tsk);
+	kfree(tsk->thread.hbkpt[pos]);
+	tsk->thread.hbkpt[pos] = NULL;
+}
+
+/**
+ * register_kernel_hw_breakpoint - register a hardware breakpoint for kernel space
+ * @bp: the breakpoint structure to register
+ *
+ * @bp.info->name or @bp.info->address, @bp.info->len, @bp.info->type and
+ * @bp->triggered must be set properly before invocation
+ *
+ */
+int register_kernel_hw_breakpoint(struct hw_breakpoint *bp)
+{
+	int rc;
+
+	rc = validate_settings(bp, NULL);
+	if (rc)
+		return rc;
+
+	mutex_lock(&hw_breakpoint_mutex);
+
+	/* Check if we are over-committing */
+	if (hbkpt_kernel_pos <= hbkpt_user_max) {
+		mutex_unlock(&hw_breakpoint_mutex);
+		return -EINVAL;
+	}
+
+	hbkpt_kernel_pos--;
+	hbkpt_kernel[hbkpt_kernel_pos] = bp;
+	arch_register_kernel_hw_breakpoint(bp);
+
+	mutex_unlock(&hw_breakpoint_mutex);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(register_kernel_hw_breakpoint);
+
+/**
+ * unregister_kernel_hw_breakpoint - unregister a hardware breakpoint for kernel space
+ * @bp: the breakpoint structure to unregister
+ *
+ * Uninstalls and unregisters @bp.
+ */
+void unregister_kernel_hw_breakpoint(struct hw_breakpoint *bp)
+{
+	int i, j;
+
+	mutex_lock(&hw_breakpoint_mutex);
+
+	for (i = hbkpt_kernel_pos; i < HB_NUM; i++)
+		if (bp == hbkpt_kernel[i])
+			break;
+
+	arch_unregister_kernel_hw_breakpoint(i);
+
+	/*
+	 * We'll shift the breakpoints one-level above to accomodate new thread
+	 * requests
+	 */
+	if (i > hbkpt_kernel_pos)
+		for (j = i; j == hbkpt_kernel_pos; j--)
+			hbkpt_kernel[j] = hbkpt_kernel[j-1];
+	hbkpt_kernel_pos++;
+
+	mutex_unlock(&hw_breakpoint_mutex);
+}
+EXPORT_SYMBOL_GPL(unregister_kernel_hw_breakpoint);
+
+/*
+ * Handle debug exception notifications.
+ */
+static int __kprobes hw_breakpoint_exceptions_notify(
+		struct notifier_block *unused, unsigned long val, void *data)
+{
+	if (val != DIE_DEBUG)
+		return NOTIFY_DONE;
+	return hw_breakpoint_handler(data);
+}
+
+static struct notifier_block hw_breakpoint_exceptions_nb = {
+	.notifier_call = hw_breakpoint_exceptions_notify,
+	/* we need to be notified first */
+	.priority = 0x7fffffff
+};
+
+static int __init init_hw_breakpoint(void)
+{
+	int i;
+
+	hbkpt_kernel_pos = HB_NUM;
+	for (i = 0; i < HB_NUM; i++)
+		hbkpt_user_max_refcount[i] = 0;
+	hbkpt_user_max = 0;
+	load_debug_registers();
+
+	return register_die_notifier(&hw_breakpoint_exceptions_nb);
+}
+
+core_initcall(init_hw_breakpoint);
Index: linux-2.6-tip.hbkpt/kernel/Makefile
===================================================================
--- linux-2.6-tip.hbkpt.orig/kernel/Makefile
+++ linux-2.6-tip.hbkpt/kernel/Makefile
@@ -95,6 +95,7 @@ obj-$(CONFIG_FUNCTION_TRACER) += trace/
 obj-$(CONFIG_TRACING) += trace/
 obj-$(CONFIG_SMP) += sched_cpupri.o
 obj-$(CONFIG_PERF_COUNTERS) += perf_counter.o
+obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o
 
 ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
 # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
Index: linux-2.6-tip.hbkpt/include/asm-generic/hw_breakpoint.h
===================================================================
--- /dev/null
+++ linux-2.6-tip.hbkpt/include/asm-generic/hw_breakpoint.h
@@ -0,0 +1,140 @@
+#ifndef	_ASM_GENERIC_HW_BREAKPOINT_H
+#define	_ASM_GENERIC_HW_BREAKPOINT_H
+
+#ifndef __ARCH_HW_BREAKPOINT_H
+#error "Please don't include this file directly"
+#endif
+
+#ifdef	__KERNEL__
+#include <linux/list.h>
+#include <linux/types.h>
+#include <linux/kallsyms.h>
+
+/**
+ * struct hw_breakpoint - unified kernel/user-space hardware breakpoint
+ * @triggered: callback invoked after target address access
+ * @info: arch-specific breakpoint info (address, length, and type)
+ *
+ * %hw_breakpoint structures are the kernel's way of representing
+ * hardware breakpoints.  These are data breakpoints
+ * (also known as "watchpoints", triggered on data access), and the breakpoint's
+ * target address can be located in either kernel space or user space.
+ *
+ * The breakpoint's address, length, and type are highly
+ * architecture-specific.  The values are encoded in the @info field; you
+ * specify them when registering the breakpoint.  To examine the encoded
+ * values use hw_breakpoint_get_{kaddress,uaddress,len,type}(), declared
+ * below.
+ *
+ * The address is specified as a regular kernel pointer (for kernel-space
+ * breakponts) or as an %__user pointer (for user-space breakpoints).
+ * With register_user_hw_breakpoint(), the address must refer to a
+ * location in user space.  The breakpoint will be active only while the
+ * requested task is running.  Conversely with
+ * register_kernel_hw_breakpoint(), the address must refer to a location
+ * in kernel space, and the breakpoint will be active on all CPUs
+ * regardless of the current task.
+ *
+ * The length is the breakpoint's extent in bytes, which is subject to
+ * certain limitations.  include/asm/hw_breakpoint.h contains macros
+ * defining the available lengths for a specific architecture.  Note that
+ * the address's alignment must match the length.  The breakpoint will
+ * catch accesses to any byte in the range from address to address +
+ * (length - 1).
+ *
+ * The breakpoint's type indicates the sort of access that will cause it
+ * to trigger.  Possible values may include:
+ *
+ * 	%HW_BREAKPOINT_RW (triggered on read or write access),
+ * 	%HW_BREAKPOINT_WRITE (triggered on write access), and
+ * 	%HW_BREAKPOINT_READ (triggered on read access).
+ *
+ * Appropriate macros are defined in include/asm/hw_breakpoint.h; not all
+ * possibilities are available on all architectures.  Execute breakpoints
+ * must have length equal to the special value %HW_BREAKPOINT_LEN_EXECUTE.
+ *
+ * When a breakpoint gets hit, the @triggered callback is
+ * invoked in_interrupt with a pointer to the %hw_breakpoint structure and the
+ * processor registers.
+ * Data breakpoints occur after the memory access has taken place.
+ * Breakpoints are disabled during execution @triggered, to avoid
+ * recursive traps and allow unhindered access to breakpointed memory.
+ *
+ * This sample code sets a breakpoint on pid_max and registers a callback
+ * function for writes to that variable.  Note that it is not portable
+ * as written, because not all architectures support HW_BREAKPOINT_LEN_4.
+ *
+ * ----------------------------------------------------------------------
+ *
+ * #include <asm/hw_breakpoint.h>
+ *
+ * struct hw_breakpoint my_bp;
+ *
+ * static void my_triggered(struct hw_breakpoint *bp, struct pt_regs *regs)
+ * {
+ * 	printk(KERN_DEBUG "Inside triggered routine of breakpoint exception\n");
+ * 	dump_stack();
+ *  	.......<more debugging output>........
+ * }
+ *
+ * static struct hw_breakpoint my_bp;
+ *
+ * static int init_module(void)
+ * {
+ *	..........<do anything>............
+ *	my_bp.info.type = HW_BREAKPOINT_WRITE;
+ *	my_bp.info.len = HW_BREAKPOINT_LEN_4;
+ *
+ *	my_bp.installed = (void *)my_bp_installed;
+ *
+ *	rc = register_kernel_hw_breakpoint(&my_bp);
+ *	..........<do anything>............
+ * }
+ *
+ * static void cleanup_module(void)
+ * {
+ *	..........<do anything>............
+ *	unregister_kernel_hw_breakpoint(&my_bp);
+ *	..........<do anything>............
+ * }
+ *
+ * ----------------------------------------------------------------------
+ */
+struct hw_breakpoint {
+	void (*triggered)(struct hw_breakpoint *, struct pt_regs *);
+	struct arch_hw_breakpoint info;
+};
+
+/*
+ * len and type values are defined in include/asm/hw_breakpoint.h.
+ * Available values vary according to the architecture.  On i386 the
+ * possibilities are:
+ *
+ *	HW_BREAKPOINT_LEN_1
+ *	HW_BREAKPOINT_LEN_2
+ *	HW_BREAKPOINT_LEN_4
+ *	HW_BREAKPOINT_LEN_EXECUTE
+ *	HW_BREAKPOINT_RW
+ *	HW_BREAKPOINT_READ
+ *	HW_BREAKPOINT_EXECUTE
+ *
+ * On other architectures HW_BREAKPOINT_LEN_8 may be available, and the
+ * 1-, 2-, and 4-byte lengths may be unavailable.  There also may be
+ * HW_BREAKPOINT_WRITE.  You can use #ifdef to check at compile time.
+ */
+
+static DEFINE_MUTEX(hw_breakpoint_mutex);	/* Protects everything */
+
+/*
+ * Kernel breakpoints are not associated with any particular thread.
+ */
+int register_kernel_hw_breakpoint(struct hw_breakpoint *bp);
+void unregister_kernel_hw_breakpoint(struct hw_breakpoint *bp);
+void switch_to_none_hw_breakpoint(void);
+
+extern unsigned int hbkpt_kernel_pos;
+extern unsigned int hbkpt_user_max;
+extern struct task_struct *last_debugged_task;
+
+#endif	/* __KERNEL__ */
+#endif	/* _ASM_GENERIC_HW_BREAKPOINT_H */
Index: linux-2.6-tip.hbkpt/arch/Kconfig
===================================================================
--- linux-2.6-tip.hbkpt.orig/arch/Kconfig
+++ linux-2.6-tip.hbkpt/arch/Kconfig
@@ -106,3 +106,6 @@ config HAVE_CLK
 	help
 	  The <linux/clk.h> calls support software clock gating and
 	  thus are a key power management tool on many systems.
+
+config HAVE_HW_BREAKPOINT
+	bool


^ permalink raw reply	[flat|nested] 55+ messages in thread

* [Patch 02/11] x86 architecture implementation of Hardware Breakpoint interfaces
       [not found] <20090319234044.410725944@K.Prasad>
  2009-03-19 23:48 ` [Patch 01/11] Introducing generic hardware breakpoint handler interfaces K.Prasad
@ 2009-03-19 23:48 ` K.Prasad
  2009-03-19 23:48 ` [Patch 03/11] Modifying generic debug exception to use thread-specific debug registers K.Prasad
                   ` (8 subsequent siblings)
  10 siblings, 0 replies; 55+ messages in thread
From: K.Prasad @ 2009-03-19 23:48 UTC (permalink / raw)
  To: Ingo Molnar, Linux Kernel Mailing List
  Cc: Alan Stern, Andrew Morton, Benjamin Herrenschmidt,
	Frederic Weisbecker, Maneesh Soni, Roland McGrath,
	Steven Rostedt, K.Prasad

[-- Attachment #1: 2 --]
[-- Type: text/plain, Size: 14025 bytes --]

This patch introduces two new files named hw_breakpoint.[ch] inside x86 specific
directories. They contain functions which help validate and serve requests for 
using Hardware Breakpoint registers on x86 processors.

Signed-off-by: K.Prasad <prasad@linux.vnet.ibm.com>
Signed-off-by: Alan Stern <stern@rowland.harvard.edu>
---
 arch/x86/Kconfig                     |    1 
 arch/x86/include/asm/hw_breakpoint.h |   69 ++++++
 arch/x86/kernel/Makefile             |    2 
 arch/x86/kernel/hw_breakpoint.c      |  384 +++++++++++++++++++++++++++++++++++
 4 files changed, 455 insertions(+), 1 deletion(-)

Index: linux-2.6-tip.hbkpt/arch/x86/kernel/hw_breakpoint.c
===================================================================
--- /dev/null
+++ linux-2.6-tip.hbkpt/arch/x86/kernel/hw_breakpoint.c
@@ -0,0 +1,384 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) 2007 Alan Stern
+ * Copyright (C) 2009 IBM Corporation
+ */
+
+/*
+ * HW_breakpoint: a unified kernel/user-space hardware breakpoint facility,
+ * using the CPU's debug registers.
+ */
+
+#include <linux/irqflags.h>
+#include <linux/notifier.h>
+#include <linux/kallsyms.h>
+#include <linux/kprobes.h>
+#include <linux/percpu.h>
+#include <linux/kdebug.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/sched.h>
+#include <linux/init.h>
+#include <linux/smp.h>
+
+#include <asm/hw_breakpoint.h>
+#include <asm/processor.h>
+#include <asm/debugreg.h>
+
+/* Unmasked kernel DR7 value */
+static unsigned long kdr7;
+
+/*
+ * Masks for the bits corresponding to registers DR0 - DR3 in DR7 register.
+ * Used to clear and verify the status of bits corresponding to DR0 - DR3
+ */
+static const unsigned long	dr7_masks[HB_NUM] = {
+	0x000f0003,	/* LEN0, R/W0, G0, L0 */
+	0x00f0000c,	/* LEN1, R/W1, G1, L1 */
+	0x0f000030,	/* LEN2, R/W2, G2, L2 */
+	0xf00000c0	/* LEN3, R/W3, G3, L3 */
+};
+
+
+/*
+ * Encode the length, type, Exact, and Enable bits for a particular breakpoint
+ * as stored in debug register 7.
+ */
+static unsigned long encode_dr7(int drnum, unsigned len, unsigned type)
+{
+	unsigned long temp;
+
+	temp = (len | type) & 0xf;
+	temp <<= (DR_CONTROL_SHIFT + drnum * DR_CONTROL_SIZE);
+	temp |= (DR_GLOBAL_ENABLE << (drnum * DR_ENABLE_SIZE)) |
+				DR_GLOBAL_SLOWDOWN;
+	return temp;
+}
+
+/*
+ * Install the kernel breakpoints in their debug registers.
+ */
+void arch_install_kernel_hbkpt(void *bkpt)
+{
+	struct hw_breakpoint *bp;
+	int i;
+	unsigned long dr7;
+
+	bp = (struct hw_breakpoint *)bkpt;
+
+	kdr7 &= ~(dr7_masks[hbkpt_kernel_pos]);
+	kdr7 |= encode_dr7(hbkpt_kernel_pos, bp->info.len, bp->info.type);
+
+	get_debugreg(dr7, 7);
+	/* Clear the bits corresponding to 'pos' register in dr7 */
+	dr7 &= ~(dr7_masks[hbkpt_kernel_pos]);
+	dr7 |= kdr7;
+
+	/* Don't allow debug exceptions while we update the registers */
+	set_debugreg(0UL, 7);
+
+	/* Kernel hbkpts always begin at 'hbkpt_kernel_pos' and upto HB_NUM */
+	for (i = hbkpt_kernel_pos; i < HB_NUM; i++)
+		set_debugreg(hbkpt_kernel[i]->info.address, i);
+
+	/* No need to set DR6 */
+	set_debugreg(dr7, 7);
+}
+
+/*
+ * Install the thread breakpoints in their debug registers.
+ */
+void arch_install_thread_hbkpt(struct task_struct *tsk)
+{
+	int i;
+	struct thread_struct *thread = &(tsk->thread);
+
+	for (i = 0; i < hbkpt_user_max; i++)
+		if (thread->hbkpt[i])
+			set_debugreg(thread->hbkpt[i]->info.address, i);
+
+	/* No need to set DR6 */
+
+	set_debugreg((kdr7 | thread->dr7), 7);
+}
+
+/*
+ * Install the debug register values for just the kernel, no thread.
+ */
+void arch_install_none()
+{
+	/* Clear the user-space portion of dr7 by setting only kdr7 */
+	set_debugreg(kdr7, 7);
+}
+
+/*
+ * Check for virtual address in user space.
+ */
+int arch_check_va_in_userspace(unsigned long va, struct task_struct *tsk)
+{
+#ifdef CONFIG_X86_32
+	return (va <= TASK_SIZE - 3);
+#else /* X86_64 */
+	return (va <= TASK_SIZE - 7);
+#endif
+}
+
+/*
+ * Check for virtual address in kernel space.
+ */
+int arch_check_va_in_kernelspace(unsigned long va)
+{
+	return (va >= TASK_SIZE);
+}
+
+/*
+ * Store a breakpoint's encoded address, length, and type.
+ */
+void arch_store_info(struct hw_breakpoint *bp)
+{
+	/*
+	 * User-space requests will always have the address field populated
+	 * For kernel-addresses, either the address or symbol name can be
+	 * specified.
+	 */
+	if (bp->info.address)
+		return;
+	if (bp->info.name)
+		bp->info.address = (unsigned long)
+					kallsyms_lookup_name(bp->info.name);
+}
+
+/*
+ * Validate the arch-specific HW Breakpoint register settings
+ */
+int arch_validate_hwbkpt_settings(struct hw_breakpoint *bp,
+				unsigned int *align, struct task_struct *tsk)
+{
+	int ret = -EINVAL;
+
+	switch (bp->info.type) {
+
+	/* Ptrace-refactoring code
+	 * For now, we'll allow instruction breakpoint only for user-space
+	 * addresses
+	 */
+	case HW_BREAKPOINT_EXECUTE:
+		if ((!arch_check_va_in_userspace(bp->info.address, tsk)) &&
+			bp->info.len != HW_BREAKPOINT_LEN_EXECUTE)
+			return ret;
+		break;
+	case HW_BREAKPOINT_WRITE:
+				break;
+	case HW_BREAKPOINT_RW:
+				break;
+	default:
+		return ret;
+	}
+
+	switch (bp->info.len) {
+	case HW_BREAKPOINT_LEN_1:
+		*align = 0;
+		break;
+	case HW_BREAKPOINT_LEN_2:
+		*align = 1;
+		break;
+	case HW_BREAKPOINT_LEN_4:
+		*align = 3;
+		break;
+	default:
+		return ret;
+	}
+
+	if (bp->triggered) {
+		ret = 0;
+		arch_store_info(bp);
+	}
+	return ret;
+}
+
+/*
+ * Register a new user breakpoint structure.
+ */
+void arch_register_user_hw_breakpoint(int pos, struct hw_breakpoint *bp,
+		struct task_struct *tsk)
+{
+	struct thread_struct *thread = &(tsk->thread);
+
+	thread->dr7 &= ~dr7_masks[pos];
+	thread->dr7 |= encode_dr7(pos, bp->info.len, bp->info.type);
+}
+
+/*
+ * Modify an existing user breakpoint structure.
+ */
+int arch_modify_user_hw_breakpoint(int pos, struct hw_breakpoint *bp,
+		struct task_struct *tsk)
+{
+	struct thread_struct *thread = &(tsk->thread);
+
+	/* Check if the register to be modified was enabled by the thread */
+	if (!(thread->dr7 & (1 << (pos * DR_ENABLE_SIZE))))
+		return -EINVAL;
+
+	thread->dr7 &= ~dr7_masks[pos];
+	thread->dr7 |= encode_dr7(pos, bp->info.len, bp->info.type);
+
+	return 0;
+}
+
+/*
+ * Unregister a user breakpoint structure.
+ */
+void arch_unregister_user_hw_breakpoint(int pos, struct hw_breakpoint *bp,
+					struct task_struct *tsk)
+{
+	struct thread_struct *thread = &(tsk->thread);
+
+	if (!thread->hbkpt[pos])
+		return;
+
+	thread->hbkpt[pos]->info.address = 0;
+	thread->dr7 &= ~dr7_masks[pos];
+}
+
+/*
+ * Register a kernel breakpoint structure.
+ */
+void arch_register_kernel_hw_breakpoint(struct hw_breakpoint *bp)
+{
+	on_each_cpu(arch_install_kernel_hbkpt, (void *)bp, 0);
+}
+
+/*
+ * Unregister a kernel breakpoint structure.
+ */
+void arch_unregister_kernel_hw_breakpoint(int pos)
+{
+	unsigned long dr7;
+
+	kdr7 &= ~(dr7_masks[pos]);
+
+	get_debugreg(dr7, 7);
+	dr7  &= ~(dr7_masks[pos]);
+	set_debugreg(dr7, 7);
+}
+
+/* End of arch-specific hook routines */
+
+/*
+ * Copy out the debug register information for a core dump.
+ *
+ * tsk must be equal to current.
+ */
+void dump_thread_hw_breakpoint(struct task_struct *tsk, int u_debugreg[8])
+{
+	struct thread_struct *thread = &(tsk->thread);
+	int i;
+
+	memset(u_debugreg, 0, sizeof u_debugreg);
+	for (i = 0; i < thread->hbkpt_num_installed && thread->hbkpt[i]; ++i)
+		u_debugreg[i] = thread->hbkpt[i]->info.address;
+	u_debugreg[7] = thread->dr7;
+	u_debugreg[6] = thread->dr6;
+}
+
+/*
+ * Handle debug exception notifications.
+ */
+int __kprobes hw_breakpoint_handler(struct die_args *args)
+{
+	int i;
+	struct hw_breakpoint *bp;
+	/* The DR6 value is stored in args->err */
+	unsigned long dr7, dr6 = args->err;
+
+	if (dr6 & DR_STEP)
+		return NOTIFY_DONE;
+
+	get_debugreg(dr7, 7);
+
+	/* Disable breakpoints during exception handling */
+	set_debugreg(0UL, 7);
+
+	/* Assert that local interrupts are disabled
+	 * Reset the DRn bits in the virtualized register value.
+	 * The ptrace trigger routine will add in whatever is needed.
+	 */
+	current->thread.dr6 &= ~(DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3);
+
+	/* Lazy debug register switching */
+	if (last_debugged_task != current)
+		switch_to_none_hw_breakpoint();
+
+	/* Handle all the breakpoints that were triggered */
+	for (i = 0; i < HB_NUM; ++i) {
+		if (likely(!(dr6 & (DR_TRAP0 << i))))
+			continue;
+
+		/* Find the corresponding hw_breakpoint structure and
+		 * invoke its triggered callback.
+		 */
+		if (i < hbkpt_user_max)
+			bp = current->thread.hbkpt[i];
+		else if (i >= hbkpt_kernel_pos)
+			bp = hbkpt_kernel[i];
+		else		/* False alarm due to lazy DR switching */
+			continue;
+		if (!bp)
+			goto ret_path;
+
+		switch (bp->info.type) {
+		case HW_BREAKPOINT_WRITE:
+		case HW_BREAKPOINT_RW:
+			if (bp->triggered)
+				(bp->triggered)(bp, args->regs);
+			/* Re-enable the breakpoints */
+			put_cpu_no_resched();
+			if (arch_check_va_in_userspace(bp->info.address,
+							current))
+				goto ret_notify_done;
+			else
+				goto ret_notify_stop;
+		/*
+		 * Presently we allow instruction breakpoints only in
+		 * user-space when requested through ptrace.
+		 */
+		case HW_BREAKPOINT_EXECUTE:
+			if (arch_check_va_in_userspace(bp->info.address,
+							current)) {
+				(bp->triggered)(bp, args->regs);
+			/*
+			 * do_debug will notify user through a SIGTRAP signal
+			 * So we are not requesting a NOTIFY_STOP here
+			 */
+				goto ret_notify_done;
+			}
+		}
+	}
+
+ret_path:
+	/* Stop processing further if the exception is a stray one */
+	if (!(dr6 & ~(DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)))
+		goto ret_notify_stop;
+
+ret_notify_done:
+	set_debugreg(dr7, 7);
+	return NOTIFY_DONE;
+ret_notify_stop:
+	set_debugreg(dr7, 7);
+	return NOTIFY_STOP;
+}
Index: linux-2.6-tip.hbkpt/arch/x86/include/asm/hw_breakpoint.h
===================================================================
--- /dev/null
+++ linux-2.6-tip.hbkpt/arch/x86/include/asm/hw_breakpoint.h
@@ -0,0 +1,69 @@
+#ifndef	_I386_HW_BREAKPOINT_H
+#define	_I386_HW_BREAKPOINT_H
+
+#ifdef	__KERNEL__
+#define	__ARCH_HW_BREAKPOINT_H
+
+struct arch_hw_breakpoint {
+	char		*name; /* Contains name of the symbol to set bkpt */
+	unsigned long	address;
+	u8		len;
+	u8		type;
+};
+
+#include <linux/kdebug.h>
+#include <asm-generic/hw_breakpoint.h>
+
+/* Available HW breakpoint length encodings */
+#define HW_BREAKPOINT_LEN_1		0x40
+#define HW_BREAKPOINT_LEN_2		0x44
+#define HW_BREAKPOINT_LEN_4		0x4c
+#define HW_BREAKPOINT_LEN_EXECUTE	0x40
+
+/* Available HW breakpoint type encodings */
+
+/* trigger on instruction execute */
+#define HW_BREAKPOINT_EXECUTE	0x80
+/* trigger on memory write */
+#define HW_BREAKPOINT_WRITE	0x81
+/* trigger on memory read or write */
+#define HW_BREAKPOINT_RW	0x83
+
+/* Total number of available HW breakpoint registers */
+#define HB_NUM 4
+
+extern struct hw_breakpoint *hbkpt_kernel[HB_NUM];
+extern unsigned int hbkpt_user_max_refcount[HB_NUM];
+
+/*
+ * Ptrace support: breakpoint trigger routine.
+ */
+int __register_user_hw_breakpoint(int pos, struct task_struct *tsk,
+			struct hw_breakpoint *bp);
+int __modify_user_hw_breakpoint(int pos, struct task_struct *tsk,
+			struct hw_breakpoint *bp);
+void __unregister_user_hw_breakpoint(int pos, struct task_struct *tsk,
+			struct hw_breakpoint *bp);
+
+void arch_install_thread_hbkpt(struct task_struct *tsk);
+void arch_install_none(void);
+void arch_install_kernel_hbkpt(void *);
+int arch_check_va_in_userspace(unsigned long va,
+						struct task_struct *tsk);
+int arch_check_va_in_kernelspace(unsigned long va);
+void arch_store_info(struct hw_breakpoint *bp);
+int arch_validate_hwbkpt_settings(struct hw_breakpoint *bp,
+				unsigned int *align, struct task_struct *tsk);
+void arch_register_user_hw_breakpoint(int pos, struct hw_breakpoint *bp,
+				struct task_struct *tsk);
+int arch_modify_user_hw_breakpoint(int pos, struct hw_breakpoint *bp,
+				struct task_struct *tsk);
+void arch_unregister_user_hw_breakpoint(int pos, struct hw_breakpoint *bp,
+					struct task_struct *tsk);
+void arch_register_kernel_hw_breakpoint(struct hw_breakpoint *bp);
+void arch_unregister_kernel_hw_breakpoint(int pos);
+int hw_breakpoint_handler(struct die_args *args);
+
+#endif	/* __KERNEL__ */
+#endif	/* _I386_HW_BREAKPOINT_H */
+
Index: linux-2.6-tip.hbkpt/arch/x86/kernel/Makefile
===================================================================
--- linux-2.6-tip.hbkpt.orig/arch/x86/kernel/Makefile
+++ linux-2.6-tip.hbkpt/arch/x86/kernel/Makefile
@@ -36,7 +36,7 @@ obj-$(CONFIG_X86_64)	+= sys_x86_64.o x86
 obj-$(CONFIG_X86_64)	+= syscall_64.o vsyscall_64.o
 obj-y			+= bootflag.o e820.o
 obj-y			+= pci-dma.o quirks.o i8237.o topology.o kdebugfs.o
-obj-y			+= alternative.o i8253.o pci-nommu.o
+obj-y			+= alternative.o i8253.o pci-nommu.o hw_breakpoint.o
 obj-y			+= tsc.o io_delay.o rtc.o
 
 obj-$(CONFIG_X86_TRAMPOLINE)	+= trampoline.o
Index: linux-2.6-tip.hbkpt/arch/x86/Kconfig
===================================================================
--- linux-2.6-tip.hbkpt.orig/arch/x86/Kconfig
+++ linux-2.6-tip.hbkpt/arch/x86/Kconfig
@@ -46,6 +46,7 @@ config X86
 	select HAVE_KERNEL_BZIP2
 	select HAVE_KERNEL_LZMA
 	select HAVE_ARCH_KMEMCHECK
+	select HAVE_HW_BREAKPOINT
 
 config ARCH_DEFCONFIG
 	string


^ permalink raw reply	[flat|nested] 55+ messages in thread

* [Patch 03/11] Modifying generic debug exception to use thread-specific debug registers
       [not found] <20090319234044.410725944@K.Prasad>
  2009-03-19 23:48 ` [Patch 01/11] Introducing generic hardware breakpoint handler interfaces K.Prasad
  2009-03-19 23:48 ` [Patch 02/11] x86 architecture implementation of Hardware Breakpoint interfaces K.Prasad
@ 2009-03-19 23:48 ` K.Prasad
  2009-03-19 23:49 ` [Patch 04/11] Introduce user-space " K.Prasad
                   ` (7 subsequent siblings)
  10 siblings, 0 replies; 55+ messages in thread
From: K.Prasad @ 2009-03-19 23:48 UTC (permalink / raw)
  To: Ingo Molnar, Linux Kernel Mailing List
  Cc: Alan Stern, Andrew Morton, Benjamin Herrenschmidt,
	Frederic Weisbecker, Maneesh Soni, Roland McGrath,
	Steven Rostedt, K.Prasad

[-- Attachment #1: 3 --]
[-- Type: text/plain, Size: 3661 bytes --]

From: Alan Stern <stern@rowland.harvard.edu>

This patch modifies the breakpoint exception handler code to use the abstract
register names.

[K.Prasad: Split-out from the bigger patch and minor changes following
           re-basing]

Signed-off-by: K.Prasad <prasad@linux.vnet.ibm.com>
Signed-off-by: Alan Stern <stern@rowland.harvard.edu>
---
 arch/x86/kernel/traps.c |   73 ++++++++++++++++--------------------------------
 1 file changed, 25 insertions(+), 48 deletions(-)

Index: linux-2.6-tip.hbkpt/arch/x86/kernel/traps.c
===================================================================
--- linux-2.6-tip.hbkpt.orig/arch/x86/kernel/traps.c
+++ linux-2.6-tip.hbkpt/arch/x86/kernel/traps.c
@@ -530,13 +530,14 @@ asmlinkage __kprobes struct pt_regs *syn
 dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code)
 {
 	struct task_struct *tsk = current;
-	unsigned long condition;
+	unsigned long dr6;
 	int si_code;
 
-	get_debugreg(condition, 6);
+	get_debugreg(dr6, 6);
+	set_debugreg(0, 6);	/* DR6 may or may not be cleared by the CPU */
 
 	/* Catch kmemcheck conditions first of all! */
-	if (condition & DR_STEP && kmemcheck_trap(regs))
+	if (dr6 & DR_STEP && kmemcheck_trap(regs))
 		return;
 
 	/*
@@ -545,61 +546,37 @@ dotraplinkage void __kprobes do_debug(st
 	clear_tsk_thread_flag(tsk, TIF_DEBUGCTLMSR);
 	tsk->thread.debugctlmsr = 0;
 
-	if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code,
+	/* Store the virtualized DR6 value */
+	tsk->thread.dr6 = dr6;
+
+	if (notify_die(DIE_DEBUG, "debug", regs, dr6, error_code,
 						SIGTRAP) == NOTIFY_STOP)
 		return;
 
 	/* It's safe to allow irq's after DR6 has been saved */
 	preempt_conditional_sti(regs);
 
-	/* Mask out spurious debug traps due to lazy DR7 setting */
-	if (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)) {
-		if (!tsk->thread.debugreg7)
-			goto clear_dr7;
-	}
-
-#ifdef CONFIG_X86_32
-	if (regs->flags & X86_VM_MASK)
-		goto debug_vm86;
-#endif
-
-	/* Save debug status register where ptrace can see it */
-	tsk->thread.debugreg6 = condition;
-
-	/*
-	 * Single-stepping through TF: make sure we ignore any events in
-	 * kernel space (but re-enable TF when returning to user mode).
-	 */
-	if (condition & DR_STEP) {
-		if (!user_mode(regs))
-			goto clear_TF_reenable;
+	if (regs->flags & X86_VM_MASK) {
+		handle_vm86_trap((struct kernel_vm86_regs *) regs,
+				error_code, 1);
+		return;
 	}
 
-	si_code = get_si_code(condition);
-	/* Ok, finally something we can handle */
-	send_sigtrap(tsk, regs, error_code, si_code);
-
 	/*
-	 * Disable additional traps. They'll be re-enabled when
-	 * the signal is delivered.
+	 * Single-stepping through system calls: ignore any exceptions in
+	 * kernel space, but re-enable TF when returning to user mode.
+	 *
+	 * We already checked v86 mode above, so we can check for kernel mode
+	 * by just checking the CPL of CS.
 	 */
-clear_dr7:
-	set_debugreg(0, 7);
-	preempt_conditional_cli(regs);
-	return;
-
-#ifdef CONFIG_X86_32
-debug_vm86:
-	/* reenable preemption: handle_vm86_trap() might sleep */
-	dec_preempt_count();
-	handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code, 1);
-	conditional_cli(regs);
-	return;
-#endif
-
-clear_TF_reenable:
-	set_tsk_thread_flag(tsk, TIF_SINGLESTEP);
-	regs->flags &= ~X86_EFLAGS_TF;
+	if ((dr6 & DR_STEP) && !user_mode(regs)) {
+		tsk->thread.dr6 &= ~DR_STEP;
+		set_tsk_thread_flag(tsk, TIF_SINGLESTEP);
+		regs->flags &= ~X86_EFLAGS_TF;
+	}
+	si_code = get_si_code(dr6);
+	if (tsk->thread.dr6 & (DR_STEP|DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3))
+		send_sigtrap(tsk, regs, error_code, si_code);
 	preempt_conditional_cli(regs);
 	return;
 }


^ permalink raw reply	[flat|nested] 55+ messages in thread

* [Patch 04/11] Introduce user-space debug registers
       [not found] <20090319234044.410725944@K.Prasad>
                   ` (2 preceding siblings ...)
  2009-03-19 23:48 ` [Patch 03/11] Modifying generic debug exception to use thread-specific debug registers K.Prasad
@ 2009-03-19 23:49 ` K.Prasad
  2009-03-19 23:49 ` [Patch 05/11] Use wrapper routines around debug registers in processor related functions K.Prasad
                   ` (6 subsequent siblings)
  10 siblings, 0 replies; 55+ messages in thread
From: K.Prasad @ 2009-03-19 23:49 UTC (permalink / raw)
  To: Ingo Molnar, Linux Kernel Mailing List
  Cc: Alan Stern, Andrew Morton, Benjamin Herrenschmidt,
	Frederic Weisbecker, Maneesh Soni, Roland McGrath,
	Steven Rostedt, K.Prasad

[-- Attachment #1: 4 --]
[-- Type: text/plain, Size: 2977 bytes --]

This patch introduces virtual debug registers to used by the per-thread
structure and wrapper routines to manage debug registers by process-related
functions.

Signed-off-by: K.Prasad <prasad@linux.vnet.ibm.com>
Signed-off-by: Alan Stern <stern@rowland.harvard.edu>
---
 arch/x86/include/asm/debugreg.h  |   23 +++++++++++++++++++++++
 arch/x86/include/asm/processor.h |   16 +++++++++-------
 2 files changed, 32 insertions(+), 7 deletions(-)

Index: linux-2.6-tip.hbkpt/arch/x86/include/asm/debugreg.h
===================================================================
--- linux-2.6-tip.hbkpt.orig/arch/x86/include/asm/debugreg.h
+++ linux-2.6-tip.hbkpt/arch/x86/include/asm/debugreg.h
@@ -49,6 +49,8 @@
 
 #define DR_LOCAL_ENABLE_SHIFT 0    /* Extra shift to the local enable bit */
 #define DR_GLOBAL_ENABLE_SHIFT 1   /* Extra shift to the global enable bit */
+#define DR_LOCAL_ENABLE (0x1)      /* Local enable for reg 0 */
+#define DR_GLOBAL_ENABLE (0x2)     /* Global enable for reg 0 */
 #define DR_ENABLE_SIZE 2           /* 2 enable bits per register */
 
 #define DR_LOCAL_ENABLE_MASK (0x55)  /* Set  local bits for all 4 regs */
@@ -67,4 +69,25 @@
 #define DR_LOCAL_SLOWDOWN (0x100)   /* Local slow the pipeline */
 #define DR_GLOBAL_SLOWDOWN (0x200)  /* Global slow the pipeline */
 
+/*
+ * HW breakpoint additions
+ */
+#ifdef __KERNEL__
+
+/* For process management */
+void flush_thread_hw_breakpoint(struct task_struct *tsk);
+int copy_thread_hw_breakpoint(struct task_struct *tsk,
+		struct task_struct *child, unsigned long clone_flags);
+void dump_thread_hw_breakpoint(struct task_struct *tsk, int u_debugreg[8]);
+void switch_to_thread_hw_breakpoint(struct task_struct *tsk);
+
+/* For CPU management */
+void load_debug_registers(void);
+static inline void hw_breakpoint_disable(void)
+{
+	set_debugreg(0UL, 7);
+}
+
+#endif	/* __KERNEL__ */
+
 #endif /* _ASM_X86_DEBUGREG_H */
Index: linux-2.6-tip.hbkpt/arch/x86/include/asm/processor.h
===================================================================
--- linux-2.6-tip.hbkpt.orig/arch/x86/include/asm/processor.h
+++ linux-2.6-tip.hbkpt/arch/x86/include/asm/processor.h
@@ -29,6 +29,7 @@ struct mm_struct;
 #include <linux/threads.h>
 #include <linux/init.h>
 
+#define HB_NUM 4
 /*
  * Default implementation of macro that returns current
  * instruction pointer ("program counter").
@@ -424,13 +425,14 @@ struct thread_struct {
 	unsigned long		ip;
 	unsigned long		fs;
 	unsigned long		gs;
-	/* Hardware debugging registers: */
-	unsigned long		debugreg0;
-	unsigned long		debugreg1;
-	unsigned long		debugreg2;
-	unsigned long		debugreg3;
-	unsigned long		debugreg6;
-	unsigned long		debugreg7;
+	/* Hardware breakpoint info */
+	struct hw_breakpoint	*hbkpt[HB_NUM];
+	unsigned int		hbkpt_num_installed;
+	/* Thread's view of debug reg 6 */
+	unsigned long		dr6;
+	/* Thread's view of debug reg 7 */
+	unsigned long		dr7;
+
 	/* Fault info: */
 	unsigned long		cr2;
 	unsigned long		trap_no;


^ permalink raw reply	[flat|nested] 55+ messages in thread

* [Patch 05/11] Use wrapper routines around debug registers in processor related functions
       [not found] <20090319234044.410725944@K.Prasad>
                   ` (3 preceding siblings ...)
  2009-03-19 23:49 ` [Patch 04/11] Introduce user-space " K.Prasad
@ 2009-03-19 23:49 ` K.Prasad
  2009-03-19 23:49 ` [Patch 06/11] Use the new wrapper routines to access debug registers in process/thread code K.Prasad
                   ` (5 subsequent siblings)
  10 siblings, 0 replies; 55+ messages in thread
From: K.Prasad @ 2009-03-19 23:49 UTC (permalink / raw)
  To: Ingo Molnar, Linux Kernel Mailing List
  Cc: Alan Stern, Andrew Morton, Benjamin Herrenschmidt,
	Frederic Weisbecker, Maneesh Soni, Roland McGrath,
	Steven Rostedt, K.Prasad

[-- Attachment #1: 5 --]
[-- Type: text/plain, Size: 3702 bytes --]

From: Alan Stern <stern@rowland.harvard.edu>

This patch enables the use of wrapper routines to access the debug/breakpoint
registers.

[K.Prasad: Split-out from the bigger patch and minor changes following
           re-basing]

Signed-off-by: K.Prasad <prasad@linux.vnet.ibm.com>
Signed-off-by: Alan Stern <stern@rowland.harvard.edu>
---
 arch/x86/kernel/smpboot.c |    3 +++
 arch/x86/power/cpu_32.c   |   16 +++-------------
 arch/x86/power/cpu_64.c   |   15 +++------------
 3 files changed, 9 insertions(+), 25 deletions(-)

Index: linux-2.6-tip.hbkpt/arch/x86/power/cpu_32.c
===================================================================
--- linux-2.6-tip.hbkpt.orig/arch/x86/power/cpu_32.c
+++ linux-2.6-tip.hbkpt/arch/x86/power/cpu_32.c
@@ -12,6 +12,7 @@
 #include <asm/mtrr.h>
 #include <asm/mce.h>
 #include <asm/xcr.h>
+#include <asm/debugreg.h>
 
 static struct saved_context saved_context;
 
@@ -47,6 +48,7 @@ static void __save_processor_state(struc
 	ctxt->cr2 = read_cr2();
 	ctxt->cr3 = read_cr3();
 	ctxt->cr4 = read_cr4_safe();
+	hw_breakpoint_disable();
 }
 
 /* Needed by apm.c */
@@ -79,19 +81,7 @@ static void fix_processor_context(void)
 	load_TR_desc();				/* This does ltr */
 	load_LDT(&current->active_mm->context);	/* This does lldt */
 
-	/*
-	 * Now maybe reload the debug registers
-	 */
-	if (current->thread.debugreg7) {
-		set_debugreg(current->thread.debugreg0, 0);
-		set_debugreg(current->thread.debugreg1, 1);
-		set_debugreg(current->thread.debugreg2, 2);
-		set_debugreg(current->thread.debugreg3, 3);
-		/* no 4 and 5 */
-		set_debugreg(current->thread.debugreg6, 6);
-		set_debugreg(current->thread.debugreg7, 7);
-	}
-
+	load_debug_registers();
 }
 
 static void __restore_processor_state(struct saved_context *ctxt)
Index: linux-2.6-tip.hbkpt/arch/x86/power/cpu_64.c
===================================================================
--- linux-2.6-tip.hbkpt.orig/arch/x86/power/cpu_64.c
+++ linux-2.6-tip.hbkpt/arch/x86/power/cpu_64.c
@@ -15,6 +15,7 @@
 #include <asm/pgtable.h>
 #include <asm/mtrr.h>
 #include <asm/xcr.h>
+#include <asm/debugreg.h>
 
 static void fix_processor_context(void);
 
@@ -70,6 +71,7 @@ static void __save_processor_state(struc
 	ctxt->cr3 = read_cr3();
 	ctxt->cr4 = read_cr4();
 	ctxt->cr8 = read_cr8();
+	hw_breakpoint_disable();
 }
 
 void save_processor_state(void)
@@ -158,16 +160,5 @@ static void fix_processor_context(void)
 	load_TR_desc();				/* This does ltr */
 	load_LDT(&current->active_mm->context);	/* This does lldt */
 
-	/*
-	 * Now maybe reload the debug registers
-	 */
-	if (current->thread.debugreg7){
-                loaddebug(&current->thread, 0);
-                loaddebug(&current->thread, 1);
-                loaddebug(&current->thread, 2);
-                loaddebug(&current->thread, 3);
-                /* no 4 and 5 */
-                loaddebug(&current->thread, 6);
-                loaddebug(&current->thread, 7);
-	}
+	load_debug_registers();
 }
Index: linux-2.6-tip.hbkpt/arch/x86/kernel/smpboot.c
===================================================================
--- linux-2.6-tip.hbkpt.orig/arch/x86/kernel/smpboot.c
+++ linux-2.6-tip.hbkpt/arch/x86/kernel/smpboot.c
@@ -63,6 +63,7 @@
 #include <asm/apic.h>
 #include <asm/setup.h>
 #include <asm/uv/uv.h>
+#include <asm/debugreg.h>
 #include <linux/mc146818rtc.h>
 
 #include <asm/smpboot_hooks.h>
@@ -331,6 +332,7 @@ notrace static void __cpuinit start_seco
 	setup_secondary_clock();
 
 	wmb();
+	load_debug_registers();
 	cpu_idle();
 }
 
@@ -1234,6 +1236,7 @@ void cpu_disable_common(void)
 	remove_cpu_from_maps(cpu);
 	unlock_vector_lock();
 	fixup_irqs();
+	hw_breakpoint_disable();
 }
 
 int native_cpu_disable(void)


^ permalink raw reply	[flat|nested] 55+ messages in thread

* [Patch 06/11] Use the new wrapper routines to access debug registers in process/thread code
       [not found] <20090319234044.410725944@K.Prasad>
                   ` (4 preceding siblings ...)
  2009-03-19 23:49 ` [Patch 05/11] Use wrapper routines around debug registers in processor related functions K.Prasad
@ 2009-03-19 23:49 ` K.Prasad
  2009-03-19 23:49 ` [Patch 07/11] Modify signal handling code to refrain from re-enabling HW Breakpoints K.Prasad
                   ` (4 subsequent siblings)
  10 siblings, 0 replies; 55+ messages in thread
From: K.Prasad @ 2009-03-19 23:49 UTC (permalink / raw)
  To: Ingo Molnar, Linux Kernel Mailing List
  Cc: Alan Stern, Andrew Morton, Benjamin Herrenschmidt,
	Frederic Weisbecker, Maneesh Soni, Roland McGrath,
	Steven Rostedt, K.Prasad

[-- Attachment #1: 6.new --]
[-- Type: text/plain, Size: 7362 bytes --]

From: Alan Stern <stern@rowland.harvard.edu>

This patch enables the use of abstract debug registers in
process-handling routines.

[K.Prasad: Split-out from the bigger patch and minor changes following
           re-basing]

Signed-off-by: K.Prasad <prasad@linux.vnet.ibm.com>
Signed-off-by: Alan Stern <stern@rowland.harvard.edu>
---
 arch/x86/kernel/process.c    |   23 ++++++-----------------
 arch/x86/kernel/process_32.c |   31 +++++++++++++++++++++++++++++++
 arch/x86/kernel/process_64.c |   33 +++++++++++++++++++++++++++++++++
 3 files changed, 70 insertions(+), 17 deletions(-)

Index: linux-2.6-tip.hbkpt/arch/x86/kernel/process.c
===================================================================
--- linux-2.6-tip.hbkpt.orig/arch/x86/kernel/process.c
+++ linux-2.6-tip.hbkpt/arch/x86/kernel/process.c
@@ -14,6 +14,8 @@
 #include <asm/idle.h>
 #include <asm/uaccess.h>
 #include <asm/i387.h>
+#include <asm/debugreg.h>
+#include <asm/hw_breakpoint.h>
 
 unsigned long idle_halt;
 EXPORT_SYMBOL(idle_halt);
@@ -83,6 +85,8 @@ void exit_thread(void)
 		put_cpu();
 		kfree(bp);
 	}
+	if (unlikely(t->dr7))
+		flush_thread_hw_breakpoint(me);
 
 	ds_exit_thread(current);
 }
@@ -103,14 +107,9 @@ void flush_thread(void)
 	}
 #endif
 
-	clear_tsk_thread_flag(tsk, TIF_DEBUG);
+	if (unlikely(tsk->thread.dr7))
+		flush_thread_hw_breakpoint(tsk);
 
-	tsk->thread.debugreg0 = 0;
-	tsk->thread.debugreg1 = 0;
-	tsk->thread.debugreg2 = 0;
-	tsk->thread.debugreg3 = 0;
-	tsk->thread.debugreg6 = 0;
-	tsk->thread.debugreg7 = 0;
 	memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
 	/*
 	 * Forget coprocessor state..
@@ -192,16 +191,6 @@ void __switch_to_xtra(struct task_struct
 	else if (next->debugctlmsr != prev->debugctlmsr)
 		update_debugctlmsr(next->debugctlmsr);
 
-	if (test_tsk_thread_flag(next_p, TIF_DEBUG)) {
-		set_debugreg(next->debugreg0, 0);
-		set_debugreg(next->debugreg1, 1);
-		set_debugreg(next->debugreg2, 2);
-		set_debugreg(next->debugreg3, 3);
-		/* no 4 and 5 */
-		set_debugreg(next->debugreg6, 6);
-		set_debugreg(next->debugreg7, 7);
-	}
-
 	if (test_tsk_thread_flag(prev_p, TIF_NOTSC) ^
 	    test_tsk_thread_flag(next_p, TIF_NOTSC)) {
 		/* prev and next are different */
Index: linux-2.6-tip.hbkpt/arch/x86/kernel/process_32.c
===================================================================
--- linux-2.6-tip.hbkpt.orig/arch/x86/kernel/process_32.c
+++ linux-2.6-tip.hbkpt/arch/x86/kernel/process_32.c
@@ -59,6 +59,8 @@
 #include <asm/idle.h>
 #include <asm/syscalls.h>
 #include <asm/ds.h>
+#include <asm/debugreg.h>
+#include <asm/hw_breakpoint.h>
 
 asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
 
@@ -263,7 +265,14 @@ int copy_thread(int nr, unsigned long cl
 
 	task_user_gs(p) = get_user_gs(regs);
 
+	p->thread.io_bitmap_ptr = NULL;
+
 	tsk = current;
+	err = -ENOMEM;
+	if (unlikely(tsk->thread.dr7)) {
+		if (copy_thread_hw_breakpoint(tsk, p, clone_flags))
+			goto out;
+	}
 	if (unlikely(test_tsk_thread_flag(tsk, TIF_IO_BITMAP))) {
 		p->thread.io_bitmap_ptr = kmemdup(tsk->thread.io_bitmap_ptr,
 						IO_BITMAP_BYTES, GFP_KERNEL);
@@ -283,10 +292,13 @@ int copy_thread(int nr, unsigned long cl
 		err = do_set_thread_area(p, -1,
 			(struct user_desc __user *)childregs->si, 0);
 
+out:
 	if (err && p->thread.io_bitmap_ptr) {
 		kfree(p->thread.io_bitmap_ptr);
 		p->thread.io_bitmap_max = 0;
 	}
+	if (err)
+		flush_thread_hw_breakpoint(p);
 
 	ds_copy_thread(p, current);
 
@@ -424,6 +436,25 @@ __switch_to(struct task_struct *prev_p, 
 		lazy_load_gs(next->gs);
 
 	percpu_write(current_task, next_p);
+	/*
+	 * There's a problem with moving the switch_to_thread_hw_breakpoint()
+	 * call before current is updated.  Suppose a kernel breakpoint is
+	 * triggered in between the two.  The hw-breakpoint handler will see
+	 * that current is different from the task pointer stored in the chbi
+	 * area, so it will think the task pointer is leftover from an old task
+	 * (lazy switching) and will erase it.  Then until the next context
+	 * switch, no user-breakpoints will be installed.
+	 *
+	 * The real problem is that it's impossible to update both current and
+	 * chbi->bp_task at the same instant, so there will always be a window
+	 * in which they disagree and a breakpoint might get triggered.  Since
+	 * we use lazy switching, we are forced to assume that a disagreement
+	 * means that current is correct and chbi->bp_task is old.  But if you
+	 * move the code above then you'll create a window in which current is
+	 * old and chbi->bp_task is correct.
+	 */
+	if (unlikely(test_tsk_thread_flag(next_p, TIF_DEBUG)))
+		switch_to_thread_hw_breakpoint(next_p);
 
 	return prev_p;
 }
Index: linux-2.6-tip.hbkpt/arch/x86/kernel/process_64.c
===================================================================
--- linux-2.6-tip.hbkpt.orig/arch/x86/kernel/process_64.c
+++ linux-2.6-tip.hbkpt/arch/x86/kernel/process_64.c
@@ -55,6 +55,8 @@
 #include <asm/idle.h>
 #include <asm/syscalls.h>
 #include <asm/ds.h>
+#include <asm/debugreg.h>
+#include <asm/hw_breakpoint.h>
 
 asmlinkage extern void ret_from_fork(void);
 
@@ -248,6 +250,8 @@ void release_thread(struct task_struct *
 			BUG();
 		}
 	}
+	if (unlikely(dead_task->thread.tdr7))
+		flush_thread_hw_breakpoint(dead_task);
 }
 
 static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr)
@@ -303,12 +307,18 @@ int copy_thread(int nr, unsigned long cl
 
 	p->thread.fs = me->thread.fs;
 	p->thread.gs = me->thread.gs;
+	p->thread.io_bitmap_ptr = NULL;
 
 	savesegment(gs, p->thread.gsindex);
 	savesegment(fs, p->thread.fsindex);
 	savesegment(es, p->thread.es);
 	savesegment(ds, p->thread.ds);
 
+	err = -ENOMEM;
+	if (unlikely(me->thread.tdr7)) {
+		if (copy_thread_hw_breakpoint(me, p, clone_flags))
+			goto out;
+	}
 	if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) {
 		p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
 		if (!p->thread.io_bitmap_ptr) {
@@ -346,6 +356,9 @@ out:
 		kfree(p->thread.io_bitmap_ptr);
 		p->thread.io_bitmap_max = 0;
 	}
+	if (err)
+		flush_thread_hw_breakpoint(p);
+
 	return err;
 }
 
@@ -491,6 +504,26 @@ __switch_to(struct task_struct *prev_p, 
 	 */
 	if (tsk_used_math(next_p) && next_p->fpu_counter > 5)
 		math_state_restore();
+	/*
+	 * There's a problem with moving the switch_to_thread_hw_breakpoint()
+	 * call before current is updated.  Suppose a kernel breakpoint is
+	 * triggered in between the two.  The hw-breakpoint handler will see
+	 * that current is different from the task pointer stored in the chbi
+	 * area, so it will think the task pointer is leftover from an old task
+	 * (lazy switching) and will erase it.  Then until the next context
+	 * switch, no user-breakpoints will be installed.
+	 *
+	 * The real problem is that it's impossible to update both current and
+	 * chbi->bp_task at the same instant, so there will always be a window
+	 * in which they disagree and a breakpoint might get triggered.  Since
+	 * we use lazy switching, we are forced to assume that a disagreement
+	 * means that current is correct and chbi->bp_task is old.  But if you
+	 * move the code above then you'll create a window in which current is
+	 * old and chbi->bp_task is correct.
+	 */
+	if (unlikely(test_tsk_thread_flag(next_p, TIF_DEBUG)))
+		switch_to_thread_hw_breakpoint(next_p);
+
 	return prev_p;
 }
 


^ permalink raw reply	[flat|nested] 55+ messages in thread

* [Patch 07/11] Modify signal handling code to refrain from re-enabling HW Breakpoints
       [not found] <20090319234044.410725944@K.Prasad>
                   ` (5 preceding siblings ...)
  2009-03-19 23:49 ` [Patch 06/11] Use the new wrapper routines to access debug registers in process/thread code K.Prasad
@ 2009-03-19 23:49 ` K.Prasad
  2009-03-19 23:49 ` [Patch 08/11] Modify Ptrace routines to access breakpoint registers K.Prasad
                   ` (3 subsequent siblings)
  10 siblings, 0 replies; 55+ messages in thread
From: K.Prasad @ 2009-03-19 23:49 UTC (permalink / raw)
  To: Ingo Molnar, Linux Kernel Mailing List
  Cc: Alan Stern, Andrew Morton, Benjamin Herrenschmidt,
	Frederic Weisbecker, Maneesh Soni, Roland McGrath,
	Steven Rostedt, K.Prasad

[-- Attachment #1: 7 --]
[-- Type: text/plain, Size: 1144 bytes --]

From: Alan Stern <stern@rowland.harvard.edu>

This patch disables re-enabling of Hardware Breakpoint registers through
the  signal handling code. This is now done during
hw_breakpoint_handler().

Signed-off-by: K.Prasad <prasad@linux.vnet.ibm.com>
Signed-off-by: Alan Stern <stern@rowland.harvard.edu>
---
 arch/x86/kernel/signal.c |    9 ---------
 1 file changed, 9 deletions(-)

Index: linux-2.6-tip.hbkpt/arch/x86/kernel/signal.c
===================================================================
--- linux-2.6-tip.hbkpt.orig/arch/x86/kernel/signal.c
+++ linux-2.6-tip.hbkpt/arch/x86/kernel/signal.c
@@ -794,15 +794,6 @@ static void do_signal(struct pt_regs *re
 
 	signr = get_signal_to_deliver(&info, &ka, regs, NULL);
 	if (signr > 0) {
-		/*
-		 * Re-enable any watchpoints before delivering the
-		 * signal to user space. The processor register will
-		 * have been cleared if the watchpoint triggered
-		 * inside the kernel.
-		 */
-		if (current->thread.debugreg7)
-			set_debugreg(current->thread.debugreg7, 7);
-
 		/* Whee! Actually deliver the signal.  */
 		if (handle_signal(signr, &info, &ka, oldset, regs) == 0) {
 			/*


^ permalink raw reply	[flat|nested] 55+ messages in thread

* [Patch 08/11] Modify Ptrace routines to access breakpoint registers
       [not found] <20090319234044.410725944@K.Prasad>
                   ` (6 preceding siblings ...)
  2009-03-19 23:49 ` [Patch 07/11] Modify signal handling code to refrain from re-enabling HW Breakpoints K.Prasad
@ 2009-03-19 23:49 ` K.Prasad
  2009-03-19 23:49 ` [Patch 09/11] Cleanup HW Breakpoint registers before kexec K.Prasad
                   ` (2 subsequent siblings)
  10 siblings, 0 replies; 55+ messages in thread
From: K.Prasad @ 2009-03-19 23:49 UTC (permalink / raw)
  To: Ingo Molnar, Linux Kernel Mailing List
  Cc: Alan Stern, Andrew Morton, Benjamin Herrenschmidt,
	Frederic Weisbecker, Maneesh Soni, Roland McGrath,
	Steven Rostedt, K.Prasad

[-- Attachment #1: 8 --]
[-- Type: text/plain, Size: 8692 bytes --]

From: Alan Stern <stern@rowland.harvard.edu>

This patch modifies the ptrace code to use the new wrapper routines around the 
debug/breakpoint registers.

[K.Prasad: Adapted the ptrace routines and to changes post x86/x86_64 merger,
	   split the minor patch from bigger patch. Re-wrote ptrace_write_dr7()
           and ptrace_set_debugreg() functions to use new data-structures]

[K.Prasad: Changed code to suit the simplified HW breakpoint implementation]

Signed-off-by: K.Prasad <prasad@linux.vnet.ibm.com>
Signed-off-by: Alan Stern <stern@rowland.harvard.edu>
---
 arch/x86/kernel/ptrace.c |  229 ++++++++++++++++++++++++++++-------------------
 1 file changed, 138 insertions(+), 91 deletions(-)

Index: linux-2.6-tip.hbkpt/arch/x86/kernel/ptrace.c
===================================================================
--- linux-2.6-tip.hbkpt.orig/arch/x86/kernel/ptrace.c
+++ linux-2.6-tip.hbkpt/arch/x86/kernel/ptrace.c
@@ -34,6 +34,7 @@
 #include <asm/prctl.h>
 #include <asm/proto.h>
 #include <asm/ds.h>
+#include <asm/hw_breakpoint.h>
 
 #include "tls.h"
 
@@ -134,11 +135,6 @@ static int set_segment_reg(struct task_s
 	return 0;
 }
 
-static unsigned long debugreg_addr_limit(struct task_struct *task)
-{
-	return TASK_SIZE - 3;
-}
-
 #else  /* CONFIG_X86_64 */
 
 #define FLAG_MASK		(FLAG_MASK_32 | X86_EFLAGS_NT)
@@ -263,15 +259,6 @@ static int set_segment_reg(struct task_s
 	return 0;
 }
 
-static unsigned long debugreg_addr_limit(struct task_struct *task)
-{
-#ifdef CONFIG_IA32_EMULATION
-	if (test_tsk_thread_flag(task, TIF_IA32))
-		return IA32_PAGE_OFFSET - 3;
-#endif
-	return TASK_SIZE_MAX - 7;
-}
-
 #endif	/* CONFIG_X86_32 */
 
 static unsigned long get_flags(struct task_struct *task)
@@ -462,95 +449,155 @@ static int genregs_set(struct task_struc
 }
 
 /*
- * This function is trivial and will be inlined by the compiler.
- * Having it separates the implementation details of debug
- * registers from the interface details of ptrace.
+ * Decode the length and type bits for a particular breakpoint as
+ * stored in debug register 7.  Return the "enabled" status.
  */
-static unsigned long ptrace_get_debugreg(struct task_struct *child, int n)
+static int decode_dr7(unsigned long dr7, int bpnum, unsigned *len,
+		unsigned *type)
 {
-	switch (n) {
-	case 0:		return child->thread.debugreg0;
-	case 1:		return child->thread.debugreg1;
-	case 2:		return child->thread.debugreg2;
-	case 3:		return child->thread.debugreg3;
-	case 6:		return child->thread.debugreg6;
-	case 7:		return child->thread.debugreg7;
-	}
-	return 0;
+	int temp = dr7 >> (DR_CONTROL_SHIFT + bpnum * DR_CONTROL_SIZE);
+
+	*len = (temp & 0xc) | 0x40;
+	*type = (temp & 0x3) | 0x80;
+	return (dr7 >> (bpnum * DR_ENABLE_SIZE)) & 0x3;
 }
 
-static int ptrace_set_debugreg(struct task_struct *child,
-			       int n, unsigned long data)
+static void ptrace_triggered(struct hw_breakpoint *bp, struct pt_regs *regs)
 {
+	struct thread_struct *thread = &(current->thread);
 	int i;
 
-	if (unlikely(n == 4 || n == 5))
-		return -EIO;
+	/* Store in the virtual DR6 register the fact that the breakpoint
+	 * was hit so the thread's debugger will see it.
+	 */
+	for (i = 0; i < hbkpt_user_max; i++)
+		if (bp->info.address == thread->hbkpt[i]->info.address)
+			break;
 
-	if (n < 4 && unlikely(data >= debugreg_addr_limit(child)))
-		return -EIO;
+	thread->dr6 |= (DR_TRAP0 << i);
+}
 
-	switch (n) {
-	case 0:		child->thread.debugreg0 = data; break;
-	case 1:		child->thread.debugreg1 = data; break;
-	case 2:		child->thread.debugreg2 = data; break;
-	case 3:		child->thread.debugreg3 = data; break;
+/*
+ * Handle ptrace writes to debug register 7.
+ */
+static int ptrace_write_dr7(struct task_struct *tsk, unsigned long data)
+{
+	struct hw_breakpoint *bp;
+	struct thread_struct *thread = &(tsk->thread);
+	int i;
+	int rc = 0;
+	unsigned long old_dr7 = thread->dr7;
 
-	case 6:
-		if ((data & ~0xffffffffUL) != 0)
-			return -EIO;
-		child->thread.debugreg6 = data;
-		break;
+	data &= ~DR_CONTROL_RESERVED;
+	/* Loop through all the hardware breakpoints, making the
+	 * appropriate changes to each.
+	 */
+restore_settings:
+	thread->dr7 = data;
+	for (i = 0; i < HB_NUM; i++) {
+		int enabled;
+		unsigned len, type;
+
+		bp = thread->hbkpt[i];
+		if (!bp)
+			continue;
+
+		enabled = decode_dr7(data, i, &len, &type);
+		if (!enabled) {
+			if (bp->triggered)
+				__unregister_user_hw_breakpoint(i, tsk, bp);
+			continue;
+		}
 
-	case 7:
-		/*
-		 * Sanity-check data. Take one half-byte at once with
-		 * check = (val >> (16 + 4*i)) & 0xf. It contains the
-		 * R/Wi and LENi bits; bits 0 and 1 are R/Wi, and bits
-		 * 2 and 3 are LENi. Given a list of invalid values,
-		 * we do mask |= 1 << invalid_value, so that
-		 * (mask >> check) & 1 is a correct test for invalid
-		 * values.
-		 *
-		 * R/Wi contains the type of the breakpoint /
-		 * watchpoint, LENi contains the length of the watched
-		 * data in the watchpoint case.
-		 *
-		 * The invalid values are:
-		 * - LENi == 0x10 (undefined), so mask |= 0x0f00.	[32-bit]
-		 * - R/Wi == 0x10 (break on I/O reads or writes), so
-		 *   mask |= 0x4444.
-		 * - R/Wi == 0x00 && LENi != 0x00, so we have mask |=
-		 *   0x1110.
-		 *
-		 * Finally, mask = 0x0f00 | 0x4444 | 0x1110 == 0x5f54.
-		 *
-		 * See the Intel Manual "System Programming Guide",
-		 * 15.2.4
-		 *
-		 * Note that LENi == 0x10 is defined on x86_64 in long
-		 * mode (i.e. even for 32-bit userspace software, but
-		 * 64-bit kernel), so the x86_64 mask value is 0x5454.
-		 * See the AMD manual no. 24593 (AMD64 System Programming)
-		 */
-#ifdef CONFIG_X86_32
-#define	DR7_MASK	0x5f54
-#else
-#define	DR7_MASK	0x5554
-#endif
-		data &= ~DR_CONTROL_RESERVED;
-		for (i = 0; i < 4; i++)
-			if ((DR7_MASK >> ((data >> (16 + 4*i)) & 0xf)) & 1)
-				return -EIO;
-		child->thread.debugreg7 = data;
-		if (data)
-			set_tsk_thread_flag(child, TIF_DEBUG);
+		if (bp->triggered)
+			rc = __modify_user_hw_breakpoint(i, tsk, bp);
+		else {
+			bp->triggered = ptrace_triggered;
+			bp->info.len = len;
+			bp->info.type = type;
+			rc = __register_user_hw_breakpoint(i, tsk, bp);
+		}
+		if (rc < 0)
+			break;
 		else
-			clear_tsk_thread_flag(child, TIF_DEBUG);
-		break;
+			set_tsk_thread_flag(tsk, TIF_DEBUG);
+	}
+	/* If anything above failed, restore the original settings */
+	if (rc < 0) {
+		data = old_dr7;
+		goto restore_settings;
 	}
+	return rc;
+}
 
-	return 0;
+/*
+ * Handle PTRACE_PEEKUSR calls for the debug register area.
+ */
+unsigned long ptrace_get_debugreg(struct task_struct *tsk, int n)
+{
+	struct thread_struct *thread = &(tsk->thread);
+	unsigned long val = 0;
+
+	mutex_lock(&hw_breakpoint_mutex);
+	if (n < HB_NUM) {
+		if (thread->hbkpt[n])
+			val = thread->hbkpt[n]->info.address;
+	} else if (n == 6) {
+		val = thread->dr6;
+	} else if (n == 7) {
+		val = thread->dr7;
+	}
+	mutex_unlock(&hw_breakpoint_mutex);
+	return val;
+}
+
+/*
+ * Handle PTRACE_POKEUSR calls for the debug register area.
+ */
+int ptrace_set_debugreg(struct task_struct *tsk, int n, unsigned long val)
+{
+	struct thread_struct *thread = &(tsk->thread);
+	int rc = -EIO;
+
+	mutex_lock(&hw_breakpoint_mutex);
+
+	/* There are no DR4 or DR5 registers */
+	if (n == 4 || n == 5)
+		goto ret_path;
+
+	/* Writes to DR6 modify the virtualized value */
+	if (n == 6) {
+		tsk->thread.dr6 = val;
+		rc = 0;
+		goto ret_path;
+	}
+
+	/* Writes to DR0 - DR3 change a breakpoint address */
+	rc = 0;
+	if (n < HB_NUM) {
+		if (!val)
+			goto ret_path;
+		if (thread->hbkpt[n]) {
+			thread->hbkpt[n]->info.address = val;
+			rc = __modify_user_hw_breakpoint(n, tsk,
+							  thread->hbkpt[n]);
+			goto ret_path;
+		}
+		thread->hbkpt[n] = kzalloc(sizeof(struct hw_breakpoint),
+								GFP_KERNEL);
+		if (!thread->hbkpt[n]) {
+			rc = -ENOMEM;
+			goto ret_path;
+		} else
+			thread->hbkpt[n]->info.address = val;
+	}
+	/* All that's left is DR7 */
+	if (n == 7)
+		rc = ptrace_write_dr7(tsk, val);
+
+ret_path:
+	mutex_unlock(&hw_breakpoint_mutex);
+	return rc;
 }
 
 /*
@@ -871,7 +918,7 @@ long arch_ptrace(struct task_struct *chi
 		else if (addr >= offsetof(struct user, u_debugreg[0]) &&
 			 addr <= offsetof(struct user, u_debugreg[7])) {
 			addr -= offsetof(struct user, u_debugreg[0]);
-			tmp = ptrace_get_debugreg(child, addr / sizeof(data));
+			tmp = ptrace_get_debugreg(child, addr/sizeof(data));
 		}
 		ret = put_user(tmp, datap);
 		break;
@@ -889,7 +936,7 @@ long arch_ptrace(struct task_struct *chi
 			 addr <= offsetof(struct user, u_debugreg[7])) {
 			addr -= offsetof(struct user, u_debugreg[0]);
 			ret = ptrace_set_debugreg(child,
-						  addr / sizeof(data), data);
+						addr/sizeof(data), data);
 		}
 		break;
 


^ permalink raw reply	[flat|nested] 55+ messages in thread

* [Patch 09/11] Cleanup HW Breakpoint registers before kexec
       [not found] <20090319234044.410725944@K.Prasad>
                   ` (7 preceding siblings ...)
  2009-03-19 23:49 ` [Patch 08/11] Modify Ptrace routines to access breakpoint registers K.Prasad
@ 2009-03-19 23:49 ` K.Prasad
  2009-03-19 23:50 ` [Patch 10/11] Sample HW breakpoint over kernel data address K.Prasad
  2009-03-19 23:50 ` [Patch 11/11] ftrace plugin for kernel symbol tracing using HW Breakpoint interfaces - v2 K.Prasad
  10 siblings, 0 replies; 55+ messages in thread
From: K.Prasad @ 2009-03-19 23:49 UTC (permalink / raw)
  To: Ingo Molnar, Linux Kernel Mailing List
  Cc: Alan Stern, Andrew Morton, Benjamin Herrenschmidt,
	Frederic Weisbecker, Maneesh Soni, Roland McGrath,
	Steven Rostedt, K.Prasad

[-- Attachment #1: 9 --]
[-- Type: text/plain, Size: 1818 bytes --]

From: Alan Stern <stern@rowland.harvard.edu>

This patch disables Hardware breakpoints before doing a 'kexec' on the machine.

[K.Prasad: Split-out from the bigger patch and minor changes following
           re-basing]

Signed-off-by: K.Prasad <prasad@linux.vnet.ibm.com>
Signed-off-by: Alan Stern <stern@rowland.harvard.edu>
---
 arch/x86/kernel/machine_kexec_32.c |    2 ++
 arch/x86/kernel/machine_kexec_64.c |    2 ++
 2 files changed, 4 insertions(+)

Index: linux-2.6-tip.hbkpt/arch/x86/kernel/machine_kexec_32.c
===================================================================
--- linux-2.6-tip.hbkpt.orig/arch/x86/kernel/machine_kexec_32.c
+++ linux-2.6-tip.hbkpt/arch/x86/kernel/machine_kexec_32.c
@@ -25,6 +25,7 @@
 #include <asm/desc.h>
 #include <asm/system.h>
 #include <asm/cacheflush.h>
+#include <asm/debugreg.h>
 
 static void set_idt(void *newidt, __u16 limit)
 {
@@ -202,6 +203,7 @@ void machine_kexec(struct kimage *image)
 
 	/* Interrupts aren't acceptable while we reboot */
 	local_irq_disable();
+	hw_breakpoint_disable();
 
 	if (image->preserve_context) {
 #ifdef CONFIG_X86_IO_APIC
Index: linux-2.6-tip.hbkpt/arch/x86/kernel/machine_kexec_64.c
===================================================================
--- linux-2.6-tip.hbkpt.orig/arch/x86/kernel/machine_kexec_64.c
+++ linux-2.6-tip.hbkpt/arch/x86/kernel/machine_kexec_64.c
@@ -18,6 +18,7 @@
 #include <asm/pgtable.h>
 #include <asm/tlbflush.h>
 #include <asm/mmu_context.h>
+#include <asm/debugreg.h>
 
 static int init_one_level2_page(struct kimage *image, pgd_t *pgd,
 				unsigned long addr)
@@ -282,6 +283,7 @@ void machine_kexec(struct kimage *image)
 
 	/* Interrupts aren't acceptable while we reboot */
 	local_irq_disable();
+	hw_breakpoint_disable();
 
 	if (image->preserve_context) {
 #ifdef CONFIG_X86_IO_APIC


^ permalink raw reply	[flat|nested] 55+ messages in thread

* [Patch 10/11] Sample HW breakpoint over kernel data address
       [not found] <20090319234044.410725944@K.Prasad>
                   ` (8 preceding siblings ...)
  2009-03-19 23:49 ` [Patch 09/11] Cleanup HW Breakpoint registers before kexec K.Prasad
@ 2009-03-19 23:50 ` K.Prasad
  2009-03-19 23:50 ` [Patch 11/11] ftrace plugin for kernel symbol tracing using HW Breakpoint interfaces - v2 K.Prasad
  10 siblings, 0 replies; 55+ messages in thread
From: K.Prasad @ 2009-03-19 23:50 UTC (permalink / raw)
  To: Ingo Molnar, Linux Kernel Mailing List
  Cc: Alan Stern, Andrew Morton, Benjamin Herrenschmidt,
	Frederic Weisbecker, Maneesh Soni, Roland McGrath,
	Steven Rostedt, K.Prasad

[-- Attachment #1: 10 --]
[-- Type: text/plain, Size: 4654 bytes --]

This patch introduces a sample kernel module to demonstrate the use of Hardware
Breakpoint feature. It places a breakpoint over the kernel variable 'pid_max'
to monitor all write operations and emits a function-backtrace when done.

Signed-off-by: K.Prasad <prasad@linux.vnet.ibm.com>
---
 samples/Kconfig                         |    6 ++
 samples/Makefile                        |    4 +
 samples/hw_breakpoint/Makefile          |    1 
 samples/hw_breakpoint/data_breakpoint.c |   79 ++++++++++++++++++++++++++++++++
 4 files changed, 89 insertions(+), 1 deletion(-)

Index: linux-2.6-tip.hbkpt/samples/Kconfig
===================================================================
--- linux-2.6-tip.hbkpt.orig/samples/Kconfig
+++ linux-2.6-tip.hbkpt/samples/Kconfig
@@ -39,5 +39,11 @@ config SAMPLE_KRETPROBES
 	default m
 	depends on SAMPLE_KPROBES && KRETPROBES
 
+config SAMPLE_HW_BREAKPOINT
+	tristate "Build kernel hardware breakpoint examples -- loadable modules only"
+	depends on HAVE_HW_BREAKPOINT && m
+	help
+	  This builds kernel hardware breakpoint example modules.
+
 endif # SAMPLES
 
Index: linux-2.6-tip.hbkpt/samples/Makefile
===================================================================
--- linux-2.6-tip.hbkpt.orig/samples/Makefile
+++ linux-2.6-tip.hbkpt/samples/Makefile
@@ -1,3 +1,5 @@
 # Makefile for Linux samples code
 
-obj-$(CONFIG_SAMPLES)	+= markers/ kobject/ kprobes/ tracepoints/
+obj-$(CONFIG_SAMPLES)	+= markers/ kobject/ kprobes/ tracepoints/ \
+			   hw_breakpoint/
+
Index: linux-2.6-tip.hbkpt/samples/hw_breakpoint/Makefile
===================================================================
--- /dev/null
+++ linux-2.6-tip.hbkpt/samples/hw_breakpoint/Makefile
@@ -0,0 +1 @@
+obj-$(CONFIG_SAMPLE_HW_BREAKPOINT) += data_breakpoint.o
Index: linux-2.6-tip.hbkpt/samples/hw_breakpoint/data_breakpoint.c
===================================================================
--- /dev/null
+++ linux-2.6-tip.hbkpt/samples/hw_breakpoint/data_breakpoint.c
@@ -0,0 +1,79 @@
+/*
+ * data_breakpoint.c - Sample HW Breakpoint file to watch kernel data address
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * This file is a kernel module that places a breakpoint over 'pid_max' kernel
+ * variable using Hardware Breakpoint register. The corresponding handler which
+ * prints a backtrace is invoked everytime a write operation is performed on
+ * that variable.
+ *
+ * After inserting this module, invoke a write operation using
+ * 'echo <desired_value> > /proc/sys/kernel/pid_max'
+ * to find the function-call backtrace.
+ *
+ * Copyright (C) IBM Corporation, 2009
+ */
+#include <linux/module.h>	/* Needed by all modules */
+#include <linux/kernel.h>	/* Needed for KERN_INFO */
+#include <linux/init.h>		/* Needed for the macros */
+
+#include <asm/hw_breakpoint.h>
+
+struct hw_breakpoint pid_max_hbkpt;
+
+void pid_max_hbkpt_handler(struct hw_breakpoint *temp, struct pt_regs
+								*temp_regs)
+{
+	printk(KERN_INFO "pid_max value is changed\n");
+	dump_stack();
+	printk(KERN_INFO "Dump stack from pid_max_hbkpt_handler\n");
+}
+
+static int __init hw_break_module_init(void)
+{
+	int ret;
+
+#ifdef CONFIG_X86
+	pid_max_hbkpt.info.name = "pid_max";
+	pid_max_hbkpt.info.type = HW_BREAKPOINT_WRITE;
+	pid_max_hbkpt.info.len = HW_BREAKPOINT_LEN_4;
+
+	pid_max_hbkpt.triggered = (void *)pid_max_hbkpt_handler;
+#endif /* CONFIG_X86 */
+
+	ret = register_kernel_hw_breakpoint(&pid_max_hbkpt);
+
+	if (ret < 0) {
+		printk(KERN_INFO "Breakpoint registration failed\n");
+		return ret;
+	} else
+		printk(KERN_INFO "HW Breakpoint for pid_max write installed\n");
+
+	return 0;
+}
+
+static void __exit hw_break_module_exit(void)
+{
+	unregister_kernel_hw_breakpoint(&pid_max_hbkpt);
+	printk(KERN_INFO "HW Breakpoint for pid_max write uninstalled\n");
+}
+
+module_init(hw_break_module_init);
+module_exit(hw_break_module_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("K.Prasad");
+MODULE_DESCRIPTION("pid_max breakpoint");


^ permalink raw reply	[flat|nested] 55+ messages in thread

* [Patch 11/11] ftrace plugin for kernel symbol tracing using HW Breakpoint interfaces - v2
       [not found] <20090319234044.410725944@K.Prasad>
                   ` (9 preceding siblings ...)
  2009-03-19 23:50 ` [Patch 10/11] Sample HW breakpoint over kernel data address K.Prasad
@ 2009-03-19 23:50 ` K.Prasad
  2009-03-20  9:04   ` Frederic Weisbecker
  10 siblings, 1 reply; 55+ messages in thread
From: K.Prasad @ 2009-03-19 23:50 UTC (permalink / raw)
  To: Ingo Molnar, Linux Kernel Mailing List
  Cc: Alan Stern, Andrew Morton, Benjamin Herrenschmidt,
	Frederic Weisbecker, Maneesh Soni, Roland McGrath,
	Steven Rostedt, K.Prasad

[-- Attachment #1: ftrace_hbkpt_12 --]
[-- Type: text/plain, Size: 19482 bytes --]

This patch adds an ftrace plugin to detect and profile memory access over
kernel variables. It uses HW Breakpoint interfaces to 'watch memory
addresses.

Signed-off-by: K.Prasad <prasad@linux.vnet.ibm.com> 
---
 kernel/trace/Kconfig          |   21 +
 kernel/trace/Makefile         |    1 
 kernel/trace/trace.h          |   25 +
 kernel/trace/trace_ksym.c     |  555 ++++++++++++++++++++++++++++++++++++++++++
 kernel/trace/trace_selftest.c |   36 ++
 5 files changed, 638 insertions(+)

Index: linux-2.6-tip.hbkpt/kernel/trace/Kconfig
===================================================================
--- linux-2.6-tip.hbkpt.orig/kernel/trace/Kconfig
+++ linux-2.6-tip.hbkpt/kernel/trace/Kconfig
@@ -264,6 +264,27 @@ config POWER_TRACER
 	  power management decisions, specifically the C-state and P-state
 	  behavior.
 
+config KSYM_TRACER
+	bool "Trace read and write access on kernel memory locations"
+	depends on HAVE_HW_BREAKPOINT
+	select TRACING
+	help
+	  This tracer helps find read and write operations on any given kernel
+	  symbol i.e. /proc/kallsyms.
+
+config PROFILE_KSYM_TRACER
+	bool "Profile all kernel memory accesses on 'watched' variables"
+	depends on KSYM_TRACER
+	help
+	  This tracer profiles kernel accesses on variables watched through the
+	  ksym tracer ftrace plugin. Depending upon the hardware, all read
+	  and write operations on kernel variables can be monitored for
+	  accesses.
+
+	  The results will be displayed in:
+	  /debugfs/tracing/profile_ksym
+
+	  Say N if unsure.
 
 config STACK_TRACER
 	bool "Trace max stack"
Index: linux-2.6-tip.hbkpt/kernel/trace/Makefile
===================================================================
--- linux-2.6-tip.hbkpt.orig/kernel/trace/Makefile
+++ linux-2.6-tip.hbkpt/kernel/trace/Makefile
@@ -44,5 +44,6 @@ obj-$(CONFIG_EVENT_TRACER) += trace_even
 obj-$(CONFIG_EVENT_TRACER) += events.o
 obj-$(CONFIG_EVENT_TRACER) += trace_export.o
 obj-$(CONFIG_FTRACE_SYSCALLS) += trace_syscalls.o
+obj-$(CONFIG_KSYM_TRACER) += trace_ksym.o
 
 libftrace-y := ftrace.o
Index: linux-2.6-tip.hbkpt/kernel/trace/trace.h
===================================================================
--- linux-2.6-tip.hbkpt.orig/kernel/trace/trace.h
+++ linux-2.6-tip.hbkpt/kernel/trace/trace.h
@@ -12,6 +12,10 @@
 #include <trace/kmemtrace.h>
 #include <trace/power.h>
 
+#ifdef CONFIG_KSYM_TRACER
+#include <asm/hw_breakpoint.h>
+#endif
+
 enum trace_type {
 	__TRACE_FIRST_TYPE = 0,
 
@@ -37,6 +41,7 @@ enum trace_type {
 	TRACE_KMEM_FREE,
 	TRACE_POWER,
 	TRACE_BLK,
+	TRACE_KSYM,
 
 	__TRACE_LAST_TYPE,
 };
@@ -214,6 +219,23 @@ struct syscall_trace_exit {
 	unsigned long		ret;
 };
 
+#ifdef CONFIG_KSYM_TRACER
+struct trace_ksym {
+	struct trace_entry	ent;
+	struct hw_breakpoint	*ksym_hbkpt;
+	unsigned long		ksym_addr;
+	unsigned long		ip;
+#ifdef CONFIG_PROFILE_KSYM_TRACER
+	unsigned long 		counter;
+#endif
+	struct hlist_node	ksym_hlist;
+	char			ksym_name[KSYM_NAME_LEN];
+	char			p_name[TASK_COMM_LEN];
+};
+#else
+struct trace_ksym {
+};
+#endif /* CONFIG_KSYM_TRACER */
 
 /*
  * trace_flag_type is an enumeration that holds different
@@ -332,6 +354,7 @@ extern void __ftrace_bad_type(void);
 			  TRACE_SYSCALL_ENTER);				\
 		IF_ASSIGN(var, ent, struct syscall_trace_exit,		\
 			  TRACE_SYSCALL_EXIT);				\
+		IF_ASSIGN(var, ent, struct trace_ksym, TRACE_KSYM);	\
 		__ftrace_bad_type();					\
 	} while (0)
 
@@ -593,6 +616,8 @@ extern int trace_selftest_startup_syspro
 					       struct trace_array *tr);
 extern int trace_selftest_startup_branch(struct tracer *trace,
 					 struct trace_array *tr);
+extern int trace_selftest_startup_ksym(struct tracer *trace,
+					 struct trace_array *tr);
 #endif /* CONFIG_FTRACE_STARTUP_TEST */
 
 extern void *head_page(struct trace_array_cpu *data);
Index: linux-2.6-tip.hbkpt/kernel/trace/trace_ksym.c
===================================================================
--- /dev/null
+++ linux-2.6-tip.hbkpt/kernel/trace/trace_ksym.c
@@ -0,0 +1,555 @@
+/*
+ * trace_ksym.c - Kernel Symbol Tracer
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) IBM Corporation, 2009
+ */
+
+#include <linux/kallsyms.h>
+#include <linux/uaccess.h>
+#include <linux/debugfs.h>
+#include <linux/ftrace.h>
+#include <linux/module.h>
+#include <linux/jhash.h>
+#include <linux/fs.h>
+
+#include "trace_output.h"
+#include "trace_stat.h"
+#include "trace.h"
+
+/* For now, let us restrict the no. of symbols traced simultaneously to number
+ * of available hardware breakpoint registers.
+ */
+#define KSYM_TRACER_MAX HB_NUM
+
+#define KSYM_TRACER_OP_LEN 3 /* rw- */
+#define KSYM_FILTER_ENTRY_LEN (KSYM_NAME_LEN + KSYM_TRACER_OP_LEN + 1)
+
+#ifdef CONFIG_FTRACE_SELFTEST
+
+static int ksym_selftest_dummy;
+#define KSYM_SELFTEST_ENTRY "ksym_selftest_dummy"
+
+#endif /* CONFIG_FTRACE_SELFTEST */
+
+static struct trace_array *ksym_trace_array;
+
+DEFINE_MUTEX(ksym_tracer_mutex);
+
+static unsigned int ksym_filter_entry_count;
+static unsigned int ksym_tracing_enabled;
+
+static HLIST_HEAD(ksym_filter_head);
+
+#ifdef CONFIG_PROFILE_KSYM_TRACER
+
+#define MAX_UL_INT 0xffffffff
+DEFINE_SPINLOCK(ksym_stat_lock);
+
+void ksym_collect_stats(unsigned long hbkpt_hit_addr)
+{
+	struct hlist_node *node;
+	struct trace_ksym *entry;
+
+	spin_lock(&ksym_stat_lock);
+	hlist_for_each_entry(entry, node, &ksym_filter_head, ksym_hlist) {
+		if ((entry->ksym_addr == hbkpt_hit_addr) &&
+		    (entry->counter <= MAX_UL_INT)) {
+			entry->counter++;
+			break;
+		}
+	}
+	spin_unlock(&ksym_stat_lock);
+}
+#endif /* CONFIG_PROFILE_KSYM_TRACER */
+
+void ksym_hbkpt_handler(struct hw_breakpoint *hbkpt, struct pt_regs *regs)
+{
+	struct ring_buffer_event *event;
+	struct trace_array *tr;
+	struct trace_ksym *entry;
+	int pc;
+
+	if (!ksym_tracing_enabled)
+		return;
+
+	tr = ksym_trace_array;
+	pc = preempt_count();
+
+	event = trace_buffer_lock_reserve(tr, TRACE_KSYM,
+							sizeof(*entry), 0, pc);
+	if (!event)
+		return;
+
+	entry = ring_buffer_event_data(event);
+	strlcpy(entry->ksym_name, hbkpt->info.name, KSYM_SYMBOL_LEN);
+	entry->ksym_hbkpt = hbkpt;
+	entry->ip = instruction_pointer(regs);
+	strlcpy(entry->p_name, current->comm, TASK_COMM_LEN);
+#ifdef CONFIG_PROFILE_KSYM_TRACER
+	ksym_collect_stats(hbkpt->info.address);
+#endif /* CONFIG_PROFILE_KSYM_TRACER */
+
+	trace_buffer_unlock_commit(tr, event, 0, pc);
+}
+
+/* Valid access types are represented as
+ *
+ * rw- : Set Read/Write Access Breakpoint
+ * -w- : Set Write Access Breakpoint
+ * --- : Clear Breakpoints
+ * --x : Set Execution Break points (Not available yet)
+ *
+ */
+static int ksym_trace_get_access_type(char *access_str)
+{
+	int pos, access = 0;
+
+	for (pos = 0; pos < KSYM_TRACER_OP_LEN; pos++) {
+		switch (access_str[pos]) {
+		case 'r':
+			access += (pos == 0) ? 4 : -1;
+			break;
+		case 'w':
+			access += (pos == 1) ? 2 : -1;
+			break;
+		case '-':
+			break;
+		default:
+			return -EINVAL;
+		}
+	}
+
+	switch (access) {
+	case 6:
+		access = HW_BREAKPOINT_RW;
+		break;
+	case 2:
+		access = HW_BREAKPOINT_WRITE;
+		break;
+	case 0:
+		access = 0;
+	}
+
+	return access;
+}
+
+/*
+ * There can be several possible malformed requests and we attempt to capture
+ * all of them. We enumerate some of the rules
+ * 1. We will not allow kernel symbols with ':' since it is used as a delimiter.
+ *    i.e. multiple ':' symbols disallowed. Possible uses are of the form
+ *    <module>:<ksym_name>:<op>.
+ * 2. No delimiter symbol ':' in the input string
+ * 3. Spurious operator symbols or symbols not in their respective positions
+ * 4. <ksym_name>:--- i.e. clear breakpoint request when ksym_name not in file
+ * 5. Kernel symbol not a part of /proc/kallsyms
+ * 6. Duplicate requests
+ */
+static int parse_ksym_trace_str(char *input_string, char **ksymname,
+							unsigned long *addr)
+{
+	char *delimiter = ":";
+	int ret;
+
+	ret = -EINVAL;
+	*ksymname = strsep(&input_string, delimiter);
+	*addr = kallsyms_lookup_name(*ksymname);
+
+	/* Check for malformed request: (2), (1) and (5) */
+	if ((!input_string) ||
+		(strlen(input_string) != (KSYM_TRACER_OP_LEN + 1)) ||
+			(*addr == 0))
+		goto return_code;
+	ret = ksym_trace_get_access_type(input_string);
+
+return_code:
+	return ret;
+}
+
+static int process_new_ksym_entry(char *ksymname, int op, unsigned long addr)
+{
+	struct trace_ksym *entry;
+	int ret;
+
+	if (ksym_filter_entry_count >= KSYM_TRACER_MAX) {
+		printk(KERN_ERR "ksym_tracer: Maximum limit:(%d) reached. No"
+		" new requests for tracing can be accepted now.\n",
+			KSYM_TRACER_MAX);
+		return -ENOSPC;
+	}
+
+	entry = kzalloc(sizeof(struct trace_ksym), GFP_KERNEL);
+	if (!entry)
+		return -ENOMEM;
+
+	entry->ksym_hbkpt = kzalloc(sizeof(struct hw_breakpoint), GFP_KERNEL);
+	if (!entry->ksym_hbkpt) {
+		kfree(entry);
+		return -ENOMEM;
+	}
+
+	entry->ksym_hbkpt->info.name = ksymname;
+	entry->ksym_hbkpt->info.type = op;
+	entry->ksym_addr = entry->ksym_hbkpt->info.address = addr;
+	entry->ksym_hbkpt->info.len = HW_BREAKPOINT_LEN_4;
+
+	entry->ksym_hbkpt->triggered = (void *)ksym_hbkpt_handler;
+
+	ret = register_kernel_hw_breakpoint(entry->ksym_hbkpt);
+	if (ret < 0) {
+		printk(KERN_INFO "ksym_tracer request failed. Try again"
+					" later!!\n");
+		kfree(entry->ksym_hbkpt);
+		kfree(entry);
+		return -EAGAIN;
+	}
+	hlist_add_head(&(entry->ksym_hlist), &ksym_filter_head);
+	ksym_filter_entry_count++;
+
+	return 0;
+}
+
+static ssize_t ksym_trace_filter_read(struct file *filp, char __user *ubuf,
+						size_t count, loff_t *ppos)
+{
+	struct trace_ksym *entry;
+	struct hlist_node *node;
+	char buf[KSYM_FILTER_ENTRY_LEN * KSYM_TRACER_MAX];
+	ssize_t ret, cnt = 0;
+
+	mutex_lock(&ksym_tracer_mutex);
+
+	hlist_for_each_entry(entry, node, &ksym_filter_head, ksym_hlist) {
+		cnt += snprintf(&buf[cnt], KSYM_FILTER_ENTRY_LEN - cnt, "%s:",
+				entry->ksym_hbkpt->info.name);
+		if (entry->ksym_hbkpt->info.type == HW_BREAKPOINT_WRITE)
+			cnt += snprintf(&buf[cnt], KSYM_FILTER_ENTRY_LEN - cnt,
+								"-w-\n");
+		else if (entry->ksym_hbkpt->info.type == HW_BREAKPOINT_RW)
+			cnt += snprintf(&buf[cnt], KSYM_FILTER_ENTRY_LEN - cnt,
+								"rw-\n");
+	}
+	ret = simple_read_from_buffer(ubuf, count, ppos, buf, strlen(buf));
+	mutex_unlock(&ksym_tracer_mutex);
+
+	return ret;
+}
+
+static ssize_t ksym_trace_filter_write(struct file *file,
+					const char __user *buffer,
+						size_t count, loff_t *ppos)
+{
+	struct trace_ksym *entry;
+	struct hlist_node *node;
+	char *input_string, *ksymname = NULL;
+	unsigned long ksym_addr = 0;
+	int ret, op, changed = 0;
+
+	/* Ignore echo "" > ksym_trace_filter */
+	if (count == 0)
+		return 0;
+
+	input_string = kzalloc(count, GFP_KERNEL);
+	if (!input_string)
+		return -ENOMEM;
+
+	if (copy_from_user(input_string, buffer, count)) {
+		kfree(input_string);
+		return -EFAULT;
+	}
+
+	ret = op = parse_ksym_trace_str(input_string, &ksymname, &ksym_addr);
+	if (ret < 0) {
+		kfree(input_string);
+		return ret;
+	}
+
+	mutex_lock(&ksym_tracer_mutex);
+
+	ret = -EINVAL;
+	hlist_for_each_entry(entry, node, &ksym_filter_head, ksym_hlist) {
+		if (entry->ksym_addr == ksym_addr) {
+			/* Check for malformed request: (6) */
+			if (entry->ksym_hbkpt->info.type != op)
+				changed = 1;
+			else
+				goto err_ret;
+			break;
+		}
+	}
+	if (changed) {
+		unregister_kernel_hw_breakpoint(entry->ksym_hbkpt);
+		entry->ksym_hbkpt->info.type = op;
+		if (op > 0) {
+			ret = register_kernel_hw_breakpoint(entry->ksym_hbkpt);
+			if (ret == 0) {
+				ret = count;
+				goto unlock_ret_path;
+			}
+		}
+		ksym_filter_entry_count--;
+		hlist_del(&(entry->ksym_hlist));
+		kfree(entry->ksym_hbkpt);
+		kfree(entry);
+		ret = count;
+		goto err_ret;
+	} else {
+		/* Check for malformed request: (4) */
+		if (op == 0)
+			goto err_ret;
+		ret = process_new_ksym_entry(ksymname, op, ksym_addr);
+		if (ret)
+			goto err_ret;
+	}
+	ret = count;
+	goto unlock_ret_path;
+
+err_ret:
+	kfree(input_string);
+
+unlock_ret_path:
+	mutex_unlock(&ksym_tracer_mutex);
+	return ret;
+}
+
+static const struct file_operations ksym_tracing_fops = {
+	.open		= tracing_open_generic,
+	.read		= ksym_trace_filter_read,
+	.write		= ksym_trace_filter_write,
+};
+
+static void ksym_trace_reset(struct trace_array *tr)
+{
+	struct trace_ksym *entry;
+	struct hlist_node *node, *node1;
+
+	ksym_tracing_enabled = 0;
+
+	mutex_lock(&ksym_tracer_mutex);
+	hlist_for_each_entry_safe(entry, node, node1, &ksym_filter_head,
+								ksym_hlist) {
+		unregister_kernel_hw_breakpoint(entry->ksym_hbkpt);
+		ksym_filter_entry_count--;
+		hlist_del(&(entry->ksym_hlist));
+
+		/* Free the 'input_string' only if reset
+		 * after startup self-test
+		 */
+#ifdef CONFIG_FTRACE_SELFTEST
+		if (strncmp(entry->ksym_hbkpt->info.name, KSYM_SELFTEST_ENTRY,
+					strlen(KSYM_SELFTEST_ENTRY)) != 0)
+#endif /* CONFIG_FTRACE_SELFTEST*/
+			kfree(entry->ksym_hbkpt->info.name);
+		kfree(entry->ksym_hbkpt);
+		kfree(entry);
+	}
+	mutex_unlock(&ksym_tracer_mutex);
+
+}
+
+static int ksym_trace_init(struct trace_array *tr)
+{
+	int cpu, ret = 0;
+
+	for_each_online_cpu(cpu)
+		tracing_reset(tr, cpu);
+
+	ksym_tracing_enabled = 1;
+	ksym_trace_array = tr;
+
+#ifdef CONFIG_FTRACE_SELFTEST
+	/* Check if we are re-entering self-test code during initialisation */
+	if (ksym_selftest_dummy)
+		goto ret_path;
+
+	ksym_selftest_dummy = 0;
+
+	/* Register the read-write tracing request */
+	ret = process_new_ksym_entry(KSYM_SELFTEST_ENTRY, HW_BREAKPOINT_RW,
+					(unsigned long)(&ksym_selftest_dummy));
+
+	if (ret < 0) {
+		printk(KERN_CONT "ksym_trace read-write startup test failed\n");
+		goto ret_path;
+	}
+	/* Perform a read and a write operation over the dummy variable to
+	 * trigger the tracer
+	 */
+	if (ksym_selftest_dummy == 0)
+		ksym_selftest_dummy++;
+
+ret_path:
+#endif /* CONFIG_FTRACE_SELFTEST */
+
+	return ret;
+}
+
+static void ksym_trace_print_header(struct seq_file *m)
+{
+
+	seq_puts(m,
+		 "#       TASK-PID      CPU#      Symbol         Type    "
+		 "Function         \n");
+	seq_puts(m,
+		 "#          |           |          |              |         "
+		 "|            \n");
+}
+
+static enum print_line_t ksym_trace_output(struct trace_iterator *iter)
+{
+	struct trace_entry *entry = iter->ent;
+	struct trace_seq *s = &iter->seq;
+	struct trace_ksym *field;
+	char str[KSYM_SYMBOL_LEN];
+	int ret;
+
+	trace_assign_type(field, entry);
+
+	ret = trace_seq_printf(s, "%-15s %-5d %-3d %-20s ", field->p_name,
+				entry->pid, iter->cpu, field->ksym_name);
+	if (!ret)
+		return TRACE_TYPE_PARTIAL_LINE;
+
+	switch (field->ksym_hbkpt->info.type) {
+	case HW_BREAKPOINT_WRITE:
+		ret = trace_seq_printf(s, " W  ");
+		break;
+	case HW_BREAKPOINT_RW:
+		ret = trace_seq_printf(s, " RW ");
+		break;
+	default:
+		return TRACE_TYPE_PARTIAL_LINE;
+	}
+
+	if (!ret)
+		return TRACE_TYPE_PARTIAL_LINE;
+
+	sprint_symbol(str, field->ip);
+	ret = trace_seq_printf(s, "%-20s\n", str);
+	if (!ret)
+		return TRACE_TYPE_PARTIAL_LINE;
+
+	return TRACE_TYPE_HANDLED;
+}
+
+struct tracer ksym_tracer __read_mostly =
+{
+	.name		= "ksym_tracer",
+	.init		= ksym_trace_init,
+	.reset		= ksym_trace_reset,
+#ifdef CONFIG_FTRACE_SELFTEST
+	.selftest	= trace_selftest_startup_ksym,
+#endif
+	.print_header   = ksym_trace_print_header,
+	.print_line	= ksym_trace_output
+};
+
+__init static int init_ksym_trace(void)
+{
+	struct dentry *d_tracer;
+	struct dentry *entry;
+
+	d_tracer = tracing_init_dentry();
+	ksym_filter_entry_count = 0;
+
+	entry = debugfs_create_file("ksym_trace_filter", 0666, d_tracer,
+				    NULL, &ksym_tracing_fops);
+	if (!entry)
+		pr_warning("Could not create debugfs "
+			   "'ksym_trace_filter' file\n");
+
+	return register_tracer(&ksym_tracer);
+}
+device_initcall(init_ksym_trace);
+
+
+#ifdef CONFIG_PROFILE_KSYM_TRACER
+static int ksym_tracer_stat_headers(struct seq_file *m)
+{
+	seq_printf(m, "   Access type    ");
+	seq_printf(m, "            Symbol                     Counter     \n");
+	return 0;
+}
+
+static int ksym_tracer_stat_show(struct seq_file *m, void *v)
+{
+	struct hlist_node *stat = v;
+	struct trace_ksym *entry;
+	int access_type = 0;
+	char fn_name[KSYM_NAME_LEN];
+
+	entry = hlist_entry(stat, struct trace_ksym, ksym_hlist);
+
+	if (entry->ksym_hbkpt)
+		access_type = entry->ksym_hbkpt->info.type;
+
+	switch (access_type) {
+	case HW_BREAKPOINT_WRITE:
+		seq_printf(m, "     W     ");
+		break;
+	case HW_BREAKPOINT_RW:
+		seq_printf(m, "     RW    ");
+		break;
+	default:
+		seq_printf(m, "     NA    ");
+	}
+
+	if (lookup_symbol_name(entry->ksym_addr, fn_name) >= 0)
+		seq_printf(m, "               %s                 ", fn_name);
+	else
+		seq_printf(m, "               <NA>                ");
+
+	seq_printf(m, "%15lu\n", entry->counter);
+	return 0;
+}
+
+static void *ksym_tracer_stat_start(void)
+{
+	return &(ksym_filter_head.first);
+}
+
+static void *
+ksym_tracer_stat_next(void *v, int idx)
+{
+	struct hlist_node *stat = v;
+
+	return stat->next;
+}
+
+static struct tracer_stat ksym_tracer_stats = {
+	.name = "ksym_tracer",
+	.stat_start = ksym_tracer_stat_start,
+	.stat_next = ksym_tracer_stat_next,
+	.stat_headers = ksym_tracer_stat_headers,
+	.stat_show = ksym_tracer_stat_show
+};
+
+__init static int ksym_tracer_stat_init(void)
+{
+	int ret;
+
+	ret = register_stat_tracer(&ksym_tracer_stats);
+	if (!ret) {
+		printk(KERN_WARNING "Warning: could not register "
+				    "ksym tracer stats\n");
+		return 1;
+	}
+
+	return 0;
+}
+fs_initcall(ksym_tracer_stat_init);
+#endif /* CONFIG_PROFILE_KSYM_TRACER */
Index: linux-2.6-tip.hbkpt/kernel/trace/trace_selftest.c
===================================================================
--- linux-2.6-tip.hbkpt.orig/kernel/trace/trace_selftest.c
+++ linux-2.6-tip.hbkpt/kernel/trace/trace_selftest.c
@@ -16,6 +16,7 @@ static inline int trace_valid_entry(stru
 	case TRACE_BRANCH:
 	case TRACE_GRAPH_ENT:
 	case TRACE_GRAPH_RET:
+	case TRACE_KSYM:
 		return 1;
 	}
 	return 0;
@@ -687,3 +688,38 @@ trace_selftest_startup_branch(struct tra
 	return ret;
 }
 #endif /* CONFIG_BRANCH_TRACER */
+
+#ifdef CONFIG_KSYM_TRACER
+int
+trace_selftest_startup_ksym(struct tracer *trace, struct trace_array *tr)
+{
+	unsigned long count;
+	int ret;
+
+	/* start the tracing */
+	ret = tracer_init(trace, tr);
+	if (ret) {
+		warn_failed_init_tracer(trace, ret);
+		return ret;
+	}
+
+	/* Sleep for a 1/10 of a second */
+	msleep(100);
+	/* stop the tracing. */
+	tracing_stop();
+	/* check the trace buffer */
+	ret = trace_test_buffer(tr, &count);
+	trace->reset(tr);
+	tracing_start();
+
+	/* read & write operations - one each is performed on the dummy variable
+	 * triggering two entries in the trace buffer
+	 */
+	if (!ret && count != 2) {
+		printk(KERN_CONT "Ksym tracer startup test failed");
+		ret = -1;
+	}
+
+	return ret;
+}
+#endif /* CONFIG_KSYM_TRACER */


^ permalink raw reply	[flat|nested] 55+ messages in thread

* Re: [Patch 11/11] ftrace plugin for kernel symbol tracing using HW Breakpoint interfaces - v2
  2009-03-19 23:50 ` [Patch 11/11] ftrace plugin for kernel symbol tracing using HW Breakpoint interfaces - v2 K.Prasad
@ 2009-03-20  9:04   ` Frederic Weisbecker
  2009-03-21 16:24     ` K.Prasad
  0 siblings, 1 reply; 55+ messages in thread
From: Frederic Weisbecker @ 2009-03-20  9:04 UTC (permalink / raw)
  To: K.Prasad
  Cc: Ingo Molnar, Linux Kernel Mailing List, Alan Stern,
	Andrew Morton, Benjamin Herrenschmidt, Maneesh Soni,
	Roland McGrath, Steven Rostedt

On Fri, Mar 20, 2009 at 05:20:32AM +0530, K.Prasad wrote:
> This patch adds an ftrace plugin to detect and profile memory access over
> kernel variables. It uses HW Breakpoint interfaces to 'watch memory
> addresses.
> 
> Signed-off-by: K.Prasad <prasad@linux.vnet.ibm.com> 
> ---
>  kernel/trace/Kconfig          |   21 +
>  kernel/trace/Makefile         |    1 
>  kernel/trace/trace.h          |   25 +
>  kernel/trace/trace_ksym.c     |  555 ++++++++++++++++++++++++++++++++++++++++++
>  kernel/trace/trace_selftest.c |   36 ++
>  5 files changed, 638 insertions(+)
> 
> Index: linux-2.6-tip.hbkpt/kernel/trace/Kconfig
> ===================================================================
> --- linux-2.6-tip.hbkpt.orig/kernel/trace/Kconfig
> +++ linux-2.6-tip.hbkpt/kernel/trace/Kconfig
> @@ -264,6 +264,27 @@ config POWER_TRACER
>  	  power management decisions, specifically the C-state and P-state
>  	  behavior.
>  
> +config KSYM_TRACER
> +	bool "Trace read and write access on kernel memory locations"
> +	depends on HAVE_HW_BREAKPOINT
> +	select TRACING
> +	help
> +	  This tracer helps find read and write operations on any given kernel
> +	  symbol i.e. /proc/kallsyms.
> +
> +config PROFILE_KSYM_TRACER
> +	bool "Profile all kernel memory accesses on 'watched' variables"
> +	depends on KSYM_TRACER
> +	help
> +	  This tracer profiles kernel accesses on variables watched through the
> +	  ksym tracer ftrace plugin. Depending upon the hardware, all read
> +	  and write operations on kernel variables can be monitored for
> +	  accesses.
> +
> +	  The results will be displayed in:
> +	  /debugfs/tracing/profile_ksym
> +
> +	  Say N if unsure.
>  
>  config STACK_TRACER
>  	bool "Trace max stack"
> Index: linux-2.6-tip.hbkpt/kernel/trace/Makefile
> ===================================================================
> --- linux-2.6-tip.hbkpt.orig/kernel/trace/Makefile
> +++ linux-2.6-tip.hbkpt/kernel/trace/Makefile
> @@ -44,5 +44,6 @@ obj-$(CONFIG_EVENT_TRACER) += trace_even
>  obj-$(CONFIG_EVENT_TRACER) += events.o
>  obj-$(CONFIG_EVENT_TRACER) += trace_export.o
>  obj-$(CONFIG_FTRACE_SYSCALLS) += trace_syscalls.o
> +obj-$(CONFIG_KSYM_TRACER) += trace_ksym.o
>  
>  libftrace-y := ftrace.o
> Index: linux-2.6-tip.hbkpt/kernel/trace/trace.h
> ===================================================================
> --- linux-2.6-tip.hbkpt.orig/kernel/trace/trace.h
> +++ linux-2.6-tip.hbkpt/kernel/trace/trace.h
> @@ -12,6 +12,10 @@
>  #include <trace/kmemtrace.h>
>  #include <trace/power.h>
>  
> +#ifdef CONFIG_KSYM_TRACER
> +#include <asm/hw_breakpoint.h>
> +#endif
> +
>  enum trace_type {
>  	__TRACE_FIRST_TYPE = 0,
>  
> @@ -37,6 +41,7 @@ enum trace_type {
>  	TRACE_KMEM_FREE,
>  	TRACE_POWER,
>  	TRACE_BLK,
> +	TRACE_KSYM,
>  
>  	__TRACE_LAST_TYPE,
>  };
> @@ -214,6 +219,23 @@ struct syscall_trace_exit {
>  	unsigned long		ret;
>  };
>  
> +#ifdef CONFIG_KSYM_TRACER
> +struct trace_ksym {
> +	struct trace_entry	ent;
> +	struct hw_breakpoint	*ksym_hbkpt;
> +	unsigned long		ksym_addr;
> +	unsigned long		ip;
> +#ifdef CONFIG_PROFILE_KSYM_TRACER
> +	unsigned long 		counter;
> +#endif
> +	struct hlist_node	ksym_hlist;
> +	char			ksym_name[KSYM_NAME_LEN];
> +	char			p_name[TASK_COMM_LEN];
> +};
> +#else
> +struct trace_ksym {
> +};
> +#endif /* CONFIG_KSYM_TRACER */
>  
>  /*
>   * trace_flag_type is an enumeration that holds different
> @@ -332,6 +354,7 @@ extern void __ftrace_bad_type(void);
>  			  TRACE_SYSCALL_ENTER);				\
>  		IF_ASSIGN(var, ent, struct syscall_trace_exit,		\
>  			  TRACE_SYSCALL_EXIT);				\
> +		IF_ASSIGN(var, ent, struct trace_ksym, TRACE_KSYM);	\
>  		__ftrace_bad_type();					\
>  	} while (0)
>  
> @@ -593,6 +616,8 @@ extern int trace_selftest_startup_syspro
>  					       struct trace_array *tr);
>  extern int trace_selftest_startup_branch(struct tracer *trace,
>  					 struct trace_array *tr);
> +extern int trace_selftest_startup_ksym(struct tracer *trace,
> +					 struct trace_array *tr);
>  #endif /* CONFIG_FTRACE_STARTUP_TEST */
>  
>  extern void *head_page(struct trace_array_cpu *data);
> Index: linux-2.6-tip.hbkpt/kernel/trace/trace_ksym.c
> ===================================================================
> --- /dev/null
> +++ linux-2.6-tip.hbkpt/kernel/trace/trace_ksym.c
> @@ -0,0 +1,555 @@
> +/*
> + * trace_ksym.c - Kernel Symbol Tracer
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License as published by
> + * the Free Software Foundation; either version 2 of the License, or
> + * (at your option) any later version.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> + * GNU General Public License for more details.
> + *
> + * You should have received a copy of the GNU General Public License
> + * along with this program; if not, write to the Free Software
> + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
> + *
> + * Copyright (C) IBM Corporation, 2009
> + */
> +
> +#include <linux/kallsyms.h>
> +#include <linux/uaccess.h>
> +#include <linux/debugfs.h>
> +#include <linux/ftrace.h>
> +#include <linux/module.h>
> +#include <linux/jhash.h>
> +#include <linux/fs.h>
> +
> +#include "trace_output.h"
> +#include "trace_stat.h"
> +#include "trace.h"
> +
> +/* For now, let us restrict the no. of symbols traced simultaneously to number
> + * of available hardware breakpoint registers.
> + */
> +#define KSYM_TRACER_MAX HB_NUM
> +
> +#define KSYM_TRACER_OP_LEN 3 /* rw- */
> +#define KSYM_FILTER_ENTRY_LEN (KSYM_NAME_LEN + KSYM_TRACER_OP_LEN + 1)
> +
> +#ifdef CONFIG_FTRACE_SELFTEST
> +
> +static int ksym_selftest_dummy;
> +#define KSYM_SELFTEST_ENTRY "ksym_selftest_dummy"
> +
> +#endif /* CONFIG_FTRACE_SELFTEST */
> +
> +static struct trace_array *ksym_trace_array;
> +
> +DEFINE_MUTEX(ksym_tracer_mutex);
> +
> +static unsigned int ksym_filter_entry_count;
> +static unsigned int ksym_tracing_enabled;
> +
> +static HLIST_HEAD(ksym_filter_head);
> +
> +#ifdef CONFIG_PROFILE_KSYM_TRACER
> +
> +#define MAX_UL_INT 0xffffffff
> +DEFINE_SPINLOCK(ksym_stat_lock);
> +
> +void ksym_collect_stats(unsigned long hbkpt_hit_addr)
> +{
> +	struct hlist_node *node;
> +	struct trace_ksym *entry;
> +
> +	spin_lock(&ksym_stat_lock);


I see that can be called from ksym_hbkpt_handler which in turn
can be called from interrupt context, right?
You can issue a deadlock if you don't disable interrupts here.

Thanks,
Frederic.

> +	hlist_for_each_entry(entry, node, &ksym_filter_head, ksym_hlist) {
> +		if ((entry->ksym_addr == hbkpt_hit_addr) &&
> +		    (entry->counter <= MAX_UL_INT)) {
> +			entry->counter++;
> +			break;
> +		}
> +	}
> +	spin_unlock(&ksym_stat_lock);
> +}
> +#endif /* CONFIG_PROFILE_KSYM_TRACER */
> +
> +void ksym_hbkpt_handler(struct hw_breakpoint *hbkpt, struct pt_regs *regs)
> +{
> +	struct ring_buffer_event *event;
> +	struct trace_array *tr;
> +	struct trace_ksym *entry;
> +	int pc;
> +
> +	if (!ksym_tracing_enabled)
> +		return;
> +
> +	tr = ksym_trace_array;
> +	pc = preempt_count();
> +
> +	event = trace_buffer_lock_reserve(tr, TRACE_KSYM,
> +							sizeof(*entry), 0, pc);
> +	if (!event)
> +		return;
> +
> +	entry = ring_buffer_event_data(event);
> +	strlcpy(entry->ksym_name, hbkpt->info.name, KSYM_SYMBOL_LEN);
> +	entry->ksym_hbkpt = hbkpt;
> +	entry->ip = instruction_pointer(regs);
> +	strlcpy(entry->p_name, current->comm, TASK_COMM_LEN);
> +#ifdef CONFIG_PROFILE_KSYM_TRACER
> +	ksym_collect_stats(hbkpt->info.address);
> +#endif /* CONFIG_PROFILE_KSYM_TRACER */
> +
> +	trace_buffer_unlock_commit(tr, event, 0, pc);
> +}
> +
> +/* Valid access types are represented as
> + *
> + * rw- : Set Read/Write Access Breakpoint
> + * -w- : Set Write Access Breakpoint
> + * --- : Clear Breakpoints
> + * --x : Set Execution Break points (Not available yet)
> + *
> + */
> +static int ksym_trace_get_access_type(char *access_str)
> +{
> +	int pos, access = 0;
> +
> +	for (pos = 0; pos < KSYM_TRACER_OP_LEN; pos++) {
> +		switch (access_str[pos]) {
> +		case 'r':
> +			access += (pos == 0) ? 4 : -1;
> +			break;
> +		case 'w':
> +			access += (pos == 1) ? 2 : -1;
> +			break;
> +		case '-':
> +			break;
> +		default:
> +			return -EINVAL;
> +		}
> +	}
> +
> +	switch (access) {
> +	case 6:
> +		access = HW_BREAKPOINT_RW;
> +		break;
> +	case 2:
> +		access = HW_BREAKPOINT_WRITE;
> +		break;
> +	case 0:
> +		access = 0;
> +	}
> +
> +	return access;
> +}
> +
> +/*
> + * There can be several possible malformed requests and we attempt to capture
> + * all of them. We enumerate some of the rules
> + * 1. We will not allow kernel symbols with ':' since it is used as a delimiter.
> + *    i.e. multiple ':' symbols disallowed. Possible uses are of the form
> + *    <module>:<ksym_name>:<op>.
> + * 2. No delimiter symbol ':' in the input string
> + * 3. Spurious operator symbols or symbols not in their respective positions
> + * 4. <ksym_name>:--- i.e. clear breakpoint request when ksym_name not in file
> + * 5. Kernel symbol not a part of /proc/kallsyms
> + * 6. Duplicate requests
> + */
> +static int parse_ksym_trace_str(char *input_string, char **ksymname,
> +							unsigned long *addr)
> +{
> +	char *delimiter = ":";
> +	int ret;
> +
> +	ret = -EINVAL;
> +	*ksymname = strsep(&input_string, delimiter);
> +	*addr = kallsyms_lookup_name(*ksymname);
> +
> +	/* Check for malformed request: (2), (1) and (5) */
> +	if ((!input_string) ||
> +		(strlen(input_string) != (KSYM_TRACER_OP_LEN + 1)) ||
> +			(*addr == 0))
> +		goto return_code;
> +	ret = ksym_trace_get_access_type(input_string);
> +
> +return_code:
> +	return ret;
> +}
> +
> +static int process_new_ksym_entry(char *ksymname, int op, unsigned long addr)
> +{
> +	struct trace_ksym *entry;
> +	int ret;
> +
> +	if (ksym_filter_entry_count >= KSYM_TRACER_MAX) {
> +		printk(KERN_ERR "ksym_tracer: Maximum limit:(%d) reached. No"
> +		" new requests for tracing can be accepted now.\n",
> +			KSYM_TRACER_MAX);
> +		return -ENOSPC;
> +	}
> +
> +	entry = kzalloc(sizeof(struct trace_ksym), GFP_KERNEL);
> +	if (!entry)
> +		return -ENOMEM;
> +
> +	entry->ksym_hbkpt = kzalloc(sizeof(struct hw_breakpoint), GFP_KERNEL);
> +	if (!entry->ksym_hbkpt) {
> +		kfree(entry);
> +		return -ENOMEM;
> +	}
> +
> +	entry->ksym_hbkpt->info.name = ksymname;
> +	entry->ksym_hbkpt->info.type = op;
> +	entry->ksym_addr = entry->ksym_hbkpt->info.address = addr;
> +	entry->ksym_hbkpt->info.len = HW_BREAKPOINT_LEN_4;
> +
> +	entry->ksym_hbkpt->triggered = (void *)ksym_hbkpt_handler;
> +
> +	ret = register_kernel_hw_breakpoint(entry->ksym_hbkpt);
> +	if (ret < 0) {
> +		printk(KERN_INFO "ksym_tracer request failed. Try again"
> +					" later!!\n");
> +		kfree(entry->ksym_hbkpt);
> +		kfree(entry);
> +		return -EAGAIN;
> +	}
> +	hlist_add_head(&(entry->ksym_hlist), &ksym_filter_head);
> +	ksym_filter_entry_count++;
> +
> +	return 0;
> +}
> +
> +static ssize_t ksym_trace_filter_read(struct file *filp, char __user *ubuf,
> +						size_t count, loff_t *ppos)
> +{
> +	struct trace_ksym *entry;
> +	struct hlist_node *node;
> +	char buf[KSYM_FILTER_ENTRY_LEN * KSYM_TRACER_MAX];
> +	ssize_t ret, cnt = 0;
> +
> +	mutex_lock(&ksym_tracer_mutex);
> +
> +	hlist_for_each_entry(entry, node, &ksym_filter_head, ksym_hlist) {
> +		cnt += snprintf(&buf[cnt], KSYM_FILTER_ENTRY_LEN - cnt, "%s:",
> +				entry->ksym_hbkpt->info.name);
> +		if (entry->ksym_hbkpt->info.type == HW_BREAKPOINT_WRITE)
> +			cnt += snprintf(&buf[cnt], KSYM_FILTER_ENTRY_LEN - cnt,
> +								"-w-\n");
> +		else if (entry->ksym_hbkpt->info.type == HW_BREAKPOINT_RW)
> +			cnt += snprintf(&buf[cnt], KSYM_FILTER_ENTRY_LEN - cnt,
> +								"rw-\n");
> +	}
> +	ret = simple_read_from_buffer(ubuf, count, ppos, buf, strlen(buf));
> +	mutex_unlock(&ksym_tracer_mutex);
> +
> +	return ret;
> +}
> +
> +static ssize_t ksym_trace_filter_write(struct file *file,
> +					const char __user *buffer,
> +						size_t count, loff_t *ppos)
> +{
> +	struct trace_ksym *entry;
> +	struct hlist_node *node;
> +	char *input_string, *ksymname = NULL;
> +	unsigned long ksym_addr = 0;
> +	int ret, op, changed = 0;
> +
> +	/* Ignore echo "" > ksym_trace_filter */
> +	if (count == 0)
> +		return 0;
> +
> +	input_string = kzalloc(count, GFP_KERNEL);
> +	if (!input_string)
> +		return -ENOMEM;
> +
> +	if (copy_from_user(input_string, buffer, count)) {
> +		kfree(input_string);
> +		return -EFAULT;
> +	}
> +
> +	ret = op = parse_ksym_trace_str(input_string, &ksymname, &ksym_addr);
> +	if (ret < 0) {
> +		kfree(input_string);
> +		return ret;
> +	}
> +
> +	mutex_lock(&ksym_tracer_mutex);
> +
> +	ret = -EINVAL;
> +	hlist_for_each_entry(entry, node, &ksym_filter_head, ksym_hlist) {
> +		if (entry->ksym_addr == ksym_addr) {
> +			/* Check for malformed request: (6) */
> +			if (entry->ksym_hbkpt->info.type != op)
> +				changed = 1;
> +			else
> +				goto err_ret;
> +			break;
> +		}
> +	}
> +	if (changed) {
> +		unregister_kernel_hw_breakpoint(entry->ksym_hbkpt);
> +		entry->ksym_hbkpt->info.type = op;
> +		if (op > 0) {
> +			ret = register_kernel_hw_breakpoint(entry->ksym_hbkpt);
> +			if (ret == 0) {
> +				ret = count;
> +				goto unlock_ret_path;
> +			}
> +		}
> +		ksym_filter_entry_count--;
> +		hlist_del(&(entry->ksym_hlist));
> +		kfree(entry->ksym_hbkpt);
> +		kfree(entry);
> +		ret = count;
> +		goto err_ret;
> +	} else {
> +		/* Check for malformed request: (4) */
> +		if (op == 0)
> +			goto err_ret;
> +		ret = process_new_ksym_entry(ksymname, op, ksym_addr);
> +		if (ret)
> +			goto err_ret;
> +	}
> +	ret = count;
> +	goto unlock_ret_path;
> +
> +err_ret:
> +	kfree(input_string);
> +
> +unlock_ret_path:
> +	mutex_unlock(&ksym_tracer_mutex);
> +	return ret;
> +}
> +
> +static const struct file_operations ksym_tracing_fops = {
> +	.open		= tracing_open_generic,
> +	.read		= ksym_trace_filter_read,
> +	.write		= ksym_trace_filter_write,
> +};
> +
> +static void ksym_trace_reset(struct trace_array *tr)
> +{
> +	struct trace_ksym *entry;
> +	struct hlist_node *node, *node1;
> +
> +	ksym_tracing_enabled = 0;
> +
> +	mutex_lock(&ksym_tracer_mutex);
> +	hlist_for_each_entry_safe(entry, node, node1, &ksym_filter_head,
> +								ksym_hlist) {
> +		unregister_kernel_hw_breakpoint(entry->ksym_hbkpt);
> +		ksym_filter_entry_count--;
> +		hlist_del(&(entry->ksym_hlist));
> +
> +		/* Free the 'input_string' only if reset
> +		 * after startup self-test
> +		 */
> +#ifdef CONFIG_FTRACE_SELFTEST
> +		if (strncmp(entry->ksym_hbkpt->info.name, KSYM_SELFTEST_ENTRY,
> +					strlen(KSYM_SELFTEST_ENTRY)) != 0)
> +#endif /* CONFIG_FTRACE_SELFTEST*/
> +			kfree(entry->ksym_hbkpt->info.name);
> +		kfree(entry->ksym_hbkpt);
> +		kfree(entry);
> +	}
> +	mutex_unlock(&ksym_tracer_mutex);
> +
> +}
> +
> +static int ksym_trace_init(struct trace_array *tr)
> +{
> +	int cpu, ret = 0;
> +
> +	for_each_online_cpu(cpu)
> +		tracing_reset(tr, cpu);
> +
> +	ksym_tracing_enabled = 1;
> +	ksym_trace_array = tr;
> +
> +#ifdef CONFIG_FTRACE_SELFTEST
> +	/* Check if we are re-entering self-test code during initialisation */
> +	if (ksym_selftest_dummy)
> +		goto ret_path;
> +
> +	ksym_selftest_dummy = 0;
> +
> +	/* Register the read-write tracing request */
> +	ret = process_new_ksym_entry(KSYM_SELFTEST_ENTRY, HW_BREAKPOINT_RW,
> +					(unsigned long)(&ksym_selftest_dummy));
> +
> +	if (ret < 0) {
> +		printk(KERN_CONT "ksym_trace read-write startup test failed\n");
> +		goto ret_path;
> +	}
> +	/* Perform a read and a write operation over the dummy variable to
> +	 * trigger the tracer
> +	 */
> +	if (ksym_selftest_dummy == 0)
> +		ksym_selftest_dummy++;
> +
> +ret_path:
> +#endif /* CONFIG_FTRACE_SELFTEST */
> +
> +	return ret;
> +}
> +
> +static void ksym_trace_print_header(struct seq_file *m)
> +{
> +
> +	seq_puts(m,
> +		 "#       TASK-PID      CPU#      Symbol         Type    "
> +		 "Function         \n");
> +	seq_puts(m,
> +		 "#          |           |          |              |         "
> +		 "|            \n");
> +}
> +
> +static enum print_line_t ksym_trace_output(struct trace_iterator *iter)
> +{
> +	struct trace_entry *entry = iter->ent;
> +	struct trace_seq *s = &iter->seq;
> +	struct trace_ksym *field;
> +	char str[KSYM_SYMBOL_LEN];
> +	int ret;
> +
> +	trace_assign_type(field, entry);
> +
> +	ret = trace_seq_printf(s, "%-15s %-5d %-3d %-20s ", field->p_name,
> +				entry->pid, iter->cpu, field->ksym_name);
> +	if (!ret)
> +		return TRACE_TYPE_PARTIAL_LINE;
> +
> +	switch (field->ksym_hbkpt->info.type) {
> +	case HW_BREAKPOINT_WRITE:
> +		ret = trace_seq_printf(s, " W  ");
> +		break;
> +	case HW_BREAKPOINT_RW:
> +		ret = trace_seq_printf(s, " RW ");
> +		break;
> +	default:
> +		return TRACE_TYPE_PARTIAL_LINE;
> +	}
> +
> +	if (!ret)
> +		return TRACE_TYPE_PARTIAL_LINE;
> +
> +	sprint_symbol(str, field->ip);
> +	ret = trace_seq_printf(s, "%-20s\n", str);
> +	if (!ret)
> +		return TRACE_TYPE_PARTIAL_LINE;
> +
> +	return TRACE_TYPE_HANDLED;
> +}
> +
> +struct tracer ksym_tracer __read_mostly =
> +{
> +	.name		= "ksym_tracer",
> +	.init		= ksym_trace_init,
> +	.reset		= ksym_trace_reset,
> +#ifdef CONFIG_FTRACE_SELFTEST
> +	.selftest	= trace_selftest_startup_ksym,
> +#endif
> +	.print_header   = ksym_trace_print_header,
> +	.print_line	= ksym_trace_output
> +};
> +
> +__init static int init_ksym_trace(void)
> +{
> +	struct dentry *d_tracer;
> +	struct dentry *entry;
> +
> +	d_tracer = tracing_init_dentry();
> +	ksym_filter_entry_count = 0;
> +
> +	entry = debugfs_create_file("ksym_trace_filter", 0666, d_tracer,
> +				    NULL, &ksym_tracing_fops);
> +	if (!entry)
> +		pr_warning("Could not create debugfs "
> +			   "'ksym_trace_filter' file\n");
> +
> +	return register_tracer(&ksym_tracer);
> +}
> +device_initcall(init_ksym_trace);
> +
> +
> +#ifdef CONFIG_PROFILE_KSYM_TRACER
> +static int ksym_tracer_stat_headers(struct seq_file *m)
> +{
> +	seq_printf(m, "   Access type    ");
> +	seq_printf(m, "            Symbol                     Counter     \n");
> +	return 0;
> +}
> +
> +static int ksym_tracer_stat_show(struct seq_file *m, void *v)
> +{
> +	struct hlist_node *stat = v;
> +	struct trace_ksym *entry;
> +	int access_type = 0;
> +	char fn_name[KSYM_NAME_LEN];
> +
> +	entry = hlist_entry(stat, struct trace_ksym, ksym_hlist);
> +
> +	if (entry->ksym_hbkpt)
> +		access_type = entry->ksym_hbkpt->info.type;
> +
> +	switch (access_type) {
> +	case HW_BREAKPOINT_WRITE:
> +		seq_printf(m, "     W     ");
> +		break;
> +	case HW_BREAKPOINT_RW:
> +		seq_printf(m, "     RW    ");
> +		break;
> +	default:
> +		seq_printf(m, "     NA    ");
> +	}
> +
> +	if (lookup_symbol_name(entry->ksym_addr, fn_name) >= 0)
> +		seq_printf(m, "               %s                 ", fn_name);
> +	else
> +		seq_printf(m, "               <NA>                ");
> +
> +	seq_printf(m, "%15lu\n", entry->counter);
> +	return 0;
> +}
> +
> +static void *ksym_tracer_stat_start(void)
> +{
> +	return &(ksym_filter_head.first);
> +}
> +
> +static void *
> +ksym_tracer_stat_next(void *v, int idx)
> +{
> +	struct hlist_node *stat = v;
> +
> +	return stat->next;
> +}
> +
> +static struct tracer_stat ksym_tracer_stats = {
> +	.name = "ksym_tracer",
> +	.stat_start = ksym_tracer_stat_start,
> +	.stat_next = ksym_tracer_stat_next,
> +	.stat_headers = ksym_tracer_stat_headers,
> +	.stat_show = ksym_tracer_stat_show
> +};
> +
> +__init static int ksym_tracer_stat_init(void)
> +{
> +	int ret;
> +
> +	ret = register_stat_tracer(&ksym_tracer_stats);
> +	if (!ret) {
> +		printk(KERN_WARNING "Warning: could not register "
> +				    "ksym tracer stats\n");
> +		return 1;
> +	}
> +
> +	return 0;
> +}
> +fs_initcall(ksym_tracer_stat_init);
> +#endif /* CONFIG_PROFILE_KSYM_TRACER */
> Index: linux-2.6-tip.hbkpt/kernel/trace/trace_selftest.c
> ===================================================================
> --- linux-2.6-tip.hbkpt.orig/kernel/trace/trace_selftest.c
> +++ linux-2.6-tip.hbkpt/kernel/trace/trace_selftest.c
> @@ -16,6 +16,7 @@ static inline int trace_valid_entry(stru
>  	case TRACE_BRANCH:
>  	case TRACE_GRAPH_ENT:
>  	case TRACE_GRAPH_RET:
> +	case TRACE_KSYM:
>  		return 1;
>  	}
>  	return 0;
> @@ -687,3 +688,38 @@ trace_selftest_startup_branch(struct tra
>  	return ret;
>  }
>  #endif /* CONFIG_BRANCH_TRACER */
> +
> +#ifdef CONFIG_KSYM_TRACER
> +int
> +trace_selftest_startup_ksym(struct tracer *trace, struct trace_array *tr)
> +{
> +	unsigned long count;
> +	int ret;
> +
> +	/* start the tracing */
> +	ret = tracer_init(trace, tr);
> +	if (ret) {
> +		warn_failed_init_tracer(trace, ret);
> +		return ret;
> +	}
> +
> +	/* Sleep for a 1/10 of a second */
> +	msleep(100);
> +	/* stop the tracing. */
> +	tracing_stop();
> +	/* check the trace buffer */
> +	ret = trace_test_buffer(tr, &count);
> +	trace->reset(tr);
> +	tracing_start();
> +
> +	/* read & write operations - one each is performed on the dummy variable
> +	 * triggering two entries in the trace buffer
> +	 */
> +	if (!ret && count != 2) {
> +		printk(KERN_CONT "Ksym tracer startup test failed");
> +		ret = -1;
> +	}
> +
> +	return ret;
> +}
> +#endif /* CONFIG_KSYM_TRACER */
> 


^ permalink raw reply	[flat|nested] 55+ messages in thread

* Re: [Patch 01/11] Introducing generic hardware breakpoint handler interfaces
  2009-03-19 23:48 ` [Patch 01/11] Introducing generic hardware breakpoint handler interfaces K.Prasad
@ 2009-03-20 14:33   ` Alan Stern
  2009-03-20 18:30     ` Ingo Molnar
                       ` (2 more replies)
  0 siblings, 3 replies; 55+ messages in thread
From: Alan Stern @ 2009-03-20 14:33 UTC (permalink / raw)
  To: K.Prasad
  Cc: Ingo Molnar, Linux Kernel Mailing List, Andrew Morton,
	Benjamin Herrenschmidt, Frederic Weisbecker, Maneesh Soni,
	Roland McGrath, Steven Rostedt

On Fri, 20 Mar 2009, K.Prasad wrote:

> This patch introduces two new files hw_breakpoint.[ch] which defines the 
> generic interfaces to use hardware breakpoint infrastructure of the system. 

Prasad:

I'm sorry to say this is full of mistakes.  So far I have looked only 
at patch 01/11, but it's not good.

> + * Kernel breakpoints grow downwards, starting from HB_NUM
> + * 'hbkpt_kernel_pos' denotes lowest numbered breakpoint register occupied for
> + * kernel-space request
> + */
> +unsigned int hbkpt_kernel_pos;

This doesn't make much sense.  All you need to know is which registers
are in use; all others are available.

For example, suppose the kernel allocated breakpoints 3, 2, and 1, and
then deallocated 2.  Then bp 2 would be available for use, even though
2 > 1.

It's also a poor choice of name.  Everywhere else (in my patches,
anyway) the code refers to hardware breakpoints using the abbreviation
"hwbp" or "hw_breakpoint".  There's no reason suddenly to start using
"hbkpt".

> +/* An array containing refcount of threads using a given bkpt register */
> +unsigned int hbkpt_user_max_refcount[HB_NUM];

Why did you put "max" in the name?  Isn't this just a simple refcount?

> +/* One higher than the highest counted user-space breakpoint register */
> +unsigned int hbkpt_user_max;

Likewise, this variable isn't really needed.  It's just one more than
the largest i such that hbkpt_user_max_refcount[i] > 0.

> +/*
> + * Install the debug register values for a new thread.
> + */
> +void switch_to_thread_hw_breakpoint(struct task_struct *tsk)
> +{
> +	/* Set the debug register */

Set _which_ debug register?

> +	arch_install_thread_hbkpt(tsk);
> +	last_debugged_task = current;
> +
> +	put_cpu_no_resched();

What's this line doing here?  It looks like something you forgot to
erase.

> +}
> +
> +/*
> + * Install the debug register values for just the kernel, no thread.
> + */
> +void switch_to_none_hw_breakpoint(void)
> +{
> +	arch_install_none();
> +	put_cpu_no_resched();

Same for this line.

> +}
> +
> +/*
> + * Load the debug registers during startup of a CPU.
> + */
> +void load_debug_registers(void)
> +{
> +	int i;
> +	unsigned long flags;
> +
> +	/* Prevent IPIs for new kernel breakpoint updates */
> +	local_irq_save(flags);
> +
> +	for (i = hbkpt_kernel_pos; i < HB_NUM; i++)
> +		if (hbkpt_kernel[i])
> +			on_each_cpu(arch_install_kernel_hbkpt,
> +				(void *)hbkpt_kernel[i], 0);

This is completely wrong.  First of all, it's dumb to send multiple
IPIs (one for each iteration through the loop).  Second, this routine
shouldn't send any IPIs at all!  It gets invoked when a CPU is
starting up and wants to load its _own_ debug registers -- not tell
another CPU to load anything.

> +	if (current->thread.dr7)
> +		arch_install_thread_hbkpt(current);
> +
> +	local_irq_restore(flags);
> +}
> +
> +/*
> + * Erase all the hardware breakpoint info associated with a thread.
> + *
> + * If tsk != current then tsk must not be usable (for example, a
> + * child being cleaned up from a failed fork).
> + */
> +void flush_thread_hw_breakpoint(struct task_struct *tsk)
> +{
> +	int i;
> +	struct thread_struct *thread = &(tsk->thread);
> +
> +	mutex_lock(&hw_breakpoint_mutex);
> +
> +	/* Let the breakpoints know they are being uninstalled */

This comment looks like a leftover which should have been erased.

> +/*
> + * Validate the settings in a hw_breakpoint structure.
> + */
> +static int validate_settings(struct hw_breakpoint *bp, struct task_struct *tsk)
> +{
> +	int ret;
> +	unsigned int align;
> +
> +	ret = arch_validate_hwbkpt_settings(bp, &align, tsk);
> +	if (ret < 0)
> +		goto err;
> +
> +	/* Check that the low-order bits of the address are appropriate
> +	 * for the alignment implied by len.
> +	 */
> +	if (bp->info.address & align)
> +		return -EINVAL;
> +
> +	/* Check that the virtual address is in the proper range */
> +	if (tsk) {
> +		if (!arch_check_va_in_userspace(bp->info.address, tsk))
> +			return -EFAULT;
> +	} else {
> +		if (!arch_check_va_in_kernelspace(bp->info.address))
> +			return -EFAULT;
> +	}

Roland pointed out that these checks need to take into account the
length of the breakpoint.  For example, in
arch_check_va_in_userspace() it isn't sufficient for the start of the
breakpoint region to be a userspace address; the end of the
breakpoint region must also be in userspace.

> + err:
> +	return ret;
> +}
> +
> +int __register_user_hw_breakpoint(int pos, struct task_struct *tsk,
> +					struct hw_breakpoint *bp)
> +{
> +	struct thread_struct *thread = &(tsk->thread);
> +	int rc;
> +
> +	/* Do not overcommit. Fail if kernel has used the hbkpt registers */
> +	if (pos >= hbkpt_kernel_pos)
> +		return -ENOSPC;

In fact you should fail if the debug register is already in use,
regardless of whether it is being used by a kernel breakpoint.  And you 
shouldn't check against hbkpt_kernel_pos; you should check whether 
hbkpt_kernel[pos] is NULL and thread->hbkpt[pos] is NULL.

> +
> +	rc = validate_settings(bp, tsk);
> +	if (rc)
> +		return rc;
> +
> +	thread->hbkpt[pos] = bp;
> +	thread->hbkpt_num_installed++;
> +	hbkpt_user_max_refcount[pos]++;
> +	/* 'tsk' is the thread that uses max number of hbkpt registers */

This is a bad comment.  It sounds like it's saying that "tsk" is
defined as the thread using the maximum number of breakpoints, rather
than being defined as the thread for which the breakpoint is being
registered.

Besides, there's no reason to keep track of which thread uses the max 
number of breakpoints anyway.  Not to mention the fact that you don't 
update hbkpt_user_max when its thread exits.

> +	if (hbkpt_user_max < thread->hbkpt_num_installed)
> +		hbkpt_user_max++;

At this point I got tired of looking, but it seems obvious that the new 
patch series needs a bunch of improvements.

Alan Stern


^ permalink raw reply	[flat|nested] 55+ messages in thread

* Re: [Patch 01/11] Introducing generic hardware breakpoint handler interfaces
  2009-03-20 14:33   ` Alan Stern
@ 2009-03-20 18:30     ` Ingo Molnar
  2009-03-21 17:32       ` K.Prasad
  2009-03-20 18:32     ` Ingo Molnar
  2009-03-21 17:26     ` K.Prasad
  2 siblings, 1 reply; 55+ messages in thread
From: Ingo Molnar @ 2009-03-20 18:30 UTC (permalink / raw)
  To: Alan Stern
  Cc: K.Prasad, Linux Kernel Mailing List, Andrew Morton,
	Benjamin Herrenschmidt, Frederic Weisbecker, Maneesh Soni,
	Roland McGrath, Steven Rostedt


* Alan Stern <stern@rowland.harvard.edu> wrote:

> > + * Kernel breakpoints grow downwards, starting from HB_NUM
> > + * 'hbkpt_kernel_pos' denotes lowest numbered breakpoint register occupied for
> > + * kernel-space request
> > + */
> > +unsigned int hbkpt_kernel_pos;
> 
> This doesn't make much sense.  All you need to know is which 
> registers are in use; all others are available.
> 
> For example, suppose the kernel allocated breakpoints 3, 2, and 1, 
> and then deallocated 2.  Then bp 2 would be available for use, 
> even though 2 > 1.

it's a high/low watermark mechanism. Yes, it's not an allocator that 
can allocate into a debug registrs 'hole', but it is a simple one 
that matches current hardware breakpoint usages and enables the 
kernel to utilize them as well - and keeps all the code simple.

	Ingo

^ permalink raw reply	[flat|nested] 55+ messages in thread

* Re: [Patch 01/11] Introducing generic hardware breakpoint handler interfaces
  2009-03-20 14:33   ` Alan Stern
  2009-03-20 18:30     ` Ingo Molnar
@ 2009-03-20 18:32     ` Ingo Molnar
  2009-03-21 17:26     ` K.Prasad
  2 siblings, 0 replies; 55+ messages in thread
From: Ingo Molnar @ 2009-03-20 18:32 UTC (permalink / raw)
  To: Alan Stern
  Cc: K.Prasad, Linux Kernel Mailing List, Andrew Morton,
	Benjamin Herrenschmidt, Frederic Weisbecker, Maneesh Soni,
	Roland McGrath, Steven Rostedt


* Alan Stern <stern@rowland.harvard.edu> wrote:

> > +	/* Check that the virtual address is in the proper range */
> > +	if (tsk) {
> > +		if (!arch_check_va_in_userspace(bp->info.address, tsk))
> > +			return -EFAULT;
> > +	} else {
> > +		if (!arch_check_va_in_kernelspace(bp->info.address))
> > +			return -EFAULT;
> > +	}
> 
> Roland pointed out that these checks need to take into account the 
> length of the breakpoint.  For example, in 
> arch_check_va_in_userspace() it isn't sufficient for the start of 
> the breakpoint region to be a userspace address; the end of the 
> breakpoint region must also be in userspace.

i pointed it out - but yes, this needs to be fixed.

	Ingo

^ permalink raw reply	[flat|nested] 55+ messages in thread

* Re: [Patch 11/11] ftrace plugin for kernel symbol tracing using HW Breakpoint interfaces - v2
  2009-03-20  9:04   ` Frederic Weisbecker
@ 2009-03-21 16:24     ` K.Prasad
  2009-03-21 16:39       ` Steven Rostedt
  0 siblings, 1 reply; 55+ messages in thread
From: K.Prasad @ 2009-03-21 16:24 UTC (permalink / raw)
  To: Frederic Weisbecker
  Cc: Ingo Molnar, Linux Kernel Mailing List, Alan Stern,
	Andrew Morton, Benjamin Herrenschmidt, Maneesh Soni,
	Roland McGrath, Steven Rostedt

On Fri, Mar 20, 2009 at 10:04:52AM +0100, Frederic Weisbecker wrote:
> On Fri, Mar 20, 2009 at 05:20:32AM +0530, K.Prasad wrote:
> > This patch adds an ftrace plugin to detect and profile memory access over
> > kernel variables. It uses HW Breakpoint interfaces to 'watch memory
> > addresses.
> > 
> > Signed-off-by: K.Prasad <prasad@linux.vnet.ibm.com> 
> > ---
> >  kernel/trace/Kconfig          |   21 +
> >  kernel/trace/Makefile         |    1 
> >  kernel/trace/trace.h          |   25 +
> >  kernel/trace/trace_ksym.c     |  555 ++++++++++++++++++++++++++++++++++++++++++
> >  kernel/trace/trace_selftest.c |   36 ++
> >  5 files changed, 638 insertions(+)
> > 

> > +
> > +void ksym_collect_stats(unsigned long hbkpt_hit_addr)
> > +{
> > +	struct hlist_node *node;
> > +	struct trace_ksym *entry;
> > +
> > +	spin_lock(&ksym_stat_lock);
> 
> 
> I see that can be called from ksym_hbkpt_handler which in turn
> can be called from interrupt context, right?
> You can issue a deadlock if you don't disable interrupts here.
> 
> Thanks,
> Frederic.
> 

ksym_collect_stats<--ksym_hbkpt_handler<--hw_breakpoint_handler<--do_debug
invocation happens with interrupts enabled (IF bit is set). I do find
that a few plugins in kernel/trace enclose the
trace_buffer_lock_reserve()--trace_buffer_unlock_commit() invocation
within interrupt-disabled code. Is that a requirement there?

The potential deadlock scenario you foresee isn't obvious to me. Can you
explain?

Thanks,
K.Prasad


^ permalink raw reply	[flat|nested] 55+ messages in thread

* Re: [Patch 11/11] ftrace plugin for kernel symbol tracing using HW Breakpoint interfaces - v2
  2009-03-21 16:24     ` K.Prasad
@ 2009-03-21 16:39       ` Steven Rostedt
  2009-03-23 19:08         ` K.Prasad
  0 siblings, 1 reply; 55+ messages in thread
From: Steven Rostedt @ 2009-03-21 16:39 UTC (permalink / raw)
  To: K.Prasad
  Cc: Frederic Weisbecker, Ingo Molnar, Linux Kernel Mailing List,
	Alan Stern, Andrew Morton, Benjamin Herrenschmidt, Maneesh Soni,
	Roland McGrath


On Sat, 21 Mar 2009, K.Prasad wrote:
> > > 
> 
> > > +
> > > +void ksym_collect_stats(unsigned long hbkpt_hit_addr)
> > > +{
> > > +	struct hlist_node *node;
> > > +	struct trace_ksym *entry;
> > > +
> > > +	spin_lock(&ksym_stat_lock);
> > 
> > 
> > I see that can be called from ksym_hbkpt_handler which in turn
> > can be called from interrupt context, right?
> > You can issue a deadlock if you don't disable interrupts here.
> > 
> > Thanks,
> > Frederic.
> > 
> 
> ksym_collect_stats<--ksym_hbkpt_handler<--hw_breakpoint_handler<--do_debug
> invocation happens with interrupts enabled (IF bit is set). I do find
> that a few plugins in kernel/trace enclose the
> trace_buffer_lock_reserve()--trace_buffer_unlock_commit() invocation
> within interrupt-disabled code. Is that a requirement there?
> 
> The potential deadlock scenario you foresee isn't obvious to me. Can you
> explain?

Can that lock ever be taken in an interrupt? If not, document that (and 
perhaps add a WARN_ON(in_interrupt()); ). Otherwise you have a possible:

	spin_lock(&ksym_stat_lock);

		===> take interrupt ...

			(from interrupt)
			spin_lock(&ksym_stat_lock); <== deadlock.


-- Steve


^ permalink raw reply	[flat|nested] 55+ messages in thread

* Re: [Patch 01/11] Introducing generic hardware breakpoint handler interfaces
  2009-03-20 14:33   ` Alan Stern
  2009-03-20 18:30     ` Ingo Molnar
  2009-03-20 18:32     ` Ingo Molnar
@ 2009-03-21 17:26     ` K.Prasad
  2009-03-21 21:39       ` Alan Stern
  2 siblings, 1 reply; 55+ messages in thread
From: K.Prasad @ 2009-03-21 17:26 UTC (permalink / raw)
  To: Alan Stern
  Cc: Ingo Molnar, Linux Kernel Mailing List, Andrew Morton,
	Benjamin Herrenschmidt, Frederic Weisbecker, Maneesh Soni,
	Roland McGrath, Steven Rostedt

On Fri, Mar 20, 2009 at 10:33:26AM -0400, Alan Stern wrote:
> On Fri, 20 Mar 2009, K.Prasad wrote:
> 
> > This patch introduces two new files hw_breakpoint.[ch] which defines the 
> > generic interfaces to use hardware breakpoint infrastructure of the system. 
> 
> Prasad:
> 
> I'm sorry to say this is full of mistakes.  So far I have looked only 
> at patch 01/11, but it's not good.
> 

After you pointed out, I realise that the code in load_debug_registers()
is an overkill and unregister_kernel_hw_breakpoint() has an obvious 
error which should have caught my attention. My next revision should 
fix them.

> > + * Kernel breakpoints grow downwards, starting from HB_NUM
> > + * 'hbkpt_kernel_pos' denotes lowest numbered breakpoint register occupied for
> > + * kernel-space request
> > + */
> > +unsigned int hbkpt_kernel_pos;
> 
> This doesn't make much sense.  All you need to know is which registers
> are in use; all others are available.
> 

As explained by Maneesh earlier, we compact the kernel-space requests
into registers (HB_NUM - 1) to hbkpt_kernel_pos. The kernel-space
requests aren't specific to any given register number too, and so
compaction would be suitable for this case (unlike when implemented for
user-space which might need virtualisation of registers).

> For example, suppose the kernel allocated breakpoints 3, 2, and 1, and
> then deallocated 2.  Then bp 2 would be available for use, even though
> 2 > 1.
> 
> It's also a poor choice of name.  Everywhere else (in my patches,
> anyway) the code refers to hardware breakpoints using the abbreviation
> "hwbp" or "hw_breakpoint".  There's no reason suddenly to start using
> "hbkpt".
> 

I began using 'hbkpt' as a shorter naming convention (the longer one
being hw_breakpoint) without being really conscious of the 'hwbkpt'
usage by you (even some of the previous iterations contained them in
samples/hw_breakpoint and ftrace-plugin).

Well, I will rename my shorter name to 'hwbkpt' for uniformity.

> > +/* An array containing refcount of threads using a given bkpt register */
> > +unsigned int hbkpt_user_max_refcount[HB_NUM];
> 
> Why did you put "max" in the name?  Isn't this just a simple refcount?
> 

Ok. It will be hbkpt_user_refcount[].

> > +/* One higher than the highest counted user-space breakpoint register */
> > +unsigned int hbkpt_user_max;
> 
> Likewise, this variable isn't really needed.  It's just one more than
> the largest i such that hbkpt_user_max_refcount[i] > 0.
> 

It acts like a cache for determining the user-space breakpoint boundary.
It is used for sanity checks and in its absence we will have to compute from
hbkpt_user_max_refcount[] everytime.

> > +/*
> > + * Install the debug register values for a new thread.
> > + */
> > +void switch_to_thread_hw_breakpoint(struct task_struct *tsk)
> > +{
> > +	/* Set the debug register */
> 
> Set _which_ debug register?
> 

Will change it to read:
/* Set all debug registers used by 'tsk' */

> > +	arch_install_thread_hbkpt(tsk);
> > +	last_debugged_task = current;
> > +
> > +	put_cpu_no_resched();
> 
> What's this line doing here?  It looks like something you forgot to
> erase.
> 
> > +}
> > +
> > +/*
> > + * Install the debug register values for just the kernel, no thread.
> > + */
> > +void switch_to_none_hw_breakpoint(void)
> > +{
> > +	arch_install_none();
> > +	put_cpu_no_resched();
> 
> Same for this line.
> 

These are carriages from the previous code. They are still invoked from
the same places (such as flush_thread_hw_breakpoint(),
hw_breakpoint_handler()) and hence I didn't analyse it enough to see if
they were to be removed.

However, having found that preempt_count() is already zero at places where
these are called I think they can be removed.

> > +}
> > +
> > +/*
> > + * Load the debug registers during startup of a CPU.
> > + */
> > +void load_debug_registers(void)
> > +{
> > +	int i;
> > +	unsigned long flags;
> > +
> > +	/* Prevent IPIs for new kernel breakpoint updates */
> > +	local_irq_save(flags);
> > +
> > +	for (i = hbkpt_kernel_pos; i < HB_NUM; i++)
> > +		if (hbkpt_kernel[i])
> > +			on_each_cpu(arch_install_kernel_hbkpt,
> > +				(void *)hbkpt_kernel[i], 0);
> 
> This is completely wrong.  First of all, it's dumb to send multiple
> IPIs (one for each iteration through the loop).  Second, this routine
> shouldn't send any IPIs at all!  It gets invoked when a CPU is
> starting up and wants to load its _own_ debug registers -- not tell
> another CPU to load anything.
> 

As I agreed before, it is an overkill (given the design of
arch_install_kernel_hbkpt()). I will create a new
arch_update_kernel_hbkpt(pos, bp) that will install breakpoints only
on the CPU starting up.

> > +	if (current->thread.dr7)
> > +		arch_install_thread_hbkpt(current);
> > +
> > +	local_irq_restore(flags);
> > +}
> > +
> > +/*
> > + * Erase all the hardware breakpoint info associated with a thread.
> > + *
> > + * If tsk != current then tsk must not be usable (for example, a
> > + * child being cleaned up from a failed fork).
> > + */
> > +void flush_thread_hw_breakpoint(struct task_struct *tsk)
> > +{
> > +	int i;
> > +	struct thread_struct *thread = &(tsk->thread);
> > +
> > +	mutex_lock(&hw_breakpoint_mutex);
> > +
> > +	/* Let the breakpoints know they are being uninstalled */
> 
> This comment looks like a leftover which should have been erased.
> 
> > +/*
> > + * Validate the settings in a hw_breakpoint structure.
> > + */
> > +static int validate_settings(struct hw_breakpoint *bp, struct task_struct *tsk)
> > +{
> > +	int ret;
> > +	unsigned int align;
> > +
> > +	ret = arch_validate_hwbkpt_settings(bp, &align, tsk);
> > +	if (ret < 0)
> > +		goto err;
> > +
> > +	/* Check that the low-order bits of the address are appropriate
> > +	 * for the alignment implied by len.
> > +	 */
> > +	if (bp->info.address & align)
> > +		return -EINVAL;
> > +
> > +	/* Check that the virtual address is in the proper range */
> > +	if (tsk) {
> > +		if (!arch_check_va_in_userspace(bp->info.address, tsk))
> > +			return -EFAULT;
> > +	} else {
> > +		if (!arch_check_va_in_kernelspace(bp->info.address))
> > +			return -EFAULT;
> > +	}
> 
> Roland pointed out that these checks need to take into account the
> length of the breakpoint.  For example, in
> arch_check_va_in_userspace() it isn't sufficient for the start of the
> breakpoint region to be a userspace address; the end of the
> breakpoint region must also be in userspace.
> 

Ok. Will do something like:
return (va <= (TASK_SIZE - (hw_breakpoint_length * word_size)));

> > + err:
> > +	return ret;
> > +}
> > +
> > +int __register_user_hw_breakpoint(int pos, struct task_struct *tsk,
> > +					struct hw_breakpoint *bp)
> > +{
> > +	struct thread_struct *thread = &(tsk->thread);
> > +	int rc;
> > +
> > +	/* Do not overcommit. Fail if kernel has used the hbkpt registers */
> > +	if (pos >= hbkpt_kernel_pos)
> > +		return -ENOSPC;
> 
> In fact you should fail if the debug register is already in use,
> regardless of whether it is being used by a kernel breakpoint.  And you 
> shouldn't check against hbkpt_kernel_pos; you should check whether 
> hbkpt_kernel[pos] is NULL and thread->hbkpt[pos] is NULL.
> 

As explained before, the intended design was like this:

ample layout:
hbkpt_kernel_pos = 1
hbkpt_user_max = 1

---------------------------------------------------------------------
|                |                |                |                |
|       DR3      |       DR2      |       DR1      |       DR0      |
|                |                |                |                |
---------------------------------------------------------------------
^                                                  ^                ^
|                                                  |                |
-----------------kernel-space addresses-------------------user-------

After removing breakpoint in say DR2, compaction occurs.
New layout will be:
hbkpt_kernel_pos = 2
hbkpt_user_max = 1

---------------------------------------------------------------------
|                |                |                |                |
|       DR3      |       DR2      |       DR1      |       DR0      |
|                |                |                |                |
---------------------------------------------------------------------
^                                 ^                ^                ^
|                                 |                |                |
-----------------kernel------------------empty-----------user--------

The above design, in my opinion is intuitive, allows re-use of
uninstalled registers and is simple to implement.

What was missing in the sent patch was the updation of dr7 and dr[pos]
register after compaction. I will add them in the next iteration of the
patch.

> > +
> > +	rc = validate_settings(bp, tsk);
> > +	if (rc)
> > +		return rc;
> > +
> > +	thread->hbkpt[pos] = bp;
> > +	thread->hbkpt_num_installed++;
> > +	hbkpt_user_max_refcount[pos]++;
> > +	/* 'tsk' is the thread that uses max number of hbkpt registers */
> 
> This is a bad comment.  It sounds like it's saying that "tsk" is
> defined as the thread using the maximum number of breakpoints, rather
> than being defined as the thread for which the breakpoint is being
> registered.
> 
> Besides, there's no reason to keep track of which thread uses the max 
> number of breakpoints anyway.  Not to mention the fact that you don't 
> update hbkpt_user_max when its thread exits.
> 

We don't keep track of the thread (in the sense the task_struct) but
'hbkpt_user_max' is used for validating requests and book-keeping. As
Maneesh mentioned before flush_thread_hw_breakpoint() updates
'hbkpt_user_max'.

I can change it to read like the one below if it sounds better to you.

/* 
 * 'tsk' uses more number of registers than 'hbkpt_user_max'. Update
 * the same.
 */

> > +	if (hbkpt_user_max < thread->hbkpt_num_installed)
> > +		hbkpt_user_max++;
> 
> At this point I got tired of looking, but it seems obvious that the new 
> patch series needs a bunch of improvements.
> 
> Alan Stern
>

As mentioned before the next iteration would contain the changes I've
discussed above.

Thanks,
K.Prasad
 

^ permalink raw reply	[flat|nested] 55+ messages in thread

* Re: [Patch 01/11] Introducing generic hardware breakpoint handler interfaces
  2009-03-20 18:30     ` Ingo Molnar
@ 2009-03-21 17:32       ` K.Prasad
  0 siblings, 0 replies; 55+ messages in thread
From: K.Prasad @ 2009-03-21 17:32 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Alan Stern, Linux Kernel Mailing List, Andrew Morton,
	Benjamin Herrenschmidt, Frederic Weisbecker, Maneesh Soni,
	Roland McGrath, Steven Rostedt

On Fri, Mar 20, 2009 at 07:30:58PM +0100, Ingo Molnar wrote:
> 
> * Alan Stern <stern@rowland.harvard.edu> wrote:
> 
> > > + * Kernel breakpoints grow downwards, starting from HB_NUM
> > > + * 'hbkpt_kernel_pos' denotes lowest numbered breakpoint register occupied for
> > > + * kernel-space request
> > > + */
> > > +unsigned int hbkpt_kernel_pos;
> > 
> > This doesn't make much sense.  All you need to know is which 
> > registers are in use; all others are available.
> > 
> > For example, suppose the kernel allocated breakpoints 3, 2, and 1, 
> > and then deallocated 2.  Then bp 2 would be available for use, 
> > even though 2 > 1.
> 
> it's a high/low watermark mechanism. Yes, it's not an allocator that 
> can allocate into a debug registrs 'hole', but it is a simple one 
> that matches current hardware breakpoint usages and enables the 
> kernel to utilize them as well - and keeps all the code simple.
> 
> 	Ingo

I've explained the design here: http://lkml.org/lkml/2009/3/21/169 in a
and is slightly different from what you've explained above.

It involves shifting of kernel-space registers by one-level if a
kernel-register is uninstalled. We compact the kernel-space registers
since a)not to leave a 'hole' thereby wasting a register forever during
runtime b)kernel-space requests are not specific to a register number
and can be moved at will (unlike user-space requests).

Hope that the design is acceptable and the resultant code - simple.

Thanks,
K.Prasad


^ permalink raw reply	[flat|nested] 55+ messages in thread

* Re: [Patch 01/11] Introducing generic hardware breakpoint handler interfaces
  2009-03-21 17:26     ` K.Prasad
@ 2009-03-21 21:39       ` Alan Stern
  2009-03-23 19:03         ` K.Prasad
  0 siblings, 1 reply; 55+ messages in thread
From: Alan Stern @ 2009-03-21 21:39 UTC (permalink / raw)
  To: K.Prasad
  Cc: Ingo Molnar, Linux Kernel Mailing List, Andrew Morton,
	Benjamin Herrenschmidt, Frederic Weisbecker, Maneesh Soni,
	Roland McGrath, Steven Rostedt

On Sat, 21 Mar 2009, K.Prasad wrote:

> > > + * Kernel breakpoints grow downwards, starting from HB_NUM
> > > + * 'hbkpt_kernel_pos' denotes lowest numbered breakpoint register occupied for
> > > + * kernel-space request
> > > + */
> > > +unsigned int hbkpt_kernel_pos;
> > 
> > This doesn't make much sense.  All you need to know is which registers
> > are in use; all others are available.
> > 
> 
> As explained by Maneesh earlier, we compact the kernel-space requests
> into registers (HB_NUM - 1) to hbkpt_kernel_pos. The kernel-space
> requests aren't specific to any given register number too, and so
> compaction would be suitable for this case (unlike when implemented for
> user-space which might need virtualisation of registers).

Okay, that makes sense.  Perhaps you could add a short comment here
explaining that the register allocations get compacted when a kernel
breakpoint is unregistered, so they will always be contiguous.

> > It's also a poor choice of name.  Everywhere else (in my patches,
> > anyway) the code refers to hardware breakpoints using the abbreviation
> > "hwbp" or "hw_breakpoint".  There's no reason suddenly to start using
> > "hbkpt".
> > 
> 
> I began using 'hbkpt' as a shorter naming convention (the longer one
> being hw_breakpoint) without being really conscious of the 'hwbkpt'
> usage by you (even some of the previous iterations contained them in
> samples/hw_breakpoint and ftrace-plugin).
> 
> Well, I will rename my shorter name to 'hwbkpt' for uniformity.

My patch never used "hwbkpt".  Besides "hw_breakpoint", it used only 
"bp".  On going back and checking, I found that it didn't even use 
"hwbp".  Some other abbreviations it did use were "kbp" for kernel 
breakpoint, "chbi" for per-CPU hardware breakpoint info, and "thbi" for 
per-thread hardware breakpoint info.

If you're looking for a good short name, and if you want to keep 
hardware breakpoints distinct from software breakpoints, I suggest 
"hbp" instead of "hbkpt".  But it's up to you, and it's worth noticing 
that the code already contains lots of variables named just "bp".

> > > +/* One higher than the highest counted user-space breakpoint register */
> > > +unsigned int hbkpt_user_max;
> > 
> > Likewise, this variable isn't really needed.  It's just one more than
> > the largest i such that hbkpt_user_max_refcount[i] > 0.
> > 
> 
> It acts like a cache for determining the user-space breakpoint boundary.
> It is used for sanity checks and in its absence we will have to compute from
> hbkpt_user_max_refcount[] everytime.

That's right.  Isn't it simpler to check

	kernel_pos > 0 && hbkpt_user_refcount[kernel_pos - 1] == 0

than to check

	kernel_pos - 1 >= hbkpt_user_max

_and_ to keep hbkpt_user_max set to the correct value at all times?

> > > +/*
> > > + * Load the debug registers during startup of a CPU.
> > > + */
> > > +void load_debug_registers(void)
> > > +{
> > > +	int i;
> > > +	unsigned long flags;
> > > +
> > > +	/* Prevent IPIs for new kernel breakpoint updates */
> > > +	local_irq_save(flags);
> > > +
> > > +	for (i = hbkpt_kernel_pos; i < HB_NUM; i++)
> > > +		if (hbkpt_kernel[i])
> > > +			on_each_cpu(arch_install_kernel_hbkpt,
> > > +				(void *)hbkpt_kernel[i], 0);
> > 
> > This is completely wrong.  First of all, it's dumb to send multiple
> > IPIs (one for each iteration through the loop).  Second, this routine
> > shouldn't send any IPIs at all!  It gets invoked when a CPU is
> > starting up and wants to load its _own_ debug registers -- not tell
> > another CPU to load anything.
> > 
> 
> As I agreed before, it is an overkill (given the design of
> arch_install_kernel_hbkpt()). I will create a new
> arch_update_kernel_hbkpt(pos, bp) that will install breakpoints only
> on the CPU starting up.

Doesn't arch_install_kernel_hbkpt() already install breakpoints
on only the current CPU?  So why do you need a new function?

> > > +	/* Check that the virtual address is in the proper range */
> > > +	if (tsk) {
> > > +		if (!arch_check_va_in_userspace(bp->info.address, tsk))
> > > +			return -EFAULT;
> > > +	} else {
> > > +		if (!arch_check_va_in_kernelspace(bp->info.address))
> > > +			return -EFAULT;
> > > +	}
> > 
> > Roland pointed out that these checks need to take into account the
> > length of the breakpoint.  For example, in
> > arch_check_va_in_userspace() it isn't sufficient for the start of the
> > breakpoint region to be a userspace address; the end of the
> > breakpoint region must also be in userspace.
> > 
> 
> Ok. Will do something like:
> return (va <= (TASK_SIZE - (hw_breakpoint_length * word_size)));

What is the purpose of word_size here?  The breakpoint length should be 
specified in bytes, not words.

Don't forget that that in arch_check_va_in_kernelspace() you need to 
check both for values that are too low and values that are too high 
(they overflow and wrap around back to a user address).

> We don't keep track of the thread (in the sense the task_struct) but
> 'hbkpt_user_max' is used for validating requests and book-keeping. As
> Maneesh mentioned before flush_thread_hw_breakpoint() updates
> 'hbkpt_user_max'.
> 
> I can change it to read like the one below if it sounds better to you.
> 
> /* 
>  * 'tsk' uses more number of registers than 'hbkpt_user_max'. Update
>  * the same.
>  */

My preference is simply to eliminate hbkpt_user_max entirely.

Alan Stern


^ permalink raw reply	[flat|nested] 55+ messages in thread

* Re: [Patch 01/11] Introducing generic hardware breakpoint handler interfaces
  2009-03-21 21:39       ` Alan Stern
@ 2009-03-23 19:03         ` K.Prasad
  2009-03-23 19:21           ` Alan Stern
  0 siblings, 1 reply; 55+ messages in thread
From: K.Prasad @ 2009-03-23 19:03 UTC (permalink / raw)
  To: Alan Stern
  Cc: Ingo Molnar, Linux Kernel Mailing List, Andrew Morton,
	Benjamin Herrenschmidt, Frederic Weisbecker, Maneesh Soni,
	Roland McGrath, Steven Rostedt

On Sat, Mar 21, 2009 at 05:39:59PM -0400, Alan Stern wrote:
> On Sat, 21 Mar 2009, K.Prasad wrote:
> 
> > > > + * Kernel breakpoints grow downwards, starting from HB_NUM
> > > > + * 'hbkpt_kernel_pos' denotes lowest numbered breakpoint register occupied for
> > > > + * kernel-space request
> > > > + */
> > > > +unsigned int hbkpt_kernel_pos;
> > > 
> > > This doesn't make much sense.  All you need to know is which registers
> > > are in use; all others are available.
> > > 
> > 
> > As explained by Maneesh earlier, we compact the kernel-space requests
> > into registers (HB_NUM - 1) to hbkpt_kernel_pos. The kernel-space
> > requests aren't specific to any given register number too, and so
> > compaction would be suitable for this case (unlike when implemented for
> > user-space which might need virtualisation of registers).
> 
> Okay, that makes sense.  Perhaps you could add a short comment here
> explaining that the register allocations get compacted when a kernel
> breakpoint is unregistered, so they will always be contiguous.
> 
> > > It's also a poor choice of name.  Everywhere else (in my patches,
> > > anyway) the code refers to hardware breakpoints using the abbreviation
> > > "hwbp" or "hw_breakpoint".  There's no reason suddenly to start using
> > > "hbkpt".
> > > 
> > 
> > I began using 'hbkpt' as a shorter naming convention (the longer one
> > being hw_breakpoint) without being really conscious of the 'hwbkpt'
> > usage by you (even some of the previous iterations contained them in
> > samples/hw_breakpoint and ftrace-plugin).
> > 
> > Well, I will rename my shorter name to 'hwbkpt' for uniformity.
> 
> My patch never used "hwbkpt".  Besides "hw_breakpoint", it used only 
> "bp".  On going back and checking, I found that it didn't even use 
> "hwbp".  Some other abbreviations it did use were "kbp" for kernel 
> breakpoint, "chbi" for per-CPU hardware breakpoint info, and "thbi" for 
> per-thread hardware breakpoint info.
> 
> If you're looking for a good short name, and if you want to keep 
> hardware breakpoints distinct from software breakpoints, I suggest 
> "hbp" instead of "hbkpt".  But it's up to you, and it's worth noticing 
> that the code already contains lots of variables named just "bp".
> 

I am renaming all 'hbkpt' strings to 'hbp'.

> > > > +/* One higher than the highest counted user-space breakpoint register */
> > > > +unsigned int hbkpt_user_max;
> > > 
> > > Likewise, this variable isn't really needed.  It's just one more than
> > > the largest i such that hbkpt_user_max_refcount[i] > 0.
> > > 
> > 
> > It acts like a cache for determining the user-space breakpoint boundary.
> > It is used for sanity checks and in its absence we will have to compute from
> > hbkpt_user_max_refcount[] everytime.
> 
> That's right.  Isn't it simpler to check
> 
> 	kernel_pos > 0 && hbkpt_user_refcount[kernel_pos - 1] == 0
> 
> than to check
> 
> 	kernel_pos - 1 >= hbkpt_user_max
> 
> _and_ to keep hbkpt_user_max set to the correct value at all times?
>

Unfortunately the lines of code required to maintain the variable comes
close to the amount of lines it would potentially save. I will change to
code to compute it from hbkpt_user_refcount[] everytime.
 
> > > > +/*
> > > > + * Load the debug registers during startup of a CPU.
> > > > + */
> > > > +void load_debug_registers(void)
> > > > +{
> > > > +	int i;
> > > > +	unsigned long flags;
> > > > +
> > > > +	/* Prevent IPIs for new kernel breakpoint updates */
> > > > +	local_irq_save(flags);
> > > > +
> > > > +	for (i = hbkpt_kernel_pos; i < HB_NUM; i++)
> > > > +		if (hbkpt_kernel[i])
> > > > +			on_each_cpu(arch_install_kernel_hbkpt,
> > > > +				(void *)hbkpt_kernel[i], 0);
> > > 
> > > This is completely wrong.  First of all, it's dumb to send multiple
> > > IPIs (one for each iteration through the loop).  Second, this routine
> > > shouldn't send any IPIs at all!  It gets invoked when a CPU is
> > > starting up and wants to load its _own_ debug registers -- not tell
> > > another CPU to load anything.
> > > 
> > 
> > As I agreed before, it is an overkill (given the design of
> > arch_install_kernel_hbkpt()). I will create a new
> > arch_update_kernel_hbkpt(pos, bp) that will install breakpoints only
> > on the CPU starting up.
> 
> Doesn't arch_install_kernel_hbkpt() already install breakpoints
> on only the current CPU?  So why do you need a new function?
>

There will be a few more changes to arch_install_kernel_hbkpt() along
with this. Please find the changes in the ensuing patchset.
 
> > > > +	/* Check that the virtual address is in the proper range */
> > > > +	if (tsk) {
> > > > +		if (!arch_check_va_in_userspace(bp->info.address, tsk))
> > > > +			return -EFAULT;
> > > > +	} else {
> > > > +		if (!arch_check_va_in_kernelspace(bp->info.address))
> > > > +			return -EFAULT;
> > > > +	}
> > > 
> > > Roland pointed out that these checks need to take into account the
> > > length of the breakpoint.  For example, in
> > > arch_check_va_in_userspace() it isn't sufficient for the start of the
> > > breakpoint region to be a userspace address; the end of the
> > > breakpoint region must also be in userspace.
> > > 
> > 
> > Ok. Will do something like:
> > return (va <= (TASK_SIZE - (hw_breakpoint_length * word_size)));
> 
> What is the purpose of word_size here?  The breakpoint length should be 
> specified in bytes, not words.
> 
> Don't forget that that in arch_check_va_in_kernelspace() you need to 
> check both for values that are too low and values that are too high 
> (they overflow and wrap around back to a user address).
> 

While I understand the user-space checking using the length of the HW
Breakpoint, I don't really see how I can check for an upper-bound for
kernel-space virtual addresses. Most usage in the kernel only checks for
the address >= TASK_SIZE (while they check for add + len if the length
of the memory is known). I will be glad to have any suggestions in this
regard.

> > We don't keep track of the thread (in the sense the task_struct) but
> > 'hbkpt_user_max' is used for validating requests and book-keeping. As
> > Maneesh mentioned before flush_thread_hw_breakpoint() updates
> > 'hbkpt_user_max'.
> > 
> > I can change it to read like the one below if it sounds better to you.
> > 
> > /* 
> >  * 'tsk' uses more number of registers than 'hbkpt_user_max'. Update
> >  * the same.
> >  */
> 
> My preference is simply to eliminate hbkpt_user_max entirely.
> 
> Alan Stern
>

Done.

Thanks,
K.Prasad
 

^ permalink raw reply	[flat|nested] 55+ messages in thread

* Re: [Patch 11/11] ftrace plugin for kernel symbol tracing using HW Breakpoint interfaces - v2
  2009-03-21 16:39       ` Steven Rostedt
@ 2009-03-23 19:08         ` K.Prasad
  0 siblings, 0 replies; 55+ messages in thread
From: K.Prasad @ 2009-03-23 19:08 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: Frederic Weisbecker, Ingo Molnar, Linux Kernel Mailing List,
	Alan Stern, Andrew Morton, Benjamin Herrenschmidt, Maneesh Soni,
	Roland McGrath

On Sat, Mar 21, 2009 at 12:39:08PM -0400, Steven Rostedt wrote:
> 
> On Sat, 21 Mar 2009, K.Prasad wrote:
> > > > 
> > 
> > > > +
> > > > +void ksym_collect_stats(unsigned long hbkpt_hit_addr)
> > > > +{
> > > > +	struct hlist_node *node;
> > > > +	struct trace_ksym *entry;
> > > > +
> > > > +	spin_lock(&ksym_stat_lock);
> > > 
> > > 
> > > I see that can be called from ksym_hbkpt_handler which in turn
> > > can be called from interrupt context, right?
> > > You can issue a deadlock if you don't disable interrupts here.
> > > 
> > > Thanks,
> > > Frederic.
> > > 
> > 
> > ksym_collect_stats<--ksym_hbkpt_handler<--hw_breakpoint_handler<--do_debug
> > invocation happens with interrupts enabled (IF bit is set). I do find
> > that a few plugins in kernel/trace enclose the
> > trace_buffer_lock_reserve()--trace_buffer_unlock_commit() invocation
> > within interrupt-disabled code. Is that a requirement there?
> > 
> > The potential deadlock scenario you foresee isn't obvious to me. Can you
> > explain?
> 
> Can that lock ever be taken in an interrupt? If not, document that (and 
> perhaps add a WARN_ON(in_interrupt()); ). Otherwise you have a possible:
> 
> 	spin_lock(&ksym_stat_lock);
> 
> 		===> take interrupt ...
> 
> 			(from interrupt)
> 			spin_lock(&ksym_stat_lock); <== deadlock.
> 
> 
> -- Steve
>

Given that the function pointed by the trigger() routine is invoked with
breakpoints disabled on that CPU, I don't think we'd enter into a loop
a cyclic dependancy as above.

On the other hand, my observation w.r.t. IF bit being set was misplaced
in the sense that it corresponded to the saved stack and not when inside
the breakpoint handler in which case interrupts were disabled.

So we are safe in either ways.

Thanks,
K.Prasad
 

^ permalink raw reply	[flat|nested] 55+ messages in thread

* Re: [Patch 01/11] Introducing generic hardware breakpoint handler interfaces
  2009-03-23 19:03         ` K.Prasad
@ 2009-03-23 19:21           ` Alan Stern
  2009-03-23 20:42             ` K.Prasad
  0 siblings, 1 reply; 55+ messages in thread
From: Alan Stern @ 2009-03-23 19:21 UTC (permalink / raw)
  To: K.Prasad
  Cc: Ingo Molnar, Linux Kernel Mailing List, Andrew Morton,
	Benjamin Herrenschmidt, Frederic Weisbecker, Maneesh Soni,
	Roland McGrath, Steven Rostedt

On Tue, 24 Mar 2009, K.Prasad wrote:

> > > Ok. Will do something like:
> > > return (va <= (TASK_SIZE - (hw_breakpoint_length * word_size)));
> > 
> > What is the purpose of word_size here?  The breakpoint length should be 
> > specified in bytes, not words.
> > 
> > Don't forget that that in arch_check_va_in_kernelspace() you need to 
> > check both for values that are too low and values that are too high 
> > (they overflow and wrap around back to a user address).
> > 
> 
> While I understand the user-space checking using the length of the HW
> Breakpoint, I don't really see how I can check for an upper-bound for
> kernel-space virtual addresses. Most usage in the kernel only checks for
> the address >= TASK_SIZE (while they check for add + len if the length
> of the memory is known). I will be glad to have any suggestions in this
> regard.

Isn't that exactly the check you need to implement?

	addr >= TASK_SIZE && (addr + len) >= TASK_SIZE,

or perhaps better,

	addr >= TASK_SIZE && (addr + len) >= addr.

In this case you _do_ know the length of the breakpoint.

Alan Stern


^ permalink raw reply	[flat|nested] 55+ messages in thread

* Re: [Patch 01/11] Introducing generic hardware breakpoint handler interfaces
  2009-03-23 19:21           ` Alan Stern
@ 2009-03-23 20:42             ` K.Prasad
  2009-03-23 21:20               ` Alan Stern
  0 siblings, 1 reply; 55+ messages in thread
From: K.Prasad @ 2009-03-23 20:42 UTC (permalink / raw)
  To: Alan Stern
  Cc: Ingo Molnar, Linux Kernel Mailing List, Andrew Morton,
	Benjamin Herrenschmidt, Frederic Weisbecker, Maneesh Soni,
	Roland McGrath, Steven Rostedt

On Mon, Mar 23, 2009 at 03:21:49PM -0400, Alan Stern wrote:
> On Tue, 24 Mar 2009, K.Prasad wrote:
> 
> > > > Ok. Will do something like:
> > > > return (va <= (TASK_SIZE - (hw_breakpoint_length * word_size)));
> > > 
> > > What is the purpose of word_size here?  The breakpoint length should be 
> > > specified in bytes, not words.
> > > 
> > > Don't forget that that in arch_check_va_in_kernelspace() you need to 
> > > check both for values that are too low and values that are too high 
> > > (they overflow and wrap around back to a user address).
> > > 
> > 
> > While I understand the user-space checking using the length of the HW
> > Breakpoint, I don't really see how I can check for an upper-bound for
> > kernel-space virtual addresses. Most usage in the kernel only checks for
> > the address >= TASK_SIZE (while they check for add + len if the length
> > of the memory is known). I will be glad to have any suggestions in this
> > regard.
> 
> Isn't that exactly the check you need to implement?
> 
> 	addr >= TASK_SIZE && (addr + len) >= TASK_SIZE,
> 
> or perhaps better,
> 
> 	addr >= TASK_SIZE && (addr + len) >= addr.
> 
> In this case you _do_ know the length of the breakpoint.
> 
> Alan Stern
>

Aren't we just checking if len is a positive number through the above
checks? The validation checks in the patchset should take care of
negative lengths. Or am I missing something?

I thought you wanted the code to check for an upper sane limit for addr
in kernel-space, say something like this:

TASK_SIZE <= addr <= (Upper limit for Kernel Virtual Address)

When I referred to 'len' in my previous mail, it meant the length
of the kernel virtual memory area (which can be used to find the upper
bound).

Thanks,
K.Prasad


^ permalink raw reply	[flat|nested] 55+ messages in thread

* Re: [Patch 01/11] Introducing generic hardware breakpoint handler interfaces
  2009-03-23 20:42             ` K.Prasad
@ 2009-03-23 21:20               ` Alan Stern
  0 siblings, 0 replies; 55+ messages in thread
From: Alan Stern @ 2009-03-23 21:20 UTC (permalink / raw)
  To: K.Prasad
  Cc: Ingo Molnar, Linux Kernel Mailing List, Andrew Morton,
	Benjamin Herrenschmidt, Frederic Weisbecker, Maneesh Soni,
	Roland McGrath, Steven Rostedt

On Tue, 24 Mar 2009, K.Prasad wrote:

> > Isn't that exactly the check you need to implement?
> > 
> > 	addr >= TASK_SIZE && (addr + len) >= TASK_SIZE,
> > 
> > or perhaps better,
> > 
> > 	addr >= TASK_SIZE && (addr + len) >= addr.
> > 
> > In this case you _do_ know the length of the breakpoint.
> > 
> > Alan Stern
> >
> 
> Aren't we just checking if len is a positive number through the above
> checks? The validation checks in the patchset should take care of
> negative lengths. Or am I missing something?

Well, 0x60000000 is a positive number, and 0xd0000000 is >= TASK_SIZE.  
But their sum is 0x30000000, which lies in userspace.  In other words, 
you are missing the possibility that the addition might overflow and 
wrap around.

> I thought you wanted the code to check for an upper sane limit for addr
> in kernel-space, say something like this:
> 
> TASK_SIZE <= addr <= (Upper limit for Kernel Virtual Address)

No, the test should be

    TASK_SIZE <= addr <= addr + (len-1) <= (Upper limit for Kernel VA)

By the way, is TASK_SIZE the correct lower bound for kernel virtual
addresses on x86-64?

> When I referred to 'len' in my previous mail, it meant the length
> of the kernel virtual memory area (which can be used to find the upper
> bound).

Oh, sorry, I misunderstood.  Isn't that limit always 0xffffffff on 
x86-32?

Alan Stern


^ permalink raw reply	[flat|nested] 55+ messages in thread

* [Patch 02/11] x86 architecture implementation of Hardware Breakpoint interfaces
       [not found] <20090324152028.754123712@K.Prasad>
@ 2009-03-24 15:25 ` K.Prasad
  0 siblings, 0 replies; 55+ messages in thread
From: K.Prasad @ 2009-03-24 15:25 UTC (permalink / raw)
  To: Ingo Molnar, Linux Kernel Mailing List
  Cc: Alan Stern, Andrew Morton, Benjamin Herrenschmidt,
	Frederic Weisbecker, maneesh, Roland McGrath, Steven Rostedt,
	K.Prasad

[-- Attachment #1: 2 --]
[-- Type: text/plain, Size: 15088 bytes --]

This patch introduces two new files named hw_breakpoint.[ch] inside x86 specific
directories. They contain functions which help validate and serve requests for 
using Hardware Breakpoint registers on x86 processors.

Signed-off-by: K.Prasad <prasad@linux.vnet.ibm.com>
Signed-off-by: Alan Stern <stern@rowland.harvard.edu>
---
 arch/x86/Kconfig                     |    1 
 arch/x86/include/asm/hw_breakpoint.h |   73 +++++
 arch/x86/kernel/Makefile             |    2 
 arch/x86/kernel/hw_breakpoint.c      |  441 +++++++++++++++++++++++++++++++++++
 4 files changed, 516 insertions(+), 1 deletion(-)

Index: linux-2.6-tip/arch/x86/kernel/hw_breakpoint.c
===================================================================
--- /dev/null
+++ linux-2.6-tip/arch/x86/kernel/hw_breakpoint.c
@@ -0,0 +1,441 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) 2007 Alan Stern
+ * Copyright (C) 2009 IBM Corporation
+ */
+
+/*
+ * HW_breakpoint: a unified kernel/user-space hardware breakpoint facility,
+ * using the CPU's debug registers.
+ */
+
+#include <linux/irqflags.h>
+#include <linux/notifier.h>
+#include <linux/kallsyms.h>
+#include <linux/kprobes.h>
+#include <linux/percpu.h>
+#include <linux/kdebug.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/sched.h>
+#include <linux/init.h>
+#include <linux/smp.h>
+
+#include <asm/hw_breakpoint.h>
+#include <asm/processor.h>
+#include <asm/debugreg.h>
+
+/* Unmasked kernel DR7 value */
+static unsigned long kdr7;
+
+/*
+ * Masks for the bits corresponding to registers DR0 - DR3 in DR7 register.
+ * Used to clear and verify the status of bits corresponding to DR0 - DR3
+ */
+static const unsigned long	dr7_masks[HB_NUM] = {
+	0x000f0003,	/* LEN0, R/W0, G0, L0 */
+	0x00f0000c,	/* LEN1, R/W1, G1, L1 */
+	0x0f000030,	/* LEN2, R/W2, G2, L2 */
+	0xf00000c0	/* LEN3, R/W3, G3, L3 */
+};
+
+
+/*
+ * Encode the length, type, Exact, and Enable bits for a particular breakpoint
+ * as stored in debug register 7.
+ */
+static unsigned long encode_dr7(int drnum, unsigned len, unsigned type)
+{
+	unsigned long temp;
+
+	temp = (len | type) & 0xf;
+	temp <<= (DR_CONTROL_SHIFT + drnum * DR_CONTROL_SIZE);
+	temp |= (DR_GLOBAL_ENABLE << (drnum * DR_ENABLE_SIZE)) |
+				DR_GLOBAL_SLOWDOWN;
+	return temp;
+}
+
+/*
+ * Install the kernel breakpoints in their debug registers.
+ * If 0 <= pos < HB_NUM, then set the debug register corresponding to that number
+ * If 'pos' is negative, then all debug registers are updated
+ */
+void arch_install_kernel_hw_breakpoint(void *idx)
+{
+	int pos = *(int *)idx;
+	unsigned long dr7;
+	int i;
+
+	get_debugreg(dr7, 7);
+
+	/* Don't allow debug exceptions while we update the registers */
+	set_debugreg(0UL, 7);
+
+	for (i = hbp_kernel_pos; i < HB_NUM; i++) {
+		if ((pos >= 0) && (i != pos))
+			continue;
+		dr7 &= ~(dr7_masks[i]);
+		if (hbp_kernel[i])
+			set_debugreg(hbp_kernel[i]->info.address, i);
+	}
+
+	dr7 |= kdr7;
+	/* No need to set DR6 */
+	set_debugreg(dr7, 7);
+}
+
+void arch_load_debug_registers()
+{
+	int pos = -1;
+	/*
+	 * We want all debug registers to be initialised for this
+	 * CPU so pos = -1
+	 */
+	arch_install_kernel_hw_breakpoint((void *)&pos);
+}
+
+/*
+ * Install the thread breakpoints in their debug registers.
+ */
+void arch_install_thread_hw_breakpoint(struct task_struct *tsk)
+{
+	int i;
+	struct thread_struct *thread = &(tsk->thread);
+
+	for (i = 0;  (i < hbp_kernel_pos) && hbp_user_refcount[i]; i++)
+		if (thread->hbp[i])
+			set_debugreg(thread->hbp[i]->info.address, i);
+
+	/* No need to set DR6 */
+
+	set_debugreg((kdr7 | thread->dr7), 7);
+}
+
+/*
+ * Install the debug register values for just the kernel, no thread.
+ */
+void arch_install_none()
+{
+	/* Clear the user-space portion of dr7 by setting only kdr7 */
+	set_debugreg(kdr7, 7);
+}
+
+static int get_hbp_len(u8 hbp_len)
+{
+	unsigned int len_in_bytes = 0;
+
+	switch (hbp_len) {
+	case HW_BREAKPOINT_LEN_1:
+		len_in_bytes = 1;
+		break;
+	case HW_BREAKPOINT_LEN_2:
+		len_in_bytes = 2;
+		break;
+	case HW_BREAKPOINT_LEN_4:
+		len_in_bytes = 4;
+		break;
+#ifdef CONFIG_X86_64
+	case HW_BREAKPOINT_LEN_8:
+		len_in_bytes = 8;
+		break;
+#endif
+	}
+	return len_in_bytes;
+}
+
+/*
+ * Check for virtual address in user space.
+ */
+int arch_check_va_in_userspace(unsigned long va, u8 hbp_len)
+{
+	unsigned int len;
+
+	len = get_hbp_len(hbp_len);
+
+	return (va <= TASK_SIZE - len);
+}
+
+/*
+ * Check for virtual address in kernel space.
+ */
+int arch_check_va_in_kernelspace(unsigned long va, u8 hbp_len)
+{
+	unsigned int len;
+
+	len = get_hbp_len(hbp_len);
+
+	return ((va >= TASK_SIZE) && ((va + len) >= TASK_SIZE));
+}
+
+/*
+ * Store a breakpoint's encoded address, length, and type.
+ */
+void arch_store_info(struct hw_breakpoint *bp)
+{
+	/*
+	 * User-space requests will always have the address field populated
+	 * For kernel-addresses, either the address or symbol name can be
+	 * specified.
+	 */
+	if (bp->info.address)
+		return;
+	if (bp->info.name)
+		bp->info.address = (unsigned long)
+					kallsyms_lookup_name(bp->info.name);
+}
+
+/*
+ * Validate the arch-specific HW Breakpoint register settings
+ */
+int arch_validate_hwbkpt_settings(struct hw_breakpoint *bp,
+				unsigned int *align, struct task_struct *tsk)
+{
+	int ret = -EINVAL;
+
+	switch (bp->info.type) {
+
+	/*
+	 * Ptrace-refactoring code
+	 * For now, we'll allow instruction breakpoint only for user-space
+	 * addresses
+	 */
+	case HW_BREAKPOINT_EXECUTE:
+		if ((!arch_check_va_in_userspace(bp->info.address,
+							bp->info.len)) &&
+			bp->info.len != HW_BREAKPOINT_LEN_EXECUTE)
+			return ret;
+		break;
+	case HW_BREAKPOINT_WRITE:
+				break;
+	case HW_BREAKPOINT_RW:
+				break;
+	default:
+		return ret;
+	}
+
+	switch (bp->info.len) {
+	case HW_BREAKPOINT_LEN_1:
+		*align = 0;
+		break;
+	case HW_BREAKPOINT_LEN_2:
+		*align = 1;
+		break;
+	case HW_BREAKPOINT_LEN_4:
+		*align = 3;
+		break;
+#ifdef CONFIG_X86_64
+	case HW_BREAKPOINT_LEN_8:
+		*align = 7;
+		break;
+#endif
+	default:
+		return ret;
+	}
+
+	if (bp->triggered) {
+		ret = 0;
+		arch_store_info(bp);
+	}
+	return ret;
+}
+
+/*
+ * Register a new user breakpoint structure.
+ */
+void arch_register_user_hw_breakpoint(int pos, struct hw_breakpoint *bp,
+		struct task_struct *tsk)
+{
+	struct thread_struct *thread = &(tsk->thread);
+
+	thread->dr7 &= ~dr7_masks[pos];
+	thread->dr7 |= encode_dr7(pos, bp->info.len, bp->info.type);
+}
+
+/*
+ * Modify an existing user breakpoint structure.
+ */
+int arch_modify_user_hw_breakpoint(int pos, struct hw_breakpoint *bp,
+		struct task_struct *tsk)
+{
+	struct thread_struct *thread = &(tsk->thread);
+
+	/* Check if the register to be modified was enabled by the thread */
+	if (!(thread->dr7 & (1 << (pos * DR_ENABLE_SIZE))))
+		return -EINVAL;
+
+	thread->dr7 &= ~dr7_masks[pos];
+	thread->dr7 |= encode_dr7(pos, bp->info.len, bp->info.type);
+
+	return 0;
+}
+
+/*
+ * Unregister a user breakpoint structure.
+ */
+void arch_unregister_user_hw_breakpoint(int pos, struct hw_breakpoint *bp,
+					struct task_struct *tsk)
+{
+	struct thread_struct *thread = &(tsk->thread);
+
+	if (!thread->hbp[pos])
+		return;
+
+	thread->hbp[pos]->info.address = 0;
+	thread->dr7 &= ~dr7_masks[pos];
+}
+
+/*
+ * Register a kernel breakpoint structure.
+ */
+void arch_register_kernel_hw_breakpoint(int pos)
+{
+	struct hw_breakpoint *bp;
+
+	bp = hbp_kernel[pos];
+
+	kdr7 &= ~(dr7_masks[pos]);
+	if (bp)
+		kdr7 |= encode_dr7(pos, bp->info.len, bp->info.type);
+
+	on_each_cpu(arch_install_kernel_hw_breakpoint, (void *)&pos, 0);
+}
+
+/*
+ * Unregister a kernel breakpoint structure.
+ */
+void arch_unregister_kernel_hw_breakpoint()
+{
+	int i, pos = -1;
+
+	/*
+	 * Modify kdr7 to reflect the new layout of kernel-space breakpoints
+	 * and invoke the routine to write breakpoint addresses onto the
+	 * physical registers
+	 */
+	for (i = hbp_kernel_pos; i < HB_NUM; i++) {
+		kdr7 &= ~(dr7_masks[i]);
+		if (hbp_kernel[i]) {
+			struct hw_breakpoint *bp = hbp_kernel[i];
+			kdr7 |= encode_dr7(i, bp->info.len, bp->info.type);
+		}
+	}
+	on_each_cpu(arch_install_kernel_hw_breakpoint, (void *)&pos, 0);
+}
+
+
+/*
+ * Copy out the debug register information for a core dump.
+ *
+ * tsk must be equal to current.
+ */
+void dump_thread_hw_breakpoint(struct task_struct *tsk, int u_debugreg[8])
+{
+	struct thread_struct *thread = &(tsk->thread);
+	int i;
+
+	memset(u_debugreg, 0, sizeof u_debugreg);
+	for (i = 0; i < thread->hbp_num_installed && thread->hbp[i]; ++i)
+		u_debugreg[i] = thread->hbp[i]->info.address;
+	u_debugreg[7] = thread->dr7;
+	u_debugreg[6] = thread->dr6;
+}
+
+/*
+ * Handle debug exception notifications.
+ */
+int __kprobes hw_breakpoint_handler(struct die_args *args)
+{
+	int i, rc = NOTIFY_DONE;
+	struct hw_breakpoint *bp;
+	/* The DR6 value is stored in args->err */
+	unsigned long dr7, dr6 = args->err;
+
+	if (dr6 & DR_STEP)
+		return NOTIFY_DONE;
+
+	get_debugreg(dr7, 7);
+
+	/* Disable breakpoints during exception handling */
+	set_debugreg(0UL, 7);
+
+	/*
+	 * Assert that local interrupts are disabled
+	 * Reset the DRn bits in the virtualized register value.
+	 * The ptrace trigger routine will add in whatever is needed.
+	 */
+	current->thread.dr6 &= ~(DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3);
+
+	/* Lazy debug register switching */
+	if (last_debugged_task != current)
+		switch_to_none_hw_breakpoint();
+
+	/* Handle all the breakpoints that were triggered */
+	for (i = 0; i < HB_NUM; ++i) {
+		if (likely(!(dr6 & (DR_TRAP0 << i))))
+			continue;
+		/*
+		 * Find the corresponding hw_breakpoint structure and
+		 * invoke its triggered callback.
+		 */
+		if (hbp_user_refcount[i])
+			bp = current->thread.hbp[i];
+		else if (i >= hbp_kernel_pos)
+			bp = hbp_kernel[i];
+		else	/* False alarm due to lazy DR switching */
+			continue;
+
+		if (!bp)
+			break;
+
+		switch (bp->info.type) {
+		case HW_BREAKPOINT_WRITE:
+		case HW_BREAKPOINT_RW:
+			if (bp->triggered)
+				(bp->triggered)(bp, args->regs);
+
+			if (arch_check_va_in_userspace(bp->info.address,
+							bp->info.len))
+				rc = NOTIFY_DONE;
+			else
+				rc = NOTIFY_STOP;;
+			goto exit;
+		case HW_BREAKPOINT_EXECUTE:
+			/*
+			 * Presently we allow instruction breakpoints only in
+			 * user-space when requested through ptrace.
+			 */
+			if (arch_check_va_in_userspace(bp->info.address,
+							bp->info.len)) {
+				(bp->triggered)(bp, args->regs);
+				/*
+				 * do_debug will notify user through a SIGTRAP
+				 * signal So we are not requesting a
+				 * NOTIFY_STOP here
+				 */
+				rc = NOTIFY_DONE;
+				goto exit;
+			}
+		}
+	}
+
+	/* Stop processing further if the exception is a stray one */
+	if (!(dr6 & ~(DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)))
+		rc = NOTIFY_STOP;
+exit:
+	set_debugreg(dr7, 7);
+	return rc;
+}
Index: linux-2.6-tip/arch/x86/include/asm/hw_breakpoint.h
===================================================================
--- /dev/null
+++ linux-2.6-tip/arch/x86/include/asm/hw_breakpoint.h
@@ -0,0 +1,73 @@
+#ifndef	_I386_HW_BREAKPOINT_H
+#define	_I386_HW_BREAKPOINT_H
+
+#ifdef	__KERNEL__
+#define	__ARCH_HW_BREAKPOINT_H
+
+struct arch_hw_breakpoint {
+	char		*name; /* Contains name of the symbol to set bkpt */
+	unsigned long	address;
+	u8		len;
+	u8		type;
+};
+
+#include <linux/kdebug.h>
+#include <asm-generic/hw_breakpoint.h>
+
+/* Available HW breakpoint length encodings */
+#define HW_BREAKPOINT_LEN_1		0x40
+#define HW_BREAKPOINT_LEN_2		0x44
+#define HW_BREAKPOINT_LEN_4		0x4c
+#define HW_BREAKPOINT_LEN_EXECUTE	0x40
+
+#ifdef CONFIG_X86_64
+#define HW_BREAKPOINT_LEN_8		0x48
+#endif
+
+/* Available HW breakpoint type encodings */
+
+/* trigger on instruction execute */
+#define HW_BREAKPOINT_EXECUTE	0x80
+/* trigger on memory write */
+#define HW_BREAKPOINT_WRITE	0x81
+/* trigger on memory read or write */
+#define HW_BREAKPOINT_RW	0x83
+
+/* Total number of available HW breakpoint registers */
+#define HB_NUM 4
+
+extern struct hw_breakpoint *hbp_kernel[HB_NUM];
+extern unsigned int hbp_user_refcount[HB_NUM];
+
+/*
+ * Ptrace support: breakpoint trigger routine.
+ */
+int __register_user_hw_breakpoint(int pos, struct task_struct *tsk,
+			struct hw_breakpoint *bp);
+int __modify_user_hw_breakpoint(int pos, struct task_struct *tsk,
+			struct hw_breakpoint *bp);
+void __unregister_user_hw_breakpoint(int pos, struct task_struct *tsk,
+			struct hw_breakpoint *bp);
+
+void arch_install_thread_hw_breakpoint(struct task_struct *tsk);
+void arch_install_none(void);
+void arch_install_kernel_hw_breakpoint(void *);
+int arch_check_va_in_userspace(unsigned long va, u8 hbp_len);
+int arch_check_va_in_kernelspace(unsigned long va, u8 hbp_len);
+void arch_store_info(struct hw_breakpoint *bp);
+int arch_validate_hwbkpt_settings(struct hw_breakpoint *bp,
+				unsigned int *align, struct task_struct *tsk);
+void arch_register_user_hw_breakpoint(int pos, struct hw_breakpoint *bp,
+				struct task_struct *tsk);
+int arch_modify_user_hw_breakpoint(int pos, struct hw_breakpoint *bp,
+				struct task_struct *tsk);
+void arch_unregister_user_hw_breakpoint(int pos, struct hw_breakpoint *bp,
+					struct task_struct *tsk);
+void arch_load_debug_registers(void);
+void arch_register_kernel_hw_breakpoint(int pos);
+void arch_unregister_kernel_hw_breakpoint(void);
+int hw_breakpoint_handler(struct die_args *args);
+
+#endif	/* __KERNEL__ */
+#endif	/* _I386_HW_BREAKPOINT_H */
+
Index: linux-2.6-tip/arch/x86/kernel/Makefile
===================================================================
--- linux-2.6-tip.orig/arch/x86/kernel/Makefile
+++ linux-2.6-tip/arch/x86/kernel/Makefile
@@ -36,7 +36,7 @@ obj-$(CONFIG_X86_64)	+= sys_x86_64.o x86
 obj-$(CONFIG_X86_64)	+= syscall_64.o vsyscall_64.o
 obj-y			+= bootflag.o e820.o
 obj-y			+= pci-dma.o quirks.o i8237.o topology.o kdebugfs.o
-obj-y			+= alternative.o i8253.o pci-nommu.o
+obj-y			+= alternative.o i8253.o pci-nommu.o hw_breakpoint.o
 obj-y			+= tsc.o io_delay.o rtc.o
 
 obj-$(CONFIG_X86_TRAMPOLINE)	+= trampoline.o
Index: linux-2.6-tip/arch/x86/Kconfig
===================================================================
--- linux-2.6-tip.orig/arch/x86/Kconfig
+++ linux-2.6-tip/arch/x86/Kconfig
@@ -47,6 +47,7 @@ config X86
 	select HAVE_KERNEL_LZMA
 	select HAVE_ARCH_KMEMCHECK
 	select HAVE_DMA_API_DEBUG
+	select HAVE_HW_BREAKPOINT
 
 config ARCH_DEFCONFIG
 	string


^ permalink raw reply	[flat|nested] 55+ messages in thread

* Re: [patch 02/11] x86 architecture implementation of Hardware Breakpoint interfaces
  2009-03-14 16:10                   ` Alan Stern
@ 2009-03-14 16:39                     ` Ingo Molnar
  0 siblings, 0 replies; 55+ messages in thread
From: Ingo Molnar @ 2009-03-14 16:39 UTC (permalink / raw)
  To: Alan Stern
  Cc: K.Prasad, Roland McGrath, Andrew Morton, Linux Kernel Mailing List


* Alan Stern <stern@rowland.harvard.edu> wrote:

> So in the end, you're _agreeing_ with what I wrote.  And yet 
> the tone of your reply suggests that you seemed to think that 
> my message had some deep, hostile intent.  It didn't.

Sorry about that - i didnt mean to convey any such message.

I guess i'll wait for the next series. All i'm striving for is 
for the whole series to be a lot simpler than what i've seen 
before.

	Ingo

^ permalink raw reply	[flat|nested] 55+ messages in thread

* Re: [patch 02/11] x86 architecture implementation of Hardware Breakpoint interfaces
  2009-03-14 12:24                 ` Ingo Molnar
@ 2009-03-14 16:10                   ` Alan Stern
  2009-03-14 16:39                     ` Ingo Molnar
  0 siblings, 1 reply; 55+ messages in thread
From: Alan Stern @ 2009-03-14 16:10 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: K.Prasad, Roland McGrath, Andrew Morton, Linux Kernel Mailing List

On Sat, 14 Mar 2009, Ingo Molnar wrote:

> 
> * Alan Stern <stern@rowland.harvard.edu> wrote:
> 
> > On Sat, 14 Mar 2009, K.Prasad wrote:
> > 
> > > Here's a summary of the intended changes to the patchset, which I hope
> > > to post early the following week. It tears down many features in the
> > > present submission (The write-up below is done without the benefit of
> > > actually having run into limitations while trying to chisel out code).
> > > 
> > > - Adopt a static allocation method for registers, say FCFS (and perhaps
> > >   botton-up for user-space allocations and the reverse for
> > >   kernel-space), although individual counters to do book-keeping should also
> > >   suffice.
> > 
> > You can't enforce bottom-up allocation for userspace breakpoint
> > requests. [...]
> 
> That's not the point.
> 
> The point is to offer a reasonable and simple static allocator 
> that will work fine with usual gdb usage. If something takes 
> away db4 that's as if user-space took away all registers - tough 
> luck.
> 
> You are trying to put complexity into a situation that is not 
> schedulable hence not resolvable _anyway_. There's just 4 debug 
> registers, not more. If the combined usage goes above four 
> someone will lose anyway - even with your allocator.

You are reading far more into my message than what I wrote.

I'm _not_ trying to put complexity anywhere.  All I did was point out
that Prasad was wrong to state that the kernel could adopt (or enforce)  
a bottom-up method for allocating debug registers for userspace 
breakpoints.  I trust you aren't trying to imply that he really was 
right?

> With my proposal the 'loss' can indeed come sooner if user-space 
> took db4 and there's nothing left for the kernel anymore - but 
> that's just an uninteresting special case that wont occur with 
> typical debug-register usage.
> 
> If it ever causes problems seriously _then_ will be the time to 
> consider "is it worth adding a more complex, dynamic allocator 
> for debug registers". Not now. This stuff is currently 
> over-designed and not acceptable to me in its current form.

My message didn't mention a word about more complex, dynamic
allocation.  Just the opposite, in fact -- because if we did virtualize
the debug registers then we _would_ be able to enforce bottom-up
allocation.

So in the end, you're _agreeing_ with what I wrote.  And yet the tone
of your reply suggests that you seemed to think that my message had
some deep, hostile intent.  It didn't.

Alan Stern


^ permalink raw reply	[flat|nested] 55+ messages in thread

* Re: [patch 02/11] x86 architecture implementation of Hardware Breakpoint interfaces
  2009-03-13 21:21               ` Alan Stern
@ 2009-03-14 12:24                 ` Ingo Molnar
  2009-03-14 16:10                   ` Alan Stern
  0 siblings, 1 reply; 55+ messages in thread
From: Ingo Molnar @ 2009-03-14 12:24 UTC (permalink / raw)
  To: Alan Stern
  Cc: K.Prasad, Roland McGrath, Andrew Morton, Linux Kernel Mailing List


* Alan Stern <stern@rowland.harvard.edu> wrote:

> On Sat, 14 Mar 2009, K.Prasad wrote:
> 
> > Here's a summary of the intended changes to the patchset, which I hope
> > to post early the following week. It tears down many features in the
> > present submission (The write-up below is done without the benefit of
> > actually having run into limitations while trying to chisel out code).
> > 
> > - Adopt a static allocation method for registers, say FCFS (and perhaps
> >   botton-up for user-space allocations and the reverse for
> >   kernel-space), although individual counters to do book-keeping should also
> >   suffice.
> 
> You can't enforce bottom-up allocation for userspace breakpoint
> requests. [...]

That's not the point.

The point is to offer a reasonable and simple static allocator 
that will work fine with usual gdb usage. If something takes 
away db4 that's as if user-space took away all registers - tough 
luck.

You are trying to put complexity into a situation that is not 
schedulable hence not resolvable _anyway_. There's just 4 debug 
registers, not more. If the combined usage goes above four 
someone will lose anyway - even with your allocator.

With my proposal the 'loss' can indeed come sooner if user-space 
took db4 and there's nothing left for the kernel anymore - but 
that's just an uninteresting special case that wont occur with 
typical debug-register usage.

If it ever causes problems seriously _then_ will be the time to 
consider "is it worth adding a more complex, dynamic allocator 
for debug registers". Not now. This stuff is currently 
over-designed and not acceptable to me in its current form.

	Ingo

^ permalink raw reply	[flat|nested] 55+ messages in thread

* Re: [patch 02/11] x86 architecture implementation of Hardware Breakpoint interfaces
  2009-03-12  2:46     ` Roland McGrath
  2009-03-13  3:43       ` Ingo Molnar
@ 2009-03-14  3:51       ` Benjamin Herrenschmidt
  1 sibling, 0 replies; 55+ messages in thread
From: Benjamin Herrenschmidt @ 2009-03-14  3:51 UTC (permalink / raw)
  To: Roland McGrath
  Cc: Ingo Molnar, prasad, Andrew Morton, Linux Kernel Mailing List,
	Alan Stern, David Gibson, Torez Smith

On Wed, 2009-03-11 at 19:46 -0700, Roland McGrath wrote:
> 
> I think it would be illustrative to have a second arch implementation to
> compare to the x86 one.  Ingo has a tendency to pretend everything is an
> x86 until shown the concrete evidence.  The obvious choice is powerpc.
> Its facility is very simple, so the arch-specific part of the
> implementation should be trivial--it's the "base case" of simplest
> available hw_breakpoint arch, really.  Also, it happens that Prasad's
> employer is interested in having that support.
> 
> For example, a sensible powerpc implementation would clearly demonstrate
> why you need accessors or at least either pre-registration setters or
> explicit type/len arguments in registration calls.

Well, we happen to be just in the middle of implementing support for
BookE HW debug facilities :-) (which have more HW breakpoints &
watchpoints than server PPCs along with fancy features like ranged
breakpoints or value compare) so it's a right time to give that a try.

I'm Ccing David Gibson and Torez Smith who both have been working on the
infrastructure to control the debug regs. For now we are just giving
pretty much direct access to the debug regs from ptrace (since they are
somewhat architected they are very similar if not identical between a
whole bunch of embedded powerpc's) but a more abstract interface would
be nice.

Ben.



^ permalink raw reply	[flat|nested] 55+ messages in thread

* Re: [patch 02/11] x86 architecture implementation of Hardware Breakpoint interfaces
  2009-03-11 17:41                   ` K.Prasad
@ 2009-03-14  3:47                     ` Benjamin Herrenschmidt
  0 siblings, 0 replies; 55+ messages in thread
From: Benjamin Herrenschmidt @ 2009-03-14  3:47 UTC (permalink / raw)
  To: prasad
  Cc: Alan Stern, Ingo Molnar, Andrew Morton,
	Linux Kernel Mailing List, Roland McGrath

On Wed, 2009-03-11 at 23:11 +0530, K.Prasad wrote:
> With FCFS or an allocation mechanism without the (un)installed()
> callbacks we'd lose the ability to record requests and service them
> later when registers become availabile.
> 
> Say when (un)installed() callbacks are implemented for the proposed
> ftrace-plugin to trace kernel symbols, they can automatically stop/start
> tracing as and when registers become (un)available. This can be helpful when
> we wish to profile memory access over a kernel variable for a long duration
> (where small loss of tracing data can be tolerated), while the system would
> permit simultaneous user-space access (say a GDB session using 'hbreak').
> 
> Are we fine with disallowing such usage, which if done will let the requester
> of the breakpoint register 'poll' periodically to check availability.

Is that such a big deal ? Can't we just have the kernel degrade to
classic SW breakpoints ?

Smells like overengineering to me ...

Ben.



^ permalink raw reply	[flat|nested] 55+ messages in thread

* Re: [patch 02/11] x86 architecture implementation of Hardware Breakpoint interfaces
  2009-03-11 13:10                   ` Ingo Molnar
@ 2009-03-14  3:46                     ` Benjamin Herrenschmidt
  0 siblings, 0 replies; 55+ messages in thread
From: Benjamin Herrenschmidt @ 2009-03-14  3:46 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: K.Prasad, Alan Stern, Andrew Morton, Linux Kernel Mailing List,
	Roland McGrath

On Wed, 2009-03-11 at 14:10 +0100, Ingo Molnar wrote:
> 
> Kernel gets debug registers in db4..db3..db2..db1 order, and its 
> allocation is essentially hardcoded - i.e. we dont try to be 
> fancy.
> 
> User-space (gdb) on the other hand will try to allocate in the 
> db1..db2..db3..db4 order.
> 
> Maintain a 'max debug register index' value driven by ptrace and 
> maintain a 'min debug register index' driven by kernel-space 
> hw-breakpoint allocations.

A few added details from the perspective of powerpc ...

breakpoints and watchpoints are separate resources with different
capacity depending on the chip, so far nothing fancy.

We also have the ability to do range breakpoints/watchpoints on some
processors by using pairs of registers, which adds some constraints to
the allocation.

We also have a value compare capability for watchpoint, but this can
also have a different capacity limitation from either the breakpoints
and the watchpoints themselves.

Cheers,
Ben.


^ permalink raw reply	[flat|nested] 55+ messages in thread

* Re: [patch 02/11] x86 architecture implementation of Hardware Breakpoint interfaces
  2009-03-11 12:12               ` Ingo Molnar
  2009-03-11 12:50                 ` K.Prasad
  2009-03-11 16:32                 ` Alan Stern
@ 2009-03-14  3:43                 ` Benjamin Herrenschmidt
  2 siblings, 0 replies; 55+ messages in thread
From: Benjamin Herrenschmidt @ 2009-03-14  3:43 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Alan Stern, prasad, Andrew Morton, Linux Kernel Mailing List,
	Roland McGrath

On Wed, 2009-03-11 at 13:12 +0100, Ingo Molnar wrote:
> 
> #3 is probably the most informative (and hence probably the
>    best) variant. It also leaves policy of how to resolve the 
>    conflict to the admin.

Agreed.
> 
> Would be nice to have it simple. Reluctance regarding this 
> patchset is mostly rooted in that diffstat above.
> 
> The changes it does in the x86 architecture code are nice 
> generalizations and cleanups. Both the scheduler, task 
> startup/exit and ptrace bits look pretty sane in terms of 
> factoring out debug register details. But the breakpoint 
> management looks very complex

I agree there is some interest in generalization and cleanup, especially
as far as userspace APIs go, though it's a hard nut to crack as every
CPU family out there has some subtle differences in the way breakpoints
or watchpoints work (for example, alignment constraints, ability to do
ranges, the way they handle kernel vs. user, etc...)

I'm not yet sold.

Cheers,
Ben.


^ permalink raw reply	[flat|nested] 55+ messages in thread

* Re: [patch 02/11] x86 architecture implementation of Hardware Breakpoint interfaces
  2009-03-10 20:30             ` Alan Stern
  2009-03-11 12:12               ` Ingo Molnar
@ 2009-03-14  3:41               ` Benjamin Herrenschmidt
  1 sibling, 0 replies; 55+ messages in thread
From: Benjamin Herrenschmidt @ 2009-03-14  3:41 UTC (permalink / raw)
  To: Alan Stern
  Cc: Ingo Molnar, prasad, Andrew Morton, Linux Kernel Mailing List,
	Roland McGrath

On Tue, 2009-03-10 at 16:30 -0400, Alan Stern wrote:
> Suppose we never allow callers to register more breakpoints than will
> fit in the CPU's registers.  Do we then use a simple first-come
> first-served algorithm, with no prioritization?  If we do prioritize
> some breakpoint registrations more highly than others, how do we
> inform
> callers that their breakpoint has been kicked out by one of higher
> priority?  And how do we let them know when the higher-priority
> breakpoint has been unregistered, so they can try again?

Do we really need such a mess ? Honestly ... We've been living fine
before without any of that.

Ben.


^ permalink raw reply	[flat|nested] 55+ messages in thread

* Re: [patch 02/11] x86 architecture implementation of Hardware Breakpoint interfaces
  2009-03-10 17:26           ` Ingo Molnar
  2009-03-10 20:30             ` Alan Stern
@ 2009-03-14  3:40             ` Benjamin Herrenschmidt
  1 sibling, 0 replies; 55+ messages in thread
From: Benjamin Herrenschmidt @ 2009-03-14  3:40 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Alan Stern, prasad, Andrew Morton, Linux Kernel Mailing List,
	Roland McGrath

On Tue, 2009-03-10 at 18:26 +0100, Ingo Molnar wrote:
> 
> That 'arbitrarily larg number of breakpoints' worries me. It's a 
> pretty broken concept for a 4-items resource that cannot be 
> time-shared and hence cannot be overcommitted.
> 
> Seems to me that much of the complexity of this patchset:
> 
>  28 files changed, 2439 insertions(+), 199 deletions(-)
> 
> Could be eliminated via a very simple exclusive reservation 
> mechanism.
> 
I also have some worries about the bloat of this infrastructure,
especially in the context switching code.

I would prefer the arch to be in control of the state in the task struct
and just context switch the actual HW registers at that stage.

Ben.



^ permalink raw reply	[flat|nested] 55+ messages in thread

* Re: [patch 02/11] x86 architecture implementation of Hardware Breakpoint interfaces
  2009-03-13 19:01             ` K.Prasad
@ 2009-03-13 21:21               ` Alan Stern
  2009-03-14 12:24                 ` Ingo Molnar
  0 siblings, 1 reply; 55+ messages in thread
From: Alan Stern @ 2009-03-13 21:21 UTC (permalink / raw)
  To: K.Prasad
  Cc: Ingo Molnar, Roland McGrath, Andrew Morton, Linux Kernel Mailing List

On Sat, 14 Mar 2009, K.Prasad wrote:

> Here's a summary of the intended changes to the patchset, which I hope
> to post early the following week. It tears down many features in the
> present submission (The write-up below is done without the benefit of
> actually having run into limitations while trying to chisel out code).
> 
> - Adopt a static allocation method for registers, say FCFS (and perhaps
>   botton-up for user-space allocations and the reverse for
>   kernel-space), although individual counters to do book-keeping should also
>   suffice.

You can't enforce bottom-up allocation for userspace breakpoint
requests.  In fact, you'll have to add a parameter indicating which
debug register is requested.  The ptrace interface will use this
parameter; the utrace interface won't care so it will specify something
like HW_BREAKPOINT_ANY_REGISTER.

You will have to add an array of HB_NUM counters, to keep track of how
many tasks are using each debug register.

> - Use an array of HB_NUM size for storing the breakpoint requests (and
>   not a linked-list implementation as done now).
> 
> - Define a HAVE_HW_BREAKPOINTS in arch/x86/Kconfig unconditionally, but
>   build kernel/hw_breakpoint.o, samples/hw_breakpoint/data_breakpoint.o
>   and kernel/trace/trace_ksym.o build conditionally if
>   HAVE_HW_BREAKPOINTS is declared. Declaring this flag will help
>   a)prevent build failures in other archs b)Prevent ftrace from showing
>   up availability of kernel symbol tracing even in unsupported archs.

This isn't quite right.  At the moment kernel/hw_breakpoint.c isn't
built at all; instead it is #included by the corresponding
arch-specific source file.  Of course, you could change that.

> - Simplify the switch_to_thread_hw_breakpoint() function (any help from
>   Alan Stern here would be gladly accepted).

Sure.  It will depend on how you implement the other changes.

> - Remove callbacks such as unregister/register.
> 
> - remove the code to implement prioritisation of requests

Remove the inline accessors.  They can be added back when they are 
needed.

Some architectures have arbitrary-length debug regions, not 
fixed-length 1, 2, 4, or 8 bytes.  We should give some thought to 
making the interface compatible with such things.

> - Add histogram support to include a 'hit counter' to the traced kernel
>   symbols.
> 
> - Address coding-style related comments.
> 
> Hope they are not in sync with the comments received thus far. Let me
> know if there are changes to be made.

Another change we need to change is the way DR6 is passed to the debug 
notifier chain.  Currently it is passed by value when do_debug() 
calls notify_die().  Instead we need to pass it by reference so that 
the notifier routines can change its value.  Each time a notifier 
routine handles a breakpoint event, the corresponding bit in DR6 should 
be turned off.

Alan Stern


^ permalink raw reply	[flat|nested] 55+ messages in thread

* Re: [patch 02/11] x86 architecture implementation of Hardware Breakpoint interfaces
  2009-03-13 14:13           ` Ingo Molnar
@ 2009-03-13 19:01             ` K.Prasad
  2009-03-13 21:21               ` Alan Stern
  0 siblings, 1 reply; 55+ messages in thread
From: K.Prasad @ 2009-03-13 19:01 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Alan Stern, Roland McGrath, Andrew Morton, Linux Kernel Mailing List

On Fri, Mar 13, 2009 at 03:13:04PM +0100, Ingo Molnar wrote:
> 
> * Alan Stern <stern@rowland.harvard.edu> wrote:
> 
> > On Fri, 13 Mar 2009, Ingo Molnar wrote:
> > 
> > > The core issue being discussed is the debug register 
> > > allocation and scheduling model though, and you have not 
> > > directly commented on that.
> > > 
> > > My argument in a nutshell is that a bottom-up for user + 
> > > top-down for kernel use static allocator with no dynamic 
> > > scheduling will get us most of the benefits with a tenth of 
> > > the complexity.
> > 
> > Take this even farther: We shouldn't restrict userspace to 
> > bottom-up register allocation.  With very little additional 
> > effort we can virtualize the debug registers; then userspace 
> > can allocate them in whatever order it wants and still end up 
> > using the physical registers in bottom-up order (or top-down, 
> > which is the order used by the current patches).
> > 
> > After all, there's nothing to prevent programs other than gdb 
> > from using ptrace, and there's no guarantee that gdb will 
> > continue to allocate registers in increasing order.
> 
> If in ~10 years of its existence no such usage arose so i dont 
> think it will magically appear now.
> 
> The thing is, kernel-side use of debug registers is a borderline 
> item whose impact we should minimalize as much as possible. 
> Linus in the past expressed that it is fine to not have _any_ 
> management of user versus kernel debug registers. So we want to 
> approach this from the minimalistic side. I offered such a very 
> minimal design that is trivial in terms of correctness and 
> impact.
> 
> We can still get this simple allocation model into .30 if we 
> dont waste time arguing about unnecessarily. If someone runs 
> into limitations the model can be extended.
> 
> 	Ingo

Here's a summary of the intended changes to the patchset, which I hope
to post early the following week. It tears down many features in the
present submission (The write-up below is done without the benefit of
actually having run into limitations while trying to chisel out code).

- Adopt a static allocation method for registers, say FCFS (and perhaps
  botton-up for user-space allocations and the reverse for
  kernel-space), although individual counters to do book-keeping should also
  suffice.

- Use an array of HB_NUM size for storing the breakpoint requests (and
  not a linked-list implementation as done now).

- Define a HAVE_HW_BREAKPOINTS in arch/x86/Kconfig unconditionally, but
  build kernel/hw_breakpoint.o, samples/hw_breakpoint/data_breakpoint.o
  and kernel/trace/trace_ksym.o build conditionally if
  HAVE_HW_BREAKPOINTS is declared. Declaring this flag will help
  a)prevent build failures in other archs b)Prevent ftrace from showing
  up availability of kernel symbol tracing even in unsupported archs.

- Simplify the switch_to_thread_hw_breakpoint() function (any help from
  Alan Stern here would be gladly accepted).

- Remove callbacks such as unregister/register.

- remove the code to implement prioritisation of requests

- Add histogram support to include a 'hit counter' to the traced kernel
  symbols.

- Address coding-style related comments.

Hope they are not in sync with the comments received thus far. Let me
know if there are changes to be made.

Thanks,
K.Prasad


^ permalink raw reply	[flat|nested] 55+ messages in thread

* Re: [patch 02/11] x86 architecture implementation of Hardware Breakpoint interfaces
  2009-03-13 14:04         ` Alan Stern
@ 2009-03-13 14:13           ` Ingo Molnar
  2009-03-13 19:01             ` K.Prasad
  0 siblings, 1 reply; 55+ messages in thread
From: Ingo Molnar @ 2009-03-13 14:13 UTC (permalink / raw)
  To: Alan Stern
  Cc: Roland McGrath, prasad, Andrew Morton, Linux Kernel Mailing List


* Alan Stern <stern@rowland.harvard.edu> wrote:

> On Fri, 13 Mar 2009, Ingo Molnar wrote:
> 
> > The core issue being discussed is the debug register 
> > allocation and scheduling model though, and you have not 
> > directly commented on that.
> > 
> > My argument in a nutshell is that a bottom-up for user + 
> > top-down for kernel use static allocator with no dynamic 
> > scheduling will get us most of the benefits with a tenth of 
> > the complexity.
> 
> Take this even farther: We shouldn't restrict userspace to 
> bottom-up register allocation.  With very little additional 
> effort we can virtualize the debug registers; then userspace 
> can allocate them in whatever order it wants and still end up 
> using the physical registers in bottom-up order (or top-down, 
> which is the order used by the current patches).
> 
> After all, there's nothing to prevent programs other than gdb 
> from using ptrace, and there's no guarantee that gdb will 
> continue to allocate registers in increasing order.

If in ~10 years of its existence no such usage arose so i dont 
think it will magically appear now.

The thing is, kernel-side use of debug registers is a borderline 
item whose impact we should minimalize as much as possible. 
Linus in the past expressed that it is fine to not have _any_ 
management of user versus kernel debug registers. So we want to 
approach this from the minimalistic side. I offered such a very 
minimal design that is trivial in terms of correctness and 
impact.

We can still get this simple allocation model into .30 if we 
dont waste time arguing about unnecessarily. If someone runs 
into limitations the model can be extended.

	Ingo

^ permalink raw reply	[flat|nested] 55+ messages in thread

* Re: [patch 02/11] x86 architecture implementation of Hardware Breakpoint interfaces
  2009-03-13  3:43       ` Ingo Molnar
@ 2009-03-13 14:04         ` Alan Stern
  2009-03-13 14:13           ` Ingo Molnar
  0 siblings, 1 reply; 55+ messages in thread
From: Alan Stern @ 2009-03-13 14:04 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Roland McGrath, prasad, Andrew Morton, Linux Kernel Mailing List

On Fri, 13 Mar 2009, Ingo Molnar wrote:

> The core issue being discussed is the debug register allocation 
> and scheduling model though, and you have not directly commented 
> on that.
> 
> My argument in a nutshell is that a bottom-up for user + 
> top-down for kernel use static allocator with no dynamic 
> scheduling will get us most of the benefits with a tenth of the 
> complexity.

Take this even farther: We shouldn't restrict userspace to bottom-up
register allocation.  With very little additional effort we can
virtualize the debug registers; then userspace can allocate them in
whatever order it wants and still end up using the physical registers
in bottom-up order (or top-down, which is the order used by the current
patches).

After all, there's nothing to prevent programs other than gdb from 
using ptrace, and there's no guarantee that gdb will continue to 
allocate registers in increasing order.

Alan Stern


^ permalink raw reply	[flat|nested] 55+ messages in thread

* Re: [patch 02/11] x86 architecture implementation of Hardware Breakpoint interfaces
  2009-03-12  2:46     ` Roland McGrath
@ 2009-03-13  3:43       ` Ingo Molnar
  2009-03-13 14:04         ` Alan Stern
  2009-03-14  3:51       ` Benjamin Herrenschmidt
  1 sibling, 1 reply; 55+ messages in thread
From: Ingo Molnar @ 2009-03-13  3:43 UTC (permalink / raw)
  To: Roland McGrath
  Cc: prasad, Andrew Morton, Linux Kernel Mailing List, Alan Stern


* Roland McGrath <roland@redhat.com> wrote:

> Perhaps it would help if asm-generic/hw_breakpoint.h had some 
> kerneldoc comments for the arch-specific functions that the 
> arch's asm/hw_breakpoint.h must define (in the style of 
> asm-generic/syscall.h).  I note that Ingo didn't have any 
> comments about asm-generic/hw_breakpoint.h in his review. Its 
> purpose should be to make any arch maintainer understand why 
> the API it specifies for each arch to meet makes sense across 
> the arch's.
> 
> > why this redirection, why dont just use the structure as-is? 
> > If there's any arch weirdness then that arch should have 
> > arch-special accessors - not the generic code.
> 
> The fields of arch_hw_breakpoint are arch-specific.  Another 
> arch's struct will not have .type and .len fields at all.  
> e.g., on powerpc there is just one size supported, so 
> hw_breakpoint_get_len() would be an inline returning a 
> constant.  Its type is encoded in low bits of the address 
> word, and the arch implementation may not want to use 
> bit-field called .type for that (and if it did, it couldn't 
> use a bit-field called .address with the meaning you'd want it 
> to have).
> 
> Having any fields in arch_hw_breakpoint at all be part of the 
> API restricts the arch implementation unreasonably.  So it has 
> accessors to fetch them instead.  (Arguably we could punt 
> those accessors from the API for hw_breakpoint users, but the 
> arch-independent part of the hw_breakpoint implementation 
> might still want them, I'm not sure.) Likewise, they need to 
> be filled in by setters or by explicit type/len arguments to 
> the registration calls.  This appears to be a tenet we worked 
> out the first time around that has gotten lost in the shuffle 
> more recently.
> 
> I think it would be illustrative to have a second arch 
> implementation to compare to the x86 one.  Ingo has a tendency 
> to pretend everything is an x86 until shown the concrete 
> evidence.  The obvious choice is powerpc. Its facility is very 
> simple, so the arch-specific part of the implementation should 
> be trivial--it's the "base case" of simplest available 
> hw_breakpoint arch, really.  Also, it happens that Prasad's 
> employer is interested in having that support.
> 
> For example, a sensible powerpc implementation would clearly 
> demonstrate why you need accessors or at least either 
> pre-registration setters or explicit type/len arguments in 
> registration calls.

That would help. I indeed have a tendency to strike out code 
that's not immediately needed, i also tend to make sure that 
design is sane on the platform that 95%+ of our active 
developers/users use.

The core issue being discussed is the debug register allocation 
and scheduling model though, and you have not directly commented 
on that.

My argument in a nutshell is that a bottom-up for user + 
top-down for kernel use static allocator with no dynamic 
scheduling will get us most of the benefits with a tenth of the 
complexity.

	Ingo

^ permalink raw reply	[flat|nested] 55+ messages in thread

* Re: [patch 02/11] x86 architecture implementation of Hardware Breakpoint interfaces
  2009-03-10 14:09   ` Ingo Molnar
  2009-03-10 14:59     ` Alan Stern
@ 2009-03-12  2:46     ` Roland McGrath
  2009-03-13  3:43       ` Ingo Molnar
  2009-03-14  3:51       ` Benjamin Herrenschmidt
  1 sibling, 2 replies; 55+ messages in thread
From: Roland McGrath @ 2009-03-12  2:46 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: prasad, Andrew Morton, Linux Kernel Mailing List, Alan Stern

Perhaps it would help if asm-generic/hw_breakpoint.h had some kerneldoc
comments for the arch-specific functions that the arch's asm/hw_breakpoint.h
must define (in the style of asm-generic/syscall.h).  I note that Ingo
didn't have any comments about asm-generic/hw_breakpoint.h in his review.
Its purpose should be to make any arch maintainer understand why the API it
specifies for each arch to meet makes sense across the arch's.

> why this redirection, why dont just use the structure as-is? If 
> there's any arch weirdness then that arch should have 
> arch-special accessors - not the generic code.

The fields of arch_hw_breakpoint are arch-specific.  Another arch's
struct will not have .type and .len fields at all.  e.g., on powerpc
there is just one size supported, so hw_breakpoint_get_len() would be an
inline returning a constant.  Its type is encoded in low bits of the
address word, and the arch implementation may not want to use bit-field
called .type for that (and if it did, it couldn't use a bit-field called
.address with the meaning you'd want it to have).  

Having any fields in arch_hw_breakpoint at all be part of the API
restricts the arch implementation unreasonably.  So it has accessors to
fetch them instead.  (Arguably we could punt those accessors from the
API for hw_breakpoint users, but the arch-independent part of the
hw_breakpoint implementation might still want them, I'm not sure.)
Likewise, they need to be filled in by setters or by explicit type/len
arguments to the registration calls.  This appears to be a tenet we
worked out the first time around that has gotten lost in the shuffle
more recently.

I think it would be illustrative to have a second arch implementation to
compare to the x86 one.  Ingo has a tendency to pretend everything is an
x86 until shown the concrete evidence.  The obvious choice is powerpc.
Its facility is very simple, so the arch-specific part of the
implementation should be trivial--it's the "base case" of simplest
available hw_breakpoint arch, really.  Also, it happens that Prasad's
employer is interested in having that support.

For example, a sensible powerpc implementation would clearly demonstrate
why you need accessors or at least either pre-registration setters or
explicit type/len arguments in registration calls.


Thanks,
Roland

^ permalink raw reply	[flat|nested] 55+ messages in thread

* Re: [patch 02/11] x86 architecture implementation of Hardware Breakpoint interfaces
  2009-03-11 16:32                 ` Alan Stern
@ 2009-03-11 17:41                   ` K.Prasad
  2009-03-14  3:47                     ` Benjamin Herrenschmidt
  0 siblings, 1 reply; 55+ messages in thread
From: K.Prasad @ 2009-03-11 17:41 UTC (permalink / raw)
  To: Alan Stern
  Cc: Ingo Molnar, Andrew Morton, Linux Kernel Mailing List, Roland McGrath

On Wed, Mar 11, 2009 at 12:32:19PM -0400, Alan Stern wrote:
> On Wed, 11 Mar 2009, Ingo Molnar wrote:
> 
> > > > Not if what we do what the previous code did: reloaded the full 
> > > > array unconditionally. (it's just 4 entries)
> > > 
> > > But that array still has to be set up somehow.  It is private 
> > > to the task; the only logical place to set it up is when the 
> > > CPU switches to that task.
> > > 
> > > In the old code, it wasn't possible for task B or the kernel 
> > > to affect the contents of task A's debug registers.  With 
> > > hw-breakpoints it _is_ possible, because the balance between 
> > > debug registers allocated to kernel breakpoints and debug 
> > > registers allocated to userspace breakpoints can change.  
> > > That's why the additional complexity is needed.
> > 
> > Yes - but we dont really need any scheduler complexity for this.
> > 
> > An IPI is enough to reload debug registers in an affected task 
> > (and calculate the real debug register layout) - and the next 
> > context switches will pick up changes automatically.
> > 
> > Am i missing anything? I'm trying to find the design that has 
> > the minimal possible complexity. (without killing any necessary 
> > features)
> 
> I think you _are_ missing something, though it's not clear what.
> 
> "and the next context switches will pick up changes automatically" --
> that may not be entirely right.  Yes, the next context switch will pick
> up the changes to DR1-4, but it won't necessarily pick up the changes
> to DR7.  However the details depend very much on how debug registers
> are allocated; with no priorities or evictions much of the complexity
> will disappear anyway.
> 
> > For an un-shareable resource like this (and this is really a 
> > rare case [and we shouldnt even consider switching between user 
> > and kernel debug registers at system call time]), the best 
> > approach is to have a rigid reservation mechanism with clear, 
> > hard, early failures in the overcommit case.
> > 
> > Silently breaking a user-space debugging sessions just because 
> > the admin has a debug register based system-wide profiling 
> > running, is pretty much the worst usage model. It does not give 
> > user-space any idea about what happened - the breakpoints just 
> > "dont work".
> > 
> > So i'd suggest a really simple scheme (depicted for x86 bug 
> > applicable on other architectures too):
> > 
> >  - we have a system-wide resource of 4 debug registers.
> > 
> >  - kernel-side can allocate debug registers system-wide (it 
> >    takes effect on all CPUs, at once), up to 4 of them. The 5th 
> >    allocation will fail.
> > 
> >  - user-side uses the ptrace APIs - and if it runs into the 
> >    limit, ptrace should return a failure.
> 
> Roland, of course, is all in favor of making hw-breakpoints compatible 
> with utrace.  The API should be flexible enough to encompass both 
> legacy ptrace and utrace.
> 
> > There's the following special case: the kernel reserves a debug 
> > register when there's tasks in the system that already have 
> > reserved all debug registers. I.e. the constraint was not known 
> > when the user-space session started, and the kernel violates it 
> > afterwards.
> 
> Right.  Or the kernel tries to allocate 2 debug registers when 
> userspace has already allocated 3, and so on...
> 
> > There's a couple of choices here, with various scales of 
> > conflict resolution:
> > 
> >  1- silently override the user-space breakpoint
> > 
> >  2- notify the user-space task via a signal - SIGXCPU or so.
> > 
> >  3- reject the kernel-space allocation with a sufficiently 
> >     informative log message: "task 123 already uses 4 debug 
> >     registers, cannot allocate more kernel breakpoints" - 
> >     leaving the resolution of the conflict to the admin.
> 
> We can't necessarily assign a particular task to the debug registers 
> already in use.  There might be more than one task using them.  But of 
> course we can always just say that they are already in use, and if 
> necessary there could be a /proc interface with more information.
> 
> Besides, we have to be able to reject kernel breakpoint requests in any
> case ("the 5th allocation will fail").
> 
> > #1 isnt particularly good because it brings back a
> >    'silentfailure' mode.
> 
> Agreed.
> 
> > #2 might be too brutal: starting something innocous-looking
> >    might kill a debug session. OTOH user-space debuggers could 
> >    catch the signal and inform the user.
> > 
> > #3 is probably the most informative (and hence probably the
> >    best) variant. It also leaves policy of how to resolve the 
> >    conflict to the admin.
> 
> AFAICS, #3 really is "first come, first served".  What do you mean by 
> "policy of how to resolve the conflict"?  It sounds like there are no 
> policy choices involved; whoever requests the debug register first will 
> get it.
>

With FCFS or an allocation mechanism without the (un)installed()
callbacks we'd lose the ability to record requests and service them
later when registers become availabile.

Say when (un)installed() callbacks are implemented for the proposed
ftrace-plugin to trace kernel symbols, they can automatically stop/start
tracing as and when registers become (un)available. This can be helpful when
we wish to profile memory access over a kernel variable for a long duration
(where small loss of tracing data can be tolerated), while the system would
permit simultaneous user-space access (say a GDB session using 'hbreak').

Are we fine with disallowing such usage, which if done will let the requester
of the breakpoint register 'poll' periodically to check availability.

Thanks,
K.Prasad


^ permalink raw reply	[flat|nested] 55+ messages in thread

* Re: [patch 02/11] x86 architecture implementation of Hardware Breakpoint interfaces
  2009-03-11 12:50                 ` K.Prasad
  2009-03-11 13:10                   ` Ingo Molnar
@ 2009-03-11 16:39                   ` Alan Stern
  1 sibling, 0 replies; 55+ messages in thread
From: Alan Stern @ 2009-03-11 16:39 UTC (permalink / raw)
  To: K.Prasad
  Cc: Ingo Molnar, Andrew Morton, Linux Kernel Mailing List, Roland McGrath

On Wed, 11 Mar 2009, K.Prasad wrote:

> The present implementation can be likened to #3 except that the
> uninstalled() callback is invoked (the user-space call through ptrace
> takes a higher priority and evicts the kernel-space requests even now).
> 
> After the task using four debug registers yield the CPU, the
> kernel-space breakpoint requests are 'restored' and installed() is
> called again.

No, that is wrong.  The kernel breakpoints do not get reinstalled until 
the userspace breakpoints are unregistered.  Merely yielding the CPU is 
not sufficient.

> Even if #3 was implemented as described, we would still retain a
> majority of the complexity in balance_kernel_vs_user() to check newer
> tasks with requests for breakpoint registers.

Some complexity is certainly needed, because at all times we need to
know the maximum number of breakpoints requested by any user task.  
The number of kernel breakpoints that can be allocated is limited to 4
minus this number.

Alan Stern


^ permalink raw reply	[flat|nested] 55+ messages in thread

* Re: [patch 02/11] x86 architecture implementation of Hardware Breakpoint interfaces
  2009-03-11 12:12               ` Ingo Molnar
  2009-03-11 12:50                 ` K.Prasad
@ 2009-03-11 16:32                 ` Alan Stern
  2009-03-11 17:41                   ` K.Prasad
  2009-03-14  3:43                 ` Benjamin Herrenschmidt
  2 siblings, 1 reply; 55+ messages in thread
From: Alan Stern @ 2009-03-11 16:32 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: prasad, Andrew Morton, Linux Kernel Mailing List, Roland McGrath

On Wed, 11 Mar 2009, Ingo Molnar wrote:

> > > Not if what we do what the previous code did: reloaded the full 
> > > array unconditionally. (it's just 4 entries)
> > 
> > But that array still has to be set up somehow.  It is private 
> > to the task; the only logical place to set it up is when the 
> > CPU switches to that task.
> > 
> > In the old code, it wasn't possible for task B or the kernel 
> > to affect the contents of task A's debug registers.  With 
> > hw-breakpoints it _is_ possible, because the balance between 
> > debug registers allocated to kernel breakpoints and debug 
> > registers allocated to userspace breakpoints can change.  
> > That's why the additional complexity is needed.
> 
> Yes - but we dont really need any scheduler complexity for this.
> 
> An IPI is enough to reload debug registers in an affected task 
> (and calculate the real debug register layout) - and the next 
> context switches will pick up changes automatically.
> 
> Am i missing anything? I'm trying to find the design that has 
> the minimal possible complexity. (without killing any necessary 
> features)

I think you _are_ missing something, though it's not clear what.

"and the next context switches will pick up changes automatically" --
that may not be entirely right.  Yes, the next context switch will pick
up the changes to DR1-4, but it won't necessarily pick up the changes
to DR7.  However the details depend very much on how debug registers
are allocated; with no priorities or evictions much of the complexity
will disappear anyway.

> For an un-shareable resource like this (and this is really a 
> rare case [and we shouldnt even consider switching between user 
> and kernel debug registers at system call time]), the best 
> approach is to have a rigid reservation mechanism with clear, 
> hard, early failures in the overcommit case.
> 
> Silently breaking a user-space debugging sessions just because 
> the admin has a debug register based system-wide profiling 
> running, is pretty much the worst usage model. It does not give 
> user-space any idea about what happened - the breakpoints just 
> "dont work".
> 
> So i'd suggest a really simple scheme (depicted for x86 bug 
> applicable on other architectures too):
> 
>  - we have a system-wide resource of 4 debug registers.
> 
>  - kernel-side can allocate debug registers system-wide (it 
>    takes effect on all CPUs, at once), up to 4 of them. The 5th 
>    allocation will fail.
> 
>  - user-side uses the ptrace APIs - and if it runs into the 
>    limit, ptrace should return a failure.

Roland, of course, is all in favor of making hw-breakpoints compatible 
with utrace.  The API should be flexible enough to encompass both 
legacy ptrace and utrace.

> There's the following special case: the kernel reserves a debug 
> register when there's tasks in the system that already have 
> reserved all debug registers. I.e. the constraint was not known 
> when the user-space session started, and the kernel violates it 
> afterwards.

Right.  Or the kernel tries to allocate 2 debug registers when 
userspace has already allocated 3, and so on...

> There's a couple of choices here, with various scales of 
> conflict resolution:
> 
>  1- silently override the user-space breakpoint
> 
>  2- notify the user-space task via a signal - SIGXCPU or so.
> 
>  3- reject the kernel-space allocation with a sufficiently 
>     informative log message: "task 123 already uses 4 debug 
>     registers, cannot allocate more kernel breakpoints" - 
>     leaving the resolution of the conflict to the admin.

We can't necessarily assign a particular task to the debug registers 
already in use.  There might be more than one task using them.  But of 
course we can always just say that they are already in use, and if 
necessary there could be a /proc interface with more information.

Besides, we have to be able to reject kernel breakpoint requests in any
case ("the 5th allocation will fail").

> #1 isnt particularly good because it brings back a
>    'silentfailure' mode.

Agreed.

> #2 might be too brutal: starting something innocous-looking
>    might kill a debug session. OTOH user-space debuggers could 
>    catch the signal and inform the user.
> 
> #3 is probably the most informative (and hence probably the
>    best) variant. It also leaves policy of how to resolve the 
>    conflict to the admin.

AFAICS, #3 really is "first come, first served".  What do you mean by 
"policy of how to resolve the conflict"?  It sounds like there are no 
policy choices involved; whoever requests the debug register first will 
get it.

> Would be nice to have it simple. Reluctance regarding this 
> patchset is mostly rooted in that diffstat above.

I'd be happy to implement #3.  Mostly it would just involve removing 
code from the patches.

> The changes it does in the x86 architecture code are nice 
> generalizations and cleanups. Both the scheduler, task 
> startup/exit and ptrace bits look pretty sane in terms of 
> factoring out debug register details. But the breakpoint 
> management looks very complex.

Yes, there's no denying it.  But I don't want to commit to any 
particular changes without Roland's input.

Alan Stern


^ permalink raw reply	[flat|nested] 55+ messages in thread

* Re: [patch 02/11] x86 architecture implementation of Hardware Breakpoint interfaces
  2009-03-11 12:50                 ` K.Prasad
@ 2009-03-11 13:10                   ` Ingo Molnar
  2009-03-14  3:46                     ` Benjamin Herrenschmidt
  2009-03-11 16:39                   ` Alan Stern
  1 sibling, 1 reply; 55+ messages in thread
From: Ingo Molnar @ 2009-03-11 13:10 UTC (permalink / raw)
  To: K.Prasad
  Cc: Alan Stern, Andrew Morton, Linux Kernel Mailing List, Roland McGrath


* K.Prasad <prasad@linux.vnet.ibm.com> wrote:

> Even if #3 was implemented as described, we would still retain 
> a majority of the complexity in balance_kernel_vs_user() to 
> check newer tasks with requests for breakpoint registers.

Not if it's implemented in a really simple way:

Kernel gets debug registers in db4..db3..db2..db1 order, and its 
allocation is essentially hardcoded - i.e. we dont try to be 
fancy.

User-space (gdb) on the other hand will try to allocate in the 
db1..db2..db3..db4 order.

Maintain a 'max debug register index' value driven by ptrace and 
maintain a 'min debug register index' driven by kernel-space 
hw-breakpoint allocations.

If they meet somewhere inbetween then we have overcommit which 
we dont allow. In all other cases (which i proffer covers 100% 
of the sane cases) they will mix amicably.

Sure, user-space can in principle do db4..db3..db2..db1 
allocations as well, but it would be silly and GDB does not do 
that.

So there's no real overlap between register usage - hence no 
need for any complex scheduling smarts. Keep it simple first, 
and only add complexity when it's justified.

[ for the special case of an architecture having just a single 
  debug register this will bring the expected behavior of either 
  allowing gdb to use the breakpoint or allowing the kernel to 
  use it. ]

	Ingo

^ permalink raw reply	[flat|nested] 55+ messages in thread

* Re: [patch 02/11] x86 architecture implementation of Hardware Breakpoint interfaces
  2009-03-11 12:12               ` Ingo Molnar
@ 2009-03-11 12:50                 ` K.Prasad
  2009-03-11 13:10                   ` Ingo Molnar
  2009-03-11 16:39                   ` Alan Stern
  2009-03-11 16:32                 ` Alan Stern
  2009-03-14  3:43                 ` Benjamin Herrenschmidt
  2 siblings, 2 replies; 55+ messages in thread
From: K.Prasad @ 2009-03-11 12:50 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Alan Stern, Andrew Morton, Linux Kernel Mailing List, Roland McGrath

On Wed, Mar 11, 2009 at 01:12:20PM +0100, Ingo Molnar wrote:
> 
> * Alan Stern <stern@rowland.harvard.edu> wrote:
> 
> > On Tue, 10 Mar 2009, Ingo Molnar wrote:
> > 
> > > > More generally, it's there because kernel & userspace 
> > > > breakpoints can be installed and uninstalled while a task is 
> > > > running -- and yes, this is partially because breakpoints are 
> > > > prioritized.  (Although it's worth pointing out that even your 
> > > > suggestion of always prioritizing kernel breakpoints above 
> > > > userspace breakpoints would have the same effect.)  However 
> > > > the fact that the breakpoints are stored in a list rather than 
> > > > an array doesn't seem to be relevant.
> > > > 
> > > > > A list needs to be maintained and when updated it's 
> > > > > reloaded.
> > > > 
> > > > The same is true of an array.
> > > 
> > > Not if what we do what the previous code did: reloaded the full 
> > > array unconditionally. (it's just 4 entries)
> > 
> > But that array still has to be set up somehow.  It is private 
> > to the task; the only logical place to set it up is when the 
> > CPU switches to that task.
> > 
> > In the old code, it wasn't possible for task B or the kernel 
> > to affect the contents of task A's debug registers.  With 
> > hw-breakpoints it _is_ possible, because the balance between 
> > debug registers allocated to kernel breakpoints and debug 
> > registers allocated to userspace breakpoints can change.  
> > That's why the additional complexity is needed.
> 
> Yes - but we dont really need any scheduler complexity for this.
> 
> An IPI is enough to reload debug registers in an affected task 
> (and calculate the real debug register layout) - and the next 
> context switches will pick up changes automatically.
> 
> Am i missing anything? I'm trying to find the design that has 
> the minimal possible complexity. (without killing any necessary 
> features)
> 
> > > > Yes, kernel breakpoints have to be kept separate from 
> > > > userspace breakpoints.  But even if you focus just on 
> > > > userspace breakpoints, you still need to use a list 
> > > > because debuggers can try to register an arbitrarily large 
> > > > number of breakpoints.
> > > 
> > > That 'arbitrarily large number of breakpoints' worries me. 
> > > It's a pretty broken concept for a 4-items resource that 
> > > cannot be time-shared and hence cannot be overcommitted.
> > 
> > Suppose we never allow callers to register more breakpoints 
> > than will fit in the CPU's registers.  Do we then use a simple 
> > first-come first-served algorithm, with no prioritization?  If 
> > we do prioritize some breakpoint registrations more highly 
> > than others, how do we inform callers that their breakpoint 
> > has been kicked out by one of higher priority?  And how do we 
> > let them know when the higher-priority breakpoint has been 
> > unregistered, so they can try again?
> 
> For an un-shareable resource like this (and this is really a 
> rare case [and we shouldnt even consider switching between user 
> and kernel debug registers at system call time]), the best 
> approach is to have a rigid reservation mechanism with clear, 
> hard, early failures in the overcommit case.
> 
> Silently breaking a user-space debugging sessions just because 
> the admin has a debug register based system-wide profiling 
> running, is pretty much the worst usage model. It does not give 
> user-space any idea about what happened - the breakpoints just 
> "dont work".
> 
> So i'd suggest a really simple scheme (depicted for x86 bug 
> applicable on other architectures too):
> 
>  - we have a system-wide resource of 4 debug registers.
> 
>  - kernel-side can allocate debug registers system-wide (it 
>    takes effect on all CPUs, at once), up to 4 of them. The 5th 
>    allocation will fail.
> 
>  - user-side uses the ptrace APIs - and if it runs into the 
>    limit, ptrace should return a failure.
> 
> There's the following special case: the kernel reserves a debug 
> register when there's tasks in the system that already have 
> reserved all debug registers. I.e. the constraint was not known 
> when the user-space session started, and the kernel violates it 
> afterwards.
> 
> There's a couple of choices here, with various scales of 
> conflict resolution:
> 
>  1- silently override the user-space breakpoint
> 
>  2- notify the user-space task via a signal - SIGXCPU or so.
> 
>  3- reject the kernel-space allocation with a sufficiently 
>     informative log message: "task 123 already uses 4 debug 
>     registers, cannot allocate more kernel breakpoints" - 
>     leaving the resolution of the conflict to the admin.
> 
> #1 isnt particularly good because it brings back a
>    'silentfailure' mode.
> 
> #2 might be too brutal: starting something innocous-looking
>    might kill a debug session. OTOH user-space debuggers could 
>    catch the signal and inform the user.
> 
> #3 is probably the most informative (and hence probably the
>    best) variant. It also leaves policy of how to resolve the 
>    conflict to the admin.
> 

While reserving more discussions after Roland posts his views, I thought
I'd share some of mine here.

The present implementation can be likened to #3 except that the
uninstalled() callback is invoked (the user-space call through ptrace
takes a higher priority and evicts the kernel-space requests even now).

After the task using four debug registers yield the CPU, the
kernel-space breakpoint requests are 'restored' and installed() is
called again.

Even if #3 was implemented as described, we would still retain a
majority of the complexity in balance_kernel_vs_user() to check newer
tasks with requests for breakpoint registers.

Thanks,
K.Prasad


^ permalink raw reply	[flat|nested] 55+ messages in thread

* Re: [patch 02/11] x86 architecture implementation of Hardware Breakpoint interfaces
  2009-03-10 20:30             ` Alan Stern
@ 2009-03-11 12:12               ` Ingo Molnar
  2009-03-11 12:50                 ` K.Prasad
                                   ` (2 more replies)
  2009-03-14  3:41               ` Benjamin Herrenschmidt
  1 sibling, 3 replies; 55+ messages in thread
From: Ingo Molnar @ 2009-03-11 12:12 UTC (permalink / raw)
  To: Alan Stern
  Cc: prasad, Andrew Morton, Linux Kernel Mailing List, Roland McGrath


* Alan Stern <stern@rowland.harvard.edu> wrote:

> On Tue, 10 Mar 2009, Ingo Molnar wrote:
> 
> > > More generally, it's there because kernel & userspace 
> > > breakpoints can be installed and uninstalled while a task is 
> > > running -- and yes, this is partially because breakpoints are 
> > > prioritized.  (Although it's worth pointing out that even your 
> > > suggestion of always prioritizing kernel breakpoints above 
> > > userspace breakpoints would have the same effect.)  However 
> > > the fact that the breakpoints are stored in a list rather than 
> > > an array doesn't seem to be relevant.
> > > 
> > > > A list needs to be maintained and when updated it's 
> > > > reloaded.
> > > 
> > > The same is true of an array.
> > 
> > Not if what we do what the previous code did: reloaded the full 
> > array unconditionally. (it's just 4 entries)
> 
> But that array still has to be set up somehow.  It is private 
> to the task; the only logical place to set it up is when the 
> CPU switches to that task.
> 
> In the old code, it wasn't possible for task B or the kernel 
> to affect the contents of task A's debug registers.  With 
> hw-breakpoints it _is_ possible, because the balance between 
> debug registers allocated to kernel breakpoints and debug 
> registers allocated to userspace breakpoints can change.  
> That's why the additional complexity is needed.

Yes - but we dont really need any scheduler complexity for this.

An IPI is enough to reload debug registers in an affected task 
(and calculate the real debug register layout) - and the next 
context switches will pick up changes automatically.

Am i missing anything? I'm trying to find the design that has 
the minimal possible complexity. (without killing any necessary 
features)

> > > Yes, kernel breakpoints have to be kept separate from 
> > > userspace breakpoints.  But even if you focus just on 
> > > userspace breakpoints, you still need to use a list 
> > > because debuggers can try to register an arbitrarily large 
> > > number of breakpoints.
> > 
> > That 'arbitrarily large number of breakpoints' worries me. 
> > It's a pretty broken concept for a 4-items resource that 
> > cannot be time-shared and hence cannot be overcommitted.
> 
> Suppose we never allow callers to register more breakpoints 
> than will fit in the CPU's registers.  Do we then use a simple 
> first-come first-served algorithm, with no prioritization?  If 
> we do prioritize some breakpoint registrations more highly 
> than others, how do we inform callers that their breakpoint 
> has been kicked out by one of higher priority?  And how do we 
> let them know when the higher-priority breakpoint has been 
> unregistered, so they can try again?

For an un-shareable resource like this (and this is really a 
rare case [and we shouldnt even consider switching between user 
and kernel debug registers at system call time]), the best 
approach is to have a rigid reservation mechanism with clear, 
hard, early failures in the overcommit case.

Silently breaking a user-space debugging sessions just because 
the admin has a debug register based system-wide profiling 
running, is pretty much the worst usage model. It does not give 
user-space any idea about what happened - the breakpoints just 
"dont work".

So i'd suggest a really simple scheme (depicted for x86 bug 
applicable on other architectures too):

 - we have a system-wide resource of 4 debug registers.

 - kernel-side can allocate debug registers system-wide (it 
   takes effect on all CPUs, at once), up to 4 of them. The 5th 
   allocation will fail.

 - user-side uses the ptrace APIs - and if it runs into the 
   limit, ptrace should return a failure.

There's the following special case: the kernel reserves a debug 
register when there's tasks in the system that already have 
reserved all debug registers. I.e. the constraint was not known 
when the user-space session started, and the kernel violates it 
afterwards.

There's a couple of choices here, with various scales of 
conflict resolution:

 1- silently override the user-space breakpoint

 2- notify the user-space task via a signal - SIGXCPU or so.

 3- reject the kernel-space allocation with a sufficiently 
    informative log message: "task 123 already uses 4 debug 
    registers, cannot allocate more kernel breakpoints" - 
    leaving the resolution of the conflict to the admin.

#1 isnt particularly good because it brings back a
   'silentfailure' mode.

#2 might be too brutal: starting something innocous-looking
   might kill a debug session. OTOH user-space debuggers could 
   catch the signal and inform the user.

#3 is probably the most informative (and hence probably the
   best) variant. It also leaves policy of how to resolve the 
   conflict to the admin.

> > Seems to me that much of the complexity of this patchset:
> > 
> >  28 files changed, 2439 insertions(+), 199 deletions(-)
> > 
> > Could be eliminated via a very simple exclusive reservation 
> > mechanism.
> 
> Can it really be as simple as all that?

Would be nice to have it simple. Reluctance regarding this 
patchset is mostly rooted in that diffstat above.

The changes it does in the x86 architecture code are nice 
generalizations and cleanups. Both the scheduler, task 
startup/exit and ptrace bits look pretty sane in terms of 
factoring out debug register details. But the breakpoint 
management looks very complex.

	Ingo

^ permalink raw reply	[flat|nested] 55+ messages in thread

* Re: [patch 02/11] x86 architecture implementation of Hardware Breakpoint interfaces
  2009-03-10 17:26           ` Ingo Molnar
@ 2009-03-10 20:30             ` Alan Stern
  2009-03-11 12:12               ` Ingo Molnar
  2009-03-14  3:41               ` Benjamin Herrenschmidt
  2009-03-14  3:40             ` Benjamin Herrenschmidt
  1 sibling, 2 replies; 55+ messages in thread
From: Alan Stern @ 2009-03-10 20:30 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: prasad, Andrew Morton, Linux Kernel Mailing List, Roland McGrath

On Tue, 10 Mar 2009, Ingo Molnar wrote:

> > More generally, it's there because kernel & userspace 
> > breakpoints can be installed and uninstalled while a task is 
> > running -- and yes, this is partially because breakpoints are 
> > prioritized.  (Although it's worth pointing out that even your 
> > suggestion of always prioritizing kernel breakpoints above 
> > userspace breakpoints would have the same effect.)  However 
> > the fact that the breakpoints are stored in a list rather than 
> > an array doesn't seem to be relevant.
> > 
> > > A list needs to be maintained and when updated it's 
> > > reloaded.
> > 
> > The same is true of an array.
> 
> Not if what we do what the previous code did: reloaded the full 
> array unconditionally. (it's just 4 entries)

But that array still has to be set up somehow.  It is private to the 
task; the only logical place to set it up is when the CPU switches to 
that task.

In the old code, it wasn't possible for task B or the kernel to
affect the contents of task A's debug registers.  With hw-breakpoints 
it _is_ possible, because the balance between debug registers allocated 
to kernel breakpoints and debug registers allocated to userspace 
breakpoints can change.  That's why the additional complexity is 
needed.

> > Yes, kernel breakpoints have to be kept separate from 
> > userspace breakpoints.  But even if you focus just on 
> > userspace breakpoints, you still need to use a list because 
> > debuggers can try to register an arbitrarily large number of 
> > breakpoints.
> 
> That 'arbitrarily larg number of breakpoints' worries me. It's a 
> pretty broken concept for a 4-items resource that cannot be 
> time-shared and hence cannot be overcommitted.

Suppose we never allow callers to register more breakpoints than will
fit in the CPU's registers.  Do we then use a simple first-come
first-served algorithm, with no prioritization?  If we do prioritize
some breakpoint registrations more highly than others, how do we inform
callers that their breakpoint has been kicked out by one of higher
priority?  And how do we let them know when the higher-priority
breakpoint has been unregistered, so they can try again?

> Seems to me that much of the complexity of this patchset:
> 
>  28 files changed, 2439 insertions(+), 199 deletions(-)
> 
> Could be eliminated via a very simple exclusive reservation 
> mechanism.

Can it really be as simple as all that?

Roland, what do you think?

Alan Stern


^ permalink raw reply	[flat|nested] 55+ messages in thread

* Re: [patch 02/11] x86 architecture implementation of Hardware Breakpoint interfaces
  2009-03-10 17:11         ` Alan Stern
@ 2009-03-10 17:26           ` Ingo Molnar
  2009-03-10 20:30             ` Alan Stern
  2009-03-14  3:40             ` Benjamin Herrenschmidt
  0 siblings, 2 replies; 55+ messages in thread
From: Ingo Molnar @ 2009-03-10 17:26 UTC (permalink / raw)
  To: Alan Stern
  Cc: prasad, Andrew Morton, Linux Kernel Mailing List, Roland McGrath


* Alan Stern <stern@rowland.harvard.edu> wrote:

> On Tue, 10 Mar 2009, Ingo Molnar wrote:
> 
> > > > why this redirection, why dont just use the structure as-is? 
> > > > If there's any arch weirdness then that arch should have 
> > > > arch-special accessors - not the generic code.
> > > 
> > > These _are_ the arch-specific accessors.  Look at the 
> > > filename: arch/x86/include/asm/hw_breakpoint.h.
> > 
> > I very well know which file this is, you need to read my reply 
> > again.
> > 
> > These are very generic-sounding fields and they should not be 
> > hidden via pointless wrappers by the generic code. Dont let the 
> > tail wag the dog. If there's architecture weirdness that does 
> > not fit the generic code, then _that_ architecture should have 
> > the ugliness - not the generic code. (note that these accessors 
> > are used by the generic code so the uglification spreads there)
> 
> Hm.  I haven't been keeping careful track of all the updates 
> Prasad has been making.  In my fairly-old copy of the 
> hw-breakpoint work, the accessors are _not_ used by the 
> generic code.  They are there for future users of the API, not 
> for internal use by the API itself.  Is there something I'm 
> missing?

Right, they do seem unused at the moment. I was going over the 
patches and this stuck out as wrong.

> I have the feeling that this doesn't really address your 
> comment, but I'm not sure if that's because I don't understand 
> your point or you don't understand mine...

Removing them addresses my comment.

> > These are very generic-sounding fields ...
> 
> Would you be happier if the field names were changed to be 
> less generic-sounding?

Not sure what to make of this kind of reply. This isnt about me 
being happier. Generic-sounding accessors for generic-sounding 
fields is an easily recognizable pattern for broken design.

> > > > > + int num_installed; /* Number of installed bps */ + 
> > > > > unsigned gennum; /* update-generation number */
> > > > 
> > > > i suspect the gennum we can get rid of if we get rid of the 
> > > > notion of priorities, right?
> > > 
> > > No.  gennum has nothing to do with priorities.
> > 
> > Well it's introduced because we have a priority-sorted list of 
> > breakpoints not an array.
> 
> More generally, it's there because kernel & userspace 
> breakpoints can be installed and uninstalled while a task is 
> running -- and yes, this is partially because breakpoints are 
> prioritized.  (Although it's worth pointing out that even your 
> suggestion of always prioritizing kernel breakpoints above 
> userspace breakpoints would have the same effect.)  However 
> the fact that the breakpoints are stored in a list rather than 
> an array doesn't seem to be relevant.
> 
> > A list needs to be maintained and when updated it's 
> > reloaded.
> 
> The same is true of an array.

Not if what we do what the previous code did: reloaded the full 
array unconditionally. (it's just 4 entries)

> > I was thinking about possibly getting rid of that list 
> > complication and go back to the simpler array. But it's hard 
> > because the lifetime of a kernel space breakpoint spans 
> > context-switches so there has to be separation.
> 
> Yes, kernel breakpoints have to be kept separate from 
> userspace breakpoints.  But even if you focus just on 
> userspace breakpoints, you still need to use a list because 
> debuggers can try to register an arbitrarily large number of 
> breakpoints.

That 'arbitrarily larg number of breakpoints' worries me. It's a 
pretty broken concept for a 4-items resource that cannot be 
time-shared and hence cannot be overcommitted.

Seems to me that much of the complexity of this patchset:

 28 files changed, 2439 insertions(+), 199 deletions(-)

Could be eliminated via a very simple exclusive reservation 
mechanism.

	Ingo

^ permalink raw reply	[flat|nested] 55+ messages in thread

* Re: [patch 02/11] x86 architecture implementation of Hardware Breakpoint interfaces
  2009-03-10 15:18       ` Ingo Molnar
@ 2009-03-10 17:11         ` Alan Stern
  2009-03-10 17:26           ` Ingo Molnar
  0 siblings, 1 reply; 55+ messages in thread
From: Alan Stern @ 2009-03-10 17:11 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: prasad, Andrew Morton, Linux Kernel Mailing List, Roland McGrath

On Tue, 10 Mar 2009, Ingo Molnar wrote:

> > > why this redirection, why dont just use the structure as-is? 
> > > If there's any arch weirdness then that arch should have 
> > > arch-special accessors - not the generic code.
> > 
> > These _are_ the arch-specific accessors.  Look at the 
> > filename: arch/x86/include/asm/hw_breakpoint.h.
> 
> I very well know which file this is, you need to read my reply 
> again.
> 
> These are very generic-sounding fields and they should not be 
> hidden via pointless wrappers by the generic code. Dont let the 
> tail wag the dog. If there's architecture weirdness that does 
> not fit the generic code, then _that_ architecture should have 
> the ugliness - not the generic code. (note that these accessors 
> are used by the generic code so the uglification spreads there)

Hm.  I haven't been keeping careful track of all the updates Prasad has
been making.  In my fairly-old copy of the hw-breakpoint work, the
accessors are _not_ used by the generic code.  They are there for
future users of the API, not for internal use by the API itself.  Is 
there something I'm missing?

I have the feeling that this doesn't really address your comment, but
I'm not sure if that's because I don't understand your point or you
don't understand mine...

> These are very generic-sounding fields ...

Would you be happier if the field names were changed to be less 
generic-sounding?

> > > > + int num_installed; /* Number of installed bps */ + 
> > > > unsigned gennum; /* update-generation number */
> > > 
> > > i suspect the gennum we can get rid of if we get rid of the 
> > > notion of priorities, right?
> > 
> > No.  gennum has nothing to do with priorities.
> 
> Well it's introduced because we have a priority-sorted list of 
> breakpoints not an array.

More generally, it's there because kernel & userspace breakpoints can
be installed and uninstalled while a task is running -- and yes, this
is partially because breakpoints are prioritized.  (Although it's worth
pointing out that even your suggestion of always prioritizing kernel
breakpoints above userspace breakpoints would have the same effect.)  
However the fact that the breakpoints are stored in a list rather than
an array doesn't seem to be relevant.

> A list needs to be maintained and when 
> updated it's reloaded.

The same is true of an array.

> I was thinking about possibly getting rid 
> of that list complication and go back to the simpler array. But 
> it's hard because the lifetime of a kernel space breakpoint 
> spans context-switches so there has to be separation.

Yes, kernel breakpoints have to be kept separate from userspace 
breakpoints.  But even if you focus just on userspace breakpoints, you 
still need to use a list because debuggers can try to register an 
arbitrarily large number of breakpoints.

Alan Stern


^ permalink raw reply	[flat|nested] 55+ messages in thread

* Re: [patch 02/11] x86 architecture implementation of Hardware Breakpoint interfaces
  2009-03-10 14:59     ` Alan Stern
@ 2009-03-10 15:18       ` Ingo Molnar
  2009-03-10 17:11         ` Alan Stern
  0 siblings, 1 reply; 55+ messages in thread
From: Ingo Molnar @ 2009-03-10 15:18 UTC (permalink / raw)
  To: Alan Stern
  Cc: prasad, Andrew Morton, Linux Kernel Mailing List, Roland McGrath


* Alan Stern <stern@rowland.harvard.edu> wrote:

> On Tue, 10 Mar 2009, Ingo Molnar wrote:
> 
> > * prasad@linux.vnet.ibm.com <prasad@linux.vnet.ibm.com> wrote:
> > 
> > > +/*
> > > + * Handle debug exception notifications.
> > > + */
> > > +
> > > +int __kprobes hw_breakpoint_handler(struct die_args *args)
> > > +{
> > > +	struct cpu_hw_breakpoint *chbi;
> > > +	int i;
> > > +	struct hw_breakpoint *bp;
> > > +	struct thread_hw_breakpoint *thbi = NULL;
> > > +
> > > +	/* The DR6 value is stored in args->err */
> > > +#define DR6	(args->err)
> > 
> > that's ugly - what's wrong with an old-fashioned "int db6 = 
> > args->err" type of approach?
> 
> Yes, it is ugly.  It was a holdover from an earlier version, and in 
> fact it's likely to change in the future to become even more ugly.  But 
> for now, you're right -- a simple assignment would be better.
> 
> > > +++ linux-2.6-tip.hbkpt/arch/x86/include/asm/hw_breakpoint.h
> > > @@ -0,0 +1,132 @@
> > > +#ifndef	_I386_HW_BREAKPOINT_H
> > > +#define	_I386_HW_BREAKPOINT_H
> > > +
> > > +#ifdef	__KERNEL__
> > > +#define	__ARCH_HW_BREAKPOINT_H
> > > +
> > > +struct arch_hw_breakpoint {
> > > +	char		*name; /* Contains name of the symbol to set bkpt */
> > > +	unsigned long	address;
> > > +	u8		len;
> > > +	u8		type;
> > > +} __attribute__((packed));
> > 
> > hm, why packed and why u8 ? We dont expose this to user-space, 
> > do we? (if yes then 'unsigned long' is wrong and __KERNEL__ is 
> > wrong as well)
> 
> I can't remember why this was made packed; there doesn't seem to be any 
> important reason for it.  The structure is not exposed to userspace.  
> The len and type fields are u8 because they contain values no larger 
> than 255.
> 
> > > +#include <linux/kdebug.h>
> > > +#include <asm-generic/hw_breakpoint.h>
> > > +
> > > +/* HW breakpoint accessor routines */
> > > +static inline const void *hw_breakpoint_get_kaddress(struct hw_breakpoint *bp)
> > > +{
> > > +	return (const void *) bp->info.address;
> > > +}
> > > +
> > > +static inline const void __user *hw_breakpoint_get_uaddress
> > > +						(struct hw_breakpoint *bp)
> > > +{
> > > +	return (const void __user *) bp->info.address;
> > > +}
> > > +
> > > +static inline unsigned hw_breakpoint_get_len(struct hw_breakpoint *bp)
> > > +{
> > > +	return bp->info.len;
> > > +}
> > > +
> > > +static inline unsigned hw_breakpoint_get_type(struct hw_breakpoint *bp)
> > > +{
> > > +	return bp->info.type;
> > > +}
> > 
> > why this redirection, why dont just use the structure as-is? 
> > If there's any arch weirdness then that arch should have 
> > arch-special accessors - not the generic code.
> 
> These _are_ the arch-specific accessors.  Look at the 
> filename: arch/x86/include/asm/hw_breakpoint.h.

I very well know which file this is, you need to read my reply 
again.

These are very generic-sounding fields and they should not be 
hidden via pointless wrappers by the generic code. Dont let the 
tail wag the dog. If there's architecture weirdness that does 
not fit the generic code, then _that_ architecture should have 
the ugliness - not the generic code. (note that these accessors 
are used by the generic code so the uglification spreads there)

> > > + int num_installed; /* Number of installed bps */ + 
> > > unsigned gennum; /* update-generation number */
> > 
> > i suspect the gennum we can get rid of if we get rid of the 
> > notion of priorities, right?
> 
> No.  gennum has nothing to do with priorities.

Well it's introduced because we have a priority-sorted list of 
breakpoints not an array. A list needs to be maintained and when 
updated it's reloaded. I was thinking about possibly getting rid 
of that list complication and go back to the simpler array. But 
it's hard because the lifetime of a kernel space breakpoint 
spans context-switches so there has to be separation.

	Ingo

^ permalink raw reply	[flat|nested] 55+ messages in thread

* Re: [patch 02/11] x86 architecture implementation of Hardware Breakpoint interfaces
  2009-03-10 14:09   ` Ingo Molnar
@ 2009-03-10 14:59     ` Alan Stern
  2009-03-10 15:18       ` Ingo Molnar
  2009-03-12  2:46     ` Roland McGrath
  1 sibling, 1 reply; 55+ messages in thread
From: Alan Stern @ 2009-03-10 14:59 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: prasad, Andrew Morton, Linux Kernel Mailing List, Roland McGrath

On Tue, 10 Mar 2009, Ingo Molnar wrote:

> * prasad@linux.vnet.ibm.com <prasad@linux.vnet.ibm.com> wrote:
> 
> > +/*
> > + * Handle debug exception notifications.
> > + */
> > +
> > +int __kprobes hw_breakpoint_handler(struct die_args *args)
> > +{
> > +	struct cpu_hw_breakpoint *chbi;
> > +	int i;
> > +	struct hw_breakpoint *bp;
> > +	struct thread_hw_breakpoint *thbi = NULL;
> > +
> > +	/* The DR6 value is stored in args->err */
> > +#define DR6	(args->err)
> 
> that's ugly - what's wrong with an old-fashioned "int db6 = 
> args->err" type of approach?

Yes, it is ugly.  It was a holdover from an earlier version, and in 
fact it's likely to change in the future to become even more ugly.  But 
for now, you're right -- a simple assignment would be better.

> > +++ linux-2.6-tip.hbkpt/arch/x86/include/asm/hw_breakpoint.h
> > @@ -0,0 +1,132 @@
> > +#ifndef	_I386_HW_BREAKPOINT_H
> > +#define	_I386_HW_BREAKPOINT_H
> > +
> > +#ifdef	__KERNEL__
> > +#define	__ARCH_HW_BREAKPOINT_H
> > +
> > +struct arch_hw_breakpoint {
> > +	char		*name; /* Contains name of the symbol to set bkpt */
> > +	unsigned long	address;
> > +	u8		len;
> > +	u8		type;
> > +} __attribute__((packed));
> 
> hm, why packed and why u8 ? We dont expose this to user-space, 
> do we? (if yes then 'unsigned long' is wrong and __KERNEL__ is 
> wrong as well)

I can't remember why this was made packed; there doesn't seem to be any 
important reason for it.  The structure is not exposed to userspace.  
The len and type fields are u8 because they contain values no larger 
than 255.

> > +#include <linux/kdebug.h>
> > +#include <asm-generic/hw_breakpoint.h>
> > +
> > +/* HW breakpoint accessor routines */
> > +static inline const void *hw_breakpoint_get_kaddress(struct hw_breakpoint *bp)
> > +{
> > +	return (const void *) bp->info.address;
> > +}
> > +
> > +static inline const void __user *hw_breakpoint_get_uaddress
> > +						(struct hw_breakpoint *bp)
> > +{
> > +	return (const void __user *) bp->info.address;
> > +}
> > +
> > +static inline unsigned hw_breakpoint_get_len(struct hw_breakpoint *bp)
> > +{
> > +	return bp->info.len;
> > +}
> > +
> > +static inline unsigned hw_breakpoint_get_type(struct hw_breakpoint *bp)
> > +{
> > +	return bp->info.type;
> > +}
> 
> why this redirection, why dont just use the structure as-is? If 
> there's any arch weirdness then that arch should have 
> arch-special accessors - not the generic code.

These _are_ the arch-specific accessors.  Look at the filename:
arch/x86/include/asm/hw_breakpoint.h.

> > +	int			num_installed;	/* Number of installed bps */
> > +	unsigned		gennum;		/* update-generation number */
> 
> i suspect the gennum we can get rid of if we get rid of the 
> notion of priorities, right?

No.  gennum has nothing to do with priorities.

Alan Stern


^ permalink raw reply	[flat|nested] 55+ messages in thread

* Re: [patch 02/11] x86 architecture implementation of Hardware Breakpoint interfaces
  2009-03-05  4:38 ` [patch " prasad
@ 2009-03-10 14:09   ` Ingo Molnar
  2009-03-10 14:59     ` Alan Stern
  2009-03-12  2:46     ` Roland McGrath
  0 siblings, 2 replies; 55+ messages in thread
From: Ingo Molnar @ 2009-03-10 14:09 UTC (permalink / raw)
  To: prasad
  Cc: Andrew Morton, Linux Kernel Mailing List, Alan Stern, Roland McGrath


* prasad@linux.vnet.ibm.com <prasad@linux.vnet.ibm.com> wrote:

> +/*
> + * Handle debug exception notifications.
> + */
> +
> +int __kprobes hw_breakpoint_handler(struct die_args *args)
> +{
> +	struct cpu_hw_breakpoint *chbi;
> +	int i;
> +	struct hw_breakpoint *bp;
> +	struct thread_hw_breakpoint *thbi = NULL;
> +
> +	/* The DR6 value is stored in args->err */
> +#define DR6	(args->err)

that's ugly - what's wrong with an old-fashioned "int db6 = 
args->err" type of approach?

> +
> +	if (DR6 & DR_STEP)
> +		return NOTIFY_DONE;
> +
> +	chbi = &per_cpu(cpu_bp, get_cpu());
> +
> +	/* Disable all breakpoints so that the callbacks can run without
> +	 * triggering recursive debug exceptions.
> +	 */
> +	set_debugreg(0UL, 7);
> +
> +	/* Assert that local interrupts are disabled
> +	 * Reset the DRn bits in the virtualized register value.
> +	 * The ptrace trigger routine will add in whatever is needed.
> +	 */
> +	current->thread.vdr6 &= ~(DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3);
> +
> +	/* Are we a victim of lazy debug-register switching? */
> +	if (!chbi->bp_task)
> +		;
> +	else if (chbi->bp_task != current) {
> +
> +		/* No user breakpoints are valid.  Perform the belated
> +		 * debug-register switch.
> +		 */
> +		switch_to_none_hw_breakpoint();
> +	} else {
> +		thbi = chbi->bp_task->thread.hw_breakpoint_info;
> +	}
> +
> +	/* Handle all the breakpoints that were triggered */
> +	for (i = 0; i < HB_NUM; ++i) {
> +		if (likely(!(DR6 & (DR_TRAP0 << i))))
> +			continue;
> +
> +		/* Find the corresponding hw_breakpoint structure and
> +		 * invoke its triggered callback.
> +		 */
> +		if (i < chbi->cur_kbpdata->num_kbps)
> +			bp = chbi->cur_kbpdata->bps[i];
> +		else if (thbi)
> +			bp = thbi->bps[i];
> +		else		/* False alarm due to lazy DR switching */
> +			continue;
> +		if (bp) {
> +			switch (bp->info.type) {
> +			case HW_BREAKPOINT_WRITE:
> +			case HW_BREAKPOINT_RW:
> +				if (bp->triggered)
> +					(bp->triggered)(bp, args->regs);
> +				/* Re-enable the breakpoints */
> +				set_debugreg(thbi ? thbi->tkdr7 :
> +						chbi->cur_kbpdata->mkdr7, 7);
> +				put_cpu_no_resched();
> +
> +				return NOTIFY_STOP;
> +			/*
> +			 * Presently we allow instruction breakpoints only in
> +			 * user-space when requested through ptrace.
> +			 */
> +			case HW_BREAKPOINT_EXECUTE:
> +				if (arch_check_va_in_userspace(bp->info.address,
> +								current)) {
> +					(bp->triggered)(bp, args->regs);
> +	/* We'll return NOTIFY_DONE, do_debug will take care of the rest */
> +					return NOTIFY_DONE;
> +				}
> +			}

the linebreaks here became so ugly because the whole loop body 
should be moved inside a helper function.

> +++ linux-2.6-tip.hbkpt/arch/x86/include/asm/hw_breakpoint.h
> @@ -0,0 +1,132 @@
> +#ifndef	_I386_HW_BREAKPOINT_H
> +#define	_I386_HW_BREAKPOINT_H
> +
> +#ifdef	__KERNEL__
> +#define	__ARCH_HW_BREAKPOINT_H
> +
> +struct arch_hw_breakpoint {
> +	char		*name; /* Contains name of the symbol to set bkpt */
> +	unsigned long	address;
> +	u8		len;
> +	u8		type;
> +} __attribute__((packed));

hm, why packed and why u8 ? We dont expose this to user-space, 
do we? (if yes then 'unsigned long' is wrong and __KERNEL__ is 
wrong as well)

> +#include <linux/kdebug.h>
> +#include <asm-generic/hw_breakpoint.h>
> +
> +/* HW breakpoint accessor routines */
> +static inline const void *hw_breakpoint_get_kaddress(struct hw_breakpoint *bp)
> +{
> +	return (const void *) bp->info.address;
> +}
> +
> +static inline const void __user *hw_breakpoint_get_uaddress
> +						(struct hw_breakpoint *bp)
> +{
> +	return (const void __user *) bp->info.address;
> +}
> +
> +static inline unsigned hw_breakpoint_get_len(struct hw_breakpoint *bp)
> +{
> +	return bp->info.len;
> +}
> +
> +static inline unsigned hw_breakpoint_get_type(struct hw_breakpoint *bp)
> +{
> +	return bp->info.type;
> +}

why this redirection, why dont just use the structure as-is? If 
there's any arch weirdness then that arch should have 
arch-special accessors - not the generic code.

> +
> +/* Kernel symbol lookup routine for installing Data HW Breakpoint Address */
> +static inline unsigned long hw_breakpoint_lookup_name(const char *name)
> +{
> +	return kallsyms_lookup_name(name);
> +}

A wrapper around kallsyms_lookup_name() is quite pointless - 
pleae us kallsyms_lookup_name() drectly.

> +/* Per-thread HW breakpoint and debug register info */
> +struct thread_hw_breakpoint {
> +
> +	/* utrace support */
> +	struct list_head	node;		/* Entry in thread list */
> +	struct list_head	thread_bps;	/* Thread's breakpoints */
> +	struct hw_breakpoint	*bps[HB_NUM];	/* Highest-priority bps */
> +	unsigned long		tdr[HB_NUM];	/*  and their addresses */

Please rename it to something like ->hw_breakpoint[] and 
->address[] - 'bps' and 'tdr' look quite meaningless.

> +	int			num_installed;	/* Number of installed bps */
> +	unsigned		gennum;		/* update-generation number */

i suspect the gennum we can get rid of if we get rid of the 
notion of priorities, right?

	Ingo

^ permalink raw reply	[flat|nested] 55+ messages in thread

* [Patch 02/11] x86 architecture implementation of Hardware Breakpoint interfaces
       [not found] <20090307045120.039324630@linux.vnet.ibm.com>
@ 2009-03-07  5:05 ` prasad
  0 siblings, 0 replies; 55+ messages in thread
From: prasad @ 2009-03-07  5:05 UTC (permalink / raw)
  To: mingo
  Cc: Andrew Morton, Linux Kernel Mailing List, Alan Stern,
	Roland McGrath, K.Prasad

[-- Attachment #1: 2 --]
[-- Type: text/plain, Size: 17464 bytes --]

From: Alan Stern <stern@rowland.harvard.edu>

This patch introduces two new files named hw_breakpoint.[ch] inside x86 specific
directories. They contain functions which help validate and serve requests for 
using Hardware Breakpoint registers on x86 processors.

[K.Prasad: More declarations in hw_breakpoint.h to independently compile each
           hw_breakpoint.c files. Split-out from the bigger patch and minor
           changes following re-basing]

Signed-off-by: K.Prasad <prasad@linux.vnet.ibm.com>
Signed-off-by: Alan Stern <stern@rowland.harvard.edu>
---
 arch/x86/include/asm/hw_breakpoint.h |  132 ++++++++++
 arch/x86/kernel/Makefile             |    2 
 arch/x86/kernel/hw_breakpoint.c      |  437 +++++++++++++++++++++++++++++++++++
 3 files changed, 570 insertions(+), 1 deletion(-)

Index: linux-2.6-tip/arch/x86/kernel/hw_breakpoint.c
===================================================================
--- /dev/null
+++ linux-2.6-tip/arch/x86/kernel/hw_breakpoint.c
@@ -0,0 +1,437 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) 2007 Alan Stern
+ * Copyright (C) 2009 IBM Corporation
+ */
+
+/*
+ * HW_breakpoint: a unified kernel/user-space hardware breakpoint facility,
+ * using the CPU's debug registers.
+ */
+
+#include <linux/init.h>
+#include <linux/irqflags.h>
+#include <linux/kdebug.h>
+#include <linux/kernel.h>
+#include <linux/kprobes.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/notifier.h>
+#include <linux/rculist.h>
+#include <linux/sched.h>
+#include <linux/smp.h>
+#include <linux/percpu.h>
+#include <linux/kallsyms.h>
+
+#include <asm/debugreg.h>
+#include <asm/hw_breakpoint.h>
+#include <asm/processor.h>
+
+static unsigned long		kdr7;		/* Unmasked kernel DR7 value */
+
+/* Masks for the bits in DR7 related to kernel breakpoints, for various
+ * values of num_kbps.  Entry n is the mask for when there are n kernel
+ * breakpoints, in debug registers 0 - (n-1).  The DR_GLOBAL_SLOWDOWN bit
+ * (GE) is handled specially.
+ */
+static const unsigned long	kdr7_masks[HB_NUM + 1] = {
+	0x00000000,
+	0x000f0003,	/* LEN0, R/W0, G0, L0 */
+	0x00ff000f,	/* Same for 0,1 */
+	0x0fff003f,	/* Same for 0,1,2 */
+	0xffff00ff	/* Same for 0,1,2,3 */
+};
+
+/*
+ * Install the kernel breakpoints in their debug registers.
+ */
+void arch_install_chbi(struct cpu_hw_breakpoint *chbi)
+{
+	struct hw_breakpoint **bps;
+
+	/* Don't allow debug exceptions while we update the registers */
+	set_debugreg(0UL, 7);
+	chbi->cur_kbpdata = rcu_dereference(cur_kbpdata);
+
+	/* Kernel breakpoints are stored starting in DR0 and going up */
+	bps = chbi->cur_kbpdata->bps;
+	switch (chbi->cur_kbpdata->num_kbps) {
+	case 4:
+		set_debugreg(bps[3]->info.address, 3);
+	case 3:
+		set_debugreg(bps[2]->info.address, 2);
+	case 2:
+		set_debugreg(bps[1]->info.address, 1);
+	case 1:
+		set_debugreg(bps[0]->info.address, 0);
+	}
+	/* No need to set DR6 */
+	set_debugreg(chbi->cur_kbpdata->mkdr7, 7);
+}
+
+/*
+ * Update an out-of-date thread hw_breakpoint info structure.
+ */
+void arch_update_thbi(struct thread_hw_breakpoint *thbi,
+			struct kernel_bp_data *thr_kbpdata)
+{
+	int num = thr_kbpdata->num_kbps;
+
+	thbi->tkdr7 = thr_kbpdata->mkdr7 | (thbi->tdr7 & ~kdr7_masks[num]);
+}
+
+/*
+ * Install the thread breakpoints in their debug registers.
+ */
+void arch_install_thbi(struct thread_hw_breakpoint *thbi)
+{
+	/* Install the user breakpoints.  Kernel breakpoints are stored
+	 * starting in DR0 and going up; there are num_kbps of them.
+	 * User breakpoints are stored starting in DR3 and going down,
+	 * as many as we have room for.
+	 */
+	switch (thbi->num_installed) {
+	case 4:
+		set_debugreg(thbi->tdr[0], 0);
+	case 3:
+		set_debugreg(thbi->tdr[1], 1);
+	case 2:
+		set_debugreg(thbi->tdr[2], 2);
+	case 1:
+		set_debugreg(thbi->tdr[3], 3);
+	}
+	/* No need to set DR6 */
+	set_debugreg(thbi->tkdr7, 7);
+}
+
+/*
+ * Install the debug register values for just the kernel, no thread.
+ */
+void arch_install_none(struct cpu_hw_breakpoint *chbi)
+{
+	set_debugreg(chbi->cur_kbpdata->mkdr7, 7);
+}
+
+/*
+ * Create a new kbpdata entry.
+ */
+void arch_new_kbpdata(struct kernel_bp_data *new_kbpdata)
+{
+	int num = new_kbpdata->num_kbps;
+
+	new_kbpdata->mkdr7 = kdr7 & (kdr7_masks[num] | DR_GLOBAL_SLOWDOWN);
+}
+
+/*
+ * Store a thread breakpoint array entry's address
+ */
+void arch_store_thread_bp_array(struct thread_hw_breakpoint *thbi,
+					struct hw_breakpoint *bp, int i)
+{
+	thbi->tdr[i] = bp->info.address;
+}
+
+/*
+ * Check for virtual address in user space.
+ */
+int arch_check_va_in_userspace(unsigned long va, struct task_struct *tsk)
+{
+	return (va < TASK_SIZE);
+}
+
+/*
+ * Check for virtual address in kernel space.
+ */
+int arch_check_va_in_kernelspace(unsigned long va)
+{
+	return (va >= TASK_SIZE);
+}
+
+/*
+ * Store a breakpoint's encoded address, length, and type.
+ */
+void arch_store_info(struct hw_breakpoint *bp)
+{
+	/*
+	 * User-space requests will always have the address field populated
+	 * For kernel-addresses, either the address or symbol name can be
+	 * specified.
+	 */
+	if (bp->info.address)
+		return;
+	bp->info.address = (unsigned long)kallsyms_lookup_name(bp->info.name);
+}
+
+/*
+ * Validate the arch-specific HW Breakpoint register settings
+ */
+int arch_validate_hwbkpt_settings(struct hw_breakpoint *bp,
+				unsigned int *align, struct task_struct *tsk)
+{
+	int ret = -EINVAL;
+
+	switch (bp->info.type) {
+
+	/* Ptrace-refactoring code
+	 * For now, we'll allow instruction breakpoint only for user-space
+	 * addresses
+	 */
+	case HW_BREAKPOINT_EXECUTE:
+		if ((!arch_check_va_in_userspace(bp->info.address, tsk)) &&
+			bp->info.len != HW_BREAKPOINT_LEN_EXECUTE)
+			return ret;
+		break;
+	case HW_BREAKPOINT_WRITE:
+				break;
+	case HW_BREAKPOINT_RW:
+				break;
+	default:
+		return ret;
+	}
+
+	switch (bp->info.len) {
+	case HW_BREAKPOINT_LEN_1:
+		*align = 0;
+		break;
+	case HW_BREAKPOINT_LEN_2:
+		*align = 1;
+		break;
+	case HW_BREAKPOINT_LEN_4:
+		*align = 3;
+		break;
+	default:
+		return ret;
+	}
+
+	if (bp->triggered) {
+		ret = 0;
+		arch_store_info(bp);
+	}
+	return ret;
+}
+
+/*
+ * Encode the length, type, Exact, and Enable bits for a particular breakpoint
+ * as stored in debug register 7.
+ */
+static unsigned long encode_dr7(int drnum, unsigned len, unsigned type)
+{
+	unsigned long temp;
+
+	temp = (len | type) & 0xf;
+	temp <<= (DR_CONTROL_SHIFT + drnum * DR_CONTROL_SIZE);
+	temp |= (DR_GLOBAL_ENABLE << (drnum * DR_ENABLE_SIZE)) |
+				DR_GLOBAL_SLOWDOWN;
+	return temp;
+}
+
+/*
+ * Calculate the DR7 value for a list of kernel or user breakpoints.
+ */
+static unsigned long calculate_dr7(struct thread_hw_breakpoint *thbi)
+{
+	int is_user;
+	struct list_head *bp_list;
+	struct hw_breakpoint *bp;
+	int i;
+	int drnum;
+	unsigned long dr7;
+
+	if (thbi) {
+		is_user = 1;
+		bp_list = &thbi->thread_bps;
+		drnum = HB_NUM - 1;
+	} else {
+		is_user = 0;
+		bp_list = &kernel_bps;
+		drnum = 0;
+	}
+
+	/* Kernel bps are assigned from DR0 on up, and user bps are assigned
+	 * from DR3 on down.  Accumulate all 4 bps; the kernel DR7 mask will
+	 * select the appropriate bits later.
+	 */
+	dr7 = 0;
+	i = 0;
+	list_for_each_entry(bp, bp_list, node) {
+
+		/* Get the debug register number and accumulate the bits */
+		dr7 |= encode_dr7(drnum, bp->info.len, bp->info.type);
+		if (++i >= HB_NUM)
+			break;
+		if (is_user)
+			--drnum;
+		else
+			++drnum;
+	}
+	return dr7;
+}
+
+/*
+ * Register a new user breakpoint structure.
+ */
+void arch_register_user_hw_breakpoint(struct hw_breakpoint *bp,
+		struct thread_hw_breakpoint *thbi)
+{
+	thbi->tdr7 = calculate_dr7(thbi);
+
+	/* If this is an execution breakpoint for the current PC address,
+	 * we should clear the task's RF so that the bp will be certain
+	 * to trigger.
+	 *
+	 * FIXME: It's not so easy to get hold of the task's PC as a linear
+	 * address!  ptrace.c does this already...
+	 */
+}
+
+/*
+ * Unregister a user breakpoint structure.
+ */
+void arch_unregister_user_hw_breakpoint(struct hw_breakpoint *bp,
+					struct thread_hw_breakpoint *thbi)
+{
+	thbi->tdr7 = calculate_dr7(thbi);
+}
+
+/*
+ * Register a kernel breakpoint structure.
+ */
+void arch_register_kernel_hw_breakpoint(struct hw_breakpoint *bp)
+{
+	kdr7 = calculate_dr7(NULL);
+}
+
+/*
+ * Unregister a kernel breakpoint structure.
+ */
+void arch_unregister_kernel_hw_breakpoint(struct hw_breakpoint *bp)
+{
+	kdr7 = calculate_dr7(NULL);
+}
+
+
+/* End of arch-specific hook routines */
+
+
+/*
+ * Copy out the debug register information for a core dump.
+ *
+ * tsk must be equal to current.
+ */
+void dump_thread_hw_breakpoint(struct task_struct *tsk, int u_debugreg[8])
+{
+	struct thread_hw_breakpoint *thbi = tsk->thread.hw_breakpoint_info;
+	int i;
+
+	memset(u_debugreg, 0, sizeof u_debugreg);
+	if (thbi) {
+		for (i = 0; i < HB_NUM; ++i)
+			u_debugreg[i] = thbi->vdr_bps[i].info.address;
+		u_debugreg[7] = thbi->vdr7;
+	}
+	u_debugreg[6] = tsk->thread.vdr6;
+}
+
+/*
+ * Handle debug exception notifications.
+ */
+
+int __kprobes hw_breakpoint_handler(struct die_args *args)
+{
+	struct cpu_hw_breakpoint *chbi;
+	int i;
+	struct hw_breakpoint *bp;
+	struct thread_hw_breakpoint *thbi = NULL;
+
+	/* The DR6 value is stored in args->err */
+#define DR6	(args->err)
+
+	if (DR6 & DR_STEP)
+		return NOTIFY_DONE;
+
+	chbi = &per_cpu(cpu_bp, get_cpu());
+
+	/* Disable all breakpoints so that the callbacks can run without
+	 * triggering recursive debug exceptions.
+	 */
+	set_debugreg(0UL, 7);
+
+	/* Assert that local interrupts are disabled
+	 * Reset the DRn bits in the virtualized register value.
+	 * The ptrace trigger routine will add in whatever is needed.
+	 */
+	current->thread.vdr6 &= ~(DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3);
+
+	/* Are we a victim of lazy debug-register switching? */
+	if (!chbi->bp_task)
+		;
+	else if (chbi->bp_task != current) {
+
+		/* No user breakpoints are valid.  Perform the belated
+		 * debug-register switch.
+		 */
+		switch_to_none_hw_breakpoint();
+	} else {
+		thbi = chbi->bp_task->thread.hw_breakpoint_info;
+	}
+
+	/* Handle all the breakpoints that were triggered */
+	for (i = 0; i < HB_NUM; ++i) {
+		if (likely(!(DR6 & (DR_TRAP0 << i))))
+			continue;
+
+		/* Find the corresponding hw_breakpoint structure and
+		 * invoke its triggered callback.
+		 */
+		if (i < chbi->cur_kbpdata->num_kbps)
+			bp = chbi->cur_kbpdata->bps[i];
+		else if (thbi)
+			bp = thbi->bps[i];
+		else		/* False alarm due to lazy DR switching */
+			continue;
+		if (bp) {
+			switch (bp->info.type) {
+			case HW_BREAKPOINT_WRITE:
+			case HW_BREAKPOINT_RW:
+				if (bp->triggered)
+					(bp->triggered)(bp, args->regs);
+				/* Re-enable the breakpoints */
+				set_debugreg(thbi ? thbi->tkdr7 :
+						chbi->cur_kbpdata->mkdr7, 7);
+				put_cpu_no_resched();
+
+				return NOTIFY_STOP;
+			/*
+			 * Presently we allow instruction breakpoints only in
+			 * user-space when requested through ptrace.
+			 */
+			case HW_BREAKPOINT_EXECUTE:
+				if (arch_check_va_in_userspace(bp->info.address,
+								current)) {
+					(bp->triggered)(bp, args->regs);
+	/* We'll return NOTIFY_DONE, do_debug will take care of the rest */
+					return NOTIFY_DONE;
+				}
+			}
+		}
+	}
+	/* Stop processing further if the exception is a stray one */
+	if (!(DR6 & ~(DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)))
+		return NOTIFY_STOP;
+
+	return NOTIFY_DONE;
+#undef DR6
+}
Index: linux-2.6-tip/arch/x86/include/asm/hw_breakpoint.h
===================================================================
--- /dev/null
+++ linux-2.6-tip/arch/x86/include/asm/hw_breakpoint.h
@@ -0,0 +1,132 @@
+#ifndef	_I386_HW_BREAKPOINT_H
+#define	_I386_HW_BREAKPOINT_H
+
+#ifdef	__KERNEL__
+#define	__ARCH_HW_BREAKPOINT_H
+
+struct arch_hw_breakpoint {
+	char		*name; /* Contains name of the symbol to set bkpt */
+	unsigned long	address;
+	u8		len;
+	u8		type;
+} __attribute__((packed));
+
+#include <linux/kdebug.h>
+#include <asm-generic/hw_breakpoint.h>
+
+/* HW breakpoint accessor routines */
+static inline const void *hw_breakpoint_get_kaddress(struct hw_breakpoint *bp)
+{
+	return (const void *) bp->info.address;
+}
+
+static inline const void __user *hw_breakpoint_get_uaddress
+						(struct hw_breakpoint *bp)
+{
+	return (const void __user *) bp->info.address;
+}
+
+static inline unsigned hw_breakpoint_get_len(struct hw_breakpoint *bp)
+{
+	return bp->info.len;
+}
+
+static inline unsigned hw_breakpoint_get_type(struct hw_breakpoint *bp)
+{
+	return bp->info.type;
+}
+
+/* Kernel symbol lookup routine for installing Data HW Breakpoint Address */
+static inline unsigned long hw_breakpoint_lookup_name(const char *name)
+{
+	return kallsyms_lookup_name(name);
+}
+
+/* Available HW breakpoint length encodings */
+#define HW_BREAKPOINT_LEN_1		0x40
+#define HW_BREAKPOINT_LEN_2		0x44
+#define HW_BREAKPOINT_LEN_4		0x4c
+#define HW_BREAKPOINT_LEN_EXECUTE	0x40
+
+/* Available HW breakpoint type encodings */
+#define HW_BREAKPOINT_EXECUTE	0x80	/* trigger on instruction execute */
+#define HW_BREAKPOINT_WRITE	0x81	/* trigger on memory write */
+#define HW_BREAKPOINT_RW	0x83	/* trigger on memory read or write */
+
+#define HB_NUM 4 /* Total number of available HW breakpoint registers */
+
+/* Per-thread HW breakpoint and debug register info */
+struct thread_hw_breakpoint {
+
+	/* utrace support */
+	struct list_head	node;		/* Entry in thread list */
+	struct list_head	thread_bps;	/* Thread's breakpoints */
+	struct hw_breakpoint	*bps[HB_NUM];	/* Highest-priority bps */
+	unsigned long		tdr[HB_NUM];	/*  and their addresses */
+	int			num_installed;	/* Number of installed bps */
+	unsigned		gennum;		/* update-generation number */
+
+	/* Only the portions below are arch-specific */
+
+	/* ptrace support -- Note that vdr6 is stored directly in the
+	 * thread_struct so that it is always available.
+	 */
+	unsigned long		vdr7;			/* Virtualized DR7 */
+	struct hw_breakpoint	vdr_bps[HB_NUM];	/* Breakpoints
+			representing virtualized debug registers 0 - 3 */
+	unsigned long		tdr7;		/* Thread's DR7 value */
+	unsigned long		tkdr7;		/* Thread + kernel DR7 value */
+};
+
+/* Kernel-space breakpoint data */
+struct kernel_bp_data {
+	unsigned		gennum;		/* Generation number */
+	int			num_kbps;	/* Number of kernel bps */
+	struct hw_breakpoint	*bps[HB_NUM];	/* Loaded breakpoints */
+
+	/* Only the portions below are arch-specific */
+	unsigned long		mkdr7;		/* Masked kernel DR7 value */
+};
+
+/* Per-CPU debug register info */
+struct cpu_hw_breakpoint {
+	struct kernel_bp_data	*cur_kbpdata;	/* Current kbpdata[] entry */
+	struct task_struct	*bp_task;	/* The thread whose bps
+			are currently loaded in the debug registers */
+};
+
+/*
+ * Ptrace support: breakpoint trigger routine.
+ */
+
+int __register_user_hw_breakpoint(struct task_struct *tsk,
+			struct hw_breakpoint *bp);
+void __unregister_user_hw_breakpoint(struct task_struct *tsk,
+			struct hw_breakpoint *bp);
+
+
+void arch_update_thbi(struct thread_hw_breakpoint *thbi,
+				struct kernel_bp_data *thr_kbpdata);
+void arch_install_thbi(struct thread_hw_breakpoint *thbi);
+void arch_install_none(struct cpu_hw_breakpoint *chbi);
+void arch_install_chbi(struct cpu_hw_breakpoint *chbi);
+void arch_new_kbpdata(struct kernel_bp_data *new_kbpdata);
+void arch_store_thread_bp_array(struct thread_hw_breakpoint *thbi,
+				struct hw_breakpoint *bp, int i);
+int arch_check_va_in_userspace(unsigned long va,
+						struct task_struct *tsk);
+int arch_check_va_in_kernelspace(unsigned long va);
+void arch_store_info(struct hw_breakpoint *bp);
+int arch_validate_hwbkpt_settings(struct hw_breakpoint *bp,
+				unsigned int *align, struct task_struct *tsk);
+void arch_register_user_hw_breakpoint(struct hw_breakpoint *bp,
+				struct thread_hw_breakpoint *thbi);
+void arch_unregister_user_hw_breakpoint(struct hw_breakpoint *bp,
+					struct thread_hw_breakpoint *thbi);
+void arch_register_kernel_hw_breakpoint(struct hw_breakpoint *bp);
+void arch_unregister_kernel_hw_breakpoint(struct hw_breakpoint *bp);
+int hw_breakpoint_handler(struct die_args *args);
+
+#endif	/* __KERNEL__ */
+#endif	/* _I386_HW_BREAKPOINT_H */
+
Index: linux-2.6-tip/arch/x86/kernel/Makefile
===================================================================
--- linux-2.6-tip.orig/arch/x86/kernel/Makefile
+++ linux-2.6-tip/arch/x86/kernel/Makefile
@@ -36,7 +36,7 @@ obj-$(CONFIG_X86_64)	+= sys_x86_64.o x86
 obj-$(CONFIG_X86_64)	+= syscall_64.o vsyscall_64.o
 obj-y			+= bootflag.o e820.o
 obj-y			+= pci-dma.o quirks.o i8237.o topology.o kdebugfs.o
-obj-y			+= alternative.o i8253.o pci-nommu.o
+obj-y			+= alternative.o i8253.o pci-nommu.o hw_breakpoint.o
 obj-y			+= tsc.o io_delay.o rtc.o
 
 obj-$(CONFIG_X86_TRAMPOLINE)	+= trampoline.o


^ permalink raw reply	[flat|nested] 55+ messages in thread

* [patch 02/11] x86 architecture implementation of Hardware Breakpoint interfaces
       [not found] <20090305043440.189041194@linux.vnet.ibm.com>
@ 2009-03-05  4:38 ` prasad
  2009-03-10 14:09   ` Ingo Molnar
  0 siblings, 1 reply; 55+ messages in thread
From: prasad @ 2009-03-05  4:38 UTC (permalink / raw)
  To: mingo
  Cc: Andrew Morton, Linux Kernel Mailing List, Alan Stern,
	Roland McGrath, K.Prasad

[-- Attachment #1: 2 --]
[-- Type: text/plain, Size: 17506 bytes --]

From: Alan Stern <stern@rowland.harvard.edu>

This patch introduces two new files named hw_breakpoint.[ch] inside x86 specific
directories. They contain functions which help validate and serve requests for 
using Hardware Breakpoint registers on x86 processors.

[K.Prasad: More declarations in hw_breakpoint.h to independently compile each
           hw_breakpoint.c files. Split-out from the bigger patch and minor
           changes following re-basing]

Signed-off-by: K.Prasad <prasad@linux.vnet.ibm.com>
Signed-off-by: Alan Stern <stern@rowland.harvard.edu>
---
 arch/x86/include/asm/hw_breakpoint.h |  132 ++++++++++
 arch/x86/kernel/Makefile             |    2 
 arch/x86/kernel/hw_breakpoint.c      |  437 +++++++++++++++++++++++++++++++++++
 3 files changed, 570 insertions(+), 1 deletion(-)

Index: linux-2.6-tip.hbkpt/arch/x86/kernel/hw_breakpoint.c
===================================================================
--- /dev/null
+++ linux-2.6-tip.hbkpt/arch/x86/kernel/hw_breakpoint.c
@@ -0,0 +1,437 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) 2007 Alan Stern
+ * Copyright (C) 2009 IBM Corporation
+ */
+
+/*
+ * HW_breakpoint: a unified kernel/user-space hardware breakpoint facility,
+ * using the CPU's debug registers.
+ */
+
+#include <linux/init.h>
+#include <linux/irqflags.h>
+#include <linux/kdebug.h>
+#include <linux/kernel.h>
+#include <linux/kprobes.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/notifier.h>
+#include <linux/rculist.h>
+#include <linux/sched.h>
+#include <linux/smp.h>
+#include <linux/percpu.h>
+#include <linux/kallsyms.h>
+
+#include <asm/debugreg.h>
+#include <asm/hw_breakpoint.h>
+#include <asm/processor.h>
+
+static unsigned long		kdr7;		/* Unmasked kernel DR7 value */
+
+/* Masks for the bits in DR7 related to kernel breakpoints, for various
+ * values of num_kbps.  Entry n is the mask for when there are n kernel
+ * breakpoints, in debug registers 0 - (n-1).  The DR_GLOBAL_SLOWDOWN bit
+ * (GE) is handled specially.
+ */
+static const unsigned long	kdr7_masks[HB_NUM + 1] = {
+	0x00000000,
+	0x000f0003,	/* LEN0, R/W0, G0, L0 */
+	0x00ff000f,	/* Same for 0,1 */
+	0x0fff003f,	/* Same for 0,1,2 */
+	0xffff00ff	/* Same for 0,1,2,3 */
+};
+
+/*
+ * Install the kernel breakpoints in their debug registers.
+ */
+void arch_install_chbi(struct cpu_hw_breakpoint *chbi)
+{
+	struct hw_breakpoint **bps;
+
+	/* Don't allow debug exceptions while we update the registers */
+	set_debugreg(0UL, 7);
+	chbi->cur_kbpdata = rcu_dereference(cur_kbpdata);
+
+	/* Kernel breakpoints are stored starting in DR0 and going up */
+	bps = chbi->cur_kbpdata->bps;
+	switch (chbi->cur_kbpdata->num_kbps) {
+	case 4:
+		set_debugreg(bps[3]->info.address, 3);
+	case 3:
+		set_debugreg(bps[2]->info.address, 2);
+	case 2:
+		set_debugreg(bps[1]->info.address, 1);
+	case 1:
+		set_debugreg(bps[0]->info.address, 0);
+	}
+	/* No need to set DR6 */
+	set_debugreg(chbi->cur_kbpdata->mkdr7, 7);
+}
+
+/*
+ * Update an out-of-date thread hw_breakpoint info structure.
+ */
+void arch_update_thbi(struct thread_hw_breakpoint *thbi,
+			struct kernel_bp_data *thr_kbpdata)
+{
+	int num = thr_kbpdata->num_kbps;
+
+	thbi->tkdr7 = thr_kbpdata->mkdr7 | (thbi->tdr7 & ~kdr7_masks[num]);
+}
+
+/*
+ * Install the thread breakpoints in their debug registers.
+ */
+void arch_install_thbi(struct thread_hw_breakpoint *thbi)
+{
+	/* Install the user breakpoints.  Kernel breakpoints are stored
+	 * starting in DR0 and going up; there are num_kbps of them.
+	 * User breakpoints are stored starting in DR3 and going down,
+	 * as many as we have room for.
+	 */
+	switch (thbi->num_installed) {
+	case 4:
+		set_debugreg(thbi->tdr[0], 0);
+	case 3:
+		set_debugreg(thbi->tdr[1], 1);
+	case 2:
+		set_debugreg(thbi->tdr[2], 2);
+	case 1:
+		set_debugreg(thbi->tdr[3], 3);
+	}
+	/* No need to set DR6 */
+	set_debugreg(thbi->tkdr7, 7);
+}
+
+/*
+ * Install the debug register values for just the kernel, no thread.
+ */
+void arch_install_none(struct cpu_hw_breakpoint *chbi)
+{
+	set_debugreg(chbi->cur_kbpdata->mkdr7, 7);
+}
+
+/*
+ * Create a new kbpdata entry.
+ */
+void arch_new_kbpdata(struct kernel_bp_data *new_kbpdata)
+{
+	int num = new_kbpdata->num_kbps;
+
+	new_kbpdata->mkdr7 = kdr7 & (kdr7_masks[num] | DR_GLOBAL_SLOWDOWN);
+}
+
+/*
+ * Store a thread breakpoint array entry's address
+ */
+void arch_store_thread_bp_array(struct thread_hw_breakpoint *thbi,
+					struct hw_breakpoint *bp, int i)
+{
+	thbi->tdr[i] = bp->info.address;
+}
+
+/*
+ * Check for virtual address in user space.
+ */
+int arch_check_va_in_userspace(unsigned long va, struct task_struct *tsk)
+{
+	return (va < TASK_SIZE);
+}
+
+/*
+ * Check for virtual address in kernel space.
+ */
+int arch_check_va_in_kernelspace(unsigned long va)
+{
+	return (va >= TASK_SIZE);
+}
+
+/*
+ * Store a breakpoint's encoded address, length, and type.
+ */
+void arch_store_info(struct hw_breakpoint *bp)
+{
+	/*
+	 * User-space requests will always have the address field populated
+	 * For kernel-addresses, either the address or symbol name can be
+	 * specified.
+	 */
+	if (bp->info.address)
+		return;
+	bp->info.address = (unsigned long)kallsyms_lookup_name(bp->info.name);
+}
+
+/*
+ * Validate the arch-specific HW Breakpoint register settings
+ */
+int arch_validate_hwbkpt_settings(struct hw_breakpoint *bp,
+				unsigned int *align, struct task_struct *tsk)
+{
+	int ret = -EINVAL;
+
+	switch (bp->info.type) {
+
+	/* Ptrace-refactoring code
+	 * For now, we'll allow instruction breakpoint only for user-space
+	 * addresses
+	 */
+	case HW_BREAKPOINT_EXECUTE:
+		if ((!arch_check_va_in_userspace(bp->info.address, tsk)) &&
+			bp->info.len != HW_BREAKPOINT_LEN_EXECUTE)
+			return ret;
+		break;
+	case HW_BREAKPOINT_WRITE:
+				break;
+	case HW_BREAKPOINT_RW:
+				break;
+	default:
+		return ret;
+	}
+
+	switch (bp->info.len) {
+	case HW_BREAKPOINT_LEN_1:
+		*align = 0;
+		break;
+	case HW_BREAKPOINT_LEN_2:
+		*align = 1;
+		break;
+	case HW_BREAKPOINT_LEN_4:
+		*align = 3;
+		break;
+	default:
+		return ret;
+	}
+
+	if (bp->triggered) {
+		ret = 0;
+		arch_store_info(bp);
+	}
+	return ret;
+}
+
+/*
+ * Encode the length, type, Exact, and Enable bits for a particular breakpoint
+ * as stored in debug register 7.
+ */
+static unsigned long encode_dr7(int drnum, unsigned len, unsigned type)
+{
+	unsigned long temp;
+
+	temp = (len | type) & 0xf;
+	temp <<= (DR_CONTROL_SHIFT + drnum * DR_CONTROL_SIZE);
+	temp |= (DR_GLOBAL_ENABLE << (drnum * DR_ENABLE_SIZE)) |
+				DR_GLOBAL_SLOWDOWN;
+	return temp;
+}
+
+/*
+ * Calculate the DR7 value for a list of kernel or user breakpoints.
+ */
+static unsigned long calculate_dr7(struct thread_hw_breakpoint *thbi)
+{
+	int is_user;
+	struct list_head *bp_list;
+	struct hw_breakpoint *bp;
+	int i;
+	int drnum;
+	unsigned long dr7;
+
+	if (thbi) {
+		is_user = 1;
+		bp_list = &thbi->thread_bps;
+		drnum = HB_NUM - 1;
+	} else {
+		is_user = 0;
+		bp_list = &kernel_bps;
+		drnum = 0;
+	}
+
+	/* Kernel bps are assigned from DR0 on up, and user bps are assigned
+	 * from DR3 on down.  Accumulate all 4 bps; the kernel DR7 mask will
+	 * select the appropriate bits later.
+	 */
+	dr7 = 0;
+	i = 0;
+	list_for_each_entry(bp, bp_list, node) {
+
+		/* Get the debug register number and accumulate the bits */
+		dr7 |= encode_dr7(drnum, bp->info.len, bp->info.type);
+		if (++i >= HB_NUM)
+			break;
+		if (is_user)
+			--drnum;
+		else
+			++drnum;
+	}
+	return dr7;
+}
+
+/*
+ * Register a new user breakpoint structure.
+ */
+void arch_register_user_hw_breakpoint(struct hw_breakpoint *bp,
+		struct thread_hw_breakpoint *thbi)
+{
+	thbi->tdr7 = calculate_dr7(thbi);
+
+	/* If this is an execution breakpoint for the current PC address,
+	 * we should clear the task's RF so that the bp will be certain
+	 * to trigger.
+	 *
+	 * FIXME: It's not so easy to get hold of the task's PC as a linear
+	 * address!  ptrace.c does this already...
+	 */
+}
+
+/*
+ * Unregister a user breakpoint structure.
+ */
+void arch_unregister_user_hw_breakpoint(struct hw_breakpoint *bp,
+					struct thread_hw_breakpoint *thbi)
+{
+	thbi->tdr7 = calculate_dr7(thbi);
+}
+
+/*
+ * Register a kernel breakpoint structure.
+ */
+void arch_register_kernel_hw_breakpoint(struct hw_breakpoint *bp)
+{
+	kdr7 = calculate_dr7(NULL);
+}
+
+/*
+ * Unregister a kernel breakpoint structure.
+ */
+void arch_unregister_kernel_hw_breakpoint(struct hw_breakpoint *bp)
+{
+	kdr7 = calculate_dr7(NULL);
+}
+
+
+/* End of arch-specific hook routines */
+
+
+/*
+ * Copy out the debug register information for a core dump.
+ *
+ * tsk must be equal to current.
+ */
+void dump_thread_hw_breakpoint(struct task_struct *tsk, int u_debugreg[8])
+{
+	struct thread_hw_breakpoint *thbi = tsk->thread.hw_breakpoint_info;
+	int i;
+
+	memset(u_debugreg, 0, sizeof u_debugreg);
+	if (thbi) {
+		for (i = 0; i < HB_NUM; ++i)
+			u_debugreg[i] = thbi->vdr_bps[i].info.address;
+		u_debugreg[7] = thbi->vdr7;
+	}
+	u_debugreg[6] = tsk->thread.vdr6;
+}
+
+/*
+ * Handle debug exception notifications.
+ */
+
+int __kprobes hw_breakpoint_handler(struct die_args *args)
+{
+	struct cpu_hw_breakpoint *chbi;
+	int i;
+	struct hw_breakpoint *bp;
+	struct thread_hw_breakpoint *thbi = NULL;
+
+	/* The DR6 value is stored in args->err */
+#define DR6	(args->err)
+
+	if (DR6 & DR_STEP)
+		return NOTIFY_DONE;
+
+	chbi = &per_cpu(cpu_bp, get_cpu());
+
+	/* Disable all breakpoints so that the callbacks can run without
+	 * triggering recursive debug exceptions.
+	 */
+	set_debugreg(0UL, 7);
+
+	/* Assert that local interrupts are disabled
+	 * Reset the DRn bits in the virtualized register value.
+	 * The ptrace trigger routine will add in whatever is needed.
+	 */
+	current->thread.vdr6 &= ~(DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3);
+
+	/* Are we a victim of lazy debug-register switching? */
+	if (!chbi->bp_task)
+		;
+	else if (chbi->bp_task != current) {
+
+		/* No user breakpoints are valid.  Perform the belated
+		 * debug-register switch.
+		 */
+		switch_to_none_hw_breakpoint();
+	} else {
+		thbi = chbi->bp_task->thread.hw_breakpoint_info;
+	}
+
+	/* Handle all the breakpoints that were triggered */
+	for (i = 0; i < HB_NUM; ++i) {
+		if (likely(!(DR6 & (DR_TRAP0 << i))))
+			continue;
+
+		/* Find the corresponding hw_breakpoint structure and
+		 * invoke its triggered callback.
+		 */
+		if (i < chbi->cur_kbpdata->num_kbps)
+			bp = chbi->cur_kbpdata->bps[i];
+		else if (thbi)
+			bp = thbi->bps[i];
+		else		/* False alarm due to lazy DR switching */
+			continue;
+		if (bp) {
+			switch (bp->info.type) {
+			case HW_BREAKPOINT_WRITE:
+			case HW_BREAKPOINT_RW:
+				if (bp->triggered)
+					(bp->triggered)(bp, args->regs);
+				/* Re-enable the breakpoints */
+				set_debugreg(thbi ? thbi->tkdr7 :
+						chbi->cur_kbpdata->mkdr7, 7);
+				put_cpu_no_resched();
+
+				return NOTIFY_STOP;
+			/*
+			 * Presently we allow instruction breakpoints only in
+			 * user-space when requested through ptrace.
+			 */
+			case HW_BREAKPOINT_EXECUTE:
+				if (arch_check_va_in_userspace(bp->info.address,
+								current)) {
+					(bp->triggered)(bp, args->regs);
+	/* We'll return NOTIFY_DONE, do_debug will take care of the rest */
+					return NOTIFY_DONE;
+				}
+			}
+		}
+	}
+	/* Stop processing further if the exception is a stray one */
+	if (!(DR6 & ~(DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)))
+		return NOTIFY_STOP;
+
+	return NOTIFY_DONE;
+#undef DR6
+}
Index: linux-2.6-tip.hbkpt/arch/x86/include/asm/hw_breakpoint.h
===================================================================
--- /dev/null
+++ linux-2.6-tip.hbkpt/arch/x86/include/asm/hw_breakpoint.h
@@ -0,0 +1,132 @@
+#ifndef	_I386_HW_BREAKPOINT_H
+#define	_I386_HW_BREAKPOINT_H
+
+#ifdef	__KERNEL__
+#define	__ARCH_HW_BREAKPOINT_H
+
+struct arch_hw_breakpoint {
+	char		*name; /* Contains name of the symbol to set bkpt */
+	unsigned long	address;
+	u8		len;
+	u8		type;
+} __attribute__((packed));
+
+#include <linux/kdebug.h>
+#include <asm-generic/hw_breakpoint.h>
+
+/* HW breakpoint accessor routines */
+static inline const void *hw_breakpoint_get_kaddress(struct hw_breakpoint *bp)
+{
+	return (const void *) bp->info.address;
+}
+
+static inline const void __user *hw_breakpoint_get_uaddress
+						(struct hw_breakpoint *bp)
+{
+	return (const void __user *) bp->info.address;
+}
+
+static inline unsigned hw_breakpoint_get_len(struct hw_breakpoint *bp)
+{
+	return bp->info.len;
+}
+
+static inline unsigned hw_breakpoint_get_type(struct hw_breakpoint *bp)
+{
+	return bp->info.type;
+}
+
+/* Kernel symbol lookup routine for installing Data HW Breakpoint Address */
+static inline unsigned long hw_breakpoint_lookup_name(const char *name)
+{
+	return kallsyms_lookup_name(name);
+}
+
+/* Available HW breakpoint length encodings */
+#define HW_BREAKPOINT_LEN_1		0x40
+#define HW_BREAKPOINT_LEN_2		0x44
+#define HW_BREAKPOINT_LEN_4		0x4c
+#define HW_BREAKPOINT_LEN_EXECUTE	0x40
+
+/* Available HW breakpoint type encodings */
+#define HW_BREAKPOINT_EXECUTE	0x80	/* trigger on instruction execute */
+#define HW_BREAKPOINT_WRITE	0x81	/* trigger on memory write */
+#define HW_BREAKPOINT_RW	0x83	/* trigger on memory read or write */
+
+#define HB_NUM 4 /* Total number of available HW breakpoint registers */
+
+/* Per-thread HW breakpoint and debug register info */
+struct thread_hw_breakpoint {
+
+	/* utrace support */
+	struct list_head	node;		/* Entry in thread list */
+	struct list_head	thread_bps;	/* Thread's breakpoints */
+	struct hw_breakpoint	*bps[HB_NUM];	/* Highest-priority bps */
+	unsigned long		tdr[HB_NUM];	/*  and their addresses */
+	int			num_installed;	/* Number of installed bps */
+	unsigned		gennum;		/* update-generation number */
+
+	/* Only the portions below are arch-specific */
+
+	/* ptrace support -- Note that vdr6 is stored directly in the
+	 * thread_struct so that it is always available.
+	 */
+	unsigned long		vdr7;			/* Virtualized DR7 */
+	struct hw_breakpoint	vdr_bps[HB_NUM];	/* Breakpoints
+			representing virtualized debug registers 0 - 3 */
+	unsigned long		tdr7;		/* Thread's DR7 value */
+	unsigned long		tkdr7;		/* Thread + kernel DR7 value */
+};
+
+/* Kernel-space breakpoint data */
+struct kernel_bp_data {
+	unsigned		gennum;		/* Generation number */
+	int			num_kbps;	/* Number of kernel bps */
+	struct hw_breakpoint	*bps[HB_NUM];	/* Loaded breakpoints */
+
+	/* Only the portions below are arch-specific */
+	unsigned long		mkdr7;		/* Masked kernel DR7 value */
+};
+
+/* Per-CPU debug register info */
+struct cpu_hw_breakpoint {
+	struct kernel_bp_data	*cur_kbpdata;	/* Current kbpdata[] entry */
+	struct task_struct	*bp_task;	/* The thread whose bps
+			are currently loaded in the debug registers */
+};
+
+/*
+ * Ptrace support: breakpoint trigger routine.
+ */
+
+int __register_user_hw_breakpoint(struct task_struct *tsk,
+			struct hw_breakpoint *bp);
+void __unregister_user_hw_breakpoint(struct task_struct *tsk,
+			struct hw_breakpoint *bp);
+
+
+void arch_update_thbi(struct thread_hw_breakpoint *thbi,
+				struct kernel_bp_data *thr_kbpdata);
+void arch_install_thbi(struct thread_hw_breakpoint *thbi);
+void arch_install_none(struct cpu_hw_breakpoint *chbi);
+void arch_install_chbi(struct cpu_hw_breakpoint *chbi);
+void arch_new_kbpdata(struct kernel_bp_data *new_kbpdata);
+void arch_store_thread_bp_array(struct thread_hw_breakpoint *thbi,
+				struct hw_breakpoint *bp, int i);
+int arch_check_va_in_userspace(unsigned long va,
+						struct task_struct *tsk);
+int arch_check_va_in_kernelspace(unsigned long va);
+void arch_store_info(struct hw_breakpoint *bp);
+int arch_validate_hwbkpt_settings(struct hw_breakpoint *bp,
+				unsigned int *align, struct task_struct *tsk);
+void arch_register_user_hw_breakpoint(struct hw_breakpoint *bp,
+				struct thread_hw_breakpoint *thbi);
+void arch_unregister_user_hw_breakpoint(struct hw_breakpoint *bp,
+					struct thread_hw_breakpoint *thbi);
+void arch_register_kernel_hw_breakpoint(struct hw_breakpoint *bp);
+void arch_unregister_kernel_hw_breakpoint(struct hw_breakpoint *bp);
+int hw_breakpoint_handler(struct die_args *args);
+
+#endif	/* __KERNEL__ */
+#endif	/* _I386_HW_BREAKPOINT_H */
+
Index: linux-2.6-tip.hbkpt/arch/x86/kernel/Makefile
===================================================================
--- linux-2.6-tip.hbkpt.orig/arch/x86/kernel/Makefile
+++ linux-2.6-tip.hbkpt/arch/x86/kernel/Makefile
@@ -36,7 +36,7 @@ obj-$(CONFIG_X86_64)	+= sys_x86_64.o x86
 obj-$(CONFIG_X86_64)	+= syscall_64.o vsyscall_64.o
 obj-y			+= bootflag.o e820.o
 obj-y			+= pci-dma.o quirks.o i8237.o topology.o kdebugfs.o
-obj-y			+= alternative.o i8253.o pci-nommu.o
+obj-y			+= alternative.o i8253.o pci-nommu.o hw_breakpoint.o
 obj-y			+= tsc.o io_delay.o rtc.o
 
 obj-$(CONFIG_X86_TRAMPOLINE)	+= trampoline.o


^ permalink raw reply	[flat|nested] 55+ messages in thread

end of thread, other threads:[~2009-03-24 15:26 UTC | newest]

Thread overview: 55+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
     [not found] <20090319234044.410725944@K.Prasad>
2009-03-19 23:48 ` [Patch 01/11] Introducing generic hardware breakpoint handler interfaces K.Prasad
2009-03-20 14:33   ` Alan Stern
2009-03-20 18:30     ` Ingo Molnar
2009-03-21 17:32       ` K.Prasad
2009-03-20 18:32     ` Ingo Molnar
2009-03-21 17:26     ` K.Prasad
2009-03-21 21:39       ` Alan Stern
2009-03-23 19:03         ` K.Prasad
2009-03-23 19:21           ` Alan Stern
2009-03-23 20:42             ` K.Prasad
2009-03-23 21:20               ` Alan Stern
2009-03-19 23:48 ` [Patch 02/11] x86 architecture implementation of Hardware Breakpoint interfaces K.Prasad
2009-03-19 23:48 ` [Patch 03/11] Modifying generic debug exception to use thread-specific debug registers K.Prasad
2009-03-19 23:49 ` [Patch 04/11] Introduce user-space " K.Prasad
2009-03-19 23:49 ` [Patch 05/11] Use wrapper routines around debug registers in processor related functions K.Prasad
2009-03-19 23:49 ` [Patch 06/11] Use the new wrapper routines to access debug registers in process/thread code K.Prasad
2009-03-19 23:49 ` [Patch 07/11] Modify signal handling code to refrain from re-enabling HW Breakpoints K.Prasad
2009-03-19 23:49 ` [Patch 08/11] Modify Ptrace routines to access breakpoint registers K.Prasad
2009-03-19 23:49 ` [Patch 09/11] Cleanup HW Breakpoint registers before kexec K.Prasad
2009-03-19 23:50 ` [Patch 10/11] Sample HW breakpoint over kernel data address K.Prasad
2009-03-19 23:50 ` [Patch 11/11] ftrace plugin for kernel symbol tracing using HW Breakpoint interfaces - v2 K.Prasad
2009-03-20  9:04   ` Frederic Weisbecker
2009-03-21 16:24     ` K.Prasad
2009-03-21 16:39       ` Steven Rostedt
2009-03-23 19:08         ` K.Prasad
     [not found] <20090324152028.754123712@K.Prasad>
2009-03-24 15:25 ` [Patch 02/11] x86 architecture implementation of Hardware Breakpoint interfaces K.Prasad
     [not found] <20090307045120.039324630@linux.vnet.ibm.com>
2009-03-07  5:05 ` prasad
     [not found] <20090305043440.189041194@linux.vnet.ibm.com>
2009-03-05  4:38 ` [patch " prasad
2009-03-10 14:09   ` Ingo Molnar
2009-03-10 14:59     ` Alan Stern
2009-03-10 15:18       ` Ingo Molnar
2009-03-10 17:11         ` Alan Stern
2009-03-10 17:26           ` Ingo Molnar
2009-03-10 20:30             ` Alan Stern
2009-03-11 12:12               ` Ingo Molnar
2009-03-11 12:50                 ` K.Prasad
2009-03-11 13:10                   ` Ingo Molnar
2009-03-14  3:46                     ` Benjamin Herrenschmidt
2009-03-11 16:39                   ` Alan Stern
2009-03-11 16:32                 ` Alan Stern
2009-03-11 17:41                   ` K.Prasad
2009-03-14  3:47                     ` Benjamin Herrenschmidt
2009-03-14  3:43                 ` Benjamin Herrenschmidt
2009-03-14  3:41               ` Benjamin Herrenschmidt
2009-03-14  3:40             ` Benjamin Herrenschmidt
2009-03-12  2:46     ` Roland McGrath
2009-03-13  3:43       ` Ingo Molnar
2009-03-13 14:04         ` Alan Stern
2009-03-13 14:13           ` Ingo Molnar
2009-03-13 19:01             ` K.Prasad
2009-03-13 21:21               ` Alan Stern
2009-03-14 12:24                 ` Ingo Molnar
2009-03-14 16:10                   ` Alan Stern
2009-03-14 16:39                     ` Ingo Molnar
2009-03-14  3:51       ` Benjamin Herrenschmidt

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.