All of lore.kernel.org
 help / color / mirror / Atom feed
* [Patch 01/11] Introducing generic hardware breakpoint handler interfaces
       [not found] <20090319234044.410725944@K.Prasad>
@ 2009-03-19 23:48 ` K.Prasad
  2009-03-20 14:33   ` Alan Stern
  2009-03-19 23:48 ` [Patch 02/11] x86 architecture implementation of Hardware Breakpoint interfaces K.Prasad
                   ` (9 subsequent siblings)
  10 siblings, 1 reply; 33+ messages in thread
From: K.Prasad @ 2009-03-19 23:48 UTC (permalink / raw)
  To: Ingo Molnar, Linux Kernel Mailing List
  Cc: Alan Stern, Andrew Morton, Benjamin Herrenschmidt,
	Frederic Weisbecker, Maneesh Soni, Roland McGrath,
	Steven Rostedt, K.Prasad

[-- Attachment #1: 1 --]
[-- Type: text/plain, Size: 16776 bytes --]

This patch introduces two new files hw_breakpoint.[ch] which defines the 
generic interfaces to use hardware breakpoint infrastructure of the system. 

Signed-off-by: K.Prasad <prasad@linux.vnet.ibm.com>
Signed-off-by: Alan Stern <stern@rowland.harvard.edu>
---
 arch/Kconfig                        |    3 
 include/asm-generic/hw_breakpoint.h |  140 +++++++++++++
 kernel/Makefile                     |    1 
 kernel/hw_breakpoint.c              |  361 ++++++++++++++++++++++++++++++++++++
 4 files changed, 505 insertions(+)

Index: linux-2.6-tip.hbkpt/kernel/hw_breakpoint.c
===================================================================
--- /dev/null
+++ linux-2.6-tip.hbkpt/kernel/hw_breakpoint.c
@@ -0,0 +1,361 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) 2007 Alan Stern
+ * Copyright (C) IBM Corporation, 2009
+ */
+
+/*
+ * HW_breakpoint: a unified kernel/user-space hardware breakpoint facility,
+ * using the CPU's debug registers.
+ *
+ * This file contains the arch-independent routines.  It is not meant
+ * to be compiled as a standalone source file; rather it should be
+ * #include'd by the arch-specific implementation.
+ */
+
+#include <linux/irqflags.h>
+#include <linux/kallsyms.h>
+#include <linux/notifier.h>
+#include <linux/kprobes.h>
+#include <linux/kdebug.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/percpu.h>
+#include <linux/mutex.h>
+#include <linux/sched.h>
+#include <linux/init.h>
+#include <linux/smp.h>
+
+#include <asm/hw_breakpoint.h>
+#include <asm/processor.h>
+#include <asm/debugreg.h>
+
+/* Array of kernel-space breakpoint structures */
+struct hw_breakpoint *hbkpt_kernel[HB_NUM];
+/*
+ * Kernel breakpoints grow downwards, starting from HB_NUM
+ * 'hbkpt_kernel_pos' denotes lowest numbered breakpoint register occupied for
+ * kernel-space request
+ */
+unsigned int hbkpt_kernel_pos;
+
+/* An array containing refcount of threads using a given bkpt register */
+unsigned int hbkpt_user_max_refcount[HB_NUM];
+
+/* One higher than the highest counted user-space breakpoint register */
+unsigned int hbkpt_user_max;
+
+struct task_struct *last_debugged_task;
+
+/*
+ * Install the debug register values for a new thread.
+ */
+void switch_to_thread_hw_breakpoint(struct task_struct *tsk)
+{
+	/* Set the debug register */
+	arch_install_thread_hbkpt(tsk);
+	last_debugged_task = current;
+
+	put_cpu_no_resched();
+}
+
+/*
+ * Install the debug register values for just the kernel, no thread.
+ */
+void switch_to_none_hw_breakpoint(void)
+{
+	arch_install_none();
+	put_cpu_no_resched();
+}
+
+/*
+ * Load the debug registers during startup of a CPU.
+ */
+void load_debug_registers(void)
+{
+	int i;
+	unsigned long flags;
+
+	/* Prevent IPIs for new kernel breakpoint updates */
+	local_irq_save(flags);
+
+	for (i = hbkpt_kernel_pos; i < HB_NUM; i++)
+		if (hbkpt_kernel[i])
+			on_each_cpu(arch_install_kernel_hbkpt,
+				(void *)hbkpt_kernel[i], 0);
+	if (current->thread.dr7)
+		arch_install_thread_hbkpt(current);
+
+	local_irq_restore(flags);
+}
+
+/*
+ * Erase all the hardware breakpoint info associated with a thread.
+ *
+ * If tsk != current then tsk must not be usable (for example, a
+ * child being cleaned up from a failed fork).
+ */
+void flush_thread_hw_breakpoint(struct task_struct *tsk)
+{
+	int i;
+	struct thread_struct *thread = &(tsk->thread);
+
+	mutex_lock(&hw_breakpoint_mutex);
+
+	/* Let the breakpoints know they are being uninstalled */
+
+	/* The thread no longer has any breakpoints associated with it */
+	clear_tsk_thread_flag(tsk, TIF_DEBUG);
+	for (i = 0; i < HB_NUM; i++) {
+		if (thread->hbkpt[i]) {
+			hbkpt_user_max_refcount[i]--;
+			if (!hbkpt_user_max_refcount[i])
+				hbkpt_user_max--;
+			kfree(thread->hbkpt[i]);
+			thread->hbkpt[i] = NULL;
+		}
+	}
+	thread->hbkpt_num_installed = 0;
+
+	/* Actually uninstall the breakpoints if necessary */
+	if (tsk == current)
+		switch_to_none_hw_breakpoint();
+	mutex_unlock(&hw_breakpoint_mutex);
+}
+
+/*
+ * Copy the hardware breakpoint info from a thread to its cloned child.
+ */
+int copy_thread_hw_breakpoint(struct task_struct *tsk,
+		struct task_struct *child, unsigned long clone_flags)
+{
+	/* We will assume that breakpoint settings are not inherited
+	 * and the child starts out with no debug registers set.
+	 * But what about CLONE_PTRACE?
+	 */
+	clear_tsk_thread_flag(child, TIF_DEBUG);
+	return 0;
+}
+
+/*
+ * Validate the settings in a hw_breakpoint structure.
+ */
+static int validate_settings(struct hw_breakpoint *bp, struct task_struct *tsk)
+{
+	int ret;
+	unsigned int align;
+
+	ret = arch_validate_hwbkpt_settings(bp, &align, tsk);
+	if (ret < 0)
+		goto err;
+
+	/* Check that the low-order bits of the address are appropriate
+	 * for the alignment implied by len.
+	 */
+	if (bp->info.address & align)
+		return -EINVAL;
+
+	/* Check that the virtual address is in the proper range */
+	if (tsk) {
+		if (!arch_check_va_in_userspace(bp->info.address, tsk))
+			return -EFAULT;
+	} else {
+		if (!arch_check_va_in_kernelspace(bp->info.address))
+			return -EFAULT;
+	}
+ err:
+	return ret;
+}
+
+int __register_user_hw_breakpoint(int pos, struct task_struct *tsk,
+					struct hw_breakpoint *bp)
+{
+	struct thread_struct *thread = &(tsk->thread);
+	int rc;
+
+	/* Do not overcommit. Fail if kernel has used the hbkpt registers */
+	if (pos >= hbkpt_kernel_pos)
+		return -ENOSPC;
+
+	rc = validate_settings(bp, tsk);
+	if (rc)
+		return rc;
+
+	thread->hbkpt[pos] = bp;
+	thread->hbkpt_num_installed++;
+	hbkpt_user_max_refcount[pos]++;
+	/* 'tsk' is the thread that uses max number of hbkpt registers */
+	if (hbkpt_user_max < thread->hbkpt_num_installed)
+		hbkpt_user_max++;
+
+	arch_register_user_hw_breakpoint(pos, bp, tsk);
+
+	/*
+	 * Does it need to be installed right now?
+	 * Otherwise it will get installed the next time tsk runs
+	 */
+	if (tsk == current)
+		switch_to_thread_hw_breakpoint(tsk);
+	return rc;
+}
+
+/*
+ * Modify the address of a hbkpt register already in use by the task
+ * Do not invoke this in-lieu of a __unregister_user_hw_breakpoint()
+ */
+int __modify_user_hw_breakpoint(int pos, struct task_struct *tsk,
+					struct hw_breakpoint *bp)
+{
+	int rc;
+	struct thread_struct *thread = &(tsk->thread);
+
+	if ((pos >= hbkpt_kernel_pos) || (validate_settings(bp, tsk)))
+		return -EINVAL;
+
+	thread->hbkpt[pos] = bp;
+
+	/*
+	 * 'pos' must be that of a hbkpt register already used by 'tsk'
+	 * Otherwise arch_modify_user_hw_breakpoint() will fail
+	 */
+	rc = arch_modify_user_hw_breakpoint(pos, bp, tsk);
+	if (rc)
+		return rc;
+
+	if (tsk == current)
+		switch_to_thread_hw_breakpoint(tsk);
+	return 0;
+}
+
+/*
+ * Actual implementation of unregister_user_hw_breakpoint.
+ */
+void __unregister_user_hw_breakpoint(int pos, struct task_struct *tsk,
+						struct hw_breakpoint *bp)
+{
+	struct thread_struct *thread = &(tsk->thread);
+
+	if (!bp)
+		return;
+
+	hbkpt_user_max_refcount[pos]--;
+	if ((hbkpt_user_max == pos + 1) && (hbkpt_user_max_refcount[pos] == 0))
+		hbkpt_user_max--;
+	thread->hbkpt_num_installed--;
+
+	arch_unregister_user_hw_breakpoint(pos, bp, tsk);
+
+	if (tsk == current)
+		switch_to_thread_hw_breakpoint(tsk);
+	kfree(tsk->thread.hbkpt[pos]);
+	tsk->thread.hbkpt[pos] = NULL;
+}
+
+/**
+ * register_kernel_hw_breakpoint - register a hardware breakpoint for kernel space
+ * @bp: the breakpoint structure to register
+ *
+ * @bp.info->name or @bp.info->address, @bp.info->len, @bp.info->type and
+ * @bp->triggered must be set properly before invocation
+ *
+ */
+int register_kernel_hw_breakpoint(struct hw_breakpoint *bp)
+{
+	int rc;
+
+	rc = validate_settings(bp, NULL);
+	if (rc)
+		return rc;
+
+	mutex_lock(&hw_breakpoint_mutex);
+
+	/* Check if we are over-committing */
+	if (hbkpt_kernel_pos <= hbkpt_user_max) {
+		mutex_unlock(&hw_breakpoint_mutex);
+		return -EINVAL;
+	}
+
+	hbkpt_kernel_pos--;
+	hbkpt_kernel[hbkpt_kernel_pos] = bp;
+	arch_register_kernel_hw_breakpoint(bp);
+
+	mutex_unlock(&hw_breakpoint_mutex);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(register_kernel_hw_breakpoint);
+
+/**
+ * unregister_kernel_hw_breakpoint - unregister a hardware breakpoint for kernel space
+ * @bp: the breakpoint structure to unregister
+ *
+ * Uninstalls and unregisters @bp.
+ */
+void unregister_kernel_hw_breakpoint(struct hw_breakpoint *bp)
+{
+	int i, j;
+
+	mutex_lock(&hw_breakpoint_mutex);
+
+	for (i = hbkpt_kernel_pos; i < HB_NUM; i++)
+		if (bp == hbkpt_kernel[i])
+			break;
+
+	arch_unregister_kernel_hw_breakpoint(i);
+
+	/*
+	 * We'll shift the breakpoints one-level above to accomodate new thread
+	 * requests
+	 */
+	if (i > hbkpt_kernel_pos)
+		for (j = i; j == hbkpt_kernel_pos; j--)
+			hbkpt_kernel[j] = hbkpt_kernel[j-1];
+	hbkpt_kernel_pos++;
+
+	mutex_unlock(&hw_breakpoint_mutex);
+}
+EXPORT_SYMBOL_GPL(unregister_kernel_hw_breakpoint);
+
+/*
+ * Handle debug exception notifications.
+ */
+static int __kprobes hw_breakpoint_exceptions_notify(
+		struct notifier_block *unused, unsigned long val, void *data)
+{
+	if (val != DIE_DEBUG)
+		return NOTIFY_DONE;
+	return hw_breakpoint_handler(data);
+}
+
+static struct notifier_block hw_breakpoint_exceptions_nb = {
+	.notifier_call = hw_breakpoint_exceptions_notify,
+	/* we need to be notified first */
+	.priority = 0x7fffffff
+};
+
+static int __init init_hw_breakpoint(void)
+{
+	int i;
+
+	hbkpt_kernel_pos = HB_NUM;
+	for (i = 0; i < HB_NUM; i++)
+		hbkpt_user_max_refcount[i] = 0;
+	hbkpt_user_max = 0;
+	load_debug_registers();
+
+	return register_die_notifier(&hw_breakpoint_exceptions_nb);
+}
+
+core_initcall(init_hw_breakpoint);
Index: linux-2.6-tip.hbkpt/kernel/Makefile
===================================================================
--- linux-2.6-tip.hbkpt.orig/kernel/Makefile
+++ linux-2.6-tip.hbkpt/kernel/Makefile
@@ -95,6 +95,7 @@ obj-$(CONFIG_FUNCTION_TRACER) += trace/
 obj-$(CONFIG_TRACING) += trace/
 obj-$(CONFIG_SMP) += sched_cpupri.o
 obj-$(CONFIG_PERF_COUNTERS) += perf_counter.o
+obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o
 
 ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
 # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
Index: linux-2.6-tip.hbkpt/include/asm-generic/hw_breakpoint.h
===================================================================
--- /dev/null
+++ linux-2.6-tip.hbkpt/include/asm-generic/hw_breakpoint.h
@@ -0,0 +1,140 @@
+#ifndef	_ASM_GENERIC_HW_BREAKPOINT_H
+#define	_ASM_GENERIC_HW_BREAKPOINT_H
+
+#ifndef __ARCH_HW_BREAKPOINT_H
+#error "Please don't include this file directly"
+#endif
+
+#ifdef	__KERNEL__
+#include <linux/list.h>
+#include <linux/types.h>
+#include <linux/kallsyms.h>
+
+/**
+ * struct hw_breakpoint - unified kernel/user-space hardware breakpoint
+ * @triggered: callback invoked after target address access
+ * @info: arch-specific breakpoint info (address, length, and type)
+ *
+ * %hw_breakpoint structures are the kernel's way of representing
+ * hardware breakpoints.  These are data breakpoints
+ * (also known as "watchpoints", triggered on data access), and the breakpoint's
+ * target address can be located in either kernel space or user space.
+ *
+ * The breakpoint's address, length, and type are highly
+ * architecture-specific.  The values are encoded in the @info field; you
+ * specify them when registering the breakpoint.  To examine the encoded
+ * values use hw_breakpoint_get_{kaddress,uaddress,len,type}(), declared
+ * below.
+ *
+ * The address is specified as a regular kernel pointer (for kernel-space
+ * breakponts) or as an %__user pointer (for user-space breakpoints).
+ * With register_user_hw_breakpoint(), the address must refer to a
+ * location in user space.  The breakpoint will be active only while the
+ * requested task is running.  Conversely with
+ * register_kernel_hw_breakpoint(), the address must refer to a location
+ * in kernel space, and the breakpoint will be active on all CPUs
+ * regardless of the current task.
+ *
+ * The length is the breakpoint's extent in bytes, which is subject to
+ * certain limitations.  include/asm/hw_breakpoint.h contains macros
+ * defining the available lengths for a specific architecture.  Note that
+ * the address's alignment must match the length.  The breakpoint will
+ * catch accesses to any byte in the range from address to address +
+ * (length - 1).
+ *
+ * The breakpoint's type indicates the sort of access that will cause it
+ * to trigger.  Possible values may include:
+ *
+ * 	%HW_BREAKPOINT_RW (triggered on read or write access),
+ * 	%HW_BREAKPOINT_WRITE (triggered on write access), and
+ * 	%HW_BREAKPOINT_READ (triggered on read access).
+ *
+ * Appropriate macros are defined in include/asm/hw_breakpoint.h; not all
+ * possibilities are available on all architectures.  Execute breakpoints
+ * must have length equal to the special value %HW_BREAKPOINT_LEN_EXECUTE.
+ *
+ * When a breakpoint gets hit, the @triggered callback is
+ * invoked in_interrupt with a pointer to the %hw_breakpoint structure and the
+ * processor registers.
+ * Data breakpoints occur after the memory access has taken place.
+ * Breakpoints are disabled during execution @triggered, to avoid
+ * recursive traps and allow unhindered access to breakpointed memory.
+ *
+ * This sample code sets a breakpoint on pid_max and registers a callback
+ * function for writes to that variable.  Note that it is not portable
+ * as written, because not all architectures support HW_BREAKPOINT_LEN_4.
+ *
+ * ----------------------------------------------------------------------
+ *
+ * #include <asm/hw_breakpoint.h>
+ *
+ * struct hw_breakpoint my_bp;
+ *
+ * static void my_triggered(struct hw_breakpoint *bp, struct pt_regs *regs)
+ * {
+ * 	printk(KERN_DEBUG "Inside triggered routine of breakpoint exception\n");
+ * 	dump_stack();
+ *  	.......<more debugging output>........
+ * }
+ *
+ * static struct hw_breakpoint my_bp;
+ *
+ * static int init_module(void)
+ * {
+ *	..........<do anything>............
+ *	my_bp.info.type = HW_BREAKPOINT_WRITE;
+ *	my_bp.info.len = HW_BREAKPOINT_LEN_4;
+ *
+ *	my_bp.installed = (void *)my_bp_installed;
+ *
+ *	rc = register_kernel_hw_breakpoint(&my_bp);
+ *	..........<do anything>............
+ * }
+ *
+ * static void cleanup_module(void)
+ * {
+ *	..........<do anything>............
+ *	unregister_kernel_hw_breakpoint(&my_bp);
+ *	..........<do anything>............
+ * }
+ *
+ * ----------------------------------------------------------------------
+ */
+struct hw_breakpoint {
+	void (*triggered)(struct hw_breakpoint *, struct pt_regs *);
+	struct arch_hw_breakpoint info;
+};
+
+/*
+ * len and type values are defined in include/asm/hw_breakpoint.h.
+ * Available values vary according to the architecture.  On i386 the
+ * possibilities are:
+ *
+ *	HW_BREAKPOINT_LEN_1
+ *	HW_BREAKPOINT_LEN_2
+ *	HW_BREAKPOINT_LEN_4
+ *	HW_BREAKPOINT_LEN_EXECUTE
+ *	HW_BREAKPOINT_RW
+ *	HW_BREAKPOINT_READ
+ *	HW_BREAKPOINT_EXECUTE
+ *
+ * On other architectures HW_BREAKPOINT_LEN_8 may be available, and the
+ * 1-, 2-, and 4-byte lengths may be unavailable.  There also may be
+ * HW_BREAKPOINT_WRITE.  You can use #ifdef to check at compile time.
+ */
+
+static DEFINE_MUTEX(hw_breakpoint_mutex);	/* Protects everything */
+
+/*
+ * Kernel breakpoints are not associated with any particular thread.
+ */
+int register_kernel_hw_breakpoint(struct hw_breakpoint *bp);
+void unregister_kernel_hw_breakpoint(struct hw_breakpoint *bp);
+void switch_to_none_hw_breakpoint(void);
+
+extern unsigned int hbkpt_kernel_pos;
+extern unsigned int hbkpt_user_max;
+extern struct task_struct *last_debugged_task;
+
+#endif	/* __KERNEL__ */
+#endif	/* _ASM_GENERIC_HW_BREAKPOINT_H */
Index: linux-2.6-tip.hbkpt/arch/Kconfig
===================================================================
--- linux-2.6-tip.hbkpt.orig/arch/Kconfig
+++ linux-2.6-tip.hbkpt/arch/Kconfig
@@ -106,3 +106,6 @@ config HAVE_CLK
 	help
 	  The <linux/clk.h> calls support software clock gating and
 	  thus are a key power management tool on many systems.
+
+config HAVE_HW_BREAKPOINT
+	bool


^ permalink raw reply	[flat|nested] 33+ messages in thread

* [Patch 02/11] x86 architecture implementation of Hardware Breakpoint interfaces
       [not found] <20090319234044.410725944@K.Prasad>
  2009-03-19 23:48 ` [Patch 01/11] Introducing generic hardware breakpoint handler interfaces K.Prasad
@ 2009-03-19 23:48 ` K.Prasad
  2009-03-19 23:48 ` [Patch 03/11] Modifying generic debug exception to use thread-specific debug registers K.Prasad
                   ` (8 subsequent siblings)
  10 siblings, 0 replies; 33+ messages in thread
From: K.Prasad @ 2009-03-19 23:48 UTC (permalink / raw)
  To: Ingo Molnar, Linux Kernel Mailing List
  Cc: Alan Stern, Andrew Morton, Benjamin Herrenschmidt,
	Frederic Weisbecker, Maneesh Soni, Roland McGrath,
	Steven Rostedt, K.Prasad

[-- Attachment #1: 2 --]
[-- Type: text/plain, Size: 14025 bytes --]

This patch introduces two new files named hw_breakpoint.[ch] inside x86 specific
directories. They contain functions which help validate and serve requests for 
using Hardware Breakpoint registers on x86 processors.

Signed-off-by: K.Prasad <prasad@linux.vnet.ibm.com>
Signed-off-by: Alan Stern <stern@rowland.harvard.edu>
---
 arch/x86/Kconfig                     |    1 
 arch/x86/include/asm/hw_breakpoint.h |   69 ++++++
 arch/x86/kernel/Makefile             |    2 
 arch/x86/kernel/hw_breakpoint.c      |  384 +++++++++++++++++++++++++++++++++++
 4 files changed, 455 insertions(+), 1 deletion(-)

Index: linux-2.6-tip.hbkpt/arch/x86/kernel/hw_breakpoint.c
===================================================================
--- /dev/null
+++ linux-2.6-tip.hbkpt/arch/x86/kernel/hw_breakpoint.c
@@ -0,0 +1,384 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) 2007 Alan Stern
+ * Copyright (C) 2009 IBM Corporation
+ */
+
+/*
+ * HW_breakpoint: a unified kernel/user-space hardware breakpoint facility,
+ * using the CPU's debug registers.
+ */
+
+#include <linux/irqflags.h>
+#include <linux/notifier.h>
+#include <linux/kallsyms.h>
+#include <linux/kprobes.h>
+#include <linux/percpu.h>
+#include <linux/kdebug.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/sched.h>
+#include <linux/init.h>
+#include <linux/smp.h>
+
+#include <asm/hw_breakpoint.h>
+#include <asm/processor.h>
+#include <asm/debugreg.h>
+
+/* Unmasked kernel DR7 value */
+static unsigned long kdr7;
+
+/*
+ * Masks for the bits corresponding to registers DR0 - DR3 in DR7 register.
+ * Used to clear and verify the status of bits corresponding to DR0 - DR3
+ */
+static const unsigned long	dr7_masks[HB_NUM] = {
+	0x000f0003,	/* LEN0, R/W0, G0, L0 */
+	0x00f0000c,	/* LEN1, R/W1, G1, L1 */
+	0x0f000030,	/* LEN2, R/W2, G2, L2 */
+	0xf00000c0	/* LEN3, R/W3, G3, L3 */
+};
+
+
+/*
+ * Encode the length, type, Exact, and Enable bits for a particular breakpoint
+ * as stored in debug register 7.
+ */
+static unsigned long encode_dr7(int drnum, unsigned len, unsigned type)
+{
+	unsigned long temp;
+
+	temp = (len | type) & 0xf;
+	temp <<= (DR_CONTROL_SHIFT + drnum * DR_CONTROL_SIZE);
+	temp |= (DR_GLOBAL_ENABLE << (drnum * DR_ENABLE_SIZE)) |
+				DR_GLOBAL_SLOWDOWN;
+	return temp;
+}
+
+/*
+ * Install the kernel breakpoints in their debug registers.
+ */
+void arch_install_kernel_hbkpt(void *bkpt)
+{
+	struct hw_breakpoint *bp;
+	int i;
+	unsigned long dr7;
+
+	bp = (struct hw_breakpoint *)bkpt;
+
+	kdr7 &= ~(dr7_masks[hbkpt_kernel_pos]);
+	kdr7 |= encode_dr7(hbkpt_kernel_pos, bp->info.len, bp->info.type);
+
+	get_debugreg(dr7, 7);
+	/* Clear the bits corresponding to 'pos' register in dr7 */
+	dr7 &= ~(dr7_masks[hbkpt_kernel_pos]);
+	dr7 |= kdr7;
+
+	/* Don't allow debug exceptions while we update the registers */
+	set_debugreg(0UL, 7);
+
+	/* Kernel hbkpts always begin at 'hbkpt_kernel_pos' and upto HB_NUM */
+	for (i = hbkpt_kernel_pos; i < HB_NUM; i++)
+		set_debugreg(hbkpt_kernel[i]->info.address, i);
+
+	/* No need to set DR6 */
+	set_debugreg(dr7, 7);
+}
+
+/*
+ * Install the thread breakpoints in their debug registers.
+ */
+void arch_install_thread_hbkpt(struct task_struct *tsk)
+{
+	int i;
+	struct thread_struct *thread = &(tsk->thread);
+
+	for (i = 0; i < hbkpt_user_max; i++)
+		if (thread->hbkpt[i])
+			set_debugreg(thread->hbkpt[i]->info.address, i);
+
+	/* No need to set DR6 */
+
+	set_debugreg((kdr7 | thread->dr7), 7);
+}
+
+/*
+ * Install the debug register values for just the kernel, no thread.
+ */
+void arch_install_none()
+{
+	/* Clear the user-space portion of dr7 by setting only kdr7 */
+	set_debugreg(kdr7, 7);
+}
+
+/*
+ * Check for virtual address in user space.
+ */
+int arch_check_va_in_userspace(unsigned long va, struct task_struct *tsk)
+{
+#ifdef CONFIG_X86_32
+	return (va <= TASK_SIZE - 3);
+#else /* X86_64 */
+	return (va <= TASK_SIZE - 7);
+#endif
+}
+
+/*
+ * Check for virtual address in kernel space.
+ */
+int arch_check_va_in_kernelspace(unsigned long va)
+{
+	return (va >= TASK_SIZE);
+}
+
+/*
+ * Store a breakpoint's encoded address, length, and type.
+ */
+void arch_store_info(struct hw_breakpoint *bp)
+{
+	/*
+	 * User-space requests will always have the address field populated
+	 * For kernel-addresses, either the address or symbol name can be
+	 * specified.
+	 */
+	if (bp->info.address)
+		return;
+	if (bp->info.name)
+		bp->info.address = (unsigned long)
+					kallsyms_lookup_name(bp->info.name);
+}
+
+/*
+ * Validate the arch-specific HW Breakpoint register settings
+ */
+int arch_validate_hwbkpt_settings(struct hw_breakpoint *bp,
+				unsigned int *align, struct task_struct *tsk)
+{
+	int ret = -EINVAL;
+
+	switch (bp->info.type) {
+
+	/* Ptrace-refactoring code
+	 * For now, we'll allow instruction breakpoint only for user-space
+	 * addresses
+	 */
+	case HW_BREAKPOINT_EXECUTE:
+		if ((!arch_check_va_in_userspace(bp->info.address, tsk)) &&
+			bp->info.len != HW_BREAKPOINT_LEN_EXECUTE)
+			return ret;
+		break;
+	case HW_BREAKPOINT_WRITE:
+				break;
+	case HW_BREAKPOINT_RW:
+				break;
+	default:
+		return ret;
+	}
+
+	switch (bp->info.len) {
+	case HW_BREAKPOINT_LEN_1:
+		*align = 0;
+		break;
+	case HW_BREAKPOINT_LEN_2:
+		*align = 1;
+		break;
+	case HW_BREAKPOINT_LEN_4:
+		*align = 3;
+		break;
+	default:
+		return ret;
+	}
+
+	if (bp->triggered) {
+		ret = 0;
+		arch_store_info(bp);
+	}
+	return ret;
+}
+
+/*
+ * Register a new user breakpoint structure.
+ */
+void arch_register_user_hw_breakpoint(int pos, struct hw_breakpoint *bp,
+		struct task_struct *tsk)
+{
+	struct thread_struct *thread = &(tsk->thread);
+
+	thread->dr7 &= ~dr7_masks[pos];
+	thread->dr7 |= encode_dr7(pos, bp->info.len, bp->info.type);
+}
+
+/*
+ * Modify an existing user breakpoint structure.
+ */
+int arch_modify_user_hw_breakpoint(int pos, struct hw_breakpoint *bp,
+		struct task_struct *tsk)
+{
+	struct thread_struct *thread = &(tsk->thread);
+
+	/* Check if the register to be modified was enabled by the thread */
+	if (!(thread->dr7 & (1 << (pos * DR_ENABLE_SIZE))))
+		return -EINVAL;
+
+	thread->dr7 &= ~dr7_masks[pos];
+	thread->dr7 |= encode_dr7(pos, bp->info.len, bp->info.type);
+
+	return 0;
+}
+
+/*
+ * Unregister a user breakpoint structure.
+ */
+void arch_unregister_user_hw_breakpoint(int pos, struct hw_breakpoint *bp,
+					struct task_struct *tsk)
+{
+	struct thread_struct *thread = &(tsk->thread);
+
+	if (!thread->hbkpt[pos])
+		return;
+
+	thread->hbkpt[pos]->info.address = 0;
+	thread->dr7 &= ~dr7_masks[pos];
+}
+
+/*
+ * Register a kernel breakpoint structure.
+ */
+void arch_register_kernel_hw_breakpoint(struct hw_breakpoint *bp)
+{
+	on_each_cpu(arch_install_kernel_hbkpt, (void *)bp, 0);
+}
+
+/*
+ * Unregister a kernel breakpoint structure.
+ */
+void arch_unregister_kernel_hw_breakpoint(int pos)
+{
+	unsigned long dr7;
+
+	kdr7 &= ~(dr7_masks[pos]);
+
+	get_debugreg(dr7, 7);
+	dr7  &= ~(dr7_masks[pos]);
+	set_debugreg(dr7, 7);
+}
+
+/* End of arch-specific hook routines */
+
+/*
+ * Copy out the debug register information for a core dump.
+ *
+ * tsk must be equal to current.
+ */
+void dump_thread_hw_breakpoint(struct task_struct *tsk, int u_debugreg[8])
+{
+	struct thread_struct *thread = &(tsk->thread);
+	int i;
+
+	memset(u_debugreg, 0, sizeof u_debugreg);
+	for (i = 0; i < thread->hbkpt_num_installed && thread->hbkpt[i]; ++i)
+		u_debugreg[i] = thread->hbkpt[i]->info.address;
+	u_debugreg[7] = thread->dr7;
+	u_debugreg[6] = thread->dr6;
+}
+
+/*
+ * Handle debug exception notifications.
+ */
+int __kprobes hw_breakpoint_handler(struct die_args *args)
+{
+	int i;
+	struct hw_breakpoint *bp;
+	/* The DR6 value is stored in args->err */
+	unsigned long dr7, dr6 = args->err;
+
+	if (dr6 & DR_STEP)
+		return NOTIFY_DONE;
+
+	get_debugreg(dr7, 7);
+
+	/* Disable breakpoints during exception handling */
+	set_debugreg(0UL, 7);
+
+	/* Assert that local interrupts are disabled
+	 * Reset the DRn bits in the virtualized register value.
+	 * The ptrace trigger routine will add in whatever is needed.
+	 */
+	current->thread.dr6 &= ~(DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3);
+
+	/* Lazy debug register switching */
+	if (last_debugged_task != current)
+		switch_to_none_hw_breakpoint();
+
+	/* Handle all the breakpoints that were triggered */
+	for (i = 0; i < HB_NUM; ++i) {
+		if (likely(!(dr6 & (DR_TRAP0 << i))))
+			continue;
+
+		/* Find the corresponding hw_breakpoint structure and
+		 * invoke its triggered callback.
+		 */
+		if (i < hbkpt_user_max)
+			bp = current->thread.hbkpt[i];
+		else if (i >= hbkpt_kernel_pos)
+			bp = hbkpt_kernel[i];
+		else		/* False alarm due to lazy DR switching */
+			continue;
+		if (!bp)
+			goto ret_path;
+
+		switch (bp->info.type) {
+		case HW_BREAKPOINT_WRITE:
+		case HW_BREAKPOINT_RW:
+			if (bp->triggered)
+				(bp->triggered)(bp, args->regs);
+			/* Re-enable the breakpoints */
+			put_cpu_no_resched();
+			if (arch_check_va_in_userspace(bp->info.address,
+							current))
+				goto ret_notify_done;
+			else
+				goto ret_notify_stop;
+		/*
+		 * Presently we allow instruction breakpoints only in
+		 * user-space when requested through ptrace.
+		 */
+		case HW_BREAKPOINT_EXECUTE:
+			if (arch_check_va_in_userspace(bp->info.address,
+							current)) {
+				(bp->triggered)(bp, args->regs);
+			/*
+			 * do_debug will notify user through a SIGTRAP signal
+			 * So we are not requesting a NOTIFY_STOP here
+			 */
+				goto ret_notify_done;
+			}
+		}
+	}
+
+ret_path:
+	/* Stop processing further if the exception is a stray one */
+	if (!(dr6 & ~(DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)))
+		goto ret_notify_stop;
+
+ret_notify_done:
+	set_debugreg(dr7, 7);
+	return NOTIFY_DONE;
+ret_notify_stop:
+	set_debugreg(dr7, 7);
+	return NOTIFY_STOP;
+}
Index: linux-2.6-tip.hbkpt/arch/x86/include/asm/hw_breakpoint.h
===================================================================
--- /dev/null
+++ linux-2.6-tip.hbkpt/arch/x86/include/asm/hw_breakpoint.h
@@ -0,0 +1,69 @@
+#ifndef	_I386_HW_BREAKPOINT_H
+#define	_I386_HW_BREAKPOINT_H
+
+#ifdef	__KERNEL__
+#define	__ARCH_HW_BREAKPOINT_H
+
+struct arch_hw_breakpoint {
+	char		*name; /* Contains name of the symbol to set bkpt */
+	unsigned long	address;
+	u8		len;
+	u8		type;
+};
+
+#include <linux/kdebug.h>
+#include <asm-generic/hw_breakpoint.h>
+
+/* Available HW breakpoint length encodings */
+#define HW_BREAKPOINT_LEN_1		0x40
+#define HW_BREAKPOINT_LEN_2		0x44
+#define HW_BREAKPOINT_LEN_4		0x4c
+#define HW_BREAKPOINT_LEN_EXECUTE	0x40
+
+/* Available HW breakpoint type encodings */
+
+/* trigger on instruction execute */
+#define HW_BREAKPOINT_EXECUTE	0x80
+/* trigger on memory write */
+#define HW_BREAKPOINT_WRITE	0x81
+/* trigger on memory read or write */
+#define HW_BREAKPOINT_RW	0x83
+
+/* Total number of available HW breakpoint registers */
+#define HB_NUM 4
+
+extern struct hw_breakpoint *hbkpt_kernel[HB_NUM];
+extern unsigned int hbkpt_user_max_refcount[HB_NUM];
+
+/*
+ * Ptrace support: breakpoint trigger routine.
+ */
+int __register_user_hw_breakpoint(int pos, struct task_struct *tsk,
+			struct hw_breakpoint *bp);
+int __modify_user_hw_breakpoint(int pos, struct task_struct *tsk,
+			struct hw_breakpoint *bp);
+void __unregister_user_hw_breakpoint(int pos, struct task_struct *tsk,
+			struct hw_breakpoint *bp);
+
+void arch_install_thread_hbkpt(struct task_struct *tsk);
+void arch_install_none(void);
+void arch_install_kernel_hbkpt(void *);
+int arch_check_va_in_userspace(unsigned long va,
+						struct task_struct *tsk);
+int arch_check_va_in_kernelspace(unsigned long va);
+void arch_store_info(struct hw_breakpoint *bp);
+int arch_validate_hwbkpt_settings(struct hw_breakpoint *bp,
+				unsigned int *align, struct task_struct *tsk);
+void arch_register_user_hw_breakpoint(int pos, struct hw_breakpoint *bp,
+				struct task_struct *tsk);
+int arch_modify_user_hw_breakpoint(int pos, struct hw_breakpoint *bp,
+				struct task_struct *tsk);
+void arch_unregister_user_hw_breakpoint(int pos, struct hw_breakpoint *bp,
+					struct task_struct *tsk);
+void arch_register_kernel_hw_breakpoint(struct hw_breakpoint *bp);
+void arch_unregister_kernel_hw_breakpoint(int pos);
+int hw_breakpoint_handler(struct die_args *args);
+
+#endif	/* __KERNEL__ */
+#endif	/* _I386_HW_BREAKPOINT_H */
+
Index: linux-2.6-tip.hbkpt/arch/x86/kernel/Makefile
===================================================================
--- linux-2.6-tip.hbkpt.orig/arch/x86/kernel/Makefile
+++ linux-2.6-tip.hbkpt/arch/x86/kernel/Makefile
@@ -36,7 +36,7 @@ obj-$(CONFIG_X86_64)	+= sys_x86_64.o x86
 obj-$(CONFIG_X86_64)	+= syscall_64.o vsyscall_64.o
 obj-y			+= bootflag.o e820.o
 obj-y			+= pci-dma.o quirks.o i8237.o topology.o kdebugfs.o
-obj-y			+= alternative.o i8253.o pci-nommu.o
+obj-y			+= alternative.o i8253.o pci-nommu.o hw_breakpoint.o
 obj-y			+= tsc.o io_delay.o rtc.o
 
 obj-$(CONFIG_X86_TRAMPOLINE)	+= trampoline.o
Index: linux-2.6-tip.hbkpt/arch/x86/Kconfig
===================================================================
--- linux-2.6-tip.hbkpt.orig/arch/x86/Kconfig
+++ linux-2.6-tip.hbkpt/arch/x86/Kconfig
@@ -46,6 +46,7 @@ config X86
 	select HAVE_KERNEL_BZIP2
 	select HAVE_KERNEL_LZMA
 	select HAVE_ARCH_KMEMCHECK
+	select HAVE_HW_BREAKPOINT
 
 config ARCH_DEFCONFIG
 	string


^ permalink raw reply	[flat|nested] 33+ messages in thread

* [Patch 03/11] Modifying generic debug exception to use thread-specific debug registers
       [not found] <20090319234044.410725944@K.Prasad>
  2009-03-19 23:48 ` [Patch 01/11] Introducing generic hardware breakpoint handler interfaces K.Prasad
  2009-03-19 23:48 ` [Patch 02/11] x86 architecture implementation of Hardware Breakpoint interfaces K.Prasad
@ 2009-03-19 23:48 ` K.Prasad
  2009-03-19 23:49 ` [Patch 04/11] Introduce user-space " K.Prasad
                   ` (7 subsequent siblings)
  10 siblings, 0 replies; 33+ messages in thread
From: K.Prasad @ 2009-03-19 23:48 UTC (permalink / raw)
  To: Ingo Molnar, Linux Kernel Mailing List
  Cc: Alan Stern, Andrew Morton, Benjamin Herrenschmidt,
	Frederic Weisbecker, Maneesh Soni, Roland McGrath,
	Steven Rostedt, K.Prasad

[-- Attachment #1: 3 --]
[-- Type: text/plain, Size: 3661 bytes --]

From: Alan Stern <stern@rowland.harvard.edu>

This patch modifies the breakpoint exception handler code to use the abstract
register names.

[K.Prasad: Split-out from the bigger patch and minor changes following
           re-basing]

Signed-off-by: K.Prasad <prasad@linux.vnet.ibm.com>
Signed-off-by: Alan Stern <stern@rowland.harvard.edu>
---
 arch/x86/kernel/traps.c |   73 ++++++++++++++++--------------------------------
 1 file changed, 25 insertions(+), 48 deletions(-)

Index: linux-2.6-tip.hbkpt/arch/x86/kernel/traps.c
===================================================================
--- linux-2.6-tip.hbkpt.orig/arch/x86/kernel/traps.c
+++ linux-2.6-tip.hbkpt/arch/x86/kernel/traps.c
@@ -530,13 +530,14 @@ asmlinkage __kprobes struct pt_regs *syn
 dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code)
 {
 	struct task_struct *tsk = current;
-	unsigned long condition;
+	unsigned long dr6;
 	int si_code;
 
-	get_debugreg(condition, 6);
+	get_debugreg(dr6, 6);
+	set_debugreg(0, 6);	/* DR6 may or may not be cleared by the CPU */
 
 	/* Catch kmemcheck conditions first of all! */
-	if (condition & DR_STEP && kmemcheck_trap(regs))
+	if (dr6 & DR_STEP && kmemcheck_trap(regs))
 		return;
 
 	/*
@@ -545,61 +546,37 @@ dotraplinkage void __kprobes do_debug(st
 	clear_tsk_thread_flag(tsk, TIF_DEBUGCTLMSR);
 	tsk->thread.debugctlmsr = 0;
 
-	if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code,
+	/* Store the virtualized DR6 value */
+	tsk->thread.dr6 = dr6;
+
+	if (notify_die(DIE_DEBUG, "debug", regs, dr6, error_code,
 						SIGTRAP) == NOTIFY_STOP)
 		return;
 
 	/* It's safe to allow irq's after DR6 has been saved */
 	preempt_conditional_sti(regs);
 
-	/* Mask out spurious debug traps due to lazy DR7 setting */
-	if (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)) {
-		if (!tsk->thread.debugreg7)
-			goto clear_dr7;
-	}
-
-#ifdef CONFIG_X86_32
-	if (regs->flags & X86_VM_MASK)
-		goto debug_vm86;
-#endif
-
-	/* Save debug status register where ptrace can see it */
-	tsk->thread.debugreg6 = condition;
-
-	/*
-	 * Single-stepping through TF: make sure we ignore any events in
-	 * kernel space (but re-enable TF when returning to user mode).
-	 */
-	if (condition & DR_STEP) {
-		if (!user_mode(regs))
-			goto clear_TF_reenable;
+	if (regs->flags & X86_VM_MASK) {
+		handle_vm86_trap((struct kernel_vm86_regs *) regs,
+				error_code, 1);
+		return;
 	}
 
-	si_code = get_si_code(condition);
-	/* Ok, finally something we can handle */
-	send_sigtrap(tsk, regs, error_code, si_code);
-
 	/*
-	 * Disable additional traps. They'll be re-enabled when
-	 * the signal is delivered.
+	 * Single-stepping through system calls: ignore any exceptions in
+	 * kernel space, but re-enable TF when returning to user mode.
+	 *
+	 * We already checked v86 mode above, so we can check for kernel mode
+	 * by just checking the CPL of CS.
 	 */
-clear_dr7:
-	set_debugreg(0, 7);
-	preempt_conditional_cli(regs);
-	return;
-
-#ifdef CONFIG_X86_32
-debug_vm86:
-	/* reenable preemption: handle_vm86_trap() might sleep */
-	dec_preempt_count();
-	handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code, 1);
-	conditional_cli(regs);
-	return;
-#endif
-
-clear_TF_reenable:
-	set_tsk_thread_flag(tsk, TIF_SINGLESTEP);
-	regs->flags &= ~X86_EFLAGS_TF;
+	if ((dr6 & DR_STEP) && !user_mode(regs)) {
+		tsk->thread.dr6 &= ~DR_STEP;
+		set_tsk_thread_flag(tsk, TIF_SINGLESTEP);
+		regs->flags &= ~X86_EFLAGS_TF;
+	}
+	si_code = get_si_code(dr6);
+	if (tsk->thread.dr6 & (DR_STEP|DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3))
+		send_sigtrap(tsk, regs, error_code, si_code);
 	preempt_conditional_cli(regs);
 	return;
 }


^ permalink raw reply	[flat|nested] 33+ messages in thread

* [Patch 04/11] Introduce user-space debug registers
       [not found] <20090319234044.410725944@K.Prasad>
                   ` (2 preceding siblings ...)
  2009-03-19 23:48 ` [Patch 03/11] Modifying generic debug exception to use thread-specific debug registers K.Prasad
@ 2009-03-19 23:49 ` K.Prasad
  2009-03-19 23:49 ` [Patch 05/11] Use wrapper routines around debug registers in processor related functions K.Prasad
                   ` (6 subsequent siblings)
  10 siblings, 0 replies; 33+ messages in thread
From: K.Prasad @ 2009-03-19 23:49 UTC (permalink / raw)
  To: Ingo Molnar, Linux Kernel Mailing List
  Cc: Alan Stern, Andrew Morton, Benjamin Herrenschmidt,
	Frederic Weisbecker, Maneesh Soni, Roland McGrath,
	Steven Rostedt, K.Prasad

[-- Attachment #1: 4 --]
[-- Type: text/plain, Size: 2977 bytes --]

This patch introduces virtual debug registers to used by the per-thread
structure and wrapper routines to manage debug registers by process-related
functions.

Signed-off-by: K.Prasad <prasad@linux.vnet.ibm.com>
Signed-off-by: Alan Stern <stern@rowland.harvard.edu>
---
 arch/x86/include/asm/debugreg.h  |   23 +++++++++++++++++++++++
 arch/x86/include/asm/processor.h |   16 +++++++++-------
 2 files changed, 32 insertions(+), 7 deletions(-)

Index: linux-2.6-tip.hbkpt/arch/x86/include/asm/debugreg.h
===================================================================
--- linux-2.6-tip.hbkpt.orig/arch/x86/include/asm/debugreg.h
+++ linux-2.6-tip.hbkpt/arch/x86/include/asm/debugreg.h
@@ -49,6 +49,8 @@
 
 #define DR_LOCAL_ENABLE_SHIFT 0    /* Extra shift to the local enable bit */
 #define DR_GLOBAL_ENABLE_SHIFT 1   /* Extra shift to the global enable bit */
+#define DR_LOCAL_ENABLE (0x1)      /* Local enable for reg 0 */
+#define DR_GLOBAL_ENABLE (0x2)     /* Global enable for reg 0 */
 #define DR_ENABLE_SIZE 2           /* 2 enable bits per register */
 
 #define DR_LOCAL_ENABLE_MASK (0x55)  /* Set  local bits for all 4 regs */
@@ -67,4 +69,25 @@
 #define DR_LOCAL_SLOWDOWN (0x100)   /* Local slow the pipeline */
 #define DR_GLOBAL_SLOWDOWN (0x200)  /* Global slow the pipeline */
 
+/*
+ * HW breakpoint additions
+ */
+#ifdef __KERNEL__
+
+/* For process management */
+void flush_thread_hw_breakpoint(struct task_struct *tsk);
+int copy_thread_hw_breakpoint(struct task_struct *tsk,
+		struct task_struct *child, unsigned long clone_flags);
+void dump_thread_hw_breakpoint(struct task_struct *tsk, int u_debugreg[8]);
+void switch_to_thread_hw_breakpoint(struct task_struct *tsk);
+
+/* For CPU management */
+void load_debug_registers(void);
+static inline void hw_breakpoint_disable(void)
+{
+	set_debugreg(0UL, 7);
+}
+
+#endif	/* __KERNEL__ */
+
 #endif /* _ASM_X86_DEBUGREG_H */
Index: linux-2.6-tip.hbkpt/arch/x86/include/asm/processor.h
===================================================================
--- linux-2.6-tip.hbkpt.orig/arch/x86/include/asm/processor.h
+++ linux-2.6-tip.hbkpt/arch/x86/include/asm/processor.h
@@ -29,6 +29,7 @@ struct mm_struct;
 #include <linux/threads.h>
 #include <linux/init.h>
 
+#define HB_NUM 4
 /*
  * Default implementation of macro that returns current
  * instruction pointer ("program counter").
@@ -424,13 +425,14 @@ struct thread_struct {
 	unsigned long		ip;
 	unsigned long		fs;
 	unsigned long		gs;
-	/* Hardware debugging registers: */
-	unsigned long		debugreg0;
-	unsigned long		debugreg1;
-	unsigned long		debugreg2;
-	unsigned long		debugreg3;
-	unsigned long		debugreg6;
-	unsigned long		debugreg7;
+	/* Hardware breakpoint info */
+	struct hw_breakpoint	*hbkpt[HB_NUM];
+	unsigned int		hbkpt_num_installed;
+	/* Thread's view of debug reg 6 */
+	unsigned long		dr6;
+	/* Thread's view of debug reg 7 */
+	unsigned long		dr7;
+
 	/* Fault info: */
 	unsigned long		cr2;
 	unsigned long		trap_no;


^ permalink raw reply	[flat|nested] 33+ messages in thread

* [Patch 05/11] Use wrapper routines around debug registers in processor related functions
       [not found] <20090319234044.410725944@K.Prasad>
                   ` (3 preceding siblings ...)
  2009-03-19 23:49 ` [Patch 04/11] Introduce user-space " K.Prasad
@ 2009-03-19 23:49 ` K.Prasad
  2009-03-19 23:49 ` [Patch 06/11] Use the new wrapper routines to access debug registers in process/thread code K.Prasad
                   ` (5 subsequent siblings)
  10 siblings, 0 replies; 33+ messages in thread
From: K.Prasad @ 2009-03-19 23:49 UTC (permalink / raw)
  To: Ingo Molnar, Linux Kernel Mailing List
  Cc: Alan Stern, Andrew Morton, Benjamin Herrenschmidt,
	Frederic Weisbecker, Maneesh Soni, Roland McGrath,
	Steven Rostedt, K.Prasad

[-- Attachment #1: 5 --]
[-- Type: text/plain, Size: 3702 bytes --]

From: Alan Stern <stern@rowland.harvard.edu>

This patch enables the use of wrapper routines to access the debug/breakpoint
registers.

[K.Prasad: Split-out from the bigger patch and minor changes following
           re-basing]

Signed-off-by: K.Prasad <prasad@linux.vnet.ibm.com>
Signed-off-by: Alan Stern <stern@rowland.harvard.edu>
---
 arch/x86/kernel/smpboot.c |    3 +++
 arch/x86/power/cpu_32.c   |   16 +++-------------
 arch/x86/power/cpu_64.c   |   15 +++------------
 3 files changed, 9 insertions(+), 25 deletions(-)

Index: linux-2.6-tip.hbkpt/arch/x86/power/cpu_32.c
===================================================================
--- linux-2.6-tip.hbkpt.orig/arch/x86/power/cpu_32.c
+++ linux-2.6-tip.hbkpt/arch/x86/power/cpu_32.c
@@ -12,6 +12,7 @@
 #include <asm/mtrr.h>
 #include <asm/mce.h>
 #include <asm/xcr.h>
+#include <asm/debugreg.h>
 
 static struct saved_context saved_context;
 
@@ -47,6 +48,7 @@ static void __save_processor_state(struc
 	ctxt->cr2 = read_cr2();
 	ctxt->cr3 = read_cr3();
 	ctxt->cr4 = read_cr4_safe();
+	hw_breakpoint_disable();
 }
 
 /* Needed by apm.c */
@@ -79,19 +81,7 @@ static void fix_processor_context(void)
 	load_TR_desc();				/* This does ltr */
 	load_LDT(&current->active_mm->context);	/* This does lldt */
 
-	/*
-	 * Now maybe reload the debug registers
-	 */
-	if (current->thread.debugreg7) {
-		set_debugreg(current->thread.debugreg0, 0);
-		set_debugreg(current->thread.debugreg1, 1);
-		set_debugreg(current->thread.debugreg2, 2);
-		set_debugreg(current->thread.debugreg3, 3);
-		/* no 4 and 5 */
-		set_debugreg(current->thread.debugreg6, 6);
-		set_debugreg(current->thread.debugreg7, 7);
-	}
-
+	load_debug_registers();
 }
 
 static void __restore_processor_state(struct saved_context *ctxt)
Index: linux-2.6-tip.hbkpt/arch/x86/power/cpu_64.c
===================================================================
--- linux-2.6-tip.hbkpt.orig/arch/x86/power/cpu_64.c
+++ linux-2.6-tip.hbkpt/arch/x86/power/cpu_64.c
@@ -15,6 +15,7 @@
 #include <asm/pgtable.h>
 #include <asm/mtrr.h>
 #include <asm/xcr.h>
+#include <asm/debugreg.h>
 
 static void fix_processor_context(void);
 
@@ -70,6 +71,7 @@ static void __save_processor_state(struc
 	ctxt->cr3 = read_cr3();
 	ctxt->cr4 = read_cr4();
 	ctxt->cr8 = read_cr8();
+	hw_breakpoint_disable();
 }
 
 void save_processor_state(void)
@@ -158,16 +160,5 @@ static void fix_processor_context(void)
 	load_TR_desc();				/* This does ltr */
 	load_LDT(&current->active_mm->context);	/* This does lldt */
 
-	/*
-	 * Now maybe reload the debug registers
-	 */
-	if (current->thread.debugreg7){
-                loaddebug(&current->thread, 0);
-                loaddebug(&current->thread, 1);
-                loaddebug(&current->thread, 2);
-                loaddebug(&current->thread, 3);
-                /* no 4 and 5 */
-                loaddebug(&current->thread, 6);
-                loaddebug(&current->thread, 7);
-	}
+	load_debug_registers();
 }
Index: linux-2.6-tip.hbkpt/arch/x86/kernel/smpboot.c
===================================================================
--- linux-2.6-tip.hbkpt.orig/arch/x86/kernel/smpboot.c
+++ linux-2.6-tip.hbkpt/arch/x86/kernel/smpboot.c
@@ -63,6 +63,7 @@
 #include <asm/apic.h>
 #include <asm/setup.h>
 #include <asm/uv/uv.h>
+#include <asm/debugreg.h>
 #include <linux/mc146818rtc.h>
 
 #include <asm/smpboot_hooks.h>
@@ -331,6 +332,7 @@ notrace static void __cpuinit start_seco
 	setup_secondary_clock();
 
 	wmb();
+	load_debug_registers();
 	cpu_idle();
 }
 
@@ -1234,6 +1236,7 @@ void cpu_disable_common(void)
 	remove_cpu_from_maps(cpu);
 	unlock_vector_lock();
 	fixup_irqs();
+	hw_breakpoint_disable();
 }
 
 int native_cpu_disable(void)


^ permalink raw reply	[flat|nested] 33+ messages in thread

* [Patch 06/11] Use the new wrapper routines to access debug registers in process/thread code
       [not found] <20090319234044.410725944@K.Prasad>
                   ` (4 preceding siblings ...)
  2009-03-19 23:49 ` [Patch 05/11] Use wrapper routines around debug registers in processor related functions K.Prasad
@ 2009-03-19 23:49 ` K.Prasad
  2009-03-19 23:49 ` [Patch 07/11] Modify signal handling code to refrain from re-enabling HW Breakpoints K.Prasad
                   ` (4 subsequent siblings)
  10 siblings, 0 replies; 33+ messages in thread
From: K.Prasad @ 2009-03-19 23:49 UTC (permalink / raw)
  To: Ingo Molnar, Linux Kernel Mailing List
  Cc: Alan Stern, Andrew Morton, Benjamin Herrenschmidt,
	Frederic Weisbecker, Maneesh Soni, Roland McGrath,
	Steven Rostedt, K.Prasad

[-- Attachment #1: 6.new --]
[-- Type: text/plain, Size: 7362 bytes --]

From: Alan Stern <stern@rowland.harvard.edu>

This patch enables the use of abstract debug registers in
process-handling routines.

[K.Prasad: Split-out from the bigger patch and minor changes following
           re-basing]

Signed-off-by: K.Prasad <prasad@linux.vnet.ibm.com>
Signed-off-by: Alan Stern <stern@rowland.harvard.edu>
---
 arch/x86/kernel/process.c    |   23 ++++++-----------------
 arch/x86/kernel/process_32.c |   31 +++++++++++++++++++++++++++++++
 arch/x86/kernel/process_64.c |   33 +++++++++++++++++++++++++++++++++
 3 files changed, 70 insertions(+), 17 deletions(-)

Index: linux-2.6-tip.hbkpt/arch/x86/kernel/process.c
===================================================================
--- linux-2.6-tip.hbkpt.orig/arch/x86/kernel/process.c
+++ linux-2.6-tip.hbkpt/arch/x86/kernel/process.c
@@ -14,6 +14,8 @@
 #include <asm/idle.h>
 #include <asm/uaccess.h>
 #include <asm/i387.h>
+#include <asm/debugreg.h>
+#include <asm/hw_breakpoint.h>
 
 unsigned long idle_halt;
 EXPORT_SYMBOL(idle_halt);
@@ -83,6 +85,8 @@ void exit_thread(void)
 		put_cpu();
 		kfree(bp);
 	}
+	if (unlikely(t->dr7))
+		flush_thread_hw_breakpoint(me);
 
 	ds_exit_thread(current);
 }
@@ -103,14 +107,9 @@ void flush_thread(void)
 	}
 #endif
 
-	clear_tsk_thread_flag(tsk, TIF_DEBUG);
+	if (unlikely(tsk->thread.dr7))
+		flush_thread_hw_breakpoint(tsk);
 
-	tsk->thread.debugreg0 = 0;
-	tsk->thread.debugreg1 = 0;
-	tsk->thread.debugreg2 = 0;
-	tsk->thread.debugreg3 = 0;
-	tsk->thread.debugreg6 = 0;
-	tsk->thread.debugreg7 = 0;
 	memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
 	/*
 	 * Forget coprocessor state..
@@ -192,16 +191,6 @@ void __switch_to_xtra(struct task_struct
 	else if (next->debugctlmsr != prev->debugctlmsr)
 		update_debugctlmsr(next->debugctlmsr);
 
-	if (test_tsk_thread_flag(next_p, TIF_DEBUG)) {
-		set_debugreg(next->debugreg0, 0);
-		set_debugreg(next->debugreg1, 1);
-		set_debugreg(next->debugreg2, 2);
-		set_debugreg(next->debugreg3, 3);
-		/* no 4 and 5 */
-		set_debugreg(next->debugreg6, 6);
-		set_debugreg(next->debugreg7, 7);
-	}
-
 	if (test_tsk_thread_flag(prev_p, TIF_NOTSC) ^
 	    test_tsk_thread_flag(next_p, TIF_NOTSC)) {
 		/* prev and next are different */
Index: linux-2.6-tip.hbkpt/arch/x86/kernel/process_32.c
===================================================================
--- linux-2.6-tip.hbkpt.orig/arch/x86/kernel/process_32.c
+++ linux-2.6-tip.hbkpt/arch/x86/kernel/process_32.c
@@ -59,6 +59,8 @@
 #include <asm/idle.h>
 #include <asm/syscalls.h>
 #include <asm/ds.h>
+#include <asm/debugreg.h>
+#include <asm/hw_breakpoint.h>
 
 asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
 
@@ -263,7 +265,14 @@ int copy_thread(int nr, unsigned long cl
 
 	task_user_gs(p) = get_user_gs(regs);
 
+	p->thread.io_bitmap_ptr = NULL;
+
 	tsk = current;
+	err = -ENOMEM;
+	if (unlikely(tsk->thread.dr7)) {
+		if (copy_thread_hw_breakpoint(tsk, p, clone_flags))
+			goto out;
+	}
 	if (unlikely(test_tsk_thread_flag(tsk, TIF_IO_BITMAP))) {
 		p->thread.io_bitmap_ptr = kmemdup(tsk->thread.io_bitmap_ptr,
 						IO_BITMAP_BYTES, GFP_KERNEL);
@@ -283,10 +292,13 @@ int copy_thread(int nr, unsigned long cl
 		err = do_set_thread_area(p, -1,
 			(struct user_desc __user *)childregs->si, 0);
 
+out:
 	if (err && p->thread.io_bitmap_ptr) {
 		kfree(p->thread.io_bitmap_ptr);
 		p->thread.io_bitmap_max = 0;
 	}
+	if (err)
+		flush_thread_hw_breakpoint(p);
 
 	ds_copy_thread(p, current);
 
@@ -424,6 +436,25 @@ __switch_to(struct task_struct *prev_p, 
 		lazy_load_gs(next->gs);
 
 	percpu_write(current_task, next_p);
+	/*
+	 * There's a problem with moving the switch_to_thread_hw_breakpoint()
+	 * call before current is updated.  Suppose a kernel breakpoint is
+	 * triggered in between the two.  The hw-breakpoint handler will see
+	 * that current is different from the task pointer stored in the chbi
+	 * area, so it will think the task pointer is leftover from an old task
+	 * (lazy switching) and will erase it.  Then until the next context
+	 * switch, no user-breakpoints will be installed.
+	 *
+	 * The real problem is that it's impossible to update both current and
+	 * chbi->bp_task at the same instant, so there will always be a window
+	 * in which they disagree and a breakpoint might get triggered.  Since
+	 * we use lazy switching, we are forced to assume that a disagreement
+	 * means that current is correct and chbi->bp_task is old.  But if you
+	 * move the code above then you'll create a window in which current is
+	 * old and chbi->bp_task is correct.
+	 */
+	if (unlikely(test_tsk_thread_flag(next_p, TIF_DEBUG)))
+		switch_to_thread_hw_breakpoint(next_p);
 
 	return prev_p;
 }
Index: linux-2.6-tip.hbkpt/arch/x86/kernel/process_64.c
===================================================================
--- linux-2.6-tip.hbkpt.orig/arch/x86/kernel/process_64.c
+++ linux-2.6-tip.hbkpt/arch/x86/kernel/process_64.c
@@ -55,6 +55,8 @@
 #include <asm/idle.h>
 #include <asm/syscalls.h>
 #include <asm/ds.h>
+#include <asm/debugreg.h>
+#include <asm/hw_breakpoint.h>
 
 asmlinkage extern void ret_from_fork(void);
 
@@ -248,6 +250,8 @@ void release_thread(struct task_struct *
 			BUG();
 		}
 	}
+	if (unlikely(dead_task->thread.tdr7))
+		flush_thread_hw_breakpoint(dead_task);
 }
 
 static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr)
@@ -303,12 +307,18 @@ int copy_thread(int nr, unsigned long cl
 
 	p->thread.fs = me->thread.fs;
 	p->thread.gs = me->thread.gs;
+	p->thread.io_bitmap_ptr = NULL;
 
 	savesegment(gs, p->thread.gsindex);
 	savesegment(fs, p->thread.fsindex);
 	savesegment(es, p->thread.es);
 	savesegment(ds, p->thread.ds);
 
+	err = -ENOMEM;
+	if (unlikely(me->thread.tdr7)) {
+		if (copy_thread_hw_breakpoint(me, p, clone_flags))
+			goto out;
+	}
 	if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) {
 		p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
 		if (!p->thread.io_bitmap_ptr) {
@@ -346,6 +356,9 @@ out:
 		kfree(p->thread.io_bitmap_ptr);
 		p->thread.io_bitmap_max = 0;
 	}
+	if (err)
+		flush_thread_hw_breakpoint(p);
+
 	return err;
 }
 
@@ -491,6 +504,26 @@ __switch_to(struct task_struct *prev_p, 
 	 */
 	if (tsk_used_math(next_p) && next_p->fpu_counter > 5)
 		math_state_restore();
+	/*
+	 * There's a problem with moving the switch_to_thread_hw_breakpoint()
+	 * call before current is updated.  Suppose a kernel breakpoint is
+	 * triggered in between the two.  The hw-breakpoint handler will see
+	 * that current is different from the task pointer stored in the chbi
+	 * area, so it will think the task pointer is leftover from an old task
+	 * (lazy switching) and will erase it.  Then until the next context
+	 * switch, no user-breakpoints will be installed.
+	 *
+	 * The real problem is that it's impossible to update both current and
+	 * chbi->bp_task at the same instant, so there will always be a window
+	 * in which they disagree and a breakpoint might get triggered.  Since
+	 * we use lazy switching, we are forced to assume that a disagreement
+	 * means that current is correct and chbi->bp_task is old.  But if you
+	 * move the code above then you'll create a window in which current is
+	 * old and chbi->bp_task is correct.
+	 */
+	if (unlikely(test_tsk_thread_flag(next_p, TIF_DEBUG)))
+		switch_to_thread_hw_breakpoint(next_p);
+
 	return prev_p;
 }
 


^ permalink raw reply	[flat|nested] 33+ messages in thread

* [Patch 07/11] Modify signal handling code to refrain from re-enabling HW Breakpoints
       [not found] <20090319234044.410725944@K.Prasad>
                   ` (5 preceding siblings ...)
  2009-03-19 23:49 ` [Patch 06/11] Use the new wrapper routines to access debug registers in process/thread code K.Prasad
@ 2009-03-19 23:49 ` K.Prasad
  2009-03-19 23:49 ` [Patch 08/11] Modify Ptrace routines to access breakpoint registers K.Prasad
                   ` (3 subsequent siblings)
  10 siblings, 0 replies; 33+ messages in thread
From: K.Prasad @ 2009-03-19 23:49 UTC (permalink / raw)
  To: Ingo Molnar, Linux Kernel Mailing List
  Cc: Alan Stern, Andrew Morton, Benjamin Herrenschmidt,
	Frederic Weisbecker, Maneesh Soni, Roland McGrath,
	Steven Rostedt, K.Prasad

[-- Attachment #1: 7 --]
[-- Type: text/plain, Size: 1144 bytes --]

From: Alan Stern <stern@rowland.harvard.edu>

This patch disables re-enabling of Hardware Breakpoint registers through
the  signal handling code. This is now done during
hw_breakpoint_handler().

Signed-off-by: K.Prasad <prasad@linux.vnet.ibm.com>
Signed-off-by: Alan Stern <stern@rowland.harvard.edu>
---
 arch/x86/kernel/signal.c |    9 ---------
 1 file changed, 9 deletions(-)

Index: linux-2.6-tip.hbkpt/arch/x86/kernel/signal.c
===================================================================
--- linux-2.6-tip.hbkpt.orig/arch/x86/kernel/signal.c
+++ linux-2.6-tip.hbkpt/arch/x86/kernel/signal.c
@@ -794,15 +794,6 @@ static void do_signal(struct pt_regs *re
 
 	signr = get_signal_to_deliver(&info, &ka, regs, NULL);
 	if (signr > 0) {
-		/*
-		 * Re-enable any watchpoints before delivering the
-		 * signal to user space. The processor register will
-		 * have been cleared if the watchpoint triggered
-		 * inside the kernel.
-		 */
-		if (current->thread.debugreg7)
-			set_debugreg(current->thread.debugreg7, 7);
-
 		/* Whee! Actually deliver the signal.  */
 		if (handle_signal(signr, &info, &ka, oldset, regs) == 0) {
 			/*


^ permalink raw reply	[flat|nested] 33+ messages in thread

* [Patch 08/11] Modify Ptrace routines to access breakpoint registers
       [not found] <20090319234044.410725944@K.Prasad>
                   ` (6 preceding siblings ...)
  2009-03-19 23:49 ` [Patch 07/11] Modify signal handling code to refrain from re-enabling HW Breakpoints K.Prasad
@ 2009-03-19 23:49 ` K.Prasad
  2009-03-19 23:49 ` [Patch 09/11] Cleanup HW Breakpoint registers before kexec K.Prasad
                   ` (2 subsequent siblings)
  10 siblings, 0 replies; 33+ messages in thread
From: K.Prasad @ 2009-03-19 23:49 UTC (permalink / raw)
  To: Ingo Molnar, Linux Kernel Mailing List
  Cc: Alan Stern, Andrew Morton, Benjamin Herrenschmidt,
	Frederic Weisbecker, Maneesh Soni, Roland McGrath,
	Steven Rostedt, K.Prasad

[-- Attachment #1: 8 --]
[-- Type: text/plain, Size: 8692 bytes --]

From: Alan Stern <stern@rowland.harvard.edu>

This patch modifies the ptrace code to use the new wrapper routines around the 
debug/breakpoint registers.

[K.Prasad: Adapted the ptrace routines and to changes post x86/x86_64 merger,
	   split the minor patch from bigger patch. Re-wrote ptrace_write_dr7()
           and ptrace_set_debugreg() functions to use new data-structures]

[K.Prasad: Changed code to suit the simplified HW breakpoint implementation]

Signed-off-by: K.Prasad <prasad@linux.vnet.ibm.com>
Signed-off-by: Alan Stern <stern@rowland.harvard.edu>
---
 arch/x86/kernel/ptrace.c |  229 ++++++++++++++++++++++++++++-------------------
 1 file changed, 138 insertions(+), 91 deletions(-)

Index: linux-2.6-tip.hbkpt/arch/x86/kernel/ptrace.c
===================================================================
--- linux-2.6-tip.hbkpt.orig/arch/x86/kernel/ptrace.c
+++ linux-2.6-tip.hbkpt/arch/x86/kernel/ptrace.c
@@ -34,6 +34,7 @@
 #include <asm/prctl.h>
 #include <asm/proto.h>
 #include <asm/ds.h>
+#include <asm/hw_breakpoint.h>
 
 #include "tls.h"
 
@@ -134,11 +135,6 @@ static int set_segment_reg(struct task_s
 	return 0;
 }
 
-static unsigned long debugreg_addr_limit(struct task_struct *task)
-{
-	return TASK_SIZE - 3;
-}
-
 #else  /* CONFIG_X86_64 */
 
 #define FLAG_MASK		(FLAG_MASK_32 | X86_EFLAGS_NT)
@@ -263,15 +259,6 @@ static int set_segment_reg(struct task_s
 	return 0;
 }
 
-static unsigned long debugreg_addr_limit(struct task_struct *task)
-{
-#ifdef CONFIG_IA32_EMULATION
-	if (test_tsk_thread_flag(task, TIF_IA32))
-		return IA32_PAGE_OFFSET - 3;
-#endif
-	return TASK_SIZE_MAX - 7;
-}
-
 #endif	/* CONFIG_X86_32 */
 
 static unsigned long get_flags(struct task_struct *task)
@@ -462,95 +449,155 @@ static int genregs_set(struct task_struc
 }
 
 /*
- * This function is trivial and will be inlined by the compiler.
- * Having it separates the implementation details of debug
- * registers from the interface details of ptrace.
+ * Decode the length and type bits for a particular breakpoint as
+ * stored in debug register 7.  Return the "enabled" status.
  */
-static unsigned long ptrace_get_debugreg(struct task_struct *child, int n)
+static int decode_dr7(unsigned long dr7, int bpnum, unsigned *len,
+		unsigned *type)
 {
-	switch (n) {
-	case 0:		return child->thread.debugreg0;
-	case 1:		return child->thread.debugreg1;
-	case 2:		return child->thread.debugreg2;
-	case 3:		return child->thread.debugreg3;
-	case 6:		return child->thread.debugreg6;
-	case 7:		return child->thread.debugreg7;
-	}
-	return 0;
+	int temp = dr7 >> (DR_CONTROL_SHIFT + bpnum * DR_CONTROL_SIZE);
+
+	*len = (temp & 0xc) | 0x40;
+	*type = (temp & 0x3) | 0x80;
+	return (dr7 >> (bpnum * DR_ENABLE_SIZE)) & 0x3;
 }
 
-static int ptrace_set_debugreg(struct task_struct *child,
-			       int n, unsigned long data)
+static void ptrace_triggered(struct hw_breakpoint *bp, struct pt_regs *regs)
 {
+	struct thread_struct *thread = &(current->thread);
 	int i;
 
-	if (unlikely(n == 4 || n == 5))
-		return -EIO;
+	/* Store in the virtual DR6 register the fact that the breakpoint
+	 * was hit so the thread's debugger will see it.
+	 */
+	for (i = 0; i < hbkpt_user_max; i++)
+		if (bp->info.address == thread->hbkpt[i]->info.address)
+			break;
 
-	if (n < 4 && unlikely(data >= debugreg_addr_limit(child)))
-		return -EIO;
+	thread->dr6 |= (DR_TRAP0 << i);
+}
 
-	switch (n) {
-	case 0:		child->thread.debugreg0 = data; break;
-	case 1:		child->thread.debugreg1 = data; break;
-	case 2:		child->thread.debugreg2 = data; break;
-	case 3:		child->thread.debugreg3 = data; break;
+/*
+ * Handle ptrace writes to debug register 7.
+ */
+static int ptrace_write_dr7(struct task_struct *tsk, unsigned long data)
+{
+	struct hw_breakpoint *bp;
+	struct thread_struct *thread = &(tsk->thread);
+	int i;
+	int rc = 0;
+	unsigned long old_dr7 = thread->dr7;
 
-	case 6:
-		if ((data & ~0xffffffffUL) != 0)
-			return -EIO;
-		child->thread.debugreg6 = data;
-		break;
+	data &= ~DR_CONTROL_RESERVED;
+	/* Loop through all the hardware breakpoints, making the
+	 * appropriate changes to each.
+	 */
+restore_settings:
+	thread->dr7 = data;
+	for (i = 0; i < HB_NUM; i++) {
+		int enabled;
+		unsigned len, type;
+
+		bp = thread->hbkpt[i];
+		if (!bp)
+			continue;
+
+		enabled = decode_dr7(data, i, &len, &type);
+		if (!enabled) {
+			if (bp->triggered)
+				__unregister_user_hw_breakpoint(i, tsk, bp);
+			continue;
+		}
 
-	case 7:
-		/*
-		 * Sanity-check data. Take one half-byte at once with
-		 * check = (val >> (16 + 4*i)) & 0xf. It contains the
-		 * R/Wi and LENi bits; bits 0 and 1 are R/Wi, and bits
-		 * 2 and 3 are LENi. Given a list of invalid values,
-		 * we do mask |= 1 << invalid_value, so that
-		 * (mask >> check) & 1 is a correct test for invalid
-		 * values.
-		 *
-		 * R/Wi contains the type of the breakpoint /
-		 * watchpoint, LENi contains the length of the watched
-		 * data in the watchpoint case.
-		 *
-		 * The invalid values are:
-		 * - LENi == 0x10 (undefined), so mask |= 0x0f00.	[32-bit]
-		 * - R/Wi == 0x10 (break on I/O reads or writes), so
-		 *   mask |= 0x4444.
-		 * - R/Wi == 0x00 && LENi != 0x00, so we have mask |=
-		 *   0x1110.
-		 *
-		 * Finally, mask = 0x0f00 | 0x4444 | 0x1110 == 0x5f54.
-		 *
-		 * See the Intel Manual "System Programming Guide",
-		 * 15.2.4
-		 *
-		 * Note that LENi == 0x10 is defined on x86_64 in long
-		 * mode (i.e. even for 32-bit userspace software, but
-		 * 64-bit kernel), so the x86_64 mask value is 0x5454.
-		 * See the AMD manual no. 24593 (AMD64 System Programming)
-		 */
-#ifdef CONFIG_X86_32
-#define	DR7_MASK	0x5f54
-#else
-#define	DR7_MASK	0x5554
-#endif
-		data &= ~DR_CONTROL_RESERVED;
-		for (i = 0; i < 4; i++)
-			if ((DR7_MASK >> ((data >> (16 + 4*i)) & 0xf)) & 1)
-				return -EIO;
-		child->thread.debugreg7 = data;
-		if (data)
-			set_tsk_thread_flag(child, TIF_DEBUG);
+		if (bp->triggered)
+			rc = __modify_user_hw_breakpoint(i, tsk, bp);
+		else {
+			bp->triggered = ptrace_triggered;
+			bp->info.len = len;
+			bp->info.type = type;
+			rc = __register_user_hw_breakpoint(i, tsk, bp);
+		}
+		if (rc < 0)
+			break;
 		else
-			clear_tsk_thread_flag(child, TIF_DEBUG);
-		break;
+			set_tsk_thread_flag(tsk, TIF_DEBUG);
+	}
+	/* If anything above failed, restore the original settings */
+	if (rc < 0) {
+		data = old_dr7;
+		goto restore_settings;
 	}
+	return rc;
+}
 
-	return 0;
+/*
+ * Handle PTRACE_PEEKUSR calls for the debug register area.
+ */
+unsigned long ptrace_get_debugreg(struct task_struct *tsk, int n)
+{
+	struct thread_struct *thread = &(tsk->thread);
+	unsigned long val = 0;
+
+	mutex_lock(&hw_breakpoint_mutex);
+	if (n < HB_NUM) {
+		if (thread->hbkpt[n])
+			val = thread->hbkpt[n]->info.address;
+	} else if (n == 6) {
+		val = thread->dr6;
+	} else if (n == 7) {
+		val = thread->dr7;
+	}
+	mutex_unlock(&hw_breakpoint_mutex);
+	return val;
+}
+
+/*
+ * Handle PTRACE_POKEUSR calls for the debug register area.
+ */
+int ptrace_set_debugreg(struct task_struct *tsk, int n, unsigned long val)
+{
+	struct thread_struct *thread = &(tsk->thread);
+	int rc = -EIO;
+
+	mutex_lock(&hw_breakpoint_mutex);
+
+	/* There are no DR4 or DR5 registers */
+	if (n == 4 || n == 5)
+		goto ret_path;
+
+	/* Writes to DR6 modify the virtualized value */
+	if (n == 6) {
+		tsk->thread.dr6 = val;
+		rc = 0;
+		goto ret_path;
+	}
+
+	/* Writes to DR0 - DR3 change a breakpoint address */
+	rc = 0;
+	if (n < HB_NUM) {
+		if (!val)
+			goto ret_path;
+		if (thread->hbkpt[n]) {
+			thread->hbkpt[n]->info.address = val;
+			rc = __modify_user_hw_breakpoint(n, tsk,
+							  thread->hbkpt[n]);
+			goto ret_path;
+		}
+		thread->hbkpt[n] = kzalloc(sizeof(struct hw_breakpoint),
+								GFP_KERNEL);
+		if (!thread->hbkpt[n]) {
+			rc = -ENOMEM;
+			goto ret_path;
+		} else
+			thread->hbkpt[n]->info.address = val;
+	}
+	/* All that's left is DR7 */
+	if (n == 7)
+		rc = ptrace_write_dr7(tsk, val);
+
+ret_path:
+	mutex_unlock(&hw_breakpoint_mutex);
+	return rc;
 }
 
 /*
@@ -871,7 +918,7 @@ long arch_ptrace(struct task_struct *chi
 		else if (addr >= offsetof(struct user, u_debugreg[0]) &&
 			 addr <= offsetof(struct user, u_debugreg[7])) {
 			addr -= offsetof(struct user, u_debugreg[0]);
-			tmp = ptrace_get_debugreg(child, addr / sizeof(data));
+			tmp = ptrace_get_debugreg(child, addr/sizeof(data));
 		}
 		ret = put_user(tmp, datap);
 		break;
@@ -889,7 +936,7 @@ long arch_ptrace(struct task_struct *chi
 			 addr <= offsetof(struct user, u_debugreg[7])) {
 			addr -= offsetof(struct user, u_debugreg[0]);
 			ret = ptrace_set_debugreg(child,
-						  addr / sizeof(data), data);
+						addr/sizeof(data), data);
 		}
 		break;
 


^ permalink raw reply	[flat|nested] 33+ messages in thread

* [Patch 09/11] Cleanup HW Breakpoint registers before kexec
       [not found] <20090319234044.410725944@K.Prasad>
                   ` (7 preceding siblings ...)
  2009-03-19 23:49 ` [Patch 08/11] Modify Ptrace routines to access breakpoint registers K.Prasad
@ 2009-03-19 23:49 ` K.Prasad
  2009-03-19 23:50 ` [Patch 10/11] Sample HW breakpoint over kernel data address K.Prasad
  2009-03-19 23:50 ` [Patch 11/11] ftrace plugin for kernel symbol tracing using HW Breakpoint interfaces - v2 K.Prasad
  10 siblings, 0 replies; 33+ messages in thread
From: K.Prasad @ 2009-03-19 23:49 UTC (permalink / raw)
  To: Ingo Molnar, Linux Kernel Mailing List
  Cc: Alan Stern, Andrew Morton, Benjamin Herrenschmidt,
	Frederic Weisbecker, Maneesh Soni, Roland McGrath,
	Steven Rostedt, K.Prasad

[-- Attachment #1: 9 --]
[-- Type: text/plain, Size: 1818 bytes --]

From: Alan Stern <stern@rowland.harvard.edu>

This patch disables Hardware breakpoints before doing a 'kexec' on the machine.

[K.Prasad: Split-out from the bigger patch and minor changes following
           re-basing]

Signed-off-by: K.Prasad <prasad@linux.vnet.ibm.com>
Signed-off-by: Alan Stern <stern@rowland.harvard.edu>
---
 arch/x86/kernel/machine_kexec_32.c |    2 ++
 arch/x86/kernel/machine_kexec_64.c |    2 ++
 2 files changed, 4 insertions(+)

Index: linux-2.6-tip.hbkpt/arch/x86/kernel/machine_kexec_32.c
===================================================================
--- linux-2.6-tip.hbkpt.orig/arch/x86/kernel/machine_kexec_32.c
+++ linux-2.6-tip.hbkpt/arch/x86/kernel/machine_kexec_32.c
@@ -25,6 +25,7 @@
 #include <asm/desc.h>
 #include <asm/system.h>
 #include <asm/cacheflush.h>
+#include <asm/debugreg.h>
 
 static void set_idt(void *newidt, __u16 limit)
 {
@@ -202,6 +203,7 @@ void machine_kexec(struct kimage *image)
 
 	/* Interrupts aren't acceptable while we reboot */
 	local_irq_disable();
+	hw_breakpoint_disable();
 
 	if (image->preserve_context) {
 #ifdef CONFIG_X86_IO_APIC
Index: linux-2.6-tip.hbkpt/arch/x86/kernel/machine_kexec_64.c
===================================================================
--- linux-2.6-tip.hbkpt.orig/arch/x86/kernel/machine_kexec_64.c
+++ linux-2.6-tip.hbkpt/arch/x86/kernel/machine_kexec_64.c
@@ -18,6 +18,7 @@
 #include <asm/pgtable.h>
 #include <asm/tlbflush.h>
 #include <asm/mmu_context.h>
+#include <asm/debugreg.h>
 
 static int init_one_level2_page(struct kimage *image, pgd_t *pgd,
 				unsigned long addr)
@@ -282,6 +283,7 @@ void machine_kexec(struct kimage *image)
 
 	/* Interrupts aren't acceptable while we reboot */
 	local_irq_disable();
+	hw_breakpoint_disable();
 
 	if (image->preserve_context) {
 #ifdef CONFIG_X86_IO_APIC


^ permalink raw reply	[flat|nested] 33+ messages in thread

* [Patch 10/11] Sample HW breakpoint over kernel data address
       [not found] <20090319234044.410725944@K.Prasad>
                   ` (8 preceding siblings ...)
  2009-03-19 23:49 ` [Patch 09/11] Cleanup HW Breakpoint registers before kexec K.Prasad
@ 2009-03-19 23:50 ` K.Prasad
  2009-03-19 23:50 ` [Patch 11/11] ftrace plugin for kernel symbol tracing using HW Breakpoint interfaces - v2 K.Prasad
  10 siblings, 0 replies; 33+ messages in thread
From: K.Prasad @ 2009-03-19 23:50 UTC (permalink / raw)
  To: Ingo Molnar, Linux Kernel Mailing List
  Cc: Alan Stern, Andrew Morton, Benjamin Herrenschmidt,
	Frederic Weisbecker, Maneesh Soni, Roland McGrath,
	Steven Rostedt, K.Prasad

[-- Attachment #1: 10 --]
[-- Type: text/plain, Size: 4654 bytes --]

This patch introduces a sample kernel module to demonstrate the use of Hardware
Breakpoint feature. It places a breakpoint over the kernel variable 'pid_max'
to monitor all write operations and emits a function-backtrace when done.

Signed-off-by: K.Prasad <prasad@linux.vnet.ibm.com>
---
 samples/Kconfig                         |    6 ++
 samples/Makefile                        |    4 +
 samples/hw_breakpoint/Makefile          |    1 
 samples/hw_breakpoint/data_breakpoint.c |   79 ++++++++++++++++++++++++++++++++
 4 files changed, 89 insertions(+), 1 deletion(-)

Index: linux-2.6-tip.hbkpt/samples/Kconfig
===================================================================
--- linux-2.6-tip.hbkpt.orig/samples/Kconfig
+++ linux-2.6-tip.hbkpt/samples/Kconfig
@@ -39,5 +39,11 @@ config SAMPLE_KRETPROBES
 	default m
 	depends on SAMPLE_KPROBES && KRETPROBES
 
+config SAMPLE_HW_BREAKPOINT
+	tristate "Build kernel hardware breakpoint examples -- loadable modules only"
+	depends on HAVE_HW_BREAKPOINT && m
+	help
+	  This builds kernel hardware breakpoint example modules.
+
 endif # SAMPLES
 
Index: linux-2.6-tip.hbkpt/samples/Makefile
===================================================================
--- linux-2.6-tip.hbkpt.orig/samples/Makefile
+++ linux-2.6-tip.hbkpt/samples/Makefile
@@ -1,3 +1,5 @@
 # Makefile for Linux samples code
 
-obj-$(CONFIG_SAMPLES)	+= markers/ kobject/ kprobes/ tracepoints/
+obj-$(CONFIG_SAMPLES)	+= markers/ kobject/ kprobes/ tracepoints/ \
+			   hw_breakpoint/
+
Index: linux-2.6-tip.hbkpt/samples/hw_breakpoint/Makefile
===================================================================
--- /dev/null
+++ linux-2.6-tip.hbkpt/samples/hw_breakpoint/Makefile
@@ -0,0 +1 @@
+obj-$(CONFIG_SAMPLE_HW_BREAKPOINT) += data_breakpoint.o
Index: linux-2.6-tip.hbkpt/samples/hw_breakpoint/data_breakpoint.c
===================================================================
--- /dev/null
+++ linux-2.6-tip.hbkpt/samples/hw_breakpoint/data_breakpoint.c
@@ -0,0 +1,79 @@
+/*
+ * data_breakpoint.c - Sample HW Breakpoint file to watch kernel data address
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * This file is a kernel module that places a breakpoint over 'pid_max' kernel
+ * variable using Hardware Breakpoint register. The corresponding handler which
+ * prints a backtrace is invoked everytime a write operation is performed on
+ * that variable.
+ *
+ * After inserting this module, invoke a write operation using
+ * 'echo <desired_value> > /proc/sys/kernel/pid_max'
+ * to find the function-call backtrace.
+ *
+ * Copyright (C) IBM Corporation, 2009
+ */
+#include <linux/module.h>	/* Needed by all modules */
+#include <linux/kernel.h>	/* Needed for KERN_INFO */
+#include <linux/init.h>		/* Needed for the macros */
+
+#include <asm/hw_breakpoint.h>
+
+struct hw_breakpoint pid_max_hbkpt;
+
+void pid_max_hbkpt_handler(struct hw_breakpoint *temp, struct pt_regs
+								*temp_regs)
+{
+	printk(KERN_INFO "pid_max value is changed\n");
+	dump_stack();
+	printk(KERN_INFO "Dump stack from pid_max_hbkpt_handler\n");
+}
+
+static int __init hw_break_module_init(void)
+{
+	int ret;
+
+#ifdef CONFIG_X86
+	pid_max_hbkpt.info.name = "pid_max";
+	pid_max_hbkpt.info.type = HW_BREAKPOINT_WRITE;
+	pid_max_hbkpt.info.len = HW_BREAKPOINT_LEN_4;
+
+	pid_max_hbkpt.triggered = (void *)pid_max_hbkpt_handler;
+#endif /* CONFIG_X86 */
+
+	ret = register_kernel_hw_breakpoint(&pid_max_hbkpt);
+
+	if (ret < 0) {
+		printk(KERN_INFO "Breakpoint registration failed\n");
+		return ret;
+	} else
+		printk(KERN_INFO "HW Breakpoint for pid_max write installed\n");
+
+	return 0;
+}
+
+static void __exit hw_break_module_exit(void)
+{
+	unregister_kernel_hw_breakpoint(&pid_max_hbkpt);
+	printk(KERN_INFO "HW Breakpoint for pid_max write uninstalled\n");
+}
+
+module_init(hw_break_module_init);
+module_exit(hw_break_module_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("K.Prasad");
+MODULE_DESCRIPTION("pid_max breakpoint");


^ permalink raw reply	[flat|nested] 33+ messages in thread

* [Patch 11/11] ftrace plugin for kernel symbol tracing using HW Breakpoint interfaces - v2
       [not found] <20090319234044.410725944@K.Prasad>
                   ` (9 preceding siblings ...)
  2009-03-19 23:50 ` [Patch 10/11] Sample HW breakpoint over kernel data address K.Prasad
@ 2009-03-19 23:50 ` K.Prasad
  2009-03-20  9:04   ` Frederic Weisbecker
  10 siblings, 1 reply; 33+ messages in thread
From: K.Prasad @ 2009-03-19 23:50 UTC (permalink / raw)
  To: Ingo Molnar, Linux Kernel Mailing List
  Cc: Alan Stern, Andrew Morton, Benjamin Herrenschmidt,
	Frederic Weisbecker, Maneesh Soni, Roland McGrath,
	Steven Rostedt, K.Prasad

[-- Attachment #1: ftrace_hbkpt_12 --]
[-- Type: text/plain, Size: 19482 bytes --]

This patch adds an ftrace plugin to detect and profile memory access over
kernel variables. It uses HW Breakpoint interfaces to 'watch memory
addresses.

Signed-off-by: K.Prasad <prasad@linux.vnet.ibm.com> 
---
 kernel/trace/Kconfig          |   21 +
 kernel/trace/Makefile         |    1 
 kernel/trace/trace.h          |   25 +
 kernel/trace/trace_ksym.c     |  555 ++++++++++++++++++++++++++++++++++++++++++
 kernel/trace/trace_selftest.c |   36 ++
 5 files changed, 638 insertions(+)

Index: linux-2.6-tip.hbkpt/kernel/trace/Kconfig
===================================================================
--- linux-2.6-tip.hbkpt.orig/kernel/trace/Kconfig
+++ linux-2.6-tip.hbkpt/kernel/trace/Kconfig
@@ -264,6 +264,27 @@ config POWER_TRACER
 	  power management decisions, specifically the C-state and P-state
 	  behavior.
 
+config KSYM_TRACER
+	bool "Trace read and write access on kernel memory locations"
+	depends on HAVE_HW_BREAKPOINT
+	select TRACING
+	help
+	  This tracer helps find read and write operations on any given kernel
+	  symbol i.e. /proc/kallsyms.
+
+config PROFILE_KSYM_TRACER
+	bool "Profile all kernel memory accesses on 'watched' variables"
+	depends on KSYM_TRACER
+	help
+	  This tracer profiles kernel accesses on variables watched through the
+	  ksym tracer ftrace plugin. Depending upon the hardware, all read
+	  and write operations on kernel variables can be monitored for
+	  accesses.
+
+	  The results will be displayed in:
+	  /debugfs/tracing/profile_ksym
+
+	  Say N if unsure.
 
 config STACK_TRACER
 	bool "Trace max stack"
Index: linux-2.6-tip.hbkpt/kernel/trace/Makefile
===================================================================
--- linux-2.6-tip.hbkpt.orig/kernel/trace/Makefile
+++ linux-2.6-tip.hbkpt/kernel/trace/Makefile
@@ -44,5 +44,6 @@ obj-$(CONFIG_EVENT_TRACER) += trace_even
 obj-$(CONFIG_EVENT_TRACER) += events.o
 obj-$(CONFIG_EVENT_TRACER) += trace_export.o
 obj-$(CONFIG_FTRACE_SYSCALLS) += trace_syscalls.o
+obj-$(CONFIG_KSYM_TRACER) += trace_ksym.o
 
 libftrace-y := ftrace.o
Index: linux-2.6-tip.hbkpt/kernel/trace/trace.h
===================================================================
--- linux-2.6-tip.hbkpt.orig/kernel/trace/trace.h
+++ linux-2.6-tip.hbkpt/kernel/trace/trace.h
@@ -12,6 +12,10 @@
 #include <trace/kmemtrace.h>
 #include <trace/power.h>
 
+#ifdef CONFIG_KSYM_TRACER
+#include <asm/hw_breakpoint.h>
+#endif
+
 enum trace_type {
 	__TRACE_FIRST_TYPE = 0,
 
@@ -37,6 +41,7 @@ enum trace_type {
 	TRACE_KMEM_FREE,
 	TRACE_POWER,
 	TRACE_BLK,
+	TRACE_KSYM,
 
 	__TRACE_LAST_TYPE,
 };
@@ -214,6 +219,23 @@ struct syscall_trace_exit {
 	unsigned long		ret;
 };
 
+#ifdef CONFIG_KSYM_TRACER
+struct trace_ksym {
+	struct trace_entry	ent;
+	struct hw_breakpoint	*ksym_hbkpt;
+	unsigned long		ksym_addr;
+	unsigned long		ip;
+#ifdef CONFIG_PROFILE_KSYM_TRACER
+	unsigned long 		counter;
+#endif
+	struct hlist_node	ksym_hlist;
+	char			ksym_name[KSYM_NAME_LEN];
+	char			p_name[TASK_COMM_LEN];
+};
+#else
+struct trace_ksym {
+};
+#endif /* CONFIG_KSYM_TRACER */
 
 /*
  * trace_flag_type is an enumeration that holds different
@@ -332,6 +354,7 @@ extern void __ftrace_bad_type(void);
 			  TRACE_SYSCALL_ENTER);				\
 		IF_ASSIGN(var, ent, struct syscall_trace_exit,		\
 			  TRACE_SYSCALL_EXIT);				\
+		IF_ASSIGN(var, ent, struct trace_ksym, TRACE_KSYM);	\
 		__ftrace_bad_type();					\
 	} while (0)
 
@@ -593,6 +616,8 @@ extern int trace_selftest_startup_syspro
 					       struct trace_array *tr);
 extern int trace_selftest_startup_branch(struct tracer *trace,
 					 struct trace_array *tr);
+extern int trace_selftest_startup_ksym(struct tracer *trace,
+					 struct trace_array *tr);
 #endif /* CONFIG_FTRACE_STARTUP_TEST */
 
 extern void *head_page(struct trace_array_cpu *data);
Index: linux-2.6-tip.hbkpt/kernel/trace/trace_ksym.c
===================================================================
--- /dev/null
+++ linux-2.6-tip.hbkpt/kernel/trace/trace_ksym.c
@@ -0,0 +1,555 @@
+/*
+ * trace_ksym.c - Kernel Symbol Tracer
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) IBM Corporation, 2009
+ */
+
+#include <linux/kallsyms.h>
+#include <linux/uaccess.h>
+#include <linux/debugfs.h>
+#include <linux/ftrace.h>
+#include <linux/module.h>
+#include <linux/jhash.h>
+#include <linux/fs.h>
+
+#include "trace_output.h"
+#include "trace_stat.h"
+#include "trace.h"
+
+/* For now, let us restrict the no. of symbols traced simultaneously to number
+ * of available hardware breakpoint registers.
+ */
+#define KSYM_TRACER_MAX HB_NUM
+
+#define KSYM_TRACER_OP_LEN 3 /* rw- */
+#define KSYM_FILTER_ENTRY_LEN (KSYM_NAME_LEN + KSYM_TRACER_OP_LEN + 1)
+
+#ifdef CONFIG_FTRACE_SELFTEST
+
+static int ksym_selftest_dummy;
+#define KSYM_SELFTEST_ENTRY "ksym_selftest_dummy"
+
+#endif /* CONFIG_FTRACE_SELFTEST */
+
+static struct trace_array *ksym_trace_array;
+
+DEFINE_MUTEX(ksym_tracer_mutex);
+
+static unsigned int ksym_filter_entry_count;
+static unsigned int ksym_tracing_enabled;
+
+static HLIST_HEAD(ksym_filter_head);
+
+#ifdef CONFIG_PROFILE_KSYM_TRACER
+
+#define MAX_UL_INT 0xffffffff
+DEFINE_SPINLOCK(ksym_stat_lock);
+
+void ksym_collect_stats(unsigned long hbkpt_hit_addr)
+{
+	struct hlist_node *node;
+	struct trace_ksym *entry;
+
+	spin_lock(&ksym_stat_lock);
+	hlist_for_each_entry(entry, node, &ksym_filter_head, ksym_hlist) {
+		if ((entry->ksym_addr == hbkpt_hit_addr) &&
+		    (entry->counter <= MAX_UL_INT)) {
+			entry->counter++;
+			break;
+		}
+	}
+	spin_unlock(&ksym_stat_lock);
+}
+#endif /* CONFIG_PROFILE_KSYM_TRACER */
+
+void ksym_hbkpt_handler(struct hw_breakpoint *hbkpt, struct pt_regs *regs)
+{
+	struct ring_buffer_event *event;
+	struct trace_array *tr;
+	struct trace_ksym *entry;
+	int pc;
+
+	if (!ksym_tracing_enabled)
+		return;
+
+	tr = ksym_trace_array;
+	pc = preempt_count();
+
+	event = trace_buffer_lock_reserve(tr, TRACE_KSYM,
+							sizeof(*entry), 0, pc);
+	if (!event)
+		return;
+
+	entry = ring_buffer_event_data(event);
+	strlcpy(entry->ksym_name, hbkpt->info.name, KSYM_SYMBOL_LEN);
+	entry->ksym_hbkpt = hbkpt;
+	entry->ip = instruction_pointer(regs);
+	strlcpy(entry->p_name, current->comm, TASK_COMM_LEN);
+#ifdef CONFIG_PROFILE_KSYM_TRACER
+	ksym_collect_stats(hbkpt->info.address);
+#endif /* CONFIG_PROFILE_KSYM_TRACER */
+
+	trace_buffer_unlock_commit(tr, event, 0, pc);
+}
+
+/* Valid access types are represented as
+ *
+ * rw- : Set Read/Write Access Breakpoint
+ * -w- : Set Write Access Breakpoint
+ * --- : Clear Breakpoints
+ * --x : Set Execution Break points (Not available yet)
+ *
+ */
+static int ksym_trace_get_access_type(char *access_str)
+{
+	int pos, access = 0;
+
+	for (pos = 0; pos < KSYM_TRACER_OP_LEN; pos++) {
+		switch (access_str[pos]) {
+		case 'r':
+			access += (pos == 0) ? 4 : -1;
+			break;
+		case 'w':
+			access += (pos == 1) ? 2 : -1;
+			break;
+		case '-':
+			break;
+		default:
+			return -EINVAL;
+		}
+	}
+
+	switch (access) {
+	case 6:
+		access = HW_BREAKPOINT_RW;
+		break;
+	case 2:
+		access = HW_BREAKPOINT_WRITE;
+		break;
+	case 0:
+		access = 0;
+	}
+
+	return access;
+}
+
+/*
+ * There can be several possible malformed requests and we attempt to capture
+ * all of them. We enumerate some of the rules
+ * 1. We will not allow kernel symbols with ':' since it is used as a delimiter.
+ *    i.e. multiple ':' symbols disallowed. Possible uses are of the form
+ *    <module>:<ksym_name>:<op>.
+ * 2. No delimiter symbol ':' in the input string
+ * 3. Spurious operator symbols or symbols not in their respective positions
+ * 4. <ksym_name>:--- i.e. clear breakpoint request when ksym_name not in file
+ * 5. Kernel symbol not a part of /proc/kallsyms
+ * 6. Duplicate requests
+ */
+static int parse_ksym_trace_str(char *input_string, char **ksymname,
+							unsigned long *addr)
+{
+	char *delimiter = ":";
+	int ret;
+
+	ret = -EINVAL;
+	*ksymname = strsep(&input_string, delimiter);
+	*addr = kallsyms_lookup_name(*ksymname);
+
+	/* Check for malformed request: (2), (1) and (5) */
+	if ((!input_string) ||
+		(strlen(input_string) != (KSYM_TRACER_OP_LEN + 1)) ||
+			(*addr == 0))
+		goto return_code;
+	ret = ksym_trace_get_access_type(input_string);
+
+return_code:
+	return ret;
+}
+
+static int process_new_ksym_entry(char *ksymname, int op, unsigned long addr)
+{
+	struct trace_ksym *entry;
+	int ret;
+
+	if (ksym_filter_entry_count >= KSYM_TRACER_MAX) {
+		printk(KERN_ERR "ksym_tracer: Maximum limit:(%d) reached. No"
+		" new requests for tracing can be accepted now.\n",
+			KSYM_TRACER_MAX);
+		return -ENOSPC;
+	}
+
+	entry = kzalloc(sizeof(struct trace_ksym), GFP_KERNEL);
+	if (!entry)
+		return -ENOMEM;
+
+	entry->ksym_hbkpt = kzalloc(sizeof(struct hw_breakpoint), GFP_KERNEL);
+	if (!entry->ksym_hbkpt) {
+		kfree(entry);
+		return -ENOMEM;
+	}
+
+	entry->ksym_hbkpt->info.name = ksymname;
+	entry->ksym_hbkpt->info.type = op;
+	entry->ksym_addr = entry->ksym_hbkpt->info.address = addr;
+	entry->ksym_hbkpt->info.len = HW_BREAKPOINT_LEN_4;
+
+	entry->ksym_hbkpt->triggered = (void *)ksym_hbkpt_handler;
+
+	ret = register_kernel_hw_breakpoint(entry->ksym_hbkpt);
+	if (ret < 0) {
+		printk(KERN_INFO "ksym_tracer request failed. Try again"
+					" later!!\n");
+		kfree(entry->ksym_hbkpt);
+		kfree(entry);
+		return -EAGAIN;
+	}
+	hlist_add_head(&(entry->ksym_hlist), &ksym_filter_head);
+	ksym_filter_entry_count++;
+
+	return 0;
+}
+
+static ssize_t ksym_trace_filter_read(struct file *filp, char __user *ubuf,
+						size_t count, loff_t *ppos)
+{
+	struct trace_ksym *entry;
+	struct hlist_node *node;
+	char buf[KSYM_FILTER_ENTRY_LEN * KSYM_TRACER_MAX];
+	ssize_t ret, cnt = 0;
+
+	mutex_lock(&ksym_tracer_mutex);
+
+	hlist_for_each_entry(entry, node, &ksym_filter_head, ksym_hlist) {
+		cnt += snprintf(&buf[cnt], KSYM_FILTER_ENTRY_LEN - cnt, "%s:",
+				entry->ksym_hbkpt->info.name);
+		if (entry->ksym_hbkpt->info.type == HW_BREAKPOINT_WRITE)
+			cnt += snprintf(&buf[cnt], KSYM_FILTER_ENTRY_LEN - cnt,
+								"-w-\n");
+		else if (entry->ksym_hbkpt->info.type == HW_BREAKPOINT_RW)
+			cnt += snprintf(&buf[cnt], KSYM_FILTER_ENTRY_LEN - cnt,
+								"rw-\n");
+	}
+	ret = simple_read_from_buffer(ubuf, count, ppos, buf, strlen(buf));
+	mutex_unlock(&ksym_tracer_mutex);
+
+	return ret;
+}
+
+static ssize_t ksym_trace_filter_write(struct file *file,
+					const char __user *buffer,
+						size_t count, loff_t *ppos)
+{
+	struct trace_ksym *entry;
+	struct hlist_node *node;
+	char *input_string, *ksymname = NULL;
+	unsigned long ksym_addr = 0;
+	int ret, op, changed = 0;
+
+	/* Ignore echo "" > ksym_trace_filter */
+	if (count == 0)
+		return 0;
+
+	input_string = kzalloc(count, GFP_KERNEL);
+	if (!input_string)
+		return -ENOMEM;
+
+	if (copy_from_user(input_string, buffer, count)) {
+		kfree(input_string);
+		return -EFAULT;
+	}
+
+	ret = op = parse_ksym_trace_str(input_string, &ksymname, &ksym_addr);
+	if (ret < 0) {
+		kfree(input_string);
+		return ret;
+	}
+
+	mutex_lock(&ksym_tracer_mutex);
+
+	ret = -EINVAL;
+	hlist_for_each_entry(entry, node, &ksym_filter_head, ksym_hlist) {
+		if (entry->ksym_addr == ksym_addr) {
+			/* Check for malformed request: (6) */
+			if (entry->ksym_hbkpt->info.type != op)
+				changed = 1;
+			else
+				goto err_ret;
+			break;
+		}
+	}
+	if (changed) {
+		unregister_kernel_hw_breakpoint(entry->ksym_hbkpt);
+		entry->ksym_hbkpt->info.type = op;
+		if (op > 0) {
+			ret = register_kernel_hw_breakpoint(entry->ksym_hbkpt);
+			if (ret == 0) {
+				ret = count;
+				goto unlock_ret_path;
+			}
+		}
+		ksym_filter_entry_count--;
+		hlist_del(&(entry->ksym_hlist));
+		kfree(entry->ksym_hbkpt);
+		kfree(entry);
+		ret = count;
+		goto err_ret;
+	} else {
+		/* Check for malformed request: (4) */
+		if (op == 0)
+			goto err_ret;
+		ret = process_new_ksym_entry(ksymname, op, ksym_addr);
+		if (ret)
+			goto err_ret;
+	}
+	ret = count;
+	goto unlock_ret_path;
+
+err_ret:
+	kfree(input_string);
+
+unlock_ret_path:
+	mutex_unlock(&ksym_tracer_mutex);
+	return ret;
+}
+
+static const struct file_operations ksym_tracing_fops = {
+	.open		= tracing_open_generic,
+	.read		= ksym_trace_filter_read,
+	.write		= ksym_trace_filter_write,
+};
+
+static void ksym_trace_reset(struct trace_array *tr)
+{
+	struct trace_ksym *entry;
+	struct hlist_node *node, *node1;
+
+	ksym_tracing_enabled = 0;
+
+	mutex_lock(&ksym_tracer_mutex);
+	hlist_for_each_entry_safe(entry, node, node1, &ksym_filter_head,
+								ksym_hlist) {
+		unregister_kernel_hw_breakpoint(entry->ksym_hbkpt);
+		ksym_filter_entry_count--;
+		hlist_del(&(entry->ksym_hlist));
+
+		/* Free the 'input_string' only if reset
+		 * after startup self-test
+		 */
+#ifdef CONFIG_FTRACE_SELFTEST
+		if (strncmp(entry->ksym_hbkpt->info.name, KSYM_SELFTEST_ENTRY,
+					strlen(KSYM_SELFTEST_ENTRY)) != 0)
+#endif /* CONFIG_FTRACE_SELFTEST*/
+			kfree(entry->ksym_hbkpt->info.name);
+		kfree(entry->ksym_hbkpt);
+		kfree(entry);
+	}
+	mutex_unlock(&ksym_tracer_mutex);
+
+}
+
+static int ksym_trace_init(struct trace_array *tr)
+{
+	int cpu, ret = 0;
+
+	for_each_online_cpu(cpu)
+		tracing_reset(tr, cpu);
+
+	ksym_tracing_enabled = 1;
+	ksym_trace_array = tr;
+
+#ifdef CONFIG_FTRACE_SELFTEST
+	/* Check if we are re-entering self-test code during initialisation */
+	if (ksym_selftest_dummy)
+		goto ret_path;
+
+	ksym_selftest_dummy = 0;
+
+	/* Register the read-write tracing request */
+	ret = process_new_ksym_entry(KSYM_SELFTEST_ENTRY, HW_BREAKPOINT_RW,
+					(unsigned long)(&ksym_selftest_dummy));
+
+	if (ret < 0) {
+		printk(KERN_CONT "ksym_trace read-write startup test failed\n");
+		goto ret_path;
+	}
+	/* Perform a read and a write operation over the dummy variable to
+	 * trigger the tracer
+	 */
+	if (ksym_selftest_dummy == 0)
+		ksym_selftest_dummy++;
+
+ret_path:
+#endif /* CONFIG_FTRACE_SELFTEST */
+
+	return ret;
+}
+
+static void ksym_trace_print_header(struct seq_file *m)
+{
+
+	seq_puts(m,
+		 "#       TASK-PID      CPU#      Symbol         Type    "
+		 "Function         \n");
+	seq_puts(m,
+		 "#          |           |          |              |         "
+		 "|            \n");
+}
+
+static enum print_line_t ksym_trace_output(struct trace_iterator *iter)
+{
+	struct trace_entry *entry = iter->ent;
+	struct trace_seq *s = &iter->seq;
+	struct trace_ksym *field;
+	char str[KSYM_SYMBOL_LEN];
+	int ret;
+
+	trace_assign_type(field, entry);
+
+	ret = trace_seq_printf(s, "%-15s %-5d %-3d %-20s ", field->p_name,
+				entry->pid, iter->cpu, field->ksym_name);
+	if (!ret)
+		return TRACE_TYPE_PARTIAL_LINE;
+
+	switch (field->ksym_hbkpt->info.type) {
+	case HW_BREAKPOINT_WRITE:
+		ret = trace_seq_printf(s, " W  ");
+		break;
+	case HW_BREAKPOINT_RW:
+		ret = trace_seq_printf(s, " RW ");
+		break;
+	default:
+		return TRACE_TYPE_PARTIAL_LINE;
+	}
+
+	if (!ret)
+		return TRACE_TYPE_PARTIAL_LINE;
+
+	sprint_symbol(str, field->ip);
+	ret = trace_seq_printf(s, "%-20s\n", str);
+	if (!ret)
+		return TRACE_TYPE_PARTIAL_LINE;
+
+	return TRACE_TYPE_HANDLED;
+}
+
+struct tracer ksym_tracer __read_mostly =
+{
+	.name		= "ksym_tracer",
+	.init		= ksym_trace_init,
+	.reset		= ksym_trace_reset,
+#ifdef CONFIG_FTRACE_SELFTEST
+	.selftest	= trace_selftest_startup_ksym,
+#endif
+	.print_header   = ksym_trace_print_header,
+	.print_line	= ksym_trace_output
+};
+
+__init static int init_ksym_trace(void)
+{
+	struct dentry *d_tracer;
+	struct dentry *entry;
+
+	d_tracer = tracing_init_dentry();
+	ksym_filter_entry_count = 0;
+
+	entry = debugfs_create_file("ksym_trace_filter", 0666, d_tracer,
+				    NULL, &ksym_tracing_fops);
+	if (!entry)
+		pr_warning("Could not create debugfs "
+			   "'ksym_trace_filter' file\n");
+
+	return register_tracer(&ksym_tracer);
+}
+device_initcall(init_ksym_trace);
+
+
+#ifdef CONFIG_PROFILE_KSYM_TRACER
+static int ksym_tracer_stat_headers(struct seq_file *m)
+{
+	seq_printf(m, "   Access type    ");
+	seq_printf(m, "            Symbol                     Counter     \n");
+	return 0;
+}
+
+static int ksym_tracer_stat_show(struct seq_file *m, void *v)
+{
+	struct hlist_node *stat = v;
+	struct trace_ksym *entry;
+	int access_type = 0;
+	char fn_name[KSYM_NAME_LEN];
+
+	entry = hlist_entry(stat, struct trace_ksym, ksym_hlist);
+
+	if (entry->ksym_hbkpt)
+		access_type = entry->ksym_hbkpt->info.type;
+
+	switch (access_type) {
+	case HW_BREAKPOINT_WRITE:
+		seq_printf(m, "     W     ");
+		break;
+	case HW_BREAKPOINT_RW:
+		seq_printf(m, "     RW    ");
+		break;
+	default:
+		seq_printf(m, "     NA    ");
+	}
+
+	if (lookup_symbol_name(entry->ksym_addr, fn_name) >= 0)
+		seq_printf(m, "               %s                 ", fn_name);
+	else
+		seq_printf(m, "               <NA>                ");
+
+	seq_printf(m, "%15lu\n", entry->counter);
+	return 0;
+}
+
+static void *ksym_tracer_stat_start(void)
+{
+	return &(ksym_filter_head.first);
+}
+
+static void *
+ksym_tracer_stat_next(void *v, int idx)
+{
+	struct hlist_node *stat = v;
+
+	return stat->next;
+}
+
+static struct tracer_stat ksym_tracer_stats = {
+	.name = "ksym_tracer",
+	.stat_start = ksym_tracer_stat_start,
+	.stat_next = ksym_tracer_stat_next,
+	.stat_headers = ksym_tracer_stat_headers,
+	.stat_show = ksym_tracer_stat_show
+};
+
+__init static int ksym_tracer_stat_init(void)
+{
+	int ret;
+
+	ret = register_stat_tracer(&ksym_tracer_stats);
+	if (!ret) {
+		printk(KERN_WARNING "Warning: could not register "
+				    "ksym tracer stats\n");
+		return 1;
+	}
+
+	return 0;
+}
+fs_initcall(ksym_tracer_stat_init);
+#endif /* CONFIG_PROFILE_KSYM_TRACER */
Index: linux-2.6-tip.hbkpt/kernel/trace/trace_selftest.c
===================================================================
--- linux-2.6-tip.hbkpt.orig/kernel/trace/trace_selftest.c
+++ linux-2.6-tip.hbkpt/kernel/trace/trace_selftest.c
@@ -16,6 +16,7 @@ static inline int trace_valid_entry(stru
 	case TRACE_BRANCH:
 	case TRACE_GRAPH_ENT:
 	case TRACE_GRAPH_RET:
+	case TRACE_KSYM:
 		return 1;
 	}
 	return 0;
@@ -687,3 +688,38 @@ trace_selftest_startup_branch(struct tra
 	return ret;
 }
 #endif /* CONFIG_BRANCH_TRACER */
+
+#ifdef CONFIG_KSYM_TRACER
+int
+trace_selftest_startup_ksym(struct tracer *trace, struct trace_array *tr)
+{
+	unsigned long count;
+	int ret;
+
+	/* start the tracing */
+	ret = tracer_init(trace, tr);
+	if (ret) {
+		warn_failed_init_tracer(trace, ret);
+		return ret;
+	}
+
+	/* Sleep for a 1/10 of a second */
+	msleep(100);
+	/* stop the tracing. */
+	tracing_stop();
+	/* check the trace buffer */
+	ret = trace_test_buffer(tr, &count);
+	trace->reset(tr);
+	tracing_start();
+
+	/* read & write operations - one each is performed on the dummy variable
+	 * triggering two entries in the trace buffer
+	 */
+	if (!ret && count != 2) {
+		printk(KERN_CONT "Ksym tracer startup test failed");
+		ret = -1;
+	}
+
+	return ret;
+}
+#endif /* CONFIG_KSYM_TRACER */


^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [Patch 11/11] ftrace plugin for kernel symbol tracing using HW Breakpoint interfaces - v2
  2009-03-19 23:50 ` [Patch 11/11] ftrace plugin for kernel symbol tracing using HW Breakpoint interfaces - v2 K.Prasad
@ 2009-03-20  9:04   ` Frederic Weisbecker
  2009-03-21 16:24     ` K.Prasad
  0 siblings, 1 reply; 33+ messages in thread
From: Frederic Weisbecker @ 2009-03-20  9:04 UTC (permalink / raw)
  To: K.Prasad
  Cc: Ingo Molnar, Linux Kernel Mailing List, Alan Stern,
	Andrew Morton, Benjamin Herrenschmidt, Maneesh Soni,
	Roland McGrath, Steven Rostedt

On Fri, Mar 20, 2009 at 05:20:32AM +0530, K.Prasad wrote:
> This patch adds an ftrace plugin to detect and profile memory access over
> kernel variables. It uses HW Breakpoint interfaces to 'watch memory
> addresses.
> 
> Signed-off-by: K.Prasad <prasad@linux.vnet.ibm.com> 
> ---
>  kernel/trace/Kconfig          |   21 +
>  kernel/trace/Makefile         |    1 
>  kernel/trace/trace.h          |   25 +
>  kernel/trace/trace_ksym.c     |  555 ++++++++++++++++++++++++++++++++++++++++++
>  kernel/trace/trace_selftest.c |   36 ++
>  5 files changed, 638 insertions(+)
> 
> Index: linux-2.6-tip.hbkpt/kernel/trace/Kconfig
> ===================================================================
> --- linux-2.6-tip.hbkpt.orig/kernel/trace/Kconfig
> +++ linux-2.6-tip.hbkpt/kernel/trace/Kconfig
> @@ -264,6 +264,27 @@ config POWER_TRACER
>  	  power management decisions, specifically the C-state and P-state
>  	  behavior.
>  
> +config KSYM_TRACER
> +	bool "Trace read and write access on kernel memory locations"
> +	depends on HAVE_HW_BREAKPOINT
> +	select TRACING
> +	help
> +	  This tracer helps find read and write operations on any given kernel
> +	  symbol i.e. /proc/kallsyms.
> +
> +config PROFILE_KSYM_TRACER
> +	bool "Profile all kernel memory accesses on 'watched' variables"
> +	depends on KSYM_TRACER
> +	help
> +	  This tracer profiles kernel accesses on variables watched through the
> +	  ksym tracer ftrace plugin. Depending upon the hardware, all read
> +	  and write operations on kernel variables can be monitored for
> +	  accesses.
> +
> +	  The results will be displayed in:
> +	  /debugfs/tracing/profile_ksym
> +
> +	  Say N if unsure.
>  
>  config STACK_TRACER
>  	bool "Trace max stack"
> Index: linux-2.6-tip.hbkpt/kernel/trace/Makefile
> ===================================================================
> --- linux-2.6-tip.hbkpt.orig/kernel/trace/Makefile
> +++ linux-2.6-tip.hbkpt/kernel/trace/Makefile
> @@ -44,5 +44,6 @@ obj-$(CONFIG_EVENT_TRACER) += trace_even
>  obj-$(CONFIG_EVENT_TRACER) += events.o
>  obj-$(CONFIG_EVENT_TRACER) += trace_export.o
>  obj-$(CONFIG_FTRACE_SYSCALLS) += trace_syscalls.o
> +obj-$(CONFIG_KSYM_TRACER) += trace_ksym.o
>  
>  libftrace-y := ftrace.o
> Index: linux-2.6-tip.hbkpt/kernel/trace/trace.h
> ===================================================================
> --- linux-2.6-tip.hbkpt.orig/kernel/trace/trace.h
> +++ linux-2.6-tip.hbkpt/kernel/trace/trace.h
> @@ -12,6 +12,10 @@
>  #include <trace/kmemtrace.h>
>  #include <trace/power.h>
>  
> +#ifdef CONFIG_KSYM_TRACER
> +#include <asm/hw_breakpoint.h>
> +#endif
> +
>  enum trace_type {
>  	__TRACE_FIRST_TYPE = 0,
>  
> @@ -37,6 +41,7 @@ enum trace_type {
>  	TRACE_KMEM_FREE,
>  	TRACE_POWER,
>  	TRACE_BLK,
> +	TRACE_KSYM,
>  
>  	__TRACE_LAST_TYPE,
>  };
> @@ -214,6 +219,23 @@ struct syscall_trace_exit {
>  	unsigned long		ret;
>  };
>  
> +#ifdef CONFIG_KSYM_TRACER
> +struct trace_ksym {
> +	struct trace_entry	ent;
> +	struct hw_breakpoint	*ksym_hbkpt;
> +	unsigned long		ksym_addr;
> +	unsigned long		ip;
> +#ifdef CONFIG_PROFILE_KSYM_TRACER
> +	unsigned long 		counter;
> +#endif
> +	struct hlist_node	ksym_hlist;
> +	char			ksym_name[KSYM_NAME_LEN];
> +	char			p_name[TASK_COMM_LEN];
> +};
> +#else
> +struct trace_ksym {
> +};
> +#endif /* CONFIG_KSYM_TRACER */
>  
>  /*
>   * trace_flag_type is an enumeration that holds different
> @@ -332,6 +354,7 @@ extern void __ftrace_bad_type(void);
>  			  TRACE_SYSCALL_ENTER);				\
>  		IF_ASSIGN(var, ent, struct syscall_trace_exit,		\
>  			  TRACE_SYSCALL_EXIT);				\
> +		IF_ASSIGN(var, ent, struct trace_ksym, TRACE_KSYM);	\
>  		__ftrace_bad_type();					\
>  	} while (0)
>  
> @@ -593,6 +616,8 @@ extern int trace_selftest_startup_syspro
>  					       struct trace_array *tr);
>  extern int trace_selftest_startup_branch(struct tracer *trace,
>  					 struct trace_array *tr);
> +extern int trace_selftest_startup_ksym(struct tracer *trace,
> +					 struct trace_array *tr);
>  #endif /* CONFIG_FTRACE_STARTUP_TEST */
>  
>  extern void *head_page(struct trace_array_cpu *data);
> Index: linux-2.6-tip.hbkpt/kernel/trace/trace_ksym.c
> ===================================================================
> --- /dev/null
> +++ linux-2.6-tip.hbkpt/kernel/trace/trace_ksym.c
> @@ -0,0 +1,555 @@
> +/*
> + * trace_ksym.c - Kernel Symbol Tracer
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License as published by
> + * the Free Software Foundation; either version 2 of the License, or
> + * (at your option) any later version.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> + * GNU General Public License for more details.
> + *
> + * You should have received a copy of the GNU General Public License
> + * along with this program; if not, write to the Free Software
> + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
> + *
> + * Copyright (C) IBM Corporation, 2009
> + */
> +
> +#include <linux/kallsyms.h>
> +#include <linux/uaccess.h>
> +#include <linux/debugfs.h>
> +#include <linux/ftrace.h>
> +#include <linux/module.h>
> +#include <linux/jhash.h>
> +#include <linux/fs.h>
> +
> +#include "trace_output.h"
> +#include "trace_stat.h"
> +#include "trace.h"
> +
> +/* For now, let us restrict the no. of symbols traced simultaneously to number
> + * of available hardware breakpoint registers.
> + */
> +#define KSYM_TRACER_MAX HB_NUM
> +
> +#define KSYM_TRACER_OP_LEN 3 /* rw- */
> +#define KSYM_FILTER_ENTRY_LEN (KSYM_NAME_LEN + KSYM_TRACER_OP_LEN + 1)
> +
> +#ifdef CONFIG_FTRACE_SELFTEST
> +
> +static int ksym_selftest_dummy;
> +#define KSYM_SELFTEST_ENTRY "ksym_selftest_dummy"
> +
> +#endif /* CONFIG_FTRACE_SELFTEST */
> +
> +static struct trace_array *ksym_trace_array;
> +
> +DEFINE_MUTEX(ksym_tracer_mutex);
> +
> +static unsigned int ksym_filter_entry_count;
> +static unsigned int ksym_tracing_enabled;
> +
> +static HLIST_HEAD(ksym_filter_head);
> +
> +#ifdef CONFIG_PROFILE_KSYM_TRACER
> +
> +#define MAX_UL_INT 0xffffffff
> +DEFINE_SPINLOCK(ksym_stat_lock);
> +
> +void ksym_collect_stats(unsigned long hbkpt_hit_addr)
> +{
> +	struct hlist_node *node;
> +	struct trace_ksym *entry;
> +
> +	spin_lock(&ksym_stat_lock);


I see that can be called from ksym_hbkpt_handler which in turn
can be called from interrupt context, right?
You can issue a deadlock if you don't disable interrupts here.

Thanks,
Frederic.

> +	hlist_for_each_entry(entry, node, &ksym_filter_head, ksym_hlist) {
> +		if ((entry->ksym_addr == hbkpt_hit_addr) &&
> +		    (entry->counter <= MAX_UL_INT)) {
> +			entry->counter++;
> +			break;
> +		}
> +	}
> +	spin_unlock(&ksym_stat_lock);
> +}
> +#endif /* CONFIG_PROFILE_KSYM_TRACER */
> +
> +void ksym_hbkpt_handler(struct hw_breakpoint *hbkpt, struct pt_regs *regs)
> +{
> +	struct ring_buffer_event *event;
> +	struct trace_array *tr;
> +	struct trace_ksym *entry;
> +	int pc;
> +
> +	if (!ksym_tracing_enabled)
> +		return;
> +
> +	tr = ksym_trace_array;
> +	pc = preempt_count();
> +
> +	event = trace_buffer_lock_reserve(tr, TRACE_KSYM,
> +							sizeof(*entry), 0, pc);
> +	if (!event)
> +		return;
> +
> +	entry = ring_buffer_event_data(event);
> +	strlcpy(entry->ksym_name, hbkpt->info.name, KSYM_SYMBOL_LEN);
> +	entry->ksym_hbkpt = hbkpt;
> +	entry->ip = instruction_pointer(regs);
> +	strlcpy(entry->p_name, current->comm, TASK_COMM_LEN);
> +#ifdef CONFIG_PROFILE_KSYM_TRACER
> +	ksym_collect_stats(hbkpt->info.address);
> +#endif /* CONFIG_PROFILE_KSYM_TRACER */
> +
> +	trace_buffer_unlock_commit(tr, event, 0, pc);
> +}
> +
> +/* Valid access types are represented as
> + *
> + * rw- : Set Read/Write Access Breakpoint
> + * -w- : Set Write Access Breakpoint
> + * --- : Clear Breakpoints
> + * --x : Set Execution Break points (Not available yet)
> + *
> + */
> +static int ksym_trace_get_access_type(char *access_str)
> +{
> +	int pos, access = 0;
> +
> +	for (pos = 0; pos < KSYM_TRACER_OP_LEN; pos++) {
> +		switch (access_str[pos]) {
> +		case 'r':
> +			access += (pos == 0) ? 4 : -1;
> +			break;
> +		case 'w':
> +			access += (pos == 1) ? 2 : -1;
> +			break;
> +		case '-':
> +			break;
> +		default:
> +			return -EINVAL;
> +		}
> +	}
> +
> +	switch (access) {
> +	case 6:
> +		access = HW_BREAKPOINT_RW;
> +		break;
> +	case 2:
> +		access = HW_BREAKPOINT_WRITE;
> +		break;
> +	case 0:
> +		access = 0;
> +	}
> +
> +	return access;
> +}
> +
> +/*
> + * There can be several possible malformed requests and we attempt to capture
> + * all of them. We enumerate some of the rules
> + * 1. We will not allow kernel symbols with ':' since it is used as a delimiter.
> + *    i.e. multiple ':' symbols disallowed. Possible uses are of the form
> + *    <module>:<ksym_name>:<op>.
> + * 2. No delimiter symbol ':' in the input string
> + * 3. Spurious operator symbols or symbols not in their respective positions
> + * 4. <ksym_name>:--- i.e. clear breakpoint request when ksym_name not in file
> + * 5. Kernel symbol not a part of /proc/kallsyms
> + * 6. Duplicate requests
> + */
> +static int parse_ksym_trace_str(char *input_string, char **ksymname,
> +							unsigned long *addr)
> +{
> +	char *delimiter = ":";
> +	int ret;
> +
> +	ret = -EINVAL;
> +	*ksymname = strsep(&input_string, delimiter);
> +	*addr = kallsyms_lookup_name(*ksymname);
> +
> +	/* Check for malformed request: (2), (1) and (5) */
> +	if ((!input_string) ||
> +		(strlen(input_string) != (KSYM_TRACER_OP_LEN + 1)) ||
> +			(*addr == 0))
> +		goto return_code;
> +	ret = ksym_trace_get_access_type(input_string);
> +
> +return_code:
> +	return ret;
> +}
> +
> +static int process_new_ksym_entry(char *ksymname, int op, unsigned long addr)
> +{
> +	struct trace_ksym *entry;
> +	int ret;
> +
> +	if (ksym_filter_entry_count >= KSYM_TRACER_MAX) {
> +		printk(KERN_ERR "ksym_tracer: Maximum limit:(%d) reached. No"
> +		" new requests for tracing can be accepted now.\n",
> +			KSYM_TRACER_MAX);
> +		return -ENOSPC;
> +	}
> +
> +	entry = kzalloc(sizeof(struct trace_ksym), GFP_KERNEL);
> +	if (!entry)
> +		return -ENOMEM;
> +
> +	entry->ksym_hbkpt = kzalloc(sizeof(struct hw_breakpoint), GFP_KERNEL);
> +	if (!entry->ksym_hbkpt) {
> +		kfree(entry);
> +		return -ENOMEM;
> +	}
> +
> +	entry->ksym_hbkpt->info.name = ksymname;
> +	entry->ksym_hbkpt->info.type = op;
> +	entry->ksym_addr = entry->ksym_hbkpt->info.address = addr;
> +	entry->ksym_hbkpt->info.len = HW_BREAKPOINT_LEN_4;
> +
> +	entry->ksym_hbkpt->triggered = (void *)ksym_hbkpt_handler;
> +
> +	ret = register_kernel_hw_breakpoint(entry->ksym_hbkpt);
> +	if (ret < 0) {
> +		printk(KERN_INFO "ksym_tracer request failed. Try again"
> +					" later!!\n");
> +		kfree(entry->ksym_hbkpt);
> +		kfree(entry);
> +		return -EAGAIN;
> +	}
> +	hlist_add_head(&(entry->ksym_hlist), &ksym_filter_head);
> +	ksym_filter_entry_count++;
> +
> +	return 0;
> +}
> +
> +static ssize_t ksym_trace_filter_read(struct file *filp, char __user *ubuf,
> +						size_t count, loff_t *ppos)
> +{
> +	struct trace_ksym *entry;
> +	struct hlist_node *node;
> +	char buf[KSYM_FILTER_ENTRY_LEN * KSYM_TRACER_MAX];
> +	ssize_t ret, cnt = 0;
> +
> +	mutex_lock(&ksym_tracer_mutex);
> +
> +	hlist_for_each_entry(entry, node, &ksym_filter_head, ksym_hlist) {
> +		cnt += snprintf(&buf[cnt], KSYM_FILTER_ENTRY_LEN - cnt, "%s:",
> +				entry->ksym_hbkpt->info.name);
> +		if (entry->ksym_hbkpt->info.type == HW_BREAKPOINT_WRITE)
> +			cnt += snprintf(&buf[cnt], KSYM_FILTER_ENTRY_LEN - cnt,
> +								"-w-\n");
> +		else if (entry->ksym_hbkpt->info.type == HW_BREAKPOINT_RW)
> +			cnt += snprintf(&buf[cnt], KSYM_FILTER_ENTRY_LEN - cnt,
> +								"rw-\n");
> +	}
> +	ret = simple_read_from_buffer(ubuf, count, ppos, buf, strlen(buf));
> +	mutex_unlock(&ksym_tracer_mutex);
> +
> +	return ret;
> +}
> +
> +static ssize_t ksym_trace_filter_write(struct file *file,
> +					const char __user *buffer,
> +						size_t count, loff_t *ppos)
> +{
> +	struct trace_ksym *entry;
> +	struct hlist_node *node;
> +	char *input_string, *ksymname = NULL;
> +	unsigned long ksym_addr = 0;
> +	int ret, op, changed = 0;
> +
> +	/* Ignore echo "" > ksym_trace_filter */
> +	if (count == 0)
> +		return 0;
> +
> +	input_string = kzalloc(count, GFP_KERNEL);
> +	if (!input_string)
> +		return -ENOMEM;
> +
> +	if (copy_from_user(input_string, buffer, count)) {
> +		kfree(input_string);
> +		return -EFAULT;
> +	}
> +
> +	ret = op = parse_ksym_trace_str(input_string, &ksymname, &ksym_addr);
> +	if (ret < 0) {
> +		kfree(input_string);
> +		return ret;
> +	}
> +
> +	mutex_lock(&ksym_tracer_mutex);
> +
> +	ret = -EINVAL;
> +	hlist_for_each_entry(entry, node, &ksym_filter_head, ksym_hlist) {
> +		if (entry->ksym_addr == ksym_addr) {
> +			/* Check for malformed request: (6) */
> +			if (entry->ksym_hbkpt->info.type != op)
> +				changed = 1;
> +			else
> +				goto err_ret;
> +			break;
> +		}
> +	}
> +	if (changed) {
> +		unregister_kernel_hw_breakpoint(entry->ksym_hbkpt);
> +		entry->ksym_hbkpt->info.type = op;
> +		if (op > 0) {
> +			ret = register_kernel_hw_breakpoint(entry->ksym_hbkpt);
> +			if (ret == 0) {
> +				ret = count;
> +				goto unlock_ret_path;
> +			}
> +		}
> +		ksym_filter_entry_count--;
> +		hlist_del(&(entry->ksym_hlist));
> +		kfree(entry->ksym_hbkpt);
> +		kfree(entry);
> +		ret = count;
> +		goto err_ret;
> +	} else {
> +		/* Check for malformed request: (4) */
> +		if (op == 0)
> +			goto err_ret;
> +		ret = process_new_ksym_entry(ksymname, op, ksym_addr);
> +		if (ret)
> +			goto err_ret;
> +	}
> +	ret = count;
> +	goto unlock_ret_path;
> +
> +err_ret:
> +	kfree(input_string);
> +
> +unlock_ret_path:
> +	mutex_unlock(&ksym_tracer_mutex);
> +	return ret;
> +}
> +
> +static const struct file_operations ksym_tracing_fops = {
> +	.open		= tracing_open_generic,
> +	.read		= ksym_trace_filter_read,
> +	.write		= ksym_trace_filter_write,
> +};
> +
> +static void ksym_trace_reset(struct trace_array *tr)
> +{
> +	struct trace_ksym *entry;
> +	struct hlist_node *node, *node1;
> +
> +	ksym_tracing_enabled = 0;
> +
> +	mutex_lock(&ksym_tracer_mutex);
> +	hlist_for_each_entry_safe(entry, node, node1, &ksym_filter_head,
> +								ksym_hlist) {
> +		unregister_kernel_hw_breakpoint(entry->ksym_hbkpt);
> +		ksym_filter_entry_count--;
> +		hlist_del(&(entry->ksym_hlist));
> +
> +		/* Free the 'input_string' only if reset
> +		 * after startup self-test
> +		 */
> +#ifdef CONFIG_FTRACE_SELFTEST
> +		if (strncmp(entry->ksym_hbkpt->info.name, KSYM_SELFTEST_ENTRY,
> +					strlen(KSYM_SELFTEST_ENTRY)) != 0)
> +#endif /* CONFIG_FTRACE_SELFTEST*/
> +			kfree(entry->ksym_hbkpt->info.name);
> +		kfree(entry->ksym_hbkpt);
> +		kfree(entry);
> +	}
> +	mutex_unlock(&ksym_tracer_mutex);
> +
> +}
> +
> +static int ksym_trace_init(struct trace_array *tr)
> +{
> +	int cpu, ret = 0;
> +
> +	for_each_online_cpu(cpu)
> +		tracing_reset(tr, cpu);
> +
> +	ksym_tracing_enabled = 1;
> +	ksym_trace_array = tr;
> +
> +#ifdef CONFIG_FTRACE_SELFTEST
> +	/* Check if we are re-entering self-test code during initialisation */
> +	if (ksym_selftest_dummy)
> +		goto ret_path;
> +
> +	ksym_selftest_dummy = 0;
> +
> +	/* Register the read-write tracing request */
> +	ret = process_new_ksym_entry(KSYM_SELFTEST_ENTRY, HW_BREAKPOINT_RW,
> +					(unsigned long)(&ksym_selftest_dummy));
> +
> +	if (ret < 0) {
> +		printk(KERN_CONT "ksym_trace read-write startup test failed\n");
> +		goto ret_path;
> +	}
> +	/* Perform a read and a write operation over the dummy variable to
> +	 * trigger the tracer
> +	 */
> +	if (ksym_selftest_dummy == 0)
> +		ksym_selftest_dummy++;
> +
> +ret_path:
> +#endif /* CONFIG_FTRACE_SELFTEST */
> +
> +	return ret;
> +}
> +
> +static void ksym_trace_print_header(struct seq_file *m)
> +{
> +
> +	seq_puts(m,
> +		 "#       TASK-PID      CPU#      Symbol         Type    "
> +		 "Function         \n");
> +	seq_puts(m,
> +		 "#          |           |          |              |         "
> +		 "|            \n");
> +}
> +
> +static enum print_line_t ksym_trace_output(struct trace_iterator *iter)
> +{
> +	struct trace_entry *entry = iter->ent;
> +	struct trace_seq *s = &iter->seq;
> +	struct trace_ksym *field;
> +	char str[KSYM_SYMBOL_LEN];
> +	int ret;
> +
> +	trace_assign_type(field, entry);
> +
> +	ret = trace_seq_printf(s, "%-15s %-5d %-3d %-20s ", field->p_name,
> +				entry->pid, iter->cpu, field->ksym_name);
> +	if (!ret)
> +		return TRACE_TYPE_PARTIAL_LINE;
> +
> +	switch (field->ksym_hbkpt->info.type) {
> +	case HW_BREAKPOINT_WRITE:
> +		ret = trace_seq_printf(s, " W  ");
> +		break;
> +	case HW_BREAKPOINT_RW:
> +		ret = trace_seq_printf(s, " RW ");
> +		break;
> +	default:
> +		return TRACE_TYPE_PARTIAL_LINE;
> +	}
> +
> +	if (!ret)
> +		return TRACE_TYPE_PARTIAL_LINE;
> +
> +	sprint_symbol(str, field->ip);
> +	ret = trace_seq_printf(s, "%-20s\n", str);
> +	if (!ret)
> +		return TRACE_TYPE_PARTIAL_LINE;
> +
> +	return TRACE_TYPE_HANDLED;
> +}
> +
> +struct tracer ksym_tracer __read_mostly =
> +{
> +	.name		= "ksym_tracer",
> +	.init		= ksym_trace_init,
> +	.reset		= ksym_trace_reset,
> +#ifdef CONFIG_FTRACE_SELFTEST
> +	.selftest	= trace_selftest_startup_ksym,
> +#endif
> +	.print_header   = ksym_trace_print_header,
> +	.print_line	= ksym_trace_output
> +};
> +
> +__init static int init_ksym_trace(void)
> +{
> +	struct dentry *d_tracer;
> +	struct dentry *entry;
> +
> +	d_tracer = tracing_init_dentry();
> +	ksym_filter_entry_count = 0;
> +
> +	entry = debugfs_create_file("ksym_trace_filter", 0666, d_tracer,
> +				    NULL, &ksym_tracing_fops);
> +	if (!entry)
> +		pr_warning("Could not create debugfs "
> +			   "'ksym_trace_filter' file\n");
> +
> +	return register_tracer(&ksym_tracer);
> +}
> +device_initcall(init_ksym_trace);
> +
> +
> +#ifdef CONFIG_PROFILE_KSYM_TRACER
> +static int ksym_tracer_stat_headers(struct seq_file *m)
> +{
> +	seq_printf(m, "   Access type    ");
> +	seq_printf(m, "            Symbol                     Counter     \n");
> +	return 0;
> +}
> +
> +static int ksym_tracer_stat_show(struct seq_file *m, void *v)
> +{
> +	struct hlist_node *stat = v;
> +	struct trace_ksym *entry;
> +	int access_type = 0;
> +	char fn_name[KSYM_NAME_LEN];
> +
> +	entry = hlist_entry(stat, struct trace_ksym, ksym_hlist);
> +
> +	if (entry->ksym_hbkpt)
> +		access_type = entry->ksym_hbkpt->info.type;
> +
> +	switch (access_type) {
> +	case HW_BREAKPOINT_WRITE:
> +		seq_printf(m, "     W     ");
> +		break;
> +	case HW_BREAKPOINT_RW:
> +		seq_printf(m, "     RW    ");
> +		break;
> +	default:
> +		seq_printf(m, "     NA    ");
> +	}
> +
> +	if (lookup_symbol_name(entry->ksym_addr, fn_name) >= 0)
> +		seq_printf(m, "               %s                 ", fn_name);
> +	else
> +		seq_printf(m, "               <NA>                ");
> +
> +	seq_printf(m, "%15lu\n", entry->counter);
> +	return 0;
> +}
> +
> +static void *ksym_tracer_stat_start(void)
> +{
> +	return &(ksym_filter_head.first);
> +}
> +
> +static void *
> +ksym_tracer_stat_next(void *v, int idx)
> +{
> +	struct hlist_node *stat = v;
> +
> +	return stat->next;
> +}
> +
> +static struct tracer_stat ksym_tracer_stats = {
> +	.name = "ksym_tracer",
> +	.stat_start = ksym_tracer_stat_start,
> +	.stat_next = ksym_tracer_stat_next,
> +	.stat_headers = ksym_tracer_stat_headers,
> +	.stat_show = ksym_tracer_stat_show
> +};
> +
> +__init static int ksym_tracer_stat_init(void)
> +{
> +	int ret;
> +
> +	ret = register_stat_tracer(&ksym_tracer_stats);
> +	if (!ret) {
> +		printk(KERN_WARNING "Warning: could not register "
> +				    "ksym tracer stats\n");
> +		return 1;
> +	}
> +
> +	return 0;
> +}
> +fs_initcall(ksym_tracer_stat_init);
> +#endif /* CONFIG_PROFILE_KSYM_TRACER */
> Index: linux-2.6-tip.hbkpt/kernel/trace/trace_selftest.c
> ===================================================================
> --- linux-2.6-tip.hbkpt.orig/kernel/trace/trace_selftest.c
> +++ linux-2.6-tip.hbkpt/kernel/trace/trace_selftest.c
> @@ -16,6 +16,7 @@ static inline int trace_valid_entry(stru
>  	case TRACE_BRANCH:
>  	case TRACE_GRAPH_ENT:
>  	case TRACE_GRAPH_RET:
> +	case TRACE_KSYM:
>  		return 1;
>  	}
>  	return 0;
> @@ -687,3 +688,38 @@ trace_selftest_startup_branch(struct tra
>  	return ret;
>  }
>  #endif /* CONFIG_BRANCH_TRACER */
> +
> +#ifdef CONFIG_KSYM_TRACER
> +int
> +trace_selftest_startup_ksym(struct tracer *trace, struct trace_array *tr)
> +{
> +	unsigned long count;
> +	int ret;
> +
> +	/* start the tracing */
> +	ret = tracer_init(trace, tr);
> +	if (ret) {
> +		warn_failed_init_tracer(trace, ret);
> +		return ret;
> +	}
> +
> +	/* Sleep for a 1/10 of a second */
> +	msleep(100);
> +	/* stop the tracing. */
> +	tracing_stop();
> +	/* check the trace buffer */
> +	ret = trace_test_buffer(tr, &count);
> +	trace->reset(tr);
> +	tracing_start();
> +
> +	/* read & write operations - one each is performed on the dummy variable
> +	 * triggering two entries in the trace buffer
> +	 */
> +	if (!ret && count != 2) {
> +		printk(KERN_CONT "Ksym tracer startup test failed");
> +		ret = -1;
> +	}
> +
> +	return ret;
> +}
> +#endif /* CONFIG_KSYM_TRACER */
> 


^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [Patch 01/11] Introducing generic hardware breakpoint handler interfaces
  2009-03-19 23:48 ` [Patch 01/11] Introducing generic hardware breakpoint handler interfaces K.Prasad
@ 2009-03-20 14:33   ` Alan Stern
  2009-03-20 18:30     ` Ingo Molnar
                       ` (2 more replies)
  0 siblings, 3 replies; 33+ messages in thread
From: Alan Stern @ 2009-03-20 14:33 UTC (permalink / raw)
  To: K.Prasad
  Cc: Ingo Molnar, Linux Kernel Mailing List, Andrew Morton,
	Benjamin Herrenschmidt, Frederic Weisbecker, Maneesh Soni,
	Roland McGrath, Steven Rostedt

On Fri, 20 Mar 2009, K.Prasad wrote:

> This patch introduces two new files hw_breakpoint.[ch] which defines the 
> generic interfaces to use hardware breakpoint infrastructure of the system. 

Prasad:

I'm sorry to say this is full of mistakes.  So far I have looked only 
at patch 01/11, but it's not good.

> + * Kernel breakpoints grow downwards, starting from HB_NUM
> + * 'hbkpt_kernel_pos' denotes lowest numbered breakpoint register occupied for
> + * kernel-space request
> + */
> +unsigned int hbkpt_kernel_pos;

This doesn't make much sense.  All you need to know is which registers
are in use; all others are available.

For example, suppose the kernel allocated breakpoints 3, 2, and 1, and
then deallocated 2.  Then bp 2 would be available for use, even though
2 > 1.

It's also a poor choice of name.  Everywhere else (in my patches,
anyway) the code refers to hardware breakpoints using the abbreviation
"hwbp" or "hw_breakpoint".  There's no reason suddenly to start using
"hbkpt".

> +/* An array containing refcount of threads using a given bkpt register */
> +unsigned int hbkpt_user_max_refcount[HB_NUM];

Why did you put "max" in the name?  Isn't this just a simple refcount?

> +/* One higher than the highest counted user-space breakpoint register */
> +unsigned int hbkpt_user_max;

Likewise, this variable isn't really needed.  It's just one more than
the largest i such that hbkpt_user_max_refcount[i] > 0.

> +/*
> + * Install the debug register values for a new thread.
> + */
> +void switch_to_thread_hw_breakpoint(struct task_struct *tsk)
> +{
> +	/* Set the debug register */

Set _which_ debug register?

> +	arch_install_thread_hbkpt(tsk);
> +	last_debugged_task = current;
> +
> +	put_cpu_no_resched();

What's this line doing here?  It looks like something you forgot to
erase.

> +}
> +
> +/*
> + * Install the debug register values for just the kernel, no thread.
> + */
> +void switch_to_none_hw_breakpoint(void)
> +{
> +	arch_install_none();
> +	put_cpu_no_resched();

Same for this line.

> +}
> +
> +/*
> + * Load the debug registers during startup of a CPU.
> + */
> +void load_debug_registers(void)
> +{
> +	int i;
> +	unsigned long flags;
> +
> +	/* Prevent IPIs for new kernel breakpoint updates */
> +	local_irq_save(flags);
> +
> +	for (i = hbkpt_kernel_pos; i < HB_NUM; i++)
> +		if (hbkpt_kernel[i])
> +			on_each_cpu(arch_install_kernel_hbkpt,
> +				(void *)hbkpt_kernel[i], 0);

This is completely wrong.  First of all, it's dumb to send multiple
IPIs (one for each iteration through the loop).  Second, this routine
shouldn't send any IPIs at all!  It gets invoked when a CPU is
starting up and wants to load its _own_ debug registers -- not tell
another CPU to load anything.

> +	if (current->thread.dr7)
> +		arch_install_thread_hbkpt(current);
> +
> +	local_irq_restore(flags);
> +}
> +
> +/*
> + * Erase all the hardware breakpoint info associated with a thread.
> + *
> + * If tsk != current then tsk must not be usable (for example, a
> + * child being cleaned up from a failed fork).
> + */
> +void flush_thread_hw_breakpoint(struct task_struct *tsk)
> +{
> +	int i;
> +	struct thread_struct *thread = &(tsk->thread);
> +
> +	mutex_lock(&hw_breakpoint_mutex);
> +
> +	/* Let the breakpoints know they are being uninstalled */

This comment looks like a leftover which should have been erased.

> +/*
> + * Validate the settings in a hw_breakpoint structure.
> + */
> +static int validate_settings(struct hw_breakpoint *bp, struct task_struct *tsk)
> +{
> +	int ret;
> +	unsigned int align;
> +
> +	ret = arch_validate_hwbkpt_settings(bp, &align, tsk);
> +	if (ret < 0)
> +		goto err;
> +
> +	/* Check that the low-order bits of the address are appropriate
> +	 * for the alignment implied by len.
> +	 */
> +	if (bp->info.address & align)
> +		return -EINVAL;
> +
> +	/* Check that the virtual address is in the proper range */
> +	if (tsk) {
> +		if (!arch_check_va_in_userspace(bp->info.address, tsk))
> +			return -EFAULT;
> +	} else {
> +		if (!arch_check_va_in_kernelspace(bp->info.address))
> +			return -EFAULT;
> +	}

Roland pointed out that these checks need to take into account the
length of the breakpoint.  For example, in
arch_check_va_in_userspace() it isn't sufficient for the start of the
breakpoint region to be a userspace address; the end of the
breakpoint region must also be in userspace.

> + err:
> +	return ret;
> +}
> +
> +int __register_user_hw_breakpoint(int pos, struct task_struct *tsk,
> +					struct hw_breakpoint *bp)
> +{
> +	struct thread_struct *thread = &(tsk->thread);
> +	int rc;
> +
> +	/* Do not overcommit. Fail if kernel has used the hbkpt registers */
> +	if (pos >= hbkpt_kernel_pos)
> +		return -ENOSPC;

In fact you should fail if the debug register is already in use,
regardless of whether it is being used by a kernel breakpoint.  And you 
shouldn't check against hbkpt_kernel_pos; you should check whether 
hbkpt_kernel[pos] is NULL and thread->hbkpt[pos] is NULL.

> +
> +	rc = validate_settings(bp, tsk);
> +	if (rc)
> +		return rc;
> +
> +	thread->hbkpt[pos] = bp;
> +	thread->hbkpt_num_installed++;
> +	hbkpt_user_max_refcount[pos]++;
> +	/* 'tsk' is the thread that uses max number of hbkpt registers */

This is a bad comment.  It sounds like it's saying that "tsk" is
defined as the thread using the maximum number of breakpoints, rather
than being defined as the thread for which the breakpoint is being
registered.

Besides, there's no reason to keep track of which thread uses the max 
number of breakpoints anyway.  Not to mention the fact that you don't 
update hbkpt_user_max when its thread exits.

> +	if (hbkpt_user_max < thread->hbkpt_num_installed)
> +		hbkpt_user_max++;

At this point I got tired of looking, but it seems obvious that the new 
patch series needs a bunch of improvements.

Alan Stern


^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [Patch 01/11] Introducing generic hardware breakpoint handler interfaces
  2009-03-20 14:33   ` Alan Stern
@ 2009-03-20 18:30     ` Ingo Molnar
  2009-03-21 17:32       ` K.Prasad
  2009-03-20 18:32     ` Ingo Molnar
  2009-03-21 17:26     ` K.Prasad
  2 siblings, 1 reply; 33+ messages in thread
From: Ingo Molnar @ 2009-03-20 18:30 UTC (permalink / raw)
  To: Alan Stern
  Cc: K.Prasad, Linux Kernel Mailing List, Andrew Morton,
	Benjamin Herrenschmidt, Frederic Weisbecker, Maneesh Soni,
	Roland McGrath, Steven Rostedt


* Alan Stern <stern@rowland.harvard.edu> wrote:

> > + * Kernel breakpoints grow downwards, starting from HB_NUM
> > + * 'hbkpt_kernel_pos' denotes lowest numbered breakpoint register occupied for
> > + * kernel-space request
> > + */
> > +unsigned int hbkpt_kernel_pos;
> 
> This doesn't make much sense.  All you need to know is which 
> registers are in use; all others are available.
> 
> For example, suppose the kernel allocated breakpoints 3, 2, and 1, 
> and then deallocated 2.  Then bp 2 would be available for use, 
> even though 2 > 1.

it's a high/low watermark mechanism. Yes, it's not an allocator that 
can allocate into a debug registrs 'hole', but it is a simple one 
that matches current hardware breakpoint usages and enables the 
kernel to utilize them as well - and keeps all the code simple.

	Ingo

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [Patch 01/11] Introducing generic hardware breakpoint handler interfaces
  2009-03-20 14:33   ` Alan Stern
  2009-03-20 18:30     ` Ingo Molnar
@ 2009-03-20 18:32     ` Ingo Molnar
  2009-03-21 17:26     ` K.Prasad
  2 siblings, 0 replies; 33+ messages in thread
From: Ingo Molnar @ 2009-03-20 18:32 UTC (permalink / raw)
  To: Alan Stern
  Cc: K.Prasad, Linux Kernel Mailing List, Andrew Morton,
	Benjamin Herrenschmidt, Frederic Weisbecker, Maneesh Soni,
	Roland McGrath, Steven Rostedt


* Alan Stern <stern@rowland.harvard.edu> wrote:

> > +	/* Check that the virtual address is in the proper range */
> > +	if (tsk) {
> > +		if (!arch_check_va_in_userspace(bp->info.address, tsk))
> > +			return -EFAULT;
> > +	} else {
> > +		if (!arch_check_va_in_kernelspace(bp->info.address))
> > +			return -EFAULT;
> > +	}
> 
> Roland pointed out that these checks need to take into account the 
> length of the breakpoint.  For example, in 
> arch_check_va_in_userspace() it isn't sufficient for the start of 
> the breakpoint region to be a userspace address; the end of the 
> breakpoint region must also be in userspace.

i pointed it out - but yes, this needs to be fixed.

	Ingo

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [Patch 11/11] ftrace plugin for kernel symbol tracing using HW Breakpoint interfaces - v2
  2009-03-20  9:04   ` Frederic Weisbecker
@ 2009-03-21 16:24     ` K.Prasad
  2009-03-21 16:39       ` Steven Rostedt
  0 siblings, 1 reply; 33+ messages in thread
From: K.Prasad @ 2009-03-21 16:24 UTC (permalink / raw)
  To: Frederic Weisbecker
  Cc: Ingo Molnar, Linux Kernel Mailing List, Alan Stern,
	Andrew Morton, Benjamin Herrenschmidt, Maneesh Soni,
	Roland McGrath, Steven Rostedt

On Fri, Mar 20, 2009 at 10:04:52AM +0100, Frederic Weisbecker wrote:
> On Fri, Mar 20, 2009 at 05:20:32AM +0530, K.Prasad wrote:
> > This patch adds an ftrace plugin to detect and profile memory access over
> > kernel variables. It uses HW Breakpoint interfaces to 'watch memory
> > addresses.
> > 
> > Signed-off-by: K.Prasad <prasad@linux.vnet.ibm.com> 
> > ---
> >  kernel/trace/Kconfig          |   21 +
> >  kernel/trace/Makefile         |    1 
> >  kernel/trace/trace.h          |   25 +
> >  kernel/trace/trace_ksym.c     |  555 ++++++++++++++++++++++++++++++++++++++++++
> >  kernel/trace/trace_selftest.c |   36 ++
> >  5 files changed, 638 insertions(+)
> > 

> > +
> > +void ksym_collect_stats(unsigned long hbkpt_hit_addr)
> > +{
> > +	struct hlist_node *node;
> > +	struct trace_ksym *entry;
> > +
> > +	spin_lock(&ksym_stat_lock);
> 
> 
> I see that can be called from ksym_hbkpt_handler which in turn
> can be called from interrupt context, right?
> You can issue a deadlock if you don't disable interrupts here.
> 
> Thanks,
> Frederic.
> 

ksym_collect_stats<--ksym_hbkpt_handler<--hw_breakpoint_handler<--do_debug
invocation happens with interrupts enabled (IF bit is set). I do find
that a few plugins in kernel/trace enclose the
trace_buffer_lock_reserve()--trace_buffer_unlock_commit() invocation
within interrupt-disabled code. Is that a requirement there?

The potential deadlock scenario you foresee isn't obvious to me. Can you
explain?

Thanks,
K.Prasad


^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [Patch 11/11] ftrace plugin for kernel symbol tracing using HW Breakpoint interfaces - v2
  2009-03-21 16:24     ` K.Prasad
@ 2009-03-21 16:39       ` Steven Rostedt
  2009-03-23 19:08         ` K.Prasad
  0 siblings, 1 reply; 33+ messages in thread
From: Steven Rostedt @ 2009-03-21 16:39 UTC (permalink / raw)
  To: K.Prasad
  Cc: Frederic Weisbecker, Ingo Molnar, Linux Kernel Mailing List,
	Alan Stern, Andrew Morton, Benjamin Herrenschmidt, Maneesh Soni,
	Roland McGrath


On Sat, 21 Mar 2009, K.Prasad wrote:
> > > 
> 
> > > +
> > > +void ksym_collect_stats(unsigned long hbkpt_hit_addr)
> > > +{
> > > +	struct hlist_node *node;
> > > +	struct trace_ksym *entry;
> > > +
> > > +	spin_lock(&ksym_stat_lock);
> > 
> > 
> > I see that can be called from ksym_hbkpt_handler which in turn
> > can be called from interrupt context, right?
> > You can issue a deadlock if you don't disable interrupts here.
> > 
> > Thanks,
> > Frederic.
> > 
> 
> ksym_collect_stats<--ksym_hbkpt_handler<--hw_breakpoint_handler<--do_debug
> invocation happens with interrupts enabled (IF bit is set). I do find
> that a few plugins in kernel/trace enclose the
> trace_buffer_lock_reserve()--trace_buffer_unlock_commit() invocation
> within interrupt-disabled code. Is that a requirement there?
> 
> The potential deadlock scenario you foresee isn't obvious to me. Can you
> explain?

Can that lock ever be taken in an interrupt? If not, document that (and 
perhaps add a WARN_ON(in_interrupt()); ). Otherwise you have a possible:

	spin_lock(&ksym_stat_lock);

		===> take interrupt ...

			(from interrupt)
			spin_lock(&ksym_stat_lock); <== deadlock.


-- Steve


^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [Patch 01/11] Introducing generic hardware breakpoint handler interfaces
  2009-03-20 14:33   ` Alan Stern
  2009-03-20 18:30     ` Ingo Molnar
  2009-03-20 18:32     ` Ingo Molnar
@ 2009-03-21 17:26     ` K.Prasad
  2009-03-21 21:39       ` Alan Stern
  2 siblings, 1 reply; 33+ messages in thread
From: K.Prasad @ 2009-03-21 17:26 UTC (permalink / raw)
  To: Alan Stern
  Cc: Ingo Molnar, Linux Kernel Mailing List, Andrew Morton,
	Benjamin Herrenschmidt, Frederic Weisbecker, Maneesh Soni,
	Roland McGrath, Steven Rostedt

On Fri, Mar 20, 2009 at 10:33:26AM -0400, Alan Stern wrote:
> On Fri, 20 Mar 2009, K.Prasad wrote:
> 
> > This patch introduces two new files hw_breakpoint.[ch] which defines the 
> > generic interfaces to use hardware breakpoint infrastructure of the system. 
> 
> Prasad:
> 
> I'm sorry to say this is full of mistakes.  So far I have looked only 
> at patch 01/11, but it's not good.
> 

After you pointed out, I realise that the code in load_debug_registers()
is an overkill and unregister_kernel_hw_breakpoint() has an obvious 
error which should have caught my attention. My next revision should 
fix them.

> > + * Kernel breakpoints grow downwards, starting from HB_NUM
> > + * 'hbkpt_kernel_pos' denotes lowest numbered breakpoint register occupied for
> > + * kernel-space request
> > + */
> > +unsigned int hbkpt_kernel_pos;
> 
> This doesn't make much sense.  All you need to know is which registers
> are in use; all others are available.
> 

As explained by Maneesh earlier, we compact the kernel-space requests
into registers (HB_NUM - 1) to hbkpt_kernel_pos. The kernel-space
requests aren't specific to any given register number too, and so
compaction would be suitable for this case (unlike when implemented for
user-space which might need virtualisation of registers).

> For example, suppose the kernel allocated breakpoints 3, 2, and 1, and
> then deallocated 2.  Then bp 2 would be available for use, even though
> 2 > 1.
> 
> It's also a poor choice of name.  Everywhere else (in my patches,
> anyway) the code refers to hardware breakpoints using the abbreviation
> "hwbp" or "hw_breakpoint".  There's no reason suddenly to start using
> "hbkpt".
> 

I began using 'hbkpt' as a shorter naming convention (the longer one
being hw_breakpoint) without being really conscious of the 'hwbkpt'
usage by you (even some of the previous iterations contained them in
samples/hw_breakpoint and ftrace-plugin).

Well, I will rename my shorter name to 'hwbkpt' for uniformity.

> > +/* An array containing refcount of threads using a given bkpt register */
> > +unsigned int hbkpt_user_max_refcount[HB_NUM];
> 
> Why did you put "max" in the name?  Isn't this just a simple refcount?
> 

Ok. It will be hbkpt_user_refcount[].

> > +/* One higher than the highest counted user-space breakpoint register */
> > +unsigned int hbkpt_user_max;
> 
> Likewise, this variable isn't really needed.  It's just one more than
> the largest i such that hbkpt_user_max_refcount[i] > 0.
> 

It acts like a cache for determining the user-space breakpoint boundary.
It is used for sanity checks and in its absence we will have to compute from
hbkpt_user_max_refcount[] everytime.

> > +/*
> > + * Install the debug register values for a new thread.
> > + */
> > +void switch_to_thread_hw_breakpoint(struct task_struct *tsk)
> > +{
> > +	/* Set the debug register */
> 
> Set _which_ debug register?
> 

Will change it to read:
/* Set all debug registers used by 'tsk' */

> > +	arch_install_thread_hbkpt(tsk);
> > +	last_debugged_task = current;
> > +
> > +	put_cpu_no_resched();
> 
> What's this line doing here?  It looks like something you forgot to
> erase.
> 
> > +}
> > +
> > +/*
> > + * Install the debug register values for just the kernel, no thread.
> > + */
> > +void switch_to_none_hw_breakpoint(void)
> > +{
> > +	arch_install_none();
> > +	put_cpu_no_resched();
> 
> Same for this line.
> 

These are carriages from the previous code. They are still invoked from
the same places (such as flush_thread_hw_breakpoint(),
hw_breakpoint_handler()) and hence I didn't analyse it enough to see if
they were to be removed.

However, having found that preempt_count() is already zero at places where
these are called I think they can be removed.

> > +}
> > +
> > +/*
> > + * Load the debug registers during startup of a CPU.
> > + */
> > +void load_debug_registers(void)
> > +{
> > +	int i;
> > +	unsigned long flags;
> > +
> > +	/* Prevent IPIs for new kernel breakpoint updates */
> > +	local_irq_save(flags);
> > +
> > +	for (i = hbkpt_kernel_pos; i < HB_NUM; i++)
> > +		if (hbkpt_kernel[i])
> > +			on_each_cpu(arch_install_kernel_hbkpt,
> > +				(void *)hbkpt_kernel[i], 0);
> 
> This is completely wrong.  First of all, it's dumb to send multiple
> IPIs (one for each iteration through the loop).  Second, this routine
> shouldn't send any IPIs at all!  It gets invoked when a CPU is
> starting up and wants to load its _own_ debug registers -- not tell
> another CPU to load anything.
> 

As I agreed before, it is an overkill (given the design of
arch_install_kernel_hbkpt()). I will create a new
arch_update_kernel_hbkpt(pos, bp) that will install breakpoints only
on the CPU starting up.

> > +	if (current->thread.dr7)
> > +		arch_install_thread_hbkpt(current);
> > +
> > +	local_irq_restore(flags);
> > +}
> > +
> > +/*
> > + * Erase all the hardware breakpoint info associated with a thread.
> > + *
> > + * If tsk != current then tsk must not be usable (for example, a
> > + * child being cleaned up from a failed fork).
> > + */
> > +void flush_thread_hw_breakpoint(struct task_struct *tsk)
> > +{
> > +	int i;
> > +	struct thread_struct *thread = &(tsk->thread);
> > +
> > +	mutex_lock(&hw_breakpoint_mutex);
> > +
> > +	/* Let the breakpoints know they are being uninstalled */
> 
> This comment looks like a leftover which should have been erased.
> 
> > +/*
> > + * Validate the settings in a hw_breakpoint structure.
> > + */
> > +static int validate_settings(struct hw_breakpoint *bp, struct task_struct *tsk)
> > +{
> > +	int ret;
> > +	unsigned int align;
> > +
> > +	ret = arch_validate_hwbkpt_settings(bp, &align, tsk);
> > +	if (ret < 0)
> > +		goto err;
> > +
> > +	/* Check that the low-order bits of the address are appropriate
> > +	 * for the alignment implied by len.
> > +	 */
> > +	if (bp->info.address & align)
> > +		return -EINVAL;
> > +
> > +	/* Check that the virtual address is in the proper range */
> > +	if (tsk) {
> > +		if (!arch_check_va_in_userspace(bp->info.address, tsk))
> > +			return -EFAULT;
> > +	} else {
> > +		if (!arch_check_va_in_kernelspace(bp->info.address))
> > +			return -EFAULT;
> > +	}
> 
> Roland pointed out that these checks need to take into account the
> length of the breakpoint.  For example, in
> arch_check_va_in_userspace() it isn't sufficient for the start of the
> breakpoint region to be a userspace address; the end of the
> breakpoint region must also be in userspace.
> 

Ok. Will do something like:
return (va <= (TASK_SIZE - (hw_breakpoint_length * word_size)));

> > + err:
> > +	return ret;
> > +}
> > +
> > +int __register_user_hw_breakpoint(int pos, struct task_struct *tsk,
> > +					struct hw_breakpoint *bp)
> > +{
> > +	struct thread_struct *thread = &(tsk->thread);
> > +	int rc;
> > +
> > +	/* Do not overcommit. Fail if kernel has used the hbkpt registers */
> > +	if (pos >= hbkpt_kernel_pos)
> > +		return -ENOSPC;
> 
> In fact you should fail if the debug register is already in use,
> regardless of whether it is being used by a kernel breakpoint.  And you 
> shouldn't check against hbkpt_kernel_pos; you should check whether 
> hbkpt_kernel[pos] is NULL and thread->hbkpt[pos] is NULL.
> 

As explained before, the intended design was like this:

ample layout:
hbkpt_kernel_pos = 1
hbkpt_user_max = 1

---------------------------------------------------------------------
|                |                |                |                |
|       DR3      |       DR2      |       DR1      |       DR0      |
|                |                |                |                |
---------------------------------------------------------------------
^                                                  ^                ^
|                                                  |                |
-----------------kernel-space addresses-------------------user-------

After removing breakpoint in say DR2, compaction occurs.
New layout will be:
hbkpt_kernel_pos = 2
hbkpt_user_max = 1

---------------------------------------------------------------------
|                |                |                |                |
|       DR3      |       DR2      |       DR1      |       DR0      |
|                |                |                |                |
---------------------------------------------------------------------
^                                 ^                ^                ^
|                                 |                |                |
-----------------kernel------------------empty-----------user--------

The above design, in my opinion is intuitive, allows re-use of
uninstalled registers and is simple to implement.

What was missing in the sent patch was the updation of dr7 and dr[pos]
register after compaction. I will add them in the next iteration of the
patch.

> > +
> > +	rc = validate_settings(bp, tsk);
> > +	if (rc)
> > +		return rc;
> > +
> > +	thread->hbkpt[pos] = bp;
> > +	thread->hbkpt_num_installed++;
> > +	hbkpt_user_max_refcount[pos]++;
> > +	/* 'tsk' is the thread that uses max number of hbkpt registers */
> 
> This is a bad comment.  It sounds like it's saying that "tsk" is
> defined as the thread using the maximum number of breakpoints, rather
> than being defined as the thread for which the breakpoint is being
> registered.
> 
> Besides, there's no reason to keep track of which thread uses the max 
> number of breakpoints anyway.  Not to mention the fact that you don't 
> update hbkpt_user_max when its thread exits.
> 

We don't keep track of the thread (in the sense the task_struct) but
'hbkpt_user_max' is used for validating requests and book-keeping. As
Maneesh mentioned before flush_thread_hw_breakpoint() updates
'hbkpt_user_max'.

I can change it to read like the one below if it sounds better to you.

/* 
 * 'tsk' uses more number of registers than 'hbkpt_user_max'. Update
 * the same.
 */

> > +	if (hbkpt_user_max < thread->hbkpt_num_installed)
> > +		hbkpt_user_max++;
> 
> At this point I got tired of looking, but it seems obvious that the new 
> patch series needs a bunch of improvements.
> 
> Alan Stern
>

As mentioned before the next iteration would contain the changes I've
discussed above.

Thanks,
K.Prasad
 

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [Patch 01/11] Introducing generic hardware breakpoint handler interfaces
  2009-03-20 18:30     ` Ingo Molnar
@ 2009-03-21 17:32       ` K.Prasad
  0 siblings, 0 replies; 33+ messages in thread
From: K.Prasad @ 2009-03-21 17:32 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Alan Stern, Linux Kernel Mailing List, Andrew Morton,
	Benjamin Herrenschmidt, Frederic Weisbecker, Maneesh Soni,
	Roland McGrath, Steven Rostedt

On Fri, Mar 20, 2009 at 07:30:58PM +0100, Ingo Molnar wrote:
> 
> * Alan Stern <stern@rowland.harvard.edu> wrote:
> 
> > > + * Kernel breakpoints grow downwards, starting from HB_NUM
> > > + * 'hbkpt_kernel_pos' denotes lowest numbered breakpoint register occupied for
> > > + * kernel-space request
> > > + */
> > > +unsigned int hbkpt_kernel_pos;
> > 
> > This doesn't make much sense.  All you need to know is which 
> > registers are in use; all others are available.
> > 
> > For example, suppose the kernel allocated breakpoints 3, 2, and 1, 
> > and then deallocated 2.  Then bp 2 would be available for use, 
> > even though 2 > 1.
> 
> it's a high/low watermark mechanism. Yes, it's not an allocator that 
> can allocate into a debug registrs 'hole', but it is a simple one 
> that matches current hardware breakpoint usages and enables the 
> kernel to utilize them as well - and keeps all the code simple.
> 
> 	Ingo

I've explained the design here: http://lkml.org/lkml/2009/3/21/169 in a
and is slightly different from what you've explained above.

It involves shifting of kernel-space registers by one-level if a
kernel-register is uninstalled. We compact the kernel-space registers
since a)not to leave a 'hole' thereby wasting a register forever during
runtime b)kernel-space requests are not specific to a register number
and can be moved at will (unlike user-space requests).

Hope that the design is acceptable and the resultant code - simple.

Thanks,
K.Prasad


^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [Patch 01/11] Introducing generic hardware breakpoint handler interfaces
  2009-03-21 17:26     ` K.Prasad
@ 2009-03-21 21:39       ` Alan Stern
  2009-03-23 19:03         ` K.Prasad
  0 siblings, 1 reply; 33+ messages in thread
From: Alan Stern @ 2009-03-21 21:39 UTC (permalink / raw)
  To: K.Prasad
  Cc: Ingo Molnar, Linux Kernel Mailing List, Andrew Morton,
	Benjamin Herrenschmidt, Frederic Weisbecker, Maneesh Soni,
	Roland McGrath, Steven Rostedt

On Sat, 21 Mar 2009, K.Prasad wrote:

> > > + * Kernel breakpoints grow downwards, starting from HB_NUM
> > > + * 'hbkpt_kernel_pos' denotes lowest numbered breakpoint register occupied for
> > > + * kernel-space request
> > > + */
> > > +unsigned int hbkpt_kernel_pos;
> > 
> > This doesn't make much sense.  All you need to know is which registers
> > are in use; all others are available.
> > 
> 
> As explained by Maneesh earlier, we compact the kernel-space requests
> into registers (HB_NUM - 1) to hbkpt_kernel_pos. The kernel-space
> requests aren't specific to any given register number too, and so
> compaction would be suitable for this case (unlike when implemented for
> user-space which might need virtualisation of registers).

Okay, that makes sense.  Perhaps you could add a short comment here
explaining that the register allocations get compacted when a kernel
breakpoint is unregistered, so they will always be contiguous.

> > It's also a poor choice of name.  Everywhere else (in my patches,
> > anyway) the code refers to hardware breakpoints using the abbreviation
> > "hwbp" or "hw_breakpoint".  There's no reason suddenly to start using
> > "hbkpt".
> > 
> 
> I began using 'hbkpt' as a shorter naming convention (the longer one
> being hw_breakpoint) without being really conscious of the 'hwbkpt'
> usage by you (even some of the previous iterations contained them in
> samples/hw_breakpoint and ftrace-plugin).
> 
> Well, I will rename my shorter name to 'hwbkpt' for uniformity.

My patch never used "hwbkpt".  Besides "hw_breakpoint", it used only 
"bp".  On going back and checking, I found that it didn't even use 
"hwbp".  Some other abbreviations it did use were "kbp" for kernel 
breakpoint, "chbi" for per-CPU hardware breakpoint info, and "thbi" for 
per-thread hardware breakpoint info.

If you're looking for a good short name, and if you want to keep 
hardware breakpoints distinct from software breakpoints, I suggest 
"hbp" instead of "hbkpt".  But it's up to you, and it's worth noticing 
that the code already contains lots of variables named just "bp".

> > > +/* One higher than the highest counted user-space breakpoint register */
> > > +unsigned int hbkpt_user_max;
> > 
> > Likewise, this variable isn't really needed.  It's just one more than
> > the largest i such that hbkpt_user_max_refcount[i] > 0.
> > 
> 
> It acts like a cache for determining the user-space breakpoint boundary.
> It is used for sanity checks and in its absence we will have to compute from
> hbkpt_user_max_refcount[] everytime.

That's right.  Isn't it simpler to check

	kernel_pos > 0 && hbkpt_user_refcount[kernel_pos - 1] == 0

than to check

	kernel_pos - 1 >= hbkpt_user_max

_and_ to keep hbkpt_user_max set to the correct value at all times?

> > > +/*
> > > + * Load the debug registers during startup of a CPU.
> > > + */
> > > +void load_debug_registers(void)
> > > +{
> > > +	int i;
> > > +	unsigned long flags;
> > > +
> > > +	/* Prevent IPIs for new kernel breakpoint updates */
> > > +	local_irq_save(flags);
> > > +
> > > +	for (i = hbkpt_kernel_pos; i < HB_NUM; i++)
> > > +		if (hbkpt_kernel[i])
> > > +			on_each_cpu(arch_install_kernel_hbkpt,
> > > +				(void *)hbkpt_kernel[i], 0);
> > 
> > This is completely wrong.  First of all, it's dumb to send multiple
> > IPIs (one for each iteration through the loop).  Second, this routine
> > shouldn't send any IPIs at all!  It gets invoked when a CPU is
> > starting up and wants to load its _own_ debug registers -- not tell
> > another CPU to load anything.
> > 
> 
> As I agreed before, it is an overkill (given the design of
> arch_install_kernel_hbkpt()). I will create a new
> arch_update_kernel_hbkpt(pos, bp) that will install breakpoints only
> on the CPU starting up.

Doesn't arch_install_kernel_hbkpt() already install breakpoints
on only the current CPU?  So why do you need a new function?

> > > +	/* Check that the virtual address is in the proper range */
> > > +	if (tsk) {
> > > +		if (!arch_check_va_in_userspace(bp->info.address, tsk))
> > > +			return -EFAULT;
> > > +	} else {
> > > +		if (!arch_check_va_in_kernelspace(bp->info.address))
> > > +			return -EFAULT;
> > > +	}
> > 
> > Roland pointed out that these checks need to take into account the
> > length of the breakpoint.  For example, in
> > arch_check_va_in_userspace() it isn't sufficient for the start of the
> > breakpoint region to be a userspace address; the end of the
> > breakpoint region must also be in userspace.
> > 
> 
> Ok. Will do something like:
> return (va <= (TASK_SIZE - (hw_breakpoint_length * word_size)));

What is the purpose of word_size here?  The breakpoint length should be 
specified in bytes, not words.

Don't forget that that in arch_check_va_in_kernelspace() you need to 
check both for values that are too low and values that are too high 
(they overflow and wrap around back to a user address).

> We don't keep track of the thread (in the sense the task_struct) but
> 'hbkpt_user_max' is used for validating requests and book-keeping. As
> Maneesh mentioned before flush_thread_hw_breakpoint() updates
> 'hbkpt_user_max'.
> 
> I can change it to read like the one below if it sounds better to you.
> 
> /* 
>  * 'tsk' uses more number of registers than 'hbkpt_user_max'. Update
>  * the same.
>  */

My preference is simply to eliminate hbkpt_user_max entirely.

Alan Stern


^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [Patch 01/11] Introducing generic hardware breakpoint handler interfaces
  2009-03-21 21:39       ` Alan Stern
@ 2009-03-23 19:03         ` K.Prasad
  2009-03-23 19:21           ` Alan Stern
  0 siblings, 1 reply; 33+ messages in thread
From: K.Prasad @ 2009-03-23 19:03 UTC (permalink / raw)
  To: Alan Stern
  Cc: Ingo Molnar, Linux Kernel Mailing List, Andrew Morton,
	Benjamin Herrenschmidt, Frederic Weisbecker, Maneesh Soni,
	Roland McGrath, Steven Rostedt

On Sat, Mar 21, 2009 at 05:39:59PM -0400, Alan Stern wrote:
> On Sat, 21 Mar 2009, K.Prasad wrote:
> 
> > > > + * Kernel breakpoints grow downwards, starting from HB_NUM
> > > > + * 'hbkpt_kernel_pos' denotes lowest numbered breakpoint register occupied for
> > > > + * kernel-space request
> > > > + */
> > > > +unsigned int hbkpt_kernel_pos;
> > > 
> > > This doesn't make much sense.  All you need to know is which registers
> > > are in use; all others are available.
> > > 
> > 
> > As explained by Maneesh earlier, we compact the kernel-space requests
> > into registers (HB_NUM - 1) to hbkpt_kernel_pos. The kernel-space
> > requests aren't specific to any given register number too, and so
> > compaction would be suitable for this case (unlike when implemented for
> > user-space which might need virtualisation of registers).
> 
> Okay, that makes sense.  Perhaps you could add a short comment here
> explaining that the register allocations get compacted when a kernel
> breakpoint is unregistered, so they will always be contiguous.
> 
> > > It's also a poor choice of name.  Everywhere else (in my patches,
> > > anyway) the code refers to hardware breakpoints using the abbreviation
> > > "hwbp" or "hw_breakpoint".  There's no reason suddenly to start using
> > > "hbkpt".
> > > 
> > 
> > I began using 'hbkpt' as a shorter naming convention (the longer one
> > being hw_breakpoint) without being really conscious of the 'hwbkpt'
> > usage by you (even some of the previous iterations contained them in
> > samples/hw_breakpoint and ftrace-plugin).
> > 
> > Well, I will rename my shorter name to 'hwbkpt' for uniformity.
> 
> My patch never used "hwbkpt".  Besides "hw_breakpoint", it used only 
> "bp".  On going back and checking, I found that it didn't even use 
> "hwbp".  Some other abbreviations it did use were "kbp" for kernel 
> breakpoint, "chbi" for per-CPU hardware breakpoint info, and "thbi" for 
> per-thread hardware breakpoint info.
> 
> If you're looking for a good short name, and if you want to keep 
> hardware breakpoints distinct from software breakpoints, I suggest 
> "hbp" instead of "hbkpt".  But it's up to you, and it's worth noticing 
> that the code already contains lots of variables named just "bp".
> 

I am renaming all 'hbkpt' strings to 'hbp'.

> > > > +/* One higher than the highest counted user-space breakpoint register */
> > > > +unsigned int hbkpt_user_max;
> > > 
> > > Likewise, this variable isn't really needed.  It's just one more than
> > > the largest i such that hbkpt_user_max_refcount[i] > 0.
> > > 
> > 
> > It acts like a cache for determining the user-space breakpoint boundary.
> > It is used for sanity checks and in its absence we will have to compute from
> > hbkpt_user_max_refcount[] everytime.
> 
> That's right.  Isn't it simpler to check
> 
> 	kernel_pos > 0 && hbkpt_user_refcount[kernel_pos - 1] == 0
> 
> than to check
> 
> 	kernel_pos - 1 >= hbkpt_user_max
> 
> _and_ to keep hbkpt_user_max set to the correct value at all times?
>

Unfortunately the lines of code required to maintain the variable comes
close to the amount of lines it would potentially save. I will change to
code to compute it from hbkpt_user_refcount[] everytime.
 
> > > > +/*
> > > > + * Load the debug registers during startup of a CPU.
> > > > + */
> > > > +void load_debug_registers(void)
> > > > +{
> > > > +	int i;
> > > > +	unsigned long flags;
> > > > +
> > > > +	/* Prevent IPIs for new kernel breakpoint updates */
> > > > +	local_irq_save(flags);
> > > > +
> > > > +	for (i = hbkpt_kernel_pos; i < HB_NUM; i++)
> > > > +		if (hbkpt_kernel[i])
> > > > +			on_each_cpu(arch_install_kernel_hbkpt,
> > > > +				(void *)hbkpt_kernel[i], 0);
> > > 
> > > This is completely wrong.  First of all, it's dumb to send multiple
> > > IPIs (one for each iteration through the loop).  Second, this routine
> > > shouldn't send any IPIs at all!  It gets invoked when a CPU is
> > > starting up and wants to load its _own_ debug registers -- not tell
> > > another CPU to load anything.
> > > 
> > 
> > As I agreed before, it is an overkill (given the design of
> > arch_install_kernel_hbkpt()). I will create a new
> > arch_update_kernel_hbkpt(pos, bp) that will install breakpoints only
> > on the CPU starting up.
> 
> Doesn't arch_install_kernel_hbkpt() already install breakpoints
> on only the current CPU?  So why do you need a new function?
>

There will be a few more changes to arch_install_kernel_hbkpt() along
with this. Please find the changes in the ensuing patchset.
 
> > > > +	/* Check that the virtual address is in the proper range */
> > > > +	if (tsk) {
> > > > +		if (!arch_check_va_in_userspace(bp->info.address, tsk))
> > > > +			return -EFAULT;
> > > > +	} else {
> > > > +		if (!arch_check_va_in_kernelspace(bp->info.address))
> > > > +			return -EFAULT;
> > > > +	}
> > > 
> > > Roland pointed out that these checks need to take into account the
> > > length of the breakpoint.  For example, in
> > > arch_check_va_in_userspace() it isn't sufficient for the start of the
> > > breakpoint region to be a userspace address; the end of the
> > > breakpoint region must also be in userspace.
> > > 
> > 
> > Ok. Will do something like:
> > return (va <= (TASK_SIZE - (hw_breakpoint_length * word_size)));
> 
> What is the purpose of word_size here?  The breakpoint length should be 
> specified in bytes, not words.
> 
> Don't forget that that in arch_check_va_in_kernelspace() you need to 
> check both for values that are too low and values that are too high 
> (they overflow and wrap around back to a user address).
> 

While I understand the user-space checking using the length of the HW
Breakpoint, I don't really see how I can check for an upper-bound for
kernel-space virtual addresses. Most usage in the kernel only checks for
the address >= TASK_SIZE (while they check for add + len if the length
of the memory is known). I will be glad to have any suggestions in this
regard.

> > We don't keep track of the thread (in the sense the task_struct) but
> > 'hbkpt_user_max' is used for validating requests and book-keeping. As
> > Maneesh mentioned before flush_thread_hw_breakpoint() updates
> > 'hbkpt_user_max'.
> > 
> > I can change it to read like the one below if it sounds better to you.
> > 
> > /* 
> >  * 'tsk' uses more number of registers than 'hbkpt_user_max'. Update
> >  * the same.
> >  */
> 
> My preference is simply to eliminate hbkpt_user_max entirely.
> 
> Alan Stern
>

Done.

Thanks,
K.Prasad
 

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [Patch 11/11] ftrace plugin for kernel symbol tracing using HW Breakpoint interfaces - v2
  2009-03-21 16:39       ` Steven Rostedt
@ 2009-03-23 19:08         ` K.Prasad
  0 siblings, 0 replies; 33+ messages in thread
From: K.Prasad @ 2009-03-23 19:08 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: Frederic Weisbecker, Ingo Molnar, Linux Kernel Mailing List,
	Alan Stern, Andrew Morton, Benjamin Herrenschmidt, Maneesh Soni,
	Roland McGrath

On Sat, Mar 21, 2009 at 12:39:08PM -0400, Steven Rostedt wrote:
> 
> On Sat, 21 Mar 2009, K.Prasad wrote:
> > > > 
> > 
> > > > +
> > > > +void ksym_collect_stats(unsigned long hbkpt_hit_addr)
> > > > +{
> > > > +	struct hlist_node *node;
> > > > +	struct trace_ksym *entry;
> > > > +
> > > > +	spin_lock(&ksym_stat_lock);
> > > 
> > > 
> > > I see that can be called from ksym_hbkpt_handler which in turn
> > > can be called from interrupt context, right?
> > > You can issue a deadlock if you don't disable interrupts here.
> > > 
> > > Thanks,
> > > Frederic.
> > > 
> > 
> > ksym_collect_stats<--ksym_hbkpt_handler<--hw_breakpoint_handler<--do_debug
> > invocation happens with interrupts enabled (IF bit is set). I do find
> > that a few plugins in kernel/trace enclose the
> > trace_buffer_lock_reserve()--trace_buffer_unlock_commit() invocation
> > within interrupt-disabled code. Is that a requirement there?
> > 
> > The potential deadlock scenario you foresee isn't obvious to me. Can you
> > explain?
> 
> Can that lock ever be taken in an interrupt? If not, document that (and 
> perhaps add a WARN_ON(in_interrupt()); ). Otherwise you have a possible:
> 
> 	spin_lock(&ksym_stat_lock);
> 
> 		===> take interrupt ...
> 
> 			(from interrupt)
> 			spin_lock(&ksym_stat_lock); <== deadlock.
> 
> 
> -- Steve
>

Given that the function pointed by the trigger() routine is invoked with
breakpoints disabled on that CPU, I don't think we'd enter into a loop
a cyclic dependancy as above.

On the other hand, my observation w.r.t. IF bit being set was misplaced
in the sense that it corresponded to the saved stack and not when inside
the breakpoint handler in which case interrupts were disabled.

So we are safe in either ways.

Thanks,
K.Prasad
 

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [Patch 01/11] Introducing generic hardware breakpoint handler interfaces
  2009-03-23 19:03         ` K.Prasad
@ 2009-03-23 19:21           ` Alan Stern
  2009-03-23 20:42             ` K.Prasad
  0 siblings, 1 reply; 33+ messages in thread
From: Alan Stern @ 2009-03-23 19:21 UTC (permalink / raw)
  To: K.Prasad
  Cc: Ingo Molnar, Linux Kernel Mailing List, Andrew Morton,
	Benjamin Herrenschmidt, Frederic Weisbecker, Maneesh Soni,
	Roland McGrath, Steven Rostedt

On Tue, 24 Mar 2009, K.Prasad wrote:

> > > Ok. Will do something like:
> > > return (va <= (TASK_SIZE - (hw_breakpoint_length * word_size)));
> > 
> > What is the purpose of word_size here?  The breakpoint length should be 
> > specified in bytes, not words.
> > 
> > Don't forget that that in arch_check_va_in_kernelspace() you need to 
> > check both for values that are too low and values that are too high 
> > (they overflow and wrap around back to a user address).
> > 
> 
> While I understand the user-space checking using the length of the HW
> Breakpoint, I don't really see how I can check for an upper-bound for
> kernel-space virtual addresses. Most usage in the kernel only checks for
> the address >= TASK_SIZE (while they check for add + len if the length
> of the memory is known). I will be glad to have any suggestions in this
> regard.

Isn't that exactly the check you need to implement?

	addr >= TASK_SIZE && (addr + len) >= TASK_SIZE,

or perhaps better,

	addr >= TASK_SIZE && (addr + len) >= addr.

In this case you _do_ know the length of the breakpoint.

Alan Stern


^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [Patch 01/11] Introducing generic hardware breakpoint handler interfaces
  2009-03-23 19:21           ` Alan Stern
@ 2009-03-23 20:42             ` K.Prasad
  2009-03-23 21:20               ` Alan Stern
  0 siblings, 1 reply; 33+ messages in thread
From: K.Prasad @ 2009-03-23 20:42 UTC (permalink / raw)
  To: Alan Stern
  Cc: Ingo Molnar, Linux Kernel Mailing List, Andrew Morton,
	Benjamin Herrenschmidt, Frederic Weisbecker, Maneesh Soni,
	Roland McGrath, Steven Rostedt

On Mon, Mar 23, 2009 at 03:21:49PM -0400, Alan Stern wrote:
> On Tue, 24 Mar 2009, K.Prasad wrote:
> 
> > > > Ok. Will do something like:
> > > > return (va <= (TASK_SIZE - (hw_breakpoint_length * word_size)));
> > > 
> > > What is the purpose of word_size here?  The breakpoint length should be 
> > > specified in bytes, not words.
> > > 
> > > Don't forget that that in arch_check_va_in_kernelspace() you need to 
> > > check both for values that are too low and values that are too high 
> > > (they overflow and wrap around back to a user address).
> > > 
> > 
> > While I understand the user-space checking using the length of the HW
> > Breakpoint, I don't really see how I can check for an upper-bound for
> > kernel-space virtual addresses. Most usage in the kernel only checks for
> > the address >= TASK_SIZE (while they check for add + len if the length
> > of the memory is known). I will be glad to have any suggestions in this
> > regard.
> 
> Isn't that exactly the check you need to implement?
> 
> 	addr >= TASK_SIZE && (addr + len) >= TASK_SIZE,
> 
> or perhaps better,
> 
> 	addr >= TASK_SIZE && (addr + len) >= addr.
> 
> In this case you _do_ know the length of the breakpoint.
> 
> Alan Stern
>

Aren't we just checking if len is a positive number through the above
checks? The validation checks in the patchset should take care of
negative lengths. Or am I missing something?

I thought you wanted the code to check for an upper sane limit for addr
in kernel-space, say something like this:

TASK_SIZE <= addr <= (Upper limit for Kernel Virtual Address)

When I referred to 'len' in my previous mail, it meant the length
of the kernel virtual memory area (which can be used to find the upper
bound).

Thanks,
K.Prasad


^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [Patch 01/11] Introducing generic hardware breakpoint handler interfaces
  2009-03-23 20:42             ` K.Prasad
@ 2009-03-23 21:20               ` Alan Stern
  0 siblings, 0 replies; 33+ messages in thread
From: Alan Stern @ 2009-03-23 21:20 UTC (permalink / raw)
  To: K.Prasad
  Cc: Ingo Molnar, Linux Kernel Mailing List, Andrew Morton,
	Benjamin Herrenschmidt, Frederic Weisbecker, Maneesh Soni,
	Roland McGrath, Steven Rostedt

On Tue, 24 Mar 2009, K.Prasad wrote:

> > Isn't that exactly the check you need to implement?
> > 
> > 	addr >= TASK_SIZE && (addr + len) >= TASK_SIZE,
> > 
> > or perhaps better,
> > 
> > 	addr >= TASK_SIZE && (addr + len) >= addr.
> > 
> > In this case you _do_ know the length of the breakpoint.
> > 
> > Alan Stern
> >
> 
> Aren't we just checking if len is a positive number through the above
> checks? The validation checks in the patchset should take care of
> negative lengths. Or am I missing something?

Well, 0x60000000 is a positive number, and 0xd0000000 is >= TASK_SIZE.  
But their sum is 0x30000000, which lies in userspace.  In other words, 
you are missing the possibility that the addition might overflow and 
wrap around.

> I thought you wanted the code to check for an upper sane limit for addr
> in kernel-space, say something like this:
> 
> TASK_SIZE <= addr <= (Upper limit for Kernel Virtual Address)

No, the test should be

    TASK_SIZE <= addr <= addr + (len-1) <= (Upper limit for Kernel VA)

By the way, is TASK_SIZE the correct lower bound for kernel virtual
addresses on x86-64?

> When I referred to 'len' in my previous mail, it meant the length
> of the kernel virtual memory area (which can be used to find the upper
> bound).

Oh, sorry, I misunderstood.  Isn't that limit always 0xffffffff on 
x86-32?

Alan Stern


^ permalink raw reply	[flat|nested] 33+ messages in thread

* [Patch 01/11] Introducing generic hardware breakpoint handler interfaces
       [not found] <20090324152028.754123712@K.Prasad>
@ 2009-03-24 15:24 ` K.Prasad
  0 siblings, 0 replies; 33+ messages in thread
From: K.Prasad @ 2009-03-24 15:24 UTC (permalink / raw)
  To: Ingo Molnar, Linux Kernel Mailing List
  Cc: Alan Stern, Andrew Morton, Benjamin Herrenschmidt,
	Frederic Weisbecker, maneesh, Roland McGrath, Steven Rostedt,
	K.Prasad

[-- Attachment #1: 1 --]
[-- Type: text/plain, Size: 16537 bytes --]

This patch introduces two new files hw_breakpoint.[ch] which defines the 
generic interfaces to use hardware breakpoint infrastructure of the system. 

Signed-off-by: K.Prasad <prasad@linux.vnet.ibm.com>
Signed-off-by: Alan Stern <stern@rowland.harvard.edu>
---
 arch/Kconfig                        |    4 
 include/asm-generic/hw_breakpoint.h |  138 +++++++++++++
 kernel/Makefile                     |    1 
 kernel/hw_breakpoint.c              |  367 ++++++++++++++++++++++++++++++++++++
 4 files changed, 510 insertions(+)

Index: linux-2.6-tip/kernel/hw_breakpoint.c
===================================================================
--- /dev/null
+++ linux-2.6-tip/kernel/hw_breakpoint.c
@@ -0,0 +1,367 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) 2007 Alan Stern
+ * Copyright (C) IBM Corporation, 2009
+ */
+
+/*
+ * HW_breakpoint: a unified kernel/user-space hardware breakpoint facility,
+ * using the CPU's debug registers.
+ *
+ * This file contains the arch-independent routines.  It is not meant
+ * to be compiled as a standalone source file; rather it should be
+ * #include'd by the arch-specific implementation.
+ */
+
+#include <linux/irqflags.h>
+#include <linux/kallsyms.h>
+#include <linux/notifier.h>
+#include <linux/kprobes.h>
+#include <linux/kdebug.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/percpu.h>
+#include <linux/mutex.h>
+#include <linux/sched.h>
+#include <linux/init.h>
+#include <linux/smp.h>
+
+#include <asm/hw_breakpoint.h>
+#include <asm/processor.h>
+#include <asm/debugreg.h>
+
+/*
+ * Mutex that protects all (un)register operations over kernel/user-space
+ * breakpoint requests
+ */
+DEFINE_MUTEX(hw_breakpoint_mutex);
+
+/* Array of kernel-space breakpoint structures */
+struct hw_breakpoint *hbp_kernel[HB_NUM];
+
+/*
+ * Kernel breakpoints grow downwards, starting from HB_NUM
+ * 'hbp_kernel_pos' denotes lowest numbered breakpoint register occupied for
+ * kernel-space request
+ */
+unsigned int hbp_kernel_pos;
+
+/*
+ * An array containing refcount of threads using a given bkpt register
+ * Accesses are synchronised by acquiring hw_breakpoint_mutex
+ */
+unsigned int hbp_user_refcount[HB_NUM];
+
+struct task_struct *last_debugged_task;
+
+/*
+ * Install the debug register values for a new thread.
+ */
+void switch_to_thread_hw_breakpoint(struct task_struct *tsk)
+{
+	/* Set the debug register */
+	arch_install_thread_hw_breakpoint(tsk);
+	last_debugged_task = current;
+}
+
+/*
+ * Install the debug register values for just the kernel, no thread.
+ */
+void switch_to_none_hw_breakpoint(void)
+{
+	arch_install_none();
+}
+
+/*
+ * Load the debug registers during startup of a CPU.
+ */
+void load_debug_registers(void)
+{
+	unsigned long flags;
+	struct task_struct *tsk = current;
+
+	mutex_lock(&hw_breakpoint_mutex);
+
+	/* Prevent IPIs for new kernel breakpoint updates */
+	local_irq_save(flags);
+	arch_load_debug_registers();
+	local_irq_restore(flags);
+
+	if (test_tsk_thread_flag(tsk, TIF_DEBUG))
+		switch_to_thread_hw_breakpoint(tsk);
+
+	mutex_unlock(&hw_breakpoint_mutex);
+}
+
+/*
+ * Erase all the hardware breakpoint info associated with a thread.
+ *
+ * If tsk != current then tsk must not be usable (for example, a
+ * child being cleaned up from a failed fork).
+ */
+void flush_thread_hw_breakpoint(struct task_struct *tsk)
+{
+	int i;
+	struct thread_struct *thread = &(tsk->thread);
+
+	mutex_lock(&hw_breakpoint_mutex);
+
+	/* The thread no longer has any breakpoints associated with it */
+	clear_tsk_thread_flag(tsk, TIF_DEBUG);
+	for (i = 0; i < HB_NUM; i++) {
+		if (thread->hbp[i]) {
+			hbp_user_refcount[i]--;
+			kfree(thread->hbp[i]);
+			thread->hbp[i] = NULL;
+		}
+	}
+	thread->hbp_num_installed = 0;
+
+	/* Actually uninstall the breakpoints if necessary */
+	if (tsk == current)
+		switch_to_none_hw_breakpoint();
+	mutex_unlock(&hw_breakpoint_mutex);
+}
+
+/*
+ * Copy the hardware breakpoint info from a thread to its cloned child.
+ */
+int copy_thread_hw_breakpoint(struct task_struct *tsk,
+		struct task_struct *child, unsigned long clone_flags)
+{
+	/*
+	 * We will assume that breakpoint settings are not inherited
+	 * and the child starts out with no debug registers set.
+	 * But what about CLONE_PTRACE?
+	 */
+	clear_tsk_thread_flag(child, TIF_DEBUG);
+	return 0;
+}
+
+/*
+ * Validate the settings in a hw_breakpoint structure.
+ */
+static int validate_settings(struct hw_breakpoint *bp, struct task_struct *tsk)
+{
+	int ret;
+	unsigned int align;
+
+	if (!bp)
+		return -EINVAL;
+
+	ret = arch_validate_hwbkpt_settings(bp, &align, tsk);
+	if (ret < 0)
+		goto err;
+
+	/*
+	 * Check that the low-order bits of the address are appropriate
+	 * for the alignment implied by len.
+	 */
+	if (bp->info.address & align)
+		return -EINVAL;
+
+	/* Check that the virtual address is in the proper range */
+	if (tsk) {
+		if (!arch_check_va_in_userspace(bp->info.address, bp->info.len))
+			return -EFAULT;
+	} else {
+		if (!arch_check_va_in_kernelspace(bp->info.address,
+								bp->info.len))
+			return -EFAULT;
+	}
+ err:
+	return ret;
+}
+
+int __register_user_hw_breakpoint(int pos, struct task_struct *tsk,
+					struct hw_breakpoint *bp)
+{
+	struct thread_struct *thread = &(tsk->thread);
+	int rc;
+
+	/* Do not overcommit. Fail if kernel has used the hbp registers */
+	if (pos >= hbp_kernel_pos)
+		return -ENOSPC;
+
+	rc = validate_settings(bp, tsk);
+	if (rc)
+		return rc;
+
+	thread->hbp[pos] = bp;
+	thread->hbp_num_installed++;
+	hbp_user_refcount[pos]++;
+
+	arch_register_user_hw_breakpoint(pos, bp, tsk);
+
+	/*
+	 * Does it need to be installed right now?
+	 * Otherwise it will get installed the next time tsk runs
+	 */
+	if (tsk == current)
+		switch_to_thread_hw_breakpoint(tsk);
+	return rc;
+}
+
+/*
+ * Modify the address of a hbp register already in use by the task
+ * Do not invoke this in-lieu of a __unregister_user_hw_breakpoint()
+ */
+int __modify_user_hw_breakpoint(int pos, struct task_struct *tsk,
+					struct hw_breakpoint *bp)
+{
+	int rc;
+	struct thread_struct *thread = &(tsk->thread);
+
+	if ((pos >= hbp_kernel_pos) || (validate_settings(bp, tsk)))
+		return -EINVAL;
+
+	thread->hbp[pos] = bp;
+
+	/*
+	 * 'pos' must be that of a hbp register already used by 'tsk'
+	 * Otherwise arch_modify_user_hw_breakpoint() will fail
+	 */
+	rc = arch_modify_user_hw_breakpoint(pos, bp, tsk);
+	if (rc)
+		return rc;
+
+	if (tsk == current)
+		switch_to_thread_hw_breakpoint(tsk);
+	return 0;
+}
+
+/*
+ * Actual implementation of unregister_user_hw_breakpoint.
+ */
+void __unregister_user_hw_breakpoint(int pos, struct task_struct *tsk,
+						struct hw_breakpoint *bp)
+{
+	struct thread_struct *thread = &(tsk->thread);
+
+	if (!bp)
+		return;
+
+	hbp_user_refcount[pos]--;
+	thread->hbp_num_installed--;
+
+	arch_unregister_user_hw_breakpoint(pos, bp, tsk);
+
+	if (tsk == current)
+		switch_to_thread_hw_breakpoint(tsk);
+	kfree(tsk->thread.hbp[pos]);
+	tsk->thread.hbp[pos] = NULL;
+}
+
+/**
+ * register_kernel_hw_breakpoint - register a hardware breakpoint for kernel space
+ * @bp: the breakpoint structure to register
+ *
+ * @bp.info->name or @bp.info->address, @bp.info->len, @bp.info->type and
+ * @bp->triggered must be set properly before invocation
+ *
+ */
+int register_kernel_hw_breakpoint(struct hw_breakpoint *bp)
+{
+	int rc;
+
+	rc = validate_settings(bp, NULL);
+	if (rc)
+		return rc;
+
+	mutex_lock(&hw_breakpoint_mutex);
+
+	rc = -EINVAL;
+	/* Check if we are over-committing */
+	if ((hbp_kernel_pos > 0) && (!hbp_user_refcount[hbp_kernel_pos-1])) {
+		hbp_kernel_pos--;
+		hbp_kernel[hbp_kernel_pos] = bp;
+		arch_register_kernel_hw_breakpoint(hbp_kernel_pos);
+		rc = 0;
+	}
+
+	mutex_unlock(&hw_breakpoint_mutex);
+	return rc;
+}
+EXPORT_SYMBOL_GPL(register_kernel_hw_breakpoint);
+
+/**
+ * unregister_kernel_hw_breakpoint - unregister a HW breakpoint for kernel space
+ * @bp: the breakpoint structure to unregister
+ *
+ * Uninstalls and unregisters @bp.
+ */
+void unregister_kernel_hw_breakpoint(struct hw_breakpoint *bp)
+{
+	int i, j;
+
+	mutex_lock(&hw_breakpoint_mutex);
+
+	/* Find the 'bp' in our list of breakpoints for kernel */
+	for (i = hbp_kernel_pos; i < HB_NUM; i++)
+		if (bp == hbp_kernel[i])
+			break;
+	/*
+	 * We'll shift the breakpoints one-level above to compact if
+	 * unregistration creates a hole
+	 */
+	if (i > hbp_kernel_pos)
+		for (j = i; j == hbp_kernel_pos; j--)
+			hbp_kernel[j] = hbp_kernel[j-1];
+
+	/*
+	 * Delete the last kernel breakpoint entry after compaction and update
+	 * the pointer to the lowest numbered kernel entry after updating its
+	 * corresponding value in kdr7
+	 */
+	hbp_kernel[hbp_kernel_pos] = 0;
+	arch_unregister_kernel_hw_breakpoint();
+	hbp_kernel_pos++;
+
+	mutex_unlock(&hw_breakpoint_mutex);
+}
+EXPORT_SYMBOL_GPL(unregister_kernel_hw_breakpoint);
+
+/*
+ * Handle debug exception notifications.
+ */
+static int __kprobes hw_breakpoint_exceptions_notify(
+		struct notifier_block *unused, unsigned long val, void *data)
+{
+	if (val != DIE_DEBUG)
+		return NOTIFY_DONE;
+
+	return hw_breakpoint_handler(data);
+}
+
+static struct notifier_block hw_breakpoint_exceptions_nb = {
+	.notifier_call = hw_breakpoint_exceptions_notify,
+	/* we need to be notified first */
+	.priority = 0x7fffffff
+};
+
+static int __init init_hw_breakpoint(void)
+{
+	int i;
+
+	hbp_kernel_pos = HB_NUM;
+	for (i = 0; i < HB_NUM; i++)
+		hbp_user_refcount[i] = 0;
+	load_debug_registers();
+
+	return register_die_notifier(&hw_breakpoint_exceptions_nb);
+}
+
+core_initcall(init_hw_breakpoint);
Index: linux-2.6-tip/kernel/Makefile
===================================================================
--- linux-2.6-tip.orig/kernel/Makefile
+++ linux-2.6-tip/kernel/Makefile
@@ -95,6 +95,7 @@ obj-$(CONFIG_FUNCTION_TRACER) += trace/
 obj-$(CONFIG_TRACING) += trace/
 obj-$(CONFIG_SMP) += sched_cpupri.o
 obj-$(CONFIG_PERF_COUNTERS) += perf_counter.o
+obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o
 
 ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
 # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
Index: linux-2.6-tip/include/asm-generic/hw_breakpoint.h
===================================================================
--- /dev/null
+++ linux-2.6-tip/include/asm-generic/hw_breakpoint.h
@@ -0,0 +1,138 @@
+#ifndef	_ASM_GENERIC_HW_BREAKPOINT_H
+#define	_ASM_GENERIC_HW_BREAKPOINT_H
+
+#ifndef __ARCH_HW_BREAKPOINT_H
+#error "Please don't include this file directly"
+#endif
+
+#ifdef	__KERNEL__
+#include <linux/list.h>
+#include <linux/types.h>
+#include <linux/kallsyms.h>
+
+/**
+ * struct hw_breakpoint - unified kernel/user-space hardware breakpoint
+ * @triggered: callback invoked after target address access
+ * @info: arch-specific breakpoint info (address, length, and type)
+ *
+ * %hw_breakpoint structures are the kernel's way of representing
+ * hardware breakpoints.  These are data breakpoints
+ * (also known as "watchpoints", triggered on data access), and the breakpoint's
+ * target address can be located in either kernel space or user space.
+ *
+ * The breakpoint's address, length, and type are highly
+ * architecture-specific.  The values are encoded in the @info field; you
+ * specify them when registering the breakpoint.  To examine the encoded
+ * values use hw_breakpoint_get_{kaddress,uaddress,len,type}(), declared
+ * below.
+ *
+ * The address is specified as a regular kernel pointer (for kernel-space
+ * breakponts) or as an %__user pointer (for user-space breakpoints).
+ * With register_user_hw_breakpoint(), the address must refer to a
+ * location in user space.  The breakpoint will be active only while the
+ * requested task is running.  Conversely with
+ * register_kernel_hw_breakpoint(), the address must refer to a location
+ * in kernel space, and the breakpoint will be active on all CPUs
+ * regardless of the current task.
+ *
+ * The length is the breakpoint's extent in bytes, which is subject to
+ * certain limitations.  include/asm/hw_breakpoint.h contains macros
+ * defining the available lengths for a specific architecture.  Note that
+ * the address's alignment must match the length.  The breakpoint will
+ * catch accesses to any byte in the range from address to address +
+ * (length - 1).
+ *
+ * The breakpoint's type indicates the sort of access that will cause it
+ * to trigger.  Possible values may include:
+ *
+ * 	%HW_BREAKPOINT_RW (triggered on read or write access),
+ * 	%HW_BREAKPOINT_WRITE (triggered on write access), and
+ * 	%HW_BREAKPOINT_READ (triggered on read access).
+ *
+ * Appropriate macros are defined in include/asm/hw_breakpoint.h; not all
+ * possibilities are available on all architectures.  Execute breakpoints
+ * must have length equal to the special value %HW_BREAKPOINT_LEN_EXECUTE.
+ *
+ * When a breakpoint gets hit, the @triggered callback is
+ * invoked in_interrupt with a pointer to the %hw_breakpoint structure and the
+ * processor registers.
+ * Data breakpoints occur after the memory access has taken place.
+ * Breakpoints are disabled during execution @triggered, to avoid
+ * recursive traps and allow unhindered access to breakpointed memory.
+ *
+ * This sample code sets a breakpoint on pid_max and registers a callback
+ * function for writes to that variable.  Note that it is not portable
+ * as written, because not all architectures support HW_BREAKPOINT_LEN_4.
+ *
+ * ----------------------------------------------------------------------
+ *
+ * #include <asm/hw_breakpoint.h>
+ *
+ * struct hw_breakpoint my_bp;
+ *
+ * static void my_triggered(struct hw_breakpoint *bp, struct pt_regs *regs)
+ * {
+ * 	printk(KERN_DEBUG "Inside triggered routine of breakpoint exception\n");
+ * 	dump_stack();
+ *  	.......<more debugging output>........
+ * }
+ *
+ * static struct hw_breakpoint my_bp;
+ *
+ * static int init_module(void)
+ * {
+ *	..........<do anything>............
+ *	my_bp.info.type = HW_BREAKPOINT_WRITE;
+ *	my_bp.info.len = HW_BREAKPOINT_LEN_4;
+ *
+ *	my_bp.installed = (void *)my_bp_installed;
+ *
+ *	rc = register_kernel_hw_breakpoint(&my_bp);
+ *	..........<do anything>............
+ * }
+ *
+ * static void cleanup_module(void)
+ * {
+ *	..........<do anything>............
+ *	unregister_kernel_hw_breakpoint(&my_bp);
+ *	..........<do anything>............
+ * }
+ *
+ * ----------------------------------------------------------------------
+ */
+struct hw_breakpoint {
+	void (*triggered)(struct hw_breakpoint *, struct pt_regs *);
+	struct arch_hw_breakpoint info;
+};
+
+/*
+ * len and type values are defined in include/asm/hw_breakpoint.h.
+ * Available values vary according to the architecture.  On i386 the
+ * possibilities are:
+ *
+ *	HW_BREAKPOINT_LEN_1
+ *	HW_BREAKPOINT_LEN_2
+ *	HW_BREAKPOINT_LEN_4
+ *	HW_BREAKPOINT_LEN_EXECUTE
+ *	HW_BREAKPOINT_RW
+ *	HW_BREAKPOINT_READ
+ *	HW_BREAKPOINT_EXECUTE
+ *
+ * On other architectures HW_BREAKPOINT_LEN_8 may be available, and the
+ * 1-, 2-, and 4-byte lengths may be unavailable.  There also may be
+ * HW_BREAKPOINT_WRITE.  You can use #ifdef to check at compile time.
+ */
+
+/*
+ * Kernel breakpoints are not associated with any particular thread.
+ */
+int register_kernel_hw_breakpoint(struct hw_breakpoint *bp);
+void unregister_kernel_hw_breakpoint(struct hw_breakpoint *bp);
+void switch_to_none_hw_breakpoint(void);
+
+extern unsigned int hbp_kernel_pos;
+extern struct task_struct *last_debugged_task;
+extern struct mutex hw_breakpoint_mutex;
+
+#endif	/* __KERNEL__ */
+#endif	/* _ASM_GENERIC_HW_BREAKPOINT_H */
Index: linux-2.6-tip/arch/Kconfig
===================================================================
--- linux-2.6-tip.orig/arch/Kconfig
+++ linux-2.6-tip/arch/Kconfig
@@ -109,3 +109,7 @@ config HAVE_CLK
 
 config HAVE_DMA_API_DEBUG
 	bool
+
+config HAVE_HW_BREAKPOINT
+	bool
+


^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [patch 01/11] Introducing generic hardware breakpoint handler interfaces
  2009-03-11 12:57         ` K.Prasad
@ 2009-03-11 13:35           ` Ingo Molnar
  0 siblings, 0 replies; 33+ messages in thread
From: Ingo Molnar @ 2009-03-11 13:35 UTC (permalink / raw)
  To: K.Prasad
  Cc: Alan Stern, Andrew Morton, Linux Kernel Mailing List, Roland McGrath


* K.Prasad <prasad@linux.vnet.ibm.com> wrote:

> For the benefit of continuing discussion on this topic, here's 
> an extract from an old mail 
> (http://lkml.org/lkml/2007/2/5/465) from Roland, explaining 
> the need for prioritisation of requests. It must have been 
> utrace as a potential user that made him suggest this.
> 
> "I am all in favor of a facility to manage shared use of the 
> debug registers, such as your debugreg.h additions.  I just 
> think it needs to be a little more flexible.  An unobtrusive 
> kernel facility has to get out of the way when user-mode 
> decides to use all its debug registers.  It's not immediately 
> important what it's going to about it when contention arises, 
> but there has to be a way for the user-mode facilities to say 
> they need to allocate debugregs with priority and evict other 
> squatters.  So, something like code allocating a debugreg can 
> supply a callback that's made when its allocation has to taken 
> by something with higher priority.
> 
> Even after utrace, there will always be the possibility of a 
> traditional uncoordinated user of the raw debug registers, if 
> nothing else ptrace compatibility will always be there for old 
> users.  So anything new and fancy needs to be prepared to back 
> out of the way gracefully.  In the case of kwatch, it can just 
> have a handler function given by the caller to start with.  
> It's OK if individual callers can specially declare "I am not 
> well-behaved" and eat debugregs so that well-behaved 
> high-priority users like ptrace just have to lose (breaking 
> compatibility).  But no well-behaved caller of kwatch will do 
> that.
> 
> I certainly intend for later features based on utrace to 
> include higher-level treatment of watchpoints so that user 
> debugging facilities can also become responsive to debugreg 
> allocation pressure.  (Eventually, the user facilities might 
> have easier ways of falling back to other methods and getting 
> out of the way of kernel debugreg consumers, than can be done 
> for the kernel-mode-tracing facilities.)  To that end, I'd 
> like to see a clear and robust interface for debugreg sharing, 
> below the level of kwatch."

This argument ignores the reality of debug registers: 
overcommitted usage of them causes silent failures and
unobvious behavior.

I think the simple reservation scheme i outlined in the
previous mail is the minimum amount of complexity that
still gets kernel-space hw-breakpoints going robustly.
If we add anything more fancy we want it based on actual
need and desire.

	Ingo

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [patch 01/11] Introducing generic hardware breakpoint handler interfaces
  2009-03-10 14:50       ` Ingo Molnar
@ 2009-03-11 12:57         ` K.Prasad
  2009-03-11 13:35           ` Ingo Molnar
  0 siblings, 1 reply; 33+ messages in thread
From: K.Prasad @ 2009-03-11 12:57 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Alan Stern, Andrew Morton, Linux Kernel Mailing List, Roland McGrath

On Tue, Mar 10, 2009 at 03:50:36PM +0100, Ingo Molnar wrote:
> 
> * Alan Stern <stern@rowland.harvard.edu> wrote:
> 
> > On Tue, 10 Mar 2009, Ingo Molnar wrote:
> > 
> > > * prasad@linux.vnet.ibm.com <prasad@linux.vnet.ibm.com> wrote:
> > > 
> > > 
> > > > +static u8			tprio[HB_NUM];	/* Thread bp max priorities */
> > > > +LIST_HEAD(kernel_bps);			/* Kernel breakpoint list */
> > > > +static LIST_HEAD(thread_list);			/* thread_hw_breakpoint list */
> > > > +DEFINE_PER_CPU(struct cpu_hw_breakpoint, cpu_bp);
> > 
> > If nobody minds, I'll answer some of these questions on 
> > Prasad's behalf because they address parts of the code that 
> > were written before he took over the project.
> > 
> > > hm, why do we need the whole 'priority' mechanism? It seems 
> > > very over-designed to me.
> > 
> > This was done at Roland McGrath's express request.  We should 
> > see what he has to say about it.
> > 
> > > The likelyhood of both user-space and kernel-space to use 
> > > hw-breakpoints is very low to begin with. And if they use 
> > > them, the likelyhood of there being more than 4 debugregs 
> > > required in the same context is even lower.
> > 
> > Not all architectures have 4 debug registers.  Most have only 
> > one.
> >
> > > If that happens we shouldnt try to be too smart about them - 
> > > just override user-space ones with kernel space ones and 
> > > that's it. No explicit priorities are needed.
> > 
> > Roland really did not want it done this way.
> 
> Well i guess i'll have to wait for Roland's reply then.
> 
> 	Ingo

For the benefit of continuing discussion on this topic, here's an
extract from an old mail (http://lkml.org/lkml/2007/2/5/465) from
Roland, explaining the need for prioritisation of requests. It must have
been utrace as a potential user that made him suggest this.

"I am all in favor of a facility to manage shared use of the debug
registers, such as your debugreg.h additions.  I just think it needs to be
a little more flexible.  An unobtrusive kernel facility has to get out of
the way when user-mode decides to use all its debug registers.  It's not
immediately important what it's going to about it when contention arises,
but there has to be a way for the user-mode facilities to say they need to
allocate debugregs with priority and evict other squatters.  So, something
like code allocating a debugreg can supply a callback that's made when its
allocation has to taken by something with higher priority.  

Even after utrace, there will always be the possibility of a traditional
uncoordinated user of the raw debug registers, if nothing else ptrace
compatibility will always be there for old users.  So anything new and
fancy needs to be prepared to back out of the way gracefully.  In the case
of kwatch, it can just have a handler function given by the caller to start
with.  It's OK if individual callers can specially declare "I am not
well-behaved" and eat debugregs so that well-behaved high-priority users
like ptrace just have to lose (breaking compatibility).  But no
well-behaved caller of kwatch will do that.

I certainly intend for later features based on utrace to include
higher-level treatment of watchpoints so that user debugging facilities can
also become responsive to debugreg allocation pressure.  (Eventually, the
user facilities might have easier ways of falling back to other methods and
getting out of the way of kernel debugreg consumers, than can be done for
the kernel-mode-tracing facilities.)  To that end, I'd like to see a clear
and robust interface for debugreg sharing, below the level of kwatch."

Thanks,
K.Prasad



^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [patch 01/11] Introducing generic hardware breakpoint handler interfaces
  2009-03-10 14:19     ` Alan Stern
@ 2009-03-10 14:50       ` Ingo Molnar
  2009-03-11 12:57         ` K.Prasad
  0 siblings, 1 reply; 33+ messages in thread
From: Ingo Molnar @ 2009-03-10 14:50 UTC (permalink / raw)
  To: Alan Stern
  Cc: prasad, Andrew Morton, Linux Kernel Mailing List, Roland McGrath


* Alan Stern <stern@rowland.harvard.edu> wrote:

> On Tue, 10 Mar 2009, Ingo Molnar wrote:
> 
> > * prasad@linux.vnet.ibm.com <prasad@linux.vnet.ibm.com> wrote:
> > 
> > 
> > > +static u8			tprio[HB_NUM];	/* Thread bp max priorities */
> > > +LIST_HEAD(kernel_bps);			/* Kernel breakpoint list */
> > > +static LIST_HEAD(thread_list);			/* thread_hw_breakpoint list */
> > > +DEFINE_PER_CPU(struct cpu_hw_breakpoint, cpu_bp);
> 
> If nobody minds, I'll answer some of these questions on 
> Prasad's behalf because they address parts of the code that 
> were written before he took over the project.
> 
> > hm, why do we need the whole 'priority' mechanism? It seems 
> > very over-designed to me.
> 
> This was done at Roland McGrath's express request.  We should 
> see what he has to say about it.
> 
> > The likelyhood of both user-space and kernel-space to use 
> > hw-breakpoints is very low to begin with. And if they use 
> > them, the likelyhood of there being more than 4 debugregs 
> > required in the same context is even lower.
> 
> Not all architectures have 4 debug registers.  Most have only 
> one.
>
> > If that happens we shouldnt try to be too smart about them - 
> > just override user-space ones with kernel space ones and 
> > that's it. No explicit priorities are needed.
> 
> Roland really did not want it done this way.

Well i guess i'll have to wait for Roland's reply then.

	Ingo

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [patch 01/11] Introducing generic hardware breakpoint handler interfaces
  2009-03-10 13:50   ` Ingo Molnar
@ 2009-03-10 14:19     ` Alan Stern
  2009-03-10 14:50       ` Ingo Molnar
  0 siblings, 1 reply; 33+ messages in thread
From: Alan Stern @ 2009-03-10 14:19 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: prasad, Andrew Morton, Linux Kernel Mailing List, Roland McGrath

On Tue, 10 Mar 2009, Ingo Molnar wrote:

> * prasad@linux.vnet.ibm.com <prasad@linux.vnet.ibm.com> wrote:
> 
> 
> > +static u8			tprio[HB_NUM];	/* Thread bp max priorities */
> > +LIST_HEAD(kernel_bps);			/* Kernel breakpoint list */
> > +static LIST_HEAD(thread_list);			/* thread_hw_breakpoint list */
> > +DEFINE_PER_CPU(struct cpu_hw_breakpoint, cpu_bp);

If nobody minds, I'll answer some of these questions on Prasad's behalf 
because they address parts of the code that were written before he took 
over the project.

> hm, why do we need the whole 'priority' mechanism? It seems very 
> over-designed to me.

This was done at Roland McGrath's express request.  We should see what 
he has to say about it.

> The likelyhood of both user-space and kernel-space to use 
> hw-breakpoints is very low to begin with. And if they use them, 
> the likelyhood of there being more than 4 debugregs required in 
> the same context is even lower.

Not all architectures have 4 debug registers.  Most have only one.

> If that happens we shouldnt try to be too smart about them - 
> just override user-space ones with kernel space ones and that's 
> it. No explicit priorities are needed.

Roland really did not want it done this way.

Alan Stern


^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [patch 01/11] Introducing generic hardware breakpoint handler interfaces
  2009-03-05  4:37 ` [patch " prasad
@ 2009-03-10 13:50   ` Ingo Molnar
  2009-03-10 14:19     ` Alan Stern
  0 siblings, 1 reply; 33+ messages in thread
From: Ingo Molnar @ 2009-03-10 13:50 UTC (permalink / raw)
  To: prasad
  Cc: Andrew Morton, Linux Kernel Mailing List, Alan Stern, Roland McGrath


* prasad@linux.vnet.ibm.com <prasad@linux.vnet.ibm.com> wrote:


> +static u8			tprio[HB_NUM];	/* Thread bp max priorities */
> +LIST_HEAD(kernel_bps);			/* Kernel breakpoint list */
> +static LIST_HEAD(thread_list);			/* thread_hw_breakpoint list */
> +DEFINE_PER_CPU(struct cpu_hw_breakpoint, cpu_bp);

hm, why do we need the whole 'priority' mechanism? It seems very 
over-designed to me.

The likelyhood of both user-space and kernel-space to use 
hw-breakpoints is very low to begin with. And if they use them, 
the likelyhood of there being more than 4 debugregs required in 
the same context is even lower.

If that happens we shouldnt try to be too smart about them - 
just override user-space ones with kernel space ones and that's 
it. No explicit priorities are needed.

	Ingo

^ permalink raw reply	[flat|nested] 33+ messages in thread

* [Patch 01/11] Introducing generic hardware breakpoint handler interfaces
       [not found] <20090307045120.039324630@linux.vnet.ibm.com>
@ 2009-03-07  5:04 ` prasad
  0 siblings, 0 replies; 33+ messages in thread
From: prasad @ 2009-03-07  5:04 UTC (permalink / raw)
  To: mingo
  Cc: Andrew Morton, Linux Kernel Mailing List, Alan Stern,
	Roland McGrath, K.Prasad

[-- Attachment #1: 1 --]
[-- Type: text/plain, Size: 35307 bytes --]

From: Alan Stern <stern@rowland.harvard.edu>

This patch introduces two new files hw_breakpoint.[ch] which defines the 
generic interfaces to use hardware breakpoint infrastructure of the system. 

[K.Prasad: Re-based the original patch to newer kernel base, modified the
           register_<kernel/user>_hw_breakpoint() interfaces and split the
           monolithic patch into smaller ones. Split-out from the bigger patch
           and minor changes following re-basing]

Signed-off-by: K.Prasad <prasad@linux.vnet.ibm.com>
Signed-off-by: Alan Stern <stern@rowland.harvard.edu>
---
 include/asm-generic/hw_breakpoint.h |  243 +++++++++++
 kernel/Makefile                     |    2 
 kernel/hw_breakpoint.c              |  772 ++++++++++++++++++++++++++++++++++++
 3 files changed, 1016 insertions(+), 1 deletion(-)

Index: linux-2.6-tip/kernel/hw_breakpoint.c
===================================================================
--- /dev/null
+++ linux-2.6-tip/kernel/hw_breakpoint.c
@@ -0,0 +1,772 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) 2007 Alan Stern
+ * Copyright (C) IBM Corporation, 2009
+ */
+
+/*
+ * HW_breakpoint: a unified kernel/user-space hardware breakpoint facility,
+ * using the CPU's debug registers.
+ *
+ * This file contains the arch-independent routines.  It is not meant
+ * to be compiled as a standalone source file; rather it should be
+ * #include'd by the arch-specific implementation.
+ */
+
+#include <linux/init.h>
+#include <linux/irqflags.h>
+#include <linux/kdebug.h>
+#include <linux/kernel.h>
+#include <linux/kprobes.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/notifier.h>
+#include <linux/rculist.h>
+#include <linux/sched.h>
+#include <linux/smp.h>
+#include <linux/percpu.h>
+#include <linux/kallsyms.h>
+
+#include <asm/debugreg.h>
+#include <asm/hw_breakpoint.h>
+#include <asm/processor.h>
+
+/* Global info */
+struct kernel_bp_data	kbpdata[2];	/* Old and new settings */
+int			cur_kbpindex;	/* Alternates 0, 1, ... */
+struct kernel_bp_data	*cur_kbpdata = &kbpdata[0];
+			/* Always equal to &kbpdata[cur_kbpindex] */
+
+static u8			tprio[HB_NUM];	/* Thread bp max priorities */
+LIST_HEAD(kernel_bps);			/* Kernel breakpoint list */
+static LIST_HEAD(thread_list);			/* thread_hw_breakpoint list */
+DEFINE_PER_CPU(struct cpu_hw_breakpoint, cpu_bp);
+
+/*
+ * Install the debug register values for a new thread.
+ */
+void switch_to_thread_hw_breakpoint(struct task_struct *tsk)
+{
+	struct thread_hw_breakpoint *thbi = tsk->thread.hw_breakpoint_info;
+	struct cpu_hw_breakpoint *chbi;
+	struct kernel_bp_data *thr_kbpdata;
+
+	/* This routine is on the hot path; it gets called for every
+	 * context switch into a task with active breakpoints.  We
+	 * must make sure that the common case executes as quickly as
+	 * possible.
+	 */
+	chbi = &per_cpu(cpu_bp, get_cpu());
+	chbi->bp_task = tsk;
+
+	/* Use RCU to synchronize with external updates */
+	rcu_read_lock();
+
+	/* Other CPUs might be making updates to the list of kernel
+	 * breakpoints at this time.  If they are, they will modify
+	 * the other entry in kbpdata[] -- the one not pointed to
+	 * by chbi->cur_kbpdata.  So the update itself won't affect
+	 * us directly.
+	 *
+	 * However when the update is finished, an IPI will arrive
+	 * telling this CPU to change chbi->cur_kbpdata.  We need
+	 * to use a single consistent kbpdata[] entry, the present one.
+	 * So we'll copy the pointer to a local variable, thr_kbpdata,
+	 * and we must prevent the compiler from aliasing the two
+	 * pointers.  Only a compiler barrier is required, not a full
+	 * memory barrier, because everything takes place on a single CPU.
+	 */
+ restart:
+	thr_kbpdata = ACCESS_ONCE(chbi->cur_kbpdata);
+
+	/* Normally we can keep the same debug register settings as the
+	 * last time this task ran.  But if the kernel breakpoints have
+	 * changed or any user breakpoints have been registered or
+	 * unregistered, we need to handle the updates and possibly
+	 * send out some notifications.
+	 */
+	if (unlikely(thbi->gennum != thr_kbpdata->gennum)) {
+		struct hw_breakpoint *bp;
+		int i;
+		int num;
+
+		thbi->gennum = thr_kbpdata->gennum;
+		arch_update_thbi(thbi, thr_kbpdata);
+		num = thr_kbpdata->num_kbps;
+
+		/* This code can be invoked while a debugger is actively
+		 * updating the thread's breakpoint list. We use RCU to
+		 * protect our access to the list pointers. */
+		thbi->num_installed = 0;
+		i = HB_NUM;
+		list_for_each_entry_rcu(bp, &thbi->thread_bps, node) {
+
+			/* If this register is allocated for kernel bps,
+			 * don't install.  Otherwise do. */
+			if (--i < num) {
+				if (bp->status == HW_BREAKPOINT_INSTALLED) {
+					if (bp->uninstalled)
+						(bp->uninstalled)(bp);
+					bp->status = HW_BREAKPOINT_REGISTERED;
+				}
+			} else {
+				++thbi->num_installed;
+				if (bp->status != HW_BREAKPOINT_INSTALLED) {
+					bp->status = HW_BREAKPOINT_INSTALLED;
+					if (bp->installed)
+						(bp->installed)(bp);
+				}
+			}
+		}
+	}
+
+	/* Set the debug register */
+	arch_install_thbi(thbi);
+
+	/* Were there any kernel breakpoint changes while we were running? */
+	if (unlikely(chbi->cur_kbpdata != thr_kbpdata)) {
+
+		/* Some debug registers now be assigned to kernel bps and
+		 * we might have messed them up.  Reload all the kernel bps
+		 * and then reload the thread bps.
+		 */
+		arch_install_chbi(chbi);
+		goto restart;
+	}
+
+	rcu_read_unlock();
+	put_cpu_no_resched();
+}
+
+/*
+ * Install the debug register values for just the kernel, no thread.
+ */
+void switch_to_none_hw_breakpoint(void)
+{
+	struct cpu_hw_breakpoint *chbi;
+
+	chbi = &per_cpu(cpu_bp, get_cpu());
+	chbi->bp_task = NULL;
+
+	/* This routine gets called from only two places.  In one
+	 * the caller holds the hw_breakpoint_mutex; in the other
+	 * interrupts are disabled.  In either case, no kernel
+	 * breakpoint updates can arrive while the routine runs.
+	 * So we don't need to use RCU.
+	 */
+	arch_install_none(chbi);
+	put_cpu_no_resched();
+}
+
+/*
+ * Update the debug registers on this CPU.
+ */
+static void update_this_cpu(void *unused)
+{
+	struct cpu_hw_breakpoint *chbi;
+	struct task_struct *tsk = current;
+
+	chbi = &per_cpu(cpu_bp, get_cpu());
+
+	rcu_read_lock();
+	/* Install both the kernel and the user breakpoints */
+	arch_install_chbi(chbi);
+	if (test_tsk_thread_flag(tsk, TIF_DEBUG))
+		switch_to_thread_hw_breakpoint(tsk);
+
+	rcu_read_unlock();
+	put_cpu_no_resched();
+}
+
+/*
+ * Tell all CPUs to update their debug registers.
+ *
+ * The caller must hold hw_breakpoint_mutex.
+ */
+static void update_all_cpus(void)
+{
+	/* We don't need to use any sort of memory barrier.  The IPI
+	 * carried out by on_each_cpu() includes its own barriers.
+	 */
+	on_each_cpu(update_this_cpu, NULL, 0);
+	synchronize_rcu();
+}
+
+/*
+ * Load the debug registers during startup of a CPU.
+ */
+void load_debug_registers(void)
+{
+	unsigned long flags;
+
+	/* Prevent IPIs for new kernel breakpoint updates */
+	local_irq_save(flags);
+	update_this_cpu(NULL);
+	local_irq_restore(flags);
+}
+
+/*
+ * Take the 4 highest-priority breakpoints in a thread and accumulate
+ * their priorities in tprio.  Highest-priority entry is in tprio[3].
+ */
+static void accum_thread_tprio(struct thread_hw_breakpoint *thbi)
+{
+	int i;
+
+	for (i = HB_NUM - 1; i >= 0 && thbi->bps[i]; --i)
+		tprio[i] = max(tprio[i], thbi->bps[i]->priority);
+}
+
+/*
+ * Recalculate the value of the tprio array, the maximum priority levels
+ * requested by user breakpoints in all threads.
+ *
+ * Each thread has a list of registered breakpoints, kept in order of
+ * decreasing priority.  We'll set tprio[0] to the maximum priority of
+ * the first entries in all the lists, tprio[1] to the maximum priority
+ * of the second entries in all the lists, etc.  In the end, we'll know
+ * that no thread requires breakpoints with priorities higher than the
+ * values in tprio.
+ *
+ * The caller must hold hw_breakpoint_mutex.
+ */
+static void recalc_tprio(void)
+{
+	struct thread_hw_breakpoint *thbi;
+
+	memset(tprio, 0, sizeof tprio);
+
+	/* Loop through all threads having registered breakpoints
+	 * and accumulate the maximum priority levels in tprio.
+	 */
+	list_for_each_entry(thbi, &thread_list, node)
+		accum_thread_tprio(thbi);
+}
+
+/*
+ * Decide how many debug registers will be allocated to kernel breakpoints
+ * and consequently, how many remain available for user breakpoints.
+ *
+ * The priorities of the entries in the list of registered kernel bps
+ * are compared against the priorities stored in tprio[].  The 4 highest
+ * winners overall get to be installed in a debug register; num_kpbs
+ * keeps track of how many of those winners come from the kernel list.
+ *
+ * If num_kbps changes, or if a kernel bp changes its installation status,
+ * then call update_all_cpus() so that the debug registers will be set
+ * correctly on every CPU.  If neither condition holds then the set of
+ * kernel bps hasn't changed, and nothing more needs to be done.
+ *
+ * The caller must hold hw_breakpoint_mutex.
+ */
+static void balance_kernel_vs_user(void)
+{
+	int k, u;
+	int changed = 0;
+	struct hw_breakpoint *bp;
+	struct kernel_bp_data *new_kbpdata;
+
+	/* Determine how many debug registers are available for kernel
+	 * breakpoints as opposed to user breakpoints, based on the
+	 * priorities.  Ties are resolved in favor of user bps.
+	 */
+	k = 0;			/* Next kernel bp to allocate */
+	u = HB_NUM - 1;		/* Next user bp to allocate */
+
+	bp = list_entry(kernel_bps.next, struct hw_breakpoint, node);
+	while (k <= u) {
+		if (&bp->node == &kernel_bps || tprio[u] >= bp->priority)
+			--u;		/* User bps win a slot */
+		else {
+			++k;		/* Kernel bp wins a slot */
+			if (bp->status != HW_BREAKPOINT_INSTALLED)
+				changed = 1;
+			bp = list_entry(bp->node.next, struct hw_breakpoint,
+					node);
+		}
+	}
+	if (k != cur_kbpdata->num_kbps)
+		changed = 1;
+
+	/* Notify the remaining kernel breakpoints that they are about
+	 * to be uninstalled.
+	 */
+	list_for_each_entry_from(bp, &kernel_bps, node) {
+		if (bp->status == HW_BREAKPOINT_INSTALLED) {
+			if (bp->uninstalled)
+				(bp->uninstalled)(bp);
+			bp->status = HW_BREAKPOINT_REGISTERED;
+			changed = 1;
+		}
+	}
+
+	if (changed) {
+		cur_kbpindex ^= 1;
+		new_kbpdata = &kbpdata[cur_kbpindex];
+		new_kbpdata->gennum = cur_kbpdata->gennum + 1;
+		new_kbpdata->num_kbps = k;
+		arch_new_kbpdata(new_kbpdata);
+		u = 0;
+		list_for_each_entry(bp, &kernel_bps, node) {
+			if (u >= k)
+				break;
+			new_kbpdata->bps[u] = bp;
+			++u;
+		}
+		rcu_assign_pointer(cur_kbpdata, new_kbpdata);
+
+		/* Tell all the CPUs to update their debug registers */
+		update_all_cpus();
+
+		/* Notify the breakpoints that just got installed */
+		for (u = 0; u < k; ++u) {
+			bp = new_kbpdata->bps[u];
+			if (bp->status != HW_BREAKPOINT_INSTALLED) {
+				bp->status = HW_BREAKPOINT_INSTALLED;
+				if (bp->installed)
+					(bp->installed)(bp);
+			}
+		}
+	}
+}
+
+/*
+ * Return the pointer to a thread's hw_breakpoint info area,
+ * and try to allocate one if it doesn't exist.
+ *
+ * The caller must hold hw_breakpoint_mutex.
+ */
+struct thread_hw_breakpoint *alloc_thread_hw_breakpoint(
+		struct task_struct *tsk)
+{
+	if (!tsk->thread.hw_breakpoint_info && !(tsk->flags & PF_EXITING)) {
+		struct thread_hw_breakpoint *thbi;
+
+		thbi = kzalloc(sizeof(struct thread_hw_breakpoint),
+				GFP_KERNEL);
+		if (thbi) {
+			INIT_LIST_HEAD(&thbi->node);
+			INIT_LIST_HEAD(&thbi->thread_bps);
+
+			/* Force an update the next time tsk runs */
+			thbi->gennum = cur_kbpdata->gennum - 2;
+			tsk->thread.hw_breakpoint_info = thbi;
+		}
+	}
+	return tsk->thread.hw_breakpoint_info;
+}
+
+/*
+ * Erase all the hardware breakpoint info associated with a thread.
+ *
+ * If tsk != current then tsk must not be usable (for example, a
+ * child being cleaned up from a failed fork).
+ */
+void flush_thread_hw_breakpoint(struct task_struct *tsk)
+{
+	struct thread_hw_breakpoint *thbi = tsk->thread.hw_breakpoint_info;
+	struct hw_breakpoint *bp;
+
+	if (!thbi)
+		return;
+	mutex_lock(&hw_breakpoint_mutex);
+
+	/* Let the breakpoints know they are being uninstalled */
+	list_for_each_entry(bp, &thbi->thread_bps, node) {
+		if (bp->status == HW_BREAKPOINT_INSTALLED && bp->uninstalled)
+			(bp->uninstalled)(bp);
+		bp->status = 0;
+	}
+
+	/* Remove tsk from the list of all threads with registered bps */
+	list_del(&thbi->node);
+
+	/* The thread no longer has any breakpoints associated with it */
+	clear_tsk_thread_flag(tsk, TIF_DEBUG);
+	tsk->thread.hw_breakpoint_info = NULL;
+	kfree(thbi);
+
+	/* Recalculate and rebalance the kernel-vs-user priorities */
+	recalc_tprio();
+	balance_kernel_vs_user();
+
+	/* Actually uninstall the breakpoints if necessary */
+	if (tsk == current)
+		switch_to_none_hw_breakpoint();
+	mutex_unlock(&hw_breakpoint_mutex);
+}
+
+/*
+ * Copy the hardware breakpoint info from a thread to its cloned child.
+ */
+int copy_thread_hw_breakpoint(struct task_struct *tsk,
+		struct task_struct *child, unsigned long clone_flags)
+{
+	/* We will assume that breakpoint settings are not inherited
+	 * and the child starts out with no debug registers set.
+	 * But what about CLONE_PTRACE?
+	 */
+	clear_tsk_thread_flag(child, TIF_DEBUG);
+	return 0;
+}
+
+/*
+ * Store the highest-priority thread breakpoint entries in an array.
+ */
+static void store_thread_bp_array(struct thread_hw_breakpoint *thbi)
+{
+	struct hw_breakpoint *bp;
+	int i;
+
+	i = HB_NUM - 1;
+	list_for_each_entry(bp, &thbi->thread_bps, node) {
+		thbi->bps[i] = bp;
+		arch_store_thread_bp_array(thbi, bp, i);
+		if (--i < 0)
+			break;
+	}
+	while (i >= 0)
+		thbi->bps[i--] = NULL;
+
+	/* Force an update the next time this task runs */
+	thbi->gennum = cur_kbpdata->gennum - 2;
+}
+
+/*
+ * Insert a new breakpoint in a priority-sorted list.
+ * Return the bp's index in the list.
+ *
+ * Thread invariants:
+ *	tsk_thread_flag(tsk, TIF_DEBUG) set implies
+ *		tsk->thread.hw_breakpoint_info is not NULL.
+ *	tsk_thread_flag(tsk, TIF_DEBUG) set iff thbi->thread_bps is non-empty
+ *		iff thbi->node is on thread_list.
+ */
+static int insert_bp_in_list(struct hw_breakpoint *bp,
+		struct thread_hw_breakpoint *thbi, struct task_struct *tsk)
+{
+	struct list_head *head;
+	int pos;
+	struct hw_breakpoint *temp_bp;
+
+	/* tsk and thbi are NULL for kernel bps, non-NULL for user bps */
+	if (tsk)
+		head = &thbi->thread_bps;
+	else
+		head = &kernel_bps;
+
+	/* Equal-priority breakpoints get listed first-come-first-served */
+	pos = 0;
+	list_for_each_entry(temp_bp, head, node) {
+		if (bp->priority > temp_bp->priority)
+			break;
+		++pos;
+	}
+	bp->status = HW_BREAKPOINT_REGISTERED;
+	list_add_tail(&bp->node, &temp_bp->node);
+
+	if (tsk) {
+		store_thread_bp_array(thbi);
+
+		/* Is this the thread's first registered breakpoint? */
+		if (list_empty(&thbi->node)) {
+			set_tsk_thread_flag(tsk, TIF_DEBUG);
+			list_add(&thbi->node, &thread_list);
+		}
+	}
+	return pos;
+}
+
+/*
+ * Remove a breakpoint from its priority-sorted list.
+ *
+ * See the invariants mentioned above.
+ */
+static void remove_bp_from_list(struct hw_breakpoint *bp,
+		struct thread_hw_breakpoint *thbi, struct task_struct *tsk)
+{
+	/* Remove bp from the thread's/kernel's list.  If the list is now
+	 * empty we must clear the TIF_DEBUG flag.  But keep the
+	 * thread_hw_breakpoint structure, so that the virtualized debug
+	 * register values will remain valid.
+	 */
+	list_del(&bp->node);
+	if (tsk) {
+		store_thread_bp_array(thbi);
+
+		if (list_empty(&thbi->thread_bps)) {
+			list_del_init(&thbi->node);
+			clear_tsk_thread_flag(tsk, TIF_DEBUG);
+		}
+	}
+
+	/* Tell the breakpoint it is being uninstalled */
+	if (bp->status == HW_BREAKPOINT_INSTALLED && bp->uninstalled)
+		(bp->uninstalled)(bp);
+	bp->status = 0;
+}
+
+/*
+ * Validate the settings in a hw_breakpoint structure.
+ */
+static int validate_settings(struct hw_breakpoint *bp, struct task_struct *tsk)
+{
+	int ret;
+	unsigned int align;
+
+	ret = arch_validate_hwbkpt_settings(bp, &align, tsk);
+	if (ret < 0)
+		goto err;
+
+	/* Check that the low-order bits of the address are appropriate
+	 * for the alignment implied by len.
+	 */
+	if (bp->info.address & align)
+		return -EINVAL;
+
+	/* Check that the virtual address is in the proper range */
+	if (tsk) {
+		if (!arch_check_va_in_userspace(bp->info.address, tsk))
+			return -EFAULT;
+	} else {
+		if (!arch_check_va_in_kernelspace(bp->info.address))
+			return -EFAULT;
+	}
+ err:
+	return ret;
+}
+
+/*
+ * Actual implementation of register_user_hw_breakpoint.
+ */
+int __register_user_hw_breakpoint(struct task_struct *tsk,
+					struct hw_breakpoint *bp)
+{
+	int rc;
+	struct thread_hw_breakpoint *thbi;
+	int pos;
+
+	bp->status = 0;
+	rc = validate_settings(bp, tsk);
+	if (rc)
+		return rc;
+
+	thbi = alloc_thread_hw_breakpoint(tsk);
+	if (!thbi)
+		return -ENOMEM;
+
+	/* Insert bp in the thread's list */
+	pos = insert_bp_in_list(bp, thbi, tsk);
+	arch_register_user_hw_breakpoint(bp, thbi);
+
+	/* Update and rebalance the priorities.  We don't need to go through
+	 * the list of all threads; adding a breakpoint can only cause the
+	 * priorities for this thread to increase.
+	 */
+	accum_thread_tprio(thbi);
+	balance_kernel_vs_user();
+
+	/* Did bp get allocated to a debug register?  We can tell from its
+	 * position in the list.  The number of registers allocated to
+	 * kernel breakpoints is num_kbps; all the others are available for
+	 * user breakpoints.  If bp's position in the priority-ordered list
+	 * is low enough, it will get a register.
+	 */
+	if (pos < HB_NUM - cur_kbpdata->num_kbps) {
+		rc = 1;
+
+		/* Does it need to be installed right now? */
+		if (tsk == current)
+			switch_to_thread_hw_breakpoint(tsk);
+		/* Otherwise it will get installed the next time tsk runs */
+	}
+
+	return rc;
+}
+
+/**
+ * register_user_hw_breakpoint - register a hardware breakpoint for user space
+ * @tsk: the task in whose memory space the breakpoint will be set
+ * @bp: the breakpoint structure to register
+ * @address: location (virtual address) of the breakpoint
+ * @len: encoded extent of the breakpoint address (1, 2, 4, or 8 bytes)
+ * @type: breakpoint type (read-only, write-only, read-write, or execute)
+ *
+ * This routine registers a breakpoint to be associated with @tsk's
+ * memory space and active only while @tsk is running.  It does not
+ * guarantee that the breakpoint will be allocated to a debug register
+ * immediately; there may be other higher-priority breakpoints registered
+ * which require the use of all the debug registers.
+ *
+ * @tsk will normally be a process being debugged by the current process,
+ * but it may also be the current process.
+ *
+ * @bp->address, @bp->len, @bp->type, @bp->triggered and @bp->priority must be
+ * set properly before invocation
+ *
+ * Returns 1 if @bp is allocated to a debug register, 0 if @bp is
+ * registered but not allowed to be installed, otherwise a negative error
+ * code.
+ */
+int register_user_hw_breakpoint(struct task_struct *tsk,
+				 struct hw_breakpoint *bp)
+{
+	int rc;
+
+	mutex_lock(&hw_breakpoint_mutex);
+	rc = __register_user_hw_breakpoint(tsk, bp);
+	mutex_unlock(&hw_breakpoint_mutex);
+	return rc;
+}
+
+/*
+ * Actual implementation of unregister_user_hw_breakpoint.
+ */
+void __unregister_user_hw_breakpoint(struct task_struct *tsk,
+		struct hw_breakpoint *bp)
+{
+	struct thread_hw_breakpoint *thbi = tsk->thread.hw_breakpoint_info;
+
+	if (!bp->status)
+		return;		/* Not registered */
+
+	/* Remove bp from the thread's list */
+	remove_bp_from_list(bp, thbi, tsk);
+	arch_unregister_user_hw_breakpoint(bp, thbi);
+
+	/* Recalculate and rebalance the kernel-vs-user priorities,
+	 * and actually uninstall bp if necessary.
+	 */
+	recalc_tprio();
+	balance_kernel_vs_user();
+	if (tsk == current)
+		switch_to_thread_hw_breakpoint(tsk);
+}
+
+/**
+ * unregister_user_hw_breakpoint - unregister a hardware breakpoint for user space
+ * @tsk: the task in whose memory space the breakpoint is registered
+ * @bp: the breakpoint structure to unregister
+ *
+ * Uninstalls and unregisters @bp.
+ */
+void unregister_user_hw_breakpoint(struct task_struct *tsk,
+		struct hw_breakpoint *bp)
+{
+	mutex_lock(&hw_breakpoint_mutex);
+	__unregister_user_hw_breakpoint(tsk, bp);
+	mutex_unlock(&hw_breakpoint_mutex);
+}
+
+/**
+ * register_kernel_hw_breakpoint - register a hardware breakpoint for kernel space
+ * @bp: the breakpoint structure to register
+ *
+ * This routine registers a breakpoint to be active at all times.  It
+ * does not guarantee that the breakpoint will be allocated to a debug
+ * register immediately; there may be other higher-priority breakpoints
+ * registered which require the use of all the debug registers.
+ *
+ * @bp.info->name or @bp.info->address, @bp.info->len, @bp.info->type,
+ * @bp->triggered and @bp->priority must be set properly before invocation
+ *
+ * Returns 1 if @bp is allocated to a debug register, 0 if @bp is
+ * registered but not allowed to be installed, otherwise a negative error
+ * code.
+ */
+int register_kernel_hw_breakpoint(struct hw_breakpoint *bp)
+{
+	int rc;
+	int pos;
+
+	bp->status = 0;
+	rc = validate_settings(bp, NULL);
+	if (rc)
+		return rc;
+
+	mutex_lock(&hw_breakpoint_mutex);
+
+	/* Insert bp in the kernel's list */
+	pos = insert_bp_in_list(bp, NULL, NULL);
+	arch_register_kernel_hw_breakpoint(bp);
+
+	/* Rebalance the priorities.  This will install bp if it
+	 * was allocated a debug register.
+	 */
+	balance_kernel_vs_user();
+
+	/* Did bp get allocated to a debug register?  We can tell from its
+	 * position in the list.  The number of registers allocated to
+	 * kernel breakpoints is num_kbps; all the others are available for
+	 * user breakpoints.  If bp's position in the priority-ordered list
+	 * is low enough, it will get a register.
+	 */
+	if (pos < cur_kbpdata->num_kbps)
+		rc = 1;
+
+	mutex_unlock(&hw_breakpoint_mutex);
+	return rc;
+}
+EXPORT_SYMBOL_GPL(register_kernel_hw_breakpoint);
+
+/**
+ * unregister_kernel_hw_breakpoint - unregister a hardware breakpoint for kernel space
+ * @bp: the breakpoint structure to unregister
+ *
+ * Uninstalls and unregisters @bp.
+ */
+void unregister_kernel_hw_breakpoint(struct hw_breakpoint *bp)
+{
+	if (!bp->status)
+		return;		/* Not registered */
+	mutex_lock(&hw_breakpoint_mutex);
+
+	/* Remove bp from the kernel's list */
+	remove_bp_from_list(bp, NULL, NULL);
+	arch_unregister_kernel_hw_breakpoint(bp);
+
+	/* Rebalance the priorities.  This will uninstall bp if it
+	 * was allocated a debug register.
+	 */
+	balance_kernel_vs_user();
+
+	mutex_unlock(&hw_breakpoint_mutex);
+}
+EXPORT_SYMBOL_GPL(unregister_kernel_hw_breakpoint);
+
+/*
+ * Handle debug exception notifications.
+ */
+static int __kprobes hw_breakpoint_exceptions_notify(
+		struct notifier_block *unused, unsigned long val, void *data)
+{
+	if (val != DIE_DEBUG)
+		return NOTIFY_DONE;
+	return hw_breakpoint_handler(data);
+}
+
+static struct notifier_block hw_breakpoint_exceptions_nb = {
+	.notifier_call = hw_breakpoint_exceptions_notify,
+	.priority = 0x7fffffff /* we need to be notified first */
+};
+
+static int __init init_hw_breakpoint(void)
+{
+	load_debug_registers();
+	return register_die_notifier(&hw_breakpoint_exceptions_nb);
+}
+
+core_initcall(init_hw_breakpoint);
Index: linux-2.6-tip/kernel/Makefile
===================================================================
--- linux-2.6-tip.orig/kernel/Makefile
+++ linux-2.6-tip/kernel/Makefile
@@ -10,7 +10,7 @@ obj-y     = sched.o fork.o exec_domain.o
 	    kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \
 	    hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \
 	    notifier.o ksysfs.o pm_qos_params.o sched_clock.o cred.o \
-	    async.o
+	    async.o hw_breakpoint.o
 
 ifdef CONFIG_FUNCTION_TRACER
 # Do not trace debug files and internal ftrace files
Index: linux-2.6-tip/include/asm-generic/hw_breakpoint.h
===================================================================
--- /dev/null
+++ linux-2.6-tip/include/asm-generic/hw_breakpoint.h
@@ -0,0 +1,243 @@
+#ifndef	_ASM_GENERIC_HW_BREAKPOINT_H
+#define	_ASM_GENERIC_HW_BREAKPOINT_H
+
+#ifndef __ARCH_HW_BREAKPOINT_H
+#error "Please don't include this file directly"
+#endif
+
+#ifdef	__KERNEL__
+#include <linux/list.h>
+#include <linux/types.h>
+#include <linux/kallsyms.h>
+
+/**
+ * struct hw_breakpoint - unified kernel/user-space hardware breakpoint
+ * @node: internal linked-list management
+ * @triggered: callback invoked after target address access
+ * @installed: callback invoked when the breakpoint is installed
+ * @uninstalled: callback invoked when the breakpoint is uninstalled
+ * @info: arch-specific breakpoint info (address, length, and type)
+ * @priority: requested priority level
+ * @status: current registration/installation status
+ *
+ * %hw_breakpoint structures are the kernel's way of representing
+ * hardware breakpoints.  These are data breakpoints
+ * (also known as "watchpoints", triggered on data access), and the breakpoint's
+ * target address can be located in either kernel space or user space.
+ *
+ * The breakpoint's address, length, and type are highly
+ * architecture-specific.  The values are encoded in the @info field; you
+ * specify them when registering the breakpoint.  To examine the encoded
+ * values use hw_breakpoint_get_{kaddress,uaddress,len,type}(), declared
+ * below.
+ *
+ * The address is specified as a regular kernel pointer (for kernel-space
+ * breakponts) or as an %__user pointer (for user-space breakpoints).
+ * With register_user_hw_breakpoint(), the address must refer to a
+ * location in user space.  The breakpoint will be active only while the
+ * requested task is running.  Conversely with
+ * register_kernel_hw_breakpoint(), the address must refer to a location
+ * in kernel space, and the breakpoint will be active on all CPUs
+ * regardless of the current task.
+ *
+ * The length is the breakpoint's extent in bytes, which is subject to
+ * certain limitations.  include/asm/hw_breakpoint.h contains macros
+ * defining the available lengths for a specific architecture.  Note that
+ * the address's alignment must match the length.  The breakpoint will
+ * catch accesses to any byte in the range from address to address +
+ * (length - 1).
+ *
+ * The breakpoint's type indicates the sort of access that will cause it
+ * to trigger.  Possible values may include:
+ *
+ * 	%HW_BREAKPOINT_RW (triggered on read or write access),
+ * 	%HW_BREAKPOINT_WRITE (triggered on write access), and
+ * 	%HW_BREAKPOINT_READ (triggered on read access).
+ *
+ * Appropriate macros are defined in include/asm/hw_breakpoint.h; not all
+ * possibilities are available on all architectures.  Execute breakpoints
+ * must have length equal to the special value %HW_BREAKPOINT_LEN_EXECUTE.
+ *
+ * When a breakpoint gets hit, the @triggered callback is
+ * invoked in_interrupt with a pointer to the %hw_breakpoint structure and the
+ * processor registers.
+ * Data breakpoints occur after the memory access has taken place.
+ * Breakpoints are disabled during execution @triggered, to avoid
+ * recursive traps and allow unhindered access to breakpointed memory.
+ *
+ * Hardware breakpoints are implemented using the CPU's debug registers,
+ * which are a limited hardware resource.  Requests to register a
+ * breakpoint will always succeed provided the parameters are valid,
+ * but the breakpoint may not be installed in a debug register right
+ * away.  Physical debug registers are allocated based on the priority
+ * level stored in @priority (higher values indicate higher priority).
+ * User-space breakpoints within a single thread compete with one
+ * another, and all user-space breakpoints compete with all kernel-space
+ * breakpoints; however user-space breakpoints in different threads do
+ * not compete.  %HW_BREAKPOINT_PRIO_PTRACE is the level used for ptrace
+ * requests; an unobtrusive kernel-space breakpoint will use
+ * %HW_BREAKPOINT_PRIO_NORMAL to avoid disturbing user programs.  A
+ * kernel-space breakpoint that always wants to be installed and doesn't
+ * care about disrupting user debugging sessions can specify
+ * %HW_BREAKPOINT_PRIO_HIGH.
+ *
+ * A particular breakpoint may be allocated (installed in) a debug
+ * register or deallocated (uninstalled) from its debug register at any
+ * time, as other breakpoints are registered and unregistered.  The
+ * @installed and @uninstalled callbacks are invoked in_atomic when these
+ * events occur.  It is legal for @installed or @uninstalled to be %NULL. Note
+ * that it is not possible to register or unregister a user-space breakpoint
+ * from within a callback routine, since doing so requires a process context.
+ * Note that for user breakpoints, while in @installed and @uninstalled the
+ * thread may be context switched. Hence it may not be safe to call printk().
+ *
+ * For kernel-space breakpoints, @installed is invoked after the
+ * breakpoint is actually installed and @uninstalled is invoked before
+ * the breakpoint is actually uninstalled.  As a result the @triggered routine
+ * may be invoked when not expected, but this way you will know that during the
+ * time interval from @installed to @uninstalled, all events are faithfully
+ * reported.  (It is not possible to do any better than this in general, because
+ * on SMP systems there is no way to set a debug register simultaneously on all
+ * CPUs.)  The same isn't always true with user-space breakpoints, but the
+ * differences should not be visible to a user process.
+ *
+ * If you need to know whether your kernel-space breakpoint was installed
+ * immediately upon registration, you can check the return value from
+ * register_kernel_hw_breakpoint().  If the value is not > 0, you can
+ * give up and unregister the breakpoint right away.
+ *
+ * @node and @status are intended for internal use.  However @status
+ * may be read to determine whether or not the breakpoint is currently
+ * installed.  (The value is not reliable unless local interrupts are
+ * disabled.)
+ *
+ * This sample code sets a breakpoint on pid_max and registers a callback
+ * function for writes to that variable.  Note that it is not portable
+ * as written, because not all architectures support HW_BREAKPOINT_LEN_4.
+ *
+ * ----------------------------------------------------------------------
+ *
+ * #include <asm/hw_breakpoint.h>
+ *
+ * struct hw_breakpoint my_bp;
+ *
+ * static void my_triggered(struct hw_breakpoint *bp, struct pt_regs *regs)
+ * {
+ * 	printk(KERN_DEBUG "Inside triggered routine of breakpoint exception\n");
+ * 	dump_stack();
+ *  	.......<more debugging output>........
+ * }
+ *
+ * static struct hw_breakpoint my_bp;
+ *
+ * static int init_module(void)
+ * {
+ *	..........<do anything>............
+ *	my_bp.info.type = HW_BREAKPOINT_WRITE;
+ *	my_bp.info.len = HW_BREAKPOINT_LEN_4;
+ *	my_bp.info.priority = HW_BREAKPOINT_PRIO_NORMAL;
+ *
+ *	my_bp.installed = (void *)my_bp_installed;
+ *	my_bp.uninstalled = (void *)my_bp_uninstalled;
+ *	my_bp.triggered = (void *)my_triggered;
+ *
+ *	rc = register_kernel_hw_breakpoint(&my_bp);
+ *	..........<do anything>............
+ * }
+ *
+ * static void cleanup_module(void)
+ * {
+ *	..........<do anything>............
+ *	unregister_kernel_hw_breakpoint(&my_bp);
+ *	..........<do anything>............
+ * }
+ *
+ * ----------------------------------------------------------------------
+ */
+struct hw_breakpoint {
+	struct list_head	node;
+	void		(*installed)(struct hw_breakpoint *);
+	void		(*uninstalled)(struct hw_breakpoint *);
+	void		(*triggered)(struct hw_breakpoint *,
+							struct pt_regs *);
+	struct arch_hw_breakpoint	info;
+	u8		priority;
+	u8		status;
+};
+
+struct kernel_bp_data;
+struct cpu_hw_breakpoint;
+
+/*
+ * Inline accessor routines to retrieve the arch-specific parts of
+ * a breakpoint structure:
+ */
+static const void *hw_breakpoint_get_kaddress(struct hw_breakpoint *bp);
+static const void __user *hw_breakpoint_get_uaddress(struct hw_breakpoint *bp);
+static unsigned hw_breakpoint_get_len(struct hw_breakpoint *bp);
+static unsigned hw_breakpoint_get_type(struct hw_breakpoint *bp);
+
+/*
+ * len and type values are defined in include/asm/hw_breakpoint.h.
+ * Available values vary according to the architecture.  On i386 the
+ * possibilities are:
+ *
+ *	HW_BREAKPOINT_LEN_1
+ *	HW_BREAKPOINT_LEN_2
+ *	HW_BREAKPOINT_LEN_4
+ *	HW_BREAKPOINT_LEN_EXECUTE
+ *	HW_BREAKPOINT_RW
+ *	HW_BREAKPOINT_READ
+ *	HW_BREAKPOINT_EXECUTE
+ *
+ * On other architectures HW_BREAKPOINT_LEN_8 may be available, and the
+ * 1-, 2-, and 4-byte lengths may be unavailable.  There also may be
+ * HW_BREAKPOINT_WRITE.  You can use #ifdef to check at compile time.
+ */
+
+/* Standard HW breakpoint priority levels (higher value = higher priority) */
+#define HW_BREAKPOINT_PRIO_NORMAL	25
+#define HW_BREAKPOINT_PRIO_PTRACE	50
+#define HW_BREAKPOINT_PRIO_HIGH		75
+
+/* HW breakpoint status values (0 = not registered) */
+#define HW_BREAKPOINT_REGISTERED	1
+#define HW_BREAKPOINT_INSTALLED		2
+
+static DEFINE_MUTEX(hw_breakpoint_mutex);	/* Protects everything */
+
+/*
+ * The following two routines are meant to be called only from within
+ * the ptrace or utrace subsystems.  The tsk argument will usually be a
+ * process being debugged by the current task, although it is also legal
+ * for tsk to be the current task.  In any case it must be guaranteed
+ * that tsk will not start running in user mode while its breakpoints are
+ * being modified.
+ */
+int register_user_hw_breakpoint(struct task_struct *tsk,
+		struct hw_breakpoint *bp);
+void unregister_user_hw_breakpoint(struct task_struct *tsk,
+		struct hw_breakpoint *bp);
+
+/*
+ * Declare arch-specific data structures here. They are defined in
+ * arch/x86/include/asm/hw_breakpoint.h
+ */
+
+/*
+ * Kernel breakpoints are not associated with any particular thread.
+ */
+int register_kernel_hw_breakpoint(struct hw_breakpoint *bp);
+void unregister_kernel_hw_breakpoint(struct hw_breakpoint *bp);
+void switch_to_none_hw_breakpoint(void);
+
+struct thread_hw_breakpoint *alloc_thread_hw_breakpoint(
+						struct task_struct *tsk);
+
+extern struct kernel_bp_data		*cur_kbpdata;
+extern int			cur_kbpindex;	/* Alternates 0, 1, ... */
+extern struct list_head kernel_bps;		/* Kernel breakpoint list */
+DECLARE_PER_CPU(struct cpu_hw_breakpoint, cpu_bp);
+
+#endif	/* __KERNEL__ */
+#endif	/* _ASM_GENERIC_HW_BREAKPOINT_H */


^ permalink raw reply	[flat|nested] 33+ messages in thread

* [patch 01/11] Introducing generic hardware breakpoint handler interfaces
       [not found] <20090305043440.189041194@linux.vnet.ibm.com>
@ 2009-03-05  4:37 ` prasad
  2009-03-10 13:50   ` Ingo Molnar
  0 siblings, 1 reply; 33+ messages in thread
From: prasad @ 2009-03-05  4:37 UTC (permalink / raw)
  To: mingo
  Cc: Andrew Morton, Linux Kernel Mailing List, Alan Stern,
	Roland McGrath, K.Prasad

[-- Attachment #1: 1 --]
[-- Type: text/plain, Size: 35349 bytes --]

From: Alan Stern <stern@rowland.harvard.edu>

This patch introduces two new files hw_breakpoint.[ch] which defines the 
generic interfaces to use hardware breakpoint infrastructure of the system. 

[K.Prasad: Re-based the original patch to newer kernel base, modified the
           register_<kernel/user>_hw_breakpoint() interfaces and split the
           monolithic patch into smaller ones. Split-out from the bigger patch
           and minor changes following re-basing]

Signed-off-by: K.Prasad <prasad@linux.vnet.ibm.com>
Signed-off-by: Alan Stern <stern@rowland.harvard.edu>
---
 include/asm-generic/hw_breakpoint.h |  243 +++++++++++
 kernel/Makefile                     |    2 
 kernel/hw_breakpoint.c              |  772 ++++++++++++++++++++++++++++++++++++
 3 files changed, 1016 insertions(+), 1 deletion(-)

Index: linux-2.6-tip.hbkpt/kernel/hw_breakpoint.c
===================================================================
--- /dev/null
+++ linux-2.6-tip.hbkpt/kernel/hw_breakpoint.c
@@ -0,0 +1,772 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) 2007 Alan Stern
+ * Copyright (C) IBM Corporation, 2009
+ */
+
+/*
+ * HW_breakpoint: a unified kernel/user-space hardware breakpoint facility,
+ * using the CPU's debug registers.
+ *
+ * This file contains the arch-independent routines.  It is not meant
+ * to be compiled as a standalone source file; rather it should be
+ * #include'd by the arch-specific implementation.
+ */
+
+#include <linux/init.h>
+#include <linux/irqflags.h>
+#include <linux/kdebug.h>
+#include <linux/kernel.h>
+#include <linux/kprobes.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/notifier.h>
+#include <linux/rculist.h>
+#include <linux/sched.h>
+#include <linux/smp.h>
+#include <linux/percpu.h>
+#include <linux/kallsyms.h>
+
+#include <asm/debugreg.h>
+#include <asm/hw_breakpoint.h>
+#include <asm/processor.h>
+
+/* Global info */
+struct kernel_bp_data	kbpdata[2];	/* Old and new settings */
+int			cur_kbpindex;	/* Alternates 0, 1, ... */
+struct kernel_bp_data	*cur_kbpdata = &kbpdata[0];
+			/* Always equal to &kbpdata[cur_kbpindex] */
+
+static u8			tprio[HB_NUM];	/* Thread bp max priorities */
+LIST_HEAD(kernel_bps);			/* Kernel breakpoint list */
+static LIST_HEAD(thread_list);			/* thread_hw_breakpoint list */
+DEFINE_PER_CPU(struct cpu_hw_breakpoint, cpu_bp);
+
+/*
+ * Install the debug register values for a new thread.
+ */
+void switch_to_thread_hw_breakpoint(struct task_struct *tsk)
+{
+	struct thread_hw_breakpoint *thbi = tsk->thread.hw_breakpoint_info;
+	struct cpu_hw_breakpoint *chbi;
+	struct kernel_bp_data *thr_kbpdata;
+
+	/* This routine is on the hot path; it gets called for every
+	 * context switch into a task with active breakpoints.  We
+	 * must make sure that the common case executes as quickly as
+	 * possible.
+	 */
+	chbi = &per_cpu(cpu_bp, get_cpu());
+	chbi->bp_task = tsk;
+
+	/* Use RCU to synchronize with external updates */
+	rcu_read_lock();
+
+	/* Other CPUs might be making updates to the list of kernel
+	 * breakpoints at this time.  If they are, they will modify
+	 * the other entry in kbpdata[] -- the one not pointed to
+	 * by chbi->cur_kbpdata.  So the update itself won't affect
+	 * us directly.
+	 *
+	 * However when the update is finished, an IPI will arrive
+	 * telling this CPU to change chbi->cur_kbpdata.  We need
+	 * to use a single consistent kbpdata[] entry, the present one.
+	 * So we'll copy the pointer to a local variable, thr_kbpdata,
+	 * and we must prevent the compiler from aliasing the two
+	 * pointers.  Only a compiler barrier is required, not a full
+	 * memory barrier, because everything takes place on a single CPU.
+	 */
+ restart:
+	thr_kbpdata = ACCESS_ONCE(chbi->cur_kbpdata);
+
+	/* Normally we can keep the same debug register settings as the
+	 * last time this task ran.  But if the kernel breakpoints have
+	 * changed or any user breakpoints have been registered or
+	 * unregistered, we need to handle the updates and possibly
+	 * send out some notifications.
+	 */
+	if (unlikely(thbi->gennum != thr_kbpdata->gennum)) {
+		struct hw_breakpoint *bp;
+		int i;
+		int num;
+
+		thbi->gennum = thr_kbpdata->gennum;
+		arch_update_thbi(thbi, thr_kbpdata);
+		num = thr_kbpdata->num_kbps;
+
+		/* This code can be invoked while a debugger is actively
+		 * updating the thread's breakpoint list. We use RCU to
+		 * protect our access to the list pointers. */
+		thbi->num_installed = 0;
+		i = HB_NUM;
+		list_for_each_entry_rcu(bp, &thbi->thread_bps, node) {
+
+			/* If this register is allocated for kernel bps,
+			 * don't install.  Otherwise do. */
+			if (--i < num) {
+				if (bp->status == HW_BREAKPOINT_INSTALLED) {
+					if (bp->uninstalled)
+						(bp->uninstalled)(bp);
+					bp->status = HW_BREAKPOINT_REGISTERED;
+				}
+			} else {
+				++thbi->num_installed;
+				if (bp->status != HW_BREAKPOINT_INSTALLED) {
+					bp->status = HW_BREAKPOINT_INSTALLED;
+					if (bp->installed)
+						(bp->installed)(bp);
+				}
+			}
+		}
+	}
+
+	/* Set the debug register */
+	arch_install_thbi(thbi);
+
+	/* Were there any kernel breakpoint changes while we were running? */
+	if (unlikely(chbi->cur_kbpdata != thr_kbpdata)) {
+
+		/* Some debug registers now be assigned to kernel bps and
+		 * we might have messed them up.  Reload all the kernel bps
+		 * and then reload the thread bps.
+		 */
+		arch_install_chbi(chbi);
+		goto restart;
+	}
+
+	rcu_read_unlock();
+	put_cpu_no_resched();
+}
+
+/*
+ * Install the debug register values for just the kernel, no thread.
+ */
+void switch_to_none_hw_breakpoint(void)
+{
+	struct cpu_hw_breakpoint *chbi;
+
+	chbi = &per_cpu(cpu_bp, get_cpu());
+	chbi->bp_task = NULL;
+
+	/* This routine gets called from only two places.  In one
+	 * the caller holds the hw_breakpoint_mutex; in the other
+	 * interrupts are disabled.  In either case, no kernel
+	 * breakpoint updates can arrive while the routine runs.
+	 * So we don't need to use RCU.
+	 */
+	arch_install_none(chbi);
+	put_cpu_no_resched();
+}
+
+/*
+ * Update the debug registers on this CPU.
+ */
+static void update_this_cpu(void *unused)
+{
+	struct cpu_hw_breakpoint *chbi;
+	struct task_struct *tsk = current;
+
+	chbi = &per_cpu(cpu_bp, get_cpu());
+
+	rcu_read_lock();
+	/* Install both the kernel and the user breakpoints */
+	arch_install_chbi(chbi);
+	if (test_tsk_thread_flag(tsk, TIF_DEBUG))
+		switch_to_thread_hw_breakpoint(tsk);
+
+	rcu_read_unlock();
+	put_cpu_no_resched();
+}
+
+/*
+ * Tell all CPUs to update their debug registers.
+ *
+ * The caller must hold hw_breakpoint_mutex.
+ */
+static void update_all_cpus(void)
+{
+	/* We don't need to use any sort of memory barrier.  The IPI
+	 * carried out by on_each_cpu() includes its own barriers.
+	 */
+	on_each_cpu(update_this_cpu, NULL, 0);
+	synchronize_rcu();
+}
+
+/*
+ * Load the debug registers during startup of a CPU.
+ */
+void load_debug_registers(void)
+{
+	unsigned long flags;
+
+	/* Prevent IPIs for new kernel breakpoint updates */
+	local_irq_save(flags);
+	update_this_cpu(NULL);
+	local_irq_restore(flags);
+}
+
+/*
+ * Take the 4 highest-priority breakpoints in a thread and accumulate
+ * their priorities in tprio.  Highest-priority entry is in tprio[3].
+ */
+static void accum_thread_tprio(struct thread_hw_breakpoint *thbi)
+{
+	int i;
+
+	for (i = HB_NUM - 1; i >= 0 && thbi->bps[i]; --i)
+		tprio[i] = max(tprio[i], thbi->bps[i]->priority);
+}
+
+/*
+ * Recalculate the value of the tprio array, the maximum priority levels
+ * requested by user breakpoints in all threads.
+ *
+ * Each thread has a list of registered breakpoints, kept in order of
+ * decreasing priority.  We'll set tprio[0] to the maximum priority of
+ * the first entries in all the lists, tprio[1] to the maximum priority
+ * of the second entries in all the lists, etc.  In the end, we'll know
+ * that no thread requires breakpoints with priorities higher than the
+ * values in tprio.
+ *
+ * The caller must hold hw_breakpoint_mutex.
+ */
+static void recalc_tprio(void)
+{
+	struct thread_hw_breakpoint *thbi;
+
+	memset(tprio, 0, sizeof tprio);
+
+	/* Loop through all threads having registered breakpoints
+	 * and accumulate the maximum priority levels in tprio.
+	 */
+	list_for_each_entry(thbi, &thread_list, node)
+		accum_thread_tprio(thbi);
+}
+
+/*
+ * Decide how many debug registers will be allocated to kernel breakpoints
+ * and consequently, how many remain available for user breakpoints.
+ *
+ * The priorities of the entries in the list of registered kernel bps
+ * are compared against the priorities stored in tprio[].  The 4 highest
+ * winners overall get to be installed in a debug register; num_kpbs
+ * keeps track of how many of those winners come from the kernel list.
+ *
+ * If num_kbps changes, or if a kernel bp changes its installation status,
+ * then call update_all_cpus() so that the debug registers will be set
+ * correctly on every CPU.  If neither condition holds then the set of
+ * kernel bps hasn't changed, and nothing more needs to be done.
+ *
+ * The caller must hold hw_breakpoint_mutex.
+ */
+static void balance_kernel_vs_user(void)
+{
+	int k, u;
+	int changed = 0;
+	struct hw_breakpoint *bp;
+	struct kernel_bp_data *new_kbpdata;
+
+	/* Determine how many debug registers are available for kernel
+	 * breakpoints as opposed to user breakpoints, based on the
+	 * priorities.  Ties are resolved in favor of user bps.
+	 */
+	k = 0;			/* Next kernel bp to allocate */
+	u = HB_NUM - 1;		/* Next user bp to allocate */
+
+	bp = list_entry(kernel_bps.next, struct hw_breakpoint, node);
+	while (k <= u) {
+		if (&bp->node == &kernel_bps || tprio[u] >= bp->priority)
+			--u;		/* User bps win a slot */
+		else {
+			++k;		/* Kernel bp wins a slot */
+			if (bp->status != HW_BREAKPOINT_INSTALLED)
+				changed = 1;
+			bp = list_entry(bp->node.next, struct hw_breakpoint,
+					node);
+		}
+	}
+	if (k != cur_kbpdata->num_kbps)
+		changed = 1;
+
+	/* Notify the remaining kernel breakpoints that they are about
+	 * to be uninstalled.
+	 */
+	list_for_each_entry_from(bp, &kernel_bps, node) {
+		if (bp->status == HW_BREAKPOINT_INSTALLED) {
+			if (bp->uninstalled)
+				(bp->uninstalled)(bp);
+			bp->status = HW_BREAKPOINT_REGISTERED;
+			changed = 1;
+		}
+	}
+
+	if (changed) {
+		cur_kbpindex ^= 1;
+		new_kbpdata = &kbpdata[cur_kbpindex];
+		new_kbpdata->gennum = cur_kbpdata->gennum + 1;
+		new_kbpdata->num_kbps = k;
+		arch_new_kbpdata(new_kbpdata);
+		u = 0;
+		list_for_each_entry(bp, &kernel_bps, node) {
+			if (u >= k)
+				break;
+			new_kbpdata->bps[u] = bp;
+			++u;
+		}
+		rcu_assign_pointer(cur_kbpdata, new_kbpdata);
+
+		/* Tell all the CPUs to update their debug registers */
+		update_all_cpus();
+
+		/* Notify the breakpoints that just got installed */
+		for (u = 0; u < k; ++u) {
+			bp = new_kbpdata->bps[u];
+			if (bp->status != HW_BREAKPOINT_INSTALLED) {
+				bp->status = HW_BREAKPOINT_INSTALLED;
+				if (bp->installed)
+					(bp->installed)(bp);
+			}
+		}
+	}
+}
+
+/*
+ * Return the pointer to a thread's hw_breakpoint info area,
+ * and try to allocate one if it doesn't exist.
+ *
+ * The caller must hold hw_breakpoint_mutex.
+ */
+struct thread_hw_breakpoint *alloc_thread_hw_breakpoint(
+		struct task_struct *tsk)
+{
+	if (!tsk->thread.hw_breakpoint_info && !(tsk->flags & PF_EXITING)) {
+		struct thread_hw_breakpoint *thbi;
+
+		thbi = kzalloc(sizeof(struct thread_hw_breakpoint),
+				GFP_KERNEL);
+		if (thbi) {
+			INIT_LIST_HEAD(&thbi->node);
+			INIT_LIST_HEAD(&thbi->thread_bps);
+
+			/* Force an update the next time tsk runs */
+			thbi->gennum = cur_kbpdata->gennum - 2;
+			tsk->thread.hw_breakpoint_info = thbi;
+		}
+	}
+	return tsk->thread.hw_breakpoint_info;
+}
+
+/*
+ * Erase all the hardware breakpoint info associated with a thread.
+ *
+ * If tsk != current then tsk must not be usable (for example, a
+ * child being cleaned up from a failed fork).
+ */
+void flush_thread_hw_breakpoint(struct task_struct *tsk)
+{
+	struct thread_hw_breakpoint *thbi = tsk->thread.hw_breakpoint_info;
+	struct hw_breakpoint *bp;
+
+	if (!thbi)
+		return;
+	mutex_lock(&hw_breakpoint_mutex);
+
+	/* Let the breakpoints know they are being uninstalled */
+	list_for_each_entry(bp, &thbi->thread_bps, node) {
+		if (bp->status == HW_BREAKPOINT_INSTALLED && bp->uninstalled)
+			(bp->uninstalled)(bp);
+		bp->status = 0;
+	}
+
+	/* Remove tsk from the list of all threads with registered bps */
+	list_del(&thbi->node);
+
+	/* The thread no longer has any breakpoints associated with it */
+	clear_tsk_thread_flag(tsk, TIF_DEBUG);
+	tsk->thread.hw_breakpoint_info = NULL;
+	kfree(thbi);
+
+	/* Recalculate and rebalance the kernel-vs-user priorities */
+	recalc_tprio();
+	balance_kernel_vs_user();
+
+	/* Actually uninstall the breakpoints if necessary */
+	if (tsk == current)
+		switch_to_none_hw_breakpoint();
+	mutex_unlock(&hw_breakpoint_mutex);
+}
+
+/*
+ * Copy the hardware breakpoint info from a thread to its cloned child.
+ */
+int copy_thread_hw_breakpoint(struct task_struct *tsk,
+		struct task_struct *child, unsigned long clone_flags)
+{
+	/* We will assume that breakpoint settings are not inherited
+	 * and the child starts out with no debug registers set.
+	 * But what about CLONE_PTRACE?
+	 */
+	clear_tsk_thread_flag(child, TIF_DEBUG);
+	return 0;
+}
+
+/*
+ * Store the highest-priority thread breakpoint entries in an array.
+ */
+static void store_thread_bp_array(struct thread_hw_breakpoint *thbi)
+{
+	struct hw_breakpoint *bp;
+	int i;
+
+	i = HB_NUM - 1;
+	list_for_each_entry(bp, &thbi->thread_bps, node) {
+		thbi->bps[i] = bp;
+		arch_store_thread_bp_array(thbi, bp, i);
+		if (--i < 0)
+			break;
+	}
+	while (i >= 0)
+		thbi->bps[i--] = NULL;
+
+	/* Force an update the next time this task runs */
+	thbi->gennum = cur_kbpdata->gennum - 2;
+}
+
+/*
+ * Insert a new breakpoint in a priority-sorted list.
+ * Return the bp's index in the list.
+ *
+ * Thread invariants:
+ *	tsk_thread_flag(tsk, TIF_DEBUG) set implies
+ *		tsk->thread.hw_breakpoint_info is not NULL.
+ *	tsk_thread_flag(tsk, TIF_DEBUG) set iff thbi->thread_bps is non-empty
+ *		iff thbi->node is on thread_list.
+ */
+static int insert_bp_in_list(struct hw_breakpoint *bp,
+		struct thread_hw_breakpoint *thbi, struct task_struct *tsk)
+{
+	struct list_head *head;
+	int pos;
+	struct hw_breakpoint *temp_bp;
+
+	/* tsk and thbi are NULL for kernel bps, non-NULL for user bps */
+	if (tsk)
+		head = &thbi->thread_bps;
+	else
+		head = &kernel_bps;
+
+	/* Equal-priority breakpoints get listed first-come-first-served */
+	pos = 0;
+	list_for_each_entry(temp_bp, head, node) {
+		if (bp->priority > temp_bp->priority)
+			break;
+		++pos;
+	}
+	bp->status = HW_BREAKPOINT_REGISTERED;
+	list_add_tail(&bp->node, &temp_bp->node);
+
+	if (tsk) {
+		store_thread_bp_array(thbi);
+
+		/* Is this the thread's first registered breakpoint? */
+		if (list_empty(&thbi->node)) {
+			set_tsk_thread_flag(tsk, TIF_DEBUG);
+			list_add(&thbi->node, &thread_list);
+		}
+	}
+	return pos;
+}
+
+/*
+ * Remove a breakpoint from its priority-sorted list.
+ *
+ * See the invariants mentioned above.
+ */
+static void remove_bp_from_list(struct hw_breakpoint *bp,
+		struct thread_hw_breakpoint *thbi, struct task_struct *tsk)
+{
+	/* Remove bp from the thread's/kernel's list.  If the list is now
+	 * empty we must clear the TIF_DEBUG flag.  But keep the
+	 * thread_hw_breakpoint structure, so that the virtualized debug
+	 * register values will remain valid.
+	 */
+	list_del(&bp->node);
+	if (tsk) {
+		store_thread_bp_array(thbi);
+
+		if (list_empty(&thbi->thread_bps)) {
+			list_del_init(&thbi->node);
+			clear_tsk_thread_flag(tsk, TIF_DEBUG);
+		}
+	}
+
+	/* Tell the breakpoint it is being uninstalled */
+	if (bp->status == HW_BREAKPOINT_INSTALLED && bp->uninstalled)
+		(bp->uninstalled)(bp);
+	bp->status = 0;
+}
+
+/*
+ * Validate the settings in a hw_breakpoint structure.
+ */
+static int validate_settings(struct hw_breakpoint *bp, struct task_struct *tsk)
+{
+	int ret;
+	unsigned int align;
+
+	ret = arch_validate_hwbkpt_settings(bp, &align, tsk);
+	if (ret < 0)
+		goto err;
+
+	/* Check that the low-order bits of the address are appropriate
+	 * for the alignment implied by len.
+	 */
+	if (bp->info.address & align)
+		return -EINVAL;
+
+	/* Check that the virtual address is in the proper range */
+	if (tsk) {
+		if (!arch_check_va_in_userspace(bp->info.address, tsk))
+			return -EFAULT;
+	} else {
+		if (!arch_check_va_in_kernelspace(bp->info.address))
+			return -EFAULT;
+	}
+ err:
+	return ret;
+}
+
+/*
+ * Actual implementation of register_user_hw_breakpoint.
+ */
+int __register_user_hw_breakpoint(struct task_struct *tsk,
+					struct hw_breakpoint *bp)
+{
+	int rc;
+	struct thread_hw_breakpoint *thbi;
+	int pos;
+
+	bp->status = 0;
+	rc = validate_settings(bp, tsk);
+	if (rc)
+		return rc;
+
+	thbi = alloc_thread_hw_breakpoint(tsk);
+	if (!thbi)
+		return -ENOMEM;
+
+	/* Insert bp in the thread's list */
+	pos = insert_bp_in_list(bp, thbi, tsk);
+	arch_register_user_hw_breakpoint(bp, thbi);
+
+	/* Update and rebalance the priorities.  We don't need to go through
+	 * the list of all threads; adding a breakpoint can only cause the
+	 * priorities for this thread to increase.
+	 */
+	accum_thread_tprio(thbi);
+	balance_kernel_vs_user();
+
+	/* Did bp get allocated to a debug register?  We can tell from its
+	 * position in the list.  The number of registers allocated to
+	 * kernel breakpoints is num_kbps; all the others are available for
+	 * user breakpoints.  If bp's position in the priority-ordered list
+	 * is low enough, it will get a register.
+	 */
+	if (pos < HB_NUM - cur_kbpdata->num_kbps) {
+		rc = 1;
+
+		/* Does it need to be installed right now? */
+		if (tsk == current)
+			switch_to_thread_hw_breakpoint(tsk);
+		/* Otherwise it will get installed the next time tsk runs */
+	}
+
+	return rc;
+}
+
+/**
+ * register_user_hw_breakpoint - register a hardware breakpoint for user space
+ * @tsk: the task in whose memory space the breakpoint will be set
+ * @bp: the breakpoint structure to register
+ * @address: location (virtual address) of the breakpoint
+ * @len: encoded extent of the breakpoint address (1, 2, 4, or 8 bytes)
+ * @type: breakpoint type (read-only, write-only, read-write, or execute)
+ *
+ * This routine registers a breakpoint to be associated with @tsk's
+ * memory space and active only while @tsk is running.  It does not
+ * guarantee that the breakpoint will be allocated to a debug register
+ * immediately; there may be other higher-priority breakpoints registered
+ * which require the use of all the debug registers.
+ *
+ * @tsk will normally be a process being debugged by the current process,
+ * but it may also be the current process.
+ *
+ * @bp->address, @bp->len, @bp->type, @bp->triggered and @bp->priority must be
+ * set properly before invocation
+ *
+ * Returns 1 if @bp is allocated to a debug register, 0 if @bp is
+ * registered but not allowed to be installed, otherwise a negative error
+ * code.
+ */
+int register_user_hw_breakpoint(struct task_struct *tsk,
+				 struct hw_breakpoint *bp)
+{
+	int rc;
+
+	mutex_lock(&hw_breakpoint_mutex);
+	rc = __register_user_hw_breakpoint(tsk, bp);
+	mutex_unlock(&hw_breakpoint_mutex);
+	return rc;
+}
+
+/*
+ * Actual implementation of unregister_user_hw_breakpoint.
+ */
+void __unregister_user_hw_breakpoint(struct task_struct *tsk,
+		struct hw_breakpoint *bp)
+{
+	struct thread_hw_breakpoint *thbi = tsk->thread.hw_breakpoint_info;
+
+	if (!bp->status)
+		return;		/* Not registered */
+
+	/* Remove bp from the thread's list */
+	remove_bp_from_list(bp, thbi, tsk);
+	arch_unregister_user_hw_breakpoint(bp, thbi);
+
+	/* Recalculate and rebalance the kernel-vs-user priorities,
+	 * and actually uninstall bp if necessary.
+	 */
+	recalc_tprio();
+	balance_kernel_vs_user();
+	if (tsk == current)
+		switch_to_thread_hw_breakpoint(tsk);
+}
+
+/**
+ * unregister_user_hw_breakpoint - unregister a hardware breakpoint for user space
+ * @tsk: the task in whose memory space the breakpoint is registered
+ * @bp: the breakpoint structure to unregister
+ *
+ * Uninstalls and unregisters @bp.
+ */
+void unregister_user_hw_breakpoint(struct task_struct *tsk,
+		struct hw_breakpoint *bp)
+{
+	mutex_lock(&hw_breakpoint_mutex);
+	__unregister_user_hw_breakpoint(tsk, bp);
+	mutex_unlock(&hw_breakpoint_mutex);
+}
+
+/**
+ * register_kernel_hw_breakpoint - register a hardware breakpoint for kernel space
+ * @bp: the breakpoint structure to register
+ *
+ * This routine registers a breakpoint to be active at all times.  It
+ * does not guarantee that the breakpoint will be allocated to a debug
+ * register immediately; there may be other higher-priority breakpoints
+ * registered which require the use of all the debug registers.
+ *
+ * @bp.info->name or @bp.info->address, @bp.info->len, @bp.info->type,
+ * @bp->triggered and @bp->priority must be set properly before invocation
+ *
+ * Returns 1 if @bp is allocated to a debug register, 0 if @bp is
+ * registered but not allowed to be installed, otherwise a negative error
+ * code.
+ */
+int register_kernel_hw_breakpoint(struct hw_breakpoint *bp)
+{
+	int rc;
+	int pos;
+
+	bp->status = 0;
+	rc = validate_settings(bp, NULL);
+	if (rc)
+		return rc;
+
+	mutex_lock(&hw_breakpoint_mutex);
+
+	/* Insert bp in the kernel's list */
+	pos = insert_bp_in_list(bp, NULL, NULL);
+	arch_register_kernel_hw_breakpoint(bp);
+
+	/* Rebalance the priorities.  This will install bp if it
+	 * was allocated a debug register.
+	 */
+	balance_kernel_vs_user();
+
+	/* Did bp get allocated to a debug register?  We can tell from its
+	 * position in the list.  The number of registers allocated to
+	 * kernel breakpoints is num_kbps; all the others are available for
+	 * user breakpoints.  If bp's position in the priority-ordered list
+	 * is low enough, it will get a register.
+	 */
+	if (pos < cur_kbpdata->num_kbps)
+		rc = 1;
+
+	mutex_unlock(&hw_breakpoint_mutex);
+	return rc;
+}
+EXPORT_SYMBOL_GPL(register_kernel_hw_breakpoint);
+
+/**
+ * unregister_kernel_hw_breakpoint - unregister a hardware breakpoint for kernel space
+ * @bp: the breakpoint structure to unregister
+ *
+ * Uninstalls and unregisters @bp.
+ */
+void unregister_kernel_hw_breakpoint(struct hw_breakpoint *bp)
+{
+	if (!bp->status)
+		return;		/* Not registered */
+	mutex_lock(&hw_breakpoint_mutex);
+
+	/* Remove bp from the kernel's list */
+	remove_bp_from_list(bp, NULL, NULL);
+	arch_unregister_kernel_hw_breakpoint(bp);
+
+	/* Rebalance the priorities.  This will uninstall bp if it
+	 * was allocated a debug register.
+	 */
+	balance_kernel_vs_user();
+
+	mutex_unlock(&hw_breakpoint_mutex);
+}
+EXPORT_SYMBOL_GPL(unregister_kernel_hw_breakpoint);
+
+/*
+ * Handle debug exception notifications.
+ */
+static int __kprobes hw_breakpoint_exceptions_notify(
+		struct notifier_block *unused, unsigned long val, void *data)
+{
+	if (val != DIE_DEBUG)
+		return NOTIFY_DONE;
+	return hw_breakpoint_handler(data);
+}
+
+static struct notifier_block hw_breakpoint_exceptions_nb = {
+	.notifier_call = hw_breakpoint_exceptions_notify,
+	.priority = 0x7fffffff /* we need to be notified first */
+};
+
+static int __init init_hw_breakpoint(void)
+{
+	load_debug_registers();
+	return register_die_notifier(&hw_breakpoint_exceptions_nb);
+}
+
+core_initcall(init_hw_breakpoint);
Index: linux-2.6-tip.hbkpt/kernel/Makefile
===================================================================
--- linux-2.6-tip.hbkpt.orig/kernel/Makefile
+++ linux-2.6-tip.hbkpt/kernel/Makefile
@@ -10,7 +10,7 @@ obj-y     = sched.o fork.o exec_domain.o
 	    kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \
 	    hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \
 	    notifier.o ksysfs.o pm_qos_params.o sched_clock.o cred.o \
-	    async.o
+	    async.o hw_breakpoint.o
 
 ifdef CONFIG_FUNCTION_TRACER
 # Do not trace debug files and internal ftrace files
Index: linux-2.6-tip.hbkpt/include/asm-generic/hw_breakpoint.h
===================================================================
--- /dev/null
+++ linux-2.6-tip.hbkpt/include/asm-generic/hw_breakpoint.h
@@ -0,0 +1,243 @@
+#ifndef	_ASM_GENERIC_HW_BREAKPOINT_H
+#define	_ASM_GENERIC_HW_BREAKPOINT_H
+
+#ifndef __ARCH_HW_BREAKPOINT_H
+#error "Please don't include this file directly"
+#endif
+
+#ifdef	__KERNEL__
+#include <linux/list.h>
+#include <linux/types.h>
+#include <linux/kallsyms.h>
+
+/**
+ * struct hw_breakpoint - unified kernel/user-space hardware breakpoint
+ * @node: internal linked-list management
+ * @triggered: callback invoked after target address access
+ * @installed: callback invoked when the breakpoint is installed
+ * @uninstalled: callback invoked when the breakpoint is uninstalled
+ * @info: arch-specific breakpoint info (address, length, and type)
+ * @priority: requested priority level
+ * @status: current registration/installation status
+ *
+ * %hw_breakpoint structures are the kernel's way of representing
+ * hardware breakpoints.  These are data breakpoints
+ * (also known as "watchpoints", triggered on data access), and the breakpoint's
+ * target address can be located in either kernel space or user space.
+ *
+ * The breakpoint's address, length, and type are highly
+ * architecture-specific.  The values are encoded in the @info field; you
+ * specify them when registering the breakpoint.  To examine the encoded
+ * values use hw_breakpoint_get_{kaddress,uaddress,len,type}(), declared
+ * below.
+ *
+ * The address is specified as a regular kernel pointer (for kernel-space
+ * breakponts) or as an %__user pointer (for user-space breakpoints).
+ * With register_user_hw_breakpoint(), the address must refer to a
+ * location in user space.  The breakpoint will be active only while the
+ * requested task is running.  Conversely with
+ * register_kernel_hw_breakpoint(), the address must refer to a location
+ * in kernel space, and the breakpoint will be active on all CPUs
+ * regardless of the current task.
+ *
+ * The length is the breakpoint's extent in bytes, which is subject to
+ * certain limitations.  include/asm/hw_breakpoint.h contains macros
+ * defining the available lengths for a specific architecture.  Note that
+ * the address's alignment must match the length.  The breakpoint will
+ * catch accesses to any byte in the range from address to address +
+ * (length - 1).
+ *
+ * The breakpoint's type indicates the sort of access that will cause it
+ * to trigger.  Possible values may include:
+ *
+ * 	%HW_BREAKPOINT_RW (triggered on read or write access),
+ * 	%HW_BREAKPOINT_WRITE (triggered on write access), and
+ * 	%HW_BREAKPOINT_READ (triggered on read access).
+ *
+ * Appropriate macros are defined in include/asm/hw_breakpoint.h; not all
+ * possibilities are available on all architectures.  Execute breakpoints
+ * must have length equal to the special value %HW_BREAKPOINT_LEN_EXECUTE.
+ *
+ * When a breakpoint gets hit, the @triggered callback is
+ * invoked in_interrupt with a pointer to the %hw_breakpoint structure and the
+ * processor registers.
+ * Data breakpoints occur after the memory access has taken place.
+ * Breakpoints are disabled during execution @triggered, to avoid
+ * recursive traps and allow unhindered access to breakpointed memory.
+ *
+ * Hardware breakpoints are implemented using the CPU's debug registers,
+ * which are a limited hardware resource.  Requests to register a
+ * breakpoint will always succeed provided the parameters are valid,
+ * but the breakpoint may not be installed in a debug register right
+ * away.  Physical debug registers are allocated based on the priority
+ * level stored in @priority (higher values indicate higher priority).
+ * User-space breakpoints within a single thread compete with one
+ * another, and all user-space breakpoints compete with all kernel-space
+ * breakpoints; however user-space breakpoints in different threads do
+ * not compete.  %HW_BREAKPOINT_PRIO_PTRACE is the level used for ptrace
+ * requests; an unobtrusive kernel-space breakpoint will use
+ * %HW_BREAKPOINT_PRIO_NORMAL to avoid disturbing user programs.  A
+ * kernel-space breakpoint that always wants to be installed and doesn't
+ * care about disrupting user debugging sessions can specify
+ * %HW_BREAKPOINT_PRIO_HIGH.
+ *
+ * A particular breakpoint may be allocated (installed in) a debug
+ * register or deallocated (uninstalled) from its debug register at any
+ * time, as other breakpoints are registered and unregistered.  The
+ * @installed and @uninstalled callbacks are invoked in_atomic when these
+ * events occur.  It is legal for @installed or @uninstalled to be %NULL. Note
+ * that it is not possible to register or unregister a user-space breakpoint
+ * from within a callback routine, since doing so requires a process context.
+ * Note that for user breakpoints, while in @installed and @uninstalled the
+ * thread may be context switched. Hence it may not be safe to call printk().
+ *
+ * For kernel-space breakpoints, @installed is invoked after the
+ * breakpoint is actually installed and @uninstalled is invoked before
+ * the breakpoint is actually uninstalled.  As a result the @triggered routine
+ * may be invoked when not expected, but this way you will know that during the
+ * time interval from @installed to @uninstalled, all events are faithfully
+ * reported.  (It is not possible to do any better than this in general, because
+ * on SMP systems there is no way to set a debug register simultaneously on all
+ * CPUs.)  The same isn't always true with user-space breakpoints, but the
+ * differences should not be visible to a user process.
+ *
+ * If you need to know whether your kernel-space breakpoint was installed
+ * immediately upon registration, you can check the return value from
+ * register_kernel_hw_breakpoint().  If the value is not > 0, you can
+ * give up and unregister the breakpoint right away.
+ *
+ * @node and @status are intended for internal use.  However @status
+ * may be read to determine whether or not the breakpoint is currently
+ * installed.  (The value is not reliable unless local interrupts are
+ * disabled.)
+ *
+ * This sample code sets a breakpoint on pid_max and registers a callback
+ * function for writes to that variable.  Note that it is not portable
+ * as written, because not all architectures support HW_BREAKPOINT_LEN_4.
+ *
+ * ----------------------------------------------------------------------
+ *
+ * #include <asm/hw_breakpoint.h>
+ *
+ * struct hw_breakpoint my_bp;
+ *
+ * static void my_triggered(struct hw_breakpoint *bp, struct pt_regs *regs)
+ * {
+ * 	printk(KERN_DEBUG "Inside triggered routine of breakpoint exception\n");
+ * 	dump_stack();
+ *  	.......<more debugging output>........
+ * }
+ *
+ * static struct hw_breakpoint my_bp;
+ *
+ * static int init_module(void)
+ * {
+ *	..........<do anything>............
+ *	my_bp.info.type = HW_BREAKPOINT_WRITE;
+ *	my_bp.info.len = HW_BREAKPOINT_LEN_4;
+ *	my_bp.info.priority = HW_BREAKPOINT_PRIO_NORMAL;
+ *
+ *	my_bp.installed = (void *)my_bp_installed;
+ *	my_bp.uninstalled = (void *)my_bp_uninstalled;
+ *	my_bp.triggered = (void *)my_triggered;
+ *
+ *	rc = register_kernel_hw_breakpoint(&my_bp);
+ *	..........<do anything>............
+ * }
+ *
+ * static void cleanup_module(void)
+ * {
+ *	..........<do anything>............
+ *	unregister_kernel_hw_breakpoint(&my_bp);
+ *	..........<do anything>............
+ * }
+ *
+ * ----------------------------------------------------------------------
+ */
+struct hw_breakpoint {
+	struct list_head	node;
+	void		(*installed)(struct hw_breakpoint *);
+	void		(*uninstalled)(struct hw_breakpoint *);
+	void		(*triggered)(struct hw_breakpoint *,
+							struct pt_regs *);
+	struct arch_hw_breakpoint	info;
+	u8		priority;
+	u8		status;
+};
+
+struct kernel_bp_data;
+struct cpu_hw_breakpoint;
+
+/*
+ * Inline accessor routines to retrieve the arch-specific parts of
+ * a breakpoint structure:
+ */
+static const void *hw_breakpoint_get_kaddress(struct hw_breakpoint *bp);
+static const void __user *hw_breakpoint_get_uaddress(struct hw_breakpoint *bp);
+static unsigned hw_breakpoint_get_len(struct hw_breakpoint *bp);
+static unsigned hw_breakpoint_get_type(struct hw_breakpoint *bp);
+
+/*
+ * len and type values are defined in include/asm/hw_breakpoint.h.
+ * Available values vary according to the architecture.  On i386 the
+ * possibilities are:
+ *
+ *	HW_BREAKPOINT_LEN_1
+ *	HW_BREAKPOINT_LEN_2
+ *	HW_BREAKPOINT_LEN_4
+ *	HW_BREAKPOINT_LEN_EXECUTE
+ *	HW_BREAKPOINT_RW
+ *	HW_BREAKPOINT_READ
+ *	HW_BREAKPOINT_EXECUTE
+ *
+ * On other architectures HW_BREAKPOINT_LEN_8 may be available, and the
+ * 1-, 2-, and 4-byte lengths may be unavailable.  There also may be
+ * HW_BREAKPOINT_WRITE.  You can use #ifdef to check at compile time.
+ */
+
+/* Standard HW breakpoint priority levels (higher value = higher priority) */
+#define HW_BREAKPOINT_PRIO_NORMAL	25
+#define HW_BREAKPOINT_PRIO_PTRACE	50
+#define HW_BREAKPOINT_PRIO_HIGH		75
+
+/* HW breakpoint status values (0 = not registered) */
+#define HW_BREAKPOINT_REGISTERED	1
+#define HW_BREAKPOINT_INSTALLED		2
+
+static DEFINE_MUTEX(hw_breakpoint_mutex);	/* Protects everything */
+
+/*
+ * The following two routines are meant to be called only from within
+ * the ptrace or utrace subsystems.  The tsk argument will usually be a
+ * process being debugged by the current task, although it is also legal
+ * for tsk to be the current task.  In any case it must be guaranteed
+ * that tsk will not start running in user mode while its breakpoints are
+ * being modified.
+ */
+int register_user_hw_breakpoint(struct task_struct *tsk,
+		struct hw_breakpoint *bp);
+void unregister_user_hw_breakpoint(struct task_struct *tsk,
+		struct hw_breakpoint *bp);
+
+/*
+ * Declare arch-specific data structures here. They are defined in
+ * arch/x86/include/asm/hw_breakpoint.h
+ */
+
+/*
+ * Kernel breakpoints are not associated with any particular thread.
+ */
+int register_kernel_hw_breakpoint(struct hw_breakpoint *bp);
+void unregister_kernel_hw_breakpoint(struct hw_breakpoint *bp);
+void switch_to_none_hw_breakpoint(void);
+
+struct thread_hw_breakpoint *alloc_thread_hw_breakpoint(
+						struct task_struct *tsk);
+
+extern struct kernel_bp_data		*cur_kbpdata;
+extern int			cur_kbpindex;	/* Alternates 0, 1, ... */
+extern struct list_head kernel_bps;		/* Kernel breakpoint list */
+DECLARE_PER_CPU(struct cpu_hw_breakpoint, cpu_bp);
+
+#endif	/* __KERNEL__ */
+#endif	/* _ASM_GENERIC_HW_BREAKPOINT_H */


^ permalink raw reply	[flat|nested] 33+ messages in thread

end of thread, other threads:[~2009-03-24 15:25 UTC | newest]

Thread overview: 33+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
     [not found] <20090319234044.410725944@K.Prasad>
2009-03-19 23:48 ` [Patch 01/11] Introducing generic hardware breakpoint handler interfaces K.Prasad
2009-03-20 14:33   ` Alan Stern
2009-03-20 18:30     ` Ingo Molnar
2009-03-21 17:32       ` K.Prasad
2009-03-20 18:32     ` Ingo Molnar
2009-03-21 17:26     ` K.Prasad
2009-03-21 21:39       ` Alan Stern
2009-03-23 19:03         ` K.Prasad
2009-03-23 19:21           ` Alan Stern
2009-03-23 20:42             ` K.Prasad
2009-03-23 21:20               ` Alan Stern
2009-03-19 23:48 ` [Patch 02/11] x86 architecture implementation of Hardware Breakpoint interfaces K.Prasad
2009-03-19 23:48 ` [Patch 03/11] Modifying generic debug exception to use thread-specific debug registers K.Prasad
2009-03-19 23:49 ` [Patch 04/11] Introduce user-space " K.Prasad
2009-03-19 23:49 ` [Patch 05/11] Use wrapper routines around debug registers in processor related functions K.Prasad
2009-03-19 23:49 ` [Patch 06/11] Use the new wrapper routines to access debug registers in process/thread code K.Prasad
2009-03-19 23:49 ` [Patch 07/11] Modify signal handling code to refrain from re-enabling HW Breakpoints K.Prasad
2009-03-19 23:49 ` [Patch 08/11] Modify Ptrace routines to access breakpoint registers K.Prasad
2009-03-19 23:49 ` [Patch 09/11] Cleanup HW Breakpoint registers before kexec K.Prasad
2009-03-19 23:50 ` [Patch 10/11] Sample HW breakpoint over kernel data address K.Prasad
2009-03-19 23:50 ` [Patch 11/11] ftrace plugin for kernel symbol tracing using HW Breakpoint interfaces - v2 K.Prasad
2009-03-20  9:04   ` Frederic Weisbecker
2009-03-21 16:24     ` K.Prasad
2009-03-21 16:39       ` Steven Rostedt
2009-03-23 19:08         ` K.Prasad
     [not found] <20090324152028.754123712@K.Prasad>
2009-03-24 15:24 ` [Patch 01/11] Introducing generic hardware breakpoint handler interfaces K.Prasad
     [not found] <20090307045120.039324630@linux.vnet.ibm.com>
2009-03-07  5:04 ` prasad
     [not found] <20090305043440.189041194@linux.vnet.ibm.com>
2009-03-05  4:37 ` [patch " prasad
2009-03-10 13:50   ` Ingo Molnar
2009-03-10 14:19     ` Alan Stern
2009-03-10 14:50       ` Ingo Molnar
2009-03-11 12:57         ` K.Prasad
2009-03-11 13:35           ` Ingo Molnar

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.