All of lore.kernel.org
 help / color / mirror / Atom feed
* [Patch 01/11] Introducing generic hardware breakpoint handler interfaces
       [not found] <20090319234044.410725944@K.Prasad>
@ 2009-03-19 23:48 ` K.Prasad
  2009-03-20 14:33   ` Alan Stern
  2009-03-19 23:48 ` [Patch 02/11] x86 architecture implementation of Hardware Breakpoint interfaces K.Prasad
                   ` (9 subsequent siblings)
  10 siblings, 1 reply; 27+ messages in thread
From: K.Prasad @ 2009-03-19 23:48 UTC (permalink / raw)
  To: Ingo Molnar, Linux Kernel Mailing List
  Cc: Alan Stern, Andrew Morton, Benjamin Herrenschmidt,
	Frederic Weisbecker, Maneesh Soni, Roland McGrath,
	Steven Rostedt, K.Prasad

[-- Attachment #1: 1 --]
[-- Type: text/plain, Size: 16776 bytes --]

This patch introduces two new files hw_breakpoint.[ch] which defines the 
generic interfaces to use hardware breakpoint infrastructure of the system. 

Signed-off-by: K.Prasad <prasad@linux.vnet.ibm.com>
Signed-off-by: Alan Stern <stern@rowland.harvard.edu>
---
 arch/Kconfig                        |    3 
 include/asm-generic/hw_breakpoint.h |  140 +++++++++++++
 kernel/Makefile                     |    1 
 kernel/hw_breakpoint.c              |  361 ++++++++++++++++++++++++++++++++++++
 4 files changed, 505 insertions(+)

Index: linux-2.6-tip.hbkpt/kernel/hw_breakpoint.c
===================================================================
--- /dev/null
+++ linux-2.6-tip.hbkpt/kernel/hw_breakpoint.c
@@ -0,0 +1,361 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) 2007 Alan Stern
+ * Copyright (C) IBM Corporation, 2009
+ */
+
+/*
+ * HW_breakpoint: a unified kernel/user-space hardware breakpoint facility,
+ * using the CPU's debug registers.
+ *
+ * This file contains the arch-independent routines.  It is not meant
+ * to be compiled as a standalone source file; rather it should be
+ * #include'd by the arch-specific implementation.
+ */
+
+#include <linux/irqflags.h>
+#include <linux/kallsyms.h>
+#include <linux/notifier.h>
+#include <linux/kprobes.h>
+#include <linux/kdebug.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/percpu.h>
+#include <linux/mutex.h>
+#include <linux/sched.h>
+#include <linux/init.h>
+#include <linux/smp.h>
+
+#include <asm/hw_breakpoint.h>
+#include <asm/processor.h>
+#include <asm/debugreg.h>
+
+/* Array of kernel-space breakpoint structures */
+struct hw_breakpoint *hbkpt_kernel[HB_NUM];
+/*
+ * Kernel breakpoints grow downwards, starting from HB_NUM
+ * 'hbkpt_kernel_pos' denotes lowest numbered breakpoint register occupied for
+ * kernel-space request
+ */
+unsigned int hbkpt_kernel_pos;
+
+/* An array containing refcount of threads using a given bkpt register */
+unsigned int hbkpt_user_max_refcount[HB_NUM];
+
+/* One higher than the highest counted user-space breakpoint register */
+unsigned int hbkpt_user_max;
+
+struct task_struct *last_debugged_task;
+
+/*
+ * Install the debug register values for a new thread.
+ */
+void switch_to_thread_hw_breakpoint(struct task_struct *tsk)
+{
+	/* Set the debug register */
+	arch_install_thread_hbkpt(tsk);
+	last_debugged_task = current;
+
+	put_cpu_no_resched();
+}
+
+/*
+ * Install the debug register values for just the kernel, no thread.
+ */
+void switch_to_none_hw_breakpoint(void)
+{
+	arch_install_none();
+	put_cpu_no_resched();
+}
+
+/*
+ * Load the debug registers during startup of a CPU.
+ */
+void load_debug_registers(void)
+{
+	int i;
+	unsigned long flags;
+
+	/* Prevent IPIs for new kernel breakpoint updates */
+	local_irq_save(flags);
+
+	for (i = hbkpt_kernel_pos; i < HB_NUM; i++)
+		if (hbkpt_kernel[i])
+			on_each_cpu(arch_install_kernel_hbkpt,
+				(void *)hbkpt_kernel[i], 0);
+	if (current->thread.dr7)
+		arch_install_thread_hbkpt(current);
+
+	local_irq_restore(flags);
+}
+
+/*
+ * Erase all the hardware breakpoint info associated with a thread.
+ *
+ * If tsk != current then tsk must not be usable (for example, a
+ * child being cleaned up from a failed fork).
+ */
+void flush_thread_hw_breakpoint(struct task_struct *tsk)
+{
+	int i;
+	struct thread_struct *thread = &(tsk->thread);
+
+	mutex_lock(&hw_breakpoint_mutex);
+
+	/* Let the breakpoints know they are being uninstalled */
+
+	/* The thread no longer has any breakpoints associated with it */
+	clear_tsk_thread_flag(tsk, TIF_DEBUG);
+	for (i = 0; i < HB_NUM; i++) {
+		if (thread->hbkpt[i]) {
+			hbkpt_user_max_refcount[i]--;
+			if (!hbkpt_user_max_refcount[i])
+				hbkpt_user_max--;
+			kfree(thread->hbkpt[i]);
+			thread->hbkpt[i] = NULL;
+		}
+	}
+	thread->hbkpt_num_installed = 0;
+
+	/* Actually uninstall the breakpoints if necessary */
+	if (tsk == current)
+		switch_to_none_hw_breakpoint();
+	mutex_unlock(&hw_breakpoint_mutex);
+}
+
+/*
+ * Copy the hardware breakpoint info from a thread to its cloned child.
+ */
+int copy_thread_hw_breakpoint(struct task_struct *tsk,
+		struct task_struct *child, unsigned long clone_flags)
+{
+	/* We will assume that breakpoint settings are not inherited
+	 * and the child starts out with no debug registers set.
+	 * But what about CLONE_PTRACE?
+	 */
+	clear_tsk_thread_flag(child, TIF_DEBUG);
+	return 0;
+}
+
+/*
+ * Validate the settings in a hw_breakpoint structure.
+ */
+static int validate_settings(struct hw_breakpoint *bp, struct task_struct *tsk)
+{
+	int ret;
+	unsigned int align;
+
+	ret = arch_validate_hwbkpt_settings(bp, &align, tsk);
+	if (ret < 0)
+		goto err;
+
+	/* Check that the low-order bits of the address are appropriate
+	 * for the alignment implied by len.
+	 */
+	if (bp->info.address & align)
+		return -EINVAL;
+
+	/* Check that the virtual address is in the proper range */
+	if (tsk) {
+		if (!arch_check_va_in_userspace(bp->info.address, tsk))
+			return -EFAULT;
+	} else {
+		if (!arch_check_va_in_kernelspace(bp->info.address))
+			return -EFAULT;
+	}
+ err:
+	return ret;
+}
+
+int __register_user_hw_breakpoint(int pos, struct task_struct *tsk,
+					struct hw_breakpoint *bp)
+{
+	struct thread_struct *thread = &(tsk->thread);
+	int rc;
+
+	/* Do not overcommit. Fail if kernel has used the hbkpt registers */
+	if (pos >= hbkpt_kernel_pos)
+		return -ENOSPC;
+
+	rc = validate_settings(bp, tsk);
+	if (rc)
+		return rc;
+
+	thread->hbkpt[pos] = bp;
+	thread->hbkpt_num_installed++;
+	hbkpt_user_max_refcount[pos]++;
+	/* 'tsk' is the thread that uses max number of hbkpt registers */
+	if (hbkpt_user_max < thread->hbkpt_num_installed)
+		hbkpt_user_max++;
+
+	arch_register_user_hw_breakpoint(pos, bp, tsk);
+
+	/*
+	 * Does it need to be installed right now?
+	 * Otherwise it will get installed the next time tsk runs
+	 */
+	if (tsk == current)
+		switch_to_thread_hw_breakpoint(tsk);
+	return rc;
+}
+
+/*
+ * Modify the address of a hbkpt register already in use by the task
+ * Do not invoke this in-lieu of a __unregister_user_hw_breakpoint()
+ */
+int __modify_user_hw_breakpoint(int pos, struct task_struct *tsk,
+					struct hw_breakpoint *bp)
+{
+	int rc;
+	struct thread_struct *thread = &(tsk->thread);
+
+	if ((pos >= hbkpt_kernel_pos) || (validate_settings(bp, tsk)))
+		return -EINVAL;
+
+	thread->hbkpt[pos] = bp;
+
+	/*
+	 * 'pos' must be that of a hbkpt register already used by 'tsk'
+	 * Otherwise arch_modify_user_hw_breakpoint() will fail
+	 */
+	rc = arch_modify_user_hw_breakpoint(pos, bp, tsk);
+	if (rc)
+		return rc;
+
+	if (tsk == current)
+		switch_to_thread_hw_breakpoint(tsk);
+	return 0;
+}
+
+/*
+ * Actual implementation of unregister_user_hw_breakpoint.
+ */
+void __unregister_user_hw_breakpoint(int pos, struct task_struct *tsk,
+						struct hw_breakpoint *bp)
+{
+	struct thread_struct *thread = &(tsk->thread);
+
+	if (!bp)
+		return;
+
+	hbkpt_user_max_refcount[pos]--;
+	if ((hbkpt_user_max == pos + 1) && (hbkpt_user_max_refcount[pos] == 0))
+		hbkpt_user_max--;
+	thread->hbkpt_num_installed--;
+
+	arch_unregister_user_hw_breakpoint(pos, bp, tsk);
+
+	if (tsk == current)
+		switch_to_thread_hw_breakpoint(tsk);
+	kfree(tsk->thread.hbkpt[pos]);
+	tsk->thread.hbkpt[pos] = NULL;
+}
+
+/**
+ * register_kernel_hw_breakpoint - register a hardware breakpoint for kernel space
+ * @bp: the breakpoint structure to register
+ *
+ * @bp.info->name or @bp.info->address, @bp.info->len, @bp.info->type and
+ * @bp->triggered must be set properly before invocation
+ *
+ */
+int register_kernel_hw_breakpoint(struct hw_breakpoint *bp)
+{
+	int rc;
+
+	rc = validate_settings(bp, NULL);
+	if (rc)
+		return rc;
+
+	mutex_lock(&hw_breakpoint_mutex);
+
+	/* Check if we are over-committing */
+	if (hbkpt_kernel_pos <= hbkpt_user_max) {
+		mutex_unlock(&hw_breakpoint_mutex);
+		return -EINVAL;
+	}
+
+	hbkpt_kernel_pos--;
+	hbkpt_kernel[hbkpt_kernel_pos] = bp;
+	arch_register_kernel_hw_breakpoint(bp);
+
+	mutex_unlock(&hw_breakpoint_mutex);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(register_kernel_hw_breakpoint);
+
+/**
+ * unregister_kernel_hw_breakpoint - unregister a hardware breakpoint for kernel space
+ * @bp: the breakpoint structure to unregister
+ *
+ * Uninstalls and unregisters @bp.
+ */
+void unregister_kernel_hw_breakpoint(struct hw_breakpoint *bp)
+{
+	int i, j;
+
+	mutex_lock(&hw_breakpoint_mutex);
+
+	for (i = hbkpt_kernel_pos; i < HB_NUM; i++)
+		if (bp == hbkpt_kernel[i])
+			break;
+
+	arch_unregister_kernel_hw_breakpoint(i);
+
+	/*
+	 * We'll shift the breakpoints one-level above to accomodate new thread
+	 * requests
+	 */
+	if (i > hbkpt_kernel_pos)
+		for (j = i; j == hbkpt_kernel_pos; j--)
+			hbkpt_kernel[j] = hbkpt_kernel[j-1];
+	hbkpt_kernel_pos++;
+
+	mutex_unlock(&hw_breakpoint_mutex);
+}
+EXPORT_SYMBOL_GPL(unregister_kernel_hw_breakpoint);
+
+/*
+ * Handle debug exception notifications.
+ */
+static int __kprobes hw_breakpoint_exceptions_notify(
+		struct notifier_block *unused, unsigned long val, void *data)
+{
+	if (val != DIE_DEBUG)
+		return NOTIFY_DONE;
+	return hw_breakpoint_handler(data);
+}
+
+static struct notifier_block hw_breakpoint_exceptions_nb = {
+	.notifier_call = hw_breakpoint_exceptions_notify,
+	/* we need to be notified first */
+	.priority = 0x7fffffff
+};
+
+static int __init init_hw_breakpoint(void)
+{
+	int i;
+
+	hbkpt_kernel_pos = HB_NUM;
+	for (i = 0; i < HB_NUM; i++)
+		hbkpt_user_max_refcount[i] = 0;
+	hbkpt_user_max = 0;
+	load_debug_registers();
+
+	return register_die_notifier(&hw_breakpoint_exceptions_nb);
+}
+
+core_initcall(init_hw_breakpoint);
Index: linux-2.6-tip.hbkpt/kernel/Makefile
===================================================================
--- linux-2.6-tip.hbkpt.orig/kernel/Makefile
+++ linux-2.6-tip.hbkpt/kernel/Makefile
@@ -95,6 +95,7 @@ obj-$(CONFIG_FUNCTION_TRACER) += trace/
 obj-$(CONFIG_TRACING) += trace/
 obj-$(CONFIG_SMP) += sched_cpupri.o
 obj-$(CONFIG_PERF_COUNTERS) += perf_counter.o
+obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o
 
 ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
 # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
Index: linux-2.6-tip.hbkpt/include/asm-generic/hw_breakpoint.h
===================================================================
--- /dev/null
+++ linux-2.6-tip.hbkpt/include/asm-generic/hw_breakpoint.h
@@ -0,0 +1,140 @@
+#ifndef	_ASM_GENERIC_HW_BREAKPOINT_H
+#define	_ASM_GENERIC_HW_BREAKPOINT_H
+
+#ifndef __ARCH_HW_BREAKPOINT_H
+#error "Please don't include this file directly"
+#endif
+
+#ifdef	__KERNEL__
+#include <linux/list.h>
+#include <linux/types.h>
+#include <linux/kallsyms.h>
+
+/**
+ * struct hw_breakpoint - unified kernel/user-space hardware breakpoint
+ * @triggered: callback invoked after target address access
+ * @info: arch-specific breakpoint info (address, length, and type)
+ *
+ * %hw_breakpoint structures are the kernel's way of representing
+ * hardware breakpoints.  These are data breakpoints
+ * (also known as "watchpoints", triggered on data access), and the breakpoint's
+ * target address can be located in either kernel space or user space.
+ *
+ * The breakpoint's address, length, and type are highly
+ * architecture-specific.  The values are encoded in the @info field; you
+ * specify them when registering the breakpoint.  To examine the encoded
+ * values use hw_breakpoint_get_{kaddress,uaddress,len,type}(), declared
+ * below.
+ *
+ * The address is specified as a regular kernel pointer (for kernel-space
+ * breakponts) or as an %__user pointer (for user-space breakpoints).
+ * With register_user_hw_breakpoint(), the address must refer to a
+ * location in user space.  The breakpoint will be active only while the
+ * requested task is running.  Conversely with
+ * register_kernel_hw_breakpoint(), the address must refer to a location
+ * in kernel space, and the breakpoint will be active on all CPUs
+ * regardless of the current task.
+ *
+ * The length is the breakpoint's extent in bytes, which is subject to
+ * certain limitations.  include/asm/hw_breakpoint.h contains macros
+ * defining the available lengths for a specific architecture.  Note that
+ * the address's alignment must match the length.  The breakpoint will
+ * catch accesses to any byte in the range from address to address +
+ * (length - 1).
+ *
+ * The breakpoint's type indicates the sort of access that will cause it
+ * to trigger.  Possible values may include:
+ *
+ * 	%HW_BREAKPOINT_RW (triggered on read or write access),
+ * 	%HW_BREAKPOINT_WRITE (triggered on write access), and
+ * 	%HW_BREAKPOINT_READ (triggered on read access).
+ *
+ * Appropriate macros are defined in include/asm/hw_breakpoint.h; not all
+ * possibilities are available on all architectures.  Execute breakpoints
+ * must have length equal to the special value %HW_BREAKPOINT_LEN_EXECUTE.
+ *
+ * When a breakpoint gets hit, the @triggered callback is
+ * invoked in_interrupt with a pointer to the %hw_breakpoint structure and the
+ * processor registers.
+ * Data breakpoints occur after the memory access has taken place.
+ * Breakpoints are disabled during execution @triggered, to avoid
+ * recursive traps and allow unhindered access to breakpointed memory.
+ *
+ * This sample code sets a breakpoint on pid_max and registers a callback
+ * function for writes to that variable.  Note that it is not portable
+ * as written, because not all architectures support HW_BREAKPOINT_LEN_4.
+ *
+ * ----------------------------------------------------------------------
+ *
+ * #include <asm/hw_breakpoint.h>
+ *
+ * struct hw_breakpoint my_bp;
+ *
+ * static void my_triggered(struct hw_breakpoint *bp, struct pt_regs *regs)
+ * {
+ * 	printk(KERN_DEBUG "Inside triggered routine of breakpoint exception\n");
+ * 	dump_stack();
+ *  	.......<more debugging output>........
+ * }
+ *
+ * static struct hw_breakpoint my_bp;
+ *
+ * static int init_module(void)
+ * {
+ *	..........<do anything>............
+ *	my_bp.info.type = HW_BREAKPOINT_WRITE;
+ *	my_bp.info.len = HW_BREAKPOINT_LEN_4;
+ *
+ *	my_bp.installed = (void *)my_bp_installed;
+ *
+ *	rc = register_kernel_hw_breakpoint(&my_bp);
+ *	..........<do anything>............
+ * }
+ *
+ * static void cleanup_module(void)
+ * {
+ *	..........<do anything>............
+ *	unregister_kernel_hw_breakpoint(&my_bp);
+ *	..........<do anything>............
+ * }
+ *
+ * ----------------------------------------------------------------------
+ */
+struct hw_breakpoint {
+	void (*triggered)(struct hw_breakpoint *, struct pt_regs *);
+	struct arch_hw_breakpoint info;
+};
+
+/*
+ * len and type values are defined in include/asm/hw_breakpoint.h.
+ * Available values vary according to the architecture.  On i386 the
+ * possibilities are:
+ *
+ *	HW_BREAKPOINT_LEN_1
+ *	HW_BREAKPOINT_LEN_2
+ *	HW_BREAKPOINT_LEN_4
+ *	HW_BREAKPOINT_LEN_EXECUTE
+ *	HW_BREAKPOINT_RW
+ *	HW_BREAKPOINT_READ
+ *	HW_BREAKPOINT_EXECUTE
+ *
+ * On other architectures HW_BREAKPOINT_LEN_8 may be available, and the
+ * 1-, 2-, and 4-byte lengths may be unavailable.  There also may be
+ * HW_BREAKPOINT_WRITE.  You can use #ifdef to check at compile time.
+ */
+
+static DEFINE_MUTEX(hw_breakpoint_mutex);	/* Protects everything */
+
+/*
+ * Kernel breakpoints are not associated with any particular thread.
+ */
+int register_kernel_hw_breakpoint(struct hw_breakpoint *bp);
+void unregister_kernel_hw_breakpoint(struct hw_breakpoint *bp);
+void switch_to_none_hw_breakpoint(void);
+
+extern unsigned int hbkpt_kernel_pos;
+extern unsigned int hbkpt_user_max;
+extern struct task_struct *last_debugged_task;
+
+#endif	/* __KERNEL__ */
+#endif	/* _ASM_GENERIC_HW_BREAKPOINT_H */
Index: linux-2.6-tip.hbkpt/arch/Kconfig
===================================================================
--- linux-2.6-tip.hbkpt.orig/arch/Kconfig
+++ linux-2.6-tip.hbkpt/arch/Kconfig
@@ -106,3 +106,6 @@ config HAVE_CLK
 	help
 	  The <linux/clk.h> calls support software clock gating and
 	  thus are a key power management tool on many systems.
+
+config HAVE_HW_BREAKPOINT
+	bool


^ permalink raw reply	[flat|nested] 27+ messages in thread

* [Patch 02/11] x86 architecture implementation of Hardware Breakpoint interfaces
       [not found] <20090319234044.410725944@K.Prasad>
  2009-03-19 23:48 ` [Patch 01/11] Introducing generic hardware breakpoint handler interfaces K.Prasad
@ 2009-03-19 23:48 ` K.Prasad
  2009-03-19 23:48 ` [Patch 03/11] Modifying generic debug exception to use thread-specific debug registers K.Prasad
                   ` (8 subsequent siblings)
  10 siblings, 0 replies; 27+ messages in thread
From: K.Prasad @ 2009-03-19 23:48 UTC (permalink / raw)
  To: Ingo Molnar, Linux Kernel Mailing List
  Cc: Alan Stern, Andrew Morton, Benjamin Herrenschmidt,
	Frederic Weisbecker, Maneesh Soni, Roland McGrath,
	Steven Rostedt, K.Prasad

[-- Attachment #1: 2 --]
[-- Type: text/plain, Size: 14025 bytes --]

This patch introduces two new files named hw_breakpoint.[ch] inside x86 specific
directories. They contain functions which help validate and serve requests for 
using Hardware Breakpoint registers on x86 processors.

Signed-off-by: K.Prasad <prasad@linux.vnet.ibm.com>
Signed-off-by: Alan Stern <stern@rowland.harvard.edu>
---
 arch/x86/Kconfig                     |    1 
 arch/x86/include/asm/hw_breakpoint.h |   69 ++++++
 arch/x86/kernel/Makefile             |    2 
 arch/x86/kernel/hw_breakpoint.c      |  384 +++++++++++++++++++++++++++++++++++
 4 files changed, 455 insertions(+), 1 deletion(-)

Index: linux-2.6-tip.hbkpt/arch/x86/kernel/hw_breakpoint.c
===================================================================
--- /dev/null
+++ linux-2.6-tip.hbkpt/arch/x86/kernel/hw_breakpoint.c
@@ -0,0 +1,384 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) 2007 Alan Stern
+ * Copyright (C) 2009 IBM Corporation
+ */
+
+/*
+ * HW_breakpoint: a unified kernel/user-space hardware breakpoint facility,
+ * using the CPU's debug registers.
+ */
+
+#include <linux/irqflags.h>
+#include <linux/notifier.h>
+#include <linux/kallsyms.h>
+#include <linux/kprobes.h>
+#include <linux/percpu.h>
+#include <linux/kdebug.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/sched.h>
+#include <linux/init.h>
+#include <linux/smp.h>
+
+#include <asm/hw_breakpoint.h>
+#include <asm/processor.h>
+#include <asm/debugreg.h>
+
+/* Unmasked kernel DR7 value */
+static unsigned long kdr7;
+
+/*
+ * Masks for the bits corresponding to registers DR0 - DR3 in DR7 register.
+ * Used to clear and verify the status of bits corresponding to DR0 - DR3
+ */
+static const unsigned long	dr7_masks[HB_NUM] = {
+	0x000f0003,	/* LEN0, R/W0, G0, L0 */
+	0x00f0000c,	/* LEN1, R/W1, G1, L1 */
+	0x0f000030,	/* LEN2, R/W2, G2, L2 */
+	0xf00000c0	/* LEN3, R/W3, G3, L3 */
+};
+
+
+/*
+ * Encode the length, type, Exact, and Enable bits for a particular breakpoint
+ * as stored in debug register 7.
+ */
+static unsigned long encode_dr7(int drnum, unsigned len, unsigned type)
+{
+	unsigned long temp;
+
+	temp = (len | type) & 0xf;
+	temp <<= (DR_CONTROL_SHIFT + drnum * DR_CONTROL_SIZE);
+	temp |= (DR_GLOBAL_ENABLE << (drnum * DR_ENABLE_SIZE)) |
+				DR_GLOBAL_SLOWDOWN;
+	return temp;
+}
+
+/*
+ * Install the kernel breakpoints in their debug registers.
+ */
+void arch_install_kernel_hbkpt(void *bkpt)
+{
+	struct hw_breakpoint *bp;
+	int i;
+	unsigned long dr7;
+
+	bp = (struct hw_breakpoint *)bkpt;
+
+	kdr7 &= ~(dr7_masks[hbkpt_kernel_pos]);
+	kdr7 |= encode_dr7(hbkpt_kernel_pos, bp->info.len, bp->info.type);
+
+	get_debugreg(dr7, 7);
+	/* Clear the bits corresponding to 'pos' register in dr7 */
+	dr7 &= ~(dr7_masks[hbkpt_kernel_pos]);
+	dr7 |= kdr7;
+
+	/* Don't allow debug exceptions while we update the registers */
+	set_debugreg(0UL, 7);
+
+	/* Kernel hbkpts always begin at 'hbkpt_kernel_pos' and upto HB_NUM */
+	for (i = hbkpt_kernel_pos; i < HB_NUM; i++)
+		set_debugreg(hbkpt_kernel[i]->info.address, i);
+
+	/* No need to set DR6 */
+	set_debugreg(dr7, 7);
+}
+
+/*
+ * Install the thread breakpoints in their debug registers.
+ */
+void arch_install_thread_hbkpt(struct task_struct *tsk)
+{
+	int i;
+	struct thread_struct *thread = &(tsk->thread);
+
+	for (i = 0; i < hbkpt_user_max; i++)
+		if (thread->hbkpt[i])
+			set_debugreg(thread->hbkpt[i]->info.address, i);
+
+	/* No need to set DR6 */
+
+	set_debugreg((kdr7 | thread->dr7), 7);
+}
+
+/*
+ * Install the debug register values for just the kernel, no thread.
+ */
+void arch_install_none()
+{
+	/* Clear the user-space portion of dr7 by setting only kdr7 */
+	set_debugreg(kdr7, 7);
+}
+
+/*
+ * Check for virtual address in user space.
+ */
+int arch_check_va_in_userspace(unsigned long va, struct task_struct *tsk)
+{
+#ifdef CONFIG_X86_32
+	return (va <= TASK_SIZE - 3);
+#else /* X86_64 */
+	return (va <= TASK_SIZE - 7);
+#endif
+}
+
+/*
+ * Check for virtual address in kernel space.
+ */
+int arch_check_va_in_kernelspace(unsigned long va)
+{
+	return (va >= TASK_SIZE);
+}
+
+/*
+ * Store a breakpoint's encoded address, length, and type.
+ */
+void arch_store_info(struct hw_breakpoint *bp)
+{
+	/*
+	 * User-space requests will always have the address field populated
+	 * For kernel-addresses, either the address or symbol name can be
+	 * specified.
+	 */
+	if (bp->info.address)
+		return;
+	if (bp->info.name)
+		bp->info.address = (unsigned long)
+					kallsyms_lookup_name(bp->info.name);
+}
+
+/*
+ * Validate the arch-specific HW Breakpoint register settings
+ */
+int arch_validate_hwbkpt_settings(struct hw_breakpoint *bp,
+				unsigned int *align, struct task_struct *tsk)
+{
+	int ret = -EINVAL;
+
+	switch (bp->info.type) {
+
+	/* Ptrace-refactoring code
+	 * For now, we'll allow instruction breakpoint only for user-space
+	 * addresses
+	 */
+	case HW_BREAKPOINT_EXECUTE:
+		if ((!arch_check_va_in_userspace(bp->info.address, tsk)) &&
+			bp->info.len != HW_BREAKPOINT_LEN_EXECUTE)
+			return ret;
+		break;
+	case HW_BREAKPOINT_WRITE:
+				break;
+	case HW_BREAKPOINT_RW:
+				break;
+	default:
+		return ret;
+	}
+
+	switch (bp->info.len) {
+	case HW_BREAKPOINT_LEN_1:
+		*align = 0;
+		break;
+	case HW_BREAKPOINT_LEN_2:
+		*align = 1;
+		break;
+	case HW_BREAKPOINT_LEN_4:
+		*align = 3;
+		break;
+	default:
+		return ret;
+	}
+
+	if (bp->triggered) {
+		ret = 0;
+		arch_store_info(bp);
+	}
+	return ret;
+}
+
+/*
+ * Register a new user breakpoint structure.
+ */
+void arch_register_user_hw_breakpoint(int pos, struct hw_breakpoint *bp,
+		struct task_struct *tsk)
+{
+	struct thread_struct *thread = &(tsk->thread);
+
+	thread->dr7 &= ~dr7_masks[pos];
+	thread->dr7 |= encode_dr7(pos, bp->info.len, bp->info.type);
+}
+
+/*
+ * Modify an existing user breakpoint structure.
+ */
+int arch_modify_user_hw_breakpoint(int pos, struct hw_breakpoint *bp,
+		struct task_struct *tsk)
+{
+	struct thread_struct *thread = &(tsk->thread);
+
+	/* Check if the register to be modified was enabled by the thread */
+	if (!(thread->dr7 & (1 << (pos * DR_ENABLE_SIZE))))
+		return -EINVAL;
+
+	thread->dr7 &= ~dr7_masks[pos];
+	thread->dr7 |= encode_dr7(pos, bp->info.len, bp->info.type);
+
+	return 0;
+}
+
+/*
+ * Unregister a user breakpoint structure.
+ */
+void arch_unregister_user_hw_breakpoint(int pos, struct hw_breakpoint *bp,
+					struct task_struct *tsk)
+{
+	struct thread_struct *thread = &(tsk->thread);
+
+	if (!thread->hbkpt[pos])
+		return;
+
+	thread->hbkpt[pos]->info.address = 0;
+	thread->dr7 &= ~dr7_masks[pos];
+}
+
+/*
+ * Register a kernel breakpoint structure.
+ */
+void arch_register_kernel_hw_breakpoint(struct hw_breakpoint *bp)
+{
+	on_each_cpu(arch_install_kernel_hbkpt, (void *)bp, 0);
+}
+
+/*
+ * Unregister a kernel breakpoint structure.
+ */
+void arch_unregister_kernel_hw_breakpoint(int pos)
+{
+	unsigned long dr7;
+
+	kdr7 &= ~(dr7_masks[pos]);
+
+	get_debugreg(dr7, 7);
+	dr7  &= ~(dr7_masks[pos]);
+	set_debugreg(dr7, 7);
+}
+
+/* End of arch-specific hook routines */
+
+/*
+ * Copy out the debug register information for a core dump.
+ *
+ * tsk must be equal to current.
+ */
+void dump_thread_hw_breakpoint(struct task_struct *tsk, int u_debugreg[8])
+{
+	struct thread_struct *thread = &(tsk->thread);
+	int i;
+
+	memset(u_debugreg, 0, sizeof u_debugreg);
+	for (i = 0; i < thread->hbkpt_num_installed && thread->hbkpt[i]; ++i)
+		u_debugreg[i] = thread->hbkpt[i]->info.address;
+	u_debugreg[7] = thread->dr7;
+	u_debugreg[6] = thread->dr6;
+}
+
+/*
+ * Handle debug exception notifications.
+ */
+int __kprobes hw_breakpoint_handler(struct die_args *args)
+{
+	int i;
+	struct hw_breakpoint *bp;
+	/* The DR6 value is stored in args->err */
+	unsigned long dr7, dr6 = args->err;
+
+	if (dr6 & DR_STEP)
+		return NOTIFY_DONE;
+
+	get_debugreg(dr7, 7);
+
+	/* Disable breakpoints during exception handling */
+	set_debugreg(0UL, 7);
+
+	/* Assert that local interrupts are disabled
+	 * Reset the DRn bits in the virtualized register value.
+	 * The ptrace trigger routine will add in whatever is needed.
+	 */
+	current->thread.dr6 &= ~(DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3);
+
+	/* Lazy debug register switching */
+	if (last_debugged_task != current)
+		switch_to_none_hw_breakpoint();
+
+	/* Handle all the breakpoints that were triggered */
+	for (i = 0; i < HB_NUM; ++i) {
+		if (likely(!(dr6 & (DR_TRAP0 << i))))
+			continue;
+
+		/* Find the corresponding hw_breakpoint structure and
+		 * invoke its triggered callback.
+		 */
+		if (i < hbkpt_user_max)
+			bp = current->thread.hbkpt[i];
+		else if (i >= hbkpt_kernel_pos)
+			bp = hbkpt_kernel[i];
+		else		/* False alarm due to lazy DR switching */
+			continue;
+		if (!bp)
+			goto ret_path;
+
+		switch (bp->info.type) {
+		case HW_BREAKPOINT_WRITE:
+		case HW_BREAKPOINT_RW:
+			if (bp->triggered)
+				(bp->triggered)(bp, args->regs);
+			/* Re-enable the breakpoints */
+			put_cpu_no_resched();
+			if (arch_check_va_in_userspace(bp->info.address,
+							current))
+				goto ret_notify_done;
+			else
+				goto ret_notify_stop;
+		/*
+		 * Presently we allow instruction breakpoints only in
+		 * user-space when requested through ptrace.
+		 */
+		case HW_BREAKPOINT_EXECUTE:
+			if (arch_check_va_in_userspace(bp->info.address,
+							current)) {
+				(bp->triggered)(bp, args->regs);
+			/*
+			 * do_debug will notify user through a SIGTRAP signal
+			 * So we are not requesting a NOTIFY_STOP here
+			 */
+				goto ret_notify_done;
+			}
+		}
+	}
+
+ret_path:
+	/* Stop processing further if the exception is a stray one */
+	if (!(dr6 & ~(DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)))
+		goto ret_notify_stop;
+
+ret_notify_done:
+	set_debugreg(dr7, 7);
+	return NOTIFY_DONE;
+ret_notify_stop:
+	set_debugreg(dr7, 7);
+	return NOTIFY_STOP;
+}
Index: linux-2.6-tip.hbkpt/arch/x86/include/asm/hw_breakpoint.h
===================================================================
--- /dev/null
+++ linux-2.6-tip.hbkpt/arch/x86/include/asm/hw_breakpoint.h
@@ -0,0 +1,69 @@
+#ifndef	_I386_HW_BREAKPOINT_H
+#define	_I386_HW_BREAKPOINT_H
+
+#ifdef	__KERNEL__
+#define	__ARCH_HW_BREAKPOINT_H
+
+struct arch_hw_breakpoint {
+	char		*name; /* Contains name of the symbol to set bkpt */
+	unsigned long	address;
+	u8		len;
+	u8		type;
+};
+
+#include <linux/kdebug.h>
+#include <asm-generic/hw_breakpoint.h>
+
+/* Available HW breakpoint length encodings */
+#define HW_BREAKPOINT_LEN_1		0x40
+#define HW_BREAKPOINT_LEN_2		0x44
+#define HW_BREAKPOINT_LEN_4		0x4c
+#define HW_BREAKPOINT_LEN_EXECUTE	0x40
+
+/* Available HW breakpoint type encodings */
+
+/* trigger on instruction execute */
+#define HW_BREAKPOINT_EXECUTE	0x80
+/* trigger on memory write */
+#define HW_BREAKPOINT_WRITE	0x81
+/* trigger on memory read or write */
+#define HW_BREAKPOINT_RW	0x83
+
+/* Total number of available HW breakpoint registers */
+#define HB_NUM 4
+
+extern struct hw_breakpoint *hbkpt_kernel[HB_NUM];
+extern unsigned int hbkpt_user_max_refcount[HB_NUM];
+
+/*
+ * Ptrace support: breakpoint trigger routine.
+ */
+int __register_user_hw_breakpoint(int pos, struct task_struct *tsk,
+			struct hw_breakpoint *bp);
+int __modify_user_hw_breakpoint(int pos, struct task_struct *tsk,
+			struct hw_breakpoint *bp);
+void __unregister_user_hw_breakpoint(int pos, struct task_struct *tsk,
+			struct hw_breakpoint *bp);
+
+void arch_install_thread_hbkpt(struct task_struct *tsk);
+void arch_install_none(void);
+void arch_install_kernel_hbkpt(void *);
+int arch_check_va_in_userspace(unsigned long va,
+						struct task_struct *tsk);
+int arch_check_va_in_kernelspace(unsigned long va);
+void arch_store_info(struct hw_breakpoint *bp);
+int arch_validate_hwbkpt_settings(struct hw_breakpoint *bp,
+				unsigned int *align, struct task_struct *tsk);
+void arch_register_user_hw_breakpoint(int pos, struct hw_breakpoint *bp,
+				struct task_struct *tsk);
+int arch_modify_user_hw_breakpoint(int pos, struct hw_breakpoint *bp,
+				struct task_struct *tsk);
+void arch_unregister_user_hw_breakpoint(int pos, struct hw_breakpoint *bp,
+					struct task_struct *tsk);
+void arch_register_kernel_hw_breakpoint(struct hw_breakpoint *bp);
+void arch_unregister_kernel_hw_breakpoint(int pos);
+int hw_breakpoint_handler(struct die_args *args);
+
+#endif	/* __KERNEL__ */
+#endif	/* _I386_HW_BREAKPOINT_H */
+
Index: linux-2.6-tip.hbkpt/arch/x86/kernel/Makefile
===================================================================
--- linux-2.6-tip.hbkpt.orig/arch/x86/kernel/Makefile
+++ linux-2.6-tip.hbkpt/arch/x86/kernel/Makefile
@@ -36,7 +36,7 @@ obj-$(CONFIG_X86_64)	+= sys_x86_64.o x86
 obj-$(CONFIG_X86_64)	+= syscall_64.o vsyscall_64.o
 obj-y			+= bootflag.o e820.o
 obj-y			+= pci-dma.o quirks.o i8237.o topology.o kdebugfs.o
-obj-y			+= alternative.o i8253.o pci-nommu.o
+obj-y			+= alternative.o i8253.o pci-nommu.o hw_breakpoint.o
 obj-y			+= tsc.o io_delay.o rtc.o
 
 obj-$(CONFIG_X86_TRAMPOLINE)	+= trampoline.o
Index: linux-2.6-tip.hbkpt/arch/x86/Kconfig
===================================================================
--- linux-2.6-tip.hbkpt.orig/arch/x86/Kconfig
+++ linux-2.6-tip.hbkpt/arch/x86/Kconfig
@@ -46,6 +46,7 @@ config X86
 	select HAVE_KERNEL_BZIP2
 	select HAVE_KERNEL_LZMA
 	select HAVE_ARCH_KMEMCHECK
+	select HAVE_HW_BREAKPOINT
 
 config ARCH_DEFCONFIG
 	string


^ permalink raw reply	[flat|nested] 27+ messages in thread

* [Patch 03/11] Modifying generic debug exception to use thread-specific debug registers
       [not found] <20090319234044.410725944@K.Prasad>
  2009-03-19 23:48 ` [Patch 01/11] Introducing generic hardware breakpoint handler interfaces K.Prasad
  2009-03-19 23:48 ` [Patch 02/11] x86 architecture implementation of Hardware Breakpoint interfaces K.Prasad
@ 2009-03-19 23:48 ` K.Prasad
  2009-03-19 23:49 ` [Patch 04/11] Introduce user-space " K.Prasad
                   ` (7 subsequent siblings)
  10 siblings, 0 replies; 27+ messages in thread
From: K.Prasad @ 2009-03-19 23:48 UTC (permalink / raw)
  To: Ingo Molnar, Linux Kernel Mailing List
  Cc: Alan Stern, Andrew Morton, Benjamin Herrenschmidt,
	Frederic Weisbecker, Maneesh Soni, Roland McGrath,
	Steven Rostedt, K.Prasad

[-- Attachment #1: 3 --]
[-- Type: text/plain, Size: 3661 bytes --]

From: Alan Stern <stern@rowland.harvard.edu>

This patch modifies the breakpoint exception handler code to use the abstract
register names.

[K.Prasad: Split-out from the bigger patch and minor changes following
           re-basing]

Signed-off-by: K.Prasad <prasad@linux.vnet.ibm.com>
Signed-off-by: Alan Stern <stern@rowland.harvard.edu>
---
 arch/x86/kernel/traps.c |   73 ++++++++++++++++--------------------------------
 1 file changed, 25 insertions(+), 48 deletions(-)

Index: linux-2.6-tip.hbkpt/arch/x86/kernel/traps.c
===================================================================
--- linux-2.6-tip.hbkpt.orig/arch/x86/kernel/traps.c
+++ linux-2.6-tip.hbkpt/arch/x86/kernel/traps.c
@@ -530,13 +530,14 @@ asmlinkage __kprobes struct pt_regs *syn
 dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code)
 {
 	struct task_struct *tsk = current;
-	unsigned long condition;
+	unsigned long dr6;
 	int si_code;
 
-	get_debugreg(condition, 6);
+	get_debugreg(dr6, 6);
+	set_debugreg(0, 6);	/* DR6 may or may not be cleared by the CPU */
 
 	/* Catch kmemcheck conditions first of all! */
-	if (condition & DR_STEP && kmemcheck_trap(regs))
+	if (dr6 & DR_STEP && kmemcheck_trap(regs))
 		return;
 
 	/*
@@ -545,61 +546,37 @@ dotraplinkage void __kprobes do_debug(st
 	clear_tsk_thread_flag(tsk, TIF_DEBUGCTLMSR);
 	tsk->thread.debugctlmsr = 0;
 
-	if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code,
+	/* Store the virtualized DR6 value */
+	tsk->thread.dr6 = dr6;
+
+	if (notify_die(DIE_DEBUG, "debug", regs, dr6, error_code,
 						SIGTRAP) == NOTIFY_STOP)
 		return;
 
 	/* It's safe to allow irq's after DR6 has been saved */
 	preempt_conditional_sti(regs);
 
-	/* Mask out spurious debug traps due to lazy DR7 setting */
-	if (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)) {
-		if (!tsk->thread.debugreg7)
-			goto clear_dr7;
-	}
-
-#ifdef CONFIG_X86_32
-	if (regs->flags & X86_VM_MASK)
-		goto debug_vm86;
-#endif
-
-	/* Save debug status register where ptrace can see it */
-	tsk->thread.debugreg6 = condition;
-
-	/*
-	 * Single-stepping through TF: make sure we ignore any events in
-	 * kernel space (but re-enable TF when returning to user mode).
-	 */
-	if (condition & DR_STEP) {
-		if (!user_mode(regs))
-			goto clear_TF_reenable;
+	if (regs->flags & X86_VM_MASK) {
+		handle_vm86_trap((struct kernel_vm86_regs *) regs,
+				error_code, 1);
+		return;
 	}
 
-	si_code = get_si_code(condition);
-	/* Ok, finally something we can handle */
-	send_sigtrap(tsk, regs, error_code, si_code);
-
 	/*
-	 * Disable additional traps. They'll be re-enabled when
-	 * the signal is delivered.
+	 * Single-stepping through system calls: ignore any exceptions in
+	 * kernel space, but re-enable TF when returning to user mode.
+	 *
+	 * We already checked v86 mode above, so we can check for kernel mode
+	 * by just checking the CPL of CS.
 	 */
-clear_dr7:
-	set_debugreg(0, 7);
-	preempt_conditional_cli(regs);
-	return;
-
-#ifdef CONFIG_X86_32
-debug_vm86:
-	/* reenable preemption: handle_vm86_trap() might sleep */
-	dec_preempt_count();
-	handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code, 1);
-	conditional_cli(regs);
-	return;
-#endif
-
-clear_TF_reenable:
-	set_tsk_thread_flag(tsk, TIF_SINGLESTEP);
-	regs->flags &= ~X86_EFLAGS_TF;
+	if ((dr6 & DR_STEP) && !user_mode(regs)) {
+		tsk->thread.dr6 &= ~DR_STEP;
+		set_tsk_thread_flag(tsk, TIF_SINGLESTEP);
+		regs->flags &= ~X86_EFLAGS_TF;
+	}
+	si_code = get_si_code(dr6);
+	if (tsk->thread.dr6 & (DR_STEP|DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3))
+		send_sigtrap(tsk, regs, error_code, si_code);
 	preempt_conditional_cli(regs);
 	return;
 }


^ permalink raw reply	[flat|nested] 27+ messages in thread

* [Patch 04/11] Introduce user-space debug registers
       [not found] <20090319234044.410725944@K.Prasad>
                   ` (2 preceding siblings ...)
  2009-03-19 23:48 ` [Patch 03/11] Modifying generic debug exception to use thread-specific debug registers K.Prasad
@ 2009-03-19 23:49 ` K.Prasad
  2009-03-19 23:49 ` [Patch 05/11] Use wrapper routines around debug registers in processor related functions K.Prasad
                   ` (6 subsequent siblings)
  10 siblings, 0 replies; 27+ messages in thread
From: K.Prasad @ 2009-03-19 23:49 UTC (permalink / raw)
  To: Ingo Molnar, Linux Kernel Mailing List
  Cc: Alan Stern, Andrew Morton, Benjamin Herrenschmidt,
	Frederic Weisbecker, Maneesh Soni, Roland McGrath,
	Steven Rostedt, K.Prasad

[-- Attachment #1: 4 --]
[-- Type: text/plain, Size: 2977 bytes --]

This patch introduces virtual debug registers to used by the per-thread
structure and wrapper routines to manage debug registers by process-related
functions.

Signed-off-by: K.Prasad <prasad@linux.vnet.ibm.com>
Signed-off-by: Alan Stern <stern@rowland.harvard.edu>
---
 arch/x86/include/asm/debugreg.h  |   23 +++++++++++++++++++++++
 arch/x86/include/asm/processor.h |   16 +++++++++-------
 2 files changed, 32 insertions(+), 7 deletions(-)

Index: linux-2.6-tip.hbkpt/arch/x86/include/asm/debugreg.h
===================================================================
--- linux-2.6-tip.hbkpt.orig/arch/x86/include/asm/debugreg.h
+++ linux-2.6-tip.hbkpt/arch/x86/include/asm/debugreg.h
@@ -49,6 +49,8 @@
 
 #define DR_LOCAL_ENABLE_SHIFT 0    /* Extra shift to the local enable bit */
 #define DR_GLOBAL_ENABLE_SHIFT 1   /* Extra shift to the global enable bit */
+#define DR_LOCAL_ENABLE (0x1)      /* Local enable for reg 0 */
+#define DR_GLOBAL_ENABLE (0x2)     /* Global enable for reg 0 */
 #define DR_ENABLE_SIZE 2           /* 2 enable bits per register */
 
 #define DR_LOCAL_ENABLE_MASK (0x55)  /* Set  local bits for all 4 regs */
@@ -67,4 +69,25 @@
 #define DR_LOCAL_SLOWDOWN (0x100)   /* Local slow the pipeline */
 #define DR_GLOBAL_SLOWDOWN (0x200)  /* Global slow the pipeline */
 
+/*
+ * HW breakpoint additions
+ */
+#ifdef __KERNEL__
+
+/* For process management */
+void flush_thread_hw_breakpoint(struct task_struct *tsk);
+int copy_thread_hw_breakpoint(struct task_struct *tsk,
+		struct task_struct *child, unsigned long clone_flags);
+void dump_thread_hw_breakpoint(struct task_struct *tsk, int u_debugreg[8]);
+void switch_to_thread_hw_breakpoint(struct task_struct *tsk);
+
+/* For CPU management */
+void load_debug_registers(void);
+static inline void hw_breakpoint_disable(void)
+{
+	set_debugreg(0UL, 7);
+}
+
+#endif	/* __KERNEL__ */
+
 #endif /* _ASM_X86_DEBUGREG_H */
Index: linux-2.6-tip.hbkpt/arch/x86/include/asm/processor.h
===================================================================
--- linux-2.6-tip.hbkpt.orig/arch/x86/include/asm/processor.h
+++ linux-2.6-tip.hbkpt/arch/x86/include/asm/processor.h
@@ -29,6 +29,7 @@ struct mm_struct;
 #include <linux/threads.h>
 #include <linux/init.h>
 
+#define HB_NUM 4
 /*
  * Default implementation of macro that returns current
  * instruction pointer ("program counter").
@@ -424,13 +425,14 @@ struct thread_struct {
 	unsigned long		ip;
 	unsigned long		fs;
 	unsigned long		gs;
-	/* Hardware debugging registers: */
-	unsigned long		debugreg0;
-	unsigned long		debugreg1;
-	unsigned long		debugreg2;
-	unsigned long		debugreg3;
-	unsigned long		debugreg6;
-	unsigned long		debugreg7;
+	/* Hardware breakpoint info */
+	struct hw_breakpoint	*hbkpt[HB_NUM];
+	unsigned int		hbkpt_num_installed;
+	/* Thread's view of debug reg 6 */
+	unsigned long		dr6;
+	/* Thread's view of debug reg 7 */
+	unsigned long		dr7;
+
 	/* Fault info: */
 	unsigned long		cr2;
 	unsigned long		trap_no;


^ permalink raw reply	[flat|nested] 27+ messages in thread

* [Patch 05/11] Use wrapper routines around debug registers in processor related functions
       [not found] <20090319234044.410725944@K.Prasad>
                   ` (3 preceding siblings ...)
  2009-03-19 23:49 ` [Patch 04/11] Introduce user-space " K.Prasad
@ 2009-03-19 23:49 ` K.Prasad
  2009-03-19 23:49 ` [Patch 06/11] Use the new wrapper routines to access debug registers in process/thread code K.Prasad
                   ` (5 subsequent siblings)
  10 siblings, 0 replies; 27+ messages in thread
From: K.Prasad @ 2009-03-19 23:49 UTC (permalink / raw)
  To: Ingo Molnar, Linux Kernel Mailing List
  Cc: Alan Stern, Andrew Morton, Benjamin Herrenschmidt,
	Frederic Weisbecker, Maneesh Soni, Roland McGrath,
	Steven Rostedt, K.Prasad

[-- Attachment #1: 5 --]
[-- Type: text/plain, Size: 3702 bytes --]

From: Alan Stern <stern@rowland.harvard.edu>

This patch enables the use of wrapper routines to access the debug/breakpoint
registers.

[K.Prasad: Split-out from the bigger patch and minor changes following
           re-basing]

Signed-off-by: K.Prasad <prasad@linux.vnet.ibm.com>
Signed-off-by: Alan Stern <stern@rowland.harvard.edu>
---
 arch/x86/kernel/smpboot.c |    3 +++
 arch/x86/power/cpu_32.c   |   16 +++-------------
 arch/x86/power/cpu_64.c   |   15 +++------------
 3 files changed, 9 insertions(+), 25 deletions(-)

Index: linux-2.6-tip.hbkpt/arch/x86/power/cpu_32.c
===================================================================
--- linux-2.6-tip.hbkpt.orig/arch/x86/power/cpu_32.c
+++ linux-2.6-tip.hbkpt/arch/x86/power/cpu_32.c
@@ -12,6 +12,7 @@
 #include <asm/mtrr.h>
 #include <asm/mce.h>
 #include <asm/xcr.h>
+#include <asm/debugreg.h>
 
 static struct saved_context saved_context;
 
@@ -47,6 +48,7 @@ static void __save_processor_state(struc
 	ctxt->cr2 = read_cr2();
 	ctxt->cr3 = read_cr3();
 	ctxt->cr4 = read_cr4_safe();
+	hw_breakpoint_disable();
 }
 
 /* Needed by apm.c */
@@ -79,19 +81,7 @@ static void fix_processor_context(void)
 	load_TR_desc();				/* This does ltr */
 	load_LDT(&current->active_mm->context);	/* This does lldt */
 
-	/*
-	 * Now maybe reload the debug registers
-	 */
-	if (current->thread.debugreg7) {
-		set_debugreg(current->thread.debugreg0, 0);
-		set_debugreg(current->thread.debugreg1, 1);
-		set_debugreg(current->thread.debugreg2, 2);
-		set_debugreg(current->thread.debugreg3, 3);
-		/* no 4 and 5 */
-		set_debugreg(current->thread.debugreg6, 6);
-		set_debugreg(current->thread.debugreg7, 7);
-	}
-
+	load_debug_registers();
 }
 
 static void __restore_processor_state(struct saved_context *ctxt)
Index: linux-2.6-tip.hbkpt/arch/x86/power/cpu_64.c
===================================================================
--- linux-2.6-tip.hbkpt.orig/arch/x86/power/cpu_64.c
+++ linux-2.6-tip.hbkpt/arch/x86/power/cpu_64.c
@@ -15,6 +15,7 @@
 #include <asm/pgtable.h>
 #include <asm/mtrr.h>
 #include <asm/xcr.h>
+#include <asm/debugreg.h>
 
 static void fix_processor_context(void);
 
@@ -70,6 +71,7 @@ static void __save_processor_state(struc
 	ctxt->cr3 = read_cr3();
 	ctxt->cr4 = read_cr4();
 	ctxt->cr8 = read_cr8();
+	hw_breakpoint_disable();
 }
 
 void save_processor_state(void)
@@ -158,16 +160,5 @@ static void fix_processor_context(void)
 	load_TR_desc();				/* This does ltr */
 	load_LDT(&current->active_mm->context);	/* This does lldt */
 
-	/*
-	 * Now maybe reload the debug registers
-	 */
-	if (current->thread.debugreg7){
-                loaddebug(&current->thread, 0);
-                loaddebug(&current->thread, 1);
-                loaddebug(&current->thread, 2);
-                loaddebug(&current->thread, 3);
-                /* no 4 and 5 */
-                loaddebug(&current->thread, 6);
-                loaddebug(&current->thread, 7);
-	}
+	load_debug_registers();
 }
Index: linux-2.6-tip.hbkpt/arch/x86/kernel/smpboot.c
===================================================================
--- linux-2.6-tip.hbkpt.orig/arch/x86/kernel/smpboot.c
+++ linux-2.6-tip.hbkpt/arch/x86/kernel/smpboot.c
@@ -63,6 +63,7 @@
 #include <asm/apic.h>
 #include <asm/setup.h>
 #include <asm/uv/uv.h>
+#include <asm/debugreg.h>
 #include <linux/mc146818rtc.h>
 
 #include <asm/smpboot_hooks.h>
@@ -331,6 +332,7 @@ notrace static void __cpuinit start_seco
 	setup_secondary_clock();
 
 	wmb();
+	load_debug_registers();
 	cpu_idle();
 }
 
@@ -1234,6 +1236,7 @@ void cpu_disable_common(void)
 	remove_cpu_from_maps(cpu);
 	unlock_vector_lock();
 	fixup_irqs();
+	hw_breakpoint_disable();
 }
 
 int native_cpu_disable(void)


^ permalink raw reply	[flat|nested] 27+ messages in thread

* [Patch 06/11] Use the new wrapper routines to access debug registers in process/thread code
       [not found] <20090319234044.410725944@K.Prasad>
                   ` (4 preceding siblings ...)
  2009-03-19 23:49 ` [Patch 05/11] Use wrapper routines around debug registers in processor related functions K.Prasad
@ 2009-03-19 23:49 ` K.Prasad
  2009-03-19 23:49 ` [Patch 07/11] Modify signal handling code to refrain from re-enabling HW Breakpoints K.Prasad
                   ` (4 subsequent siblings)
  10 siblings, 0 replies; 27+ messages in thread
From: K.Prasad @ 2009-03-19 23:49 UTC (permalink / raw)
  To: Ingo Molnar, Linux Kernel Mailing List
  Cc: Alan Stern, Andrew Morton, Benjamin Herrenschmidt,
	Frederic Weisbecker, Maneesh Soni, Roland McGrath,
	Steven Rostedt, K.Prasad

[-- Attachment #1: 6.new --]
[-- Type: text/plain, Size: 7362 bytes --]

From: Alan Stern <stern@rowland.harvard.edu>

This patch enables the use of abstract debug registers in
process-handling routines.

[K.Prasad: Split-out from the bigger patch and minor changes following
           re-basing]

Signed-off-by: K.Prasad <prasad@linux.vnet.ibm.com>
Signed-off-by: Alan Stern <stern@rowland.harvard.edu>
---
 arch/x86/kernel/process.c    |   23 ++++++-----------------
 arch/x86/kernel/process_32.c |   31 +++++++++++++++++++++++++++++++
 arch/x86/kernel/process_64.c |   33 +++++++++++++++++++++++++++++++++
 3 files changed, 70 insertions(+), 17 deletions(-)

Index: linux-2.6-tip.hbkpt/arch/x86/kernel/process.c
===================================================================
--- linux-2.6-tip.hbkpt.orig/arch/x86/kernel/process.c
+++ linux-2.6-tip.hbkpt/arch/x86/kernel/process.c
@@ -14,6 +14,8 @@
 #include <asm/idle.h>
 #include <asm/uaccess.h>
 #include <asm/i387.h>
+#include <asm/debugreg.h>
+#include <asm/hw_breakpoint.h>
 
 unsigned long idle_halt;
 EXPORT_SYMBOL(idle_halt);
@@ -83,6 +85,8 @@ void exit_thread(void)
 		put_cpu();
 		kfree(bp);
 	}
+	if (unlikely(t->dr7))
+		flush_thread_hw_breakpoint(me);
 
 	ds_exit_thread(current);
 }
@@ -103,14 +107,9 @@ void flush_thread(void)
 	}
 #endif
 
-	clear_tsk_thread_flag(tsk, TIF_DEBUG);
+	if (unlikely(tsk->thread.dr7))
+		flush_thread_hw_breakpoint(tsk);
 
-	tsk->thread.debugreg0 = 0;
-	tsk->thread.debugreg1 = 0;
-	tsk->thread.debugreg2 = 0;
-	tsk->thread.debugreg3 = 0;
-	tsk->thread.debugreg6 = 0;
-	tsk->thread.debugreg7 = 0;
 	memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
 	/*
 	 * Forget coprocessor state..
@@ -192,16 +191,6 @@ void __switch_to_xtra(struct task_struct
 	else if (next->debugctlmsr != prev->debugctlmsr)
 		update_debugctlmsr(next->debugctlmsr);
 
-	if (test_tsk_thread_flag(next_p, TIF_DEBUG)) {
-		set_debugreg(next->debugreg0, 0);
-		set_debugreg(next->debugreg1, 1);
-		set_debugreg(next->debugreg2, 2);
-		set_debugreg(next->debugreg3, 3);
-		/* no 4 and 5 */
-		set_debugreg(next->debugreg6, 6);
-		set_debugreg(next->debugreg7, 7);
-	}
-
 	if (test_tsk_thread_flag(prev_p, TIF_NOTSC) ^
 	    test_tsk_thread_flag(next_p, TIF_NOTSC)) {
 		/* prev and next are different */
Index: linux-2.6-tip.hbkpt/arch/x86/kernel/process_32.c
===================================================================
--- linux-2.6-tip.hbkpt.orig/arch/x86/kernel/process_32.c
+++ linux-2.6-tip.hbkpt/arch/x86/kernel/process_32.c
@@ -59,6 +59,8 @@
 #include <asm/idle.h>
 #include <asm/syscalls.h>
 #include <asm/ds.h>
+#include <asm/debugreg.h>
+#include <asm/hw_breakpoint.h>
 
 asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
 
@@ -263,7 +265,14 @@ int copy_thread(int nr, unsigned long cl
 
 	task_user_gs(p) = get_user_gs(regs);
 
+	p->thread.io_bitmap_ptr = NULL;
+
 	tsk = current;
+	err = -ENOMEM;
+	if (unlikely(tsk->thread.dr7)) {
+		if (copy_thread_hw_breakpoint(tsk, p, clone_flags))
+			goto out;
+	}
 	if (unlikely(test_tsk_thread_flag(tsk, TIF_IO_BITMAP))) {
 		p->thread.io_bitmap_ptr = kmemdup(tsk->thread.io_bitmap_ptr,
 						IO_BITMAP_BYTES, GFP_KERNEL);
@@ -283,10 +292,13 @@ int copy_thread(int nr, unsigned long cl
 		err = do_set_thread_area(p, -1,
 			(struct user_desc __user *)childregs->si, 0);
 
+out:
 	if (err && p->thread.io_bitmap_ptr) {
 		kfree(p->thread.io_bitmap_ptr);
 		p->thread.io_bitmap_max = 0;
 	}
+	if (err)
+		flush_thread_hw_breakpoint(p);
 
 	ds_copy_thread(p, current);
 
@@ -424,6 +436,25 @@ __switch_to(struct task_struct *prev_p, 
 		lazy_load_gs(next->gs);
 
 	percpu_write(current_task, next_p);
+	/*
+	 * There's a problem with moving the switch_to_thread_hw_breakpoint()
+	 * call before current is updated.  Suppose a kernel breakpoint is
+	 * triggered in between the two.  The hw-breakpoint handler will see
+	 * that current is different from the task pointer stored in the chbi
+	 * area, so it will think the task pointer is leftover from an old task
+	 * (lazy switching) and will erase it.  Then until the next context
+	 * switch, no user-breakpoints will be installed.
+	 *
+	 * The real problem is that it's impossible to update both current and
+	 * chbi->bp_task at the same instant, so there will always be a window
+	 * in which they disagree and a breakpoint might get triggered.  Since
+	 * we use lazy switching, we are forced to assume that a disagreement
+	 * means that current is correct and chbi->bp_task is old.  But if you
+	 * move the code above then you'll create a window in which current is
+	 * old and chbi->bp_task is correct.
+	 */
+	if (unlikely(test_tsk_thread_flag(next_p, TIF_DEBUG)))
+		switch_to_thread_hw_breakpoint(next_p);
 
 	return prev_p;
 }
Index: linux-2.6-tip.hbkpt/arch/x86/kernel/process_64.c
===================================================================
--- linux-2.6-tip.hbkpt.orig/arch/x86/kernel/process_64.c
+++ linux-2.6-tip.hbkpt/arch/x86/kernel/process_64.c
@@ -55,6 +55,8 @@
 #include <asm/idle.h>
 #include <asm/syscalls.h>
 #include <asm/ds.h>
+#include <asm/debugreg.h>
+#include <asm/hw_breakpoint.h>
 
 asmlinkage extern void ret_from_fork(void);
 
@@ -248,6 +250,8 @@ void release_thread(struct task_struct *
 			BUG();
 		}
 	}
+	if (unlikely(dead_task->thread.tdr7))
+		flush_thread_hw_breakpoint(dead_task);
 }
 
 static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr)
@@ -303,12 +307,18 @@ int copy_thread(int nr, unsigned long cl
 
 	p->thread.fs = me->thread.fs;
 	p->thread.gs = me->thread.gs;
+	p->thread.io_bitmap_ptr = NULL;
 
 	savesegment(gs, p->thread.gsindex);
 	savesegment(fs, p->thread.fsindex);
 	savesegment(es, p->thread.es);
 	savesegment(ds, p->thread.ds);
 
+	err = -ENOMEM;
+	if (unlikely(me->thread.tdr7)) {
+		if (copy_thread_hw_breakpoint(me, p, clone_flags))
+			goto out;
+	}
 	if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) {
 		p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
 		if (!p->thread.io_bitmap_ptr) {
@@ -346,6 +356,9 @@ out:
 		kfree(p->thread.io_bitmap_ptr);
 		p->thread.io_bitmap_max = 0;
 	}
+	if (err)
+		flush_thread_hw_breakpoint(p);
+
 	return err;
 }
 
@@ -491,6 +504,26 @@ __switch_to(struct task_struct *prev_p, 
 	 */
 	if (tsk_used_math(next_p) && next_p->fpu_counter > 5)
 		math_state_restore();
+	/*
+	 * There's a problem with moving the switch_to_thread_hw_breakpoint()
+	 * call before current is updated.  Suppose a kernel breakpoint is
+	 * triggered in between the two.  The hw-breakpoint handler will see
+	 * that current is different from the task pointer stored in the chbi
+	 * area, so it will think the task pointer is leftover from an old task
+	 * (lazy switching) and will erase it.  Then until the next context
+	 * switch, no user-breakpoints will be installed.
+	 *
+	 * The real problem is that it's impossible to update both current and
+	 * chbi->bp_task at the same instant, so there will always be a window
+	 * in which they disagree and a breakpoint might get triggered.  Since
+	 * we use lazy switching, we are forced to assume that a disagreement
+	 * means that current is correct and chbi->bp_task is old.  But if you
+	 * move the code above then you'll create a window in which current is
+	 * old and chbi->bp_task is correct.
+	 */
+	if (unlikely(test_tsk_thread_flag(next_p, TIF_DEBUG)))
+		switch_to_thread_hw_breakpoint(next_p);
+
 	return prev_p;
 }
 


^ permalink raw reply	[flat|nested] 27+ messages in thread

* [Patch 07/11] Modify signal handling code to refrain from re-enabling HW Breakpoints
       [not found] <20090319234044.410725944@K.Prasad>
                   ` (5 preceding siblings ...)
  2009-03-19 23:49 ` [Patch 06/11] Use the new wrapper routines to access debug registers in process/thread code K.Prasad
@ 2009-03-19 23:49 ` K.Prasad
  2009-03-19 23:49 ` [Patch 08/11] Modify Ptrace routines to access breakpoint registers K.Prasad
                   ` (3 subsequent siblings)
  10 siblings, 0 replies; 27+ messages in thread
From: K.Prasad @ 2009-03-19 23:49 UTC (permalink / raw)
  To: Ingo Molnar, Linux Kernel Mailing List
  Cc: Alan Stern, Andrew Morton, Benjamin Herrenschmidt,
	Frederic Weisbecker, Maneesh Soni, Roland McGrath,
	Steven Rostedt, K.Prasad

[-- Attachment #1: 7 --]
[-- Type: text/plain, Size: 1144 bytes --]

From: Alan Stern <stern@rowland.harvard.edu>

This patch disables re-enabling of Hardware Breakpoint registers through
the  signal handling code. This is now done during
hw_breakpoint_handler().

Signed-off-by: K.Prasad <prasad@linux.vnet.ibm.com>
Signed-off-by: Alan Stern <stern@rowland.harvard.edu>
---
 arch/x86/kernel/signal.c |    9 ---------
 1 file changed, 9 deletions(-)

Index: linux-2.6-tip.hbkpt/arch/x86/kernel/signal.c
===================================================================
--- linux-2.6-tip.hbkpt.orig/arch/x86/kernel/signal.c
+++ linux-2.6-tip.hbkpt/arch/x86/kernel/signal.c
@@ -794,15 +794,6 @@ static void do_signal(struct pt_regs *re
 
 	signr = get_signal_to_deliver(&info, &ka, regs, NULL);
 	if (signr > 0) {
-		/*
-		 * Re-enable any watchpoints before delivering the
-		 * signal to user space. The processor register will
-		 * have been cleared if the watchpoint triggered
-		 * inside the kernel.
-		 */
-		if (current->thread.debugreg7)
-			set_debugreg(current->thread.debugreg7, 7);
-
 		/* Whee! Actually deliver the signal.  */
 		if (handle_signal(signr, &info, &ka, oldset, regs) == 0) {
 			/*


^ permalink raw reply	[flat|nested] 27+ messages in thread

* [Patch 08/11] Modify Ptrace routines to access breakpoint registers
       [not found] <20090319234044.410725944@K.Prasad>
                   ` (6 preceding siblings ...)
  2009-03-19 23:49 ` [Patch 07/11] Modify signal handling code to refrain from re-enabling HW Breakpoints K.Prasad
@ 2009-03-19 23:49 ` K.Prasad
  2009-03-19 23:49 ` [Patch 09/11] Cleanup HW Breakpoint registers before kexec K.Prasad
                   ` (2 subsequent siblings)
  10 siblings, 0 replies; 27+ messages in thread
From: K.Prasad @ 2009-03-19 23:49 UTC (permalink / raw)
  To: Ingo Molnar, Linux Kernel Mailing List
  Cc: Alan Stern, Andrew Morton, Benjamin Herrenschmidt,
	Frederic Weisbecker, Maneesh Soni, Roland McGrath,
	Steven Rostedt, K.Prasad

[-- Attachment #1: 8 --]
[-- Type: text/plain, Size: 8692 bytes --]

From: Alan Stern <stern@rowland.harvard.edu>

This patch modifies the ptrace code to use the new wrapper routines around the 
debug/breakpoint registers.

[K.Prasad: Adapted the ptrace routines and to changes post x86/x86_64 merger,
	   split the minor patch from bigger patch. Re-wrote ptrace_write_dr7()
           and ptrace_set_debugreg() functions to use new data-structures]

[K.Prasad: Changed code to suit the simplified HW breakpoint implementation]

Signed-off-by: K.Prasad <prasad@linux.vnet.ibm.com>
Signed-off-by: Alan Stern <stern@rowland.harvard.edu>
---
 arch/x86/kernel/ptrace.c |  229 ++++++++++++++++++++++++++++-------------------
 1 file changed, 138 insertions(+), 91 deletions(-)

Index: linux-2.6-tip.hbkpt/arch/x86/kernel/ptrace.c
===================================================================
--- linux-2.6-tip.hbkpt.orig/arch/x86/kernel/ptrace.c
+++ linux-2.6-tip.hbkpt/arch/x86/kernel/ptrace.c
@@ -34,6 +34,7 @@
 #include <asm/prctl.h>
 #include <asm/proto.h>
 #include <asm/ds.h>
+#include <asm/hw_breakpoint.h>
 
 #include "tls.h"
 
@@ -134,11 +135,6 @@ static int set_segment_reg(struct task_s
 	return 0;
 }
 
-static unsigned long debugreg_addr_limit(struct task_struct *task)
-{
-	return TASK_SIZE - 3;
-}
-
 #else  /* CONFIG_X86_64 */
 
 #define FLAG_MASK		(FLAG_MASK_32 | X86_EFLAGS_NT)
@@ -263,15 +259,6 @@ static int set_segment_reg(struct task_s
 	return 0;
 }
 
-static unsigned long debugreg_addr_limit(struct task_struct *task)
-{
-#ifdef CONFIG_IA32_EMULATION
-	if (test_tsk_thread_flag(task, TIF_IA32))
-		return IA32_PAGE_OFFSET - 3;
-#endif
-	return TASK_SIZE_MAX - 7;
-}
-
 #endif	/* CONFIG_X86_32 */
 
 static unsigned long get_flags(struct task_struct *task)
@@ -462,95 +449,155 @@ static int genregs_set(struct task_struc
 }
 
 /*
- * This function is trivial and will be inlined by the compiler.
- * Having it separates the implementation details of debug
- * registers from the interface details of ptrace.
+ * Decode the length and type bits for a particular breakpoint as
+ * stored in debug register 7.  Return the "enabled" status.
  */
-static unsigned long ptrace_get_debugreg(struct task_struct *child, int n)
+static int decode_dr7(unsigned long dr7, int bpnum, unsigned *len,
+		unsigned *type)
 {
-	switch (n) {
-	case 0:		return child->thread.debugreg0;
-	case 1:		return child->thread.debugreg1;
-	case 2:		return child->thread.debugreg2;
-	case 3:		return child->thread.debugreg3;
-	case 6:		return child->thread.debugreg6;
-	case 7:		return child->thread.debugreg7;
-	}
-	return 0;
+	int temp = dr7 >> (DR_CONTROL_SHIFT + bpnum * DR_CONTROL_SIZE);
+
+	*len = (temp & 0xc) | 0x40;
+	*type = (temp & 0x3) | 0x80;
+	return (dr7 >> (bpnum * DR_ENABLE_SIZE)) & 0x3;
 }
 
-static int ptrace_set_debugreg(struct task_struct *child,
-			       int n, unsigned long data)
+static void ptrace_triggered(struct hw_breakpoint *bp, struct pt_regs *regs)
 {
+	struct thread_struct *thread = &(current->thread);
 	int i;
 
-	if (unlikely(n == 4 || n == 5))
-		return -EIO;
+	/* Store in the virtual DR6 register the fact that the breakpoint
+	 * was hit so the thread's debugger will see it.
+	 */
+	for (i = 0; i < hbkpt_user_max; i++)
+		if (bp->info.address == thread->hbkpt[i]->info.address)
+			break;
 
-	if (n < 4 && unlikely(data >= debugreg_addr_limit(child)))
-		return -EIO;
+	thread->dr6 |= (DR_TRAP0 << i);
+}
 
-	switch (n) {
-	case 0:		child->thread.debugreg0 = data; break;
-	case 1:		child->thread.debugreg1 = data; break;
-	case 2:		child->thread.debugreg2 = data; break;
-	case 3:		child->thread.debugreg3 = data; break;
+/*
+ * Handle ptrace writes to debug register 7.
+ */
+static int ptrace_write_dr7(struct task_struct *tsk, unsigned long data)
+{
+	struct hw_breakpoint *bp;
+	struct thread_struct *thread = &(tsk->thread);
+	int i;
+	int rc = 0;
+	unsigned long old_dr7 = thread->dr7;
 
-	case 6:
-		if ((data & ~0xffffffffUL) != 0)
-			return -EIO;
-		child->thread.debugreg6 = data;
-		break;
+	data &= ~DR_CONTROL_RESERVED;
+	/* Loop through all the hardware breakpoints, making the
+	 * appropriate changes to each.
+	 */
+restore_settings:
+	thread->dr7 = data;
+	for (i = 0; i < HB_NUM; i++) {
+		int enabled;
+		unsigned len, type;
+
+		bp = thread->hbkpt[i];
+		if (!bp)
+			continue;
+
+		enabled = decode_dr7(data, i, &len, &type);
+		if (!enabled) {
+			if (bp->triggered)
+				__unregister_user_hw_breakpoint(i, tsk, bp);
+			continue;
+		}
 
-	case 7:
-		/*
-		 * Sanity-check data. Take one half-byte at once with
-		 * check = (val >> (16 + 4*i)) & 0xf. It contains the
-		 * R/Wi and LENi bits; bits 0 and 1 are R/Wi, and bits
-		 * 2 and 3 are LENi. Given a list of invalid values,
-		 * we do mask |= 1 << invalid_value, so that
-		 * (mask >> check) & 1 is a correct test for invalid
-		 * values.
-		 *
-		 * R/Wi contains the type of the breakpoint /
-		 * watchpoint, LENi contains the length of the watched
-		 * data in the watchpoint case.
-		 *
-		 * The invalid values are:
-		 * - LENi == 0x10 (undefined), so mask |= 0x0f00.	[32-bit]
-		 * - R/Wi == 0x10 (break on I/O reads or writes), so
-		 *   mask |= 0x4444.
-		 * - R/Wi == 0x00 && LENi != 0x00, so we have mask |=
-		 *   0x1110.
-		 *
-		 * Finally, mask = 0x0f00 | 0x4444 | 0x1110 == 0x5f54.
-		 *
-		 * See the Intel Manual "System Programming Guide",
-		 * 15.2.4
-		 *
-		 * Note that LENi == 0x10 is defined on x86_64 in long
-		 * mode (i.e. even for 32-bit userspace software, but
-		 * 64-bit kernel), so the x86_64 mask value is 0x5454.
-		 * See the AMD manual no. 24593 (AMD64 System Programming)
-		 */
-#ifdef CONFIG_X86_32
-#define	DR7_MASK	0x5f54
-#else
-#define	DR7_MASK	0x5554
-#endif
-		data &= ~DR_CONTROL_RESERVED;
-		for (i = 0; i < 4; i++)
-			if ((DR7_MASK >> ((data >> (16 + 4*i)) & 0xf)) & 1)
-				return -EIO;
-		child->thread.debugreg7 = data;
-		if (data)
-			set_tsk_thread_flag(child, TIF_DEBUG);
+		if (bp->triggered)
+			rc = __modify_user_hw_breakpoint(i, tsk, bp);
+		else {
+			bp->triggered = ptrace_triggered;
+			bp->info.len = len;
+			bp->info.type = type;
+			rc = __register_user_hw_breakpoint(i, tsk, bp);
+		}
+		if (rc < 0)
+			break;
 		else
-			clear_tsk_thread_flag(child, TIF_DEBUG);
-		break;
+			set_tsk_thread_flag(tsk, TIF_DEBUG);
+	}
+	/* If anything above failed, restore the original settings */
+	if (rc < 0) {
+		data = old_dr7;
+		goto restore_settings;
 	}
+	return rc;
+}
 
-	return 0;
+/*
+ * Handle PTRACE_PEEKUSR calls for the debug register area.
+ */
+unsigned long ptrace_get_debugreg(struct task_struct *tsk, int n)
+{
+	struct thread_struct *thread = &(tsk->thread);
+	unsigned long val = 0;
+
+	mutex_lock(&hw_breakpoint_mutex);
+	if (n < HB_NUM) {
+		if (thread->hbkpt[n])
+			val = thread->hbkpt[n]->info.address;
+	} else if (n == 6) {
+		val = thread->dr6;
+	} else if (n == 7) {
+		val = thread->dr7;
+	}
+	mutex_unlock(&hw_breakpoint_mutex);
+	return val;
+}
+
+/*
+ * Handle PTRACE_POKEUSR calls for the debug register area.
+ */
+int ptrace_set_debugreg(struct task_struct *tsk, int n, unsigned long val)
+{
+	struct thread_struct *thread = &(tsk->thread);
+	int rc = -EIO;
+
+	mutex_lock(&hw_breakpoint_mutex);
+
+	/* There are no DR4 or DR5 registers */
+	if (n == 4 || n == 5)
+		goto ret_path;
+
+	/* Writes to DR6 modify the virtualized value */
+	if (n == 6) {
+		tsk->thread.dr6 = val;
+		rc = 0;
+		goto ret_path;
+	}
+
+	/* Writes to DR0 - DR3 change a breakpoint address */
+	rc = 0;
+	if (n < HB_NUM) {
+		if (!val)
+			goto ret_path;
+		if (thread->hbkpt[n]) {
+			thread->hbkpt[n]->info.address = val;
+			rc = __modify_user_hw_breakpoint(n, tsk,
+							  thread->hbkpt[n]);
+			goto ret_path;
+		}
+		thread->hbkpt[n] = kzalloc(sizeof(struct hw_breakpoint),
+								GFP_KERNEL);
+		if (!thread->hbkpt[n]) {
+			rc = -ENOMEM;
+			goto ret_path;
+		} else
+			thread->hbkpt[n]->info.address = val;
+	}
+	/* All that's left is DR7 */
+	if (n == 7)
+		rc = ptrace_write_dr7(tsk, val);
+
+ret_path:
+	mutex_unlock(&hw_breakpoint_mutex);
+	return rc;
 }
 
 /*
@@ -871,7 +918,7 @@ long arch_ptrace(struct task_struct *chi
 		else if (addr >= offsetof(struct user, u_debugreg[0]) &&
 			 addr <= offsetof(struct user, u_debugreg[7])) {
 			addr -= offsetof(struct user, u_debugreg[0]);
-			tmp = ptrace_get_debugreg(child, addr / sizeof(data));
+			tmp = ptrace_get_debugreg(child, addr/sizeof(data));
 		}
 		ret = put_user(tmp, datap);
 		break;
@@ -889,7 +936,7 @@ long arch_ptrace(struct task_struct *chi
 			 addr <= offsetof(struct user, u_debugreg[7])) {
 			addr -= offsetof(struct user, u_debugreg[0]);
 			ret = ptrace_set_debugreg(child,
-						  addr / sizeof(data), data);
+						addr/sizeof(data), data);
 		}
 		break;
 


^ permalink raw reply	[flat|nested] 27+ messages in thread

* [Patch 09/11] Cleanup HW Breakpoint registers before kexec
       [not found] <20090319234044.410725944@K.Prasad>
                   ` (7 preceding siblings ...)
  2009-03-19 23:49 ` [Patch 08/11] Modify Ptrace routines to access breakpoint registers K.Prasad
@ 2009-03-19 23:49 ` K.Prasad
  2009-03-19 23:50 ` [Patch 10/11] Sample HW breakpoint over kernel data address K.Prasad
  2009-03-19 23:50 ` [Patch 11/11] ftrace plugin for kernel symbol tracing using HW Breakpoint interfaces - v2 K.Prasad
  10 siblings, 0 replies; 27+ messages in thread
From: K.Prasad @ 2009-03-19 23:49 UTC (permalink / raw)
  To: Ingo Molnar, Linux Kernel Mailing List
  Cc: Alan Stern, Andrew Morton, Benjamin Herrenschmidt,
	Frederic Weisbecker, Maneesh Soni, Roland McGrath,
	Steven Rostedt, K.Prasad

[-- Attachment #1: 9 --]
[-- Type: text/plain, Size: 1818 bytes --]

From: Alan Stern <stern@rowland.harvard.edu>

This patch disables Hardware breakpoints before doing a 'kexec' on the machine.

[K.Prasad: Split-out from the bigger patch and minor changes following
           re-basing]

Signed-off-by: K.Prasad <prasad@linux.vnet.ibm.com>
Signed-off-by: Alan Stern <stern@rowland.harvard.edu>
---
 arch/x86/kernel/machine_kexec_32.c |    2 ++
 arch/x86/kernel/machine_kexec_64.c |    2 ++
 2 files changed, 4 insertions(+)

Index: linux-2.6-tip.hbkpt/arch/x86/kernel/machine_kexec_32.c
===================================================================
--- linux-2.6-tip.hbkpt.orig/arch/x86/kernel/machine_kexec_32.c
+++ linux-2.6-tip.hbkpt/arch/x86/kernel/machine_kexec_32.c
@@ -25,6 +25,7 @@
 #include <asm/desc.h>
 #include <asm/system.h>
 #include <asm/cacheflush.h>
+#include <asm/debugreg.h>
 
 static void set_idt(void *newidt, __u16 limit)
 {
@@ -202,6 +203,7 @@ void machine_kexec(struct kimage *image)
 
 	/* Interrupts aren't acceptable while we reboot */
 	local_irq_disable();
+	hw_breakpoint_disable();
 
 	if (image->preserve_context) {
 #ifdef CONFIG_X86_IO_APIC
Index: linux-2.6-tip.hbkpt/arch/x86/kernel/machine_kexec_64.c
===================================================================
--- linux-2.6-tip.hbkpt.orig/arch/x86/kernel/machine_kexec_64.c
+++ linux-2.6-tip.hbkpt/arch/x86/kernel/machine_kexec_64.c
@@ -18,6 +18,7 @@
 #include <asm/pgtable.h>
 #include <asm/tlbflush.h>
 #include <asm/mmu_context.h>
+#include <asm/debugreg.h>
 
 static int init_one_level2_page(struct kimage *image, pgd_t *pgd,
 				unsigned long addr)
@@ -282,6 +283,7 @@ void machine_kexec(struct kimage *image)
 
 	/* Interrupts aren't acceptable while we reboot */
 	local_irq_disable();
+	hw_breakpoint_disable();
 
 	if (image->preserve_context) {
 #ifdef CONFIG_X86_IO_APIC


^ permalink raw reply	[flat|nested] 27+ messages in thread

* [Patch 10/11] Sample HW breakpoint over kernel data address
       [not found] <20090319234044.410725944@K.Prasad>
                   ` (8 preceding siblings ...)
  2009-03-19 23:49 ` [Patch 09/11] Cleanup HW Breakpoint registers before kexec K.Prasad
@ 2009-03-19 23:50 ` K.Prasad
  2009-03-19 23:50 ` [Patch 11/11] ftrace plugin for kernel symbol tracing using HW Breakpoint interfaces - v2 K.Prasad
  10 siblings, 0 replies; 27+ messages in thread
From: K.Prasad @ 2009-03-19 23:50 UTC (permalink / raw)
  To: Ingo Molnar, Linux Kernel Mailing List
  Cc: Alan Stern, Andrew Morton, Benjamin Herrenschmidt,
	Frederic Weisbecker, Maneesh Soni, Roland McGrath,
	Steven Rostedt, K.Prasad

[-- Attachment #1: 10 --]
[-- Type: text/plain, Size: 4654 bytes --]

This patch introduces a sample kernel module to demonstrate the use of Hardware
Breakpoint feature. It places a breakpoint over the kernel variable 'pid_max'
to monitor all write operations and emits a function-backtrace when done.

Signed-off-by: K.Prasad <prasad@linux.vnet.ibm.com>
---
 samples/Kconfig                         |    6 ++
 samples/Makefile                        |    4 +
 samples/hw_breakpoint/Makefile          |    1 
 samples/hw_breakpoint/data_breakpoint.c |   79 ++++++++++++++++++++++++++++++++
 4 files changed, 89 insertions(+), 1 deletion(-)

Index: linux-2.6-tip.hbkpt/samples/Kconfig
===================================================================
--- linux-2.6-tip.hbkpt.orig/samples/Kconfig
+++ linux-2.6-tip.hbkpt/samples/Kconfig
@@ -39,5 +39,11 @@ config SAMPLE_KRETPROBES
 	default m
 	depends on SAMPLE_KPROBES && KRETPROBES
 
+config SAMPLE_HW_BREAKPOINT
+	tristate "Build kernel hardware breakpoint examples -- loadable modules only"
+	depends on HAVE_HW_BREAKPOINT && m
+	help
+	  This builds kernel hardware breakpoint example modules.
+
 endif # SAMPLES
 
Index: linux-2.6-tip.hbkpt/samples/Makefile
===================================================================
--- linux-2.6-tip.hbkpt.orig/samples/Makefile
+++ linux-2.6-tip.hbkpt/samples/Makefile
@@ -1,3 +1,5 @@
 # Makefile for Linux samples code
 
-obj-$(CONFIG_SAMPLES)	+= markers/ kobject/ kprobes/ tracepoints/
+obj-$(CONFIG_SAMPLES)	+= markers/ kobject/ kprobes/ tracepoints/ \
+			   hw_breakpoint/
+
Index: linux-2.6-tip.hbkpt/samples/hw_breakpoint/Makefile
===================================================================
--- /dev/null
+++ linux-2.6-tip.hbkpt/samples/hw_breakpoint/Makefile
@@ -0,0 +1 @@
+obj-$(CONFIG_SAMPLE_HW_BREAKPOINT) += data_breakpoint.o
Index: linux-2.6-tip.hbkpt/samples/hw_breakpoint/data_breakpoint.c
===================================================================
--- /dev/null
+++ linux-2.6-tip.hbkpt/samples/hw_breakpoint/data_breakpoint.c
@@ -0,0 +1,79 @@
+/*
+ * data_breakpoint.c - Sample HW Breakpoint file to watch kernel data address
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * This file is a kernel module that places a breakpoint over 'pid_max' kernel
+ * variable using Hardware Breakpoint register. The corresponding handler which
+ * prints a backtrace is invoked everytime a write operation is performed on
+ * that variable.
+ *
+ * After inserting this module, invoke a write operation using
+ * 'echo <desired_value> > /proc/sys/kernel/pid_max'
+ * to find the function-call backtrace.
+ *
+ * Copyright (C) IBM Corporation, 2009
+ */
+#include <linux/module.h>	/* Needed by all modules */
+#include <linux/kernel.h>	/* Needed for KERN_INFO */
+#include <linux/init.h>		/* Needed for the macros */
+
+#include <asm/hw_breakpoint.h>
+
+struct hw_breakpoint pid_max_hbkpt;
+
+void pid_max_hbkpt_handler(struct hw_breakpoint *temp, struct pt_regs
+								*temp_regs)
+{
+	printk(KERN_INFO "pid_max value is changed\n");
+	dump_stack();
+	printk(KERN_INFO "Dump stack from pid_max_hbkpt_handler\n");
+}
+
+static int __init hw_break_module_init(void)
+{
+	int ret;
+
+#ifdef CONFIG_X86
+	pid_max_hbkpt.info.name = "pid_max";
+	pid_max_hbkpt.info.type = HW_BREAKPOINT_WRITE;
+	pid_max_hbkpt.info.len = HW_BREAKPOINT_LEN_4;
+
+	pid_max_hbkpt.triggered = (void *)pid_max_hbkpt_handler;
+#endif /* CONFIG_X86 */
+
+	ret = register_kernel_hw_breakpoint(&pid_max_hbkpt);
+
+	if (ret < 0) {
+		printk(KERN_INFO "Breakpoint registration failed\n");
+		return ret;
+	} else
+		printk(KERN_INFO "HW Breakpoint for pid_max write installed\n");
+
+	return 0;
+}
+
+static void __exit hw_break_module_exit(void)
+{
+	unregister_kernel_hw_breakpoint(&pid_max_hbkpt);
+	printk(KERN_INFO "HW Breakpoint for pid_max write uninstalled\n");
+}
+
+module_init(hw_break_module_init);
+module_exit(hw_break_module_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("K.Prasad");
+MODULE_DESCRIPTION("pid_max breakpoint");


^ permalink raw reply	[flat|nested] 27+ messages in thread

* [Patch 11/11] ftrace plugin for kernel symbol tracing using HW Breakpoint interfaces - v2
       [not found] <20090319234044.410725944@K.Prasad>
                   ` (9 preceding siblings ...)
  2009-03-19 23:50 ` [Patch 10/11] Sample HW breakpoint over kernel data address K.Prasad
@ 2009-03-19 23:50 ` K.Prasad
  2009-03-20  9:04   ` Frederic Weisbecker
  10 siblings, 1 reply; 27+ messages in thread
From: K.Prasad @ 2009-03-19 23:50 UTC (permalink / raw)
  To: Ingo Molnar, Linux Kernel Mailing List
  Cc: Alan Stern, Andrew Morton, Benjamin Herrenschmidt,
	Frederic Weisbecker, Maneesh Soni, Roland McGrath,
	Steven Rostedt, K.Prasad

[-- Attachment #1: ftrace_hbkpt_12 --]
[-- Type: text/plain, Size: 19482 bytes --]

This patch adds an ftrace plugin to detect and profile memory access over
kernel variables. It uses HW Breakpoint interfaces to 'watch memory
addresses.

Signed-off-by: K.Prasad <prasad@linux.vnet.ibm.com> 
---
 kernel/trace/Kconfig          |   21 +
 kernel/trace/Makefile         |    1 
 kernel/trace/trace.h          |   25 +
 kernel/trace/trace_ksym.c     |  555 ++++++++++++++++++++++++++++++++++++++++++
 kernel/trace/trace_selftest.c |   36 ++
 5 files changed, 638 insertions(+)

Index: linux-2.6-tip.hbkpt/kernel/trace/Kconfig
===================================================================
--- linux-2.6-tip.hbkpt.orig/kernel/trace/Kconfig
+++ linux-2.6-tip.hbkpt/kernel/trace/Kconfig
@@ -264,6 +264,27 @@ config POWER_TRACER
 	  power management decisions, specifically the C-state and P-state
 	  behavior.
 
+config KSYM_TRACER
+	bool "Trace read and write access on kernel memory locations"
+	depends on HAVE_HW_BREAKPOINT
+	select TRACING
+	help
+	  This tracer helps find read and write operations on any given kernel
+	  symbol i.e. /proc/kallsyms.
+
+config PROFILE_KSYM_TRACER
+	bool "Profile all kernel memory accesses on 'watched' variables"
+	depends on KSYM_TRACER
+	help
+	  This tracer profiles kernel accesses on variables watched through the
+	  ksym tracer ftrace plugin. Depending upon the hardware, all read
+	  and write operations on kernel variables can be monitored for
+	  accesses.
+
+	  The results will be displayed in:
+	  /debugfs/tracing/profile_ksym
+
+	  Say N if unsure.
 
 config STACK_TRACER
 	bool "Trace max stack"
Index: linux-2.6-tip.hbkpt/kernel/trace/Makefile
===================================================================
--- linux-2.6-tip.hbkpt.orig/kernel/trace/Makefile
+++ linux-2.6-tip.hbkpt/kernel/trace/Makefile
@@ -44,5 +44,6 @@ obj-$(CONFIG_EVENT_TRACER) += trace_even
 obj-$(CONFIG_EVENT_TRACER) += events.o
 obj-$(CONFIG_EVENT_TRACER) += trace_export.o
 obj-$(CONFIG_FTRACE_SYSCALLS) += trace_syscalls.o
+obj-$(CONFIG_KSYM_TRACER) += trace_ksym.o
 
 libftrace-y := ftrace.o
Index: linux-2.6-tip.hbkpt/kernel/trace/trace.h
===================================================================
--- linux-2.6-tip.hbkpt.orig/kernel/trace/trace.h
+++ linux-2.6-tip.hbkpt/kernel/trace/trace.h
@@ -12,6 +12,10 @@
 #include <trace/kmemtrace.h>
 #include <trace/power.h>
 
+#ifdef CONFIG_KSYM_TRACER
+#include <asm/hw_breakpoint.h>
+#endif
+
 enum trace_type {
 	__TRACE_FIRST_TYPE = 0,
 
@@ -37,6 +41,7 @@ enum trace_type {
 	TRACE_KMEM_FREE,
 	TRACE_POWER,
 	TRACE_BLK,
+	TRACE_KSYM,
 
 	__TRACE_LAST_TYPE,
 };
@@ -214,6 +219,23 @@ struct syscall_trace_exit {
 	unsigned long		ret;
 };
 
+#ifdef CONFIG_KSYM_TRACER
+struct trace_ksym {
+	struct trace_entry	ent;
+	struct hw_breakpoint	*ksym_hbkpt;
+	unsigned long		ksym_addr;
+	unsigned long		ip;
+#ifdef CONFIG_PROFILE_KSYM_TRACER
+	unsigned long 		counter;
+#endif
+	struct hlist_node	ksym_hlist;
+	char			ksym_name[KSYM_NAME_LEN];
+	char			p_name[TASK_COMM_LEN];
+};
+#else
+struct trace_ksym {
+};
+#endif /* CONFIG_KSYM_TRACER */
 
 /*
  * trace_flag_type is an enumeration that holds different
@@ -332,6 +354,7 @@ extern void __ftrace_bad_type(void);
 			  TRACE_SYSCALL_ENTER);				\
 		IF_ASSIGN(var, ent, struct syscall_trace_exit,		\
 			  TRACE_SYSCALL_EXIT);				\
+		IF_ASSIGN(var, ent, struct trace_ksym, TRACE_KSYM);	\
 		__ftrace_bad_type();					\
 	} while (0)
 
@@ -593,6 +616,8 @@ extern int trace_selftest_startup_syspro
 					       struct trace_array *tr);
 extern int trace_selftest_startup_branch(struct tracer *trace,
 					 struct trace_array *tr);
+extern int trace_selftest_startup_ksym(struct tracer *trace,
+					 struct trace_array *tr);
 #endif /* CONFIG_FTRACE_STARTUP_TEST */
 
 extern void *head_page(struct trace_array_cpu *data);
Index: linux-2.6-tip.hbkpt/kernel/trace/trace_ksym.c
===================================================================
--- /dev/null
+++ linux-2.6-tip.hbkpt/kernel/trace/trace_ksym.c
@@ -0,0 +1,555 @@
+/*
+ * trace_ksym.c - Kernel Symbol Tracer
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) IBM Corporation, 2009
+ */
+
+#include <linux/kallsyms.h>
+#include <linux/uaccess.h>
+#include <linux/debugfs.h>
+#include <linux/ftrace.h>
+#include <linux/module.h>
+#include <linux/jhash.h>
+#include <linux/fs.h>
+
+#include "trace_output.h"
+#include "trace_stat.h"
+#include "trace.h"
+
+/* For now, let us restrict the no. of symbols traced simultaneously to number
+ * of available hardware breakpoint registers.
+ */
+#define KSYM_TRACER_MAX HB_NUM
+
+#define KSYM_TRACER_OP_LEN 3 /* rw- */
+#define KSYM_FILTER_ENTRY_LEN (KSYM_NAME_LEN + KSYM_TRACER_OP_LEN + 1)
+
+#ifdef CONFIG_FTRACE_SELFTEST
+
+static int ksym_selftest_dummy;
+#define KSYM_SELFTEST_ENTRY "ksym_selftest_dummy"
+
+#endif /* CONFIG_FTRACE_SELFTEST */
+
+static struct trace_array *ksym_trace_array;
+
+DEFINE_MUTEX(ksym_tracer_mutex);
+
+static unsigned int ksym_filter_entry_count;
+static unsigned int ksym_tracing_enabled;
+
+static HLIST_HEAD(ksym_filter_head);
+
+#ifdef CONFIG_PROFILE_KSYM_TRACER
+
+#define MAX_UL_INT 0xffffffff
+DEFINE_SPINLOCK(ksym_stat_lock);
+
+void ksym_collect_stats(unsigned long hbkpt_hit_addr)
+{
+	struct hlist_node *node;
+	struct trace_ksym *entry;
+
+	spin_lock(&ksym_stat_lock);
+	hlist_for_each_entry(entry, node, &ksym_filter_head, ksym_hlist) {
+		if ((entry->ksym_addr == hbkpt_hit_addr) &&
+		    (entry->counter <= MAX_UL_INT)) {
+			entry->counter++;
+			break;
+		}
+	}
+	spin_unlock(&ksym_stat_lock);
+}
+#endif /* CONFIG_PROFILE_KSYM_TRACER */
+
+void ksym_hbkpt_handler(struct hw_breakpoint *hbkpt, struct pt_regs *regs)
+{
+	struct ring_buffer_event *event;
+	struct trace_array *tr;
+	struct trace_ksym *entry;
+	int pc;
+
+	if (!ksym_tracing_enabled)
+		return;
+
+	tr = ksym_trace_array;
+	pc = preempt_count();
+
+	event = trace_buffer_lock_reserve(tr, TRACE_KSYM,
+							sizeof(*entry), 0, pc);
+	if (!event)
+		return;
+
+	entry = ring_buffer_event_data(event);
+	strlcpy(entry->ksym_name, hbkpt->info.name, KSYM_SYMBOL_LEN);
+	entry->ksym_hbkpt = hbkpt;
+	entry->ip = instruction_pointer(regs);
+	strlcpy(entry->p_name, current->comm, TASK_COMM_LEN);
+#ifdef CONFIG_PROFILE_KSYM_TRACER
+	ksym_collect_stats(hbkpt->info.address);
+#endif /* CONFIG_PROFILE_KSYM_TRACER */
+
+	trace_buffer_unlock_commit(tr, event, 0, pc);
+}
+
+/* Valid access types are represented as
+ *
+ * rw- : Set Read/Write Access Breakpoint
+ * -w- : Set Write Access Breakpoint
+ * --- : Clear Breakpoints
+ * --x : Set Execution Break points (Not available yet)
+ *
+ */
+static int ksym_trace_get_access_type(char *access_str)
+{
+	int pos, access = 0;
+
+	for (pos = 0; pos < KSYM_TRACER_OP_LEN; pos++) {
+		switch (access_str[pos]) {
+		case 'r':
+			access += (pos == 0) ? 4 : -1;
+			break;
+		case 'w':
+			access += (pos == 1) ? 2 : -1;
+			break;
+		case '-':
+			break;
+		default:
+			return -EINVAL;
+		}
+	}
+
+	switch (access) {
+	case 6:
+		access = HW_BREAKPOINT_RW;
+		break;
+	case 2:
+		access = HW_BREAKPOINT_WRITE;
+		break;
+	case 0:
+		access = 0;
+	}
+
+	return access;
+}
+
+/*
+ * There can be several possible malformed requests and we attempt to capture
+ * all of them. We enumerate some of the rules
+ * 1. We will not allow kernel symbols with ':' since it is used as a delimiter.
+ *    i.e. multiple ':' symbols disallowed. Possible uses are of the form
+ *    <module>:<ksym_name>:<op>.
+ * 2. No delimiter symbol ':' in the input string
+ * 3. Spurious operator symbols or symbols not in their respective positions
+ * 4. <ksym_name>:--- i.e. clear breakpoint request when ksym_name not in file
+ * 5. Kernel symbol not a part of /proc/kallsyms
+ * 6. Duplicate requests
+ */
+static int parse_ksym_trace_str(char *input_string, char **ksymname,
+							unsigned long *addr)
+{
+	char *delimiter = ":";
+	int ret;
+
+	ret = -EINVAL;
+	*ksymname = strsep(&input_string, delimiter);
+	*addr = kallsyms_lookup_name(*ksymname);
+
+	/* Check for malformed request: (2), (1) and (5) */
+	if ((!input_string) ||
+		(strlen(input_string) != (KSYM_TRACER_OP_LEN + 1)) ||
+			(*addr == 0))
+		goto return_code;
+	ret = ksym_trace_get_access_type(input_string);
+
+return_code:
+	return ret;
+}
+
+static int process_new_ksym_entry(char *ksymname, int op, unsigned long addr)
+{
+	struct trace_ksym *entry;
+	int ret;
+
+	if (ksym_filter_entry_count >= KSYM_TRACER_MAX) {
+		printk(KERN_ERR "ksym_tracer: Maximum limit:(%d) reached. No"
+		" new requests for tracing can be accepted now.\n",
+			KSYM_TRACER_MAX);
+		return -ENOSPC;
+	}
+
+	entry = kzalloc(sizeof(struct trace_ksym), GFP_KERNEL);
+	if (!entry)
+		return -ENOMEM;
+
+	entry->ksym_hbkpt = kzalloc(sizeof(struct hw_breakpoint), GFP_KERNEL);
+	if (!entry->ksym_hbkpt) {
+		kfree(entry);
+		return -ENOMEM;
+	}
+
+	entry->ksym_hbkpt->info.name = ksymname;
+	entry->ksym_hbkpt->info.type = op;
+	entry->ksym_addr = entry->ksym_hbkpt->info.address = addr;
+	entry->ksym_hbkpt->info.len = HW_BREAKPOINT_LEN_4;
+
+	entry->ksym_hbkpt->triggered = (void *)ksym_hbkpt_handler;
+
+	ret = register_kernel_hw_breakpoint(entry->ksym_hbkpt);
+	if (ret < 0) {
+		printk(KERN_INFO "ksym_tracer request failed. Try again"
+					" later!!\n");
+		kfree(entry->ksym_hbkpt);
+		kfree(entry);
+		return -EAGAIN;
+	}
+	hlist_add_head(&(entry->ksym_hlist), &ksym_filter_head);
+	ksym_filter_entry_count++;
+
+	return 0;
+}
+
+static ssize_t ksym_trace_filter_read(struct file *filp, char __user *ubuf,
+						size_t count, loff_t *ppos)
+{
+	struct trace_ksym *entry;
+	struct hlist_node *node;
+	char buf[KSYM_FILTER_ENTRY_LEN * KSYM_TRACER_MAX];
+	ssize_t ret, cnt = 0;
+
+	mutex_lock(&ksym_tracer_mutex);
+
+	hlist_for_each_entry(entry, node, &ksym_filter_head, ksym_hlist) {
+		cnt += snprintf(&buf[cnt], KSYM_FILTER_ENTRY_LEN - cnt, "%s:",
+				entry->ksym_hbkpt->info.name);
+		if (entry->ksym_hbkpt->info.type == HW_BREAKPOINT_WRITE)
+			cnt += snprintf(&buf[cnt], KSYM_FILTER_ENTRY_LEN - cnt,
+								"-w-\n");
+		else if (entry->ksym_hbkpt->info.type == HW_BREAKPOINT_RW)
+			cnt += snprintf(&buf[cnt], KSYM_FILTER_ENTRY_LEN - cnt,
+								"rw-\n");
+	}
+	ret = simple_read_from_buffer(ubuf, count, ppos, buf, strlen(buf));
+	mutex_unlock(&ksym_tracer_mutex);
+
+	return ret;
+}
+
+static ssize_t ksym_trace_filter_write(struct file *file,
+					const char __user *buffer,
+						size_t count, loff_t *ppos)
+{
+	struct trace_ksym *entry;
+	struct hlist_node *node;
+	char *input_string, *ksymname = NULL;
+	unsigned long ksym_addr = 0;
+	int ret, op, changed = 0;
+
+	/* Ignore echo "" > ksym_trace_filter */
+	if (count == 0)
+		return 0;
+
+	input_string = kzalloc(count, GFP_KERNEL);
+	if (!input_string)
+		return -ENOMEM;
+
+	if (copy_from_user(input_string, buffer, count)) {
+		kfree(input_string);
+		return -EFAULT;
+	}
+
+	ret = op = parse_ksym_trace_str(input_string, &ksymname, &ksym_addr);
+	if (ret < 0) {
+		kfree(input_string);
+		return ret;
+	}
+
+	mutex_lock(&ksym_tracer_mutex);
+
+	ret = -EINVAL;
+	hlist_for_each_entry(entry, node, &ksym_filter_head, ksym_hlist) {
+		if (entry->ksym_addr == ksym_addr) {
+			/* Check for malformed request: (6) */
+			if (entry->ksym_hbkpt->info.type != op)
+				changed = 1;
+			else
+				goto err_ret;
+			break;
+		}
+	}
+	if (changed) {
+		unregister_kernel_hw_breakpoint(entry->ksym_hbkpt);
+		entry->ksym_hbkpt->info.type = op;
+		if (op > 0) {
+			ret = register_kernel_hw_breakpoint(entry->ksym_hbkpt);
+			if (ret == 0) {
+				ret = count;
+				goto unlock_ret_path;
+			}
+		}
+		ksym_filter_entry_count--;
+		hlist_del(&(entry->ksym_hlist));
+		kfree(entry->ksym_hbkpt);
+		kfree(entry);
+		ret = count;
+		goto err_ret;
+	} else {
+		/* Check for malformed request: (4) */
+		if (op == 0)
+			goto err_ret;
+		ret = process_new_ksym_entry(ksymname, op, ksym_addr);
+		if (ret)
+			goto err_ret;
+	}
+	ret = count;
+	goto unlock_ret_path;
+
+err_ret:
+	kfree(input_string);
+
+unlock_ret_path:
+	mutex_unlock(&ksym_tracer_mutex);
+	return ret;
+}
+
+static const struct file_operations ksym_tracing_fops = {
+	.open		= tracing_open_generic,
+	.read		= ksym_trace_filter_read,
+	.write		= ksym_trace_filter_write,
+};
+
+static void ksym_trace_reset(struct trace_array *tr)
+{
+	struct trace_ksym *entry;
+	struct hlist_node *node, *node1;
+
+	ksym_tracing_enabled = 0;
+
+	mutex_lock(&ksym_tracer_mutex);
+	hlist_for_each_entry_safe(entry, node, node1, &ksym_filter_head,
+								ksym_hlist) {
+		unregister_kernel_hw_breakpoint(entry->ksym_hbkpt);
+		ksym_filter_entry_count--;
+		hlist_del(&(entry->ksym_hlist));
+
+		/* Free the 'input_string' only if reset
+		 * after startup self-test
+		 */
+#ifdef CONFIG_FTRACE_SELFTEST
+		if (strncmp(entry->ksym_hbkpt->info.name, KSYM_SELFTEST_ENTRY,
+					strlen(KSYM_SELFTEST_ENTRY)) != 0)
+#endif /* CONFIG_FTRACE_SELFTEST*/
+			kfree(entry->ksym_hbkpt->info.name);
+		kfree(entry->ksym_hbkpt);
+		kfree(entry);
+	}
+	mutex_unlock(&ksym_tracer_mutex);
+
+}
+
+static int ksym_trace_init(struct trace_array *tr)
+{
+	int cpu, ret = 0;
+
+	for_each_online_cpu(cpu)
+		tracing_reset(tr, cpu);
+
+	ksym_tracing_enabled = 1;
+	ksym_trace_array = tr;
+
+#ifdef CONFIG_FTRACE_SELFTEST
+	/* Check if we are re-entering self-test code during initialisation */
+	if (ksym_selftest_dummy)
+		goto ret_path;
+
+	ksym_selftest_dummy = 0;
+
+	/* Register the read-write tracing request */
+	ret = process_new_ksym_entry(KSYM_SELFTEST_ENTRY, HW_BREAKPOINT_RW,
+					(unsigned long)(&ksym_selftest_dummy));
+
+	if (ret < 0) {
+		printk(KERN_CONT "ksym_trace read-write startup test failed\n");
+		goto ret_path;
+	}
+	/* Perform a read and a write operation over the dummy variable to
+	 * trigger the tracer
+	 */
+	if (ksym_selftest_dummy == 0)
+		ksym_selftest_dummy++;
+
+ret_path:
+#endif /* CONFIG_FTRACE_SELFTEST */
+
+	return ret;
+}
+
+static void ksym_trace_print_header(struct seq_file *m)
+{
+
+	seq_puts(m,
+		 "#       TASK-PID      CPU#      Symbol         Type    "
+		 "Function         \n");
+	seq_puts(m,
+		 "#          |           |          |              |         "
+		 "|            \n");
+}
+
+static enum print_line_t ksym_trace_output(struct trace_iterator *iter)
+{
+	struct trace_entry *entry = iter->ent;
+	struct trace_seq *s = &iter->seq;
+	struct trace_ksym *field;
+	char str[KSYM_SYMBOL_LEN];
+	int ret;
+
+	trace_assign_type(field, entry);
+
+	ret = trace_seq_printf(s, "%-15s %-5d %-3d %-20s ", field->p_name,
+				entry->pid, iter->cpu, field->ksym_name);
+	if (!ret)
+		return TRACE_TYPE_PARTIAL_LINE;
+
+	switch (field->ksym_hbkpt->info.type) {
+	case HW_BREAKPOINT_WRITE:
+		ret = trace_seq_printf(s, " W  ");
+		break;
+	case HW_BREAKPOINT_RW:
+		ret = trace_seq_printf(s, " RW ");
+		break;
+	default:
+		return TRACE_TYPE_PARTIAL_LINE;
+	}
+
+	if (!ret)
+		return TRACE_TYPE_PARTIAL_LINE;
+
+	sprint_symbol(str, field->ip);
+	ret = trace_seq_printf(s, "%-20s\n", str);
+	if (!ret)
+		return TRACE_TYPE_PARTIAL_LINE;
+
+	return TRACE_TYPE_HANDLED;
+}
+
+struct tracer ksym_tracer __read_mostly =
+{
+	.name		= "ksym_tracer",
+	.init		= ksym_trace_init,
+	.reset		= ksym_trace_reset,
+#ifdef CONFIG_FTRACE_SELFTEST
+	.selftest	= trace_selftest_startup_ksym,
+#endif
+	.print_header   = ksym_trace_print_header,
+	.print_line	= ksym_trace_output
+};
+
+__init static int init_ksym_trace(void)
+{
+	struct dentry *d_tracer;
+	struct dentry *entry;
+
+	d_tracer = tracing_init_dentry();
+	ksym_filter_entry_count = 0;
+
+	entry = debugfs_create_file("ksym_trace_filter", 0666, d_tracer,
+				    NULL, &ksym_tracing_fops);
+	if (!entry)
+		pr_warning("Could not create debugfs "
+			   "'ksym_trace_filter' file\n");
+
+	return register_tracer(&ksym_tracer);
+}
+device_initcall(init_ksym_trace);
+
+
+#ifdef CONFIG_PROFILE_KSYM_TRACER
+static int ksym_tracer_stat_headers(struct seq_file *m)
+{
+	seq_printf(m, "   Access type    ");
+	seq_printf(m, "            Symbol                     Counter     \n");
+	return 0;
+}
+
+static int ksym_tracer_stat_show(struct seq_file *m, void *v)
+{
+	struct hlist_node *stat = v;
+	struct trace_ksym *entry;
+	int access_type = 0;
+	char fn_name[KSYM_NAME_LEN];
+
+	entry = hlist_entry(stat, struct trace_ksym, ksym_hlist);
+
+	if (entry->ksym_hbkpt)
+		access_type = entry->ksym_hbkpt->info.type;
+
+	switch (access_type) {
+	case HW_BREAKPOINT_WRITE:
+		seq_printf(m, "     W     ");
+		break;
+	case HW_BREAKPOINT_RW:
+		seq_printf(m, "     RW    ");
+		break;
+	default:
+		seq_printf(m, "     NA    ");
+	}
+
+	if (lookup_symbol_name(entry->ksym_addr, fn_name) >= 0)
+		seq_printf(m, "               %s                 ", fn_name);
+	else
+		seq_printf(m, "               <NA>                ");
+
+	seq_printf(m, "%15lu\n", entry->counter);
+	return 0;
+}
+
+static void *ksym_tracer_stat_start(void)
+{
+	return &(ksym_filter_head.first);
+}
+
+static void *
+ksym_tracer_stat_next(void *v, int idx)
+{
+	struct hlist_node *stat = v;
+
+	return stat->next;
+}
+
+static struct tracer_stat ksym_tracer_stats = {
+	.name = "ksym_tracer",
+	.stat_start = ksym_tracer_stat_start,
+	.stat_next = ksym_tracer_stat_next,
+	.stat_headers = ksym_tracer_stat_headers,
+	.stat_show = ksym_tracer_stat_show
+};
+
+__init static int ksym_tracer_stat_init(void)
+{
+	int ret;
+
+	ret = register_stat_tracer(&ksym_tracer_stats);
+	if (!ret) {
+		printk(KERN_WARNING "Warning: could not register "
+				    "ksym tracer stats\n");
+		return 1;
+	}
+
+	return 0;
+}
+fs_initcall(ksym_tracer_stat_init);
+#endif /* CONFIG_PROFILE_KSYM_TRACER */
Index: linux-2.6-tip.hbkpt/kernel/trace/trace_selftest.c
===================================================================
--- linux-2.6-tip.hbkpt.orig/kernel/trace/trace_selftest.c
+++ linux-2.6-tip.hbkpt/kernel/trace/trace_selftest.c
@@ -16,6 +16,7 @@ static inline int trace_valid_entry(stru
 	case TRACE_BRANCH:
 	case TRACE_GRAPH_ENT:
 	case TRACE_GRAPH_RET:
+	case TRACE_KSYM:
 		return 1;
 	}
 	return 0;
@@ -687,3 +688,38 @@ trace_selftest_startup_branch(struct tra
 	return ret;
 }
 #endif /* CONFIG_BRANCH_TRACER */
+
+#ifdef CONFIG_KSYM_TRACER
+int
+trace_selftest_startup_ksym(struct tracer *trace, struct trace_array *tr)
+{
+	unsigned long count;
+	int ret;
+
+	/* start the tracing */
+	ret = tracer_init(trace, tr);
+	if (ret) {
+		warn_failed_init_tracer(trace, ret);
+		return ret;
+	}
+
+	/* Sleep for a 1/10 of a second */
+	msleep(100);
+	/* stop the tracing. */
+	tracing_stop();
+	/* check the trace buffer */
+	ret = trace_test_buffer(tr, &count);
+	trace->reset(tr);
+	tracing_start();
+
+	/* read & write operations - one each is performed on the dummy variable
+	 * triggering two entries in the trace buffer
+	 */
+	if (!ret && count != 2) {
+		printk(KERN_CONT "Ksym tracer startup test failed");
+		ret = -1;
+	}
+
+	return ret;
+}
+#endif /* CONFIG_KSYM_TRACER */


^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [Patch 11/11] ftrace plugin for kernel symbol tracing using HW Breakpoint interfaces - v2
  2009-03-19 23:50 ` [Patch 11/11] ftrace plugin for kernel symbol tracing using HW Breakpoint interfaces - v2 K.Prasad
@ 2009-03-20  9:04   ` Frederic Weisbecker
  2009-03-21 16:24     ` K.Prasad
  0 siblings, 1 reply; 27+ messages in thread
From: Frederic Weisbecker @ 2009-03-20  9:04 UTC (permalink / raw)
  To: K.Prasad
  Cc: Ingo Molnar, Linux Kernel Mailing List, Alan Stern,
	Andrew Morton, Benjamin Herrenschmidt, Maneesh Soni,
	Roland McGrath, Steven Rostedt

On Fri, Mar 20, 2009 at 05:20:32AM +0530, K.Prasad wrote:
> This patch adds an ftrace plugin to detect and profile memory access over
> kernel variables. It uses HW Breakpoint interfaces to 'watch memory
> addresses.
> 
> Signed-off-by: K.Prasad <prasad@linux.vnet.ibm.com> 
> ---
>  kernel/trace/Kconfig          |   21 +
>  kernel/trace/Makefile         |    1 
>  kernel/trace/trace.h          |   25 +
>  kernel/trace/trace_ksym.c     |  555 ++++++++++++++++++++++++++++++++++++++++++
>  kernel/trace/trace_selftest.c |   36 ++
>  5 files changed, 638 insertions(+)
> 
> Index: linux-2.6-tip.hbkpt/kernel/trace/Kconfig
> ===================================================================
> --- linux-2.6-tip.hbkpt.orig/kernel/trace/Kconfig
> +++ linux-2.6-tip.hbkpt/kernel/trace/Kconfig
> @@ -264,6 +264,27 @@ config POWER_TRACER
>  	  power management decisions, specifically the C-state and P-state
>  	  behavior.
>  
> +config KSYM_TRACER
> +	bool "Trace read and write access on kernel memory locations"
> +	depends on HAVE_HW_BREAKPOINT
> +	select TRACING
> +	help
> +	  This tracer helps find read and write operations on any given kernel
> +	  symbol i.e. /proc/kallsyms.
> +
> +config PROFILE_KSYM_TRACER
> +	bool "Profile all kernel memory accesses on 'watched' variables"
> +	depends on KSYM_TRACER
> +	help
> +	  This tracer profiles kernel accesses on variables watched through the
> +	  ksym tracer ftrace plugin. Depending upon the hardware, all read
> +	  and write operations on kernel variables can be monitored for
> +	  accesses.
> +
> +	  The results will be displayed in:
> +	  /debugfs/tracing/profile_ksym
> +
> +	  Say N if unsure.
>  
>  config STACK_TRACER
>  	bool "Trace max stack"
> Index: linux-2.6-tip.hbkpt/kernel/trace/Makefile
> ===================================================================
> --- linux-2.6-tip.hbkpt.orig/kernel/trace/Makefile
> +++ linux-2.6-tip.hbkpt/kernel/trace/Makefile
> @@ -44,5 +44,6 @@ obj-$(CONFIG_EVENT_TRACER) += trace_even
>  obj-$(CONFIG_EVENT_TRACER) += events.o
>  obj-$(CONFIG_EVENT_TRACER) += trace_export.o
>  obj-$(CONFIG_FTRACE_SYSCALLS) += trace_syscalls.o
> +obj-$(CONFIG_KSYM_TRACER) += trace_ksym.o
>  
>  libftrace-y := ftrace.o
> Index: linux-2.6-tip.hbkpt/kernel/trace/trace.h
> ===================================================================
> --- linux-2.6-tip.hbkpt.orig/kernel/trace/trace.h
> +++ linux-2.6-tip.hbkpt/kernel/trace/trace.h
> @@ -12,6 +12,10 @@
>  #include <trace/kmemtrace.h>
>  #include <trace/power.h>
>  
> +#ifdef CONFIG_KSYM_TRACER
> +#include <asm/hw_breakpoint.h>
> +#endif
> +
>  enum trace_type {
>  	__TRACE_FIRST_TYPE = 0,
>  
> @@ -37,6 +41,7 @@ enum trace_type {
>  	TRACE_KMEM_FREE,
>  	TRACE_POWER,
>  	TRACE_BLK,
> +	TRACE_KSYM,
>  
>  	__TRACE_LAST_TYPE,
>  };
> @@ -214,6 +219,23 @@ struct syscall_trace_exit {
>  	unsigned long		ret;
>  };
>  
> +#ifdef CONFIG_KSYM_TRACER
> +struct trace_ksym {
> +	struct trace_entry	ent;
> +	struct hw_breakpoint	*ksym_hbkpt;
> +	unsigned long		ksym_addr;
> +	unsigned long		ip;
> +#ifdef CONFIG_PROFILE_KSYM_TRACER
> +	unsigned long 		counter;
> +#endif
> +	struct hlist_node	ksym_hlist;
> +	char			ksym_name[KSYM_NAME_LEN];
> +	char			p_name[TASK_COMM_LEN];
> +};
> +#else
> +struct trace_ksym {
> +};
> +#endif /* CONFIG_KSYM_TRACER */
>  
>  /*
>   * trace_flag_type is an enumeration that holds different
> @@ -332,6 +354,7 @@ extern void __ftrace_bad_type(void);
>  			  TRACE_SYSCALL_ENTER);				\
>  		IF_ASSIGN(var, ent, struct syscall_trace_exit,		\
>  			  TRACE_SYSCALL_EXIT);				\
> +		IF_ASSIGN(var, ent, struct trace_ksym, TRACE_KSYM);	\
>  		__ftrace_bad_type();					\
>  	} while (0)
>  
> @@ -593,6 +616,8 @@ extern int trace_selftest_startup_syspro
>  					       struct trace_array *tr);
>  extern int trace_selftest_startup_branch(struct tracer *trace,
>  					 struct trace_array *tr);
> +extern int trace_selftest_startup_ksym(struct tracer *trace,
> +					 struct trace_array *tr);
>  #endif /* CONFIG_FTRACE_STARTUP_TEST */
>  
>  extern void *head_page(struct trace_array_cpu *data);
> Index: linux-2.6-tip.hbkpt/kernel/trace/trace_ksym.c
> ===================================================================
> --- /dev/null
> +++ linux-2.6-tip.hbkpt/kernel/trace/trace_ksym.c
> @@ -0,0 +1,555 @@
> +/*
> + * trace_ksym.c - Kernel Symbol Tracer
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License as published by
> + * the Free Software Foundation; either version 2 of the License, or
> + * (at your option) any later version.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> + * GNU General Public License for more details.
> + *
> + * You should have received a copy of the GNU General Public License
> + * along with this program; if not, write to the Free Software
> + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
> + *
> + * Copyright (C) IBM Corporation, 2009
> + */
> +
> +#include <linux/kallsyms.h>
> +#include <linux/uaccess.h>
> +#include <linux/debugfs.h>
> +#include <linux/ftrace.h>
> +#include <linux/module.h>
> +#include <linux/jhash.h>
> +#include <linux/fs.h>
> +
> +#include "trace_output.h"
> +#include "trace_stat.h"
> +#include "trace.h"
> +
> +/* For now, let us restrict the no. of symbols traced simultaneously to number
> + * of available hardware breakpoint registers.
> + */
> +#define KSYM_TRACER_MAX HB_NUM
> +
> +#define KSYM_TRACER_OP_LEN 3 /* rw- */
> +#define KSYM_FILTER_ENTRY_LEN (KSYM_NAME_LEN + KSYM_TRACER_OP_LEN + 1)
> +
> +#ifdef CONFIG_FTRACE_SELFTEST
> +
> +static int ksym_selftest_dummy;
> +#define KSYM_SELFTEST_ENTRY "ksym_selftest_dummy"
> +
> +#endif /* CONFIG_FTRACE_SELFTEST */
> +
> +static struct trace_array *ksym_trace_array;
> +
> +DEFINE_MUTEX(ksym_tracer_mutex);
> +
> +static unsigned int ksym_filter_entry_count;
> +static unsigned int ksym_tracing_enabled;
> +
> +static HLIST_HEAD(ksym_filter_head);
> +
> +#ifdef CONFIG_PROFILE_KSYM_TRACER
> +
> +#define MAX_UL_INT 0xffffffff
> +DEFINE_SPINLOCK(ksym_stat_lock);
> +
> +void ksym_collect_stats(unsigned long hbkpt_hit_addr)
> +{
> +	struct hlist_node *node;
> +	struct trace_ksym *entry;
> +
> +	spin_lock(&ksym_stat_lock);


I see that can be called from ksym_hbkpt_handler which in turn
can be called from interrupt context, right?
You can issue a deadlock if you don't disable interrupts here.

Thanks,
Frederic.

> +	hlist_for_each_entry(entry, node, &ksym_filter_head, ksym_hlist) {
> +		if ((entry->ksym_addr == hbkpt_hit_addr) &&
> +		    (entry->counter <= MAX_UL_INT)) {
> +			entry->counter++;
> +			break;
> +		}
> +	}
> +	spin_unlock(&ksym_stat_lock);
> +}
> +#endif /* CONFIG_PROFILE_KSYM_TRACER */
> +
> +void ksym_hbkpt_handler(struct hw_breakpoint *hbkpt, struct pt_regs *regs)
> +{
> +	struct ring_buffer_event *event;
> +	struct trace_array *tr;
> +	struct trace_ksym *entry;
> +	int pc;
> +
> +	if (!ksym_tracing_enabled)
> +		return;
> +
> +	tr = ksym_trace_array;
> +	pc = preempt_count();
> +
> +	event = trace_buffer_lock_reserve(tr, TRACE_KSYM,
> +							sizeof(*entry), 0, pc);
> +	if (!event)
> +		return;
> +
> +	entry = ring_buffer_event_data(event);
> +	strlcpy(entry->ksym_name, hbkpt->info.name, KSYM_SYMBOL_LEN);
> +	entry->ksym_hbkpt = hbkpt;
> +	entry->ip = instruction_pointer(regs);
> +	strlcpy(entry->p_name, current->comm, TASK_COMM_LEN);
> +#ifdef CONFIG_PROFILE_KSYM_TRACER
> +	ksym_collect_stats(hbkpt->info.address);
> +#endif /* CONFIG_PROFILE_KSYM_TRACER */
> +
> +	trace_buffer_unlock_commit(tr, event, 0, pc);
> +}
> +
> +/* Valid access types are represented as
> + *
> + * rw- : Set Read/Write Access Breakpoint
> + * -w- : Set Write Access Breakpoint
> + * --- : Clear Breakpoints
> + * --x : Set Execution Break points (Not available yet)
> + *
> + */
> +static int ksym_trace_get_access_type(char *access_str)
> +{
> +	int pos, access = 0;
> +
> +	for (pos = 0; pos < KSYM_TRACER_OP_LEN; pos++) {
> +		switch (access_str[pos]) {
> +		case 'r':
> +			access += (pos == 0) ? 4 : -1;
> +			break;
> +		case 'w':
> +			access += (pos == 1) ? 2 : -1;
> +			break;
> +		case '-':
> +			break;
> +		default:
> +			return -EINVAL;
> +		}
> +	}
> +
> +	switch (access) {
> +	case 6:
> +		access = HW_BREAKPOINT_RW;
> +		break;
> +	case 2:
> +		access = HW_BREAKPOINT_WRITE;
> +		break;
> +	case 0:
> +		access = 0;
> +	}
> +
> +	return access;
> +}
> +
> +/*
> + * There can be several possible malformed requests and we attempt to capture
> + * all of them. We enumerate some of the rules
> + * 1. We will not allow kernel symbols with ':' since it is used as a delimiter.
> + *    i.e. multiple ':' symbols disallowed. Possible uses are of the form
> + *    <module>:<ksym_name>:<op>.
> + * 2. No delimiter symbol ':' in the input string
> + * 3. Spurious operator symbols or symbols not in their respective positions
> + * 4. <ksym_name>:--- i.e. clear breakpoint request when ksym_name not in file
> + * 5. Kernel symbol not a part of /proc/kallsyms
> + * 6. Duplicate requests
> + */
> +static int parse_ksym_trace_str(char *input_string, char **ksymname,
> +							unsigned long *addr)
> +{
> +	char *delimiter = ":";
> +	int ret;
> +
> +	ret = -EINVAL;
> +	*ksymname = strsep(&input_string, delimiter);
> +	*addr = kallsyms_lookup_name(*ksymname);
> +
> +	/* Check for malformed request: (2), (1) and (5) */
> +	if ((!input_string) ||
> +		(strlen(input_string) != (KSYM_TRACER_OP_LEN + 1)) ||
> +			(*addr == 0))
> +		goto return_code;
> +	ret = ksym_trace_get_access_type(input_string);
> +
> +return_code:
> +	return ret;
> +}
> +
> +static int process_new_ksym_entry(char *ksymname, int op, unsigned long addr)
> +{
> +	struct trace_ksym *entry;
> +	int ret;
> +
> +	if (ksym_filter_entry_count >= KSYM_TRACER_MAX) {
> +		printk(KERN_ERR "ksym_tracer: Maximum limit:(%d) reached. No"
> +		" new requests for tracing can be accepted now.\n",
> +			KSYM_TRACER_MAX);
> +		return -ENOSPC;
> +	}
> +
> +	entry = kzalloc(sizeof(struct trace_ksym), GFP_KERNEL);
> +	if (!entry)
> +		return -ENOMEM;
> +
> +	entry->ksym_hbkpt = kzalloc(sizeof(struct hw_breakpoint), GFP_KERNEL);
> +	if (!entry->ksym_hbkpt) {
> +		kfree(entry);
> +		return -ENOMEM;
> +	}
> +
> +	entry->ksym_hbkpt->info.name = ksymname;
> +	entry->ksym_hbkpt->info.type = op;
> +	entry->ksym_addr = entry->ksym_hbkpt->info.address = addr;
> +	entry->ksym_hbkpt->info.len = HW_BREAKPOINT_LEN_4;
> +
> +	entry->ksym_hbkpt->triggered = (void *)ksym_hbkpt_handler;
> +
> +	ret = register_kernel_hw_breakpoint(entry->ksym_hbkpt);
> +	if (ret < 0) {
> +		printk(KERN_INFO "ksym_tracer request failed. Try again"
> +					" later!!\n");
> +		kfree(entry->ksym_hbkpt);
> +		kfree(entry);
> +		return -EAGAIN;
> +	}
> +	hlist_add_head(&(entry->ksym_hlist), &ksym_filter_head);
> +	ksym_filter_entry_count++;
> +
> +	return 0;
> +}
> +
> +static ssize_t ksym_trace_filter_read(struct file *filp, char __user *ubuf,
> +						size_t count, loff_t *ppos)
> +{
> +	struct trace_ksym *entry;
> +	struct hlist_node *node;
> +	char buf[KSYM_FILTER_ENTRY_LEN * KSYM_TRACER_MAX];
> +	ssize_t ret, cnt = 0;
> +
> +	mutex_lock(&ksym_tracer_mutex);
> +
> +	hlist_for_each_entry(entry, node, &ksym_filter_head, ksym_hlist) {
> +		cnt += snprintf(&buf[cnt], KSYM_FILTER_ENTRY_LEN - cnt, "%s:",
> +				entry->ksym_hbkpt->info.name);
> +		if (entry->ksym_hbkpt->info.type == HW_BREAKPOINT_WRITE)
> +			cnt += snprintf(&buf[cnt], KSYM_FILTER_ENTRY_LEN - cnt,
> +								"-w-\n");
> +		else if (entry->ksym_hbkpt->info.type == HW_BREAKPOINT_RW)
> +			cnt += snprintf(&buf[cnt], KSYM_FILTER_ENTRY_LEN - cnt,
> +								"rw-\n");
> +	}
> +	ret = simple_read_from_buffer(ubuf, count, ppos, buf, strlen(buf));
> +	mutex_unlock(&ksym_tracer_mutex);
> +
> +	return ret;
> +}
> +
> +static ssize_t ksym_trace_filter_write(struct file *file,
> +					const char __user *buffer,
> +						size_t count, loff_t *ppos)
> +{
> +	struct trace_ksym *entry;
> +	struct hlist_node *node;
> +	char *input_string, *ksymname = NULL;
> +	unsigned long ksym_addr = 0;
> +	int ret, op, changed = 0;
> +
> +	/* Ignore echo "" > ksym_trace_filter */
> +	if (count == 0)
> +		return 0;
> +
> +	input_string = kzalloc(count, GFP_KERNEL);
> +	if (!input_string)
> +		return -ENOMEM;
> +
> +	if (copy_from_user(input_string, buffer, count)) {
> +		kfree(input_string);
> +		return -EFAULT;
> +	}
> +
> +	ret = op = parse_ksym_trace_str(input_string, &ksymname, &ksym_addr);
> +	if (ret < 0) {
> +		kfree(input_string);
> +		return ret;
> +	}
> +
> +	mutex_lock(&ksym_tracer_mutex);
> +
> +	ret = -EINVAL;
> +	hlist_for_each_entry(entry, node, &ksym_filter_head, ksym_hlist) {
> +		if (entry->ksym_addr == ksym_addr) {
> +			/* Check for malformed request: (6) */
> +			if (entry->ksym_hbkpt->info.type != op)
> +				changed = 1;
> +			else
> +				goto err_ret;
> +			break;
> +		}
> +	}
> +	if (changed) {
> +		unregister_kernel_hw_breakpoint(entry->ksym_hbkpt);
> +		entry->ksym_hbkpt->info.type = op;
> +		if (op > 0) {
> +			ret = register_kernel_hw_breakpoint(entry->ksym_hbkpt);
> +			if (ret == 0) {
> +				ret = count;
> +				goto unlock_ret_path;
> +			}
> +		}
> +		ksym_filter_entry_count--;
> +		hlist_del(&(entry->ksym_hlist));
> +		kfree(entry->ksym_hbkpt);
> +		kfree(entry);
> +		ret = count;
> +		goto err_ret;
> +	} else {
> +		/* Check for malformed request: (4) */
> +		if (op == 0)
> +			goto err_ret;
> +		ret = process_new_ksym_entry(ksymname, op, ksym_addr);
> +		if (ret)
> +			goto err_ret;
> +	}
> +	ret = count;
> +	goto unlock_ret_path;
> +
> +err_ret:
> +	kfree(input_string);
> +
> +unlock_ret_path:
> +	mutex_unlock(&ksym_tracer_mutex);
> +	return ret;
> +}
> +
> +static const struct file_operations ksym_tracing_fops = {
> +	.open		= tracing_open_generic,
> +	.read		= ksym_trace_filter_read,
> +	.write		= ksym_trace_filter_write,
> +};
> +
> +static void ksym_trace_reset(struct trace_array *tr)
> +{
> +	struct trace_ksym *entry;
> +	struct hlist_node *node, *node1;
> +
> +	ksym_tracing_enabled = 0;
> +
> +	mutex_lock(&ksym_tracer_mutex);
> +	hlist_for_each_entry_safe(entry, node, node1, &ksym_filter_head,
> +								ksym_hlist) {
> +		unregister_kernel_hw_breakpoint(entry->ksym_hbkpt);
> +		ksym_filter_entry_count--;
> +		hlist_del(&(entry->ksym_hlist));
> +
> +		/* Free the 'input_string' only if reset
> +		 * after startup self-test
> +		 */
> +#ifdef CONFIG_FTRACE_SELFTEST
> +		if (strncmp(entry->ksym_hbkpt->info.name, KSYM_SELFTEST_ENTRY,
> +					strlen(KSYM_SELFTEST_ENTRY)) != 0)
> +#endif /* CONFIG_FTRACE_SELFTEST*/
> +			kfree(entry->ksym_hbkpt->info.name);
> +		kfree(entry->ksym_hbkpt);
> +		kfree(entry);
> +	}
> +	mutex_unlock(&ksym_tracer_mutex);
> +
> +}
> +
> +static int ksym_trace_init(struct trace_array *tr)
> +{
> +	int cpu, ret = 0;
> +
> +	for_each_online_cpu(cpu)
> +		tracing_reset(tr, cpu);
> +
> +	ksym_tracing_enabled = 1;
> +	ksym_trace_array = tr;
> +
> +#ifdef CONFIG_FTRACE_SELFTEST
> +	/* Check if we are re-entering self-test code during initialisation */
> +	if (ksym_selftest_dummy)
> +		goto ret_path;
> +
> +	ksym_selftest_dummy = 0;
> +
> +	/* Register the read-write tracing request */
> +	ret = process_new_ksym_entry(KSYM_SELFTEST_ENTRY, HW_BREAKPOINT_RW,
> +					(unsigned long)(&ksym_selftest_dummy));
> +
> +	if (ret < 0) {
> +		printk(KERN_CONT "ksym_trace read-write startup test failed\n");
> +		goto ret_path;
> +	}
> +	/* Perform a read and a write operation over the dummy variable to
> +	 * trigger the tracer
> +	 */
> +	if (ksym_selftest_dummy == 0)
> +		ksym_selftest_dummy++;
> +
> +ret_path:
> +#endif /* CONFIG_FTRACE_SELFTEST */
> +
> +	return ret;
> +}
> +
> +static void ksym_trace_print_header(struct seq_file *m)
> +{
> +
> +	seq_puts(m,
> +		 "#       TASK-PID      CPU#      Symbol         Type    "
> +		 "Function         \n");
> +	seq_puts(m,
> +		 "#          |           |          |              |         "
> +		 "|            \n");
> +}
> +
> +static enum print_line_t ksym_trace_output(struct trace_iterator *iter)
> +{
> +	struct trace_entry *entry = iter->ent;
> +	struct trace_seq *s = &iter->seq;
> +	struct trace_ksym *field;
> +	char str[KSYM_SYMBOL_LEN];
> +	int ret;
> +
> +	trace_assign_type(field, entry);
> +
> +	ret = trace_seq_printf(s, "%-15s %-5d %-3d %-20s ", field->p_name,
> +				entry->pid, iter->cpu, field->ksym_name);
> +	if (!ret)
> +		return TRACE_TYPE_PARTIAL_LINE;
> +
> +	switch (field->ksym_hbkpt->info.type) {
> +	case HW_BREAKPOINT_WRITE:
> +		ret = trace_seq_printf(s, " W  ");
> +		break;
> +	case HW_BREAKPOINT_RW:
> +		ret = trace_seq_printf(s, " RW ");
> +		break;
> +	default:
> +		return TRACE_TYPE_PARTIAL_LINE;
> +	}
> +
> +	if (!ret)
> +		return TRACE_TYPE_PARTIAL_LINE;
> +
> +	sprint_symbol(str, field->ip);
> +	ret = trace_seq_printf(s, "%-20s\n", str);
> +	if (!ret)
> +		return TRACE_TYPE_PARTIAL_LINE;
> +
> +	return TRACE_TYPE_HANDLED;
> +}
> +
> +struct tracer ksym_tracer __read_mostly =
> +{
> +	.name		= "ksym_tracer",
> +	.init		= ksym_trace_init,
> +	.reset		= ksym_trace_reset,
> +#ifdef CONFIG_FTRACE_SELFTEST
> +	.selftest	= trace_selftest_startup_ksym,
> +#endif
> +	.print_header   = ksym_trace_print_header,
> +	.print_line	= ksym_trace_output
> +};
> +
> +__init static int init_ksym_trace(void)
> +{
> +	struct dentry *d_tracer;
> +	struct dentry *entry;
> +
> +	d_tracer = tracing_init_dentry();
> +	ksym_filter_entry_count = 0;
> +
> +	entry = debugfs_create_file("ksym_trace_filter", 0666, d_tracer,
> +				    NULL, &ksym_tracing_fops);
> +	if (!entry)
> +		pr_warning("Could not create debugfs "
> +			   "'ksym_trace_filter' file\n");
> +
> +	return register_tracer(&ksym_tracer);
> +}
> +device_initcall(init_ksym_trace);
> +
> +
> +#ifdef CONFIG_PROFILE_KSYM_TRACER
> +static int ksym_tracer_stat_headers(struct seq_file *m)
> +{
> +	seq_printf(m, "   Access type    ");
> +	seq_printf(m, "            Symbol                     Counter     \n");
> +	return 0;
> +}
> +
> +static int ksym_tracer_stat_show(struct seq_file *m, void *v)
> +{
> +	struct hlist_node *stat = v;
> +	struct trace_ksym *entry;
> +	int access_type = 0;
> +	char fn_name[KSYM_NAME_LEN];
> +
> +	entry = hlist_entry(stat, struct trace_ksym, ksym_hlist);
> +
> +	if (entry->ksym_hbkpt)
> +		access_type = entry->ksym_hbkpt->info.type;
> +
> +	switch (access_type) {
> +	case HW_BREAKPOINT_WRITE:
> +		seq_printf(m, "     W     ");
> +		break;
> +	case HW_BREAKPOINT_RW:
> +		seq_printf(m, "     RW    ");
> +		break;
> +	default:
> +		seq_printf(m, "     NA    ");
> +	}
> +
> +	if (lookup_symbol_name(entry->ksym_addr, fn_name) >= 0)
> +		seq_printf(m, "               %s                 ", fn_name);
> +	else
> +		seq_printf(m, "               <NA>                ");
> +
> +	seq_printf(m, "%15lu\n", entry->counter);
> +	return 0;
> +}
> +
> +static void *ksym_tracer_stat_start(void)
> +{
> +	return &(ksym_filter_head.first);
> +}
> +
> +static void *
> +ksym_tracer_stat_next(void *v, int idx)
> +{
> +	struct hlist_node *stat = v;
> +
> +	return stat->next;
> +}
> +
> +static struct tracer_stat ksym_tracer_stats = {
> +	.name = "ksym_tracer",
> +	.stat_start = ksym_tracer_stat_start,
> +	.stat_next = ksym_tracer_stat_next,
> +	.stat_headers = ksym_tracer_stat_headers,
> +	.stat_show = ksym_tracer_stat_show
> +};
> +
> +__init static int ksym_tracer_stat_init(void)
> +{
> +	int ret;
> +
> +	ret = register_stat_tracer(&ksym_tracer_stats);
> +	if (!ret) {
> +		printk(KERN_WARNING "Warning: could not register "
> +				    "ksym tracer stats\n");
> +		return 1;
> +	}
> +
> +	return 0;
> +}
> +fs_initcall(ksym_tracer_stat_init);
> +#endif /* CONFIG_PROFILE_KSYM_TRACER */
> Index: linux-2.6-tip.hbkpt/kernel/trace/trace_selftest.c
> ===================================================================
> --- linux-2.6-tip.hbkpt.orig/kernel/trace/trace_selftest.c
> +++ linux-2.6-tip.hbkpt/kernel/trace/trace_selftest.c
> @@ -16,6 +16,7 @@ static inline int trace_valid_entry(stru
>  	case TRACE_BRANCH:
>  	case TRACE_GRAPH_ENT:
>  	case TRACE_GRAPH_RET:
> +	case TRACE_KSYM:
>  		return 1;
>  	}
>  	return 0;
> @@ -687,3 +688,38 @@ trace_selftest_startup_branch(struct tra
>  	return ret;
>  }
>  #endif /* CONFIG_BRANCH_TRACER */
> +
> +#ifdef CONFIG_KSYM_TRACER
> +int
> +trace_selftest_startup_ksym(struct tracer *trace, struct trace_array *tr)
> +{
> +	unsigned long count;
> +	int ret;
> +
> +	/* start the tracing */
> +	ret = tracer_init(trace, tr);
> +	if (ret) {
> +		warn_failed_init_tracer(trace, ret);
> +		return ret;
> +	}
> +
> +	/* Sleep for a 1/10 of a second */
> +	msleep(100);
> +	/* stop the tracing. */
> +	tracing_stop();
> +	/* check the trace buffer */
> +	ret = trace_test_buffer(tr, &count);
> +	trace->reset(tr);
> +	tracing_start();
> +
> +	/* read & write operations - one each is performed on the dummy variable
> +	 * triggering two entries in the trace buffer
> +	 */
> +	if (!ret && count != 2) {
> +		printk(KERN_CONT "Ksym tracer startup test failed");
> +		ret = -1;
> +	}
> +
> +	return ret;
> +}
> +#endif /* CONFIG_KSYM_TRACER */
> 


^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [Patch 01/11] Introducing generic hardware breakpoint handler interfaces
  2009-03-19 23:48 ` [Patch 01/11] Introducing generic hardware breakpoint handler interfaces K.Prasad
@ 2009-03-20 14:33   ` Alan Stern
  2009-03-20 18:30     ` Ingo Molnar
                       ` (2 more replies)
  0 siblings, 3 replies; 27+ messages in thread
From: Alan Stern @ 2009-03-20 14:33 UTC (permalink / raw)
  To: K.Prasad
  Cc: Ingo Molnar, Linux Kernel Mailing List, Andrew Morton,
	Benjamin Herrenschmidt, Frederic Weisbecker, Maneesh Soni,
	Roland McGrath, Steven Rostedt

On Fri, 20 Mar 2009, K.Prasad wrote:

> This patch introduces two new files hw_breakpoint.[ch] which defines the 
> generic interfaces to use hardware breakpoint infrastructure of the system. 

Prasad:

I'm sorry to say this is full of mistakes.  So far I have looked only 
at patch 01/11, but it's not good.

> + * Kernel breakpoints grow downwards, starting from HB_NUM
> + * 'hbkpt_kernel_pos' denotes lowest numbered breakpoint register occupied for
> + * kernel-space request
> + */
> +unsigned int hbkpt_kernel_pos;

This doesn't make much sense.  All you need to know is which registers
are in use; all others are available.

For example, suppose the kernel allocated breakpoints 3, 2, and 1, and
then deallocated 2.  Then bp 2 would be available for use, even though
2 > 1.

It's also a poor choice of name.  Everywhere else (in my patches,
anyway) the code refers to hardware breakpoints using the abbreviation
"hwbp" or "hw_breakpoint".  There's no reason suddenly to start using
"hbkpt".

> +/* An array containing refcount of threads using a given bkpt register */
> +unsigned int hbkpt_user_max_refcount[HB_NUM];

Why did you put "max" in the name?  Isn't this just a simple refcount?

> +/* One higher than the highest counted user-space breakpoint register */
> +unsigned int hbkpt_user_max;

Likewise, this variable isn't really needed.  It's just one more than
the largest i such that hbkpt_user_max_refcount[i] > 0.

> +/*
> + * Install the debug register values for a new thread.
> + */
> +void switch_to_thread_hw_breakpoint(struct task_struct *tsk)
> +{
> +	/* Set the debug register */

Set _which_ debug register?

> +	arch_install_thread_hbkpt(tsk);
> +	last_debugged_task = current;
> +
> +	put_cpu_no_resched();

What's this line doing here?  It looks like something you forgot to
erase.

> +}
> +
> +/*
> + * Install the debug register values for just the kernel, no thread.
> + */
> +void switch_to_none_hw_breakpoint(void)
> +{
> +	arch_install_none();
> +	put_cpu_no_resched();

Same for this line.

> +}
> +
> +/*
> + * Load the debug registers during startup of a CPU.
> + */
> +void load_debug_registers(void)
> +{
> +	int i;
> +	unsigned long flags;
> +
> +	/* Prevent IPIs for new kernel breakpoint updates */
> +	local_irq_save(flags);
> +
> +	for (i = hbkpt_kernel_pos; i < HB_NUM; i++)
> +		if (hbkpt_kernel[i])
> +			on_each_cpu(arch_install_kernel_hbkpt,
> +				(void *)hbkpt_kernel[i], 0);

This is completely wrong.  First of all, it's dumb to send multiple
IPIs (one for each iteration through the loop).  Second, this routine
shouldn't send any IPIs at all!  It gets invoked when a CPU is
starting up and wants to load its _own_ debug registers -- not tell
another CPU to load anything.

> +	if (current->thread.dr7)
> +		arch_install_thread_hbkpt(current);
> +
> +	local_irq_restore(flags);
> +}
> +
> +/*
> + * Erase all the hardware breakpoint info associated with a thread.
> + *
> + * If tsk != current then tsk must not be usable (for example, a
> + * child being cleaned up from a failed fork).
> + */
> +void flush_thread_hw_breakpoint(struct task_struct *tsk)
> +{
> +	int i;
> +	struct thread_struct *thread = &(tsk->thread);
> +
> +	mutex_lock(&hw_breakpoint_mutex);
> +
> +	/* Let the breakpoints know they are being uninstalled */

This comment looks like a leftover which should have been erased.

> +/*
> + * Validate the settings in a hw_breakpoint structure.
> + */
> +static int validate_settings(struct hw_breakpoint *bp, struct task_struct *tsk)
> +{
> +	int ret;
> +	unsigned int align;
> +
> +	ret = arch_validate_hwbkpt_settings(bp, &align, tsk);
> +	if (ret < 0)
> +		goto err;
> +
> +	/* Check that the low-order bits of the address are appropriate
> +	 * for the alignment implied by len.
> +	 */
> +	if (bp->info.address & align)
> +		return -EINVAL;
> +
> +	/* Check that the virtual address is in the proper range */
> +	if (tsk) {
> +		if (!arch_check_va_in_userspace(bp->info.address, tsk))
> +			return -EFAULT;
> +	} else {
> +		if (!arch_check_va_in_kernelspace(bp->info.address))
> +			return -EFAULT;
> +	}

Roland pointed out that these checks need to take into account the
length of the breakpoint.  For example, in
arch_check_va_in_userspace() it isn't sufficient for the start of the
breakpoint region to be a userspace address; the end of the
breakpoint region must also be in userspace.

> + err:
> +	return ret;
> +}
> +
> +int __register_user_hw_breakpoint(int pos, struct task_struct *tsk,
> +					struct hw_breakpoint *bp)
> +{
> +	struct thread_struct *thread = &(tsk->thread);
> +	int rc;
> +
> +	/* Do not overcommit. Fail if kernel has used the hbkpt registers */
> +	if (pos >= hbkpt_kernel_pos)
> +		return -ENOSPC;

In fact you should fail if the debug register is already in use,
regardless of whether it is being used by a kernel breakpoint.  And you 
shouldn't check against hbkpt_kernel_pos; you should check whether 
hbkpt_kernel[pos] is NULL and thread->hbkpt[pos] is NULL.

> +
> +	rc = validate_settings(bp, tsk);
> +	if (rc)
> +		return rc;
> +
> +	thread->hbkpt[pos] = bp;
> +	thread->hbkpt_num_installed++;
> +	hbkpt_user_max_refcount[pos]++;
> +	/* 'tsk' is the thread that uses max number of hbkpt registers */

This is a bad comment.  It sounds like it's saying that "tsk" is
defined as the thread using the maximum number of breakpoints, rather
than being defined as the thread for which the breakpoint is being
registered.

Besides, there's no reason to keep track of which thread uses the max 
number of breakpoints anyway.  Not to mention the fact that you don't 
update hbkpt_user_max when its thread exits.

> +	if (hbkpt_user_max < thread->hbkpt_num_installed)
> +		hbkpt_user_max++;

At this point I got tired of looking, but it seems obvious that the new 
patch series needs a bunch of improvements.

Alan Stern


^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [Patch 01/11] Introducing generic hardware breakpoint handler interfaces
  2009-03-20 14:33   ` Alan Stern
@ 2009-03-20 18:30     ` Ingo Molnar
  2009-03-21 17:32       ` K.Prasad
  2009-03-20 18:32     ` Ingo Molnar
  2009-03-21 17:26     ` K.Prasad
  2 siblings, 1 reply; 27+ messages in thread
From: Ingo Molnar @ 2009-03-20 18:30 UTC (permalink / raw)
  To: Alan Stern
  Cc: K.Prasad, Linux Kernel Mailing List, Andrew Morton,
	Benjamin Herrenschmidt, Frederic Weisbecker, Maneesh Soni,
	Roland McGrath, Steven Rostedt


* Alan Stern <stern@rowland.harvard.edu> wrote:

> > + * Kernel breakpoints grow downwards, starting from HB_NUM
> > + * 'hbkpt_kernel_pos' denotes lowest numbered breakpoint register occupied for
> > + * kernel-space request
> > + */
> > +unsigned int hbkpt_kernel_pos;
> 
> This doesn't make much sense.  All you need to know is which 
> registers are in use; all others are available.
> 
> For example, suppose the kernel allocated breakpoints 3, 2, and 1, 
> and then deallocated 2.  Then bp 2 would be available for use, 
> even though 2 > 1.

it's a high/low watermark mechanism. Yes, it's not an allocator that 
can allocate into a debug registrs 'hole', but it is a simple one 
that matches current hardware breakpoint usages and enables the 
kernel to utilize them as well - and keeps all the code simple.

	Ingo

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [Patch 01/11] Introducing generic hardware breakpoint handler interfaces
  2009-03-20 14:33   ` Alan Stern
  2009-03-20 18:30     ` Ingo Molnar
@ 2009-03-20 18:32     ` Ingo Molnar
  2009-03-21 17:26     ` K.Prasad
  2 siblings, 0 replies; 27+ messages in thread
From: Ingo Molnar @ 2009-03-20 18:32 UTC (permalink / raw)
  To: Alan Stern
  Cc: K.Prasad, Linux Kernel Mailing List, Andrew Morton,
	Benjamin Herrenschmidt, Frederic Weisbecker, Maneesh Soni,
	Roland McGrath, Steven Rostedt


* Alan Stern <stern@rowland.harvard.edu> wrote:

> > +	/* Check that the virtual address is in the proper range */
> > +	if (tsk) {
> > +		if (!arch_check_va_in_userspace(bp->info.address, tsk))
> > +			return -EFAULT;
> > +	} else {
> > +		if (!arch_check_va_in_kernelspace(bp->info.address))
> > +			return -EFAULT;
> > +	}
> 
> Roland pointed out that these checks need to take into account the 
> length of the breakpoint.  For example, in 
> arch_check_va_in_userspace() it isn't sufficient for the start of 
> the breakpoint region to be a userspace address; the end of the 
> breakpoint region must also be in userspace.

i pointed it out - but yes, this needs to be fixed.

	Ingo

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [Patch 11/11] ftrace plugin for kernel symbol tracing using HW Breakpoint interfaces - v2
  2009-03-20  9:04   ` Frederic Weisbecker
@ 2009-03-21 16:24     ` K.Prasad
  2009-03-21 16:39       ` Steven Rostedt
  0 siblings, 1 reply; 27+ messages in thread
From: K.Prasad @ 2009-03-21 16:24 UTC (permalink / raw)
  To: Frederic Weisbecker
  Cc: Ingo Molnar, Linux Kernel Mailing List, Alan Stern,
	Andrew Morton, Benjamin Herrenschmidt, Maneesh Soni,
	Roland McGrath, Steven Rostedt

On Fri, Mar 20, 2009 at 10:04:52AM +0100, Frederic Weisbecker wrote:
> On Fri, Mar 20, 2009 at 05:20:32AM +0530, K.Prasad wrote:
> > This patch adds an ftrace plugin to detect and profile memory access over
> > kernel variables. It uses HW Breakpoint interfaces to 'watch memory
> > addresses.
> > 
> > Signed-off-by: K.Prasad <prasad@linux.vnet.ibm.com> 
> > ---
> >  kernel/trace/Kconfig          |   21 +
> >  kernel/trace/Makefile         |    1 
> >  kernel/trace/trace.h          |   25 +
> >  kernel/trace/trace_ksym.c     |  555 ++++++++++++++++++++++++++++++++++++++++++
> >  kernel/trace/trace_selftest.c |   36 ++
> >  5 files changed, 638 insertions(+)
> > 

> > +
> > +void ksym_collect_stats(unsigned long hbkpt_hit_addr)
> > +{
> > +	struct hlist_node *node;
> > +	struct trace_ksym *entry;
> > +
> > +	spin_lock(&ksym_stat_lock);
> 
> 
> I see that can be called from ksym_hbkpt_handler which in turn
> can be called from interrupt context, right?
> You can issue a deadlock if you don't disable interrupts here.
> 
> Thanks,
> Frederic.
> 

ksym_collect_stats<--ksym_hbkpt_handler<--hw_breakpoint_handler<--do_debug
invocation happens with interrupts enabled (IF bit is set). I do find
that a few plugins in kernel/trace enclose the
trace_buffer_lock_reserve()--trace_buffer_unlock_commit() invocation
within interrupt-disabled code. Is that a requirement there?

The potential deadlock scenario you foresee isn't obvious to me. Can you
explain?

Thanks,
K.Prasad


^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [Patch 11/11] ftrace plugin for kernel symbol tracing using HW Breakpoint interfaces - v2
  2009-03-21 16:24     ` K.Prasad
@ 2009-03-21 16:39       ` Steven Rostedt
  2009-03-23 19:08         ` K.Prasad
  0 siblings, 1 reply; 27+ messages in thread
From: Steven Rostedt @ 2009-03-21 16:39 UTC (permalink / raw)
  To: K.Prasad
  Cc: Frederic Weisbecker, Ingo Molnar, Linux Kernel Mailing List,
	Alan Stern, Andrew Morton, Benjamin Herrenschmidt, Maneesh Soni,
	Roland McGrath


On Sat, 21 Mar 2009, K.Prasad wrote:
> > > 
> 
> > > +
> > > +void ksym_collect_stats(unsigned long hbkpt_hit_addr)
> > > +{
> > > +	struct hlist_node *node;
> > > +	struct trace_ksym *entry;
> > > +
> > > +	spin_lock(&ksym_stat_lock);
> > 
> > 
> > I see that can be called from ksym_hbkpt_handler which in turn
> > can be called from interrupt context, right?
> > You can issue a deadlock if you don't disable interrupts here.
> > 
> > Thanks,
> > Frederic.
> > 
> 
> ksym_collect_stats<--ksym_hbkpt_handler<--hw_breakpoint_handler<--do_debug
> invocation happens with interrupts enabled (IF bit is set). I do find
> that a few plugins in kernel/trace enclose the
> trace_buffer_lock_reserve()--trace_buffer_unlock_commit() invocation
> within interrupt-disabled code. Is that a requirement there?
> 
> The potential deadlock scenario you foresee isn't obvious to me. Can you
> explain?

Can that lock ever be taken in an interrupt? If not, document that (and 
perhaps add a WARN_ON(in_interrupt()); ). Otherwise you have a possible:

	spin_lock(&ksym_stat_lock);

		===> take interrupt ...

			(from interrupt)
			spin_lock(&ksym_stat_lock); <== deadlock.


-- Steve


^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [Patch 01/11] Introducing generic hardware breakpoint handler interfaces
  2009-03-20 14:33   ` Alan Stern
  2009-03-20 18:30     ` Ingo Molnar
  2009-03-20 18:32     ` Ingo Molnar
@ 2009-03-21 17:26     ` K.Prasad
  2009-03-21 21:39       ` Alan Stern
  2 siblings, 1 reply; 27+ messages in thread
From: K.Prasad @ 2009-03-21 17:26 UTC (permalink / raw)
  To: Alan Stern
  Cc: Ingo Molnar, Linux Kernel Mailing List, Andrew Morton,
	Benjamin Herrenschmidt, Frederic Weisbecker, Maneesh Soni,
	Roland McGrath, Steven Rostedt

On Fri, Mar 20, 2009 at 10:33:26AM -0400, Alan Stern wrote:
> On Fri, 20 Mar 2009, K.Prasad wrote:
> 
> > This patch introduces two new files hw_breakpoint.[ch] which defines the 
> > generic interfaces to use hardware breakpoint infrastructure of the system. 
> 
> Prasad:
> 
> I'm sorry to say this is full of mistakes.  So far I have looked only 
> at patch 01/11, but it's not good.
> 

After you pointed out, I realise that the code in load_debug_registers()
is an overkill and unregister_kernel_hw_breakpoint() has an obvious 
error which should have caught my attention. My next revision should 
fix them.

> > + * Kernel breakpoints grow downwards, starting from HB_NUM
> > + * 'hbkpt_kernel_pos' denotes lowest numbered breakpoint register occupied for
> > + * kernel-space request
> > + */
> > +unsigned int hbkpt_kernel_pos;
> 
> This doesn't make much sense.  All you need to know is which registers
> are in use; all others are available.
> 

As explained by Maneesh earlier, we compact the kernel-space requests
into registers (HB_NUM - 1) to hbkpt_kernel_pos. The kernel-space
requests aren't specific to any given register number too, and so
compaction would be suitable for this case (unlike when implemented for
user-space which might need virtualisation of registers).

> For example, suppose the kernel allocated breakpoints 3, 2, and 1, and
> then deallocated 2.  Then bp 2 would be available for use, even though
> 2 > 1.
> 
> It's also a poor choice of name.  Everywhere else (in my patches,
> anyway) the code refers to hardware breakpoints using the abbreviation
> "hwbp" or "hw_breakpoint".  There's no reason suddenly to start using
> "hbkpt".
> 

I began using 'hbkpt' as a shorter naming convention (the longer one
being hw_breakpoint) without being really conscious of the 'hwbkpt'
usage by you (even some of the previous iterations contained them in
samples/hw_breakpoint and ftrace-plugin).

Well, I will rename my shorter name to 'hwbkpt' for uniformity.

> > +/* An array containing refcount of threads using a given bkpt register */
> > +unsigned int hbkpt_user_max_refcount[HB_NUM];
> 
> Why did you put "max" in the name?  Isn't this just a simple refcount?
> 

Ok. It will be hbkpt_user_refcount[].

> > +/* One higher than the highest counted user-space breakpoint register */
> > +unsigned int hbkpt_user_max;
> 
> Likewise, this variable isn't really needed.  It's just one more than
> the largest i such that hbkpt_user_max_refcount[i] > 0.
> 

It acts like a cache for determining the user-space breakpoint boundary.
It is used for sanity checks and in its absence we will have to compute from
hbkpt_user_max_refcount[] everytime.

> > +/*
> > + * Install the debug register values for a new thread.
> > + */
> > +void switch_to_thread_hw_breakpoint(struct task_struct *tsk)
> > +{
> > +	/* Set the debug register */
> 
> Set _which_ debug register?
> 

Will change it to read:
/* Set all debug registers used by 'tsk' */

> > +	arch_install_thread_hbkpt(tsk);
> > +	last_debugged_task = current;
> > +
> > +	put_cpu_no_resched();
> 
> What's this line doing here?  It looks like something you forgot to
> erase.
> 
> > +}
> > +
> > +/*
> > + * Install the debug register values for just the kernel, no thread.
> > + */
> > +void switch_to_none_hw_breakpoint(void)
> > +{
> > +	arch_install_none();
> > +	put_cpu_no_resched();
> 
> Same for this line.
> 

These are carriages from the previous code. They are still invoked from
the same places (such as flush_thread_hw_breakpoint(),
hw_breakpoint_handler()) and hence I didn't analyse it enough to see if
they were to be removed.

However, having found that preempt_count() is already zero at places where
these are called I think they can be removed.

> > +}
> > +
> > +/*
> > + * Load the debug registers during startup of a CPU.
> > + */
> > +void load_debug_registers(void)
> > +{
> > +	int i;
> > +	unsigned long flags;
> > +
> > +	/* Prevent IPIs for new kernel breakpoint updates */
> > +	local_irq_save(flags);
> > +
> > +	for (i = hbkpt_kernel_pos; i < HB_NUM; i++)
> > +		if (hbkpt_kernel[i])
> > +			on_each_cpu(arch_install_kernel_hbkpt,
> > +				(void *)hbkpt_kernel[i], 0);
> 
> This is completely wrong.  First of all, it's dumb to send multiple
> IPIs (one for each iteration through the loop).  Second, this routine
> shouldn't send any IPIs at all!  It gets invoked when a CPU is
> starting up and wants to load its _own_ debug registers -- not tell
> another CPU to load anything.
> 

As I agreed before, it is an overkill (given the design of
arch_install_kernel_hbkpt()). I will create a new
arch_update_kernel_hbkpt(pos, bp) that will install breakpoints only
on the CPU starting up.

> > +	if (current->thread.dr7)
> > +		arch_install_thread_hbkpt(current);
> > +
> > +	local_irq_restore(flags);
> > +}
> > +
> > +/*
> > + * Erase all the hardware breakpoint info associated with a thread.
> > + *
> > + * If tsk != current then tsk must not be usable (for example, a
> > + * child being cleaned up from a failed fork).
> > + */
> > +void flush_thread_hw_breakpoint(struct task_struct *tsk)
> > +{
> > +	int i;
> > +	struct thread_struct *thread = &(tsk->thread);
> > +
> > +	mutex_lock(&hw_breakpoint_mutex);
> > +
> > +	/* Let the breakpoints know they are being uninstalled */
> 
> This comment looks like a leftover which should have been erased.
> 
> > +/*
> > + * Validate the settings in a hw_breakpoint structure.
> > + */
> > +static int validate_settings(struct hw_breakpoint *bp, struct task_struct *tsk)
> > +{
> > +	int ret;
> > +	unsigned int align;
> > +
> > +	ret = arch_validate_hwbkpt_settings(bp, &align, tsk);
> > +	if (ret < 0)
> > +		goto err;
> > +
> > +	/* Check that the low-order bits of the address are appropriate
> > +	 * for the alignment implied by len.
> > +	 */
> > +	if (bp->info.address & align)
> > +		return -EINVAL;
> > +
> > +	/* Check that the virtual address is in the proper range */
> > +	if (tsk) {
> > +		if (!arch_check_va_in_userspace(bp->info.address, tsk))
> > +			return -EFAULT;
> > +	} else {
> > +		if (!arch_check_va_in_kernelspace(bp->info.address))
> > +			return -EFAULT;
> > +	}
> 
> Roland pointed out that these checks need to take into account the
> length of the breakpoint.  For example, in
> arch_check_va_in_userspace() it isn't sufficient for the start of the
> breakpoint region to be a userspace address; the end of the
> breakpoint region must also be in userspace.
> 

Ok. Will do something like:
return (va <= (TASK_SIZE - (hw_breakpoint_length * word_size)));

> > + err:
> > +	return ret;
> > +}
> > +
> > +int __register_user_hw_breakpoint(int pos, struct task_struct *tsk,
> > +					struct hw_breakpoint *bp)
> > +{
> > +	struct thread_struct *thread = &(tsk->thread);
> > +	int rc;
> > +
> > +	/* Do not overcommit. Fail if kernel has used the hbkpt registers */
> > +	if (pos >= hbkpt_kernel_pos)
> > +		return -ENOSPC;
> 
> In fact you should fail if the debug register is already in use,
> regardless of whether it is being used by a kernel breakpoint.  And you 
> shouldn't check against hbkpt_kernel_pos; you should check whether 
> hbkpt_kernel[pos] is NULL and thread->hbkpt[pos] is NULL.
> 

As explained before, the intended design was like this:

ample layout:
hbkpt_kernel_pos = 1
hbkpt_user_max = 1

---------------------------------------------------------------------
|                |                |                |                |
|       DR3      |       DR2      |       DR1      |       DR0      |
|                |                |                |                |
---------------------------------------------------------------------
^                                                  ^                ^
|                                                  |                |
-----------------kernel-space addresses-------------------user-------

After removing breakpoint in say DR2, compaction occurs.
New layout will be:
hbkpt_kernel_pos = 2
hbkpt_user_max = 1

---------------------------------------------------------------------
|                |                |                |                |
|       DR3      |       DR2      |       DR1      |       DR0      |
|                |                |                |                |
---------------------------------------------------------------------
^                                 ^                ^                ^
|                                 |                |                |
-----------------kernel------------------empty-----------user--------

The above design, in my opinion is intuitive, allows re-use of
uninstalled registers and is simple to implement.

What was missing in the sent patch was the updation of dr7 and dr[pos]
register after compaction. I will add them in the next iteration of the
patch.

> > +
> > +	rc = validate_settings(bp, tsk);
> > +	if (rc)
> > +		return rc;
> > +
> > +	thread->hbkpt[pos] = bp;
> > +	thread->hbkpt_num_installed++;
> > +	hbkpt_user_max_refcount[pos]++;
> > +	/* 'tsk' is the thread that uses max number of hbkpt registers */
> 
> This is a bad comment.  It sounds like it's saying that "tsk" is
> defined as the thread using the maximum number of breakpoints, rather
> than being defined as the thread for which the breakpoint is being
> registered.
> 
> Besides, there's no reason to keep track of which thread uses the max 
> number of breakpoints anyway.  Not to mention the fact that you don't 
> update hbkpt_user_max when its thread exits.
> 

We don't keep track of the thread (in the sense the task_struct) but
'hbkpt_user_max' is used for validating requests and book-keeping. As
Maneesh mentioned before flush_thread_hw_breakpoint() updates
'hbkpt_user_max'.

I can change it to read like the one below if it sounds better to you.

/* 
 * 'tsk' uses more number of registers than 'hbkpt_user_max'. Update
 * the same.
 */

> > +	if (hbkpt_user_max < thread->hbkpt_num_installed)
> > +		hbkpt_user_max++;
> 
> At this point I got tired of looking, but it seems obvious that the new 
> patch series needs a bunch of improvements.
> 
> Alan Stern
>

As mentioned before the next iteration would contain the changes I've
discussed above.

Thanks,
K.Prasad
 

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [Patch 01/11] Introducing generic hardware breakpoint handler interfaces
  2009-03-20 18:30     ` Ingo Molnar
@ 2009-03-21 17:32       ` K.Prasad
  0 siblings, 0 replies; 27+ messages in thread
From: K.Prasad @ 2009-03-21 17:32 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Alan Stern, Linux Kernel Mailing List, Andrew Morton,
	Benjamin Herrenschmidt, Frederic Weisbecker, Maneesh Soni,
	Roland McGrath, Steven Rostedt

On Fri, Mar 20, 2009 at 07:30:58PM +0100, Ingo Molnar wrote:
> 
> * Alan Stern <stern@rowland.harvard.edu> wrote:
> 
> > > + * Kernel breakpoints grow downwards, starting from HB_NUM
> > > + * 'hbkpt_kernel_pos' denotes lowest numbered breakpoint register occupied for
> > > + * kernel-space request
> > > + */
> > > +unsigned int hbkpt_kernel_pos;
> > 
> > This doesn't make much sense.  All you need to know is which 
> > registers are in use; all others are available.
> > 
> > For example, suppose the kernel allocated breakpoints 3, 2, and 1, 
> > and then deallocated 2.  Then bp 2 would be available for use, 
> > even though 2 > 1.
> 
> it's a high/low watermark mechanism. Yes, it's not an allocator that 
> can allocate into a debug registrs 'hole', but it is a simple one 
> that matches current hardware breakpoint usages and enables the 
> kernel to utilize them as well - and keeps all the code simple.
> 
> 	Ingo

I've explained the design here: http://lkml.org/lkml/2009/3/21/169 in a
and is slightly different from what you've explained above.

It involves shifting of kernel-space registers by one-level if a
kernel-register is uninstalled. We compact the kernel-space registers
since a)not to leave a 'hole' thereby wasting a register forever during
runtime b)kernel-space requests are not specific to a register number
and can be moved at will (unlike user-space requests).

Hope that the design is acceptable and the resultant code - simple.

Thanks,
K.Prasad


^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [Patch 01/11] Introducing generic hardware breakpoint handler interfaces
  2009-03-21 17:26     ` K.Prasad
@ 2009-03-21 21:39       ` Alan Stern
  2009-03-23 19:03         ` K.Prasad
  0 siblings, 1 reply; 27+ messages in thread
From: Alan Stern @ 2009-03-21 21:39 UTC (permalink / raw)
  To: K.Prasad
  Cc: Ingo Molnar, Linux Kernel Mailing List, Andrew Morton,
	Benjamin Herrenschmidt, Frederic Weisbecker, Maneesh Soni,
	Roland McGrath, Steven Rostedt

On Sat, 21 Mar 2009, K.Prasad wrote:

> > > + * Kernel breakpoints grow downwards, starting from HB_NUM
> > > + * 'hbkpt_kernel_pos' denotes lowest numbered breakpoint register occupied for
> > > + * kernel-space request
> > > + */
> > > +unsigned int hbkpt_kernel_pos;
> > 
> > This doesn't make much sense.  All you need to know is which registers
> > are in use; all others are available.
> > 
> 
> As explained by Maneesh earlier, we compact the kernel-space requests
> into registers (HB_NUM - 1) to hbkpt_kernel_pos. The kernel-space
> requests aren't specific to any given register number too, and so
> compaction would be suitable for this case (unlike when implemented for
> user-space which might need virtualisation of registers).

Okay, that makes sense.  Perhaps you could add a short comment here
explaining that the register allocations get compacted when a kernel
breakpoint is unregistered, so they will always be contiguous.

> > It's also a poor choice of name.  Everywhere else (in my patches,
> > anyway) the code refers to hardware breakpoints using the abbreviation
> > "hwbp" or "hw_breakpoint".  There's no reason suddenly to start using
> > "hbkpt".
> > 
> 
> I began using 'hbkpt' as a shorter naming convention (the longer one
> being hw_breakpoint) without being really conscious of the 'hwbkpt'
> usage by you (even some of the previous iterations contained them in
> samples/hw_breakpoint and ftrace-plugin).
> 
> Well, I will rename my shorter name to 'hwbkpt' for uniformity.

My patch never used "hwbkpt".  Besides "hw_breakpoint", it used only 
"bp".  On going back and checking, I found that it didn't even use 
"hwbp".  Some other abbreviations it did use were "kbp" for kernel 
breakpoint, "chbi" for per-CPU hardware breakpoint info, and "thbi" for 
per-thread hardware breakpoint info.

If you're looking for a good short name, and if you want to keep 
hardware breakpoints distinct from software breakpoints, I suggest 
"hbp" instead of "hbkpt".  But it's up to you, and it's worth noticing 
that the code already contains lots of variables named just "bp".

> > > +/* One higher than the highest counted user-space breakpoint register */
> > > +unsigned int hbkpt_user_max;
> > 
> > Likewise, this variable isn't really needed.  It's just one more than
> > the largest i such that hbkpt_user_max_refcount[i] > 0.
> > 
> 
> It acts like a cache for determining the user-space breakpoint boundary.
> It is used for sanity checks and in its absence we will have to compute from
> hbkpt_user_max_refcount[] everytime.

That's right.  Isn't it simpler to check

	kernel_pos > 0 && hbkpt_user_refcount[kernel_pos - 1] == 0

than to check

	kernel_pos - 1 >= hbkpt_user_max

_and_ to keep hbkpt_user_max set to the correct value at all times?

> > > +/*
> > > + * Load the debug registers during startup of a CPU.
> > > + */
> > > +void load_debug_registers(void)
> > > +{
> > > +	int i;
> > > +	unsigned long flags;
> > > +
> > > +	/* Prevent IPIs for new kernel breakpoint updates */
> > > +	local_irq_save(flags);
> > > +
> > > +	for (i = hbkpt_kernel_pos; i < HB_NUM; i++)
> > > +		if (hbkpt_kernel[i])
> > > +			on_each_cpu(arch_install_kernel_hbkpt,
> > > +				(void *)hbkpt_kernel[i], 0);
> > 
> > This is completely wrong.  First of all, it's dumb to send multiple
> > IPIs (one for each iteration through the loop).  Second, this routine
> > shouldn't send any IPIs at all!  It gets invoked when a CPU is
> > starting up and wants to load its _own_ debug registers -- not tell
> > another CPU to load anything.
> > 
> 
> As I agreed before, it is an overkill (given the design of
> arch_install_kernel_hbkpt()). I will create a new
> arch_update_kernel_hbkpt(pos, bp) that will install breakpoints only
> on the CPU starting up.

Doesn't arch_install_kernel_hbkpt() already install breakpoints
on only the current CPU?  So why do you need a new function?

> > > +	/* Check that the virtual address is in the proper range */
> > > +	if (tsk) {
> > > +		if (!arch_check_va_in_userspace(bp->info.address, tsk))
> > > +			return -EFAULT;
> > > +	} else {
> > > +		if (!arch_check_va_in_kernelspace(bp->info.address))
> > > +			return -EFAULT;
> > > +	}
> > 
> > Roland pointed out that these checks need to take into account the
> > length of the breakpoint.  For example, in
> > arch_check_va_in_userspace() it isn't sufficient for the start of the
> > breakpoint region to be a userspace address; the end of the
> > breakpoint region must also be in userspace.
> > 
> 
> Ok. Will do something like:
> return (va <= (TASK_SIZE - (hw_breakpoint_length * word_size)));

What is the purpose of word_size here?  The breakpoint length should be 
specified in bytes, not words.

Don't forget that that in arch_check_va_in_kernelspace() you need to 
check both for values that are too low and values that are too high 
(they overflow and wrap around back to a user address).

> We don't keep track of the thread (in the sense the task_struct) but
> 'hbkpt_user_max' is used for validating requests and book-keeping. As
> Maneesh mentioned before flush_thread_hw_breakpoint() updates
> 'hbkpt_user_max'.
> 
> I can change it to read like the one below if it sounds better to you.
> 
> /* 
>  * 'tsk' uses more number of registers than 'hbkpt_user_max'. Update
>  * the same.
>  */

My preference is simply to eliminate hbkpt_user_max entirely.

Alan Stern


^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [Patch 01/11] Introducing generic hardware breakpoint handler interfaces
  2009-03-21 21:39       ` Alan Stern
@ 2009-03-23 19:03         ` K.Prasad
  2009-03-23 19:21           ` Alan Stern
  0 siblings, 1 reply; 27+ messages in thread
From: K.Prasad @ 2009-03-23 19:03 UTC (permalink / raw)
  To: Alan Stern
  Cc: Ingo Molnar, Linux Kernel Mailing List, Andrew Morton,
	Benjamin Herrenschmidt, Frederic Weisbecker, Maneesh Soni,
	Roland McGrath, Steven Rostedt

On Sat, Mar 21, 2009 at 05:39:59PM -0400, Alan Stern wrote:
> On Sat, 21 Mar 2009, K.Prasad wrote:
> 
> > > > + * Kernel breakpoints grow downwards, starting from HB_NUM
> > > > + * 'hbkpt_kernel_pos' denotes lowest numbered breakpoint register occupied for
> > > > + * kernel-space request
> > > > + */
> > > > +unsigned int hbkpt_kernel_pos;
> > > 
> > > This doesn't make much sense.  All you need to know is which registers
> > > are in use; all others are available.
> > > 
> > 
> > As explained by Maneesh earlier, we compact the kernel-space requests
> > into registers (HB_NUM - 1) to hbkpt_kernel_pos. The kernel-space
> > requests aren't specific to any given register number too, and so
> > compaction would be suitable for this case (unlike when implemented for
> > user-space which might need virtualisation of registers).
> 
> Okay, that makes sense.  Perhaps you could add a short comment here
> explaining that the register allocations get compacted when a kernel
> breakpoint is unregistered, so they will always be contiguous.
> 
> > > It's also a poor choice of name.  Everywhere else (in my patches,
> > > anyway) the code refers to hardware breakpoints using the abbreviation
> > > "hwbp" or "hw_breakpoint".  There's no reason suddenly to start using
> > > "hbkpt".
> > > 
> > 
> > I began using 'hbkpt' as a shorter naming convention (the longer one
> > being hw_breakpoint) without being really conscious of the 'hwbkpt'
> > usage by you (even some of the previous iterations contained them in
> > samples/hw_breakpoint and ftrace-plugin).
> > 
> > Well, I will rename my shorter name to 'hwbkpt' for uniformity.
> 
> My patch never used "hwbkpt".  Besides "hw_breakpoint", it used only 
> "bp".  On going back and checking, I found that it didn't even use 
> "hwbp".  Some other abbreviations it did use were "kbp" for kernel 
> breakpoint, "chbi" for per-CPU hardware breakpoint info, and "thbi" for 
> per-thread hardware breakpoint info.
> 
> If you're looking for a good short name, and if you want to keep 
> hardware breakpoints distinct from software breakpoints, I suggest 
> "hbp" instead of "hbkpt".  But it's up to you, and it's worth noticing 
> that the code already contains lots of variables named just "bp".
> 

I am renaming all 'hbkpt' strings to 'hbp'.

> > > > +/* One higher than the highest counted user-space breakpoint register */
> > > > +unsigned int hbkpt_user_max;
> > > 
> > > Likewise, this variable isn't really needed.  It's just one more than
> > > the largest i such that hbkpt_user_max_refcount[i] > 0.
> > > 
> > 
> > It acts like a cache for determining the user-space breakpoint boundary.
> > It is used for sanity checks and in its absence we will have to compute from
> > hbkpt_user_max_refcount[] everytime.
> 
> That's right.  Isn't it simpler to check
> 
> 	kernel_pos > 0 && hbkpt_user_refcount[kernel_pos - 1] == 0
> 
> than to check
> 
> 	kernel_pos - 1 >= hbkpt_user_max
> 
> _and_ to keep hbkpt_user_max set to the correct value at all times?
>

Unfortunately the lines of code required to maintain the variable comes
close to the amount of lines it would potentially save. I will change to
code to compute it from hbkpt_user_refcount[] everytime.
 
> > > > +/*
> > > > + * Load the debug registers during startup of a CPU.
> > > > + */
> > > > +void load_debug_registers(void)
> > > > +{
> > > > +	int i;
> > > > +	unsigned long flags;
> > > > +
> > > > +	/* Prevent IPIs for new kernel breakpoint updates */
> > > > +	local_irq_save(flags);
> > > > +
> > > > +	for (i = hbkpt_kernel_pos; i < HB_NUM; i++)
> > > > +		if (hbkpt_kernel[i])
> > > > +			on_each_cpu(arch_install_kernel_hbkpt,
> > > > +				(void *)hbkpt_kernel[i], 0);
> > > 
> > > This is completely wrong.  First of all, it's dumb to send multiple
> > > IPIs (one for each iteration through the loop).  Second, this routine
> > > shouldn't send any IPIs at all!  It gets invoked when a CPU is
> > > starting up and wants to load its _own_ debug registers -- not tell
> > > another CPU to load anything.
> > > 
> > 
> > As I agreed before, it is an overkill (given the design of
> > arch_install_kernel_hbkpt()). I will create a new
> > arch_update_kernel_hbkpt(pos, bp) that will install breakpoints only
> > on the CPU starting up.
> 
> Doesn't arch_install_kernel_hbkpt() already install breakpoints
> on only the current CPU?  So why do you need a new function?
>

There will be a few more changes to arch_install_kernel_hbkpt() along
with this. Please find the changes in the ensuing patchset.
 
> > > > +	/* Check that the virtual address is in the proper range */
> > > > +	if (tsk) {
> > > > +		if (!arch_check_va_in_userspace(bp->info.address, tsk))
> > > > +			return -EFAULT;
> > > > +	} else {
> > > > +		if (!arch_check_va_in_kernelspace(bp->info.address))
> > > > +			return -EFAULT;
> > > > +	}
> > > 
> > > Roland pointed out that these checks need to take into account the
> > > length of the breakpoint.  For example, in
> > > arch_check_va_in_userspace() it isn't sufficient for the start of the
> > > breakpoint region to be a userspace address; the end of the
> > > breakpoint region must also be in userspace.
> > > 
> > 
> > Ok. Will do something like:
> > return (va <= (TASK_SIZE - (hw_breakpoint_length * word_size)));
> 
> What is the purpose of word_size here?  The breakpoint length should be 
> specified in bytes, not words.
> 
> Don't forget that that in arch_check_va_in_kernelspace() you need to 
> check both for values that are too low and values that are too high 
> (they overflow and wrap around back to a user address).
> 

While I understand the user-space checking using the length of the HW
Breakpoint, I don't really see how I can check for an upper-bound for
kernel-space virtual addresses. Most usage in the kernel only checks for
the address >= TASK_SIZE (while they check for add + len if the length
of the memory is known). I will be glad to have any suggestions in this
regard.

> > We don't keep track of the thread (in the sense the task_struct) but
> > 'hbkpt_user_max' is used for validating requests and book-keeping. As
> > Maneesh mentioned before flush_thread_hw_breakpoint() updates
> > 'hbkpt_user_max'.
> > 
> > I can change it to read like the one below if it sounds better to you.
> > 
> > /* 
> >  * 'tsk' uses more number of registers than 'hbkpt_user_max'. Update
> >  * the same.
> >  */
> 
> My preference is simply to eliminate hbkpt_user_max entirely.
> 
> Alan Stern
>

Done.

Thanks,
K.Prasad
 

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [Patch 11/11] ftrace plugin for kernel symbol tracing using HW Breakpoint interfaces - v2
  2009-03-21 16:39       ` Steven Rostedt
@ 2009-03-23 19:08         ` K.Prasad
  0 siblings, 0 replies; 27+ messages in thread
From: K.Prasad @ 2009-03-23 19:08 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: Frederic Weisbecker, Ingo Molnar, Linux Kernel Mailing List,
	Alan Stern, Andrew Morton, Benjamin Herrenschmidt, Maneesh Soni,
	Roland McGrath

On Sat, Mar 21, 2009 at 12:39:08PM -0400, Steven Rostedt wrote:
> 
> On Sat, 21 Mar 2009, K.Prasad wrote:
> > > > 
> > 
> > > > +
> > > > +void ksym_collect_stats(unsigned long hbkpt_hit_addr)
> > > > +{
> > > > +	struct hlist_node *node;
> > > > +	struct trace_ksym *entry;
> > > > +
> > > > +	spin_lock(&ksym_stat_lock);
> > > 
> > > 
> > > I see that can be called from ksym_hbkpt_handler which in turn
> > > can be called from interrupt context, right?
> > > You can issue a deadlock if you don't disable interrupts here.
> > > 
> > > Thanks,
> > > Frederic.
> > > 
> > 
> > ksym_collect_stats<--ksym_hbkpt_handler<--hw_breakpoint_handler<--do_debug
> > invocation happens with interrupts enabled (IF bit is set). I do find
> > that a few plugins in kernel/trace enclose the
> > trace_buffer_lock_reserve()--trace_buffer_unlock_commit() invocation
> > within interrupt-disabled code. Is that a requirement there?
> > 
> > The potential deadlock scenario you foresee isn't obvious to me. Can you
> > explain?
> 
> Can that lock ever be taken in an interrupt? If not, document that (and 
> perhaps add a WARN_ON(in_interrupt()); ). Otherwise you have a possible:
> 
> 	spin_lock(&ksym_stat_lock);
> 
> 		===> take interrupt ...
> 
> 			(from interrupt)
> 			spin_lock(&ksym_stat_lock); <== deadlock.
> 
> 
> -- Steve
>

Given that the function pointed by the trigger() routine is invoked with
breakpoints disabled on that CPU, I don't think we'd enter into a loop
a cyclic dependancy as above.

On the other hand, my observation w.r.t. IF bit being set was misplaced
in the sense that it corresponded to the saved stack and not when inside
the breakpoint handler in which case interrupts were disabled.

So we are safe in either ways.

Thanks,
K.Prasad
 

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [Patch 01/11] Introducing generic hardware breakpoint handler interfaces
  2009-03-23 19:03         ` K.Prasad
@ 2009-03-23 19:21           ` Alan Stern
  2009-03-23 20:42             ` K.Prasad
  0 siblings, 1 reply; 27+ messages in thread
From: Alan Stern @ 2009-03-23 19:21 UTC (permalink / raw)
  To: K.Prasad
  Cc: Ingo Molnar, Linux Kernel Mailing List, Andrew Morton,
	Benjamin Herrenschmidt, Frederic Weisbecker, Maneesh Soni,
	Roland McGrath, Steven Rostedt

On Tue, 24 Mar 2009, K.Prasad wrote:

> > > Ok. Will do something like:
> > > return (va <= (TASK_SIZE - (hw_breakpoint_length * word_size)));
> > 
> > What is the purpose of word_size here?  The breakpoint length should be 
> > specified in bytes, not words.
> > 
> > Don't forget that that in arch_check_va_in_kernelspace() you need to 
> > check both for values that are too low and values that are too high 
> > (they overflow and wrap around back to a user address).
> > 
> 
> While I understand the user-space checking using the length of the HW
> Breakpoint, I don't really see how I can check for an upper-bound for
> kernel-space virtual addresses. Most usage in the kernel only checks for
> the address >= TASK_SIZE (while they check for add + len if the length
> of the memory is known). I will be glad to have any suggestions in this
> regard.

Isn't that exactly the check you need to implement?

	addr >= TASK_SIZE && (addr + len) >= TASK_SIZE,

or perhaps better,

	addr >= TASK_SIZE && (addr + len) >= addr.

In this case you _do_ know the length of the breakpoint.

Alan Stern


^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [Patch 01/11] Introducing generic hardware breakpoint handler interfaces
  2009-03-23 19:21           ` Alan Stern
@ 2009-03-23 20:42             ` K.Prasad
  2009-03-23 21:20               ` Alan Stern
  0 siblings, 1 reply; 27+ messages in thread
From: K.Prasad @ 2009-03-23 20:42 UTC (permalink / raw)
  To: Alan Stern
  Cc: Ingo Molnar, Linux Kernel Mailing List, Andrew Morton,
	Benjamin Herrenschmidt, Frederic Weisbecker, Maneesh Soni,
	Roland McGrath, Steven Rostedt

On Mon, Mar 23, 2009 at 03:21:49PM -0400, Alan Stern wrote:
> On Tue, 24 Mar 2009, K.Prasad wrote:
> 
> > > > Ok. Will do something like:
> > > > return (va <= (TASK_SIZE - (hw_breakpoint_length * word_size)));
> > > 
> > > What is the purpose of word_size here?  The breakpoint length should be 
> > > specified in bytes, not words.
> > > 
> > > Don't forget that that in arch_check_va_in_kernelspace() you need to 
> > > check both for values that are too low and values that are too high 
> > > (they overflow and wrap around back to a user address).
> > > 
> > 
> > While I understand the user-space checking using the length of the HW
> > Breakpoint, I don't really see how I can check for an upper-bound for
> > kernel-space virtual addresses. Most usage in the kernel only checks for
> > the address >= TASK_SIZE (while they check for add + len if the length
> > of the memory is known). I will be glad to have any suggestions in this
> > regard.
> 
> Isn't that exactly the check you need to implement?
> 
> 	addr >= TASK_SIZE && (addr + len) >= TASK_SIZE,
> 
> or perhaps better,
> 
> 	addr >= TASK_SIZE && (addr + len) >= addr.
> 
> In this case you _do_ know the length of the breakpoint.
> 
> Alan Stern
>

Aren't we just checking if len is a positive number through the above
checks? The validation checks in the patchset should take care of
negative lengths. Or am I missing something?

I thought you wanted the code to check for an upper sane limit for addr
in kernel-space, say something like this:

TASK_SIZE <= addr <= (Upper limit for Kernel Virtual Address)

When I referred to 'len' in my previous mail, it meant the length
of the kernel virtual memory area (which can be used to find the upper
bound).

Thanks,
K.Prasad


^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [Patch 01/11] Introducing generic hardware breakpoint handler interfaces
  2009-03-23 20:42             ` K.Prasad
@ 2009-03-23 21:20               ` Alan Stern
  0 siblings, 0 replies; 27+ messages in thread
From: Alan Stern @ 2009-03-23 21:20 UTC (permalink / raw)
  To: K.Prasad
  Cc: Ingo Molnar, Linux Kernel Mailing List, Andrew Morton,
	Benjamin Herrenschmidt, Frederic Weisbecker, Maneesh Soni,
	Roland McGrath, Steven Rostedt

On Tue, 24 Mar 2009, K.Prasad wrote:

> > Isn't that exactly the check you need to implement?
> > 
> > 	addr >= TASK_SIZE && (addr + len) >= TASK_SIZE,
> > 
> > or perhaps better,
> > 
> > 	addr >= TASK_SIZE && (addr + len) >= addr.
> > 
> > In this case you _do_ know the length of the breakpoint.
> > 
> > Alan Stern
> >
> 
> Aren't we just checking if len is a positive number through the above
> checks? The validation checks in the patchset should take care of
> negative lengths. Or am I missing something?

Well, 0x60000000 is a positive number, and 0xd0000000 is >= TASK_SIZE.  
But their sum is 0x30000000, which lies in userspace.  In other words, 
you are missing the possibility that the addition might overflow and 
wrap around.

> I thought you wanted the code to check for an upper sane limit for addr
> in kernel-space, say something like this:
> 
> TASK_SIZE <= addr <= (Upper limit for Kernel Virtual Address)

No, the test should be

    TASK_SIZE <= addr <= addr + (len-1) <= (Upper limit for Kernel VA)

By the way, is TASK_SIZE the correct lower bound for kernel virtual
addresses on x86-64?

> When I referred to 'len' in my previous mail, it meant the length
> of the kernel virtual memory area (which can be used to find the upper
> bound).

Oh, sorry, I misunderstood.  Isn't that limit always 0xffffffff on 
x86-32?

Alan Stern


^ permalink raw reply	[flat|nested] 27+ messages in thread

* [Patch 06/11] Use the new wrapper routines to access debug registers in process/thread code
       [not found] <20090407063058.301701787@prasadkr_t60p.in.ibm.com>
@ 2009-04-07  6:36 ` K.Prasad
  0 siblings, 0 replies; 27+ messages in thread
From: K.Prasad @ 2009-04-07  6:36 UTC (permalink / raw)
  To: Alan Stern
  Cc: Ingo Molnar, Linux Kernel Mailing List, Andrew Morton,
	Benjamin Herrenschmidt, Frederic Weisbecker, maneesh,
	Roland McGrath, Steven Rostedt, K.Prasad

[-- Attachment #1: modify_process_related_06 --]
[-- Type: text/plain, Size: 7781 bytes --]

From: Alan Stern <stern@rowland.harvard.edu>

This patch enables the use of abstract debug registers in
process-handling routines.

[K.Prasad: Split-out from the bigger patch and minor changes following
           re-basing]

Signed-off-by: K.Prasad <prasad@linux.vnet.ibm.com>
Signed-off-by: Alan Stern <stern@rowland.harvard.edu>
---
 arch/x86/include/asm/processor.h |    4 ---
 arch/x86/kernel/process.c        |   23 ++++-------------
 arch/x86/kernel/process_32.c     |   31 +++++++++++++++++++++++
 arch/x86/kernel/process_64.c     |   33 +++++++++++++++++++++++++
 4 files changed, 70 insertions(+), 21 deletions(-)

Index: arch/x86/kernel/process.c
===================================================================
--- arch/x86/kernel/process.c.orig	2009-04-01 20:53:43.000000000 +0530
+++ arch/x86/kernel/process.c	2009-04-01 20:54:39.000000000 +0530
@@ -14,6 +14,8 @@
 #include <asm/idle.h>
 #include <asm/uaccess.h>
 #include <asm/i387.h>
+#include <asm/debugreg.h>
+#include <asm/hw_breakpoint.h>
 
 unsigned long idle_halt;
 EXPORT_SYMBOL(idle_halt);
@@ -83,6 +85,8 @@
 		put_cpu();
 		kfree(bp);
 	}
+	if (unlikely(test_tsk_thread_flag(me, TIF_DEBUG)))
+		flush_thread_hw_breakpoint(me);
 
 	ds_exit_thread(current);
 }
@@ -103,14 +107,9 @@
 	}
 #endif
 
-	clear_tsk_thread_flag(tsk, TIF_DEBUG);
+	if (unlikely(test_tsk_thread_flag(tsk, TIF_DEBUG)))
+		flush_thread_hw_breakpoint(tsk);
 
-	tsk->thread.debugreg0 = 0;
-	tsk->thread.debugreg1 = 0;
-	tsk->thread.debugreg2 = 0;
-	tsk->thread.debugreg3 = 0;
-	tsk->thread.debugreg6 = 0;
-	tsk->thread.debugreg7 = 0;
 	memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
 	/*
 	 * Forget coprocessor state..
@@ -192,16 +191,6 @@
 	else if (next->debugctlmsr != prev->debugctlmsr)
 		update_debugctlmsr(next->debugctlmsr);
 
-	if (test_tsk_thread_flag(next_p, TIF_DEBUG)) {
-		set_debugreg(next->debugreg0, 0);
-		set_debugreg(next->debugreg1, 1);
-		set_debugreg(next->debugreg2, 2);
-		set_debugreg(next->debugreg3, 3);
-		/* no 4 and 5 */
-		set_debugreg(next->debugreg6, 6);
-		set_debugreg(next->debugreg7, 7);
-	}
-
 	if (test_tsk_thread_flag(prev_p, TIF_NOTSC) ^
 	    test_tsk_thread_flag(next_p, TIF_NOTSC)) {
 		/* prev and next are different */
Index: arch/x86/kernel/process_32.c
===================================================================
--- arch/x86/kernel/process_32.c.orig	2009-04-01 20:53:43.000000000 +0530
+++ arch/x86/kernel/process_32.c	2009-04-01 20:54:39.000000000 +0530
@@ -61,6 +61,8 @@
 #include <asm/idle.h>
 #include <asm/syscalls.h>
 #include <asm/ds.h>
+#include <asm/debugreg.h>
+#include <asm/hw_breakpoint.h>
 
 asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
 
@@ -265,7 +267,14 @@
 
 	task_user_gs(p) = get_user_gs(regs);
 
+	p->thread.io_bitmap_ptr = NULL;
+
 	tsk = current;
+	err = -ENOMEM;
+	if (unlikely(test_tsk_thread_flag(tsk, TIF_DEBUG)))
+		if (copy_thread_hw_breakpoint(tsk, p, clone_flags))
+			goto out;
+
 	if (unlikely(test_tsk_thread_flag(tsk, TIF_IO_BITMAP))) {
 		p->thread.io_bitmap_ptr = kmemdup(tsk->thread.io_bitmap_ptr,
 						IO_BITMAP_BYTES, GFP_KERNEL);
@@ -285,10 +294,13 @@
 		err = do_set_thread_area(p, -1,
 			(struct user_desc __user *)childregs->si, 0);
 
+out:
 	if (err && p->thread.io_bitmap_ptr) {
 		kfree(p->thread.io_bitmap_ptr);
 		p->thread.io_bitmap_max = 0;
 	}
+	if (err)
+		flush_thread_hw_breakpoint(p);
 
 	ds_copy_thread(p, current);
 
@@ -426,6 +438,25 @@
 		lazy_load_gs(next->gs);
 
 	percpu_write(current_task, next_p);
+	/*
+	 * There's a problem with moving the switch_to_thread_hw_breakpoint()
+	 * call before current is updated.  Suppose a kernel breakpoint is
+	 * triggered in between the two.  The hw-breakpoint handler will see
+	 * that current is different from the task pointer stored in
+	 * last_debugged_task, so it will think the task pointer is leftover
+	 * from an old task (lazy switching) and will erase it. Then until the
+	 * next context switch, no user-breakpoints will be installed.
+	 *
+	 * The real problem is that it's impossible to update both current and
+	 * last_debugged_task at the same instant, so there will always be a
+	 * window in which they disagree and a breakpoint might get triggered.
+	 * Since we use lazy switching, we are forced to assume that a
+	 * disagreement means that current is correct and last_debugged_task is
+	 * old.  But if you move the code above then you'll create a window in
+	 * which current is old and last_debugged_task is correct.
+	 */
+	if (unlikely(test_tsk_thread_flag(next_p, TIF_DEBUG)))
+		switch_to_thread_hw_breakpoint(next_p);
 
 	return prev_p;
 }
Index: arch/x86/kernel/process_64.c
===================================================================
--- arch/x86/kernel/process_64.c.orig	2009-04-01 20:53:43.000000000 +0530
+++ arch/x86/kernel/process_64.c	2009-04-01 20:54:39.000000000 +0530
@@ -55,6 +55,8 @@
 #include <asm/idle.h>
 #include <asm/syscalls.h>
 #include <asm/ds.h>
+#include <asm/debugreg.h>
+#include <asm/hw_breakpoint.h>
 
 asmlinkage extern void ret_from_fork(void);
 
@@ -248,6 +250,8 @@
 			BUG();
 		}
 	}
+	if (unlikely(dead_task->thread.debugreg7))
+		flush_thread_hw_breakpoint(dead_task);
 }
 
 static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr)
@@ -303,12 +307,18 @@
 
 	p->thread.fs = me->thread.fs;
 	p->thread.gs = me->thread.gs;
+	p->thread.io_bitmap_ptr = NULL;
 
 	savesegment(gs, p->thread.gsindex);
 	savesegment(fs, p->thread.fsindex);
 	savesegment(es, p->thread.es);
 	savesegment(ds, p->thread.ds);
 
+	err = -ENOMEM;
+	if (unlikely(test_tsk_thread_flag(me, TIF_DEBUG)))
+		if (copy_thread_hw_breakpoint(me, p, clone_flags))
+			goto out;
+
 	if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) {
 		p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
 		if (!p->thread.io_bitmap_ptr) {
@@ -346,6 +356,9 @@
 		kfree(p->thread.io_bitmap_ptr);
 		p->thread.io_bitmap_max = 0;
 	}
+	if (err)
+		flush_thread_hw_breakpoint(p);
+
 	return err;
 }
 
@@ -491,6 +504,26 @@
 	 */
 	if (tsk_used_math(next_p) && next_p->fpu_counter > 5)
 		math_state_restore();
+	/*
+	 * There's a problem with moving the switch_to_thread_hw_breakpoint()
+	 * call before current is updated.  Suppose a kernel breakpoint is
+	 * triggered in between the two.  The hw-breakpoint handler will see
+	 * that current is different from the task pointer stored in
+	 * last_debugged_task, so it will think the task pointer is leftover
+	 * from an old task (lazy switching) and will erase it. Then until the
+	 * next context switch, no user-breakpoints will be installed.
+	 *
+	 * The real problem is that it's impossible to update both current and
+	 * last_debugged_task at the same instant, so there will always be a
+	 * window in which they disagree and a breakpoint might get triggered.
+	 * Since we use lazy switching, we are forced to assume that a
+	 * disagreement means that current is correct and last_debugged_task is
+	 * old.  But if you move the code above then you'll create a window in
+	 * which current is old and last_debugged_task is correct.
+	 */
+	if (unlikely(test_tsk_thread_flag(next_p, TIF_DEBUG)))
+		switch_to_thread_hw_breakpoint(next_p);
+
 	return prev_p;
 }
 
Index: arch/x86/include/asm/processor.h
===================================================================
--- arch/x86/include/asm/processor.h.orig	2009-04-01 20:53:46.000000000 +0530
+++ arch/x86/include/asm/processor.h	2009-04-01 20:54:39.000000000 +0530
@@ -426,10 +426,6 @@
 	unsigned long		fs;
 	unsigned long		gs;
 	/* Hardware debugging registers: */
-	unsigned long		debugreg0;
-	unsigned long		debugreg1;
-	unsigned long		debugreg2;
-	unsigned long		debugreg3;
 	unsigned long		debugreg[HB_NUM];
 	unsigned long		debugreg6;
 	unsigned long		debugreg7;


^ permalink raw reply	[flat|nested] 27+ messages in thread

* [Patch 06/11] Use the new wrapper routines to access debug registers in process/thread code
       [not found] <20090324152028.754123712@K.Prasad>
@ 2009-03-24 15:26 ` K.Prasad
  0 siblings, 0 replies; 27+ messages in thread
From: K.Prasad @ 2009-03-24 15:26 UTC (permalink / raw)
  To: Ingo Molnar, Linux Kernel Mailing List
  Cc: Alan Stern, Andrew Morton, Benjamin Herrenschmidt,
	Frederic Weisbecker, maneesh, Roland McGrath, Steven Rostedt,
	K.Prasad

[-- Attachment #1: 6.new --]
[-- Type: text/plain, Size: 7306 bytes --]

From: Alan Stern <stern@rowland.harvard.edu>

This patch enables the use of abstract debug registers in
process-handling routines.

[K.Prasad: Split-out from the bigger patch and minor changes following
           re-basing]

Signed-off-by: K.Prasad <prasad@linux.vnet.ibm.com>
Signed-off-by: Alan Stern <stern@rowland.harvard.edu>
---
 arch/x86/kernel/process.c    |   23 ++++++-----------------
 arch/x86/kernel/process_32.c |   31 +++++++++++++++++++++++++++++++
 arch/x86/kernel/process_64.c |   33 +++++++++++++++++++++++++++++++++
 3 files changed, 70 insertions(+), 17 deletions(-)

Index: linux-2.6-tip/arch/x86/kernel/process.c
===================================================================
--- linux-2.6-tip.orig/arch/x86/kernel/process.c
+++ linux-2.6-tip/arch/x86/kernel/process.c
@@ -14,6 +14,8 @@
 #include <asm/idle.h>
 #include <asm/uaccess.h>
 #include <asm/i387.h>
+#include <asm/debugreg.h>
+#include <asm/hw_breakpoint.h>
 
 unsigned long idle_halt;
 EXPORT_SYMBOL(idle_halt);
@@ -83,6 +85,8 @@ void exit_thread(void)
 		put_cpu();
 		kfree(bp);
 	}
+	if (unlikely(t->dr7))
+		flush_thread_hw_breakpoint(me);
 
 	ds_exit_thread(current);
 }
@@ -103,14 +107,9 @@ void flush_thread(void)
 	}
 #endif
 
-	clear_tsk_thread_flag(tsk, TIF_DEBUG);
+	if (unlikely(tsk->thread.dr7))
+		flush_thread_hw_breakpoint(tsk);
 
-	tsk->thread.debugreg0 = 0;
-	tsk->thread.debugreg1 = 0;
-	tsk->thread.debugreg2 = 0;
-	tsk->thread.debugreg3 = 0;
-	tsk->thread.debugreg6 = 0;
-	tsk->thread.debugreg7 = 0;
 	memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
 	/*
 	 * Forget coprocessor state..
@@ -192,16 +191,6 @@ void __switch_to_xtra(struct task_struct
 	else if (next->debugctlmsr != prev->debugctlmsr)
 		update_debugctlmsr(next->debugctlmsr);
 
-	if (test_tsk_thread_flag(next_p, TIF_DEBUG)) {
-		set_debugreg(next->debugreg0, 0);
-		set_debugreg(next->debugreg1, 1);
-		set_debugreg(next->debugreg2, 2);
-		set_debugreg(next->debugreg3, 3);
-		/* no 4 and 5 */
-		set_debugreg(next->debugreg6, 6);
-		set_debugreg(next->debugreg7, 7);
-	}
-
 	if (test_tsk_thread_flag(prev_p, TIF_NOTSC) ^
 	    test_tsk_thread_flag(next_p, TIF_NOTSC)) {
 		/* prev and next are different */
Index: linux-2.6-tip/arch/x86/kernel/process_32.c
===================================================================
--- linux-2.6-tip.orig/arch/x86/kernel/process_32.c
+++ linux-2.6-tip/arch/x86/kernel/process_32.c
@@ -61,6 +61,8 @@
 #include <asm/idle.h>
 #include <asm/syscalls.h>
 #include <asm/ds.h>
+#include <asm/debugreg.h>
+#include <asm/hw_breakpoint.h>
 
 asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
 
@@ -265,7 +267,14 @@ int copy_thread(int nr, unsigned long cl
 
 	task_user_gs(p) = get_user_gs(regs);
 
+	p->thread.io_bitmap_ptr = NULL;
+
 	tsk = current;
+	err = -ENOMEM;
+	if (unlikely(tsk->thread.dr7)) {
+		if (copy_thread_hw_breakpoint(tsk, p, clone_flags))
+			goto out;
+	}
 	if (unlikely(test_tsk_thread_flag(tsk, TIF_IO_BITMAP))) {
 		p->thread.io_bitmap_ptr = kmemdup(tsk->thread.io_bitmap_ptr,
 						IO_BITMAP_BYTES, GFP_KERNEL);
@@ -285,10 +294,13 @@ int copy_thread(int nr, unsigned long cl
 		err = do_set_thread_area(p, -1,
 			(struct user_desc __user *)childregs->si, 0);
 
+out:
 	if (err && p->thread.io_bitmap_ptr) {
 		kfree(p->thread.io_bitmap_ptr);
 		p->thread.io_bitmap_max = 0;
 	}
+	if (err)
+		flush_thread_hw_breakpoint(p);
 
 	ds_copy_thread(p, current);
 
@@ -426,6 +438,25 @@ __switch_to(struct task_struct *prev_p, 
 		lazy_load_gs(next->gs);
 
 	percpu_write(current_task, next_p);
+	/*
+	 * There's a problem with moving the switch_to_thread_hw_breakpoint()
+	 * call before current is updated.  Suppose a kernel breakpoint is
+	 * triggered in between the two.  The hw-breakpoint handler will see
+	 * that current is different from the task pointer stored in the chbi
+	 * area, so it will think the task pointer is leftover from an old task
+	 * (lazy switching) and will erase it.  Then until the next context
+	 * switch, no user-breakpoints will be installed.
+	 *
+	 * The real problem is that it's impossible to update both current and
+	 * chbi->bp_task at the same instant, so there will always be a window
+	 * in which they disagree and a breakpoint might get triggered.  Since
+	 * we use lazy switching, we are forced to assume that a disagreement
+	 * means that current is correct and chbi->bp_task is old.  But if you
+	 * move the code above then you'll create a window in which current is
+	 * old and chbi->bp_task is correct.
+	 */
+	if (unlikely(test_tsk_thread_flag(next_p, TIF_DEBUG)))
+		switch_to_thread_hw_breakpoint(next_p);
 
 	return prev_p;
 }
Index: linux-2.6-tip/arch/x86/kernel/process_64.c
===================================================================
--- linux-2.6-tip.orig/arch/x86/kernel/process_64.c
+++ linux-2.6-tip/arch/x86/kernel/process_64.c
@@ -55,6 +55,8 @@
 #include <asm/idle.h>
 #include <asm/syscalls.h>
 #include <asm/ds.h>
+#include <asm/debugreg.h>
+#include <asm/hw_breakpoint.h>
 
 asmlinkage extern void ret_from_fork(void);
 
@@ -248,6 +250,8 @@ void release_thread(struct task_struct *
 			BUG();
 		}
 	}
+	if (unlikely(dead_task->thread.dr7))
+		flush_thread_hw_breakpoint(dead_task);
 }
 
 static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr)
@@ -303,12 +307,18 @@ int copy_thread(int nr, unsigned long cl
 
 	p->thread.fs = me->thread.fs;
 	p->thread.gs = me->thread.gs;
+	p->thread.io_bitmap_ptr = NULL;
 
 	savesegment(gs, p->thread.gsindex);
 	savesegment(fs, p->thread.fsindex);
 	savesegment(es, p->thread.es);
 	savesegment(ds, p->thread.ds);
 
+	err = -ENOMEM;
+	if (unlikely(me->thread.dr7)) {
+		if (copy_thread_hw_breakpoint(me, p, clone_flags))
+			goto out;
+	}
 	if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) {
 		p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
 		if (!p->thread.io_bitmap_ptr) {
@@ -346,6 +356,9 @@ out:
 		kfree(p->thread.io_bitmap_ptr);
 		p->thread.io_bitmap_max = 0;
 	}
+	if (err)
+		flush_thread_hw_breakpoint(p);
+
 	return err;
 }
 
@@ -491,6 +504,26 @@ __switch_to(struct task_struct *prev_p, 
 	 */
 	if (tsk_used_math(next_p) && next_p->fpu_counter > 5)
 		math_state_restore();
+	/*
+	 * There's a problem with moving the switch_to_thread_hw_breakpoint()
+	 * call before current is updated.  Suppose a kernel breakpoint is
+	 * triggered in between the two.  The hw-breakpoint handler will see
+	 * that current is different from the task pointer stored in the chbi
+	 * area, so it will think the task pointer is leftover from an old task
+	 * (lazy switching) and will erase it.  Then until the next context
+	 * switch, no user-breakpoints will be installed.
+	 *
+	 * The real problem is that it's impossible to update both current and
+	 * chbi->bp_task at the same instant, so there will always be a window
+	 * in which they disagree and a breakpoint might get triggered.  Since
+	 * we use lazy switching, we are forced to assume that a disagreement
+	 * means that current is correct and chbi->bp_task is old.  But if you
+	 * move the code above then you'll create a window in which current is
+	 * old and chbi->bp_task is correct.
+	 */
+	if (unlikely(test_tsk_thread_flag(next_p, TIF_DEBUG)))
+		switch_to_thread_hw_breakpoint(next_p);
+
 	return prev_p;
 }
 


^ permalink raw reply	[flat|nested] 27+ messages in thread

end of thread, other threads:[~2009-04-07  6:37 UTC | newest]

Thread overview: 27+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
     [not found] <20090319234044.410725944@K.Prasad>
2009-03-19 23:48 ` [Patch 01/11] Introducing generic hardware breakpoint handler interfaces K.Prasad
2009-03-20 14:33   ` Alan Stern
2009-03-20 18:30     ` Ingo Molnar
2009-03-21 17:32       ` K.Prasad
2009-03-20 18:32     ` Ingo Molnar
2009-03-21 17:26     ` K.Prasad
2009-03-21 21:39       ` Alan Stern
2009-03-23 19:03         ` K.Prasad
2009-03-23 19:21           ` Alan Stern
2009-03-23 20:42             ` K.Prasad
2009-03-23 21:20               ` Alan Stern
2009-03-19 23:48 ` [Patch 02/11] x86 architecture implementation of Hardware Breakpoint interfaces K.Prasad
2009-03-19 23:48 ` [Patch 03/11] Modifying generic debug exception to use thread-specific debug registers K.Prasad
2009-03-19 23:49 ` [Patch 04/11] Introduce user-space " K.Prasad
2009-03-19 23:49 ` [Patch 05/11] Use wrapper routines around debug registers in processor related functions K.Prasad
2009-03-19 23:49 ` [Patch 06/11] Use the new wrapper routines to access debug registers in process/thread code K.Prasad
2009-03-19 23:49 ` [Patch 07/11] Modify signal handling code to refrain from re-enabling HW Breakpoints K.Prasad
2009-03-19 23:49 ` [Patch 08/11] Modify Ptrace routines to access breakpoint registers K.Prasad
2009-03-19 23:49 ` [Patch 09/11] Cleanup HW Breakpoint registers before kexec K.Prasad
2009-03-19 23:50 ` [Patch 10/11] Sample HW breakpoint over kernel data address K.Prasad
2009-03-19 23:50 ` [Patch 11/11] ftrace plugin for kernel symbol tracing using HW Breakpoint interfaces - v2 K.Prasad
2009-03-20  9:04   ` Frederic Weisbecker
2009-03-21 16:24     ` K.Prasad
2009-03-21 16:39       ` Steven Rostedt
2009-03-23 19:08         ` K.Prasad
     [not found] <20090324152028.754123712@K.Prasad>
2009-03-24 15:26 ` [Patch 06/11] Use the new wrapper routines to access debug registers in process/thread code K.Prasad
     [not found] <20090407063058.301701787@prasadkr_t60p.in.ibm.com>
2009-04-07  6:36 ` K.Prasad

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.