All of lore.kernel.org
 help / color / mirror / Atom feed
* [patch 01/11] Introducing generic hardware breakpoint handler interfaces
       [not found] <20090305043440.189041194@linux.vnet.ibm.com>
@ 2009-03-05  4:37 ` prasad
  2009-03-10 13:50   ` Ingo Molnar
  2009-03-05  4:38 ` [patch 02/11] x86 architecture implementation of Hardware Breakpoint interfaces prasad
                   ` (10 subsequent siblings)
  11 siblings, 1 reply; 71+ messages in thread
From: prasad @ 2009-03-05  4:37 UTC (permalink / raw)
  To: mingo
  Cc: Andrew Morton, Linux Kernel Mailing List, Alan Stern,
	Roland McGrath, K.Prasad

[-- Attachment #1: 1 --]
[-- Type: text/plain, Size: 35349 bytes --]

From: Alan Stern <stern@rowland.harvard.edu>

This patch introduces two new files hw_breakpoint.[ch] which defines the 
generic interfaces to use hardware breakpoint infrastructure of the system. 

[K.Prasad: Re-based the original patch to newer kernel base, modified the
           register_<kernel/user>_hw_breakpoint() interfaces and split the
           monolithic patch into smaller ones. Split-out from the bigger patch
           and minor changes following re-basing]

Signed-off-by: K.Prasad <prasad@linux.vnet.ibm.com>
Signed-off-by: Alan Stern <stern@rowland.harvard.edu>
---
 include/asm-generic/hw_breakpoint.h |  243 +++++++++++
 kernel/Makefile                     |    2 
 kernel/hw_breakpoint.c              |  772 ++++++++++++++++++++++++++++++++++++
 3 files changed, 1016 insertions(+), 1 deletion(-)

Index: linux-2.6-tip.hbkpt/kernel/hw_breakpoint.c
===================================================================
--- /dev/null
+++ linux-2.6-tip.hbkpt/kernel/hw_breakpoint.c
@@ -0,0 +1,772 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) 2007 Alan Stern
+ * Copyright (C) IBM Corporation, 2009
+ */
+
+/*
+ * HW_breakpoint: a unified kernel/user-space hardware breakpoint facility,
+ * using the CPU's debug registers.
+ *
+ * This file contains the arch-independent routines.  It is not meant
+ * to be compiled as a standalone source file; rather it should be
+ * #include'd by the arch-specific implementation.
+ */
+
+#include <linux/init.h>
+#include <linux/irqflags.h>
+#include <linux/kdebug.h>
+#include <linux/kernel.h>
+#include <linux/kprobes.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/notifier.h>
+#include <linux/rculist.h>
+#include <linux/sched.h>
+#include <linux/smp.h>
+#include <linux/percpu.h>
+#include <linux/kallsyms.h>
+
+#include <asm/debugreg.h>
+#include <asm/hw_breakpoint.h>
+#include <asm/processor.h>
+
+/* Global info */
+struct kernel_bp_data	kbpdata[2];	/* Old and new settings */
+int			cur_kbpindex;	/* Alternates 0, 1, ... */
+struct kernel_bp_data	*cur_kbpdata = &kbpdata[0];
+			/* Always equal to &kbpdata[cur_kbpindex] */
+
+static u8			tprio[HB_NUM];	/* Thread bp max priorities */
+LIST_HEAD(kernel_bps);			/* Kernel breakpoint list */
+static LIST_HEAD(thread_list);			/* thread_hw_breakpoint list */
+DEFINE_PER_CPU(struct cpu_hw_breakpoint, cpu_bp);
+
+/*
+ * Install the debug register values for a new thread.
+ */
+void switch_to_thread_hw_breakpoint(struct task_struct *tsk)
+{
+	struct thread_hw_breakpoint *thbi = tsk->thread.hw_breakpoint_info;
+	struct cpu_hw_breakpoint *chbi;
+	struct kernel_bp_data *thr_kbpdata;
+
+	/* This routine is on the hot path; it gets called for every
+	 * context switch into a task with active breakpoints.  We
+	 * must make sure that the common case executes as quickly as
+	 * possible.
+	 */
+	chbi = &per_cpu(cpu_bp, get_cpu());
+	chbi->bp_task = tsk;
+
+	/* Use RCU to synchronize with external updates */
+	rcu_read_lock();
+
+	/* Other CPUs might be making updates to the list of kernel
+	 * breakpoints at this time.  If they are, they will modify
+	 * the other entry in kbpdata[] -- the one not pointed to
+	 * by chbi->cur_kbpdata.  So the update itself won't affect
+	 * us directly.
+	 *
+	 * However when the update is finished, an IPI will arrive
+	 * telling this CPU to change chbi->cur_kbpdata.  We need
+	 * to use a single consistent kbpdata[] entry, the present one.
+	 * So we'll copy the pointer to a local variable, thr_kbpdata,
+	 * and we must prevent the compiler from aliasing the two
+	 * pointers.  Only a compiler barrier is required, not a full
+	 * memory barrier, because everything takes place on a single CPU.
+	 */
+ restart:
+	thr_kbpdata = ACCESS_ONCE(chbi->cur_kbpdata);
+
+	/* Normally we can keep the same debug register settings as the
+	 * last time this task ran.  But if the kernel breakpoints have
+	 * changed or any user breakpoints have been registered or
+	 * unregistered, we need to handle the updates and possibly
+	 * send out some notifications.
+	 */
+	if (unlikely(thbi->gennum != thr_kbpdata->gennum)) {
+		struct hw_breakpoint *bp;
+		int i;
+		int num;
+
+		thbi->gennum = thr_kbpdata->gennum;
+		arch_update_thbi(thbi, thr_kbpdata);
+		num = thr_kbpdata->num_kbps;
+
+		/* This code can be invoked while a debugger is actively
+		 * updating the thread's breakpoint list. We use RCU to
+		 * protect our access to the list pointers. */
+		thbi->num_installed = 0;
+		i = HB_NUM;
+		list_for_each_entry_rcu(bp, &thbi->thread_bps, node) {
+
+			/* If this register is allocated for kernel bps,
+			 * don't install.  Otherwise do. */
+			if (--i < num) {
+				if (bp->status == HW_BREAKPOINT_INSTALLED) {
+					if (bp->uninstalled)
+						(bp->uninstalled)(bp);
+					bp->status = HW_BREAKPOINT_REGISTERED;
+				}
+			} else {
+				++thbi->num_installed;
+				if (bp->status != HW_BREAKPOINT_INSTALLED) {
+					bp->status = HW_BREAKPOINT_INSTALLED;
+					if (bp->installed)
+						(bp->installed)(bp);
+				}
+			}
+		}
+	}
+
+	/* Set the debug register */
+	arch_install_thbi(thbi);
+
+	/* Were there any kernel breakpoint changes while we were running? */
+	if (unlikely(chbi->cur_kbpdata != thr_kbpdata)) {
+
+		/* Some debug registers now be assigned to kernel bps and
+		 * we might have messed them up.  Reload all the kernel bps
+		 * and then reload the thread bps.
+		 */
+		arch_install_chbi(chbi);
+		goto restart;
+	}
+
+	rcu_read_unlock();
+	put_cpu_no_resched();
+}
+
+/*
+ * Install the debug register values for just the kernel, no thread.
+ */
+void switch_to_none_hw_breakpoint(void)
+{
+	struct cpu_hw_breakpoint *chbi;
+
+	chbi = &per_cpu(cpu_bp, get_cpu());
+	chbi->bp_task = NULL;
+
+	/* This routine gets called from only two places.  In one
+	 * the caller holds the hw_breakpoint_mutex; in the other
+	 * interrupts are disabled.  In either case, no kernel
+	 * breakpoint updates can arrive while the routine runs.
+	 * So we don't need to use RCU.
+	 */
+	arch_install_none(chbi);
+	put_cpu_no_resched();
+}
+
+/*
+ * Update the debug registers on this CPU.
+ */
+static void update_this_cpu(void *unused)
+{
+	struct cpu_hw_breakpoint *chbi;
+	struct task_struct *tsk = current;
+
+	chbi = &per_cpu(cpu_bp, get_cpu());
+
+	rcu_read_lock();
+	/* Install both the kernel and the user breakpoints */
+	arch_install_chbi(chbi);
+	if (test_tsk_thread_flag(tsk, TIF_DEBUG))
+		switch_to_thread_hw_breakpoint(tsk);
+
+	rcu_read_unlock();
+	put_cpu_no_resched();
+}
+
+/*
+ * Tell all CPUs to update their debug registers.
+ *
+ * The caller must hold hw_breakpoint_mutex.
+ */
+static void update_all_cpus(void)
+{
+	/* We don't need to use any sort of memory barrier.  The IPI
+	 * carried out by on_each_cpu() includes its own barriers.
+	 */
+	on_each_cpu(update_this_cpu, NULL, 0);
+	synchronize_rcu();
+}
+
+/*
+ * Load the debug registers during startup of a CPU.
+ */
+void load_debug_registers(void)
+{
+	unsigned long flags;
+
+	/* Prevent IPIs for new kernel breakpoint updates */
+	local_irq_save(flags);
+	update_this_cpu(NULL);
+	local_irq_restore(flags);
+}
+
+/*
+ * Take the 4 highest-priority breakpoints in a thread and accumulate
+ * their priorities in tprio.  Highest-priority entry is in tprio[3].
+ */
+static void accum_thread_tprio(struct thread_hw_breakpoint *thbi)
+{
+	int i;
+
+	for (i = HB_NUM - 1; i >= 0 && thbi->bps[i]; --i)
+		tprio[i] = max(tprio[i], thbi->bps[i]->priority);
+}
+
+/*
+ * Recalculate the value of the tprio array, the maximum priority levels
+ * requested by user breakpoints in all threads.
+ *
+ * Each thread has a list of registered breakpoints, kept in order of
+ * decreasing priority.  We'll set tprio[0] to the maximum priority of
+ * the first entries in all the lists, tprio[1] to the maximum priority
+ * of the second entries in all the lists, etc.  In the end, we'll know
+ * that no thread requires breakpoints with priorities higher than the
+ * values in tprio.
+ *
+ * The caller must hold hw_breakpoint_mutex.
+ */
+static void recalc_tprio(void)
+{
+	struct thread_hw_breakpoint *thbi;
+
+	memset(tprio, 0, sizeof tprio);
+
+	/* Loop through all threads having registered breakpoints
+	 * and accumulate the maximum priority levels in tprio.
+	 */
+	list_for_each_entry(thbi, &thread_list, node)
+		accum_thread_tprio(thbi);
+}
+
+/*
+ * Decide how many debug registers will be allocated to kernel breakpoints
+ * and consequently, how many remain available for user breakpoints.
+ *
+ * The priorities of the entries in the list of registered kernel bps
+ * are compared against the priorities stored in tprio[].  The 4 highest
+ * winners overall get to be installed in a debug register; num_kpbs
+ * keeps track of how many of those winners come from the kernel list.
+ *
+ * If num_kbps changes, or if a kernel bp changes its installation status,
+ * then call update_all_cpus() so that the debug registers will be set
+ * correctly on every CPU.  If neither condition holds then the set of
+ * kernel bps hasn't changed, and nothing more needs to be done.
+ *
+ * The caller must hold hw_breakpoint_mutex.
+ */
+static void balance_kernel_vs_user(void)
+{
+	int k, u;
+	int changed = 0;
+	struct hw_breakpoint *bp;
+	struct kernel_bp_data *new_kbpdata;
+
+	/* Determine how many debug registers are available for kernel
+	 * breakpoints as opposed to user breakpoints, based on the
+	 * priorities.  Ties are resolved in favor of user bps.
+	 */
+	k = 0;			/* Next kernel bp to allocate */
+	u = HB_NUM - 1;		/* Next user bp to allocate */
+
+	bp = list_entry(kernel_bps.next, struct hw_breakpoint, node);
+	while (k <= u) {
+		if (&bp->node == &kernel_bps || tprio[u] >= bp->priority)
+			--u;		/* User bps win a slot */
+		else {
+			++k;		/* Kernel bp wins a slot */
+			if (bp->status != HW_BREAKPOINT_INSTALLED)
+				changed = 1;
+			bp = list_entry(bp->node.next, struct hw_breakpoint,
+					node);
+		}
+	}
+	if (k != cur_kbpdata->num_kbps)
+		changed = 1;
+
+	/* Notify the remaining kernel breakpoints that they are about
+	 * to be uninstalled.
+	 */
+	list_for_each_entry_from(bp, &kernel_bps, node) {
+		if (bp->status == HW_BREAKPOINT_INSTALLED) {
+			if (bp->uninstalled)
+				(bp->uninstalled)(bp);
+			bp->status = HW_BREAKPOINT_REGISTERED;
+			changed = 1;
+		}
+	}
+
+	if (changed) {
+		cur_kbpindex ^= 1;
+		new_kbpdata = &kbpdata[cur_kbpindex];
+		new_kbpdata->gennum = cur_kbpdata->gennum + 1;
+		new_kbpdata->num_kbps = k;
+		arch_new_kbpdata(new_kbpdata);
+		u = 0;
+		list_for_each_entry(bp, &kernel_bps, node) {
+			if (u >= k)
+				break;
+			new_kbpdata->bps[u] = bp;
+			++u;
+		}
+		rcu_assign_pointer(cur_kbpdata, new_kbpdata);
+
+		/* Tell all the CPUs to update their debug registers */
+		update_all_cpus();
+
+		/* Notify the breakpoints that just got installed */
+		for (u = 0; u < k; ++u) {
+			bp = new_kbpdata->bps[u];
+			if (bp->status != HW_BREAKPOINT_INSTALLED) {
+				bp->status = HW_BREAKPOINT_INSTALLED;
+				if (bp->installed)
+					(bp->installed)(bp);
+			}
+		}
+	}
+}
+
+/*
+ * Return the pointer to a thread's hw_breakpoint info area,
+ * and try to allocate one if it doesn't exist.
+ *
+ * The caller must hold hw_breakpoint_mutex.
+ */
+struct thread_hw_breakpoint *alloc_thread_hw_breakpoint(
+		struct task_struct *tsk)
+{
+	if (!tsk->thread.hw_breakpoint_info && !(tsk->flags & PF_EXITING)) {
+		struct thread_hw_breakpoint *thbi;
+
+		thbi = kzalloc(sizeof(struct thread_hw_breakpoint),
+				GFP_KERNEL);
+		if (thbi) {
+			INIT_LIST_HEAD(&thbi->node);
+			INIT_LIST_HEAD(&thbi->thread_bps);
+
+			/* Force an update the next time tsk runs */
+			thbi->gennum = cur_kbpdata->gennum - 2;
+			tsk->thread.hw_breakpoint_info = thbi;
+		}
+	}
+	return tsk->thread.hw_breakpoint_info;
+}
+
+/*
+ * Erase all the hardware breakpoint info associated with a thread.
+ *
+ * If tsk != current then tsk must not be usable (for example, a
+ * child being cleaned up from a failed fork).
+ */
+void flush_thread_hw_breakpoint(struct task_struct *tsk)
+{
+	struct thread_hw_breakpoint *thbi = tsk->thread.hw_breakpoint_info;
+	struct hw_breakpoint *bp;
+
+	if (!thbi)
+		return;
+	mutex_lock(&hw_breakpoint_mutex);
+
+	/* Let the breakpoints know they are being uninstalled */
+	list_for_each_entry(bp, &thbi->thread_bps, node) {
+		if (bp->status == HW_BREAKPOINT_INSTALLED && bp->uninstalled)
+			(bp->uninstalled)(bp);
+		bp->status = 0;
+	}
+
+	/* Remove tsk from the list of all threads with registered bps */
+	list_del(&thbi->node);
+
+	/* The thread no longer has any breakpoints associated with it */
+	clear_tsk_thread_flag(tsk, TIF_DEBUG);
+	tsk->thread.hw_breakpoint_info = NULL;
+	kfree(thbi);
+
+	/* Recalculate and rebalance the kernel-vs-user priorities */
+	recalc_tprio();
+	balance_kernel_vs_user();
+
+	/* Actually uninstall the breakpoints if necessary */
+	if (tsk == current)
+		switch_to_none_hw_breakpoint();
+	mutex_unlock(&hw_breakpoint_mutex);
+}
+
+/*
+ * Copy the hardware breakpoint info from a thread to its cloned child.
+ */
+int copy_thread_hw_breakpoint(struct task_struct *tsk,
+		struct task_struct *child, unsigned long clone_flags)
+{
+	/* We will assume that breakpoint settings are not inherited
+	 * and the child starts out with no debug registers set.
+	 * But what about CLONE_PTRACE?
+	 */
+	clear_tsk_thread_flag(child, TIF_DEBUG);
+	return 0;
+}
+
+/*
+ * Store the highest-priority thread breakpoint entries in an array.
+ */
+static void store_thread_bp_array(struct thread_hw_breakpoint *thbi)
+{
+	struct hw_breakpoint *bp;
+	int i;
+
+	i = HB_NUM - 1;
+	list_for_each_entry(bp, &thbi->thread_bps, node) {
+		thbi->bps[i] = bp;
+		arch_store_thread_bp_array(thbi, bp, i);
+		if (--i < 0)
+			break;
+	}
+	while (i >= 0)
+		thbi->bps[i--] = NULL;
+
+	/* Force an update the next time this task runs */
+	thbi->gennum = cur_kbpdata->gennum - 2;
+}
+
+/*
+ * Insert a new breakpoint in a priority-sorted list.
+ * Return the bp's index in the list.
+ *
+ * Thread invariants:
+ *	tsk_thread_flag(tsk, TIF_DEBUG) set implies
+ *		tsk->thread.hw_breakpoint_info is not NULL.
+ *	tsk_thread_flag(tsk, TIF_DEBUG) set iff thbi->thread_bps is non-empty
+ *		iff thbi->node is on thread_list.
+ */
+static int insert_bp_in_list(struct hw_breakpoint *bp,
+		struct thread_hw_breakpoint *thbi, struct task_struct *tsk)
+{
+	struct list_head *head;
+	int pos;
+	struct hw_breakpoint *temp_bp;
+
+	/* tsk and thbi are NULL for kernel bps, non-NULL for user bps */
+	if (tsk)
+		head = &thbi->thread_bps;
+	else
+		head = &kernel_bps;
+
+	/* Equal-priority breakpoints get listed first-come-first-served */
+	pos = 0;
+	list_for_each_entry(temp_bp, head, node) {
+		if (bp->priority > temp_bp->priority)
+			break;
+		++pos;
+	}
+	bp->status = HW_BREAKPOINT_REGISTERED;
+	list_add_tail(&bp->node, &temp_bp->node);
+
+	if (tsk) {
+		store_thread_bp_array(thbi);
+
+		/* Is this the thread's first registered breakpoint? */
+		if (list_empty(&thbi->node)) {
+			set_tsk_thread_flag(tsk, TIF_DEBUG);
+			list_add(&thbi->node, &thread_list);
+		}
+	}
+	return pos;
+}
+
+/*
+ * Remove a breakpoint from its priority-sorted list.
+ *
+ * See the invariants mentioned above.
+ */
+static void remove_bp_from_list(struct hw_breakpoint *bp,
+		struct thread_hw_breakpoint *thbi, struct task_struct *tsk)
+{
+	/* Remove bp from the thread's/kernel's list.  If the list is now
+	 * empty we must clear the TIF_DEBUG flag.  But keep the
+	 * thread_hw_breakpoint structure, so that the virtualized debug
+	 * register values will remain valid.
+	 */
+	list_del(&bp->node);
+	if (tsk) {
+		store_thread_bp_array(thbi);
+
+		if (list_empty(&thbi->thread_bps)) {
+			list_del_init(&thbi->node);
+			clear_tsk_thread_flag(tsk, TIF_DEBUG);
+		}
+	}
+
+	/* Tell the breakpoint it is being uninstalled */
+	if (bp->status == HW_BREAKPOINT_INSTALLED && bp->uninstalled)
+		(bp->uninstalled)(bp);
+	bp->status = 0;
+}
+
+/*
+ * Validate the settings in a hw_breakpoint structure.
+ */
+static int validate_settings(struct hw_breakpoint *bp, struct task_struct *tsk)
+{
+	int ret;
+	unsigned int align;
+
+	ret = arch_validate_hwbkpt_settings(bp, &align, tsk);
+	if (ret < 0)
+		goto err;
+
+	/* Check that the low-order bits of the address are appropriate
+	 * for the alignment implied by len.
+	 */
+	if (bp->info.address & align)
+		return -EINVAL;
+
+	/* Check that the virtual address is in the proper range */
+	if (tsk) {
+		if (!arch_check_va_in_userspace(bp->info.address, tsk))
+			return -EFAULT;
+	} else {
+		if (!arch_check_va_in_kernelspace(bp->info.address))
+			return -EFAULT;
+	}
+ err:
+	return ret;
+}
+
+/*
+ * Actual implementation of register_user_hw_breakpoint.
+ */
+int __register_user_hw_breakpoint(struct task_struct *tsk,
+					struct hw_breakpoint *bp)
+{
+	int rc;
+	struct thread_hw_breakpoint *thbi;
+	int pos;
+
+	bp->status = 0;
+	rc = validate_settings(bp, tsk);
+	if (rc)
+		return rc;
+
+	thbi = alloc_thread_hw_breakpoint(tsk);
+	if (!thbi)
+		return -ENOMEM;
+
+	/* Insert bp in the thread's list */
+	pos = insert_bp_in_list(bp, thbi, tsk);
+	arch_register_user_hw_breakpoint(bp, thbi);
+
+	/* Update and rebalance the priorities.  We don't need to go through
+	 * the list of all threads; adding a breakpoint can only cause the
+	 * priorities for this thread to increase.
+	 */
+	accum_thread_tprio(thbi);
+	balance_kernel_vs_user();
+
+	/* Did bp get allocated to a debug register?  We can tell from its
+	 * position in the list.  The number of registers allocated to
+	 * kernel breakpoints is num_kbps; all the others are available for
+	 * user breakpoints.  If bp's position in the priority-ordered list
+	 * is low enough, it will get a register.
+	 */
+	if (pos < HB_NUM - cur_kbpdata->num_kbps) {
+		rc = 1;
+
+		/* Does it need to be installed right now? */
+		if (tsk == current)
+			switch_to_thread_hw_breakpoint(tsk);
+		/* Otherwise it will get installed the next time tsk runs */
+	}
+
+	return rc;
+}
+
+/**
+ * register_user_hw_breakpoint - register a hardware breakpoint for user space
+ * @tsk: the task in whose memory space the breakpoint will be set
+ * @bp: the breakpoint structure to register
+ * @address: location (virtual address) of the breakpoint
+ * @len: encoded extent of the breakpoint address (1, 2, 4, or 8 bytes)
+ * @type: breakpoint type (read-only, write-only, read-write, or execute)
+ *
+ * This routine registers a breakpoint to be associated with @tsk's
+ * memory space and active only while @tsk is running.  It does not
+ * guarantee that the breakpoint will be allocated to a debug register
+ * immediately; there may be other higher-priority breakpoints registered
+ * which require the use of all the debug registers.
+ *
+ * @tsk will normally be a process being debugged by the current process,
+ * but it may also be the current process.
+ *
+ * @bp->address, @bp->len, @bp->type, @bp->triggered and @bp->priority must be
+ * set properly before invocation
+ *
+ * Returns 1 if @bp is allocated to a debug register, 0 if @bp is
+ * registered but not allowed to be installed, otherwise a negative error
+ * code.
+ */
+int register_user_hw_breakpoint(struct task_struct *tsk,
+				 struct hw_breakpoint *bp)
+{
+	int rc;
+
+	mutex_lock(&hw_breakpoint_mutex);
+	rc = __register_user_hw_breakpoint(tsk, bp);
+	mutex_unlock(&hw_breakpoint_mutex);
+	return rc;
+}
+
+/*
+ * Actual implementation of unregister_user_hw_breakpoint.
+ */
+void __unregister_user_hw_breakpoint(struct task_struct *tsk,
+		struct hw_breakpoint *bp)
+{
+	struct thread_hw_breakpoint *thbi = tsk->thread.hw_breakpoint_info;
+
+	if (!bp->status)
+		return;		/* Not registered */
+
+	/* Remove bp from the thread's list */
+	remove_bp_from_list(bp, thbi, tsk);
+	arch_unregister_user_hw_breakpoint(bp, thbi);
+
+	/* Recalculate and rebalance the kernel-vs-user priorities,
+	 * and actually uninstall bp if necessary.
+	 */
+	recalc_tprio();
+	balance_kernel_vs_user();
+	if (tsk == current)
+		switch_to_thread_hw_breakpoint(tsk);
+}
+
+/**
+ * unregister_user_hw_breakpoint - unregister a hardware breakpoint for user space
+ * @tsk: the task in whose memory space the breakpoint is registered
+ * @bp: the breakpoint structure to unregister
+ *
+ * Uninstalls and unregisters @bp.
+ */
+void unregister_user_hw_breakpoint(struct task_struct *tsk,
+		struct hw_breakpoint *bp)
+{
+	mutex_lock(&hw_breakpoint_mutex);
+	__unregister_user_hw_breakpoint(tsk, bp);
+	mutex_unlock(&hw_breakpoint_mutex);
+}
+
+/**
+ * register_kernel_hw_breakpoint - register a hardware breakpoint for kernel space
+ * @bp: the breakpoint structure to register
+ *
+ * This routine registers a breakpoint to be active at all times.  It
+ * does not guarantee that the breakpoint will be allocated to a debug
+ * register immediately; there may be other higher-priority breakpoints
+ * registered which require the use of all the debug registers.
+ *
+ * @bp.info->name or @bp.info->address, @bp.info->len, @bp.info->type,
+ * @bp->triggered and @bp->priority must be set properly before invocation
+ *
+ * Returns 1 if @bp is allocated to a debug register, 0 if @bp is
+ * registered but not allowed to be installed, otherwise a negative error
+ * code.
+ */
+int register_kernel_hw_breakpoint(struct hw_breakpoint *bp)
+{
+	int rc;
+	int pos;
+
+	bp->status = 0;
+	rc = validate_settings(bp, NULL);
+	if (rc)
+		return rc;
+
+	mutex_lock(&hw_breakpoint_mutex);
+
+	/* Insert bp in the kernel's list */
+	pos = insert_bp_in_list(bp, NULL, NULL);
+	arch_register_kernel_hw_breakpoint(bp);
+
+	/* Rebalance the priorities.  This will install bp if it
+	 * was allocated a debug register.
+	 */
+	balance_kernel_vs_user();
+
+	/* Did bp get allocated to a debug register?  We can tell from its
+	 * position in the list.  The number of registers allocated to
+	 * kernel breakpoints is num_kbps; all the others are available for
+	 * user breakpoints.  If bp's position in the priority-ordered list
+	 * is low enough, it will get a register.
+	 */
+	if (pos < cur_kbpdata->num_kbps)
+		rc = 1;
+
+	mutex_unlock(&hw_breakpoint_mutex);
+	return rc;
+}
+EXPORT_SYMBOL_GPL(register_kernel_hw_breakpoint);
+
+/**
+ * unregister_kernel_hw_breakpoint - unregister a hardware breakpoint for kernel space
+ * @bp: the breakpoint structure to unregister
+ *
+ * Uninstalls and unregisters @bp.
+ */
+void unregister_kernel_hw_breakpoint(struct hw_breakpoint *bp)
+{
+	if (!bp->status)
+		return;		/* Not registered */
+	mutex_lock(&hw_breakpoint_mutex);
+
+	/* Remove bp from the kernel's list */
+	remove_bp_from_list(bp, NULL, NULL);
+	arch_unregister_kernel_hw_breakpoint(bp);
+
+	/* Rebalance the priorities.  This will uninstall bp if it
+	 * was allocated a debug register.
+	 */
+	balance_kernel_vs_user();
+
+	mutex_unlock(&hw_breakpoint_mutex);
+}
+EXPORT_SYMBOL_GPL(unregister_kernel_hw_breakpoint);
+
+/*
+ * Handle debug exception notifications.
+ */
+static int __kprobes hw_breakpoint_exceptions_notify(
+		struct notifier_block *unused, unsigned long val, void *data)
+{
+	if (val != DIE_DEBUG)
+		return NOTIFY_DONE;
+	return hw_breakpoint_handler(data);
+}
+
+static struct notifier_block hw_breakpoint_exceptions_nb = {
+	.notifier_call = hw_breakpoint_exceptions_notify,
+	.priority = 0x7fffffff /* we need to be notified first */
+};
+
+static int __init init_hw_breakpoint(void)
+{
+	load_debug_registers();
+	return register_die_notifier(&hw_breakpoint_exceptions_nb);
+}
+
+core_initcall(init_hw_breakpoint);
Index: linux-2.6-tip.hbkpt/kernel/Makefile
===================================================================
--- linux-2.6-tip.hbkpt.orig/kernel/Makefile
+++ linux-2.6-tip.hbkpt/kernel/Makefile
@@ -10,7 +10,7 @@ obj-y     = sched.o fork.o exec_domain.o
 	    kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \
 	    hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \
 	    notifier.o ksysfs.o pm_qos_params.o sched_clock.o cred.o \
-	    async.o
+	    async.o hw_breakpoint.o
 
 ifdef CONFIG_FUNCTION_TRACER
 # Do not trace debug files and internal ftrace files
Index: linux-2.6-tip.hbkpt/include/asm-generic/hw_breakpoint.h
===================================================================
--- /dev/null
+++ linux-2.6-tip.hbkpt/include/asm-generic/hw_breakpoint.h
@@ -0,0 +1,243 @@
+#ifndef	_ASM_GENERIC_HW_BREAKPOINT_H
+#define	_ASM_GENERIC_HW_BREAKPOINT_H
+
+#ifndef __ARCH_HW_BREAKPOINT_H
+#error "Please don't include this file directly"
+#endif
+
+#ifdef	__KERNEL__
+#include <linux/list.h>
+#include <linux/types.h>
+#include <linux/kallsyms.h>
+
+/**
+ * struct hw_breakpoint - unified kernel/user-space hardware breakpoint
+ * @node: internal linked-list management
+ * @triggered: callback invoked after target address access
+ * @installed: callback invoked when the breakpoint is installed
+ * @uninstalled: callback invoked when the breakpoint is uninstalled
+ * @info: arch-specific breakpoint info (address, length, and type)
+ * @priority: requested priority level
+ * @status: current registration/installation status
+ *
+ * %hw_breakpoint structures are the kernel's way of representing
+ * hardware breakpoints.  These are data breakpoints
+ * (also known as "watchpoints", triggered on data access), and the breakpoint's
+ * target address can be located in either kernel space or user space.
+ *
+ * The breakpoint's address, length, and type are highly
+ * architecture-specific.  The values are encoded in the @info field; you
+ * specify them when registering the breakpoint.  To examine the encoded
+ * values use hw_breakpoint_get_{kaddress,uaddress,len,type}(), declared
+ * below.
+ *
+ * The address is specified as a regular kernel pointer (for kernel-space
+ * breakponts) or as an %__user pointer (for user-space breakpoints).
+ * With register_user_hw_breakpoint(), the address must refer to a
+ * location in user space.  The breakpoint will be active only while the
+ * requested task is running.  Conversely with
+ * register_kernel_hw_breakpoint(), the address must refer to a location
+ * in kernel space, and the breakpoint will be active on all CPUs
+ * regardless of the current task.
+ *
+ * The length is the breakpoint's extent in bytes, which is subject to
+ * certain limitations.  include/asm/hw_breakpoint.h contains macros
+ * defining the available lengths for a specific architecture.  Note that
+ * the address's alignment must match the length.  The breakpoint will
+ * catch accesses to any byte in the range from address to address +
+ * (length - 1).
+ *
+ * The breakpoint's type indicates the sort of access that will cause it
+ * to trigger.  Possible values may include:
+ *
+ * 	%HW_BREAKPOINT_RW (triggered on read or write access),
+ * 	%HW_BREAKPOINT_WRITE (triggered on write access), and
+ * 	%HW_BREAKPOINT_READ (triggered on read access).
+ *
+ * Appropriate macros are defined in include/asm/hw_breakpoint.h; not all
+ * possibilities are available on all architectures.  Execute breakpoints
+ * must have length equal to the special value %HW_BREAKPOINT_LEN_EXECUTE.
+ *
+ * When a breakpoint gets hit, the @triggered callback is
+ * invoked in_interrupt with a pointer to the %hw_breakpoint structure and the
+ * processor registers.
+ * Data breakpoints occur after the memory access has taken place.
+ * Breakpoints are disabled during execution @triggered, to avoid
+ * recursive traps and allow unhindered access to breakpointed memory.
+ *
+ * Hardware breakpoints are implemented using the CPU's debug registers,
+ * which are a limited hardware resource.  Requests to register a
+ * breakpoint will always succeed provided the parameters are valid,
+ * but the breakpoint may not be installed in a debug register right
+ * away.  Physical debug registers are allocated based on the priority
+ * level stored in @priority (higher values indicate higher priority).
+ * User-space breakpoints within a single thread compete with one
+ * another, and all user-space breakpoints compete with all kernel-space
+ * breakpoints; however user-space breakpoints in different threads do
+ * not compete.  %HW_BREAKPOINT_PRIO_PTRACE is the level used for ptrace
+ * requests; an unobtrusive kernel-space breakpoint will use
+ * %HW_BREAKPOINT_PRIO_NORMAL to avoid disturbing user programs.  A
+ * kernel-space breakpoint that always wants to be installed and doesn't
+ * care about disrupting user debugging sessions can specify
+ * %HW_BREAKPOINT_PRIO_HIGH.
+ *
+ * A particular breakpoint may be allocated (installed in) a debug
+ * register or deallocated (uninstalled) from its debug register at any
+ * time, as other breakpoints are registered and unregistered.  The
+ * @installed and @uninstalled callbacks are invoked in_atomic when these
+ * events occur.  It is legal for @installed or @uninstalled to be %NULL. Note
+ * that it is not possible to register or unregister a user-space breakpoint
+ * from within a callback routine, since doing so requires a process context.
+ * Note that for user breakpoints, while in @installed and @uninstalled the
+ * thread may be context switched. Hence it may not be safe to call printk().
+ *
+ * For kernel-space breakpoints, @installed is invoked after the
+ * breakpoint is actually installed and @uninstalled is invoked before
+ * the breakpoint is actually uninstalled.  As a result the @triggered routine
+ * may be invoked when not expected, but this way you will know that during the
+ * time interval from @installed to @uninstalled, all events are faithfully
+ * reported.  (It is not possible to do any better than this in general, because
+ * on SMP systems there is no way to set a debug register simultaneously on all
+ * CPUs.)  The same isn't always true with user-space breakpoints, but the
+ * differences should not be visible to a user process.
+ *
+ * If you need to know whether your kernel-space breakpoint was installed
+ * immediately upon registration, you can check the return value from
+ * register_kernel_hw_breakpoint().  If the value is not > 0, you can
+ * give up and unregister the breakpoint right away.
+ *
+ * @node and @status are intended for internal use.  However @status
+ * may be read to determine whether or not the breakpoint is currently
+ * installed.  (The value is not reliable unless local interrupts are
+ * disabled.)
+ *
+ * This sample code sets a breakpoint on pid_max and registers a callback
+ * function for writes to that variable.  Note that it is not portable
+ * as written, because not all architectures support HW_BREAKPOINT_LEN_4.
+ *
+ * ----------------------------------------------------------------------
+ *
+ * #include <asm/hw_breakpoint.h>
+ *
+ * struct hw_breakpoint my_bp;
+ *
+ * static void my_triggered(struct hw_breakpoint *bp, struct pt_regs *regs)
+ * {
+ * 	printk(KERN_DEBUG "Inside triggered routine of breakpoint exception\n");
+ * 	dump_stack();
+ *  	.......<more debugging output>........
+ * }
+ *
+ * static struct hw_breakpoint my_bp;
+ *
+ * static int init_module(void)
+ * {
+ *	..........<do anything>............
+ *	my_bp.info.type = HW_BREAKPOINT_WRITE;
+ *	my_bp.info.len = HW_BREAKPOINT_LEN_4;
+ *	my_bp.info.priority = HW_BREAKPOINT_PRIO_NORMAL;
+ *
+ *	my_bp.installed = (void *)my_bp_installed;
+ *	my_bp.uninstalled = (void *)my_bp_uninstalled;
+ *	my_bp.triggered = (void *)my_triggered;
+ *
+ *	rc = register_kernel_hw_breakpoint(&my_bp);
+ *	..........<do anything>............
+ * }
+ *
+ * static void cleanup_module(void)
+ * {
+ *	..........<do anything>............
+ *	unregister_kernel_hw_breakpoint(&my_bp);
+ *	..........<do anything>............
+ * }
+ *
+ * ----------------------------------------------------------------------
+ */
+struct hw_breakpoint {
+	struct list_head	node;
+	void		(*installed)(struct hw_breakpoint *);
+	void		(*uninstalled)(struct hw_breakpoint *);
+	void		(*triggered)(struct hw_breakpoint *,
+							struct pt_regs *);
+	struct arch_hw_breakpoint	info;
+	u8		priority;
+	u8		status;
+};
+
+struct kernel_bp_data;
+struct cpu_hw_breakpoint;
+
+/*
+ * Inline accessor routines to retrieve the arch-specific parts of
+ * a breakpoint structure:
+ */
+static const void *hw_breakpoint_get_kaddress(struct hw_breakpoint *bp);
+static const void __user *hw_breakpoint_get_uaddress(struct hw_breakpoint *bp);
+static unsigned hw_breakpoint_get_len(struct hw_breakpoint *bp);
+static unsigned hw_breakpoint_get_type(struct hw_breakpoint *bp);
+
+/*
+ * len and type values are defined in include/asm/hw_breakpoint.h.
+ * Available values vary according to the architecture.  On i386 the
+ * possibilities are:
+ *
+ *	HW_BREAKPOINT_LEN_1
+ *	HW_BREAKPOINT_LEN_2
+ *	HW_BREAKPOINT_LEN_4
+ *	HW_BREAKPOINT_LEN_EXECUTE
+ *	HW_BREAKPOINT_RW
+ *	HW_BREAKPOINT_READ
+ *	HW_BREAKPOINT_EXECUTE
+ *
+ * On other architectures HW_BREAKPOINT_LEN_8 may be available, and the
+ * 1-, 2-, and 4-byte lengths may be unavailable.  There also may be
+ * HW_BREAKPOINT_WRITE.  You can use #ifdef to check at compile time.
+ */
+
+/* Standard HW breakpoint priority levels (higher value = higher priority) */
+#define HW_BREAKPOINT_PRIO_NORMAL	25
+#define HW_BREAKPOINT_PRIO_PTRACE	50
+#define HW_BREAKPOINT_PRIO_HIGH		75
+
+/* HW breakpoint status values (0 = not registered) */
+#define HW_BREAKPOINT_REGISTERED	1
+#define HW_BREAKPOINT_INSTALLED		2
+
+static DEFINE_MUTEX(hw_breakpoint_mutex);	/* Protects everything */
+
+/*
+ * The following two routines are meant to be called only from within
+ * the ptrace or utrace subsystems.  The tsk argument will usually be a
+ * process being debugged by the current task, although it is also legal
+ * for tsk to be the current task.  In any case it must be guaranteed
+ * that tsk will not start running in user mode while its breakpoints are
+ * being modified.
+ */
+int register_user_hw_breakpoint(struct task_struct *tsk,
+		struct hw_breakpoint *bp);
+void unregister_user_hw_breakpoint(struct task_struct *tsk,
+		struct hw_breakpoint *bp);
+
+/*
+ * Declare arch-specific data structures here. They are defined in
+ * arch/x86/include/asm/hw_breakpoint.h
+ */
+
+/*
+ * Kernel breakpoints are not associated with any particular thread.
+ */
+int register_kernel_hw_breakpoint(struct hw_breakpoint *bp);
+void unregister_kernel_hw_breakpoint(struct hw_breakpoint *bp);
+void switch_to_none_hw_breakpoint(void);
+
+struct thread_hw_breakpoint *alloc_thread_hw_breakpoint(
+						struct task_struct *tsk);
+
+extern struct kernel_bp_data		*cur_kbpdata;
+extern int			cur_kbpindex;	/* Alternates 0, 1, ... */
+extern struct list_head kernel_bps;		/* Kernel breakpoint list */
+DECLARE_PER_CPU(struct cpu_hw_breakpoint, cpu_bp);
+
+#endif	/* __KERNEL__ */
+#endif	/* _ASM_GENERIC_HW_BREAKPOINT_H */


^ permalink raw reply	[flat|nested] 71+ messages in thread

* [patch 02/11] x86 architecture implementation of Hardware Breakpoint interfaces
       [not found] <20090305043440.189041194@linux.vnet.ibm.com>
  2009-03-05  4:37 ` [patch 01/11] Introducing generic hardware breakpoint handler interfaces prasad
@ 2009-03-05  4:38 ` prasad
  2009-03-10 14:09   ` Ingo Molnar
  2009-03-05  4:38 ` [patch 03/11] Modifying generic debug exception to use virtual debug registers prasad
                   ` (9 subsequent siblings)
  11 siblings, 1 reply; 71+ messages in thread
From: prasad @ 2009-03-05  4:38 UTC (permalink / raw)
  To: mingo
  Cc: Andrew Morton, Linux Kernel Mailing List, Alan Stern,
	Roland McGrath, K.Prasad

[-- Attachment #1: 2 --]
[-- Type: text/plain, Size: 17506 bytes --]

From: Alan Stern <stern@rowland.harvard.edu>

This patch introduces two new files named hw_breakpoint.[ch] inside x86 specific
directories. They contain functions which help validate and serve requests for 
using Hardware Breakpoint registers on x86 processors.

[K.Prasad: More declarations in hw_breakpoint.h to independently compile each
           hw_breakpoint.c files. Split-out from the bigger patch and minor
           changes following re-basing]

Signed-off-by: K.Prasad <prasad@linux.vnet.ibm.com>
Signed-off-by: Alan Stern <stern@rowland.harvard.edu>
---
 arch/x86/include/asm/hw_breakpoint.h |  132 ++++++++++
 arch/x86/kernel/Makefile             |    2 
 arch/x86/kernel/hw_breakpoint.c      |  437 +++++++++++++++++++++++++++++++++++
 3 files changed, 570 insertions(+), 1 deletion(-)

Index: linux-2.6-tip.hbkpt/arch/x86/kernel/hw_breakpoint.c
===================================================================
--- /dev/null
+++ linux-2.6-tip.hbkpt/arch/x86/kernel/hw_breakpoint.c
@@ -0,0 +1,437 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) 2007 Alan Stern
+ * Copyright (C) 2009 IBM Corporation
+ */
+
+/*
+ * HW_breakpoint: a unified kernel/user-space hardware breakpoint facility,
+ * using the CPU's debug registers.
+ */
+
+#include <linux/init.h>
+#include <linux/irqflags.h>
+#include <linux/kdebug.h>
+#include <linux/kernel.h>
+#include <linux/kprobes.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/notifier.h>
+#include <linux/rculist.h>
+#include <linux/sched.h>
+#include <linux/smp.h>
+#include <linux/percpu.h>
+#include <linux/kallsyms.h>
+
+#include <asm/debugreg.h>
+#include <asm/hw_breakpoint.h>
+#include <asm/processor.h>
+
+static unsigned long		kdr7;		/* Unmasked kernel DR7 value */
+
+/* Masks for the bits in DR7 related to kernel breakpoints, for various
+ * values of num_kbps.  Entry n is the mask for when there are n kernel
+ * breakpoints, in debug registers 0 - (n-1).  The DR_GLOBAL_SLOWDOWN bit
+ * (GE) is handled specially.
+ */
+static const unsigned long	kdr7_masks[HB_NUM + 1] = {
+	0x00000000,
+	0x000f0003,	/* LEN0, R/W0, G0, L0 */
+	0x00ff000f,	/* Same for 0,1 */
+	0x0fff003f,	/* Same for 0,1,2 */
+	0xffff00ff	/* Same for 0,1,2,3 */
+};
+
+/*
+ * Install the kernel breakpoints in their debug registers.
+ */
+void arch_install_chbi(struct cpu_hw_breakpoint *chbi)
+{
+	struct hw_breakpoint **bps;
+
+	/* Don't allow debug exceptions while we update the registers */
+	set_debugreg(0UL, 7);
+	chbi->cur_kbpdata = rcu_dereference(cur_kbpdata);
+
+	/* Kernel breakpoints are stored starting in DR0 and going up */
+	bps = chbi->cur_kbpdata->bps;
+	switch (chbi->cur_kbpdata->num_kbps) {
+	case 4:
+		set_debugreg(bps[3]->info.address, 3);
+	case 3:
+		set_debugreg(bps[2]->info.address, 2);
+	case 2:
+		set_debugreg(bps[1]->info.address, 1);
+	case 1:
+		set_debugreg(bps[0]->info.address, 0);
+	}
+	/* No need to set DR6 */
+	set_debugreg(chbi->cur_kbpdata->mkdr7, 7);
+}
+
+/*
+ * Update an out-of-date thread hw_breakpoint info structure.
+ */
+void arch_update_thbi(struct thread_hw_breakpoint *thbi,
+			struct kernel_bp_data *thr_kbpdata)
+{
+	int num = thr_kbpdata->num_kbps;
+
+	thbi->tkdr7 = thr_kbpdata->mkdr7 | (thbi->tdr7 & ~kdr7_masks[num]);
+}
+
+/*
+ * Install the thread breakpoints in their debug registers.
+ */
+void arch_install_thbi(struct thread_hw_breakpoint *thbi)
+{
+	/* Install the user breakpoints.  Kernel breakpoints are stored
+	 * starting in DR0 and going up; there are num_kbps of them.
+	 * User breakpoints are stored starting in DR3 and going down,
+	 * as many as we have room for.
+	 */
+	switch (thbi->num_installed) {
+	case 4:
+		set_debugreg(thbi->tdr[0], 0);
+	case 3:
+		set_debugreg(thbi->tdr[1], 1);
+	case 2:
+		set_debugreg(thbi->tdr[2], 2);
+	case 1:
+		set_debugreg(thbi->tdr[3], 3);
+	}
+	/* No need to set DR6 */
+	set_debugreg(thbi->tkdr7, 7);
+}
+
+/*
+ * Install the debug register values for just the kernel, no thread.
+ */
+void arch_install_none(struct cpu_hw_breakpoint *chbi)
+{
+	set_debugreg(chbi->cur_kbpdata->mkdr7, 7);
+}
+
+/*
+ * Create a new kbpdata entry.
+ */
+void arch_new_kbpdata(struct kernel_bp_data *new_kbpdata)
+{
+	int num = new_kbpdata->num_kbps;
+
+	new_kbpdata->mkdr7 = kdr7 & (kdr7_masks[num] | DR_GLOBAL_SLOWDOWN);
+}
+
+/*
+ * Store a thread breakpoint array entry's address
+ */
+void arch_store_thread_bp_array(struct thread_hw_breakpoint *thbi,
+					struct hw_breakpoint *bp, int i)
+{
+	thbi->tdr[i] = bp->info.address;
+}
+
+/*
+ * Check for virtual address in user space.
+ */
+int arch_check_va_in_userspace(unsigned long va, struct task_struct *tsk)
+{
+	return (va < TASK_SIZE);
+}
+
+/*
+ * Check for virtual address in kernel space.
+ */
+int arch_check_va_in_kernelspace(unsigned long va)
+{
+	return (va >= TASK_SIZE);
+}
+
+/*
+ * Store a breakpoint's encoded address, length, and type.
+ */
+void arch_store_info(struct hw_breakpoint *bp)
+{
+	/*
+	 * User-space requests will always have the address field populated
+	 * For kernel-addresses, either the address or symbol name can be
+	 * specified.
+	 */
+	if (bp->info.address)
+		return;
+	bp->info.address = (unsigned long)kallsyms_lookup_name(bp->info.name);
+}
+
+/*
+ * Validate the arch-specific HW Breakpoint register settings
+ */
+int arch_validate_hwbkpt_settings(struct hw_breakpoint *bp,
+				unsigned int *align, struct task_struct *tsk)
+{
+	int ret = -EINVAL;
+
+	switch (bp->info.type) {
+
+	/* Ptrace-refactoring code
+	 * For now, we'll allow instruction breakpoint only for user-space
+	 * addresses
+	 */
+	case HW_BREAKPOINT_EXECUTE:
+		if ((!arch_check_va_in_userspace(bp->info.address, tsk)) &&
+			bp->info.len != HW_BREAKPOINT_LEN_EXECUTE)
+			return ret;
+		break;
+	case HW_BREAKPOINT_WRITE:
+				break;
+	case HW_BREAKPOINT_RW:
+				break;
+	default:
+		return ret;
+	}
+
+	switch (bp->info.len) {
+	case HW_BREAKPOINT_LEN_1:
+		*align = 0;
+		break;
+	case HW_BREAKPOINT_LEN_2:
+		*align = 1;
+		break;
+	case HW_BREAKPOINT_LEN_4:
+		*align = 3;
+		break;
+	default:
+		return ret;
+	}
+
+	if (bp->triggered) {
+		ret = 0;
+		arch_store_info(bp);
+	}
+	return ret;
+}
+
+/*
+ * Encode the length, type, Exact, and Enable bits for a particular breakpoint
+ * as stored in debug register 7.
+ */
+static unsigned long encode_dr7(int drnum, unsigned len, unsigned type)
+{
+	unsigned long temp;
+
+	temp = (len | type) & 0xf;
+	temp <<= (DR_CONTROL_SHIFT + drnum * DR_CONTROL_SIZE);
+	temp |= (DR_GLOBAL_ENABLE << (drnum * DR_ENABLE_SIZE)) |
+				DR_GLOBAL_SLOWDOWN;
+	return temp;
+}
+
+/*
+ * Calculate the DR7 value for a list of kernel or user breakpoints.
+ */
+static unsigned long calculate_dr7(struct thread_hw_breakpoint *thbi)
+{
+	int is_user;
+	struct list_head *bp_list;
+	struct hw_breakpoint *bp;
+	int i;
+	int drnum;
+	unsigned long dr7;
+
+	if (thbi) {
+		is_user = 1;
+		bp_list = &thbi->thread_bps;
+		drnum = HB_NUM - 1;
+	} else {
+		is_user = 0;
+		bp_list = &kernel_bps;
+		drnum = 0;
+	}
+
+	/* Kernel bps are assigned from DR0 on up, and user bps are assigned
+	 * from DR3 on down.  Accumulate all 4 bps; the kernel DR7 mask will
+	 * select the appropriate bits later.
+	 */
+	dr7 = 0;
+	i = 0;
+	list_for_each_entry(bp, bp_list, node) {
+
+		/* Get the debug register number and accumulate the bits */
+		dr7 |= encode_dr7(drnum, bp->info.len, bp->info.type);
+		if (++i >= HB_NUM)
+			break;
+		if (is_user)
+			--drnum;
+		else
+			++drnum;
+	}
+	return dr7;
+}
+
+/*
+ * Register a new user breakpoint structure.
+ */
+void arch_register_user_hw_breakpoint(struct hw_breakpoint *bp,
+		struct thread_hw_breakpoint *thbi)
+{
+	thbi->tdr7 = calculate_dr7(thbi);
+
+	/* If this is an execution breakpoint for the current PC address,
+	 * we should clear the task's RF so that the bp will be certain
+	 * to trigger.
+	 *
+	 * FIXME: It's not so easy to get hold of the task's PC as a linear
+	 * address!  ptrace.c does this already...
+	 */
+}
+
+/*
+ * Unregister a user breakpoint structure.
+ */
+void arch_unregister_user_hw_breakpoint(struct hw_breakpoint *bp,
+					struct thread_hw_breakpoint *thbi)
+{
+	thbi->tdr7 = calculate_dr7(thbi);
+}
+
+/*
+ * Register a kernel breakpoint structure.
+ */
+void arch_register_kernel_hw_breakpoint(struct hw_breakpoint *bp)
+{
+	kdr7 = calculate_dr7(NULL);
+}
+
+/*
+ * Unregister a kernel breakpoint structure.
+ */
+void arch_unregister_kernel_hw_breakpoint(struct hw_breakpoint *bp)
+{
+	kdr7 = calculate_dr7(NULL);
+}
+
+
+/* End of arch-specific hook routines */
+
+
+/*
+ * Copy out the debug register information for a core dump.
+ *
+ * tsk must be equal to current.
+ */
+void dump_thread_hw_breakpoint(struct task_struct *tsk, int u_debugreg[8])
+{
+	struct thread_hw_breakpoint *thbi = tsk->thread.hw_breakpoint_info;
+	int i;
+
+	memset(u_debugreg, 0, sizeof u_debugreg);
+	if (thbi) {
+		for (i = 0; i < HB_NUM; ++i)
+			u_debugreg[i] = thbi->vdr_bps[i].info.address;
+		u_debugreg[7] = thbi->vdr7;
+	}
+	u_debugreg[6] = tsk->thread.vdr6;
+}
+
+/*
+ * Handle debug exception notifications.
+ */
+
+int __kprobes hw_breakpoint_handler(struct die_args *args)
+{
+	struct cpu_hw_breakpoint *chbi;
+	int i;
+	struct hw_breakpoint *bp;
+	struct thread_hw_breakpoint *thbi = NULL;
+
+	/* The DR6 value is stored in args->err */
+#define DR6	(args->err)
+
+	if (DR6 & DR_STEP)
+		return NOTIFY_DONE;
+
+	chbi = &per_cpu(cpu_bp, get_cpu());
+
+	/* Disable all breakpoints so that the callbacks can run without
+	 * triggering recursive debug exceptions.
+	 */
+	set_debugreg(0UL, 7);
+
+	/* Assert that local interrupts are disabled
+	 * Reset the DRn bits in the virtualized register value.
+	 * The ptrace trigger routine will add in whatever is needed.
+	 */
+	current->thread.vdr6 &= ~(DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3);
+
+	/* Are we a victim of lazy debug-register switching? */
+	if (!chbi->bp_task)
+		;
+	else if (chbi->bp_task != current) {
+
+		/* No user breakpoints are valid.  Perform the belated
+		 * debug-register switch.
+		 */
+		switch_to_none_hw_breakpoint();
+	} else {
+		thbi = chbi->bp_task->thread.hw_breakpoint_info;
+	}
+
+	/* Handle all the breakpoints that were triggered */
+	for (i = 0; i < HB_NUM; ++i) {
+		if (likely(!(DR6 & (DR_TRAP0 << i))))
+			continue;
+
+		/* Find the corresponding hw_breakpoint structure and
+		 * invoke its triggered callback.
+		 */
+		if (i < chbi->cur_kbpdata->num_kbps)
+			bp = chbi->cur_kbpdata->bps[i];
+		else if (thbi)
+			bp = thbi->bps[i];
+		else		/* False alarm due to lazy DR switching */
+			continue;
+		if (bp) {
+			switch (bp->info.type) {
+			case HW_BREAKPOINT_WRITE:
+			case HW_BREAKPOINT_RW:
+				if (bp->triggered)
+					(bp->triggered)(bp, args->regs);
+				/* Re-enable the breakpoints */
+				set_debugreg(thbi ? thbi->tkdr7 :
+						chbi->cur_kbpdata->mkdr7, 7);
+				put_cpu_no_resched();
+
+				return NOTIFY_STOP;
+			/*
+			 * Presently we allow instruction breakpoints only in
+			 * user-space when requested through ptrace.
+			 */
+			case HW_BREAKPOINT_EXECUTE:
+				if (arch_check_va_in_userspace(bp->info.address,
+								current)) {
+					(bp->triggered)(bp, args->regs);
+	/* We'll return NOTIFY_DONE, do_debug will take care of the rest */
+					return NOTIFY_DONE;
+				}
+			}
+		}
+	}
+	/* Stop processing further if the exception is a stray one */
+	if (!(DR6 & ~(DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)))
+		return NOTIFY_STOP;
+
+	return NOTIFY_DONE;
+#undef DR6
+}
Index: linux-2.6-tip.hbkpt/arch/x86/include/asm/hw_breakpoint.h
===================================================================
--- /dev/null
+++ linux-2.6-tip.hbkpt/arch/x86/include/asm/hw_breakpoint.h
@@ -0,0 +1,132 @@
+#ifndef	_I386_HW_BREAKPOINT_H
+#define	_I386_HW_BREAKPOINT_H
+
+#ifdef	__KERNEL__
+#define	__ARCH_HW_BREAKPOINT_H
+
+struct arch_hw_breakpoint {
+	char		*name; /* Contains name of the symbol to set bkpt */
+	unsigned long	address;
+	u8		len;
+	u8		type;
+} __attribute__((packed));
+
+#include <linux/kdebug.h>
+#include <asm-generic/hw_breakpoint.h>
+
+/* HW breakpoint accessor routines */
+static inline const void *hw_breakpoint_get_kaddress(struct hw_breakpoint *bp)
+{
+	return (const void *) bp->info.address;
+}
+
+static inline const void __user *hw_breakpoint_get_uaddress
+						(struct hw_breakpoint *bp)
+{
+	return (const void __user *) bp->info.address;
+}
+
+static inline unsigned hw_breakpoint_get_len(struct hw_breakpoint *bp)
+{
+	return bp->info.len;
+}
+
+static inline unsigned hw_breakpoint_get_type(struct hw_breakpoint *bp)
+{
+	return bp->info.type;
+}
+
+/* Kernel symbol lookup routine for installing Data HW Breakpoint Address */
+static inline unsigned long hw_breakpoint_lookup_name(const char *name)
+{
+	return kallsyms_lookup_name(name);
+}
+
+/* Available HW breakpoint length encodings */
+#define HW_BREAKPOINT_LEN_1		0x40
+#define HW_BREAKPOINT_LEN_2		0x44
+#define HW_BREAKPOINT_LEN_4		0x4c
+#define HW_BREAKPOINT_LEN_EXECUTE	0x40
+
+/* Available HW breakpoint type encodings */
+#define HW_BREAKPOINT_EXECUTE	0x80	/* trigger on instruction execute */
+#define HW_BREAKPOINT_WRITE	0x81	/* trigger on memory write */
+#define HW_BREAKPOINT_RW	0x83	/* trigger on memory read or write */
+
+#define HB_NUM 4 /* Total number of available HW breakpoint registers */
+
+/* Per-thread HW breakpoint and debug register info */
+struct thread_hw_breakpoint {
+
+	/* utrace support */
+	struct list_head	node;		/* Entry in thread list */
+	struct list_head	thread_bps;	/* Thread's breakpoints */
+	struct hw_breakpoint	*bps[HB_NUM];	/* Highest-priority bps */
+	unsigned long		tdr[HB_NUM];	/*  and their addresses */
+	int			num_installed;	/* Number of installed bps */
+	unsigned		gennum;		/* update-generation number */
+
+	/* Only the portions below are arch-specific */
+
+	/* ptrace support -- Note that vdr6 is stored directly in the
+	 * thread_struct so that it is always available.
+	 */
+	unsigned long		vdr7;			/* Virtualized DR7 */
+	struct hw_breakpoint	vdr_bps[HB_NUM];	/* Breakpoints
+			representing virtualized debug registers 0 - 3 */
+	unsigned long		tdr7;		/* Thread's DR7 value */
+	unsigned long		tkdr7;		/* Thread + kernel DR7 value */
+};
+
+/* Kernel-space breakpoint data */
+struct kernel_bp_data {
+	unsigned		gennum;		/* Generation number */
+	int			num_kbps;	/* Number of kernel bps */
+	struct hw_breakpoint	*bps[HB_NUM];	/* Loaded breakpoints */
+
+	/* Only the portions below are arch-specific */
+	unsigned long		mkdr7;		/* Masked kernel DR7 value */
+};
+
+/* Per-CPU debug register info */
+struct cpu_hw_breakpoint {
+	struct kernel_bp_data	*cur_kbpdata;	/* Current kbpdata[] entry */
+	struct task_struct	*bp_task;	/* The thread whose bps
+			are currently loaded in the debug registers */
+};
+
+/*
+ * Ptrace support: breakpoint trigger routine.
+ */
+
+int __register_user_hw_breakpoint(struct task_struct *tsk,
+			struct hw_breakpoint *bp);
+void __unregister_user_hw_breakpoint(struct task_struct *tsk,
+			struct hw_breakpoint *bp);
+
+
+void arch_update_thbi(struct thread_hw_breakpoint *thbi,
+				struct kernel_bp_data *thr_kbpdata);
+void arch_install_thbi(struct thread_hw_breakpoint *thbi);
+void arch_install_none(struct cpu_hw_breakpoint *chbi);
+void arch_install_chbi(struct cpu_hw_breakpoint *chbi);
+void arch_new_kbpdata(struct kernel_bp_data *new_kbpdata);
+void arch_store_thread_bp_array(struct thread_hw_breakpoint *thbi,
+				struct hw_breakpoint *bp, int i);
+int arch_check_va_in_userspace(unsigned long va,
+						struct task_struct *tsk);
+int arch_check_va_in_kernelspace(unsigned long va);
+void arch_store_info(struct hw_breakpoint *bp);
+int arch_validate_hwbkpt_settings(struct hw_breakpoint *bp,
+				unsigned int *align, struct task_struct *tsk);
+void arch_register_user_hw_breakpoint(struct hw_breakpoint *bp,
+				struct thread_hw_breakpoint *thbi);
+void arch_unregister_user_hw_breakpoint(struct hw_breakpoint *bp,
+					struct thread_hw_breakpoint *thbi);
+void arch_register_kernel_hw_breakpoint(struct hw_breakpoint *bp);
+void arch_unregister_kernel_hw_breakpoint(struct hw_breakpoint *bp);
+int hw_breakpoint_handler(struct die_args *args);
+
+#endif	/* __KERNEL__ */
+#endif	/* _I386_HW_BREAKPOINT_H */
+
Index: linux-2.6-tip.hbkpt/arch/x86/kernel/Makefile
===================================================================
--- linux-2.6-tip.hbkpt.orig/arch/x86/kernel/Makefile
+++ linux-2.6-tip.hbkpt/arch/x86/kernel/Makefile
@@ -36,7 +36,7 @@ obj-$(CONFIG_X86_64)	+= sys_x86_64.o x86
 obj-$(CONFIG_X86_64)	+= syscall_64.o vsyscall_64.o
 obj-y			+= bootflag.o e820.o
 obj-y			+= pci-dma.o quirks.o i8237.o topology.o kdebugfs.o
-obj-y			+= alternative.o i8253.o pci-nommu.o
+obj-y			+= alternative.o i8253.o pci-nommu.o hw_breakpoint.o
 obj-y			+= tsc.o io_delay.o rtc.o
 
 obj-$(CONFIG_X86_TRAMPOLINE)	+= trampoline.o


^ permalink raw reply	[flat|nested] 71+ messages in thread

* [patch 03/11] Modifying generic debug exception to use virtual debug registers
       [not found] <20090305043440.189041194@linux.vnet.ibm.com>
  2009-03-05  4:37 ` [patch 01/11] Introducing generic hardware breakpoint handler interfaces prasad
  2009-03-05  4:38 ` [patch 02/11] x86 architecture implementation of Hardware Breakpoint interfaces prasad
@ 2009-03-05  4:38 ` prasad
  2009-03-05  4:38 ` [patch 04/11] Introduce virtual debug register in thread_struct and wrapper-routines around process related functions prasad
                   ` (8 subsequent siblings)
  11 siblings, 0 replies; 71+ messages in thread
From: prasad @ 2009-03-05  4:38 UTC (permalink / raw)
  To: mingo
  Cc: Andrew Morton, Linux Kernel Mailing List, Alan Stern,
	Roland McGrath, K.Prasad

[-- Attachment #1: 3 --]
[-- Type: text/plain, Size: 3646 bytes --]

From: Alan Stern <stern@rowland.harvard.edu>

This patch modifies the breakpoint exception handler code to use the abstract
register names.

[K.Prasad: Split-out from the bigger patch and minor changes following
           re-basing]

Signed-off-by: K.Prasad <prasad@linux.vnet.ibm.com>
Signed-off-by: Alan Stern <stern@rowland.harvard.edu>
---
 arch/x86/kernel/traps.c |   73 ++++++++++++++++--------------------------------
 1 file changed, 25 insertions(+), 48 deletions(-)

Index: linux-2.6-tip/arch/x86/kernel/traps.c
===================================================================
--- linux-2.6-tip.orig/arch/x86/kernel/traps.c
+++ linux-2.6-tip/arch/x86/kernel/traps.c
@@ -576,13 +576,14 @@ asmlinkage __kprobes struct pt_regs *syn
 dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code)
 {
 	struct task_struct *tsk = current;
-	unsigned long condition;
+	unsigned long dr6;
 	int si_code;
 
-	get_debugreg(condition, 6);
+	get_debugreg(dr6, 6);
+	set_debugreg(0, 6);	/* DR6 may or may not be cleared by the CPU */
 
 	/* Catch kmemcheck conditions first of all! */
-	if (condition & DR_STEP && kmemcheck_trap(regs))
+	if (dr6 & DR_STEP && kmemcheck_trap(regs))
 		return;
 
 	/*
@@ -591,61 +592,37 @@ dotraplinkage void __kprobes do_debug(st
 	clear_tsk_thread_flag(tsk, TIF_DEBUGCTLMSR);
 	tsk->thread.debugctlmsr = 0;
 
-	if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code,
+	/* Store the virtualized DR6 value */
+	tsk->thread.vdr6 = dr6;
+
+	if (notify_die(DIE_DEBUG, "debug", regs, dr6, error_code,
 						SIGTRAP) == NOTIFY_STOP)
 		return;
 
 	/* It's safe to allow irq's after DR6 has been saved */
 	preempt_conditional_sti(regs);
 
-	/* Mask out spurious debug traps due to lazy DR7 setting */
-	if (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)) {
-		if (!tsk->thread.debugreg7)
-			goto clear_dr7;
-	}
-
-#ifdef CONFIG_X86_32
-	if (regs->flags & X86_VM_MASK)
-		goto debug_vm86;
-#endif
-
-	/* Save debug status register where ptrace can see it */
-	tsk->thread.debugreg6 = condition;
-
-	/*
-	 * Single-stepping through TF: make sure we ignore any events in
-	 * kernel space (but re-enable TF when returning to user mode).
-	 */
-	if (condition & DR_STEP) {
-		if (!user_mode(regs))
-			goto clear_TF_reenable;
+	if (regs->flags & X86_VM_MASK) {
+		handle_vm86_trap((struct kernel_vm86_regs *) regs,
+				error_code, 1);
+		return;
 	}
 
-	si_code = get_si_code(condition);
-	/* Ok, finally something we can handle */
-	send_sigtrap(tsk, regs, error_code, si_code);
-
 	/*
-	 * Disable additional traps. They'll be re-enabled when
-	 * the signal is delivered.
+	 * Single-stepping through system calls: ignore any exceptions in
+	 * kernel space, but re-enable TF when returning to user mode.
+	 *
+	 * We already checked v86 mode above, so we can check for kernel mode
+	 * by just checking the CPL of CS.
 	 */
-clear_dr7:
-	set_debugreg(0, 7);
-	preempt_conditional_cli(regs);
-	return;
-
-#ifdef CONFIG_X86_32
-debug_vm86:
-	/* reenable preemption: handle_vm86_trap() might sleep */
-	dec_preempt_count();
-	handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code, 1);
-	conditional_cli(regs);
-	return;
-#endif
-
-clear_TF_reenable:
-	set_tsk_thread_flag(tsk, TIF_SINGLESTEP);
-	regs->flags &= ~X86_EFLAGS_TF;
+	if ((dr6 & DR_STEP) && !user_mode(regs)) {
+		tsk->thread.vdr6 &= ~DR_STEP;
+		set_tsk_thread_flag(tsk, TIF_SINGLESTEP);
+		regs->flags &= ~X86_EFLAGS_TF;
+	}
+	si_code = get_si_code(dr6);
+	if (tsk->thread.vdr6 & (DR_STEP|DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3))
+		send_sigtrap(tsk, regs, error_code, si_code);
 	preempt_conditional_cli(regs);
 	return;
 }


^ permalink raw reply	[flat|nested] 71+ messages in thread

* [patch 04/11] Introduce virtual debug register in thread_struct and wrapper-routines around process related functions
       [not found] <20090305043440.189041194@linux.vnet.ibm.com>
                   ` (2 preceding siblings ...)
  2009-03-05  4:38 ` [patch 03/11] Modifying generic debug exception to use virtual debug registers prasad
@ 2009-03-05  4:38 ` prasad
  2009-03-10 14:35   ` Ingo Molnar
  2009-03-05  4:38 ` [patch 05/11] Use wrapper routines around debug registers in processor " prasad
                   ` (7 subsequent siblings)
  11 siblings, 1 reply; 71+ messages in thread
From: prasad @ 2009-03-05  4:38 UTC (permalink / raw)
  To: mingo
  Cc: Andrew Morton, Linux Kernel Mailing List, Alan Stern,
	Roland McGrath, K.Prasad

[-- Attachment #1: 4 --]
[-- Type: text/plain, Size: 2949 bytes --]

From: Alan Stern <stern@rowland.harvard.edu>

This patch introduces virtual debug registers to used by the per-thread
structure ad wrapper routines to manage debug registers by process-related
functions.

[K.Prasad: Split-out from the bigger patch and minor changes following
           re-basing]

Signed-off-by: K.Prasad <prasad@linux.vnet.ibm.com>
Signed-off-by: Alan Stern <stern@rowland.harvard.edu>
---
 arch/x86/include/asm/debugreg.h  |   27 +++++++++++++++++++++++++++
 arch/x86/include/asm/processor.h |   10 +++-------
 2 files changed, 30 insertions(+), 7 deletions(-)

Index: linux-2.6-tip.hbkpt/arch/x86/include/asm/debugreg.h
===================================================================
--- linux-2.6-tip.hbkpt.orig/arch/x86/include/asm/debugreg.h
+++ linux-2.6-tip.hbkpt/arch/x86/include/asm/debugreg.h
@@ -49,6 +49,8 @@
 
 #define DR_LOCAL_ENABLE_SHIFT 0    /* Extra shift to the local enable bit */
 #define DR_GLOBAL_ENABLE_SHIFT 1   /* Extra shift to the global enable bit */
+#define DR_LOCAL_ENABLE (0x1)      /* Local enable for reg 0 */
+#define DR_GLOBAL_ENABLE (0x2)     /* Global enable for reg 0 */
 #define DR_ENABLE_SIZE 2           /* 2 enable bits per register */
 
 #define DR_LOCAL_ENABLE_MASK (0x55)  /* Set  local bits for all 4 regs */
@@ -67,4 +69,29 @@
 #define DR_LOCAL_SLOWDOWN (0x100)   /* Local slow the pipeline */
 #define DR_GLOBAL_SLOWDOWN (0x200)  /* Global slow the pipeline */
 
+/*
+ * HW breakpoint additions
+ */
+#ifdef __KERNEL__
+
+/* For process management */
+void flush_thread_hw_breakpoint(struct task_struct *tsk);
+int copy_thread_hw_breakpoint(struct task_struct *tsk,
+		struct task_struct *child, unsigned long clone_flags);
+void dump_thread_hw_breakpoint(struct task_struct *tsk, int u_debugreg[8]);
+void switch_to_thread_hw_breakpoint(struct task_struct *tsk);
+
+/* For CPU management */
+void load_debug_registers(void);
+static inline void disable_debug_registers(void)
+{
+	set_debugreg(0UL, 7);
+}
+
+/* For use by ptrace */
+unsigned long thread_get_debugreg(struct task_struct *tsk, int n);
+int thread_set_debugreg(struct task_struct *tsk, int n, unsigned long val);
+
+#endif	/* __KERNEL__ */
+
 #endif /* _ASM_X86_DEBUGREG_H */
Index: linux-2.6-tip.hbkpt/arch/x86/include/asm/processor.h
===================================================================
--- linux-2.6-tip.hbkpt.orig/arch/x86/include/asm/processor.h
+++ linux-2.6-tip.hbkpt/arch/x86/include/asm/processor.h
@@ -427,13 +427,9 @@ struct thread_struct {
 	unsigned long		ip;
 	unsigned long		fs;
 	unsigned long		gs;
-	/* Hardware debugging registers: */
-	unsigned long		debugreg0;
-	unsigned long		debugreg1;
-	unsigned long		debugreg2;
-	unsigned long		debugreg3;
-	unsigned long		debugreg6;
-	unsigned long		debugreg7;
+	/* Hardware breakpoint info */
+	unsigned long	vdr6;
+	struct thread_hw_breakpoint	*hw_breakpoint_info;
 	/* Fault info: */
 	unsigned long		cr2;
 	unsigned long		trap_no;


^ permalink raw reply	[flat|nested] 71+ messages in thread

* [patch 05/11] Use wrapper routines around debug registers in processor related functions
       [not found] <20090305043440.189041194@linux.vnet.ibm.com>
                   ` (3 preceding siblings ...)
  2009-03-05  4:38 ` [patch 04/11] Introduce virtual debug register in thread_struct and wrapper-routines around process related functions prasad
@ 2009-03-05  4:38 ` prasad
  2009-03-05  4:40 ` [patch 06/11] Use virtual debug registers in process/thread handling code prasad
                   ` (6 subsequent siblings)
  11 siblings, 0 replies; 71+ messages in thread
From: prasad @ 2009-03-05  4:38 UTC (permalink / raw)
  To: mingo
  Cc: Andrew Morton, Linux Kernel Mailing List, Alan Stern,
	Roland McGrath, K.Prasad

[-- Attachment #1: 5 --]
[-- Type: text/plain, Size: 3654 bytes --]

From: Alan Stern <stern@rowland.harvard.edu>

This patch enables the use of wrapper routines to access the debug/breakpoint
registers.

[K.Prasad: Split-out from the bigger patch and minor changes following
           re-basing]

Signed-off-by: K.Prasad <prasad@linux.vnet.ibm.com>
Signed-off-by: Alan Stern <stern@rowland.harvard.edu>
---
 arch/x86/kernel/smpboot.c |    3 +++
 arch/x86/power/cpu_32.c   |   16 +++-------------
 arch/x86/power/cpu_64.c   |   15 +++------------
 3 files changed, 9 insertions(+), 25 deletions(-)

Index: linux-2.6-tip/arch/x86/power/cpu_32.c
===================================================================
--- linux-2.6-tip.orig/arch/x86/power/cpu_32.c
+++ linux-2.6-tip/arch/x86/power/cpu_32.c
@@ -12,6 +12,7 @@
 #include <asm/mtrr.h>
 #include <asm/mce.h>
 #include <asm/xcr.h>
+#include <asm/debugreg.h>
 
 static struct saved_context saved_context;
 
@@ -47,6 +48,7 @@ static void __save_processor_state(struc
 	ctxt->cr2 = read_cr2();
 	ctxt->cr3 = read_cr3();
 	ctxt->cr4 = read_cr4_safe();
+	disable_debug_registers();
 }
 
 /* Needed by apm.c */
@@ -79,19 +81,7 @@ static void fix_processor_context(void)
 	load_TR_desc();				/* This does ltr */
 	load_LDT(&current->active_mm->context);	/* This does lldt */
 
-	/*
-	 * Now maybe reload the debug registers
-	 */
-	if (current->thread.debugreg7) {
-		set_debugreg(current->thread.debugreg0, 0);
-		set_debugreg(current->thread.debugreg1, 1);
-		set_debugreg(current->thread.debugreg2, 2);
-		set_debugreg(current->thread.debugreg3, 3);
-		/* no 4 and 5 */
-		set_debugreg(current->thread.debugreg6, 6);
-		set_debugreg(current->thread.debugreg7, 7);
-	}
-
+	load_debug_registers();
 }
 
 static void __restore_processor_state(struct saved_context *ctxt)
Index: linux-2.6-tip/arch/x86/power/cpu_64.c
===================================================================
--- linux-2.6-tip.orig/arch/x86/power/cpu_64.c
+++ linux-2.6-tip/arch/x86/power/cpu_64.c
@@ -15,6 +15,7 @@
 #include <asm/pgtable.h>
 #include <asm/mtrr.h>
 #include <asm/xcr.h>
+#include <asm/debugreg.h>
 
 static void fix_processor_context(void);
 
@@ -70,6 +71,7 @@ static void __save_processor_state(struc
 	ctxt->cr3 = read_cr3();
 	ctxt->cr4 = read_cr4();
 	ctxt->cr8 = read_cr8();
+	disable_debug_registers();
 }
 
 void save_processor_state(void)
@@ -158,16 +160,5 @@ static void fix_processor_context(void)
 	load_TR_desc();				/* This does ltr */
 	load_LDT(&current->active_mm->context);	/* This does lldt */
 
-	/*
-	 * Now maybe reload the debug registers
-	 */
-	if (current->thread.debugreg7){
-                loaddebug(&current->thread, 0);
-                loaddebug(&current->thread, 1);
-                loaddebug(&current->thread, 2);
-                loaddebug(&current->thread, 3);
-                /* no 4 and 5 */
-                loaddebug(&current->thread, 6);
-                loaddebug(&current->thread, 7);
-	}
+	load_debug_registers();
 }
Index: linux-2.6-tip/arch/x86/kernel/smpboot.c
===================================================================
--- linux-2.6-tip.orig/arch/x86/kernel/smpboot.c
+++ linux-2.6-tip/arch/x86/kernel/smpboot.c
@@ -63,6 +63,7 @@
 #include <asm/apic.h>
 #include <asm/setup.h>
 #include <asm/uv/uv.h>
+#include <asm/debugreg.h>
 #include <linux/mc146818rtc.h>
 
 #include <asm/smpboot_hooks.h>
@@ -337,6 +338,7 @@ notrace static void __cpuinit start_seco
 	setup_secondary_clock();
 
 	wmb();
+	load_debug_registers();
 	cpu_idle();
 }
 
@@ -1312,6 +1314,7 @@ void cpu_disable_common(void)
 	remove_cpu_from_maps(cpu);
 	unlock_vector_lock();
 	fixup_irqs();
+	disable_debug_registers();
 }
 
 int native_cpu_disable(void)


^ permalink raw reply	[flat|nested] 71+ messages in thread

* [patch 06/11] Use virtual debug registers in process/thread handling code
       [not found] <20090305043440.189041194@linux.vnet.ibm.com>
                   ` (4 preceding siblings ...)
  2009-03-05  4:38 ` [patch 05/11] Use wrapper routines around debug registers in processor " prasad
@ 2009-03-05  4:40 ` prasad
  2009-03-10 14:49   ` Ingo Molnar
  2009-03-05  4:40 ` [patch 07/11] Modify signal handling code to refrain from re-enabling HW Breakpoints prasad
                   ` (5 subsequent siblings)
  11 siblings, 1 reply; 71+ messages in thread
From: prasad @ 2009-03-05  4:40 UTC (permalink / raw)
  To: mingo
  Cc: Andrew Morton, Linux Kernel Mailing List, Alan Stern,
	Roland McGrath, K.Prasad

[-- Attachment #1: 6 --]
[-- Type: text/plain, Size: 6638 bytes --]

From: Alan Stern <stern@rowland.harvard.edu>

This patch enables the use of abstract/virtual debug registers in
process-handling routines.

[K.Prasad: Split-out from the bigger patch and minor changes following
           re-basing]

Signed-off-by: K.Prasad <prasad@linux.vnet.ibm.com>
Signed-off-by: Alan Stern <stern@rowland.harvard.edu>
---
 arch/x86/kernel/process_32.c |   43 +++++++++++++++++++++++++------------------
 arch/x86/kernel/process_64.c |   41 ++++++++++++++++++++++++-----------------
 2 files changed, 49 insertions(+), 35 deletions(-)

Index: linux-2.6-tip/arch/x86/kernel/process_32.c
===================================================================
--- linux-2.6-tip.orig/arch/x86/kernel/process_32.c
+++ linux-2.6-tip/arch/x86/kernel/process_32.c
@@ -59,6 +59,8 @@
 #include <asm/idle.h>
 #include <asm/syscalls.h>
 #include <asm/ds.h>
+#include <asm/debugreg.h>
+#include <asm/hw_breakpoint.h>
 
 asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
 
@@ -233,6 +235,8 @@ EXPORT_SYMBOL(kernel_thread);
  */
 void exit_thread(void)
 {
+	struct task_struct *tsk = current;
+
 	/* The process may have allocated an io port bitmap... nuke it. */
 	if (unlikely(test_thread_flag(TIF_IO_BITMAP))) {
 		struct task_struct *tsk = current;
@@ -253,6 +257,8 @@ void exit_thread(void)
 		tss->x86_tss.io_bitmap_base = INVALID_IO_BITMAP_OFFSET;
 		put_cpu();
 	}
+	if (unlikely(tsk->thread.hw_breakpoint_info))
+		flush_thread_hw_breakpoint(tsk);
 
 	ds_exit_thread(current);
 }
@@ -261,14 +267,9 @@ void flush_thread(void)
 {
 	struct task_struct *tsk = current;
 
-	tsk->thread.debugreg0 = 0;
-	tsk->thread.debugreg1 = 0;
-	tsk->thread.debugreg2 = 0;
-	tsk->thread.debugreg3 = 0;
-	tsk->thread.debugreg6 = 0;
-	tsk->thread.debugreg7 = 0;
 	memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
-	clear_tsk_thread_flag(tsk, TIF_DEBUG);
+	if (unlikely(tsk->thread.hw_breakpoint_info))
+		flush_thread_hw_breakpoint(tsk);
 	/*
 	 * Forget coprocessor state..
 	 */
@@ -312,7 +313,15 @@ int copy_thread(int nr, unsigned long cl
 
 	task_user_gs(p) = get_user_gs(regs);
 
+	p->thread.hw_breakpoint_info = NULL;
+	p->thread.io_bitmap_ptr = NULL;
+
 	tsk = current;
+	err = -ENOMEM;
+	if (unlikely(tsk->thread.hw_breakpoint_info)) {
+		if (copy_thread_hw_breakpoint(tsk, p, clone_flags))
+			goto out;
+	}
 	if (unlikely(test_tsk_thread_flag(tsk, TIF_IO_BITMAP))) {
 		p->thread.io_bitmap_ptr = kmemdup(tsk->thread.io_bitmap_ptr,
 						IO_BITMAP_BYTES, GFP_KERNEL);
@@ -331,11 +340,13 @@ int copy_thread(int nr, unsigned long cl
 	if (clone_flags & CLONE_SETTLS)
 		err = do_set_thread_area(p, -1,
 			(struct user_desc __user *)childregs->si, 0);
-
+ out:
 	if (err && p->thread.io_bitmap_ptr) {
 		kfree(p->thread.io_bitmap_ptr);
 		p->thread.io_bitmap_max = 0;
 	}
+	if (err)
+		flush_thread_hw_breakpoint(p);
 
 	ds_copy_thread(p, current);
 
@@ -437,16 +448,6 @@ __switch_to_xtra(struct task_struct *pre
 	else if (next->debugctlmsr != prev->debugctlmsr)
 		update_debugctlmsr(next->debugctlmsr);
 
-	if (test_tsk_thread_flag(next_p, TIF_DEBUG)) {
-		set_debugreg(next->debugreg0, 0);
-		set_debugreg(next->debugreg1, 1);
-		set_debugreg(next->debugreg2, 2);
-		set_debugreg(next->debugreg3, 3);
-		/* no 4 and 5 */
-		set_debugreg(next->debugreg6, 6);
-		set_debugreg(next->debugreg7, 7);
-	}
-
 	if (test_tsk_thread_flag(prev_p, TIF_NOTSC) ^
 	    test_tsk_thread_flag(next_p, TIF_NOTSC)) {
 		/* prev and next are different */
@@ -595,6 +596,12 @@ __switch_to(struct task_struct *prev_p, 
 
 	percpu_write(current_task, next_p);
 
+	/*
+	 * Handle debug registers.  This must be done _after_ current
+	 * is updated.
+	 */
+	if (unlikely(test_tsk_thread_flag(next_p, TIF_DEBUG)))
+		switch_to_thread_hw_breakpoint(next_p);
 	return prev_p;
 }
 
Index: linux-2.6-tip/arch/x86/kernel/process_64.c
===================================================================
--- linux-2.6-tip.orig/arch/x86/kernel/process_64.c
+++ linux-2.6-tip/arch/x86/kernel/process_64.c
@@ -53,6 +53,8 @@
 #include <asm/proto.h>
 #include <asm/ia32.h>
 #include <asm/idle.h>
+#include <asm/debugreg.h>
+#include <asm/hw_breakpoint.h>
 #include <asm/syscalls.h>
 #include <asm/ds.h>
 
@@ -277,13 +279,9 @@ void flush_thread(void)
 	}
 	clear_tsk_thread_flag(tsk, TIF_DEBUG);
 
-	tsk->thread.debugreg0 = 0;
-	tsk->thread.debugreg1 = 0;
-	tsk->thread.debugreg2 = 0;
-	tsk->thread.debugreg3 = 0;
-	tsk->thread.debugreg6 = 0;
-	tsk->thread.debugreg7 = 0;
 	memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
+	if (unlikely(tsk->thread.hw_breakpoint_info))
+		flush_thread_hw_breakpoint(tsk);
 	/*
 	 * Forget coprocessor state..
 	 */
@@ -303,6 +301,8 @@ void release_thread(struct task_struct *
 			BUG();
 		}
 	}
+	if (unlikely(me->thread.hw_breakpoint_info))
+		flush_thread_hw_breakpoint(me);
 }
 
 static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr)
@@ -358,13 +358,21 @@ int copy_thread(int nr, unsigned long cl
 
 	p->thread.fs = me->thread.fs;
 	p->thread.gs = me->thread.gs;
+	p->thread.hw_breakpoint_info = NULL;
+	p->thread.io_bitmap_ptr = NULL;
 
 	savesegment(gs, p->thread.gsindex);
 	savesegment(fs, p->thread.fsindex);
 	savesegment(es, p->thread.es);
 	savesegment(ds, p->thread.ds);
 
-	if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) {
+	err = -ENOMEM;
+	if (unlikely(me->thread.hw_breakpoint_info)) {
+		if (copy_thread_hw_breakpoint(me, p, clone_flags))
+			goto out;
+	}
+
+if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) {
 		p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
 		if (!p->thread.io_bitmap_ptr) {
 			p->thread.io_bitmap_max = 0;
@@ -401,6 +409,9 @@ out:
 		kfree(p->thread.io_bitmap_ptr);
 		p->thread.io_bitmap_max = 0;
 	}
+	if (err)
+		flush_thread_hw_breakpoint(p);
+
 	return err;
 }
 
@@ -503,16 +514,6 @@ static inline void __switch_to_xtra(stru
 	else if (next->debugctlmsr != prev->debugctlmsr)
 		update_debugctlmsr(next->debugctlmsr);
 
-	if (test_tsk_thread_flag(next_p, TIF_DEBUG)) {
-		loaddebug(next, 0);
-		loaddebug(next, 1);
-		loaddebug(next, 2);
-		loaddebug(next, 3);
-		/* no 4 and 5 */
-		loaddebug(next, 6);
-		loaddebug(next, 7);
-	}
-
 	if (test_tsk_thread_flag(prev_p, TIF_NOTSC) ^
 	    test_tsk_thread_flag(next_p, TIF_NOTSC)) {
 		/* prev and next are different */
@@ -535,6 +536,12 @@ static inline void __switch_to_xtra(stru
 		 */
 		memset(tss->io_bitmap, 0xff, prev->io_bitmap_max);
 	}
+	/*
+	 * Handle debug registers.  This must be done _after_ current
+	 * is updated.
+	 */
+	if (unlikely(test_tsk_thread_flag(next_p, TIF_DEBUG)))
+		switch_to_thread_hw_breakpoint(next_p);
 }
 
 /*


^ permalink raw reply	[flat|nested] 71+ messages in thread

* [patch 07/11] Modify signal handling code to refrain from re-enabling HW Breakpoints
       [not found] <20090305043440.189041194@linux.vnet.ibm.com>
                   ` (5 preceding siblings ...)
  2009-03-05  4:40 ` [patch 06/11] Use virtual debug registers in process/thread handling code prasad
@ 2009-03-05  4:40 ` prasad
  2009-03-05  4:40 ` [patch 08/11] Modify Ptrace routines to access breakpoint registers prasad
                   ` (4 subsequent siblings)
  11 siblings, 0 replies; 71+ messages in thread
From: prasad @ 2009-03-05  4:40 UTC (permalink / raw)
  To: mingo
  Cc: Andrew Morton, Linux Kernel Mailing List, Alan Stern,
	Roland McGrath, K.Prasad

[-- Attachment #1: 7 --]
[-- Type: text/plain, Size: 1126 bytes --]

From: Alan Stern <stern@rowland.harvard.edu>

This patch disables re-enabling of Hardware Breakpoint registers through
the  signal handling code. This is now done during
hw_breakpoint_handler().

Signed-off-by: K.Prasad <prasad@linux.vnet.ibm.com>
Signed-off-by: Alan Stern <stern@rowland.harvard.edu>
---
 arch/x86/kernel/signal.c |    9 ---------
 1 file changed, 9 deletions(-)

Index: linux-2.6-tip/arch/x86/kernel/signal.c
===================================================================
--- linux-2.6-tip.orig/arch/x86/kernel/signal.c
+++ linux-2.6-tip/arch/x86/kernel/signal.c
@@ -795,15 +795,6 @@ static void do_signal(struct pt_regs *re
 
 	signr = get_signal_to_deliver(&info, &ka, regs, NULL);
 	if (signr > 0) {
-		/*
-		 * Re-enable any watchpoints before delivering the
-		 * signal to user space. The processor register will
-		 * have been cleared if the watchpoint triggered
-		 * inside the kernel.
-		 */
-		if (current->thread.debugreg7)
-			set_debugreg(current->thread.debugreg7, 7);
-
 		/* Whee! Actually deliver the signal.  */
 		if (handle_signal(signr, &info, &ka, oldset, regs) == 0) {
 			/*


^ permalink raw reply	[flat|nested] 71+ messages in thread

* [patch 08/11] Modify Ptrace routines to access breakpoint registers
       [not found] <20090305043440.189041194@linux.vnet.ibm.com>
                   ` (6 preceding siblings ...)
  2009-03-05  4:40 ` [patch 07/11] Modify signal handling code to refrain from re-enabling HW Breakpoints prasad
@ 2009-03-05  4:40 ` prasad
  2009-03-10 14:40   ` Ingo Molnar
  2009-03-05  4:41 ` [patch 09/11] Cleanup HW Breakpoint registers before kexec prasad
                   ` (3 subsequent siblings)
  11 siblings, 1 reply; 71+ messages in thread
From: prasad @ 2009-03-05  4:40 UTC (permalink / raw)
  To: mingo
  Cc: Andrew Morton, Linux Kernel Mailing List, Alan Stern,
	Roland McGrath, K.Prasad

[-- Attachment #1: 8 --]
[-- Type: text/plain, Size: 9043 bytes --]

From: Alan Stern <stern@rowland.harvard.edu>

This patch modifies the ptrace code to use the new wrapper routines around the 
debug/breakpoint registers.

[K.Prasad: Adapted the ptrace routines and to changes post x86/x86_64 merger]

[K.Prasad: Split-out from the bigger patch and minor changes following
           re-basing]

Signed-off-by: K.Prasad <prasad@linux.vnet.ibm.com>
Signed-off-by: Alan Stern <stern@rowland.harvard.edu>
---
 arch/x86/kernel/ptrace.c |  242 +++++++++++++++++++++++++++++------------------
 1 file changed, 152 insertions(+), 90 deletions(-)

Index: linux-2.6-tip/arch/x86/kernel/ptrace.c
===================================================================
--- linux-2.6-tip.orig/arch/x86/kernel/ptrace.c
+++ linux-2.6-tip/arch/x86/kernel/ptrace.c
@@ -33,6 +33,7 @@
 #include <asm/prctl.h>
 #include <asm/proto.h>
 #include <asm/ds.h>
+#include <asm/hw_breakpoint.h>
 
 #include "tls.h"
 
@@ -133,11 +134,6 @@ static int set_segment_reg(struct task_s
 	return 0;
 }
 
-static unsigned long debugreg_addr_limit(struct task_struct *task)
-{
-	return TASK_SIZE - 3;
-}
-
 #else  /* CONFIG_X86_64 */
 
 #define FLAG_MASK		(FLAG_MASK_32 | X86_EFLAGS_NT)
@@ -262,15 +258,6 @@ static int set_segment_reg(struct task_s
 	return 0;
 }
 
-static unsigned long debugreg_addr_limit(struct task_struct *task)
-{
-#ifdef CONFIG_IA32_EMULATION
-	if (test_tsk_thread_flag(task, TIF_IA32))
-		return IA32_PAGE_OFFSET - 3;
-#endif
-	return TASK_SIZE_MAX - 7;
-}
-
 #endif	/* CONFIG_X86_32 */
 
 static unsigned long get_flags(struct task_struct *task)
@@ -461,95 +448,170 @@ static int genregs_set(struct task_struc
 }
 
 /*
- * This function is trivial and will be inlined by the compiler.
- * Having it separates the implementation details of debug
- * registers from the interface details of ptrace.
+ * Decode the length and type bits for a particular breakpoint as
+ * stored in debug register 7.  Return the "enabled" status.
  */
-static unsigned long ptrace_get_debugreg(struct task_struct *child, int n)
+static int decode_dr7(unsigned long dr7, int bpnum, unsigned *len,
+		unsigned *type)
 {
-	switch (n) {
-	case 0:		return child->thread.debugreg0;
-	case 1:		return child->thread.debugreg1;
-	case 2:		return child->thread.debugreg2;
-	case 3:		return child->thread.debugreg3;
-	case 6:		return child->thread.debugreg6;
-	case 7:		return child->thread.debugreg7;
+	int temp = dr7 >> (DR_CONTROL_SHIFT + bpnum * DR_CONTROL_SIZE);
+
+	*len = (temp & 0xc) | 0x40;
+	*type = (temp & 0x3) | 0x80;
+	return (dr7 >> (bpnum * DR_ENABLE_SIZE)) & 0x3;
+}
+
+static void ptrace_triggered(struct hw_breakpoint *bp, struct pt_regs *regs)
+{
+	struct task_struct *tsk = current;
+	struct thread_hw_breakpoint *thbi = tsk->thread.hw_breakpoint_info;
+	int i;
+
+	/* Store in the virtual DR6 register the fact that the breakpoint
+	 * was hit so the thread's debugger will see it.
+	 */
+	if (thbi) {
+		i = bp - thbi->vdr_bps;
+		tsk->thread.vdr6 |= (DR_TRAP0 << i);
 	}
-	return 0;
 }
 
-static int ptrace_set_debugreg(struct task_struct *child,
-			       int n, unsigned long data)
+/*
+ * Handle ptrace writes to debug register 7.
+ */
+static int ptrace_write_dr7(struct task_struct *tsk,
+		struct thread_hw_breakpoint *thbi, unsigned long data)
 {
+	struct hw_breakpoint *bp;
 	int i;
+	int rc = 0;
+	unsigned long old_dr7 = thbi->vdr7;
 
-	if (unlikely(n == 4 || n == 5))
-		return -EIO;
+	data &= ~DR_CONTROL_RESERVED;
 
-	if (n < 4 && unlikely(data >= debugreg_addr_limit(child)))
-		return -EIO;
+	/* Loop through all the hardware breakpoints, making the
+	 * appropriate changes to each.
+	 */
+ restore_settings:
+	thbi->vdr7 = data;
+	bp = &thbi->vdr_bps[0];
+	for (i = 0; i < HB_NUM; (++i, ++bp)) {
+		int enabled;
+		unsigned len, type;
+
+		enabled = decode_dr7(data, i, &len, &type);
+
+		/* Unregister the breakpoint before trying to change it */
+		if (bp->status)
+			__unregister_user_hw_breakpoint(tsk, bp);
 
-	switch (n) {
-	case 0:		child->thread.debugreg0 = data; break;
-	case 1:		child->thread.debugreg1 = data; break;
-	case 2:		child->thread.debugreg2 = data; break;
-	case 3:		child->thread.debugreg3 = data; break;
+		/* Now register the breakpoint if it should be enabled.
+		 * New invalid entries will raise an error here.
+		 */
+		if (enabled) {
+			bp->triggered = ptrace_triggered;
+			bp->info.len = len;
+			bp->info.type = type;
+
+			bp->priority = HW_BREAKPOINT_PRIO_PTRACE;
+			if (rc == 0 && __register_user_hw_breakpoint(tsk,
+									bp) < 0)
+				break;
+		}
+	}
 
-	case 6:
-		if ((data & ~0xffffffffUL) != 0)
-			return -EIO;
-		child->thread.debugreg6 = data;
-		break;
+	/* If anything above failed, restore the original settings */
+	if (i < HB_NUM) {
+		rc = -EIO;
+		data = old_dr7;
+		goto restore_settings;
+	}
+	return rc;
+}
 
-	case 7:
-		/*
-		 * Sanity-check data. Take one half-byte at once with
-		 * check = (val >> (16 + 4*i)) & 0xf. It contains the
-		 * R/Wi and LENi bits; bits 0 and 1 are R/Wi, and bits
-		 * 2 and 3 are LENi. Given a list of invalid values,
-		 * we do mask |= 1 << invalid_value, so that
-		 * (mask >> check) & 1 is a correct test for invalid
-		 * values.
-		 *
-		 * R/Wi contains the type of the breakpoint /
-		 * watchpoint, LENi contains the length of the watched
-		 * data in the watchpoint case.
-		 *
-		 * The invalid values are:
-		 * - LENi == 0x10 (undefined), so mask |= 0x0f00.	[32-bit]
-		 * - R/Wi == 0x10 (break on I/O reads or writes), so
-		 *   mask |= 0x4444.
-		 * - R/Wi == 0x00 && LENi != 0x00, so we have mask |=
-		 *   0x1110.
-		 *
-		 * Finally, mask = 0x0f00 | 0x4444 | 0x1110 == 0x5f54.
-		 *
-		 * See the Intel Manual "System Programming Guide",
-		 * 15.2.4
-		 *
-		 * Note that LENi == 0x10 is defined on x86_64 in long
-		 * mode (i.e. even for 32-bit userspace software, but
-		 * 64-bit kernel), so the x86_64 mask value is 0x5454.
-		 * See the AMD manual no. 24593 (AMD64 System Programming)
+/*
+ * Handle PTRACE_PEEKUSR calls for the debug register area.
+ */
+unsigned long ptrace_get_debugreg(struct task_struct *tsk, int n)
+{
+	struct thread_hw_breakpoint *thbi;
+	unsigned long val = 0;
+
+	mutex_lock(&hw_breakpoint_mutex);
+	thbi = tsk->thread.hw_breakpoint_info;
+	if (n < HB_NUM) {
+		if (thbi)
+			val = thbi->vdr_bps[n].info.address;
+	} else if (n == 6) {
+		val = tsk->thread.vdr6;
+	} else if (n == 7) {
+		if (thbi)
+			val = thbi->vdr7;
+	}
+	mutex_unlock(&hw_breakpoint_mutex);
+	return val;
+}
+
+/*
+ * Handle PTRACE_POKEUSR calls for the debug register area.
+ */
+int ptrace_set_debugreg(struct task_struct *tsk, int n, unsigned long val)
+{
+	struct thread_hw_breakpoint *thbi;
+	int rc = -EIO;
+
+	/* We have to hold this lock the entire time, to prevent thbi
+	 * from being deallocated out from under us.
+	 */
+	mutex_lock(&hw_breakpoint_mutex);
+
+	/* There are no DR4 or DR5 registers */
+	if (n == 4 || n == 5)
+		;
+
+	/* Writes to DR6 modify the virtualized value */
+	else if (n == 6) {
+		tsk->thread.vdr6 = val;
+		rc = 0;
+	}
+
+	else if (!tsk->thread.hw_breakpoint_info && val == 0)
+		rc = 0;		/* Minor optimization */
+
+	else if ((thbi = alloc_thread_hw_breakpoint(tsk)) == NULL)
+		rc = -ENOMEM;
+
+	/* Writes to DR0 - DR3 change a breakpoint address */
+	else if (n < HB_NUM) {
+		struct hw_breakpoint *bp = &thbi->vdr_bps[n];
+
+		/* If the breakpoint is registered then unregister it,
+		 * change it, and re-register it.  Revert to the original
+		 * address if an error occurs.
 		 */
-#ifdef CONFIG_X86_32
-#define	DR7_MASK	0x5f54
-#else
-#define	DR7_MASK	0x5554
-#endif
-		data &= ~DR_CONTROL_RESERVED;
-		for (i = 0; i < 4; i++)
-			if ((DR7_MASK >> ((data >> (16 + 4*i)) & 0xf)) & 1)
-				return -EIO;
-		child->thread.debugreg7 = data;
-		if (data)
-			set_tsk_thread_flag(child, TIF_DEBUG);
-		else
-			clear_tsk_thread_flag(child, TIF_DEBUG);
-		break;
+		if (bp->status) {
+			unsigned long old_addr = bp->info.address;
+
+			__unregister_user_hw_breakpoint(tsk, bp);
+
+			bp->info.address = val;
+			rc = __register_user_hw_breakpoint(tsk, bp);
+			if (rc < 0) {
+				bp->info.address = old_addr;
+				__register_user_hw_breakpoint(tsk, bp);
+			}
+		} else {
+			bp->info.address = val;
+			rc = 0;
+		}
 	}
 
-	return 0;
+	/* All that's left is DR7 */
+	else
+		rc = ptrace_write_dr7(tsk, thbi, val);
+
+	mutex_unlock(&hw_breakpoint_mutex);
+	return rc;
 }
 
 /*
@@ -871,7 +933,7 @@ long arch_ptrace(struct task_struct *chi
 		else if (addr >= offsetof(struct user, u_debugreg[0]) &&
 			 addr <= offsetof(struct user, u_debugreg[7])) {
 			addr -= offsetof(struct user, u_debugreg[0]);
-			tmp = ptrace_get_debugreg(child, addr / sizeof(data));
+			tmp = ptrace_get_debugreg(child, addr/sizeof(data));
 		}
 		ret = put_user(tmp, datap);
 		break;
@@ -889,7 +951,7 @@ long arch_ptrace(struct task_struct *chi
 			 addr <= offsetof(struct user, u_debugreg[7])) {
 			addr -= offsetof(struct user, u_debugreg[0]);
 			ret = ptrace_set_debugreg(child,
-						  addr / sizeof(data), data);
+						addr/sizeof(data), data);
 		}
 		break;
 


^ permalink raw reply	[flat|nested] 71+ messages in thread

* [patch 09/11] Cleanup HW Breakpoint registers before kexec
       [not found] <20090305043440.189041194@linux.vnet.ibm.com>
                   ` (7 preceding siblings ...)
  2009-03-05  4:40 ` [patch 08/11] Modify Ptrace routines to access breakpoint registers prasad
@ 2009-03-05  4:41 ` prasad
  2009-03-10 14:42   ` Ingo Molnar
  2009-03-05  4:41 ` [patch 10/11] Sample HW breakpoint over kernel data address prasad
                   ` (2 subsequent siblings)
  11 siblings, 1 reply; 71+ messages in thread
From: prasad @ 2009-03-05  4:41 UTC (permalink / raw)
  To: mingo
  Cc: Andrew Morton, Linux Kernel Mailing List, Alan Stern,
	Roland McGrath, K.Prasad

[-- Attachment #1: 9 --]
[-- Type: text/plain, Size: 1819 bytes --]

From: Alan Stern <stern@rowland.harvard.edu>

This patch disables Hardware breakpoints before doing a 'kexec' on the machine.

[K.Prasad: Split-out from the bigger patch and minor changes following
           re-basing]

Signed-off-by: K.Prasad <prasad@linux.vnet.ibm.com>
Signed-off-by: Alan Stern <stern@rowland.harvard.edu>
---
 arch/x86/kernel/machine_kexec_32.c |    2 ++
 arch/x86/kernel/machine_kexec_64.c |    2 ++
 2 files changed, 4 insertions(+)

Index: linux-2.6-tip/arch/x86/kernel/machine_kexec_32.c
===================================================================
--- linux-2.6-tip.orig/arch/x86/kernel/machine_kexec_32.c
+++ linux-2.6-tip/arch/x86/kernel/machine_kexec_32.c
@@ -25,6 +25,7 @@
 #include <asm/desc.h>
 #include <asm/system.h>
 #include <asm/cacheflush.h>
+#include <asm/debugreg.h>
 
 static void set_idt(void *newidt, __u16 limit)
 {
@@ -202,6 +203,7 @@ void machine_kexec(struct kimage *image)
 
 	/* Interrupts aren't acceptable while we reboot */
 	local_irq_disable();
+	disable_debug_registers();
 
 	if (image->preserve_context) {
 #ifdef CONFIG_X86_IO_APIC
Index: linux-2.6-tip/arch/x86/kernel/machine_kexec_64.c
===================================================================
--- linux-2.6-tip.orig/arch/x86/kernel/machine_kexec_64.c
+++ linux-2.6-tip/arch/x86/kernel/machine_kexec_64.c
@@ -17,6 +17,7 @@
 #include <asm/tlbflush.h>
 #include <asm/mmu_context.h>
 #include <asm/io.h>
+#include <asm/debugreg.h>
 
 static void init_level2_page(pmd_t *level2p, unsigned long addr)
 {
@@ -234,6 +235,7 @@ void machine_kexec(struct kimage *image)
 
 	/* Interrupts aren't acceptable while we reboot */
 	local_irq_disable();
+	disable_debug_registers();
 
 	control_page = page_address(image->control_code_page) + PAGE_SIZE;
 	memcpy(control_page, relocate_kernel, PAGE_SIZE);


^ permalink raw reply	[flat|nested] 71+ messages in thread

* [patch 10/11] Sample HW breakpoint over kernel data address
       [not found] <20090305043440.189041194@linux.vnet.ibm.com>
                   ` (8 preceding siblings ...)
  2009-03-05  4:41 ` [patch 09/11] Cleanup HW Breakpoint registers before kexec prasad
@ 2009-03-05  4:41 ` prasad
  2009-03-05  4:43 ` prasad
  2009-03-05  4:43 ` [patch 11/11] ftrace plugin for kernel symbol tracing using HW Breakpoint interfaces prasad
  11 siblings, 0 replies; 71+ messages in thread
From: prasad @ 2009-03-05  4:41 UTC (permalink / raw)
  To: mingo
  Cc: Andrew Morton, Linux Kernel Mailing List, Alan Stern,
	Roland McGrath, K.Prasad

[-- Attachment #1: 10 --]
[-- Type: text/plain, Size: 4320 bytes --]

This patch introduces a sample kernel module to demonstrate the use of Hardware
Breakpoint feature. It places a breakpoint over the kernel variable 'pid_max'
to monitor all write operations and emits a function-backtrace when done.

Signed-off-by: K.Prasad <prasad@linux.vnet.ibm.com>
---
 samples/Kconfig                         |    6 ++
 samples/Makefile                        |    3 -
 samples/hw_breakpoint/Makefile          |    1 
 samples/hw_breakpoint/data_breakpoint.c |   80 ++++++++++++++++++++++++++++++++
 4 files changed, 89 insertions(+), 1 deletion(-)

Index: linux-2.6-tip/samples/Kconfig
===================================================================
--- linux-2.6-tip.orig/samples/Kconfig
+++ linux-2.6-tip/samples/Kconfig
@@ -39,5 +39,11 @@ config SAMPLE_KRETPROBES
 	default m
 	depends on SAMPLE_KPROBES && KRETPROBES
 
+config SAMPLE_HW_BREAKPOINT
+	tristate "Build kernel hardware breakpoint examples -- loadable modules only"
+	depends on m
+	help
+	  This builds kernel hardware breakpoint example modules.
+
 endif # SAMPLES
 
Index: linux-2.6-tip/samples/Makefile
===================================================================
--- linux-2.6-tip.orig/samples/Makefile
+++ linux-2.6-tip/samples/Makefile
@@ -1,3 +1,4 @@
 # Makefile for Linux samples code
 
-obj-$(CONFIG_SAMPLES)	+= markers/ kobject/ kprobes/ tracepoints/
+obj-$(CONFIG_SAMPLES)	+= markers/ kobject/ kprobes/ tracepoints/ \
+			   hw_breakpoint/
Index: linux-2.6-tip/samples/hw_breakpoint/Makefile
===================================================================
--- /dev/null
+++ linux-2.6-tip/samples/hw_breakpoint/Makefile
@@ -0,0 +1 @@
+obj-$(CONFIG_SAMPLE_HW_BREAKPOINT) += data_breakpoint.o
Index: linux-2.6-tip/samples/hw_breakpoint/data_breakpoint.c
===================================================================
--- /dev/null
+++ linux-2.6-tip/samples/hw_breakpoint/data_breakpoint.c
@@ -0,0 +1,80 @@
+/*
+ * data_breakpoint.c - Sample HW Breakpoint file to watch kernel data address
+ *
+ * This file is a kernel module that places a breakpoint over 'pid_max' kernel
+ * variable using Hardware Breakpoint register. The corresponding handler which
+ * prints a backtrace is invoked everytime a write operation is performed on
+ * that variable.
+ *
+ * After inserting this module, invoke a write operation using
+ * 'echo <desired_value> > /proc/sys/kernel/pid_max'
+ * to find the function-call backtrace.
+ *
+ * Copyright (C) IBM Corporation, 2009
+ */
+#include <linux/module.h>	/* Needed by all modules */
+#include <linux/kernel.h>	/* Needed for KERN_INFO */
+#include <linux/init.h>		/* Needed for the macros */
+
+#include <asm/hw_breakpoint.h>
+
+struct hw_breakpoint pid_max_hbkpt;
+
+void pid_max_hbkpt_installed(struct hw_breakpoint *temp, struct pt_regs
+								*temp_regs)
+{
+	printk(KERN_INFO "pid_max_hbkpt ENABLED\n");
+}
+
+void pid_max_hbkpt_uninstalled(struct hw_breakpoint *temp, struct
+							pt_regs * temp_regs)
+{
+	printk(KERN_INFO "pid_max_hbkpt DISABLED\n");
+}
+
+void pid_max_hbkpt_handler(struct hw_breakpoint *temp, struct pt_regs
+								*temp_regs)
+{
+	printk(KERN_INFO "pid_max value is changed\n");
+	dump_stack();
+	printk(KERN_INFO "Dump stack from pid_max_hbkpt_handler\n");
+}
+
+static int __init hw_break_module_init(void)
+{
+	int ret;
+
+#ifdef CONFIG_X86
+	pid_max_hbkpt.info.name = "pid_max";
+	pid_max_hbkpt.info.type = HW_BREAKPOINT_WRITE;
+	pid_max_hbkpt.info.len = HW_BREAKPOINT_LEN_4;
+	pid_max_hbkpt.priority = HW_BREAKPOINT_PRIO_NORMAL;
+
+	pid_max_hbkpt.installed = (void *)pid_max_hbkpt_installed;
+	pid_max_hbkpt.uninstalled = (void *)pid_max_hbkpt_uninstalled;
+	pid_max_hbkpt.triggered = (void *)pid_max_hbkpt_handler;
+#endif /* CONFIG_X86 */
+
+	ret = register_kernel_hw_breakpoint(&pid_max_hbkpt);
+
+	if (ret < 0) {
+		printk(KERN_INFO "Breakpoint registration failed\n");
+		return ret;
+	} else
+		printk(KERN_INFO "HW Breakpoint for pid_max write installed\n");
+
+	return 0;
+}
+
+static void __exit hw_break_module_exit(void)
+{
+	unregister_kernel_hw_breakpoint(&pid_max_hbkpt);
+	printk(KERN_INFO "HW Breakpoint for pid_max write uninstalled\n");
+}
+
+module_init(hw_break_module_init);
+module_exit(hw_break_module_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("K.Prasad");
+MODULE_DESCRIPTION("pid_max breakpoint");


^ permalink raw reply	[flat|nested] 71+ messages in thread

* [patch 10/11] Sample HW breakpoint over kernel data address
       [not found] <20090305043440.189041194@linux.vnet.ibm.com>
                   ` (9 preceding siblings ...)
  2009-03-05  4:41 ` [patch 10/11] Sample HW breakpoint over kernel data address prasad
@ 2009-03-05  4:43 ` prasad
  2009-03-05  4:43 ` [patch 11/11] ftrace plugin for kernel symbol tracing using HW Breakpoint interfaces prasad
  11 siblings, 0 replies; 71+ messages in thread
From: prasad @ 2009-03-05  4:43 UTC (permalink / raw)
  To: mingo
  Cc: Andrew Morton, Linux Kernel Mailing List, Alan Stern,
	Roland McGrath, K.Prasad

[-- Attachment #1: 10 --]
[-- Type: text/plain, Size: 4320 bytes --]

This patch introduces a sample kernel module to demonstrate the use of Hardware
Breakpoint feature. It places a breakpoint over the kernel variable 'pid_max'
to monitor all write operations and emits a function-backtrace when done.

Signed-off-by: K.Prasad <prasad@linux.vnet.ibm.com>
---
 samples/Kconfig                         |    6 ++
 samples/Makefile                        |    3 -
 samples/hw_breakpoint/Makefile          |    1 
 samples/hw_breakpoint/data_breakpoint.c |   80 ++++++++++++++++++++++++++++++++
 4 files changed, 89 insertions(+), 1 deletion(-)

Index: linux-2.6-tip/samples/Kconfig
===================================================================
--- linux-2.6-tip.orig/samples/Kconfig
+++ linux-2.6-tip/samples/Kconfig
@@ -39,5 +39,11 @@ config SAMPLE_KRETPROBES
 	default m
 	depends on SAMPLE_KPROBES && KRETPROBES
 
+config SAMPLE_HW_BREAKPOINT
+	tristate "Build kernel hardware breakpoint examples -- loadable modules only"
+	depends on m
+	help
+	  This builds kernel hardware breakpoint example modules.
+
 endif # SAMPLES
 
Index: linux-2.6-tip/samples/Makefile
===================================================================
--- linux-2.6-tip.orig/samples/Makefile
+++ linux-2.6-tip/samples/Makefile
@@ -1,3 +1,4 @@
 # Makefile for Linux samples code
 
-obj-$(CONFIG_SAMPLES)	+= markers/ kobject/ kprobes/ tracepoints/
+obj-$(CONFIG_SAMPLES)	+= markers/ kobject/ kprobes/ tracepoints/ \
+			   hw_breakpoint/
Index: linux-2.6-tip/samples/hw_breakpoint/Makefile
===================================================================
--- /dev/null
+++ linux-2.6-tip/samples/hw_breakpoint/Makefile
@@ -0,0 +1 @@
+obj-$(CONFIG_SAMPLE_HW_BREAKPOINT) += data_breakpoint.o
Index: linux-2.6-tip/samples/hw_breakpoint/data_breakpoint.c
===================================================================
--- /dev/null
+++ linux-2.6-tip/samples/hw_breakpoint/data_breakpoint.c
@@ -0,0 +1,80 @@
+/*
+ * data_breakpoint.c - Sample HW Breakpoint file to watch kernel data address
+ *
+ * This file is a kernel module that places a breakpoint over 'pid_max' kernel
+ * variable using Hardware Breakpoint register. The corresponding handler which
+ * prints a backtrace is invoked everytime a write operation is performed on
+ * that variable.
+ *
+ * After inserting this module, invoke a write operation using
+ * 'echo <desired_value> > /proc/sys/kernel/pid_max'
+ * to find the function-call backtrace.
+ *
+ * Copyright (C) IBM Corporation, 2009
+ */
+#include <linux/module.h>	/* Needed by all modules */
+#include <linux/kernel.h>	/* Needed for KERN_INFO */
+#include <linux/init.h>		/* Needed for the macros */
+
+#include <asm/hw_breakpoint.h>
+
+struct hw_breakpoint pid_max_hbkpt;
+
+void pid_max_hbkpt_installed(struct hw_breakpoint *temp, struct pt_regs
+								*temp_regs)
+{
+	printk(KERN_INFO "pid_max_hbkpt ENABLED\n");
+}
+
+void pid_max_hbkpt_uninstalled(struct hw_breakpoint *temp, struct
+							pt_regs * temp_regs)
+{
+	printk(KERN_INFO "pid_max_hbkpt DISABLED\n");
+}
+
+void pid_max_hbkpt_handler(struct hw_breakpoint *temp, struct pt_regs
+								*temp_regs)
+{
+	printk(KERN_INFO "pid_max value is changed\n");
+	dump_stack();
+	printk(KERN_INFO "Dump stack from pid_max_hbkpt_handler\n");
+}
+
+static int __init hw_break_module_init(void)
+{
+	int ret;
+
+#ifdef CONFIG_X86
+	pid_max_hbkpt.info.name = "pid_max";
+	pid_max_hbkpt.info.type = HW_BREAKPOINT_WRITE;
+	pid_max_hbkpt.info.len = HW_BREAKPOINT_LEN_4;
+	pid_max_hbkpt.priority = HW_BREAKPOINT_PRIO_NORMAL;
+
+	pid_max_hbkpt.installed = (void *)pid_max_hbkpt_installed;
+	pid_max_hbkpt.uninstalled = (void *)pid_max_hbkpt_uninstalled;
+	pid_max_hbkpt.triggered = (void *)pid_max_hbkpt_handler;
+#endif /* CONFIG_X86 */
+
+	ret = register_kernel_hw_breakpoint(&pid_max_hbkpt);
+
+	if (ret < 0) {
+		printk(KERN_INFO "Breakpoint registration failed\n");
+		return ret;
+	} else
+		printk(KERN_INFO "HW Breakpoint for pid_max write installed\n");
+
+	return 0;
+}
+
+static void __exit hw_break_module_exit(void)
+{
+	unregister_kernel_hw_breakpoint(&pid_max_hbkpt);
+	printk(KERN_INFO "HW Breakpoint for pid_max write uninstalled\n");
+}
+
+module_init(hw_break_module_init);
+module_exit(hw_break_module_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("K.Prasad");
+MODULE_DESCRIPTION("pid_max breakpoint");


^ permalink raw reply	[flat|nested] 71+ messages in thread

* [patch 11/11] ftrace plugin for kernel symbol tracing using HW Breakpoint interfaces
       [not found] <20090305043440.189041194@linux.vnet.ibm.com>
                   ` (10 preceding siblings ...)
  2009-03-05  4:43 ` prasad
@ 2009-03-05  4:43 ` prasad
  2009-03-05  6:37   ` Frederic Weisbecker
  2009-03-05 14:54   ` Steven Rostedt
  11 siblings, 2 replies; 71+ messages in thread
From: prasad @ 2009-03-05  4:43 UTC (permalink / raw)
  To: mingo
  Cc: Andrew Morton, Linux Kernel Mailing List, Alan Stern,
	Roland McGrath, K.Prasad

[-- Attachment #1: ftrace_hbkpt_12 --]
[-- Type: text/plain, Size: 12966 bytes --]

This patch adds an ftrace plugin to detect and profile memory access over
kernel variables. It uses HW Breakpoint interfaces to 'watch memory
addresses.

Signed-off-by: K.Prasad <prasad@linux.vnet.ibm.com> 
---
 kernel/trace/Kconfig      |    6 
 kernel/trace/Makefile     |    1 
 kernel/trace/trace.h      |   15 +
 kernel/trace/trace_ksym.c |  399 ++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 421 insertions(+)

Index: linux-2.6-tip/kernel/trace/Kconfig
===================================================================
--- linux-2.6-tip.orig/kernel/trace/Kconfig
+++ linux-2.6-tip/kernel/trace/Kconfig
@@ -249,6 +249,12 @@ config POWER_TRACER
 	  power management decisions, specifically the C-state and P-state
 	  behavior.
 
+config KSYM_TRACER
+	bool "Trace read and write access on kernel memory locations"
+	select TRACING
+	help
+	  This tracer helps find read and write operations on any given kernel
+	  symbol i.e. /proc/kallsyms.
 
 config STACK_TRACER
 	bool "Trace max stack"
Index: linux-2.6-tip/kernel/trace/Makefile
===================================================================
--- linux-2.6-tip.orig/kernel/trace/Makefile
+++ linux-2.6-tip/kernel/trace/Makefile
@@ -41,5 +41,6 @@ obj-$(CONFIG_WORKQUEUE_TRACER) += trace_
 obj-$(CONFIG_BLK_DEV_IO_TRACE)	+= blktrace.o
 obj-$(CONFIG_EVENT_TRACER) += trace_events.o
 obj-$(CONFIG_EVENT_TRACER) += events.o
+obj-$(CONFIG_KSYM_TRACER) += trace_ksym.o
 
 libftrace-y := ftrace.o
Index: linux-2.6-tip/kernel/trace/trace.h
===================================================================
--- linux-2.6-tip.orig/kernel/trace/trace.h
+++ linux-2.6-tip/kernel/trace/trace.h
@@ -12,6 +12,8 @@
 #include <trace/kmemtrace.h>
 #include <trace/power.h>
 
+#include <asm/hw_breakpoint.h>
+
 enum trace_type {
 	__TRACE_FIRST_TYPE = 0,
 
@@ -34,6 +36,7 @@ enum trace_type {
 	TRACE_KMEM_FREE,
 	TRACE_POWER,
 	TRACE_BLK,
+	TRACE_KSYM,
 
 	__TRACE_LAST_TYPE,
 };
@@ -191,6 +194,17 @@ struct kmemtrace_free_entry {
 	const void *ptr;
 };
 
+struct trace_ksym {
+	struct trace_entry	ent;
+	struct hw_breakpoint	*ksym_hbkpt;
+	unsigned long		ksym_addr;
+	unsigned long		ip;
+	pid_t			pid;
+	struct hlist_node	ksym_hlist;
+	char			ksym_name[KSYM_NAME_LEN];
+	char			p_name[TASK_COMM_LEN];
+};
+
 /*
  * trace_flag_type is an enumeration that holds different
  * states when a trace occurs. These are:
@@ -302,6 +316,7 @@ extern void __ftrace_bad_type(void);
 			  TRACE_KMEM_ALLOC);	\
 		IF_ASSIGN(var, ent, struct kmemtrace_free_entry,	\
 			  TRACE_KMEM_FREE);	\
+		IF_ASSIGN(var, ent, struct trace_ksym, TRACE_KSYM); \
 		__ftrace_bad_type();					\
 	} while (0)
 
Index: linux-2.6-tip/kernel/trace/trace_ksym.c
===================================================================
--- /dev/null
+++ linux-2.6-tip/kernel/trace/trace_ksym.c
@@ -0,0 +1,399 @@
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/debugfs.h>
+#include <linux/ftrace.h>
+#include <linux/kallsyms.h>
+#include <linux/uaccess.h>
+
+#include "trace.h"
+#include "trace_output.h"
+
+/* For now, let us restrict the no. of symbols traced simultaneously to number
+ * of available hardware breakpoint registers.
+ */
+#define KSYM_TRACER_MAX HB_NUM
+
+#define KSYM_TRACER_OP_LEN 3 /* rw- */
+#define KSYM_FILTER_ENTRY_LEN (KSYM_NAME_LEN + KSYM_TRACER_OP_LEN + 1)
+
+#define KSYM_DEBUG 1
+
+static struct trace_array *ksym_trace_array;
+
+DEFINE_MUTEX(ksym_tracer_mutex);
+
+static unsigned int ksym_filter_entry_count;
+static unsigned int ksym_tracing_enabled;
+
+static HLIST_HEAD(ksym_filter_head);
+
+/* HW Breakpoint related callback functions */
+void ksym_hbkpt_installed(struct hw_breakpoint *temp, struct pt_regs
+								*temp_regs)
+{
+}
+
+void ksym_hbkpt_uninstalled(struct hw_breakpoint *temp, struct
+							pt_regs * temp_regs)
+{
+}
+
+void ksym_hbkpt_handler(struct hw_breakpoint *hbkpt, struct pt_regs *regs)
+{
+	struct ring_buffer_event *event;
+	struct trace_array *tr;
+	struct trace_ksym *entry;
+	int pc;
+
+	if (!ksym_tracing_enabled)
+		return;
+
+	tr = ksym_trace_array;
+	pc = preempt_count();
+
+	event = trace_buffer_lock_reserve(tr, TRACE_KSYM,
+							sizeof(*entry), 0, pc);
+	if (!event)
+		return;
+
+	entry = ring_buffer_event_data(event);
+	strlcpy(entry->ksym_name, hbkpt->info.name, KSYM_SYMBOL_LEN);
+	entry->ksym_hbkpt = hbkpt;
+	entry->ip = instruction_pointer(regs);
+	strlcpy(entry->p_name, current->comm, TASK_COMM_LEN);
+
+	entry->pid = current->pid;
+	trace_buffer_unlock_commit(tr, event, 0, pc);
+}
+
+/* Valid access types are represented as
+ *
+ * rw- : Set Read/Write Access Breakpoint
+ * -w- : Set Write Access Breakpoint
+ * --- : Clear Breakpoints
+ * --x : Set Execution Break points (Not available yet)
+ *
+ */
+static int ksym_trace_get_access_type(char *access_str)
+{
+	int pos, access = 0;
+
+	for (pos = 0; pos < KSYM_TRACER_OP_LEN; pos++) {
+		switch (access_str[pos]) {
+		case 'r':
+			access += (pos == 0) ? 4 : -1;
+			break;
+		case 'w':
+			access += (pos == 1) ? 2 : -1;
+			break;
+		case '-':
+			break;
+		default:
+			return -EINVAL;
+		}
+	}
+
+	switch (access) {
+	case 6:
+		access = HW_BREAKPOINT_RW;
+		break;
+	case 2:
+		access = HW_BREAKPOINT_WRITE;
+		break;
+	case 0:
+		access = 0;
+	}
+
+	return access;
+}
+
+/*
+ * There can be several possible malformed requests and we attempt to capture
+ * all of them. We enumerate some of the rules
+ * 1. We will not allow kernel symbols with ':' since it is used as a delimiter.
+ *    i.e. multiple ':' symbols disallowed. Possible uses are of the form
+ *    <module>:<ksym_name>:<op>.
+ * 2. No delimiter symbol ':' in the input string
+ * 3. Spurious operator symbols or symbols not in their respective positions
+ * 4. <ksym_name>:--- i.e. clear breakpoint request when ksym_name not in file
+ * 5. Kernel symbol not a part of /proc/kallsyms
+ * 6. Duplicate requests
+ */
+static int parse_ksym_trace_str(char *input_string, char **ksymname,
+							unsigned long *addr)
+{
+	char *delimiter = ":";
+	int ret;
+
+	ret = -EINVAL;
+	*ksymname = strsep(&input_string, delimiter);
+	*addr = kallsyms_lookup_name(*ksymname);
+
+	/* Check for malformed request: (2), (1) and (5) */
+	if ((!input_string) ||
+		(strlen(input_string) != KSYM_TRACER_OP_LEN + 1) ||
+		(*addr == 0))
+		goto return_code;
+
+	ret = ksym_trace_get_access_type(input_string);
+
+return_code:
+	return ret;
+}
+
+static int process_new_ksym_entry(struct trace_ksym *entry, char *ksymname,
+			     int op, unsigned long addr)
+{
+	if (ksym_filter_entry_count >= KSYM_TRACER_MAX) {
+		printk(KERN_ERR "ksym_tracer: Maximum limit:(%d) reached. No"
+			" new requests for tracing can be accepted now.\n",
+			KSYM_TRACER_MAX);
+		return -ENOSPC;
+	}
+
+	entry = kzalloc(sizeof(struct trace_ksym), GFP_KERNEL);
+	if (!entry)
+		return -ENOMEM;
+
+	entry->ksym_hbkpt = kzalloc(sizeof(struct hw_breakpoint), GFP_KERNEL);
+	if (!entry->ksym_hbkpt)
+		return -ENOMEM;
+
+	entry->ksym_hbkpt->info.name = ksymname;
+	entry->ksym_hbkpt->info.type = op;
+	entry->ksym_addr = entry->ksym_hbkpt->info.address = addr;
+	entry->ksym_hbkpt->info.len = HW_BREAKPOINT_LEN_4;
+	entry->ksym_hbkpt->priority = HW_BREAKPOINT_PRIO_NORMAL;
+
+	entry->ksym_hbkpt->installed = (void *)ksym_hbkpt_installed;
+	entry->ksym_hbkpt->uninstalled = (void *)ksym_hbkpt_uninstalled;
+	entry->ksym_hbkpt->triggered = (void *)ksym_hbkpt_handler;
+
+	if ((register_kernel_hw_breakpoint(entry->ksym_hbkpt)) < 0) {
+		printk(KERN_INFO "ksym_tracer request failed. Try again"
+					" later!!\n");
+		kfree(entry);
+		return -EAGAIN;
+	}
+	hlist_add_head(&(entry->ksym_hlist), &ksym_filter_head);
+	printk(KERN_INFO "ksym_tracer changes are now effective\n");
+
+	ksym_filter_entry_count++;
+
+	return 0;
+}
+
+static ssize_t ksym_trace_filter_read(struct file *filp, char __user *ubuf,
+						size_t count, loff_t *ppos)
+{
+	struct trace_ksym *entry;
+	struct hlist_node *node;
+	char buf[KSYM_FILTER_ENTRY_LEN * KSYM_TRACER_MAX];
+	ssize_t ret, cnt = 0;
+
+	mutex_lock(&ksym_tracer_mutex);
+
+	hlist_for_each_entry(entry, node, &ksym_filter_head, ksym_hlist) {
+		cnt += snprintf(&buf[cnt], KSYM_FILTER_ENTRY_LEN - cnt, "%s:",
+				entry->ksym_hbkpt->info.name);
+		if (entry->ksym_hbkpt->info.type == HW_BREAKPOINT_WRITE)
+			cnt += snprintf(&buf[cnt], KSYM_FILTER_ENTRY_LEN - cnt,
+								"-w-\n");
+		else if (entry->ksym_hbkpt->info.type == HW_BREAKPOINT_RW)
+			cnt += snprintf(&buf[cnt], KSYM_FILTER_ENTRY_LEN - cnt,
+								"rw-\n");
+	}
+	ret = simple_read_from_buffer(ubuf, count, ppos, buf, strlen(buf));
+	mutex_unlock(&ksym_tracer_mutex);
+
+	return ret;
+}
+
+static ssize_t ksym_trace_filter_write(struct file *file,
+					const char __user *buffer,
+						size_t count, loff_t *ppos)
+{
+	struct trace_ksym *entry;
+	struct hlist_node *node;
+	char *input_string, *ksymname = NULL;
+	unsigned long ksym_addr = 0;
+	int ret, op, changed = 0;
+
+	input_string = kzalloc(count, GFP_KERNEL);
+	if (!input_string)
+		return -ENOMEM;
+
+	/* Ignore echo "" > ksym_trace_filter */
+	if (count == 0)
+		return 0;
+
+	if (copy_from_user(input_string, buffer, count))
+		return -EFAULT;
+
+	ret = op = parse_ksym_trace_str(input_string, &ksymname, &ksym_addr);
+
+	if (ret < 0)
+		goto err_ret;
+	mutex_lock(&ksym_tracer_mutex);
+
+	ret = -EINVAL;
+	hlist_for_each_entry(entry, node, &ksym_filter_head, ksym_hlist) {
+		if (entry->ksym_addr == ksym_addr) {
+			/* Check for malformed request: (6) */
+			if (entry->ksym_hbkpt->info.type != op)
+				changed = 1;
+			else
+				goto err_ret;
+			break;
+		}
+	}
+	if (changed) {
+		unregister_kernel_hw_breakpoint(entry->ksym_hbkpt);
+		entry->ksym_hbkpt->info.type = op;
+		if (op > 0) {
+			ret = register_kernel_hw_breakpoint(entry->ksym_hbkpt);
+			if (ret > 0) {
+				ret = count;
+				goto unlock_ret_path;
+			}
+			if (ret == 0) {
+				ret = -ENOSPC;
+				unregister_kernel_hw_breakpoint(entry->\
+								ksym_hbkpt);
+			}
+		}
+		ksym_filter_entry_count--;
+		hlist_del(&(entry->ksym_hlist));
+		kfree(entry->ksym_hbkpt);
+		kfree(entry);
+		ret = count;
+		goto err_ret;
+	} else {
+		/* Check for malformed request: (4) */
+		if (op == 0)
+			goto err_ret;
+
+		ret = process_new_ksym_entry(entry, ksymname, op, ksym_addr);
+		if (ret)
+			goto err_ret;
+	}
+	ret = count;
+	goto unlock_ret_path;
+
+err_ret:
+	kfree(input_string);
+
+unlock_ret_path:
+	mutex_unlock(&ksym_tracer_mutex);
+	return ret;
+}
+
+static const struct file_operations ksym_tracing_fops = {
+	.open		= tracing_open_generic,
+	.read		= ksym_trace_filter_read,
+	.write		= ksym_trace_filter_write,
+};
+
+static int ksym_trace_init(struct trace_array *tr)
+{
+	int cpu;
+
+	for_each_online_cpu(cpu)
+		tracing_reset(tr, cpu);
+	ksym_tracing_enabled = 1;
+	ksym_trace_array = tr;
+
+	return 0;
+}
+
+static void ksym_trace_reset(struct trace_array *tr)
+{
+	ksym_tracing_enabled = 0;
+}
+
+#ifdef CONFIG_FTRACE_SELFTEST
+int trace_selftest_startup_ksym(struct tracer *trace, struct trace_array *tr)
+{
+	/* TODO: Will be implemented later */
+	return 0;
+}
+#endif /* CONFIG_FTRACE_SELFTEST */
+
+static void ksym_trace_print_header(struct seq_file *m)
+{
+
+	seq_puts(m,
+		 "#       TASK-PID      CPU#      Symbol         Type    "
+		 "Function         \n");
+	seq_puts(m,
+		 "#          |           |          |              |         "
+		 "|            \n");
+}
+
+static enum print_line_t ksym_trace_output(struct trace_iterator *iter)
+{
+	struct trace_entry *entry = iter->ent;
+	struct trace_seq *s = &iter->seq;
+	struct trace_ksym *field;
+	char str[KSYM_SYMBOL_LEN];
+	int ret;
+
+	trace_assign_type(field, entry);
+
+	ret = trace_seq_printf(s, "%-15s %-5d %-3d %-20s ", field->p_name,
+				field->pid, iter->cpu, field->ksym_name);
+	if (!ret)
+		return TRACE_TYPE_PARTIAL_LINE;
+
+	switch (field->ksym_hbkpt->info.type) {
+	case HW_BREAKPOINT_WRITE:
+		ret = trace_seq_printf(s, " W  ");
+		break;
+	case HW_BREAKPOINT_RW:
+		ret = trace_seq_printf(s, " RW ");
+		break;
+	default:
+		return TRACE_TYPE_PARTIAL_LINE;
+	}
+
+	if (!ret)
+		return TRACE_TYPE_PARTIAL_LINE;
+
+	sprint_symbol(str, field->ip);
+	ret = trace_seq_printf(s, "%-20s\n", str);
+	if (!ret)
+		return TRACE_TYPE_PARTIAL_LINE;
+
+	return TRACE_TYPE_HANDLED;
+}
+
+struct tracer ksym_tracer __read_mostly =
+{
+	.name		= "ksym_tracer",
+	.init		= ksym_trace_init,
+	.reset		= ksym_trace_reset,
+#ifdef CONFIG_FTRACE_SELFTEST
+	.selftest	= trace_selftest_startup_ksym,
+#endif
+	.print_header   = ksym_trace_print_header,
+	.print_line	= ksym_trace_output
+};
+
+__init static int init_ksym_trace(void)
+{
+	struct dentry *d_tracer;
+	struct dentry *entry;
+
+	d_tracer = tracing_init_dentry();
+	ksym_filter_entry_count = 0;
+
+	entry = debugfs_create_file("ksym_trace_filter", 0666, d_tracer,
+				    NULL, &ksym_tracing_fops);
+	if (!entry)
+		pr_warning("Could not create debugfs "
+			   "'ksym_trace_filter' file\n");
+
+	return register_tracer(&ksym_tracer);
+
+}
+device_initcall(init_ksym_trace);


^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: [patch 11/11] ftrace plugin for kernel symbol tracing using HW Breakpoint interfaces
  2009-03-05  4:43 ` [patch 11/11] ftrace plugin for kernel symbol tracing using HW Breakpoint interfaces prasad
@ 2009-03-05  6:37   ` Frederic Weisbecker
  2009-03-05  9:16     ` Ingo Molnar
                       ` (2 more replies)
  2009-03-05 14:54   ` Steven Rostedt
  1 sibling, 3 replies; 71+ messages in thread
From: Frederic Weisbecker @ 2009-03-05  6:37 UTC (permalink / raw)
  To: prasad
  Cc: mingo, Andrew Morton, Linux Kernel Mailing List, Alan Stern,
	Roland McGrath

On Thu, Mar 05, 2009 at 10:13:33AM +0530, prasad@linux.vnet.ibm.com wrote:
> This patch adds an ftrace plugin to detect and profile memory access over
> kernel variables. It uses HW Breakpoint interfaces to 'watch memory
> addresses.
> 
> Signed-off-by: K.Prasad <prasad@linux.vnet.ibm.com> 
> ---


Hi,

Nice feature. And moreover the standardized hardware breakpoints could
be helpful for tracing.

Just some comments below.


>  kernel/trace/Kconfig      |    6 
>  kernel/trace/Makefile     |    1 
>  kernel/trace/trace.h      |   15 +
>  kernel/trace/trace_ksym.c |  399 ++++++++++++++++++++++++++++++++++++++++++++++
>  4 files changed, 421 insertions(+)
> 
> Index: linux-2.6-tip/kernel/trace/Kconfig
> ===================================================================
> --- linux-2.6-tip.orig/kernel/trace/Kconfig
> +++ linux-2.6-tip/kernel/trace/Kconfig
> @@ -249,6 +249,12 @@ config POWER_TRACER
>  	  power management decisions, specifically the C-state and P-state
>  	  behavior.
>  
> +config KSYM_TRACER
> +	bool "Trace read and write access on kernel memory locations"
> +	select TRACING
> +	help
> +	  This tracer helps find read and write operations on any given kernel
> +	  symbol i.e. /proc/kallsyms.
>
>  config STACK_TRACER
>  	bool "Trace max stack"
> Index: linux-2.6-tip/kernel/trace/Makefile
> ===================================================================
> --- linux-2.6-tip.orig/kernel/trace/Makefile
> +++ linux-2.6-tip/kernel/trace/Makefile
> @@ -41,5 +41,6 @@ obj-$(CONFIG_WORKQUEUE_TRACER) += trace_
>  obj-$(CONFIG_BLK_DEV_IO_TRACE)	+= blktrace.o
>  obj-$(CONFIG_EVENT_TRACER) += trace_events.o
>  obj-$(CONFIG_EVENT_TRACER) += events.o
> +obj-$(CONFIG_KSYM_TRACER) += trace_ksym.o
>  
>  libftrace-y := ftrace.o
> Index: linux-2.6-tip/kernel/trace/trace.h
> ===================================================================
> --- linux-2.6-tip.orig/kernel/trace/trace.h
> +++ linux-2.6-tip/kernel/trace/trace.h
> @@ -12,6 +12,8 @@
>  #include <trace/kmemtrace.h>
>  #include <trace/power.h>
>  
> +#include <asm/hw_breakpoint.h>
> +
>  enum trace_type {
>  	__TRACE_FIRST_TYPE = 0,
>  
> @@ -34,6 +36,7 @@ enum trace_type {
>  	TRACE_KMEM_FREE,
>  	TRACE_POWER,
>  	TRACE_BLK,
> +	TRACE_KSYM,
>  
>  	__TRACE_LAST_TYPE,
>  };
> @@ -191,6 +194,17 @@ struct kmemtrace_free_entry {
>  	const void *ptr;
>  };
>  
> +struct trace_ksym {
> +	struct trace_entry	ent;
> +	struct hw_breakpoint	*ksym_hbkpt;
> +	unsigned long		ksym_addr;
> +	unsigned long		ip;
> +	pid_t			pid;


Just a doubt here.
The current pid is automatically recorded on trace_buffer_lock_reserve()
(or unlock_commit, don't remember), so if this pid is the current one, you
don't need to reserve a room for it, current pid is on struct trace_entry.


> +	struct hlist_node	ksym_hlist;
> +	char			ksym_name[KSYM_NAME_LEN];
> +	char			p_name[TASK_COMM_LEN];
> +};
> +
>  /*
>   * trace_flag_type is an enumeration that holds different
>   * states when a trace occurs. These are:
> @@ -302,6 +316,7 @@ extern void __ftrace_bad_type(void);
>  			  TRACE_KMEM_ALLOC);	\
>  		IF_ASSIGN(var, ent, struct kmemtrace_free_entry,	\
>  			  TRACE_KMEM_FREE);	\
> +		IF_ASSIGN(var, ent, struct trace_ksym, TRACE_KSYM); \
>  		__ftrace_bad_type();					\
>  	} while (0)
>  
> Index: linux-2.6-tip/kernel/trace/trace_ksym.c
> ===================================================================
> --- /dev/null
> +++ linux-2.6-tip/kernel/trace/trace_ksym.c
> @@ -0,0 +1,399 @@
> +#include <linux/module.h>
> +#include <linux/fs.h>
> +#include <linux/debugfs.h>
> +#include <linux/ftrace.h>
> +#include <linux/kallsyms.h>
> +#include <linux/uaccess.h>
> +
> +#include "trace.h"
> +#include "trace_output.h"
> +
> +/* For now, let us restrict the no. of symbols traced simultaneously to number
> + * of available hardware breakpoint registers.
> + */
> +#define KSYM_TRACER_MAX HB_NUM
> +
> +#define KSYM_TRACER_OP_LEN 3 /* rw- */
> +#define KSYM_FILTER_ENTRY_LEN (KSYM_NAME_LEN + KSYM_TRACER_OP_LEN + 1)
> +
> +#define KSYM_DEBUG 1
> +
> +static struct trace_array *ksym_trace_array;
> +
> +DEFINE_MUTEX(ksym_tracer_mutex);
> +
> +static unsigned int ksym_filter_entry_count;
> +static unsigned int ksym_tracing_enabled;
> +
> +static HLIST_HEAD(ksym_filter_head);
> +
> +/* HW Breakpoint related callback functions */
> +void ksym_hbkpt_installed(struct hw_breakpoint *temp, struct pt_regs
> +								*temp_regs)
> +{
> +}
> +
> +void ksym_hbkpt_uninstalled(struct hw_breakpoint *temp, struct
> +							pt_regs * temp_regs)
> +{
> +}
> +
> +void ksym_hbkpt_handler(struct hw_breakpoint *hbkpt, struct pt_regs *regs)
> +{
> +	struct ring_buffer_event *event;
> +	struct trace_array *tr;
> +	struct trace_ksym *entry;
> +	int pc;
> +
> +	if (!ksym_tracing_enabled)
> +		return;
> +
> +	tr = ksym_trace_array;
> +	pc = preempt_count();
> +
> +	event = trace_buffer_lock_reserve(tr, TRACE_KSYM,
> +							sizeof(*entry), 0, pc);
> +	if (!event)
> +		return;
> +
> +	entry = ring_buffer_event_data(event);
> +	strlcpy(entry->ksym_name, hbkpt->info.name, KSYM_SYMBOL_LEN);
> +	entry->ksym_hbkpt = hbkpt;
> +	entry->ip = instruction_pointer(regs);
> +	strlcpy(entry->p_name, current->comm, TASK_COMM_LEN);
> +
> +	entry->pid = current->pid;


Ah, so yes you don't need this field.



> +	trace_buffer_unlock_commit(tr, event, 0, pc);
> +}
> +
> +/* Valid access types are represented as
> + *
> + * rw- : Set Read/Write Access Breakpoint
> + * -w- : Set Write Access Breakpoint
> + * --- : Clear Breakpoints
> + * --x : Set Execution Break points (Not available yet)
> + *
> + */
> +static int ksym_trace_get_access_type(char *access_str)
> +{
> +	int pos, access = 0;
> +
> +	for (pos = 0; pos < KSYM_TRACER_OP_LEN; pos++) {
> +		switch (access_str[pos]) {
> +		case 'r':
> +			access += (pos == 0) ? 4 : -1;
> +			break;
> +		case 'w':
> +			access += (pos == 1) ? 2 : -1;
> +			break;
> +		case '-':
> +			break;
> +		default:
> +			return -EINVAL;
> +		}
> +	}
> +
> +	switch (access) {
> +	case 6:
> +		access = HW_BREAKPOINT_RW;
> +		break;
> +	case 2:
> +		access = HW_BREAKPOINT_WRITE;
> +		break;
> +	case 0:
> +		access = 0;
> +	}
> +
> +	return access;
> +}
> +
> +/*
> + * There can be several possible malformed requests and we attempt to capture
> + * all of them. We enumerate some of the rules
> + * 1. We will not allow kernel symbols with ':' since it is used as a delimiter.
> + *    i.e. multiple ':' symbols disallowed. Possible uses are of the form
> + *    <module>:<ksym_name>:<op>.
> + * 2. No delimiter symbol ':' in the input string
> + * 3. Spurious operator symbols or symbols not in their respective positions
> + * 4. <ksym_name>:--- i.e. clear breakpoint request when ksym_name not in file
> + * 5. Kernel symbol not a part of /proc/kallsyms
> + * 6. Duplicate requests
> + */
> +static int parse_ksym_trace_str(char *input_string, char **ksymname,
> +							unsigned long *addr)
> +{
> +	char *delimiter = ":";
> +	int ret;
> +
> +	ret = -EINVAL;
> +	*ksymname = strsep(&input_string, delimiter);
> +	*addr = kallsyms_lookup_name(*ksymname);
> +
> +	/* Check for malformed request: (2), (1) and (5) */
> +	if ((!input_string) ||
> +		(strlen(input_string) != KSYM_TRACER_OP_LEN + 1) ||
> +		(*addr == 0))
> +		goto return_code;
> +
> +	ret = ksym_trace_get_access_type(input_string);
> +
> +return_code:
> +	return ret;
> +}
> +
> +static int process_new_ksym_entry(struct trace_ksym *entry, char *ksymname,
> +			     int op, unsigned long addr)
> +{
> +	if (ksym_filter_entry_count >= KSYM_TRACER_MAX) {
> +		printk(KERN_ERR "ksym_tracer: Maximum limit:(%d) reached. No"
> +			" new requests for tracing can be accepted now.\n",
> +			KSYM_TRACER_MAX);
> +		return -ENOSPC;
> +	}
> +
> +	entry = kzalloc(sizeof(struct trace_ksym), GFP_KERNEL);


I'm not sure I understand, you passed an allocated entry to that function, no?
If your are using entry as a local variable, it doesn't make sense to pass it
as a parameter.


> +	if (!entry)
> +		return -ENOMEM;
>
> +	entry->ksym_hbkpt = kzalloc(sizeof(struct hw_breakpoint), GFP_KERNEL);
> +	if (!entry->ksym_hbkpt)
> +		return -ENOMEM;


Ouch, what happens here to the memory pointed by entry?


> +
> +	entry->ksym_hbkpt->info.name = ksymname;
> +	entry->ksym_hbkpt->info.type = op;
> +	entry->ksym_addr = entry->ksym_hbkpt->info.address = addr;
> +	entry->ksym_hbkpt->info.len = HW_BREAKPOINT_LEN_4;
> +	entry->ksym_hbkpt->priority = HW_BREAKPOINT_PRIO_NORMAL;
> +
> +	entry->ksym_hbkpt->installed = (void *)ksym_hbkpt_installed;
> +	entry->ksym_hbkpt->uninstalled = (void *)ksym_hbkpt_uninstalled;
> +	entry->ksym_hbkpt->triggered = (void *)ksym_hbkpt_handler;
> +
> +	if ((register_kernel_hw_breakpoint(entry->ksym_hbkpt)) < 0) {
> +		printk(KERN_INFO "ksym_tracer request failed. Try again"
> +					" later!!\n");
> +		kfree(entry);
> +		return -EAGAIN;


You forgot to free entry->ksym_hbkpt


> +	}
> +	hlist_add_head(&(entry->ksym_hlist), &ksym_filter_head);
> +	printk(KERN_INFO "ksym_tracer changes are now effective\n");
> +
> +	ksym_filter_entry_count++;
> +
> +	return 0;
> +}
> +
> +static ssize_t ksym_trace_filter_read(struct file *filp, char __user *ubuf,
> +						size_t count, loff_t *ppos)
> +{
> +	struct trace_ksym *entry;
> +	struct hlist_node *node;
> +	char buf[KSYM_FILTER_ENTRY_LEN * KSYM_TRACER_MAX];
> +	ssize_t ret, cnt = 0;
> +
> +	mutex_lock(&ksym_tracer_mutex);
> +
> +	hlist_for_each_entry(entry, node, &ksym_filter_head, ksym_hlist) {
> +		cnt += snprintf(&buf[cnt], KSYM_FILTER_ENTRY_LEN - cnt, "%s:",
> +				entry->ksym_hbkpt->info.name);
> +		if (entry->ksym_hbkpt->info.type == HW_BREAKPOINT_WRITE)
> +			cnt += snprintf(&buf[cnt], KSYM_FILTER_ENTRY_LEN - cnt,
> +								"-w-\n");
> +		else if (entry->ksym_hbkpt->info.type == HW_BREAKPOINT_RW)
> +			cnt += snprintf(&buf[cnt], KSYM_FILTER_ENTRY_LEN - cnt,
> +								"rw-\n");
> +	}
> +	ret = simple_read_from_buffer(ubuf, count, ppos, buf, strlen(buf));
> +	mutex_unlock(&ksym_tracer_mutex);
> +
> +	return ret;
> +}
> +
> +static ssize_t ksym_trace_filter_write(struct file *file,
> +					const char __user *buffer,
> +						size_t count, loff_t *ppos)
> +{
> +	struct trace_ksym *entry;
> +	struct hlist_node *node;
> +	char *input_string, *ksymname = NULL;
> +	unsigned long ksym_addr = 0;
> +	int ret, op, changed = 0;
> +
> +	input_string = kzalloc(count, GFP_KERNEL);
> +	if (!input_string)
> +		return -ENOMEM;
> +
> +	/* Ignore echo "" > ksym_trace_filter */
> +	if (count == 0)
> +		return 0;


You forgot to free input_string in !count case.


> +
> +	if (copy_from_user(input_string, buffer, count))
> +		return -EFAULT;


Ditto.

> +	ret = op = parse_ksym_trace_str(input_string, &ksymname, &ksym_addr);
> +
> +	if (ret < 0)
> +		goto err_ret;


Ah, here you didn't forget.


> +	mutex_lock(&ksym_tracer_mutex);
> +
> +	ret = -EINVAL;
> +	hlist_for_each_entry(entry, node, &ksym_filter_head, ksym_hlist) {
> +		if (entry->ksym_addr == ksym_addr) {
> +			/* Check for malformed request: (6) */
> +			if (entry->ksym_hbkpt->info.type != op)
> +				changed = 1;
> +			else
> +				goto err_ret;
> +			break;
> +		}
> +	}
> +	if (changed) {
> +		unregister_kernel_hw_breakpoint(entry->ksym_hbkpt);
> +		entry->ksym_hbkpt->info.type = op;
> +		if (op > 0) {
> +			ret = register_kernel_hw_breakpoint(entry->ksym_hbkpt);
> +			if (ret > 0) {
> +				ret = count;
> +				goto unlock_ret_path;
> +			}
> +			if (ret == 0) {
> +				ret = -ENOSPC;
> +				unregister_kernel_hw_breakpoint(entry->\
> +								ksym_hbkpt);
> +			}
> +		}
> +		ksym_filter_entry_count--;
> +		hlist_del(&(entry->ksym_hlist));
> +		kfree(entry->ksym_hbkpt);
> +		kfree(entry);
> +		ret = count;
> +		goto err_ret;
> +	} else {
> +		/* Check for malformed request: (4) */
> +		if (op == 0)
> +			goto err_ret;
> +
> +		ret = process_new_ksym_entry(entry, ksymname, op, ksym_addr);


You are passing an allocated entry as a parameter, but later on process_new_ksym_entry()
you allocate a new space for entry.
I'm confused.


> +		if (ret)
> +			goto err_ret;
> +	}
> +	ret = count;
> +	goto unlock_ret_path;
> +
> +err_ret:
> +	kfree(input_string);
> +
> +unlock_ret_path:
> +	mutex_unlock(&ksym_tracer_mutex);
> +	return ret;
> +}
> +
> +static const struct file_operations ksym_tracing_fops = {
> +	.open		= tracing_open_generic,
> +	.read		= ksym_trace_filter_read,
> +	.write		= ksym_trace_filter_write,
> +};
> +
> +static int ksym_trace_init(struct trace_array *tr)
> +{
> +	int cpu;
> +
> +	for_each_online_cpu(cpu)
> +		tracing_reset(tr, cpu);
> +	ksym_tracing_enabled = 1;
> +	ksym_trace_array = tr;
> +
> +	return 0;
> +}
> +
> +static void ksym_trace_reset(struct trace_array *tr)
> +{
> +	ksym_tracing_enabled = 0;
> +}
> +
> +#ifdef CONFIG_FTRACE_SELFTEST
> +int trace_selftest_startup_ksym(struct tracer *trace, struct trace_array *tr)
> +{
> +	/* TODO: Will be implemented later */
> +	return 0;
> +}
> +#endif /* CONFIG_FTRACE_SELFTEST */
> +
> +static void ksym_trace_print_header(struct seq_file *m)
> +{
> +
> +	seq_puts(m,
> +		 "#       TASK-PID      CPU#      Symbol         Type    "
> +		 "Function         \n");
> +	seq_puts(m,
> +		 "#          |           |          |              |         "
> +		 "|            \n");
> +}
> +
> +static enum print_line_t ksym_trace_output(struct trace_iterator *iter)
> +{
> +	struct trace_entry *entry = iter->ent;
> +	struct trace_seq *s = &iter->seq;
> +	struct trace_ksym *field;
> +	char str[KSYM_SYMBOL_LEN];
> +	int ret;
> +
> +	trace_assign_type(field, entry);
> +
> +	ret = trace_seq_printf(s, "%-15s %-5d %-3d %-20s ", field->p_name,
> +				field->pid, iter->cpu, field->ksym_name);
> +	if (!ret)
> +		return TRACE_TYPE_PARTIAL_LINE;
> +
> +	switch (field->ksym_hbkpt->info.type) {
> +	case HW_BREAKPOINT_WRITE:
> +		ret = trace_seq_printf(s, " W  ");
> +		break;
> +	case HW_BREAKPOINT_RW:
> +		ret = trace_seq_printf(s, " RW ");
> +		break;
> +	default:
> +		return TRACE_TYPE_PARTIAL_LINE;
> +	}
> +
> +	if (!ret)
> +		return TRACE_TYPE_PARTIAL_LINE;
> +
> +	sprint_symbol(str, field->ip);
> +	ret = trace_seq_printf(s, "%-20s\n", str);
> +	if (!ret)
> +		return TRACE_TYPE_PARTIAL_LINE;
> +
> +	return TRACE_TYPE_HANDLED;
> +}
> +
> +struct tracer ksym_tracer __read_mostly =
> +{
> +	.name		= "ksym_tracer",
> +	.init		= ksym_trace_init,
> +	.reset		= ksym_trace_reset,
> +#ifdef CONFIG_FTRACE_SELFTEST
> +	.selftest	= trace_selftest_startup_ksym,
> +#endif
> +	.print_header   = ksym_trace_print_header,
> +	.print_line	= ksym_trace_output
> +};
> +
> +__init static int init_ksym_trace(void)
> +{
> +	struct dentry *d_tracer;
> +	struct dentry *entry;
> +
> +	d_tracer = tracing_init_dentry();
> +	ksym_filter_entry_count = 0;
> +
> +	entry = debugfs_create_file("ksym_trace_filter", 0666, d_tracer,
> +				    NULL, &ksym_tracing_fops);
> +	if (!entry)
> +		pr_warning("Could not create debugfs "
> +			   "'ksym_trace_filter' file\n");
> +
> +	return register_tracer(&ksym_tracer);
> +
> +}
> +device_initcall(init_ksym_trace);


Well, the rest looks good.

 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/


^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: [patch 11/11] ftrace plugin for kernel symbol tracing using HW Breakpoint interfaces
  2009-03-05  6:37   ` Frederic Weisbecker
@ 2009-03-05  9:16     ` Ingo Molnar
  2009-03-05 13:15       ` K.Prasad
  2009-03-05 11:33     ` K.Prasad
  2009-03-05 15:00     ` Steven Rostedt
  2 siblings, 1 reply; 71+ messages in thread
From: Ingo Molnar @ 2009-03-05  9:16 UTC (permalink / raw)
  To: Frederic Weisbecker
  Cc: prasad, Andrew Morton, Linux Kernel Mailing List, Alan Stern,
	Roland McGrath


* Frederic Weisbecker <fweisbec@gmail.com> wrote:

> On Thu, Mar 05, 2009 at 10:13:33AM +0530, prasad@linux.vnet.ibm.com wrote:
> > This patch adds an ftrace plugin to detect and profile memory access over
> > kernel variables. It uses HW Breakpoint interfaces to 'watch memory
> > addresses.
> > 
> > Signed-off-by: K.Prasad <prasad@linux.vnet.ibm.com> 
> > ---
> 
> 
> Hi,
> 
> Nice feature. And moreover the standardized hardware 
> breakpoints could be helpful for tracing.

yeah. The feature is much more alive now.

> Just some comments below.

One other thing:

+#ifdef CONFIG_FTRACE_SELFTEST
+int trace_selftest_startup_ksym(struct tracer *trace, struct trace_array *tr)
+{
+       /* TODO: Will be implemented later */
+       return 0;
+}
+#endif /* CONFIG_FTRACE_SELFTEST */

This needs to be implemented before i can pick the code up into 
tip:tracing, as otherwise we will not notice it fast enough if 
some of this stuff breaks.

Basically the ftrace plugin will be the main usage vector of 
this facility, so the self-test is a must-have.

Looks very nice otherwise.

	Ingo

^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: [patch 11/11] ftrace plugin for kernel symbol tracing using HW Breakpoint interfaces
  2009-03-05  6:37   ` Frederic Weisbecker
  2009-03-05  9:16     ` Ingo Molnar
@ 2009-03-05 11:33     ` K.Prasad
  2009-03-05 12:19       ` K.Prasad
  2009-03-05 12:28       ` Frederic Weisbecker
  2009-03-05 15:00     ` Steven Rostedt
  2 siblings, 2 replies; 71+ messages in thread
From: K.Prasad @ 2009-03-05 11:33 UTC (permalink / raw)
  To: Frederic Weisbecker
  Cc: mingo, Andrew Morton, Linux Kernel Mailing List, Alan Stern,
	Roland McGrath

On Thu, Mar 05, 2009 at 07:37:04AM +0100, Frederic Weisbecker wrote:
> On Thu, Mar 05, 2009 at 10:13:33AM +0530, prasad@linux.vnet.ibm.com wrote:
> > This patch adds an ftrace plugin to detect and profile memory access over
> > kernel variables. It uses HW Breakpoint interfaces to 'watch memory
> > addresses.
> > 
> > Signed-off-by: K.Prasad <prasad@linux.vnet.ibm.com> 
> > ---
> 
> 
> Hi,
> 
> Nice feature. And moreover the standardized hardware breakpoints could
> be helpful for tracing.
> 
> Just some comments below.
> 
>

Hi,
  Thanks for reviewing the code and pointing out the potential memory
leaks. The next iteration of this code should contain fixes for them.
I've explained the usage of 'entry' field inline.
 
> > +struct trace_ksym {
> > +	struct trace_entry	ent;
> > +	struct hw_breakpoint	*ksym_hbkpt;
> > +	unsigned long		ksym_addr;
> > +	unsigned long		ip;
> > +	pid_t			pid;
> 
> 
> Just a doubt here.
> The current pid is automatically recorded on trace_buffer_lock_reserve()
> (or unlock_commit, don't remember), so if this pid is the current one, you
> don't need to reserve a room for it, current pid is on struct trace_entry.
>

It's a carriage from an old version of the code which used the old
ring-buffer APIs like ring_buffer_lock_reserve(). I will now use the
"pid" field in "struct trace_entry".
 
> > +static int process_new_ksym_entry(struct trace_ksym *entry, char *ksymname,
> > +			     int op, unsigned long addr)
> > +{
> > +	if (ksym_filter_entry_count >= KSYM_TRACER_MAX) {
> > +		printk(KERN_ERR "ksym_tracer: Maximum limit:(%d) reached. No"
> > +			" new requests for tracing can be accepted now.\n",
> > +			KSYM_TRACER_MAX);
> > +		return -ENOSPC;
> > +	}
> > +
> > +	entry = kzalloc(sizeof(struct trace_ksym), GFP_KERNEL);
> 
> 
> I'm not sure I understand, you passed an allocated entry to that function, no?
> If your are using entry as a local variable, it doesn't make sense to pass it
> as a parameter.
> 
> 
> > +	if (!entry)
> > +		return -ENOMEM;
> >
> > +	entry->ksym_hbkpt = kzalloc(sizeof(struct hw_breakpoint), GFP_KERNEL);
> > +	if (!entry->ksym_hbkpt)
> > +		return -ENOMEM;
> 
> 
> Ouch, what happens here to the memory pointed by entry?
> 
>

A potential leak....will fix this and the others you've pointed below.
 
> > +
> > +	entry->ksym_hbkpt->info.name = ksymname;
> > +	entry->ksym_hbkpt->info.type = op;
> > +	entry->ksym_addr = entry->ksym_hbkpt->info.address = addr;
> > +	entry->ksym_hbkpt->info.len = HW_BREAKPOINT_LEN_4;
> > +	entry->ksym_hbkpt->priority = HW_BREAKPOINT_PRIO_NORMAL;
> > +
> > +	entry->ksym_hbkpt->installed = (void *)ksym_hbkpt_installed;
> > +	entry->ksym_hbkpt->uninstalled = (void *)ksym_hbkpt_uninstalled;
> > +	entry->ksym_hbkpt->triggered = (void *)ksym_hbkpt_handler;
> > +
> > +	if ((register_kernel_hw_breakpoint(entry->ksym_hbkpt)) < 0) {
> > +		printk(KERN_INFO "ksym_tracer request failed. Try again"
> > +					" later!!\n");
> > +		kfree(entry);
> > +		return -EAGAIN;
> 
> 
> You forgot to free entry->ksym_hbkpt
> 
> 
> > +	}
> > +	hlist_add_head(&(entry->ksym_hlist), &ksym_filter_head);
> > +	printk(KERN_INFO "ksym_tracer changes are now effective\n");
> > +
> > +	ksym_filter_entry_count++;
> > +
> > +	return 0;
> > +}
> > +
> > +static ssize_t ksym_trace_filter_read(struct file *filp, char __user *ubuf,
> > +						size_t count, loff_t *ppos)
> > +{
> > +	struct trace_ksym *entry;
> > +	struct hlist_node *node;
> > +	char buf[KSYM_FILTER_ENTRY_LEN * KSYM_TRACER_MAX];
> > +	ssize_t ret, cnt = 0;
> > +
> > +	mutex_lock(&ksym_tracer_mutex);
> > +
> > +	hlist_for_each_entry(entry, node, &ksym_filter_head, ksym_hlist) {
> > +		cnt += snprintf(&buf[cnt], KSYM_FILTER_ENTRY_LEN - cnt, "%s:",
> > +				entry->ksym_hbkpt->info.name);
> > +		if (entry->ksym_hbkpt->info.type == HW_BREAKPOINT_WRITE)
> > +			cnt += snprintf(&buf[cnt], KSYM_FILTER_ENTRY_LEN - cnt,
> > +								"-w-\n");
> > +		else if (entry->ksym_hbkpt->info.type == HW_BREAKPOINT_RW)
> > +			cnt += snprintf(&buf[cnt], KSYM_FILTER_ENTRY_LEN - cnt,
> > +								"rw-\n");
> > +	}
> > +	ret = simple_read_from_buffer(ubuf, count, ppos, buf, strlen(buf));
> > +	mutex_unlock(&ksym_tracer_mutex);
> > +
> > +	return ret;
> > +}
> > +
> > +static ssize_t ksym_trace_filter_write(struct file *file,
> > +					const char __user *buffer,
> > +						size_t count, loff_t *ppos)
> > +{
> > +	struct trace_ksym *entry;
> > +	struct hlist_node *node;
> > +	char *input_string, *ksymname = NULL;
> > +	unsigned long ksym_addr = 0;
> > +	int ret, op, changed = 0;
> > +
> > +	input_string = kzalloc(count, GFP_KERNEL);
> > +	if (!input_string)
> > +		return -ENOMEM;
> > +
> > +	/* Ignore echo "" > ksym_trace_filter */
> > +	if (count == 0)
> > +		return 0;
> 
> 
> You forgot to free input_string in !count case.
> 
> 
> > +
> > +	if (copy_from_user(input_string, buffer, count))
> > +		return -EFAULT;
> 
> 
> Ditto.
> 
> > +	ret = op = parse_ksym_trace_str(input_string, &ksymname, &ksym_addr);
> > +
> > +	if (ret < 0)
> > +		goto err_ret;
> 
> 
> Ah, here you didn't forget.
> 
> 
> > +	mutex_lock(&ksym_tracer_mutex);
> > +
> > +	ret = -EINVAL;
> > +	hlist_for_each_entry(entry, node, &ksym_filter_head, ksym_hlist) {
> > +		if (entry->ksym_addr == ksym_addr) {
> > +			/* Check for malformed request: (6) */
> > +			if (entry->ksym_hbkpt->info.type != op)
> > +				changed = 1;
> > +			else
> > +				goto err_ret;
> > +			break;
> > +		}
> > +	}
> > +	if (changed) {
> > +		unregister_kernel_hw_breakpoint(entry->ksym_hbkpt);
> > +		entry->ksym_hbkpt->info.type = op;
> > +		if (op > 0) {
> > +			ret = register_kernel_hw_breakpoint(entry->ksym_hbkpt);
> > +			if (ret > 0) {
> > +				ret = count;
> > +				goto unlock_ret_path;
> > +			}
> > +			if (ret == 0) {
> > +				ret = -ENOSPC;
> > +				unregister_kernel_hw_breakpoint(entry->\
> > +								ksym_hbkpt);
> > +			}
> > +		}
> > +		ksym_filter_entry_count--;
> > +		hlist_del(&(entry->ksym_hlist));
> > +		kfree(entry->ksym_hbkpt);
> > +		kfree(entry);
> > +		ret = count;
> > +		goto err_ret;
> > +	} else {
> > +		/* Check for malformed request: (4) */
> > +		if (op == 0)
> > +			goto err_ret;
> > +
> > +		ret = process_new_ksym_entry(entry, ksymname, op, ksym_addr);
> 
> 
> You are passing an allocated entry as a parameter, but later on process_new_ksym_entry()
> you allocate a new space for entry.
> I'm confused.
> 
>

When changed = 1, entry points to the existing instance of 'struct
trace_ksym' and will be used for changing the type of breakpoint. If the
input is a new request to ksym_trace_filter file process_new_ksym_entry()
takes a pointer to 'struct trace_ksym' i.e entry for
allocation/initialisation rather than use it as a parameter in the true
sense.

This is similar to the usage of parameters 'ksymname and addr' in
parse_ksym_trace_str() where they are used to return multiple values.

I hope you find the usage acceptable.
 
> > +
> > +__init static int init_ksym_trace(void)
> > +{
> > +	struct dentry *d_tracer;
> > +	struct dentry *entry;
> > +
> > +	d_tracer = tracing_init_dentry();
> > +	ksym_filter_entry_count = 0;
> > +
> > +	entry = debugfs_create_file("ksym_trace_filter", 0666, d_tracer,
> > +				    NULL, &ksym_tracing_fops);
> > +	if (!entry)
> > +		pr_warning("Could not create debugfs "
> > +			   "'ksym_trace_filter' file\n");
> > +
> > +	return register_tracer(&ksym_tracer);
> > +
> > +}
> > +device_initcall(init_ksym_trace);
> 
> 
> Well, the rest looks good.
> 
>

Thanks again for your comments.

-- K.Prasad 

^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: [patch 11/11] ftrace plugin for kernel symbol tracing using HW Breakpoint interfaces
  2009-03-05 11:33     ` K.Prasad
@ 2009-03-05 12:19       ` K.Prasad
  2009-03-05 12:30         ` Frederic Weisbecker
  2009-03-05 12:28       ` Frederic Weisbecker
  1 sibling, 1 reply; 71+ messages in thread
From: K.Prasad @ 2009-03-05 12:19 UTC (permalink / raw)
  To: Frederic Weisbecker
  Cc: mingo, Andrew Morton, Linux Kernel Mailing List, Alan Stern,
	Roland McGrath

On Thu, Mar 05, 2009 at 05:03:59PM +0530, K.Prasad wrote:
> On Thu, Mar 05, 2009 at 07:37:04AM +0100, Frederic Weisbecker wrote:
> > On Thu, Mar 05, 2009 at 10:13:33AM +0530, prasad@linux.vnet.ibm.com wrote:
> > > This patch adds an ftrace plugin to detect and profile memory access over
> > > kernel variables. It uses HW Breakpoint interfaces to 'watch memory
> > > addresses.
> > > 
> > > Signed-off-by: K.Prasad <prasad@linux.vnet.ibm.com> 
> > > ---
> > 
> > 
> > > +
> > > +		ret = process_new_ksym_entry(entry, ksymname, op, ksym_addr);
> > 
> > 
> > You are passing an allocated entry as a parameter, but later on process_new_ksym_entry()
> > you allocate a new space for entry.
> > I'm confused.
> > 
> >
> 
> When changed = 1, entry points to the existing instance of 'struct
> trace_ksym' and will be used for changing the type of breakpoint. If the
> input is a new request to ksym_trace_filter file process_new_ksym_entry()
> takes a pointer to 'struct trace_ksym' i.e entry for
> allocation/initialisation rather than use it as a parameter in the true
> sense.
> 
> This is similar to the usage of parameters 'ksymname and addr' in
> parse_ksym_trace_str() where they are used to return multiple values.
> 
> I hope you find the usage acceptable.
>

aah....but entry isn't used anywhere anywhere in
ksym_trace_filter_write() after process_new_ksym_entry(). I was trying
to explain why I used entry as a parameter to let
process_new_ksym_entry() return multiple values, but it isn't used after
that. I will remove it, and thanks for pointing it.

-- K.Prasad


^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: [patch 11/11] ftrace plugin for kernel symbol tracing using HW Breakpoint interfaces
  2009-03-05 11:33     ` K.Prasad
  2009-03-05 12:19       ` K.Prasad
@ 2009-03-05 12:28       ` Frederic Weisbecker
  1 sibling, 0 replies; 71+ messages in thread
From: Frederic Weisbecker @ 2009-03-05 12:28 UTC (permalink / raw)
  To: K.Prasad
  Cc: mingo, Andrew Morton, Linux Kernel Mailing List, Alan Stern,
	Roland McGrath

On Thu, Mar 05, 2009 at 05:03:59PM +0530, K.Prasad wrote:
> On Thu, Mar 05, 2009 at 07:37:04AM +0100, Frederic Weisbecker wrote:
> > On Thu, Mar 05, 2009 at 10:13:33AM +0530, prasad@linux.vnet.ibm.com wrote:
> > > This patch adds an ftrace plugin to detect and profile memory access over
> > > kernel variables. It uses HW Breakpoint interfaces to 'watch memory
> > > addresses.
> > > 
> > > Signed-off-by: K.Prasad <prasad@linux.vnet.ibm.com> 
> > > ---
> > 
> > 
> > Hi,
> > 
> > Nice feature. And moreover the standardized hardware breakpoints could
> > be helpful for tracing.
> > 
> > Just some comments below.
> > 
> >
> 
> Hi,
>   Thanks for reviewing the code and pointing out the potential memory
> leaks. The next iteration of this code should contain fixes for them.
> I've explained the usage of 'entry' field inline.
>  
> > > +struct trace_ksym {
> > > +	struct trace_entry	ent;
> > > +	struct hw_breakpoint	*ksym_hbkpt;
> > > +	unsigned long		ksym_addr;
> > > +	unsigned long		ip;
> > > +	pid_t			pid;
> > 
> > 
> > Just a doubt here.
> > The current pid is automatically recorded on trace_buffer_lock_reserve()
> > (or unlock_commit, don't remember), so if this pid is the current one, you
> > don't need to reserve a room for it, current pid is on struct trace_entry.
> >
> 
> It's a carriage from an old version of the code which used the old
> ring-buffer APIs like ring_buffer_lock_reserve(). I will now use the
> "pid" field in "struct trace_entry".
>  
> > > +static int process_new_ksym_entry(struct trace_ksym *entry, char *ksymname,
> > > +			     int op, unsigned long addr)
> > > +{
> > > +	if (ksym_filter_entry_count >= KSYM_TRACER_MAX) {
> > > +		printk(KERN_ERR "ksym_tracer: Maximum limit:(%d) reached. No"
> > > +			" new requests for tracing can be accepted now.\n",
> > > +			KSYM_TRACER_MAX);
> > > +		return -ENOSPC;
> > > +	}
> > > +
> > > +	entry = kzalloc(sizeof(struct trace_ksym), GFP_KERNEL);
> > 
> > 
> > I'm not sure I understand, you passed an allocated entry to that function, no?
> > If your are using entry as a local variable, it doesn't make sense to pass it
> > as a parameter.
> > 
> > 
> > > +	if (!entry)
> > > +		return -ENOMEM;
> > >
> > > +	entry->ksym_hbkpt = kzalloc(sizeof(struct hw_breakpoint), GFP_KERNEL);
> > > +	if (!entry->ksym_hbkpt)
> > > +		return -ENOMEM;
> > 
> > 
> > Ouch, what happens here to the memory pointed by entry?
> > 
> >
> 
> A potential leak....will fix this and the others you've pointed below.
>  
> > > +
> > > +	entry->ksym_hbkpt->info.name = ksymname;
> > > +	entry->ksym_hbkpt->info.type = op;
> > > +	entry->ksym_addr = entry->ksym_hbkpt->info.address = addr;
> > > +	entry->ksym_hbkpt->info.len = HW_BREAKPOINT_LEN_4;
> > > +	entry->ksym_hbkpt->priority = HW_BREAKPOINT_PRIO_NORMAL;
> > > +
> > > +	entry->ksym_hbkpt->installed = (void *)ksym_hbkpt_installed;
> > > +	entry->ksym_hbkpt->uninstalled = (void *)ksym_hbkpt_uninstalled;
> > > +	entry->ksym_hbkpt->triggered = (void *)ksym_hbkpt_handler;
> > > +
> > > +	if ((register_kernel_hw_breakpoint(entry->ksym_hbkpt)) < 0) {
> > > +		printk(KERN_INFO "ksym_tracer request failed. Try again"
> > > +					" later!!\n");
> > > +		kfree(entry);
> > > +		return -EAGAIN;
> > 
> > 
> > You forgot to free entry->ksym_hbkpt
> > 
> > 
> > > +	}
> > > +	hlist_add_head(&(entry->ksym_hlist), &ksym_filter_head);
> > > +	printk(KERN_INFO "ksym_tracer changes are now effective\n");
> > > +
> > > +	ksym_filter_entry_count++;
> > > +
> > > +	return 0;
> > > +}
> > > +
> > > +static ssize_t ksym_trace_filter_read(struct file *filp, char __user *ubuf,
> > > +						size_t count, loff_t *ppos)
> > > +{
> > > +	struct trace_ksym *entry;
> > > +	struct hlist_node *node;
> > > +	char buf[KSYM_FILTER_ENTRY_LEN * KSYM_TRACER_MAX];
> > > +	ssize_t ret, cnt = 0;
> > > +
> > > +	mutex_lock(&ksym_tracer_mutex);
> > > +
> > > +	hlist_for_each_entry(entry, node, &ksym_filter_head, ksym_hlist) {
> > > +		cnt += snprintf(&buf[cnt], KSYM_FILTER_ENTRY_LEN - cnt, "%s:",
> > > +				entry->ksym_hbkpt->info.name);
> > > +		if (entry->ksym_hbkpt->info.type == HW_BREAKPOINT_WRITE)
> > > +			cnt += snprintf(&buf[cnt], KSYM_FILTER_ENTRY_LEN - cnt,
> > > +								"-w-\n");
> > > +		else if (entry->ksym_hbkpt->info.type == HW_BREAKPOINT_RW)
> > > +			cnt += snprintf(&buf[cnt], KSYM_FILTER_ENTRY_LEN - cnt,
> > > +								"rw-\n");
> > > +	}
> > > +	ret = simple_read_from_buffer(ubuf, count, ppos, buf, strlen(buf));
> > > +	mutex_unlock(&ksym_tracer_mutex);
> > > +
> > > +	return ret;
> > > +}
> > > +
> > > +static ssize_t ksym_trace_filter_write(struct file *file,
> > > +					const char __user *buffer,
> > > +						size_t count, loff_t *ppos)
> > > +{
> > > +	struct trace_ksym *entry;
> > > +	struct hlist_node *node;
> > > +	char *input_string, *ksymname = NULL;
> > > +	unsigned long ksym_addr = 0;
> > > +	int ret, op, changed = 0;
> > > +
> > > +	input_string = kzalloc(count, GFP_KERNEL);
> > > +	if (!input_string)
> > > +		return -ENOMEM;
> > > +
> > > +	/* Ignore echo "" > ksym_trace_filter */
> > > +	if (count == 0)
> > > +		return 0;
> > 
> > 
> > You forgot to free input_string in !count case.
> > 
> > 
> > > +
> > > +	if (copy_from_user(input_string, buffer, count))
> > > +		return -EFAULT;
> > 
> > 
> > Ditto.
> > 
> > > +	ret = op = parse_ksym_trace_str(input_string, &ksymname, &ksym_addr);
> > > +
> > > +	if (ret < 0)
> > > +		goto err_ret;
> > 
> > 
> > Ah, here you didn't forget.
> > 
> > 
> > > +	mutex_lock(&ksym_tracer_mutex);
> > > +
> > > +	ret = -EINVAL;
> > > +	hlist_for_each_entry(entry, node, &ksym_filter_head, ksym_hlist) {
> > > +		if (entry->ksym_addr == ksym_addr) {
> > > +			/* Check for malformed request: (6) */
> > > +			if (entry->ksym_hbkpt->info.type != op)
> > > +				changed = 1;
> > > +			else
> > > +				goto err_ret;
> > > +			break;
> > > +		}
> > > +	}
> > > +	if (changed) {
> > > +		unregister_kernel_hw_breakpoint(entry->ksym_hbkpt);
> > > +		entry->ksym_hbkpt->info.type = op;
> > > +		if (op > 0) {
> > > +			ret = register_kernel_hw_breakpoint(entry->ksym_hbkpt);
> > > +			if (ret > 0) {
> > > +				ret = count;
> > > +				goto unlock_ret_path;
> > > +			}
> > > +			if (ret == 0) {
> > > +				ret = -ENOSPC;
> > > +				unregister_kernel_hw_breakpoint(entry->\
> > > +								ksym_hbkpt);
> > > +			}
> > > +		}
> > > +		ksym_filter_entry_count--;
> > > +		hlist_del(&(entry->ksym_hlist));
> > > +		kfree(entry->ksym_hbkpt);
> > > +		kfree(entry);
> > > +		ret = count;
> > > +		goto err_ret;
> > > +	} else {
> > > +		/* Check for malformed request: (4) */
> > > +		if (op == 0)
> > > +			goto err_ret;
> > > +
> > > +		ret = process_new_ksym_entry(entry, ksymname, op, ksym_addr);
> > 
> > 
> > You are passing an allocated entry as a parameter, but later on process_new_ksym_entry()
> > you allocate a new space for entry.
> > I'm confused.
> > 
> >
> 
> When changed = 1, entry points to the existing instance of 'struct
> trace_ksym' and will be used for changing the type of breakpoint. If the
> input is a new request to ksym_trace_filter file process_new_ksym_entry()
> takes a pointer to 'struct trace_ksym' i.e entry for
> allocation/initialisation rather than use it as a parameter in the true
> sense.
> 
> This is similar to the usage of parameters 'ksymname and addr' in
> parse_ksym_trace_str() where they are used to return multiple values.
> 
> I hope you find the usage acceptable.


Hmm. I understand the case of ksymname and addr in parse_ksym_trace_str()

But I don't understand the case here.
You pass the "entry" pointer to process_new_ksym_entry() but:

- this is only a pointer of type struct trace_ksym * and not
  struct trace_ksym **entry
  Once it comes to process_new_ksym_entry() it's not anymore
  the same variable than the caller passed. You override
  it with kzalloc() but this change will not be done on the caller
  which will keep the same address stored on its pointer.

- you are not reusing it on the caller after it called
  process_nex_ksym_ntry()

But you use it on the callee because you insert it on the list.
So the code is not wrong, it's just that such only internal
pointer is generally expected to be declared inside the function itself:

static int process_new_ksym_entry(char *ksymname,
			     int op, unsigned long addr)
{
	struct trace_ksym *entry

	entry = kzalloc(sizeof(struct trace_ksym), GFP_KERNEL);

	...
}

Otherwise when such a parameter is passed, the code reader would expect that

1) this is a value that we will use inside this function (not the case, the value
   is immediately overriden).
2) this is a secondary return value (not the case, or we would need a pointer to
   a pointer).

Well, sorry perhaps I'm a bit annoying with that :-)
It's just for the code readability...I mean code flow for the reader eyes.
But the code action itself is not broken.


Thanks.
Frederic.

  
> > > +
> > > +__init static int init_ksym_trace(void)
> > > +{
> > > +	struct dentry *d_tracer;
> > > +	struct dentry *entry;
> > > +
> > > +	d_tracer = tracing_init_dentry();
> > > +	ksym_filter_entry_count = 0;
> > > +
> > > +	entry = debugfs_create_file("ksym_trace_filter", 0666, d_tracer,
> > > +				    NULL, &ksym_tracing_fops);
> > > +	if (!entry)
> > > +		pr_warning("Could not create debugfs "
> > > +			   "'ksym_trace_filter' file\n");
> > > +
> > > +	return register_tracer(&ksym_tracer);
> > > +
> > > +}
> > > +device_initcall(init_ksym_trace);
> > 
> > 
> > Well, the rest looks good.
> > 
> >
> 
> Thanks again for your comments.
> 
> -- K.Prasad 


^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: [patch 11/11] ftrace plugin for kernel symbol tracing using HW Breakpoint interfaces
  2009-03-05 12:19       ` K.Prasad
@ 2009-03-05 12:30         ` Frederic Weisbecker
  0 siblings, 0 replies; 71+ messages in thread
From: Frederic Weisbecker @ 2009-03-05 12:30 UTC (permalink / raw)
  To: K.Prasad
  Cc: mingo, Andrew Morton, Linux Kernel Mailing List, Alan Stern,
	Roland McGrath

On Thu, Mar 05, 2009 at 05:49:29PM +0530, K.Prasad wrote:
> On Thu, Mar 05, 2009 at 05:03:59PM +0530, K.Prasad wrote:
> > On Thu, Mar 05, 2009 at 07:37:04AM +0100, Frederic Weisbecker wrote:
> > > On Thu, Mar 05, 2009 at 10:13:33AM +0530, prasad@linux.vnet.ibm.com wrote:
> > > > This patch adds an ftrace plugin to detect and profile memory access over
> > > > kernel variables. It uses HW Breakpoint interfaces to 'watch memory
> > > > addresses.
> > > > 
> > > > Signed-off-by: K.Prasad <prasad@linux.vnet.ibm.com> 
> > > > ---
> > > 
> > > 
> > > > +
> > > > +		ret = process_new_ksym_entry(entry, ksymname, op, ksym_addr);
> > > 
> > > 
> > > You are passing an allocated entry as a parameter, but later on process_new_ksym_entry()
> > > you allocate a new space for entry.
> > > I'm confused.
> > > 
> > >
> > 
> > When changed = 1, entry points to the existing instance of 'struct
> > trace_ksym' and will be used for changing the type of breakpoint. If the
> > input is a new request to ksym_trace_filter file process_new_ksym_entry()
> > takes a pointer to 'struct trace_ksym' i.e entry for
> > allocation/initialisation rather than use it as a parameter in the true
> > sense.
> > 
> > This is similar to the usage of parameters 'ksymname and addr' in
> > parse_ksym_trace_str() where they are used to return multiple values.
> > 
> > I hope you find the usage acceptable.
> >
> 
> aah....but entry isn't used anywhere anywhere in
> ksym_trace_filter_write() after process_new_ksym_entry(). I was trying
> to explain why I used entry as a parameter to let
> process_new_ksym_entry() return multiple values, but it isn't used after
> that. I will remove it, and thanks for pointing it.
> 
> -- K.Prasad
> 

Ah, I thought I misunderstood something :-)

Thanks.


^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: [patch 11/11] ftrace plugin for kernel symbol tracing using HW Breakpoint interfaces
  2009-03-05  9:16     ` Ingo Molnar
@ 2009-03-05 13:15       ` K.Prasad
  2009-03-05 13:28         ` Ingo Molnar
  0 siblings, 1 reply; 71+ messages in thread
From: K.Prasad @ 2009-03-05 13:15 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Frederic Weisbecker, Andrew Morton, Linux Kernel Mailing List,
	Alan Stern, Roland McGrath

On Thu, Mar 05, 2009 at 10:16:11AM +0100, Ingo Molnar wrote:
> 
> * Frederic Weisbecker <fweisbec@gmail.com> wrote:
> 
> > On Thu, Mar 05, 2009 at 10:13:33AM +0530, prasad@linux.vnet.ibm.com wrote:
> > > This patch adds an ftrace plugin to detect and profile memory access over
> > > kernel variables. It uses HW Breakpoint interfaces to 'watch memory
> > > addresses.
> > > 
> > > Signed-off-by: K.Prasad <prasad@linux.vnet.ibm.com> 
> > > ---
> > 
> > 
> > Hi,
> > 
> > Nice feature. And moreover the standardized hardware 
> > breakpoints could be helpful for tracing.
> 
> yeah. The feature is much more alive now.
> 
> > Just some comments below.
> 
> One other thing:
> 
> +#ifdef CONFIG_FTRACE_SELFTEST
> +int trace_selftest_startup_ksym(struct tracer *trace, struct trace_array *tr)
> +{
> +       /* TODO: Will be implemented later */
> +       return 0;
> +}
> +#endif /* CONFIG_FTRACE_SELFTEST */
> 
> This needs to be implemented before i can pick the code up into 
> tip:tracing, as otherwise we will not notice it fast enough if 
> some of this stuff breaks.
> 
> Basically the ftrace plugin will be the main usage vector of 
> this facility, so the self-test is a must-have.
> 
> Looks very nice otherwise.
> 
> 	Ingo

Thanks for the comments.

Test-cases for the hardware breakpoint interfaces can be the following:

- Basic sanity test to check if the API is intact
- Perform various types of memory accesses, like read, write (I/O and 
  others when implemented) on a dummy kernel variable and verify the 
  trigger of the exception handler.

While the above can be a part of trace_selftest_startup_ksym(),
rigorous testing would involve:

i) stressing the HW breakpoint infrastructure to confirm sane behaviour
when interoperated with other users of a)breakpoint register b)the
do_debug() exception. This will involve simultaneous use of kprobes,
hardware breakpoint interface and requests from user-space (say through
GDB).
ii) Verifying successful HB_NUM number of register_ requests.
iii) Verifying right priority resolution, and handling user-space
requests.

These, in my opinion, would better fit in a full-featured test-suite
such as LTP, as opposed to startup testing in ftrace.

I will implement trace_selftest_startup_ksym() to contain the first two
test-cases in the next iteration of this code.

Thanks,
K.Prasad


Thanks,
K.Prasad



^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: [patch 11/11] ftrace plugin for kernel symbol tracing using HW Breakpoint interfaces
  2009-03-05 13:15       ` K.Prasad
@ 2009-03-05 13:28         ` Ingo Molnar
  0 siblings, 0 replies; 71+ messages in thread
From: Ingo Molnar @ 2009-03-05 13:28 UTC (permalink / raw)
  To: K.Prasad
  Cc: Frederic Weisbecker, Andrew Morton, Linux Kernel Mailing List,
	Alan Stern, Roland McGrath


* K.Prasad <prasad@linux.vnet.ibm.com> wrote:

> On Thu, Mar 05, 2009 at 10:16:11AM +0100, Ingo Molnar wrote:
> > 
> > * Frederic Weisbecker <fweisbec@gmail.com> wrote:
> > 
> > > On Thu, Mar 05, 2009 at 10:13:33AM +0530, prasad@linux.vnet.ibm.com wrote:
> > > > This patch adds an ftrace plugin to detect and profile memory access over
> > > > kernel variables. It uses HW Breakpoint interfaces to 'watch memory
> > > > addresses.
> > > > 
> > > > Signed-off-by: K.Prasad <prasad@linux.vnet.ibm.com> 
> > > > ---
> > > 
> > > 
> > > Hi,
> > > 
> > > Nice feature. And moreover the standardized hardware 
> > > breakpoints could be helpful for tracing.
> > 
> > yeah. The feature is much more alive now.
> > 
> > > Just some comments below.
> > 
> > One other thing:
> > 
> > +#ifdef CONFIG_FTRACE_SELFTEST
> > +int trace_selftest_startup_ksym(struct tracer *trace, struct trace_array *tr)
> > +{
> > +       /* TODO: Will be implemented later */
> > +       return 0;
> > +}
> > +#endif /* CONFIG_FTRACE_SELFTEST */
> > 
> > This needs to be implemented before i can pick the code up into 
> > tip:tracing, as otherwise we will not notice it fast enough if 
> > some of this stuff breaks.
> > 
> > Basically the ftrace plugin will be the main usage vector of 
> > this facility, so the self-test is a must-have.
> > 
> > Looks very nice otherwise.
> > 
> > 	Ingo
> 
> Thanks for the comments.
> 
> Test-cases for the hardware breakpoint interfaces can be the following:
> 
> - Basic sanity test to check if the API is intact
> - Perform various types of memory accesses, like read, write (I/O and 
>   others when implemented) on a dummy kernel variable and verify the 
>   trigger of the exception handler.
> 
> While the above can be a part of trace_selftest_startup_ksym(),
> rigorous testing would involve:
> 
> i) stressing the HW breakpoint infrastructure to confirm sane behaviour
> when interoperated with other users of a)breakpoint register b)the
> do_debug() exception. This will involve simultaneous use of kprobes,
> hardware breakpoint interface and requests from user-space (say through
> GDB).
> ii) Verifying successful HB_NUM number of register_ requests.
> iii) Verifying right priority resolution, and handling user-space
> requests.
> 
> These, in my opinion, would better fit in a full-featured 
> test-suite such as LTP, as opposed to startup testing in 
> ftrace.

sure. It's just a quick self-test to make sure basic 
functionality is ok.

> I will implement trace_selftest_startup_ksym() to contain the 
> first two test-cases in the next iteration of this code.

Thanks.

	Ingo

^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: [patch 11/11] ftrace plugin for kernel symbol tracing using HW Breakpoint interfaces
  2009-03-05  4:43 ` [patch 11/11] ftrace plugin for kernel symbol tracing using HW Breakpoint interfaces prasad
  2009-03-05  6:37   ` Frederic Weisbecker
@ 2009-03-05 14:54   ` Steven Rostedt
  1 sibling, 0 replies; 71+ messages in thread
From: Steven Rostedt @ 2009-03-05 14:54 UTC (permalink / raw)
  To: K.Prasad
  Cc: mingo, Andrew Morton, Linux Kernel Mailing List, Alan Stern,
	Roland McGrath


On Thu, 5 Mar 2009, prasad@linux.vnet.ibm.com wrote:
>  };
> @@ -191,6 +194,17 @@ struct kmemtrace_free_entry {
>  	const void *ptr;
>  };
>  
> +struct trace_ksym {
> +	struct trace_entry	ent;
> +	struct hw_breakpoint	*ksym_hbkpt;
> +	unsigned long		ksym_addr;
> +	unsigned long		ip;
> +	pid_t			pid;

pid not needed, see below.

> +	struct hlist_node	ksym_hlist;
> +	char			ksym_name[KSYM_NAME_LEN];
> +	char			p_name[TASK_COMM_LEN];
> +};
> +
>  /*
>   * trace_flag_type is an enumeration that holds different
>   * states when a trace occurs. These are:
> @@ -302,6 +316,7 @@ extern void __ftrace_bad_type(void);
>  			  TRACE_KMEM_ALLOC);	\
>  		IF_ASSIGN(var, ent, struct kmemtrace_free_entry,	\
>  			  TRACE_KMEM_FREE);	\
> +		IF_ASSIGN(var, ent, struct trace_ksym, TRACE_KSYM); \
>  		__ftrace_bad_type();					\
>  	} while (0)
>  
> Index: linux-2.6-tip/kernel/trace/trace_ksym.c
> ===================================================================
> --- /dev/null
> +++ linux-2.6-tip/kernel/trace/trace_ksym.c
> @@ -0,0 +1,399 @@
> +#include <linux/module.h>
> +#include <linux/fs.h>
> +#include <linux/debugfs.h>
> +#include <linux/ftrace.h>
> +#include <linux/kallsyms.h>
> +#include <linux/uaccess.h>
> +
> +#include "trace.h"
> +#include "trace_output.h"
> +
> +/* For now, let us restrict the no. of symbols traced simultaneously to number
> + * of available hardware breakpoint registers.
> + */
> +#define KSYM_TRACER_MAX HB_NUM
> +
> +#define KSYM_TRACER_OP_LEN 3 /* rw- */
> +#define KSYM_FILTER_ENTRY_LEN (KSYM_NAME_LEN + KSYM_TRACER_OP_LEN + 1)
> +
> +#define KSYM_DEBUG 1
> +
> +static struct trace_array *ksym_trace_array;
> +
> +DEFINE_MUTEX(ksym_tracer_mutex);
> +
> +static unsigned int ksym_filter_entry_count;
> +static unsigned int ksym_tracing_enabled;
> +
> +static HLIST_HEAD(ksym_filter_head);
> +
> +/* HW Breakpoint related callback functions */
> +void ksym_hbkpt_installed(struct hw_breakpoint *temp, struct pt_regs
> +								*temp_regs)
> +{
> +}
> +
> +void ksym_hbkpt_uninstalled(struct hw_breakpoint *temp, struct
> +							pt_regs * temp_regs)
> +{
> +}
> +
> +void ksym_hbkpt_handler(struct hw_breakpoint *hbkpt, struct pt_regs *regs)
> +{
> +	struct ring_buffer_event *event;
> +	struct trace_array *tr;
> +	struct trace_ksym *entry;
> +	int pc;
> +
> +	if (!ksym_tracing_enabled)
> +		return;
> +
> +	tr = ksym_trace_array;
> +	pc = preempt_count();
> +
> +	event = trace_buffer_lock_reserve(tr, TRACE_KSYM,
> +							sizeof(*entry), 0, pc);
> +	if (!event)
> +		return;
> +
> +	entry = ring_buffer_event_data(event);
> +	strlcpy(entry->ksym_name, hbkpt->info.name, KSYM_SYMBOL_LEN);
> +	entry->ksym_hbkpt = hbkpt;
> +	entry->ip = instruction_pointer(regs);
> +	strlcpy(entry->p_name, current->comm, TASK_COMM_LEN);
> +
> +	entry->pid = current->pid;

You just duplicated the pid. In trace_buffer_lock_reserve we record
the pid in entry->ent.pid.


> +	trace_buffer_unlock_commit(tr, event, 0, pc);
> +}
> +
> +/* Valid access types are represented as
> + *
> + * rw- : Set Read/Write Access Breakpoint
> + * -w- : Set Write Access Breakpoint
> + * --- : Clear Breakpoints
> + * --x : Set Execution Break points (Not available yet)
> + *
> + */
> +static int ksym_trace_get_access_type(char *access_str)
> +{
> +	int pos, access = 0;
> +
> +	for (pos = 0; pos < KSYM_TRACER_OP_LEN; pos++) {
> +		switch (access_str[pos]) {
> +		case 'r':
> +			access += (pos == 0) ? 4 : -1;
> +			break;
> +		case 'w':
> +			access += (pos == 1) ? 2 : -1;
> +			break;
> +		case '-':
> +			break;
> +		default:
> +			return -EINVAL;
> +		}
> +	}
> +
> +	switch (access) {
> +	case 6:
> +		access = HW_BREAKPOINT_RW;
> +		break;
> +	case 2:
> +		access = HW_BREAKPOINT_WRITE;
> +		break;
> +	case 0:
> +		access = 0;
> +	}
> +
> +	return access;
> +}
> +
> +/*
> + * There can be several possible malformed requests and we attempt to capture
> + * all of them. We enumerate some of the rules
> + * 1. We will not allow kernel symbols with ':' since it is used as a delimiter.
> + *    i.e. multiple ':' symbols disallowed. Possible uses are of the form
> + *    <module>:<ksym_name>:<op>.
> + * 2. No delimiter symbol ':' in the input string
> + * 3. Spurious operator symbols or symbols not in their respective positions
> + * 4. <ksym_name>:--- i.e. clear breakpoint request when ksym_name not in file
> + * 5. Kernel symbol not a part of /proc/kallsyms
> + * 6. Duplicate requests
> + */
> +static int parse_ksym_trace_str(char *input_string, char **ksymname,
> +							unsigned long *addr)
> +{
> +	char *delimiter = ":";
> +	int ret;
> +
> +	ret = -EINVAL;
> +	*ksymname = strsep(&input_string, delimiter);
> +	*addr = kallsyms_lookup_name(*ksymname);
> +
> +	/* Check for malformed request: (2), (1) and (5) */
> +	if ((!input_string) ||
> +		(strlen(input_string) != KSYM_TRACER_OP_LEN + 1) ||
> +		(*addr == 0))
> +		goto return_code;
> +
> +	ret = ksym_trace_get_access_type(input_string);
> +
> +return_code:
> +	return ret;
> +}
> +
> +static int process_new_ksym_entry(struct trace_ksym *entry, char *ksymname,
> +			     int op, unsigned long addr)
> +{
> +	if (ksym_filter_entry_count >= KSYM_TRACER_MAX) {
> +		printk(KERN_ERR "ksym_tracer: Maximum limit:(%d) reached. No"
> +			" new requests for tracing can be accepted now.\n",
> +			KSYM_TRACER_MAX);
> +		return -ENOSPC;
> +	}
> +
> +	entry = kzalloc(sizeof(struct trace_ksym), GFP_KERNEL);
> +	if (!entry)
> +		return -ENOMEM;
> +
> +	entry->ksym_hbkpt = kzalloc(sizeof(struct hw_breakpoint), GFP_KERNEL);
> +	if (!entry->ksym_hbkpt)
> +		return -ENOMEM;
> +
> +	entry->ksym_hbkpt->info.name = ksymname;
> +	entry->ksym_hbkpt->info.type = op;
> +	entry->ksym_addr = entry->ksym_hbkpt->info.address = addr;
> +	entry->ksym_hbkpt->info.len = HW_BREAKPOINT_LEN_4;
> +	entry->ksym_hbkpt->priority = HW_BREAKPOINT_PRIO_NORMAL;
> +
> +	entry->ksym_hbkpt->installed = (void *)ksym_hbkpt_installed;
> +	entry->ksym_hbkpt->uninstalled = (void *)ksym_hbkpt_uninstalled;
> +	entry->ksym_hbkpt->triggered = (void *)ksym_hbkpt_handler;
> +
> +	if ((register_kernel_hw_breakpoint(entry->ksym_hbkpt)) < 0) {
> +		printk(KERN_INFO "ksym_tracer request failed. Try again"
> +					" later!!\n");
> +		kfree(entry);
> +		return -EAGAIN;
> +	}
> +	hlist_add_head(&(entry->ksym_hlist), &ksym_filter_head);
> +	printk(KERN_INFO "ksym_tracer changes are now effective\n");
> +
> +	ksym_filter_entry_count++;
> +
> +	return 0;
> +}
> +
> +static ssize_t ksym_trace_filter_read(struct file *filp, char __user *ubuf,
> +						size_t count, loff_t *ppos)
> +{
> +	struct trace_ksym *entry;
> +	struct hlist_node *node;
> +	char buf[KSYM_FILTER_ENTRY_LEN * KSYM_TRACER_MAX];
> +	ssize_t ret, cnt = 0;
> +
> +	mutex_lock(&ksym_tracer_mutex);
> +
> +	hlist_for_each_entry(entry, node, &ksym_filter_head, ksym_hlist) {
> +		cnt += snprintf(&buf[cnt], KSYM_FILTER_ENTRY_LEN - cnt, "%s:",
> +				entry->ksym_hbkpt->info.name);
> +		if (entry->ksym_hbkpt->info.type == HW_BREAKPOINT_WRITE)
> +			cnt += snprintf(&buf[cnt], KSYM_FILTER_ENTRY_LEN - cnt,
> +								"-w-\n");
> +		else if (entry->ksym_hbkpt->info.type == HW_BREAKPOINT_RW)
> +			cnt += snprintf(&buf[cnt], KSYM_FILTER_ENTRY_LEN - cnt,
> +								"rw-\n");
> +	}
> +	ret = simple_read_from_buffer(ubuf, count, ppos, buf, strlen(buf));
> +	mutex_unlock(&ksym_tracer_mutex);
> +
> +	return ret;
> +}
> +
> +static ssize_t ksym_trace_filter_write(struct file *file,
> +					const char __user *buffer,
> +						size_t count, loff_t *ppos)
> +{
> +	struct trace_ksym *entry;
> +	struct hlist_node *node;
> +	char *input_string, *ksymname = NULL;
> +	unsigned long ksym_addr = 0;
> +	int ret, op, changed = 0;
> +
> +	input_string = kzalloc(count, GFP_KERNEL);
> +	if (!input_string)
> +		return -ENOMEM;
> +
> +	/* Ignore echo "" > ksym_trace_filter */
> +	if (count == 0)
> +		return 0;
> +
> +	if (copy_from_user(input_string, buffer, count))
> +		return -EFAULT;
> +
> +	ret = op = parse_ksym_trace_str(input_string, &ksymname, &ksym_addr);
> +
> +	if (ret < 0)
> +		goto err_ret;
> +	mutex_lock(&ksym_tracer_mutex);
> +
> +	ret = -EINVAL;
> +	hlist_for_each_entry(entry, node, &ksym_filter_head, ksym_hlist) {
> +		if (entry->ksym_addr == ksym_addr) {
> +			/* Check for malformed request: (6) */
> +			if (entry->ksym_hbkpt->info.type != op)
> +				changed = 1;
> +			else
> +				goto err_ret;
> +			break;
> +		}
> +	}
> +	if (changed) {
> +		unregister_kernel_hw_breakpoint(entry->ksym_hbkpt);
> +		entry->ksym_hbkpt->info.type = op;
> +		if (op > 0) {
> +			ret = register_kernel_hw_breakpoint(entry->ksym_hbkpt);
> +			if (ret > 0) {
> +				ret = count;
> +				goto unlock_ret_path;
> +			}
> +			if (ret == 0) {
> +				ret = -ENOSPC;
> +				unregister_kernel_hw_breakpoint(entry->\
> +								ksym_hbkpt);
> +			}
> +		}
> +		ksym_filter_entry_count--;
> +		hlist_del(&(entry->ksym_hlist));
> +		kfree(entry->ksym_hbkpt);
> +		kfree(entry);
> +		ret = count;
> +		goto err_ret;
> +	} else {
> +		/* Check for malformed request: (4) */
> +		if (op == 0)
> +			goto err_ret;
> +
> +		ret = process_new_ksym_entry(entry, ksymname, op, ksym_addr);
> +		if (ret)
> +			goto err_ret;
> +	}
> +	ret = count;
> +	goto unlock_ret_path;
> +
> +err_ret:
> +	kfree(input_string);
> +
> +unlock_ret_path:
> +	mutex_unlock(&ksym_tracer_mutex);
> +	return ret;
> +}
> +
> +static const struct file_operations ksym_tracing_fops = {
> +	.open		= tracing_open_generic,
> +	.read		= ksym_trace_filter_read,
> +	.write		= ksym_trace_filter_write,
> +};
> +
> +static int ksym_trace_init(struct trace_array *tr)
> +{
> +	int cpu;
> +
> +	for_each_online_cpu(cpu)
> +		tracing_reset(tr, cpu);
> +	ksym_tracing_enabled = 1;
> +	ksym_trace_array = tr;
> +
> +	return 0;
> +}
> +
> +static void ksym_trace_reset(struct trace_array *tr)
> +{
> +	ksym_tracing_enabled = 0;
> +}
> +
> +#ifdef CONFIG_FTRACE_SELFTEST
> +int trace_selftest_startup_ksym(struct tracer *trace, struct trace_array *tr)
> +{
> +	/* TODO: Will be implemented later */
> +	return 0;
> +}
> +#endif /* CONFIG_FTRACE_SELFTEST */
> +
> +static void ksym_trace_print_header(struct seq_file *m)
> +{
> +
> +	seq_puts(m,
> +		 "#       TASK-PID      CPU#      Symbol         Type    "
> +		 "Function         \n");
> +	seq_puts(m,
> +		 "#          |           |          |              |         "
> +		 "|            \n");
> +}
> +
> +static enum print_line_t ksym_trace_output(struct trace_iterator *iter)
> +{
> +	struct trace_entry *entry = iter->ent;
> +	struct trace_seq *s = &iter->seq;
> +	struct trace_ksym *field;
> +	char str[KSYM_SYMBOL_LEN];
> +	int ret;
> +
> +	trace_assign_type(field, entry);
> +
> +	ret = trace_seq_printf(s, "%-15s %-5d %-3d %-20s ", field->p_name,
> +				field->pid, iter->cpu, field->ksym_name);

s/field->pid/entry->pid/

-- Steve

> +	if (!ret)
> +		return TRACE_TYPE_PARTIAL_LINE;
> +
> +	switch (field->ksym_hbkpt->info.type) {
> +	case HW_BREAKPOINT_WRITE:
> +		ret = trace_seq_printf(s, " W  ");
> +		break;
> +	case HW_BREAKPOINT_RW:
> +		ret = trace_seq_printf(s, " RW ");
> +		break;
> +	default:
> +		return TRACE_TYPE_PARTIAL_LINE;
> +	}
> +
> +	if (!ret)
> +		return TRACE_TYPE_PARTIAL_LINE;
> +
> +	sprint_symbol(str, field->ip);
> +	ret = trace_seq_printf(s, "%-20s\n", str);
> +	if (!ret)
> +		return TRACE_TYPE_PARTIAL_LINE;
> +
> +	return TRACE_TYPE_HANDLED;
> +}
> +
> +struct tracer ksym_tracer __read_mostly =
> +{
> +	.name		= "ksym_tracer",
> +	.init		= ksym_trace_init,
> +	.reset		= ksym_trace_reset,
> +#ifdef CONFIG_FTRACE_SELFTEST
> +	.selftest	= trace_selftest_startup_ksym,
> +#endif
> +	.print_header   = ksym_trace_print_header,
> +	.print_line	= ksym_trace_output
> +};
> +
> +__init static int init_ksym_trace(void)
> +{
> +	struct dentry *d_tracer;
> +	struct dentry *entry;
> +
> +	d_tracer = tracing_init_dentry();
> +	ksym_filter_entry_count = 0;
> +
> +	entry = debugfs_create_file("ksym_trace_filter", 0666, d_tracer,
> +				    NULL, &ksym_tracing_fops);
> +	if (!entry)
> +		pr_warning("Could not create debugfs "
> +			   "'ksym_trace_filter' file\n");
> +
> +	return register_tracer(&ksym_tracer);
> +
> +}
> +device_initcall(init_ksym_trace);
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/
> 

^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: [patch 11/11] ftrace plugin for kernel symbol tracing using HW Breakpoint interfaces
  2009-03-05  6:37   ` Frederic Weisbecker
  2009-03-05  9:16     ` Ingo Molnar
  2009-03-05 11:33     ` K.Prasad
@ 2009-03-05 15:00     ` Steven Rostedt
  2 siblings, 0 replies; 71+ messages in thread
From: Steven Rostedt @ 2009-03-05 15:00 UTC (permalink / raw)
  To: Frederic Weisbecker
  Cc: prasad, mingo, Andrew Morton, Linux Kernel Mailing List,
	Alan Stern, Roland McGrath


On Thu, 5 Mar 2009, Frederic Weisbecker wrote:
> >  
> > +struct trace_ksym {
> > +	struct trace_entry	ent;
> > +	struct hw_breakpoint	*ksym_hbkpt;
> > +	unsigned long		ksym_addr;
> > +	unsigned long		ip;
> > +	pid_t			pid;
> 
> 
> Just a doubt here.
> The current pid is automatically recorded on trace_buffer_lock_reserve()
> (or unlock_commit, don't remember), so if this pid is the current one, you
> don't need to reserve a room for it, current pid is on struct trace_entry.

Heh, I guess I should have read the rest of the thread before replying.
Frederic beat me to it ;-)

-- Steve

^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: [patch 01/11] Introducing generic hardware breakpoint handler interfaces
  2009-03-05  4:37 ` [patch 01/11] Introducing generic hardware breakpoint handler interfaces prasad
@ 2009-03-10 13:50   ` Ingo Molnar
  2009-03-10 14:19     ` Alan Stern
  0 siblings, 1 reply; 71+ messages in thread
From: Ingo Molnar @ 2009-03-10 13:50 UTC (permalink / raw)
  To: prasad
  Cc: Andrew Morton, Linux Kernel Mailing List, Alan Stern, Roland McGrath


* prasad@linux.vnet.ibm.com <prasad@linux.vnet.ibm.com> wrote:


> +static u8			tprio[HB_NUM];	/* Thread bp max priorities */
> +LIST_HEAD(kernel_bps);			/* Kernel breakpoint list */
> +static LIST_HEAD(thread_list);			/* thread_hw_breakpoint list */
> +DEFINE_PER_CPU(struct cpu_hw_breakpoint, cpu_bp);

hm, why do we need the whole 'priority' mechanism? It seems very 
over-designed to me.

The likelyhood of both user-space and kernel-space to use 
hw-breakpoints is very low to begin with. And if they use them, 
the likelyhood of there being more than 4 debugregs required in 
the same context is even lower.

If that happens we shouldnt try to be too smart about them - 
just override user-space ones with kernel space ones and that's 
it. No explicit priorities are needed.

	Ingo

^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: [patch 02/11] x86 architecture implementation of Hardware Breakpoint interfaces
  2009-03-05  4:38 ` [patch 02/11] x86 architecture implementation of Hardware Breakpoint interfaces prasad
@ 2009-03-10 14:09   ` Ingo Molnar
  2009-03-10 14:59     ` Alan Stern
  2009-03-12  2:46     ` Roland McGrath
  0 siblings, 2 replies; 71+ messages in thread
From: Ingo Molnar @ 2009-03-10 14:09 UTC (permalink / raw)
  To: prasad
  Cc: Andrew Morton, Linux Kernel Mailing List, Alan Stern, Roland McGrath


* prasad@linux.vnet.ibm.com <prasad@linux.vnet.ibm.com> wrote:

> +/*
> + * Handle debug exception notifications.
> + */
> +
> +int __kprobes hw_breakpoint_handler(struct die_args *args)
> +{
> +	struct cpu_hw_breakpoint *chbi;
> +	int i;
> +	struct hw_breakpoint *bp;
> +	struct thread_hw_breakpoint *thbi = NULL;
> +
> +	/* The DR6 value is stored in args->err */
> +#define DR6	(args->err)

that's ugly - what's wrong with an old-fashioned "int db6 = 
args->err" type of approach?

> +
> +	if (DR6 & DR_STEP)
> +		return NOTIFY_DONE;
> +
> +	chbi = &per_cpu(cpu_bp, get_cpu());
> +
> +	/* Disable all breakpoints so that the callbacks can run without
> +	 * triggering recursive debug exceptions.
> +	 */
> +	set_debugreg(0UL, 7);
> +
> +	/* Assert that local interrupts are disabled
> +	 * Reset the DRn bits in the virtualized register value.
> +	 * The ptrace trigger routine will add in whatever is needed.
> +	 */
> +	current->thread.vdr6 &= ~(DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3);
> +
> +	/* Are we a victim of lazy debug-register switching? */
> +	if (!chbi->bp_task)
> +		;
> +	else if (chbi->bp_task != current) {
> +
> +		/* No user breakpoints are valid.  Perform the belated
> +		 * debug-register switch.
> +		 */
> +		switch_to_none_hw_breakpoint();
> +	} else {
> +		thbi = chbi->bp_task->thread.hw_breakpoint_info;
> +	}
> +
> +	/* Handle all the breakpoints that were triggered */
> +	for (i = 0; i < HB_NUM; ++i) {
> +		if (likely(!(DR6 & (DR_TRAP0 << i))))
> +			continue;
> +
> +		/* Find the corresponding hw_breakpoint structure and
> +		 * invoke its triggered callback.
> +		 */
> +		if (i < chbi->cur_kbpdata->num_kbps)
> +			bp = chbi->cur_kbpdata->bps[i];
> +		else if (thbi)
> +			bp = thbi->bps[i];
> +		else		/* False alarm due to lazy DR switching */
> +			continue;
> +		if (bp) {
> +			switch (bp->info.type) {
> +			case HW_BREAKPOINT_WRITE:
> +			case HW_BREAKPOINT_RW:
> +				if (bp->triggered)
> +					(bp->triggered)(bp, args->regs);
> +				/* Re-enable the breakpoints */
> +				set_debugreg(thbi ? thbi->tkdr7 :
> +						chbi->cur_kbpdata->mkdr7, 7);
> +				put_cpu_no_resched();
> +
> +				return NOTIFY_STOP;
> +			/*
> +			 * Presently we allow instruction breakpoints only in
> +			 * user-space when requested through ptrace.
> +			 */
> +			case HW_BREAKPOINT_EXECUTE:
> +				if (arch_check_va_in_userspace(bp->info.address,
> +								current)) {
> +					(bp->triggered)(bp, args->regs);
> +	/* We'll return NOTIFY_DONE, do_debug will take care of the rest */
> +					return NOTIFY_DONE;
> +				}
> +			}

the linebreaks here became so ugly because the whole loop body 
should be moved inside a helper function.

> +++ linux-2.6-tip.hbkpt/arch/x86/include/asm/hw_breakpoint.h
> @@ -0,0 +1,132 @@
> +#ifndef	_I386_HW_BREAKPOINT_H
> +#define	_I386_HW_BREAKPOINT_H
> +
> +#ifdef	__KERNEL__
> +#define	__ARCH_HW_BREAKPOINT_H
> +
> +struct arch_hw_breakpoint {
> +	char		*name; /* Contains name of the symbol to set bkpt */
> +	unsigned long	address;
> +	u8		len;
> +	u8		type;
> +} __attribute__((packed));

hm, why packed and why u8 ? We dont expose this to user-space, 
do we? (if yes then 'unsigned long' is wrong and __KERNEL__ is 
wrong as well)

> +#include <linux/kdebug.h>
> +#include <asm-generic/hw_breakpoint.h>
> +
> +/* HW breakpoint accessor routines */
> +static inline const void *hw_breakpoint_get_kaddress(struct hw_breakpoint *bp)
> +{
> +	return (const void *) bp->info.address;
> +}
> +
> +static inline const void __user *hw_breakpoint_get_uaddress
> +						(struct hw_breakpoint *bp)
> +{
> +	return (const void __user *) bp->info.address;
> +}
> +
> +static inline unsigned hw_breakpoint_get_len(struct hw_breakpoint *bp)
> +{
> +	return bp->info.len;
> +}
> +
> +static inline unsigned hw_breakpoint_get_type(struct hw_breakpoint *bp)
> +{
> +	return bp->info.type;
> +}

why this redirection, why dont just use the structure as-is? If 
there's any arch weirdness then that arch should have 
arch-special accessors - not the generic code.

> +
> +/* Kernel symbol lookup routine for installing Data HW Breakpoint Address */
> +static inline unsigned long hw_breakpoint_lookup_name(const char *name)
> +{
> +	return kallsyms_lookup_name(name);
> +}

A wrapper around kallsyms_lookup_name() is quite pointless - 
pleae us kallsyms_lookup_name() drectly.

> +/* Per-thread HW breakpoint and debug register info */
> +struct thread_hw_breakpoint {
> +
> +	/* utrace support */
> +	struct list_head	node;		/* Entry in thread list */
> +	struct list_head	thread_bps;	/* Thread's breakpoints */
> +	struct hw_breakpoint	*bps[HB_NUM];	/* Highest-priority bps */
> +	unsigned long		tdr[HB_NUM];	/*  and their addresses */

Please rename it to something like ->hw_breakpoint[] and 
->address[] - 'bps' and 'tdr' look quite meaningless.

> +	int			num_installed;	/* Number of installed bps */
> +	unsigned		gennum;		/* update-generation number */

i suspect the gennum we can get rid of if we get rid of the 
notion of priorities, right?

	Ingo

^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: [patch 01/11] Introducing generic hardware breakpoint handler interfaces
  2009-03-10 13:50   ` Ingo Molnar
@ 2009-03-10 14:19     ` Alan Stern
  2009-03-10 14:50       ` Ingo Molnar
  0 siblings, 1 reply; 71+ messages in thread
From: Alan Stern @ 2009-03-10 14:19 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: prasad, Andrew Morton, Linux Kernel Mailing List, Roland McGrath

On Tue, 10 Mar 2009, Ingo Molnar wrote:

> * prasad@linux.vnet.ibm.com <prasad@linux.vnet.ibm.com> wrote:
> 
> 
> > +static u8			tprio[HB_NUM];	/* Thread bp max priorities */
> > +LIST_HEAD(kernel_bps);			/* Kernel breakpoint list */
> > +static LIST_HEAD(thread_list);			/* thread_hw_breakpoint list */
> > +DEFINE_PER_CPU(struct cpu_hw_breakpoint, cpu_bp);

If nobody minds, I'll answer some of these questions on Prasad's behalf 
because they address parts of the code that were written before he took 
over the project.

> hm, why do we need the whole 'priority' mechanism? It seems very 
> over-designed to me.

This was done at Roland McGrath's express request.  We should see what 
he has to say about it.

> The likelyhood of both user-space and kernel-space to use 
> hw-breakpoints is very low to begin with. And if they use them, 
> the likelyhood of there being more than 4 debugregs required in 
> the same context is even lower.

Not all architectures have 4 debug registers.  Most have only one.

> If that happens we shouldnt try to be too smart about them - 
> just override user-space ones with kernel space ones and that's 
> it. No explicit priorities are needed.

Roland really did not want it done this way.

Alan Stern


^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: [patch 04/11] Introduce virtual debug register in thread_struct and wrapper-routines around process related functions
  2009-03-05  4:38 ` [patch 04/11] Introduce virtual debug register in thread_struct and wrapper-routines around process related functions prasad
@ 2009-03-10 14:35   ` Ingo Molnar
  2009-03-10 15:53     ` Alan Stern
  2009-03-12  2:26     ` Roland McGrath
  0 siblings, 2 replies; 71+ messages in thread
From: Ingo Molnar @ 2009-03-10 14:35 UTC (permalink / raw)
  To: prasad
  Cc: Andrew Morton, Linux Kernel Mailing List, Alan Stern, Roland McGrath


* prasad@linux.vnet.ibm.com <prasad@linux.vnet.ibm.com> wrote:

> This patch introduces virtual debug registers to used by the 
> per-thread structure ad wrapper routines to manage debug 
> registers by process-related functions.

this is somewhat confusing. It would be much clearer to name it 
'user debug registers'.
  
and why is this:

> @@ -427,13 +427,9 @@ struct thread_struct {
>  	unsigned long		ip;
>  	unsigned long		fs;
>  	unsigned long		gs;
> -	/* Hardware debugging registers: */
> -	unsigned long		debugreg0;
> -	unsigned long		debugreg1;
> -	unsigned long		debugreg2;
> -	unsigned long		debugreg3;
> -	unsigned long		debugreg6;
> -	unsigned long		debugreg7;
> +	/* Hardware breakpoint info */
> +	unsigned long	vdr6;
> +	struct thread_hw_breakpoint	*hw_breakpoint_info;

detached from thread_struct? There's a lot of complications 
(alloc/free, locking, etc.) from this for no good reason - the 
hardware-breakpoints info structure is alway per thread and is 
quite small, so there's no reason not to embedd it directly 
inside thread_struct.

That way we get its allocation and freeing logic for free in 
essence.

	Ingo

^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: [patch 08/11] Modify Ptrace routines to access breakpoint registers
  2009-03-05  4:40 ` [patch 08/11] Modify Ptrace routines to access breakpoint registers prasad
@ 2009-03-10 14:40   ` Ingo Molnar
  2009-03-10 15:54     ` Alan Stern
  2009-03-12  3:14     ` Roland McGrath
  0 siblings, 2 replies; 71+ messages in thread
From: Ingo Molnar @ 2009-03-10 14:40 UTC (permalink / raw)
  To: prasad
  Cc: Andrew Morton, Linux Kernel Mailing List, Alan Stern, Roland McGrath


* prasad@linux.vnet.ibm.com <prasad@linux.vnet.ibm.com> wrote:

> -static unsigned long debugreg_addr_limit(struct task_struct *task)
> -{
> -#ifdef CONFIG_IA32_EMULATION
> -	if (test_tsk_thread_flag(task, TIF_IA32))
> -		return IA32_PAGE_OFFSET - 3;
> -#endif
> -	return TASK_SIZE_MAX - 7;
> -}
> -

I dont see where this security check has been carried over into 
the generic code. The new code has:

 +int arch_check_va_in_userspace(unsigned long va, struct task_struct *tsk)
 +{
 +       return (va < TASK_SIZE);
 +}

but i think that misses the detail that it's not just the start 
address of an x86 breakpoint that has to be considered, but also 
the end addess of it.

For example a hardware breakpoint can be at 0xbfffffff with a 
length of 4 bytes - thus overlapping into kernel-space by 3 
bytes. It is important to not let that happen.

	Ingo

^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: [patch 09/11] Cleanup HW Breakpoint registers before kexec
  2009-03-05  4:41 ` [patch 09/11] Cleanup HW Breakpoint registers before kexec prasad
@ 2009-03-10 14:42   ` Ingo Molnar
  0 siblings, 0 replies; 71+ messages in thread
From: Ingo Molnar @ 2009-03-10 14:42 UTC (permalink / raw)
  To: prasad
  Cc: Andrew Morton, Linux Kernel Mailing List, Alan Stern, Roland McGrath


* prasad@linux.vnet.ibm.com <prasad@linux.vnet.ibm.com> wrote:

> +	disable_debug_registers();

naming nit: please name them properly:

   breakpoints_disable();
   breakpoints_enable();

new kernel APIs always get named left to right in ascending 
specificity order.

	Ingo

^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: [patch 06/11] Use virtual debug registers in process/thread handling code
  2009-03-05  4:40 ` [patch 06/11] Use virtual debug registers in process/thread handling code prasad
@ 2009-03-10 14:49   ` Ingo Molnar
  2009-03-10 16:05     ` Alan Stern
  0 siblings, 1 reply; 71+ messages in thread
From: Ingo Molnar @ 2009-03-10 14:49 UTC (permalink / raw)
  To: prasad
  Cc: Andrew Morton, Linux Kernel Mailing List, Alan Stern, Roland McGrath


* prasad@linux.vnet.ibm.com <prasad@linux.vnet.ibm.com> wrote:

> @@ -437,16 +448,6 @@ __switch_to_xtra(struct task_struct *pre
>  	else if (next->debugctlmsr != prev->debugctlmsr)
>  		update_debugctlmsr(next->debugctlmsr);
>  
> -	if (test_tsk_thread_flag(next_p, TIF_DEBUG)) {
> -		set_debugreg(next->debugreg0, 0);
> -		set_debugreg(next->debugreg1, 1);
> -		set_debugreg(next->debugreg2, 2);
> -		set_debugreg(next->debugreg3, 3);
> -		/* no 4 and 5 */
> -		set_debugreg(next->debugreg6, 6);
> -		set_debugreg(next->debugreg7, 7);
> -	}
> -
>  	if (test_tsk_thread_flag(prev_p, TIF_NOTSC) ^
>  	    test_tsk_thread_flag(next_p, TIF_NOTSC)) {
>  		/* prev and next are different */
> @@ -595,6 +596,12 @@ __switch_to(struct task_struct *prev_p, 
>  
>  	percpu_write(current_task, next_p);
>  
> +	/*
> +	 * Handle debug registers.  This must be done _after_ current
> +	 * is updated.
> +	 */
> +	if (unlikely(test_tsk_thread_flag(next_p, TIF_DEBUG)))
> +		switch_to_thread_hw_breakpoint(next_p);

why does this have to be called after 'current' has been 
updated? AFAICS switch_to_thread_hw_breakpoint() does not take a 
look at 'current'.

Speaking of switch_to_thread_hw_breakpoint(), i dont like that 
function at all:

- why does it have to do a list of debug registers?

- why does it worry about IPIs arriving when context-switches on 
  x86 are always done with interrupts disabled?

- also, what do the ->installed() and ->uninstalled() callbacks 
  do - nothing uses it!

	Ingo

^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: [patch 01/11] Introducing generic hardware breakpoint handler interfaces
  2009-03-10 14:19     ` Alan Stern
@ 2009-03-10 14:50       ` Ingo Molnar
  2009-03-11 12:57         ` K.Prasad
  0 siblings, 1 reply; 71+ messages in thread
From: Ingo Molnar @ 2009-03-10 14:50 UTC (permalink / raw)
  To: Alan Stern
  Cc: prasad, Andrew Morton, Linux Kernel Mailing List, Roland McGrath


* Alan Stern <stern@rowland.harvard.edu> wrote:

> On Tue, 10 Mar 2009, Ingo Molnar wrote:
> 
> > * prasad@linux.vnet.ibm.com <prasad@linux.vnet.ibm.com> wrote:
> > 
> > 
> > > +static u8			tprio[HB_NUM];	/* Thread bp max priorities */
> > > +LIST_HEAD(kernel_bps);			/* Kernel breakpoint list */
> > > +static LIST_HEAD(thread_list);			/* thread_hw_breakpoint list */
> > > +DEFINE_PER_CPU(struct cpu_hw_breakpoint, cpu_bp);
> 
> If nobody minds, I'll answer some of these questions on 
> Prasad's behalf because they address parts of the code that 
> were written before he took over the project.
> 
> > hm, why do we need the whole 'priority' mechanism? It seems 
> > very over-designed to me.
> 
> This was done at Roland McGrath's express request.  We should 
> see what he has to say about it.
> 
> > The likelyhood of both user-space and kernel-space to use 
> > hw-breakpoints is very low to begin with. And if they use 
> > them, the likelyhood of there being more than 4 debugregs 
> > required in the same context is even lower.
> 
> Not all architectures have 4 debug registers.  Most have only 
> one.
>
> > If that happens we shouldnt try to be too smart about them - 
> > just override user-space ones with kernel space ones and 
> > that's it. No explicit priorities are needed.
> 
> Roland really did not want it done this way.

Well i guess i'll have to wait for Roland's reply then.

	Ingo

^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: [patch 02/11] x86 architecture implementation of Hardware Breakpoint interfaces
  2009-03-10 14:09   ` Ingo Molnar
@ 2009-03-10 14:59     ` Alan Stern
  2009-03-10 15:18       ` Ingo Molnar
  2009-03-12  2:46     ` Roland McGrath
  1 sibling, 1 reply; 71+ messages in thread
From: Alan Stern @ 2009-03-10 14:59 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: prasad, Andrew Morton, Linux Kernel Mailing List, Roland McGrath

On Tue, 10 Mar 2009, Ingo Molnar wrote:

> * prasad@linux.vnet.ibm.com <prasad@linux.vnet.ibm.com> wrote:
> 
> > +/*
> > + * Handle debug exception notifications.
> > + */
> > +
> > +int __kprobes hw_breakpoint_handler(struct die_args *args)
> > +{
> > +	struct cpu_hw_breakpoint *chbi;
> > +	int i;
> > +	struct hw_breakpoint *bp;
> > +	struct thread_hw_breakpoint *thbi = NULL;
> > +
> > +	/* The DR6 value is stored in args->err */
> > +#define DR6	(args->err)
> 
> that's ugly - what's wrong with an old-fashioned "int db6 = 
> args->err" type of approach?

Yes, it is ugly.  It was a holdover from an earlier version, and in 
fact it's likely to change in the future to become even more ugly.  But 
for now, you're right -- a simple assignment would be better.

> > +++ linux-2.6-tip.hbkpt/arch/x86/include/asm/hw_breakpoint.h
> > @@ -0,0 +1,132 @@
> > +#ifndef	_I386_HW_BREAKPOINT_H
> > +#define	_I386_HW_BREAKPOINT_H
> > +
> > +#ifdef	__KERNEL__
> > +#define	__ARCH_HW_BREAKPOINT_H
> > +
> > +struct arch_hw_breakpoint {
> > +	char		*name; /* Contains name of the symbol to set bkpt */
> > +	unsigned long	address;
> > +	u8		len;
> > +	u8		type;
> > +} __attribute__((packed));
> 
> hm, why packed and why u8 ? We dont expose this to user-space, 
> do we? (if yes then 'unsigned long' is wrong and __KERNEL__ is 
> wrong as well)

I can't remember why this was made packed; there doesn't seem to be any 
important reason for it.  The structure is not exposed to userspace.  
The len and type fields are u8 because they contain values no larger 
than 255.

> > +#include <linux/kdebug.h>
> > +#include <asm-generic/hw_breakpoint.h>
> > +
> > +/* HW breakpoint accessor routines */
> > +static inline const void *hw_breakpoint_get_kaddress(struct hw_breakpoint *bp)
> > +{
> > +	return (const void *) bp->info.address;
> > +}
> > +
> > +static inline const void __user *hw_breakpoint_get_uaddress
> > +						(struct hw_breakpoint *bp)
> > +{
> > +	return (const void __user *) bp->info.address;
> > +}
> > +
> > +static inline unsigned hw_breakpoint_get_len(struct hw_breakpoint *bp)
> > +{
> > +	return bp->info.len;
> > +}
> > +
> > +static inline unsigned hw_breakpoint_get_type(struct hw_breakpoint *bp)
> > +{
> > +	return bp->info.type;
> > +}
> 
> why this redirection, why dont just use the structure as-is? If 
> there's any arch weirdness then that arch should have 
> arch-special accessors - not the generic code.

These _are_ the arch-specific accessors.  Look at the filename:
arch/x86/include/asm/hw_breakpoint.h.

> > +	int			num_installed;	/* Number of installed bps */
> > +	unsigned		gennum;		/* update-generation number */
> 
> i suspect the gennum we can get rid of if we get rid of the 
> notion of priorities, right?

No.  gennum has nothing to do with priorities.

Alan Stern


^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: [patch 02/11] x86 architecture implementation of Hardware Breakpoint interfaces
  2009-03-10 14:59     ` Alan Stern
@ 2009-03-10 15:18       ` Ingo Molnar
  2009-03-10 17:11         ` Alan Stern
  0 siblings, 1 reply; 71+ messages in thread
From: Ingo Molnar @ 2009-03-10 15:18 UTC (permalink / raw)
  To: Alan Stern
  Cc: prasad, Andrew Morton, Linux Kernel Mailing List, Roland McGrath


* Alan Stern <stern@rowland.harvard.edu> wrote:

> On Tue, 10 Mar 2009, Ingo Molnar wrote:
> 
> > * prasad@linux.vnet.ibm.com <prasad@linux.vnet.ibm.com> wrote:
> > 
> > > +/*
> > > + * Handle debug exception notifications.
> > > + */
> > > +
> > > +int __kprobes hw_breakpoint_handler(struct die_args *args)
> > > +{
> > > +	struct cpu_hw_breakpoint *chbi;
> > > +	int i;
> > > +	struct hw_breakpoint *bp;
> > > +	struct thread_hw_breakpoint *thbi = NULL;
> > > +
> > > +	/* The DR6 value is stored in args->err */
> > > +#define DR6	(args->err)
> > 
> > that's ugly - what's wrong with an old-fashioned "int db6 = 
> > args->err" type of approach?
> 
> Yes, it is ugly.  It was a holdover from an earlier version, and in 
> fact it's likely to change in the future to become even more ugly.  But 
> for now, you're right -- a simple assignment would be better.
> 
> > > +++ linux-2.6-tip.hbkpt/arch/x86/include/asm/hw_breakpoint.h
> > > @@ -0,0 +1,132 @@
> > > +#ifndef	_I386_HW_BREAKPOINT_H
> > > +#define	_I386_HW_BREAKPOINT_H
> > > +
> > > +#ifdef	__KERNEL__
> > > +#define	__ARCH_HW_BREAKPOINT_H
> > > +
> > > +struct arch_hw_breakpoint {
> > > +	char		*name; /* Contains name of the symbol to set bkpt */
> > > +	unsigned long	address;
> > > +	u8		len;
> > > +	u8		type;
> > > +} __attribute__((packed));
> > 
> > hm, why packed and why u8 ? We dont expose this to user-space, 
> > do we? (if yes then 'unsigned long' is wrong and __KERNEL__ is 
> > wrong as well)
> 
> I can't remember why this was made packed; there doesn't seem to be any 
> important reason for it.  The structure is not exposed to userspace.  
> The len and type fields are u8 because they contain values no larger 
> than 255.
> 
> > > +#include <linux/kdebug.h>
> > > +#include <asm-generic/hw_breakpoint.h>
> > > +
> > > +/* HW breakpoint accessor routines */
> > > +static inline const void *hw_breakpoint_get_kaddress(struct hw_breakpoint *bp)
> > > +{
> > > +	return (const void *) bp->info.address;
> > > +}
> > > +
> > > +static inline const void __user *hw_breakpoint_get_uaddress
> > > +						(struct hw_breakpoint *bp)
> > > +{
> > > +	return (const void __user *) bp->info.address;
> > > +}
> > > +
> > > +static inline unsigned hw_breakpoint_get_len(struct hw_breakpoint *bp)
> > > +{
> > > +	return bp->info.len;
> > > +}
> > > +
> > > +static inline unsigned hw_breakpoint_get_type(struct hw_breakpoint *bp)
> > > +{
> > > +	return bp->info.type;
> > > +}
> > 
> > why this redirection, why dont just use the structure as-is? 
> > If there's any arch weirdness then that arch should have 
> > arch-special accessors - not the generic code.
> 
> These _are_ the arch-specific accessors.  Look at the 
> filename: arch/x86/include/asm/hw_breakpoint.h.

I very well know which file this is, you need to read my reply 
again.

These are very generic-sounding fields and they should not be 
hidden via pointless wrappers by the generic code. Dont let the 
tail wag the dog. If there's architecture weirdness that does 
not fit the generic code, then _that_ architecture should have 
the ugliness - not the generic code. (note that these accessors 
are used by the generic code so the uglification spreads there)

> > > + int num_installed; /* Number of installed bps */ + 
> > > unsigned gennum; /* update-generation number */
> > 
> > i suspect the gennum we can get rid of if we get rid of the 
> > notion of priorities, right?
> 
> No.  gennum has nothing to do with priorities.

Well it's introduced because we have a priority-sorted list of 
breakpoints not an array. A list needs to be maintained and when 
updated it's reloaded. I was thinking about possibly getting rid 
of that list complication and go back to the simpler array. But 
it's hard because the lifetime of a kernel space breakpoint 
spans context-switches so there has to be separation.

	Ingo

^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: [patch 04/11] Introduce virtual debug register in thread_struct and wrapper-routines around process related functions
  2009-03-10 14:35   ` Ingo Molnar
@ 2009-03-10 15:53     ` Alan Stern
  2009-03-10 17:06       ` Ingo Molnar
  2009-03-12  2:26     ` Roland McGrath
  1 sibling, 1 reply; 71+ messages in thread
From: Alan Stern @ 2009-03-10 15:53 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: prasad, Andrew Morton, Linux Kernel Mailing List, Roland McGrath

On Tue, 10 Mar 2009, Ingo Molnar wrote:

> and why is this:
> 
> > @@ -427,13 +427,9 @@ struct thread_struct {
> >  	unsigned long		ip;
> >  	unsigned long		fs;
> >  	unsigned long		gs;
> > -	/* Hardware debugging registers: */
> > -	unsigned long		debugreg0;
> > -	unsigned long		debugreg1;
> > -	unsigned long		debugreg2;
> > -	unsigned long		debugreg3;
> > -	unsigned long		debugreg6;
> > -	unsigned long		debugreg7;
> > +	/* Hardware breakpoint info */
> > +	unsigned long	vdr6;
> > +	struct thread_hw_breakpoint	*hw_breakpoint_info;
> 
> detached from thread_struct? There's a lot of complications 
> (alloc/free, locking, etc.) from this for no good reason - the 
> hardware-breakpoints info structure is alway per thread and is 
> quite small, so there's no reason not to embedd it directly 
> inside thread_struct.

The only reason for separating it out was to avoid bogging down the 
vast majority of threads which aren't debugged.  If you think the extra 
overhead isn't worth worrying about then the hw-breakpoint info 
structure can be embedded.

Alan Stern


^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: [patch 08/11] Modify Ptrace routines to access breakpoint registers
  2009-03-10 14:40   ` Ingo Molnar
@ 2009-03-10 15:54     ` Alan Stern
  2009-03-12  3:14     ` Roland McGrath
  1 sibling, 0 replies; 71+ messages in thread
From: Alan Stern @ 2009-03-10 15:54 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: prasad, Andrew Morton, Linux Kernel Mailing List, Roland McGrath

On Tue, 10 Mar 2009, Ingo Molnar wrote:

> 
> * prasad@linux.vnet.ibm.com <prasad@linux.vnet.ibm.com> wrote:
> 
> > -static unsigned long debugreg_addr_limit(struct task_struct *task)
> > -{
> > -#ifdef CONFIG_IA32_EMULATION
> > -	if (test_tsk_thread_flag(task, TIF_IA32))
> > -		return IA32_PAGE_OFFSET - 3;
> > -#endif
> > -	return TASK_SIZE_MAX - 7;
> > -}
> > -
> 
> I dont see where this security check has been carried over into 
> the generic code. The new code has:

Probably the IA32_EMULATION stuff was added after the hw-breakpoint 
patch was written.

>  +int arch_check_va_in_userspace(unsigned long va, struct task_struct *tsk)
>  +{
>  +       return (va < TASK_SIZE);
>  +}
> 
> but i think that misses the detail that it's not just the start 
> address of an x86 breakpoint that has to be considered, but also 
> the end addess of it.
> 
> For example a hardware breakpoint can be at 0xbfffffff with a 
> length of 4 bytes - thus overlapping into kernel-space by 3 
> bytes. It is important to not let that happen.

Quite correct.

Alan Stern


^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: [patch 06/11] Use virtual debug registers in process/thread handling code
  2009-03-10 14:49   ` Ingo Molnar
@ 2009-03-10 16:05     ` Alan Stern
  2009-03-10 16:58       ` Ingo Molnar
  2009-03-10 17:07       ` Ingo Molnar
  0 siblings, 2 replies; 71+ messages in thread
From: Alan Stern @ 2009-03-10 16:05 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: prasad, Andrew Morton, Linux Kernel Mailing List, Roland McGrath

On Tue, 10 Mar 2009, Ingo Molnar wrote:

> > @@ -595,6 +596,12 @@ __switch_to(struct task_struct *prev_p, 
> >  
> >  	percpu_write(current_task, next_p);
> >  
> > +	/*
> > +	 * Handle debug registers.  This must be done _after_ current
> > +	 * is updated.
> > +	 */
> > +	if (unlikely(test_tsk_thread_flag(next_p, TIF_DEBUG)))
> > +		switch_to_thread_hw_breakpoint(next_p);
> 
> why does this have to be called after 'current' has been 
> updated? AFAICS switch_to_thread_hw_breakpoint() does not take a 
> look at 'current'.

There was a discussion about this on LKML last October 17, and you were 
in the CC list.  Here is the reason, extracted from one of those 
messages:


There's a problem with moving the switch_to_thread_hw_breakpoint() call 
before current is updated.  Suppose a kernel breakpoint is triggered in 
between the two.  The hw-breakpoint handler will see that current is 
different from the task pointer stored in the chbi area, so it will 
think the task pointer is leftover from an old task (lazy switching) 
and will erase it.  Then until the next context switch, no 
user-breakpoints will be installed.

The real problem is that it's impossible to update both current and 
chbi->bp_task at the same instant, so there will always be a window in 
which they disagree and a breakpoint might get triggered.  Since we use 
lazy switching, we are forced to assume that a disagreement means that 
current is correct and chbi->bp_task is old.  But if you move the code 
above then you'll create a window in which current is old and 
chbi->bp_task is correct.


> Speaking of switch_to_thread_hw_breakpoint(), i dont like that 
> function at all:
> 
> - why does it have to do a list of debug registers?

I'm not sure I understand the point of this question.  Are you asking
why the hw_breakpoint structures are stored on a list?  Because there
can be an arbitrarily large number of them.

> - why does it worry about IPIs arriving when context-switches on 
>   x86 are always done with interrupts disabled?

The routine gets invoked at times other than during a context switch.  
However you may be right that these times are all mutually exclusive.  
If so then a good deal of complication can be removed.

> - also, what do the ->installed() and ->uninstalled() callbacks 
>   do - nothing uses it!

What do you mean?  They do what any callback does.  And of course 
nothing uses them -- the code hasn't been merged yet!

The intention is to let programs (or kernel debuggers) know when the 
statistics they are gathering are contaminated because the breakpoint 
in question has been uninstalled, and when the statistics are again 
valid because the breakpoint has been re-installed.

Alan Stern


^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: [patch 06/11] Use virtual debug registers in process/thread handling code
  2009-03-10 16:05     ` Alan Stern
@ 2009-03-10 16:58       ` Ingo Molnar
  2009-03-10 17:07       ` Ingo Molnar
  1 sibling, 0 replies; 71+ messages in thread
From: Ingo Molnar @ 2009-03-10 16:58 UTC (permalink / raw)
  To: Alan Stern
  Cc: prasad, Andrew Morton, Linux Kernel Mailing List, Roland McGrath


* Alan Stern <stern@rowland.harvard.edu> wrote:

> On Tue, 10 Mar 2009, Ingo Molnar wrote:
> 
> > > @@ -595,6 +596,12 @@ __switch_to(struct task_struct *prev_p, 
> > >  
> > >  	percpu_write(current_task, next_p);
> > >  
> > > +	/*
> > > +	 * Handle debug registers.  This must be done _after_ current
> > > +	 * is updated.
> > > +	 */
> > > +	if (unlikely(test_tsk_thread_flag(next_p, TIF_DEBUG)))
> > > +		switch_to_thread_hw_breakpoint(next_p);
> > 
> > why does this have to be called after 'current' has been 
> > updated? AFAICS switch_to_thread_hw_breakpoint() does not take a 
> > look at 'current'.
> 
> There was a discussion about this on LKML last October 17, and 
> you were in the CC list. [...]

I am on the Cc: list of thousands of messages per month. 
Consider it a very volatile form of storage.

Instead put these:

> There's a problem with moving the 
> switch_to_thread_hw_breakpoint() call before current is 
> updated.  Suppose a kernel breakpoint is triggered in between 
> the two.  The hw-breakpoint handler will see that current is 
> different from the task pointer stored in the chbi area, so it 
> will think the task pointer is leftover from an old task (lazy 
> switching) and will erase it.  Then until the next context 
> switch, no user-breakpoints will be installed.
> 
> The real problem is that it's impossible to update both 
> current and chbi->bp_task at the same instant, so there will 
> always be a window in which they disagree and a breakpoint 
> might get triggered.  Since we use lazy switching, we are 
> forced to assume that a disagreement means that current is 
> correct and chbi->bp_task is old.  But if you move the code 
> above then you'll create a window in which current is old and 
> chbi->bp_task is correct.

inside these:

  /*
   * ......
   */

Thanks,

        Ingo


^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: [patch 04/11] Introduce virtual debug register in thread_struct and wrapper-routines around process related functions
  2009-03-10 15:53     ` Alan Stern
@ 2009-03-10 17:06       ` Ingo Molnar
  0 siblings, 0 replies; 71+ messages in thread
From: Ingo Molnar @ 2009-03-10 17:06 UTC (permalink / raw)
  To: Alan Stern
  Cc: prasad, Andrew Morton, Linux Kernel Mailing List, Roland McGrath


* Alan Stern <stern@rowland.harvard.edu> wrote:

> On Tue, 10 Mar 2009, Ingo Molnar wrote:
> 
> > and why is this:
> > 
> > > @@ -427,13 +427,9 @@ struct thread_struct {
> > >  	unsigned long		ip;
> > >  	unsigned long		fs;
> > >  	unsigned long		gs;
> > > -	/* Hardware debugging registers: */
> > > -	unsigned long		debugreg0;
> > > -	unsigned long		debugreg1;
> > > -	unsigned long		debugreg2;
> > > -	unsigned long		debugreg3;
> > > -	unsigned long		debugreg6;
> > > -	unsigned long		debugreg7;
> > > +	/* Hardware breakpoint info */
> > > +	unsigned long	vdr6;
> > > +	struct thread_hw_breakpoint	*hw_breakpoint_info;
> > 
> > detached from thread_struct? There's a lot of complications 
> > (alloc/free, locking, etc.) from this for no good reason - the 
> > hardware-breakpoints info structure is alway per thread and is 
> > quite small, so there's no reason not to embedd it directly 
> > inside thread_struct.
> 
> The only reason for separating it out was to avoid bogging 
> down the vast majority of threads which aren't debugged.  If 
> you think the extra overhead isn't worth worrying about then 
> the hw-breakpoint info structure can be embedded.

yeah. This new facility is barely used, and such things should 
always strive for 100% dumb simplicity. If the overhead of that 
structure is ever a problem we can allocate it dynamically. (but 
generally it's just not worth the pain)

	Ingo

^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: [patch 06/11] Use virtual debug registers in process/thread handling code
  2009-03-10 16:05     ` Alan Stern
  2009-03-10 16:58       ` Ingo Molnar
@ 2009-03-10 17:07       ` Ingo Molnar
  2009-03-10 20:10         ` Alan Stern
  1 sibling, 1 reply; 71+ messages in thread
From: Ingo Molnar @ 2009-03-10 17:07 UTC (permalink / raw)
  To: Alan Stern
  Cc: prasad, Andrew Morton, Linux Kernel Mailing List, Roland McGrath


* Alan Stern <stern@rowland.harvard.edu> wrote:

> > Speaking of switch_to_thread_hw_breakpoint(), i dont like 
> > that function at all:
> > 
> > - why does it have to do a list of debug registers?
> 
> I'm not sure I understand the point of this question.  Are you 
> asking why the hw_breakpoint structures are stored on a list?  
> Because there can be an arbitrarily large number of them.

But that does not make much sense. There's just 4 hardware 
registers. There's no sane way to overcommit them hence we 
_should not_.

> > - why does it worry about IPIs arriving when context-switches on 
> >   x86 are always done with interrupts disabled?
> 
> The routine gets invoked at times other than during a context 
> switch.  However you may be right that these times are all 
> mutually exclusive.  If so then a good deal of complication 
> can be removed.

Yes.

> > - also, what do the ->installed() and ->uninstalled() callbacks 
> >   do - nothing uses it!
> 
> What do you mean?  They do what any callback does.  And of 
> course nothing uses them -- the code hasn't been merged yet!

No need to get testy - i'm the maintainer and you are trying to 
get stuff into two subsystems i maintain. I ask such questions 
when i see something added that has no immediate purpose.

If a later patch needs a particular facility then submit it 
together with that use. It's not that hard to add callbacks - 
but right now it just distracts from the immediate purpose of 
these patches.

And please dont try to get stuff merged if you are not willing 
to answer simple questions like that in a constructive way.

	Ingo

^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: [patch 02/11] x86 architecture implementation of Hardware Breakpoint interfaces
  2009-03-10 15:18       ` Ingo Molnar
@ 2009-03-10 17:11         ` Alan Stern
  2009-03-10 17:26           ` Ingo Molnar
  0 siblings, 1 reply; 71+ messages in thread
From: Alan Stern @ 2009-03-10 17:11 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: prasad, Andrew Morton, Linux Kernel Mailing List, Roland McGrath

On Tue, 10 Mar 2009, Ingo Molnar wrote:

> > > why this redirection, why dont just use the structure as-is? 
> > > If there's any arch weirdness then that arch should have 
> > > arch-special accessors - not the generic code.
> > 
> > These _are_ the arch-specific accessors.  Look at the 
> > filename: arch/x86/include/asm/hw_breakpoint.h.
> 
> I very well know which file this is, you need to read my reply 
> again.
> 
> These are very generic-sounding fields and they should not be 
> hidden via pointless wrappers by the generic code. Dont let the 
> tail wag the dog. If there's architecture weirdness that does 
> not fit the generic code, then _that_ architecture should have 
> the ugliness - not the generic code. (note that these accessors 
> are used by the generic code so the uglification spreads there)

Hm.  I haven't been keeping careful track of all the updates Prasad has
been making.  In my fairly-old copy of the hw-breakpoint work, the
accessors are _not_ used by the generic code.  They are there for
future users of the API, not for internal use by the API itself.  Is 
there something I'm missing?

I have the feeling that this doesn't really address your comment, but
I'm not sure if that's because I don't understand your point or you
don't understand mine...

> These are very generic-sounding fields ...

Would you be happier if the field names were changed to be less 
generic-sounding?

> > > > + int num_installed; /* Number of installed bps */ + 
> > > > unsigned gennum; /* update-generation number */
> > > 
> > > i suspect the gennum we can get rid of if we get rid of the 
> > > notion of priorities, right?
> > 
> > No.  gennum has nothing to do with priorities.
> 
> Well it's introduced because we have a priority-sorted list of 
> breakpoints not an array.

More generally, it's there because kernel & userspace breakpoints can
be installed and uninstalled while a task is running -- and yes, this
is partially because breakpoints are prioritized.  (Although it's worth
pointing out that even your suggestion of always prioritizing kernel
breakpoints above userspace breakpoints would have the same effect.)  
However the fact that the breakpoints are stored in a list rather than
an array doesn't seem to be relevant.

> A list needs to be maintained and when 
> updated it's reloaded.

The same is true of an array.

> I was thinking about possibly getting rid 
> of that list complication and go back to the simpler array. But 
> it's hard because the lifetime of a kernel space breakpoint 
> spans context-switches so there has to be separation.

Yes, kernel breakpoints have to be kept separate from userspace 
breakpoints.  But even if you focus just on userspace breakpoints, you 
still need to use a list because debuggers can try to register an 
arbitrarily large number of breakpoints.

Alan Stern


^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: [patch 02/11] x86 architecture implementation of Hardware Breakpoint interfaces
  2009-03-10 17:11         ` Alan Stern
@ 2009-03-10 17:26           ` Ingo Molnar
  2009-03-10 20:30             ` Alan Stern
  2009-03-14  3:40             ` Benjamin Herrenschmidt
  0 siblings, 2 replies; 71+ messages in thread
From: Ingo Molnar @ 2009-03-10 17:26 UTC (permalink / raw)
  To: Alan Stern
  Cc: prasad, Andrew Morton, Linux Kernel Mailing List, Roland McGrath


* Alan Stern <stern@rowland.harvard.edu> wrote:

> On Tue, 10 Mar 2009, Ingo Molnar wrote:
> 
> > > > why this redirection, why dont just use the structure as-is? 
> > > > If there's any arch weirdness then that arch should have 
> > > > arch-special accessors - not the generic code.
> > > 
> > > These _are_ the arch-specific accessors.  Look at the 
> > > filename: arch/x86/include/asm/hw_breakpoint.h.
> > 
> > I very well know which file this is, you need to read my reply 
> > again.
> > 
> > These are very generic-sounding fields and they should not be 
> > hidden via pointless wrappers by the generic code. Dont let the 
> > tail wag the dog. If there's architecture weirdness that does 
> > not fit the generic code, then _that_ architecture should have 
> > the ugliness - not the generic code. (note that these accessors 
> > are used by the generic code so the uglification spreads there)
> 
> Hm.  I haven't been keeping careful track of all the updates 
> Prasad has been making.  In my fairly-old copy of the 
> hw-breakpoint work, the accessors are _not_ used by the 
> generic code.  They are there for future users of the API, not 
> for internal use by the API itself.  Is there something I'm 
> missing?

Right, they do seem unused at the moment. I was going over the 
patches and this stuck out as wrong.

> I have the feeling that this doesn't really address your 
> comment, but I'm not sure if that's because I don't understand 
> your point or you don't understand mine...

Removing them addresses my comment.

> > These are very generic-sounding fields ...
> 
> Would you be happier if the field names were changed to be 
> less generic-sounding?

Not sure what to make of this kind of reply. This isnt about me 
being happier. Generic-sounding accessors for generic-sounding 
fields is an easily recognizable pattern for broken design.

> > > > > + int num_installed; /* Number of installed bps */ + 
> > > > > unsigned gennum; /* update-generation number */
> > > > 
> > > > i suspect the gennum we can get rid of if we get rid of the 
> > > > notion of priorities, right?
> > > 
> > > No.  gennum has nothing to do with priorities.
> > 
> > Well it's introduced because we have a priority-sorted list of 
> > breakpoints not an array.
> 
> More generally, it's there because kernel & userspace 
> breakpoints can be installed and uninstalled while a task is 
> running -- and yes, this is partially because breakpoints are 
> prioritized.  (Although it's worth pointing out that even your 
> suggestion of always prioritizing kernel breakpoints above 
> userspace breakpoints would have the same effect.)  However 
> the fact that the breakpoints are stored in a list rather than 
> an array doesn't seem to be relevant.
> 
> > A list needs to be maintained and when updated it's 
> > reloaded.
> 
> The same is true of an array.

Not if what we do what the previous code did: reloaded the full 
array unconditionally. (it's just 4 entries)

> > I was thinking about possibly getting rid of that list 
> > complication and go back to the simpler array. But it's hard 
> > because the lifetime of a kernel space breakpoint spans 
> > context-switches so there has to be separation.
> 
> Yes, kernel breakpoints have to be kept separate from 
> userspace breakpoints.  But even if you focus just on 
> userspace breakpoints, you still need to use a list because 
> debuggers can try to register an arbitrarily large number of 
> breakpoints.

That 'arbitrarily larg number of breakpoints' worries me. It's a 
pretty broken concept for a 4-items resource that cannot be 
time-shared and hence cannot be overcommitted.

Seems to me that much of the complexity of this patchset:

 28 files changed, 2439 insertions(+), 199 deletions(-)

Could be eliminated via a very simple exclusive reservation 
mechanism.

	Ingo

^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: [patch 06/11] Use virtual debug registers in process/thread handling code
  2009-03-10 17:07       ` Ingo Molnar
@ 2009-03-10 20:10         ` Alan Stern
  2009-03-11 11:53           ` Ingo Molnar
  0 siblings, 1 reply; 71+ messages in thread
From: Alan Stern @ 2009-03-10 20:10 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: prasad, Andrew Morton, Linux Kernel Mailing List, Roland McGrath

On Tue, 10 Mar 2009, Ingo Molnar wrote:

> * Alan Stern <stern@rowland.harvard.edu> wrote:
> 
> > > Speaking of switch_to_thread_hw_breakpoint(), i dont like 
> > > that function at all:
> > > 
> > > - why does it have to do a list of debug registers?
> > 
> > I'm not sure I understand the point of this question.  Are you 
> > asking why the hw_breakpoint structures are stored on a list?  
> > Because there can be an arbitrarily large number of them.
> 
> But that does not make much sense. There's just 4 hardware 
> registers. There's no sane way to overcommit them hence we 
> _should not_.

The number of hardware registers will vary according to the
architecture.  Our intention was to make the hardware breakpoint
interface architecture-neutral, as nearly as possible.  Hence we
decided to let callers register arbitrary numbers of breakpoints, and
inform them when the breakpoints actually got installed in or
uninstalled from the debug registers.

If you think this design decision is a bad one, we can discuss it.  But 
Roland should be involved, because it is in large part his design.

> > > - why does it worry about IPIs arriving when context-switches on 
> > >   x86 are always done with interrupts disabled?
> > 
> > The routine gets invoked at times other than during a context 
> > switch.  However you may be right that these times are all 
> > mutually exclusive.  If so then a good deal of complication 
> > can be removed.
> 
> Yes.

After looking through it more carefully, I think you're right -- if a
kernel breakpoint change does occur while
switch_to_thread_hw_breakpoint() is running then the IPI will arrive
immediately afterward, so there's no need to check for it explicitly.  
(When this was written I probably wasn't aware that interrupts are
disabled during context switches.)  So all the stuff involving "goto
restart" can be removed.


> > > - also, what do the ->installed() and ->uninstalled() callbacks 
> > >   do - nothing uses it!
> > 
> > What do you mean?  They do what any callback does.  And of 
> > course nothing uses them -- the code hasn't been merged yet!
> 
> No need to get testy - i'm the maintainer and you are trying to 
> get stuff into two subsystems i maintain. I ask such questions 
> when i see something added that has no immediate purpose.

Email is hopeless for conveying emotional nuances.  I didn't intend 
that statement to sound testy; if it did I apologize.

> If a later patch needs a particular facility then submit it 
> together with that use. It's not that hard to add callbacks - 
> but right now it just distracts from the immediate purpose of 
> these patches.

Prasad can take out the callback parts for now.  And if we do change
the design so that breakpoints don't get installed and uninstalled at
random times then the callbacks won't be needed at all.

> And please dont try to get stuff merged if you are not willing 
> to answer simple questions like that in a constructive way.

Wasn't the remainder of that reply (the part you omitted) constructive?

Alan Stern


^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: [patch 02/11] x86 architecture implementation of Hardware Breakpoint interfaces
  2009-03-10 17:26           ` Ingo Molnar
@ 2009-03-10 20:30             ` Alan Stern
  2009-03-11 12:12               ` Ingo Molnar
  2009-03-14  3:41               ` Benjamin Herrenschmidt
  2009-03-14  3:40             ` Benjamin Herrenschmidt
  1 sibling, 2 replies; 71+ messages in thread
From: Alan Stern @ 2009-03-10 20:30 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: prasad, Andrew Morton, Linux Kernel Mailing List, Roland McGrath

On Tue, 10 Mar 2009, Ingo Molnar wrote:

> > More generally, it's there because kernel & userspace 
> > breakpoints can be installed and uninstalled while a task is 
> > running -- and yes, this is partially because breakpoints are 
> > prioritized.  (Although it's worth pointing out that even your 
> > suggestion of always prioritizing kernel breakpoints above 
> > userspace breakpoints would have the same effect.)  However 
> > the fact that the breakpoints are stored in a list rather than 
> > an array doesn't seem to be relevant.
> > 
> > > A list needs to be maintained and when updated it's 
> > > reloaded.
> > 
> > The same is true of an array.
> 
> Not if what we do what the previous code did: reloaded the full 
> array unconditionally. (it's just 4 entries)

But that array still has to be set up somehow.  It is private to the 
task; the only logical place to set it up is when the CPU switches to 
that task.

In the old code, it wasn't possible for task B or the kernel to
affect the contents of task A's debug registers.  With hw-breakpoints 
it _is_ possible, because the balance between debug registers allocated 
to kernel breakpoints and debug registers allocated to userspace 
breakpoints can change.  That's why the additional complexity is 
needed.

> > Yes, kernel breakpoints have to be kept separate from 
> > userspace breakpoints.  But even if you focus just on 
> > userspace breakpoints, you still need to use a list because 
> > debuggers can try to register an arbitrarily large number of 
> > breakpoints.
> 
> That 'arbitrarily larg number of breakpoints' worries me. It's a 
> pretty broken concept for a 4-items resource that cannot be 
> time-shared and hence cannot be overcommitted.

Suppose we never allow callers to register more breakpoints than will
fit in the CPU's registers.  Do we then use a simple first-come
first-served algorithm, with no prioritization?  If we do prioritize
some breakpoint registrations more highly than others, how do we inform
callers that their breakpoint has been kicked out by one of higher
priority?  And how do we let them know when the higher-priority
breakpoint has been unregistered, so they can try again?

> Seems to me that much of the complexity of this patchset:
> 
>  28 files changed, 2439 insertions(+), 199 deletions(-)
> 
> Could be eliminated via a very simple exclusive reservation 
> mechanism.

Can it really be as simple as all that?

Roland, what do you think?

Alan Stern


^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: [patch 06/11] Use virtual debug registers in process/thread handling code
  2009-03-10 20:10         ` Alan Stern
@ 2009-03-11 11:53           ` Ingo Molnar
  0 siblings, 0 replies; 71+ messages in thread
From: Ingo Molnar @ 2009-03-11 11:53 UTC (permalink / raw)
  To: Alan Stern
  Cc: prasad, Andrew Morton, Linux Kernel Mailing List, Roland McGrath


* Alan Stern <stern@rowland.harvard.edu> wrote:

> On Tue, 10 Mar 2009, Ingo Molnar wrote:
> 
> > * Alan Stern <stern@rowland.harvard.edu> wrote:
> > 
> > > > Speaking of switch_to_thread_hw_breakpoint(), i dont like 
> > > > that function at all:
> > > > 
> > > > - why does it have to do a list of debug registers?
> > > 
> > > I'm not sure I understand the point of this question.  Are you 
> > > asking why the hw_breakpoint structures are stored on a list?  
> > > Because there can be an arbitrarily large number of them.
> > 
> > But that does not make much sense. There's just 4 hardware 
> > registers. There's no sane way to overcommit them hence we 
> > _should not_.
> 
> The number of hardware registers will vary according to the 
> architecture.  Our intention was to make the hardware 
> breakpoint interface architecture-neutral, as nearly as 
> possible.  Hence we decided to let callers register arbitrary 
> numbers of breakpoints, and inform them when the breakpoints 
> actually got installed in or uninstalled from the debug 
> registers.

This may sound as handwaving, but the thing is, it's best to do 
these kinds of things gradually. Keep it clean, design for sane 
hardware first (and x86, as a rare exception i guess, is rather 
sane when it comes to hw debug features), add quirks on an 
as-needed basis.

That principle is _especially_ true when a feature with 
borderline utility is merged. We had to do that with KGDB: had 
to strip down a decade of cruft and it really helped.

> If you think this design decision is a bad one, we can discuss 
> it.  But Roland should be involved, because it is in large 
> part his design.

Sure.

> > > > - why does it worry about IPIs arriving when context-switches on 
> > > >   x86 are always done with interrupts disabled?
> > > 
> > > The routine gets invoked at times other than during a 
> > > context switch.  However you may be right that these times 
> > > are all mutually exclusive.  If so then a good deal of 
> > > complication can be removed.
> > 
> > Yes.
> 
> After looking through it more carefully, I think you're right 
> -- if a kernel breakpoint change does occur while 
> switch_to_thread_hw_breakpoint() is running then the IPI will 
> arrive immediately afterward, so there's no need to check for 
> it explicitly.  (When this was written I probably wasn't aware 
> that interrupts are disabled during context switches.)  So all 
> the stuff involving "goto restart" can be removed.

Good - that certainly makes the code we execute during 
context-switch a lot more palatable.

	Ingo

^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: [patch 02/11] x86 architecture implementation of Hardware Breakpoint interfaces
  2009-03-10 20:30             ` Alan Stern
@ 2009-03-11 12:12               ` Ingo Molnar
  2009-03-11 12:50                 ` K.Prasad
                                   ` (2 more replies)
  2009-03-14  3:41               ` Benjamin Herrenschmidt
  1 sibling, 3 replies; 71+ messages in thread
From: Ingo Molnar @ 2009-03-11 12:12 UTC (permalink / raw)
  To: Alan Stern
  Cc: prasad, Andrew Morton, Linux Kernel Mailing List, Roland McGrath


* Alan Stern <stern@rowland.harvard.edu> wrote:

> On Tue, 10 Mar 2009, Ingo Molnar wrote:
> 
> > > More generally, it's there because kernel & userspace 
> > > breakpoints can be installed and uninstalled while a task is 
> > > running -- and yes, this is partially because breakpoints are 
> > > prioritized.  (Although it's worth pointing out that even your 
> > > suggestion of always prioritizing kernel breakpoints above 
> > > userspace breakpoints would have the same effect.)  However 
> > > the fact that the breakpoints are stored in a list rather than 
> > > an array doesn't seem to be relevant.
> > > 
> > > > A list needs to be maintained and when updated it's 
> > > > reloaded.
> > > 
> > > The same is true of an array.
> > 
> > Not if what we do what the previous code did: reloaded the full 
> > array unconditionally. (it's just 4 entries)
> 
> But that array still has to be set up somehow.  It is private 
> to the task; the only logical place to set it up is when the 
> CPU switches to that task.
> 
> In the old code, it wasn't possible for task B or the kernel 
> to affect the contents of task A's debug registers.  With 
> hw-breakpoints it _is_ possible, because the balance between 
> debug registers allocated to kernel breakpoints and debug 
> registers allocated to userspace breakpoints can change.  
> That's why the additional complexity is needed.

Yes - but we dont really need any scheduler complexity for this.

An IPI is enough to reload debug registers in an affected task 
(and calculate the real debug register layout) - and the next 
context switches will pick up changes automatically.

Am i missing anything? I'm trying to find the design that has 
the minimal possible complexity. (without killing any necessary 
features)

> > > Yes, kernel breakpoints have to be kept separate from 
> > > userspace breakpoints.  But even if you focus just on 
> > > userspace breakpoints, you still need to use a list 
> > > because debuggers can try to register an arbitrarily large 
> > > number of breakpoints.
> > 
> > That 'arbitrarily large number of breakpoints' worries me. 
> > It's a pretty broken concept for a 4-items resource that 
> > cannot be time-shared and hence cannot be overcommitted.
> 
> Suppose we never allow callers to register more breakpoints 
> than will fit in the CPU's registers.  Do we then use a simple 
> first-come first-served algorithm, with no prioritization?  If 
> we do prioritize some breakpoint registrations more highly 
> than others, how do we inform callers that their breakpoint 
> has been kicked out by one of higher priority?  And how do we 
> let them know when the higher-priority breakpoint has been 
> unregistered, so they can try again?

For an un-shareable resource like this (and this is really a 
rare case [and we shouldnt even consider switching between user 
and kernel debug registers at system call time]), the best 
approach is to have a rigid reservation mechanism with clear, 
hard, early failures in the overcommit case.

Silently breaking a user-space debugging sessions just because 
the admin has a debug register based system-wide profiling 
running, is pretty much the worst usage model. It does not give 
user-space any idea about what happened - the breakpoints just 
"dont work".

So i'd suggest a really simple scheme (depicted for x86 bug 
applicable on other architectures too):

 - we have a system-wide resource of 4 debug registers.

 - kernel-side can allocate debug registers system-wide (it 
   takes effect on all CPUs, at once), up to 4 of them. The 5th 
   allocation will fail.

 - user-side uses the ptrace APIs - and if it runs into the 
   limit, ptrace should return a failure.

There's the following special case: the kernel reserves a debug 
register when there's tasks in the system that already have 
reserved all debug registers. I.e. the constraint was not known 
when the user-space session started, and the kernel violates it 
afterwards.

There's a couple of choices here, with various scales of 
conflict resolution:

 1- silently override the user-space breakpoint

 2- notify the user-space task via a signal - SIGXCPU or so.

 3- reject the kernel-space allocation with a sufficiently 
    informative log message: "task 123 already uses 4 debug 
    registers, cannot allocate more kernel breakpoints" - 
    leaving the resolution of the conflict to the admin.

#1 isnt particularly good because it brings back a
   'silentfailure' mode.

#2 might be too brutal: starting something innocous-looking
   might kill a debug session. OTOH user-space debuggers could 
   catch the signal and inform the user.

#3 is probably the most informative (and hence probably the
   best) variant. It also leaves policy of how to resolve the 
   conflict to the admin.

> > Seems to me that much of the complexity of this patchset:
> > 
> >  28 files changed, 2439 insertions(+), 199 deletions(-)
> > 
> > Could be eliminated via a very simple exclusive reservation 
> > mechanism.
> 
> Can it really be as simple as all that?

Would be nice to have it simple. Reluctance regarding this 
patchset is mostly rooted in that diffstat above.

The changes it does in the x86 architecture code are nice 
generalizations and cleanups. Both the scheduler, task 
startup/exit and ptrace bits look pretty sane in terms of 
factoring out debug register details. But the breakpoint 
management looks very complex.

	Ingo

^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: [patch 02/11] x86 architecture implementation of Hardware Breakpoint interfaces
  2009-03-11 12:12               ` Ingo Molnar
@ 2009-03-11 12:50                 ` K.Prasad
  2009-03-11 13:10                   ` Ingo Molnar
  2009-03-11 16:39                   ` Alan Stern
  2009-03-11 16:32                 ` Alan Stern
  2009-03-14  3:43                 ` Benjamin Herrenschmidt
  2 siblings, 2 replies; 71+ messages in thread
From: K.Prasad @ 2009-03-11 12:50 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Alan Stern, Andrew Morton, Linux Kernel Mailing List, Roland McGrath

On Wed, Mar 11, 2009 at 01:12:20PM +0100, Ingo Molnar wrote:
> 
> * Alan Stern <stern@rowland.harvard.edu> wrote:
> 
> > On Tue, 10 Mar 2009, Ingo Molnar wrote:
> > 
> > > > More generally, it's there because kernel & userspace 
> > > > breakpoints can be installed and uninstalled while a task is 
> > > > running -- and yes, this is partially because breakpoints are 
> > > > prioritized.  (Although it's worth pointing out that even your 
> > > > suggestion of always prioritizing kernel breakpoints above 
> > > > userspace breakpoints would have the same effect.)  However 
> > > > the fact that the breakpoints are stored in a list rather than 
> > > > an array doesn't seem to be relevant.
> > > > 
> > > > > A list needs to be maintained and when updated it's 
> > > > > reloaded.
> > > > 
> > > > The same is true of an array.
> > > 
> > > Not if what we do what the previous code did: reloaded the full 
> > > array unconditionally. (it's just 4 entries)
> > 
> > But that array still has to be set up somehow.  It is private 
> > to the task; the only logical place to set it up is when the 
> > CPU switches to that task.
> > 
> > In the old code, it wasn't possible for task B or the kernel 
> > to affect the contents of task A's debug registers.  With 
> > hw-breakpoints it _is_ possible, because the balance between 
> > debug registers allocated to kernel breakpoints and debug 
> > registers allocated to userspace breakpoints can change.  
> > That's why the additional complexity is needed.
> 
> Yes - but we dont really need any scheduler complexity for this.
> 
> An IPI is enough to reload debug registers in an affected task 
> (and calculate the real debug register layout) - and the next 
> context switches will pick up changes automatically.
> 
> Am i missing anything? I'm trying to find the design that has 
> the minimal possible complexity. (without killing any necessary 
> features)
> 
> > > > Yes, kernel breakpoints have to be kept separate from 
> > > > userspace breakpoints.  But even if you focus just on 
> > > > userspace breakpoints, you still need to use a list 
> > > > because debuggers can try to register an arbitrarily large 
> > > > number of breakpoints.
> > > 
> > > That 'arbitrarily large number of breakpoints' worries me. 
> > > It's a pretty broken concept for a 4-items resource that 
> > > cannot be time-shared and hence cannot be overcommitted.
> > 
> > Suppose we never allow callers to register more breakpoints 
> > than will fit in the CPU's registers.  Do we then use a simple 
> > first-come first-served algorithm, with no prioritization?  If 
> > we do prioritize some breakpoint registrations more highly 
> > than others, how do we inform callers that their breakpoint 
> > has been kicked out by one of higher priority?  And how do we 
> > let them know when the higher-priority breakpoint has been 
> > unregistered, so they can try again?
> 
> For an un-shareable resource like this (and this is really a 
> rare case [and we shouldnt even consider switching between user 
> and kernel debug registers at system call time]), the best 
> approach is to have a rigid reservation mechanism with clear, 
> hard, early failures in the overcommit case.
> 
> Silently breaking a user-space debugging sessions just because 
> the admin has a debug register based system-wide profiling 
> running, is pretty much the worst usage model. It does not give 
> user-space any idea about what happened - the breakpoints just 
> "dont work".
> 
> So i'd suggest a really simple scheme (depicted for x86 bug 
> applicable on other architectures too):
> 
>  - we have a system-wide resource of 4 debug registers.
> 
>  - kernel-side can allocate debug registers system-wide (it 
>    takes effect on all CPUs, at once), up to 4 of them. The 5th 
>    allocation will fail.
> 
>  - user-side uses the ptrace APIs - and if it runs into the 
>    limit, ptrace should return a failure.
> 
> There's the following special case: the kernel reserves a debug 
> register when there's tasks in the system that already have 
> reserved all debug registers. I.e. the constraint was not known 
> when the user-space session started, and the kernel violates it 
> afterwards.
> 
> There's a couple of choices here, with various scales of 
> conflict resolution:
> 
>  1- silently override the user-space breakpoint
> 
>  2- notify the user-space task via a signal - SIGXCPU or so.
> 
>  3- reject the kernel-space allocation with a sufficiently 
>     informative log message: "task 123 already uses 4 debug 
>     registers, cannot allocate more kernel breakpoints" - 
>     leaving the resolution of the conflict to the admin.
> 
> #1 isnt particularly good because it brings back a
>    'silentfailure' mode.
> 
> #2 might be too brutal: starting something innocous-looking
>    might kill a debug session. OTOH user-space debuggers could 
>    catch the signal and inform the user.
> 
> #3 is probably the most informative (and hence probably the
>    best) variant. It also leaves policy of how to resolve the 
>    conflict to the admin.
> 

While reserving more discussions after Roland posts his views, I thought
I'd share some of mine here.

The present implementation can be likened to #3 except that the
uninstalled() callback is invoked (the user-space call through ptrace
takes a higher priority and evicts the kernel-space requests even now).

After the task using four debug registers yield the CPU, the
kernel-space breakpoint requests are 'restored' and installed() is
called again.

Even if #3 was implemented as described, we would still retain a
majority of the complexity in balance_kernel_vs_user() to check newer
tasks with requests for breakpoint registers.

Thanks,
K.Prasad


^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: [patch 01/11] Introducing generic hardware breakpoint handler interfaces
  2009-03-10 14:50       ` Ingo Molnar
@ 2009-03-11 12:57         ` K.Prasad
  2009-03-11 13:35           ` Ingo Molnar
  0 siblings, 1 reply; 71+ messages in thread
From: K.Prasad @ 2009-03-11 12:57 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Alan Stern, Andrew Morton, Linux Kernel Mailing List, Roland McGrath

On Tue, Mar 10, 2009 at 03:50:36PM +0100, Ingo Molnar wrote:
> 
> * Alan Stern <stern@rowland.harvard.edu> wrote:
> 
> > On Tue, 10 Mar 2009, Ingo Molnar wrote:
> > 
> > > * prasad@linux.vnet.ibm.com <prasad@linux.vnet.ibm.com> wrote:
> > > 
> > > 
> > > > +static u8			tprio[HB_NUM];	/* Thread bp max priorities */
> > > > +LIST_HEAD(kernel_bps);			/* Kernel breakpoint list */
> > > > +static LIST_HEAD(thread_list);			/* thread_hw_breakpoint list */
> > > > +DEFINE_PER_CPU(struct cpu_hw_breakpoint, cpu_bp);
> > 
> > If nobody minds, I'll answer some of these questions on 
> > Prasad's behalf because they address parts of the code that 
> > were written before he took over the project.
> > 
> > > hm, why do we need the whole 'priority' mechanism? It seems 
> > > very over-designed to me.
> > 
> > This was done at Roland McGrath's express request.  We should 
> > see what he has to say about it.
> > 
> > > The likelyhood of both user-space and kernel-space to use 
> > > hw-breakpoints is very low to begin with. And if they use 
> > > them, the likelyhood of there being more than 4 debugregs 
> > > required in the same context is even lower.
> > 
> > Not all architectures have 4 debug registers.  Most have only 
> > one.
> >
> > > If that happens we shouldnt try to be too smart about them - 
> > > just override user-space ones with kernel space ones and 
> > > that's it. No explicit priorities are needed.
> > 
> > Roland really did not want it done this way.
> 
> Well i guess i'll have to wait for Roland's reply then.
> 
> 	Ingo

For the benefit of continuing discussion on this topic, here's an
extract from an old mail (http://lkml.org/lkml/2007/2/5/465) from
Roland, explaining the need for prioritisation of requests. It must have
been utrace as a potential user that made him suggest this.

"I am all in favor of a facility to manage shared use of the debug
registers, such as your debugreg.h additions.  I just think it needs to be
a little more flexible.  An unobtrusive kernel facility has to get out of
the way when user-mode decides to use all its debug registers.  It's not
immediately important what it's going to about it when contention arises,
but there has to be a way for the user-mode facilities to say they need to
allocate debugregs with priority and evict other squatters.  So, something
like code allocating a debugreg can supply a callback that's made when its
allocation has to taken by something with higher priority.  

Even after utrace, there will always be the possibility of a traditional
uncoordinated user of the raw debug registers, if nothing else ptrace
compatibility will always be there for old users.  So anything new and
fancy needs to be prepared to back out of the way gracefully.  In the case
of kwatch, it can just have a handler function given by the caller to start
with.  It's OK if individual callers can specially declare "I am not
well-behaved" and eat debugregs so that well-behaved high-priority users
like ptrace just have to lose (breaking compatibility).  But no
well-behaved caller of kwatch will do that.

I certainly intend for later features based on utrace to include
higher-level treatment of watchpoints so that user debugging facilities can
also become responsive to debugreg allocation pressure.  (Eventually, the
user facilities might have easier ways of falling back to other methods and
getting out of the way of kernel debugreg consumers, than can be done for
the kernel-mode-tracing facilities.)  To that end, I'd like to see a clear
and robust interface for debugreg sharing, below the level of kwatch."

Thanks,
K.Prasad



^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: [patch 02/11] x86 architecture implementation of Hardware Breakpoint interfaces
  2009-03-11 12:50                 ` K.Prasad
@ 2009-03-11 13:10                   ` Ingo Molnar
  2009-03-14  3:46                     ` Benjamin Herrenschmidt
  2009-03-11 16:39                   ` Alan Stern
  1 sibling, 1 reply; 71+ messages in thread
From: Ingo Molnar @ 2009-03-11 13:10 UTC (permalink / raw)
  To: K.Prasad
  Cc: Alan Stern, Andrew Morton, Linux Kernel Mailing List, Roland McGrath


* K.Prasad <prasad@linux.vnet.ibm.com> wrote:

> Even if #3 was implemented as described, we would still retain 
> a majority of the complexity in balance_kernel_vs_user() to 
> check newer tasks with requests for breakpoint registers.

Not if it's implemented in a really simple way:

Kernel gets debug registers in db4..db3..db2..db1 order, and its 
allocation is essentially hardcoded - i.e. we dont try to be 
fancy.

User-space (gdb) on the other hand will try to allocate in the 
db1..db2..db3..db4 order.

Maintain a 'max debug register index' value driven by ptrace and 
maintain a 'min debug register index' driven by kernel-space 
hw-breakpoint allocations.

If they meet somewhere inbetween then we have overcommit which 
we dont allow. In all other cases (which i proffer covers 100% 
of the sane cases) they will mix amicably.

Sure, user-space can in principle do db4..db3..db2..db1 
allocations as well, but it would be silly and GDB does not do 
that.

So there's no real overlap between register usage - hence no 
need for any complex scheduling smarts. Keep it simple first, 
and only add complexity when it's justified.

[ for the special case of an architecture having just a single 
  debug register this will bring the expected behavior of either 
  allowing gdb to use the breakpoint or allowing the kernel to 
  use it. ]

	Ingo

^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: [patch 01/11] Introducing generic hardware breakpoint handler interfaces
  2009-03-11 12:57         ` K.Prasad
@ 2009-03-11 13:35           ` Ingo Molnar
  0 siblings, 0 replies; 71+ messages in thread
From: Ingo Molnar @ 2009-03-11 13:35 UTC (permalink / raw)
  To: K.Prasad
  Cc: Alan Stern, Andrew Morton, Linux Kernel Mailing List, Roland McGrath


* K.Prasad <prasad@linux.vnet.ibm.com> wrote:

> For the benefit of continuing discussion on this topic, here's 
> an extract from an old mail 
> (http://lkml.org/lkml/2007/2/5/465) from Roland, explaining 
> the need for prioritisation of requests. It must have been 
> utrace as a potential user that made him suggest this.
> 
> "I am all in favor of a facility to manage shared use of the 
> debug registers, such as your debugreg.h additions.  I just 
> think it needs to be a little more flexible.  An unobtrusive 
> kernel facility has to get out of the way when user-mode 
> decides to use all its debug registers.  It's not immediately 
> important what it's going to about it when contention arises, 
> but there has to be a way for the user-mode facilities to say 
> they need to allocate debugregs with priority and evict other 
> squatters.  So, something like code allocating a debugreg can 
> supply a callback that's made when its allocation has to taken 
> by something with higher priority.
> 
> Even after utrace, there will always be the possibility of a 
> traditional uncoordinated user of the raw debug registers, if 
> nothing else ptrace compatibility will always be there for old 
> users.  So anything new and fancy needs to be prepared to back 
> out of the way gracefully.  In the case of kwatch, it can just 
> have a handler function given by the caller to start with.  
> It's OK if individual callers can specially declare "I am not 
> well-behaved" and eat debugregs so that well-behaved 
> high-priority users like ptrace just have to lose (breaking 
> compatibility).  But no well-behaved caller of kwatch will do 
> that.
> 
> I certainly intend for later features based on utrace to 
> include higher-level treatment of watchpoints so that user 
> debugging facilities can also become responsive to debugreg 
> allocation pressure.  (Eventually, the user facilities might 
> have easier ways of falling back to other methods and getting 
> out of the way of kernel debugreg consumers, than can be done 
> for the kernel-mode-tracing facilities.)  To that end, I'd 
> like to see a clear and robust interface for debugreg sharing, 
> below the level of kwatch."

This argument ignores the reality of debug registers: 
overcommitted usage of them causes silent failures and
unobvious behavior.

I think the simple reservation scheme i outlined in the
previous mail is the minimum amount of complexity that
still gets kernel-space hw-breakpoints going robustly.
If we add anything more fancy we want it based on actual
need and desire.

	Ingo

^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: [patch 02/11] x86 architecture implementation of Hardware Breakpoint interfaces
  2009-03-11 12:12               ` Ingo Molnar
  2009-03-11 12:50                 ` K.Prasad
@ 2009-03-11 16:32                 ` Alan Stern
  2009-03-11 17:41                   ` K.Prasad
  2009-03-14  3:43                 ` Benjamin Herrenschmidt
  2 siblings, 1 reply; 71+ messages in thread
From: Alan Stern @ 2009-03-11 16:32 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: prasad, Andrew Morton, Linux Kernel Mailing List, Roland McGrath

On Wed, 11 Mar 2009, Ingo Molnar wrote:

> > > Not if what we do what the previous code did: reloaded the full 
> > > array unconditionally. (it's just 4 entries)
> > 
> > But that array still has to be set up somehow.  It is private 
> > to the task; the only logical place to set it up is when the 
> > CPU switches to that task.
> > 
> > In the old code, it wasn't possible for task B or the kernel 
> > to affect the contents of task A's debug registers.  With 
> > hw-breakpoints it _is_ possible, because the balance between 
> > debug registers allocated to kernel breakpoints and debug 
> > registers allocated to userspace breakpoints can change.  
> > That's why the additional complexity is needed.
> 
> Yes - but we dont really need any scheduler complexity for this.
> 
> An IPI is enough to reload debug registers in an affected task 
> (and calculate the real debug register layout) - and the next 
> context switches will pick up changes automatically.
> 
> Am i missing anything? I'm trying to find the design that has 
> the minimal possible complexity. (without killing any necessary 
> features)

I think you _are_ missing something, though it's not clear what.

"and the next context switches will pick up changes automatically" --
that may not be entirely right.  Yes, the next context switch will pick
up the changes to DR1-4, but it won't necessarily pick up the changes
to DR7.  However the details depend very much on how debug registers
are allocated; with no priorities or evictions much of the complexity
will disappear anyway.

> For an un-shareable resource like this (and this is really a 
> rare case [and we shouldnt even consider switching between user 
> and kernel debug registers at system call time]), the best 
> approach is to have a rigid reservation mechanism with clear, 
> hard, early failures in the overcommit case.
> 
> Silently breaking a user-space debugging sessions just because 
> the admin has a debug register based system-wide profiling 
> running, is pretty much the worst usage model. It does not give 
> user-space any idea about what happened - the breakpoints just 
> "dont work".
> 
> So i'd suggest a really simple scheme (depicted for x86 bug 
> applicable on other architectures too):
> 
>  - we have a system-wide resource of 4 debug registers.
> 
>  - kernel-side can allocate debug registers system-wide (it 
>    takes effect on all CPUs, at once), up to 4 of them. The 5th 
>    allocation will fail.
> 
>  - user-side uses the ptrace APIs - and if it runs into the 
>    limit, ptrace should return a failure.

Roland, of course, is all in favor of making hw-breakpoints compatible 
with utrace.  The API should be flexible enough to encompass both 
legacy ptrace and utrace.

> There's the following special case: the kernel reserves a debug 
> register when there's tasks in the system that already have 
> reserved all debug registers. I.e. the constraint was not known 
> when the user-space session started, and the kernel violates it 
> afterwards.

Right.  Or the kernel tries to allocate 2 debug registers when 
userspace has already allocated 3, and so on...

> There's a couple of choices here, with various scales of 
> conflict resolution:
> 
>  1- silently override the user-space breakpoint
> 
>  2- notify the user-space task via a signal - SIGXCPU or so.
> 
>  3- reject the kernel-space allocation with a sufficiently 
>     informative log message: "task 123 already uses 4 debug 
>     registers, cannot allocate more kernel breakpoints" - 
>     leaving the resolution of the conflict to the admin.

We can't necessarily assign a particular task to the debug registers 
already in use.  There might be more than one task using them.  But of 
course we can always just say that they are already in use, and if 
necessary there could be a /proc interface with more information.

Besides, we have to be able to reject kernel breakpoint requests in any
case ("the 5th allocation will fail").

> #1 isnt particularly good because it brings back a
>    'silentfailure' mode.

Agreed.

> #2 might be too brutal: starting something innocous-looking
>    might kill a debug session. OTOH user-space debuggers could 
>    catch the signal and inform the user.
> 
> #3 is probably the most informative (and hence probably the
>    best) variant. It also leaves policy of how to resolve the 
>    conflict to the admin.

AFAICS, #3 really is "first come, first served".  What do you mean by 
"policy of how to resolve the conflict"?  It sounds like there are no 
policy choices involved; whoever requests the debug register first will 
get it.

> Would be nice to have it simple. Reluctance regarding this 
> patchset is mostly rooted in that diffstat above.

I'd be happy to implement #3.  Mostly it would just involve removing 
code from the patches.

> The changes it does in the x86 architecture code are nice 
> generalizations and cleanups. Both the scheduler, task 
> startup/exit and ptrace bits look pretty sane in terms of 
> factoring out debug register details. But the breakpoint 
> management looks very complex.

Yes, there's no denying it.  But I don't want to commit to any 
particular changes without Roland's input.

Alan Stern


^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: [patch 02/11] x86 architecture implementation of Hardware Breakpoint interfaces
  2009-03-11 12:50                 ` K.Prasad
  2009-03-11 13:10                   ` Ingo Molnar
@ 2009-03-11 16:39                   ` Alan Stern
  1 sibling, 0 replies; 71+ messages in thread
From: Alan Stern @ 2009-03-11 16:39 UTC (permalink / raw)
  To: K.Prasad
  Cc: Ingo Molnar, Andrew Morton, Linux Kernel Mailing List, Roland McGrath

On Wed, 11 Mar 2009, K.Prasad wrote:

> The present implementation can be likened to #3 except that the
> uninstalled() callback is invoked (the user-space call through ptrace
> takes a higher priority and evicts the kernel-space requests even now).
> 
> After the task using four debug registers yield the CPU, the
> kernel-space breakpoint requests are 'restored' and installed() is
> called again.

No, that is wrong.  The kernel breakpoints do not get reinstalled until 
the userspace breakpoints are unregistered.  Merely yielding the CPU is 
not sufficient.

> Even if #3 was implemented as described, we would still retain a
> majority of the complexity in balance_kernel_vs_user() to check newer
> tasks with requests for breakpoint registers.

Some complexity is certainly needed, because at all times we need to
know the maximum number of breakpoints requested by any user task.  
The number of kernel breakpoints that can be allocated is limited to 4
minus this number.

Alan Stern


^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: [patch 02/11] x86 architecture implementation of Hardware Breakpoint interfaces
  2009-03-11 16:32                 ` Alan Stern
@ 2009-03-11 17:41                   ` K.Prasad
  2009-03-14  3:47                     ` Benjamin Herrenschmidt
  0 siblings, 1 reply; 71+ messages in thread
From: K.Prasad @ 2009-03-11 17:41 UTC (permalink / raw)
  To: Alan Stern
  Cc: Ingo Molnar, Andrew Morton, Linux Kernel Mailing List, Roland McGrath

On Wed, Mar 11, 2009 at 12:32:19PM -0400, Alan Stern wrote:
> On Wed, 11 Mar 2009, Ingo Molnar wrote:
> 
> > > > Not if what we do what the previous code did: reloaded the full 
> > > > array unconditionally. (it's just 4 entries)
> > > 
> > > But that array still has to be set up somehow.  It is private 
> > > to the task; the only logical place to set it up is when the 
> > > CPU switches to that task.
> > > 
> > > In the old code, it wasn't possible for task B or the kernel 
> > > to affect the contents of task A's debug registers.  With 
> > > hw-breakpoints it _is_ possible, because the balance between 
> > > debug registers allocated to kernel breakpoints and debug 
> > > registers allocated to userspace breakpoints can change.  
> > > That's why the additional complexity is needed.
> > 
> > Yes - but we dont really need any scheduler complexity for this.
> > 
> > An IPI is enough to reload debug registers in an affected task 
> > (and calculate the real debug register layout) - and the next 
> > context switches will pick up changes automatically.
> > 
> > Am i missing anything? I'm trying to find the design that has 
> > the minimal possible complexity. (without killing any necessary 
> > features)
> 
> I think you _are_ missing something, though it's not clear what.
> 
> "and the next context switches will pick up changes automatically" --
> that may not be entirely right.  Yes, the next context switch will pick
> up the changes to DR1-4, but it won't necessarily pick up the changes
> to DR7.  However the details depend very much on how debug registers
> are allocated; with no priorities or evictions much of the complexity
> will disappear anyway.
> 
> > For an un-shareable resource like this (and this is really a 
> > rare case [and we shouldnt even consider switching between user 
> > and kernel debug registers at system call time]), the best 
> > approach is to have a rigid reservation mechanism with clear, 
> > hard, early failures in the overcommit case.
> > 
> > Silently breaking a user-space debugging sessions just because 
> > the admin has a debug register based system-wide profiling 
> > running, is pretty much the worst usage model. It does not give 
> > user-space any idea about what happened - the breakpoints just 
> > "dont work".
> > 
> > So i'd suggest a really simple scheme (depicted for x86 bug 
> > applicable on other architectures too):
> > 
> >  - we have a system-wide resource of 4 debug registers.
> > 
> >  - kernel-side can allocate debug registers system-wide (it 
> >    takes effect on all CPUs, at once), up to 4 of them. The 5th 
> >    allocation will fail.
> > 
> >  - user-side uses the ptrace APIs - and if it runs into the 
> >    limit, ptrace should return a failure.
> 
> Roland, of course, is all in favor of making hw-breakpoints compatible 
> with utrace.  The API should be flexible enough to encompass both 
> legacy ptrace and utrace.
> 
> > There's the following special case: the kernel reserves a debug 
> > register when there's tasks in the system that already have 
> > reserved all debug registers. I.e. the constraint was not known 
> > when the user-space session started, and the kernel violates it 
> > afterwards.
> 
> Right.  Or the kernel tries to allocate 2 debug registers when 
> userspace has already allocated 3, and so on...
> 
> > There's a couple of choices here, with various scales of 
> > conflict resolution:
> > 
> >  1- silently override the user-space breakpoint
> > 
> >  2- notify the user-space task via a signal - SIGXCPU or so.
> > 
> >  3- reject the kernel-space allocation with a sufficiently 
> >     informative log message: "task 123 already uses 4 debug 
> >     registers, cannot allocate more kernel breakpoints" - 
> >     leaving the resolution of the conflict to the admin.
> 
> We can't necessarily assign a particular task to the debug registers 
> already in use.  There might be more than one task using them.  But of 
> course we can always just say that they are already in use, and if 
> necessary there could be a /proc interface with more information.
> 
> Besides, we have to be able to reject kernel breakpoint requests in any
> case ("the 5th allocation will fail").
> 
> > #1 isnt particularly good because it brings back a
> >    'silentfailure' mode.
> 
> Agreed.
> 
> > #2 might be too brutal: starting something innocous-looking
> >    might kill a debug session. OTOH user-space debuggers could 
> >    catch the signal and inform the user.
> > 
> > #3 is probably the most informative (and hence probably the
> >    best) variant. It also leaves policy of how to resolve the 
> >    conflict to the admin.
> 
> AFAICS, #3 really is "first come, first served".  What do you mean by 
> "policy of how to resolve the conflict"?  It sounds like there are no 
> policy choices involved; whoever requests the debug register first will 
> get it.
>

With FCFS or an allocation mechanism without the (un)installed()
callbacks we'd lose the ability to record requests and service them
later when registers become availabile.

Say when (un)installed() callbacks are implemented for the proposed
ftrace-plugin to trace kernel symbols, they can automatically stop/start
tracing as and when registers become (un)available. This can be helpful when
we wish to profile memory access over a kernel variable for a long duration
(where small loss of tracing data can be tolerated), while the system would
permit simultaneous user-space access (say a GDB session using 'hbreak').

Are we fine with disallowing such usage, which if done will let the requester
of the breakpoint register 'poll' periodically to check availability.

Thanks,
K.Prasad


^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: [patch 04/11] Introduce virtual debug register in thread_struct and wrapper-routines around process related functions
  2009-03-10 14:35   ` Ingo Molnar
  2009-03-10 15:53     ` Alan Stern
@ 2009-03-12  2:26     ` Roland McGrath
  1 sibling, 0 replies; 71+ messages in thread
From: Roland McGrath @ 2009-03-12  2:26 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: prasad, Andrew Morton, Linux Kernel Mailing List, Alan Stern

> detached from thread_struct? There's a lot of complications 
> (alloc/free, locking, etc.) from this for no good reason - the 
> hardware-breakpoints info structure is alway per thread and is 
> quite small, so there's no reason not to embedd it directly 
> inside thread_struct.

I do certainly think it's worthwhile to use a coherent struct type here
rather than many fields in thread_struct, independent of the allocation for
it being direct or indirect (not that you objected to that).  That makes
the code read cleaner, and should make it a minor change to most of the
code later if the allocation plan changes.

I think in the original effort another motivating factor was concern about
bloating the size of thread_struct.  The struct thread_hw_breakpoint is at
least a few times the size of the set old fields it replaces.  We were
concerned that inflating every task's thread_struct for the benefit of
these rarely-used fancy new features might meet resistance from arch
maintainers like you.  If that issue doesn't hold you back from taking the
new code, then I think we are more than happy to start with thread_struct
directly containing a struct thread_hw_breakpoint member.

I do think we'll want to make it a pointer with later incremental changes.
(But those may not need to come very soon.)  Firstly, the size reduction to
task_struct is fairly compelling since it's for the vast majority of tasks
which never need to allocate it.  The hair potential is really not very
much at least to begin with, if you just make it allocate on first setup
and never free (no locking et al, just if (thread->hwbkpt) ...).  Second,
eventually we'd like to have the possibility of sharing the struct among
threads.  This will come later on when we have higher-level things that
would like to set common watchpoints on a whole group of threads (what
debuggers usually really do).  Such APIs are far improved and optimized by
updating many threads together, and when a big app has thousands of threads
to which all the same watchpoints apply, sharing at low level makes many of
those intraprocess context switches quicker.  As I said, all in the future.
But it's far from being entirely baseless to think a pointer makes good sense.


Thanks,
Roland

^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: [patch 02/11] x86 architecture implementation of Hardware Breakpoint interfaces
  2009-03-10 14:09   ` Ingo Molnar
  2009-03-10 14:59     ` Alan Stern
@ 2009-03-12  2:46     ` Roland McGrath
  2009-03-13  3:43       ` Ingo Molnar
  2009-03-14  3:51       ` Benjamin Herrenschmidt
  1 sibling, 2 replies; 71+ messages in thread
From: Roland McGrath @ 2009-03-12  2:46 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: prasad, Andrew Morton, Linux Kernel Mailing List, Alan Stern

Perhaps it would help if asm-generic/hw_breakpoint.h had some kerneldoc
comments for the arch-specific functions that the arch's asm/hw_breakpoint.h
must define (in the style of asm-generic/syscall.h).  I note that Ingo
didn't have any comments about asm-generic/hw_breakpoint.h in his review.
Its purpose should be to make any arch maintainer understand why the API it
specifies for each arch to meet makes sense across the arch's.

> why this redirection, why dont just use the structure as-is? If 
> there's any arch weirdness then that arch should have 
> arch-special accessors - not the generic code.

The fields of arch_hw_breakpoint are arch-specific.  Another arch's
struct will not have .type and .len fields at all.  e.g., on powerpc
there is just one size supported, so hw_breakpoint_get_len() would be an
inline returning a constant.  Its type is encoded in low bits of the
address word, and the arch implementation may not want to use bit-field
called .type for that (and if it did, it couldn't use a bit-field called
.address with the meaning you'd want it to have).  

Having any fields in arch_hw_breakpoint at all be part of the API
restricts the arch implementation unreasonably.  So it has accessors to
fetch them instead.  (Arguably we could punt those accessors from the
API for hw_breakpoint users, but the arch-independent part of the
hw_breakpoint implementation might still want them, I'm not sure.)
Likewise, they need to be filled in by setters or by explicit type/len
arguments to the registration calls.  This appears to be a tenet we
worked out the first time around that has gotten lost in the shuffle
more recently.

I think it would be illustrative to have a second arch implementation to
compare to the x86 one.  Ingo has a tendency to pretend everything is an
x86 until shown the concrete evidence.  The obvious choice is powerpc.
Its facility is very simple, so the arch-specific part of the
implementation should be trivial--it's the "base case" of simplest
available hw_breakpoint arch, really.  Also, it happens that Prasad's
employer is interested in having that support.

For example, a sensible powerpc implementation would clearly demonstrate
why you need accessors or at least either pre-registration setters or
explicit type/len arguments in registration calls.


Thanks,
Roland

^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: [patch 08/11] Modify Ptrace routines to access breakpoint registers
  2009-03-10 14:40   ` Ingo Molnar
  2009-03-10 15:54     ` Alan Stern
@ 2009-03-12  3:14     ` Roland McGrath
  1 sibling, 0 replies; 71+ messages in thread
From: Roland McGrath @ 2009-03-12  3:14 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: prasad, Andrew Morton, Linux Kernel Mailing List, Alan Stern

> I dont see where this security check has been carried over into 
> the generic code. The new code has:
> 
>  +int arch_check_va_in_userspace(unsigned long va, struct task_struct *tsk)
>  +{
>  +       return (va < TASK_SIZE);
>  +}
> 
> but i think that misses the detail that it's not just the start 
> address of an x86 breakpoint that has to be considered, but also 
> the end addess of it.

It also needs to be TASK_SIZE_OF(tsk), which is shorthand for the same
logic already in the 64-bit debugreg_addr_limit().  

For the end-of-range issue, it perhaps ought to check size-1 instead of
wordsize-1, i.e. through the end of the actual breakpoint range, not of the
word containing it.  What debugreg_addr_limit() does is the historical
ptrace check on x86, but I don't see a reason to disallow a 1-byte
watchpoint on the last addressable user-space byte if the hardware will
support it.

So either the arch check should take a size parameter, or the
arch-independent code can just call it with address+size-1.


Thanks,
Roland

^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: [patch 02/11] x86 architecture implementation of Hardware Breakpoint interfaces
  2009-03-12  2:46     ` Roland McGrath
@ 2009-03-13  3:43       ` Ingo Molnar
  2009-03-13 14:04         ` Alan Stern
  2009-03-14  3:51       ` Benjamin Herrenschmidt
  1 sibling, 1 reply; 71+ messages in thread
From: Ingo Molnar @ 2009-03-13  3:43 UTC (permalink / raw)
  To: Roland McGrath
  Cc: prasad, Andrew Morton, Linux Kernel Mailing List, Alan Stern


* Roland McGrath <roland@redhat.com> wrote:

> Perhaps it would help if asm-generic/hw_breakpoint.h had some 
> kerneldoc comments for the arch-specific functions that the 
> arch's asm/hw_breakpoint.h must define (in the style of 
> asm-generic/syscall.h).  I note that Ingo didn't have any 
> comments about asm-generic/hw_breakpoint.h in his review. Its 
> purpose should be to make any arch maintainer understand why 
> the API it specifies for each arch to meet makes sense across 
> the arch's.
> 
> > why this redirection, why dont just use the structure as-is? 
> > If there's any arch weirdness then that arch should have 
> > arch-special accessors - not the generic code.
> 
> The fields of arch_hw_breakpoint are arch-specific.  Another 
> arch's struct will not have .type and .len fields at all.  
> e.g., on powerpc there is just one size supported, so 
> hw_breakpoint_get_len() would be an inline returning a 
> constant.  Its type is encoded in low bits of the address 
> word, and the arch implementation may not want to use 
> bit-field called .type for that (and if it did, it couldn't 
> use a bit-field called .address with the meaning you'd want it 
> to have).
> 
> Having any fields in arch_hw_breakpoint at all be part of the 
> API restricts the arch implementation unreasonably.  So it has 
> accessors to fetch them instead.  (Arguably we could punt 
> those accessors from the API for hw_breakpoint users, but the 
> arch-independent part of the hw_breakpoint implementation 
> might still want them, I'm not sure.) Likewise, they need to 
> be filled in by setters or by explicit type/len arguments to 
> the registration calls.  This appears to be a tenet we worked 
> out the first time around that has gotten lost in the shuffle 
> more recently.
> 
> I think it would be illustrative to have a second arch 
> implementation to compare to the x86 one.  Ingo has a tendency 
> to pretend everything is an x86 until shown the concrete 
> evidence.  The obvious choice is powerpc. Its facility is very 
> simple, so the arch-specific part of the implementation should 
> be trivial--it's the "base case" of simplest available 
> hw_breakpoint arch, really.  Also, it happens that Prasad's 
> employer is interested in having that support.
> 
> For example, a sensible powerpc implementation would clearly 
> demonstrate why you need accessors or at least either 
> pre-registration setters or explicit type/len arguments in 
> registration calls.

That would help. I indeed have a tendency to strike out code 
that's not immediately needed, i also tend to make sure that 
design is sane on the platform that 95%+ of our active 
developers/users use.

The core issue being discussed is the debug register allocation 
and scheduling model though, and you have not directly commented 
on that.

My argument in a nutshell is that a bottom-up for user + 
top-down for kernel use static allocator with no dynamic 
scheduling will get us most of the benefits with a tenth of the 
complexity.

	Ingo

^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: [patch 02/11] x86 architecture implementation of Hardware Breakpoint interfaces
  2009-03-13  3:43       ` Ingo Molnar
@ 2009-03-13 14:04         ` Alan Stern
  2009-03-13 14:13           ` Ingo Molnar
  0 siblings, 1 reply; 71+ messages in thread
From: Alan Stern @ 2009-03-13 14:04 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Roland McGrath, prasad, Andrew Morton, Linux Kernel Mailing List

On Fri, 13 Mar 2009, Ingo Molnar wrote:

> The core issue being discussed is the debug register allocation 
> and scheduling model though, and you have not directly commented 
> on that.
> 
> My argument in a nutshell is that a bottom-up for user + 
> top-down for kernel use static allocator with no dynamic 
> scheduling will get us most of the benefits with a tenth of the 
> complexity.

Take this even farther: We shouldn't restrict userspace to bottom-up
register allocation.  With very little additional effort we can
virtualize the debug registers; then userspace can allocate them in
whatever order it wants and still end up using the physical registers
in bottom-up order (or top-down, which is the order used by the current
patches).

After all, there's nothing to prevent programs other than gdb from 
using ptrace, and there's no guarantee that gdb will continue to 
allocate registers in increasing order.

Alan Stern


^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: [patch 02/11] x86 architecture implementation of Hardware Breakpoint interfaces
  2009-03-13 14:04         ` Alan Stern
@ 2009-03-13 14:13           ` Ingo Molnar
  2009-03-13 19:01             ` K.Prasad
  0 siblings, 1 reply; 71+ messages in thread
From: Ingo Molnar @ 2009-03-13 14:13 UTC (permalink / raw)
  To: Alan Stern
  Cc: Roland McGrath, prasad, Andrew Morton, Linux Kernel Mailing List


* Alan Stern <stern@rowland.harvard.edu> wrote:

> On Fri, 13 Mar 2009, Ingo Molnar wrote:
> 
> > The core issue being discussed is the debug register 
> > allocation and scheduling model though, and you have not 
> > directly commented on that.
> > 
> > My argument in a nutshell is that a bottom-up for user + 
> > top-down for kernel use static allocator with no dynamic 
> > scheduling will get us most of the benefits with a tenth of 
> > the complexity.
> 
> Take this even farther: We shouldn't restrict userspace to 
> bottom-up register allocation.  With very little additional 
> effort we can virtualize the debug registers; then userspace 
> can allocate them in whatever order it wants and still end up 
> using the physical registers in bottom-up order (or top-down, 
> which is the order used by the current patches).
> 
> After all, there's nothing to prevent programs other than gdb 
> from using ptrace, and there's no guarantee that gdb will 
> continue to allocate registers in increasing order.

If in ~10 years of its existence no such usage arose so i dont 
think it will magically appear now.

The thing is, kernel-side use of debug registers is a borderline 
item whose impact we should minimalize as much as possible. 
Linus in the past expressed that it is fine to not have _any_ 
management of user versus kernel debug registers. So we want to 
approach this from the minimalistic side. I offered such a very 
minimal design that is trivial in terms of correctness and 
impact.

We can still get this simple allocation model into .30 if we 
dont waste time arguing about unnecessarily. If someone runs 
into limitations the model can be extended.

	Ingo

^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: [patch 02/11] x86 architecture implementation of Hardware Breakpoint interfaces
  2009-03-13 14:13           ` Ingo Molnar
@ 2009-03-13 19:01             ` K.Prasad
  2009-03-13 21:21               ` Alan Stern
  0 siblings, 1 reply; 71+ messages in thread
From: K.Prasad @ 2009-03-13 19:01 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Alan Stern, Roland McGrath, Andrew Morton, Linux Kernel Mailing List

On Fri, Mar 13, 2009 at 03:13:04PM +0100, Ingo Molnar wrote:
> 
> * Alan Stern <stern@rowland.harvard.edu> wrote:
> 
> > On Fri, 13 Mar 2009, Ingo Molnar wrote:
> > 
> > > The core issue being discussed is the debug register 
> > > allocation and scheduling model though, and you have not 
> > > directly commented on that.
> > > 
> > > My argument in a nutshell is that a bottom-up for user + 
> > > top-down for kernel use static allocator with no dynamic 
> > > scheduling will get us most of the benefits with a tenth of 
> > > the complexity.
> > 
> > Take this even farther: We shouldn't restrict userspace to 
> > bottom-up register allocation.  With very little additional 
> > effort we can virtualize the debug registers; then userspace 
> > can allocate them in whatever order it wants and still end up 
> > using the physical registers in bottom-up order (or top-down, 
> > which is the order used by the current patches).
> > 
> > After all, there's nothing to prevent programs other than gdb 
> > from using ptrace, and there's no guarantee that gdb will 
> > continue to allocate registers in increasing order.
> 
> If in ~10 years of its existence no such usage arose so i dont 
> think it will magically appear now.
> 
> The thing is, kernel-side use of debug registers is a borderline 
> item whose impact we should minimalize as much as possible. 
> Linus in the past expressed that it is fine to not have _any_ 
> management of user versus kernel debug registers. So we want to 
> approach this from the minimalistic side. I offered such a very 
> minimal design that is trivial in terms of correctness and 
> impact.
> 
> We can still get this simple allocation model into .30 if we 
> dont waste time arguing about unnecessarily. If someone runs 
> into limitations the model can be extended.
> 
> 	Ingo

Here's a summary of the intended changes to the patchset, which I hope
to post early the following week. It tears down many features in the
present submission (The write-up below is done without the benefit of
actually having run into limitations while trying to chisel out code).

- Adopt a static allocation method for registers, say FCFS (and perhaps
  botton-up for user-space allocations and the reverse for
  kernel-space), although individual counters to do book-keeping should also
  suffice.

- Use an array of HB_NUM size for storing the breakpoint requests (and
  not a linked-list implementation as done now).

- Define a HAVE_HW_BREAKPOINTS in arch/x86/Kconfig unconditionally, but
  build kernel/hw_breakpoint.o, samples/hw_breakpoint/data_breakpoint.o
  and kernel/trace/trace_ksym.o build conditionally if
  HAVE_HW_BREAKPOINTS is declared. Declaring this flag will help
  a)prevent build failures in other archs b)Prevent ftrace from showing
  up availability of kernel symbol tracing even in unsupported archs.

- Simplify the switch_to_thread_hw_breakpoint() function (any help from
  Alan Stern here would be gladly accepted).

- Remove callbacks such as unregister/register.

- remove the code to implement prioritisation of requests

- Add histogram support to include a 'hit counter' to the traced kernel
  symbols.

- Address coding-style related comments.

Hope they are not in sync with the comments received thus far. Let me
know if there are changes to be made.

Thanks,
K.Prasad


^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: [patch 02/11] x86 architecture implementation of Hardware Breakpoint interfaces
  2009-03-13 19:01             ` K.Prasad
@ 2009-03-13 21:21               ` Alan Stern
  2009-03-14 12:24                 ` Ingo Molnar
  0 siblings, 1 reply; 71+ messages in thread
From: Alan Stern @ 2009-03-13 21:21 UTC (permalink / raw)
  To: K.Prasad
  Cc: Ingo Molnar, Roland McGrath, Andrew Morton, Linux Kernel Mailing List

On Sat, 14 Mar 2009, K.Prasad wrote:

> Here's a summary of the intended changes to the patchset, which I hope
> to post early the following week. It tears down many features in the
> present submission (The write-up below is done without the benefit of
> actually having run into limitations while trying to chisel out code).
> 
> - Adopt a static allocation method for registers, say FCFS (and perhaps
>   botton-up for user-space allocations and the reverse for
>   kernel-space), although individual counters to do book-keeping should also
>   suffice.

You can't enforce bottom-up allocation for userspace breakpoint
requests.  In fact, you'll have to add a parameter indicating which
debug register is requested.  The ptrace interface will use this
parameter; the utrace interface won't care so it will specify something
like HW_BREAKPOINT_ANY_REGISTER.

You will have to add an array of HB_NUM counters, to keep track of how
many tasks are using each debug register.

> - Use an array of HB_NUM size for storing the breakpoint requests (and
>   not a linked-list implementation as done now).
> 
> - Define a HAVE_HW_BREAKPOINTS in arch/x86/Kconfig unconditionally, but
>   build kernel/hw_breakpoint.o, samples/hw_breakpoint/data_breakpoint.o
>   and kernel/trace/trace_ksym.o build conditionally if
>   HAVE_HW_BREAKPOINTS is declared. Declaring this flag will help
>   a)prevent build failures in other archs b)Prevent ftrace from showing
>   up availability of kernel symbol tracing even in unsupported archs.

This isn't quite right.  At the moment kernel/hw_breakpoint.c isn't
built at all; instead it is #included by the corresponding
arch-specific source file.  Of course, you could change that.

> - Simplify the switch_to_thread_hw_breakpoint() function (any help from
>   Alan Stern here would be gladly accepted).

Sure.  It will depend on how you implement the other changes.

> - Remove callbacks such as unregister/register.
> 
> - remove the code to implement prioritisation of requests

Remove the inline accessors.  They can be added back when they are 
needed.

Some architectures have arbitrary-length debug regions, not 
fixed-length 1, 2, 4, or 8 bytes.  We should give some thought to 
making the interface compatible with such things.

> - Add histogram support to include a 'hit counter' to the traced kernel
>   symbols.
> 
> - Address coding-style related comments.
> 
> Hope they are not in sync with the comments received thus far. Let me
> know if there are changes to be made.

Another change we need to change is the way DR6 is passed to the debug 
notifier chain.  Currently it is passed by value when do_debug() 
calls notify_die().  Instead we need to pass it by reference so that 
the notifier routines can change its value.  Each time a notifier 
routine handles a breakpoint event, the corresponding bit in DR6 should 
be turned off.

Alan Stern


^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: [patch 02/11] x86 architecture implementation of Hardware Breakpoint interfaces
  2009-03-10 17:26           ` Ingo Molnar
  2009-03-10 20:30             ` Alan Stern
@ 2009-03-14  3:40             ` Benjamin Herrenschmidt
  1 sibling, 0 replies; 71+ messages in thread
From: Benjamin Herrenschmidt @ 2009-03-14  3:40 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Alan Stern, prasad, Andrew Morton, Linux Kernel Mailing List,
	Roland McGrath

On Tue, 2009-03-10 at 18:26 +0100, Ingo Molnar wrote:
> 
> That 'arbitrarily larg number of breakpoints' worries me. It's a 
> pretty broken concept for a 4-items resource that cannot be 
> time-shared and hence cannot be overcommitted.
> 
> Seems to me that much of the complexity of this patchset:
> 
>  28 files changed, 2439 insertions(+), 199 deletions(-)
> 
> Could be eliminated via a very simple exclusive reservation 
> mechanism.
> 
I also have some worries about the bloat of this infrastructure,
especially in the context switching code.

I would prefer the arch to be in control of the state in the task struct
and just context switch the actual HW registers at that stage.

Ben.



^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: [patch 02/11] x86 architecture implementation of Hardware Breakpoint interfaces
  2009-03-10 20:30             ` Alan Stern
  2009-03-11 12:12               ` Ingo Molnar
@ 2009-03-14  3:41               ` Benjamin Herrenschmidt
  1 sibling, 0 replies; 71+ messages in thread
From: Benjamin Herrenschmidt @ 2009-03-14  3:41 UTC (permalink / raw)
  To: Alan Stern
  Cc: Ingo Molnar, prasad, Andrew Morton, Linux Kernel Mailing List,
	Roland McGrath

On Tue, 2009-03-10 at 16:30 -0400, Alan Stern wrote:
> Suppose we never allow callers to register more breakpoints than will
> fit in the CPU's registers.  Do we then use a simple first-come
> first-served algorithm, with no prioritization?  If we do prioritize
> some breakpoint registrations more highly than others, how do we
> inform
> callers that their breakpoint has been kicked out by one of higher
> priority?  And how do we let them know when the higher-priority
> breakpoint has been unregistered, so they can try again?

Do we really need such a mess ? Honestly ... We've been living fine
before without any of that.

Ben.


^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: [patch 02/11] x86 architecture implementation of Hardware Breakpoint interfaces
  2009-03-11 12:12               ` Ingo Molnar
  2009-03-11 12:50                 ` K.Prasad
  2009-03-11 16:32                 ` Alan Stern
@ 2009-03-14  3:43                 ` Benjamin Herrenschmidt
  2 siblings, 0 replies; 71+ messages in thread
From: Benjamin Herrenschmidt @ 2009-03-14  3:43 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Alan Stern, prasad, Andrew Morton, Linux Kernel Mailing List,
	Roland McGrath

On Wed, 2009-03-11 at 13:12 +0100, Ingo Molnar wrote:
> 
> #3 is probably the most informative (and hence probably the
>    best) variant. It also leaves policy of how to resolve the 
>    conflict to the admin.

Agreed.
> 
> Would be nice to have it simple. Reluctance regarding this 
> patchset is mostly rooted in that diffstat above.
> 
> The changes it does in the x86 architecture code are nice 
> generalizations and cleanups. Both the scheduler, task 
> startup/exit and ptrace bits look pretty sane in terms of 
> factoring out debug register details. But the breakpoint 
> management looks very complex

I agree there is some interest in generalization and cleanup, especially
as far as userspace APIs go, though it's a hard nut to crack as every
CPU family out there has some subtle differences in the way breakpoints
or watchpoints work (for example, alignment constraints, ability to do
ranges, the way they handle kernel vs. user, etc...)

I'm not yet sold.

Cheers,
Ben.


^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: [patch 02/11] x86 architecture implementation of Hardware Breakpoint interfaces
  2009-03-11 13:10                   ` Ingo Molnar
@ 2009-03-14  3:46                     ` Benjamin Herrenschmidt
  0 siblings, 0 replies; 71+ messages in thread
From: Benjamin Herrenschmidt @ 2009-03-14  3:46 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: K.Prasad, Alan Stern, Andrew Morton, Linux Kernel Mailing List,
	Roland McGrath

On Wed, 2009-03-11 at 14:10 +0100, Ingo Molnar wrote:
> 
> Kernel gets debug registers in db4..db3..db2..db1 order, and its 
> allocation is essentially hardcoded - i.e. we dont try to be 
> fancy.
> 
> User-space (gdb) on the other hand will try to allocate in the 
> db1..db2..db3..db4 order.
> 
> Maintain a 'max debug register index' value driven by ptrace and 
> maintain a 'min debug register index' driven by kernel-space 
> hw-breakpoint allocations.

A few added details from the perspective of powerpc ...

breakpoints and watchpoints are separate resources with different
capacity depending on the chip, so far nothing fancy.

We also have the ability to do range breakpoints/watchpoints on some
processors by using pairs of registers, which adds some constraints to
the allocation.

We also have a value compare capability for watchpoint, but this can
also have a different capacity limitation from either the breakpoints
and the watchpoints themselves.

Cheers,
Ben.


^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: [patch 02/11] x86 architecture implementation of Hardware Breakpoint interfaces
  2009-03-11 17:41                   ` K.Prasad
@ 2009-03-14  3:47                     ` Benjamin Herrenschmidt
  0 siblings, 0 replies; 71+ messages in thread
From: Benjamin Herrenschmidt @ 2009-03-14  3:47 UTC (permalink / raw)
  To: prasad
  Cc: Alan Stern, Ingo Molnar, Andrew Morton,
	Linux Kernel Mailing List, Roland McGrath

On Wed, 2009-03-11 at 23:11 +0530, K.Prasad wrote:
> With FCFS or an allocation mechanism without the (un)installed()
> callbacks we'd lose the ability to record requests and service them
> later when registers become availabile.
> 
> Say when (un)installed() callbacks are implemented for the proposed
> ftrace-plugin to trace kernel symbols, they can automatically stop/start
> tracing as and when registers become (un)available. This can be helpful when
> we wish to profile memory access over a kernel variable for a long duration
> (where small loss of tracing data can be tolerated), while the system would
> permit simultaneous user-space access (say a GDB session using 'hbreak').
> 
> Are we fine with disallowing such usage, which if done will let the requester
> of the breakpoint register 'poll' periodically to check availability.

Is that such a big deal ? Can't we just have the kernel degrade to
classic SW breakpoints ?

Smells like overengineering to me ...

Ben.



^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: [patch 02/11] x86 architecture implementation of Hardware Breakpoint interfaces
  2009-03-12  2:46     ` Roland McGrath
  2009-03-13  3:43       ` Ingo Molnar
@ 2009-03-14  3:51       ` Benjamin Herrenschmidt
  1 sibling, 0 replies; 71+ messages in thread
From: Benjamin Herrenschmidt @ 2009-03-14  3:51 UTC (permalink / raw)
  To: Roland McGrath
  Cc: Ingo Molnar, prasad, Andrew Morton, Linux Kernel Mailing List,
	Alan Stern, David Gibson, Torez Smith

On Wed, 2009-03-11 at 19:46 -0700, Roland McGrath wrote:
> 
> I think it would be illustrative to have a second arch implementation to
> compare to the x86 one.  Ingo has a tendency to pretend everything is an
> x86 until shown the concrete evidence.  The obvious choice is powerpc.
> Its facility is very simple, so the arch-specific part of the
> implementation should be trivial--it's the "base case" of simplest
> available hw_breakpoint arch, really.  Also, it happens that Prasad's
> employer is interested in having that support.
> 
> For example, a sensible powerpc implementation would clearly demonstrate
> why you need accessors or at least either pre-registration setters or
> explicit type/len arguments in registration calls.

Well, we happen to be just in the middle of implementing support for
BookE HW debug facilities :-) (which have more HW breakpoints &
watchpoints than server PPCs along with fancy features like ranged
breakpoints or value compare) so it's a right time to give that a try.

I'm Ccing David Gibson and Torez Smith who both have been working on the
infrastructure to control the debug regs. For now we are just giving
pretty much direct access to the debug regs from ptrace (since they are
somewhat architected they are very similar if not identical between a
whole bunch of embedded powerpc's) but a more abstract interface would
be nice.

Ben.



^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: [patch 02/11] x86 architecture implementation of Hardware Breakpoint interfaces
  2009-03-13 21:21               ` Alan Stern
@ 2009-03-14 12:24                 ` Ingo Molnar
  2009-03-14 16:10                   ` Alan Stern
  0 siblings, 1 reply; 71+ messages in thread
From: Ingo Molnar @ 2009-03-14 12:24 UTC (permalink / raw)
  To: Alan Stern
  Cc: K.Prasad, Roland McGrath, Andrew Morton, Linux Kernel Mailing List


* Alan Stern <stern@rowland.harvard.edu> wrote:

> On Sat, 14 Mar 2009, K.Prasad wrote:
> 
> > Here's a summary of the intended changes to the patchset, which I hope
> > to post early the following week. It tears down many features in the
> > present submission (The write-up below is done without the benefit of
> > actually having run into limitations while trying to chisel out code).
> > 
> > - Adopt a static allocation method for registers, say FCFS (and perhaps
> >   botton-up for user-space allocations and the reverse for
> >   kernel-space), although individual counters to do book-keeping should also
> >   suffice.
> 
> You can't enforce bottom-up allocation for userspace breakpoint
> requests. [...]

That's not the point.

The point is to offer a reasonable and simple static allocator 
that will work fine with usual gdb usage. If something takes 
away db4 that's as if user-space took away all registers - tough 
luck.

You are trying to put complexity into a situation that is not 
schedulable hence not resolvable _anyway_. There's just 4 debug 
registers, not more. If the combined usage goes above four 
someone will lose anyway - even with your allocator.

With my proposal the 'loss' can indeed come sooner if user-space 
took db4 and there's nothing left for the kernel anymore - but 
that's just an uninteresting special case that wont occur with 
typical debug-register usage.

If it ever causes problems seriously _then_ will be the time to 
consider "is it worth adding a more complex, dynamic allocator 
for debug registers". Not now. This stuff is currently 
over-designed and not acceptable to me in its current form.

	Ingo

^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: [patch 02/11] x86 architecture implementation of Hardware Breakpoint interfaces
  2009-03-14 12:24                 ` Ingo Molnar
@ 2009-03-14 16:10                   ` Alan Stern
  2009-03-14 16:39                     ` Ingo Molnar
  0 siblings, 1 reply; 71+ messages in thread
From: Alan Stern @ 2009-03-14 16:10 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: K.Prasad, Roland McGrath, Andrew Morton, Linux Kernel Mailing List

On Sat, 14 Mar 2009, Ingo Molnar wrote:

> 
> * Alan Stern <stern@rowland.harvard.edu> wrote:
> 
> > On Sat, 14 Mar 2009, K.Prasad wrote:
> > 
> > > Here's a summary of the intended changes to the patchset, which I hope
> > > to post early the following week. It tears down many features in the
> > > present submission (The write-up below is done without the benefit of
> > > actually having run into limitations while trying to chisel out code).
> > > 
> > > - Adopt a static allocation method for registers, say FCFS (and perhaps
> > >   botton-up for user-space allocations and the reverse for
> > >   kernel-space), although individual counters to do book-keeping should also
> > >   suffice.
> > 
> > You can't enforce bottom-up allocation for userspace breakpoint
> > requests. [...]
> 
> That's not the point.
> 
> The point is to offer a reasonable and simple static allocator 
> that will work fine with usual gdb usage. If something takes 
> away db4 that's as if user-space took away all registers - tough 
> luck.
> 
> You are trying to put complexity into a situation that is not 
> schedulable hence not resolvable _anyway_. There's just 4 debug 
> registers, not more. If the combined usage goes above four 
> someone will lose anyway - even with your allocator.

You are reading far more into my message than what I wrote.

I'm _not_ trying to put complexity anywhere.  All I did was point out
that Prasad was wrong to state that the kernel could adopt (or enforce)  
a bottom-up method for allocating debug registers for userspace 
breakpoints.  I trust you aren't trying to imply that he really was 
right?

> With my proposal the 'loss' can indeed come sooner if user-space 
> took db4 and there's nothing left for the kernel anymore - but 
> that's just an uninteresting special case that wont occur with 
> typical debug-register usage.
> 
> If it ever causes problems seriously _then_ will be the time to 
> consider "is it worth adding a more complex, dynamic allocator 
> for debug registers". Not now. This stuff is currently 
> over-designed and not acceptable to me in its current form.

My message didn't mention a word about more complex, dynamic
allocation.  Just the opposite, in fact -- because if we did virtualize
the debug registers then we _would_ be able to enforce bottom-up
allocation.

So in the end, you're _agreeing_ with what I wrote.  And yet the tone
of your reply suggests that you seemed to think that my message had
some deep, hostile intent.  It didn't.

Alan Stern


^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: [patch 02/11] x86 architecture implementation of Hardware Breakpoint interfaces
  2009-03-14 16:10                   ` Alan Stern
@ 2009-03-14 16:39                     ` Ingo Molnar
  0 siblings, 0 replies; 71+ messages in thread
From: Ingo Molnar @ 2009-03-14 16:39 UTC (permalink / raw)
  To: Alan Stern
  Cc: K.Prasad, Roland McGrath, Andrew Morton, Linux Kernel Mailing List


* Alan Stern <stern@rowland.harvard.edu> wrote:

> So in the end, you're _agreeing_ with what I wrote.  And yet 
> the tone of your reply suggests that you seemed to think that 
> my message had some deep, hostile intent.  It didn't.

Sorry about that - i didnt mean to convey any such message.

I guess i'll wait for the next series. All i'm striving for is 
for the whole series to be a lot simpler than what i've seen 
before.

	Ingo

^ permalink raw reply	[flat|nested] 71+ messages in thread

* [Patch 02/11] x86 architecture implementation of Hardware Breakpoint interfaces
       [not found] <20090324152028.754123712@K.Prasad>
@ 2009-03-24 15:25 ` K.Prasad
  0 siblings, 0 replies; 71+ messages in thread
From: K.Prasad @ 2009-03-24 15:25 UTC (permalink / raw)
  To: Ingo Molnar, Linux Kernel Mailing List
  Cc: Alan Stern, Andrew Morton, Benjamin Herrenschmidt,
	Frederic Weisbecker, maneesh, Roland McGrath, Steven Rostedt,
	K.Prasad

[-- Attachment #1: 2 --]
[-- Type: text/plain, Size: 15088 bytes --]

This patch introduces two new files named hw_breakpoint.[ch] inside x86 specific
directories. They contain functions which help validate and serve requests for 
using Hardware Breakpoint registers on x86 processors.

Signed-off-by: K.Prasad <prasad@linux.vnet.ibm.com>
Signed-off-by: Alan Stern <stern@rowland.harvard.edu>
---
 arch/x86/Kconfig                     |    1 
 arch/x86/include/asm/hw_breakpoint.h |   73 +++++
 arch/x86/kernel/Makefile             |    2 
 arch/x86/kernel/hw_breakpoint.c      |  441 +++++++++++++++++++++++++++++++++++
 4 files changed, 516 insertions(+), 1 deletion(-)

Index: linux-2.6-tip/arch/x86/kernel/hw_breakpoint.c
===================================================================
--- /dev/null
+++ linux-2.6-tip/arch/x86/kernel/hw_breakpoint.c
@@ -0,0 +1,441 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) 2007 Alan Stern
+ * Copyright (C) 2009 IBM Corporation
+ */
+
+/*
+ * HW_breakpoint: a unified kernel/user-space hardware breakpoint facility,
+ * using the CPU's debug registers.
+ */
+
+#include <linux/irqflags.h>
+#include <linux/notifier.h>
+#include <linux/kallsyms.h>
+#include <linux/kprobes.h>
+#include <linux/percpu.h>
+#include <linux/kdebug.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/sched.h>
+#include <linux/init.h>
+#include <linux/smp.h>
+
+#include <asm/hw_breakpoint.h>
+#include <asm/processor.h>
+#include <asm/debugreg.h>
+
+/* Unmasked kernel DR7 value */
+static unsigned long kdr7;
+
+/*
+ * Masks for the bits corresponding to registers DR0 - DR3 in DR7 register.
+ * Used to clear and verify the status of bits corresponding to DR0 - DR3
+ */
+static const unsigned long	dr7_masks[HB_NUM] = {
+	0x000f0003,	/* LEN0, R/W0, G0, L0 */
+	0x00f0000c,	/* LEN1, R/W1, G1, L1 */
+	0x0f000030,	/* LEN2, R/W2, G2, L2 */
+	0xf00000c0	/* LEN3, R/W3, G3, L3 */
+};
+
+
+/*
+ * Encode the length, type, Exact, and Enable bits for a particular breakpoint
+ * as stored in debug register 7.
+ */
+static unsigned long encode_dr7(int drnum, unsigned len, unsigned type)
+{
+	unsigned long temp;
+
+	temp = (len | type) & 0xf;
+	temp <<= (DR_CONTROL_SHIFT + drnum * DR_CONTROL_SIZE);
+	temp |= (DR_GLOBAL_ENABLE << (drnum * DR_ENABLE_SIZE)) |
+				DR_GLOBAL_SLOWDOWN;
+	return temp;
+}
+
+/*
+ * Install the kernel breakpoints in their debug registers.
+ * If 0 <= pos < HB_NUM, then set the debug register corresponding to that number
+ * If 'pos' is negative, then all debug registers are updated
+ */
+void arch_install_kernel_hw_breakpoint(void *idx)
+{
+	int pos = *(int *)idx;
+	unsigned long dr7;
+	int i;
+
+	get_debugreg(dr7, 7);
+
+	/* Don't allow debug exceptions while we update the registers */
+	set_debugreg(0UL, 7);
+
+	for (i = hbp_kernel_pos; i < HB_NUM; i++) {
+		if ((pos >= 0) && (i != pos))
+			continue;
+		dr7 &= ~(dr7_masks[i]);
+		if (hbp_kernel[i])
+			set_debugreg(hbp_kernel[i]->info.address, i);
+	}
+
+	dr7 |= kdr7;
+	/* No need to set DR6 */
+	set_debugreg(dr7, 7);
+}
+
+void arch_load_debug_registers()
+{
+	int pos = -1;
+	/*
+	 * We want all debug registers to be initialised for this
+	 * CPU so pos = -1
+	 */
+	arch_install_kernel_hw_breakpoint((void *)&pos);
+}
+
+/*
+ * Install the thread breakpoints in their debug registers.
+ */
+void arch_install_thread_hw_breakpoint(struct task_struct *tsk)
+{
+	int i;
+	struct thread_struct *thread = &(tsk->thread);
+
+	for (i = 0;  (i < hbp_kernel_pos) && hbp_user_refcount[i]; i++)
+		if (thread->hbp[i])
+			set_debugreg(thread->hbp[i]->info.address, i);
+
+	/* No need to set DR6 */
+
+	set_debugreg((kdr7 | thread->dr7), 7);
+}
+
+/*
+ * Install the debug register values for just the kernel, no thread.
+ */
+void arch_install_none()
+{
+	/* Clear the user-space portion of dr7 by setting only kdr7 */
+	set_debugreg(kdr7, 7);
+}
+
+static int get_hbp_len(u8 hbp_len)
+{
+	unsigned int len_in_bytes = 0;
+
+	switch (hbp_len) {
+	case HW_BREAKPOINT_LEN_1:
+		len_in_bytes = 1;
+		break;
+	case HW_BREAKPOINT_LEN_2:
+		len_in_bytes = 2;
+		break;
+	case HW_BREAKPOINT_LEN_4:
+		len_in_bytes = 4;
+		break;
+#ifdef CONFIG_X86_64
+	case HW_BREAKPOINT_LEN_8:
+		len_in_bytes = 8;
+		break;
+#endif
+	}
+	return len_in_bytes;
+}
+
+/*
+ * Check for virtual address in user space.
+ */
+int arch_check_va_in_userspace(unsigned long va, u8 hbp_len)
+{
+	unsigned int len;
+
+	len = get_hbp_len(hbp_len);
+
+	return (va <= TASK_SIZE - len);
+}
+
+/*
+ * Check for virtual address in kernel space.
+ */
+int arch_check_va_in_kernelspace(unsigned long va, u8 hbp_len)
+{
+	unsigned int len;
+
+	len = get_hbp_len(hbp_len);
+
+	return ((va >= TASK_SIZE) && ((va + len) >= TASK_SIZE));
+}
+
+/*
+ * Store a breakpoint's encoded address, length, and type.
+ */
+void arch_store_info(struct hw_breakpoint *bp)
+{
+	/*
+	 * User-space requests will always have the address field populated
+	 * For kernel-addresses, either the address or symbol name can be
+	 * specified.
+	 */
+	if (bp->info.address)
+		return;
+	if (bp->info.name)
+		bp->info.address = (unsigned long)
+					kallsyms_lookup_name(bp->info.name);
+}
+
+/*
+ * Validate the arch-specific HW Breakpoint register settings
+ */
+int arch_validate_hwbkpt_settings(struct hw_breakpoint *bp,
+				unsigned int *align, struct task_struct *tsk)
+{
+	int ret = -EINVAL;
+
+	switch (bp->info.type) {
+
+	/*
+	 * Ptrace-refactoring code
+	 * For now, we'll allow instruction breakpoint only for user-space
+	 * addresses
+	 */
+	case HW_BREAKPOINT_EXECUTE:
+		if ((!arch_check_va_in_userspace(bp->info.address,
+							bp->info.len)) &&
+			bp->info.len != HW_BREAKPOINT_LEN_EXECUTE)
+			return ret;
+		break;
+	case HW_BREAKPOINT_WRITE:
+				break;
+	case HW_BREAKPOINT_RW:
+				break;
+	default:
+		return ret;
+	}
+
+	switch (bp->info.len) {
+	case HW_BREAKPOINT_LEN_1:
+		*align = 0;
+		break;
+	case HW_BREAKPOINT_LEN_2:
+		*align = 1;
+		break;
+	case HW_BREAKPOINT_LEN_4:
+		*align = 3;
+		break;
+#ifdef CONFIG_X86_64
+	case HW_BREAKPOINT_LEN_8:
+		*align = 7;
+		break;
+#endif
+	default:
+		return ret;
+	}
+
+	if (bp->triggered) {
+		ret = 0;
+		arch_store_info(bp);
+	}
+	return ret;
+}
+
+/*
+ * Register a new user breakpoint structure.
+ */
+void arch_register_user_hw_breakpoint(int pos, struct hw_breakpoint *bp,
+		struct task_struct *tsk)
+{
+	struct thread_struct *thread = &(tsk->thread);
+
+	thread->dr7 &= ~dr7_masks[pos];
+	thread->dr7 |= encode_dr7(pos, bp->info.len, bp->info.type);
+}
+
+/*
+ * Modify an existing user breakpoint structure.
+ */
+int arch_modify_user_hw_breakpoint(int pos, struct hw_breakpoint *bp,
+		struct task_struct *tsk)
+{
+	struct thread_struct *thread = &(tsk->thread);
+
+	/* Check if the register to be modified was enabled by the thread */
+	if (!(thread->dr7 & (1 << (pos * DR_ENABLE_SIZE))))
+		return -EINVAL;
+
+	thread->dr7 &= ~dr7_masks[pos];
+	thread->dr7 |= encode_dr7(pos, bp->info.len, bp->info.type);
+
+	return 0;
+}
+
+/*
+ * Unregister a user breakpoint structure.
+ */
+void arch_unregister_user_hw_breakpoint(int pos, struct hw_breakpoint *bp,
+					struct task_struct *tsk)
+{
+	struct thread_struct *thread = &(tsk->thread);
+
+	if (!thread->hbp[pos])
+		return;
+
+	thread->hbp[pos]->info.address = 0;
+	thread->dr7 &= ~dr7_masks[pos];
+}
+
+/*
+ * Register a kernel breakpoint structure.
+ */
+void arch_register_kernel_hw_breakpoint(int pos)
+{
+	struct hw_breakpoint *bp;
+
+	bp = hbp_kernel[pos];
+
+	kdr7 &= ~(dr7_masks[pos]);
+	if (bp)
+		kdr7 |= encode_dr7(pos, bp->info.len, bp->info.type);
+
+	on_each_cpu(arch_install_kernel_hw_breakpoint, (void *)&pos, 0);
+}
+
+/*
+ * Unregister a kernel breakpoint structure.
+ */
+void arch_unregister_kernel_hw_breakpoint()
+{
+	int i, pos = -1;
+
+	/*
+	 * Modify kdr7 to reflect the new layout of kernel-space breakpoints
+	 * and invoke the routine to write breakpoint addresses onto the
+	 * physical registers
+	 */
+	for (i = hbp_kernel_pos; i < HB_NUM; i++) {
+		kdr7 &= ~(dr7_masks[i]);
+		if (hbp_kernel[i]) {
+			struct hw_breakpoint *bp = hbp_kernel[i];
+			kdr7 |= encode_dr7(i, bp->info.len, bp->info.type);
+		}
+	}
+	on_each_cpu(arch_install_kernel_hw_breakpoint, (void *)&pos, 0);
+}
+
+
+/*
+ * Copy out the debug register information for a core dump.
+ *
+ * tsk must be equal to current.
+ */
+void dump_thread_hw_breakpoint(struct task_struct *tsk, int u_debugreg[8])
+{
+	struct thread_struct *thread = &(tsk->thread);
+	int i;
+
+	memset(u_debugreg, 0, sizeof u_debugreg);
+	for (i = 0; i < thread->hbp_num_installed && thread->hbp[i]; ++i)
+		u_debugreg[i] = thread->hbp[i]->info.address;
+	u_debugreg[7] = thread->dr7;
+	u_debugreg[6] = thread->dr6;
+}
+
+/*
+ * Handle debug exception notifications.
+ */
+int __kprobes hw_breakpoint_handler(struct die_args *args)
+{
+	int i, rc = NOTIFY_DONE;
+	struct hw_breakpoint *bp;
+	/* The DR6 value is stored in args->err */
+	unsigned long dr7, dr6 = args->err;
+
+	if (dr6 & DR_STEP)
+		return NOTIFY_DONE;
+
+	get_debugreg(dr7, 7);
+
+	/* Disable breakpoints during exception handling */
+	set_debugreg(0UL, 7);
+
+	/*
+	 * Assert that local interrupts are disabled
+	 * Reset the DRn bits in the virtualized register value.
+	 * The ptrace trigger routine will add in whatever is needed.
+	 */
+	current->thread.dr6 &= ~(DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3);
+
+	/* Lazy debug register switching */
+	if (last_debugged_task != current)
+		switch_to_none_hw_breakpoint();
+
+	/* Handle all the breakpoints that were triggered */
+	for (i = 0; i < HB_NUM; ++i) {
+		if (likely(!(dr6 & (DR_TRAP0 << i))))
+			continue;
+		/*
+		 * Find the corresponding hw_breakpoint structure and
+		 * invoke its triggered callback.
+		 */
+		if (hbp_user_refcount[i])
+			bp = current->thread.hbp[i];
+		else if (i >= hbp_kernel_pos)
+			bp = hbp_kernel[i];
+		else	/* False alarm due to lazy DR switching */
+			continue;
+
+		if (!bp)
+			break;
+
+		switch (bp->info.type) {
+		case HW_BREAKPOINT_WRITE:
+		case HW_BREAKPOINT_RW:
+			if (bp->triggered)
+				(bp->triggered)(bp, args->regs);
+
+			if (arch_check_va_in_userspace(bp->info.address,
+							bp->info.len))
+				rc = NOTIFY_DONE;
+			else
+				rc = NOTIFY_STOP;;
+			goto exit;
+		case HW_BREAKPOINT_EXECUTE:
+			/*
+			 * Presently we allow instruction breakpoints only in
+			 * user-space when requested through ptrace.
+			 */
+			if (arch_check_va_in_userspace(bp->info.address,
+							bp->info.len)) {
+				(bp->triggered)(bp, args->regs);
+				/*
+				 * do_debug will notify user through a SIGTRAP
+				 * signal So we are not requesting a
+				 * NOTIFY_STOP here
+				 */
+				rc = NOTIFY_DONE;
+				goto exit;
+			}
+		}
+	}
+
+	/* Stop processing further if the exception is a stray one */
+	if (!(dr6 & ~(DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)))
+		rc = NOTIFY_STOP;
+exit:
+	set_debugreg(dr7, 7);
+	return rc;
+}
Index: linux-2.6-tip/arch/x86/include/asm/hw_breakpoint.h
===================================================================
--- /dev/null
+++ linux-2.6-tip/arch/x86/include/asm/hw_breakpoint.h
@@ -0,0 +1,73 @@
+#ifndef	_I386_HW_BREAKPOINT_H
+#define	_I386_HW_BREAKPOINT_H
+
+#ifdef	__KERNEL__
+#define	__ARCH_HW_BREAKPOINT_H
+
+struct arch_hw_breakpoint {
+	char		*name; /* Contains name of the symbol to set bkpt */
+	unsigned long	address;
+	u8		len;
+	u8		type;
+};
+
+#include <linux/kdebug.h>
+#include <asm-generic/hw_breakpoint.h>
+
+/* Available HW breakpoint length encodings */
+#define HW_BREAKPOINT_LEN_1		0x40
+#define HW_BREAKPOINT_LEN_2		0x44
+#define HW_BREAKPOINT_LEN_4		0x4c
+#define HW_BREAKPOINT_LEN_EXECUTE	0x40
+
+#ifdef CONFIG_X86_64
+#define HW_BREAKPOINT_LEN_8		0x48
+#endif
+
+/* Available HW breakpoint type encodings */
+
+/* trigger on instruction execute */
+#define HW_BREAKPOINT_EXECUTE	0x80
+/* trigger on memory write */
+#define HW_BREAKPOINT_WRITE	0x81
+/* trigger on memory read or write */
+#define HW_BREAKPOINT_RW	0x83
+
+/* Total number of available HW breakpoint registers */
+#define HB_NUM 4
+
+extern struct hw_breakpoint *hbp_kernel[HB_NUM];
+extern unsigned int hbp_user_refcount[HB_NUM];
+
+/*
+ * Ptrace support: breakpoint trigger routine.
+ */
+int __register_user_hw_breakpoint(int pos, struct task_struct *tsk,
+			struct hw_breakpoint *bp);
+int __modify_user_hw_breakpoint(int pos, struct task_struct *tsk,
+			struct hw_breakpoint *bp);
+void __unregister_user_hw_breakpoint(int pos, struct task_struct *tsk,
+			struct hw_breakpoint *bp);
+
+void arch_install_thread_hw_breakpoint(struct task_struct *tsk);
+void arch_install_none(void);
+void arch_install_kernel_hw_breakpoint(void *);
+int arch_check_va_in_userspace(unsigned long va, u8 hbp_len);
+int arch_check_va_in_kernelspace(unsigned long va, u8 hbp_len);
+void arch_store_info(struct hw_breakpoint *bp);
+int arch_validate_hwbkpt_settings(struct hw_breakpoint *bp,
+				unsigned int *align, struct task_struct *tsk);
+void arch_register_user_hw_breakpoint(int pos, struct hw_breakpoint *bp,
+				struct task_struct *tsk);
+int arch_modify_user_hw_breakpoint(int pos, struct hw_breakpoint *bp,
+				struct task_struct *tsk);
+void arch_unregister_user_hw_breakpoint(int pos, struct hw_breakpoint *bp,
+					struct task_struct *tsk);
+void arch_load_debug_registers(void);
+void arch_register_kernel_hw_breakpoint(int pos);
+void arch_unregister_kernel_hw_breakpoint(void);
+int hw_breakpoint_handler(struct die_args *args);
+
+#endif	/* __KERNEL__ */
+#endif	/* _I386_HW_BREAKPOINT_H */
+
Index: linux-2.6-tip/arch/x86/kernel/Makefile
===================================================================
--- linux-2.6-tip.orig/arch/x86/kernel/Makefile
+++ linux-2.6-tip/arch/x86/kernel/Makefile
@@ -36,7 +36,7 @@ obj-$(CONFIG_X86_64)	+= sys_x86_64.o x86
 obj-$(CONFIG_X86_64)	+= syscall_64.o vsyscall_64.o
 obj-y			+= bootflag.o e820.o
 obj-y			+= pci-dma.o quirks.o i8237.o topology.o kdebugfs.o
-obj-y			+= alternative.o i8253.o pci-nommu.o
+obj-y			+= alternative.o i8253.o pci-nommu.o hw_breakpoint.o
 obj-y			+= tsc.o io_delay.o rtc.o
 
 obj-$(CONFIG_X86_TRAMPOLINE)	+= trampoline.o
Index: linux-2.6-tip/arch/x86/Kconfig
===================================================================
--- linux-2.6-tip.orig/arch/x86/Kconfig
+++ linux-2.6-tip/arch/x86/Kconfig
@@ -47,6 +47,7 @@ config X86
 	select HAVE_KERNEL_LZMA
 	select HAVE_ARCH_KMEMCHECK
 	select HAVE_DMA_API_DEBUG
+	select HAVE_HW_BREAKPOINT
 
 config ARCH_DEFCONFIG
 	string


^ permalink raw reply	[flat|nested] 71+ messages in thread

* [Patch 02/11] x86 architecture implementation of Hardware Breakpoint interfaces
       [not found] <20090319234044.410725944@K.Prasad>
@ 2009-03-19 23:48 ` K.Prasad
  0 siblings, 0 replies; 71+ messages in thread
From: K.Prasad @ 2009-03-19 23:48 UTC (permalink / raw)
  To: Ingo Molnar, Linux Kernel Mailing List
  Cc: Alan Stern, Andrew Morton, Benjamin Herrenschmidt,
	Frederic Weisbecker, Maneesh Soni, Roland McGrath,
	Steven Rostedt, K.Prasad

[-- Attachment #1: 2 --]
[-- Type: text/plain, Size: 14025 bytes --]

This patch introduces two new files named hw_breakpoint.[ch] inside x86 specific
directories. They contain functions which help validate and serve requests for 
using Hardware Breakpoint registers on x86 processors.

Signed-off-by: K.Prasad <prasad@linux.vnet.ibm.com>
Signed-off-by: Alan Stern <stern@rowland.harvard.edu>
---
 arch/x86/Kconfig                     |    1 
 arch/x86/include/asm/hw_breakpoint.h |   69 ++++++
 arch/x86/kernel/Makefile             |    2 
 arch/x86/kernel/hw_breakpoint.c      |  384 +++++++++++++++++++++++++++++++++++
 4 files changed, 455 insertions(+), 1 deletion(-)

Index: linux-2.6-tip.hbkpt/arch/x86/kernel/hw_breakpoint.c
===================================================================
--- /dev/null
+++ linux-2.6-tip.hbkpt/arch/x86/kernel/hw_breakpoint.c
@@ -0,0 +1,384 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) 2007 Alan Stern
+ * Copyright (C) 2009 IBM Corporation
+ */
+
+/*
+ * HW_breakpoint: a unified kernel/user-space hardware breakpoint facility,
+ * using the CPU's debug registers.
+ */
+
+#include <linux/irqflags.h>
+#include <linux/notifier.h>
+#include <linux/kallsyms.h>
+#include <linux/kprobes.h>
+#include <linux/percpu.h>
+#include <linux/kdebug.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/sched.h>
+#include <linux/init.h>
+#include <linux/smp.h>
+
+#include <asm/hw_breakpoint.h>
+#include <asm/processor.h>
+#include <asm/debugreg.h>
+
+/* Unmasked kernel DR7 value */
+static unsigned long kdr7;
+
+/*
+ * Masks for the bits corresponding to registers DR0 - DR3 in DR7 register.
+ * Used to clear and verify the status of bits corresponding to DR0 - DR3
+ */
+static const unsigned long	dr7_masks[HB_NUM] = {
+	0x000f0003,	/* LEN0, R/W0, G0, L0 */
+	0x00f0000c,	/* LEN1, R/W1, G1, L1 */
+	0x0f000030,	/* LEN2, R/W2, G2, L2 */
+	0xf00000c0	/* LEN3, R/W3, G3, L3 */
+};
+
+
+/*
+ * Encode the length, type, Exact, and Enable bits for a particular breakpoint
+ * as stored in debug register 7.
+ */
+static unsigned long encode_dr7(int drnum, unsigned len, unsigned type)
+{
+	unsigned long temp;
+
+	temp = (len | type) & 0xf;
+	temp <<= (DR_CONTROL_SHIFT + drnum * DR_CONTROL_SIZE);
+	temp |= (DR_GLOBAL_ENABLE << (drnum * DR_ENABLE_SIZE)) |
+				DR_GLOBAL_SLOWDOWN;
+	return temp;
+}
+
+/*
+ * Install the kernel breakpoints in their debug registers.
+ */
+void arch_install_kernel_hbkpt(void *bkpt)
+{
+	struct hw_breakpoint *bp;
+	int i;
+	unsigned long dr7;
+
+	bp = (struct hw_breakpoint *)bkpt;
+
+	kdr7 &= ~(dr7_masks[hbkpt_kernel_pos]);
+	kdr7 |= encode_dr7(hbkpt_kernel_pos, bp->info.len, bp->info.type);
+
+	get_debugreg(dr7, 7);
+	/* Clear the bits corresponding to 'pos' register in dr7 */
+	dr7 &= ~(dr7_masks[hbkpt_kernel_pos]);
+	dr7 |= kdr7;
+
+	/* Don't allow debug exceptions while we update the registers */
+	set_debugreg(0UL, 7);
+
+	/* Kernel hbkpts always begin at 'hbkpt_kernel_pos' and upto HB_NUM */
+	for (i = hbkpt_kernel_pos; i < HB_NUM; i++)
+		set_debugreg(hbkpt_kernel[i]->info.address, i);
+
+	/* No need to set DR6 */
+	set_debugreg(dr7, 7);
+}
+
+/*
+ * Install the thread breakpoints in their debug registers.
+ */
+void arch_install_thread_hbkpt(struct task_struct *tsk)
+{
+	int i;
+	struct thread_struct *thread = &(tsk->thread);
+
+	for (i = 0; i < hbkpt_user_max; i++)
+		if (thread->hbkpt[i])
+			set_debugreg(thread->hbkpt[i]->info.address, i);
+
+	/* No need to set DR6 */
+
+	set_debugreg((kdr7 | thread->dr7), 7);
+}
+
+/*
+ * Install the debug register values for just the kernel, no thread.
+ */
+void arch_install_none()
+{
+	/* Clear the user-space portion of dr7 by setting only kdr7 */
+	set_debugreg(kdr7, 7);
+}
+
+/*
+ * Check for virtual address in user space.
+ */
+int arch_check_va_in_userspace(unsigned long va, struct task_struct *tsk)
+{
+#ifdef CONFIG_X86_32
+	return (va <= TASK_SIZE - 3);
+#else /* X86_64 */
+	return (va <= TASK_SIZE - 7);
+#endif
+}
+
+/*
+ * Check for virtual address in kernel space.
+ */
+int arch_check_va_in_kernelspace(unsigned long va)
+{
+	return (va >= TASK_SIZE);
+}
+
+/*
+ * Store a breakpoint's encoded address, length, and type.
+ */
+void arch_store_info(struct hw_breakpoint *bp)
+{
+	/*
+	 * User-space requests will always have the address field populated
+	 * For kernel-addresses, either the address or symbol name can be
+	 * specified.
+	 */
+	if (bp->info.address)
+		return;
+	if (bp->info.name)
+		bp->info.address = (unsigned long)
+					kallsyms_lookup_name(bp->info.name);
+}
+
+/*
+ * Validate the arch-specific HW Breakpoint register settings
+ */
+int arch_validate_hwbkpt_settings(struct hw_breakpoint *bp,
+				unsigned int *align, struct task_struct *tsk)
+{
+	int ret = -EINVAL;
+
+	switch (bp->info.type) {
+
+	/* Ptrace-refactoring code
+	 * For now, we'll allow instruction breakpoint only for user-space
+	 * addresses
+	 */
+	case HW_BREAKPOINT_EXECUTE:
+		if ((!arch_check_va_in_userspace(bp->info.address, tsk)) &&
+			bp->info.len != HW_BREAKPOINT_LEN_EXECUTE)
+			return ret;
+		break;
+	case HW_BREAKPOINT_WRITE:
+				break;
+	case HW_BREAKPOINT_RW:
+				break;
+	default:
+		return ret;
+	}
+
+	switch (bp->info.len) {
+	case HW_BREAKPOINT_LEN_1:
+		*align = 0;
+		break;
+	case HW_BREAKPOINT_LEN_2:
+		*align = 1;
+		break;
+	case HW_BREAKPOINT_LEN_4:
+		*align = 3;
+		break;
+	default:
+		return ret;
+	}
+
+	if (bp->triggered) {
+		ret = 0;
+		arch_store_info(bp);
+	}
+	return ret;
+}
+
+/*
+ * Register a new user breakpoint structure.
+ */
+void arch_register_user_hw_breakpoint(int pos, struct hw_breakpoint *bp,
+		struct task_struct *tsk)
+{
+	struct thread_struct *thread = &(tsk->thread);
+
+	thread->dr7 &= ~dr7_masks[pos];
+	thread->dr7 |= encode_dr7(pos, bp->info.len, bp->info.type);
+}
+
+/*
+ * Modify an existing user breakpoint structure.
+ */
+int arch_modify_user_hw_breakpoint(int pos, struct hw_breakpoint *bp,
+		struct task_struct *tsk)
+{
+	struct thread_struct *thread = &(tsk->thread);
+
+	/* Check if the register to be modified was enabled by the thread */
+	if (!(thread->dr7 & (1 << (pos * DR_ENABLE_SIZE))))
+		return -EINVAL;
+
+	thread->dr7 &= ~dr7_masks[pos];
+	thread->dr7 |= encode_dr7(pos, bp->info.len, bp->info.type);
+
+	return 0;
+}
+
+/*
+ * Unregister a user breakpoint structure.
+ */
+void arch_unregister_user_hw_breakpoint(int pos, struct hw_breakpoint *bp,
+					struct task_struct *tsk)
+{
+	struct thread_struct *thread = &(tsk->thread);
+
+	if (!thread->hbkpt[pos])
+		return;
+
+	thread->hbkpt[pos]->info.address = 0;
+	thread->dr7 &= ~dr7_masks[pos];
+}
+
+/*
+ * Register a kernel breakpoint structure.
+ */
+void arch_register_kernel_hw_breakpoint(struct hw_breakpoint *bp)
+{
+	on_each_cpu(arch_install_kernel_hbkpt, (void *)bp, 0);
+}
+
+/*
+ * Unregister a kernel breakpoint structure.
+ */
+void arch_unregister_kernel_hw_breakpoint(int pos)
+{
+	unsigned long dr7;
+
+	kdr7 &= ~(dr7_masks[pos]);
+
+	get_debugreg(dr7, 7);
+	dr7  &= ~(dr7_masks[pos]);
+	set_debugreg(dr7, 7);
+}
+
+/* End of arch-specific hook routines */
+
+/*
+ * Copy out the debug register information for a core dump.
+ *
+ * tsk must be equal to current.
+ */
+void dump_thread_hw_breakpoint(struct task_struct *tsk, int u_debugreg[8])
+{
+	struct thread_struct *thread = &(tsk->thread);
+	int i;
+
+	memset(u_debugreg, 0, sizeof u_debugreg);
+	for (i = 0; i < thread->hbkpt_num_installed && thread->hbkpt[i]; ++i)
+		u_debugreg[i] = thread->hbkpt[i]->info.address;
+	u_debugreg[7] = thread->dr7;
+	u_debugreg[6] = thread->dr6;
+}
+
+/*
+ * Handle debug exception notifications.
+ */
+int __kprobes hw_breakpoint_handler(struct die_args *args)
+{
+	int i;
+	struct hw_breakpoint *bp;
+	/* The DR6 value is stored in args->err */
+	unsigned long dr7, dr6 = args->err;
+
+	if (dr6 & DR_STEP)
+		return NOTIFY_DONE;
+
+	get_debugreg(dr7, 7);
+
+	/* Disable breakpoints during exception handling */
+	set_debugreg(0UL, 7);
+
+	/* Assert that local interrupts are disabled
+	 * Reset the DRn bits in the virtualized register value.
+	 * The ptrace trigger routine will add in whatever is needed.
+	 */
+	current->thread.dr6 &= ~(DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3);
+
+	/* Lazy debug register switching */
+	if (last_debugged_task != current)
+		switch_to_none_hw_breakpoint();
+
+	/* Handle all the breakpoints that were triggered */
+	for (i = 0; i < HB_NUM; ++i) {
+		if (likely(!(dr6 & (DR_TRAP0 << i))))
+			continue;
+
+		/* Find the corresponding hw_breakpoint structure and
+		 * invoke its triggered callback.
+		 */
+		if (i < hbkpt_user_max)
+			bp = current->thread.hbkpt[i];
+		else if (i >= hbkpt_kernel_pos)
+			bp = hbkpt_kernel[i];
+		else		/* False alarm due to lazy DR switching */
+			continue;
+		if (!bp)
+			goto ret_path;
+
+		switch (bp->info.type) {
+		case HW_BREAKPOINT_WRITE:
+		case HW_BREAKPOINT_RW:
+			if (bp->triggered)
+				(bp->triggered)(bp, args->regs);
+			/* Re-enable the breakpoints */
+			put_cpu_no_resched();
+			if (arch_check_va_in_userspace(bp->info.address,
+							current))
+				goto ret_notify_done;
+			else
+				goto ret_notify_stop;
+		/*
+		 * Presently we allow instruction breakpoints only in
+		 * user-space when requested through ptrace.
+		 */
+		case HW_BREAKPOINT_EXECUTE:
+			if (arch_check_va_in_userspace(bp->info.address,
+							current)) {
+				(bp->triggered)(bp, args->regs);
+			/*
+			 * do_debug will notify user through a SIGTRAP signal
+			 * So we are not requesting a NOTIFY_STOP here
+			 */
+				goto ret_notify_done;
+			}
+		}
+	}
+
+ret_path:
+	/* Stop processing further if the exception is a stray one */
+	if (!(dr6 & ~(DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)))
+		goto ret_notify_stop;
+
+ret_notify_done:
+	set_debugreg(dr7, 7);
+	return NOTIFY_DONE;
+ret_notify_stop:
+	set_debugreg(dr7, 7);
+	return NOTIFY_STOP;
+}
Index: linux-2.6-tip.hbkpt/arch/x86/include/asm/hw_breakpoint.h
===================================================================
--- /dev/null
+++ linux-2.6-tip.hbkpt/arch/x86/include/asm/hw_breakpoint.h
@@ -0,0 +1,69 @@
+#ifndef	_I386_HW_BREAKPOINT_H
+#define	_I386_HW_BREAKPOINT_H
+
+#ifdef	__KERNEL__
+#define	__ARCH_HW_BREAKPOINT_H
+
+struct arch_hw_breakpoint {
+	char		*name; /* Contains name of the symbol to set bkpt */
+	unsigned long	address;
+	u8		len;
+	u8		type;
+};
+
+#include <linux/kdebug.h>
+#include <asm-generic/hw_breakpoint.h>
+
+/* Available HW breakpoint length encodings */
+#define HW_BREAKPOINT_LEN_1		0x40
+#define HW_BREAKPOINT_LEN_2		0x44
+#define HW_BREAKPOINT_LEN_4		0x4c
+#define HW_BREAKPOINT_LEN_EXECUTE	0x40
+
+/* Available HW breakpoint type encodings */
+
+/* trigger on instruction execute */
+#define HW_BREAKPOINT_EXECUTE	0x80
+/* trigger on memory write */
+#define HW_BREAKPOINT_WRITE	0x81
+/* trigger on memory read or write */
+#define HW_BREAKPOINT_RW	0x83
+
+/* Total number of available HW breakpoint registers */
+#define HB_NUM 4
+
+extern struct hw_breakpoint *hbkpt_kernel[HB_NUM];
+extern unsigned int hbkpt_user_max_refcount[HB_NUM];
+
+/*
+ * Ptrace support: breakpoint trigger routine.
+ */
+int __register_user_hw_breakpoint(int pos, struct task_struct *tsk,
+			struct hw_breakpoint *bp);
+int __modify_user_hw_breakpoint(int pos, struct task_struct *tsk,
+			struct hw_breakpoint *bp);
+void __unregister_user_hw_breakpoint(int pos, struct task_struct *tsk,
+			struct hw_breakpoint *bp);
+
+void arch_install_thread_hbkpt(struct task_struct *tsk);
+void arch_install_none(void);
+void arch_install_kernel_hbkpt(void *);
+int arch_check_va_in_userspace(unsigned long va,
+						struct task_struct *tsk);
+int arch_check_va_in_kernelspace(unsigned long va);
+void arch_store_info(struct hw_breakpoint *bp);
+int arch_validate_hwbkpt_settings(struct hw_breakpoint *bp,
+				unsigned int *align, struct task_struct *tsk);
+void arch_register_user_hw_breakpoint(int pos, struct hw_breakpoint *bp,
+				struct task_struct *tsk);
+int arch_modify_user_hw_breakpoint(int pos, struct hw_breakpoint *bp,
+				struct task_struct *tsk);
+void arch_unregister_user_hw_breakpoint(int pos, struct hw_breakpoint *bp,
+					struct task_struct *tsk);
+void arch_register_kernel_hw_breakpoint(struct hw_breakpoint *bp);
+void arch_unregister_kernel_hw_breakpoint(int pos);
+int hw_breakpoint_handler(struct die_args *args);
+
+#endif	/* __KERNEL__ */
+#endif	/* _I386_HW_BREAKPOINT_H */
+
Index: linux-2.6-tip.hbkpt/arch/x86/kernel/Makefile
===================================================================
--- linux-2.6-tip.hbkpt.orig/arch/x86/kernel/Makefile
+++ linux-2.6-tip.hbkpt/arch/x86/kernel/Makefile
@@ -36,7 +36,7 @@ obj-$(CONFIG_X86_64)	+= sys_x86_64.o x86
 obj-$(CONFIG_X86_64)	+= syscall_64.o vsyscall_64.o
 obj-y			+= bootflag.o e820.o
 obj-y			+= pci-dma.o quirks.o i8237.o topology.o kdebugfs.o
-obj-y			+= alternative.o i8253.o pci-nommu.o
+obj-y			+= alternative.o i8253.o pci-nommu.o hw_breakpoint.o
 obj-y			+= tsc.o io_delay.o rtc.o
 
 obj-$(CONFIG_X86_TRAMPOLINE)	+= trampoline.o
Index: linux-2.6-tip.hbkpt/arch/x86/Kconfig
===================================================================
--- linux-2.6-tip.hbkpt.orig/arch/x86/Kconfig
+++ linux-2.6-tip.hbkpt/arch/x86/Kconfig
@@ -46,6 +46,7 @@ config X86
 	select HAVE_KERNEL_BZIP2
 	select HAVE_KERNEL_LZMA
 	select HAVE_ARCH_KMEMCHECK
+	select HAVE_HW_BREAKPOINT
 
 config ARCH_DEFCONFIG
 	string


^ permalink raw reply	[flat|nested] 71+ messages in thread

* [Patch 02/11] x86 architecture implementation of Hardware Breakpoint interfaces
       [not found] <20090307045120.039324630@linux.vnet.ibm.com>
@ 2009-03-07  5:05 ` prasad
  0 siblings, 0 replies; 71+ messages in thread
From: prasad @ 2009-03-07  5:05 UTC (permalink / raw)
  To: mingo
  Cc: Andrew Morton, Linux Kernel Mailing List, Alan Stern,
	Roland McGrath, K.Prasad

[-- Attachment #1: 2 --]
[-- Type: text/plain, Size: 17464 bytes --]

From: Alan Stern <stern@rowland.harvard.edu>

This patch introduces two new files named hw_breakpoint.[ch] inside x86 specific
directories. They contain functions which help validate and serve requests for 
using Hardware Breakpoint registers on x86 processors.

[K.Prasad: More declarations in hw_breakpoint.h to independently compile each
           hw_breakpoint.c files. Split-out from the bigger patch and minor
           changes following re-basing]

Signed-off-by: K.Prasad <prasad@linux.vnet.ibm.com>
Signed-off-by: Alan Stern <stern@rowland.harvard.edu>
---
 arch/x86/include/asm/hw_breakpoint.h |  132 ++++++++++
 arch/x86/kernel/Makefile             |    2 
 arch/x86/kernel/hw_breakpoint.c      |  437 +++++++++++++++++++++++++++++++++++
 3 files changed, 570 insertions(+), 1 deletion(-)

Index: linux-2.6-tip/arch/x86/kernel/hw_breakpoint.c
===================================================================
--- /dev/null
+++ linux-2.6-tip/arch/x86/kernel/hw_breakpoint.c
@@ -0,0 +1,437 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) 2007 Alan Stern
+ * Copyright (C) 2009 IBM Corporation
+ */
+
+/*
+ * HW_breakpoint: a unified kernel/user-space hardware breakpoint facility,
+ * using the CPU's debug registers.
+ */
+
+#include <linux/init.h>
+#include <linux/irqflags.h>
+#include <linux/kdebug.h>
+#include <linux/kernel.h>
+#include <linux/kprobes.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/notifier.h>
+#include <linux/rculist.h>
+#include <linux/sched.h>
+#include <linux/smp.h>
+#include <linux/percpu.h>
+#include <linux/kallsyms.h>
+
+#include <asm/debugreg.h>
+#include <asm/hw_breakpoint.h>
+#include <asm/processor.h>
+
+static unsigned long		kdr7;		/* Unmasked kernel DR7 value */
+
+/* Masks for the bits in DR7 related to kernel breakpoints, for various
+ * values of num_kbps.  Entry n is the mask for when there are n kernel
+ * breakpoints, in debug registers 0 - (n-1).  The DR_GLOBAL_SLOWDOWN bit
+ * (GE) is handled specially.
+ */
+static const unsigned long	kdr7_masks[HB_NUM + 1] = {
+	0x00000000,
+	0x000f0003,	/* LEN0, R/W0, G0, L0 */
+	0x00ff000f,	/* Same for 0,1 */
+	0x0fff003f,	/* Same for 0,1,2 */
+	0xffff00ff	/* Same for 0,1,2,3 */
+};
+
+/*
+ * Install the kernel breakpoints in their debug registers.
+ */
+void arch_install_chbi(struct cpu_hw_breakpoint *chbi)
+{
+	struct hw_breakpoint **bps;
+
+	/* Don't allow debug exceptions while we update the registers */
+	set_debugreg(0UL, 7);
+	chbi->cur_kbpdata = rcu_dereference(cur_kbpdata);
+
+	/* Kernel breakpoints are stored starting in DR0 and going up */
+	bps = chbi->cur_kbpdata->bps;
+	switch (chbi->cur_kbpdata->num_kbps) {
+	case 4:
+		set_debugreg(bps[3]->info.address, 3);
+	case 3:
+		set_debugreg(bps[2]->info.address, 2);
+	case 2:
+		set_debugreg(bps[1]->info.address, 1);
+	case 1:
+		set_debugreg(bps[0]->info.address, 0);
+	}
+	/* No need to set DR6 */
+	set_debugreg(chbi->cur_kbpdata->mkdr7, 7);
+}
+
+/*
+ * Update an out-of-date thread hw_breakpoint info structure.
+ */
+void arch_update_thbi(struct thread_hw_breakpoint *thbi,
+			struct kernel_bp_data *thr_kbpdata)
+{
+	int num = thr_kbpdata->num_kbps;
+
+	thbi->tkdr7 = thr_kbpdata->mkdr7 | (thbi->tdr7 & ~kdr7_masks[num]);
+}
+
+/*
+ * Install the thread breakpoints in their debug registers.
+ */
+void arch_install_thbi(struct thread_hw_breakpoint *thbi)
+{
+	/* Install the user breakpoints.  Kernel breakpoints are stored
+	 * starting in DR0 and going up; there are num_kbps of them.
+	 * User breakpoints are stored starting in DR3 and going down,
+	 * as many as we have room for.
+	 */
+	switch (thbi->num_installed) {
+	case 4:
+		set_debugreg(thbi->tdr[0], 0);
+	case 3:
+		set_debugreg(thbi->tdr[1], 1);
+	case 2:
+		set_debugreg(thbi->tdr[2], 2);
+	case 1:
+		set_debugreg(thbi->tdr[3], 3);
+	}
+	/* No need to set DR6 */
+	set_debugreg(thbi->tkdr7, 7);
+}
+
+/*
+ * Install the debug register values for just the kernel, no thread.
+ */
+void arch_install_none(struct cpu_hw_breakpoint *chbi)
+{
+	set_debugreg(chbi->cur_kbpdata->mkdr7, 7);
+}
+
+/*
+ * Create a new kbpdata entry.
+ */
+void arch_new_kbpdata(struct kernel_bp_data *new_kbpdata)
+{
+	int num = new_kbpdata->num_kbps;
+
+	new_kbpdata->mkdr7 = kdr7 & (kdr7_masks[num] | DR_GLOBAL_SLOWDOWN);
+}
+
+/*
+ * Store a thread breakpoint array entry's address
+ */
+void arch_store_thread_bp_array(struct thread_hw_breakpoint *thbi,
+					struct hw_breakpoint *bp, int i)
+{
+	thbi->tdr[i] = bp->info.address;
+}
+
+/*
+ * Check for virtual address in user space.
+ */
+int arch_check_va_in_userspace(unsigned long va, struct task_struct *tsk)
+{
+	return (va < TASK_SIZE);
+}
+
+/*
+ * Check for virtual address in kernel space.
+ */
+int arch_check_va_in_kernelspace(unsigned long va)
+{
+	return (va >= TASK_SIZE);
+}
+
+/*
+ * Store a breakpoint's encoded address, length, and type.
+ */
+void arch_store_info(struct hw_breakpoint *bp)
+{
+	/*
+	 * User-space requests will always have the address field populated
+	 * For kernel-addresses, either the address or symbol name can be
+	 * specified.
+	 */
+	if (bp->info.address)
+		return;
+	bp->info.address = (unsigned long)kallsyms_lookup_name(bp->info.name);
+}
+
+/*
+ * Validate the arch-specific HW Breakpoint register settings
+ */
+int arch_validate_hwbkpt_settings(struct hw_breakpoint *bp,
+				unsigned int *align, struct task_struct *tsk)
+{
+	int ret = -EINVAL;
+
+	switch (bp->info.type) {
+
+	/* Ptrace-refactoring code
+	 * For now, we'll allow instruction breakpoint only for user-space
+	 * addresses
+	 */
+	case HW_BREAKPOINT_EXECUTE:
+		if ((!arch_check_va_in_userspace(bp->info.address, tsk)) &&
+			bp->info.len != HW_BREAKPOINT_LEN_EXECUTE)
+			return ret;
+		break;
+	case HW_BREAKPOINT_WRITE:
+				break;
+	case HW_BREAKPOINT_RW:
+				break;
+	default:
+		return ret;
+	}
+
+	switch (bp->info.len) {
+	case HW_BREAKPOINT_LEN_1:
+		*align = 0;
+		break;
+	case HW_BREAKPOINT_LEN_2:
+		*align = 1;
+		break;
+	case HW_BREAKPOINT_LEN_4:
+		*align = 3;
+		break;
+	default:
+		return ret;
+	}
+
+	if (bp->triggered) {
+		ret = 0;
+		arch_store_info(bp);
+	}
+	return ret;
+}
+
+/*
+ * Encode the length, type, Exact, and Enable bits for a particular breakpoint
+ * as stored in debug register 7.
+ */
+static unsigned long encode_dr7(int drnum, unsigned len, unsigned type)
+{
+	unsigned long temp;
+
+	temp = (len | type) & 0xf;
+	temp <<= (DR_CONTROL_SHIFT + drnum * DR_CONTROL_SIZE);
+	temp |= (DR_GLOBAL_ENABLE << (drnum * DR_ENABLE_SIZE)) |
+				DR_GLOBAL_SLOWDOWN;
+	return temp;
+}
+
+/*
+ * Calculate the DR7 value for a list of kernel or user breakpoints.
+ */
+static unsigned long calculate_dr7(struct thread_hw_breakpoint *thbi)
+{
+	int is_user;
+	struct list_head *bp_list;
+	struct hw_breakpoint *bp;
+	int i;
+	int drnum;
+	unsigned long dr7;
+
+	if (thbi) {
+		is_user = 1;
+		bp_list = &thbi->thread_bps;
+		drnum = HB_NUM - 1;
+	} else {
+		is_user = 0;
+		bp_list = &kernel_bps;
+		drnum = 0;
+	}
+
+	/* Kernel bps are assigned from DR0 on up, and user bps are assigned
+	 * from DR3 on down.  Accumulate all 4 bps; the kernel DR7 mask will
+	 * select the appropriate bits later.
+	 */
+	dr7 = 0;
+	i = 0;
+	list_for_each_entry(bp, bp_list, node) {
+
+		/* Get the debug register number and accumulate the bits */
+		dr7 |= encode_dr7(drnum, bp->info.len, bp->info.type);
+		if (++i >= HB_NUM)
+			break;
+		if (is_user)
+			--drnum;
+		else
+			++drnum;
+	}
+	return dr7;
+}
+
+/*
+ * Register a new user breakpoint structure.
+ */
+void arch_register_user_hw_breakpoint(struct hw_breakpoint *bp,
+		struct thread_hw_breakpoint *thbi)
+{
+	thbi->tdr7 = calculate_dr7(thbi);
+
+	/* If this is an execution breakpoint for the current PC address,
+	 * we should clear the task's RF so that the bp will be certain
+	 * to trigger.
+	 *
+	 * FIXME: It's not so easy to get hold of the task's PC as a linear
+	 * address!  ptrace.c does this already...
+	 */
+}
+
+/*
+ * Unregister a user breakpoint structure.
+ */
+void arch_unregister_user_hw_breakpoint(struct hw_breakpoint *bp,
+					struct thread_hw_breakpoint *thbi)
+{
+	thbi->tdr7 = calculate_dr7(thbi);
+}
+
+/*
+ * Register a kernel breakpoint structure.
+ */
+void arch_register_kernel_hw_breakpoint(struct hw_breakpoint *bp)
+{
+	kdr7 = calculate_dr7(NULL);
+}
+
+/*
+ * Unregister a kernel breakpoint structure.
+ */
+void arch_unregister_kernel_hw_breakpoint(struct hw_breakpoint *bp)
+{
+	kdr7 = calculate_dr7(NULL);
+}
+
+
+/* End of arch-specific hook routines */
+
+
+/*
+ * Copy out the debug register information for a core dump.
+ *
+ * tsk must be equal to current.
+ */
+void dump_thread_hw_breakpoint(struct task_struct *tsk, int u_debugreg[8])
+{
+	struct thread_hw_breakpoint *thbi = tsk->thread.hw_breakpoint_info;
+	int i;
+
+	memset(u_debugreg, 0, sizeof u_debugreg);
+	if (thbi) {
+		for (i = 0; i < HB_NUM; ++i)
+			u_debugreg[i] = thbi->vdr_bps[i].info.address;
+		u_debugreg[7] = thbi->vdr7;
+	}
+	u_debugreg[6] = tsk->thread.vdr6;
+}
+
+/*
+ * Handle debug exception notifications.
+ */
+
+int __kprobes hw_breakpoint_handler(struct die_args *args)
+{
+	struct cpu_hw_breakpoint *chbi;
+	int i;
+	struct hw_breakpoint *bp;
+	struct thread_hw_breakpoint *thbi = NULL;
+
+	/* The DR6 value is stored in args->err */
+#define DR6	(args->err)
+
+	if (DR6 & DR_STEP)
+		return NOTIFY_DONE;
+
+	chbi = &per_cpu(cpu_bp, get_cpu());
+
+	/* Disable all breakpoints so that the callbacks can run without
+	 * triggering recursive debug exceptions.
+	 */
+	set_debugreg(0UL, 7);
+
+	/* Assert that local interrupts are disabled
+	 * Reset the DRn bits in the virtualized register value.
+	 * The ptrace trigger routine will add in whatever is needed.
+	 */
+	current->thread.vdr6 &= ~(DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3);
+
+	/* Are we a victim of lazy debug-register switching? */
+	if (!chbi->bp_task)
+		;
+	else if (chbi->bp_task != current) {
+
+		/* No user breakpoints are valid.  Perform the belated
+		 * debug-register switch.
+		 */
+		switch_to_none_hw_breakpoint();
+	} else {
+		thbi = chbi->bp_task->thread.hw_breakpoint_info;
+	}
+
+	/* Handle all the breakpoints that were triggered */
+	for (i = 0; i < HB_NUM; ++i) {
+		if (likely(!(DR6 & (DR_TRAP0 << i))))
+			continue;
+
+		/* Find the corresponding hw_breakpoint structure and
+		 * invoke its triggered callback.
+		 */
+		if (i < chbi->cur_kbpdata->num_kbps)
+			bp = chbi->cur_kbpdata->bps[i];
+		else if (thbi)
+			bp = thbi->bps[i];
+		else		/* False alarm due to lazy DR switching */
+			continue;
+		if (bp) {
+			switch (bp->info.type) {
+			case HW_BREAKPOINT_WRITE:
+			case HW_BREAKPOINT_RW:
+				if (bp->triggered)
+					(bp->triggered)(bp, args->regs);
+				/* Re-enable the breakpoints */
+				set_debugreg(thbi ? thbi->tkdr7 :
+						chbi->cur_kbpdata->mkdr7, 7);
+				put_cpu_no_resched();
+
+				return NOTIFY_STOP;
+			/*
+			 * Presently we allow instruction breakpoints only in
+			 * user-space when requested through ptrace.
+			 */
+			case HW_BREAKPOINT_EXECUTE:
+				if (arch_check_va_in_userspace(bp->info.address,
+								current)) {
+					(bp->triggered)(bp, args->regs);
+	/* We'll return NOTIFY_DONE, do_debug will take care of the rest */
+					return NOTIFY_DONE;
+				}
+			}
+		}
+	}
+	/* Stop processing further if the exception is a stray one */
+	if (!(DR6 & ~(DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)))
+		return NOTIFY_STOP;
+
+	return NOTIFY_DONE;
+#undef DR6
+}
Index: linux-2.6-tip/arch/x86/include/asm/hw_breakpoint.h
===================================================================
--- /dev/null
+++ linux-2.6-tip/arch/x86/include/asm/hw_breakpoint.h
@@ -0,0 +1,132 @@
+#ifndef	_I386_HW_BREAKPOINT_H
+#define	_I386_HW_BREAKPOINT_H
+
+#ifdef	__KERNEL__
+#define	__ARCH_HW_BREAKPOINT_H
+
+struct arch_hw_breakpoint {
+	char		*name; /* Contains name of the symbol to set bkpt */
+	unsigned long	address;
+	u8		len;
+	u8		type;
+} __attribute__((packed));
+
+#include <linux/kdebug.h>
+#include <asm-generic/hw_breakpoint.h>
+
+/* HW breakpoint accessor routines */
+static inline const void *hw_breakpoint_get_kaddress(struct hw_breakpoint *bp)
+{
+	return (const void *) bp->info.address;
+}
+
+static inline const void __user *hw_breakpoint_get_uaddress
+						(struct hw_breakpoint *bp)
+{
+	return (const void __user *) bp->info.address;
+}
+
+static inline unsigned hw_breakpoint_get_len(struct hw_breakpoint *bp)
+{
+	return bp->info.len;
+}
+
+static inline unsigned hw_breakpoint_get_type(struct hw_breakpoint *bp)
+{
+	return bp->info.type;
+}
+
+/* Kernel symbol lookup routine for installing Data HW Breakpoint Address */
+static inline unsigned long hw_breakpoint_lookup_name(const char *name)
+{
+	return kallsyms_lookup_name(name);
+}
+
+/* Available HW breakpoint length encodings */
+#define HW_BREAKPOINT_LEN_1		0x40
+#define HW_BREAKPOINT_LEN_2		0x44
+#define HW_BREAKPOINT_LEN_4		0x4c
+#define HW_BREAKPOINT_LEN_EXECUTE	0x40
+
+/* Available HW breakpoint type encodings */
+#define HW_BREAKPOINT_EXECUTE	0x80	/* trigger on instruction execute */
+#define HW_BREAKPOINT_WRITE	0x81	/* trigger on memory write */
+#define HW_BREAKPOINT_RW	0x83	/* trigger on memory read or write */
+
+#define HB_NUM 4 /* Total number of available HW breakpoint registers */
+
+/* Per-thread HW breakpoint and debug register info */
+struct thread_hw_breakpoint {
+
+	/* utrace support */
+	struct list_head	node;		/* Entry in thread list */
+	struct list_head	thread_bps;	/* Thread's breakpoints */
+	struct hw_breakpoint	*bps[HB_NUM];	/* Highest-priority bps */
+	unsigned long		tdr[HB_NUM];	/*  and their addresses */
+	int			num_installed;	/* Number of installed bps */
+	unsigned		gennum;		/* update-generation number */
+
+	/* Only the portions below are arch-specific */
+
+	/* ptrace support -- Note that vdr6 is stored directly in the
+	 * thread_struct so that it is always available.
+	 */
+	unsigned long		vdr7;			/* Virtualized DR7 */
+	struct hw_breakpoint	vdr_bps[HB_NUM];	/* Breakpoints
+			representing virtualized debug registers 0 - 3 */
+	unsigned long		tdr7;		/* Thread's DR7 value */
+	unsigned long		tkdr7;		/* Thread + kernel DR7 value */
+};
+
+/* Kernel-space breakpoint data */
+struct kernel_bp_data {
+	unsigned		gennum;		/* Generation number */
+	int			num_kbps;	/* Number of kernel bps */
+	struct hw_breakpoint	*bps[HB_NUM];	/* Loaded breakpoints */
+
+	/* Only the portions below are arch-specific */
+	unsigned long		mkdr7;		/* Masked kernel DR7 value */
+};
+
+/* Per-CPU debug register info */
+struct cpu_hw_breakpoint {
+	struct kernel_bp_data	*cur_kbpdata;	/* Current kbpdata[] entry */
+	struct task_struct	*bp_task;	/* The thread whose bps
+			are currently loaded in the debug registers */
+};
+
+/*
+ * Ptrace support: breakpoint trigger routine.
+ */
+
+int __register_user_hw_breakpoint(struct task_struct *tsk,
+			struct hw_breakpoint *bp);
+void __unregister_user_hw_breakpoint(struct task_struct *tsk,
+			struct hw_breakpoint *bp);
+
+
+void arch_update_thbi(struct thread_hw_breakpoint *thbi,
+				struct kernel_bp_data *thr_kbpdata);
+void arch_install_thbi(struct thread_hw_breakpoint *thbi);
+void arch_install_none(struct cpu_hw_breakpoint *chbi);
+void arch_install_chbi(struct cpu_hw_breakpoint *chbi);
+void arch_new_kbpdata(struct kernel_bp_data *new_kbpdata);
+void arch_store_thread_bp_array(struct thread_hw_breakpoint *thbi,
+				struct hw_breakpoint *bp, int i);
+int arch_check_va_in_userspace(unsigned long va,
+						struct task_struct *tsk);
+int arch_check_va_in_kernelspace(unsigned long va);
+void arch_store_info(struct hw_breakpoint *bp);
+int arch_validate_hwbkpt_settings(struct hw_breakpoint *bp,
+				unsigned int *align, struct task_struct *tsk);
+void arch_register_user_hw_breakpoint(struct hw_breakpoint *bp,
+				struct thread_hw_breakpoint *thbi);
+void arch_unregister_user_hw_breakpoint(struct hw_breakpoint *bp,
+					struct thread_hw_breakpoint *thbi);
+void arch_register_kernel_hw_breakpoint(struct hw_breakpoint *bp);
+void arch_unregister_kernel_hw_breakpoint(struct hw_breakpoint *bp);
+int hw_breakpoint_handler(struct die_args *args);
+
+#endif	/* __KERNEL__ */
+#endif	/* _I386_HW_BREAKPOINT_H */
+
Index: linux-2.6-tip/arch/x86/kernel/Makefile
===================================================================
--- linux-2.6-tip.orig/arch/x86/kernel/Makefile
+++ linux-2.6-tip/arch/x86/kernel/Makefile
@@ -36,7 +36,7 @@ obj-$(CONFIG_X86_64)	+= sys_x86_64.o x86
 obj-$(CONFIG_X86_64)	+= syscall_64.o vsyscall_64.o
 obj-y			+= bootflag.o e820.o
 obj-y			+= pci-dma.o quirks.o i8237.o topology.o kdebugfs.o
-obj-y			+= alternative.o i8253.o pci-nommu.o
+obj-y			+= alternative.o i8253.o pci-nommu.o hw_breakpoint.o
 obj-y			+= tsc.o io_delay.o rtc.o
 
 obj-$(CONFIG_X86_TRAMPOLINE)	+= trampoline.o


^ permalink raw reply	[flat|nested] 71+ messages in thread

end of thread, other threads:[~2009-03-24 15:26 UTC | newest]

Thread overview: 71+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
     [not found] <20090305043440.189041194@linux.vnet.ibm.com>
2009-03-05  4:37 ` [patch 01/11] Introducing generic hardware breakpoint handler interfaces prasad
2009-03-10 13:50   ` Ingo Molnar
2009-03-10 14:19     ` Alan Stern
2009-03-10 14:50       ` Ingo Molnar
2009-03-11 12:57         ` K.Prasad
2009-03-11 13:35           ` Ingo Molnar
2009-03-05  4:38 ` [patch 02/11] x86 architecture implementation of Hardware Breakpoint interfaces prasad
2009-03-10 14:09   ` Ingo Molnar
2009-03-10 14:59     ` Alan Stern
2009-03-10 15:18       ` Ingo Molnar
2009-03-10 17:11         ` Alan Stern
2009-03-10 17:26           ` Ingo Molnar
2009-03-10 20:30             ` Alan Stern
2009-03-11 12:12               ` Ingo Molnar
2009-03-11 12:50                 ` K.Prasad
2009-03-11 13:10                   ` Ingo Molnar
2009-03-14  3:46                     ` Benjamin Herrenschmidt
2009-03-11 16:39                   ` Alan Stern
2009-03-11 16:32                 ` Alan Stern
2009-03-11 17:41                   ` K.Prasad
2009-03-14  3:47                     ` Benjamin Herrenschmidt
2009-03-14  3:43                 ` Benjamin Herrenschmidt
2009-03-14  3:41               ` Benjamin Herrenschmidt
2009-03-14  3:40             ` Benjamin Herrenschmidt
2009-03-12  2:46     ` Roland McGrath
2009-03-13  3:43       ` Ingo Molnar
2009-03-13 14:04         ` Alan Stern
2009-03-13 14:13           ` Ingo Molnar
2009-03-13 19:01             ` K.Prasad
2009-03-13 21:21               ` Alan Stern
2009-03-14 12:24                 ` Ingo Molnar
2009-03-14 16:10                   ` Alan Stern
2009-03-14 16:39                     ` Ingo Molnar
2009-03-14  3:51       ` Benjamin Herrenschmidt
2009-03-05  4:38 ` [patch 03/11] Modifying generic debug exception to use virtual debug registers prasad
2009-03-05  4:38 ` [patch 04/11] Introduce virtual debug register in thread_struct and wrapper-routines around process related functions prasad
2009-03-10 14:35   ` Ingo Molnar
2009-03-10 15:53     ` Alan Stern
2009-03-10 17:06       ` Ingo Molnar
2009-03-12  2:26     ` Roland McGrath
2009-03-05  4:38 ` [patch 05/11] Use wrapper routines around debug registers in processor " prasad
2009-03-05  4:40 ` [patch 06/11] Use virtual debug registers in process/thread handling code prasad
2009-03-10 14:49   ` Ingo Molnar
2009-03-10 16:05     ` Alan Stern
2009-03-10 16:58       ` Ingo Molnar
2009-03-10 17:07       ` Ingo Molnar
2009-03-10 20:10         ` Alan Stern
2009-03-11 11:53           ` Ingo Molnar
2009-03-05  4:40 ` [patch 07/11] Modify signal handling code to refrain from re-enabling HW Breakpoints prasad
2009-03-05  4:40 ` [patch 08/11] Modify Ptrace routines to access breakpoint registers prasad
2009-03-10 14:40   ` Ingo Molnar
2009-03-10 15:54     ` Alan Stern
2009-03-12  3:14     ` Roland McGrath
2009-03-05  4:41 ` [patch 09/11] Cleanup HW Breakpoint registers before kexec prasad
2009-03-10 14:42   ` Ingo Molnar
2009-03-05  4:41 ` [patch 10/11] Sample HW breakpoint over kernel data address prasad
2009-03-05  4:43 ` prasad
2009-03-05  4:43 ` [patch 11/11] ftrace plugin for kernel symbol tracing using HW Breakpoint interfaces prasad
2009-03-05  6:37   ` Frederic Weisbecker
2009-03-05  9:16     ` Ingo Molnar
2009-03-05 13:15       ` K.Prasad
2009-03-05 13:28         ` Ingo Molnar
2009-03-05 11:33     ` K.Prasad
2009-03-05 12:19       ` K.Prasad
2009-03-05 12:30         ` Frederic Weisbecker
2009-03-05 12:28       ` Frederic Weisbecker
2009-03-05 15:00     ` Steven Rostedt
2009-03-05 14:54   ` Steven Rostedt
     [not found] <20090307045120.039324630@linux.vnet.ibm.com>
2009-03-07  5:05 ` [Patch 02/11] x86 architecture implementation of Hardware " prasad
     [not found] <20090319234044.410725944@K.Prasad>
2009-03-19 23:48 ` K.Prasad
     [not found] <20090324152028.754123712@K.Prasad>
2009-03-24 15:25 ` K.Prasad

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.