All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] Kwatch: kernel watchpoints using CPU debug registers
@ 2007-01-16 16:55 Alan Stern
  2007-01-16 23:35 ` Christoph Hellwig
  2007-01-17  9:44 ` Ingo Molnar
  0 siblings, 2 replies; 25+ messages in thread
From: Alan Stern @ 2007-01-16 16:55 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Prasanna S Panchamukhi, Kernel development list

From: Alan Stern <stern@rowland.harvard.edu>

This patch (as839) implements the Kwatch (kernel-space hardware-based
watchpoints) API for the i386 architecture.  The API is explained in
the kerneldoc for register_kwatch() in arch/i386/kernel/kwatch.c.

The original version of the patch was written by Vamsi Krishna S and
Bharata Rao.  It was later updated by Prasanna S Panchamukhi for 2.6.13
and then again by me for 2.6.20.

Signed-off-by: Prasanna S Panchamukhi <prasanna@in.ibm.com>
Signed-off-by: Alan Stern <stern@rowland.harvard.edu>

---

Hardware-based watchpoints can sometimes be indispensable for finding the 
source of problems.  Although this patch is only for the x86 architecture, 
it should still be useful.  And there's no downside to adopting it, since 
it has virtually no overhead with CONFIG_KWATCH isn't selected.


Index: usb-2.6/arch/i386/kernel/debugreg.c
===================================================================
--- /dev/null
+++ usb-2.6/arch/i386/kernel/debugreg.c
@@ -0,0 +1,182 @@
+/*
+ *  Debug register
+ *  arch/i386/kernel/debugreg.c
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) IBM Corporation, 2002, 2004
+ *
+ * 2002-Oct	Created by Vamsi Krishna S <vamsi_krishna@in.ibm.com> and
+ *		Bharata Rao <bharata@in.ibm.com> to provide debug register
+ *		allocation mechanism.
+ * 2004-Oct	Updated by Prasanna S Panchamukhi <prasanna@in.ibm.com> with
+ *		idr_allocations mechanism as suggested by Andi Kleen.
+ */
+
+/*
+ * These routines provide a debug register allocation mechanism.
+ */
+
+#include <linux/kernel.h>
+#include <linux/spinlock.h>
+#include <linux/module.h>
+#include <asm/system.h>
+#include <asm/debugreg.h>
+
+struct debugreg {
+	int flag;
+	int use_count;
+};
+
+struct debugreg dr_list[DR_MAX];
+static spinlock_t dr_lock = SPIN_LOCK_UNLOCKED;
+unsigned long dr7_global_mask = DR_CONTROL_RESERVED | DR_GLOBAL_SLOWDOWN |
+		DR_GLOBAL_ENABLE_MASK;
+
+static unsigned long dr7_global_reg_mask(unsigned int regnum)
+{
+	return (0xf << (16 + regnum * 4)) | (0x1 << (regnum * 2));
+}
+
+static int get_dr(int regnum, int flag)
+{
+	if (flag == DR_ALLOC_GLOBAL && !dr_list[regnum].flag) {
+		dr_list[regnum].flag = flag;
+		dr7_global_mask |= dr7_global_reg_mask(regnum);
+		return regnum;
+	}
+	if (flag == DR_ALLOC_LOCAL &&
+			dr_list[regnum].flag != DR_ALLOC_GLOBAL) {
+		dr_list[regnum].flag = flag;
+		dr_list[regnum].use_count++;
+		return regnum;
+	}
+	return -1;
+}
+
+static void free_dr(int regnum)
+{
+	if (dr_list[regnum].flag == DR_ALLOC_LOCAL) {
+		if (!--dr_list[regnum].use_count)
+			dr_list[regnum].flag = 0;
+	} else {
+		dr_list[regnum].flag = 0;
+		dr_list[regnum].use_count = 0;
+		dr7_global_mask &= ~(dr7_global_reg_mask(regnum));
+	}
+}
+
+int dr_alloc(int regnum, int flag)
+{
+	int ret = -1;
+
+	spin_lock(&dr_lock);
+	if (regnum >= 0 && regnum < DR_MAX)
+		ret = get_dr(regnum, flag);
+	else if (regnum == DR_ANY) {
+
+		/* gdb allocates local debug registers starting from 0.
+		 * To help avoid conflicts, we'll start from the other end.
+		 */
+		for (regnum = DR_MAX - 1; regnum >= 0; --regnum) {
+			ret = get_dr(regnum, flag);
+			if (ret >= 0)
+				break;
+		}
+	} else
+		printk(KERN_ERR "dr_alloc: "
+				"Cannot allocate debug register %d\n", regnum);
+	spin_unlock(&dr_lock);
+	return ret;
+}
+
+void dr_free(int regnum)
+{
+	spin_lock(&dr_lock);
+	if (regnum < 0 || regnum >= DR_MAX || !dr_list[regnum].flag)
+		printk(KERN_ERR "dr_free: "
+				"Cannot free debug register %d\n", regnum);
+	else
+		free_dr(regnum);
+	spin_unlock(&dr_lock);
+}
+
+void dr_inc_use_count(unsigned long mask)
+{
+	int i;
+	int dr_local_enable = 1 << DR_LOCAL_ENABLE_SHIFT;
+
+	spin_lock(&dr_lock);
+	for (i = 0; i < DR_MAX; (++i, dr_local_enable <<= DR_ENABLE_SIZE)) {
+		if (mask & dr_local_enable)
+			dr_list[i].use_count++;
+	}
+	spin_unlock(&dr_lock);
+}
+
+void dr_dec_use_count(unsigned long mask)
+{
+	int i;
+	int dr_local_enable = 1 << DR_LOCAL_ENABLE_SHIFT;
+
+	spin_lock(&dr_lock);
+	for (i = 0; i < DR_MAX; (++i, dr_local_enable <<= DR_ENABLE_SIZE)) {
+		if (mask & dr_local_enable)
+			free_dr(i);
+	}
+	spin_unlock(&dr_lock);
+}
+
+int dr_is_global(int regnum)
+{
+	return (dr_list[regnum].flag == DR_ALLOC_GLOBAL);
+}
+
+/*
+ * This routine decides if a ptrace request is for enabling or disabling
+ * a debug reg, and accordingly calls dr_alloc() or dr_free().
+ *
+ * gdb uses ptrace to write to debug registers.  It assumes that writing to
+ * a debug register always succeds and it doesn't check the return value of
+ * ptrace.  Now with this new global debug register allocation/freeing,
+ * ptrace request for a local debug register will fail if the required debug
+ * register is already globally allocated.  Since gdb doesn't notice this
+ * failure, it sometimes tries to free a debug register which it does not
+ * own.
+ *
+ * Returns -1 if the ptrace request tries to locally allocate a debug register
+ * that is already globally allocated.  Otherwise returns >0 or 0 according
+ * as any debug registers are or are not locally allocated in the new setting.
+ */
+int enable_debugreg(unsigned long old_dr7, unsigned long new_dr7)
+{
+	int i;
+	int dr_local_enable = 1 << DR_LOCAL_ENABLE_SHIFT;
+
+	if (new_dr7 & DR_LOCAL_ENABLE_MASK & dr7_global_mask)
+		return -1;
+	for (i = 0; i < DR_MAX; (++i, dr_local_enable <<= DR_ENABLE_SIZE)) {
+		if ((old_dr7 ^ new_dr7) & dr_local_enable) {
+			if (new_dr7 & dr_local_enable)
+				dr_alloc(i, DR_ALLOC_LOCAL);
+			else
+				dr_free(i);
+		}
+	}
+	return new_dr7 & DR_LOCAL_ENABLE_MASK;
+}
+
+EXPORT_SYMBOL(dr_alloc);
+EXPORT_SYMBOL(dr_free);
Index: usb-2.6/arch/i386/kernel/process.c
===================================================================
--- usb-2.6.orig/arch/i386/kernel/process.c
+++ usb-2.6/arch/i386/kernel/process.c
@@ -51,6 +51,7 @@
 #ifdef CONFIG_MATH_EMULATION
 #include <asm/math_emu.h>
 #endif
+#include <asm/debugreg.h>
 
 #include <linux/err.h>
 
@@ -356,9 +357,10 @@ EXPORT_SYMBOL(kernel_thread);
  */
 void exit_thread(void)
 {
+	struct task_struct *tsk = current;
+
 	/* The process may have allocated an io port bitmap... nuke it. */
 	if (unlikely(test_thread_flag(TIF_IO_BITMAP))) {
-		struct task_struct *tsk = current;
 		struct thread_struct *t = &tsk->thread;
 		int cpu = get_cpu();
 		struct tss_struct *tss = &per_cpu(init_tss, cpu);
@@ -376,12 +378,16 @@ void exit_thread(void)
 		tss->io_bitmap_base = INVALID_IO_BITMAP_OFFSET;
 		put_cpu();
 	}
+ 	if (unlikely(tsk->thread.debugreg[7]))
+ 		dr_dec_use_count(tsk->thread.debugreg[7]);
 }
 
 void flush_thread(void)
 {
 	struct task_struct *tsk = current;
 
+	if (unlikely(tsk->thread.debugreg[7]))
+		dr_dec_use_count(tsk->thread.debugreg[7]);
 	memset(tsk->thread.debugreg, 0, sizeof(unsigned long)*8);
 	memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));	
 	clear_tsk_thread_flag(tsk, TIF_DEBUG);
@@ -462,6 +468,9 @@ int copy_thread(int nr, unsigned long cl
 		desc->b = LDT_entry_b(&info);
 	}
 
+	if (unlikely(tsk->thread.debugreg[7]))
+		dr_inc_use_count(tsk->thread.debugreg[7]);
+
 	err = 0;
  out:
 	if (err && p->thread.io_bitmap_ptr) {
@@ -537,14 +546,22 @@ static noinline void __switch_to_xtra(st
 
 	next = &next_p->thread;
 
+	/*
+	 * Don't reload global debug registers. Don't touch the global debug
+	 * register settings in dr7.
+	 */
 	if (test_tsk_thread_flag(next_p, TIF_DEBUG)) {
-		set_debugreg(next->debugreg[0], 0);
-		set_debugreg(next->debugreg[1], 1);
-		set_debugreg(next->debugreg[2], 2);
-		set_debugreg(next->debugreg[3], 3);
+		if (!dr_is_global(0))
+			set_debugreg(next->debugreg[0], 0);
+		if (!dr_is_global(1))
+			set_debugreg(next->debugreg[1], 1);
+		if (!dr_is_global(2))
+			set_debugreg(next->debugreg[2], 2);
+		if (!dr_is_global(3))
+			set_debugreg(next->debugreg[3], 3);
 		/* no 4 and 5 */
 		set_debugreg(next->debugreg[6], 6);
-		set_debugreg(next->debugreg[7], 7);
+		set_process_dr7(next->debugreg[7]);
 	}
 
 	if (!test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) {
Index: usb-2.6/arch/i386/kernel/ptrace.c
===================================================================
--- usb-2.6.orig/arch/i386/kernel/ptrace.c
+++ usb-2.6/arch/i386/kernel/ptrace.c
@@ -412,6 +412,7 @@ long arch_ptrace(struct task_struct *chi
 			ret = putreg(child, addr, data);
 			break;
 		}
+
 		/* We need to be very careful here.  We implicitly
 		   want to modify a portion of the task_struct, and we
 		   have to be selective about what portions we allow someone
@@ -421,10 +422,18 @@ long arch_ptrace(struct task_struct *chi
 		if (addr >= (long) &dummy->u_debugreg[0] &&
 		    addr <= (long) &dummy->u_debugreg[7]) {
 
-			if (addr == (long) &dummy->u_debugreg[4]) break;
-			if (addr == (long) &dummy->u_debugreg[5]) break;
-			if (addr < (long) &dummy->u_debugreg[4] &&
-			    ((unsigned long) data) >= TASK_SIZE-3) break;
+			addr -= (long) &dummy->u_debugreg;
+			addr = addr >> 2;
+			if (addr < 4) {
+				if ((unsigned long) data >= TASK_SIZE-3)
+					break;
+				if (dr_is_global(addr)) {
+					ret = -EBUSY;
+					break;
+				}
+			}
+			else if (addr == 4 || addr == 5)
+				break;
 
 			/* Sanity-check data. Take one half-byte at once with
 			 * check = (val >> (16 + 4*i)) & 0xf. It contains the
@@ -456,18 +465,21 @@ long arch_ptrace(struct task_struct *chi
 			 * See the AMD manual no. 24593 (AMD64 System
 			 * Programming) */
 
-			if (addr == (long) &dummy->u_debugreg[7]) {
+			else if (addr == 7) {
 				data &= ~DR_CONTROL_RESERVED;
 				for (i = 0; i < 4; i++)
 					if ((0x5f54 >> ((data >> (16 + 4*i)) & 0xf)) & 1)
 						goto out_tsk;
-				if (data)
+				i = enable_debugreg(child->thread.debugreg[7], data);
+				if (i < 0) {
+					ret = -EBUSY;
+					break;
+				}
+				if (i)
 					set_tsk_thread_flag(child, TIF_DEBUG);
 				else
 					clear_tsk_thread_flag(child, TIF_DEBUG);
 			}
-			addr -= (long) &dummy->u_debugreg;
-			addr = addr >> 2;
 			child->thread.debugreg[addr] = data;
 			ret = 0;
 		}
Index: usb-2.6/arch/i386/kernel/signal.c
===================================================================
--- usb-2.6.orig/arch/i386/kernel/signal.c
+++ usb-2.6/arch/i386/kernel/signal.c
@@ -25,6 +25,7 @@
 #include <asm/ucontext.h>
 #include <asm/uaccess.h>
 #include <asm/i387.h>
+#include <asm/debugreg.h>
 #include "sigframe.h"
 
 #define DEBUG_SIG 0
@@ -594,7 +595,7 @@ static void fastcall do_signal(struct pt
 		 * inside the kernel.
 		 */
 		if (unlikely(current->thread.debugreg[7]))
-			set_debugreg(current->thread.debugreg[7], 7);
+			set_process_dr7(current->thread.debugreg[7]);
 
 		/* Whee!  Actually deliver the signal.  */
 		if (handle_signal(signr, &info, &ka, oldset, regs) == 0) {
Index: usb-2.6/arch/i386/kernel/traps.c
===================================================================
--- usb-2.6.orig/arch/i386/kernel/traps.c
+++ usb-2.6/arch/i386/kernel/traps.c
@@ -808,6 +808,7 @@ fastcall void __kprobes do_debug(struct 
 	struct task_struct *tsk = current;
 
 	get_debugreg(condition, 6);
+	set_debugreg(0, 6);	/* DR6 is never cleared by the CPU */
 
 	if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code,
 					SIGTRAP) == NOTIFY_STOP)
@@ -849,7 +850,7 @@ fastcall void __kprobes do_debug(struct 
 	 * the signal is delivered.
 	 */
 clear_dr7:
-	set_debugreg(0, 7);
+	set_process_dr7(0);
 	return;
 
 debug_vm86:
Index: usb-2.6/include/asm-i386/debugreg.h
===================================================================
--- usb-2.6.orig/include/asm-i386/debugreg.h
+++ usb-2.6/include/asm-i386/debugreg.h
@@ -33,6 +33,7 @@
 
 #define DR_RW_EXECUTE (0x0)   /* Settings for the access types to trap on */
 #define DR_RW_WRITE (0x1)
+#define DR_RW_IO (0x2)
 #define DR_RW_READ (0x3)
 
 #define DR_LEN_1 (0x0) /* Settings for data length to trap on */
@@ -61,4 +62,63 @@
 #define DR_LOCAL_SLOWDOWN (0x100)   /* Local slow the pipeline */
 #define DR_GLOBAL_SLOWDOWN (0x200)  /* Global slow the pipeline */
 
+#define DR_MAX	4
+#define DR_ANY	(DR_MAX + 1)
+
+/* global or local allocation requests */
+#define DR_ALLOC_GLOBAL		1
+#define DR_ALLOC_LOCAL		2
+
+#ifdef CONFIG_KWATCH
+
+/* Set the type, len and global flag in dr7 for a debug register */
+#define SET_DR7(dr, regnum, type, len)	do {		\
+		dr &= ~(0xf << (16 + (regnum) * 4));	\
+		dr |= (((((len) - 1) << 2) | (type)) <<	\
+				(16 + (regnum) * 4)) |	\
+			(0x2 << ((regnum) * 2));	\
+	} while (0)
+
+/* Disable a debug register by clearing the global/local flag in dr7 */
+#define RESET_DR7(dr, regnum)	dr &= ~(0x3 << ((regnum) * 2))
+
+extern int dr_alloc(int regnum, int flag);
+extern void dr_free(int regnum);
+extern void dr_inc_use_count(unsigned long mask);
+extern void dr_dec_use_count(unsigned long mask);
+extern int dr_is_global(int regnum);
+extern unsigned long dr7_global_mask;
+extern int enable_debugreg(unsigned long old_dr7, unsigned long new_dr7);
+
+static inline void set_process_dr7(unsigned long new_dr7)
+{
+	unsigned long dr7;
+
+	get_debugreg(dr7, 7);
+	dr7 = (dr7 & dr7_global_mask) | (new_dr7 & ~dr7_global_mask);
+	set_debugreg(dr7, 7);
+}
+
+#else
+
+static inline void dr_inc_use_count(unsigned long mask)
+{
+}
+static inline void dr_dec_use_count(unsigned long mask)
+{
+}
+static inline int dr_is_global(int regnum)
+{
+	return 0;
+}
+static inline int enable_debugreg(unsigned long old_dr7, unsigned long new_dr7)
+{
+	return (new_dr7 != 0);
+}
+static inline void set_process_dr7(unsigned long new_dr7)
+{
+	set_debugreg(new_dr7, 7);
+}
+
+#endif				/* CONFIG_KWATCH */
 #endif
Index: usb-2.6/arch/i386/kernel/kwatch.c
===================================================================
--- /dev/null
+++ usb-2.6/arch/i386/kernel/kwatch.c
@@ -0,0 +1,281 @@
+/*
+ *  Kernel Watchpoint interface.
+ *  arch/i386/kernel/kwatch.c
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) IBM Corporation, 2002, 2004
+ *
+ * 2002-Oct	Created by Vamsi Krishna S <vamsi_krishna@in.ibm.com> for
+ *		Kernel Watchpoint implementation.
+ * 2004-Oct	Updated by Prasanna S Panchamukhi <prasanna@in.ibm.com> to
+ *		to make use of notifiers.
+ */
+#include <linux/kprobes.h>
+#include <linux/ptrace.h>
+#include <linux/spinlock.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <asm/kwatch.h>
+#include <asm/kdebug.h>
+#include <asm/debugreg.h>
+#include <asm/bitops.h>
+
+#define RF_MASK	0x00010000
+
+static struct kwatch kwatch_list[DR_MAX];
+static spinlock_t kwatch_lock = SPIN_LOCK_UNLOCKED;
+static unsigned long kwatch_in_progress;	/* currently being handled */
+
+struct dr_info {
+	int debugreg;
+	unsigned long addr;
+	int type;
+};
+
+static void write_dr(int debugreg, unsigned long addr)
+{
+	switch (debugreg) {
+		case 0:	set_debugreg(addr, 0);	break;
+		case 1:	set_debugreg(addr, 1);	break;
+		case 2:	set_debugreg(addr, 2);	break;
+		case 3:	set_debugreg(addr, 3);	break;
+		case 6:	set_debugreg(addr, 6);	break;
+		case 7:	set_debugreg(addr, 7);	break;
+	}
+}
+
+#define write_dr7(val)	set_debugreg((val), 7)
+#define read_dr7(val)	get_debugreg((val), 7)
+
+static void write_smp_dr(void *info)
+{
+	struct dr_info *dr = (struct dr_info *)info;
+
+	if (cpu_has_de && dr->type == DR_TYPE_IO)
+		set_in_cr4(X86_CR4_DE);
+	write_dr(dr->debugreg, dr->addr);
+}
+
+/* Update the debug register on all CPUs */
+static void sync_dr(int debugreg, unsigned long addr, int type)
+{
+	struct dr_info dr;
+	dr.debugreg = debugreg;
+	dr.addr = addr;
+	dr.type = type;
+	smp_call_function(write_smp_dr, &dr, 0, 0);
+}
+
+/*
+ * Interrupts are disabled on entry as trap1 is an interrupt gate and they
+ * remain disabled thorough out this function.
+ */
+int kwatch_handler(unsigned long condition, struct pt_regs *regs)
+{
+	unsigned int debugreg;
+	unsigned long addr;
+
+	/* Using the debug status register value, find the debug register
+	 * number and the address for which the trap occurred. */
+	if (condition & DR_TRAP0) {
+		debugreg = 0;
+		get_debugreg(addr, 0);
+	} else if (condition & DR_TRAP1) {
+		debugreg = 1;
+		get_debugreg(addr, 1);
+	} else if (condition & DR_TRAP2) {
+		debugreg = 2;
+		get_debugreg(addr, 2);
+	} else if (condition & DR_TRAP3) {
+		debugreg = 3;
+		get_debugreg(addr, 3);
+	} else
+		return 0;
+
+	/* We're in an interrupt, but this is clear and BUG()-safe. */
+	preempt_disable();
+
+	/* If we are recursing, we already hold the lock. */
+	if (kwatch_in_progress)
+		goto recursed;
+
+	set_bit(debugreg, &kwatch_in_progress);
+
+	spin_lock(&kwatch_lock);
+	if ((unsigned long) kwatch_list[debugreg].addr != addr)
+		goto out;
+
+	if (kwatch_list[debugreg].handler)
+		kwatch_list[debugreg].handler(&kwatch_list[debugreg], regs);
+
+	if (kwatch_list[debugreg].type == DR_TYPE_EXECUTE)
+		regs->eflags |= RF_MASK;
+      out:
+	clear_bit(debugreg, &kwatch_in_progress);
+	spin_unlock(&kwatch_lock);
+	preempt_enable_no_resched();
+	return 0;
+
+      recursed:
+	if (kwatch_list[debugreg].type == DR_TYPE_EXECUTE)
+		regs->eflags |= RF_MASK;
+	preempt_enable_no_resched();
+	return 1;
+}
+
+/**
+ * register_kwatch - register a hardware watchpoint
+ * @addr: address of the watchpoint
+ * @length: extent of the watchpoint (1, 2, or 4 bytes)
+ * @type: type of access to trap (read, write, I/O, or execute)
+ * @handler: callback routine to invoke when a trap occurs
+ *
+ * Allocates and returns a debug register and installs the requested
+ * watchpoint.
+ *
+ * @length must be 1, 2, or 4, and @type must be one of %DR_TYPE_RW
+ * (read or write), %DR_TYPE_WRITE (write only), %DR_TYPE_IO (I/O space
+ * access), or %DR_TYPE_EXECUTE.  Note that %DR_TYPE_IO is available only
+ * on processors with Debugging Extensions, and @length must be 1 for
+ * %DR_TYPE_EXECUTE.
+ *
+ * When a trap occurs, @handler is invoked in_interrupt with a pointer
+ * to a struct kwatch containing the watchpoint information and a pointer
+ * to the CPU register values at the time of the trap.  %DR_TYPE_EXECUTE
+ * traps occur before the watch-pointed instruction executes; all other
+ * types occur after the memory or I/O access has taken place.
+ *
+ * Returns a debug register number or a negative error code.
+ */
+int register_kwatch(void *addr, u8 length, u8 type, kwatch_handler_t handler)
+{
+	int debugreg;
+	unsigned long dr7, flags;
+
+	switch (length) {
+	case 1:
+	case 2:
+	case 4:
+		break;
+	default:
+		return -EINVAL;
+	}
+	switch (type) {
+	case DR_TYPE_WRITE:
+	case DR_TYPE_RW:
+		break;
+	case DR_TYPE_IO:
+		if (cpu_has_de)
+			break;
+		return -EINVAL;
+	case DR_TYPE_EXECUTE:
+		if (length == 1)
+			break;
+		/* FALL THROUGH */
+	default:
+		return -EINVAL;
+	}
+	if (!handler)
+		return -EINVAL;
+
+	debugreg = dr_alloc(DR_ANY, DR_ALLOC_GLOBAL);
+	if (debugreg < 0)
+		return -EBUSY;
+
+	spin_lock_irqsave(&kwatch_lock, flags);
+	kwatch_list[debugreg].addr = addr;
+	kwatch_list[debugreg].length = length;
+	kwatch_list[debugreg].type = type;
+	kwatch_list[debugreg].handler = handler;
+	spin_unlock_irqrestore(&kwatch_lock, flags);
+
+	if (type == DR_TYPE_IO)
+		set_in_cr4(X86_CR4_DE);
+	write_dr(debugreg, (unsigned long) addr);
+	sync_dr(debugreg, (unsigned long) addr, type);
+
+	read_dr7(dr7);
+	SET_DR7(dr7, debugreg, type, length);
+	write_dr7(dr7);
+	sync_dr(7, dr7, 0);
+	return debugreg;
+}
+
+/**
+ * unregister_kwatch - free a previously-allocated debugging watchpoint
+ * @debugreg: the debugging register to deallocate
+ *
+ * Removes a hardware watchpoint and deallocates the corresponding
+ * debugging register.  @debugreg must previously have been allocated
+ * by register_kwatch().
+ */
+void unregister_kwatch(int debugreg)
+{
+	unsigned long flags;
+	unsigned long dr7;
+
+	if (debugreg < 0 || debugreg >= DR_MAX ||
+			!kwatch_list[debugreg].handler)
+		return;
+
+	read_dr7(dr7);
+	RESET_DR7(dr7, debugreg);
+	write_dr7(dr7);
+	sync_dr(7, dr7, 0);
+
+	spin_lock_irqsave(&kwatch_lock, flags);
+	kwatch_list[debugreg].addr = 0;
+	kwatch_list[debugreg].handler = NULL;
+	spin_unlock_irqrestore(&kwatch_lock, flags);
+
+	dr_free(debugreg);
+}
+
+/*
+ * Wrapper routine to for handling exceptions.
+ */
+int kwatch_exceptions_notify(struct notifier_block *self, unsigned long val,
+			     void *data)
+{
+	struct die_args *args = (struct die_args *)data;
+	switch (val) {
+	case DIE_DEBUG:
+		if (kwatch_handler(args->err, args->regs))
+			return NOTIFY_STOP;
+		break;
+	default:
+		break;
+	}
+	return NOTIFY_DONE;
+}
+
+static struct notifier_block kwatch_exceptions_nb = {
+	.notifier_call = kwatch_exceptions_notify,
+	.priority = 0x7ffffffe	/* we need to notified second */
+};
+
+static int __init init_kwatch(void)
+{
+	int err = 0;
+
+	err = register_die_notifier(&kwatch_exceptions_nb);
+	return err;
+}
+
+__initcall(init_kwatch);
+
+EXPORT_SYMBOL_GPL(register_kwatch);
+EXPORT_SYMBOL_GPL(unregister_kwatch);
Index: usb-2.6/arch/i386/kernel/Makefile
===================================================================
--- usb-2.6.orig/arch/i386/kernel/Makefile
+++ usb-2.6/arch/i386/kernel/Makefile
@@ -39,6 +39,7 @@ obj-$(CONFIG_VM86)		+= vm86.o
 obj-$(CONFIG_EARLY_PRINTK)	+= early_printk.o
 obj-$(CONFIG_HPET_TIMER) 	+= hpet.o
 obj-$(CONFIG_K8_NB)		+= k8.o
+obj-$(CONFIG_KWATCH)		+= debugreg.o kwatch.o
 
 # Make sure this is linked after any other paravirt_ops structs: see head.S
 obj-$(CONFIG_PARAVIRT)		+= paravirt.o
Index: usb-2.6/include/asm-i386/kwatch.h
===================================================================
--- /dev/null
+++ usb-2.6/include/asm-i386/kwatch.h
@@ -0,0 +1,60 @@
+#ifndef _ASM_KWATCH_H
+#define _ASM_KWATCH_H
+/*
+ *  Kernel Watchpoint interface.
+ *  include/asm-i386/kwatch.h
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) IBM Corporation, 2002, 2004
+ *
+ * 2002-Oct	Created by Vamsi Krishna S <vamsi_krishna@in.ibm.com> for
+ *		Kernel Watchpoint implementation.
+ */
+#include <linux/types.h>
+#include <linux/ptrace.h>
+
+struct kwatch;
+typedef void (*kwatch_handler_t) (struct kwatch *, struct pt_regs *);
+
+struct kwatch {
+	void *addr;		/* location of watchpoint */
+	u8 length;		/* range of address */
+	u8 type;		/* type of watchpoint */
+	kwatch_handler_t handler;
+};
+
+#define DR_TYPE_EXECUTE 	0x0	/* Watchpoint types */
+#define DR_TYPE_WRITE		0x1
+#define DR_TYPE_IO		0x2
+#define DR_TYPE_RW		0x3
+
+#ifdef CONFIG_KWATCH
+extern int register_kwatch(void *addr, u8 length, u8 type,
+		kwatch_handler_t handler);
+extern void unregister_kwatch(int debugreg);
+
+#else
+
+static inline int register_kwatch(void *addr, u8 length, u8 type,
+		kwatch_handler_t handler)
+{
+	return -ENOSYS;
+}
+static inline void unregister_kwatch(int debugreg)
+{
+}
+#endif
+#endif				/* _ASM_KWATCH_H */
Index: usb-2.6/arch/i386/Kconfig
===================================================================
--- usb-2.6.orig/arch/i386/Kconfig
+++ usb-2.6/arch/i386/Kconfig
@@ -1210,6 +1210,14 @@ config KPROBES
 	  a probepoint and specifies the callback.  Kprobes is useful
 	  for kernel debugging, non-intrusive instrumentation and testing.
 	  If in doubt, say "N".
+
+config KWATCH
+	bool "Kwatch points (EXPERIMENTAL)"
+	depends on EXPERIMENTAL
+	help
+	  Kwatch enables kernel-space data watchpoints using the processor's
+	  debug registers.  It can be very useful for kernel debugging.
+	  If in doubt, say "N".
 endmenu
 
 source "arch/i386/Kconfig.debug"


^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH] Kwatch: kernel watchpoints using CPU debug registers
  2007-01-16 16:55 [PATCH] Kwatch: kernel watchpoints using CPU debug registers Alan Stern
@ 2007-01-16 23:35 ` Christoph Hellwig
  2007-01-17 16:33   ` Alan Stern
  2007-01-17  9:44 ` Ingo Molnar
  1 sibling, 1 reply; 25+ messages in thread
From: Christoph Hellwig @ 2007-01-16 23:35 UTC (permalink / raw)
  To: Alan Stern; +Cc: Andrew Morton, Prasanna S Panchamukhi, Kernel development list

Fir4st I'd say thanks a lot for forward-porting this, it's really useful
feature for all kinds of nasty debugging.

I think you should split this into two patches, one for the debugreg
infrastructure, and one for the actual kwatch code.

Also I think you provide one (or even a few) example wathes for
trivial things, say updating i_ino for an inode given through debugfs.

Some comments on the code below:

> --- /dev/null
> +++ usb-2.6/arch/i386/kernel/debugreg.c
> @@ -0,0 +1,182 @@
> +/*
> + *  Debug register
> + *  arch/i386/kernel/debugreg.c

Please don't put in comments that mention the name of the containing
file.  Also the "Debug register" comments seems rather useless.

> + * 2002-Oct	Created by Vamsi Krishna S <vamsi_krishna@in.ibm.com> and
> + *		Bharata Rao <bharata@in.ibm.com> to provide debug register
> + *		allocation mechanism.
> + * 2004-Oct	Updated by Prasanna S Panchamukhi <prasanna@in.ibm.com> with
> + *		idr_allocations mechanism as suggested by Andi Kleen.

I think these kinds of comments aren't in fashion anymore either, all
changelogs should be in git commit messages and initial credits go
into the first commit message.

> +struct debugreg dr_list[DR_MAX];
> +static spinlock_t dr_lock = SPIN_LOCK_UNLOCKED;

I think you're supposed to use magic DEFINE_SPINLOCK macro these days.

> +unsigned long dr7_global_mask = DR_CONTROL_RESERVED | DR_GLOBAL_SLOWDOWN |
> +		DR_GLOBAL_ENABLE_MASK;

I'd rahter keep this static and make  set_process_dr7 a non-inline
function.

> +
> +static unsigned long dr7_global_reg_mask(unsigned int regnum)
> +{
> +	return (0xf << (16 + regnum * 4)) | (0x1 << (regnum * 2));
> +}
> +
> +static int get_dr(int regnum, int flag)
> +{
> +	if (flag == DR_ALLOC_GLOBAL && !dr_list[regnum].flag) {
> +		dr_list[regnum].flag = flag;
> +		dr7_global_mask |= dr7_global_reg_mask(regnum);
> +		return regnum;
> +	}
> +	if (flag == DR_ALLOC_LOCAL &&
> +			dr_list[regnum].flag != DR_ALLOC_GLOBAL) {
> +		dr_list[regnum].flag = flag;
> +		dr_list[regnum].use_count++;
> +		return regnum;
> +	}
> +	return -1;

This looks rather poorly structured, as the function does compltely
different things depending on the flags passed in.

> +static void free_dr(int regnum)
> +{
> +	if (dr_list[regnum].flag == DR_ALLOC_LOCAL) {
> +		if (!--dr_list[regnum].use_count)
> +			dr_list[regnum].flag = 0;
> +	} else {
> +		dr_list[regnum].flag = 0;
> +		dr_list[regnum].use_count = 0;
> +		dr7_global_mask &= ~(dr7_global_reg_mask(regnum));
> +	}
> +}

Same here.

> +int dr_alloc(int regnum, int flag)
> +{
> +	int ret = -1;
> +
> +	spin_lock(&dr_lock);
> +	if (regnum >= 0 && regnum < DR_MAX)
> +		ret = get_dr(regnum, flag);
> +	else if (regnum == DR_ANY) {
> +
> +		/* gdb allocates local debug registers starting from 0.
> +		 * To help avoid conflicts, we'll start from the other end.
> +		 */
> +		for (regnum = DR_MAX - 1; regnum >= 0; --regnum) {
> +			ret = get_dr(regnum, flag);
> +			if (ret >= 0)
> +				break;
> +		}
> +	} else
> +		printk(KERN_ERR "dr_alloc: "
> +				"Cannot allocate debug register %d\n", regnum);
> +	spin_unlock(&dr_lock);
> +	return ret;

I suspect this should be replaced wit ha global and local variant
to fix the above mentioned issue.  It's a tiny bit duplicated code,
but seems much cleaner.

> +static int get_dr(int regnum, int flag)
> +{
> +	if (flag == DR_ALLOC_GLOBAL && !dr_list[regnum].flag) {
> +		dr_list[regnum].flag = flag;
> +		dr7_global_mask |= dr7_global_reg_mask(regnum);
> +		return regnum;
> +	}
> +	if (flag == DR_ALLOC_LOCAL &&
> +			dr_list[regnum].flag != DR_ALLOC_GLOBAL) {
> +		dr_list[regnum].flag = flag;
> +		dr_list[regnum].use_count++;
> +		return regnum;
> +	}
> +	return -1;

Same comments about global vs local here.

> +
> +EXPORT_SYMBOL(dr_alloc);
> +EXPORT_SYMBOL(dr_free);

I don't think we want these exported at all, and if a proper modular
user shows up they should be _GPL as they're fairly lowlevel.

Btw, the naming in the whole debugregs code should be consolidated to
be debugreg_ instead of all kinds of different variants.

> +#ifdef CONFIG_KWATCH
> +
> +/* Set the type, len and global flag in dr7 for a debug register */
> +#define SET_DR7(dr, regnum, type, len)	do {		\
> +		dr &= ~(0xf << (16 + (regnum) * 4));	\
> +		dr |= (((((len) - 1) << 2) | (type)) <<	\
> +				(16 + (regnum) * 4)) |	\
> +			(0x2 << ((regnum) * 2));	\
> +	} while (0)
> +
> +/* Disable a debug register by clearing the global/local flag in dr7 */
> +#define RESET_DR7(dr, regnum)	dr &= ~(0x3 << ((regnum) * 2))

I don't think there's any point in making these macros conditional.
Then again is there a good reason to mke these macros?

> + *  Kernel Watchpoint interface.
> + *  arch/i386/kernel/kwatch.c
> + *
> + *
> + * 2002-Oct	Created by Vamsi Krishna S <vamsi_krishna@in.ibm.com> for
> + *		Kernel Watchpoint implementation.
> + * 2004-Oct	Updated by Prasanna S Panchamukhi <prasanna@in.ibm.com> to
> + *		to make use of notifiers.
> + */

Same comments about these comments applies as in debugreg.c

> +#include <linux/kprobes.h>
> +#include <linux/ptrace.h>
> +#include <linux/spinlock.h>
> +#include <linux/module.h>
> +#include <linux/init.h>
> +#include <asm/kwatch.h>
> +#include <asm/kdebug.h>
> +#include <asm/debugreg.h>
> +#include <asm/bitops.h>

I think this should be linux/bitops.h these days.

> +
> +#define RF_MASK	0x00010000
> +
> +static struct kwatch kwatch_list[DR_MAX];
> +static spinlock_t kwatch_lock = SPIN_LOCK_UNLOCKED;



> +static unsigned long kwatch_in_progress;	/* currently being handled */

Give that this is a bitmap the comment is rather misleading, it should
probably be:

/*
 * Bitmap of registers beeing handled.
 */

> +static void write_dr(int debugreg, unsigned long addr)
> +{
> +	switch (debugreg) {
> +		case 0:	set_debugreg(addr, 0);	break;
> +		case 1:	set_debugreg(addr, 1);	break;
> +		case 2:	set_debugreg(addr, 2);	break;
> +		case 3:	set_debugreg(addr, 3);	break;
> +		case 6:	set_debugreg(addr, 6);	break;
> +		case 7:	set_debugreg(addr, 7);	break;
> +	}
> +}

What's the point of this wrapper?

> +
> +#define write_dr7(val)	set_debugreg((val), 7)
> +#define read_dr7(val)	get_debugreg((val), 7)

And these?

> +	if (kwatch_in_progress)
> +		goto recursed;
> +

I don't think there's any point in this goto, just handle it inside
the if block

> +	set_bit(debugreg, &kwatch_in_progress);
> +
> +	spin_lock(&kwatch_lock);
> +	if ((unsigned long) kwatch_list[debugreg].addr != addr)
> +		goto out;
> +
> +	if (kwatch_list[debugreg].handler)
> +		kwatch_list[debugreg].handler(&kwatch_list[debugreg], regs);
> +
> +	if (kwatch_list[debugreg].type == DR_TYPE_EXECUTE)
> +		regs->eflags |= RF_MASK;
> +      out:

Again, I think the goto here could be avoided and actually make the code
cleanere.  Also a local variable for kwatch_list[debugreg] with a short
would probably make this section of code a lot more readable.

> +
> +static int __init init_kwatch(void)
> +{
> +	int err = 0;
> +
> +	err = register_die_notifier(&kwatch_exceptions_nb);
> +	return err;
> +}

Just remove the err local variable here.

> +EXPORT_SYMBOL_GPL(register_kwatch);
> +EXPORT_SYMBOL_GPL(unregister_kwatch);

Please move these exports close to the actual function definition.

> --- /dev/null
> +++ usb-2.6/include/asm-i386/kwatch.h
> @@ -0,0 +1,60 @@
> +#ifndef _ASM_KWATCH_H
> +#define _ASM_KWATCH_H
> +/*
> + *  Kernel Watchpoint interface.
> + *  include/asm-i386/kwatch.h

> + * 2002-Oct	Created by Vamsi Krishna S <vamsi_krishna@in.ibm.com> for
> + *		Kernel Watchpoint implementation.
> + */

Same comments once again.

> +#include <linux/types.h>
> +#include <linux/ptrace.h>
> +
> +struct kwatch;
> +typedef void (*kwatch_handler_t) (struct kwatch *, struct pt_regs *);
> +
> +struct kwatch {
> +	void *addr;		/* location of watchpoint */
> +	u8 length;		/* range of address */
> +	u8 type;		/* type of watchpoint */
> +	kwatch_handler_t handler;
> +};
> +
> +#define DR_TYPE_EXECUTE 	0x0	/* Watchpoint types */
> +#define DR_TYPE_WRITE		0x1
> +#define DR_TYPE_IO		0x2
> +#define DR_TYPE_RW		0x3

I think large parts of this header should go into a new linux/kwatch.h
so that generic code can use kwatches.

> +config KWATCH
> +	bool "Kwatch points (EXPERIMENTAL)"
> +	depends on EXPERIMENTAL
> +	help
> +	  Kwatch enables kernel-space data watchpoints using the processor's
> +	  debug registers.  It can be very useful for kernel debugging.
> +	  If in doubt, say "N".

I think we want different options for debugregs and kwatch.  The debugreg
one probably doesn't have to be actually user-visible, though.

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH] Kwatch: kernel watchpoints using CPU debug registers
  2007-01-16 16:55 [PATCH] Kwatch: kernel watchpoints using CPU debug registers Alan Stern
  2007-01-16 23:35 ` Christoph Hellwig
@ 2007-01-17  9:44 ` Ingo Molnar
  2007-01-17 16:17   ` Alan Stern
  1 sibling, 1 reply; 25+ messages in thread
From: Ingo Molnar @ 2007-01-17  9:44 UTC (permalink / raw)
  To: Alan Stern
  Cc: Andrew Morton, Prasanna S Panchamukhi, Kernel development list,
	Roland McGrath


* Alan Stern <stern@rowland.harvard.edu> wrote:

> From: Alan Stern <stern@rowland.harvard.edu>
> 
> This patch (as839) implements the Kwatch (kernel-space hardware-based 
> watchpoints) API for the i386 architecture.  The API is explained in 
> the kerneldoc for register_kwatch() in arch/i386/kernel/kwatch.c.

i think it would be nice to have this ontop of Roland's utrace 
infrastructure, which nicely modularizes all hardware debugging 
capabilities and detaches it from ptrace.

	Ingo

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH] Kwatch: kernel watchpoints using CPU debug registers
  2007-01-17  9:44 ` Ingo Molnar
@ 2007-01-17 16:17   ` Alan Stern
  2007-01-18  0:12     ` Christoph Hellwig
  2007-02-06  4:25     ` [PATCH] " Roland McGrath
  0 siblings, 2 replies; 25+ messages in thread
From: Alan Stern @ 2007-01-17 16:17 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Andrew Morton, Prasanna S Panchamukhi, Kernel development list,
	Roland McGrath

On Wed, 17 Jan 2007, Ingo Molnar wrote:

> * Alan Stern <stern@rowland.harvard.edu> wrote:
> 
> > From: Alan Stern <stern@rowland.harvard.edu>
> > 
> > This patch (as839) implements the Kwatch (kernel-space hardware-based 
> > watchpoints) API for the i386 architecture.  The API is explained in 
> > the kerneldoc for register_kwatch() in arch/i386/kernel/kwatch.c.
> 
> i think it would be nice to have this ontop of Roland's utrace 
> infrastructure, which nicely modularizes all hardware debugging 
> capabilities and detaches it from ptrace.

I'll be happy to move this over to the utrace setting, once it is merged.  
Do you think it would be better to include the current version of kwatch 
now or to wait for utrace?

Roland, is there a schedule for when you plan to get utrace into -mm?

Alan Stern


^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH] Kwatch: kernel watchpoints using CPU debug registers
  2007-01-16 23:35 ` Christoph Hellwig
@ 2007-01-17 16:33   ` Alan Stern
  0 siblings, 0 replies; 25+ messages in thread
From: Alan Stern @ 2007-01-17 16:33 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Andrew Morton, Prasanna S Panchamukhi, Kernel development list

On Tue, 16 Jan 2007, Christoph Hellwig wrote:

> Fir4st I'd say thanks a lot for forward-porting this, it's really useful
> feature for all kinds of nasty debugging.
> 
> I think you should split this into two patches, one for the debugreg
> infrastructure, and one for the actual kwatch code.
> 
> Also I think you provide one (or even a few) example wathes for
> trivial things, say updating i_ino for an inode given through debugfs.
> 
> Some comments on the code below:

Many thanks for your detailed comments and suggestions.  It probably was
obvious that most of the things you picked up on were inherited from the
original Kwatch patch.  I'll update my patch in accordance with your
suggestions.

Responses to just a couple of the comments:

> I suspect this should be replaced wit ha global and local variant
> to fix the above mentioned issue.  It's a tiny bit duplicated code,
> but seems much cleaner.

It would indeed be cleaner.  And in fact the local variant would have a
large amount of dead code, which could be left out entirely (at least from
the initial version).  That's because the only current user of local debug 
register allocations is ptrace.

> > +static void write_dr(int debugreg, unsigned long addr)
> > +{
> > +	switch (debugreg) {
> > +		case 0:	set_debugreg(addr, 0);	break;
> > +		case 1:	set_debugreg(addr, 1);	break;
> > +		case 2:	set_debugreg(addr, 2);	break;
> > +		case 3:	set_debugreg(addr, 3);	break;
> > +		case 6:	set_debugreg(addr, 6);	break;
> > +		case 7:	set_debugreg(addr, 7);	break;
> > +	}
> > +}
> 
> What's the point of this wrapper?

It is called from two different places, and it's better than including
the "switch" in each place.

> I think large parts of this header should go into a new linux/kwatch.h
> so that generic code can use kwatches.

In the long run that may well be true.  For now, I'm a little hesitant to
put something which works only on i386 under include/linux.

> > +config KWATCH
> > +	bool "Kwatch points (EXPERIMENTAL)"
> > +	depends on EXPERIMENTAL
> > +	help
> > +	  Kwatch enables kernel-space data watchpoints using the processor's
> > +	  debug registers.  It can be very useful for kernel debugging.
> > +	  If in doubt, say "N".
> 
> I think we want different options for debugregs and kwatch.  The debugreg
> one probably doesn't have to be actually user-visible, though.

It's easier to start out like this and then change it later when someone
comes up with another use for debugregs.  Or perhaps by then the whole
thing will have been moved over to utrace, making the issue academic.

Alan Stern


^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH] Kwatch: kernel watchpoints using CPU debug registers
  2007-01-17 16:17   ` Alan Stern
@ 2007-01-18  0:12     ` Christoph Hellwig
  2007-01-18  7:31       ` Ingo Molnar
  2007-02-06  4:25     ` [PATCH] " Roland McGrath
  1 sibling, 1 reply; 25+ messages in thread
From: Christoph Hellwig @ 2007-01-18  0:12 UTC (permalink / raw)
  To: Alan Stern
  Cc: Ingo Molnar, Andrew Morton, Prasanna S Panchamukhi,
	Kernel development list, Roland McGrath

On Wed, Jan 17, 2007 at 11:17:37AM -0500, Alan Stern wrote:
> I'll be happy to move this over to the utrace setting, once it is merged.  
> Do you think it would be better to include the current version of kwatch 
> now or to wait for utrace?
> 
> Roland, is there a schedule for when you plan to get utrace into -mm?

Even if it goes into mainline soon we'll need a lot of time for all
architectures to catch up, so I think kwatch should definitely comes first.

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH] Kwatch: kernel watchpoints using CPU debug registers
  2007-01-18  0:12     ` Christoph Hellwig
@ 2007-01-18  7:31       ` Ingo Molnar
  2007-01-18 15:37         ` Alan Stern
  2007-01-18 22:33         ` Christoph Hellwig
  0 siblings, 2 replies; 25+ messages in thread
From: Ingo Molnar @ 2007-01-18  7:31 UTC (permalink / raw)
  To: Christoph Hellwig, Alan Stern, Andrew Morton,
	Prasanna S Panchamukhi, Kernel development list, Roland McGrath


* Christoph Hellwig <hch@infradead.org> wrote:

> > I'll be happy to move this over to the utrace setting, once it is 
> > merged.  Do you think it would be better to include the current 
> > version of kwatch now or to wait for utrace?
> > 
> > Roland, is there a schedule for when you plan to get utrace into 
> > -mm?
> 
> Even if it goes into mainline soon we'll need a lot of time for all 
> architectures to catch up, so I think kwatch should definitely comes 
> first.

i disagree. Utrace is a once-in-a-lifetime opportunity to clean up the 
/huge/ ptrace mess. Ptrace has been a very large PITA, for many, many 
years, precisely because it was done in the 'oh, lets get this feature 
added first, think about it later' manner. Roland's work is a large 
logistical undertaking and we should not make it more complex than it 
is. Once it's in we can add debugging features ontop of that. To me work 
that cleans up existing mess takes precedence before work that adds to 
the mess.

	Ingo

ps. please fix your mailer to not emit those silly Mail-Followup-To 
headers! It collapses To: and Cc: lines into one huge unnecessary To: 
line.

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH] Kwatch: kernel watchpoints using CPU debug registers
  2007-01-18  7:31       ` Ingo Molnar
@ 2007-01-18 15:37         ` Alan Stern
  2007-01-18 22:33         ` Christoph Hellwig
  1 sibling, 0 replies; 25+ messages in thread
From: Alan Stern @ 2007-01-18 15:37 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Christoph Hellwig, Andrew Morton, Prasanna S Panchamukhi,
	Kernel development list, Roland McGrath

On Thu, 18 Jan 2007, Ingo Molnar wrote:

> 
> * Christoph Hellwig <hch@infradead.org> wrote:
> 
> > > I'll be happy to move this over to the utrace setting, once it is 
> > > merged.  Do you think it would be better to include the current 
> > > version of kwatch now or to wait for utrace?
> > > 
> > > Roland, is there a schedule for when you plan to get utrace into 
> > > -mm?
> > 
> > Even if it goes into mainline soon we'll need a lot of time for all 
> > architectures to catch up, so I think kwatch should definitely comes 
> > first.
> 
> i disagree. Utrace is a once-in-a-lifetime opportunity to clean up the 
> /huge/ ptrace mess. Ptrace has been a very large PITA, for many, many 
> years, precisely because it was done in the 'oh, lets get this feature 
> added first, think about it later' manner. Roland's work is a large 
> logistical undertaking and we should not make it more complex than it 
> is. Once it's in we can add debugging features ontop of that. To me work 
> that cleans up existing mess takes precedence before work that adds to 
> the mess.

Interestingly, the current version of utrace makes no special provision
for watchpoints, either in kernel or user space.  Instead it relies on the
legacy ptrace mechanism for setting debug registers in the target
process's user area.  Perhaps an explicit watchpoint implementation should
be added to utrace, but that's beyond the scope of this discussion.

Furthermore, utrace is explicitly intended for tracing user programs, not
for tracing the kernel.  Kwatch, however, is just the opposite: It is
intended for setting up watchpoints in kernel space.  In that sense it is
pretty much orthogonal to utrace.  Although it would affect the utrace
patches, the changes would be basically transparent (i.e., move the new
code from one ptrace handler to another instead of moving the old code).

If Kwatch is to be subsumed anywhere, I think it should be under the
Kprobes/Systemtap project.  Again, that's a separate question -- so far 
they have avoided data watchpoints.

Alan Stern


^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH] Kwatch: kernel watchpoints using CPU debug registers
  2007-01-18  7:31       ` Ingo Molnar
  2007-01-18 15:37         ` Alan Stern
@ 2007-01-18 22:33         ` Christoph Hellwig
  2007-01-22 16:54           ` [PATCH - revised] " Alan Stern
  1 sibling, 1 reply; 25+ messages in thread
From: Christoph Hellwig @ 2007-01-18 22:33 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Christoph Hellwig, Alan Stern, Andrew Morton,
	Prasanna S Panchamukhi, Kernel development list, Roland McGrath

On Thu, Jan 18, 2007 at 08:31:59AM +0100, Ingo Molnar wrote:
> 
> * Christoph Hellwig <hch@infradead.org> wrote:
> 
> > > I'll be happy to move this over to the utrace setting, once it is 
> > > merged.  Do you think it would be better to include the current 
> > > version of kwatch now or to wait for utrace?
> > > 
> > > Roland, is there a schedule for when you plan to get utrace into 
> > > -mm?
> > 
> > Even if it goes into mainline soon we'll need a lot of time for all 
> > architectures to catch up, so I think kwatch should definitely comes 
> > first.
> 
> i disagree. Utrace is a once-in-a-lifetime opportunity to clean up the 
> /huge/ ptrace mess. Ptrace has been a very large PITA, for many, many 
> years, precisely because it was done in the 'oh, lets get this feature 
> added first, think about it later' manner. Roland's work is a large 
> logistical undertaking and we should not make it more complex than it 
> is. Once it's in we can add debugging features ontop of that. To me work 
> that cleans up existing mess takes precedence before work that adds to 
> the mess.

Utrace doesn't provide any kind of watchpoint infrastructure now, and
utrace will take a lot of time to get ready for inclusion, mostly because
it really needs asll the arch maintainers to help out (and various not
so easy core fixes aswell).

I'm all for merging utrace, and I wish we'd be much further into the
merging process already, but blocking mostly unrelated functionality for
it is more than dumb.


> ps. please fix your mailer to not emit those silly Mail-Followup-To 
> headers! It collapses To: and Cc: lines into one huge unnecessary To: 
> line.

This header is absolutely intentation as far too many folks seem to randomly
drop to or cc lines on mailing lists.  and of course it's alsmost esential
on lists with braindead reply to list policies (e.g. Debian)

^ permalink raw reply	[flat|nested] 25+ messages in thread

* [PATCH - revised] Kwatch: kernel watchpoints using CPU debug registers
  2007-01-18 22:33         ` Christoph Hellwig
@ 2007-01-22 16:54           ` Alan Stern
  0 siblings, 0 replies; 25+ messages in thread
From: Alan Stern @ 2007-01-22 16:54 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Christoph Hellwig, Ingo Molnar, Prasanna S Panchamukhi,
	Kernel development list, Roland McGrath

From: Alan Stern <stern@rowland.harvard.edu>

This patch (as839b) implements the Kwatch (kernel-space hardware-based
watchpoints) API for the i386 architecture.  The API is explained in
the kerneldoc for register_kwatch() in arch/i386/kernel/kwatch.c, and
there is demonstration code in include/asm-i386/kwatch.h.

The original version of the patch was written by Vamsi Krishna S and
Bharata B Rao in 2002.  It was updated in 2004 by Prasanna S Panchamukhi
for 2.6.13 and then again (here) by me for 2.6.20.

Signed-off-by: Prasanna S Panchamukhi <prasanna@in.ibm.com>
Signed-off-by: Alan Stern <stern@rowland.harvard.edu>

---

This version of the patch has been revised along the lines suggested by
Christoph.  Note that it is meant to apply on top of the i386-ptrace
whitespace cleanup patch I submitted last week.

The impact of kwatch upon utrace is minimal.  Kwatch is intended for 
adding hardware watchpoints in the kernel, whereas utrace is intended for 
general debugging (not including watchpoints at the moment, oddly enough) 
of user processes.  The two don't really overlap, and adding kwatch should 
not result in any significant delay in adding utrace.

Alan Stern


Index: usb-2.6/arch/i386/kernel/debugreg.c
===================================================================
--- /dev/null
+++ usb-2.6/arch/i386/kernel/debugreg.c
@@ -0,0 +1,269 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) IBM Corporation, 2002, 2004
+ * Copyright (C) 2007 Alan Stern
+ */
+
+/*
+ * These routines provide a debug register allocation mechanism.
+ */
+
+#include <linux/kernel.h>
+#include <linux/spinlock.h>
+#include <linux/module.h>
+#include <asm/system.h>
+#include <asm/debugreg.h>
+
+struct debugreg {
+	int flag;
+	int use_count;
+};
+
+/* flag values */
+#define DR_ALLOC_NONE		0
+#define DR_ALLOC_GLOBAL		1
+#define DR_ALLOC_LOCAL		2
+
+static struct debugreg dr_list[DR_MAX];
+static DEFINE_SPINLOCK(dr_lock);
+static unsigned long dr7_global_mask = DR_CONTROL_RESERVED |
+		DR_GLOBAL_SLOWDOWN | DR_GLOBAL_ENABLE_MASK;
+
+/*
+ * Set the process's debug register 7 value: Keep all existing global
+ * bit settings and install the process's local bit settings.
+ */
+void set_process_debugreg7(unsigned long new_dr7)
+{
+	unsigned long dr7;
+
+	get_debugreg(dr7, 7);
+	dr7 = (dr7 & dr7_global_mask) | (new_dr7 & ~dr7_global_mask);
+	set_debugreg(dr7, 7);
+}
+
+/**
+ * debugreg7_clear_bits - clear the type, len, and global-enable flag bits
+ * @old_dr7: original value for debug register 7
+ * @regnum: number of the debug register to disable
+ *
+ * @regnum must lie in the range 0 - 3.
+ *
+ * Returns the new value for debug register 7, with the appropriate bits
+ * cleared to disable the specified debug register.
+ */
+unsigned long debugreg7_clear_bits(unsigned long old_dr7, int regnum)
+{
+	unsigned long new_dr7;
+
+	new_dr7 = old_dr7 & ~(0xf <<
+			(DR_CONTROL_SHIFT + regnum * DR_CONTROL_SIZE));
+	new_dr7 &= ~(0x2 << (regnum * DR_ENABLE_SIZE));
+	return new_dr7;
+}
+
+/**
+ * debugreg7_set_bits - set the type, len, and global-enable flag bits
+ * @old_dr7: original value for debug register 7
+ * @regnum: number of the debug register to enable
+ * @type: type of debugging watchpoint
+ * @len: length of the debugging watchpoint
+ *
+ * @regnum must lie in the range DR_FIRST_ADDR - DR_LAST_ADDR (0 - 3).
+ * @type must lie in the range DR_WR_EXECUTE - DR_RW_READ (0 - 3).
+ * @len must be 1, 2, or 4.
+ *
+ * Returns the new value for debug register 7, with the appropriate bits
+ * set to enable a new watchpoint for the specified debug register.
+ */
+unsigned long debugreg7_set_bits(unsigned long old_dr7, int regnum,
+		u8 type, u8 len)
+{
+	unsigned long new_dr7;
+
+	--len;
+	new_dr7 = old_dr7 & ~(0xf <<
+			(DR_CONTROL_SHIFT + regnum * DR_CONTROL_SIZE));
+	new_dr7 |= (((len << 2) | type) <<
+			(DR_CONTROL_SHIFT + regnum * DR_CONTROL_SIZE));
+	new_dr7 |= (0x2 << (regnum * DR_ENABLE_SIZE));
+	return new_dr7;
+}
+
+static unsigned long dr7_global_reg_mask(unsigned int regnum)
+{
+	return (0xf << (DR_CONTROL_SHIFT + regnum * DR_CONTROL_SIZE)) |
+			(0x1 << (regnum * DR_ENABLE_SIZE));
+}
+
+static int get_global_dr(int regnum)
+{
+	if (dr_list[regnum].flag == DR_ALLOC_NONE) {
+		dr_list[regnum].flag = DR_ALLOC_GLOBAL;
+		dr7_global_mask |= dr7_global_reg_mask(regnum);
+		return regnum;
+	}
+	return -1;
+}
+
+static void free_global_dr(int regnum)
+{
+	dr_list[regnum].flag = DR_ALLOC_NONE;
+	dr_list[regnum].use_count = 0;
+	dr7_global_mask &= ~dr7_global_reg_mask(regnum);
+}
+
+static int get_local_dr(int regnum)
+{
+	if (dr_list[regnum].flag != DR_ALLOC_GLOBAL) {
+		dr_list[regnum].flag = DR_ALLOC_LOCAL;
+		dr_list[regnum].use_count++;
+		return regnum;
+	}
+	return -1;
+}
+
+static void free_local_dr(int regnum)
+{
+	if (dr_list[regnum].flag == DR_ALLOC_LOCAL) {
+		if (!--dr_list[regnum].use_count)
+			dr_list[regnum].flag = DR_ALLOC_NONE;
+	}
+}
+
+/**
+ * debugreg_global_alloc - allocate a debug register for global use
+ * @regnum: register number to allocate or %DR_ANY
+ *
+ * Returns -1 if @regnum is already allocated, otherwise returns
+ * @regnum and does a global allocation.
+ */
+int debugreg_global_alloc(int regnum)
+{
+	int ret = -1;
+
+	spin_lock(&dr_lock);
+	if (regnum >= 0 && regnum < DR_MAX)
+		ret = get_global_dr(regnum);
+	else if (regnum == DR_ANY) {
+
+		/*
+		 * gdb allocates local debug registers starting from 0.
+		 * To help avoid conflicts, we'll start from the other end.
+		 */
+		for (regnum = DR_MAX - 1; regnum >= 0; --regnum) {
+			ret = get_global_dr(regnum);
+			if (ret >= 0)
+				break;
+		}
+	} else
+		printk(KERN_ERR "%s: Cannot allocate debug register %d\n",
+				__FUNCTION__,  regnum);
+	spin_unlock(&dr_lock);
+	return ret;
+}
+
+/**
+ * debugreg_global_free - release a global debug register allocation
+ * @regnum: the number of the register to deallocate
+ *
+ * @regnum must previously have been allocated by debugreg_global_alloc().
+ */
+void debugreg_global_free(int regnum)
+{
+	spin_lock(&dr_lock);
+	if (regnum < 0 || regnum >= DR_MAX ||
+			dr_list[regnum].flag != DR_ALLOC_GLOBAL)
+		printk(KERN_ERR "%s: Cannot free debug register %d\n",
+				__FUNCTION__,  regnum);
+	else
+		free_global_dr(regnum);
+	spin_unlock(&dr_lock);
+}
+
+/*
+ * Increment the usage counts for locally-enabled debug registers.
+ * Must be called when a process using debug registers forks or is cloned.
+ */
+void debugreg_inc_use_count(unsigned long mask)
+{
+	int i;
+	int dr_local_enable = 1 << DR_LOCAL_ENABLE_SHIFT;
+
+	spin_lock(&dr_lock);
+	for (i = 0; i < DR_MAX; (++i, dr_local_enable <<= DR_ENABLE_SIZE)) {
+		if (mask & dr_local_enable)
+			dr_list[i].use_count++;
+	}
+	spin_unlock(&dr_lock);
+}
+
+/*
+ * Decrement the usage counts for locally-enabled debug registers.
+ * Must be called when a process using debug registers exits or execs.
+ */
+void debugreg_dec_use_count(unsigned long mask)
+{
+	int i;
+	int dr_local_enable = 1 << DR_LOCAL_ENABLE_SHIFT;
+
+	spin_lock(&dr_lock);
+	for (i = 0; i < DR_MAX; (++i, dr_local_enable <<= DR_ENABLE_SIZE)) {
+		if (mask & dr_local_enable)
+			free_local_dr(i);
+	}
+	spin_unlock(&dr_lock);
+}
+
+/* Report whether a debug register is globally allocated. */
+int debugreg_is_global(int regnum)
+{
+	return (dr_list[regnum].flag == DR_ALLOC_GLOBAL);
+}
+
+/*
+ * This routine decides if a ptrace request is for enabling or disabling
+ * a debug reg, and accordingly calls dr_alloc() or dr_free().
+ *
+ * gdb uses ptrace to write to debug registers.  It assumes that writing to
+ * a debug register always succeds and it doesn't check the return value of
+ * ptrace.  Now with this new global debug register allocation/freeing,
+ * ptrace request for a local debug register will fail if the required debug
+ * register is already globally allocated.  Since gdb doesn't notice this
+ * failure, it sometimes tries to free a debug register which it does not
+ * own.
+ *
+ * Returns -1 if the ptrace request tries to locally allocate a debug register
+ * that is already globally allocated.  Otherwise returns >0 or 0 according
+ * as any debug registers are or are not locally allocated in the new setting.
+ */
+int enable_debugreg(unsigned long old_dr7, unsigned long new_dr7)
+{
+	int i;
+	int dr_local_enable = 1 << DR_LOCAL_ENABLE_SHIFT;
+
+	if (new_dr7 & DR_LOCAL_ENABLE_MASK & dr7_global_mask)
+		return -1;
+	for (i = 0; i < DR_MAX; (++i, dr_local_enable <<= DR_ENABLE_SIZE)) {
+		if ((old_dr7 ^ new_dr7) & dr_local_enable) {
+			if (new_dr7 & dr_local_enable)
+				get_local_dr(i);
+			else
+				free_local_dr(i);
+		}
+	}
+	return new_dr7 & DR_LOCAL_ENABLE_MASK;
+}
Index: usb-2.6/arch/i386/kernel/process.c
===================================================================
--- usb-2.6.orig/arch/i386/kernel/process.c
+++ usb-2.6/arch/i386/kernel/process.c
@@ -51,6 +51,7 @@
 #ifdef CONFIG_MATH_EMULATION
 #include <asm/math_emu.h>
 #endif
+#include <asm/debugreg.h>
 
 #include <linux/err.h>
 
@@ -356,9 +357,10 @@ EXPORT_SYMBOL(kernel_thread);
  */
 void exit_thread(void)
 {
+	struct task_struct *tsk = current;
+
 	/* The process may have allocated an io port bitmap... nuke it. */
 	if (unlikely(test_thread_flag(TIF_IO_BITMAP))) {
-		struct task_struct *tsk = current;
 		struct thread_struct *t = &tsk->thread;
 		int cpu = get_cpu();
 		struct tss_struct *tss = &per_cpu(init_tss, cpu);
@@ -376,15 +378,20 @@ void exit_thread(void)
 		tss->io_bitmap_base = INVALID_IO_BITMAP_OFFSET;
 		put_cpu();
 	}
+	if (unlikely(test_thread_flag(TIF_DEBUG)))
+ 		debugreg_dec_use_count(tsk->thread.debugreg[7]);
 }
 
 void flush_thread(void)
 {
 	struct task_struct *tsk = current;
 
+	if (unlikely(test_thread_flag(TIF_DEBUG))) {
+		debugreg_dec_use_count(tsk->thread.debugreg[7]);
+		clear_tsk_thread_flag(tsk, TIF_DEBUG);
+	}
 	memset(tsk->thread.debugreg, 0, sizeof(unsigned long)*8);
 	memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));	
-	clear_tsk_thread_flag(tsk, TIF_DEBUG);
 	/*
 	 * Forget coprocessor state..
 	 */
@@ -462,6 +469,9 @@ int copy_thread(int nr, unsigned long cl
 		desc->b = LDT_entry_b(&info);
 	}
 
+	if (unlikely(test_thread_flag(TIF_DEBUG)))
+		debugreg_inc_use_count(tsk->thread.debugreg[7]);
+
 	err = 0;
  out:
 	if (err && p->thread.io_bitmap_ptr) {
@@ -537,14 +547,22 @@ static noinline void __switch_to_xtra(st
 
 	next = &next_p->thread;
 
+	/*
+	 * Don't reload global debug registers. Don't touch the global debug
+	 * register settings in dr7.
+	 */
 	if (test_tsk_thread_flag(next_p, TIF_DEBUG)) {
-		set_debugreg(next->debugreg[0], 0);
-		set_debugreg(next->debugreg[1], 1);
-		set_debugreg(next->debugreg[2], 2);
-		set_debugreg(next->debugreg[3], 3);
+		if (!debugreg_is_global(0))
+			set_debugreg(next->debugreg[0], 0);
+		if (!debugreg_is_global(1))
+			set_debugreg(next->debugreg[1], 1);
+		if (!debugreg_is_global(2))
+			set_debugreg(next->debugreg[2], 2);
+		if (!debugreg_is_global(3))
+			set_debugreg(next->debugreg[3], 3);
 		/* no 4 and 5 */
 		set_debugreg(next->debugreg[6], 6);
-		set_debugreg(next->debugreg[7], 7);
+		set_process_debugreg7(next->debugreg[7]);
 	}
 
 	if (!test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) {
Index: usb-2.6/arch/i386/kernel/ptrace.c
===================================================================
--- usb-2.6.orig/arch/i386/kernel/ptrace.c
+++ usb-2.6/arch/i386/kernel/ptrace.c
@@ -412,6 +412,7 @@ long arch_ptrace(struct task_struct *chi
 			ret = putreg(child, addr, data);
 			break;
 		}
+
 		/* We need to be very careful here.  We implicitly
 		   want to modify a portion of the task_struct, and we
 		   have to be selective about what portions we allow someone
@@ -421,10 +422,18 @@ long arch_ptrace(struct task_struct *chi
 		if (addr >= (long) &dummy->u_debugreg[0] &&
 		    addr <= (long) &dummy->u_debugreg[7]) {
 
-			if (addr == (long) &dummy->u_debugreg[4]) break;
-			if (addr == (long) &dummy->u_debugreg[5]) break;
-			if (addr < (long) &dummy->u_debugreg[4] &&
-			    ((unsigned long) data) >= TASK_SIZE-3) break;
+			addr -= (long) &dummy->u_debugreg;
+			addr = addr >> 2;
+			if (addr < 4) {
+				if ((unsigned long) data >= TASK_SIZE-3)
+					break;
+				if (debugreg_is_global(addr)) {
+					ret = -EBUSY;
+					break;
+				}
+			}
+			else if (addr == 4 || addr == 5)
+				break;
 
 			/* Sanity-check data. Take one half-byte at once with
 			 * check = (val >> (16 + 4*i)) & 0xf. It contains the
@@ -456,18 +465,21 @@ long arch_ptrace(struct task_struct *chi
 			 * See the AMD manual no. 24593 (AMD64 System
 			 * Programming) */
 
-			if (addr == (long) &dummy->u_debugreg[7]) {
+			else if (addr == 7) {
 				data &= ~DR_CONTROL_RESERVED;
 				for (i = 0; i < 4; i++)
 					if ((0x5f54 >> ((data >> (16 + 4*i)) & 0xf)) & 1)
 						goto out_tsk;
-				if (data)
+				i = enable_debugreg(child->thread.debugreg[7], data);
+				if (i < 0) {
+					ret = -EBUSY;
+					break;
+				}
+				if (i)
 					set_tsk_thread_flag(child, TIF_DEBUG);
 				else
 					clear_tsk_thread_flag(child, TIF_DEBUG);
 			}
-			addr -= (long) &dummy->u_debugreg;
-			addr = addr >> 2;
 			child->thread.debugreg[addr] = data;
 			ret = 0;
 		}
Index: usb-2.6/arch/i386/kernel/signal.c
===================================================================
--- usb-2.6.orig/arch/i386/kernel/signal.c
+++ usb-2.6/arch/i386/kernel/signal.c
@@ -25,6 +25,7 @@
 #include <asm/ucontext.h>
 #include <asm/uaccess.h>
 #include <asm/i387.h>
+#include <asm/debugreg.h>
 #include "sigframe.h"
 
 #define DEBUG_SIG 0
@@ -594,7 +595,7 @@ static void fastcall do_signal(struct pt
 		 * inside the kernel.
 		 */
 		if (unlikely(current->thread.debugreg[7]))
-			set_debugreg(current->thread.debugreg[7], 7);
+			set_process_debugreg7(current->thread.debugreg[7]);
 
 		/* Whee!  Actually deliver the signal.  */
 		if (handle_signal(signr, &info, &ka, oldset, regs) == 0) {
Index: usb-2.6/arch/i386/kernel/traps.c
===================================================================
--- usb-2.6.orig/arch/i386/kernel/traps.c
+++ usb-2.6/arch/i386/kernel/traps.c
@@ -808,6 +808,7 @@ fastcall void __kprobes do_debug(struct 
 	struct task_struct *tsk = current;
 
 	get_debugreg(condition, 6);
+	set_debugreg(0, 6);	/* DR6 is never cleared by the CPU */
 
 	if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code,
 					SIGTRAP) == NOTIFY_STOP)
@@ -849,7 +850,7 @@ fastcall void __kprobes do_debug(struct 
 	 * the signal is delivered.
 	 */
 clear_dr7:
-	set_debugreg(0, 7);
+	set_process_debugreg7(0);
 	return;
 
 debug_vm86:
Index: usb-2.6/include/asm-i386/debugreg.h
===================================================================
--- usb-2.6.orig/include/asm-i386/debugreg.h
+++ usb-2.6/include/asm-i386/debugreg.h
@@ -33,6 +33,7 @@
 
 #define DR_RW_EXECUTE (0x0)   /* Settings for the access types to trap on */
 #define DR_RW_WRITE (0x1)
+#define DR_RW_IO (0x2)
 #define DR_RW_READ (0x3)
 
 #define DR_LEN_1 (0x0) /* Settings for data length to trap on */
@@ -61,4 +62,43 @@
 #define DR_LOCAL_SLOWDOWN (0x100)   /* Local slow the pipeline */
 #define DR_GLOBAL_SLOWDOWN (0x200)  /* Global slow the pipeline */
 
+#define DR_MAX	4
+#define DR_ANY	(DR_MAX + 1)
+
+#ifdef CONFIG_KWATCH
+
+extern void set_process_debugreg7(unsigned long new_dr7);
+extern unsigned long debugreg7_set_bits(unsigned long old_dr7, int regnum,
+		u8 type, u8 len);
+extern unsigned long debugreg7_clear_bits(unsigned long old_dr7, int regnum);
+extern int debugreg_global_alloc(int regnum);
+extern void debugreg_global_free(int regnum);
+extern void debugreg_inc_use_count(unsigned long mask);
+extern void debugreg_dec_use_count(unsigned long mask);
+extern int debugreg_is_global(int regnum);
+extern int enable_debugreg(unsigned long old_dr7, unsigned long new_dr7);
+
+#else
+
+static inline void set_process_debugreg7(unsigned long new_dr7);
+{
+	set_debugreg(new_dr7, 7);
+}
+static inline void debugreg_inc_use_count(unsigned long mask)
+{
+}
+static inline void debugreg_dec_use_count(unsigned long mask)
+{
+}
+static inline int debugreg_is_global(int regnum)
+{
+	return 0;
+}
+static inline int enable_debugreg(unsigned long old_dr7,
+		unsigned long new_dr7)
+{
+	return (new_dr7 != 0);
+}
+
+#endif				/* CONFIG_KWATCH */
 #endif
Index: usb-2.6/arch/i386/kernel/kwatch.c
===================================================================
--- /dev/null
+++ usb-2.6/arch/i386/kernel/kwatch.c
@@ -0,0 +1,274 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) IBM Corporation, 2002, 2004
+ * Copyright (C) 2007 Alan Stern
+ */
+
+/* Kwatch: a kernel watchpoint interface, using the CPU's debug registers. */
+
+#include <linux/bitops.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/ptrace.h>
+#include <linux/smp.h>
+#include <linux/spinlock.h>
+#include <asm/kwatch.h>
+#include <asm/kdebug.h>
+#include <asm/debugreg.h>
+
+#define RF_MASK	0x00010000
+
+static struct kwatch kwatch_list[DR_MAX];
+static DEFINE_SPINLOCK(kwatch_lock);
+static unsigned long kwatch_in_progress;	/* bitmap of registers
+							being handled */
+
+static void write_debugreg(unsigned long addr, int debugreg)
+{
+	switch (debugreg) {
+		case 0:	set_debugreg(addr, 0);	break;
+		case 1:	set_debugreg(addr, 1);	break;
+		case 2:	set_debugreg(addr, 2);	break;
+		case 3:	set_debugreg(addr, 3);	break;
+		case 6:	set_debugreg(addr, 6);	break;
+		case 7:	set_debugreg(addr, 7);	break;
+	}
+}
+
+#ifdef CONFIG_SMP
+
+struct dr_info {
+	int debugreg;
+	int type;
+	unsigned long addr;
+};
+
+static void write_smp_debugreg(void *info)
+{
+	struct dr_info *dr = (struct dr_info *) info;
+
+	if (cpu_has_de && dr->type == KWATCH_TYPE_IO)
+		set_in_cr4(X86_CR4_DE);
+	write_debugreg(dr->addr, dr->debugreg);
+}
+
+/* Update a debug register on all CPUs */
+static void sync_debugreg(unsigned long addr, int debugreg, int type)
+{
+	struct dr_info dr;
+
+	dr.debugreg = debugreg;
+	dr.type = type;
+	dr.addr = addr;
+	smp_call_function(write_smp_debugreg, &dr, 0, 0);
+}
+
+#else
+
+static inline void sync_debugreg(unsigned long addr, int debugreg, int type)
+{
+}
+#endif	/* CONFIG_SMP */
+
+/*
+ * Interrupts are disabled on entry as trap1 is an interrupt gate and they
+ * remain disabled thorough out this function.
+ */
+static int kwatch_handler(unsigned long condition, struct pt_regs *regs)
+{
+	unsigned int debugreg;
+	unsigned long addr;
+	struct kwatch *kw;
+	int recursed = 0;
+
+	/* The debug status register value is passed in "condition". */
+	if (condition & DR_TRAP0) {
+		debugreg = 0;
+		get_debugreg(addr, 0);
+	} else if (condition & DR_TRAP1) {
+		debugreg = 1;
+		get_debugreg(addr, 1);
+	} else if (condition & DR_TRAP2) {
+		debugreg = 2;
+		get_debugreg(addr, 2);
+	} else if (condition & DR_TRAP3) {
+		debugreg = 3;
+		get_debugreg(addr, 3);
+	} else
+		return recursed;
+	kw = &kwatch_list[debugreg];
+
+	/* We're in an interrupt, but this is clear and BUG()-safe. */
+	preempt_disable();
+
+	/* If we are recursing, we already hold the lock. */
+	if (kwatch_in_progress)
+		recursed = 1;
+	else
+		spin_lock(&kwatch_lock);
+	set_bit(debugreg, &kwatch_in_progress);
+
+	if ((unsigned long) kw->addr == addr) {
+		if (!recursed && kw->handler)
+			kw->handler(kw, regs);
+		if (kw->type == KWATCH_TYPE_EXECUTE)
+			regs->eflags |= RF_MASK;
+	}
+
+	clear_bit(debugreg, &kwatch_in_progress);
+	if (!recursed)
+		spin_unlock(&kwatch_lock);
+
+	preempt_enable_no_resched();
+	return recursed;
+}
+
+/**
+ * register_kwatch - register a hardware watchpoint
+ * @addr: address of the watchpoint
+ * @length: extent of the watchpoint (1, 2, or 4 bytes)
+ * @type: type of access to trap (read, write, I/O, or execute)
+ * @handler: callback routine to invoke when a trap occurs
+ *
+ * Allocates and returns a debug register and installs the requested
+ * watchpoint.
+ *
+ * @length must be 1, 2, or 4, and @type must be one of %KWATCH_TYPE_RW
+ * (read or write), %KWATCH_TYPE_WRITE (write only), %KWATCH_TYPE_IO
+ * (IO-space access), or %KWATCH_TYPE_EXECUTE.  Note that %KWATCH_TYPE_IO
+ * is available only on processors with Debugging Extensions, and @length
+ * must be 1 for %KWATCH_TYPE_EXECUTE.
+ *
+ * When a trap occurs, @handler is invoked in_interrupt with a pointer
+ * to a struct kwatch containing the watchpoint information and a pointer
+ * to the CPU register values at the time of the trap.  %KWATCH_TYPE_EXECUTE
+ * traps occur before the watch-pointed instruction executes; all other
+ * types occur after the memory or I/O access has taken place.
+ *
+ * Returns a debug register number or a negative error code.
+ */
+int register_kwatch(void *addr, u8 length, u8 type, kwatch_handler_t handler)
+{
+	int debugreg;
+	unsigned long dr7, flags;
+
+	switch (length) {
+	case 1:
+	case 2:
+	case 4:
+		break;
+	default:
+		return -EINVAL;
+	}
+	switch (type) {
+	case KWATCH_TYPE_WRITE:
+	case KWATCH_TYPE_RW:
+		break;
+	case KWATCH_TYPE_IO:
+		if (cpu_has_de)
+			break;
+		return -EINVAL;
+	case KWATCH_TYPE_EXECUTE:
+		if (length == 1)
+			break;
+		/* FALL THROUGH */
+	default:
+		return -EINVAL;
+	}
+	if (!handler)
+		return -EINVAL;
+
+	debugreg = debugreg_global_alloc(DR_ANY);
+	if (debugreg < 0)
+		return -EBUSY;
+
+	spin_lock_irqsave(&kwatch_lock, flags);
+	kwatch_list[debugreg].addr = addr;
+	kwatch_list[debugreg].length = length;
+	kwatch_list[debugreg].type = type;
+	kwatch_list[debugreg].handler = handler;
+	spin_unlock_irqrestore(&kwatch_lock, flags);
+
+	if (type == KWATCH_TYPE_IO)
+		set_in_cr4(X86_CR4_DE);
+	write_debugreg((unsigned long) addr, debugreg);
+	sync_debugreg((unsigned long) addr, debugreg, type);
+
+	get_debugreg(dr7, 7);
+	dr7 = debugreg7_set_bits(dr7, debugreg, type, length);
+	set_debugreg(dr7, 7);
+	sync_debugreg(dr7, 7, 0);
+	return debugreg;
+}
+EXPORT_SYMBOL_GPL(register_kwatch);
+
+/**
+ * unregister_kwatch - free a previously-allocated debugging watchpoint
+ * @debugreg: the debugging register to deallocate
+ *
+ * Removes a hardware watchpoint and deallocates the corresponding
+ * debugging register.  @debugreg must previously have been allocated
+ * by register_kwatch().
+ */
+void unregister_kwatch(int debugreg)
+{
+	unsigned long flags;
+	unsigned long dr7;
+
+	if (debugreg < 0 || debugreg >= DR_MAX ||
+			!kwatch_list[debugreg].handler)
+		return;
+
+	get_debugreg(dr7, 7);
+	dr7 = debugreg7_clear_bits(dr7, debugreg);
+	set_debugreg(dr7, 7);
+	sync_debugreg(dr7, 7, 0);
+
+	spin_lock_irqsave(&kwatch_lock, flags);
+	kwatch_list[debugreg].addr = 0;
+	kwatch_list[debugreg].handler = NULL;
+	spin_unlock_irqrestore(&kwatch_lock, flags);
+
+	debugreg_global_free(debugreg);
+}
+EXPORT_SYMBOL_GPL(unregister_kwatch);
+
+/*
+ * Wrapper routine to for handling exceptions.
+ */
+static int kwatch_exceptions_notify(struct notifier_block *self,
+		unsigned long val, void *data)
+{
+	struct die_args *args = (struct die_args *) data;
+
+	if (val == DIE_DEBUG) {
+		if (kwatch_handler(args->err, args->regs))
+			return NOTIFY_STOP;
+	}
+	return NOTIFY_DONE;
+}
+
+static struct notifier_block kwatch_exceptions_nb = {
+	.notifier_call = kwatch_exceptions_notify,
+	.priority = 0x7ffffffe		/* we need to notified second */
+};
+
+static int __init init_kwatch(void)
+{
+	return register_die_notifier(&kwatch_exceptions_nb);
+}
+
+__initcall(init_kwatch);
Index: usb-2.6/arch/i386/kernel/Makefile
===================================================================
--- usb-2.6.orig/arch/i386/kernel/Makefile
+++ usb-2.6/arch/i386/kernel/Makefile
@@ -39,6 +39,7 @@ obj-$(CONFIG_VM86)		+= vm86.o
 obj-$(CONFIG_EARLY_PRINTK)	+= early_printk.o
 obj-$(CONFIG_HPET_TIMER) 	+= hpet.o
 obj-$(CONFIG_K8_NB)		+= k8.o
+obj-$(CONFIG_KWATCH)		+= debugreg.o kwatch.o
 
 # Make sure this is linked after any other paravirt_ops structs: see head.S
 obj-$(CONFIG_PARAVIRT)		+= paravirt.o
Index: usb-2.6/include/asm-i386/kwatch.h
===================================================================
--- /dev/null
+++ usb-2.6/include/asm-i386/kwatch.h
@@ -0,0 +1,99 @@
+#ifndef _ASM_KWATCH_H
+#define _ASM_KWATCH_H
+
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) IBM Corporation, 2002, 2004
+ * Copyright (C) 2007 Alan Stern
+ */
+
+/*
+ * Kernel watchpoint interface.  Use these routines to create and delete
+ * hardware watchpoints within the kernel, using the CPU's debug registers.
+ *
+ * This sample code sets a watchpoint on pid_max and registers a callback
+ * function for writes to that variable.
+ *
+ * ----------------------------------------------------------------------
+ * #include <asm/kwatch.h>
+ *
+ * static void kwatch_handler(struct kwatch *p, struct pt_regs *regs)
+ * {
+ * 	printk(KERN_DEBUG "Watchpoint triggered\n");
+ * 	dump_stack();
+ * 	.......<do anything>........
+ * }
+ *
+ * static int debugreg_num;
+ *
+ * static int init_module(void)
+ * {
+ * 	..........<do anything>............
+ * 	debugreg_num = register_kwatch(&pid_max,
+ * 			 4, KWATCH_TYPE_WRITE, kwatch_handler);
+ * 	..........<do anything>............
+ * }
+ *
+ * static void cleanup_module(void)
+ * {
+ * 	..........<do anything>............
+ * 	unregister_kwatch(debugreg_num);
+ * 	..........<do anything>............
+ * }
+ * ----------------------------------------------------------------------
+ *
+ * Test this by changing the value of pid_max in /proc/sys/kernel/pid_max:
+ *
+ * 	# echo 1000 > /proc/sys/kernel/pid_max
+ *
+ * The output from kwatch_handler() will show up in the system log.
+ */
+
+#include <linux/types.h>
+
+struct kwatch;
+struct pt_regs;
+typedef void (*kwatch_handler_t)(struct kwatch *, struct pt_regs *);
+
+struct kwatch {
+	void *addr;		/* location of watchpoint */
+	u8 length;		/* range of address */
+	u8 type;		/* type of watchpoint */
+	kwatch_handler_t handler;
+};
+
+#define KWATCH_TYPE_EXECUTE 	0x0	/* Watchpoint types */
+#define KWATCH_TYPE_WRITE	0x1
+#define KWATCH_TYPE_IO		0x2
+#define KWATCH_TYPE_RW		0x3
+
+#ifdef	CONFIG_KWATCH
+extern int register_kwatch(void *addr, u8 length, u8 type,
+		kwatch_handler_t handler);
+extern void unregister_kwatch(int debugreg);
+
+#else
+
+static inline int register_kwatch(void *addr, u8 length, u8 type,
+		kwatch_handler_t handler)
+{
+	return -ENOSYS;
+}
+static inline void unregister_kwatch(int debugreg)
+{
+}
+#endif
+#endif	/* _ASM_KWATCH_H */
Index: usb-2.6/arch/i386/Kconfig
===================================================================
--- usb-2.6.orig/arch/i386/Kconfig
+++ usb-2.6/arch/i386/Kconfig
@@ -1210,6 +1210,14 @@ config KPROBES
 	  a probepoint and specifies the callback.  Kprobes is useful
 	  for kernel debugging, non-intrusive instrumentation and testing.
 	  If in doubt, say "N".
+
+config KWATCH
+	bool "Kwatch points (EXPERIMENTAL)"
+	depends on EXPERIMENTAL
+	help
+	  Kwatch enables kernel-space data watchpoints using the processor's
+	  debug registers.  It can be very useful for kernel debugging.
+	  If in doubt, say "N".
 endmenu
 
 source "arch/i386/Kconfig.debug"


^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH] Kwatch: kernel watchpoints using CPU debug registers
  2007-01-17 16:17   ` Alan Stern
  2007-01-18  0:12     ` Christoph Hellwig
@ 2007-02-06  4:25     ` Roland McGrath
  1 sibling, 0 replies; 25+ messages in thread
From: Roland McGrath @ 2007-02-06  4:25 UTC (permalink / raw)
  To: Alan Stern; +Cc: Prasanna S Panchamukhi, Kernel development list

Sorry I've been slow in giving you feedback on kwatch.

> I'll be happy to move this over to the utrace setting, once it is merged.  
> Do you think it would be better to include the current version of kwatch 
> now or to wait for utrace?
> 
> Roland, is there a schedule for when you plan to get utrace into -mm?

Since you've asked, I'll mention that I've been discussing this with Andrew
lately and we plan to work on merging it into -mm as soon as we can manage.

The kwatch implementation is pretty much orthogonal to the utrace patch as
it is so far.  As you've noted, it doesn't change the nature of the setting
of the debug registers; it only moves around the existing code for setting
them in raw form.  Hence it doesn't much matter which order the work is
merged at this stage.  There's no reason to withhold kwatch waiting for utrace.

I do have a problem with kwatch, however.  The existing user ABI includes
setting all of the debug registers, and this interface has never before
expressed a situation where you can set some but not all of them.  Having
ptrace suddenly fail with EBUSY when it never did before is not OK.  No
well-behaved kernel-mode tracing/debugging facility should perturb the user
experience in this way.  It is certainly understandable that one will
sometimes want to do invasive kernel-mode debugging and on special
occasions choose to be ill-behaved in this way (you might know your
userland work load doesn't include running gdb with watchpoints).  
But kwatch as it stands does not even make it possible to write a
well-behaved facility.

I am all in favor of a facility to manage shared use of the debug
registers, such as your debugreg.h additions.  I just think it needs to be
a little more flexible.  An unobtrusive kernel facility has to get out of
the way when user-mode decides to use all its debug registers.  It's not
immediately important what it's going to about it when contention arises,
but there has to be a way for the user-mode facilities to say they need to
allocate debugregs with priority and evict other squatters.  So, something
like code allocating a debugreg can supply a callback that's made when its
allocation has to taken by something with higher priority.  

Even after utrace, there will always be the possibility of a traditional
uncoordinated user of the raw debug registers, if nothing else ptrace
compatibility will always be there for old users.  So anything new and
fancy needs to be prepared to back out of the way gracefully.  In the case
of kwatch, it can just have a handler function given by the caller to start
with.  It's OK if individual callers can specially declare "I am not
well-behaved" and eat debugregs so that well-behaved high-priority users
like ptrace just have to lose (breaking compatibility).  But no
well-behaved caller of kwatch will do that.  

As a later improvement, kwatch could try a thing or two to stave off giving
up and telling its caller the watchpoint couldn't stay for the current
task.  For example, if a watchpoint is in kernel memory, you could switch
in your debugreg settings on entering the kernel and restore the user
watchpoints before returning to user mode.  Then you'd need to make
get_user et al somehow observe the user-mode watchpoints.  But it could be
investigated if the need arises.  Note that you can already silently do
something simple like juggling your kwatch debugreg assignments around if
the higher-priority consumer evicting you has left some other debugregs unused.

I certainly intend for later features based on utrace to include
higher-level treatment of watchpoints so that user debugging facilities can
also become responsive to debugreg allocation pressure.  (Eventually, the
user facilities might have easier ways of falling back to other methods and
getting out of the way of kernel debugreg consumers, than can be done for
the kernel-mode-tracing facilities.)  To that end, I'd like to see a clear
and robust interface for debugreg sharing, below the level of kwatch.  I'd
also like to see a thin layer on that giving a machine-independent kernel
source API for talking about watchpoints, which you pretty much have rolled
into the kwatch interface now.  But these are further refinements, not
barriers to including kwatch.

Also, an unrelated minor point.  I think it's error-prone to have an
integer argument to unregister_kwatch.  I think it makes most sense to have
the caller provide the space and call register/unregister with a pointer,
in the style of kprobes.


Thanks,
Roland

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH] Kwatch: kernel watchpoints using CPU debug registers
  2007-02-23 16:55             ` Alan Stern
@ 2007-02-24  0:08               ` Roland McGrath
  0 siblings, 0 replies; 25+ messages in thread
From: Roland McGrath @ 2007-02-24  0:08 UTC (permalink / raw)
  To: Alan Stern; +Cc: Prasanna S Panchamukhi, Kernel development list

> I think the best approach will be not to reset dr7 at all.  Then there 
> won't be any need to worry about restoring it.  Leaving a userspace 
> watchpoint enabled while running in the kernel ought not to matter; a 
> system call shouldn't touch any address in userspace more than once or 
> twice.

Hmm.  That sounds reasonable.  But I wonder why the old code clears %dr7.
It's been that way for a long time (since 2.4 at least).

> My idea was to put 4 hwbkpt structures in thread_struct, so they would
> always be available for use by ptrace.  However it turned out not to be
> feasible to replace the debugreg array with something more sophisticated,
> because of conflicting declarations and problems with the ordering of
> #includes.  So instead I have been forced to replace debugreg[] with a
> pointer to a structure which can be allocated as needed.

I think that's preferable anyway.  Most tasks most of the time will never
need that storage, so why not make thread_struct a little smaller?
(There is also the potential for sharing, which I mentioned earlier.)

> This raises the possibility that a PTRACE syscall might fail because the 
> allocation fails.  Hopefully that won't be an issue?

It's not a new issue, anyway, after utrace.  The utrace-based ptrace can
fail for PTRACE_ATTACH because of OOM too, which wasn't possible before.
I think it's survivable.


Thanks,
Roland

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH] Kwatch: kernel watchpoints using CPU debug registers
  2007-02-23  2:19           ` Roland McGrath
@ 2007-02-23 16:55             ` Alan Stern
  2007-02-24  0:08               ` Roland McGrath
  0 siblings, 1 reply; 25+ messages in thread
From: Alan Stern @ 2007-02-23 16:55 UTC (permalink / raw)
  To: Roland McGrath; +Cc: Prasanna S Panchamukhi, Kernel development list

On Thu, 22 Feb 2007, Roland McGrath wrote:

> > Yes, you are wrong -- although perhaps you shouldn't be.
> > 
> > The current version of do_debug() clears dr7 when a debug interrupt is 
> > fielded.  As a result, if a system call touches two watchpoint addresses 
> > in userspace only the first access will be noticed.
> 
> Ah, I see.  I think it would indeed be nice to fix this.
> 
> > This is probably a bug in do_debug().  It would be better to disable each
> > individual userspace watchpoint as it is triggered (or even not to disable
> > it at all).  dr7 would be restored when the SIGTRAP is delivered.  (But
> > what if the user is blocking or ignoring SIGTRAP?)
> 
> The user blocking or ignoring it doesn't come up, because it's a
> force_sig_info call.  However, a debugger will indeed swallow the signal
> through ptrace/utrace means.  In ptrace, the dr7 is always going to get
> reset because there will always be a context switch out and back in that
> does it.  But with utrace it's now possible to swallow the signal and keep
> going without a context switch (e.g. a breakpoint that is just doing
> logging but not stopping).  So perhaps we should have a TIF_RESTORE_DR7
> that goes into _TIF_WORK_MASK and gets handled in do_notify_resume
> (or maybe it's TIF_HWBKPT).

I think the best approach will be not to reset dr7 at all.  Then there 
won't be any need to worry about restoring it.  Leaving a userspace 
watchpoint enabled while running in the kernel ought not to matter; a 
system call shouldn't touch any address in userspace more than once or 
twice.

> > I've worked out a plan for implementing combined user/kernel mode
> > breakpoints and watchpoints (call them "debugpoints" as a catch-all
> > term).  It should work transparently with ptrace and should accomodate
> > whatever scheme utrace decides to adopt.  There won't need to be a
> > separate kwatch facility on top of it; the new combined facility will
> > handle debugpoints in both userspace and kernelspace.
> 
> That sounds great.  I'm not thrilled with the name "debugpoint", I have to
> tell you.  The hardware documentation calls all these things "breakpoints",
> and I think "data breakpoint" and "instruction breakpoint" are pretty good
> terms.  How about "hwbkpt" for the facility API?

Okay, I'll change the name.

> The one caveat I have here is that I don't want ptrace (via utrace) to have
> to supply the usual structure.  I probably only think this because it would
> be a pain for the ptrace/utrace implementation to find a place to stick it.
> But I have a rationalization.  The old ptrace interface, and the
> utrace_regset for debugregs, is not really a "debugpoint user" in the sense
> you're defining it.  It's an access to the "raw" debugregs as part of the
> thread's virtual CPU context.  You can use ptrace to set a watchpoint, then
> detach ptrace, and the thread will get a SIGTRAP later though there is no
> remnant at that point of the debugger interface that made it come about.
> For the degenerate case of medium-high priority with no handler callbacks
> (that should instead be an error at registration time if no slot is free),
> you shouldn't really need any per-caller storage (there can only be one
> such caller per slot).  

My idea was to put 4 hwbkpt structures in thread_struct, so they would
always be available for use by ptrace.  However it turned out not to be
feasible to replace the debugreg array with something more sophisticated,
because of conflicting declarations and problems with the ordering of
#includes.  So instead I have been forced to replace debugreg[] with a
pointer to a structure which can be allocated as needed.

This raises the possibility that a PTRACE syscall might fail because the 
allocation fails.  Hopefully that won't be an issue?

Alan Stern


^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH] Kwatch: kernel watchpoints using CPU debug registers
  2007-02-21 20:35         ` Alan Stern
  2007-02-22 11:43           ` S. P. Prasanna
@ 2007-02-23  2:19           ` Roland McGrath
  2007-02-23 16:55             ` Alan Stern
  1 sibling, 1 reply; 25+ messages in thread
From: Roland McGrath @ 2007-02-23  2:19 UTC (permalink / raw)
  To: Alan Stern; +Cc: Prasanna S Panchamukhi, Kernel development list

> Yes, you are wrong -- although perhaps you shouldn't be.
> 
> The current version of do_debug() clears dr7 when a debug interrupt is 
> fielded.  As a result, if a system call touches two watchpoint addresses 
> in userspace only the first access will be noticed.

Ah, I see.  I think it would indeed be nice to fix this.

> This is probably a bug in do_debug().  It would be better to disable each
> individual userspace watchpoint as it is triggered (or even not to disable
> it at all).  dr7 would be restored when the SIGTRAP is delivered.  (But
> what if the user is blocking or ignoring SIGTRAP?)

The user blocking or ignoring it doesn't come up, because it's a
force_sig_info call.  However, a debugger will indeed swallow the signal
through ptrace/utrace means.  In ptrace, the dr7 is always going to get
reset because there will always be a context switch out and back in that
does it.  But with utrace it's now possible to swallow the signal and keep
going without a context switch (e.g. a breakpoint that is just doing
logging but not stopping).  So perhaps we should have a TIF_RESTORE_DR7
that goes into _TIF_WORK_MASK and gets handled in do_notify_resume
(or maybe it's TIF_HWBKPT).

You should not actually need to disable user watchpoints, because in data
watchpoints the exception comes after the instruction completes.  Only for
instruction watchpoints does the exception come before the instruction
executes, and no user watchpoints can be in the address range containing
kernel code.  

SIGTRAP both doesn't queue, and doesn't give %dr6 values in its siginfo_t.
All user watchpoints will be handled via the signal; this is the only way
ptrace can report them, and is also the utrace way of doing things.
do_debug can happen inside kernel code, and tracing of user-level tasks can
only safely do anything at the point just before returning to user mode,
where signals are handled.  So, getting to send_sigtrap in do_debug is
enough to say "one or more user debug exceptions happened".  The %dr6 value
that collects in the thread state to be seen by ptrace, or by utrace-based
things using your new facility, needs to collect all the %dr6 bits that
were set by the hardware and weren't consumed by kernel-level tracing.  An
eventual utrace-based thing might in fact have some other way to tie in so
that the event details could just be in some call made by do_debug and not
recorded in the thread's virtual %dr6 value.  But at least for ptrace, they
should collect there if it becomes possible for more than one exception to
happen while in kernel mode or in a single user instruction.  (A single
instruction can cause multiple exceptions at the hardware level.)

> I've worked out a plan for implementing combined user/kernel mode
> breakpoints and watchpoints (call them "debugpoints" as a catch-all
> term).  It should work transparently with ptrace and should accomodate
> whatever scheme utrace decides to adopt.  There won't need to be a
> separate kwatch facility on top of it; the new combined facility will
> handle debugpoints in both userspace and kernelspace.

That sounds great.  I'm not thrilled with the name "debugpoint", I have to
tell you.  The hardware documentation calls all these things "breakpoints",
and I think "data breakpoint" and "instruction breakpoint" are pretty good
terms.  How about "hwbkpt" for the facility API?

> To work with ptrace, the new scheme will completely virtualize the debug
> registers.  So for example, a ptrace call might request a debugpoint in
> dr0, but it could end up that the physical register used is really dr2
> instead.  The various bits in dr6 and dr7 will be mapped in such a way
> that the entire procedure is transparent to the user.  Debugpoints 
> registered in kernelspace or by utrace won't care, of course.

I think that's a fine idea.  

The one caveat I have here is that I don't want ptrace (via utrace) to have
to supply the usual structure.  I probably only think this because it would
be a pain for the ptrace/utrace implementation to find a place to stick it.
But I have a rationalization.  The old ptrace interface, and the
utrace_regset for debugregs, is not really a "debugpoint user" in the sense
you're defining it.  It's an access to the "raw" debugregs as part of the
thread's virtual CPU context.  You can use ptrace to set a watchpoint, then
detach ptrace, and the thread will get a SIGTRAP later though there is no
remnant at that point of the debugger interface that made it come about.
For the degenerate case of medium-high priority with no handler callbacks
(that should instead be an error at registration time if no slot is free),
you shouldn't really need any per-caller storage (there can only be one
such caller per slot).  

> There are two things I am uncertain about: vm86 mode and kprobes.  I don't
> know anything about how either of them works.  

I know about kprobes.  I don't know about vm86, but I can read the code.


Thanks,
Roland

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH] Kwatch: kernel watchpoints using CPU debug registers
  2007-02-21 20:35         ` Alan Stern
@ 2007-02-22 11:43           ` S. P. Prasanna
  2007-02-23  2:19           ` Roland McGrath
  1 sibling, 0 replies; 25+ messages in thread
From: S. P. Prasanna @ 2007-02-22 11:43 UTC (permalink / raw)
  To: Alan Stern; +Cc: Roland McGrath, Kernel development list

On Wed, Feb 21, 2007 at 03:35:13PM -0500, Alan Stern wrote:
> Going back to something you mentioned earlier...
> 
[...]
> On Fri, 9 Feb 2007, Roland McGrath wrote:
> There are two things I am uncertain about: vm86 mode and kprobes.  I don't
> know anything about how either of them works.  Judging from the current
> code, nothing much should be needed -- debug traps in vm86 mode are
> handled by calling handle_vm86_trap(), and kprobes puts itself at the
> start of the notify_die() chain so it can handle single-step traps.  
> Eventually it will be necessary to check with someone who really 
> understands the issues.

Yes, Kprobes needs to get notified first to handle single-step traps. So kwatch
getting notified secound should be fine.

Thanks
Prasanna
-- 
Prasanna S.P.
Linux Technology Center
India Software Labs, IBM Bangalore
Email: prasanna@in.ibm.com
Ph: 91-80-41776329

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH] Kwatch: kernel watchpoints using CPU debug registers
  2007-02-09 23:31       ` Roland McGrath
  2007-02-10  4:32         ` Alan Stern
@ 2007-02-21 20:35         ` Alan Stern
  2007-02-22 11:43           ` S. P. Prasanna
  2007-02-23  2:19           ` Roland McGrath
  1 sibling, 2 replies; 25+ messages in thread
From: Alan Stern @ 2007-02-21 20:35 UTC (permalink / raw)
  To: Roland McGrath; +Cc: Prasanna S Panchamukhi, Kernel development list

Going back to something you mentioned earlier...

On Fri, 9 Feb 2007, Roland McGrath wrote:

> I don't think I really object to the ABI change of clearing %dr6 after an
> exception so that it does not accumulate multiple results.  But first I'll
> have to convince myself that we never actually do want to accumulate
> multiple results.  Hmm, I think we can, so maybe I do object.  If you set
> two watchpoints inside a user buffer and then do a system call that touches
> both those addresses (e.g. read), then you will go through do_debug (to
> send_sigtrap) twice before returning to user mode.  When the syscall is
> done, you'll have a pending SIGTRAP for the debugger to handle.  By looking
> at your %dr6 the debugger can see that both watchpoints hit.  (gdb does not
> handle this case, but it should.)  Am I wrong?

Yes, you are wrong -- although perhaps you shouldn't be.

The current version of do_debug() clears dr7 when a debug interrupt is 
fielded.  As a result, if a system call touches two watchpoint addresses 
in userspace only the first access will be noticed.

This is probably a bug in do_debug().  It would be better to disable each
individual userspace watchpoint as it is triggered (or even not to disable
it at all).  dr7 would be restored when the SIGTRAP is delivered.  (But
what if the user is blocking or ignoring SIGTRAP?)


Moving on...

I've worked out a plan for implementing combined user/kernel mode 
breakpoints and watchpoints (call them "debugpoints" as a catch-all 
term).  It should work transparently with ptrace and should accomodate 
whatever scheme utrace decides to adopt.  There won't need to be a 
separate kwatch facility on top of it; the new combined facility will 
handle debugpoints in both userspace and kernelspace.

The idea is that callers can register and unregister a struct debugpoint, 
which contains fields for the type, length, address, and priority as well 
as three callback pointers (for installed, uninstalled, and triggered).  
The debug core will keep these structures sorted by priority and will 
allocate the available debug registers based on the priorities of the 
userspace and kernelspace requests.  Each CPU will have its own array of 
pointers to these structures, indicating which debugpoints are currently 
enabled.

To work with ptrace, the new scheme will completely virtualize the debug
registers.  So for example, a ptrace call might request a debugpoint in
dr0, but it could end up that the physical register used is really dr2
instead.  The various bits in dr6 and dr7 will be mapped in such a way
that the entire procedure is transparent to the user.  Debugpoints 
registered in kernelspace or by utrace won't care, of course.

There are two things I am uncertain about: vm86 mode and kprobes.  I don't
know anything about how either of them works.  Judging from the current
code, nothing much should be needed -- debug traps in vm86 mode are
handled by calling handle_vm86_trap(), and kprobes puts itself at the
start of the notify_die() chain so it can handle single-step traps.  
Eventually it will be necessary to check with someone who really 
understands the issues.

Alan Stern


^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH] Kwatch: kernel watchpoints using CPU debug registers
  2007-02-10  4:32         ` Alan Stern
@ 2007-02-18  3:03           ` Roland McGrath
  0 siblings, 0 replies; 25+ messages in thread
From: Roland McGrath @ 2007-02-18  3:03 UTC (permalink / raw)
  To: Alan Stern; +Cc: Prasanna S Panchamukhi, Kernel development list

> How do you suggest this be handled?  Maybe we should just keep track of a
> maximum user priority level for each slot, allowing it to go up but not
> down until all user processes have given up the slot.  (I.e., in the
> example above the later kwatch requests would still fail because we would
> continue to remember the high user priority level so long as the first
> process maintained its allocation.)  That would be overly pessimistic, but
> it would at least be safe.

I think that is certainly fine to start with.  Like I said before, we can
start conservative and then worry about more complexity as the concrete
needs arise.  I don't think it will really be any trouble to change some of
these low-level interfaces later to accomodate more sophisticated callers
and implementations.  

When the issue does arise, scanning all the necessary tasks may not really
need to be so costly.  That is, if rather than scanning all tasks in the
system, it's a list of debugreg allocations.  The callers doing fancy
allocation can be responsible for passing in storage for a struct
containing the list structure.  That would naturally embed in struct
kwatch.  Having the debugreg allocation routines pass in such a structure
would also be useful for another kind of flexibility I'd like to have
eventually.  That is, "local" allocations that are local to a group of
tasks rather than just one.  For example, a debugger most often actually
wants to share watchpoints among all the threads sharing an address space.
If identical settings are in fact shared, the storage requirements for
using watchpoints in many-threaded processes scale that much better.

I think we have a while before we have to actually figure any of that out
in detail.  The simple starting point covers all our immediate concrete
concerns.


Thanks,
Roland


^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH] Kwatch: kernel watchpoints using CPU debug registers
  2007-02-09 23:31       ` Roland McGrath
@ 2007-02-10  4:32         ` Alan Stern
  2007-02-18  3:03           ` Roland McGrath
  2007-02-21 20:35         ` Alan Stern
  1 sibling, 1 reply; 25+ messages in thread
From: Alan Stern @ 2007-02-10  4:32 UTC (permalink / raw)
  To: Roland McGrath; +Cc: Prasanna S Panchamukhi, Kernel development list

On Fri, 9 Feb 2007, Roland McGrath wrote:

> I don't think I really object to the ABI change of clearing %dr6 after an
> exception so that it does not accumulate multiple results.  But first I'll
> have to convince myself that we never actually do want to accumulate
> multiple results.  Hmm, I think we can, so maybe I do object.  If you set
> two watchpoints inside a user buffer and then do a system call that touches
> both those addresses (e.g. read), then you will go through do_debug (to
> send_sigtrap) twice before returning to user mode.  When the syscall is
> done, you'll have a pending SIGTRAP for the debugger to handle.  By looking
> at your %dr6 the debugger can see that both watchpoints hit.  (gdb does not
> handle this case, but it should.)  Am I wrong?

I think you're right.

> So this gets to the more complicated view of %dr6 handling that I had first
> had in mind yesterday.  Each allocation "owns" one of the low 4 bits in
> %dr6 too.  Only the dr6 bits owned by the userland "raw" allocation
> (i.e. ptrace/utrace_regset) should appear nonzero in thread.debugreg[6].
> So when kwatch swallows a debug exception, it should mask off its bit from
> %dr6 in the CPU, but not clear %dr6 completely.  That way you can have a
> sequence of user dr0 hit, kwatch dr3 hit, user dr1 hit, all inside one
> system call (including interrupt handlers), and when it gets to the
> userland debugger examining dr6 it sees the low 2 bits both set.

Okay; I'll fix this too.  Come to think of it, kwatch needs to handle 
multiple hits as well -- there might be two watchpoints set to the same 
address.

> > It's really quite a tricky matter.  Should a register be allocated to
> > kwatch only when no user process needs it?  Should we really go about
> > checking the requirements of every single process whenever a kwatch
> > allocation request comes in?  What if the processes which need a
> > particular register aren't running -- should the register then be given to
> > kwatch?  What if one of those processes then does start running on one
> > CPU?
> 
> To "go about checking the requirements of every single process" is not so
> hard as it sounds when they're recorded as a single global use count per
> slot, as your original code does.  When you mentioned a "your allocation is
> available" callback, I was thinking it might come to that being called
> inside context switch.  It's all rather tricky, indeed.  
> 
> The obvious answer is to start simple.  If any user process anywhere uses
> drN, kwatch has to give it up for all CPUs (watchpoints with less than
> "break ptrace" priority do).  If anyone really cares about more flexibility
> than that, we can change or extend it.  Some copious comments in the
> interface descriptions can lead them in the right direction if the
> situation comes up.  Probably with systemtap support in a while, we'll get
> a lot more concrete uses of watchpoints and people finding out what really 
> matters to them.

It's still more complicated than you might think.  Let's say two user
processes each have dr1 allocated, one with low priority and the other
with high priority.  The kernel has to be aware of the high-priority
allocation, so that it can refuse intermediate-priority kwatch allocation
attempts.  Now suppose the second process exits.  dr1 is still allocated
to the first user process but only with low priority, so now
intermediate-priority kwatch allocation attempts should succeed.

In order for this to work, when the second process gives up its allocation 
I would have to either scan though all tasks to see the first process, or 
else keep several global use counts for each slot -- in fact, one use 
count for each priority level.  That's doable if there are only a few 
levels, but not if there are many.

How do you suggest this be handled?  Maybe we should just keep track of a
maximum user priority level for each slot, allowing it to go up but not
down until all user processes have given up the slot.  (I.e., in the
example above the later kwatch requests would still fail because we would
continue to remember the high user priority level so long as the first
process maintained its allocation.)  That would be overly pessimistic, but
it would at least be safe.

Alan Stern


^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH] Kwatch: kernel watchpoints using CPU debug registers
  2007-02-09 15:54     ` Alan Stern
@ 2007-02-09 23:31       ` Roland McGrath
  2007-02-10  4:32         ` Alan Stern
  2007-02-21 20:35         ` Alan Stern
  0 siblings, 2 replies; 25+ messages in thread
From: Roland McGrath @ 2007-02-09 23:31 UTC (permalink / raw)
  To: Alan Stern; +Cc: Prasanna S Panchamukhi, Kernel development list

> Yes.  In fact, the current existing code does not handle dr6 correctly.  
> It never clears the register, which means you're likely to get into 
> trouble when multiple breakpoints (or watchpoints) are enabled.

This is a subtle change from the existing ABI, in which userland has to
clear %dr6 via ptrace itself.  But gdb never does that AFAICT.  So it's in
fact subject to confusion when two watchpoints are set and the second hits
after the first.  So gdb ought to be fixed to clear dr6 via ptrace, to work
with existing and older kernels.

I don't think I really object to the ABI change of clearing %dr6 after an
exception so that it does not accumulate multiple results.  But first I'll
have to convince myself that we never actually do want to accumulate
multiple results.  Hmm, I think we can, so maybe I do object.  If you set
two watchpoints inside a user buffer and then do a system call that touches
both those addresses (e.g. read), then you will go through do_debug (to
send_sigtrap) twice before returning to user mode.  When the syscall is
done, you'll have a pending SIGTRAP for the debugger to handle.  By looking
at your %dr6 the debugger can see that both watchpoints hit.  (gdb does not
handle this case, but it should.)  Am I wrong?

So this gets to the more complicated view of %dr6 handling that I had first
had in mind yesterday.  Each allocation "owns" one of the low 4 bits in
%dr6 too.  Only the dr6 bits owned by the userland "raw" allocation
(i.e. ptrace/utrace_regset) should appear nonzero in thread.debugreg[6].
So when kwatch swallows a debug exception, it should mask off its bit from
%dr6 in the CPU, but not clear %dr6 completely.  That way you can have a
sequence of user dr0 hit, kwatch dr3 hit, user dr1 hit, all inside one
system call (including interrupt handlers), and when it gets to the
userland debugger examining dr6 it sees the low 2 bits both set.

> It's really quite a tricky matter.  Should a register be allocated to
> kwatch only when no user process needs it?  Should we really go about
> checking the requirements of every single process whenever a kwatch
> allocation request comes in?  What if the processes which need a
> particular register aren't running -- should the register then be given to
> kwatch?  What if one of those processes then does start running on one
> CPU?

To "go about checking the requirements of every single process" is not so
hard as it sounds when they're recorded as a single global use count per
slot, as your original code does.  When you mentioned a "your allocation is
available" callback, I was thinking it might come to that being called
inside context switch.  It's all rather tricky, indeed.  

The obvious answer is to start simple.  If any user process anywhere uses
drN, kwatch has to give it up for all CPUs (watchpoints with less than
"break ptrace" priority do).  If anyone really cares about more flexibility
than that, we can change or extend it.  Some copious comments in the
interface descriptions can lead them in the right direction if the
situation comes up.  Probably with systemtap support in a while, we'll get
a lot more concrete uses of watchpoints and people finding out what really 
matters to them.


Thanks,
Roland


^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH] Kwatch: kernel watchpoints using CPU debug registers
  2007-02-09 10:21   ` Roland McGrath
@ 2007-02-09 15:54     ` Alan Stern
  2007-02-09 23:31       ` Roland McGrath
  0 siblings, 1 reply; 25+ messages in thread
From: Alan Stern @ 2007-02-09 15:54 UTC (permalink / raw)
  To: Roland McGrath; +Cc: Prasanna S Panchamukhi, Kernel development list

On Fri, 9 Feb 2007, Roland McGrath wrote:

> > All right.  However this means thread_struct will have to grow in order to
> > hold each task's debug-register allocations and priorities.  Would that be
> > acceptable?  (This might be a good reason to keep the number of bits
> > down.)
> 
> Well, noone seems to mind the wasted debugreg[5..6] words. ;-) 
> 
> I'm inclined to make thread_struct smaller than it is now by making it
> indirect (debugreg[8] -> one pointer).  It feels like this would be pretty
> safe now that we have TIF_DEBUG anyway.  Already nothing should be looking
> at the field when TIF_DEBUG isn't set.

I had the same thought.

> > Another question: How can a program using the ptrace API ever give up a
> > debug-register allocation?  Disabling the register isn't sufficient; a
> > user program should be able to store a watchpoint address in dr1, enable
> > it in dr7, disable it in dr7, and then re-enable it in the expectation
> > that the address stored in dr1 hasn't been overwritten.  As far as I can
> > see, ptrace-type allocations have to be permanent (until the task exits or
> > execs, or uses some other to-be-determined API to do the de-allocation.)
> 
> I hadn't really thought about this before, but it's pretty straightforward.
> Each allocation is for one of %dr[0-3] and for its associated bits in %dr7.
> %dr0 and bits 0,1,16-19; %dr1 and bits 2,3,20-23; etc.
> %dr7 & (0x000f0003 << N) for %drN, I guess it is.
> ((((1 << DR_CONTROL_SIZE) - 1) << DR_CONTROL_SHIFT) |
>  ((1 << DR_ENABLE_SIZE) - 1)) << N, I should say.
> 
> Each allocation owns those 38 bits (70 bits on x86_64).  When all those
> bits are zero, the allocation is not active and might as well not be there
> (except for whatever semantics you might want to have about an allocation's
> lifetime as distinct from the event of resetting its contents).  

Okay, that makes sense.

> Your question made me think about the %dr6 issue, too, which I also hadn't
> thought about before.  It looks like your patch handles this correctly, but
> it's a subtle point that I think warrants some comments in the code.  When
> userland inserts a watchpoint and it's hit, it gets a SIGTRAP via do_debug
> and eventually looks at dr6 (via ptrace) to see what happened.  Kernel
> watchpoints that come along after the user signal is generated can clobber
> the CPU %dr6 with new hits, but userland should not perceive this.  This
> works out because what userland sees is thread.debugreg[6], the only place
> that sets it is do_debug, and a kwatch hit bails out before changing it.
> Any other new users of the debugreg sharing interface need to be cognizant
> of the %dr6 issue to avoid stepping on old ptrace uses.

Yes.  In fact, the current existing code does not handle dr6 correctly.  
It never clears the register, which means you're likely to get into 
trouble when multiple breakpoints (or watchpoints) are enabled.


Here's another complication -- and this is one I can't figure out any easy
solutions for.  The type of API we've been discussing will work well
enough on UP systems, but what about SMP?  I don't see any value in having
a kernel watchpoint enabled on some CPUs and not others.  But then what
should happen when a debug register is in use by kwatch and a ptrace
request on one CPU usurps it (leaving no other register in which to put
it)?  Not to mention the difficulties of keeping track of everything when
the same kwatch watchpoint is stored in different debug registers on
different CPUs.

It's really quite a tricky matter.  Should a register be allocated to
kwatch only when no user process needs it?  Should we really go about
checking the requirements of every single process whenever a kwatch
allocation request comes in?  What if the processes which need a
particular register aren't running -- should the register then be given to
kwatch?  What if one of those processes then does start running on one
CPU?

Alan Stern


^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH] Kwatch: kernel watchpoints using CPU debug registers
  2007-02-07 19:22 ` Alan Stern
  2007-02-07 22:08   ` Bob Copeland
@ 2007-02-09 10:21   ` Roland McGrath
  2007-02-09 15:54     ` Alan Stern
  1 sibling, 1 reply; 25+ messages in thread
From: Roland McGrath @ 2007-02-09 10:21 UTC (permalink / raw)
  To: Alan Stern; +Cc: Prasanna S Panchamukhi, Kernel development list

> All right.  However this means thread_struct will have to grow in order to
> hold each task's debug-register allocations and priorities.  Would that be
> acceptable?  (This might be a good reason to keep the number of bits
> down.)

Well, noone seems to mind the wasted debugreg[5..6] words. ;-) 

I'm inclined to make thread_struct smaller than it is now by making it
indirect (debugreg[8] -> one pointer).  It feels like this would be pretty
safe now that we have TIF_DEBUG anyway.  Already nothing should be looking
at the field when TIF_DEBUG isn't set.

> Another question: How can a program using the ptrace API ever give up a
> debug-register allocation?  Disabling the register isn't sufficient; a
> user program should be able to store a watchpoint address in dr1, enable
> it in dr7, disable it in dr7, and then re-enable it in the expectation
> that the address stored in dr1 hasn't been overwritten.  As far as I can
> see, ptrace-type allocations have to be permanent (until the task exits or
> execs, or uses some other to-be-determined API to do the de-allocation.)

I hadn't really thought about this before, but it's pretty straightforward.
Each allocation is for one of %dr[0-3] and for its associated bits in %dr7.
%dr0 and bits 0,1,16-19; %dr1 and bits 2,3,20-23; etc.
%dr7 & (0x000f0003 << N) for %drN, I guess it is.
((((1 << DR_CONTROL_SIZE) - 1) << DR_CONTROL_SHIFT) |
 ((1 << DR_ENABLE_SIZE) - 1)) << N, I should say.

Each allocation owns those 38 bits (70 bits on x86_64).  When all those
bits are zero, the allocation is not active and might as well not be there
(except for whatever semantics you might want to have about an allocation's
lifetime as distinct from the event of resetting its contents).  

gdb already works this way: when it removes a watchpoint, it clears drN to
zero when it zeros all the corresponding bits in dr7.  (In fact it's in a
separate call after changing dr7, because of the ptrace interface.)

Your question made me think about the %dr6 issue, too, which I also hadn't
thought about before.  It looks like your patch handles this correctly, but
it's a subtle point that I think warrants some comments in the code.  When
userland inserts a watchpoint and it's hit, it gets a SIGTRAP via do_debug
and eventually looks at dr6 (via ptrace) to see what happened.  Kernel
watchpoints that come along after the user signal is generated can clobber
the CPU %dr6 with new hits, but userland should not perceive this.  This
works out because what userland sees is thread.debugreg[6], the only place
that sets it is do_debug, and a kwatch hit bails out before changing it.
Any other new users of the debugreg sharing interface need to be cognizant
of the %dr6 issue to avoid stepping on old ptrace uses.


Thanks,
Roland

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH] Kwatch: kernel watchpoints using CPU debug registers
  2007-02-07 19:22 ` Alan Stern
@ 2007-02-07 22:08   ` Bob Copeland
  2007-02-09 10:21   ` Roland McGrath
  1 sibling, 0 replies; 25+ messages in thread
From: Bob Copeland @ 2007-02-07 22:08 UTC (permalink / raw)
  To: Alan Stern
  Cc: Roland McGrath, Prasanna S Panchamukhi, Kernel development list

On 2/7/07, Alan Stern <stern@rowland.harvard.edu> wrote:
> I wonder where this convention of using lower numbers to indicate higher
> priorities comes from...  It seems to be quite prevalent even though it's
> obviously backwards.

I agree but at least in the case of 'nice' it works in the sense that
the value increases with increasing niceness.  Done the other way,
they would have to call it 'mean,' then someone would wonder why 'mean
10 20' prints 'No such file or directory' instead of '15'...

Bob

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH] Kwatch: kernel watchpoints using CPU debug registers
       [not found] <20070207025008.1B11118005D@magilla.sf.frob.com>
@ 2007-02-07 19:22 ` Alan Stern
  2007-02-07 22:08   ` Bob Copeland
  2007-02-09 10:21   ` Roland McGrath
  0 siblings, 2 replies; 25+ messages in thread
From: Alan Stern @ 2007-02-07 19:22 UTC (permalink / raw)
  To: Roland McGrath; +Cc: Prasanna S Panchamukhi, Kernel development list

On Tue, 6 Feb 2007, Roland McGrath wrote:

> > So for the sake of argument, let's assume that debug registers can be 
> > assigned with priority values ranging from 0 to 7 (overkill, but who 
> > cares?).  By fiat, ptrace assignments use priority 4.  Then kwatch callers 
> > can request whatever priority they like.  The well-behaved cases you've 
> > been discussing will use priority 0, and the invasive cases can use 
> > priority 7.  (With appropriate symbolic names instead of raw numeric 
> > values, naturally.)
> 
> Sure.  Or make it signed with lower value wins, have ptrace use -1 and the
> average bear use 0 or something especially unobtrusive use >0, and
> something very obtrusive use -many.

I wonder where this convention of using lower numbers to indicate higher 
priorities comes from...  It seems to be quite prevalent even though it's 
obviously backwards.

>  Unless you are really going to pack it
> into a few bits somewhere, I'd make it an arbitrary int rather than a
> special small range; it's just for sort order comparison.  Bottom line, I
> don't really care about the numerology.  Just so "break ptrace", "don't
> break ptrace", and "readily get out of the way on demand" can be expressed.
> We can always fine-tune it later as there are more concrete users.

Okay; I'm not fixated on any particular size.

> > Or maybe that's too complicated.  Perhaps all userspace assignments should 
> > always use the same priority level.  
> 
> No, I want priorities among user-mode watchpoint users too.  ptrace is
> rigid, but newer facilities can coexist with ptrace on the same thread and
> with kwatch, and do fancy new things to fall back when there is debugreg
> allocation pressure.  Future user facilities might be able to do VM tricks
> that are harder to make workable for kernel mode, for example.  

All right.  However this means thread_struct will have to grow in order to
hold each task's debug-register allocations and priorities.  Would that be
acceptable?  (This might be a good reason to keep the number of bits
down.)

Another question: How can a program using the ptrace API ever give up a
debug-register allocation?  Disabling the register isn't sufficient; a
user program should be able to store a watchpoint address in dr1, enable
it in dr7, disable it in dr7, and then re-enable it in the expectation
that the address stored in dr1 hasn't been overwritten.  As far as I can
see, ptrace-type allocations have to be permanent (until the task exits or
execs, or uses some other to-be-determined API to do the de-allocation.)

Alan Stern


^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH] Kwatch: kernel watchpoints using CPU debug registers
  2007-02-06 19:58 ` Alan Stern
@ 2007-02-07  2:56   ` Roland McGrath
  0 siblings, 0 replies; 25+ messages in thread
From: Roland McGrath @ 2007-02-07  2:56 UTC (permalink / raw)
  To: Alan Stern; +Cc: Prasanna S Panchamukhi, Kernel development list

> That's good.  So I'll assume an updated version of kwatch can be submitted 
> without regard to the progress of utrace (other than minor conflicts over 
> the exact location of the ptrace code to change).

Indeed.

> Right.  I had been thinking in terms of a developer using kwatch to track 
> down some particularly nasty problem, something that would happen rather 
> infrequently, where one wouldn't care about side effects on user programs.  
> But of course those side effects might alter an important aspect of the 
> kernel problem being debugged...

This is indeed a way it might reasonably be used.  As I said, it's fine for
an individual use to be that way.  But think also of using it for
performance measurement (i.e. "how hot is this counter") in something like
systemtap, where you might have long-running instrumentation over arbitrary
workloads.

> It's also true that the current kwatch version affects the user experience
> even when no kernel debugging is going on, as it forcibly prevents ptrace
> calls from setting the Global-Enable bits in dr7.  That at least can be
> fixed quite easily.  (On the other hand, userspace should never do 
> anything other than a Local Enable.)

The distinction between local and global here never matters on Linux.  We
don't use hardware task switching at all, and if we did it would be part of
context switch, which already switches in debug register values.  

The local vs global distinction you have in debugreg allocation (when one
Linux task_struct is on the CPU vs always on every CPU) is a
machine-independent notion at the level of your debugreg sharing
abstraction, and has nothing to do with particular %dr7 bit values
(just with the allocation of all the bits in %dr7 that correspond to a
particular allocated %drN).

> How about a pair of callbacks: One to notify whenever the watchpoint is 
> enabled and one to notify whenever it is disabled?

That sounds fine.  You'll want to make sure it's structured so it doesn't
get too hairy when a caller wants to just give up and unregister when its
slot is unavailable (hopefully shouldn't lead to calling unregister from
the callback made inside the register call and such twists).

> So for the sake of argument, let's assume that debug registers can be 
> assigned with priority values ranging from 0 to 7 (overkill, but who 
> cares?).  By fiat, ptrace assignments use priority 4.  Then kwatch callers 
> can request whatever priority they like.  The well-behaved cases you've 
> been discussing will use priority 0, and the invasive cases can use 
> priority 7.  (With appropriate symbolic names instead of raw numeric 
> values, naturally.)

Sure.  Or make it signed with lower value wins, have ptrace use -1 and the
average bear use 0 or something especially unobtrusive use >0, and
something very obtrusive use -many.  Unless you are really going to pack it
into a few bits somewhere, I'd make it an arbitrary int rather than a
special small range; it's just for sort order comparison.  Bottom line, I
don't really care about the numerology.  Just so "break ptrace", "don't
break ptrace", and "readily get out of the way on demand" can be expressed.
We can always fine-tune it later as there are more concrete users.

> Or maybe that's too complicated.  Perhaps all userspace assignments should 
> always use the same priority level.  

No, I want priorities among user-mode watchpoint users too.  ptrace is
rigid, but newer facilities can coexist with ptrace on the same thread and
with kwatch, and do fancy new things to fall back when there is debugreg
allocation pressure.  Future user facilities might be able to do VM tricks
that are harder to make workable for kernel mode, for example.  

> For now I would prefer to avoid that.  It's true that kwatch is intended
> _only_ for kernelspace watchpoints, not userspace.  But I'd rather leave
> the complications up to someone else.

Understood.  If you constrain the kwatch interface so it cannot be used
with user addresses (checks < TASK_SIZE or whatever), then the problem will
be clearly defined as the slightly simpler one whenever someone does come
along in need of more complications.

> It seems likely that the interfaces added by kwatch will need to be
> generalized in various ways in order to handle the requirements of other
> architectures.  However I don't know what those requirements might be, so
> it seems best to start out small with x86 only and leave more refinements
> for the future.

Agreed, just to keep it in mind.  I think the features on other machines
are roughly similar except for not offering size choices other than
"anywhere in this aligned word".  

> If I update the patch, adding a priority level and the callback 
> notifications, do you think it would then be acceptable?

I expect so.


Thanks,
Roland

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH] Kwatch: kernel watchpoints using CPU debug registers
       [not found] <20070206042153.66AB418005D@magilla.sf.frob.com>
@ 2007-02-06 19:58 ` Alan Stern
  2007-02-07  2:56   ` Roland McGrath
  0 siblings, 1 reply; 25+ messages in thread
From: Alan Stern @ 2007-02-06 19:58 UTC (permalink / raw)
  To: Roland McGrath; +Cc: Prasanna S Panchamukhi, Kernel development list

On Mon, 5 Feb 2007, Roland McGrath wrote:

> Sorry I've been slow in giving you feedback on kwatch.

No problem (I have plenty of other things to work on!), and thanks for the 
detailed reply.

> > I'll be happy to move this over to the utrace setting, once it is merged.  
> > Do you think it would be better to include the current version of kwatch 
> > now or to wait for utrace?
> > 
> > Roland, is there a schedule for when you plan to get utrace into -mm?
> 
> Since you've asked, I'll mention that I've been discussing this with Andrew
> lately and we plan to work on merging it into -mm as soon as we can manage.
> 
> The kwatch implementation is pretty much orthogonal to the utrace patch as
> it is so far.  As you've noted, it doesn't change the nature of the setting
> of the debug registers; it only moves around the existing code for setting
> them in raw form.  Hence it doesn't much matter which order the work is
> merged at this stage.  There's no reason to withhold kwatch waiting for utrace.

That's good.  So I'll assume an updated version of kwatch can be submitted 
without regard to the progress of utrace (other than minor conflicts over 
the exact location of the ptrace code to change).

> I do have a problem with kwatch, however.  The existing user ABI includes
> setting all of the debug registers, and this interface has never before
> expressed a situation where you can set some but not all of them.  Having
> ptrace suddenly fail with EBUSY when it never did before is not OK.  No
> well-behaved kernel-mode tracing/debugging facility should perturb the user
> experience in this way.  It is certainly understandable that one will
> sometimes want to do invasive kernel-mode debugging and on special
> occasions choose to be ill-behaved in this way (you might know your
> userland work load doesn't include running gdb with watchpoints).  
> But kwatch as it stands does not even make it possible to write a
> well-behaved facility.

Right.  I had been thinking in terms of a developer using kwatch to track 
down some particularly nasty problem, something that would happen rather 
infrequently, where one wouldn't care about side effects on user programs.  
But of course those side effects might alter an important aspect of the 
kernel problem being debugged...

It's also true that the current kwatch version affects the user experience
even when no kernel debugging is going on, as it forcibly prevents ptrace
calls from setting the Global-Enable bits in dr7.  That at least can be
fixed quite easily.  (On the other hand, userspace should never do 
anything other than a Local Enable.)

> I am all in favor of a facility to manage shared use of the debug
> registers, such as your debugreg.h additions.  I just think it needs to be
> a little more flexible.  An unobtrusive kernel facility has to get out of
> the way when user-mode decides to use all its debug registers.  It's not
> immediately important what it's going to about it when contention arises,
> but there has to be a way for the user-mode facilities to say they need to
> allocate debugregs with priority and evict other squatters.  So, something
> like code allocating a debugreg can supply a callback that's made when its
> allocation has to taken by something with higher priority.  

How about a pair of callbacks: One to notify whenever the watchpoint is 
enabled and one to notify whenever it is disabled?

> Even after utrace, there will always be the possibility of a traditional
> uncoordinated user of the raw debug registers, if nothing else ptrace
> compatibility will always be there for old users.  So anything new and
> fancy needs to be prepared to back out of the way gracefully.  In the case
> of kwatch, it can just have a handler function given by the caller to start
> with.  It's OK if individual callers can specially declare "I am not
> well-behaved" and eat debugregs so that well-behaved high-priority users
> like ptrace just have to lose (breaking compatibility).  But no
> well-behaved caller of kwatch will do that.  

No doubt the future userspace API will include some sort of priority 
facility.  For now, though, ptrace doesn't have anything like it.  We just 
have to assign it an arbitrary intermediate priority.

So for the sake of argument, let's assume that debug registers can be 
assigned with priority values ranging from 0 to 7 (overkill, but who 
cares?).  By fiat, ptrace assignments use priority 4.  Then kwatch callers 
can request whatever priority they like.  The well-behaved cases you've 
been discussing will use priority 0, and the invasive cases can use 
priority 7.  (With appropriate symbolic names instead of raw numeric 
values, naturally.)

Or maybe that's too complicated.  Perhaps all userspace assignments should 
always use the same priority level.  After all, it's possible for multiple 
tasks to allocate the same debug register at the same time -- if they had 
differing priorities that would make it much more difficult to keep things 
straight.  Then there would be only three effective priority levels: 0 = 
well-behaved kernel, 1 = all userspace, and 2 = invasive kernel.

> As a later improvement, kwatch could try a thing or two to stave off giving
> up and telling its caller the watchpoint couldn't stay for the current
> task.  For example, if a watchpoint is in kernel memory, you could switch
> in your debugreg settings on entering the kernel and restore the user
> watchpoints before returning to user mode.  Then you'd need to make
> get_user et al somehow observe the user-mode watchpoints.  But it could be
> investigated if the need arises.

For now I would prefer to avoid that.  It's true that kwatch is intended
_only_ for kernelspace watchpoints, not userspace.  But I'd rather leave
the complications up to someone else.

>  Note that you can already silently do
> something simple like juggling your kwatch debugreg assignments around if
> the higher-priority consumer evicting you has left some other debugregs unused.

Yes, I might add that in.

> I certainly intend for later features based on utrace to include
> higher-level treatment of watchpoints so that user debugging facilities can
> also become responsive to debugreg allocation pressure.  (Eventually, the
> user facilities might have easier ways of falling back to other methods and
> getting out of the way of kernel debugreg consumers, than can be done for
> the kernel-mode-tracing facilities.)  To that end, I'd like to see a clear
> and robust interface for debugreg sharing, below the level of kwatch.  I'd
> also like to see a thin layer on that giving a machine-independent kernel
> source API for talking about watchpoints, which you pretty much have rolled
> into the kwatch interface now.  But these are further refinements, not
> barriers to including kwatch.

It seems likely that the interfaces added by kwatch will need to be
generalized in various ways in order to handle the requirements of other
architectures.  However I don't know what those requirements might be, so
it seems best to start out small with x86 only and leave more refinements
for the future.

> Also, an unrelated minor point.  I think it's error-prone to have an
> integer argument to unregister_kwatch.  I think it makes most sense to have
> the caller provide the space and call register/unregister with a pointer,
> in the style of kprobes.

In fact, something like that would be necessary if the debug register 
assignment could be changed silently as need arises.

If I update the patch, adding a priority level and the callback 
notifications, do you think it would then be acceptable?

Alan Stern


^ permalink raw reply	[flat|nested] 25+ messages in thread

end of thread, other threads:[~2007-02-24  0:08 UTC | newest]

Thread overview: 25+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2007-01-16 16:55 [PATCH] Kwatch: kernel watchpoints using CPU debug registers Alan Stern
2007-01-16 23:35 ` Christoph Hellwig
2007-01-17 16:33   ` Alan Stern
2007-01-17  9:44 ` Ingo Molnar
2007-01-17 16:17   ` Alan Stern
2007-01-18  0:12     ` Christoph Hellwig
2007-01-18  7:31       ` Ingo Molnar
2007-01-18 15:37         ` Alan Stern
2007-01-18 22:33         ` Christoph Hellwig
2007-01-22 16:54           ` [PATCH - revised] " Alan Stern
2007-02-06  4:25     ` [PATCH] " Roland McGrath
     [not found] <20070206042153.66AB418005D@magilla.sf.frob.com>
2007-02-06 19:58 ` Alan Stern
2007-02-07  2:56   ` Roland McGrath
     [not found] <20070207025008.1B11118005D@magilla.sf.frob.com>
2007-02-07 19:22 ` Alan Stern
2007-02-07 22:08   ` Bob Copeland
2007-02-09 10:21   ` Roland McGrath
2007-02-09 15:54     ` Alan Stern
2007-02-09 23:31       ` Roland McGrath
2007-02-10  4:32         ` Alan Stern
2007-02-18  3:03           ` Roland McGrath
2007-02-21 20:35         ` Alan Stern
2007-02-22 11:43           ` S. P. Prasanna
2007-02-23  2:19           ` Roland McGrath
2007-02-23 16:55             ` Alan Stern
2007-02-24  0:08               ` Roland McGrath

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.