linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] i386: improve double fault handling
@ 2008-07-18 12:30 Jan Beulich
  2008-07-18 23:24 ` H. Peter Anvin
  2008-07-23 21:43 ` Joerg Roedel
  0 siblings, 2 replies; 15+ messages in thread
From: Jan Beulich @ 2008-07-18 12:30 UTC (permalink / raw)
  To: mingo, tglx, hpa; +Cc: Andi Kleen, linux-kernel

Make the double fault handler use CPU-specific stacks. Add some
abstraction to simplify future change of other exception handlers to go
through task gates.
Add a new notification of the event through the die notifier chain,
also providing some environmental adjustments so that various
infrastructural things work independent of the fact that the fault and
the callbacks are running on other then the normal kernel stack.

Signed-Off-By: Jan Beulich <jbeulich@novell.com>
Cc: Andi Kleen <andi@firstfloor.org>

---
 arch/x86/kernel/cpu/common.c     |   17 +++++--
 arch/x86/kernel/doublefault_32.c |   86 ++++++++++++++++++++++++---------------
 arch/x86/kernel/smpboot.c        |   44 +++++++++++++++++++
 arch/x86/kernel/traps_32.c       |   51 ++++++++++++++++++++++-
 drivers/lguest/segments.c        |    3 -
 include/asm-x86/kdebug.h         |    1 
 include/asm-x86/processor.h      |    7 ++-
 include/asm-x86/segment.h        |   15 ++++--
 include/asm-x86/thread_info_32.h |    9 +++-
 9 files changed, 187 insertions(+), 46 deletions(-)

--- linux-2.6.26/arch/x86/kernel/cpu/common.c	2008-07-13 23:51:29.000000000 +0200
+++ 2.6.26-i386-double-fault/arch/x86/kernel/cpu/common.c	2008-06-25 14:43:16.000000000 +0200
@@ -650,6 +650,13 @@ void switch_to_new_gdt(void)
 	asm("mov %0, %%fs" : : "r" (__KERNEL_PERCPU) : "memory");
 }
 
+static void *__init_refok alloc_boot_stack(void)
+{
+	BUILD_BUG_ON(EXCEPTION_STACK_ORDER > THREAD_ORDER);
+	return __alloc_bootmem(EXCEPTION_STACK_SIZE, THREAD_SIZE,
+			       __pa(MAX_DMA_ADDRESS));
+}
+
 /*
  * cpu_init() initializes state that is per-CPU. Some data is already
  * initialized (naturally) in the bootstrap process, such as the GDT
@@ -690,10 +697,12 @@ void __cpuinit cpu_init(void)
 	load_TR_desc();
 	load_LDT(&init_mm.context);
 
-#ifdef CONFIG_DOUBLEFAULT
-	/* Set up doublefault TSS pointer in the GDT */
-	__set_tss_desc(cpu, GDT_ENTRY_DOUBLEFAULT_TSS, &doublefault_tss);
-#endif
+	if (cpu == 0) {
+		unsigned i;
+
+		for (i = 0; i < N_EXCEPTION_TSS; ++i)
+			setup_exception_tss(cpu, i, alloc_boot_stack);
+	}
 
 	/* Clear %gs. */
 	asm volatile ("mov %0, %%gs" : : "r" (0));
--- linux-2.6.26/arch/x86/kernel/doublefault_32.c	2008-04-17 04:49:44.000000000 +0200
+++ 2.6.26-i386-double-fault/arch/x86/kernel/doublefault_32.c	2008-06-25 14:43:16.000000000 +0200
@@ -3,69 +3,89 @@
 #include <linux/init.h>
 #include <linux/init_task.h>
 #include <linux/fs.h>
+#include <linux/kdebug.h>
 
 #include <asm/uaccess.h>
 #include <asm/pgtable.h>
 #include <asm/processor.h>
 #include <asm/desc.h>
 
-#define DOUBLEFAULT_STACKSIZE (1024)
-static unsigned long doublefault_stack[DOUBLEFAULT_STACKSIZE];
-#define STACK_START (unsigned long)(doublefault_stack+DOUBLEFAULT_STACKSIZE)
+#define ptr_ok(x, l) ((x) >= PAGE_OFFSET && (x) + (l) < (unsigned long)high_memory)
 
-#define ptr_ok(x) ((x) > PAGE_OFFSET && (x) < PAGE_OFFSET + MAXMEM)
+#define THREAD_INFO_FROM(x) ((struct thread_info *)((x) & ~(THREAD_SIZE - 1)))
 
-static void doublefault_fn(void)
+register const struct x86_hw_tss *self __asm__("ebx");
+
+void doublefault_fn(void)
 {
-	struct desc_ptr gdt_desc = {0, 0};
+	struct desc_ptr gdt_desc;
 	unsigned long gdt, tss;
 
 	store_gdt(&gdt_desc);
 	gdt = gdt_desc.address;
 
-	printk(KERN_EMERG "PANIC: double fault, gdt at %08lx [%d bytes]\n", gdt, gdt_desc.size);
+	printk(KERN_EMERG "PANIC: double fault on CPU#%lu, gdt at %08lx [%d bytes]\n",
+	       self->sp2, gdt, gdt_desc.size + 1);
 
-	if (ptr_ok(gdt)) {
+	if (ptr_ok(gdt, gdt_desc.size)) {
 		gdt += GDT_ENTRY_TSS << 3;
 		tss = *(u16 *)(gdt+2);
 		tss += *(u8 *)(gdt+4) << 16;
 		tss += *(u8 *)(gdt+7) << 24;
 		printk(KERN_EMERG "double fault, tss at %08lx\n", tss);
 
-		if (ptr_ok(tss)) {
-			struct x86_hw_tss *t = (struct x86_hw_tss *)tss;
+		if (ptr_ok(tss, *(u16 *)gdt)) {
+			const struct x86_hw_tss *t = (struct x86_hw_tss *)tss;
+			struct {
+				struct pt_regs common;
+				struct {
+					unsigned long es;
+					unsigned long ds;
+					unsigned long fs;
+					unsigned long gs;
+				} vm86;
+			} regs;
+
+			/* for current/current_thread_info to work... */
+			*THREAD_INFO_FROM(self->sp) = *THREAD_INFO_FROM(t->sp0 - 1);
 
 			printk(KERN_EMERG "eip = %08lx, esp = %08lx\n",
 			       t->ip, t->sp);
 
 			printk(KERN_EMERG "eax = %08lx, ebx = %08lx, ecx = %08lx, edx = %08lx\n",
 				t->ax, t->bx, t->cx, t->dx);
-			printk(KERN_EMERG "esi = %08lx, edi = %08lx\n",
-				t->si, t->di);
+			printk(KERN_EMERG "esi = %08lx, edi = %08lx, ebp = %08lx\n",
+				t->si, t->di, t->bp);
+
+			regs.common.bx = t->bx;
+			regs.common.cx = t->cx;
+			regs.common.dx = t->dx;
+			regs.common.si = t->si;
+			regs.common.di = t->di;
+			regs.common.bp = t->bp;
+			regs.common.ax = t->ax;
+			regs.common.ds = t->ds;
+			regs.common.es = t->es;
+			regs.common.fs = t->fs;
+			regs.common.orig_ax = -1;
+			regs.common.ip = t->ip;
+			regs.common.cs = t->cs;
+			regs.common.flags = t->flags;
+			regs.common.sp = t->sp;
+			regs.common.ss = t->ss;
+			if (t->flags & X86_EFLAGS_VM) {
+				regs.common.ds = 0;
+				regs.common.es = 0;
+				regs.common.fs = 0;
+				regs.vm86.es = t->es;
+				regs.vm86.ds = t->ds;
+				regs.vm86.fs = t->fs;
+				regs.vm86.gs = t->gs;
+			}
+			notify_die(DIE_DOUBLE_FAULT, "double fault", &regs.common, 0, 8, SIGKILL);
 		}
 	}
 
 	for (;;)
 		cpu_relax();
 }
-
-struct tss_struct doublefault_tss __cacheline_aligned = {
-	.x86_tss = {
-		.sp0		= STACK_START,
-		.ss0		= __KERNEL_DS,
-		.ldt		= 0,
-		.io_bitmap_base	= INVALID_IO_BITMAP_OFFSET,
-
-		.ip		= (unsigned long) doublefault_fn,
-		/* 0x2 bit is always set */
-		.flags		= X86_EFLAGS_SF | 0x2,
-		.sp		= STACK_START,
-		.es		= __USER_DS,
-		.cs		= __KERNEL_CS,
-		.ss		= __KERNEL_DS,
-		.ds		= __USER_DS,
-		.fs		= __KERNEL_PERCPU,
-
-		.__cr3		= __pa(swapper_pg_dir)
-	}
-};
--- linux-2.6.26/arch/x86/kernel/smpboot.c	2008-07-13 23:51:29.000000000 +0200
+++ 2.6.26-i386-double-fault/arch/x86/kernel/smpboot.c	2008-06-25 14:46:27.000000000 +0200
@@ -832,6 +832,45 @@ static void __cpuinit do_fork_idle(struc
 	complete(&c_idle->done);
 }
 
+#ifdef CONFIG_X86_32
+static int __cpuinit map_exception_stack(pte_t *pte, struct page *pmd_page,
+					 unsigned long addr, void *data)
+{
+	struct page **pages = data;
+
+	*pte = mk_pte(pages[(addr >> PAGE_SHIFT)
+			    & ((1 << EXCEPTION_STACK_ORDER) - 1)],
+		      PAGE_KERNEL);
+	return 0;
+}
+
+static void *__cpuinit alloc_exception_stack(void)
+{
+	struct vm_struct *area;
+	void *stack;
+	unsigned int i;
+	struct page *pages[1 << EXCEPTION_STACK_ORDER];
+
+	BUILD_BUG_ON(EXCEPTION_STACK_ORDER > THREAD_ORDER);
+	/* Try not wasting virtual space. */
+	for (i = EXCEPTION_STACK_SIZE; i < 2 * THREAD_SIZE; i += PAGE_SIZE) {
+		area = get_vm_area(i, 0);
+		BUG_ON(!area);
+		stack = PTR_ALIGN(area->addr, THREAD_SIZE);
+		if (stack + EXCEPTION_STACK_SIZE <= area->addr + i)
+			break;
+		free_vm_area(area);
+	}
+	for (i = 0; !(i >> EXCEPTION_STACK_ORDER); ++i) {
+		pages[i] = alloc_page(GFP_KERNEL|__GFP_HIGHMEM);
+		BUG_ON(!pages[i]);
+	}
+	apply_to_page_range(&init_mm, (unsigned long)stack,
+			    EXCEPTION_STACK_SIZE, map_exception_stack, pages);
+	return stack;
+}
+#endif
+
 static int __cpuinit do_boot_cpu(int apicid, int cpu)
 /*
  * NOTE - on most systems this is a PHYSICAL apic ID, but on multiquad
@@ -906,6 +945,11 @@ do_rest:
 	per_cpu(current_task, cpu) = c_idle.idle;
 	init_gdt(cpu);
 	early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(cpu);
+#define i start_ip
+	for (i = 0; i < N_EXCEPTION_TSS; ++i)
+		setup_exception_tss(cpu, i, alloc_exception_stack);
+	vmalloc_sync_all();
+#undef i
 	c_idle.idle->thread.ip = (unsigned long) start_secondary;
 	/* Stack for startup_32 can be just as for start_secondary onwards */
 	stack_start.sp = (void *) c_idle.idle->thread.sp;
--- linux-2.6.26/arch/x86/kernel/traps_32.c	2008-07-13 23:51:29.000000000 +0200
+++ 2.6.26-i386-double-fault/arch/x86/kernel/traps_32.c	2008-06-25 14:49:20.000000000 +0200
@@ -67,6 +67,29 @@ EXPORT_SYMBOL_GPL(used_vectors);
 
 asmlinkage int system_call(void);
 
+#if N_EXCEPTION_TSS
+void doublefault_fn(void);
+
+static DEFINE_PER_CPU(struct x86_hw_tss[N_EXCEPTION_TSS], exception_tss) =
+{
+	[0 ... N_EXCEPTION_TSS-1] =
+	{
+		.cs       = __KERNEL_CS,
+		.ss       = __KERNEL_DS,
+		.ss0      = __KERNEL_DS,
+		.__cr3    = __pa(swapper_pg_dir),
+		.io_bitmap_base = INVALID_IO_BITMAP_OFFSET,
+		.ds       = __USER_DS,
+		.es       = __USER_DS,
+		.fs       = __KERNEL_PERCPU,
+		.flags	  = X86_EFLAGS_SF | 0x2, /* 0x2 bit is always set */
+	},
+#ifdef CONFIG_DOUBLEFAULT
+	[DOUBLEFAULT_TSS].ip = (unsigned long)doublefault_fn
+#endif
+};
+#endif
+
 /* Do we ignore FPU interrupts ? */
 char ignore_fpu_irq;
 
@@ -1184,6 +1207,30 @@ asmlinkage void math_emulate(long arg)
 
 #endif /* CONFIG_MATH_EMULATION */
 
+#if N_EXCEPTION_TSS
+void __cpuinit setup_exception_tss(unsigned int cpu, unsigned int idx,
+				   void *(*alloc_stack)(void))
+{
+	struct x86_hw_tss *tss = per_cpu(exception_tss, cpu) + idx;
+
+	/* Set up exception handling TSS. */
+	tss->bx = (unsigned long)tss;
+	tss->sp2 = cpu;
+
+	/* Set up exception handling stack. */
+	if (!tss->sp) {
+		char *stack;
+
+		stack = alloc_stack() + EXCEPTION_STACK_SIZE;
+		tss->sp = (unsigned long)stack;
+		tss->sp0 = (unsigned long)stack;
+	}
+
+	/* Set up exception handling TSS pointer in the GDT. */
+	__set_tss_desc(cpu, GDT_ENTRY_EXCEPTION_TSS + idx, tss);
+}
+#endif
+
 void __init trap_init(void)
 {
 	int i;
@@ -1207,7 +1254,9 @@ void __init trap_init(void)
 	set_trap_gate(5,  &bounds);
 	set_trap_gate(6,  &invalid_op);
 	set_trap_gate(7,  &device_not_available);
-	set_task_gate(8,  GDT_ENTRY_DOUBLEFAULT_TSS);
+#ifdef DOUBLEFAULT_TSS
+	set_task_gate(8,  GDT_ENTRY_EXCEPTION_TSS + DOUBLEFAULT_TSS);
+#endif
 	set_trap_gate(9,  &coprocessor_segment_overrun);
 	set_trap_gate(10, &invalid_TSS);
 	set_trap_gate(11, &segment_not_present);
--- linux-2.6.26/drivers/lguest/segments.c	2008-04-17 04:49:44.000000000 +0200
+++ 2.6.26-i386-double-fault/drivers/lguest/segments.c	2008-06-25 14:43:16.000000000 +0200
@@ -50,7 +50,8 @@ static int ignored_gdt(unsigned int num)
 	return (num == GDT_ENTRY_TSS
 		|| num == GDT_ENTRY_LGUEST_CS
 		|| num == GDT_ENTRY_LGUEST_DS
-		|| num == GDT_ENTRY_DOUBLEFAULT_TSS);
+		|| (num >= GDT_ENTRY_EXCEPTION_TSS
+		    && num < GDT_ENTRY_EXCEPTION_TSS + N_EXCEPTION_TSS));
 }
 
 /*H:630 Once the Guest gave us new GDT entries, we fix them up a little.  We
--- linux-2.6.26/include/asm-x86/kdebug.h	2008-07-13 23:51:29.000000000 +0200
+++ 2.6.26-i386-double-fault/include/asm-x86/kdebug.h	2008-06-25 14:50:04.000000000 +0200
@@ -20,6 +20,7 @@ enum die_val {
 	DIE_CALL,
 	DIE_NMI_IPI,
 	DIE_PAGE_FAULT,
+	DIE_DOUBLE_FAULT,
 	DIE_NMIUNKNOWN,
 };
 
--- linux-2.6.26/include/asm-x86/processor.h	2008-07-13 23:51:29.000000000 +0200
+++ 2.6.26-i386-double-fault/include/asm-x86/processor.h	2008-06-25 14:52:11.000000000 +0200
@@ -128,7 +128,6 @@ struct cpuinfo_x86 {
 extern struct cpuinfo_x86	boot_cpu_data;
 extern struct cpuinfo_x86	new_cpu_data;
 
-extern struct tss_struct	doublefault_tss;
 extern __u32			cleared_cpu_caps[NCAPINTS];
 
 #ifdef CONFIG_SMP
@@ -841,6 +840,12 @@ static inline void spin_lock_prefetch(co
 	.io_bitmap		= { [0 ... IO_BITMAP_LONGS] = ~0 },	  \
 }
 
+#define EXCEPTION_STACK_ORDER 0
+#define EXCEPTION_STACK_SIZE (PAGE_SIZE << EXCEPTION_STACK_ORDER)
+
+void __cpuinit setup_exception_tss(unsigned int cpu, unsigned int idx,
+				   void *(*alloc_stack)(void));
+
 extern unsigned long thread_saved_pc(struct task_struct *tsk);
 
 #define THREAD_SIZE_LONGS      (THREAD_SIZE/sizeof(unsigned long))
--- linux-2.6.26/include/asm-x86/segment.h	2008-07-13 23:51:29.000000000 +0200
+++ 2.6.26-i386-double-fault/include/asm-x86/segment.h	2008-06-25 14:43:16.000000000 +0200
@@ -55,7 +55,7 @@
  *  28 - unused
  *  29 - unused
  *  30 - unused
- *  31 - TSS for double fault handler
+ *  31+  TSSes for exception handlers
  */
 #define GDT_ENTRY_TLS_MIN	6
 #define GDT_ENTRY_TLS_MAX 	(GDT_ENTRY_TLS_MIN + GDT_ENTRY_TLS_ENTRIES - 1)
@@ -90,12 +90,19 @@
 #define __KERNEL_PERCPU 0
 #endif
 
-#define GDT_ENTRY_DOUBLEFAULT_TSS	31
+#define GDT_ENTRY_EXCEPTION_TSS	31
+#ifdef CONFIG_DOUBLEFAULT
+#define DOUBLEFAULT_TSS 0
+#define N_EXCEPTION_TSS 1
+#else
+#undef GDT_ENTRY_EXCEPTION_TSS
+#define N_EXCEPTION_TSS 0
+#endif
 
 /*
- * The GDT has 32 entries
+ * The GDT has 31+ entries
  */
-#define GDT_ENTRIES 32
+#define GDT_ENTRIES (31 + N_EXCEPTION_TSS)
 
 /* The PnP BIOS entries in the GDT */
 #define GDT_ENTRY_PNPBIOS_CS32		(GDT_ENTRY_PNPBIOS_BASE + 0)
--- linux-2.6.26/include/asm-x86/thread_info_32.h	2008-07-13 23:51:29.000000000 +0200
+++ 2.6.26-i386-double-fault/include/asm-x86/thread_info_32.h	2008-06-25 14:43:16.000000000 +0200
@@ -53,9 +53,14 @@ struct thread_info {
 
 #define PREEMPT_ACTIVE		0x10000000
 #ifdef CONFIG_4KSTACKS
-#define THREAD_SIZE            (4096)
+#define THREAD_ORDER 0
 #else
-#define THREAD_SIZE		(8192)
+#define THREAD_ORDER 1
+#endif
+#ifndef __ASSEMBLY__
+#define THREAD_SIZE (PAGE_SIZE << THREAD_ORDER)
+#else
+#define THREAD_SIZE (PAGE_SIZE_asm << THREAD_ORDER)
 #endif
 
 #define STACK_WARN             (THREAD_SIZE/8)



^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] i386: improve double fault handling
  2008-07-18 12:30 [PATCH] i386: improve double fault handling Jan Beulich
@ 2008-07-18 23:24 ` H. Peter Anvin
  2008-07-21  8:54   ` Jan Beulich
  2008-07-23 21:43 ` Joerg Roedel
  1 sibling, 1 reply; 15+ messages in thread
From: H. Peter Anvin @ 2008-07-18 23:24 UTC (permalink / raw)
  To: Jan Beulich; +Cc: mingo, tglx, Andi Kleen, linux-kernel

Jan Beulich wrote:
> Make the double fault handler use CPU-specific stacks. Add some
> abstraction to simplify future change of other exception handlers to go
> through task gates.
> Add a new notification of the event through the die notifier chain,
> also providing some environmental adjustments so that various
> infrastructural things work independent of the fact that the fault and
> the callbacks are running on other then the normal kernel stack.
> 
> Signed-Off-By: Jan Beulich <jbeulich@novell.com>
> Cc: Andi Kleen <andi@firstfloor.org>

This patch doesn't apply for me to the extent that I'm hesitant to fix 
it up manually.  Could you please refresh it against current -linus?

	-hpa

P.S. All your patches came through QP-damaged, which made them more 
difficult to deal with manually.

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] i386: improve double fault handling
  2008-07-18 23:24 ` H. Peter Anvin
@ 2008-07-21  8:54   ` Jan Beulich
  2008-07-21 11:05     ` Ingo Molnar
  0 siblings, 1 reply; 15+ messages in thread
From: Jan Beulich @ 2008-07-21  8:54 UTC (permalink / raw)
  To: H. Peter Anvin; +Cc: mingo, Andi Kleen, tglx, linux-kernel

>>> "H. Peter Anvin" <hpa@zytor.com> 19.07.08 01:24 >>>
>Jan Beulich wrote:
>> Make the double fault handler use CPU-specific stacks. Add some
>> abstraction to simplify future change of other exception handlers to go
>> through task gates.
>> Add a new notification of the event through the die notifier chain,
>> also providing some environmental adjustments so that various
>> infrastructural things work independent of the fact that the fault and
>> the callbacks are running on other then the normal kernel stack.
>> 
>> Signed-Off-By: Jan Beulich <jbeulich@novell.com>
>> Cc: Andi Kleen <andi@firstfloor.org>
>
>This patch doesn't apply for me to the extent that I'm hesitant to fix 
>it up manually.  Could you please refresh it against current -linus?

Make the double fault handler use CPU-specific stacks. Add some
abstraction to simplify future change of other exception handlers to go
through task gates.
Add a new notification of the event through the die notifier chain,
also providing some environmental adjustments so that various
infrastructural things work independent of the fact that the fault and
the callbacks are running on other then the normal kernel stack.

Signed-Off-By: Jan Beulich <jbeulich@novell.com>
Cc: Andi Kleen <andi@firstfloor.org>

---
 arch/x86/kernel/cpu/common.c     |   17 +++++--
 arch/x86/kernel/doublefault_32.c |   86 ++++++++++++++++++++++++---------------
 arch/x86/kernel/smpboot.c        |   44 +++++++++++++++++++
 arch/x86/kernel/traps_32.c       |   51 ++++++++++++++++++++++-
 drivers/lguest/segments.c        |    3 -
 include/asm-x86/kdebug.h         |    1 
 include/asm-x86/processor.h      |    7 ++-
 include/asm-x86/segment.h        |   15 ++++--
 8 files changed, 180 insertions(+), 44 deletions(-)

--- linux-2.6.26/arch/x86/kernel/cpu/common.c	2008-07-13 23:51:29.000000000 +0200
+++ 2.6.26-i386-double-fault/arch/x86/kernel/cpu/common.c	2008-06-25 14:43:16.000000000 +0200
@@ -650,6 +650,13 @@ void switch_to_new_gdt(void)
 	asm("mov %0, %%fs" : : "r" (__KERNEL_PERCPU) : "memory");
 }
 
+static void *__init_refok alloc_boot_stack(void)
+{
+	BUILD_BUG_ON(EXCEPTION_STACK_ORDER > THREAD_ORDER);
+	return __alloc_bootmem(EXCEPTION_STACK_SIZE, THREAD_SIZE,
+			       __pa(MAX_DMA_ADDRESS));
+}
+
 /*
  * cpu_init() initializes state that is per-CPU. Some data is already
  * initialized (naturally) in the bootstrap process, such as the GDT
@@ -690,10 +697,12 @@ void __cpuinit cpu_init(void)
 	load_TR_desc();
 	load_LDT(&init_mm.context);
 
-#ifdef CONFIG_DOUBLEFAULT
-	/* Set up doublefault TSS pointer in the GDT */
-	__set_tss_desc(cpu, GDT_ENTRY_DOUBLEFAULT_TSS, &doublefault_tss);
-#endif
+	if (cpu == 0) {
+		unsigned i;
+
+		for (i = 0; i < N_EXCEPTION_TSS; ++i)
+			setup_exception_tss(cpu, i, alloc_boot_stack);
+	}
 
 	/* Clear %gs. */
 	asm volatile ("mov %0, %%gs" : : "r" (0));
--- linux-2.6.26/arch/x86/kernel/doublefault_32.c	2008-04-17 04:49:44.000000000 +0200
+++ 2.6.26-i386-double-fault/arch/x86/kernel/doublefault_32.c	2008-06-25 14:43:16.000000000 +0200
@@ -3,69 +3,89 @@
 #include <linux/init.h>
 #include <linux/init_task.h>
 #include <linux/fs.h>
+#include <linux/kdebug.h>
 
 #include <asm/uaccess.h>
 #include <asm/pgtable.h>
 #include <asm/processor.h>
 #include <asm/desc.h>
 
-#define DOUBLEFAULT_STACKSIZE (1024)
-static unsigned long doublefault_stack[DOUBLEFAULT_STACKSIZE];
-#define STACK_START (unsigned long)(doublefault_stack+DOUBLEFAULT_STACKSIZE)
+#define ptr_ok(x, l) ((x) >= PAGE_OFFSET && (x) + (l) < (unsigned long)high_memory)
 
-#define ptr_ok(x) ((x) > PAGE_OFFSET && (x) < PAGE_OFFSET + MAXMEM)
+#define THREAD_INFO_FROM(x) ((struct thread_info *)((x) & ~(THREAD_SIZE - 1)))
 
-static void doublefault_fn(void)
+register const struct x86_hw_tss *self __asm__("ebx");
+
+void doublefault_fn(void)
 {
-	struct desc_ptr gdt_desc = {0, 0};
+	struct desc_ptr gdt_desc;
 	unsigned long gdt, tss;
 
 	store_gdt(&gdt_desc);
 	gdt = gdt_desc.address;
 
-	printk(KERN_EMERG "PANIC: double fault, gdt at %08lx [%d bytes]\n", gdt, gdt_desc.size);
+	printk(KERN_EMERG "PANIC: double fault on CPU#%lu, gdt at %08lx [%d bytes]\n",
+	       self->sp2, gdt, gdt_desc.size + 1);
 
-	if (ptr_ok(gdt)) {
+	if (ptr_ok(gdt, gdt_desc.size)) {
 		gdt += GDT_ENTRY_TSS << 3;
 		tss = *(u16 *)(gdt+2);
 		tss += *(u8 *)(gdt+4) << 16;
 		tss += *(u8 *)(gdt+7) << 24;
 		printk(KERN_EMERG "double fault, tss at %08lx\n", tss);
 
-		if (ptr_ok(tss)) {
-			struct x86_hw_tss *t = (struct x86_hw_tss *)tss;
+		if (ptr_ok(tss, *(u16 *)gdt)) {
+			const struct x86_hw_tss *t = (struct x86_hw_tss *)tss;
+			struct {
+				struct pt_regs common;
+				struct {
+					unsigned long es;
+					unsigned long ds;
+					unsigned long fs;
+					unsigned long gs;
+				} vm86;
+			} regs;
+
+			/* for current/current_thread_info to work... */
+			*THREAD_INFO_FROM(self->sp) = *THREAD_INFO_FROM(t->sp0 - 1);
 
 			printk(KERN_EMERG "eip = %08lx, esp = %08lx\n",
 			       t->ip, t->sp);
 
 			printk(KERN_EMERG "eax = %08lx, ebx = %08lx, ecx = %08lx, edx = %08lx\n",
 				t->ax, t->bx, t->cx, t->dx);
-			printk(KERN_EMERG "esi = %08lx, edi = %08lx\n",
-				t->si, t->di);
+			printk(KERN_EMERG "esi = %08lx, edi = %08lx, ebp = %08lx\n",
+				t->si, t->di, t->bp);
+
+			regs.common.bx = t->bx;
+			regs.common.cx = t->cx;
+			regs.common.dx = t->dx;
+			regs.common.si = t->si;
+			regs.common.di = t->di;
+			regs.common.bp = t->bp;
+			regs.common.ax = t->ax;
+			regs.common.ds = t->ds;
+			regs.common.es = t->es;
+			regs.common.fs = t->fs;
+			regs.common.orig_ax = -1;
+			regs.common.ip = t->ip;
+			regs.common.cs = t->cs;
+			regs.common.flags = t->flags;
+			regs.common.sp = t->sp;
+			regs.common.ss = t->ss;
+			if (t->flags & X86_EFLAGS_VM) {
+				regs.common.ds = 0;
+				regs.common.es = 0;
+				regs.common.fs = 0;
+				regs.vm86.es = t->es;
+				regs.vm86.ds = t->ds;
+				regs.vm86.fs = t->fs;
+				regs.vm86.gs = t->gs;
+			}
+			notify_die(DIE_DOUBLE_FAULT, "double fault", &regs.common, 0, 8, SIGKILL);
 		}
 	}
 
 	for (;;)
 		cpu_relax();
 }
-
-struct tss_struct doublefault_tss __cacheline_aligned = {
-	.x86_tss = {
-		.sp0		= STACK_START,
-		.ss0		= __KERNEL_DS,
-		.ldt		= 0,
-		.io_bitmap_base	= INVALID_IO_BITMAP_OFFSET,
-
-		.ip		= (unsigned long) doublefault_fn,
-		/* 0x2 bit is always set */
-		.flags		= X86_EFLAGS_SF | 0x2,
-		.sp		= STACK_START,
-		.es		= __USER_DS,
-		.cs		= __KERNEL_CS,
-		.ss		= __KERNEL_DS,
-		.ds		= __USER_DS,
-		.fs		= __KERNEL_PERCPU,
-
-		.__cr3		= __pa(swapper_pg_dir)
-	}
-};
--- linux-2.6.26/arch/x86/kernel/smpboot.c	2008-07-13 23:51:29.000000000 +0200
+++ 2.6.26-i386-double-fault/arch/x86/kernel/smpboot.c	2008-06-25 14:46:27.000000000 +0200
@@ -832,6 +832,45 @@ static void __cpuinit do_fork_idle(struc
 	complete(&c_idle->done);
 }
 
+#ifdef CONFIG_X86_32
+static int __cpuinit map_exception_stack(pte_t *pte, struct page *pmd_page,
+					 unsigned long addr, void *data)
+{
+	struct page **pages = data;
+
+	*pte = mk_pte(pages[(addr >> PAGE_SHIFT)
+			    & ((1 << EXCEPTION_STACK_ORDER) - 1)],
+		      PAGE_KERNEL);
+	return 0;
+}
+
+static void *__cpuinit alloc_exception_stack(void)
+{
+	struct vm_struct *area;
+	void *stack;
+	unsigned int i;
+	struct page *pages[1 << EXCEPTION_STACK_ORDER];
+
+	BUILD_BUG_ON(EXCEPTION_STACK_ORDER > THREAD_ORDER);
+	/* Try not wasting virtual space. */
+	for (i = EXCEPTION_STACK_SIZE; i < 2 * THREAD_SIZE; i += PAGE_SIZE) {
+		area = get_vm_area(i, 0);
+		BUG_ON(!area);
+		stack = PTR_ALIGN(area->addr, THREAD_SIZE);
+		if (stack + EXCEPTION_STACK_SIZE <= area->addr + i)
+			break;
+		free_vm_area(area);
+	}
+	for (i = 0; !(i >> EXCEPTION_STACK_ORDER); ++i) {
+		pages[i] = alloc_page(GFP_KERNEL|__GFP_HIGHMEM);
+		BUG_ON(!pages[i]);
+	}
+	apply_to_page_range(&init_mm, (unsigned long)stack,
+			    EXCEPTION_STACK_SIZE, map_exception_stack, pages);
+	return stack;
+}
+#endif
+
 static int __cpuinit do_boot_cpu(int apicid, int cpu)
 /*
  * NOTE - on most systems this is a PHYSICAL apic ID, but on multiquad
@@ -906,6 +945,11 @@ do_rest:
 	per_cpu(current_task, cpu) = c_idle.idle;
 	init_gdt(cpu);
 	early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(cpu);
+#define i start_ip
+	for (i = 0; i < N_EXCEPTION_TSS; ++i)
+		setup_exception_tss(cpu, i, alloc_exception_stack);
+	vmalloc_sync_all();
+#undef i
 	c_idle.idle->thread.ip = (unsigned long) start_secondary;
 	/* Stack for startup_32 can be just as for start_secondary onwards */
 	stack_start.sp = (void *) c_idle.idle->thread.sp;
--- linux-2.6.26/arch/x86/kernel/traps_32.c	2008-07-13 23:51:29.000000000 +0200
+++ 2.6.26-i386-double-fault/arch/x86/kernel/traps_32.c	2008-06-25 14:49:20.000000000 +0200
@@ -67,6 +67,29 @@ EXPORT_SYMBOL_GPL(used_vectors);
 
 asmlinkage int system_call(void);
 
+#if N_EXCEPTION_TSS
+void doublefault_fn(void);
+
+static DEFINE_PER_CPU(struct x86_hw_tss[N_EXCEPTION_TSS], exception_tss) =
+{
+	[0 ... N_EXCEPTION_TSS-1] =
+	{
+		.cs       = __KERNEL_CS,
+		.ss       = __KERNEL_DS,
+		.ss0      = __KERNEL_DS,
+		.__cr3    = __pa(swapper_pg_dir),
+		.io_bitmap_base = INVALID_IO_BITMAP_OFFSET,
+		.ds       = __USER_DS,
+		.es       = __USER_DS,
+		.fs       = __KERNEL_PERCPU,
+		.flags	  = X86_EFLAGS_SF | 0x2, /* 0x2 bit is always set */
+	},
+#ifdef CONFIG_DOUBLEFAULT
+	[DOUBLEFAULT_TSS].ip = (unsigned long)doublefault_fn
+#endif
+};
+#endif
+
 /* Do we ignore FPU interrupts ? */
 char ignore_fpu_irq;
 
@@ -1184,6 +1207,30 @@ asmlinkage void math_emulate(long arg)
 
 #endif /* CONFIG_MATH_EMULATION */
 
+#if N_EXCEPTION_TSS
+void __cpuinit setup_exception_tss(unsigned int cpu, unsigned int idx,
+				   void *(*alloc_stack)(void))
+{
+	struct x86_hw_tss *tss = per_cpu(exception_tss, cpu) + idx;
+
+	/* Set up exception handling TSS. */
+	tss->bx = (unsigned long)tss;
+	tss->sp2 = cpu;
+
+	/* Set up exception handling stack. */
+	if (!tss->sp) {
+		char *stack;
+
+		stack = alloc_stack() + EXCEPTION_STACK_SIZE;
+		tss->sp = (unsigned long)stack;
+		tss->sp0 = (unsigned long)stack;
+	}
+
+	/* Set up exception handling TSS pointer in the GDT. */
+	__set_tss_desc(cpu, GDT_ENTRY_EXCEPTION_TSS + idx, tss);
+}
+#endif
+
 void __init trap_init(void)
 {
 	int i;
@@ -1207,7 +1254,9 @@ void __init trap_init(void)
 	set_trap_gate(5,  &bounds);
 	set_trap_gate(6,  &invalid_op);
 	set_trap_gate(7,  &device_not_available);
-	set_task_gate(8,  GDT_ENTRY_DOUBLEFAULT_TSS);
+#ifdef DOUBLEFAULT_TSS
+	set_task_gate(8,  GDT_ENTRY_EXCEPTION_TSS + DOUBLEFAULT_TSS);
+#endif
 	set_trap_gate(9,  &coprocessor_segment_overrun);
 	set_trap_gate(10, &invalid_TSS);
 	set_trap_gate(11, &segment_not_present);
--- linux-2.6.26/drivers/lguest/segments.c	2008-04-17 04:49:44.000000000 +0200
+++ 2.6.26-i386-double-fault/drivers/lguest/segments.c	2008-06-25 14:43:16.000000000 +0200
@@ -50,7 +50,8 @@ static int ignored_gdt(unsigned int num)
 	return (num == GDT_ENTRY_TSS
 		|| num == GDT_ENTRY_LGUEST_CS
 		|| num == GDT_ENTRY_LGUEST_DS
-		|| num == GDT_ENTRY_DOUBLEFAULT_TSS);
+		|| (num >= GDT_ENTRY_EXCEPTION_TSS
+		    && num < GDT_ENTRY_EXCEPTION_TSS + N_EXCEPTION_TSS));
 }
 
 /*H:630 Once the Guest gave us new GDT entries, we fix them up a little.  We
--- linux-2.6.26/include/asm-x86/kdebug.h	2008-07-13 23:51:29.000000000 +0200
+++ 2.6.26-i386-double-fault/include/asm-x86/kdebug.h	2008-06-25 14:50:04.000000000 +0200
@@ -20,6 +20,7 @@ enum die_val {
 	DIE_CALL,
 	DIE_NMI_IPI,
 	DIE_PAGE_FAULT,
+	DIE_DOUBLE_FAULT,
 	DIE_NMIUNKNOWN,
 };
 
--- linux-2.6.26/include/asm-x86/processor.h	2008-07-13 23:51:29.000000000 +0200
+++ 2.6.26-i386-double-fault/include/asm-x86/processor.h	2008-06-25 14:52:11.000000000 +0200
@@ -128,7 +128,6 @@ struct cpuinfo_x86 {
 extern struct cpuinfo_x86	boot_cpu_data;
 extern struct cpuinfo_x86	new_cpu_data;
 
-extern struct tss_struct	doublefault_tss;
 extern __u32			cleared_cpu_caps[NCAPINTS];
 
 #ifdef CONFIG_SMP
@@ -841,6 +840,12 @@ static inline void spin_lock_prefetch(co
 	.io_bitmap		= { [0 ... IO_BITMAP_LONGS] = ~0 },	  \
 }
 
+#define EXCEPTION_STACK_ORDER 0
+#define EXCEPTION_STACK_SIZE (PAGE_SIZE << EXCEPTION_STACK_ORDER)
+
+void __cpuinit setup_exception_tss(unsigned int cpu, unsigned int idx,
+				   void *(*alloc_stack)(void));
+
 extern unsigned long thread_saved_pc(struct task_struct *tsk);
 
 #define THREAD_SIZE_LONGS      (THREAD_SIZE/sizeof(unsigned long))
--- linux-2.6.26/include/asm-x86/segment.h	2008-07-13 23:51:29.000000000 +0200
+++ 2.6.26-i386-double-fault/include/asm-x86/segment.h	2008-06-25 14:43:16.000000000 +0200
@@ -55,7 +55,7 @@
  *  28 - unused
  *  29 - unused
  *  30 - unused
- *  31 - TSS for double fault handler
+ *  31+  TSSes for exception handlers
  */
 #define GDT_ENTRY_TLS_MIN	6
 #define GDT_ENTRY_TLS_MAX 	(GDT_ENTRY_TLS_MIN + GDT_ENTRY_TLS_ENTRIES - 1)
@@ -90,12 +90,19 @@
 #define __KERNEL_PERCPU 0
 #endif
 
-#define GDT_ENTRY_DOUBLEFAULT_TSS	31
+#define GDT_ENTRY_EXCEPTION_TSS	31
+#ifdef CONFIG_DOUBLEFAULT
+#define DOUBLEFAULT_TSS 0
+#define N_EXCEPTION_TSS 1
+#else
+#undef GDT_ENTRY_EXCEPTION_TSS
+#define N_EXCEPTION_TSS 0
+#endif
 
 /*
- * The GDT has 32 entries
+ * The GDT has 31+ entries
  */
-#define GDT_ENTRIES 32
+#define GDT_ENTRIES (31 + N_EXCEPTION_TSS)
 
 /* The PnP BIOS entries in the GDT */
 #define GDT_ENTRY_PNPBIOS_CS32		(GDT_ENTRY_PNPBIOS_BASE + 0)
--- linux-2.6.26/include/asm-x86/thread_info_32.h	2008-07-13 23:51:29.000000000 +0200
+++ 2.6.26-i386-double-fault/include/asm-x86/thread_info_32.h	2008-06-25 14:43:16.000000000 +0200
@@ -53,9 +53,14 @@ struct thread_info {
 
 #define PREEMPT_ACTIVE		0x10000000
 #ifdef CONFIG_4KSTACKS
-#define THREAD_SIZE            (4096)
+#define THREAD_ORDER 0
 #else
-#define THREAD_SIZE		(8192)
+#define THREAD_ORDER 1
+#endif
+#ifndef __ASSEMBLY__
+#define THREAD_SIZE (PAGE_SIZE << THREAD_ORDER)
+#else
+#define THREAD_SIZE (PAGE_SIZE_asm << THREAD_ORDER)
 #endif
 
 #define STACK_WARN             (THREAD_SIZE/8)



^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] i386: improve double fault handling
  2008-07-21  8:54   ` Jan Beulich
@ 2008-07-21 11:05     ` Ingo Molnar
  2008-07-22 10:13       ` Jan Beulich
  0 siblings, 1 reply; 15+ messages in thread
From: Ingo Molnar @ 2008-07-21 11:05 UTC (permalink / raw)
  To: Jan Beulich; +Cc: H. Peter Anvin, Andi Kleen, tglx, linux-kernel


* Jan Beulich <jbeulich@novell.com> wrote:

> > This patch doesn't apply for me to the extent that I'm hesitant to 
> > fix it up manually.  Could you please refresh it against current 
> > -linus?
> 
> Make the double fault handler use CPU-specific stacks. Add some 
> abstraction to simplify future change of other exception handlers to 
> go through task gates. Add a new notification of the event through the 
> die notifier chain, also providing some environmental adjustments so 
> that various infrastructural things work independent of the fact that 
> the fault and the callbacks are running on other then the normal 
> kernel stack.

this still doesnt apply to latest -git. (or tip/master)

	Ingo

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] i386: improve double fault handling
  2008-07-21 11:05     ` Ingo Molnar
@ 2008-07-22 10:13       ` Jan Beulich
  2008-07-28 13:42         ` Ingo Molnar
  0 siblings, 1 reply; 15+ messages in thread
From: Jan Beulich @ 2008-07-22 10:13 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: Andi Kleen, tglx, linux-kernel, H. Peter Anvin

>>> Ingo Molnar <mingo@elte.hu> 21.07.08 13:05 >>>
>this still doesnt apply to latest -git. (or tip/master)

Indeed, tip/master had a __pa -> __phys_addr_const conversion that
I now sync-ed the patch with (without another round of testing):

Make the double fault handler use CPU-specific stacks. Add some
abstraction to simplify future change of other exception handlers to go
through task gates.
Add a new notification of the event through the die notifier chain,
also providing some environmental adjustments so that various
infrastructural things work independent of the fact that the fault and
the callbacks are running on other then the normal kernel stack.

Signed-Off-By: Jan Beulich <jbeulich@novell.com>
Cc: Andi Kleen <andi@firstfloor.org>

---
 arch/x86/kernel/cpu/common.c     |   17 +++++--
 arch/x86/kernel/doublefault_32.c |   86 ++++++++++++++++++++++++---------------
 arch/x86/kernel/smpboot.c        |   44 +++++++++++++++++++
 arch/x86/kernel/traps_32.c       |   51 ++++++++++++++++++++++-
 drivers/lguest/segments.c        |    3 -
 include/asm-x86/kdebug.h         |    1 
 include/asm-x86/processor.h      |    7 ++-
 include/asm-x86/segment.h        |   15 ++++--
 include/asm-x86/thread_info_32.h |    9 +++-
 9 files changed, 187 insertions(+), 46 deletions(-)

--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -650,6 +650,13 @@ void switch_to_new_gdt(void)
 	asm("mov %0, %%fs" : : "r" (__KERNEL_PERCPU) : "memory");
 }
 
+static void *__init_refok alloc_boot_stack(void)
+{
+	BUILD_BUG_ON(EXCEPTION_STACK_ORDER > THREAD_ORDER);
+	return __alloc_bootmem(EXCEPTION_STACK_SIZE, THREAD_SIZE,
+			       __phys_addr_const(MAX_DMA_ADDRESS));
+}
+
 /*
  * cpu_init() initializes state that is per-CPU. Some data is already
  * initialized (naturally) in the bootstrap process, such as the GDT
@@ -690,10 +697,12 @@ void __cpuinit cpu_init(void)
 	load_TR_desc();
 	load_LDT(&init_mm.context);
 
-#ifdef CONFIG_DOUBLEFAULT
-	/* Set up doublefault TSS pointer in the GDT */
-	__set_tss_desc(cpu, GDT_ENTRY_DOUBLEFAULT_TSS, &doublefault_tss);
-#endif
+	if (cpu == 0) {
+		unsigned i;
+
+		for (i = 0; i < N_EXCEPTION_TSS; ++i)
+			setup_exception_tss(cpu, i, alloc_boot_stack);
+	}
 
 	/* Clear %gs. */
 	asm volatile ("mov %0, %%gs" : : "r" (0));
--- a/arch/x86/kernel/doublefault_32.c
+++ b/arch/x86/kernel/doublefault_32.c
@@ -3,69 +3,89 @@
 #include <linux/init.h>
 #include <linux/init_task.h>
 #include <linux/fs.h>
+#include <linux/kdebug.h>
 
 #include <asm/uaccess.h>
 #include <asm/pgtable.h>
 #include <asm/processor.h>
 #include <asm/desc.h>
 
-#define DOUBLEFAULT_STACKSIZE (1024)
-static unsigned long doublefault_stack[DOUBLEFAULT_STACKSIZE];
-#define STACK_START (unsigned long)(doublefault_stack+DOUBLEFAULT_STACKSIZE)
+#define ptr_ok(x, l) ((x) >= PAGE_OFFSET && (x) + (l) < (unsigned long)high_memory)
 
-#define ptr_ok(x) ((x) > PAGE_OFFSET && (x) < PAGE_OFFSET + MAXMEM)
+#define THREAD_INFO_FROM(x) ((struct thread_info *)((x) & ~(THREAD_SIZE - 1)))
 
-static void doublefault_fn(void)
+register const struct x86_hw_tss *self __asm__("ebx");
+
+void doublefault_fn(void)
 {
-	struct desc_ptr gdt_desc = {0, 0};
+	struct desc_ptr gdt_desc;
 	unsigned long gdt, tss;
 
 	store_gdt(&gdt_desc);
 	gdt = gdt_desc.address;
 
-	printk(KERN_EMERG "PANIC: double fault, gdt at %08lx [%d bytes]\n", gdt, gdt_desc.size);
+	printk(KERN_EMERG "PANIC: double fault on CPU#%lu, gdt at %08lx [%d bytes]\n",
+	       self->sp2, gdt, gdt_desc.size + 1);
 
-	if (ptr_ok(gdt)) {
+	if (ptr_ok(gdt, gdt_desc.size)) {
 		gdt += GDT_ENTRY_TSS << 3;
 		tss = *(u16 *)(gdt+2);
 		tss += *(u8 *)(gdt+4) << 16;
 		tss += *(u8 *)(gdt+7) << 24;
 		printk(KERN_EMERG "double fault, tss at %08lx\n", tss);
 
-		if (ptr_ok(tss)) {
-			struct x86_hw_tss *t = (struct x86_hw_tss *)tss;
+		if (ptr_ok(tss, *(u16 *)gdt)) {
+			const struct x86_hw_tss *t = (struct x86_hw_tss *)tss;
+			struct {
+				struct pt_regs common;
+				struct {
+					unsigned long es;
+					unsigned long ds;
+					unsigned long fs;
+					unsigned long gs;
+				} vm86;
+			} regs;
+
+			/* for current/current_thread_info to work... */
+			*THREAD_INFO_FROM(self->sp) = *THREAD_INFO_FROM(t->sp0 - 1);
 
 			printk(KERN_EMERG "eip = %08lx, esp = %08lx\n",
 			       t->ip, t->sp);
 
 			printk(KERN_EMERG "eax = %08lx, ebx = %08lx, ecx = %08lx, edx = %08lx\n",
 				t->ax, t->bx, t->cx, t->dx);
-			printk(KERN_EMERG "esi = %08lx, edi = %08lx\n",
-				t->si, t->di);
+			printk(KERN_EMERG "esi = %08lx, edi = %08lx, ebp = %08lx\n",
+				t->si, t->di, t->bp);
+
+			regs.common.bx = t->bx;
+			regs.common.cx = t->cx;
+			regs.common.dx = t->dx;
+			regs.common.si = t->si;
+			regs.common.di = t->di;
+			regs.common.bp = t->bp;
+			regs.common.ax = t->ax;
+			regs.common.ds = t->ds;
+			regs.common.es = t->es;
+			regs.common.fs = t->fs;
+			regs.common.orig_ax = -1;
+			regs.common.ip = t->ip;
+			regs.common.cs = t->cs;
+			regs.common.flags = t->flags;
+			regs.common.sp = t->sp;
+			regs.common.ss = t->ss;
+			if (t->flags & X86_EFLAGS_VM) {
+				regs.common.ds = 0;
+				regs.common.es = 0;
+				regs.common.fs = 0;
+				regs.vm86.es = t->es;
+				regs.vm86.ds = t->ds;
+				regs.vm86.fs = t->fs;
+				regs.vm86.gs = t->gs;
+			}
+			notify_die(DIE_DOUBLE_FAULT, "double fault", &regs.common, 0, 8, SIGKILL);
 		}
 	}
 
 	for (;;)
 		cpu_relax();
 }
-
-struct tss_struct doublefault_tss __cacheline_aligned = {
-	.x86_tss = {
-		.sp0		= STACK_START,
-		.ss0		= __KERNEL_DS,
-		.ldt		= 0,
-		.io_bitmap_base	= INVALID_IO_BITMAP_OFFSET,
-
-		.ip		= (unsigned long) doublefault_fn,
-		/* 0x2 bit is always set */
-		.flags		= X86_EFLAGS_SF | 0x2,
-		.sp		= STACK_START,
-		.es		= __USER_DS,
-		.cs		= __KERNEL_CS,
-		.ss		= __KERNEL_DS,
-		.ds		= __USER_DS,
-		.fs		= __KERNEL_PERCPU,
-
-		.__cr3		= __phys_addr_const((unsigned long)swapper_pg_dir)
-	}
-};
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -762,6 +762,45 @@ static void __cpuinit do_fork_idle(struc
 	complete(&c_idle->done);
 }
 
+#ifdef CONFIG_X86_32
+static int __cpuinit map_exception_stack(pte_t *pte, struct page *pmd_page,
+					 unsigned long addr, void *data)
+{
+	struct page **pages = data;
+
+	*pte = mk_pte(pages[(addr >> PAGE_SHIFT)
+			    & ((1 << EXCEPTION_STACK_ORDER) - 1)],
+		      PAGE_KERNEL);
+	return 0;
+}
+
+static void *__cpuinit alloc_exception_stack(void)
+{
+	struct vm_struct *area;
+	void *stack;
+	unsigned int i;
+	struct page *pages[1 << EXCEPTION_STACK_ORDER];
+
+	BUILD_BUG_ON(EXCEPTION_STACK_ORDER > THREAD_ORDER);
+	/* Try not wasting virtual space. */
+	for (i = EXCEPTION_STACK_SIZE; i < 2 * THREAD_SIZE; i += PAGE_SIZE) {
+		area = get_vm_area(i, 0);
+		BUG_ON(!area);
+		stack = PTR_ALIGN(area->addr, THREAD_SIZE);
+		if (stack + EXCEPTION_STACK_SIZE <= area->addr + i)
+			break;
+		free_vm_area(area);
+	}
+	for (i = 0; !(i >> EXCEPTION_STACK_ORDER); ++i) {
+		pages[i] = alloc_page(GFP_KERNEL|__GFP_HIGHMEM);
+		BUG_ON(!pages[i]);
+	}
+	apply_to_page_range(&init_mm, (unsigned long)stack,
+			    EXCEPTION_STACK_SIZE, map_exception_stack, pages);
+	return stack;
+}
+#endif
+
 #ifdef CONFIG_X86_64
 /*
  * Allocate node local memory for the AP pda.
@@ -862,6 +901,11 @@ do_rest:
 	init_gdt(cpu);
 	/* Stack for startup_32 can be just as for start_secondary onwards */
 	irq_ctx_init(cpu);
+#define i start_ip
+	for (i = 0; i < N_EXCEPTION_TSS; ++i)
+		setup_exception_tss(cpu, i, alloc_exception_stack);
+	vmalloc_sync_all();
+#undef i
 #else
 	cpu_pda(cpu)->pcurrent = c_idle.idle;
 	clear_tsk_thread_flag(c_idle.idle, TIF_FORK);
--- a/arch/x86/kernel/traps_32.c
+++ b/arch/x86/kernel/traps_32.c
@@ -66,6 +66,29 @@ EXPORT_SYMBOL_GPL(used_vectors);
 
 asmlinkage int system_call(void);
 
+#if N_EXCEPTION_TSS
+void doublefault_fn(void);
+
+static DEFINE_PER_CPU(struct x86_hw_tss[N_EXCEPTION_TSS], exception_tss) =
+{
+	[0 ... N_EXCEPTION_TSS-1] =
+	{
+		.cs       = __KERNEL_CS,
+		.ss       = __KERNEL_DS,
+		.ss0      = __KERNEL_DS,
+		.__cr3    = __phys_addr_const((unsigned long)swapper_pg_dir),
+		.io_bitmap_base = INVALID_IO_BITMAP_OFFSET,
+		.ds       = __USER_DS,
+		.es       = __USER_DS,
+		.fs       = __KERNEL_PERCPU,
+		.flags	  = X86_EFLAGS_SF | 0x2, /* 0x2 bit is always set */
+	},
+#ifdef CONFIG_DOUBLEFAULT
+	[DOUBLEFAULT_TSS].ip = (unsigned long)doublefault_fn
+#endif
+};
+#endif
+
 /* Do we ignore FPU interrupts ? */
 char ignore_fpu_irq;
 
@@ -1185,6 +1208,30 @@ asmlinkage void math_emulate(long arg)
 
 #endif /* CONFIG_MATH_EMULATION */
 
+#if N_EXCEPTION_TSS
+void __cpuinit setup_exception_tss(unsigned int cpu, unsigned int idx,
+				   void *(*alloc_stack)(void))
+{
+	struct x86_hw_tss *tss = per_cpu(exception_tss, cpu) + idx;
+
+	/* Set up exception handling TSS. */
+	tss->bx = (unsigned long)tss;
+	tss->sp2 = cpu;
+
+	/* Set up exception handling stack. */
+	if (!tss->sp) {
+		char *stack;
+
+		stack = alloc_stack() + EXCEPTION_STACK_SIZE;
+		tss->sp = (unsigned long)stack;
+		tss->sp0 = (unsigned long)stack;
+	}
+
+	/* Set up exception handling TSS pointer in the GDT. */
+	__set_tss_desc(cpu, GDT_ENTRY_EXCEPTION_TSS + idx, tss);
+}
+#endif
+
 void __init trap_init(void)
 {
 	int i;
@@ -1205,7 +1252,9 @@ void __init trap_init(void)
 	set_trap_gate(5, &bounds);
 	set_trap_gate(6, &invalid_op);
 	set_trap_gate(7, &device_not_available);
-	set_task_gate(8, GDT_ENTRY_DOUBLEFAULT_TSS);
+#ifdef DOUBLEFAULT_TSS
+	set_task_gate(8, GDT_ENTRY_EXCEPTION_TSS + DOUBLEFAULT_TSS);
+#endif
 	set_trap_gate(9, &coprocessor_segment_overrun);
 	set_trap_gate(10, &invalid_TSS);
 	set_trap_gate(11, &segment_not_present);
--- a/drivers/lguest/segments.c
+++ b/drivers/lguest/segments.c
@@ -50,7 +50,8 @@ static int ignored_gdt(unsigned int num)
 	return (num == GDT_ENTRY_TSS
 		|| num == GDT_ENTRY_LGUEST_CS
 		|| num == GDT_ENTRY_LGUEST_DS
-		|| num == GDT_ENTRY_DOUBLEFAULT_TSS);
+		|| (num >= GDT_ENTRY_EXCEPTION_TSS
+		    && num < GDT_ENTRY_EXCEPTION_TSS + N_EXCEPTION_TSS));
 }
 
 /*H:630 Once the Guest gave us new GDT entries, we fix them up a little.  We
--- a/include/asm-x86/kdebug.h
+++ b/include/asm-x86/kdebug.h
@@ -20,6 +20,7 @@ enum die_val {
 	DIE_CALL,
 	DIE_NMI_IPI,
 	DIE_PAGE_FAULT,
+	DIE_DOUBLE_FAULT,
 	DIE_NMIUNKNOWN,
 };
 
--- linux-2.6.26/include/asm-x86/processor.h	2008-07-13 23:51:29.000000000 +0200
+++ 2.6.26-i386-double-fault/include/asm-x86/processor.h	2008-06-25 14:52:11.000000000 +0200
@@ -128,7 +128,6 @@ struct cpuinfo_x86 {
 extern struct cpuinfo_x86	boot_cpu_data;
 extern struct cpuinfo_x86	new_cpu_data;
 
-extern struct tss_struct	doublefault_tss;
 extern __u32			cleared_cpu_caps[NCAPINTS];
 
 #ifdef CONFIG_SMP
@@ -838,6 +837,12 @@ static inline void spin_lock_prefetch(co
 	.io_bitmap		= { [0 ... IO_BITMAP_LONGS] = ~0 },	  \
 }
 
+#define EXCEPTION_STACK_ORDER 0
+#define EXCEPTION_STACK_SIZE (PAGE_SIZE << EXCEPTION_STACK_ORDER)
+
+void __cpuinit setup_exception_tss(unsigned int cpu, unsigned int idx,
+				   void *(*alloc_stack)(void));
+
 extern unsigned long thread_saved_pc(struct task_struct *tsk);
 
 #define THREAD_SIZE_LONGS      (THREAD_SIZE/sizeof(unsigned long))
--- linux-2.6.26/include/asm-x86/segment.h	2008-07-13 23:51:29.000000000 +0200
+++ 2.6.26-i386-double-fault/include/asm-x86/segment.h	2008-06-25 14:43:16.000000000 +0200
@@ -55,7 +55,7 @@
  *  28 - unused
  *  29 - unused
  *  30 - unused
- *  31 - TSS for double fault handler
+ *  31+  TSSes for exception handlers
  */
 #define GDT_ENTRY_TLS_MIN	6
 #define GDT_ENTRY_TLS_MAX 	(GDT_ENTRY_TLS_MIN + GDT_ENTRY_TLS_ENTRIES - 1)
@@ -86,12 +86,19 @@
 #define __KERNEL_PERCPU 0
 #endif
 
-#define GDT_ENTRY_DOUBLEFAULT_TSS	31
+#define GDT_ENTRY_EXCEPTION_TSS	31
+#ifdef CONFIG_DOUBLEFAULT
+#define DOUBLEFAULT_TSS 0
+#define N_EXCEPTION_TSS 1
+#else
+#undef GDT_ENTRY_EXCEPTION_TSS
+#define N_EXCEPTION_TSS 0
+#endif
 
 /*
- * The GDT has 32 entries
+ * The GDT has 31+ entries
  */
-#define GDT_ENTRIES 32
+#define GDT_ENTRIES (31 + N_EXCEPTION_TSS)
 
 /* The PnP BIOS entries in the GDT */
 #define GDT_ENTRY_PNPBIOS_CS32		(GDT_ENTRY_PNPBIOS_BASE + 0)




^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] i386: improve double fault handling
  2008-07-18 12:30 [PATCH] i386: improve double fault handling Jan Beulich
  2008-07-18 23:24 ` H. Peter Anvin
@ 2008-07-23 21:43 ` Joerg Roedel
  2008-07-24  7:08   ` Jan Beulich
  1 sibling, 1 reply; 15+ messages in thread
From: Joerg Roedel @ 2008-07-23 21:43 UTC (permalink / raw)
  To: Jan Beulich; +Cc: mingo, tglx, hpa, Andi Kleen, linux-kernel

On Fri, Jul 18, 2008 at 01:30:42PM +0100, Jan Beulich wrote:
> Make the double fault handler use CPU-specific stacks. Add some
> abstraction to simplify future change of other exception handlers to go
> through task gates.

What is the benefit of exception handlers going through task gates?
Hardware task switches are not very well supported in virtualization
(e.g. its has issues in KVM and is also not in Xen for a long time).

Joerg


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] i386: improve double fault handling
  2008-07-23 21:43 ` Joerg Roedel
@ 2008-07-24  7:08   ` Jan Beulich
  2008-07-24 13:24     ` H. Peter Anvin
  0 siblings, 1 reply; 15+ messages in thread
From: Jan Beulich @ 2008-07-24  7:08 UTC (permalink / raw)
  To: Joerg Roedel; +Cc: mingo, Andi Kleen, tglx, linux-kernel, hpa

>>> Joerg Roedel <joro@8bytes.org> 23.07.08 23:43 >>>
>On Fri, Jul 18, 2008 at 01:30:42PM +0100, Jan Beulich wrote:
>> Make the double fault handler use CPU-specific stacks. Add some
>> abstraction to simplify future change of other exception handlers to go
>> through task gates.
>
>What is the benefit of exception handlers going through task gates?
>Hardware task switches are not very well supported in virtualization
>(e.g. its has issues in KVM and is also not in Xen for a long time).

The main goal is to get to a different stack. While at present this is done
only for the double fault, I think generally NMI and MCE should also do
so, as they may be caused by a stack access (see x86-64, which runs
them on IST stacks), and hence continuing to run on that same stack
may not allow the exception to be handled.

Jan


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] i386: improve double fault handling
  2008-07-24  7:08   ` Jan Beulich
@ 2008-07-24 13:24     ` H. Peter Anvin
  0 siblings, 0 replies; 15+ messages in thread
From: H. Peter Anvin @ 2008-07-24 13:24 UTC (permalink / raw)
  To: Jan Beulich; +Cc: Joerg Roedel, mingo, Andi Kleen, tglx, linux-kernel

Jan Beulich wrote:
>>>> Joerg Roedel <joro@8bytes.org> 23.07.08 23:43 >>>
>> On Fri, Jul 18, 2008 at 01:30:42PM +0100, Jan Beulich wrote:
>>> Make the double fault handler use CPU-specific stacks. Add some
>>> abstraction to simplify future change of other exception handlers to go
>>> through task gates.
>> What is the benefit of exception handlers going through task gates?
>> Hardware task switches are not very well supported in virtualization
>> (e.g. its has issues in KVM and is also not in Xen for a long time).
> 
> The main goal is to get to a different stack. While at present this is done
> only for the double fault, I think generally NMI and MCE should also do
> so, as they may be caused by a stack access (see x86-64, which runs
> them on IST stacks), and hence continuing to run on that same stack
> may not allow the exception to be handled.

NMI, MCE and #DF are the obvious candidates.

Now, keep in mind TSSes have to be prepared per-CPU, since they get 
marked "busy" when in use, so it's a bit of a nontrivial undertaking.

	-hpa


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] i386: improve double fault handling
  2008-07-22 10:13       ` Jan Beulich
@ 2008-07-28 13:42         ` Ingo Molnar
  2008-07-28 13:45           ` H. Peter Anvin
                             ` (2 more replies)
  0 siblings, 3 replies; 15+ messages in thread
From: Ingo Molnar @ 2008-07-28 13:42 UTC (permalink / raw)
  To: Jan Beulich
  Cc: Andi Kleen, tglx, linux-kernel, H. Peter Anvin, Linus Torvalds,
	Joerg Roedel


* Jan Beulich <jbeulich@novell.com> wrote:

> >>> Ingo Molnar <mingo@elte.hu> 21.07.08 13:05 >>>
> >this still doesnt apply to latest -git. (or tip/master)
> 
> Indeed, tip/master had a __pa -> __phys_addr_const conversion that I 
> now sync-ed the patch with (without another round of testing):
> 
> Make the double fault handler use CPU-specific stacks. Add some 
> abstraction to simplify future change of other exception handlers to 
> go through task gates. Add a new notification of the event through the 
> die notifier chain, also providing some environmental adjustments so 
> that various infrastructural things work independent of the fact that 
> the fault and the callbacks are running on other then the normal 
> kernel stack.
> 
> Signed-Off-By: Jan Beulich <jbeulich@novell.com>
> Cc: Andi Kleen <andi@firstfloor.org>
> 
> ---
>  arch/x86/kernel/cpu/common.c     |   17 +++++--
>  arch/x86/kernel/doublefault_32.c |   86 ++++++++++++++++++++++++---------------
>  arch/x86/kernel/smpboot.c        |   44 +++++++++++++++++++
>  arch/x86/kernel/traps_32.c       |   51 ++++++++++++++++++++++-
>  drivers/lguest/segments.c        |    3 -
>  include/asm-x86/kdebug.h         |    1 
>  include/asm-x86/processor.h      |    7 ++-
>  include/asm-x86/segment.h        |   15 ++++--
>  include/asm-x86/thread_info_32.h |    9 +++-
>  9 files changed, 187 insertions(+), 46 deletions(-)

I dont know.

All CPUs hitting a double fault simultaneously and corrupting each 
others' kernel stack is a theoretical possibility - but is handling it 
worth the complexity? It appears to me that a lock plus a short stub 
function that takes the lock (with no stack usage) would handle that 
much better.

Also, you seem to be setting things up to turn NMIs and MCEs into task 
gates too, right?

So i'm really uneasy about all this. Breakage in such rarely used code 
gets found very late, and has thus a high risk of losing debug 
information when we need it the most. (i.e. it works in the exact 
_opposite_ way of the intented goal of making things more robust - it 
makes things less robust)

Firstly, 64-bit does not use a task gate for double faults anymore. (but 
uses a separate IST stack for double faults)

Secondly, task gates are really a relic that should not be proliferated. 
Besides the complications in virtualized environments (if more common 
things like Big Real Mode are not supported well in virtual mode what do 
we expect of more esoteric features as task gates?) it does not get 
nearly as much testing on real silicon as other, more mainstream CPU 
features.

Thirdly, NMI based profiling is quite common, so by turning NMIs into 
task gates we'd slow that down quite a lot.

Also, the change to doublefault_fn is quite ugly - that inner block 
should be split out into a separate function.

Plus the notifier - why do we care about that? It's not like we can 
sanely kexec into a safe kernel from double faulting kernels in most 
cases. In real cases where i've seen double faults it was due to us 
corrupting kernel pagetables - kexec has no chance there. To recover 
from that we'd have to set up the TSS with a safe(r) cr3 as well - but 
your patch leaves _that_ untouched. (nor do we want to waste extra 
unswappable memory on such remote possibilities i think)

        Ingo

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] i386: improve double fault handling
  2008-07-28 13:42         ` Ingo Molnar
@ 2008-07-28 13:45           ` H. Peter Anvin
  2008-07-28 13:59           ` Jan Beulich
  2008-07-28 22:00           ` Chuck Ebbert
  2 siblings, 0 replies; 15+ messages in thread
From: H. Peter Anvin @ 2008-07-28 13:45 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Jan Beulich, Andi Kleen, tglx, linux-kernel, Linus Torvalds,
	Joerg Roedel

Ingo Molnar wrote:
> 
> Secondly, task gates are really a relic that should not be proliferated. 
> Besides the complications in virtualized environments (if more common 
> things like Big Real Mode are not supported well in virtual mode what do 
> we expect of more esoteric features as task gates?) it does not get 
> nearly as much testing on real silicon as other, more mainstream CPU 
> features.
> 

I think that using it as a bailout mechanism is going to remain 
supported and tested.  It's just never going to be fast.

	-hpa


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] i386: improve double fault handling
  2008-07-28 13:42         ` Ingo Molnar
  2008-07-28 13:45           ` H. Peter Anvin
@ 2008-07-28 13:59           ` Jan Beulich
  2008-07-28 14:02             ` H. Peter Anvin
  2008-07-28 22:00           ` Chuck Ebbert
  2 siblings, 1 reply; 15+ messages in thread
From: Jan Beulich @ 2008-07-28 13:59 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Joerg Roedel, Andi Kleen, tglx, Linus Torvalds, linux-kernel,
	H. Peter Anvin

>Also, you seem to be setting things up to turn NMIs and MCEs into task 
>gates too, right?

Yes, at the very minimum I'd like to have the possibility to do so. Perhaps
under a default-off config option.

>So i'm really uneasy about all this. Breakage in such rarely used code 
>gets found very late, and has thus a high risk of losing debug 
>information when we need it the most. (i.e. it works in the exact 
>_opposite_ way of the intented goal of making things more robust - it 
>makes things less robust)

I realize this aspect, but think that either way has its advantages and
disadvantages.

>Firstly, 64-bit does not use a task gate for double faults anymore. (but 
>uses a separate IST stack for double faults)

Sure - because there are no task gates on 64-bit.

>Secondly, task gates are really a relic that should not be proliferated. 
>Besides the complications in virtualized environments (if more common 
>things like Big Real Mode are not supported well in virtual mode what do 
>we expect of more esoteric features as task gates?) it does not get 
>nearly as much testing on real silicon as other, more mainstream CPU 
>features.
>
>Thirdly, NMI based profiling is quite common, so by turning NMIs into 
>task gates we'd slow that down quite a lot.

As said above, I'd like to allow the option of doing so. Profiling via
NMI certainly will not want this. I'm really uncertain whether modern
machines can report any hardware issue through NMI (no chipset spec
I read 'recently' [covering quite a number of years] was really explicit
about this) - if it can't, MCE would be the only candidate unless
running on really old hardware.

>Also, the change to doublefault_fn is quite ugly - that inner block 
>should be split out into a separate function.

That's certainly doable - if the whole thing is acceptable apart from
that issue, which it doesn't seem it is...

>Plus the notifier - why do we care about that? It's not like we can 

In order to let a kernel debugger take control.

>sanely kexec into a safe kernel from double faulting kernels in most 
>cases. In real cases where i've seen double faults it was due to us 
>corrupting kernel pagetables - kexec has no chance there. To recover 
>from that we'd have to set up the TSS with a safe(r) cr3 as well - but 
>your patch leaves _that_ untouched. (nor do we want to waste extra 
>unswappable memory on such remote possibilities i think)

I've seen double faults due to other than page table corruption, but
I do understand if it is the page tables that caused it handling the
condition is almost impossible without a second complete set of (kernel)
page tables.

Jan


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] i386: improve double fault handling
  2008-07-28 13:59           ` Jan Beulich
@ 2008-07-28 14:02             ` H. Peter Anvin
  2008-07-28 16:28               ` Ingo Molnar
  0 siblings, 1 reply; 15+ messages in thread
From: H. Peter Anvin @ 2008-07-28 14:02 UTC (permalink / raw)
  To: Jan Beulich
  Cc: Ingo Molnar, Joerg Roedel, Andi Kleen, tglx, Linus Torvalds,
	linux-kernel

Jan Beulich wrote:
> 
>> Firstly, 64-bit does not use a task gate for double faults anymore. (but 
>> uses a separate IST stack for double faults)
> 
> Sure - because there are no task gates on 64-bit.
> 

What we're doing here is really using task gates to emulate IST anyway.

	-hpa

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] i386: improve double fault handling
  2008-07-28 14:02             ` H. Peter Anvin
@ 2008-07-28 16:28               ` Ingo Molnar
  0 siblings, 0 replies; 15+ messages in thread
From: Ingo Molnar @ 2008-07-28 16:28 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: Jan Beulich, Joerg Roedel, Andi Kleen, tglx, Linus Torvalds,
	linux-kernel


* H. Peter Anvin <hpa@zytor.com> wrote:

> Jan Beulich wrote:
>>
>>> Firstly, 64-bit does not use a task gate for double faults anymore. 
>>> (but uses a separate IST stack for double faults)
>>
>> Sure - because there are no task gates on 64-bit.
>>
>
> What we're doing here is really using task gates to emulate IST 
> anyway.

yes, because we dont use the main feature that differentiates task gates 
from ISTs: a different cr3 entry. (the rest of the differences is really 
just fluff)

	Ingo

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] i386: improve double fault handling
  2008-07-28 13:42         ` Ingo Molnar
  2008-07-28 13:45           ` H. Peter Anvin
  2008-07-28 13:59           ` Jan Beulich
@ 2008-07-28 22:00           ` Chuck Ebbert
  2008-07-31 10:46             ` Ingo Molnar
  2 siblings, 1 reply; 15+ messages in thread
From: Chuck Ebbert @ 2008-07-28 22:00 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Jan Beulich, Andi Kleen, tglx, linux-kernel, H. Peter Anvin,
	Linus Torvalds, Joerg Roedel

Ingo Molnar wrote:
> 
> All CPUs hitting a double fault simultaneously and corrupting each 
> others' kernel stack is a theoretical possibility - but is handling it 
> worth the complexity? It appears to me that a lock plus a short stub 
> function that takes the lock (with no stack usage) would handle that 
> much better.

That can't happen now because the TSS gets marked busy so we will get a
triple fault instead. One thing we might want to do in the current code
is unset the busy flag after handling the fault and before we start looping
at the end of the handler so we can handle another fault later.

> 
> So i'm really uneasy about all this. Breakage in such rarely used code 
> gets found very late, and has thus a high risk of losing debug 
> information when we need it the most. (i.e. it works in the exact 
> _opposite_ way of the intented goal of making things more robust - it 
> makes things less robust)
> 

Also how much bloat does this cause, having a per-CPU TSS and stack for every
fault handler that uses this method?


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] i386: improve double fault handling
  2008-07-28 22:00           ` Chuck Ebbert
@ 2008-07-31 10:46             ` Ingo Molnar
  0 siblings, 0 replies; 15+ messages in thread
From: Ingo Molnar @ 2008-07-31 10:46 UTC (permalink / raw)
  To: Chuck Ebbert
  Cc: Jan Beulich, Andi Kleen, tglx, linux-kernel, H. Peter Anvin,
	Linus Torvalds, Joerg Roedel


* Chuck Ebbert <cebbert@redhat.com> wrote:

> Ingo Molnar wrote:
>>
>> All CPUs hitting a double fault simultaneously and corrupting each  
>> others' kernel stack is a theoretical possibility - but is handling it  
>> worth the complexity? It appears to me that a lock plus a short stub  
>> function that takes the lock (with no stack usage) would handle that  
>> much better.
>
> That can't happen now because the TSS gets marked busy so we will get 
> a triple fault instead. One thing we might want to do in the current 
> code is unset the busy flag after handling the fault and before we 
> start looping at the end of the handler so we can handle another fault 
> later.

that would be a nice improvement.

	Ingo

^ permalink raw reply	[flat|nested] 15+ messages in thread

end of thread, other threads:[~2008-07-31 10:47 UTC | newest]

Thread overview: 15+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2008-07-18 12:30 [PATCH] i386: improve double fault handling Jan Beulich
2008-07-18 23:24 ` H. Peter Anvin
2008-07-21  8:54   ` Jan Beulich
2008-07-21 11:05     ` Ingo Molnar
2008-07-22 10:13       ` Jan Beulich
2008-07-28 13:42         ` Ingo Molnar
2008-07-28 13:45           ` H. Peter Anvin
2008-07-28 13:59           ` Jan Beulich
2008-07-28 14:02             ` H. Peter Anvin
2008-07-28 16:28               ` Ingo Molnar
2008-07-28 22:00           ` Chuck Ebbert
2008-07-31 10:46             ` Ingo Molnar
2008-07-23 21:43 ` Joerg Roedel
2008-07-24  7:08   ` Jan Beulich
2008-07-24 13:24     ` H. Peter Anvin

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).