linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [patch] x86: fix ESP corruption CPU bug
@ 2005-01-03 23:39 Stas Sergeev
  2005-01-04  0:01 ` Linus Torvalds
  0 siblings, 1 reply; 25+ messages in thread
From: Stas Sergeev @ 2005-01-03 23:39 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Linus Torvalds, Linux kernel, Petr Vandrovec

[-- Attachment #1: Type: text/plain, Size: 744 bytes --]

Hi Andrew.

Attached patch works around the corruption
of the high word of the ESP register, which
is the official bug of x86 CPUs. The bug
triggers only when the one is using the
16bit stack segment.
Patch helps running many apps under dosemu,
and, according to the comments found in
Wine sources, also helps Wine.

The patch defines the per-CPU 16bit stacks,
and every time the process that uses 16bit
stack, returns to the userspace, we switch
to our 16bit stack and preload the high word
of ESP.
This also closes the "informational leak",
which is that the user process is not
supposed to know the kernel's ESP value.

Can this please be applied?

Acked-by: Linus Torvalds <torvalds@osdl.org>
|Signed-off-by: Stas Sergeev <stsp@aknet.ru>|


[-- Attachment #2: linux-2.6.10-mm1-stk4.diff --]
[-- Type: text/x-patch, Size: 11529 bytes --]

diff -urN linux-2.6.10-mm1/arch/i386/kernel/cpu/common.c linux-2.6.10-mm1-stk/arch/i386/kernel/cpu/common.c
--- linux-2.6.10-mm1/arch/i386/kernel/cpu/common.c	2005-01-03 22:56:03.000000000 +0300
+++ linux-2.6.10-mm1-stk/arch/i386/kernel/cpu/common.c	2005-01-03 23:13:21.000000000 +0300
@@ -16,6 +16,10 @@
 DEFINE_PER_CPU(struct desc_struct, cpu_gdt_table[GDT_ENTRIES]);
 EXPORT_PER_CPU_SYMBOL(cpu_gdt_table);
 
+unsigned char cpu_16bit_stack[CPU_16BIT_STACK_SIZE];
+DEFINE_PER_CPU(unsigned char, cpu_16bit_stack[CPU_16BIT_STACK_SIZE]);
+EXPORT_PER_CPU_SYMBOL(cpu_16bit_stack);
+
 static int cachesize_override __initdata = -1;
 static int disable_x86_fxsr __initdata = 0;
 static int disable_x86_serial_nr __initdata = 1;
@@ -505,6 +509,7 @@
 	int cpu = smp_processor_id();
 	struct tss_struct * t = &per_cpu(init_tss, cpu);
 	struct thread_struct *thread = &current->thread;
+	__u32 stk16_off = (__u32)&per_cpu(cpu_16bit_stack, cpu);
 
 	if (test_and_set_bit(cpu, &cpu_initialized)) {
 		printk(KERN_WARNING "CPU#%d already initialized!\n", cpu);
@@ -527,6 +532,13 @@
 	 */
 	memcpy(&per_cpu(cpu_gdt_table, cpu), cpu_gdt_table,
 	       GDT_SIZE);
+
+	/* Set up GDT entry for 16bit stack */
+	*(__u64 *)&(per_cpu(cpu_gdt_table, cpu)[GDT_ENTRY_ESPFIX_SS]) |=
+		((((__u64)stk16_off) << 16) & 0x000000ffffff0000ULL) |
+		((((__u64)stk16_off) << 32) & 0xff00000000000000ULL) |
+		(CPU_16BIT_STACK_SIZE - 1);
+
 	cpu_gdt_descr[cpu].size = GDT_SIZE - 1;
 	cpu_gdt_descr[cpu].address =
 	    (unsigned long)&per_cpu(cpu_gdt_table, cpu);
diff -urN linux-2.6.10-mm1/arch/i386/kernel/entry.S linux-2.6.10-mm1-stk/arch/i386/kernel/entry.S
--- linux-2.6.10-mm1/arch/i386/kernel/entry.S	2005-01-03 22:56:03.000000000 +0300
+++ linux-2.6.10-mm1-stk/arch/i386/kernel/entry.S	2005-01-04 00:47:03.000000000 +0300
@@ -47,6 +47,7 @@
 #include <asm/segment.h>
 #include <asm/smp.h>
 #include <asm/page.h>
+#include <asm/desc.h>
 #include "irq_vectors.h"
         /* We do not recover from a stack overflow, but at least
          * we know it happened and should be able to track it down.
@@ -90,7 +91,7 @@
 #define preempt_stop		cli
 #else
 #define preempt_stop
-#define resume_kernel		restore_all
+#define resume_kernel		resume_kernelX
 #endif
 
 #define SAVE_ALL \
@@ -135,24 +136,6 @@
 .previous
 
 
-#define RESTORE_ALL	\
-	RESTORE_REGS	\
-	addl $4, %esp;	\
-1:	iret;		\
-.section .fixup,"ax";   \
-2:	sti;		\
-	movl $(__USER_DS), %edx; \
-	movl %edx, %ds; \
-	movl %edx, %es; \
-	movl $11,%eax;	\
-	call do_exit;	\
-.previous;		\
-.section __ex_table,"a";\
-	.align 4;	\
-	.long 1b,2b;	\
-.previous
-
-
 ENTRY(ret_from_fork)
 	pushl %eax
 	call schedule_tail
@@ -176,7 +159,7 @@
 	movl EFLAGS(%esp), %eax		# mix EFLAGS and CS
 	movb CS(%esp), %al
 	testl $(VM_MASK | 3), %eax
-	jz resume_kernel		# returning to kernel or vm86-space
+	jz resume_kernel
 ENTRY(resume_userspace)
  	cli				# make sure we don't miss an interrupt
 					# setting need_resched or sigpending
@@ -190,7 +173,7 @@
 #ifdef CONFIG_PREEMPT
 ENTRY(resume_kernel)
 	cmpl $0,TI_preempt_count(%ebp)	# non-zero preempt_count ?
-	jnz restore_all
+	jnz resume_kernelX
 need_resched:
 	movl TI_flags(%ebp), %ecx	# need_resched set ?
 	testb $_TIF_NEED_RESCHED, %cl
@@ -204,6 +187,31 @@
 	jmp need_resched
 #endif
 
+ldt_ss:
+	larl OLDSS(%esp), %eax
+	jnz resume_kernelX
+	testl $0x00400000, %eax		# returning to 32bit stack?
+	jnz resume_kernelX		# allright, normal return
+	/* If returning to userspace with 16bit stack,
+	 * try to fix the higher word of ESP, as the CPU
+	 * won't restore it.
+	 * This is an "official" bug of all the x86-compatible
+	 * CPUs, which we can try to work around to make
+	 * dosemu and wine happy. */
+	subl $8, %esp		# reserve space for switch16 pointer
+	cli
+	movl %esp, %eax
+	/* Set up the 16bit stack frame with switch32 pointer on top,
+	 * and a switch16 pointer on top of the current frame. */
+	call setup_x86_bogus_stack
+	RESTORE_REGS
+	lss 20+4(%esp), %esp	# The magic pointer above iret frame
+1:	iret
+.section __ex_table,"a"
+	.align 4
+	.long 1b,iret_exc
+.previous
+
 /* SYSENTER_RETURN points to after the "sysenter" instruction in
    the vsyscall page.  See vsyscall-sysentry.S, which defines the symbol.  */
 
@@ -273,21 +281,40 @@
 	movl TI_flags(%ebp), %ecx
 	testw $_TIF_ALLWORK_MASK, %cx	# current->work
 	jne syscall_exit_work
+
 restore_all:
-#ifdef CONFIG_TRAP_BAD_SYSCALL_EXITS
 	movl EFLAGS(%esp), %eax		# mix EFLAGS and CS
 	movb CS(%esp), %al
-	testl $(VM_MASK | 3), %eax
-	jz resume_kernelX		# returning to kernel or vm86-space
+	andl $(VM_MASK | 3), %eax
+	cmpl $3, %eax
+	jne resume_kernelX		# returning to kernel or vm86-space
 
+#ifdef CONFIG_TRAP_BAD_SYSCALL_EXITS
 	cmpl $0,TI_preempt_count(%ebp)  # non-zero preempt_count ?
-	jz resume_kernelX
-
+	jz 47f
         int $3
+47:
+#endif
 
+	testl $4, OLDSS(%esp)
+	jnz ldt_ss
 resume_kernelX:
-#endif
-	RESTORE_ALL
+	RESTORE_REGS
+	addl $4, %esp
+1:	iret
+.section .fixup,"ax"
+iret_exc:
+	sti
+	movl $(__USER_DS), %edx
+	movl %edx, %ds
+	movl %edx, %es
+	movl $11,%eax
+	call do_exit
+.previous
+.section __ex_table,"a"
+	.align 4
+	.long 1b,iret_exc
+.previous
 
 	# perform work that needs to be done immediately before resumption
 	ALIGN
@@ -363,6 +390,25 @@
 	movl $-ENOSYS,EAX(%esp)
 	jmp resume_userspace
 
+/* Check if we are on 16bit stack. Can happen either on iret of ESPFIX,
+ * or in an exception handler after that iret... */
+#define FIXUP_ESPFIX_STACK \
+	movl %esp, %eax; \
+	/* Magic pointer is at the top of the 16bit stack */ \
+	lss %ss:CPU_16BIT_STACK_SIZE-8, %esp; \
+	call fixup_x86_bogus_stack; \
+	movl %eax, %esp;
+#define UNWIND_ESPFIX_STACK \
+	pushl %eax; \
+	movl %ss, %eax; \
+	cmpw $__ESPFIX_SS, %ax; \
+	jne 28f; \
+	movl $(__KERNEL_DS), %edx; \
+	movl %edx, %ds; \
+	movl %edx, %es; \
+	FIXUP_ESPFIX_STACK \
+28:	popl %eax;
+
 /*
  * Build the entry stubs and pointer table with
  * some assembler magic.
@@ -427,7 +473,9 @@
 	pushl %ecx
 	pushl %ebx
 	cld
-	movl %es, %ecx
+	pushl %es
+	UNWIND_ESPFIX_STACK
+	popl %ecx
 	movl ES(%esp), %edi		# get the function address
 	movl ORIG_EAX(%esp), %edx	# get the error code
 	movl %eax, ORIG_EAX(%esp)
@@ -509,6 +557,11 @@
  * fault happened on the sysenter path.
  */
 ENTRY(nmi)
+	pushl %eax
+	movl %ss, %eax
+	cmpw $__ESPFIX_SS, %ax
+	popl %eax
+	je nmi_16bit_stack
 	cmpl $sysenter_entry,(%esp)
 	je nmi_stack_fixup
 	pushl %eax
@@ -528,7 +581,7 @@
 	xorl %edx,%edx		# zero error code
 	movl %esp,%eax		# pt_regs pointer
 	call do_nmi
-	RESTORE_ALL
+	jmp restore_all
 
 nmi_stack_fixup:
 	FIX_STACK(12,nmi_stack_correct, 1)
@@ -544,6 +597,29 @@
 	FIX_STACK(24,nmi_stack_correct, 1)
 	jmp nmi_stack_correct
 
+nmi_16bit_stack:
+	/* create the pointer to lss back */
+	pushl %ss
+	pushl %esp
+	andl $0xffff, %esp
+	addw $4, (%esp)
+	/* copy the iret frame of 12 bytes */
+	.rept 3
+	pushl 16(%esp)
+	.endr
+	pushl %eax
+	SAVE_ALL
+	FIXUP_ESPFIX_STACK		# %eax == %esp
+	xorl %edx,%edx			# zero error code
+	call do_nmi
+	RESTORE_REGS
+	lss 12+4(%esp), %esp		# back to 16bit stack
+1:	iret
+.section __ex_table,"a"
+	.align 4
+	.long 1b,iret_exc
+.previous
+
 ENTRY(int3)
 	pushl $-1			# mark this as an int
 	SAVE_ALL
diff -urN linux-2.6.10-mm1/arch/i386/kernel/head.S linux-2.6.10-mm1-stk/arch/i386/kernel/head.S
--- linux-2.6.10-mm1/arch/i386/kernel/head.S	2005-01-03 22:56:03.000000000 +0300
+++ linux-2.6.10-mm1-stk/arch/i386/kernel/head.S	2005-01-03 23:13:21.000000000 +0300
@@ -512,7 +512,7 @@
 	.quad 0x00009a0000000000	/* 0xc0 APM CS 16 code (16 bit) */
 	.quad 0x0040920000000000	/* 0xc8 APM DS    data */
 
-	.quad 0x0000000000000000	/* 0xd0 - unused */
+	.quad 0x0000920000000000	/* 0xd0 - ESPFIX 16-bit SS */
 	.quad 0x0000000000000000	/* 0xd8 - unused */
 	.quad 0x0000000000000000	/* 0xe0 - unused */
 	.quad 0x0000000000000000	/* 0xe8 - unused */
diff -urN linux-2.6.10-mm1/arch/i386/kernel/traps.c linux-2.6.10-mm1-stk/arch/i386/kernel/traps.c
--- linux-2.6.10-mm1/arch/i386/kernel/traps.c	2005-01-03 22:56:03.000000000 +0300
+++ linux-2.6.10-mm1-stk/arch/i386/kernel/traps.c	2005-01-03 23:13:21.000000000 +0300
@@ -972,6 +972,51 @@
 #endif
 }
 
+fastcall void setup_x86_bogus_stack(unsigned char * stk)
+{
+	unsigned long *switch16_ptr, *switch32_ptr;
+	struct pt_regs *regs;
+	unsigned long stack_top, stack_bot;
+	unsigned short iret_frame16_off;
+	int cpu = smp_processor_id();
+	/* reserve the space on 32bit stack for the magic switch16 pointer */
+	memmove(stk, stk + 8, sizeof(struct pt_regs));
+	switch16_ptr = (unsigned long *)(stk + sizeof(struct pt_regs));
+	regs = (struct pt_regs *)stk;
+	/* now the switch32 on 16bit stack */
+	stack_bot = (unsigned long)&per_cpu(cpu_16bit_stack, cpu);
+	stack_top = stack_bot +	CPU_16BIT_STACK_SIZE;
+	switch32_ptr = (unsigned long *)(stack_top - 8);
+	iret_frame16_off = CPU_16BIT_STACK_SIZE - 8 - 20;
+	/* copy iret frame on 16bit stack */
+	memcpy((void *)(stack_bot + iret_frame16_off), &regs->eip, 20);
+	/* fill in the switch pointers */
+	switch16_ptr[0] = (regs->esp & 0xffff0000) | iret_frame16_off;
+	switch16_ptr[1] = __ESPFIX_SS;
+	switch32_ptr[0] = (unsigned long)stk + sizeof(struct pt_regs) +
+		8 - CPU_16BIT_STACK_SIZE;
+	switch32_ptr[1] = __KERNEL_DS;
+}
+
+fastcall unsigned char * fixup_x86_bogus_stack(unsigned short sp)
+{
+	unsigned long *switch32_ptr;
+	unsigned char *stack16, *stack32;
+	unsigned long stack_top, stack_bot;
+	int len;
+	int cpu = smp_processor_id();
+	stack_bot = (unsigned long)&per_cpu(cpu_16bit_stack, cpu);
+	stack_top = stack_bot +	CPU_16BIT_STACK_SIZE;
+	switch32_ptr = (unsigned long *)(stack_top - 8);
+	/* copy the data from 16bit stack to 32bit stack */
+	len = CPU_16BIT_STACK_SIZE - 8 - sp;
+	stack16 = (unsigned char *)(stack_bot + sp);
+	stack32 = (unsigned char *)
+		(switch32_ptr[0] + CPU_16BIT_STACK_SIZE - 8 - len);
+	memcpy(stack32, stack16, len);
+	return stack32;
+}
+
 /*
  *  'math_state_restore()' saves the current math information in the
  * old math state array, and gets the new ones from the current task
diff -urN linux-2.6.10-mm1/include/asm-i386/desc.h linux-2.6.10-mm1-stk/include/asm-i386/desc.h
--- linux-2.6.10-mm1/include/asm-i386/desc.h	2005-01-03 22:55:08.000000000 +0300
+++ linux-2.6.10-mm1-stk/include/asm-i386/desc.h	2005-01-03 23:13:21.000000000 +0300
@@ -4,6 +4,8 @@
 #include <asm/ldt.h>
 #include <asm/segment.h>
 
+#define CPU_16BIT_STACK_SIZE 1024
+
 #ifndef __ASSEMBLY__
 
 #include <linux/preempt.h>
@@ -15,6 +17,9 @@
 extern struct desc_struct cpu_gdt_table[GDT_ENTRIES];
 DECLARE_PER_CPU(struct desc_struct, cpu_gdt_table[GDT_ENTRIES]);
 
+extern unsigned char cpu_16bit_stack[CPU_16BIT_STACK_SIZE];
+DECLARE_PER_CPU(unsigned char, cpu_16bit_stack[CPU_16BIT_STACK_SIZE]);
+
 struct Xgt_desc_struct {
 	unsigned short size;
 	unsigned long address __attribute__((packed));
diff -urN linux-2.6.10-mm1/include/asm-i386/segment.h linux-2.6.10-mm1-stk/include/asm-i386/segment.h
--- linux-2.6.10-mm1/include/asm-i386/segment.h	2005-01-03 22:56:32.000000000 +0300
+++ linux-2.6.10-mm1-stk/include/asm-i386/segment.h	2005-01-03 23:13:21.000000000 +0300
@@ -38,7 +38,7 @@
  *  24 - APM BIOS support
  *  25 - APM BIOS support 
  *
- *  26 - unused
+ *  26 - ESPFIX small SS
  *  27 - unused
  *  28 - unused
  *  29 - unused
@@ -71,6 +71,9 @@
 #define GDT_ENTRY_PNPBIOS_BASE		(GDT_ENTRY_KERNEL_BASE + 6)
 #define GDT_ENTRY_APMBIOS_BASE		(GDT_ENTRY_KERNEL_BASE + 11)
 
+#define GDT_ENTRY_ESPFIX_SS		(GDT_ENTRY_KERNEL_BASE + 14)
+#define __ESPFIX_SS (GDT_ENTRY_ESPFIX_SS * 8)
+
 #define GDT_ENTRY_DOUBLEFAULT_TSS	31
 
 /*

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [patch] x86: fix ESP corruption CPU bug
  2005-01-03 23:39 [patch] x86: fix ESP corruption CPU bug Stas Sergeev
@ 2005-01-04  0:01 ` Linus Torvalds
  2005-01-04  1:58   ` Stas Sergeev
  0 siblings, 1 reply; 25+ messages in thread
From: Linus Torvalds @ 2005-01-04  0:01 UTC (permalink / raw)
  To: Stas Sergeev; +Cc: Andrew Morton, Linux kernel, Petr Vandrovec



On Tue, 4 Jan 2005, Stas Sergeev wrote:
> 
> Can this please be applied?

Please don't do it like this - you made the patch now depend on the 
ugliest code in the universe, namely that horribly crappy kgdb-ga sh*t 
("Don't hold back, Linus, tell us how you really feel").

The 16-bit stack code may not be the prettiest either, but it doesn't hold 
a candle to the asm-crap that is entry.S after kgdb-ga.

"resume_kernelX"? What crud.

		Linus

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [patch] x86: fix ESP corruption CPU bug
  2005-01-04  0:01 ` Linus Torvalds
@ 2005-01-04  1:58   ` Stas Sergeev
  0 siblings, 0 replies; 25+ messages in thread
From: Stas Sergeev @ 2005-01-04  1:58 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: Andrew Morton, Linux kernel, Petr Vandrovec

[-- Attachment #1: Type: text/plain, Size: 1075 bytes --]

Hi Linus.

Linus Torvalds wrote:
> Please don't do it like this - you made the patch now depend on the 
> ugliest code in the universe, namely that horribly crappy kgdb-ga sh*t 
I didn't do that. I just re-targeted the
patch to -mm tree and it clashed with the
kgdb-ga patch. And somehow it happened
that a few lines I would have to insert
myself otherwise anyway, appeared to be
already there for me to re-use. I wouldn't
call that a dependancy. Only 3 lines are
re-used in fact.

> The 16-bit stack code may not be the prettiest either, but it doesn't hold 
> a candle to the asm-crap that is entry.S after kgdb-ga.
>From what I can see, kgdb-ga have only
3 small hunks in entry.S, so the crap is
probably very dense there.
Clashing into one of these hunks looks
unavoidable for my needs.

> "resume_kernelX"? What crud.
Does "restore_nocheck" sound better?
Yes, maybe, but then I don't see the way
to provide my patch for -mm. So the attached
one is for plain 2.6.10. I don't know how
Andrew can apply it, so maybe you will?


Signed-off-by: Stas Sergeev <stsp@aknet.ru>


[-- Attachment #2: linux-2.6.10-stk5.diff --]
[-- Type: text/x-patch, Size: 11056 bytes --]

diff -urN linux-2.6.10/arch/i386/kernel/cpu/common.c linux-2.6.10-stk/arch/i386/kernel/cpu/common.c
--- linux-2.6.10/arch/i386/kernel/cpu/common.c	2004-10-21 21:35:21.000000000 +0400
+++ linux-2.6.10-stk/arch/i386/kernel/cpu/common.c	2004-12-31 11:16:28.000000000 +0300
@@ -16,6 +16,10 @@
 DEFINE_PER_CPU(struct desc_struct, cpu_gdt_table[GDT_ENTRIES]);
 EXPORT_PER_CPU_SYMBOL(cpu_gdt_table);
 
+unsigned char cpu_16bit_stack[CPU_16BIT_STACK_SIZE];
+DEFINE_PER_CPU(unsigned char, cpu_16bit_stack[CPU_16BIT_STACK_SIZE]);
+EXPORT_PER_CPU_SYMBOL(cpu_16bit_stack);
+
 static int cachesize_override __initdata = -1;
 static int disable_x86_fxsr __initdata = 0;
 static int disable_x86_serial_nr __initdata = 1;
@@ -508,6 +512,7 @@
 	int cpu = smp_processor_id();
 	struct tss_struct * t = &per_cpu(init_tss, cpu);
 	struct thread_struct *thread = &current->thread;
+	__u32 stk16_off = (__u32)&per_cpu(cpu_16bit_stack, cpu);
 
 	if (test_and_set_bit(cpu, &cpu_initialized)) {
 		printk(KERN_WARNING "CPU#%d already initialized!\n", cpu);
@@ -530,6 +535,13 @@
 	 */
 	memcpy(&per_cpu(cpu_gdt_table, cpu), cpu_gdt_table,
 	       GDT_SIZE);
+
+	/* Set up GDT entry for 16bit stack */
+	*(__u64 *)&(per_cpu(cpu_gdt_table, cpu)[GDT_ENTRY_ESPFIX_SS]) |=
+		((((__u64)stk16_off) << 16) & 0x000000ffffff0000ULL) |
+		((((__u64)stk16_off) << 32) & 0xff00000000000000ULL) |
+		(CPU_16BIT_STACK_SIZE - 1);
+
 	cpu_gdt_descr[cpu].size = GDT_SIZE - 1;
 	cpu_gdt_descr[cpu].address =
 	    (unsigned long)&per_cpu(cpu_gdt_table, cpu);
diff -urN linux-2.6.10/arch/i386/kernel/entry.S linux-2.6.10-stk/arch/i386/kernel/entry.S
--- linux-2.6.10/arch/i386/kernel/entry.S	2004-12-26 00:37:55.000000000 +0300
+++ linux-2.6.10-stk/arch/i386/kernel/entry.S	2004-12-31 11:30:18.000000000 +0300
@@ -47,6 +47,7 @@
 #include <asm/segment.h>
 #include <asm/smp.h>
 #include <asm/page.h>
+#include <asm/desc.h>
 #include "irq_vectors.h"
 
 #define nr_syscalls ((syscall_table_size)/4)
@@ -78,7 +79,7 @@
 #define preempt_stop		cli
 #else
 #define preempt_stop
-#define resume_kernel		restore_all
+#define resume_kernel		restore_nocheck
 #endif
 
 #define SAVE_ALL \
@@ -122,24 +123,6 @@
 .previous
 
 
-#define RESTORE_ALL	\
-	RESTORE_REGS	\
-	addl $4, %esp;	\
-1:	iret;		\
-.section .fixup,"ax";   \
-2:	sti;		\
-	movl $(__USER_DS), %edx; \
-	movl %edx, %ds; \
-	movl %edx, %es; \
-	movl $11,%eax;	\
-	call do_exit;	\
-.previous;		\
-.section __ex_table,"a";\
-	.align 4;	\
-	.long 1b,2b;	\
-.previous
-
-
 ENTRY(ret_from_fork)
 	pushl %eax
 	call schedule_tail
@@ -163,7 +146,7 @@
 	movl EFLAGS(%esp), %eax		# mix EFLAGS and CS
 	movb CS(%esp), %al
 	testl $(VM_MASK | 3), %eax
-	jz resume_kernel		# returning to kernel or vm86-space
+	jz resume_kernel
 ENTRY(resume_userspace)
  	cli				# make sure we don't miss an interrupt
 					# setting need_resched or sigpending
@@ -177,7 +160,7 @@
 #ifdef CONFIG_PREEMPT
 ENTRY(resume_kernel)
 	cmpl $0,TI_preempt_count(%ebp)	# non-zero preempt_count ?
-	jnz restore_all
+	jnz restore_nocheck
 need_resched:
 	movl TI_flags(%ebp), %ecx	# need_resched set ?
 	testb $_TIF_NEED_RESCHED, %cl
@@ -192,6 +175,31 @@
 	jmp need_resched
 #endif
 
+ldt_ss:
+	larl OLDSS(%esp), %eax
+	jnz restore_nocheck
+	testl $0x00400000, %eax		# returning to 32bit stack?
+	jnz restore_nocheck		# allright, normal return
+	/* If returning to userspace with 16bit stack,
+	 * try to fix the higher word of ESP, as the CPU
+	 * won't restore it.
+	 * This is an "official" bug of all the x86-compatible
+	 * CPUs, which we can try to work around to make
+	 * dosemu and wine happy. */
+	subl $8, %esp		# reserve space for switch16 pointer
+	cli
+	movl %esp, %eax
+	/* Set up the 16bit stack frame with switch32 pointer on top,
+	 * and a switch16 pointer on top of the current frame. */
+	call setup_x86_bogus_stack
+	RESTORE_REGS
+	lss 20+4(%esp), %esp	# The magic pointer above iret frame
+1:	iret
+.section __ex_table,"a"
+	.align 4
+	.long 1b,iret_exc
+.previous
+
 /* SYSENTER_RETURN points to after the "sysenter" instruction in
    the vsyscall page.  See vsyscall-sysentry.S, which defines the symbol.  */
 
@@ -260,8 +268,32 @@
 	movl TI_flags(%ebp), %ecx
 	testw $_TIF_ALLWORK_MASK, %cx	# current->work
 	jne syscall_exit_work
+
 restore_all:
-	RESTORE_ALL
+	movl EFLAGS(%esp), %eax		# mix EFLAGS and CS
+	movb CS(%esp), %al
+	andl $(VM_MASK | 3), %eax
+	cmpl $3, %eax
+	jne restore_nocheck		# returning to kernel or vm86-space
+	testl $4, OLDSS(%esp)
+	jnz ldt_ss
+restore_nocheck:
+	RESTORE_REGS
+	addl $4, %esp
+1:	iret
+.section .fixup,"ax"
+iret_exc:
+	sti
+	movl $(__USER_DS), %edx
+	movl %edx, %ds
+	movl %edx, %es
+	movl $11,%eax
+	call do_exit
+.previous
+.section __ex_table,"a"
+	.align 4
+	.long 1b,iret_exc
+.previous
 
 	# perform work that needs to be done immediately before resumption
 	ALIGN
@@ -337,6 +369,25 @@
 	movl $-ENOSYS,EAX(%esp)
 	jmp resume_userspace
 
+/* Check if we are on 16bit stack. Can happen either on iret of ESPFIX,
+ * or in an exception handler after that iret... */
+#define FIXUP_ESPFIX_STACK \
+	movl %esp, %eax; \
+	/* Magic pointer is at the top of the 16bit stack */ \
+	lss %ss:CPU_16BIT_STACK_SIZE-8, %esp; \
+	call fixup_x86_bogus_stack; \
+	movl %eax, %esp;
+#define UNWIND_ESPFIX_STACK \
+	pushl %eax; \
+	movl %ss, %eax; \
+	cmpw $__ESPFIX_SS, %ax; \
+	jne 28f; \
+	movl $(__KERNEL_DS), %edx; \
+	movl %edx, %ds; \
+	movl %edx, %es; \
+	FIXUP_ESPFIX_STACK \
+28:	popl %eax;
+
 /*
  * Build the entry stubs and pointer table with
  * some assembler magic.
@@ -391,7 +442,9 @@
 	pushl %ecx
 	pushl %ebx
 	cld
-	movl %es, %ecx
+	pushl %es
+	UNWIND_ESPFIX_STACK
+	popl %ecx
 	movl ES(%esp), %edi		# get the function address
 	movl ORIG_EAX(%esp), %edx	# get the error code
 	movl %eax, ORIG_EAX(%esp)
@@ -473,6 +526,11 @@
  * fault happened on the sysenter path.
  */
 ENTRY(nmi)
+	pushl %eax
+	movl %ss, %eax
+	cmpw $__ESPFIX_SS, %ax
+	popl %eax
+	je nmi_16bit_stack
 	cmpl $sysenter_entry,(%esp)
 	je nmi_stack_fixup
 	pushl %eax
@@ -492,7 +550,7 @@
 	xorl %edx,%edx		# zero error code
 	movl %esp,%eax		# pt_regs pointer
 	call do_nmi
-	RESTORE_ALL
+	jmp restore_all
 
 nmi_stack_fixup:
 	FIX_STACK(12,nmi_stack_correct, 1)
@@ -508,6 +566,29 @@
 	FIX_STACK(24,nmi_stack_correct, 1)
 	jmp nmi_stack_correct
 
+nmi_16bit_stack:
+	/* create the pointer to lss back */
+	pushl %ss
+	pushl %esp
+	andl $0xffff, %esp
+	addw $4, (%esp)
+	/* copy the iret frame of 12 bytes */
+	.rept 3
+	pushl 16(%esp)
+	.endr
+	pushl %eax
+	SAVE_ALL
+	FIXUP_ESPFIX_STACK		# %eax == %esp
+	xorl %edx,%edx			# zero error code
+	call do_nmi
+	RESTORE_REGS
+	lss 12+4(%esp), %esp		# back to 16bit stack
+1:	iret
+.section __ex_table,"a"
+	.align 4
+	.long 1b,iret_exc
+.previous
+
 ENTRY(int3)
 	pushl $-1			# mark this as an int
 	SAVE_ALL
diff -urN linux-2.6.10/arch/i386/kernel/head.S linux-2.6.10-stk/arch/i386/kernel/head.S
--- linux-2.6.10/arch/i386/kernel/head.S	2004-10-21 21:35:21.000000000 +0400
+++ linux-2.6.10-stk/arch/i386/kernel/head.S	2005-01-01 13:02:39.000000000 +0300
@@ -514,7 +514,7 @@
 	.quad 0x00009a0000000000	/* 0xc0 APM CS 16 code (16 bit) */
 	.quad 0x0040920000000000	/* 0xc8 APM DS    data */
 
-	.quad 0x0000000000000000	/* 0xd0 - unused */
+	.quad 0x0000920000000000	/* 0xd0 - ESPFIX 16-bit SS */
 	.quad 0x0000000000000000	/* 0xd8 - unused */
 	.quad 0x0000000000000000	/* 0xe0 - unused */
 	.quad 0x0000000000000000	/* 0xe8 - unused */
diff -urN linux-2.6.10/arch/i386/kernel/traps.c linux-2.6.10-stk/arch/i386/kernel/traps.c
--- linux-2.6.10/arch/i386/kernel/traps.c	2004-12-26 00:37:31.000000000 +0300
+++ linux-2.6.10-stk/arch/i386/kernel/traps.c	2004-12-31 11:27:02.000000000 +0300
@@ -904,6 +904,51 @@
 #endif
 }
 
+fastcall void setup_x86_bogus_stack(unsigned char * stk)
+{
+	unsigned long *switch16_ptr, *switch32_ptr;
+	struct pt_regs *regs;
+	unsigned long stack_top, stack_bot;
+	unsigned short iret_frame16_off;
+	int cpu = smp_processor_id();
+	/* reserve the space on 32bit stack for the magic switch16 pointer */
+	memmove(stk, stk + 8, sizeof(struct pt_regs));
+	switch16_ptr = (unsigned long *)(stk + sizeof(struct pt_regs));
+	regs = (struct pt_regs *)stk;
+	/* now the switch32 on 16bit stack */
+	stack_bot = (unsigned long)&per_cpu(cpu_16bit_stack, cpu);
+	stack_top = stack_bot +	CPU_16BIT_STACK_SIZE;
+	switch32_ptr = (unsigned long *)(stack_top - 8);
+	iret_frame16_off = CPU_16BIT_STACK_SIZE - 8 - 20;
+	/* copy iret frame on 16bit stack */
+	memcpy((void *)(stack_bot + iret_frame16_off), &regs->eip, 20);
+	/* fill in the switch pointers */
+	switch16_ptr[0] = (regs->esp & 0xffff0000) | iret_frame16_off;
+	switch16_ptr[1] = __ESPFIX_SS;
+	switch32_ptr[0] = (unsigned long)stk + sizeof(struct pt_regs) +
+		8 - CPU_16BIT_STACK_SIZE;
+	switch32_ptr[1] = __KERNEL_DS;
+}
+
+fastcall unsigned char * fixup_x86_bogus_stack(unsigned short sp)
+{
+	unsigned long *switch32_ptr;
+	unsigned char *stack16, *stack32;
+	unsigned long stack_top, stack_bot;
+	int len;
+	int cpu = smp_processor_id();
+	stack_bot = (unsigned long)&per_cpu(cpu_16bit_stack, cpu);
+	stack_top = stack_bot +	CPU_16BIT_STACK_SIZE;
+	switch32_ptr = (unsigned long *)(stack_top - 8);
+	/* copy the data from 16bit stack to 32bit stack */
+	len = CPU_16BIT_STACK_SIZE - 8 - sp;
+	stack16 = (unsigned char *)(stack_bot + sp);
+	stack32 = (unsigned char *)
+		(switch32_ptr[0] + CPU_16BIT_STACK_SIZE - 8 - len);
+	memcpy(stack32, stack16, len);
+	return stack32;
+}
+
 /*
  *  'math_state_restore()' saves the current math information in the
  * old math state array, and gets the new ones from the current task
diff -urN linux-2.6.10/include/asm-i386/desc.h linux-2.6.10-stk/include/asm-i386/desc.h
--- linux-2.6.10/include/asm-i386/desc.h	2004-10-21 21:35:55.000000000 +0400
+++ linux-2.6.10-stk/include/asm-i386/desc.h	2005-01-01 13:02:39.000000000 +0300
@@ -4,6 +4,8 @@
 #include <asm/ldt.h>
 #include <asm/segment.h>
 
+#define CPU_16BIT_STACK_SIZE 1024
+
 #ifndef __ASSEMBLY__
 
 #include <linux/preempt.h>
@@ -15,6 +17,9 @@
 extern struct desc_struct cpu_gdt_table[GDT_ENTRIES];
 DECLARE_PER_CPU(struct desc_struct, cpu_gdt_table[GDT_ENTRIES]);
 
+extern unsigned char cpu_16bit_stack[CPU_16BIT_STACK_SIZE];
+DECLARE_PER_CPU(unsigned char, cpu_16bit_stack[CPU_16BIT_STACK_SIZE]);
+
 struct Xgt_desc_struct {
 	unsigned short size;
 	unsigned long address __attribute__((packed));
diff -urN linux-2.6.10/include/asm-i386/segment.h linux-2.6.10-stk/include/asm-i386/segment.h
--- linux-2.6.10/include/asm-i386/segment.h	2004-01-09 09:59:19.000000000 +0300
+++ linux-2.6.10-stk/include/asm-i386/segment.h	2005-01-01 13:02:39.000000000 +0300
@@ -38,7 +38,7 @@
  *  24 - APM BIOS support
  *  25 - APM BIOS support 
  *
- *  26 - unused
+ *  26 - ESPFIX small SS
  *  27 - unused
  *  28 - unused
  *  29 - unused
@@ -71,6 +71,9 @@
 #define GDT_ENTRY_PNPBIOS_BASE		(GDT_ENTRY_KERNEL_BASE + 6)
 #define GDT_ENTRY_APMBIOS_BASE		(GDT_ENTRY_KERNEL_BASE + 11)
 
+#define GDT_ENTRY_ESPFIX_SS		(GDT_ENTRY_KERNEL_BASE + 14)
+#define __ESPFIX_SS (GDT_ENTRY_ESPFIX_SS * 8)
+
 #define GDT_ENTRY_DOUBLEFAULT_TSS	31
 
 /*

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [patch] x86: fix ESP corruption CPU bug
  2005-03-14 19:24                 ` Brian Gerst
@ 2005-03-14 20:21                   ` Stas Sergeev
  0 siblings, 0 replies; 25+ messages in thread
From: Stas Sergeev @ 2005-03-14 20:21 UTC (permalink / raw)
  To: Brian Gerst
  Cc: linux-os, Jakob Eriksson, Pavel Machek, Linux kernel, wine-devel

Hello.

Brian Gerst wrote:
>> Can you tell me how the invisible high-word (invisible in VM-86, and
>> in real mode) could possibly harm something running in VM-86 or
>> read-mode ???  I don't even think it's a BUG. If the transition
>> into and out of VM-86 doesn't handle the fact that the high-word
>> of the stack hasn't been used in VM-86, then that piece of code
>> is bad (the SP isn't even the same stack, BTW).
> Because even in 16-bit mode (real, vm86 or 16-bit protected) you can use 
> 32-bit instructions, with an operand and/or address size override 
> prefix.
And the real problem is when the pure
32bit code is starting to use the 16bit
stack for some strange reasons. Looks like
the common technique for the early dos4gw
-based apps...


^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [patch] x86: fix ESP corruption CPU bug
  2005-03-14 17:03               ` linux-os
  2005-03-14 17:10                 ` Pavel Machek
@ 2005-03-14 19:24                 ` Brian Gerst
  2005-03-14 20:21                   ` Stas Sergeev
  1 sibling, 1 reply; 25+ messages in thread
From: Brian Gerst @ 2005-03-14 19:24 UTC (permalink / raw)
  To: linux-os
  Cc: Jakob Eriksson, Andi Kleen, Stas Sergeev, Pavel Machek, Alan Cox,
	Linux kernel, Petr Vandrovec, Denis Vlasenko, wine-devel,
	torvalds

linux-os wrote:
> On Mon, 14 Mar 2005, Jakob Eriksson wrote:
> 
>> Andi Kleen wrote:
>>
>>> Stas Sergeev <stsp@aknet.ru> writes:
>>>
>>>>> Another way of saying the same thing: I absolutely hate seeing
>>>>> patches that fix some theoretical issue that no Linux apps will ever
>>>>> care about.
>>>>>
>>>> No, it is not theoretical, but it is mainly
>>>> about a DOS games and an MS linker, as for
>>>> me. The things I'd like to get working, but
>>>> the ones you may not care too much about:)
>>>> The particular game I want to get working,
>>>> is "Master of Orion 2" for DOS.
>>>>
>>>
>>> How about you just run it in dosbox instead of dosemu ?
>>>
>>
>> Yes, that's a solution of course, but it is a bit like saying why
>> not use Open Office instead of MS Word.
>>
>> A long term goal of wine is to support DOS apps to. Of course
>> it's not a priority, but it's there.
>>
>> regards,
>> Jakob
>>
> 
> Can you tell me how the invisible high-word (invisible in VM-86, and
> in real mode) could possibly harm something running in VM-86 or
> read-mode ???  I don't even think it's a BUG. If the transition
> into and out of VM-86 doesn't handle the fact that the high-word
> of the stack hasn't been used in VM-86, then that piece of code
> is bad (the SP isn't even the same stack, BTW).

Because even in 16-bit mode (real, vm86 or 16-bit protected) you can use 
32-bit instructions, with an operand and/or address size override 
prefix.  Of course this only works on 386 or later.

--
				Brian Gerst

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [patch] x86: fix ESP corruption CPU bug
  2005-03-14 15:21             ` Jakob Eriksson
  2005-03-14 17:03               ` linux-os
@ 2005-03-14 18:02               ` Stas Sergeev
  1 sibling, 0 replies; 25+ messages in thread
From: Stas Sergeev @ 2005-03-14 18:02 UTC (permalink / raw)
  To: Jakob Eriksson
  Cc: Andi Kleen, Pavel Machek, Alan Cox, Linux kernel, Petr Vandrovec,
	Denis Vlasenko, wine-devel, torvalds, the3dfxdude

Hi,

Jakob Eriksson wrote:
> A long term goal of wine is to support DOS apps to. Of course
> it's not a priority, but it's there.
Yes, that's exactly what I was hoping
for, thanks!
Even if no Windows apps do such a thing
(which wasn't confirmed yet), Wine may
still need that fix for the DOS support
in the future, and dosemu seems to be in
need of that fix already and for long
(that's where I am getting the use of it).
And we haven't heard a word for a VMWare
yet, and I think they may appreciate that
too.
Also, since the first version of the path,
I've been contacted by a few people asking
me to provide an updated version for 2.6.9
and 2.6.10. I don't know the reason, but
I know it was used (I think they were more
concerned about an aforementioned "information
leak", rather than about the %esp corruption).
So if the last problem with that patch was
that it is not really needed, I think it
no longer stays:)


^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [patch] x86: fix ESP corruption CPU bug
  2005-03-14  9:34           ` Andi Kleen
  2005-03-14 15:21             ` Jakob Eriksson
@ 2005-03-14 17:29             ` Stas Sergeev
  1 sibling, 0 replies; 25+ messages in thread
From: Stas Sergeev @ 2005-03-14 17:29 UTC (permalink / raw)
  To: Andi Kleen; +Cc: Linux kernel, torvalds

Hello.

Andi Kleen wrote:
>> The particular game I want to get working,
>> is "Master of Orion 2" for DOS.
> How about you just run it in dosbox instead of dosemu ?
Way too slow, and there are other reasons
too, like the better networking support on
dosemu side (not for Orion, but for other
games is important), but that might be an
off-topic here:)


^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [patch] x86: fix ESP corruption CPU bug
  2005-03-14 17:03               ` linux-os
@ 2005-03-14 17:10                 ` Pavel Machek
  2005-03-14 19:24                 ` Brian Gerst
  1 sibling, 0 replies; 25+ messages in thread
From: Pavel Machek @ 2005-03-14 17:10 UTC (permalink / raw)
  To: linux-os
  Cc: Jakob Eriksson, Andi Kleen, Stas Sergeev, Alan Cox, Linux kernel,
	Petr Vandrovec, Denis Vlasenko, wine-devel, torvalds

Hi!

> Can you tell me how the invisible high-word (invisible in VM-86, and
> in real mode) could possibly harm something running in VM-86 or
> read-mode ???  I don't even think it's a BUG. If the transition

You can have protected-mode application running in dosemu with 16-bit
stack segment.
								Pavel
-- 
People were complaining that M$ turns users into beta-testers...
...jr ghea gurz vagb qrirybcref, naq gurl frrz gb yvxr vg gung jnl!

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [patch] x86: fix ESP corruption CPU bug
  2005-03-14 15:21             ` Jakob Eriksson
@ 2005-03-14 17:03               ` linux-os
  2005-03-14 17:10                 ` Pavel Machek
  2005-03-14 19:24                 ` Brian Gerst
  2005-03-14 18:02               ` Stas Sergeev
  1 sibling, 2 replies; 25+ messages in thread
From: linux-os @ 2005-03-14 17:03 UTC (permalink / raw)
  To: Jakob Eriksson
  Cc: Andi Kleen, Stas Sergeev, Pavel Machek, Alan Cox, Linux kernel,
	Petr Vandrovec, Denis Vlasenko, wine-devel, torvalds

On Mon, 14 Mar 2005, Jakob Eriksson wrote:

> Andi Kleen wrote:
>
>> Stas Sergeev <stsp@aknet.ru> writes:
>> 
>>>> Another way of saying the same thing: I absolutely hate seeing
>>>> patches that fix some theoretical issue that no Linux apps will ever
>>>> care about.
>>>> 
>>> No, it is not theoretical, but it is mainly
>>> about a DOS games and an MS linker, as for
>>> me. The things I'd like to get working, but
>>> the ones you may not care too much about:)
>>> The particular game I want to get working,
>>> is "Master of Orion 2" for DOS.
>>> 
>> 
>> How about you just run it in dosbox instead of dosemu ?
>> 
>
> Yes, that's a solution of course, but it is a bit like saying why
> not use Open Office instead of MS Word.
>
> A long term goal of wine is to support DOS apps to. Of course
> it's not a priority, but it's there.
>
> regards,
> Jakob
>

Can you tell me how the invisible high-word (invisible in VM-86, and
in real mode) could possibly harm something running in VM-86 or
read-mode ???  I don't even think it's a BUG. If the transition
into and out of VM-86 doesn't handle the fact that the high-word
of the stack hasn't been used in VM-86, then that piece of code
is bad (the SP isn't even the same stack, BTW).


Cheers,
Dick Johnson
Penguin : Linux version 2.6.11 on an i686 machine (5537.79 BogoMips).
  Notice : All mail here is now cached for review by Dictator Bush.
                  98.36% of all statistics are fiction.

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [patch] x86: fix ESP corruption CPU bug
  2005-03-14  9:34           ` Andi Kleen
@ 2005-03-14 15:21             ` Jakob Eriksson
  2005-03-14 17:03               ` linux-os
  2005-03-14 18:02               ` Stas Sergeev
  2005-03-14 17:29             ` Stas Sergeev
  1 sibling, 2 replies; 25+ messages in thread
From: Jakob Eriksson @ 2005-03-14 15:21 UTC (permalink / raw)
  To: Andi Kleen
  Cc: Stas Sergeev, Pavel Machek, Alan Cox, Linux kernel,
	Petr Vandrovec, Denis Vlasenko, wine-devel, torvalds

Andi Kleen wrote:

>Stas Sergeev <stsp@aknet.ru> writes:
>  
>
>>>Another way of saying the same thing: I absolutely hate seeing
>>>patches that fix some theoretical issue that no Linux apps will ever
>>>care about.
>>>      
>>>
>>No, it is not theoretical, but it is mainly
>>about a DOS games and an MS linker, as for
>>me. The things I'd like to get working, but
>>the ones you may not care too much about:)
>>The particular game I want to get working,
>>is "Master of Orion 2" for DOS.
>>    
>>
>
>How about you just run it in dosbox instead of dosemu ?
>  
>

Yes, that's a solution of course, but it is a bit like saying why
not use Open Office instead of MS Word.

A long term goal of wine is to support DOS apps to. Of course
it's not a priority, but it's there.

regards,
Jakob


^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [patch] x86: fix ESP corruption CPU bug
@ 2005-03-14 11:10 Zoltan Boszormenyi
  0 siblings, 0 replies; 25+ messages in thread
From: Zoltan Boszormenyi @ 2005-03-14 11:10 UTC (permalink / raw)
  To: Andi Kleen; +Cc: linux-kernel

> Stas Sergeev <stsp@aknet.ru> writes:
>>
>>> Another way of saying the same thing: I absolutely hate seeing
>>> patches that fix some theoretical issue that no Linux apps will ever
>>> care about.
>> No, it is not theoretical, but it is mainly
>> about a DOS games and an MS linker, as for
>> me. The things I'd like to get working, but
>> the ones you may not care too much about:)
>> The particular game I want to get working,
>> is "Master of Orion 2" for DOS.
> 
> How about you just run it in dosbox instead of dosemu ?
> 
> -Andi

Nah, don't insult a DOSemu developer. ;-) Stas is one of them...

Best regards,
Zoltán Böszörményi


^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [patch] x86: fix ESP corruption CPU bug
  2005-03-14  4:52         ` Stas Sergeev
@ 2005-03-14  9:34           ` Andi Kleen
  2005-03-14 15:21             ` Jakob Eriksson
  2005-03-14 17:29             ` Stas Sergeev
  0 siblings, 2 replies; 25+ messages in thread
From: Andi Kleen @ 2005-03-14  9:34 UTC (permalink / raw)
  To: Stas Sergeev
  Cc: Pavel Machek, Alan Cox, Linux kernel, Petr Vandrovec,
	Denis Vlasenko, wine-devel, torvalds

Stas Sergeev <stsp@aknet.ru> writes:
>
>> Another way of saying the same thing: I absolutely hate seeing
>> patches that fix some theoretical issue that no Linux apps will ever
>> care about.
> No, it is not theoretical, but it is mainly
> about a DOS games and an MS linker, as for
> me. The things I'd like to get working, but
> the ones you may not care too much about:)
> The particular game I want to get working,
> is "Master of Orion 2" for DOS.

How about you just run it in dosbox instead of dosemu ?

-Andi

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [patch] x86: fix ESP corruption CPU bug
  2005-03-14  0:16       ` Linus Torvalds
@ 2005-03-14  4:52         ` Stas Sergeev
  2005-03-14  9:34           ` Andi Kleen
  0 siblings, 1 reply; 25+ messages in thread
From: Stas Sergeev @ 2005-03-14  4:52 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Pavel Machek, Alan Cox, Linux kernel, Petr Vandrovec,
	Denis Vlasenko, wine-devel

Hi,

Linus Torvalds wrote:
> Btw, Stas, one thing I'd really like to see is even a partial list of 
> anything that actually cares about this. Ie, if there is some known 
> Windows app where Wine works better or something like that, just adding
I am not using Wine too much, but I've
found this:
http://cvs.winehq.org/cvsweb/~checkout~/wine/dlls/winedos/int31.c?rev=1.41&content-type=text/plain
---
/* due to a flaw in some CPUs (at least mine), it is best to mark stack segments as 32-bit if they
   can be used in 32-bit code. Otherwise, these CPUs may not set the high word of esp during a
   ring transition (from kernel code) to the 16-bit stack, and this causes trouble if executing
   32-bit code using this stack. */
---
I added win-devel to CC, maybe people there
can tell if that patch has any value for them
or not.
The reference to the original patch:
http://www.uwsg.iu.edu/hypermail/linux/kernel/0503.1/1794.html

Dosemu looks a little better on that, the
whole chapter of the docs is dedicated to
that problem:
http://dosemu.sourceforge.net/docs/EMUfailure/t1.html#AEN55
There you can find a (relatively small)
list of the programs that are affected,
but I personally have the old Microsoft
linker that crashes, and a few more DOS
games.

> Another way of saying the same thing: I absolutely hate seeing patches 
> that fix some theoretical issue that no Linux apps will ever care about. 
No, it is not theoretical, but it is mainly
about a DOS games and an MS linker, as for
me. The things I'd like to get working, but
the ones you may not care too much about:)
The particular game I want to get working,
is "Master of Orion 2" for DOS.

> So I'd like to have a bit more of a case for this patch, since I know what 
> the case against it is ;)
Yep, and the informational leak it closes,
looks also rather minor.
So it is only a matter of how do you care
about the dosemu and the DOS games under
linux. Considering the amount of the
dosemu-related code in vm86.c, I guess you
care:)
And uhm, adding the list of the DOS games
to the comments of the Linux kernel code,
doesn't sound like a good idea to me:)


^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [patch] x86: fix ESP corruption CPU bug
  2005-03-13 21:13     ` Linus Torvalds
  2005-03-13 23:17       ` Pavel Machek
@ 2005-03-14  0:16       ` Linus Torvalds
  2005-03-14  4:52         ` Stas Sergeev
  1 sibling, 1 reply; 25+ messages in thread
From: Linus Torvalds @ 2005-03-14  0:16 UTC (permalink / raw)
  To: Stas Sergeev
  Cc: Pavel Machek, Alan Cox, Linux kernel, Petr Vandrovec, Denis Vlasenko



On Sun, 13 Mar 2005, Linus Torvalds wrote:
> 
> That said, the "ldt_ss" case should be moved _after_ the conditional
> tests, since most CPU's out there will do static prediction based on
> forward/backwards direction 

Btw, Stas, one thing I'd really like to see is even a partial list of 
anything that actually cares about this. Ie, if there is some known 
Windows app where Wine works better or something like that, just adding 
that information to the comments would be hugely appreciated. 

Another way of saying the same thing: I absolutely hate seeing patches 
that fix some theoretical issue that no Linux apps will ever care about. 
So I'd like to have a bit more of a case for this patch, since I know what 
the case against it is ;)

		Linus

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [patch] x86: fix ESP corruption CPU bug
  2005-03-13 23:17       ` Pavel Machek
@ 2005-03-13 23:54         ` Linus Torvalds
  0 siblings, 0 replies; 25+ messages in thread
From: Linus Torvalds @ 2005-03-13 23:54 UTC (permalink / raw)
  To: Pavel Machek
  Cc: Stas Sergeev, Alan Cox, Linux kernel, Petr Vandrovec, Denis Vlasenko



On Mon, 14 Mar 2005, Pavel Machek wrote:
> 
> What about flag similar to _TIF_SYSCALL_TRACE (call it
> _TIF_THIS_BEAST_USES_V86 or something), and only do the tests in the
> slowpath if it is set? As normal applications do not use v86, we could
> make this 0 instructions in syscall fast path...

It wouldn't help you. You'd need to mix in two of the values anyway, so at 
most you'd save one instruction. And the cost would be that anything that 
has ever used vm86 mode (can you say "X server"?) would be slower. Not a 
good trade-off.

Oh, I guess you could clear the flag when you know there's no vm86 state
anywhere (easy enough, those things never nest), but then it still comes
back to "extra complexity that you can get wrong, just to save a single
"mov" instruction - that "mov" may have partial-register-stall issues, 
but still..).

		Linus

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [patch] x86: fix ESP corruption CPU bug
  2005-03-13 21:13     ` Linus Torvalds
@ 2005-03-13 23:17       ` Pavel Machek
  2005-03-13 23:54         ` Linus Torvalds
  2005-03-14  0:16       ` Linus Torvalds
  1 sibling, 1 reply; 25+ messages in thread
From: Pavel Machek @ 2005-03-13 23:17 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Stas Sergeev, Alan Cox, Linux kernel, Petr Vandrovec, Denis Vlasenko

On Ne 13-03-05 13:13:16, Linus Torvalds wrote:
> 
> 
> On Sun, 13 Mar 2005, Stas Sergeev wrote:
> >
> > Such an optimization will cost three more
> > instructions, one of which is a "taken"
> > jump.
> 
> I think Pavel missed the fact that you have to check the VM86 bit in
> eflags before you check SS, since otherwise SS doesn't mean anything
> special at all (ie checking for just the normal SS isn't correct: you
> could have a 16-bit SS that looks normal, but is actually a vm86 segment).
> 
> Pavel: for the same reason you have to check the low two bits of CS too, 
> since if they are zero, then SS hasn't been saved on the stack at all, so 
> comparing it against some normal value is meaningless.

Yes, I missed that one, thanks.

What about flag similar to _TIF_SYSCALL_TRACE (call it
_TIF_THIS_BEAST_USES_V86 or something), and only do the tests in the
slowpath if it is set? As normal applications do not use v86, we could
make this 0 instructions in syscall fast path...
								Pavel
-- 
People were complaining that M$ turns users into beta-testers...
...jr ghea gurz vagb qrirybcref, naq gurl frrz gb yvxr vg gung jnl!

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [patch] x86: fix ESP corruption CPU bug
  2005-03-13 20:55   ` Stas Sergeev
@ 2005-03-13 21:13     ` Linus Torvalds
  2005-03-13 23:17       ` Pavel Machek
  2005-03-14  0:16       ` Linus Torvalds
  0 siblings, 2 replies; 25+ messages in thread
From: Linus Torvalds @ 2005-03-13 21:13 UTC (permalink / raw)
  To: Stas Sergeev
  Cc: Pavel Machek, Alan Cox, Linux kernel, Petr Vandrovec, Denis Vlasenko



On Sun, 13 Mar 2005, Stas Sergeev wrote:
>
> Such an optimization will cost three more
> instructions, one of which is a "taken"
> jump.

I think Pavel missed the fact that you have to check the VM86 bit in
eflags before you check SS, since otherwise SS doesn't mean anything
special at all (ie checking for just the normal SS isn't correct: you
could have a 16-bit SS that looks normal, but is actually a vm86 segment).

Pavel: for the same reason you have to check the low two bits of CS too, 
since if they are zero, then SS hasn't been saved on the stack at all, so 
comparing it against some normal value is meaningless.

That said, the "ldt_ss" case should be moved _after_ the conditional
tests, since most CPU's out there will do static prediction based on
forward/backwards direction if the branch predictor isn't primed. And so
since it's an unusual case, the branch should be a forward branch, which
is usually the not-taken one (this depends on the core, of course, and you
could also add the prefix byte to mark it explicitly predict-not-taken for 
the newer cores that support it).

			Linus

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [patch] x86: fix ESP corruption CPU bug
  2005-03-13 20:10 ` Pavel Machek
@ 2005-03-13 20:55   ` Stas Sergeev
  2005-03-13 21:13     ` Linus Torvalds
  0 siblings, 1 reply; 25+ messages in thread
From: Stas Sergeev @ 2005-03-13 20:55 UTC (permalink / raw)
  To: Pavel Machek
  Cc: Alan Cox, Linux kernel, Linus Torvalds, Petr Vandrovec, Denis Vlasenko

Hi.

Pavel Machek wrote:
>> +	andl $(VM_MASK | (4 << 8) | 3), %eax
>> +	cmpl $((4 << 8) | 3), %eax
>> +	je ldt_ss			# returning to user-space with LDT SS
> All common linux apps use same %ss, no? Perhaps it would be more
> efficient to just check if %ss == 0x7b, and proceed directly to
> restore_nocheck if not?
Such an optimization will cost three more
instructions, one of which is a "taken"
jump. It seems like the "taken" jump on
a fast path is not good, while now it is
only 5 instructions with a not-taken jump.
I am not sure here, but I think the current
solution is better (depends on how bad the
"taken" jump is, and how bad it is to have
the three extra insns for that optimization
purpose).

> Or perhaps we could only enable this code
> after application loads custom ldt?
The good thing here is that the code
actually does what you say, i.e. it jumps
to ldt_ss only when the app has loaded
the custom ldt and loaded that selector
to %ss. The way it is implemented, is
probably different from what you mean,
I assume you mean the new per-thread flag?
But I don't see how it can be more optimal,
i.e. you propose to check whether or not
the app altered the ldt (which can just be
the old glibc I think), while the current
solution is to also check whether it was
loaded to %ss (so the glibc case is avoided,
IIRC glibc used to load %gs with LDT selector).
I.e. since right now we jump to ldt_ss only
when the %ss is loaded with an LDT selector,
I think the extra checks are not needed.


^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [patch] x86: fix ESP corruption CPU bug
  2005-03-13 18:20 Stas Sergeev
  2005-03-13 18:52 ` Grzegorz Kulewski
@ 2005-03-13 20:10 ` Pavel Machek
  2005-03-13 20:55   ` Stas Sergeev
  1 sibling, 1 reply; 25+ messages in thread
From: Pavel Machek @ 2005-03-13 20:10 UTC (permalink / raw)
  To: Stas Sergeev
  Cc: Alan Cox, Linux kernel, Linus Torvalds, Petr Vandrovec, Denis Vlasenko

Hi!

> @@ -257,8 +265,31 @@
>  	movl TI_flags(%ebp), %ecx
>  	testw $_TIF_ALLWORK_MASK, %cx	# current->work
>  	jne syscall_exit_work
> +
>  restore_all:
> -	RESTORE_ALL
> +	movl EFLAGS(%esp), %eax		# mix EFLAGS, SS and CS
> +	movb OLDSS(%esp), %ah
> +	movb CS(%esp), %al
> +	andl $(VM_MASK | (4 << 8) | 3), %eax
> +	cmpl $((4 << 8) | 3), %eax
> +	je ldt_ss			# returning to user-space with LDT SS

All common linux apps use same %ss, no? Perhaps it would be more
efficient to just check if %ss == 0x7b, and proceed directly to
restore_nocheck if not?

Or perhaps we could only enable this code after application loads
custom ldt?

								Pavel

-- 
People were complaining that M$ turns users into beta-testers...
...jr ghea gurz vagb qrirybcref, naq gurl frrz gb yvxr vg gung jnl!

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [patch] x86: fix ESP corruption CPU bug
  2005-03-13 18:52 ` Grzegorz Kulewski
  2005-03-13 19:11   ` Stas Sergeev
@ 2005-03-13 20:02   ` Pavel Machek
  1 sibling, 0 replies; 25+ messages in thread
From: Pavel Machek @ 2005-03-13 20:02 UTC (permalink / raw)
  To: Grzegorz Kulewski
  Cc: Stas Sergeev, Alan Cox, Linux kernel, Linus Torvalds,
	Petr Vandrovec, Denis Vlasenko

Hi!

> >Attached patch works around the corruption
> >of the high word of the ESP register, which
> >is the official bug of x86 CPUs. The bug
> >triggers only when the one is using the
> >16bit stack segment, and is described here:
> >http://www.intel.com/design/intarch/specupdt/27287402.PDF
> 
> Does the bug also egsist on AMD CPU's? Does the patch add anything to 
> kernels compiled for AMD CPU's?

Yes, same workaround is needed on AMDs, Cyrixes, ...

								Pavel

-- 
People were complaining that M$ turns users into beta-testers...
...jr ghea gurz vagb qrirybcref, naq gurl frrz gb yvxr vg gung jnl!

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [patch] x86: fix ESP corruption CPU bug
  2005-03-13 19:37     ` Ondrej Zary
@ 2005-03-13 19:46       ` Stas Sergeev
  0 siblings, 0 replies; 25+ messages in thread
From: Stas Sergeev @ 2005-03-13 19:46 UTC (permalink / raw)
  To: Ondrej Zary; +Cc: Grzegorz Kulewski, Linux kernel, Petr Vandrovec

Hello.

Ondrej Zary wrote:
> I've just ran that on my Cyrix MII PR300 and the bug is present:<>
> UMC U5SX/33 in my router - also present:
Thanks, now I know for sure that it exist
everywhere.
Now you can apply the patch and make sure
the bug goes away:)


^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [patch] x86: fix ESP corruption CPU bug
  2005-03-13 19:11   ` Stas Sergeev
@ 2005-03-13 19:37     ` Ondrej Zary
  2005-03-13 19:46       ` Stas Sergeev
  0 siblings, 1 reply; 25+ messages in thread
From: Ondrej Zary @ 2005-03-13 19:37 UTC (permalink / raw)
  To: Stas Sergeev; +Cc: Grzegorz Kulewski, Linux kernel, Petr Vandrovec

Stas Sergeev wrote:
> Hello.
> 
> Grzegorz Kulewski wrote:
> 
>> Does the bug also egsist on AMD CPU's?
> 
> Yes. As well as the ones of a Transmeta etc.
> I just haven't tested the old Cyrixes, that
> AFAIK were trying to ignore some Intel bugs.
> The test-case for the bug is here:
> http://www.ussg.iu.edu/hypermail/linux/kernel/0409.2/0690.html

I've just ran that on my Cyrix MII PR300 and the bug is present:
old_ss=0x7b new_ss=0x7f
In sighandler: esp=bffff780
old_esp=0xbffff780 new_esp=0xc1a6f780
BUG!

I have also an older Cyrix CPU - 6x86 PR166 - but can't test it now as 
it's sitting in a plastic box on the shelf :-)

UMC U5SX/33 in my router - also present:
old_ss=0x7b new_ss=0x7f
In sighandler: esp=bffff820
old_esp=0xbffff820 new_esp=0xc003f820
BUG!


-- 
Ondrej Zary

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [patch] x86: fix ESP corruption CPU bug
  2005-03-13 18:52 ` Grzegorz Kulewski
@ 2005-03-13 19:11   ` Stas Sergeev
  2005-03-13 19:37     ` Ondrej Zary
  2005-03-13 20:02   ` Pavel Machek
  1 sibling, 1 reply; 25+ messages in thread
From: Stas Sergeev @ 2005-03-13 19:11 UTC (permalink / raw)
  To: Grzegorz Kulewski; +Cc: Linux kernel, Petr Vandrovec

Hello.

Grzegorz Kulewski wrote:
> Does the bug also egsist on AMD CPU's?
Yes. As well as the ones of a Transmeta etc.
I just haven't tested the old Cyrixes, that
AFAIK were trying to ignore some Intel bugs.
The test-case for the bug is here:
http://www.ussg.iu.edu/hypermail/linux/kernel/0409.2/0690.html

> Does the patch add anything to 
> kernels compiled for AMD CPU's?
Same as for the Intel ones - unless you are
a dosemu or, in a lesser extent, Wine user -
nothing except for fixing the small "information
leak".
If you are the dosemu user however, then this
patch adds a lot. Whether or not it adds
something for the VMWare users I can't say, since
I am not the one of them, but my guess is that
it can help with the DOS programs under it. 


^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [patch] x86: fix ESP corruption CPU bug
  2005-03-13 18:20 Stas Sergeev
@ 2005-03-13 18:52 ` Grzegorz Kulewski
  2005-03-13 19:11   ` Stas Sergeev
  2005-03-13 20:02   ` Pavel Machek
  2005-03-13 20:10 ` Pavel Machek
  1 sibling, 2 replies; 25+ messages in thread
From: Grzegorz Kulewski @ 2005-03-13 18:52 UTC (permalink / raw)
  To: Stas Sergeev
  Cc: Alan Cox, Linux kernel, Linus Torvalds, Petr Vandrovec,
	Denis Vlasenko, Pavel Machek

On Sun, 13 Mar 2005, Stas Sergeev wrote:

> Hi Alan.
>
> Attached patch works around the corruption
> of the high word of the ESP register, which
> is the official bug of x86 CPUs. The bug
> triggers only when the one is using the
> 16bit stack segment, and is described here:
> http://www.intel.com/design/intarch/specupdt/27287402.PDF

Does the bug also egsist on AMD CPU's? Does the patch add anything to 
kernels compiled for AMD CPU's?


Thanks,

Grzegorz Kulewski

^ permalink raw reply	[flat|nested] 25+ messages in thread

* [patch] x86: fix ESP corruption CPU bug
@ 2005-03-13 18:20 Stas Sergeev
  2005-03-13 18:52 ` Grzegorz Kulewski
  2005-03-13 20:10 ` Pavel Machek
  0 siblings, 2 replies; 25+ messages in thread
From: Stas Sergeev @ 2005-03-13 18:20 UTC (permalink / raw)
  To: Alan Cox
  Cc: Linux kernel, Linus Torvalds, Petr Vandrovec, Denis Vlasenko,
	Pavel Machek

[-- Attachment #1: Type: text/plain, Size: 1817 bytes --]

Hi Alan.

Attached patch works around the corruption
of the high word of the ESP register, which
is the official bug of x86 CPUs. The bug
triggers only when the one is using the
16bit stack segment, and is described here:
http://www.intel.com/design/intarch/specupdt/27287402.PDF

Patch helps running many apps under dosemu,
and, according to the comments found in
Wine sources, also helps Wine.

Also the patch closes the "information leak",
which is that the half of the kernel's %esp
value gets leaked to user-space.

The initial discussion about the problem
can be found here:
http://lkml.org/lkml/2004/9/16/254
On a later stages the development of that
patch was driven mainly by Linus, but it
was in private.

This patch adds only 6 instructions for
the fast kernel-->user return path, 6
instructions on an exception path and
5 instructions on an NMI path. All the
rest of the patch gets executed only
for the two apps in that world, namely,
dosemu and wine (and VMWare?).
That's why it should be relatevely easy
to make sure the patch doesn't do any
harm for the normal apps, and so it is
safe, and probably fits into an -ac tree.

How it works:
- Allocates the per-cpu data for 16bit
stacks and sets the per-cpu GDT entries
for them.
- On a return to userspace, checks whether
the SS is from LDT and is 16 bit.
- If it is, the iret frame gets copied
to the 16bit per-cpu stack and stack gets
switched to the 16bit one, while the
higher word of %esp gets preloaded with
the proper value.
- On an exceptions the check is performed
to see if we are on a 16bit stack, and
if we are - switch to the 32bit one.


Alan, would it be possible to apply that
patch to an -ac tree?


Acked-by: Linus Torvalds <torvalds@osdl.org>
Acked-by: Petr Vandrovec <vandrove@vc.cvut.cz>
Signed-off-by: Stas Sergeev <stsp@aknet.ru>


[-- Attachment #2: linux-2.6.11-ac2-stk7.diff --]
[-- Type: text/x-patch, Size: 11163 bytes --]

diff -ur linux-2.6.11-ac2/arch/i386/kernel/cpu/common.c linux-2.6.11-ac2-stk/arch/i386/kernel/cpu/common.c
--- linux-2.6.11-ac2/arch/i386/kernel/cpu/common.c	2005-03-13 17:59:45.000000000 +0300
+++ linux-2.6.11-ac2-stk/arch/i386/kernel/cpu/common.c	2005-03-13 18:17:47.000000000 +0300
@@ -21,6 +21,10 @@
 DEFINE_PER_CPU(struct desc_struct, cpu_gdt_table[GDT_ENTRIES]);
 EXPORT_PER_CPU_SYMBOL(cpu_gdt_table);
 
+unsigned char cpu_16bit_stack[CPU_16BIT_STACK_SIZE];
+DEFINE_PER_CPU(unsigned char, cpu_16bit_stack[CPU_16BIT_STACK_SIZE]);
+EXPORT_PER_CPU_SYMBOL(cpu_16bit_stack);
+
 static int cachesize_override __initdata = -1;
 static int disable_x86_fxsr __initdata = 0;
 static int disable_x86_serial_nr __initdata = 1;
@@ -557,6 +561,7 @@
 	int cpu = smp_processor_id();
 	struct tss_struct * t = &per_cpu(init_tss, cpu);
 	struct thread_struct *thread = &current->thread;
+	__u32 stk16_off = (__u32)&per_cpu(cpu_16bit_stack, cpu);
 
 	if (cpu_test_and_set(cpu, cpu_initialized)) {
 		printk(KERN_WARNING "CPU#%d already initialized!\n", cpu);
@@ -579,6 +584,13 @@
 	 */
 	memcpy(&per_cpu(cpu_gdt_table, cpu), cpu_gdt_table,
 	       GDT_SIZE);
+
+	/* Set up GDT entry for 16bit stack */
+	*(__u64 *)&(per_cpu(cpu_gdt_table, cpu)[GDT_ENTRY_ESPFIX_SS]) |=
+		((((__u64)stk16_off) << 16) & 0x000000ffffff0000ULL) |
+		((((__u64)stk16_off) << 32) & 0xff00000000000000ULL) |
+		(CPU_16BIT_STACK_SIZE - 1);
+
 	cpu_gdt_descr[cpu].size = GDT_SIZE - 1;
 	cpu_gdt_descr[cpu].address =
 	    (unsigned long)&per_cpu(cpu_gdt_table, cpu);
diff -ur linux-2.6.11-ac2/arch/i386/kernel/entry.S linux-2.6.11-ac2-stk/arch/i386/kernel/entry.S
--- linux-2.6.11-ac2/arch/i386/kernel/entry.S	2005-03-13 17:59:45.000000000 +0300
+++ linux-2.6.11-ac2-stk/arch/i386/kernel/entry.S	2005-03-13 18:17:47.000000000 +0300
@@ -47,6 +47,7 @@
 #include <asm/segment.h>
 #include <asm/smp.h>
 #include <asm/page.h>
+#include <asm/desc.h>
 #include "irq_vectors.h"
 
 #define nr_syscalls ((syscall_table_size)/4)
@@ -78,7 +79,7 @@
 #define preempt_stop		cli
 #else
 #define preempt_stop
-#define resume_kernel		restore_all
+#define resume_kernel		restore_nocheck
 #endif
 
 #define SAVE_ALL \
@@ -122,24 +123,6 @@
 .previous
 
 
-#define RESTORE_ALL	\
-	RESTORE_REGS	\
-	addl $4, %esp;	\
-1:	iret;		\
-.section .fixup,"ax";   \
-2:	sti;		\
-	movl $(__USER_DS), %edx; \
-	movl %edx, %ds; \
-	movl %edx, %es; \
-	movl $11,%eax;	\
-	call do_exit;	\
-.previous;		\
-.section __ex_table,"a";\
-	.align 4;	\
-	.long 1b,2b;	\
-.previous
-
-
 ENTRY(ret_from_fork)
 	pushl %eax
 	call schedule_tail
@@ -163,7 +146,7 @@
 	movl EFLAGS(%esp), %eax		# mix EFLAGS and CS
 	movb CS(%esp), %al
 	testl $(VM_MASK | 3), %eax
-	jz resume_kernel		# returning to kernel or vm86-space
+	jz resume_kernel
 ENTRY(resume_userspace)
  	cli				# make sure we don't miss an interrupt
 					# setting need_resched or sigpending
@@ -178,7 +161,7 @@
 ENTRY(resume_kernel)
 	cli
 	cmpl $0,TI_preempt_count(%ebp)	# non-zero preempt_count ?
-	jnz restore_all
+	jnz restore_nocheck
 need_resched:
 	movl TI_flags(%ebp), %ecx	# need_resched set ?
 	testb $_TIF_NEED_RESCHED, %cl
@@ -189,6 +172,31 @@
 	jmp need_resched
 #endif
 
+ldt_ss:
+	larl OLDSS(%esp), %eax
+	jnz restore_nocheck
+	testl $0x00400000, %eax		# returning to 32bit stack?
+	jnz restore_nocheck		# allright, normal return
+	/* If returning to userspace with 16bit stack,
+	 * try to fix the higher word of ESP, as the CPU
+	 * won't restore it.
+	 * This is an "official" bug of all the x86-compatible
+	 * CPUs, which we can try to work around to make
+	 * dosemu and wine happy. */
+	subl $8, %esp		# reserve space for switch16 pointer
+	cli
+	movl %esp, %eax
+	/* Set up the 16bit stack frame with switch32 pointer on top,
+	 * and a switch16 pointer on top of the current frame. */
+	call setup_x86_bogus_stack
+	RESTORE_REGS
+	lss 20+4(%esp), %esp	# switch to 16bit stack
+1:	iret
+.section __ex_table,"a"
+	.align 4
+	.long 1b,iret_exc
+.previous
+
 /* SYSENTER_RETURN points to after the "sysenter" instruction in
    the vsyscall page.  See vsyscall-sysentry.S, which defines the symbol.  */
 
@@ -257,8 +265,31 @@
 	movl TI_flags(%ebp), %ecx
 	testw $_TIF_ALLWORK_MASK, %cx	# current->work
 	jne syscall_exit_work
+
 restore_all:
-	RESTORE_ALL
+	movl EFLAGS(%esp), %eax		# mix EFLAGS, SS and CS
+	movb OLDSS(%esp), %ah
+	movb CS(%esp), %al
+	andl $(VM_MASK | (4 << 8) | 3), %eax
+	cmpl $((4 << 8) | 3), %eax
+	je ldt_ss			# returning to user-space with LDT SS
+restore_nocheck:
+	RESTORE_REGS
+	addl $4, %esp
+1:	iret
+.section .fixup,"ax"
+iret_exc:
+	sti
+	movl $__USER_DS, %edx
+	movl %edx, %ds
+	movl %edx, %es
+	movl $11,%eax
+	call do_exit
+.previous
+.section __ex_table,"a"
+	.align 4
+	.long 1b,iret_exc
+.previous
 
 	# perform work that needs to be done immediately before resumption
 	ALIGN
@@ -334,6 +365,27 @@
 	movl $-ENOSYS,EAX(%esp)
 	jmp resume_userspace
 
+#define FIXUP_ESPFIX_STACK \
+	movl %esp, %eax; \
+	/* switch to 32bit stack using the pointer on top of 16bit stack */ \
+	lss %ss:CPU_16BIT_STACK_SIZE-8, %esp; \
+	/* copy data from 16bit stack to 32bit stack */ \
+	call fixup_x86_bogus_stack; \
+	/* put ESP to the proper location */ \
+	movl %eax, %esp;
+#define UNWIND_ESPFIX_STACK \
+	pushl %eax; \
+	movl %ss, %eax; \
+	/* see if on 16bit stack */ \
+	cmpw $__ESPFIX_SS, %ax; \
+	jne 28f; \
+	movl $__KERNEL_DS, %edx; \
+	movl %edx, %ds; \
+	movl %edx, %es; \
+	/* switch to 32bit stack */ \
+	FIXUP_ESPFIX_STACK \
+28:	popl %eax;
+
 /*
  * Build the entry stubs and pointer table with
  * some assembler magic.
@@ -388,7 +440,9 @@
 	pushl %ecx
 	pushl %ebx
 	cld
-	movl %es, %ecx
+	pushl %es
+	UNWIND_ESPFIX_STACK
+	popl %ecx
 	movl ES(%esp), %edi		# get the function address
 	movl ORIG_EAX(%esp), %edx	# get the error code
 	movl %eax, ORIG_EAX(%esp)
@@ -470,6 +524,11 @@
  * fault happened on the sysenter path.
  */
 ENTRY(nmi)
+	pushl %eax
+	movl %ss, %eax
+	cmpw $__ESPFIX_SS, %ax
+	popl %eax
+	je nmi_16bit_stack
 	cmpl $sysenter_entry,(%esp)
 	je nmi_stack_fixup
 	pushl %eax
@@ -489,7 +548,7 @@
 	xorl %edx,%edx		# zero error code
 	movl %esp,%eax		# pt_regs pointer
 	call do_nmi
-	RESTORE_ALL
+	jmp restore_all
 
 nmi_stack_fixup:
 	FIX_STACK(12,nmi_stack_correct, 1)
@@ -505,6 +564,29 @@
 	FIX_STACK(24,nmi_stack_correct, 1)
 	jmp nmi_stack_correct
 
+nmi_16bit_stack:
+	/* create the pointer to lss back */
+	pushl %ss
+	pushl %esp
+	movzwl %sp, %esp
+	addw $4, (%esp)
+	/* copy the iret frame of 12 bytes */
+	.rept 3
+	pushl 16(%esp)
+	.endr
+	pushl %eax
+	SAVE_ALL
+	FIXUP_ESPFIX_STACK		# %eax == %esp
+	xorl %edx,%edx			# zero error code
+	call do_nmi
+	RESTORE_REGS
+	lss 12+4(%esp), %esp		# back to 16bit stack
+1:	iret
+.section __ex_table,"a"
+	.align 4
+	.long 1b,iret_exc
+.previous
+
 ENTRY(int3)
 	pushl $-1			# mark this as an int
 	SAVE_ALL
diff -ur linux-2.6.11-ac2/arch/i386/kernel/head.S linux-2.6.11-ac2-stk/arch/i386/kernel/head.S
--- linux-2.6.11-ac2/arch/i386/kernel/head.S	2005-03-13 17:56:19.000000000 +0300
+++ linux-2.6.11-ac2-stk/arch/i386/kernel/head.S	2005-03-13 18:17:47.000000000 +0300
@@ -512,7 +512,7 @@
 	.quad 0x00009a0000000000	/* 0xc0 APM CS 16 code (16 bit) */
 	.quad 0x0040920000000000	/* 0xc8 APM DS    data */
 
-	.quad 0x0000000000000000	/* 0xd0 - unused */
+	.quad 0x0000920000000000	/* 0xd0 - ESPFIX 16-bit SS */
 	.quad 0x0000000000000000	/* 0xd8 - unused */
 	.quad 0x0000000000000000	/* 0xe0 - unused */
 	.quad 0x0000000000000000	/* 0xe8 - unused */
diff -ur linux-2.6.11-ac2/arch/i386/kernel/traps.c linux-2.6.11-ac2-stk/arch/i386/kernel/traps.c
--- linux-2.6.11-ac2/arch/i386/kernel/traps.c	2005-03-13 17:59:45.000000000 +0300
+++ linux-2.6.11-ac2-stk/arch/i386/kernel/traps.c	2005-03-13 18:17:47.000000000 +0300
@@ -895,6 +895,51 @@
 #endif
 }
 
+fastcall void setup_x86_bogus_stack(unsigned char * stk)
+{
+	unsigned long *switch16_ptr, *switch32_ptr;
+	struct pt_regs *regs;
+	unsigned long stack_top, stack_bot;
+	unsigned short iret_frame16_off;
+	int cpu = smp_processor_id();
+	/* reserve the space on 32bit stack for the magic switch16 pointer */
+	memmove(stk, stk + 8, sizeof(struct pt_regs));
+	switch16_ptr = (unsigned long *)(stk + sizeof(struct pt_regs));
+	regs = (struct pt_regs *)stk;
+	/* now the switch32 on 16bit stack */
+	stack_bot = (unsigned long)&per_cpu(cpu_16bit_stack, cpu);
+	stack_top = stack_bot +	CPU_16BIT_STACK_SIZE;
+	switch32_ptr = (unsigned long *)(stack_top - 8);
+	iret_frame16_off = CPU_16BIT_STACK_SIZE - 8 - 20;
+	/* copy iret frame on 16bit stack */
+	memcpy((void *)(stack_bot + iret_frame16_off), &regs->eip, 20);
+	/* fill in the switch pointers */
+	switch16_ptr[0] = (regs->esp & 0xffff0000) | iret_frame16_off;
+	switch16_ptr[1] = __ESPFIX_SS;
+	switch32_ptr[0] = (unsigned long)stk + sizeof(struct pt_regs) +
+		8 - CPU_16BIT_STACK_SIZE;
+	switch32_ptr[1] = __KERNEL_DS;
+}
+
+fastcall unsigned char * fixup_x86_bogus_stack(unsigned short sp)
+{
+	unsigned long *switch32_ptr;
+	unsigned char *stack16, *stack32;
+	unsigned long stack_top, stack_bot;
+	int len;
+	int cpu = smp_processor_id();
+	stack_bot = (unsigned long)&per_cpu(cpu_16bit_stack, cpu);
+	stack_top = stack_bot +	CPU_16BIT_STACK_SIZE;
+	switch32_ptr = (unsigned long *)(stack_top - 8);
+	/* copy the data from 16bit stack to 32bit stack */
+	len = CPU_16BIT_STACK_SIZE - 8 - sp;
+	stack16 = (unsigned char *)(stack_bot + sp);
+	stack32 = (unsigned char *)
+		(switch32_ptr[0] + CPU_16BIT_STACK_SIZE - 8 - len);
+	memcpy(stack32, stack16, len);
+	return stack32;
+}
+
 /*
  *  'math_state_restore()' saves the current math information in the
  * old math state array, and gets the new ones from the current task
diff -ur linux-2.6.11-ac2/include/asm-i386/desc.h linux-2.6.11-ac2-stk/include/asm-i386/desc.h
--- linux-2.6.11-ac2/include/asm-i386/desc.h	2005-03-13 17:57:12.000000000 +0300
+++ linux-2.6.11-ac2-stk/include/asm-i386/desc.h	2005-03-13 18:17:47.000000000 +0300
@@ -4,6 +4,8 @@
 #include <asm/ldt.h>
 #include <asm/segment.h>
 
+#define CPU_16BIT_STACK_SIZE 1024
+
 #ifndef __ASSEMBLY__
 
 #include <linux/preempt.h>
@@ -15,6 +17,9 @@
 extern struct desc_struct cpu_gdt_table[GDT_ENTRIES];
 DECLARE_PER_CPU(struct desc_struct, cpu_gdt_table[GDT_ENTRIES]);
 
+extern unsigned char cpu_16bit_stack[CPU_16BIT_STACK_SIZE];
+DECLARE_PER_CPU(unsigned char, cpu_16bit_stack[CPU_16BIT_STACK_SIZE]);
+
 struct Xgt_desc_struct {
 	unsigned short size;
 	unsigned long address __attribute__((packed));
diff -ur linux-2.6.11-ac2/include/asm-i386/segment.h linux-2.6.11-ac2-stk/include/asm-i386/segment.h
--- linux-2.6.11-ac2/include/asm-i386/segment.h	2005-01-04 03:59:37.000000000 +0300
+++ linux-2.6.11-ac2-stk/include/asm-i386/segment.h	2005-03-13 18:17:47.000000000 +0300
@@ -38,7 +38,7 @@
  *  24 - APM BIOS support
  *  25 - APM BIOS support 
  *
- *  26 - unused
+ *  26 - ESPFIX small SS
  *  27 - unused
  *  28 - unused
  *  29 - unused
@@ -71,6 +71,9 @@
 #define GDT_ENTRY_PNPBIOS_BASE		(GDT_ENTRY_KERNEL_BASE + 6)
 #define GDT_ENTRY_APMBIOS_BASE		(GDT_ENTRY_KERNEL_BASE + 11)
 
+#define GDT_ENTRY_ESPFIX_SS		(GDT_ENTRY_KERNEL_BASE + 14)
+#define __ESPFIX_SS (GDT_ENTRY_ESPFIX_SS * 8)
+
 #define GDT_ENTRY_DOUBLEFAULT_TSS	31
 
 /*

^ permalink raw reply	[flat|nested] 25+ messages in thread

end of thread, other threads:[~2005-03-14 20:24 UTC | newest]

Thread overview: 25+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2005-01-03 23:39 [patch] x86: fix ESP corruption CPU bug Stas Sergeev
2005-01-04  0:01 ` Linus Torvalds
2005-01-04  1:58   ` Stas Sergeev
2005-03-13 18:20 Stas Sergeev
2005-03-13 18:52 ` Grzegorz Kulewski
2005-03-13 19:11   ` Stas Sergeev
2005-03-13 19:37     ` Ondrej Zary
2005-03-13 19:46       ` Stas Sergeev
2005-03-13 20:02   ` Pavel Machek
2005-03-13 20:10 ` Pavel Machek
2005-03-13 20:55   ` Stas Sergeev
2005-03-13 21:13     ` Linus Torvalds
2005-03-13 23:17       ` Pavel Machek
2005-03-13 23:54         ` Linus Torvalds
2005-03-14  0:16       ` Linus Torvalds
2005-03-14  4:52         ` Stas Sergeev
2005-03-14  9:34           ` Andi Kleen
2005-03-14 15:21             ` Jakob Eriksson
2005-03-14 17:03               ` linux-os
2005-03-14 17:10                 ` Pavel Machek
2005-03-14 19:24                 ` Brian Gerst
2005-03-14 20:21                   ` Stas Sergeev
2005-03-14 18:02               ` Stas Sergeev
2005-03-14 17:29             ` Stas Sergeev
2005-03-14 11:10 Zoltan Boszormenyi

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).