All of lore.kernel.org
 help / color / mirror / Atom feed
* [patch] Memory Error Handling Improvement
@ 2005-06-23 17:30 Russ Anderson
  2005-06-23 17:52 ` David Mosberger
                   ` (17 more replies)
  0 siblings, 18 replies; 19+ messages in thread
From: Russ Anderson @ 2005-06-23 17:30 UTC (permalink / raw)
  To: linux-ia64

	[patch] Memory Error Handling Improvement

With the current memory recovery code, if the MCA surfaces while the CPU is
in privilage mode, the code does not try to recover.  There are cases where
a read is launched in user context, but an interrupt starts a context
switch causing the MCA to surface early in the context switch code.  This
patch add a check to see if the MCA surfaced in the interrupt code while
saving user state, and if so, the MCA was due to a useland read and can
be recovered.

Jack Steiner wrote the significant parts of this code.

Testing: The test program allocates memory, does a SAL call to change the 
	ECC on that memory to create a hardware uncorrectable error, then 
	consumes the data, causing the MCA.  With other interrupt traffic 
	(to force context switches) the 2.6.12 code will not recover after
	8-100 passes.  With this patch, the test consistently reaches the
	1024 recovery limit.  Analysis of the recovered MCA records shows 
	7-10% are in the interrupt code saving user state.  Those MCAs
	become recovered with this patch.

Signed-off-by: Russ Anderson (rja@sgi.com)

----------------------------------------------------------------------
Index: linux-2.6/arch/ia64/kernel/mca_drv.c
=================================--- linux-2.6.orig/arch/ia64/kernel/mca_drv.c	2005-06-22 10:27:30.006898406 -0500
+++ linux-2.6/arch/ia64/kernel/mca_drv.c	2005-06-23 10:58:30.145638400 -0500
@@ -118,10 +118,11 @@
  */
 
 void
-mca_handler_bh(unsigned long paddr)
+mca_handler_bh(unsigned long paddr, void *iip, unsigned long ipsr)
 {
-	printk(KERN_DEBUG "OS_MCA: process [pid: %d](%s) encounters MCA.\n",
-		current->pid, current->comm);
+	printk(KERN_DEBUG "OS_MCA: process [cpu %d, pid: %d, uid: %d, iip: %p, psr: 0x%lx, paddr: 0x%lx](%s) encounters MCA.\n",
+		smp_processor_id(), current->pid, current->uid, iip, ipsr, paddr, current->
+comm);
 
 	spin_lock(&mca_bh_lock);
 	if (mca_page_isolate(paddr) = ISOLATE_OK) {
@@ -394,6 +395,7 @@
 	pal_min_state_area_t *pmsa;
 	struct ia64_psr *psr1, *psr2;
 	ia64_fptr_t *mca_hdlr_bh = (ia64_fptr_t*)mca_handler_bhhook;
+	extern void *interrupt, *interrupt_pnr;
 
 	/* Is target address valid? */
 	if (!pbci->tv)
@@ -419,16 +421,19 @@
 	 *  Check the privilege level of interrupted context.
 	 *   If it is user-mode, then terminate affected process.
 	 */
-	if (psr1->cpl != 0) {
+	pmsa = (pal_min_state_area_t *)(sal_to_os_handoff_state->pal_min_state | (6ul<<61));
+	if (psr1->cpl != 0 || (pmsa->pmsa_iip >= (unsigned long)&interrupt &&
+			       pmsa->pmsa_iip <  (unsigned long)&interrupt_pnr)) {
 		smei = peidx_bus_check(peidx, 0);
 		if (smei->valid.target_identifier) {
 			/*
 			 *  setup for resume to bottom half of MCA,
 			 * "mca_handler_bhhook"
 			 */
-			pmsa = (pal_min_state_area_t *)(sal_to_os_handoff_state->pal_min_state | (6ul<<61));
-			/* pass to bhhook as 1st argument (gr8) */
+			/* pass to bhhook as argument (gr8, ...) */
 			pmsa->pmsa_gr[8-1] = smei->target_identifier;
+			pmsa->pmsa_gr[9-1] = pmsa->pmsa_iip;
+			pmsa->pmsa_gr[10-1] = pmsa->pmsa_ipsr;
 			/* set interrupted return address (but no use) */
 			pmsa->pmsa_br0 = pmsa->pmsa_iip;
 			/* change resume address to bottom half */
@@ -438,6 +443,7 @@
 			psr2 = (struct ia64_psr *)&pmsa->pmsa_ipsr;
 			psr2->cpl = 0;
 			psr2->ri  = 0;
+			psr2->bn  = 1;
 			psr2->i  = 0;
 
 			return 1;
Index: linux-2.6/arch/ia64/kernel/mca_drv_asm.S
=================================--- linux-2.6.orig/arch/ia64/kernel/mca_drv_asm.S	2005-06-22 10:27:30.006898406 -0500
+++ linux-2.6/arch/ia64/kernel/mca_drv_asm.S	2005-06-22 15:30:35.506339996 -0500
@@ -19,7 +19,7 @@
 	;;						//
 	clrrrb						//
 	;;						
-	alloc		r16=ar.pfs,0,2,1,0		// make a new frame
+	alloc		r16=ar.pfs,0,2,3,0		// make a new frame
 	;;
 	mov		ar.rsc=0
 	;;
@@ -40,11 +40,13 @@
 	movl		loc1=mca_handler_bh		// recovery C function
 	;;
 	mov		out0=r8				// poisoned address
+	mov		out1=r9				// iip
+	mov		out2=r10			// psr
 	mov		b6=loc1
 	;;
 	mov		loc1=rp
 	;;
-	ssm		psr.i
+	ssm		psr.i | psr.ic
 	;;
 	br.call.sptk.many    rp¶			// does not return ...
 	;;
Index: linux-2.6/arch/ia64/kernel/ia64_ksyms.c
=================================--- linux-2.6.orig/arch/ia64/kernel/ia64_ksyms.c	2005-06-22 10:27:30.006898406 -0500
+++ linux-2.6/arch/ia64/kernel/ia64_ksyms.c	2005-06-22 15:30:35.507316546 -0500
@@ -123,5 +123,9 @@
 # endif
 #endif
 
+extern char interrupt, interrupt_pnr;
+EXPORT_SYMBOL(interrupt);
+EXPORT_SYMBOL(interrupt_pnr);
+
 extern char ia64_ivt[];
 EXPORT_SYMBOL(ia64_ivt);
Index: linux-2.6/arch/ia64/kernel/ivt.S
=================================--- linux-2.6.orig/arch/ia64/kernel/ivt.S	2005-06-22 10:27:30.007874957 -0500
+++ linux-2.6/arch/ia64/kernel/ivt.S	2005-06-22 15:30:35.508293097 -0500
@@ -768,7 +768,7 @@
 	.org ia64_ivt+0x3000
 /////////////////////////////////////////////////////////////////////////////////////////
 // 0x3000 Entry 12 (size 64 bundles) External Interrupt (4)
-ENTRY(interrupt)
+GLOBAL_ENTRY(interrupt)
 	DBG_FAULT(12)
 	mov r31=pr		// prepare to save predicates
 	;;
@@ -780,6 +780,8 @@
 	;;
 	SAVE_REST
 	;;
+	.global	interrupt_pnr
+interrupt_pnr:
 	alloc r14=ar.pfs,0,0,2,0 // must be first in an insn group
 	mov out0=cr.ivr		// pass cr.ivr as first arg
 	add out1\x16,sp		// pass pointer to pt_regs as second arg

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [patch] Memory Error Handling Improvement
  2005-06-23 17:30 [patch] Memory Error Handling Improvement Russ Anderson
@ 2005-06-23 17:52 ` David Mosberger
  2005-06-23 19:30 ` Russ Anderson
                   ` (16 subsequent siblings)
  17 siblings, 0 replies; 19+ messages in thread
From: David Mosberger @ 2005-06-23 17:52 UTC (permalink / raw)
  To: linux-ia64

>>>>> On Thu, 23 Jun 2005 12:30:23 -0500 (CDT), Russ Anderson <rja@sgi.com> said:

  Russ> -ENTRY(interrupt)
  Russ> +GLOBAL_ENTRY(interrupt)

Please don't create such name-space pollution.

AFAICS, you should be able to use ia64_ivt and the architected offset
for the external-interrupt handler instead.

	--david

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [patch] Memory Error Handling Improvement
  2005-06-23 17:30 [patch] Memory Error Handling Improvement Russ Anderson
  2005-06-23 17:52 ` David Mosberger
@ 2005-06-23 19:30 ` Russ Anderson
  2005-06-23 22:11 ` Andreas Schwab
                   ` (15 subsequent siblings)
  17 siblings, 0 replies; 19+ messages in thread
From: Russ Anderson @ 2005-06-23 19:30 UTC (permalink / raw)
  To: linux-ia64

David Mosberger wrote:
> 
>   Russ> -ENTRY(interrupt)
>   Russ> +GLOBAL_ENTRY(interrupt)
> 
> Please don't create such name-space pollution.
> 
> AFAICS, you should be able to use ia64_ivt and the architected offset
> for the external-interrupt handler instead.

I assume you mean usage like in arch/ia64/oprofile/backtrace.c, so the
usage in mca_drv.c would be along the lines of:

	extern char ia64_ivt[];

	if (psr1->cpl != 0 || (pmsa->pmsa_iip >= (unsigned long)ia64_ivt+0x3000 &&

Do you have the same objection for interrupt_pnr?  If so, what is 
the best way to calculate the offset in ivt.S (which looks hardcoded for
other routines)?


The updated patch with GLOBAL_ENTRY(interrupt) removed:

--------------------------------------------------------------------------
Index: linux-2.6/arch/ia64/kernel/mca_drv.c
=================================--- linux-2.6.orig/arch/ia64/kernel/mca_drv.c	2005-06-23 14:04:13.741379508 -0500
+++ linux-2.6/arch/ia64/kernel/mca_drv.c	2005-06-23 14:04:18.584089395 -0500
@@ -118,10 +118,11 @@
  */
 
 void
-mca_handler_bh(unsigned long paddr)
+mca_handler_bh(unsigned long paddr, void *iip, unsigned long ipsr)
 {
-	printk(KERN_DEBUG "OS_MCA: process [pid: %d](%s) encounters MCA.\n",
-		current->pid, current->comm);
+	printk(KERN_DEBUG "OS_MCA: process [cpu %d, pid: %d, uid: %d, iip: %p, psr: 0x%lx, paddr: 0x%lx](%s) encounters MCA.\n",
+		smp_processor_id(), current->pid, current->uid, iip, ipsr, paddr, current->
+comm);
 
 	spin_lock(&mca_bh_lock);
 	if (mca_page_isolate(paddr) = ISOLATE_OK) {
@@ -394,6 +395,8 @@
 	pal_min_state_area_t *pmsa;
 	struct ia64_psr *psr1, *psr2;
 	ia64_fptr_t *mca_hdlr_bh = (ia64_fptr_t*)mca_handler_bhhook;
+	extern char ia64_ivt[];
+	extern void *interrupt_pnr;
 
 	/* Is target address valid? */
 	if (!pbci->tv)
@@ -419,16 +422,19 @@
 	 *  Check the privilege level of interrupted context.
 	 *   If it is user-mode, then terminate affected process.
 	 */
-	if (psr1->cpl != 0) {
+	pmsa = (pal_min_state_area_t *)(sal_to_os_handoff_state->pal_min_state | (6ul<<61));
+	if (psr1->cpl != 0 || (pmsa->pmsa_iip >= (unsigned long)ia64_ivt+0x3000 &&
+			       pmsa->pmsa_iip <  (unsigned long)&interrupt_pnr)) {
 		smei = peidx_bus_check(peidx, 0);
 		if (smei->valid.target_identifier) {
 			/*
 			 *  setup for resume to bottom half of MCA,
 			 * "mca_handler_bhhook"
 			 */
-			pmsa = (pal_min_state_area_t *)(sal_to_os_handoff_state->pal_min_state | (6ul<<61));
-			/* pass to bhhook as 1st argument (gr8) */
+			/* pass to bhhook as argument (gr8, ...) */
 			pmsa->pmsa_gr[8-1] = smei->target_identifier;
+			pmsa->pmsa_gr[9-1] = pmsa->pmsa_iip;
+			pmsa->pmsa_gr[10-1] = pmsa->pmsa_ipsr;
 			/* set interrupted return address (but no use) */
 			pmsa->pmsa_br0 = pmsa->pmsa_iip;
 			/* change resume address to bottom half */
@@ -438,6 +444,7 @@
 			psr2 = (struct ia64_psr *)&pmsa->pmsa_ipsr;
 			psr2->cpl = 0;
 			psr2->ri  = 0;
+			psr2->bn  = 1;
 			psr2->i  = 0;
 
 			return 1;
Index: linux-2.6/arch/ia64/kernel/mca_drv_asm.S
=================================--- linux-2.6.orig/arch/ia64/kernel/mca_drv_asm.S	2005-06-23 14:04:13.742356057 -0500
+++ linux-2.6/arch/ia64/kernel/mca_drv_asm.S	2005-06-23 14:04:18.585065944 -0500
@@ -19,7 +19,7 @@
 	;;						//
 	clrrrb						//
 	;;						
-	alloc		r16=ar.pfs,0,2,1,0		// make a new frame
+	alloc		r16=ar.pfs,0,2,3,0		// make a new frame
 	;;
 	mov		ar.rsc=0
 	;;
@@ -40,11 +40,13 @@
 	movl		loc1=mca_handler_bh		// recovery C function
 	;;
 	mov		out0=r8				// poisoned address
+	mov		out1=r9				// iip
+	mov		out2=r10			// psr
 	mov		b6=loc1
 	;;
 	mov		loc1=rp
 	;;
-	ssm		psr.i
+	ssm		psr.i | psr.ic
 	;;
 	br.call.sptk.many    rp¶			// does not return ...
 	;;
Index: linux-2.6/arch/ia64/kernel/ia64_ksyms.c
=================================--- linux-2.6.orig/arch/ia64/kernel/ia64_ksyms.c	2005-06-23 14:04:13.742356057 -0500
+++ linux-2.6/arch/ia64/kernel/ia64_ksyms.c	2005-06-23 14:04:18.585065944 -0500
@@ -123,5 +123,8 @@
 # endif
 #endif
 
+extern char interrupt_pnr;
+EXPORT_SYMBOL(interrupt_pnr);
+
 extern char ia64_ivt[];
 EXPORT_SYMBOL(ia64_ivt);
Index: linux-2.6/arch/ia64/kernel/ivt.S
=================================--- linux-2.6.orig/arch/ia64/kernel/ivt.S	2005-06-23 14:04:13.742356057 -0500
+++ linux-2.6/arch/ia64/kernel/ivt.S	2005-06-23 14:04:18.586042494 -0500
@@ -780,6 +780,8 @@
 	;;
 	SAVE_REST
 	;;
+	.global	interrupt_pnr
+interrupt_pnr:
 	alloc r14=ar.pfs,0,0,2,0 // must be first in an insn group
 	mov out0=cr.ivr		// pass cr.ivr as first arg
 	add out1\x16,sp		// pass pointer to pt_regs as second arg

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [patch] Memory Error Handling Improvement
  2005-06-23 17:30 [patch] Memory Error Handling Improvement Russ Anderson
  2005-06-23 17:52 ` David Mosberger
  2005-06-23 19:30 ` Russ Anderson
@ 2005-06-23 22:11 ` Andreas Schwab
  2005-06-23 22:18 ` David Mosberger
                   ` (14 subsequent siblings)
  17 siblings, 0 replies; 19+ messages in thread
From: Andreas Schwab @ 2005-06-23 22:11 UTC (permalink / raw)
  To: linux-ia64

Russ Anderson <rja@sgi.com> writes:

> @@ -394,6 +395,8 @@
>  	pal_min_state_area_t *pmsa;
>  	struct ia64_psr *psr1, *psr2;
>  	ia64_fptr_t *mca_hdlr_bh = (ia64_fptr_t*)mca_handler_bhhook;
> +	extern char ia64_ivt[];
> +	extern void *interrupt_pnr;

Can this be moved to some header?

Andreas.

-- 
Andreas Schwab, SuSE Labs, schwab@suse.de
SuSE Linux Products GmbH, Maxfeldstraße 5, 90409 Nürnberg, Germany
Key fingerprint = 58CA 54C7 6D53 942B 1756  01D3 44D5 214B 8276 4ED5
"And now for something completely different."

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [patch] Memory Error Handling Improvement
  2005-06-23 17:30 [patch] Memory Error Handling Improvement Russ Anderson
                   ` (2 preceding siblings ...)
  2005-06-23 22:11 ` Andreas Schwab
@ 2005-06-23 22:18 ` David Mosberger
  2005-06-23 22:22 ` Russ Anderson
                   ` (13 subsequent siblings)
  17 siblings, 0 replies; 19+ messages in thread
From: David Mosberger @ 2005-06-23 22:18 UTC (permalink / raw)
  To: linux-ia64

>>>>> On Thu, 23 Jun 2005 14:30:20 -0500 (CDT), Russ Anderson <rja@sgi.com> said:

  Russ> David Mosberger wrote:
  >>
  Russ> -ENTRY(interrupt) +GLOBAL_ENTRY(interrupt)
  >>  Please don't create such name-space pollution.

  >> AFAICS, you should be able to use ia64_ivt and the architected
  >> offset for the external-interrupt handler instead.

  Russ> I assume you mean usage like in
  Russ> arch/ia64/oprofile/backtrace.c, so the usage in mca_drv.c
  Russ> would be along the lines of:

  Russ> 	extern char ia64_ivt[];

  Russ> 	if (psr1->cpl != 0 || (pmsa->pmsa_iip >= (unsigned
  Russ> long)ia64_ivt+0x3000 &&

  Russ> Do you have the same objection for interrupt_pnr?  If so, what
  Russ> is the best way to calculate the offset in ivt.S (which looks
  Russ> hardcoded for other routines)?

I thought interrupt_pnr was at the end of the vector.  Now that I look
closer, the whole thing looks rather doubious/fragile to me.  What
exactly are you trying to do there?

	--david

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [patch] Memory Error Handling Improvement
  2005-06-23 17:30 [patch] Memory Error Handling Improvement Russ Anderson
                   ` (3 preceding siblings ...)
  2005-06-23 22:18 ` David Mosberger
@ 2005-06-23 22:22 ` Russ Anderson
  2005-06-23 22:54 ` Andreas Schwab
                   ` (12 subsequent siblings)
  17 siblings, 0 replies; 19+ messages in thread
From: Russ Anderson @ 2005-06-23 22:22 UTC (permalink / raw)
  To: linux-ia64

Andreas Schwab wrote:
> Russ Anderson <rja@sgi.com> writes:
> 
> > @@ -394,6 +395,8 @@
> >  	pal_min_state_area_t *pmsa;
> >  	struct ia64_psr *psr1, *psr2;
> >  	ia64_fptr_t *mca_hdlr_bh = (ia64_fptr_t*)mca_handler_bhhook;
> > +	extern char ia64_ivt[];
> > +	extern void *interrupt_pnr;
> 
> Can this be moved to some header?

Sure.  I'll put it in mca_drv.h.

The updated patch:
-------------------------------------------------------------------
Index: linux-2.6/arch/ia64/kernel/mca_drv.c
=================================--- linux-2.6.orig/arch/ia64/kernel/mca_drv.c	2005-06-23 14:04:13.741379508 -0500
+++ linux-2.6/arch/ia64/kernel/mca_drv.c	2005-06-23 17:20:42.995242422 -0500
@@ -118,10 +118,11 @@
  */
 
 void
-mca_handler_bh(unsigned long paddr)
+mca_handler_bh(unsigned long paddr, void *iip, unsigned long ipsr)
 {
-	printk(KERN_DEBUG "OS_MCA: process [pid: %d](%s) encounters MCA.\n",
-		current->pid, current->comm);
+	printk(KERN_DEBUG "OS_MCA: process [cpu %d, pid: %d, uid: %d, iip: %p, psr: 0x%lx, paddr: 0x%lx](%s) encounters MCA.\n",
+		smp_processor_id(), current->pid, current->uid, iip, ipsr, paddr, current->
+comm);
 
 	spin_lock(&mca_bh_lock);
 	if (mca_page_isolate(paddr) = ISOLATE_OK) {
@@ -419,16 +420,19 @@
 	 *  Check the privilege level of interrupted context.
 	 *   If it is user-mode, then terminate affected process.
 	 */
-	if (psr1->cpl != 0) {
+	pmsa = (pal_min_state_area_t *)(sal_to_os_handoff_state->pal_min_state | (6ul<<61));
+	if (psr1->cpl != 0 || (pmsa->pmsa_iip >= (unsigned long)ia64_ivt+0x3000 &&
+			       pmsa->pmsa_iip <  (unsigned long)&interrupt_pnr)) {
 		smei = peidx_bus_check(peidx, 0);
 		if (smei->valid.target_identifier) {
 			/*
 			 *  setup for resume to bottom half of MCA,
 			 * "mca_handler_bhhook"
 			 */
-			pmsa = (pal_min_state_area_t *)(sal_to_os_handoff_state->pal_min_state | (6ul<<61));
-			/* pass to bhhook as 1st argument (gr8) */
+			/* pass to bhhook as argument (gr8, ...) */
 			pmsa->pmsa_gr[8-1] = smei->target_identifier;
+			pmsa->pmsa_gr[9-1] = pmsa->pmsa_iip;
+			pmsa->pmsa_gr[10-1] = pmsa->pmsa_ipsr;
 			/* set interrupted return address (but no use) */
 			pmsa->pmsa_br0 = pmsa->pmsa_iip;
 			/* change resume address to bottom half */
@@ -438,6 +442,7 @@
 			psr2 = (struct ia64_psr *)&pmsa->pmsa_ipsr;
 			psr2->cpl = 0;
 			psr2->ri  = 0;
+			psr2->bn  = 1;
 			psr2->i  = 0;
 
 			return 1;
Index: linux-2.6/arch/ia64/kernel/mca_drv_asm.S
=================================--- linux-2.6.orig/arch/ia64/kernel/mca_drv_asm.S	2005-06-23 14:04:13.742356057 -0500
+++ linux-2.6/arch/ia64/kernel/mca_drv_asm.S	2005-06-23 14:04:18.585065944 -0500
@@ -19,7 +19,7 @@
 	;;						//
 	clrrrb						//
 	;;						
-	alloc		r16=ar.pfs,0,2,1,0		// make a new frame
+	alloc		r16=ar.pfs,0,2,3,0		// make a new frame
 	;;
 	mov		ar.rsc=0
 	;;
@@ -40,11 +40,13 @@
 	movl		loc1=mca_handler_bh		// recovery C function
 	;;
 	mov		out0=r8				// poisoned address
+	mov		out1=r9				// iip
+	mov		out2=r10			// psr
 	mov		b6=loc1
 	;;
 	mov		loc1=rp
 	;;
-	ssm		psr.i
+	ssm		psr.i | psr.ic
 	;;
 	br.call.sptk.many    rp¶			// does not return ...
 	;;
Index: linux-2.6/arch/ia64/kernel/ia64_ksyms.c
=================================--- linux-2.6.orig/arch/ia64/kernel/ia64_ksyms.c	2005-06-23 14:04:13.742356057 -0500
+++ linux-2.6/arch/ia64/kernel/ia64_ksyms.c	2005-06-23 14:04:18.585065944 -0500
@@ -123,5 +123,8 @@
 # endif
 #endif
 
+extern char interrupt_pnr;
+EXPORT_SYMBOL(interrupt_pnr);
+
 extern char ia64_ivt[];
 EXPORT_SYMBOL(ia64_ivt);
Index: linux-2.6/arch/ia64/kernel/ivt.S
=================================--- linux-2.6.orig/arch/ia64/kernel/ivt.S	2005-06-23 14:04:13.742356057 -0500
+++ linux-2.6/arch/ia64/kernel/ivt.S	2005-06-23 14:04:18.586042494 -0500
@@ -780,6 +780,8 @@
 	;;
 	SAVE_REST
 	;;
+	.global	interrupt_pnr
+interrupt_pnr:
 	alloc r14=ar.pfs,0,0,2,0 // must be first in an insn group
 	mov out0=cr.ivr		// pass cr.ivr as first arg
 	add out1\x16,sp		// pass pointer to pt_regs as second arg
Index: linux-2.6/arch/ia64/kernel/mca_drv.h
=================================--- linux-2.6.orig/arch/ia64/kernel/mca_drv.h	2005-06-21 15:01:11.940147599 -0500
+++ linux-2.6/arch/ia64/kernel/mca_drv.h	2005-06-23 17:20:29.779596431 -0500
@@ -111,3 +111,5 @@
 	slidx_foreach_entry(__pos, &((slidx)->sec)) { __count++; }\
 	__count; })
 
+extern char ia64_ivt[];
+extern void *interrupt_pnr;

-- 
Russ Anderson, OS RAS/Partitioning Project Lead  
SGI - Silicon Graphics Inc          rja@sgi.com

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [patch] Memory Error Handling Improvement
  2005-06-23 17:30 [patch] Memory Error Handling Improvement Russ Anderson
                   ` (4 preceding siblings ...)
  2005-06-23 22:22 ` Russ Anderson
@ 2005-06-23 22:54 ` Andreas Schwab
  2005-06-24  1:12 ` Hidetoshi Seto
                   ` (11 subsequent siblings)
  17 siblings, 0 replies; 19+ messages in thread
From: Andreas Schwab @ 2005-06-23 22:54 UTC (permalink / raw)
  To: linux-ia64

Russ Anderson <rja@sgi.com> writes:

> Index: linux-2.6/arch/ia64/kernel/ia64_ksyms.c
> =================================> --- linux-2.6.orig/arch/ia64/kernel/ia64_ksyms.c	2005-06-23 14:04:13.742356057 -0500
> +++ linux-2.6/arch/ia64/kernel/ia64_ksyms.c	2005-06-23 14:04:18.585065944 -0500
> @@ -123,5 +123,8 @@
>  # endif
>  #endif
>  
> +extern char interrupt_pnr;
> +EXPORT_SYMBOL(interrupt_pnr);
> +
>  extern char ia64_ivt[];
>  EXPORT_SYMBOL(ia64_ivt);

This should use the header as well.

Andreas.

-- 
Andreas Schwab, SuSE Labs, schwab@suse.de
SuSE Linux Products GmbH, Maxfeldstraße 5, 90409 Nürnberg, Germany
Key fingerprint = 58CA 54C7 6D53 942B 1756  01D3 44D5 214B 8276 4ED5
"And now for something completely different."

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [patch] Memory Error Handling Improvement
  2005-06-23 17:30 [patch] Memory Error Handling Improvement Russ Anderson
                   ` (5 preceding siblings ...)
  2005-06-23 22:54 ` Andreas Schwab
@ 2005-06-24  1:12 ` Hidetoshi Seto
  2005-06-24 20:11 ` Russ Anderson
                   ` (10 subsequent siblings)
  17 siblings, 0 replies; 19+ messages in thread
From: Hidetoshi Seto @ 2005-06-24  1:12 UTC (permalink / raw)
  To: linux-ia64

Hi Russ,

Russ Anderson wrote:
>  void
> -mca_handler_bh(unsigned long paddr)
> +mca_handler_bh(unsigned long paddr, void *iip, unsigned long ipsr)
>  {
> -	printk(KERN_DEBUG "OS_MCA: process [pid: %d](%s) encounters MCA.\n",
> -		current->pid, current->comm);
> +	printk(KERN_DEBUG "OS_MCA: process [cpu %d, pid: %d, uid: %d, iip: %p, psr: 0x%lx, paddr: 0x%lx](%s) encounters MCA.\n",
> +		smp_processor_id(), current->pid, current->uid, iip, ipsr, paddr, current->
> +comm);

Printing detail would be good.

> +	pmsa = (pal_min_state_area_t *)(sal_to_os_handoff_state->pal_min_state | (6ul<<61));
> +	if (psr1->cpl != 0 || (pmsa->pmsa_iip >= (unsigned long)&interrupt &&
> +			       pmsa->pmsa_iip <  (unsigned long)&interrupt_pnr)) {

It seems that the latter state just means "this was in early of interrupt."
You shoud also make sure that the interrupt has happen in user context, right?

Thanks,
H.Seto


^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [patch] Memory Error Handling Improvement
  2005-06-23 17:30 [patch] Memory Error Handling Improvement Russ Anderson
                   ` (6 preceding siblings ...)
  2005-06-24  1:12 ` Hidetoshi Seto
@ 2005-06-24 20:11 ` Russ Anderson
  2005-06-24 20:18 ` Russ Anderson
                   ` (9 subsequent siblings)
  17 siblings, 0 replies; 19+ messages in thread
From: Russ Anderson @ 2005-06-24 20:11 UTC (permalink / raw)
  To: linux-ia64

David Mosberger wrote:
>
>   Russ> I assume you mean usage like in
>   Russ> arch/ia64/oprofile/backtrace.c, so the usage in mca_drv.c
>   Russ> would be along the lines of:
>
>   Russ>       extern char ia64_ivt[];
>
>   Russ>       if (psr1->cpl != 0 || (pmsa->pmsa_iip >= (unsigned
>   Russ> long)ia64_ivt+0x3000 &&
>
>   Russ> Do you have the same objection for interrupt_pnr?  If so, what
>   Russ> is the best way to calculate the offset in ivt.S (which looks
>   Russ> hardcoded for other routines)?
>
> I thought interrupt_pnr was at the end of the vector.  Now that I look
> closer, the whole thing looks rather doubious/fragile to me.  What
> exactly are you trying to do there?

The intent is to deal with cases where the user application does
a load of memory with bad ECC, but the MCA surfaces in the interrupt
(kernel) code.   

Testing with error injection showed a significant number of cases
where the MCA surfaced early in the interrupt routine, even though
the load of the bad data was launched from a user process.  Adding 
the second condition to look for these cases allowed them to be 
recovered.  Analysis of the recovered MCA records showed 7-10%
of the recoverys were this condition, when running the error
recovery code with other activity that caused interrupts.

Previously, if the MCA surfaced while the cpu was in privilage 
mode the code would not try to recover.  This change adds a second 
condition, to see if the kernel is early in the interrupt
routine.  It does this by checking the instruction range.  As
Hidetoshi Seto points out, the check should also make sure
the interrupted process was in user mode.  That has been added 
to the patch and tested.

A major concern is verifying that the correct process is killed.  
The name of the process killed is logged to /var/log/messages.
The test also verifies that the correct process is killed.  Early
on there was a bug in the error injection routine, that resulted
in changing the ECC on the wrong physical address.  This was 
identified because the error injection test completed without 
being killed.  If the recovery code killed the wrong process, 
it would be obvious because the error injection test would not 
get killed. 

We have also been testing the recovery code in manufacturing 
with real bad DIMMs.

The updated patch.

Signed-off-by: Russ Anderson (rja@sgi.com)

-----------------------------------------------------------
Index: linux-2.6/arch/ia64/kernel/mca_drv.c
=================================--- linux-2.6.orig/arch/ia64/kernel/mca_drv.c	2005-06-24 14:24:08.100951706 -0500
+++ linux-2.6/arch/ia64/kernel/mca_drv.c	2005-06-24 14:31:03.253211915 -0500
@@ -118,10 +118,11 @@
  */
 
 void
-mca_handler_bh(unsigned long paddr)
+mca_handler_bh(unsigned long paddr, void *iip, unsigned long ipsr)
 {
-	printk(KERN_DEBUG "OS_MCA: process [pid: %d](%s) encounters MCA.\n",
-		current->pid, current->comm);
+	printk(KERN_DEBUG "OS_MCA: process [cpu %d, pid: %d, uid: %d, iip: %p, psr: 0x%lx, paddr: 0x%lx](%s) encounters MCA.\n",
+		smp_processor_id(), current->pid, current->uid, iip, ipsr, paddr, current->
+comm);
 
 	spin_lock(&mca_bh_lock);
 	if (mca_page_isolate(paddr) = ISOLATE_OK) {
@@ -414,21 +415,27 @@
 	 */
 
 	psr1 =(struct ia64_psr *)&(peidx_minstate_area(peidx)->pmsa_ipsr);
+	psr2 =(struct ia64_psr *)&(peidx_minstate_area(peidx)->pmsa_xpsr);
 
 	/*
 	 *  Check the privilege level of interrupted context.
 	 *   If it is user-mode, then terminate affected process.
 	 */
-	if (psr1->cpl != 0) {
+	pmsa = (pal_min_state_area_t *)(sal_to_os_handoff_state->pal_min_state | (6ul<<61));
+
+	if (psr1->cpl != 0 || ((psr2->cpl != 0) &&
+			       (pmsa->pmsa_iip >= (unsigned long)ia64_ivt+0x3000 &&
+			        pmsa->pmsa_iip <  (unsigned long)&interrupt_pnr))) {
 		smei = peidx_bus_check(peidx, 0);
 		if (smei->valid.target_identifier) {
 			/*
 			 *  setup for resume to bottom half of MCA,
 			 * "mca_handler_bhhook"
 			 */
-			pmsa = (pal_min_state_area_t *)(sal_to_os_handoff_state->pal_min_state | (6ul<<61));
-			/* pass to bhhook as 1st argument (gr8) */
+			/* pass to bhhook as argument (gr8, ...) */
 			pmsa->pmsa_gr[8-1] = smei->target_identifier;
+			pmsa->pmsa_gr[9-1] = pmsa->pmsa_iip;
+			pmsa->pmsa_gr[10-1] = pmsa->pmsa_ipsr;
 			/* set interrupted return address (but no use) */
 			pmsa->pmsa_br0 = pmsa->pmsa_iip;
 			/* change resume address to bottom half */
@@ -438,6 +445,7 @@
 			psr2 = (struct ia64_psr *)&pmsa->pmsa_ipsr;
 			psr2->cpl = 0;
 			psr2->ri  = 0;
+			psr2->bn  = 1;
 			psr2->i  = 0;
 
 			return 1;
Index: linux-2.6/arch/ia64/kernel/mca_drv_asm.S
=================================--- linux-2.6.orig/arch/ia64/kernel/mca_drv_asm.S	2005-06-24 14:24:08.100951706 -0500
+++ linux-2.6/arch/ia64/kernel/mca_drv_asm.S	2005-06-24 14:31:03.254188466 -0500
@@ -19,7 +19,7 @@
 	;;						//
 	clrrrb						//
 	;;						
-	alloc		r16=ar.pfs,0,2,1,0		// make a new frame
+	alloc		r16=ar.pfs,0,2,3,0		// make a new frame
 	;;
 	mov		ar.rsc=0
 	;;
@@ -40,11 +40,13 @@
 	movl		loc1=mca_handler_bh		// recovery C function
 	;;
 	mov		out0=r8				// poisoned address
+	mov		out1=r9				// iip
+	mov		out2=r10			// psr
 	mov		b6=loc1
 	;;
 	mov		loc1=rp
 	;;
-	ssm		psr.i
+	ssm		psr.i | psr.ic
 	;;
 	br.call.sptk.many    rp¶			// does not return ...
 	;;
Index: linux-2.6/arch/ia64/kernel/ivt.S
=================================--- linux-2.6.orig/arch/ia64/kernel/ivt.S	2005-06-24 14:24:08.097045503 -0500
+++ linux-2.6/arch/ia64/kernel/ivt.S	2005-06-24 14:31:03.255165017 -0500
@@ -785,6 +785,8 @@
 	;;
 	SAVE_REST
 	;;
+	.global	interrupt_pnr
+interrupt_pnr:
 	alloc r14=ar.pfs,0,0,2,0 // must be first in an insn group
 	mov out0=cr.ivr		// pass cr.ivr as first arg
 	add out1\x16,sp		// pass pointer to pt_regs as second arg
Index: linux-2.6/arch/ia64/kernel/mca_drv.h
=================================--- linux-2.6.orig/arch/ia64/kernel/mca_drv.h	2005-06-24 14:24:08.100951706 -0500
+++ linux-2.6/arch/ia64/kernel/mca_drv.h	2005-06-24 14:31:03.256141568 -0500
@@ -111,3 +111,6 @@
 	slidx_foreach_entry(__pos, &((slidx)->sec)) { __count++; }\
 	__count; })
 
+extern char ia64_ivt[];
+extern void *interrupt_pnr;
+EXPORT_SYMBOL(interrupt_pnr);

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [patch] Memory Error Handling Improvement
  2005-06-23 17:30 [patch] Memory Error Handling Improvement Russ Anderson
                   ` (7 preceding siblings ...)
  2005-06-24 20:11 ` Russ Anderson
@ 2005-06-24 20:18 ` Russ Anderson
  2005-06-24 20:36 ` David Mosberger
                   ` (8 subsequent siblings)
  17 siblings, 0 replies; 19+ messages in thread
From: Russ Anderson @ 2005-06-24 20:18 UTC (permalink / raw)
  To: linux-ia64

Hidetoshi Seto wrote:
> Russ Anderson wrote:
> 
> > +	pmsa = (pal_min_state_area_t *)(sal_to_os_handoff_state->pal_min_state | (6ul<<61));
> > +	if (psr1->cpl != 0 || (pmsa->pmsa_iip >= (unsigned long)&interrupt &&
> > +			       pmsa->pmsa_iip <  (unsigned long)&interrupt_pnr)) {
> 
> It seems that the latter state just means "this was in early of interrupt."
> You shoud also make sure that the interrupt has happen in user context, right?

Yes, that is a good point.   The updated patch has that change.

------------------------------------------------------------------------------
        psr1 =(struct ia64_psr *)&(peidx_minstate_area(peidx)->pmsa_ipsr);
+       psr2 =(struct ia64_psr *)&(peidx_minstate_area(peidx)->pmsa_xpsr);

        /*
         *  Check the privilege level of interrupted context.
         *   If it is user-mode, then terminate affected process.
         */
-       if (psr1->cpl != 0) {
+       pmsa = (pal_min_state_area_t *)(sal_to_os_handoff_state->pal_min_state | (6ul<<61));
+
+       if (psr1->cpl != 0 || ((psr2->cpl != 0) &&
+                              (pmsa->pmsa_iip >= (unsigned long)ia64_ivt+0x3000 &&
+                               pmsa->pmsa_iip <  (unsigned long)&interrupt_pnr))) {
                smei = peidx_bus_check(peidx, 0);


-- 
Russ Anderson, OS RAS/Partitioning Project Lead  
SGI - Silicon Graphics Inc          rja@sgi.com

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [patch] Memory Error Handling Improvement
  2005-06-23 17:30 [patch] Memory Error Handling Improvement Russ Anderson
                   ` (8 preceding siblings ...)
  2005-06-24 20:18 ` Russ Anderson
@ 2005-06-24 20:36 ` David Mosberger
  2005-06-24 21:05 ` Luck, Tony
                   ` (7 subsequent siblings)
  17 siblings, 0 replies; 19+ messages in thread
From: David Mosberger @ 2005-06-24 20:36 UTC (permalink / raw)
  To: linux-ia64

>>>>> On Fri, 24 Jun 2005 15:11:09 -0500 (CDT), Russ Anderson <rja@sgi.com> said:

  Russ> Testing with error injection showed a significant number of
  Russ> cases where the MCA surfaced early in the interrupt routine,
  Russ> even though the load of the bad data was launched from a user
  Russ> process.  Adding the second condition to look for these cases
  Russ> allowed them to be recovered.  Analysis of the recovered MCA
  Russ> records showed 7-10% of the recoverys were this condition,
  Russ> when running the error recovery code with other activity that
  Russ> caused interrupts.

  Russ> Previously, if the MCA surfaced while the cpu was in privilage
  Russ> mode the code would not try to recover.  This change adds a
  Russ> second condition, to see if the kernel is early in the
  Russ> interrupt routine.  It does this by checking the instruction
  Russ> range.  As Hidetoshi Seto points out, the check should also
  Russ> make sure the interrupted process was in user mode.  That has
  Russ> been added to the patch and tested.

Sorry, but this doesn't make any sense to me.  If an application
spends a lot of time handling TLB faults or unaligned access faults,
it's very likely the MCA will hit in those handlers and your patch
will not help at all.  Furthermore, if MCA delivery timing changes for
some reason, the user-triggered MCA might show up much later, i.e.,
pretty much anywhere in the kernel no?

Isn't there a more reliably method to handle this?  What if you just
_assumed_ that MCAs are triggered by user-level (unless you can prove
that it was kernel-only memory, perhaps).

	--david


^ permalink raw reply	[flat|nested] 19+ messages in thread

* RE: [patch] Memory Error Handling Improvement
  2005-06-23 17:30 [patch] Memory Error Handling Improvement Russ Anderson
                   ` (9 preceding siblings ...)
  2005-06-24 20:36 ` David Mosberger
@ 2005-06-24 21:05 ` Luck, Tony
  2005-06-24 21:11 ` David Mosberger
                   ` (6 subsequent siblings)
  17 siblings, 0 replies; 19+ messages in thread
From: Luck, Tony @ 2005-06-24 21:05 UTC (permalink / raw)
  To: linux-ia64


>will not help at all.  Furthermore, if MCA delivery timing changes for
>some reason, the user-triggered MCA might show up much later, i.e.,
>pretty much anywhere in the kernel no?

Not really.  Suppose there is some location in memory that has a multi-bit
ECC error, and the user reads this location:

	ld8 r17=[r18]

Now the MCA mechanism is asynchronous ... so nothing may happen right away,
but there is a guarantee that at the very latest the MCA will be delivered
before the poisoned data is consumed.  So suppose that we happen to try to
enter the kernel for some reason before the MCA is delivered.  Since the
kernel saves all the users registers, it will attempt to consume the data,
and so the MCA will be delivered.

For the user-mode MCA to survive to an arbitrary point in the kernel would
mean that we didn't save some user mode register.

-Tony

^ permalink raw reply	[flat|nested] 19+ messages in thread

* RE: [patch] Memory Error Handling Improvement
  2005-06-23 17:30 [patch] Memory Error Handling Improvement Russ Anderson
                   ` (10 preceding siblings ...)
  2005-06-24 21:05 ` Luck, Tony
@ 2005-06-24 21:11 ` David Mosberger
  2005-06-24 21:20 ` Luck, Tony
                   ` (5 subsequent siblings)
  17 siblings, 0 replies; 19+ messages in thread
From: David Mosberger @ 2005-06-24 21:11 UTC (permalink / raw)
  To: linux-ia64

>>>>> On Fri, 24 Jun 2005 14:05:00 -0700, "Luck, Tony" <tony.luck@intel.com> said:

  >> will not help at all.  Furthermore, if MCA delivery timing changes for
  >> some reason, the user-triggered MCA might show up much later, i.e.,
  >> pretty much anywhere in the kernel no?

  Tony> Not really.  Suppose there is some location in memory that has a multi-bit
  Tony> ECC error, and the user reads this location:

  Tony> ld8 r17=[r18]

  Tony> Now the MCA mechanism is asynchronous ... so nothing may
  Tony> happen right away, but there is a guarantee that at the very
  Tony> latest the MCA will be delivered before the poisoned data is
  Tony> consumed.  So suppose that we happen to try to enter the
  Tony> kernel for some reason before the MCA is delivered.  Since the
  Tony> kernel saves all the users registers, it will attempt to
  Tony> consume the data, and so the MCA will be delivered.

Fine, but what about stores?

Furthermore, there are many ways to enter the kernel, so it still
makes no sense to me to consider external interrupts only.  System
calls for one are certainly quite common, too.

  Tony> For the user-mode MCA to survive to an arbitrary point in the
  Tony> kernel would mean that we didn't save some user mode register.

Again, what about stores?

	--david


^ permalink raw reply	[flat|nested] 19+ messages in thread

* RE: [patch] Memory Error Handling Improvement
  2005-06-23 17:30 [patch] Memory Error Handling Improvement Russ Anderson
                   ` (11 preceding siblings ...)
  2005-06-24 21:11 ` David Mosberger
@ 2005-06-24 21:20 ` Luck, Tony
  2005-06-24 21:25 ` David Mosberger
                   ` (4 subsequent siblings)
  17 siblings, 0 replies; 19+ messages in thread
From: Luck, Tony @ 2005-06-24 21:20 UTC (permalink / raw)
  To: linux-ia64


>Fine, but what about stores?

Stores have too many extra levels of buffering to have much hope.  But just
because we can't help the store case doesn't mean that we shouldn't do
something about the load case.

>Furthermore, there are many ways to enter the kernel, so it still
>makes no sense to me to consider external interrupts only.  System
>calls for one are certainly quite common, too.

Agreed.  It would be much better if we had some clean & easy way to tag
all of the kernel entry sequences for a check like this.  The cost of
checking to see whether we are in any of these code regions could be
quite high ... but we only pay it if there is an error, so as long as
it isn't outrageous it should be okay.

-Tony

^ permalink raw reply	[flat|nested] 19+ messages in thread

* RE: [patch] Memory Error Handling Improvement
  2005-06-23 17:30 [patch] Memory Error Handling Improvement Russ Anderson
                   ` (12 preceding siblings ...)
  2005-06-24 21:20 ` Luck, Tony
@ 2005-06-24 21:25 ` David Mosberger
  2005-06-24 21:31 ` Luck, Tony
                   ` (3 subsequent siblings)
  17 siblings, 0 replies; 19+ messages in thread
From: David Mosberger @ 2005-06-24 21:25 UTC (permalink / raw)
  To: linux-ia64

>>>>> On Fri, 24 Jun 2005 14:20:17 -0700, "Luck, Tony" <tony.luck@intel.com> said:

  >> Fine, but what about stores?

  Tony> Stores have too many extra levels of buffering to have much
  Tony> hope.  But just because we can't help the store case doesn't
  Tony> mean that we shouldn't do something about the load case.

Fine, but what about my suggestion of just presuming that the access
came for user-level unless you can prove otherwise?

	--david


^ permalink raw reply	[flat|nested] 19+ messages in thread

* RE: [patch] Memory Error Handling Improvement
  2005-06-23 17:30 [patch] Memory Error Handling Improvement Russ Anderson
                   ` (13 preceding siblings ...)
  2005-06-24 21:25 ` David Mosberger
@ 2005-06-24 21:31 ` Luck, Tony
  2005-06-24 21:36 ` Russ Anderson
                   ` (2 subsequent siblings)
  17 siblings, 0 replies; 19+ messages in thread
From: Luck, Tony @ 2005-06-24 21:31 UTC (permalink / raw)
  To: linux-ia64

>Fine, but what about my suggestion of just presuming that the access
>came for user-level unless you can prove otherwise?

If the presumption is wrong, then you'll kill an innocent user process.
Worse, you will let a corrupted kernel carry on running.

-Tony

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [patch] Memory Error Handling Improvement
  2005-06-23 17:30 [patch] Memory Error Handling Improvement Russ Anderson
                   ` (14 preceding siblings ...)
  2005-06-24 21:31 ` Luck, Tony
@ 2005-06-24 21:36 ` Russ Anderson
  2005-06-24 21:36 ` David Mosberger
  2005-06-24 21:53 ` David Mosberger
  17 siblings, 0 replies; 19+ messages in thread
From: Russ Anderson @ 2005-06-24 21:36 UTC (permalink / raw)
  To: linux-ia64

David Mosberger wrote:
> 
>   >> Fine, but what about stores?
> 
>   Tony> Stores have too many extra levels of buffering to have much
>   Tony> hope.  But just because we can't help the store case doesn't
>   Tony> mean that we shouldn't do something about the load case.
> 
> Fine, but what about my suggestion of just presuming that the access
> came for user-level unless you can prove otherwise?

The key question is how to prove otherwise. 

Ironicly, my concern was this patch would be seen as too aggressive and
therefore risky.  So the patch is limited to the interrupt case, which
seemed to be a heavy hitter in testing.  I hadn't expected pushback for 
not being aggressive enough.  :-)

-- 
Russ Anderson, OS RAS/Partitioning Project Lead  
SGI - Silicon Graphics Inc          rja@sgi.com

^ permalink raw reply	[flat|nested] 19+ messages in thread

* RE: [patch] Memory Error Handling Improvement
  2005-06-23 17:30 [patch] Memory Error Handling Improvement Russ Anderson
                   ` (15 preceding siblings ...)
  2005-06-24 21:36 ` Russ Anderson
@ 2005-06-24 21:36 ` David Mosberger
  2005-06-24 21:53 ` David Mosberger
  17 siblings, 0 replies; 19+ messages in thread
From: David Mosberger @ 2005-06-24 21:36 UTC (permalink / raw)
  To: linux-ia64

>>>>> On Fri, 24 Jun 2005 14:31:28 -0700, "Luck, Tony" <tony.luck@Intel.com> said:

  >> Fine, but what about my suggestion of just presuming that the access
  >> came for user-level unless you can prove otherwise?

  Tony> If the presumption is wrong, then you'll kill an innocent user process.

As opposed to panic'ing the kernel?  That doesn't strike me as a big
problem. ;-)

  Tony> Worse, you will let a corrupted kernel carry on running.

It'd boil down to the question of what the likelihood is that the
kernel would be touching a user-mapped page whose contents it depends
on for correct operation. (If it is not a user-mapped page, you'd
obviously _not_ conclude that the MCA was triggered by user-level.)

	--david

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [patch] Memory Error Handling Improvement
  2005-06-23 17:30 [patch] Memory Error Handling Improvement Russ Anderson
                   ` (16 preceding siblings ...)
  2005-06-24 21:36 ` David Mosberger
@ 2005-06-24 21:53 ` David Mosberger
  17 siblings, 0 replies; 19+ messages in thread
From: David Mosberger @ 2005-06-24 21:53 UTC (permalink / raw)
  To: linux-ia64

>>>>> On Fri, 24 Jun 2005 16:36:02 -0500 (CDT), Russ Anderson <rja@sgi.com> said:

  Russ> David Mosberger wrote:

  >> >> Fine, but what about stores?

  Tony> Stores have too many extra levels of buffering to have much
  Tony> hope.  But just because we can't help the store case doesn't
  Tony> mean that we shouldn't do something about the load case.

  >> Fine, but what about my suggestion of just presuming that the access
  >> came for user-level unless you can prove otherwise?

  Russ> The key question is how to prove otherwise.

  Russ> Ironicly, my concern was this patch would be seen as too
  Russ> aggressive and therefore risky.  So the patch is limited to
  Russ> the interrupt case, which seemed to be a heavy hitter in
  Russ> testing.  I hadn't expected pushback for not being aggressive
  Russ> enough.  :-)

I'm not complaining that it's not aggressive enough, but doing
"random" address-range checks just seems fragile.  I'm sure with a bit
of thought, a better (and more general scheme) can be thought of.

One thing that you could definitely do is to use an
exception-table-like approach (see uacess.h), where you'd tag each
instruction which consumes a user-level value and mark it as
potentially MCA triggering.  If you defined a suitable macro (e.g.,
st8.mca), this could be even reasonable and would then capture all
interesting cases reliably.

	--david

^ permalink raw reply	[flat|nested] 19+ messages in thread

end of thread, other threads:[~2005-06-24 21:53 UTC | newest]

Thread overview: 19+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2005-06-23 17:30 [patch] Memory Error Handling Improvement Russ Anderson
2005-06-23 17:52 ` David Mosberger
2005-06-23 19:30 ` Russ Anderson
2005-06-23 22:11 ` Andreas Schwab
2005-06-23 22:18 ` David Mosberger
2005-06-23 22:22 ` Russ Anderson
2005-06-23 22:54 ` Andreas Schwab
2005-06-24  1:12 ` Hidetoshi Seto
2005-06-24 20:11 ` Russ Anderson
2005-06-24 20:18 ` Russ Anderson
2005-06-24 20:36 ` David Mosberger
2005-06-24 21:05 ` Luck, Tony
2005-06-24 21:11 ` David Mosberger
2005-06-24 21:20 ` Luck, Tony
2005-06-24 21:25 ` David Mosberger
2005-06-24 21:31 ` Luck, Tony
2005-06-24 21:36 ` Russ Anderson
2005-06-24 21:36 ` David Mosberger
2005-06-24 21:53 ` David Mosberger

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.