All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 1/1] fusion: hold off error recovery while alternate ioc is initializing
@ 2010-02-10 20:32 Michael Reed
  2010-02-11  9:27 ` Desai, Kashyap
  0 siblings, 1 reply; 6+ messages in thread
From: Michael Reed @ 2010-02-10 20:32 UTC (permalink / raw)
  To: linux-scsi, kashyap.desai, Prakash, Sathya
  Cc: Moore, Eric, Jeremy Higdon, Robin Holt

After discussing this patch with LSI, I resubmitting with a recommended
40 second wait for the alternate ioc's initialization to complete.
--
Fusion FC chips are two function with some shared resources.  During
initialization of one function its driver inhibits the ability of the
other function's driver to allocate message frames by clearing its
"active" flag.  Should mid-layer error recovery be initiated for a
scsi command during this initialization (which can take up to 40 seconds)
error recovery will escalate to the level of host reset.  This host
reset might fail (as the other function is resetting) resulting in
all connected targets being taken offline.

This patch holds off mid-layer error recovery for up to 40 seconds
to permit initialization of the other function to complete.

Applies to scsi-misc.

Signed-off-by: Michael Reed <mdr@sgi.com>

==

--- scsi-misc-2.6/drivers/message/fusion/mptfc.c	2010-02-08 11:19:47.000000000 -0600
+++ scsi-misc-2.6-2010_02_08-modified/drivers/message/fusion/mptfc.c	2010-02-10 12:40:23.184510802 -0600
@@ -195,29 +195,34 @@ mptfc_block_error_handler(struct scsi_cm
 	unsigned long		flags;
 	int			ready;
 	MPT_ADAPTER 		*ioc;
+	int			loops = 40;	/* seconds */
 
 	hd = shost_priv(SCpnt->device->host);
 	ioc = hd->ioc;
 	spin_lock_irqsave(shost->host_lock, flags);
-	while ((ready = fc_remote_port_chkready(rport) >> 16) == DID_IMM_RETRY) {
+	while ((ready = fc_remote_port_chkready(rport) >> 16) == DID_IMM_RETRY
+	 || (loops > 0 && ioc->active == 0)) {
 		spin_unlock_irqrestore(shost->host_lock, flags);
 		dfcprintk (ioc, printk(MYIOC_s_DEBUG_FMT
 			"mptfc_block_error_handler.%d: %d:%d, port status is "
-			"DID_IMM_RETRY, deferring %s recovery.\n",
+			"%x, active flag %d, deferring %s recovery.\n",
 			ioc->name, ioc->sh->host_no,
-			SCpnt->device->id, SCpnt->device->lun, caller));
+			SCpnt->device->id, SCpnt->device->lun,
+			ready, ioc->active, caller));
 		msleep(1000);
 		spin_lock_irqsave(shost->host_lock, flags);
+		loops --;
 	}
 	spin_unlock_irqrestore(shost->host_lock, flags);
 
-	if (ready == DID_NO_CONNECT || !SCpnt->device->hostdata) {
+	if (ready == DID_NO_CONNECT || !SCpnt->device->hostdata
+	 || ioc->active == 0) {
 		dfcprintk (ioc, printk(MYIOC_s_DEBUG_FMT
 			"%s.%d: %d:%d, failing recovery, "
-			"port state %d, vdevice %p.\n", caller,
+			"port state %x, active %d, vdevice %p.\n", caller,
 			ioc->name, ioc->sh->host_no,
 			SCpnt->device->id, SCpnt->device->lun, ready,
-			SCpnt->device->hostdata));
+			ioc->active, SCpnt->device->hostdata));
 		return FAILED;
 	}
 	dfcprintk (ioc, printk(MYIOC_s_DEBUG_FMT

^ permalink raw reply	[flat|nested] 6+ messages in thread

* RE: [PATCH 1/1] fusion: hold off error recovery while alternate ioc is initializing
  2010-02-10 20:32 [PATCH 1/1] fusion: hold off error recovery while alternate ioc is initializing Michael Reed
@ 2010-02-11  9:27 ` Desai, Kashyap
  2010-02-11 17:53   ` Bernd Schubert
  0 siblings, 1 reply; 6+ messages in thread
From: Desai, Kashyap @ 2010-02-11  9:27 UTC (permalink / raw)
  To: Michael Reed, linux-scsi, Prakash, Sathya
  Cc: Moore, Eric, Jeremy Higdon, Robin Holt, James.Bottomley

Please consider this patch as an ACKed.

Thanks,
Kashyap

> -----Original Message-----
> From: Michael Reed [mailto:mdr@sgi.com]
> Sent: Thursday, February 11, 2010 2:02 AM
> To: linux-scsi; Desai, Kashyap; Prakash, Sathya
> Cc: Moore, Eric; Jeremy Higdon; Robin Holt
> Subject: [PATCH 1/1] fusion: hold off error recovery while alternate
> ioc is initializing
> 
> After discussing this patch with LSI, I resubmitting with a recommended
> 40 second wait for the alternate ioc's initialization to complete.
> --
> Fusion FC chips are two function with some shared resources.  During
> initialization of one function its driver inhibits the ability of the
> other function's driver to allocate message frames by clearing its
> "active" flag.  Should mid-layer error recovery be initiated for a
> scsi command during this initialization (which can take up to 40
> seconds)
> error recovery will escalate to the level of host reset.  This host
> reset might fail (as the other function is resetting) resulting in
> all connected targets being taken offline.
> 
> This patch holds off mid-layer error recovery for up to 40 seconds
> to permit initialization of the other function to complete.
> 
> Applies to scsi-misc.
> 
> Signed-off-by: Michael Reed <mdr@sgi.com>
> 
> ==
> 
> --- scsi-misc-2.6/drivers/message/fusion/mptfc.c	2010-02-08
> 11:19:47.000000000 -0600
> +++ scsi-misc-2.6-2010_02_08-modified/drivers/message/fusion/mptfc.c
> 	2010-02-10 12:40:23.184510802 -0600
> @@ -195,29 +195,34 @@ mptfc_block_error_handler(struct scsi_cm
>  	unsigned long		flags;
>  	int			ready;
>  	MPT_ADAPTER 		*ioc;
> +	int			loops = 40;	/* seconds */
> 
>  	hd = shost_priv(SCpnt->device->host);
>  	ioc = hd->ioc;
>  	spin_lock_irqsave(shost->host_lock, flags);
> -	while ((ready = fc_remote_port_chkready(rport) >> 16) ==
> DID_IMM_RETRY) {
> +	while ((ready = fc_remote_port_chkready(rport) >> 16) ==
> DID_IMM_RETRY
> +	 || (loops > 0 && ioc->active == 0)) {
>  		spin_unlock_irqrestore(shost->host_lock, flags);
>  		dfcprintk (ioc, printk(MYIOC_s_DEBUG_FMT
>  			"mptfc_block_error_handler.%d: %d:%d, port status is
> "
> -			"DID_IMM_RETRY, deferring %s recovery.\n",
> +			"%x, active flag %d, deferring %s recovery.\n",
>  			ioc->name, ioc->sh->host_no,
> -			SCpnt->device->id, SCpnt->device->lun, caller));
> +			SCpnt->device->id, SCpnt->device->lun,
> +			ready, ioc->active, caller));
>  		msleep(1000);
>  		spin_lock_irqsave(shost->host_lock, flags);
> +		loops --;
>  	}
>  	spin_unlock_irqrestore(shost->host_lock, flags);
> 
> -	if (ready == DID_NO_CONNECT || !SCpnt->device->hostdata) {
> +	if (ready == DID_NO_CONNECT || !SCpnt->device->hostdata
> +	 || ioc->active == 0) {
>  		dfcprintk (ioc, printk(MYIOC_s_DEBUG_FMT
>  			"%s.%d: %d:%d, failing recovery, "
> -			"port state %d, vdevice %p.\n", caller,
> +			"port state %x, active %d, vdevice %p.\n", caller,
>  			ioc->name, ioc->sh->host_no,
>  			SCpnt->device->id, SCpnt->device->lun, ready,
> -			SCpnt->device->hostdata));
> +			ioc->active, SCpnt->device->hostdata));
>  		return FAILED;
>  	}
>  	dfcprintk (ioc, printk(MYIOC_s_DEBUG_FMT

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH 1/1] fusion: hold off error recovery while alternate ioc is initializing
  2010-02-11  9:27 ` Desai, Kashyap
@ 2010-02-11 17:53   ` Bernd Schubert
  2010-02-11 18:01     ` James Bottomley
  0 siblings, 1 reply; 6+ messages in thread
From: Bernd Schubert @ 2010-02-11 17:53 UTC (permalink / raw)
  To: Desai, Kashyap
  Cc: Michael Reed, linux-scsi, Prakash, Sathya, Moore, Eric,
	Jeremy Higdon, Robin Holt, James.Bottomley@HansenPartnership.com

Shouldn't we create a similar patch for scsi and sas as well? This issue might 
explain why any hard-reset (for scsi systems of my former employer) always 
caused a complete failure of both ports.


Thanks,
Bernd

On Thursday 11 February 2010, Desai, Kashyap wrote:
> Please consider this patch as an ACKed.
> 
> Thanks,
> Kashyap
> 
> > -----Original Message-----
> > From: Michael Reed [mailto:mdr@sgi.com]
> > Sent: Thursday, February 11, 2010 2:02 AM
> > To: linux-scsi; Desai, Kashyap; Prakash, Sathya
> > Cc: Moore, Eric; Jeremy Higdon; Robin Holt
> > Subject: [PATCH 1/1] fusion: hold off error recovery while alternate
> > ioc is initializing
> >
> > After discussing this patch with LSI, I resubmitting with a recommended
> > 40 second wait for the alternate ioc's initialization to complete.
> > --
> > Fusion FC chips are two function with some shared resources.  During
> > initialization of one function its driver inhibits the ability of the
> > other function's driver to allocate message frames by clearing its
> > "active" flag.  Should mid-layer error recovery be initiated for a
> > scsi command during this initialization (which can take up to 40
> > seconds)
> > error recovery will escalate to the level of host reset.  This host
> > reset might fail (as the other function is resetting) resulting in
> > all connected targets being taken offline.
> >
> > This patch holds off mid-layer error recovery for up to 40 seconds
> > to permit initialization of the other function to complete.
> >
> > Applies to scsi-misc.
> >
> > Signed-off-by: Michael Reed <mdr@sgi.com>
> >
> > ==
> >
> > --- scsi-misc-2.6/drivers/message/fusion/mptfc.c	2010-02-08
> > 11:19:47.000000000 -0600
> > +++ scsi-misc-2.6-2010_02_08-modified/drivers/message/fusion/mptfc.c
> > 	2010-02-10 12:40:23.184510802 -0600
> > @@ -195,29 +195,34 @@ mptfc_block_error_handler(struct scsi_cm
> >  	unsigned long		flags;
> >  	int			ready;
> >  	MPT_ADAPTER 		*ioc;
> > +	int			loops = 40;	/* seconds */
> >
> >  	hd = shost_priv(SCpnt->device->host);
> >  	ioc = hd->ioc;
> >  	spin_lock_irqsave(shost->host_lock, flags);
> > -	while ((ready = fc_remote_port_chkready(rport) >> 16) ==
> > DID_IMM_RETRY) {
> > +	while ((ready = fc_remote_port_chkready(rport) >> 16) ==
> > DID_IMM_RETRY
> > +	 || (loops > 0 && ioc->active == 0)) {
> >  		spin_unlock_irqrestore(shost->host_lock, flags);
> >  		dfcprintk (ioc, printk(MYIOC_s_DEBUG_FMT
> >  			"mptfc_block_error_handler.%d: %d:%d, port status is
> > "
> > -			"DID_IMM_RETRY, deferring %s recovery.\n",
> > +			"%x, active flag %d, deferring %s recovery.\n",
> >  			ioc->name, ioc->sh->host_no,
> > -			SCpnt->device->id, SCpnt->device->lun, caller));
> > +			SCpnt->device->id, SCpnt->device->lun,
> > +			ready, ioc->active, caller));
> >  		msleep(1000);
> >  		spin_lock_irqsave(shost->host_lock, flags);
> > +		loops --;
> >  	}
> >  	spin_unlock_irqrestore(shost->host_lock, flags);
> >
> > -	if (ready == DID_NO_CONNECT || !SCpnt->device->hostdata) {
> > +	if (ready == DID_NO_CONNECT || !SCpnt->device->hostdata
> > +	 || ioc->active == 0) {
> >  		dfcprintk (ioc, printk(MYIOC_s_DEBUG_FMT
> >  			"%s.%d: %d:%d, failing recovery, "
> > -			"port state %d, vdevice %p.\n", caller,
> > +			"port state %x, active %d, vdevice %p.\n", caller,
> >  			ioc->name, ioc->sh->host_no,
> >  			SCpnt->device->id, SCpnt->device->lun, ready,
> > -			SCpnt->device->hostdata));
> > +			ioc->active, SCpnt->device->hostdata));
> >  		return FAILED;
> >  	}
> >  	dfcprintk (ioc, printk(MYIOC_s_DEBUG_FMT
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-scsi" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 


^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH 1/1] fusion: hold off error recovery while alternate ioc is initializing
  2010-02-11 17:53   ` Bernd Schubert
@ 2010-02-11 18:01     ` James Bottomley
  2010-02-11 18:11       ` Bernd Schubert
  0 siblings, 1 reply; 6+ messages in thread
From: James Bottomley @ 2010-02-11 18:01 UTC (permalink / raw)
  To: Bernd Schubert
  Cc: Desai, Kashyap, Michael Reed, linux-scsi, Prakash, Sathya, Moore,
	Eric, Jeremy Higdon, Robin Holt

On Thu, 2010-02-11 at 18:53 +0100, Bernd Schubert wrote:
> Shouldn't we create a similar patch for scsi and sas as well? This issue might 
> explain why any hard-reset (for scsi systems of my former employer) always 
> caused a complete failure of both ports.

It can't really be done generically.  Firstly, the attachments look like
two separate hosts, the mid layer won't even know if they're related,
and secondly, even if we work out they are different functions of the
same PCI device, depending on how the device is implemented, there may
not be this problematic interdependency ... in fact it's more the
exception than the norm.

James



^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH 1/1] fusion: hold off error recovery while alternate ioc is initializing
  2010-02-11 18:01     ` James Bottomley
@ 2010-02-11 18:11       ` Bernd Schubert
  0 siblings, 0 replies; 6+ messages in thread
From: Bernd Schubert @ 2010-02-11 18:11 UTC (permalink / raw)
  To: James Bottomley
  Cc: Desai, Kashyap, Michael Reed, linux-scsi, Prakash, Sathya, Moore,
	Eric, Jeremy Higdon, Robin Holt

On Thursday 11 February 2010, James Bottomley wrote:
> On Thu, 2010-02-11 at 18:53 +0100, Bernd Schubert wrote:
> > Shouldn't we create a similar patch for scsi and sas as well? This issue
> > might explain why any hard-reset (for scsi systems of my former employer)
> > always caused a complete failure of both ports.
> 
> It can't really be done generically.  Firstly, the attachments look like
> two separate hosts, the mid layer won't even know if they're related,

Yes, that is also true for those LSI53C1030 based devices we used (and which 
I'm still involved in to maintain, any kernel update is extremely difficult, 
as most of my patches never went upstream, but without spurious failures would 
happen at a high rate...).

> and secondly, even if we work out they are different functions of the
> same PCI device, depending on how the device is implemented, there may
> not be this problematic interdependency ... in fact it's more the
> exception than the norm.

I see your point, but some days I also would like to see a vanilla or 
mainstream distribution kernel to work with those scsi devices. Hmm, if those 
patches are further delayed, the problem will solve itself - parallel scsi is 
outdated now. At least with LSI SAS we never run into those problems.


Thanks,
Bernd

^ permalink raw reply	[flat|nested] 6+ messages in thread

* [PATCH 1/1] fusion: hold off error recovery while alternate ioc is initializing
@ 2009-12-16 21:20 Michael Reed
  0 siblings, 0 replies; 6+ messages in thread
From: Michael Reed @ 2009-12-16 21:20 UTC (permalink / raw)
  To: linux-scsi; +Cc: Moore, Eric, Jeremy Higdon, Robin Holt, kashyap.desai

Fusion FC chips are two function with some shared resources.  During
initialization of one function its driver inhibits the ability of the
other function's driver to allocate message frames by clearing its
"active" flag.  Should mid-layer error recovery be initiated for a
scsi command during this initialization (which can take many seconds)
error recovery will escalate to the level of host reset.  This host
reset might fail resulting in all connected targets being taken offline.

This patch holds off mid-layer error recovery for up to 60 seconds
to permit initialization of the other function to complete.

Applies to scsi-misc.

Signed-off-by: Michael Reed <mdr@sgi.com>

==

--- a/drivers/message/fusion/mptfc.c	2009-12-16 15:09:22.817382765 -0600
+++ b/drivers/message/fusion/mptfc.c	2009-12-16 15:10:31.949380663 -0600
@@ -195,29 +195,31 @@ mptfc_block_error_handler(struct scsi_cm
 	unsigned long		flags;
 	int			ready;
 	MPT_ADAPTER 		*ioc;
+	int			sleep_interval = 1000;
+	int			loops = 60 * sleep_interval;
 
 	hd = shost_priv(SCpnt->device->host);
 	ioc = hd->ioc;
 	spin_lock_irqsave(shost->host_lock, flags);
-	while ((ready = fc_remote_port_chkready(rport) >> 16) == DID_IMM_RETRY) {
+
+	while ((loops > 0 && ioc->active == 0)
+	 || (ready = fc_remote_port_chkready(rport) >> 16) == DID_IMM_RETRY) {
+
 		spin_unlock_irqrestore(shost->host_lock, flags);
-		dfcprintk (ioc, printk(MYIOC_s_DEBUG_FMT
-			"mptfc_block_error_handler.%d: %d:%d, port status is "
-			"DID_IMM_RETRY, deferring %s recovery.\n",
-			ioc->name, ioc->sh->host_no,
-			SCpnt->device->id, SCpnt->device->lun, caller));
-		msleep(1000);
+		msleep(sleep_interval);
+		loops -= sleep_interval;
 		spin_lock_irqsave(shost->host_lock, flags);
 	}
 	spin_unlock_irqrestore(shost->host_lock, flags);
 
-	if (ready == DID_NO_CONNECT || !SCpnt->device->hostdata) {
+	if (ioc->active == 0
+	 || ready == DID_NO_CONNECT || !SCpnt->device->hostdata) {
 		dfcprintk (ioc, printk(MYIOC_s_DEBUG_FMT
-			"%s.%d: %d:%d, failing recovery, "
+			"%s.%d: %d:%d, failing recovery, active %d, "
 			"port state %d, vdevice %p.\n", caller,
 			ioc->name, ioc->sh->host_no,
-			SCpnt->device->id, SCpnt->device->lun, ready,
-			SCpnt->device->hostdata));
+			SCpnt->device->id, SCpnt->device->lun,
+			ioc->active, ready, SCpnt->device->hostdata));
 		return FAILED;
 	}
 	dfcprintk (ioc, printk(MYIOC_s_DEBUG_FMT

^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2010-02-11 18:11 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2010-02-10 20:32 [PATCH 1/1] fusion: hold off error recovery while alternate ioc is initializing Michael Reed
2010-02-11  9:27 ` Desai, Kashyap
2010-02-11 17:53   ` Bernd Schubert
2010-02-11 18:01     ` James Bottomley
2010-02-11 18:11       ` Bernd Schubert
  -- strict thread matches above, loose matches on Subject: below --
2009-12-16 21:20 Michael Reed

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.