All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] scsi: Allow error handling timeout to be specified
@ 2013-05-10  3:11 Martin K. Petersen
  2013-05-10  6:23 ` Bart Van Assche
  2013-05-10 12:43 ` Ewan Milne
  0 siblings, 2 replies; 26+ messages in thread
From: Martin K. Petersen @ 2013-05-10  3:11 UTC (permalink / raw)
  To: linux-scsi; +Cc: Ewan Milne, Hannes Reinecke, michaelc


Introduce eh_timeout which can be used for error handling purposes. This
was previously hardcoded to 10 seconds in the SCSI error handling
code. However, for some fast-fail scenarios it is necessary to be able
to tune this as it can take several iterations (bus device, target, bus,
controller) before we give up.

Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>

diff --git a/drivers/scsi/scsi_error.c b/drivers/scsi/scsi_error.c
index c1b05a8..91adc52 100644
--- a/drivers/scsi/scsi_error.c
+++ b/drivers/scsi/scsi_error.c
@@ -44,8 +44,6 @@
 
 static void scsi_eh_done(struct scsi_cmnd *scmd);
 
-#define SENSE_TIMEOUT		(10*HZ)
-
 /*
  * These should *probably* be handled by the host itself.
  * Since it is allowed to sleep, it probably should.
@@ -864,7 +862,7 @@ static int scsi_send_eh_cmnd(struct scsi_cmnd *scmd, unsigned char *cmnd,
  */
 static int scsi_request_sense(struct scsi_cmnd *scmd)
 {
-	return scsi_send_eh_cmnd(scmd, NULL, 0, SENSE_TIMEOUT, ~0);
+	return scsi_send_eh_cmnd(scmd, NULL, 0, scmd->device->eh_timeout, ~0);
 }
 
 /**
@@ -965,7 +963,8 @@ static int scsi_eh_tur(struct scsi_cmnd *scmd)
 	int retry_cnt = 1, rtn;
 
 retry_tur:
-	rtn = scsi_send_eh_cmnd(scmd, tur_command, 6, SENSE_TIMEOUT, 0);
+	rtn = scsi_send_eh_cmnd(scmd, tur_command, 6,
+				scmd->device->eh_timeout, 0);
 
 	SCSI_LOG_ERROR_RECOVERY(3, printk("%s: scmd %p rtn %x\n",
 		__func__, scmd, rtn));
diff --git a/drivers/scsi/scsi_scan.c b/drivers/scsi/scsi_scan.c
index b9e39e0..7f7bd1f 100644
--- a/drivers/scsi/scsi_scan.c
+++ b/drivers/scsi/scsi_scan.c
@@ -945,6 +945,7 @@ static int scsi_add_lun(struct scsi_device *sdev, unsigned char *inq_result,
 	}
 
 	sdev->max_queue_depth = sdev->queue_depth;
+	sdev->eh_timeout = SCSI_DEFAULT_EH_TIMEOUT;
 
 	/*
 	 * Ok, the device is now all set up, we can
diff --git a/drivers/scsi/scsi_sysfs.c b/drivers/scsi/scsi_sysfs.c
index 931a7d9..38db310 100644
--- a/drivers/scsi/scsi_sysfs.c
+++ b/drivers/scsi/scsi_sysfs.c
@@ -560,6 +560,35 @@ sdev_store_timeout (struct device *dev, struct device_attribute *attr,
 static DEVICE_ATTR(timeout, S_IRUGO | S_IWUSR, sdev_show_timeout, sdev_store_timeout);
 
 static ssize_t
+sdev_show_eh_timeout (struct device *dev, struct device_attribute *attr, char *buf)
+{
+	struct scsi_device *sdev;
+	sdev = to_scsi_device(dev);
+	return snprintf(buf, 20, "%u\n", sdev->eh_timeout / HZ);
+}
+
+static ssize_t
+sdev_store_eh_timeout (struct device *dev, struct device_attribute *attr,
+		    const char *buf, size_t count)
+{
+	struct scsi_device *sdev;
+	unsigned int eh_timeout;
+	int err;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EACCES;
+
+	sdev = to_scsi_device(dev);
+	err = kstrtouint(buf, 10, &eh_timeout);
+	if (err)
+		return err;
+	sdev->eh_timeout = eh_timeout * HZ;
+
+	return count;
+}
+static DEVICE_ATTR(eh_timeout, S_IRUGO | S_IWUSR, sdev_show_eh_timeout, sdev_store_eh_timeout);
+
+static ssize_t
 store_rescan_field (struct device *dev, struct device_attribute *attr,
 		    const char *buf, size_t count)
 {
@@ -723,6 +752,7 @@ static struct attribute *scsi_sdev_attrs[] = {
 	&dev_attr_delete.attr,
 	&dev_attr_state.attr,
 	&dev_attr_timeout.attr,
+	&dev_attr_eh_timeout.attr,
 	&dev_attr_iocounterbits.attr,
 	&dev_attr_iorequest_cnt.attr,
 	&dev_attr_iodone_cnt.attr,
diff --git a/include/scsi/scsi.h b/include/scsi/scsi.h
index 66216c1..4b87d99 100644
--- a/include/scsi/scsi.h
+++ b/include/scsi/scsi.h
@@ -10,9 +10,14 @@
 
 #include <linux/types.h>
 #include <linux/scatterlist.h>
+#include <linux/kernel.h>
 
 struct scsi_cmnd;
 
+enum scsi_timeouts {
+	SCSI_DEFAULT_EH_TIMEOUT		= 10 * HZ,
+};
+
 /*
  * The maximum number of SG segments that we will put inside a
  * scatterlist (unless chaining is used). Should ideally fit inside a
diff --git a/include/scsi/scsi_device.h b/include/scsi/scsi_device.h
index a7f9cba..7eb9b20 100644
--- a/include/scsi/scsi_device.h
+++ b/include/scsi/scsi_device.h
@@ -113,6 +113,7 @@ struct scsi_device {
 				 * scsi_devinfo.[hc]. For now used only to
 				 * pass settings from slave_alloc to scsi
 				 * core. */
+	unsigned int eh_timeout; /* Error handling timeout */
 	unsigned writeable:1;
 	unsigned removable:1;
 	unsigned changed:1;	/* Data invalid due to media change */

^ permalink raw reply related	[flat|nested] 26+ messages in thread

* Re: [PATCH] scsi: Allow error handling timeout to be specified
  2013-05-10  3:11 [PATCH] scsi: Allow error handling timeout to be specified Martin K. Petersen
@ 2013-05-10  6:23 ` Bart Van Assche
  2013-05-10 14:36   ` Martin K. Petersen
  2013-05-10 12:43 ` Ewan Milne
  1 sibling, 1 reply; 26+ messages in thread
From: Bart Van Assche @ 2013-05-10  6:23 UTC (permalink / raw)
  To: Martin K. Petersen; +Cc: linux-scsi, Ewan Milne, Hannes Reinecke, michaelc

On 05/10/13 05:11, Martin K. Petersen wrote:
> Introduce eh_timeout which can be used for error handling purposes. This
> was previously hardcoded to 10 seconds in the SCSI error handling
> code. However, for some fast-fail scenarios it is necessary to be able
> to tune this as it can take several iterations (bus device, target, bus,
> controller) before we give up.

Hello Martin,

  	sdev->max_queue_depth = sdev->queue_depth;
+	sdev->eh_timeout = SCSI_DEFAULT_EH_TIMEOUT;

Have you considered to move the eh_timeout assignment statement to just 
before the transport_configure_device() and slave_configure() calls ? 
That would allow transport drivers and LLD drivers to override the 
default eh_timeout value.

Bart.

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH] scsi: Allow error handling timeout to be specified
  2013-05-10  3:11 [PATCH] scsi: Allow error handling timeout to be specified Martin K. Petersen
  2013-05-10  6:23 ` Bart Van Assche
@ 2013-05-10 12:43 ` Ewan Milne
  2013-05-10 12:55   ` Hannes Reinecke
                     ` (3 more replies)
  1 sibling, 4 replies; 26+ messages in thread
From: Ewan Milne @ 2013-05-10 12:43 UTC (permalink / raw)
  To: Martin K. Petersen; +Cc: linux-scsi, Hannes Reinecke, michaelc

On Thu, 2013-05-09 at 23:11 -0400, Martin K. Petersen wrote:
> Introduce eh_timeout which can be used for error handling purposes. This
> was previously hardcoded to 10 seconds in the SCSI error handling
> code. However, for some fast-fail scenarios it is necessary to be able
> to tune this as it can take several iterations (bus device, target, bus,
> controller) before we give up.
> 
> Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
> 

Thanks for posting this.  It will be very helpful to have this
capability, particularly when alternate paths to the device exist.

Acked-by: Ewan D. Milne <emilne@redhat.com>



^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH] scsi: Allow error handling timeout to be specified
  2013-05-10 12:43 ` Ewan Milne
@ 2013-05-10 12:55   ` Hannes Reinecke
  2013-05-10 13:09   ` Bryn M. Reeves
                     ` (2 subsequent siblings)
  3 siblings, 0 replies; 26+ messages in thread
From: Hannes Reinecke @ 2013-05-10 12:55 UTC (permalink / raw)
  To: emilne; +Cc: Martin K. Petersen, linux-scsi, michaelc

On 05/10/2013 02:43 PM, Ewan Milne wrote:
> On Thu, 2013-05-09 at 23:11 -0400, Martin K. Petersen wrote:
>> Introduce eh_timeout which can be used for error handling purposes. This
>> was previously hardcoded to 10 seconds in the SCSI error handling
>> code. However, for some fast-fail scenarios it is necessary to be able
>> to tune this as it can take several iterations (bus device, target, bus,
>> controller) before we give up.
>>
>> Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
>>
> 
> Thanks for posting this.  It will be very helpful to have this
> capability, particularly when alternate paths to the device exist.
> 
> Acked-by: Ewan D. Milne <emilne@redhat.com>
> 
> 
Seconded.

Acked-by: Hannes Reinecke <hare@suse.de>

Cheers,

Hannes
-- 
Dr. Hannes Reinecke		      zSeries & Storage
hare@suse.de			      +49 911 74053 688
SUSE LINUX Products GmbH, Maxfeldstr. 5, 90409 Nürnberg
GF: J. Hawn, J. Guild, F. Imendörffer, HRB 16746 (AG Nürnberg)
--
To unsubscribe from this list: send the line "unsubscribe linux-scsi" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH] scsi: Allow error handling timeout to be specified
  2013-05-10 12:43 ` Ewan Milne
  2013-05-10 12:55   ` Hannes Reinecke
@ 2013-05-10 13:09   ` Bryn M. Reeves
  2013-05-10 13:22   ` Baruch Even
       [not found]   ` <CAC9+anJ9Y-SnCOK6EOCavTNJwx=xhAbL_X__MsEsL7DroawaJg@mail.gmail.com>
  3 siblings, 0 replies; 26+ messages in thread
From: Bryn M. Reeves @ 2013-05-10 13:09 UTC (permalink / raw)
  To: emilne; +Cc: Martin K. Petersen, linux-scsi, Hannes Reinecke, michaelc

On 05/10/2013 01:43 PM, Ewan Milne wrote:
> On Thu, 2013-05-09 at 23:11 -0400, Martin K. Petersen wrote:
>> Introduce eh_timeout which can be used for error handling purposes. This
>> was previously hardcoded to 10 seconds in the SCSI error handling
>> code. However, for some fast-fail scenarios it is necessary to be able
>> to tune this as it can take several iterations (bus device, target, bus,
>> controller) before we give up.
>>
>> Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
>>
>
> Thanks for posting this.  It will be very helpful to have this
> capability, particularly when alternate paths to the device exist.

Ack - this is definitely a step forward but until we have better eh 
behaviour for FC the benefits are pretty limited. This is especially the 
case with large LU counts and certain LLDDs since some impose much 
longer timeouts (e.g. lpfc's 60s TMF timeout).

With 5 LUs presented and a single dd driving IO on lpfc I see a time to 
fail an IO of 10-11m when inducing a fabric fault that blackholes all 
traffic to a particular target port on my test setup.

Looking at where the time is being spent in this example there's around 
200s of TUR waits (3m20) and >500s waiting on TMF timeouts (foreach 
device, BDR, foreach target, etc.):

http://paste.fedoraproject.org/11473/81911241/

Environments with 100s of devices can easily spend an hour or more 
waiting for the eh to do its thing.

Regards,
Bryn.




^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH] scsi: Allow error handling timeout to be specified
  2013-05-10 12:43 ` Ewan Milne
  2013-05-10 12:55   ` Hannes Reinecke
  2013-05-10 13:09   ` Bryn M. Reeves
@ 2013-05-10 13:22   ` Baruch Even
  2013-05-10 14:01     ` Ewan Milne
       [not found]   ` <CAC9+anJ9Y-SnCOK6EOCavTNJwx=xhAbL_X__MsEsL7DroawaJg@mail.gmail.com>
  3 siblings, 1 reply; 26+ messages in thread
From: Baruch Even @ 2013-05-10 13:22 UTC (permalink / raw)
  To: emilne; +Cc: Martin K. Petersen, linux-scsi, Hannes Reinecke, michaelc

On Fri, May 10, 2013 at 3:43 PM, Ewan Milne <emilne@redhat.com> wrote:
>
> On Thu, 2013-05-09 at 23:11 -0400, Martin K. Petersen wrote:
> > Introduce eh_timeout which can be used for error handling purposes. This
> > was previously hardcoded to 10 seconds in the SCSI error handling
> > code. However, for some fast-fail scenarios it is necessary to be able
> > to tune this as it can take several iterations (bus device, target, bus,
> > controller) before we give up.
> >
> > Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
> >
>
> Thanks for posting this.  It will be very helpful to have this
> capability, particularly when alternate paths to the device exist.
>
> Acked-by: Ewan D. Milne <emilne@redhat.com>


I would argue that waiting for the eh to timeout before you switch to
another path is most likely to be wrong. If you did the first pass of
error recovery (task abort) and that failed the
path/hba/logical-device is doomed. If you will switch to another path
it will either work (meaning the path/hba were bad) or not (logical
device was the culprit).

Actually reducing the timeouts is probably not a good approach since
it will cause the host to take a more radical approach without waiting
sufficiently for a potential recovery. In addition the more radical
error handlings such as host reset will destroy other paths for
completely unrelated devices/links, from my experience a host reset is
usually not required and the Linux kernel currently reaches to this
big hammer too fast.

Not that I have any qualms about the patch itself, I've been down this
path myself and was proven wrong by real life. Though my experience
was mostly on the SAS network rather than the FC network.

Baruch

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH] scsi: Allow error handling timeout to be specified
  2013-05-10 13:22   ` Baruch Even
@ 2013-05-10 14:01     ` Ewan Milne
  2013-05-10 14:24       ` Hannes Reinecke
  2013-05-10 17:51       ` Baruch Even
  0 siblings, 2 replies; 26+ messages in thread
From: Ewan Milne @ 2013-05-10 14:01 UTC (permalink / raw)
  To: Baruch Even; +Cc: Martin K. Petersen, linux-scsi, Hannes Reinecke, michaelc

On Fri, 2013-05-10 at 16:22 +0300, Baruch Even wrote:
> On Fri, May 10, 2013 at 3:43 PM, Ewan Milne <emilne@redhat.com> wrote:
> >
> > On Thu, 2013-05-09 at 23:11 -0400, Martin K. Petersen wrote:
> > > Introduce eh_timeout which can be used for error handling purposes. This
> > > was previously hardcoded to 10 seconds in the SCSI error handling
> > > code. However, for some fast-fail scenarios it is necessary to be able
> > > to tune this as it can take several iterations (bus device, target, bus,
> > > controller) before we give up.
> > >
> > > Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
> > >
> >
> > Thanks for posting this.  It will be very helpful to have this
> > capability, particularly when alternate paths to the device exist.
> >
> > Acked-by: Ewan D. Milne <emilne@redhat.com>
> 
> 
> I would argue that waiting for the eh to timeout before you switch to
> another path is most likely to be wrong. If you did the first pass of
> error recovery (task abort) and that failed the
> path/hba/logical-device is doomed. If you will switch to another path
> it will either work (meaning the path/hba were bad) or not (logical
> device was the culprit).

It is necessary to either know the disposition of a command or
else wait for a defined amount of time before retrying the command on
another path.  Otherwise you run the risk that the command will
eventually complete on the first path.  So yes, we need to do the abort
(and its timeout).

> 
> Actually reducing the timeouts is probably not a good approach since
> it will cause the host to take a more radical approach without waiting
> sufficiently for a potential recovery. In addition the more radical
> error handlings such as host reset will destroy other paths for
> completely unrelated devices/links, from my experience a host reset is
> usually not required and the Linux kernel currently reaches to this
> big hammer too fast.

I believe that Hannes is working on a better error handling algorithm
that e.g. does not cause an emulated bus reset in an FC environment
by resetting all the targets (and affecting I/O to unrelated targets in
the process).

> 
> Not that I have any qualms about the patch itself, I've been down this
> path myself and was proven wrong by real life. Though my experience
> was mostly on the SAS network rather than the FC network.
> 
> Baruch




^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH] scsi: Allow error handling timeout to be specified
  2013-05-10 14:01     ` Ewan Milne
@ 2013-05-10 14:24       ` Hannes Reinecke
  2013-05-10 14:31         ` Bryn M. Reeves
  2013-05-10 16:59         ` Ewan Milne
  2013-05-10 17:51       ` Baruch Even
  1 sibling, 2 replies; 26+ messages in thread
From: Hannes Reinecke @ 2013-05-10 14:24 UTC (permalink / raw)
  To: emilne; +Cc: Baruch Even, Martin K. Petersen, linux-scsi, michaelc

On 05/10/2013 04:01 PM, Ewan Milne wrote:
> On Fri, 2013-05-10 at 16:22 +0300, Baruch Even wrote:
>> On Fri, May 10, 2013 at 3:43 PM, Ewan Milne <emilne@redhat.com> wrote:
>>>
>>> On Thu, 2013-05-09 at 23:11 -0400, Martin K. Petersen wrote:
>>>> Introduce eh_timeout which can be used for error handling purposes. This
>>>> was previously hardcoded to 10 seconds in the SCSI error handling
>>>> code. However, for some fast-fail scenarios it is necessary to be able
>>>> to tune this as it can take several iterations (bus device, target, bus,
>>>> controller) before we give up.
>>>>
>>>> Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
>>>>
>>>
>>> Thanks for posting this.  It will be very helpful to have this
>>> capability, particularly when alternate paths to the device exist.
>>>
>>> Acked-by: Ewan D. Milne <emilne@redhat.com>
>>
>>
>> I would argue that waiting for the eh to timeout before you switch to
>> another path is most likely to be wrong. If you did the first pass of
>> error recovery (task abort) and that failed the
>> path/hba/logical-device is doomed. If you will switch to another path
>> it will either work (meaning the path/hba were bad) or not (logical
>> device was the culprit).
> 
> It is necessary to either know the disposition of a command or
> else wait for a defined amount of time before retrying the command on
> another path.  Otherwise you run the risk that the command will
> eventually complete on the first path.  So yes, we need to do the abort
> (and its timeout).
> 
Strictly speaking that's not true.
Yes, we do need to wait for a certain amount of time for the command
completion to come in.

However, this time is only defined _on the initiator_.
The specification does _NOT_ have any fixed timeout values for _any_
command. As such it could in theory (and does, if you happen to run
against certain arrays under certain conditions) take several
minutes to return a completion.

So we have to accept that a command completion might happen in
between the time we take between deciding that a command abort has
to be send and the actual submission of the command abort by the
HBA. Which is totally independent of any command timeout we set.
It's just that a short command timeout increases the likelyhood of
the race to happen; the race itself is always present.

>>
>> Actually reducing the timeouts is probably not a good approach since
>> it will cause the host to take a more radical approach without waiting
>> sufficiently for a potential recovery. In addition the more radical
>> error handlings such as host reset will destroy other paths for
>> completely unrelated devices/links, from my experience a host reset is
>> usually not required and the Linux kernel currently reaches to this
>> big hammer too fast.
> 
> I believe that Hannes is working on a better error handling algorithm
> that e.g. does not cause an emulated bus reset in an FC environment
> by resetting all the targets (and affecting I/O to unrelated targets in
> the process).
> 
Yes, that was the idea.
Which I'll get down to eventually; if only customers wouldn't have
all these obnoxious issues no-one has ever seen...

And there is nothing wrong with reducing the timeout per se. It's
just that the current error recovery strategy isn't well equipped to
handle it :-)

Cheers,

Hannes
-- 
Dr. Hannes Reinecke		      zSeries & Storage
hare@suse.de			      +49 911 74053 688
SUSE LINUX Products GmbH, Maxfeldstr. 5, 90409 Nürnberg
GF: J. Hawn, J. Guild, F. Imendörffer, HRB 16746 (AG Nürnberg)
--
To unsubscribe from this list: send the line "unsubscribe linux-scsi" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH] scsi: Allow error handling timeout to be specified
  2013-05-10 14:24       ` Hannes Reinecke
@ 2013-05-10 14:31         ` Bryn M. Reeves
  2013-05-10 16:59         ` Ewan Milne
  1 sibling, 0 replies; 26+ messages in thread
From: Bryn M. Reeves @ 2013-05-10 14:31 UTC (permalink / raw)
  To: Hannes Reinecke
  Cc: emilne, Baruch Even, Martin K. Petersen, linux-scsi, michaelc

On 05/10/2013 03:24 PM, Hannes Reinecke wrote:
> However, this time is only defined _on the initiator_.
> The specification does _NOT_ have any fixed timeout values for _any_
> command. As such it could in theory (and does, if you happen to run
> against certain arrays under certain conditions) take several
> minutes to return a completion.

That's my understanding too - in a multipath configuration we're 
waiting only for our own fast_io_fail_tmo (if set), which is essentially 
an arbitrary, administrator-controlled interval. You can tune it between 
extremes of rapid fault identification vs. paths twitching at every 
transient glitch.

> Yes, that was the idea.
> Which I'll get down to eventually; if only customers wouldn't have
> all these obnoxious issues no-one has ever seen...

The class I've been looking at is really very easy to reproduce and 
we've seen it at least a half dozen times at different sites with 
different FC switches (so it's certainly not that unusual).

To recreate it artificially you just need a target, a host, and a switch 
that can block RSCN propagation on a per-port basis. I've been using 
brocades with the rscnsupr portcfg attribute.

It's important that you block a port on the switch<->target side 
otherwise the host will see a link event which short-circuits everything.

E.g. if you have one port of an array attached to port 1 on a brocade 
the following two commands will set up this scenario:

portcfg rscnsupr 1 --enable
portdisable 1

Regards,
Bryn.


^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH] scsi: Allow error handling timeout to be specified
  2013-05-10  6:23 ` Bart Van Assche
@ 2013-05-10 14:36   ` Martin K. Petersen
  0 siblings, 0 replies; 26+ messages in thread
From: Martin K. Petersen @ 2013-05-10 14:36 UTC (permalink / raw)
  To: Bart Van Assche
  Cc: Martin K. Petersen, linux-scsi, Ewan Milne, Hannes Reinecke, michaelc

>>>>> "Bart" == Bart Van Assche <bvanassche@acm.org> writes:

Bart> Have you considered to move the eh_timeout assignment statement to
Bart> just before the transport_configure_device() and slave_configure()
Bart> calls ?  That would allow transport drivers and LLD drivers to
Bart> override the default eh_timeout value.

I'm ok with that...


scsi: Allow error handling timeout to be specified

Introduce eh_timeout which can be used for error handling purposes. This
was previously hardcoded to 10 seconds in the SCSI error handling
code. However, for some fast-fail scenarios it is necessary to be able
to tune this as it can take several iterations (bus device, target, bus,
controller) before we give up.

Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>

diff --git a/drivers/scsi/scsi_error.c b/drivers/scsi/scsi_error.c
index c1b05a8..91adc52 100644
--- a/drivers/scsi/scsi_error.c
+++ b/drivers/scsi/scsi_error.c
@@ -44,8 +44,6 @@
 
 static void scsi_eh_done(struct scsi_cmnd *scmd);
 
-#define SENSE_TIMEOUT		(10*HZ)
-
 /*
  * These should *probably* be handled by the host itself.
  * Since it is allowed to sleep, it probably should.
@@ -864,7 +862,7 @@ static int scsi_send_eh_cmnd(struct scsi_cmnd *scmd, unsigned char *cmnd,
  */
 static int scsi_request_sense(struct scsi_cmnd *scmd)
 {
-	return scsi_send_eh_cmnd(scmd, NULL, 0, SENSE_TIMEOUT, ~0);
+	return scsi_send_eh_cmnd(scmd, NULL, 0, scmd->device->eh_timeout, ~0);
 }
 
 /**
@@ -965,7 +963,8 @@ static int scsi_eh_tur(struct scsi_cmnd *scmd)
 	int retry_cnt = 1, rtn;
 
 retry_tur:
-	rtn = scsi_send_eh_cmnd(scmd, tur_command, 6, SENSE_TIMEOUT, 0);
+	rtn = scsi_send_eh_cmnd(scmd, tur_command, 6,
+				scmd->device->eh_timeout, 0);
 
 	SCSI_LOG_ERROR_RECOVERY(3, printk("%s: scmd %p rtn %x\n",
 		__func__, scmd, rtn));
diff --git a/drivers/scsi/scsi_scan.c b/drivers/scsi/scsi_scan.c
index b9e39e0..9091d6d 100644
--- a/drivers/scsi/scsi_scan.c
+++ b/drivers/scsi/scsi_scan.c
@@ -927,6 +927,8 @@ static int scsi_add_lun(struct scsi_device *sdev, unsigned char *inq_result,
 	if (*bflags & BLIST_SKIP_VPD_PAGES)
 		sdev->skip_vpd_pages = 1;
 
+	sdev->eh_timeout = SCSI_DEFAULT_EH_TIMEOUT;
+
 	transport_configure_device(&sdev->sdev_gendev);
 
 	if (sdev->host->hostt->slave_configure) {
diff --git a/drivers/scsi/scsi_sysfs.c b/drivers/scsi/scsi_sysfs.c
index 931a7d9..38db310 100644
--- a/drivers/scsi/scsi_sysfs.c
+++ b/drivers/scsi/scsi_sysfs.c
@@ -560,6 +560,35 @@ sdev_store_timeout (struct device *dev, struct device_attribute *attr,
 static DEVICE_ATTR(timeout, S_IRUGO | S_IWUSR, sdev_show_timeout, sdev_store_timeout);
 
 static ssize_t
+sdev_show_eh_timeout (struct device *dev, struct device_attribute *attr, char *buf)
+{
+	struct scsi_device *sdev;
+	sdev = to_scsi_device(dev);
+	return snprintf(buf, 20, "%u\n", sdev->eh_timeout / HZ);
+}
+
+static ssize_t
+sdev_store_eh_timeout (struct device *dev, struct device_attribute *attr,
+		    const char *buf, size_t count)
+{
+	struct scsi_device *sdev;
+	unsigned int eh_timeout;
+	int err;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EACCES;
+
+	sdev = to_scsi_device(dev);
+	err = kstrtouint(buf, 10, &eh_timeout);
+	if (err)
+		return err;
+	sdev->eh_timeout = eh_timeout * HZ;
+
+	return count;
+}
+static DEVICE_ATTR(eh_timeout, S_IRUGO | S_IWUSR, sdev_show_eh_timeout, sdev_store_eh_timeout);
+
+static ssize_t
 store_rescan_field (struct device *dev, struct device_attribute *attr,
 		    const char *buf, size_t count)
 {
@@ -723,6 +752,7 @@ static struct attribute *scsi_sdev_attrs[] = {
 	&dev_attr_delete.attr,
 	&dev_attr_state.attr,
 	&dev_attr_timeout.attr,
+	&dev_attr_eh_timeout.attr,
 	&dev_attr_iocounterbits.attr,
 	&dev_attr_iorequest_cnt.attr,
 	&dev_attr_iodone_cnt.attr,
diff --git a/include/scsi/scsi.h b/include/scsi/scsi.h
index 66216c1..4b87d99 100644
--- a/include/scsi/scsi.h
+++ b/include/scsi/scsi.h
@@ -10,9 +10,14 @@
 
 #include <linux/types.h>
 #include <linux/scatterlist.h>
+#include <linux/kernel.h>
 
 struct scsi_cmnd;
 
+enum scsi_timeouts {
+	SCSI_DEFAULT_EH_TIMEOUT		= 10 * HZ,
+};
+
 /*
  * The maximum number of SG segments that we will put inside a
  * scatterlist (unless chaining is used). Should ideally fit inside a
diff --git a/include/scsi/scsi_device.h b/include/scsi/scsi_device.h
index a7f9cba..7eb9b20 100644
--- a/include/scsi/scsi_device.h
+++ b/include/scsi/scsi_device.h
@@ -113,6 +113,7 @@ struct scsi_device {
 				 * scsi_devinfo.[hc]. For now used only to
 				 * pass settings from slave_alloc to scsi
 				 * core. */
+	unsigned int eh_timeout; /* Error handling timeout */
 	unsigned writeable:1;
 	unsigned removable:1;
 	unsigned changed:1;	/* Data invalid due to media change */

^ permalink raw reply related	[flat|nested] 26+ messages in thread

* Re: [PATCH] scsi: Allow error handling timeout to be specified
       [not found]   ` <CAC9+anJ9Y-SnCOK6EOCavTNJwx=xhAbL_X__MsEsL7DroawaJg@mail.gmail.com>
@ 2013-05-10 14:53     ` Martin K. Petersen
  2013-05-10 15:27       ` Martin K. Petersen
  2013-05-10 17:55       ` Baruch Even
  0 siblings, 2 replies; 26+ messages in thread
From: Martin K. Petersen @ 2013-05-10 14:53 UTC (permalink / raw)
  To: Baruch Even
  Cc: emilne, Martin K. Petersen, linux-scsi, Hannes Reinecke, michaelc

>>>>> "Baruch" == Baruch Even <baruch@ev-en.org> writes:

Baruch> Actually reducing the timeouts is probably not a good approach
Baruch> since it will cause the host to take a more radical approach
Baruch> without waiting sufficiently for a potential recovery.

Reducing the eh timeout is a requirement in many clustered setups. We've
been shipping a predecessor to this patch in our kernels for a long
time.


Baruch> In addition the more radical error handlings such as host reset
Baruch> will destroy other paths for completely unrelated devices/links,
Baruch> from my experience a host reset is usually not required and the
Baruch> Linux kernel currently reaches to this big hammer too fast.

I'm also working on a patch to add some heuristics to avoid the HBA and
bus resets if I/O is completing successfully on other attached
targets. But that's an orthogonal issue.

-- 
Martin K. Petersen	Oracle Linux Engineering

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH] scsi: Allow error handling timeout to be specified
  2013-05-10 14:53     ` Martin K. Petersen
@ 2013-05-10 15:27       ` Martin K. Petersen
  2013-05-10 17:55       ` Baruch Even
  1 sibling, 0 replies; 26+ messages in thread
From: Martin K. Petersen @ 2013-05-10 15:27 UTC (permalink / raw)
  To: Martin K. Petersen
  Cc: Baruch Even, emilne, linux-scsi, Hannes Reinecke, michaelc

>>>>> "Martin" == Martin K Petersen <martin.petersen@oracle.com> writes:

Martin> I'm also working on a patch to add some heuristics to avoid the
Martin> HBA and bus resets

Or rather: Defer the HBA and bus resets...

Martin> if I/O is completing successfully on other attached targets. But
Martin> that's an orthogonal issue.

-- 
Martin K. Petersen	Oracle Linux Engineering

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH] scsi: Allow error handling timeout to be specified
  2013-05-10 14:24       ` Hannes Reinecke
  2013-05-10 14:31         ` Bryn M. Reeves
@ 2013-05-10 16:59         ` Ewan Milne
  2013-05-13 15:16           ` Elliott, Robert (Server Storage)
  1 sibling, 1 reply; 26+ messages in thread
From: Ewan Milne @ 2013-05-10 16:59 UTC (permalink / raw)
  To: Hannes Reinecke; +Cc: Baruch Even, Martin K. Petersen, linux-scsi, michaelc

On Fri, 2013-05-10 at 16:24 +0200, Hannes Reinecke wrote: 
> On 05/10/2013 04:01 PM, Ewan Milne wrote:
> > On Fri, 2013-05-10 at 16:22 +0300, Baruch Even wrote:
> >> On Fri, May 10, 2013 at 3:43 PM, Ewan Milne <emilne@redhat.com> wrote:
> >>>
> >>> On Thu, 2013-05-09 at 23:11 -0400, Martin K. Petersen wrote:
> >>>> Introduce eh_timeout which can be used for error handling purposes. This
> >>>> was previously hardcoded to 10 seconds in the SCSI error handling
> >>>> code. However, for some fast-fail scenarios it is necessary to be able
> >>>> to tune this as it can take several iterations (bus device, target, bus,
> >>>> controller) before we give up.
> >>>>
> >>>> Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
> >>>>
> >>>
> >>> Thanks for posting this.  It will be very helpful to have this
> >>> capability, particularly when alternate paths to the device exist.
> >>>
> >>> Acked-by: Ewan D. Milne <emilne@redhat.com>
> >>
> >>
> >> I would argue that waiting for the eh to timeout before you switch to
> >> another path is most likely to be wrong. If you did the first pass of
> >> error recovery (task abort) and that failed the
> >> path/hba/logical-device is doomed. If you will switch to another path
> >> it will either work (meaning the path/hba were bad) or not (logical
> >> device was the culprit).
> > 
> > It is necessary to either know the disposition of a command or
> > else wait for a defined amount of time before retrying the command on
> > another path.  Otherwise you run the risk that the command will
> > eventually complete on the first path.  So yes, we need to do the abort
> > (and its timeout).
> > 
> Strictly speaking that's not true.
> Yes, we do need to wait for a certain amount of time for the command
> completion to come in.
> 
> However, this time is only defined _on the initiator_.
> The specification does _NOT_ have any fixed timeout values for _any_
> command. As such it could in theory (and does, if you happen to run
> against certain arrays under certain conditions) take several
> minutes to return a completion.

Granted.  (e.g. in the case of WRITE SAME, it could be a while before
the command completes, and retrying it on another path too quickly,
followed by other WRITE commands could be a disaster).  So the timeout
used for the original command has to be appropriate for the command.
Reducing that timeout and issuing an abort / lun reset / target reset
to try to fail over to another path earlier won't work if the device
never gets the abort / lun reset / target reset and the command is still
executing.

In the case of commands / TMFs issued by the error handling, the timeout
needs to be long enough to account for the delay in the driver / HBA,
switches (i.e. in an FC environment), and the target's device server.
But this time might very well be much shorter than the worst case for
other commands.  So I think allowing EH timeouts to be specified is a
good thing.  They just have to be set properly, the same as timeouts
for other commands (which can already be adjusted, but are overridden
for SYNCHRONIZE CACHE and WRITE SAME).

> 
> So we have to accept that a command completion might happen in
> between the time we take between deciding that a command abort has
> to be send and the actual submission of the command abort by the
> HBA. Which is totally independent of any command timeout we set.
> It's just that a short command timeout increases the likelyhood of
> the race to happen; the race itself is always present.
> 
> >>
> >> Actually reducing the timeouts is probably not a good approach since
> >> it will cause the host to take a more radical approach without waiting
> >> sufficiently for a potential recovery. In addition the more radical
> >> error handlings such as host reset will destroy other paths for
> >> completely unrelated devices/links, from my experience a host reset is
> >> usually not required and the Linux kernel currently reaches to this
> >> big hammer too fast.
> > 
> > I believe that Hannes is working on a better error handling algorithm
> > that e.g. does not cause an emulated bus reset in an FC environment
> > by resetting all the targets (and affecting I/O to unrelated targets in
> > the process).
> > 
> Yes, that was the idea.
> Which I'll get down to eventually; if only customers wouldn't have
> all these obnoxious issues no-one has ever seen...
> 
> And there is nothing wrong with reducing the timeout per se. It's
> just that the current error recovery strategy isn't well equipped to
> handle it :-)
> 
> Cheers,
> 
> Hannes



^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH] scsi: Allow error handling timeout to be specified
  2013-05-10 14:01     ` Ewan Milne
  2013-05-10 14:24       ` Hannes Reinecke
@ 2013-05-10 17:51       ` Baruch Even
  2013-05-10 20:18         ` Hannes Reinecke
  1 sibling, 1 reply; 26+ messages in thread
From: Baruch Even @ 2013-05-10 17:51 UTC (permalink / raw)
  To: emilne; +Cc: Martin K. Petersen, linux-scsi, Hannes Reinecke, michaelc

On Fri, May 10, 2013 at 5:01 PM, Ewan Milne <emilne@redhat.com> wrote:
> On Fri, 2013-05-10 at 16:22 +0300, Baruch Even wrote:
>> On Fri, May 10, 2013 at 3:43 PM, Ewan Milne <emilne@redhat.com> wrote:
>> >
>> > On Thu, 2013-05-09 at 23:11 -0400, Martin K. Petersen wrote:
>> > > Introduce eh_timeout which can be used for error handling purposes. This
>> > > was previously hardcoded to 10 seconds in the SCSI error handling
>> > > code. However, for some fast-fail scenarios it is necessary to be able
>> > > to tune this as it can take several iterations (bus device, target, bus,
>> > > controller) before we give up.
>> > >
>> > > Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
>> > >
>> >
>> > Thanks for posting this.  It will be very helpful to have this
>> > capability, particularly when alternate paths to the device exist.
>> >
>> > Acked-by: Ewan D. Milne <emilne@redhat.com>
>>
>>
>> I would argue that waiting for the eh to timeout before you switch to
>> another path is most likely to be wrong. If you did the first pass of
>> error recovery (task abort) and that failed the
>> path/hba/logical-device is doomed. If you will switch to another path
>> it will either work (meaning the path/hba were bad) or not (logical
>> device was the culprit).
>
> It is necessary to either know the disposition of a command or
> else wait for a defined amount of time before retrying the command on
> another path.  Otherwise you run the risk that the command will
> eventually complete on the first path.  So yes, we need to do the abort
> (and its timeout).
>
>>
>> Actually reducing the timeouts is probably not a good approach since
>> it will cause the host to take a more radical approach without waiting
>> sufficiently for a potential recovery. In addition the more radical
>> error handlings such as host reset will destroy other paths for
>> completely unrelated devices/links, from my experience a host reset is
>> usually not required and the Linux kernel currently reaches to this
>> big hammer too fast.
>
> I believe that Hannes is working on a better error handling algorithm
> that e.g. does not cause an emulated bus reset in an FC environment
> by resetting all the targets (and affecting I/O to unrelated targets in
> the process).

The error handling I have in mind (admittedly, not fully thought out)
should work for both FC and SAS. Currently the error recovery
progresses at the host level regardless of if the errors are on one
device or all of them, it also stops the IOs on all devices and LUNs.
It would be nice if that was taken into account. My ideas may be more
suitable to the environment I work in (enterprise storage devices
rather than hosts) but I believe the same approach would benefit the
hosts as well.

It would be interesting to see what approach the new error handling will take.

Baruch

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH] scsi: Allow error handling timeout to be specified
  2013-05-10 14:53     ` Martin K. Petersen
  2013-05-10 15:27       ` Martin K. Petersen
@ 2013-05-10 17:55       ` Baruch Even
  1 sibling, 0 replies; 26+ messages in thread
From: Baruch Even @ 2013-05-10 17:55 UTC (permalink / raw)
  To: Martin K. Petersen; +Cc: emilne, linux-scsi, Hannes Reinecke, michaelc

On Fri, May 10, 2013 at 5:53 PM, Martin K. Petersen
<martin.petersen@oracle.com> wrote:
>>>>>> "Baruch" == Baruch Even <baruch@ev-en.org> writes:
>
> Baruch> Actually reducing the timeouts is probably not a good approach
> Baruch> since it will cause the host to take a more radical approach
> Baruch> without waiting sufficiently for a potential recovery.
>
> Reducing the eh timeout is a requirement in many clustered setups. We've
> been shipping a predecessor to this patch in our kernels for a long
> time.

> Baruch> In addition the more radical error handlings such as host reset
> Baruch> will destroy other paths for completely unrelated devices/links,
> Baruch> from my experience a host reset is usually not required and the
> Baruch> Linux kernel currently reaches to this big hammer too fast.
>
> I'm also working on a patch to add some heuristics to avoid the HBA and
> bus resets if I/O is completing successfully on other attached
> targets. But that's an orthogonal issue.

Why?

In my experience (again, SAS based inside a storage device) the
reduced eh timeout is more likely to cause escalated problems rather
than resolve the issue.

I actually find that the higher level should have a small timeout of
its own to do its own recovery work, which normally entails going to
other copies of the data where available and let the device try to get
the IO done if possible. Not sure how applicable it is to the kernel
itself but I do feel it could be relevant.

Baruch

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH] scsi: Allow error handling timeout to be specified
  2013-05-10 20:18         ` Hannes Reinecke
@ 2013-05-10 19:27           ` Baruch Even
  2013-05-13  5:46             ` Hannes Reinecke
  0 siblings, 1 reply; 26+ messages in thread
From: Baruch Even @ 2013-05-10 19:27 UTC (permalink / raw)
  To: Hannes Reinecke; +Cc: emilne, Martin K. Petersen, linux-scsi, michaelc

On Fri, May 10, 2013 at 11:18 PM, Hannes Reinecke <hare@suse.de> wrote:
> On 05/10/2013 07:51 PM, Baruch Even wrote:
>>
>> The error handling I have in mind (admittedly, not fully thought out)
>> should work for both FC and SAS. Currently the error recovery
>> progresses at the host level regardless of if the errors are on one
>> device or all of them, it also stops the IOs on all devices and LUNs.
>> It would be nice if that was taken into account. My ideas may be more
>> suitable to the environment I work in (enterprise storage devices
>> rather than hosts) but I believe the same approach would benefit the
>> hosts as well.
>>
>> It would be interesting to see what approach the new error handling will
>> take.
>>
> So, my general idea is this:
>
> 1) Send command aborts from scsi_times_out(). There is no requirement
>    on stopping I/O on the host simply because a single command times
>    out. And as scsi_times_out() is run from a separate thread anyway
>    we should be able to send ABORT TASK TMFs without a problem
> 2) Modify recovery sequence.
>    One of the major pitfalls of the current scsi_eh is that it
>    spills over onto unrelated LUNs for higher levels. So for the
>    new EH we should be using a sequence of
>    - ABORT TASK
>    - ABORT TASK SET
>    - (Terminate I_T nexus)
>    - (Host reset)
>    'Terminate I_T nexus' for FibreChannel is equivalent to a LOGO.
>    'Host reset' is the current host reset function.
> 3) Finegrained recovery setting.
>    There is no need to stop the entire host when doing a recovery;
>    it should be sufficient to stop I/O to the unit
>    (LUN, I_T nexus, host) when the error recovery is at the
>    respective level.

This looks great and much in line with what I'm thinking.

What about not going to the higher level if not everything at that
level had failed?
I mean that if at the target not all LUNs failed it will be quite
troublesome to other LUNs if I-T-Nexus is terminated and that at the
host level if there are still targets that are functioning it will
kill them too to reset the host.

Will this replace all of EH or just FC EH?

Baruch

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH] scsi: Allow error handling timeout to be specified
  2013-05-10 17:51       ` Baruch Even
@ 2013-05-10 20:18         ` Hannes Reinecke
  2013-05-10 19:27           ` Baruch Even
  0 siblings, 1 reply; 26+ messages in thread
From: Hannes Reinecke @ 2013-05-10 20:18 UTC (permalink / raw)
  To: Baruch Even; +Cc: emilne, Martin K. Petersen, linux-scsi, michaelc

On 05/10/2013 07:51 PM, Baruch Even wrote:
> On Fri, May 10, 2013 at 5:01 PM, Ewan Milne <emilne@redhat.com> wrote:
>> On Fri, 2013-05-10 at 16:22 +0300, Baruch Even wrote:
>>> On Fri, May 10, 2013 at 3:43 PM, Ewan Milne <emilne@redhat.com> wrote:
>>>>
>>>> On Thu, 2013-05-09 at 23:11 -0400, Martin K. Petersen wrote:
>>>>> Introduce eh_timeout which can be used for error handling purposes. This
>>>>> was previously hardcoded to 10 seconds in the SCSI error handling
>>>>> code. However, for some fast-fail scenarios it is necessary to be able
>>>>> to tune this as it can take several iterations (bus device, target, bus,
>>>>> controller) before we give up.
>>>>>
>>>>> Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
>>>>>
>>>>
>>>> Thanks for posting this.  It will be very helpful to have this
>>>> capability, particularly when alternate paths to the device exist.
>>>>
>>>> Acked-by: Ewan D. Milne <emilne@redhat.com>
>>>
>>>
>>> I would argue that waiting for the eh to timeout before you switch to
>>> another path is most likely to be wrong. If you did the first pass of
>>> error recovery (task abort) and that failed the
>>> path/hba/logical-device is doomed. If you will switch to another path
>>> it will either work (meaning the path/hba were bad) or not (logical
>>> device was the culprit).
>>
>> It is necessary to either know the disposition of a command or
>> else wait for a defined amount of time before retrying the command on
>> another path.  Otherwise you run the risk that the command will
>> eventually complete on the first path.  So yes, we need to do the abort
>> (and its timeout).
>>
>>>
>>> Actually reducing the timeouts is probably not a good approach since
>>> it will cause the host to take a more radical approach without waiting
>>> sufficiently for a potential recovery. In addition the more radical
>>> error handlings such as host reset will destroy other paths for
>>> completely unrelated devices/links, from my experience a host reset is
>>> usually not required and the Linux kernel currently reaches to this
>>> big hammer too fast.
>>
>> I believe that Hannes is working on a better error handling algorithm
>> that e.g. does not cause an emulated bus reset in an FC environment
>> by resetting all the targets (and affecting I/O to unrelated targets in
>> the process).
>
> The error handling I have in mind (admittedly, not fully thought out)
> should work for both FC and SAS. Currently the error recovery
> progresses at the host level regardless of if the errors are on one
> device or all of them, it also stops the IOs on all devices and LUNs.
> It would be nice if that was taken into account. My ideas may be more
> suitable to the environment I work in (enterprise storage devices
> rather than hosts) but I believe the same approach would benefit the
> hosts as well.
>
> It would be interesting to see what approach the new error handling will take.
>
So, my general idea is this:

1) Send command aborts from scsi_times_out(). There is no requirement
    on stopping I/O on the host simply because a single command times
    out. And as scsi_times_out() is run from a separate thread anyway
    we should be able to send ABORT TASK TMFs without a problem
2) Modify recovery sequence.
    One of the major pitfalls of the current scsi_eh is that it
    spills over onto unrelated LUNs for higher levels. So for the
    new EH we should be using a sequence of
    - ABORT TASK
    - ABORT TASK SET
    - (Terminate I_T nexus)
    - (Host reset)
    'Terminate I_T nexus' for FibreChannel is equivalent to a LOGO.
    'Host reset' is the current host reset function.
3) Finegrained recovery setting.
    There is no need to stop the entire host when doing a recovery;
    it should be sufficient to stop I/O to the unit
    (LUN, I_T nexus, host) when the error recovery is at the
    respective level.

As usual, comments are welcome.

Cheers,

Hannes


^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH] scsi: Allow error handling timeout to be specified
  2013-05-10 19:27           ` Baruch Even
@ 2013-05-13  5:46             ` Hannes Reinecke
  2013-05-13 14:40               ` Jeremy Linton
  0 siblings, 1 reply; 26+ messages in thread
From: Hannes Reinecke @ 2013-05-13  5:46 UTC (permalink / raw)
  To: Baruch Even; +Cc: emilne, Martin K. Petersen, linux-scsi, michaelc

On 05/10/2013 09:27 PM, Baruch Even wrote:
> On Fri, May 10, 2013 at 11:18 PM, Hannes Reinecke <hare@suse.de> wrote:
>> On 05/10/2013 07:51 PM, Baruch Even wrote:
>>>
>>> The error handling I have in mind (admittedly, not fully thought out)
>>> should work for both FC and SAS. Currently the error recovery
>>> progresses at the host level regardless of if the errors are on one
>>> device or all of them, it also stops the IOs on all devices and LUNs.
>>> It would be nice if that was taken into account. My ideas may be more
>>> suitable to the environment I work in (enterprise storage devices
>>> rather than hosts) but I believe the same approach would benefit the
>>> hosts as well.
>>>
>>> It would be interesting to see what approach the new error handling will
>>> take.
>>>
>> So, my general idea is this:
>>
>> 1) Send command aborts from scsi_times_out(). There is no requirement
>>    on stopping I/O on the host simply because a single command times
>>    out. And as scsi_times_out() is run from a separate thread anyway
>>    we should be able to send ABORT TASK TMFs without a problem
>> 2) Modify recovery sequence.
>>    One of the major pitfalls of the current scsi_eh is that it
>>    spills over onto unrelated LUNs for higher levels. So for the
>>    new EH we should be using a sequence of
>>    - ABORT TASK
>>    - ABORT TASK SET
>>    - (Terminate I_T nexus)
>>    - (Host reset)
>>    'Terminate I_T nexus' for FibreChannel is equivalent to a LOGO.
>>    'Host reset' is the current host reset function.
>> 3) Finegrained recovery setting.
>>    There is no need to stop the entire host when doing a recovery;
>>    it should be sufficient to stop I/O to the unit
>>    (LUN, I_T nexus, host) when the error recovery is at the
>>    respective level.
> 
> This looks great and much in line with what I'm thinking.
> 
> What about not going to the higher level if not everything at that
> level had failed?
> I mean that if at the target not all LUNs failed it will be quite
> troublesome to other LUNs if I-T-Nexus is terminated and that at the
> host level if there are still targets that are functioning it will
> kill them too to reset the host.
> 

True. But and the end of the day, we _do_ want to recover the failed
LUN. If we were to disable that faulty LUN and continue running with
the others we won't have a chance of _ever_ recovering that one LUN.

Plus we have to keep in mind that the attempted error recovery did
not succeed for totally unrelated issues (ie sending a ABORT TASK
SET when the link is down). So we basically _have_ to escalate it
to the next level. Even though that will mean to stop I/O to other,
hitherto unaffected instances.

Cheers,

Hannes
-- 
Dr. Hannes Reinecke		      zSeries & Storage
hare@suse.de			      +49 911 74053 688
SUSE LINUX Products GmbH, Maxfeldstr. 5, 90409 Nürnberg
GF: J. Hawn, J. Guild, F. Imendörffer, HRB 16746 (AG Nürnberg)
--
To unsubscribe from this list: send the line "unsubscribe linux-scsi" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH] scsi: Allow error handling timeout to be specified
  2013-05-13  5:46             ` Hannes Reinecke
@ 2013-05-13 14:40               ` Jeremy Linton
  2013-05-13 15:03                 ` Hannes Reinecke
  0 siblings, 1 reply; 26+ messages in thread
From: Jeremy Linton @ 2013-05-13 14:40 UTC (permalink / raw)
  To: Hannes Reinecke
  Cc: Baruch Even, emilne, Martin K. Petersen, linux-scsi, michaelc

On 5/13/2013 12:46 AM, Hannes Reinecke wrote:

> True. But and the end of the day, we _do_ want to recover the failed LUN.
> If we were to disable that faulty LUN and continue running with the others
> we won't have a chance of _ever_ recovering that one LUN.

	I don't buy this. Especially for FC devices, the vast majority of errors I see
are related to zoning, SFP and cabling problems. Once one of those happens you
tend to get a lot of shotgun debugging, which injects all kinds of
further errors.	None of these errors are fixed by the linux error recovery paths.

	That said, if the admin fixes something, for FC/SAS (and potentially others)
you _WILL_ get notification that the device is online again.


> SET when the link is down). So we basically _have_ to escalate it to the
> next level. Even though that will mean to stop I/O to other, hitherto
> unaffected instances.

	And a single failure, turns into performance bubbles and further errors on
other devices. Particularly if the functional devices are stateful, and the
error recovery mechanism isn't sufficiently intelligent about that state (see
tape drives). Think about what happens when a marginal SFP on a target causes
a device to repeatably drop off and reappear at some random point in the future.


	Anyway, It is possible to make a determination about the topology and make
decisions about the likely-hood of any given portion being at fault. For
example, if one lun on a target has failed and the remainder continue to work,
then its unlikely that if abort and lun reset fail that anything higher up in
the stack is going to succeed.

	I feel pretty strongly, at that point your better off providing good
diagnostics about the failure and expecting user interaction rather than
muddying the waters by causing other device interruptions. If the user tries
everything and determines that a HBA reset is the right choice, provide that
option, don't do it for them.

	If every device attached to the HBA fails then resetting the HBA is a valid
choice, not before. Same for I_T.




^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH] scsi: Allow error handling timeout to be specified
  2013-05-13 14:40               ` Jeremy Linton
@ 2013-05-13 15:03                 ` Hannes Reinecke
  2013-05-13 15:58                   ` Jeremy Linton
  0 siblings, 1 reply; 26+ messages in thread
From: Hannes Reinecke @ 2013-05-13 15:03 UTC (permalink / raw)
  To: Jeremy Linton
  Cc: Baruch Even, emilne, Martin K. Petersen, linux-scsi, michaelc

On 05/13/2013 04:40 PM, Jeremy Linton wrote:
> On 5/13/2013 12:46 AM, Hannes Reinecke wrote:
> 
>> True. But and the end of the day, we _do_ want to recover the failed LUN.
>> If we were to disable that faulty LUN and continue running with the others
>> we won't have a chance of _ever_ recovering that one LUN.
> 
> 	I don't buy this. Especially for FC devices, the vast majority of errors I see
> are related to zoning, SFP and cabling problems. Once one of those happens you
> tend to get a lot of shotgun debugging, which injects all kinds of
> further errors.	None of these errors are fixed by the linux error recovery paths.
> 
> 	That said, if the admin fixes something, for FC/SAS (and potentially others)
> you _WILL_ get notification that the device is online again.
> 

Well, yes, of course.
Sadly, these kind of errors tend to be very erratic and very hard to
diagnose. There simply is no way telling that the error you've had
is due to a bad cable or bad SFP.
Bad zoning is easy; then the device is simply not reachable anymore.

So for error recovery we first have to assume that the error is
fixable. And then we have a standard way of trying to fix this error.
The problem we have is that we lose all information about the error
once it's 'fixed' (ie after eh is done). Which is the main problem
with bad cabling: we're running the same sequence all over again,
without ever figuring out 'hey, I've done this already'.

sd.c has some _very_ limited support for this. But trying to
generalise things here will be _hard_.

So yeah, I see your point. In fact, I've been bitten by this, too.
But the error scenarios I've seen are far to complex to have them
modelled into something re-usable.

>> SET when the link is down). So we basically _have_ to escalate it to the
>> next level. Even though that will mean to stop I/O to other, hitherto
>> unaffected instances.
> 
> 	And a single failure, turns into performance bubbles and further errors on
> other devices. Particularly if the functional devices are stateful, and the
> error recovery mechanism isn't sufficiently intelligent about that state (see
> tape drives). Think about what happens when a marginal SFP on a target causes
> a device to repeatably drop off and reappear at some random point in the future.
> 
> 
> 	Anyway, It is possible to make a determination about the topology and make
> decisions about the likely-hood of any given portion being at fault. For
> example, if one lun on a target has failed and the remainder continue to work,
> then its unlikely that if abort and lun reset fail that anything higher up in
> the stack is going to succeed.
> 
Which is why I suggested 'ABORT TASK SET' instead of 'LUN reset'.
That will be restricted to the I_T_L nexus, and leave the rest of
the LUN alone (or so one hopes).

> 	I feel pretty strongly, at that point your better off providing good
> diagnostics about the failure and expecting user interaction rather than
> muddying the waters by causing other device interruptions. If the user tries
> everything and determines that a HBA reset is the right choice, provide that
> option, don't do it for them.
> 
> 	If every device attached to the HBA fails then resetting the HBA is a valid
> choice, not before. Same for I_T.
> 
Hmm. Really not sure.

Take the 'target not responding' case. (which is what triggered this
whole issue anyway). Say a target port went out to lunch and don't
respond to FC commands anymore.

With our current EH it'll take _ages_, but eventually the big hammer
hits (or the device comes back) and everything is back to normal again.
So LUN reset (or ABORT TASK SET) fails.
The other LUNs haven't reported an error. But how do you know
whether they are still okay? The other LUNs might simply be idle,
and no commands have been send to them.
So the state's still good. Do we reset the I_T nexus or not?

If we do, we would find that the entire rport doesn't respond, so
the devloss_tmo mechanism would trigger, and eventually the rport
will disappear and we're back on normal operation.

If we don't the LUN will be stuck forever, until someone actually
issues I/O to the other LUNs for that rport. And only when I/O is
issued to the _last_ LUN we'll decide to reset the I_T nexus.
Not a very appealing scenario.

And 'reset I_T nexus' should be a rather fast operation; with a bit
of luck the other rports wouldn't even notice.
I've had a prototype running which would just kick off the
dev_loss_tmo mechanism; that worked like a charm.
(Agreed, as James Smart indicated 'only by luck', but nevertheless)

Cheers,

Hannes
-- 
Dr. Hannes Reinecke		      zSeries & Storage
hare@suse.de			      +49 911 74053 688
SUSE LINUX Products GmbH, Maxfeldstr. 5, 90409 Nürnberg
GF: J. Hawn, J. Guild, F. Imendörffer, HRB 16746 (AG Nürnberg)
--
To unsubscribe from this list: send the line "unsubscribe linux-scsi" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 26+ messages in thread

* RE: [PATCH] scsi: Allow error handling timeout to be specified
  2013-05-10 16:59         ` Ewan Milne
@ 2013-05-13 15:16           ` Elliott, Robert (Server Storage)
  0 siblings, 0 replies; 26+ messages in thread
From: Elliott, Robert (Server Storage) @ 2013-05-13 15:16 UTC (permalink / raw)
  To: emilne, Hannes Reinecke
  Cc: Baruch Even, Martin K. Petersen, linux-scsi, michaelc



> -----Original Message-----
> From: linux-scsi-owner@vger.kernel.org [mailto:linux-scsi-
> owner@vger.kernel.org] On Behalf Of Ewan Milne
> Sent: Friday, 10 May, 2013 11:59 AM
> To: Hannes Reinecke
> Cc: Baruch Even; Martin K. Petersen; linux-scsi; michaelc
> Subject: Re: [PATCH] scsi: Allow error handling timeout to be specified
> 
> On Fri, 2013-05-10 at 16:24 +0200, Hannes Reinecke wrote:
> > On 05/10/2013 04:01 PM, Ewan Milne wrote:
> > > On Fri, 2013-05-10 at 16:22 +0300, Baruch Even wrote:
> > >> On Fri, May 10, 2013 at 3:43 PM, Ewan Milne <emilne@redhat.com>
> wrote:
> > >>
> > >>
> > >> I would argue that waiting for the eh to timeout before you switch to
> > >> another path is most likely to be wrong. If you did the first pass of
> > >> error recovery (task abort) and that failed the
> > >> path/hba/logical-device is doomed. If you will switch to another path
> > >> it will either work (meaning the path/hba were bad) or not (logical
> > >> device was the culprit).
> > >
> > > It is necessary to either know the disposition of a command or
> > > else wait for a defined amount of time before retrying the command on
> > > another path.  Otherwise you run the risk that the command will
> > > eventually complete on the first path.  So yes, we need to do the abort
> > > (and its timeout).
> > >
> > Strictly speaking that's not true.
> > Yes, we do need to wait for a certain amount of time for the command
> > completion to come in.
> >
> > However, this time is only defined _on the initiator_.
> > The specification does _NOT_ have any fixed timeout values for _any_
> > command. As such it could in theory (and does, if you happen to run
> > against certain arrays under certain conditions) take several
> > minutes to return a completion.

The REPORT SUPPORTED OPERATION CODES command (see SPC-4) 
returns nominal and recommended timeout values for each supported
command.  Similarly, REPORT SUPPORTED TASK MANAGEMENT FUNCTIONS
returns timeouts for task management functions.

Those times are from the device server's perspective, so any fabric 
overhead needs to be added.

Those commands and the command timeout descriptors are optional.
They are proposed to be mandatory in the Base feature set, though.

> Granted.  (e.g. in the case of WRITE SAME, it could be a while before
> the command completes, and retrying it on another path too quickly,
> followed by other WRITE commands could be a disaster).  So the timeout
> used for the original command has to be appropriate for the command.
> Reducing that timeout and issuing an abort / lun reset / target reset
> to try to fail over to another path earlier won't work if the device
> never gets the abort / lun reset / target reset and the command is still
> executing.

One problem with the ABORT TASK and I_T NEXUS RESET task management
functions is they must be sent down the same I_T nexus as the command(s)
that ran into timeouts.  If that I_T nexus is the source of the problem,
then they are likely to timeout as well.

The REMOVE I_T NEXUS command (standardized in March 2012 in 
SPC-4 revision 35) is designed to be sent down a different I_T nexus - the 
failover path.  It ensures that commands on the original I_T nexus won't 
suddenly resume.  That command is optional and still very new in
standards time.



^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH] scsi: Allow error handling timeout to be specified
  2013-05-13 15:03                 ` Hannes Reinecke
@ 2013-05-13 15:58                   ` Jeremy Linton
  2013-05-13 16:50                     ` Baruch Even
  2013-05-13 20:29                     ` Martin K. Petersen
  0 siblings, 2 replies; 26+ messages in thread
From: Jeremy Linton @ 2013-05-13 15:58 UTC (permalink / raw)
  To: Hannes Reinecke
  Cc: Baruch Even, emilne, Martin K. Petersen, linux-scsi, michaelc

On 5/13/2013 10:03 AM, Hannes Reinecke wrote:
> The other LUNs haven't reported an error. But how do you know whether they
> are still okay? The other LUNs might simply be idle, and no commands have
> been send to them.

	Well, how about generating std inquiry against them if they are idle and the
given HBA has a device in error state? Then you can make a rough approximation
of what has failed, and escalate the error handling if all the devices at a
particular level have failed.

	The midlayer may not even need to send the inquiries. If the individual
device drivers (sd/st/etc) are responsible for monitoring and error recovery
then they can be tasked with determining device availability as well. I think
this solves other problems too. For example, the use of TUR in the midlayer,
is a problem because it doesn't have enough knowledge about the possible check
conditions being returned to act on them appropriately.








^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH] scsi: Allow error handling timeout to be specified
  2013-05-13 15:58                   ` Jeremy Linton
@ 2013-05-13 16:50                     ` Baruch Even
  2013-05-13 20:29                     ` Martin K. Petersen
  1 sibling, 0 replies; 26+ messages in thread
From: Baruch Even @ 2013-05-13 16:50 UTC (permalink / raw)
  To: Jeremy Linton
  Cc: Hannes Reinecke, emilne, Martin K. Petersen, linux-scsi, michaelc

On Mon, May 13, 2013 at 6:58 PM, Jeremy Linton <jlinton@tributary.com> wrote:
> On 5/13/2013 10:03 AM, Hannes Reinecke wrote:
>> The other LUNs haven't reported an error. But how do you know whether they
>> are still okay? The other LUNs might simply be idle, and no commands have
>> been send to them.
>
>         Well, how about generating std inquiry against them if they are idle and the
> given HBA has a device in error state? Then you can make a rough approximation
> of what has failed, and escalate the error handling if all the devices at a
> particular level have failed.
>
>         The midlayer may not even need to send the inquiries. If the individual
> device drivers (sd/st/etc) are responsible for monitoring and error recovery
> then they can be tasked with determining device availability as well. I think
> this solves other problems too. For example, the use of TUR in the midlayer,
> is a problem because it doesn't have enough knowledge about the possible check
> conditions being returned to act on them appropriately.

Such an approach is preferable IMO than the big hammer, especially if
we are talking about a likely condition of using multipath and having
other links over the same host  that do have traffic flowing through
them. If there is traffic already on the same host there is no reason
to do a host reset, if there is no traffic and there are no other
luns, go for the big gun it will not matter to anything else, if there
are other inactive luns some mechanism to trigger some basic traffic
(inquiry/tur) on them is much preferable to just a plain big hammer
application.

It might be that the kernel is not the right place for all of this
diagnostics work but then some interface for an external daemon to do
this diagnostics is preferable to just wielding the big hammer and
killing all traffic.

In my experience if the device doesn't respond it usually just
disappeared from the network, if it is on the network and the task
abort or target reset do not return successfully it is either unlikely
that the host reset will help (the host is fine, device is gone) or
that the host reset is the only way since the host is dead but then a
simple check on all other luns would reveal that quite fast. In many
cases the host controller itself is mostly dead and the driver could
detect that on its own without waiting for the traffic to time out but
that's an issue for each driver to handle.

Baruch

Baruch

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH] scsi: Allow error handling timeout to be specified
  2013-05-13 15:58                   ` Jeremy Linton
  2013-05-13 16:50                     ` Baruch Even
@ 2013-05-13 20:29                     ` Martin K. Petersen
  2013-05-13 21:01                       ` Jeremy Linton
  1 sibling, 1 reply; 26+ messages in thread
From: Martin K. Petersen @ 2013-05-13 20:29 UTC (permalink / raw)
  To: Jeremy Linton
  Cc: Hannes Reinecke, Baruch Even, emilne, Martin K. Petersen,
	linux-scsi, michaelc

>>>>> "Jeremy" == Jeremy Linton <jlinton@tributary.com> writes:

Jeremy> Well, how about generating std inquiry against them if they are
Jeremy> idle and the given HBA has a device in error state? Then you can
Jeremy> make a rough approximation of what has failed, and escalate the
Jeremy> error handling if all the devices at a particular level have
Jeremy> failed.

It's not that simple, unfortunately. Some HBAs keep more state than
others. We see cases fairly often where a misbehaving target has
confused the HBA enough that we can not bring the device back without
doing an HBA firmware reset. Despite I/O completing successfully on
other targets connected to the same HBA.

So at some point we do need to give up and escalate to a full HBA
reset. We would just like to defer that hammer until we have run out of
other options.

-- 
Martin K. Petersen	Oracle Linux Engineering

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH] scsi: Allow error handling timeout to be specified
  2013-05-13 20:29                     ` Martin K. Petersen
@ 2013-05-13 21:01                       ` Jeremy Linton
  2013-05-14 22:21                         ` Martin K. Petersen
  0 siblings, 1 reply; 26+ messages in thread
From: Jeremy Linton @ 2013-05-13 21:01 UTC (permalink / raw)
  To: Martin K. Petersen
  Cc: Hannes Reinecke, Baruch Even, emilne, linux-scsi, michaelc

On 5/13/2013 3:29 PM, Martin K. Petersen wrote:

> others. We see cases fairly often where a misbehaving target has
> confused the HBA enough that we can not bring the device back without
> doing an HBA firmware reset. Despite I/O completing successfully on
> other targets connected to the same HBA.

	This would seem to indicate a HBA/driver bug...

> So at some point we do need to give up and escalate to a full HBA
> reset. We would just like to defer that hammer until we have run out of
> other options.

	Except that I've seen the linux error recovery cause more problems than it
solves on a fairly regular basis. I would rather have a solution designed to
isolate failures, than one that makes a lot of mistakes and causes further
problems (sometimes with other machines). I'm pretty convinced that attempting
everything possible to recover a device when the underlying problem is unknown
is a bad strategy.

	I think maybe its a perspective difference. If the device that is failing is an
OS disk, then giving up is paramount to crashing the machine. On the other hand,
if the failing device is some shared tape drive in a SAN with a few hundred
alternatives then killing the OS in an attempt to recover that drive is a problem.

	Maybe, the super aggressive recovery paths should be reserved for devices
marked critical to system operation.



^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH] scsi: Allow error handling timeout to be specified
  2013-05-13 21:01                       ` Jeremy Linton
@ 2013-05-14 22:21                         ` Martin K. Petersen
  0 siblings, 0 replies; 26+ messages in thread
From: Martin K. Petersen @ 2013-05-14 22:21 UTC (permalink / raw)
  To: Jeremy Linton
  Cc: Martin K. Petersen, Hannes Reinecke, Baruch Even, emilne,
	linux-scsi, michaelc

>>>>> "Jeremy" == Jeremy Linton <jlinton@tributary.com> writes:

>> others. We see cases fairly often where a misbehaving target has
>> confused the HBA enough that we can not bring the device back without
>> doing an HBA firmware reset. Despite I/O completing successfully on
>> other targets connected to the same HBA.

Jeremy> 	This would seem to indicate a HBA/driver bug...

Yep. It's not just targets that go bad!


Jeremy> Except that I've seen the linux error recovery cause more
Jeremy> problems than it solves on a fairly regular basis. I would
Jeremy> rather have a solution designed to isolate failures, than one
Jeremy> that makes a lot of mistakes and causes further problems
Jeremy> (sometimes with other machines). I'm pretty convinced that
Jeremy> attempting everything possible to recover a device when the
Jeremy> underlying problem is unknown is a bad strategy.

There is no one size that fits all. Which is why we're taking steps to
make the error recovery parameters tweakable.

-- 
Martin K. Petersen	Oracle Linux Engineering

^ permalink raw reply	[flat|nested] 26+ messages in thread

end of thread, other threads:[~2013-05-14 22:21 UTC | newest]

Thread overview: 26+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2013-05-10  3:11 [PATCH] scsi: Allow error handling timeout to be specified Martin K. Petersen
2013-05-10  6:23 ` Bart Van Assche
2013-05-10 14:36   ` Martin K. Petersen
2013-05-10 12:43 ` Ewan Milne
2013-05-10 12:55   ` Hannes Reinecke
2013-05-10 13:09   ` Bryn M. Reeves
2013-05-10 13:22   ` Baruch Even
2013-05-10 14:01     ` Ewan Milne
2013-05-10 14:24       ` Hannes Reinecke
2013-05-10 14:31         ` Bryn M. Reeves
2013-05-10 16:59         ` Ewan Milne
2013-05-13 15:16           ` Elliott, Robert (Server Storage)
2013-05-10 17:51       ` Baruch Even
2013-05-10 20:18         ` Hannes Reinecke
2013-05-10 19:27           ` Baruch Even
2013-05-13  5:46             ` Hannes Reinecke
2013-05-13 14:40               ` Jeremy Linton
2013-05-13 15:03                 ` Hannes Reinecke
2013-05-13 15:58                   ` Jeremy Linton
2013-05-13 16:50                     ` Baruch Even
2013-05-13 20:29                     ` Martin K. Petersen
2013-05-13 21:01                       ` Jeremy Linton
2013-05-14 22:21                         ` Martin K. Petersen
     [not found]   ` <CAC9+anJ9Y-SnCOK6EOCavTNJwx=xhAbL_X__MsEsL7DroawaJg@mail.gmail.com>
2013-05-10 14:53     ` Martin K. Petersen
2013-05-10 15:27       ` Martin K. Petersen
2013-05-10 17:55       ` Baruch Even

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.