All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 1/1] ib_srp: Infiniband srp fast failover patch.
@ 2012-05-29 21:07 Karandeep Chahal
  2012-05-29 21:51 ` Michael Reed
       [not found] ` <4FC53AAA.3060203-LfVdkaOWEx8@public.gmane.org>
  0 siblings, 2 replies; 11+ messages in thread
From: Karandeep Chahal @ 2012-05-29 21:07 UTC (permalink / raw)
  To: linux-rdma, linux-kernel; +Cc: dillowda, roland, sean.hefty

[-- Attachment #1: Type: text/plain, Size: 987 bytes --]

Subject: [PATCH] Infiniband srp fast failover patch. Currently ib_srp does
  not do anything on receiving a DREQ from the target, it
  only sends a response back. Further it also does not
  monitor port (down) events. I have patched srp to remove
  scsi devices when a port down event is received or if the
  target sends a DREQ. Currently even though the target
  notifies the initiator of its intentions of going away, the
  initiator ignores that information. Later the initiator
  gets upset when the devices "suddenly" disappear resulting
  in srp initiating an error recovery process which takes a
  long time. This caused high failover latencies as compared
  to fibre channel. In my experiments with RHEL 6.0 and 6.2 I
  encountered failover time that exceeded 2 minutes and 20
  seconds (despite tweaking /etc/multipath.conf and
  /sys/block/<>/timeout). With this patch the failover takes
  30 seconds. I have tested this patch with and without a
  switch.

Yours, etc.
Karan


[-- Attachment #2: 0001-Infiniband-srp-fast-failover-patch.-Currently-ib_srp.patch --]
[-- Type: text/x-patch, Size: 4767 bytes --]

>From 4ebb453ccde59cf0b674bd4a23fb85f4a3333618 Mon Sep 17 00:00:00 2001
From: Karandeep Chahal <kchahal@ddn.com>
Date: Tue, 29 May 2012 16:48:20 -0400
Subject: [PATCH] Infiniband srp fast failover patch. Currently ib_srp does
 not do anything on receiving a DREQ from the target, it
 only sends a response back. Further it also does not
 monitor port (down) events. I have patched srp to remove
 scsi devices when a port down event is received or if the
 target sends a DREQ. Currently even though the target
 notifies the initiator of its intentions of going away, the
 initiator ignores that information. Later the initiator
 gets upset when the devices "suddenly" disappear resulting
 in srp initiating an error recovery process which takes a
 long time. This caused high failover latencies as compared
 to fibre channel. In my experiments with RHEL 6.0 and 6.2 I
 encountered failover time that exceeded 2 minutes and 20
 seconds (despite tweaking /etc/multipath.conf and
 /sys/block/<>/timeout). With this patch the failover takes
 30 seconds. I have tested this patch with and without a
 switch.


Signed-off-by: Karandeep Chahal <kchahal@ddn.com>
---
 drivers/infiniband/ulp/srp/ib_srp.c |   64 +++++++++++++++++++++++++++++++++++
 drivers/infiniband/ulp/srp/ib_srp.h |    1 +
 2 files changed, 65 insertions(+), 0 deletions(-)

diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c
index bcbf22e..088215b 100644
--- a/drivers/infiniband/ulp/srp/ib_srp.c
+++ b/drivers/infiniband/ulp/srp/ib_srp.c
@@ -1524,6 +1524,37 @@ static void srp_cm_rej_handler(struct ib_cm_id *cm_id,
 	}
 }
 
+static void srp_mark_all_devices_dead(int port_num, struct srp_device *srp_dev,
+				    struct ib_cm_id *cm_id)
+{
+	struct srp_host *host, *tmp_host;
+	struct srp_target_port *target;
+
+	list_for_each_entry_safe(host, tmp_host, &srp_dev->dev_list, list) {
+		/*
+		 * Mark all target ports as removed, so we stop queueing
+		 * commands and don't try to reconnect.
+		 */
+		if ((port_num != -1) && (port_num != host->port))
+			continue;
+
+		spin_lock(&host->target_lock);
+		list_for_each_entry(target, &host->target_list, list) {
+			if (!cm_id || (target->cm_id == cm_id)) {
+
+				shost_printk(KERN_WARNING, target->scsi_host,
+				     PFX "Removing all scsi devices\n");
+				spin_lock_irq(&target->lock);
+				target->state = SRP_TARGET_DEAD;
+				INIT_WORK(&target->work, srp_remove_work);
+				queue_work(ib_wq, &target->work);
+				spin_unlock_irq(&target->lock);
+			}
+		}
+		spin_unlock(&host->target_lock);
+	}
+}
+
 static int srp_cm_handler(struct ib_cm_id *cm_id, struct ib_cm_event *event)
 {
 	struct srp_target_port *target = cm_id->context;
@@ -1555,6 +1586,8 @@ static int srp_cm_handler(struct ib_cm_id *cm_id, struct ib_cm_event *event)
 		if (ib_send_cm_drep(cm_id, NULL, 0))
 			shost_printk(KERN_ERR, target->scsi_host,
 				     PFX "Sending CM DREP failed\n");
+		srp_mark_all_devices_dead(-1, target->srp_host->srp_dev,
+				     cm_id);
 		break;
 
 	case IB_CM_TIMEWAIT_EXIT:
@@ -2284,6 +2317,31 @@ free_host:
 	return NULL;
 }
 
+static void srp_event_handler(struct ib_event_handler *handler,
+				    struct ib_event *event)
+{
+	struct srp_device *srp_dev;
+
+	switch (event->event) {
+
+	case IB_EVENT_DEVICE_FATAL:
+	case IB_EVENT_PORT_ERR:
+		srp_dev = container_of(handler, struct srp_device,
+				    event_handler);
+
+		printk(KERN_INFO PFX "%s port %d down detected\n",
+				    srp_dev->dev->name,
+				    event->element.port_num);
+
+		srp_mark_all_devices_dead(event->element.port_num,
+				    srp_dev, NULL);
+		break;
+
+	default:
+		break;
+	}
+}
+
 static void srp_add_one(struct ib_device *device)
 {
 	struct srp_device *srp_dev;
@@ -2366,6 +2424,10 @@ static void srp_add_one(struct ib_device *device)
 
 	ib_set_client_data(device, &srp_client, srp_dev);
 
+	INIT_IB_EVENT_HANDLER(&srp_dev->event_handler, device,
+				    srp_event_handler);
+	ib_register_event_handler(&srp_dev->event_handler);
+
 	goto free_attr;
 
 err_pd:
@@ -2387,6 +2449,8 @@ static void srp_remove_one(struct ib_device *device)
 
 	srp_dev = ib_get_client_data(device, &srp_client);
 
+	ib_unregister_event_handler(&srp_dev->event_handler);
+
 	list_for_each_entry_safe(host, tmp_host, &srp_dev->dev_list, list) {
 		device_unregister(&host->dev);
 		/*
diff --git a/drivers/infiniband/ulp/srp/ib_srp.h b/drivers/infiniband/ulp/srp/ib_srp.h
index 020caf0..e0737a1 100644
--- a/drivers/infiniband/ulp/srp/ib_srp.h
+++ b/drivers/infiniband/ulp/srp/ib_srp.h
@@ -97,6 +97,7 @@ struct srp_device {
 	struct ib_pd	       *pd;
 	struct ib_mr	       *mr;
 	struct ib_fmr_pool     *fmr_pool;
+	struct ib_event_handler event_handler;
 	u64			fmr_page_mask;
 	int			fmr_page_size;
 	int			fmr_max_size;
-- 
1.7.7.6


^ permalink raw reply related	[flat|nested] 11+ messages in thread

* Re: [PATCH 1/1] ib_srp: Infiniband srp fast failover patch.
  2012-05-29 21:07 [PATCH 1/1] ib_srp: Infiniband srp fast failover patch Karandeep Chahal
@ 2012-05-29 21:51 ` Michael Reed
  2012-05-29 22:27   ` Karandeep Chahal
       [not found] ` <4FC53AAA.3060203-LfVdkaOWEx8@public.gmane.org>
  1 sibling, 1 reply; 11+ messages in thread
From: Michael Reed @ 2012-05-29 21:51 UTC (permalink / raw)
  To: Karandeep Chahal; +Cc: linux-rdma, linux-kernel, dillowda, roland, sean.hefty

Did you subsequently reconnect the target and confirm appropriate behavior?


On 05/29/2012 02:07 PM, Karandeep Chahal wrote:
> Subject: [PATCH] Infiniband srp fast failover patch. Currently ib_srp does
>    not do anything on receiving a DREQ from the target, it
>    only sends a response back. Further it also does not
>    monitor port (down) events. I have patched srp to remove
>    scsi devices when a port down event is received or if the
>    target sends a DREQ. Currently even though the target
>    notifies the initiator of its intentions of going away, the
>    initiator ignores that information. Later the initiator
>    gets upset when the devices "suddenly" disappear resulting
>    in srp initiating an error recovery process which takes a
>    long time. This caused high failover latencies as compared
>    to fibre channel. In my experiments with RHEL 6.0 and 6.2 I
>    encountered failover time that exceeded 2 minutes and 20
>    seconds (despite tweaking /etc/multipath.conf and
>    /sys/block/<>/timeout). With this patch the failover takes
>    30 seconds. I have tested this patch with and without a
>    switch.
>
> Yours, etc.
> Karan
>

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH 1/1] ib_srp: Infiniband srp fast failover patch.
  2012-05-29 21:51 ` Michael Reed
@ 2012-05-29 22:27   ` Karandeep Chahal
       [not found]     ` <4FC54D62.3080003-LfVdkaOWEx8@public.gmane.org>
  0 siblings, 1 reply; 11+ messages in thread
From: Karandeep Chahal @ 2012-05-29 22:27 UTC (permalink / raw)
  To: Michael Reed; +Cc: linux-rdma, linux-kernel, dillowda, roland, sean.hefty

Hi Michael,

Yes, I tried reconnecting the targets and removing reinserting ib-srp.

Thanks
Karan


On 05/29/2012 05:51 PM, Michael Reed wrote:
> Did you subsequently reconnect the target and confirm appropriate behavior?
>
>
> On 05/29/2012 02:07 PM, Karandeep Chahal wrote:
>> Subject: [PATCH] Infiniband srp fast failover patch. Currently ib_srp does
>>     not do anything on receiving a DREQ from the target, it
>>     only sends a response back. Further it also does not
>>     monitor port (down) events. I have patched srp to remove
>>     scsi devices when a port down event is received or if the
>>     target sends a DREQ. Currently even though the target
>>     notifies the initiator of its intentions of going away, the
>>     initiator ignores that information. Later the initiator
>>     gets upset when the devices "suddenly" disappear resulting
>>     in srp initiating an error recovery process which takes a
>>     long time. This caused high failover latencies as compared
>>     to fibre channel. In my experiments with RHEL 6.0 and 6.2 I
>>     encountered failover time that exceeded 2 minutes and 20
>>     seconds (despite tweaking /etc/multipath.conf and
>>     /sys/block/<>/timeout). With this patch the failover takes
>>     30 seconds. I have tested this patch with and without a
>>     switch.
>>
>> Yours, etc.
>> Karan
>>

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH 1/1] ib_srp: Infiniband srp fast failover patch.
  2012-05-29 22:27   ` Karandeep Chahal
@ 2012-05-29 22:53         ` Michael Reed
  0 siblings, 0 replies; 11+ messages in thread
From: Michael Reed @ 2012-05-29 22:53 UTC (permalink / raw)
  To: Karandeep Chahal
  Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA, dillowda-1Heg1YXhbW8,
	roland-DgEjT+Ai2ygdnm+yROfE0A, sean.hefty-ral2JQCrhuEAvxtiuMwx3w

Thank you for clarifying!

On 05/29/2012 03:27 PM, Karandeep Chahal wrote:
> Hi Michael,
>
> Yes, I tried reconnecting the targets and removing reinserting ib-srp.
>
> Thanks
> Karan
>
>
> On 05/29/2012 05:51 PM, Michael Reed wrote:
>> Did you subsequently reconnect the target and confirm appropriate behavior?
>>
>>
>> On 05/29/2012 02:07 PM, Karandeep Chahal wrote:
>>> Subject: [PATCH] Infiniband srp fast failover patch. Currently ib_srp does
>>>      not do anything on receiving a DREQ from the target, it
>>>      only sends a response back. Further it also does not
>>>      monitor port (down) events. I have patched srp to remove
>>>      scsi devices when a port down event is received or if the
>>>      target sends a DREQ. Currently even though the target
>>>      notifies the initiator of its intentions of going away, the
>>>      initiator ignores that information. Later the initiator
>>>      gets upset when the devices "suddenly" disappear resulting
>>>      in srp initiating an error recovery process which takes a
>>>      long time. This caused high failover latencies as compared
>>>      to fibre channel. In my experiments with RHEL 6.0 and 6.2 I
>>>      encountered failover time that exceeded 2 minutes and 20
>>>      seconds (despite tweaking /etc/multipath.conf and
>>>      /sys/block/<>/timeout). With this patch the failover takes
>>>      30 seconds. I have tested this patch with and without a
>>>      switch.
>>>
>>> Yours, etc.
>>> Karan
>>>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
> the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH 1/1] ib_srp: Infiniband srp fast failover patch.
@ 2012-05-29 22:53         ` Michael Reed
  0 siblings, 0 replies; 11+ messages in thread
From: Michael Reed @ 2012-05-29 22:53 UTC (permalink / raw)
  To: Karandeep Chahal; +Cc: linux-rdma, linux-kernel, dillowda, roland, sean.hefty

Thank you for clarifying!

On 05/29/2012 03:27 PM, Karandeep Chahal wrote:
> Hi Michael,
>
> Yes, I tried reconnecting the targets and removing reinserting ib-srp.
>
> Thanks
> Karan
>
>
> On 05/29/2012 05:51 PM, Michael Reed wrote:
>> Did you subsequently reconnect the target and confirm appropriate behavior?
>>
>>
>> On 05/29/2012 02:07 PM, Karandeep Chahal wrote:
>>> Subject: [PATCH] Infiniband srp fast failover patch. Currently ib_srp does
>>>      not do anything on receiving a DREQ from the target, it
>>>      only sends a response back. Further it also does not
>>>      monitor port (down) events. I have patched srp to remove
>>>      scsi devices when a port down event is received or if the
>>>      target sends a DREQ. Currently even though the target
>>>      notifies the initiator of its intentions of going away, the
>>>      initiator ignores that information. Later the initiator
>>>      gets upset when the devices "suddenly" disappear resulting
>>>      in srp initiating an error recovery process which takes a
>>>      long time. This caused high failover latencies as compared
>>>      to fibre channel. In my experiments with RHEL 6.0 and 6.2 I
>>>      encountered failover time that exceeded 2 minutes and 20
>>>      seconds (despite tweaking /etc/multipath.conf and
>>>      /sys/block/<>/timeout). With this patch the failover takes
>>>      30 seconds. I have tested this patch with and without a
>>>      switch.
>>>
>>> Yours, etc.
>>> Karan
>>>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH 1/1] ib_srp: Infiniband srp fast failover patch.
  2012-05-29 21:07 [PATCH 1/1] ib_srp: Infiniband srp fast failover patch Karandeep Chahal
@ 2012-05-30  5:06     ` David Dillow
       [not found] ` <4FC53AAA.3060203-LfVdkaOWEx8@public.gmane.org>
  1 sibling, 0 replies; 11+ messages in thread
From: David Dillow @ 2012-05-30  5:06 UTC (permalink / raw)
  To: Karandeep Chahal
  Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	roland-DgEjT+Ai2ygdnm+yROfE0A, sean.hefty-ral2JQCrhuEAvxtiuMwx3w

On Tue, 2012-05-29 at 17:07 -0400, Karandeep Chahal wrote:
> Subject: [PATCH] Infiniband srp fast failover patch.

This conflicts with Bart's patches to improve failover; it will be much
better to use his approach to block the target rather than remove it
wholesale -- we could have lost connectivity as a transient and may get
it back quickly if someone grabbed the wrong cable, etc.

Also, we should only kill the one target on DREQ, and we already have a
pointer to it from the CM context -- no need to search.

It is a good idea to hook into the event mechanism; this is something
I've long wanted to incorporate (as Vu did in OFED). I'm looking at
getting Bart's series to a point I can merge it, and I'll pull in your
ideas -- with credit -- there.

Thanks,
-- 
Dave Dillow
National Center for Computational Science
Oak Ridge National Laboratory
(865) 241-6602 office

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH 1/1] ib_srp: Infiniband srp fast failover patch.
@ 2012-05-30  5:06     ` David Dillow
  0 siblings, 0 replies; 11+ messages in thread
From: David Dillow @ 2012-05-30  5:06 UTC (permalink / raw)
  To: Karandeep Chahal; +Cc: linux-rdma, linux-kernel, roland, sean.hefty

On Tue, 2012-05-29 at 17:07 -0400, Karandeep Chahal wrote:
> Subject: [PATCH] Infiniband srp fast failover patch.

This conflicts with Bart's patches to improve failover; it will be much
better to use his approach to block the target rather than remove it
wholesale -- we could have lost connectivity as a transient and may get
it back quickly if someone grabbed the wrong cable, etc.

Also, we should only kill the one target on DREQ, and we already have a
pointer to it from the CM context -- no need to search.

It is a good idea to hook into the event mechanism; this is something
I've long wanted to incorporate (as Vu did in OFED). I'm looking at
getting Bart's series to a point I can merge it, and I'll pull in your
ideas -- with credit -- there.

Thanks,
-- 
Dave Dillow
National Center for Computational Science
Oak Ridge National Laboratory
(865) 241-6602 office


^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH 1/1] ib_srp: Infiniband srp fast failover patch.
  2012-05-30  5:06     ` David Dillow
  (?)
@ 2012-05-30 14:39     ` Karandeep Chahal
  -1 siblings, 0 replies; 11+ messages in thread
From: Karandeep Chahal @ 2012-05-30 14:39 UTC (permalink / raw)
  To: David Dillow; +Cc: linux-rdma, linux-kernel, roland, sean.hefty

Hi Dave,

As long as we get faster failover I am happy with Bart's patch.

Currently when I run IO to several luns over multipath and the preferred 
path goes down, the system hangs until the IO fails over. Even ssh'ing 
into the systems take 20-30 seconds. I *suspect* that is because IO is 
being queued up somewhere which brings the whole system to its knees.

Thank you for looking at the patch.

Thanks
Karan

On 05/30/2012 01:06 AM, David Dillow wrote:
> On Tue, 2012-05-29 at 17:07 -0400, Karandeep Chahal wrote:
>> Subject: [PATCH] Infiniband srp fast failover patch.
> This conflicts with Bart's patches to improve failover; it will be much
> better to use his approach to block the target rather than remove it
> wholesale -- we could have lost connectivity as a transient and may get
> it back quickly if someone grabbed the wrong cable, etc.
>
> Also, we should only kill the one target on DREQ, and we already have a
> pointer to it from the CM context -- no need to search.
>
> It is a good idea to hook into the event mechanism; this is something
> I've long wanted to incorporate (as Vu did in OFED). I'm looking at
> getting Bart's series to a point I can merge it, and I'll pull in your
> ideas -- with credit -- there.
>
> Thanks,

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH 1/1] ib_srp: Infiniband srp fast failover patch.
       [not found]     ` <1338354377.2361.13.camel-1q1vX8mYZiGLUyTwlgNVppKKF0rrzTr+@public.gmane.org>
@ 2012-11-12 23:10       ` Or Gerlitz
       [not found]         ` <CAJZOPZL0cj6Sb_xs7prV0k8HjWkELD7Y89Bxxp-xXnnGqxvXyg-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
  0 siblings, 1 reply; 11+ messages in thread
From: Or Gerlitz @ 2012-11-12 23:10 UTC (permalink / raw)
  To: David Dillow; +Cc: Karandeep Chahal, linux-rdma-u79uwXL29TY76Z2rM5mHXA

On Wed, May 30, 2012 at 7:06 AM, David Dillow <dillowda-1Heg1YXhbW8@public.gmane.org> wrote:
> On Tue, 2012-05-29 at 17:07 -0400, Karandeep Chahal wrote:
>> Subject: [PATCH] Infiniband srp fast failover patch.
>
> This conflicts with Bart's patches to improve failover; it will be much
> better to use his approach to block the target rather than remove it
> wholesale -- we could have lost connectivity as a transient and may get
> it back quickly if someone grabbed the wrong cable, etc.
>
> Also, we should only kill the one target on DREQ, and we already have a
> pointer to it from the CM context -- no need to search.
>
> It is a good idea to hook into the event mechanism;

Dave,

I wonder why this hooking is the correct way to go, if the IB link
went down for very short time, why should we care, what's missing in
relying on probes done by HA SW such as multipath and RC timeouts?

Or.

> this is something I've long wanted to incorporate (as Vu did in OFED). I'm looking at
> getting Bart's series to a point I can merge it, and I'll pull in your
> ideas -- with credit -- there.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH 1/1] ib_srp: Infiniband srp fast failover patch.
       [not found]         ` <CAJZOPZL0cj6Sb_xs7prV0k8HjWkELD7Y89Bxxp-xXnnGqxvXyg-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
@ 2012-11-12 23:46           ` Karandeep Chahal
       [not found]             ` <50A18A3D.9070304-LfVdkaOWEx8@public.gmane.org>
  0 siblings, 1 reply; 11+ messages in thread
From: Karandeep Chahal @ 2012-11-12 23:46 UTC (permalink / raw)
  To: Or Gerlitz; +Cc: David Dillow, linux-rdma-u79uwXL29TY76Z2rM5mHXA

In my experience with ib-srp, I am not sure if there is such a thing as 
link going down for a short time. When the link goes down, no matter how 
short the duration (< 1s), IO fails over, this failover takes very long 
time currently. Or, I would like to know if you can get a ib-srp 
failover time of less than a minute somehow (by tweaking OS or multipath 
settings). I was unable to get it to work even after a pretty exhaustive 
attempt.

Dave, on a separate note, with OFED-1.5.4 and RHEL-6x, I have tried 
setting ib_srp.srp_dev_loss_tmo=5 seconds, it does not seem to help 
failover time.

Karan

On 11/12/2012 06:10 PM, Or Gerlitz wrote:
> On Wed, May 30, 2012 at 7:06 AM, David Dillow <dillowda-1Heg1YXhbW8@public.gmane.org> wrote:
>> On Tue, 2012-05-29 at 17:07 -0400, Karandeep Chahal wrote:
>>> Subject: [PATCH] Infiniband srp fast failover patch.
>> This conflicts with Bart's patches to improve failover; it will be much
>> better to use his approach to block the target rather than remove it
>> wholesale -- we could have lost connectivity as a transient and may get
>> it back quickly if someone grabbed the wrong cable, etc.
>>
>> Also, we should only kill the one target on DREQ, and we already have a
>> pointer to it from the CM context -- no need to search.
>>
>> It is a good idea to hook into the event mechanism;
> Dave,
>
> I wonder why this hooking is the correct way to go, if the IB link
> went down for very short time, why should we care, what's missing in
> relying on probes done by HA SW such as multipath and RC timeouts?
>
> Or.
>
>> this is something I've long wanted to incorporate (as Vu did in OFED). I'm looking at
>> getting Bart's series to a point I can merge it, and I'll pull in your
>> ideas -- with credit -- there.

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH 1/1] ib_srp: Infiniband srp fast failover patch.
       [not found]             ` <50A18A3D.9070304-LfVdkaOWEx8@public.gmane.org>
@ 2012-11-13 21:07               ` Or Gerlitz
  0 siblings, 0 replies; 11+ messages in thread
From: Or Gerlitz @ 2012-11-13 21:07 UTC (permalink / raw)
  To: Karandeep Chahal; +Cc: David Dillow, linux-rdma-u79uwXL29TY76Z2rM5mHXA

On Tue, Nov 13, 2012 at 1:46 AM, Karandeep Chahal <kchahal-LfVdkaOWEx8@public.gmane.org> wrote:
> In my experience with ib-srp, I am not sure if there is such a thing as link
> going down for a short time. When the link goes down, no matter how short
> the duration (< 1s), IO fails over, this failover takes very long time currently.

But maybe IOs fail over since these patches force them to be (e.g disconnected
the IB RC connection and remove the related SCSI host), even if the RC
connection
can survive this small link down event?

>  Or, I would like to know if you can get a ib-srp failover time of
> less than a minute somehow (by tweaking OS or multipath settings). I was
> unable to get it to work even after a pretty exhaustive attempt.

looking on this
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 11+ messages in thread

end of thread, other threads:[~2012-11-13 21:07 UTC | newest]

Thread overview: 11+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2012-05-29 21:07 [PATCH 1/1] ib_srp: Infiniband srp fast failover patch Karandeep Chahal
2012-05-29 21:51 ` Michael Reed
2012-05-29 22:27   ` Karandeep Chahal
     [not found]     ` <4FC54D62.3080003-LfVdkaOWEx8@public.gmane.org>
2012-05-29 22:53       ` Michael Reed
2012-05-29 22:53         ` Michael Reed
     [not found] ` <4FC53AAA.3060203-LfVdkaOWEx8@public.gmane.org>
2012-05-30  5:06   ` David Dillow
2012-05-30  5:06     ` David Dillow
2012-05-30 14:39     ` Karandeep Chahal
     [not found]     ` <1338354377.2361.13.camel-1q1vX8mYZiGLUyTwlgNVppKKF0rrzTr+@public.gmane.org>
2012-11-12 23:10       ` Or Gerlitz
     [not found]         ` <CAJZOPZL0cj6Sb_xs7prV0k8HjWkELD7Y89Bxxp-xXnnGqxvXyg-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
2012-11-12 23:46           ` Karandeep Chahal
     [not found]             ` <50A18A3D.9070304-LfVdkaOWEx8@public.gmane.org>
2012-11-13 21:07               ` Or Gerlitz

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.