All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH V3 FIX for-3.19] IB/ipoib: Fix sendonly traffic and multicast traffic
@ 2015-01-26 13:00 Erez Shitrit
       [not found] ` <1422277227-1086-1-git-send-email-erezsh-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
  0 siblings, 1 reply; 14+ messages in thread
From: Erez Shitrit @ 2015-01-26 13:00 UTC (permalink / raw)
  To: dledford-H+wXaHxf7aLQT0dZR+AlfA, roland-DgEjT+Ai2ygdnm+yROfE0A
  Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA, Erez Shitrit, Or Gerlitz


Following commit 016d9fb25cd9 "IPoIB: fix MCAST_FLAG_BUSY usage" both
IPv6 traffic and for the most cases all IPv4 multicast traffic aren't
working.

After this change there is no mechanism to handle the work that does the
join process for the rest of the mcg's. For example, if in the list of
all the mcg's there is a send-only request, after its processing, the
code in ipoib_mcast_sendonly_join_complete() will not requeue the
mcast task, but leaves the bit that signals this task is running,
and hence the task will never run.

Also, whenever the kernel sends multicast packet (w.o joining to this
group), we don't call ipoib_send_only_join(), the code tries to start
the mcast task but it failed because the bit IPOIB_MCAST_RUN is always
set, As a result the multicast packet will never be sent.

The fix handles all the join requests via the same logic, and call
explicitly to sendonly join whenever there is a packet from sendonly
type.

Since ipoib_mcast_sendonly_join() is now called from the driver TX flow, we
can't take mutex there. We avoid this locking by using temporary pointer when
calling ib_sa_join_multicast and testing that pointer object to be valid (not
error), if it is an error the driver knows that the completion will not be
called, and the driver can set the mcast->mc value.


Fixes: 016d9fb25cd9 ('IPoIB: fix MCAST_FLAG_BUSY usage')
Reported-by: Eyal Perry <eyalpe-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
Signed-off-by: Erez Shitrit <erezsh-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
Signed-off-by: Or Gerlitz <ogerlitz-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>

---
Changes from V1:
1. always do clear_bit(IPOIB_MCAST_FLAG_BUSY) in
ipoib_mcast_sendonly_join_complete()
2. Sync between ipoib_mcast_sendonly_join() to
ipoib_mcast_sendonly_join_complete using
an IS_ERR_OR_NULL(mcast->mc) test

Changes from V2:
1. ipoib_mcast_sendonly_join will not  abuse mcast->mc pointer.

 drivers/infiniband/ulp/ipoib/ipoib_multicast.c |   38 ++++++++++++------------
 1 files changed, 19 insertions(+), 19 deletions(-)

diff --git a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
index bc50dd0..bbb4d3d 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
@@ -306,6 +306,8 @@ out:
 	clear_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags);
 	if (status)
 		mcast->mc = NULL;
+	else
+		mcast->mc = multicast;
 	complete(&mcast->done);
 	if (status == -ENETRESET)
 		status = 0;
@@ -317,6 +319,7 @@ static int ipoib_mcast_sendonly_join(struct ipoib_mcast *mcast)
 {
 	struct net_device *dev = mcast->dev;
 	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	struct ib_sa_multicast	 *mc_ret;
 	struct ib_sa_mcmember_rec rec = {
 #if 0				/* Some SMs don't support send-only yet */
 		.join_state = 4
@@ -342,20 +345,20 @@ static int ipoib_mcast_sendonly_join(struct ipoib_mcast *mcast)
 	rec.port_gid = priv->local_gid;
 	rec.pkey     = cpu_to_be16(priv->pkey);
 
-	mutex_lock(&mcast_mutex);
 	init_completion(&mcast->done);
 	set_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags);
-	mcast->mc = ib_sa_join_multicast(&ipoib_sa_client, priv->ca,
-					 priv->port, &rec,
-					 IB_SA_MCMEMBER_REC_MGID	|
-					 IB_SA_MCMEMBER_REC_PORT_GID	|
-					 IB_SA_MCMEMBER_REC_PKEY	|
-					 IB_SA_MCMEMBER_REC_JOIN_STATE,
-					 GFP_ATOMIC,
-					 ipoib_mcast_sendonly_join_complete,
-					 mcast);
-	if (IS_ERR(mcast->mc)) {
-		ret = PTR_ERR(mcast->mc);
+	mc_ret = ib_sa_join_multicast(&ipoib_sa_client, priv->ca,
+				      priv->port, &rec,
+				      IB_SA_MCMEMBER_REC_MGID	|
+				      IB_SA_MCMEMBER_REC_PORT_GID	|
+				      IB_SA_MCMEMBER_REC_PKEY	|
+				      IB_SA_MCMEMBER_REC_JOIN_STATE,
+				      GFP_ATOMIC,
+				      ipoib_mcast_sendonly_join_complete,
+				      mcast);
+	if (IS_ERR(mc_ret)) {
+		mcast->mc = mc_ret;
+		ret = PTR_ERR(mc_ret);
 		clear_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags);
 		complete(&mcast->done);
 		ipoib_warn(priv, "ib_sa_join_multicast for sendonly join "
@@ -364,7 +367,6 @@ static int ipoib_mcast_sendonly_join(struct ipoib_mcast *mcast)
 		ipoib_dbg_mcast(priv, "no multicast record for %pI6, starting "
 				"sendonly join\n", mcast->mcmember.mgid.raw);
 	}
-	mutex_unlock(&mcast_mutex);
 
 	return ret;
 }
@@ -622,10 +624,8 @@ void ipoib_mcast_join_task(struct work_struct *work)
 			break;
 		}
 
-		if (test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags))
-			ipoib_mcast_sendonly_join(mcast);
-		else
-			ipoib_mcast_join(dev, mcast, 1);
+		ipoib_mcast_join(dev, mcast, 1);
+
 		return;
 	}
 
@@ -725,8 +725,6 @@ void ipoib_mcast_send(struct net_device *dev, u8 *daddr, struct sk_buff *skb)
 		memcpy(mcast->mcmember.mgid.raw, mgid, sizeof (union ib_gid));
 		__ipoib_mcast_add(dev, mcast);
 		list_add_tail(&mcast->list, &priv->multicast_list);
-		if (!test_and_set_bit(IPOIB_MCAST_RUN, &priv->flags))
-			queue_delayed_work(priv->wq, &priv->mcast_task, 0);
 	}
 
 	if (!mcast->ah) {
@@ -740,6 +738,8 @@ void ipoib_mcast_send(struct net_device *dev, u8 *daddr, struct sk_buff *skb)
 		if (test_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags))
 			ipoib_dbg_mcast(priv, "no address vector, "
 					"but multicast join already started\n");
+		else if (test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags))
+			ipoib_mcast_sendonly_join(mcast);
 
 		/*
 		 * If lookup completes between here and out:, don't
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply related	[flat|nested] 14+ messages in thread

* Re: [PATCH V3 FIX for-3.19] IB/ipoib: Fix sendonly traffic and multicast traffic
       [not found] ` <1422277227-1086-1-git-send-email-erezsh-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
@ 2015-01-26 13:16   ` Or Gerlitz
       [not found]     ` <CAJ3xEMjERaEP5d_ZT8RN5+w8Z_Hig4T7dhuq3o+1NOUuQgfJLw-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
  0 siblings, 1 reply; 14+ messages in thread
From: Or Gerlitz @ 2015-01-26 13:16 UTC (permalink / raw)
  To: Doug Ledford, Roland Dreier
  Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA, Or Gerlitz, Erez Shitrit,
	Amir Vadai, Eyal Perry

On Mon, Jan 26, 2015 at 3:00 PM, Erez Shitrit <erezsh-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org> wrote:
> Following commit 016d9fb25cd9 "IPoIB: fix MCAST_FLAG_BUSY usage" both
> IPv6 traffic and for the most cases all IPv4 multicast traffic aren't
> working.


Hi Doug + Roland

Erez was very patiently reviewing and testing all the six (V0...V5)
patch series you sent to fix the 3.19-rc1 regression. Can you also
give this patch a try? We believe it addresses your concerns re the
previous versions of it and fix the RC1 regression.

Or.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH V3 FIX for-3.19] IB/ipoib: Fix sendonly traffic and multicast traffic
       [not found]     ` <CAJ3xEMjERaEP5d_ZT8RN5+w8Z_Hig4T7dhuq3o+1NOUuQgfJLw-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
@ 2015-01-26 19:38       ` Doug Ledford
       [not found]         ` <1422301106.2854.41.camel-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
  0 siblings, 1 reply; 14+ messages in thread
From: Doug Ledford @ 2015-01-26 19:38 UTC (permalink / raw)
  To: Or Gerlitz
  Cc: Roland Dreier, linux-rdma-u79uwXL29TY76Z2rM5mHXA, Or Gerlitz,
	Erez Shitrit, Amir Vadai, Eyal Perry

[-- Attachment #1: Type: text/plain, Size: 1074 bytes --]

On Mon, 2015-01-26 at 15:16 +0200, Or Gerlitz wrote:
> On Mon, Jan 26, 2015 at 3:00 PM, Erez Shitrit <erezsh-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org> wrote:
> > Following commit 016d9fb25cd9 "IPoIB: fix MCAST_FLAG_BUSY usage" both
> > IPv6 traffic and for the most cases all IPv4 multicast traffic aren't
> > working.
> 
> 
> Hi Doug + Roland
> 
> Erez was very patiently reviewing and testing all the six (V0...V5)
> patch series you sent to fix the 3.19-rc1 regression.

Yes he has.

>  Can you also
> give this patch a try?

I can test it.  But I need to know how it's supposed to be applied.  Is
this in lieu of my entire patchset?  If so, I know there are some things
not addressed here that should be pulled in from my patchset.

>  We believe it addresses your concerns re the
> previous versions of it and fix the RC1 regression.

It might fix the regression, it might also reintroduce a race on
ifup/ifdown.  I'll test and see.

-- 
Doug Ledford <dledford-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
              GPG KeyID: 0E572FDD



[-- Attachment #2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 819 bytes --]

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH V3 FIX for-3.19] IB/ipoib: Fix sendonly traffic and multicast traffic
       [not found]         ` <1422301106.2854.41.camel-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
@ 2015-01-26 20:57           ` Or Gerlitz
       [not found]             ` <CAJ3xEMg3vYGbGuT+Z-XQMv5YuPws33XHQP_Wcz8gvpBbCg3TSw-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
  0 siblings, 1 reply; 14+ messages in thread
From: Or Gerlitz @ 2015-01-26 20:57 UTC (permalink / raw)
  To: Doug Ledford
  Cc: Roland Dreier, linux-rdma-u79uwXL29TY76Z2rM5mHXA, Or Gerlitz,
	Erez Shitrit, Amir Vadai, Eyal Perry

On Mon, Jan 26, 2015 at 9:38 PM, Doug Ledford <dledford-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org> wrote:
> On Mon, 2015-01-26 at 15:16 +0200, Or Gerlitz wrote:
>> On Mon, Jan 26, 2015 at 3:00 PM, Erez Shitrit <erezsh-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org> wrote:
>> > Following commit 016d9fb25cd9 "IPoIB: fix MCAST_FLAG_BUSY usage" both
>> > IPv6 traffic and for the most cases all IPv4 multicast traffic aren't
>> > working.
>>
>>
>> Hi Doug + Roland
>>
>> Erez was very patiently reviewing and testing all the six (V0...V5)
>> patch series you sent to fix the 3.19-rc1 regression.
>
> Yes he has.


>>  Can you also give this patch a try?

> I can test it.  But I need to know how it's supposed to be applied.

just apply it on latest upstream and run whatever tests you have, simple.

> It might fix the regression, it might also reintroduce a race on
> ifup/ifdown.  I'll test and see.

Let's see it in action @ your env

Or.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH V3 FIX for-3.19] IB/ipoib: Fix sendonly traffic and multicast traffic
       [not found]             ` <CAJ3xEMg3vYGbGuT+Z-XQMv5YuPws33XHQP_Wcz8gvpBbCg3TSw-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
@ 2015-01-26 22:00               ` Doug Ledford
       [not found]                 ` <1422309605.2854.62.camel-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
  0 siblings, 1 reply; 14+ messages in thread
From: Doug Ledford @ 2015-01-26 22:00 UTC (permalink / raw)
  To: Or Gerlitz
  Cc: Roland Dreier, linux-rdma-u79uwXL29TY76Z2rM5mHXA, Or Gerlitz,
	Erez Shitrit, Amir Vadai, Eyal Perry

[-- Attachment #1: Type: text/plain, Size: 3671 bytes --]

On Mon, 2015-01-26 at 22:57 +0200, Or Gerlitz wrote:
> On Mon, Jan 26, 2015 at 9:38 PM, Doug Ledford <dledford-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org> wrote:
> > On Mon, 2015-01-26 at 15:16 +0200, Or Gerlitz wrote:
> >> On Mon, Jan 26, 2015 at 3:00 PM, Erez Shitrit <erezsh-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org> wrote:
> >> > Following commit 016d9fb25cd9 "IPoIB: fix MCAST_FLAG_BUSY usage" both
> >> > IPv6 traffic and for the most cases all IPv4 multicast traffic aren't
> >> > working.
> >>
> >>
> >> Hi Doug + Roland
> >>
> >> Erez was very patiently reviewing and testing all the six (V0...V5)
> >> patch series you sent to fix the 3.19-rc1 regression.
> >
> > Yes he has.
> 
> 
> >>  Can you also give this patch a try?
> 
> > I can test it.  But I need to know how it's supposed to be applied.
> 
> just apply it on latest upstream and run whatever tests you have, simple.

I used the same base kernel that I used for my patchset.

> > It might fix the regression, it might also reintroduce a race on
> > ifup/ifdown.  I'll test and see.
> 
> Let's see it in action @ your env

It passed the initial IPv6 after a failed join issue that my own
patchset just finally passes.

However, I didn't get more than 5 minutes into testing before I was able
to livelock the system.  In this case, from machine A running my
patchset, I did

ping6 -I mlx4_ib0 -i .25 <machine B address>

On machine B running Erez's patch, I did:

rmmod ib_ipoib; modprobe ib_ipoib mcast_debug_level=1; sleep 2; ping6
-i .25 -c 10 -I mlx4_ib0 <machine A address>

And on the machine rdma-master, where the opensm runs, I did just a few:

systemctl restart opensm

The livelock is in the mcast flushing code.  On the machine that
livelocked, here's the dmesg tail:

[  423.189514] mlx4_ib0.8002: multicast join failed for ff12:401b:8002:0000:0000:0000:ffff:ffff, status -110
[  423.189541] mlx4_ib0.8002: deleting multicast group ff12:401b:8002:0000:0000:0000:0000:0001
[  423.189545] mlx4_ib0.8002: deleting multicast group ff12:601b:8002:0000:0000:0000:0000:0001
[  423.189547] mlx4_ib0.8002: deleting multicast group ff12:601b:8002:0000:0000:0001:ff7b:e1b1
[  423.189549] mlx4_ib0.8002: deleting multicast group ff12:401b:8002:0000:0000:0000:0000:00fb
[  423.189551] mlx4_ib0.8002: deleting multicast group ff12:401b:8002:0000:0000:0000:ffff:ffff
[  423.204570] mlx4_ib0.8002: stopping multicast thread
[  423.204573] mlx4_ib0.8002: flushing multicast list
[  423.213567] mlx4_ib0: stopping multicast thread
[  423.213571] mlx4_ib0: flushing multicast list

The rmmod operation is stuck in ib_sa_unregister_client (one of the
specific fixes my patchset resolves BTW).

On another machine I started another one of my tests:

On machine A:

ping6 I mlx4_ib0 -i .25 <machine C address>

On rdma-master:

while true; do sleep 4; systemctl restart opensm; done

One machine C:

passes=0; while true; do ifdown qib_ib0; ifup qib_ib0; echo "Passes $passes..."; let passes++; done

In this test Erez's patch made it through about 5 down/up cycles before
the machine oopsed.

Do I need to keep going?  I was able to crash two different machines on
two different brands of hardware within only a few test cycles.  My
patchset, while large and intrusive, now survives all of this with
flying colors, and now that I've replicated Erez's specific multicast
join failure, I've taken care of that corner case too (and will be
adding that to my long term QE setup so it doesn't regress in the
future).


-- 
Doug Ledford <dledford-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
              GPG KeyID: 0E572FDD



[-- Attachment #2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 819 bytes --]

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH V3 FIX for-3.19] IB/ipoib: Fix sendonly traffic and multicast traffic
       [not found]                 ` <1422309605.2854.62.camel-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
@ 2015-01-27  8:33                   ` Erez Shitrit
       [not found]                     ` <54C74D49.3080201-LDSdmyG8hGV8YrgS2mwiifqBs+8SCbDb@public.gmane.org>
  2015-01-27 13:05                   ` Or Gerlitz
  1 sibling, 1 reply; 14+ messages in thread
From: Erez Shitrit @ 2015-01-27  8:33 UTC (permalink / raw)
  To: Doug Ledford, Or Gerlitz
  Cc: Roland Dreier, linux-rdma-u79uwXL29TY76Z2rM5mHXA, Or Gerlitz,
	Erez Shitrit, Amir Vadai, Eyal Perry

On 1/27/2015 12:00 AM, Doug Ledford wrote:
> On Mon, 2015-01-26 at 22:57 +0200, Or Gerlitz wrote:
>> On Mon, Jan 26, 2015 at 9:38 PM, Doug Ledford <dledford-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org> wrote:
>>> On Mon, 2015-01-26 at 15:16 +0200, Or Gerlitz wrote:
>>>> On Mon, Jan 26, 2015 at 3:00 PM, Erez Shitrit <erezsh-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org> wrote:
>>>>> Following commit 016d9fb25cd9 "IPoIB: fix MCAST_FLAG_BUSY usage" both
>>>>> IPv6 traffic and for the most cases all IPv4 multicast traffic aren't
>>>>> working.
>>>>
>>>> Hi Doug + Roland
>>>>
>>>> Erez was very patiently reviewing and testing all the six (V0...V5)
>>>> patch series you sent to fix the 3.19-rc1 regression.
>>> Yes he has.
>>
>>>>   Can you also give this patch a try?
>>> I can test it.  But I need to know how it's supposed to be applied.
>> just apply it on latest upstream and run whatever tests you have, simple.
> I used the same base kernel that I used for my patchset.
>
>>> It might fix the regression, it might also reintroduce a race on
>>> ifup/ifdown.  I'll test and see.
>> Let's see it in action @ your env
> It passed the initial IPv6 after a failed join issue that my own
> patchset just finally passes.
>
> However, I didn't get more than 5 minutes into testing before I was able
> to livelock the system.  In this case, from machine A running my
> patchset, I did
>
> ping6 -I mlx4_ib0 -i .25 <machine B address>
>
> On machine B running Erez's patch, I did:
>
> rmmod ib_ipoib; modprobe ib_ipoib mcast_debug_level=1; sleep 2; ping6
> -i .25 -c 10 -I mlx4_ib0 <machine A address>
>
> And on the machine rdma-master, where the opensm runs, I did just a few:
>
> systemctl restart opensm
>
> The livelock is in the mcast flushing code.  On the machine that
> livelocked, here's the dmesg tail:
>
> [  423.189514] mlx4_ib0.8002: multicast join failed for ff12:401b:8002:0000:0000:0000:ffff:ffff, status -110
> [  423.189541] mlx4_ib0.8002: deleting multicast group ff12:401b:8002:0000:0000:0000:0000:0001
> [  423.189545] mlx4_ib0.8002: deleting multicast group ff12:601b:8002:0000:0000:0000:0000:0001
> [  423.189547] mlx4_ib0.8002: deleting multicast group ff12:601b:8002:0000:0000:0001:ff7b:e1b1
> [  423.189549] mlx4_ib0.8002: deleting multicast group ff12:401b:8002:0000:0000:0000:0000:00fb
> [  423.189551] mlx4_ib0.8002: deleting multicast group ff12:401b:8002:0000:0000:0000:ffff:ffff
> [  423.204570] mlx4_ib0.8002: stopping multicast thread
> [  423.204573] mlx4_ib0.8002: flushing multicast list
> [  423.213567] mlx4_ib0: stopping multicast thread
> [  423.213571] mlx4_ib0: flushing multicast list
>
> The rmmod operation is stuck in ib_sa_unregister_client (one of the
> specific fixes my patchset resolves BTW).

The patch I sent, only claims to fix the regression in multicast and 
sendonly issues, it can replace the first 3 patches and the one you sent 
me off list from your last patchset.
(probably there are more bugs, that the rest of your patchset solved, 
hence it should be tested with the rest of your patchset)

And probably there are more bugs that your patcheset didn't fix yet -:)
For example, I run your last patchset+ the last fix you sent me with:
modprobe -r ib_ipoib; modprobe  ib_ipoib;
ping6
some adding/deleting  one child interface and got the next panic:

[81209.348259] ib0: join completion for 
ff12:601b:ffff:0000:0000:0001:ff43:3bf1 (status -102)
[81209.408787] BUG: unable to handle kernel NULL pointer dereference at 
0000000000000020
[81209.416750] IP: [<ffffffffa096b399>] ipoib_mcast_join+0xa9/0x1b0 
[ib_ipoib]
[81209.423787] PGD 0
[81209.425864] Oops: 0000 [#1] SMP
[81209.429165] Modules linked in: ib_ipoib(E) ib_cm mlx4_ib ib_sa ib_mad 
ib_core ib_addr mlx4_core netconsole configfs nfsv3 nfs_acl 
rpcsec_gss_krb5 auth_rpcgss nfsv4 nfs fscache lockd grace 
ip6table_filter ip6_tables ebtable_nat ebtables nf_conntrack_ipv4 
nf_defrag_ipv4 xt_state nf_conntrack ipt_REJECT nf_reject_ipv4 
xt_CHECKSUM iptable_mangle iptable_filter ip_tables autofs4 sunrpc 
bridge stp llc ipv6 dm_mirror dm_region_hash dm_log dm_mod vhost_net 
macvtap macvlan vhost tun kvm_intel kvm iTCO_wdt iTCO_vendor_support 
dcdbas microcode pcspkr serio_raw wmi sg lpc_ich mfd_core i7core_edac 
edac_core bnx2 ext3(E) jbd(E) mbcache(E) sr_mod(E) cdrom(E) sd_mod(E) 
pata_acpi(E) ata_generic(E) ata_piix(E) megaraid_sas(E) [last unloaded: 
ib_cm]
[81209.495358] CPU: 10 PID: 7655 Comm: kworker/u64:0 Tainted: 
G            E  3.18.0+ #1
[81209.503297] Hardware name: Dell Inc. PowerEdge R710/0MD99X, BIOS 
6.4.0 07/23/2013
[81209.510905] Workqueue: ipoib_wq ipoib_mcast_join_task [ib_ipoib]
[81209.516975] task: ffff88041bb64050 ti: ffff88041b968000 task.ti: 
ffff88041b968000
[81209.524566] RIP: 0010:[<ffffffffa096b399>] [<ffffffffa096b399>] 
ipoib_mcast_join+0xa9/0x1b0 [ib_ipoib]
[81209.534079] RSP: 0018:ffff88041b96bcf8  EFLAGS: 00010202
[81209.539447] RAX: 0000000000000000 RBX: ffff8803bd92e8c0 RCX: 
0000000000000000
[81209.546636] RDX: ffff88082fcaea38 RSI: ffff88082fcad238 RDI: 
ffff88082fcad238
[81209.553833] RBP: ffff88041b96bd78 R08: 0000000000000000 R09: 
00000000000060f0
[81209.561027] R10: 0000000000000000 R11: 0000000000000001 R12: 
ffff88041bec0700
[81209.568217] R13: ffff88041b96bd08 R14: 0000000000000001 R15: 
f773010000000000
[81209.575411] FS:  0000000000000000(0000) GS:ffff88082fca0000(0000) 
knlGS:0000000000000000
[81209.583608] CS:  0010 DS: 0000 ES: 0000 CR0: 000000008005003b
[81209.589411] CR2: 0000000000000020 CR3: 0000000001a14000 CR4: 
00000000000007e0
[81209.596599] Stack:
[81209.598664]  ffff8803bd92ecc0 0000000100000000 000001801b6012ff 
f13b43ff01000000
[81209.606216]  00000000000080fe f13b430003c90200 0000000000000000 
0000000001800000
[81209.613768]  0000000000000000 0000000000010000 ffff8803bd92ea60 
ffff8803bd92ea60
[81209.621328] Call Trace:
[81209.623832]  [<ffffffffa096b7d5>] ipoib_mcast_join_task+0x195/0x370 
[ib_ipoib]
[81209.631172]  [<ffffffff8106d6cd>] process_one_work+0x14d/0x430
[81209.637059]  [<ffffffff8106dad0>] worker_thread+0x120/0x3c0
[81209.642689]  [<ffffffff815b7e35>] ? __schedule+0x355/0x6d0
[81209.648227]  [<ffffffff8106d9b0>] ? process_one_work+0x430/0x430
[81209.654290]  [<ffffffff8107290e>] kthread+0xce/0xf0
[81209.659222]  [<ffffffff81072840>] ? 
kthread_freezable_should_stop+0x70/0x70
[81209.666242]  [<ffffffff815bbb2c>] ret_from_fork+0x7c/0xb0
[81209.671698]  [<ffffffff81072840>] ? 
kthread_freezable_should_stop+0x70/0x70
[81209.678713] Code: 00 00 48 89 45 a8 0f b7 83 ca 03 00 00 66 c1 c0 08 
45 85 f6 66 89 45 ba 74 48 48 8b 83 78 01 00 00 49 bf 00 00 00 00 00 01 
73 f7 <8b> 50 20 c6 45 b6 02 89 55 b0 0f b6 50 27 88 55 b7 0f b6 50 28
[81209.698543] RIP  [<ffffffffa096b399>] ipoib_mcast_join+0xa9/0x1b0 
[ib_ipoib]
[81209.705661]  RSP <ffff88041b96bcf8>
[81209.709201] CR2: 0000000000000020
[81209.712896] ---[ end trace 02ca131660e82eb4 ]---
[81209.728390] BUG: unable to handle kernel paging request at 
ffffffffffffffd8
[81209.735555] IP: [<ffffffff81072230>] kthread_data+0x10/0x20
[81209.741280] PGD 1a15067 PUD 1a17067 PMD 0
[81209.745626] Oops: 0000 [#2] SMP
[81209.749049] Modules linked in: ib_ipoib(E) ib_cm mlx4_ib ib_sa ib_mad 
ib_core ib_addr mlx4_core netconsole configfs nfsv3 nfs_acl 
rpcsec_gss_krb5 auth_rpcgss nfsv4 nfs fscache lockd grace 
ip6table_filter ip6_tables ebtable_nat ebtables nf_conntrack_ipv4 
nf_defrag_ipv4 xt_state nf_conntrack ipt_REJECT nf_reject_ipv4 
xt_CHECKSUM iptable_mangle iptable_filter ip_tables autofs4 sunrpc 
bridge stp llc ipv6 dm_mirror dm_region_hash dm_log dm_mod vhost_net 
macvtap macvlan vhost tun kvm_intel kvm iTCO_wdt iTCO_vendor_support 
dcdbas microcode pcspkr serio_raw wmi sg lpc_ich mfd_core i7core_edac 
edac_core bnx2 ext3(E) jbd(E) mbcache(E) sr_mod(E) cdrom(E) sd_mod(E) 
pata_acpi(E) ata_generic(E) ata_piix(E) megaraid_sas(E) [last unloaded: 
ib_cm]
[81209.818287] CPU: 22 PID: 7655 Comm: kworker/u64:0 Tainted: G D     E  
3.18.0+ #1
[81209.826266] Hardware name: Dell Inc. PowerEdge R710/0MD99X, BIOS 
6.4.0 07/23/2013
[81209.833901] task: ffff88041bb64050 ti: ffff88041b968000 task.ti: 
ffff88041b968000
[81209.841545] RIP: 0010:[<ffffffff81072230>] [<ffffffff81072230>] 
kthread_data+0x10/0x20
[81209.849750] RSP: 0018:ffff88041b96b958  EFLAGS: 00010092
[81209.855158] RAX: 0000000000000000 RBX: 0000000000000016 RCX: 
ffffffff81d627a0
[81209.862386] RDX: ffff88041bb64050 RSI: 0000000000000016 RDI: 
ffff88041bb64050
[81209.869615] RBP: ffff88041b96b958 R08: ffff88041bb640e0 R09: 
dead000000200200
[81209.876842] R10: dead000000200200 R11: 0000000000000007 R12: 
0000000000000016
[81209.884074] R13: ffff88041bb64960 R14: 0000000000000001 R15: 
0000000000000092
[81209.891302] FS:  0000000000000000(0000) GS:ffff88082fd60000(0000) 
knlGS:0000000000000000
[81209.899538] CS:  0010 DS: 0000 ES: 0000 CR0: 000000008005003b
[81209.905377] CR2: 0000000000000028 CR3: 0000000001a14000 CR4: 
00000000000007e0
[81209.912606] Stack:
[81209.914713]  ffff88041b96b978 ffffffff8106b025 ffff88041b96b978 
ffff88082fd72a40
[81209.922469]  ffff88041b96b9d8 ffffffff815b7ffa ffff88041b968010 
0000000000012a40
[81209.930219]  ffff88041bb64050 ffff88041bb64050 ffff88041b96b9e8 
ffff88041bb64050
[81209.937968] Call Trace:
[81209.940509]  [<ffffffff8106b025>] wq_worker_sleeping+0x15/0xb0
[81209.946435]  [<ffffffff815b7ffa>] __schedule+0x51a/0x6d0
[81209.951844]  [<ffffffff815b82e9>] schedule+0x29/0x70
[81209.956904]  [<ffffffff810585ba>] do_exit+0x2da/0x490
[81209.962051]  [<ffffffff81007840>] oops_end+0xa0/0xe0
[81209.967111]  [<ffffffff81047ec5>] no_context+0x125/0x200
[81209.972517]  [<ffffffff810480bd>] __bad_area_nosemaphore+0x11d/0x220
[81209.978966]  [<ffffffff810481d3>] bad_area_nosemaphore+0x13/0x20
[81209.985067]  [<ffffffff81048792>] __do_page_fault+0x322/0x4b0
[81209.990908]  [<ffffffff81097a4f>] ? up+0x2f/0x50
[81209.995624]  [<ffffffff8112ad8b>] ? irq_work_queue+0x9b/0xd0
[81210.002583]  [<ffffffff810a47d2>] ? wake_up_klogd+0x32/0x40
[81210.008251]  [<ffffffff810a56b0>] ? console_unlock+0x2a0/0x2e0
[81210.014179]  [<ffffffff810489fc>] do_page_fault+0xc/0x10
[81210.019585]  [<ffffffff815bd542>] page_fault+0x22/0x30
[81210.024818]  [<ffffffffa096b399>] ? ipoib_mcast_join+0xa9/0x1b0 
[ib_ipoib]
[81210.031795]  [<ffffffffa096b451>] ? ipoib_mcast_join+0x161/0x1b0 
[ib_ipoib]
[81210.038853]  [<ffffffffa096b7d5>] ipoib_mcast_join_task+0x195/0x370 
[ib_ipoib]
[81210.046226]  [<ffffffff8106d6cd>] process_one_work+0x14d/0x430
[81210.052157]  [<ffffffff8106dad0>] worker_thread+0x120/0x3c0
[81210.057824]  [<ffffffff815b7e35>] ? __schedule+0x355/0x6d0
[81210.063403]  [<ffffffff8106d9b0>] ? process_one_work+0x430/0x430
[81210.069502]  [<ffffffff8107290e>] kthread+0xce/0xf0
[81210.074473]  [<ffffffff81072840>] ? 
kthread_freezable_should_stop+0x70/0x70
[81210.081535]  [<ffffffff815bbb2c>] ret_from_fork+0x7c/0xb0
[81210.087028]  [<ffffffff81072840>] ? 
kthread_freezable_should_stop+0x70/0x70
[81210.094084] Code: b8 08 00 00 48 8b 40 c8 c9 48 c1 e8 02 83 e0 01 c3 
66 2e 0f 1f 84 00 00 00 00 00 55 48 89 e5 66 66 66 66 90 48 8b 87 b8 08 
00 00 <48> 8b 40 d8 c9 c3 66 2e 0f 1f 84 00 00 00 00 00 55 48 89 e5 66
[81210.116583] RIP  [<ffffffff81072230>] kthread_data+0x10/0x20
[81210.122388]  RSP <ffff88041b96b958>
[81210.125971] CR2: ffffffffffffffd8

> On another machine I started another one of my tests:
>
> On machine A:
>
> ping6 I mlx4_ib0 -i .25 <machine C address>
>
> On rdma-master:
>
> while true; do sleep 4; systemctl restart opensm; done
>
> One machine C:
>
> passes=0; while true; do ifdown qib_ib0; ifup qib_ib0; echo "Passes $passes..."; let passes++; done
>
> In this test Erez's patch made it through about 5 down/up cycles before
> the machine oopsed.
>
> Do I need to keep going?  I was able to crash two different machines on
> two different brands of hardware within only a few test cycles.  My
> patchset, while large and intrusive, now survives all of this with
> flying colors, and now that I've replicated Erez's specific multicast
> join failure, I've taken care of that corner case too (and will be
> adding that to my long term QE setup so it doesn't regress in the
> future).
>

Doug, there are bugs that probably will be found all the time with any 
patchset, my point was only according to the way sendonly need to be 
handled.

Thanks, Erez


--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH V3 FIX for-3.19] IB/ipoib: Fix sendonly traffic and multicast traffic
       [not found]                 ` <1422309605.2854.62.camel-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
  2015-01-27  8:33                   ` Erez Shitrit
@ 2015-01-27 13:05                   ` Or Gerlitz
       [not found]                     ` <54C78D36.7050700-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
  1 sibling, 1 reply; 14+ messages in thread
From: Or Gerlitz @ 2015-01-27 13:05 UTC (permalink / raw)
  To: Doug Ledford, Roland Dreier
  Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA, Erez Shitrit, Amir Vadai, Eyal Perry

On 1/27/2015 12:00 AM, Doug Ledford wrote:
> However, I didn't get more than 5 minutes into testing before I was able
> to livelock the system.  In this case, from machine A running my
> patchset, I did
>
> ping6 -I mlx4_ib0 -i .25 <machine B address>
>
> On machine B running Erez's patch, I did:
>
> rmmod ib_ipoib; modprobe ib_ipoib mcast_debug_level=1; sleep 2; ping6
> -i .25 -c 10 -I mlx4_ib0 <machine A address>
>
> And on the machine rdma-master, where the opensm runs, I did just a few:
>
> systemctl restart opensm
>
> The livelock is in the mcast flushing code.  On the machine that livelocked

Doug,

The tests you are running and the issues you are seeing fall well into a 
to-be-fixed-in-some-kernel-rc1 category but by NO means as something 
which should be an rc6 fix.

You must do the distinction between Erez's patch that fixes the 
regressions introduced on 3.19-rc1 to your attempts to fix many more 
instabilities in the IPoIB driver, which are seen under whatever nasty 
test you are running (and it's good we want to reach there).

Roland, the V3 patch solves the rc1 regression and I think we should 
pick it up, by no way we can allow to pick eleven patches @ this point.

Thoughts?

Or.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH V3 FIX for-3.19] IB/ipoib: Fix sendonly traffic and multicast traffic
       [not found]                     ` <54C74D49.3080201-LDSdmyG8hGV8YrgS2mwiifqBs+8SCbDb@public.gmane.org>
@ 2015-01-27 17:02                       ` Doug Ledford
       [not found]                         ` <1422378130.2854.119.camel-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
  0 siblings, 1 reply; 14+ messages in thread
From: Doug Ledford @ 2015-01-27 17:02 UTC (permalink / raw)
  To: Erez Shitrit
  Cc: Or Gerlitz, Roland Dreier, linux-rdma-u79uwXL29TY76Z2rM5mHXA,
	Or Gerlitz, Erez Shitrit, Amir Vadai, Eyal Perry

[-- Attachment #1: Type: text/plain, Size: 4504 bytes --]

On Tue, 2015-01-27 at 10:33 +0200, Erez Shitrit wrote:
> The patch I sent, only claims to fix the regression in multicast and 
> sendonly issues, it can replace the first 3 patches and the one you sent 
> me off list from your last patchset.

OK.  But my patchset resolve those issues too, and as you point out,
it's not really my entire patchset that resolves just this regression.
It would be easy enough to squash just those patches in my patchset
required to solve just this regression down to just one patch if patch
count bothers people (I find that to be a silly thing to be concerned
with, but oh well).

> (probably there are more bugs, that the rest of your patchset solved, 
> hence it should be tested with the rest of your patchset)

Yes, there are.  But Or has been arguing that we should take your patch
to resolve the regression and leave the rest out.  So, testing that
specific situation seemed to make sense in terms of finding out if that
left us with a kernel that was in reasonable shape.

If we want to look at your patch combined with my patchset in lieu of
the individual patches in mine that perform the same fix, then it would
take some work to merge them as there are differences in assumptions
made and they would not simply work together.

> And probably there are more bugs that your patcheset didn't fix yet -:)
> For example, I run your last patchset+ the last fix you sent me with:
> modprobe -r ib_ipoib; modprobe  ib_ipoib;
> ping6
> some adding/deleting  one child interface and got the next panic:
> 
> [81209.348259] ib0: join completion for 
> ff12:601b:ffff:0000:0000:0001:ff43:3bf1 (status -102)
> [81209.408787] BUG: unable to handle kernel NULL pointer dereference at 
> 0000000000000020
> [81209.416750] IP: [<ffffffffa096b399>] ipoib_mcast_join+0xa9/0x1b0 
> [ib_ipoib]

This looks exactly like the issue solved by 3ee9876fbdb (IB/ipoib: fix
race between mcast_dev_flush and mcast_join).  But without the full logs
I wouldn't know.  However, if it happened while creating/deleting
children then there might be something lurking in the child delete logic
that is not properly locked.

> Doug, there are bugs that probably will be found all the time with any 
> patchset, my point was only according to the way sendonly need to be 
> handled.

And I proved with my final patchset that sendonly does *not* have to be
handled that way.  I proved that the regression could be solved without
basically reverting the logic to the old split way of handling multicast
joins.

So you've argued several times that "sendonly conceptually should be in
the tx path because at the Ethernet layer we are emulating there is no
join for sendonly traffic, it just goes straight out".  To whit, your
patch restores that behavior.

My argument against is that:

1) We are emulating Ethernet, we are not Ethernet, and we have to have a
sendonly join that Ethernet does not.  Emulating Ethernet does not mean
we should do conceptually silly things in our lower layer driver.  We
must do a sendonly join, which is still a join, and splitting it off
from other joins makes no sense for us at the InfiniBand layer.  And in
fact, once it leaves the ipoib layer and enters the ib_sa layer, the two
code paths completely merge and are absolutely no different.  And at
that layer, they *always* get queued to a work queue and handled later.
So there really is no advantage to doing the send directly in the tx
path once you look beyond the ipoib_multicast.c file itself.

2) Having two code paths that only vary in 2 simple ways
  a) one path uses 1 for the membership type, one uses 4
  b) one attaches a QP at the end, the other does not
but which are otherwise almost identical (and could probably be joined
into a single code path, which is something I'm planning for 3.20) does
not make sense.

3) Fixing oopses and races and other issues with (needlessly) multiple
racing code paths is considerably harder than with fewer code paths.
For the sake of hardening the IPoIB driver against various situations, a
single code path is preferable to multiple code paths.

I haven't heard an argument from you yet that I believe beats the points
I've made above.  So I believe a solution that does not revert back to
having two separate code paths to be maintained is preferable to your
patch.

-- 
Doug Ledford <dledford-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
              GPG KeyID: 0E572FDD



[-- Attachment #2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 819 bytes --]

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH V3 FIX for-3.19] IB/ipoib: Fix sendonly traffic and multicast traffic
       [not found]                     ` <54C78D36.7050700-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
@ 2015-01-27 17:51                       ` Doug Ledford
  0 siblings, 0 replies; 14+ messages in thread
From: Doug Ledford @ 2015-01-27 17:51 UTC (permalink / raw)
  To: Or Gerlitz
  Cc: Roland Dreier, linux-rdma-u79uwXL29TY76Z2rM5mHXA, Erez Shitrit,
	Amir Vadai, Eyal Perry

[-- Attachment #1: Type: text/plain, Size: 4240 bytes --]

On Tue, 2015-01-27 at 15:05 +0200, Or Gerlitz wrote:
> On 1/27/2015 12:00 AM, Doug Ledford wrote:
> > However, I didn't get more than 5 minutes into testing before I was able
> > to livelock the system.  In this case, from machine A running my
> > patchset, I did
> >
> > ping6 -I mlx4_ib0 -i .25 <machine B address>
> >
> > On machine B running Erez's patch, I did:
> >
> > rmmod ib_ipoib; modprobe ib_ipoib mcast_debug_level=1; sleep 2; ping6
> > -i .25 -c 10 -I mlx4_ib0 <machine A address>
> >
> > And on the machine rdma-master, where the opensm runs, I did just a few:
> >
> > systemctl restart opensm
> >
> > The livelock is in the mcast flushing code.  On the machine that livelocked
> 
> Doug,
> 
> The tests you are running and the issues you are seeing fall well into a 
> to-be-fixed-in-some-kernel-rc1 category but by NO means as something 
> which should be an rc6 fix.
> 
> You must do the distinction between Erez's patch that fixes the 
> regressions introduced on 3.19-rc1 to your attempts to fix many more 
> instabilities in the IPoIB driver, which are seen under whatever nasty 
> test you are running (and it's good we want to reach there).
> 
> Roland, the V3 patch solves the rc1 regression and I think we should 
> pick it up, by no way we can allow to pick eleven patches @ this point.
> 
> Thoughts?

As I said in my other email to Erez, and as Erez points out, not all 11
patches of mine are needed to resolve the specific regression you are
talking about.  However, my fix resolves the regression without
reverting to splitting the multicast joins down two separate code paths,
which I think is the wrong thing to do and something that actually makes
hardening the driver harder.  If you *really* don't want my patchset
because it's 11 patches (something I couldn't care less about, and I
don't think you should either...the content of the patches is much more
important than the count), I could certainly do some squashing.  And I
could split out just the regression fix from all the rest too.

But in a situation like this, what I'm *really* concerned about is the
final result.  And here's how it breaks down under the various options:

v3.18 plain - ifconfig down/ifconfig up on ib0 can easily lock machine

v3.18 + 8 patches for above issue - initial multicast bringup works, but
additional joins attempted later (after the multicast task had decided
it was done with the initial join set) did not.  there were multiple
symptoms of the multicast join issue, one of which was failure of ipv6
or ipv4 multicast, but another was hangs in ib_sa_unregister_client on
shutdown which could just as easily be classified as a regression as the
ipv6/ipv4 multicast support

v3.18 + 8 patches + Erez patch - subsequent multicast joins now work
again, but other symptoms of the 8 patch series not addressed at all,
including other regressions, and in adding this patch in, it reverts
part of the changes made in the original 8 patch series and quite likely
reintroduces instability on ifconfig down/ifconfig up cycles (making one
wonder if this fix is better or worse than just reverting the original 8
patch set)

v3.18 + 8 patches + 11 fix patches - multicast joins now work again,
ifconfig down/ifconfig up fix continues to work, other regressions such
as hangs in ib_sa_unregister_client on shutdown fixed, overall
considerably harder to cause the kernel to behave badly than with any of
the above alternatives.  I don't claim that it's perfect and that there
isn't additional hardening to be done, but I believe it is considerably
harder/less likely to trip this kernel up than all of the rest above

If there hadn't been a flurry of testing around my patches, then I
wouldn't suggest them at all.  But they have been getting testing.  Lots
of it.  And so have the alternatives.  And out of the bunch, regardless
of patch count, my patchset has fared best under testing.  But if we
don't want to do that, then I would probably recommend reverting the
original 8 patches and then dropping the whole bunch early into 3.20.

-- 
Doug Ledford <dledford-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
              GPG KeyID: 0E572FDD



[-- Attachment #2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 819 bytes --]

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH V3 FIX for-3.19] IB/ipoib: Fix sendonly traffic and multicast traffic
       [not found]                         ` <1422378130.2854.119.camel-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
@ 2015-01-29 12:51                           ` Or Gerlitz
       [not found]                             ` <54CA2CE0.30107-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
  0 siblings, 1 reply; 14+ messages in thread
From: Or Gerlitz @ 2015-01-29 12:51 UTC (permalink / raw)
  To: Doug Ledford
  Cc: Erez Shitrit, Or Gerlitz, Roland Dreier,
	linux-rdma-u79uwXL29TY76Z2rM5mHXA, Erez Shitrit, Amir Vadai,
	Eyal Perry

On 1/27/2015 7:02 PM, Doug Ledford wrote:
> [...]
> I haven't heard an argument from you yet that I believe beats the points
> I've made above.  So I believe a solution that does not revert back to
> having two separate code paths to be maintained is preferable to your patch.

Doug,

It's not going to work this way.

We should 1st and most take a decision what gonna happen with the 
3.19driver
and only then restart/resume the multi way/arguments discussion, and we 
have
very little time for 3.20-rc1too, BTW.

So lets close the 3.19 saga, again, either revert your eight patches or 
apply Erez's patch.

This way or another for 3.20 we can do things right by whatever method 
we agree on.

Or.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH V3 FIX for-3.19] IB/ipoib: Fix sendonly traffic and multicast traffic
       [not found]                             ` <54CA2CE0.30107-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
@ 2015-01-29 15:34                               ` Doug Ledford
       [not found]                                 ` <1422545677.2854.260.camel-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
  0 siblings, 1 reply; 14+ messages in thread
From: Doug Ledford @ 2015-01-29 15:34 UTC (permalink / raw)
  To: Or Gerlitz
  Cc: Erez Shitrit, Or Gerlitz, Roland Dreier,
	linux-rdma-u79uwXL29TY76Z2rM5mHXA, Erez Shitrit, Amir Vadai,
	Eyal Perry

[-- Attachment #1: Type: text/plain, Size: 2386 bytes --]

On Thu, 2015-01-29 at 14:51 +0200, Or Gerlitz wrote:
> On 1/27/2015 7:02 PM, Doug Ledford wrote:
> > [...]
> > I haven't heard an argument from you yet that I believe beats the points
> > I've made above.  So I believe a solution that does not revert back to
> > having two separate code paths to be maintained is preferable to your patch.
> 
> Doug,
> 
> It's not going to work this way.
> 
> We should 1st and most take a decision what gonna happen with the 
> 3.19driver

Yes.  And that's what my above comment was about, the 3.19 driver.  My
opinion that I stated then, and I'll reiterate now, is that we should
revert the original 8 or pull in my full patchset.

> and only then restart/resume the multi way/arguments discussion, and we 
> have
> very little time for 3.20-rc1too, BTW.
> 
> So lets close the 3.19 saga, again, either revert your eight patches

I would support that option.

>  or 
> apply Erez's patch.

I disagree with this option.  You've chosen one regression to highlight,
but ignored other regressions, and Erez's patch by itself does not
address those other regressions.  You can't go picking and choosing
which regressions to highlight and ignore if your rationale for
justification is that we don't allow regressions in releases.
Certainly, failure to unload the module or reboot due to a hand in
ib_sa_unregister_client would be considered a regression too, yes?  But
that isn't addressed by Erez's patchset.

I'm tired of arguing about this Or.  You will not change my mind.
Erez's patch is a bandaid that only solves one particular issue while
ignoring others, it doesn't have half the testing my patchset has, it
doesn't address half the issues mine does, and with it in place there
are still glaring problems left for the end user to suffer through.  As
much as it pains me to admit it, my original patchset had issues that
were pretty severe, and a bandaid *does not do the job*.  It takes some
honest to god stitching to fix that up.  If my much more complete fixup
is "too much, too late", so be it.  We revert the original 8 patches.
But a bandaid on a katana slice doesn't cut it.

> This way or another for 3.20 we can do things right by whatever method 
> we agree on.

Agreed.


-- 
Doug Ledford <dledford-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
              GPG KeyID: 0E572FDD



[-- Attachment #2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 819 bytes --]

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH V3 FIX for-3.19] IB/ipoib: Fix sendonly traffic and multicast traffic
       [not found]                                 ` <1422545677.2854.260.camel-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
@ 2015-01-29 19:23                                   ` Roland Dreier
       [not found]                                     ` <CAL1RGDV30SRUv0oxZCQW0e+tziO0g+iDha8DSWeM56PiWtnRwA-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
  0 siblings, 1 reply; 14+ messages in thread
From: Roland Dreier @ 2015-01-29 19:23 UTC (permalink / raw)
  To: Doug Ledford
  Cc: Or Gerlitz, Erez Shitrit, Or Gerlitz,
	linux-rdma-u79uwXL29TY76Z2rM5mHXA, Erez Shitrit, Amir Vadai,
	Eyal Perry

On Thu, Jan 29, 2015 at 7:34 AM, Doug Ledford <dledford-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org> wrote:
>
> > So lets close the 3.19 saga, again, either revert your eight patches
>
> I would support that option.

I think at this point we have to do the revert, and get it right for
3.20.  There's just too much noise floating around and I can't tell
whether all the regressions are dealt with or not, and we're running
out of time.

I will ask Linus to pull in the reverts, and then let's try to agree
on a patch set for 3.20 ASAP.

 - R.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH V3 FIX for-3.19] IB/ipoib: Fix sendonly traffic and multicast traffic
       [not found]                                     ` <CAL1RGDV30SRUv0oxZCQW0e+tziO0g+iDha8DSWeM56PiWtnRwA-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
@ 2015-01-29 19:27                                       ` Doug Ledford
  2015-01-29 20:29                                       ` Jason Gunthorpe
  1 sibling, 0 replies; 14+ messages in thread
From: Doug Ledford @ 2015-01-29 19:27 UTC (permalink / raw)
  To: Roland Dreier
  Cc: Or Gerlitz, Erez Shitrit, Or Gerlitz,
	linux-rdma-u79uwXL29TY76Z2rM5mHXA, Erez Shitrit, Amir Vadai,
	Eyal Perry

[-- Attachment #1: Type: text/plain, Size: 765 bytes --]

On Thu, 2015-01-29 at 11:23 -0800, Roland Dreier wrote:
> On Thu, Jan 29, 2015 at 7:34 AM, Doug Ledford <dledford-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org> wrote:
> >
> > > So lets close the 3.19 saga, again, either revert your eight patches
> >
> > I would support that option.
> 
> I think at this point we have to do the revert, and get it right for
> 3.20.  There's just too much noise floating around and I can't tell
> whether all the regressions are dealt with or not, and we're running
> out of time.
> 
> I will ask Linus to pull in the reverts, and then let's try to agree
> on a patch set for 3.20 ASAP.

Works for me.  Thanks!

-- 
Doug Ledford <dledford-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
              GPG KeyID: 0E572FDD



[-- Attachment #2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 819 bytes --]

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH V3 FIX for-3.19] IB/ipoib: Fix sendonly traffic and multicast traffic
       [not found]                                     ` <CAL1RGDV30SRUv0oxZCQW0e+tziO0g+iDha8DSWeM56PiWtnRwA-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
  2015-01-29 19:27                                       ` Doug Ledford
@ 2015-01-29 20:29                                       ` Jason Gunthorpe
  1 sibling, 0 replies; 14+ messages in thread
From: Jason Gunthorpe @ 2015-01-29 20:29 UTC (permalink / raw)
  To: Roland Dreier, Yann Droneaud
  Cc: Doug Ledford, Or Gerlitz, Erez Shitrit, Or Gerlitz,
	linux-rdma-u79uwXL29TY76Z2rM5mHXA, Erez Shitrit, Amir Vadai,
	Eyal Perry

On Thu, Jan 29, 2015 at 11:23:22AM -0800, Roland Dreier wrote:
> On Thu, Jan 29, 2015 at 7:34 AM, Doug Ledford <dledford-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org> wrote:
> >
> > > So lets close the 3.19 saga, again, either revert your eight patches
> >
> > I would support that option.
> 
> I think at this point we have to do the revert, and get it right for
> 3.20.  There's just too much noise floating around and I can't tell
> whether all the regressions are dealt with or not, and we're running
> out of time.
> 
> I will ask Linus to pull in the reverts, and then let's try to agree
> on a patch set for 3.20 ASAP.

This discussion with Yann is around this patch, perhaps it should be
reverted till 3.20 as well?

5a77abf9a97a7ecc8fb0f6bf4ad411fb12b02f31 

IB/core: Add support for extended query device caps

Add extensible query device capabilities verb to allow adding new
features.  ib_uverbs_ex_query_device is added and
copy_query_dev_fields is used to copy capability fields to be used by
both ib_uverbs_query_device and ib_uverbs_ex_query_device.

Jason
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 14+ messages in thread

end of thread, other threads:[~2015-01-29 20:29 UTC | newest]

Thread overview: 14+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2015-01-26 13:00 [PATCH V3 FIX for-3.19] IB/ipoib: Fix sendonly traffic and multicast traffic Erez Shitrit
     [not found] ` <1422277227-1086-1-git-send-email-erezsh-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
2015-01-26 13:16   ` Or Gerlitz
     [not found]     ` <CAJ3xEMjERaEP5d_ZT8RN5+w8Z_Hig4T7dhuq3o+1NOUuQgfJLw-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
2015-01-26 19:38       ` Doug Ledford
     [not found]         ` <1422301106.2854.41.camel-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
2015-01-26 20:57           ` Or Gerlitz
     [not found]             ` <CAJ3xEMg3vYGbGuT+Z-XQMv5YuPws33XHQP_Wcz8gvpBbCg3TSw-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
2015-01-26 22:00               ` Doug Ledford
     [not found]                 ` <1422309605.2854.62.camel-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
2015-01-27  8:33                   ` Erez Shitrit
     [not found]                     ` <54C74D49.3080201-LDSdmyG8hGV8YrgS2mwiifqBs+8SCbDb@public.gmane.org>
2015-01-27 17:02                       ` Doug Ledford
     [not found]                         ` <1422378130.2854.119.camel-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
2015-01-29 12:51                           ` Or Gerlitz
     [not found]                             ` <54CA2CE0.30107-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
2015-01-29 15:34                               ` Doug Ledford
     [not found]                                 ` <1422545677.2854.260.camel-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
2015-01-29 19:23                                   ` Roland Dreier
     [not found]                                     ` <CAL1RGDV30SRUv0oxZCQW0e+tziO0g+iDha8DSWeM56PiWtnRwA-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
2015-01-29 19:27                                       ` Doug Ledford
2015-01-29 20:29                                       ` Jason Gunthorpe
2015-01-27 13:05                   ` Or Gerlitz
     [not found]                     ` <54C78D36.7050700-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
2015-01-27 17:51                       ` Doug Ledford

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.