All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH V2] IB/ipoib: Leave stale send-only multicast groups
@ 2011-01-17 11:10 Moni Shoua
       [not found] ` <4D34239D.9030004-hKgKHo2Ms0F+cjeuK/JdrQ@public.gmane.org>
  0 siblings, 1 reply; 5+ messages in thread
From: Moni Shoua @ 2011-01-17 11:10 UTC (permalink / raw)
  To: Roland Dreier; +Cc: linux-rdma, Yossi Etigin

Unlike with send/receive multicast groups, there is no indication for IPoIB
that a send-only multicast group is useless. Therefore, even a single packet
to a multicast destination leaves a multicast entry on the fabric until the
host interface is down. This causes an MGID leakage in the SM.

Here, a garbage-collection task will be scheduled once a minute and will leave
stale multicast groups.

V1 of the patch below was sent to the list a long ago by Yossi Etigin and from some
reason the discussion about it was stopped without a conclusion.

Link to V1:
 - http://www.mail-archive.com/general-ZwoEplunGu1OwGhvXhtEPSCwEArCW2h5@public.gmane.org/msg18928.html

Changes from V1:
 - Add a module parameter to control the amount of time that an idle send-only
   group is allowed to stay joined.

Signed-off-by: Yossi Etigin <yosefe-smomgflXvOZWk0Htik3J/w@public.gmane.org>
Signed-off-by: Moni Shoua <monis-smomgflXvOZWk0Htik3J/w@public.gmane.org>

--
 drivers/infiniband/ulp/ipoib/ipoib.h           |    8 +++-
 drivers/infiniband/ulp/ipoib/ipoib_main.c      |    8 +++-
 drivers/infiniband/ulp/ipoib/ipoib_multicast.c |   50 +++++++++++++++++++++----
 3 files changed, 56 insertions(+), 10 deletions(-)

diff --git a/drivers/infiniband/ulp/ipoib/ipoib.h b/drivers/infiniband/ulp/ipoib/ipoib.h
index ab97f92..fb1714f 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib.h
+++ b/drivers/infiniband/ulp/ipoib/ipoib.h
@@ -92,6 +92,7 @@ enum {
 	IPOIB_FLAG_ADMIN_CM	  = 9,
 	IPOIB_FLAG_UMCAST	  = 10,
 	IPOIB_FLAG_CSUM		  = 11,
+	IPOIB_MCAST_RUN_GC	  = 12,
 
 	IPOIB_MAX_BACKOFF_SECONDS = 16,
 
@@ -132,6 +133,7 @@ struct ipoib_mcast {
 	struct list_head  list;
 
 	unsigned long created;
+	unsigned long used;
 	unsigned long backoff;
 
 	unsigned long flags;
@@ -283,7 +285,8 @@ struct ipoib_dev_priv {
 	struct rb_root multicast_tree;
 
 	struct delayed_work pkey_poll_task;
-	struct delayed_work mcast_task;
+	struct delayed_work mcast_join_task;
+	struct delayed_work mcast_leave_task;
 	struct work_struct carrier_on_task;
 	struct work_struct flush_light;
 	struct work_struct flush_normal;
@@ -411,6 +414,8 @@ void ipoib_neigh_free(struct net_device *dev, struct ipoib_neigh *neigh);
 
 extern struct workqueue_struct *ipoib_workqueue;
 
+extern int ipoib_mc_sendonly_timeout;
+
 /* functions */
 
 int ipoib_poll(struct napi_struct *napi, int budget);
@@ -453,6 +458,7 @@ int ipoib_dev_init(struct net_device *dev, struct ib_device *ca, int port);
 void ipoib_dev_cleanup(struct net_device *dev);
 
 void ipoib_mcast_join_task(struct work_struct *work);
+void ipoib_mcast_leave_task(struct work_struct *work);
 void ipoib_mcast_carrier_on_task(struct work_struct *work);
 void ipoib_mcast_send(struct net_device *dev, void *mgid, struct sk_buff *skb);
 
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c
index 7a07a72..563370e 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_main.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c
@@ -67,6 +67,11 @@ module_param_named(debug_level, ipoib_debug_level, int, 0644);
 MODULE_PARM_DESC(debug_level, "Enable debug tracing if > 0");
 #endif
 
+int ipoib_mc_sendonly_timeout;
+
+module_param_named(mc_sendonly_timeout, ipoib_mc_sendonly_timeout, int, 0644);
+MODULE_PARM_DESC(mc_sendonly_timeout, "Enable debug tracing if > 0");
+
 struct ipoib_path_iter {
 	struct net_device *dev;
 	struct ipoib_path  path;
@@ -1020,7 +1025,8 @@ static void ipoib_setup(struct net_device *dev)
 	INIT_LIST_HEAD(&priv->multicast_list);
 
 	INIT_DELAYED_WORK(&priv->pkey_poll_task, ipoib_pkey_poll);
-	INIT_DELAYED_WORK(&priv->mcast_task,   ipoib_mcast_join_task);
+	INIT_DELAYED_WORK(&priv->mcast_join_task,   ipoib_mcast_join_task);
+	INIT_DELAYED_WORK(&priv->mcast_leave_task, ipoib_mcast_leave_task);
 	INIT_WORK(&priv->carrier_on_task, ipoib_mcast_carrier_on_task);
 	INIT_WORK(&priv->flush_light,   ipoib_ib_dev_flush_light);
 	INIT_WORK(&priv->flush_normal,   ipoib_ib_dev_flush_normal);
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
index 3871ac6..87928c1 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
@@ -117,6 +117,7 @@ static struct ipoib_mcast *ipoib_mcast_alloc(struct net_device *dev,
 
 	mcast->dev = dev;
 	mcast->created = jiffies;
+	mcast->used = jiffies;
 	mcast->backoff = 1;
 
 	INIT_LIST_HEAD(&mcast->list);
@@ -403,7 +404,7 @@ static int ipoib_mcast_join_complete(int status,
 		mutex_lock(&mcast_mutex);
 		if (test_bit(IPOIB_MCAST_RUN, &priv->flags))
 			queue_delayed_work(ipoib_workqueue,
-					   &priv->mcast_task, 0);
+					   &priv->mcast_join_task, 0);
 		mutex_unlock(&mcast_mutex);
 
 		/*
@@ -436,7 +437,7 @@ static int ipoib_mcast_join_complete(int status,
 	mutex_lock(&mcast_mutex);
 	spin_lock_irq(&priv->lock);
 	if (test_bit(IPOIB_MCAST_RUN, &priv->flags))
-		queue_delayed_work(ipoib_workqueue, &priv->mcast_task,
+		queue_delayed_work(ipoib_workqueue, &priv->mcast_join_task,
 				   mcast->backoff * HZ);
 	spin_unlock_irq(&priv->lock);
 	mutex_unlock(&mcast_mutex);
@@ -505,7 +506,7 @@ static void ipoib_mcast_join(struct net_device *dev, struct ipoib_mcast *mcast,
 		mutex_lock(&mcast_mutex);
 		if (test_bit(IPOIB_MCAST_RUN, &priv->flags))
 			queue_delayed_work(ipoib_workqueue,
-					   &priv->mcast_task,
+					   &priv->mcast_join_task,
 					   mcast->backoff * HZ);
 		mutex_unlock(&mcast_mutex);
 	}
@@ -514,7 +515,7 @@ static void ipoib_mcast_join(struct net_device *dev, struct ipoib_mcast *mcast,
 void ipoib_mcast_join_task(struct work_struct *work)
 {
 	struct ipoib_dev_priv *priv =
-		container_of(work, struct ipoib_dev_priv, mcast_task.work);
+		container_of(work, struct ipoib_dev_priv, mcast_join_task.work);
 	struct net_device *dev = priv->dev;
 
 	if (!test_bit(IPOIB_MCAST_RUN, &priv->flags))
@@ -546,7 +547,7 @@ void ipoib_mcast_join_task(struct work_struct *work)
 			mutex_lock(&mcast_mutex);
 			if (test_bit(IPOIB_MCAST_RUN, &priv->flags))
 				queue_delayed_work(ipoib_workqueue,
-						   &priv->mcast_task, HZ);
+						   &priv->mcast_join_task, HZ);
 			mutex_unlock(&mcast_mutex);
 			return;
 		}
@@ -610,7 +611,9 @@ int ipoib_mcast_start_thread(struct net_device *dev)
 
 	mutex_lock(&mcast_mutex);
 	if (!test_and_set_bit(IPOIB_MCAST_RUN, &priv->flags))
-		queue_delayed_work(ipoib_workqueue, &priv->mcast_task, 0);
+		queue_delayed_work(ipoib_workqueue, &priv->mcast_join_task, 0);
+	if (!test_and_set_bit(IPOIB_MCAST_RUN_GC, &priv->flags))
+		queue_delayed_work(ipoib_workqueue, &priv->mcast_leave_task, 0);
 	mutex_unlock(&mcast_mutex);
 
 	return 0;
@@ -624,7 +627,9 @@ int ipoib_mcast_stop_thread(struct net_device *dev, int flush)
 
 	mutex_lock(&mcast_mutex);
 	clear_bit(IPOIB_MCAST_RUN, &priv->flags);
-	cancel_delayed_work(&priv->mcast_task);
+	clear_bit(IPOIB_MCAST_RUN_GC, &priv->flags);
+	cancel_delayed_work(&priv->mcast_join_task);
+	cancel_delayed_work(&priv->mcast_leave_task);
 	mutex_unlock(&mcast_mutex);
 
 	if (flush)
@@ -727,7 +732,7 @@ out:
 				list_add_tail(&neigh->list, &mcast->neigh_list);
 			}
 		}
-
+		mcast->used = jiffies;
 		spin_unlock_irqrestore(&priv->lock, flags);
 		ipoib_send(dev, skb, mcast->ah, IB_MULTICAST_QPN);
 		return;
@@ -888,6 +893,35 @@ void ipoib_mcast_restart_task(struct work_struct *work)
 		ipoib_mcast_start_thread(dev);
 }
 
+void ipoib_mcast_leave_task(struct work_struct *work)
+{
+	struct ipoib_dev_priv *priv =
+		container_of(work, struct ipoib_dev_priv, mcast_leave_task.work);
+	struct net_device *dev = priv->dev;
+	struct ipoib_mcast *mcast, *tmcast;
+	LIST_HEAD(remove_list);
+
+	if (!test_bit(IPOIB_MCAST_RUN_GC, &priv->flags))
+		return;
+
+	if (ipoib_mc_sendonly_timeout > 0) {
+		list_for_each_entry_safe(mcast, tmcast, &priv->multicast_list, list) {
+			if (test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags) &&
+			    time_before(mcast->used, jiffies - ipoib_mc_sendonly_timeout * HZ)) {
+				rb_erase(&mcast->rb_node, &priv->multicast_tree);
+				list_move_tail(&mcast->list, &remove_list);
+			}
+		}
+
+		list_for_each_entry_safe(mcast, tmcast, &remove_list, list) {
+			ipoib_mcast_leave(dev, mcast);
+			ipoib_mcast_free(mcast);
+		}
+	}
+
+	queue_delayed_work(ipoib_workqueue, &priv->mcast_leave_task, 60 * HZ);
+}
+
 #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG
 
 struct ipoib_mcast_iter *ipoib_mcast_iter_init(struct net_device *dev)
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply related	[flat|nested] 5+ messages in thread

* Re: [PATCH V2] IB/ipoib: Leave stale send-only multicast groups
       [not found] ` <4D34239D.9030004-hKgKHo2Ms0F+cjeuK/JdrQ@public.gmane.org>
@ 2011-01-17 11:30   ` Or Gerlitz
  2011-01-18 20:09   ` Christoph Lameter
  1 sibling, 0 replies; 5+ messages in thread
From: Or Gerlitz @ 2011-01-17 11:30 UTC (permalink / raw)
  To: Moni Shoua; +Cc: Roland Dreier, linux-rdma, Yossi Etigin

Moni Shoua wrote:
> Unlike with send/receive multicast groups, there is no indication for IPoIB
> that a send-only multicast group is useless. Therefore, even a single packet
> to a multicast destination leaves a multicast entry on the fabric until the
> host interface is down. This causes an MGID leakage in the SM.

Just few quick comments on the change-log: the SM doesn't control/manage MGIDs, its
the application (ipoib in this case), so I assume you are referring to MLIDs. I believe 
that the Linux IB stack doesn't support send-only groups ala the IB spec, so you're talking 
on "stale groups" which are practically send-only, I would add that to the change log

Or.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH V2] IB/ipoib: Leave stale send-only multicast groups
       [not found] ` <4D34239D.9030004-hKgKHo2Ms0F+cjeuK/JdrQ@public.gmane.org>
  2011-01-17 11:30   ` Or Gerlitz
@ 2011-01-18 20:09   ` Christoph Lameter
       [not found]     ` <AANLkTiku+dkiqsv1AKUr_wCVhxWFhay0HEuSZtt7uZWr@mail.gmail.com>
  1 sibling, 1 reply; 5+ messages in thread
From: Christoph Lameter @ 2011-01-18 20:09 UTC (permalink / raw)
  To: Moni Shoua; +Cc: Roland Dreier, linux-rdma, Yossi Etigin

On Mon, 17 Jan 2011, Moni Shoua wrote:

> Unlike with send/receive multicast groups, there is no indication for IPoIB
> that a send-only multicast group is useless. Therefore, even a single packet
> to a multicast destination leaves a multicast entry on the fabric until the
> host interface is down. This causes an MGID leakage in the SM.

There is such an indication. The igmp subsystem removes the multicast
group from the grouplist for the interface when the process terminates.

The ipoib layer will then release the MGID when the multicast groups are
reprocessed in ipoib_mcast_restart_task(). The sendonly detect logic puts
it onto the remove_list() and then mcast_mcast_leave() is called at the
end to dispose of the MC group.

That is at least what our tests show. MC sendonly groups vanish when a
task terminates. Lets leave it that way.

Did you leave the task that caused the sendonly join running until
shutdown?

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH V2] IB/ipoib: Leave stale send-only multicast groups
       [not found]       ` <AANLkTiku+dkiqsv1AKUr_wCVhxWFhay0HEuSZtt7uZWr-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
@ 2011-01-26 15:55         ` Christoph Lameter
       [not found]           ` <alpine.DEB.2.00.1101260954420.23080-sBS69tsa9Uj/9pzu0YdTqQ@public.gmane.org>
  0 siblings, 1 reply; 5+ messages in thread
From: Christoph Lameter @ 2011-01-26 15:55 UTC (permalink / raw)
  To: Moni Shoua; +Cc: Roland Dreier, linux-rdma, Yossi Etigin

And here is my answer that also was not cced.


> Please take a look at this. It demonstrates what I claim.

Trouble is that iperf is a performance measurement tool and I am not sure
what is going on behind the scenes in regard to MC subscriptions etc. Also
I do not see when you terminate what task.

I can demonstrate using mcast that the groups go away (see below). AFAICT
behavior that you suggest would be troublesome for some of the apps we run
here.


> linux:~ # cat /proc/net/dev_mcast |grep ib0
>
> 29   ib0             1     0     00ffffffff12401bffff00000000000003070707 <---
                      ^^^ there is a reference remaining that is why the
group sticks around.

Here is the output of /sys/kernel/debug/ipoib/ib0.8030_mcg a few minutes
after an mcast test completed:

GID: ff12:401b:8030:0:0:0:0:1
  created: 4370767176
  queuelen:         0
  complete:       yes
  send_only:       no

GID: ff12:401b:8030:0:0:0:ffff:ffff
  created: 4370767176
  queuelen:         0
  complete:       yes
  send_only:       no


Running mcast -b ib0.8030 on another host
and running ./mcast -n 1 -b ib0.8030 on this host yields:

Receiver: Listening to control channel 239.0.192.1
Receiver: Subscribing to 0 MC addresses 239.0.192-254.2-254 offset 0
origin 10.2.30.180
Sender: Sending 10 msgs/ch/sec on 1 channels. Probe interval=0.001-1 sec.

While the program is running we do:

clameter@rd-gateway-deb64:/sys/kernel/debug/ipoib$ cat ib0.8030_mcg
GID: ff12:401b:8030:0:0:0:0:1
  created: 4370767176
  queuelen:         0
  complete:       yes
  send_only:       no

GID: ff12:401b:8030:0:0:0:f00:c001
  created: 4371010379
  queuelen:         0
  complete:       yes
  send_only:       no

GID: ff12:401b:8030:0:0:0:f00:c002
  created: 4371010589
  queuelen:         0
  complete:       yes
  send_only:      yes

GID: ff12:401b:8030:0:0:0:ffff:ffff
  created: 4370767176
  queuelen:         0
  complete:       yes
  send_only:       no


Terminating mcast yields:

clameter@rd-gateway-deb64:/sys/kernel/debug/ipoib$ cat ib0.8030_mcg
GID: ff12:401b:8030:0:0:0:0:1
  created: 4370767176
  queuelen:         0
  complete:       yes
  send_only:       no

GID: ff12:401b:8030:0:0:0:0:2
  created: 4371020715
  queuelen:         0
  complete:       yes
  send_only:      yes

GID: ff12:401b:8030:0:0:0:f00:c002
  created: 4371010589
  queuelen:         0
  complete:       yes
  send_only:      yes

GID: ff12:401b:8030:0:0:0:ffff:ffff
  created: 4370767176
  queuelen:         0
  complete:       yes
  send_only:       no

Wait a few minutes and then c002 will also vanish and you will have the
state above.



--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH V2] IB/ipoib: Leave stale send-only multicast groups
       [not found]           ` <alpine.DEB.2.00.1101260954420.23080-sBS69tsa9Uj/9pzu0YdTqQ@public.gmane.org>
@ 2011-01-26 18:26             ` Christoph Lameter
  0 siblings, 0 replies; 5+ messages in thread
From: Christoph Lameter @ 2011-01-26 18:26 UTC (permalink / raw)
  To: Moni Shoua; +Cc: Roland Dreier, linux-rdma, Yossi Etigin


Well turns out that my tests just proves that the patch works as
intended. After all we run Voltaire OFED which has had a similar patch to
address the issue for a long time.

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2011-01-26 18:26 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2011-01-17 11:10 [PATCH V2] IB/ipoib: Leave stale send-only multicast groups Moni Shoua
     [not found] ` <4D34239D.9030004-hKgKHo2Ms0F+cjeuK/JdrQ@public.gmane.org>
2011-01-17 11:30   ` Or Gerlitz
2011-01-18 20:09   ` Christoph Lameter
     [not found]     ` <AANLkTiku+dkiqsv1AKUr_wCVhxWFhay0HEuSZtt7uZWr@mail.gmail.com>
     [not found]       ` <AANLkTiku+dkiqsv1AKUr_wCVhxWFhay0HEuSZtt7uZWr-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
2011-01-26 15:55         ` Christoph Lameter
     [not found]           ` <alpine.DEB.2.00.1101260954420.23080-sBS69tsa9Uj/9pzu0YdTqQ@public.gmane.org>
2011-01-26 18:26             ` Christoph Lameter

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.