From mboxrd@z Thu Jan 1 00:00:00 1970 From: Moni Shoua Subject: [PATCH V2] IB/ipoib: Leave stale send-only multicast groups Date: Mon, 17 Jan 2011 13:10:21 +0200 Message-ID: <4D34239D.9030004@Voltaire.COM> Mime-Version: 1.0 Content-Type: text/plain; charset="ISO-8859-1" Content-Transfer-Encoding: 7bit Return-path: Sender: linux-rdma-owner-u79uwXL29TY76Z2rM5mHXA@public.gmane.org To: Roland Dreier Cc: linux-rdma , Yossi Etigin List-Id: linux-rdma@vger.kernel.org Unlike with send/receive multicast groups, there is no indication for IPoIB that a send-only multicast group is useless. Therefore, even a single packet to a multicast destination leaves a multicast entry on the fabric until the host interface is down. This causes an MGID leakage in the SM. Here, a garbage-collection task will be scheduled once a minute and will leave stale multicast groups. V1 of the patch below was sent to the list a long ago by Yossi Etigin and from some reason the discussion about it was stopped without a conclusion. Link to V1: - http://www.mail-archive.com/general-ZwoEplunGu1OwGhvXhtEPSCwEArCW2h5@public.gmane.org/msg18928.html Changes from V1: - Add a module parameter to control the amount of time that an idle send-only group is allowed to stay joined. Signed-off-by: Yossi Etigin Signed-off-by: Moni Shoua -- drivers/infiniband/ulp/ipoib/ipoib.h | 8 +++- drivers/infiniband/ulp/ipoib/ipoib_main.c | 8 +++- drivers/infiniband/ulp/ipoib/ipoib_multicast.c | 50 +++++++++++++++++++++---- 3 files changed, 56 insertions(+), 10 deletions(-) diff --git a/drivers/infiniband/ulp/ipoib/ipoib.h b/drivers/infiniband/ulp/ipoib/ipoib.h index ab97f92..fb1714f 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib.h +++ b/drivers/infiniband/ulp/ipoib/ipoib.h @@ -92,6 +92,7 @@ enum { IPOIB_FLAG_ADMIN_CM = 9, IPOIB_FLAG_UMCAST = 10, IPOIB_FLAG_CSUM = 11, + IPOIB_MCAST_RUN_GC = 12, IPOIB_MAX_BACKOFF_SECONDS = 16, @@ -132,6 +133,7 @@ struct ipoib_mcast { struct list_head list; unsigned long created; + unsigned long used; unsigned long backoff; unsigned long flags; @@ -283,7 +285,8 @@ struct ipoib_dev_priv { struct rb_root multicast_tree; struct delayed_work pkey_poll_task; - struct delayed_work mcast_task; + struct delayed_work mcast_join_task; + struct delayed_work mcast_leave_task; struct work_struct carrier_on_task; struct work_struct flush_light; struct work_struct flush_normal; @@ -411,6 +414,8 @@ void ipoib_neigh_free(struct net_device *dev, struct ipoib_neigh *neigh); extern struct workqueue_struct *ipoib_workqueue; +extern int ipoib_mc_sendonly_timeout; + /* functions */ int ipoib_poll(struct napi_struct *napi, int budget); @@ -453,6 +458,7 @@ int ipoib_dev_init(struct net_device *dev, struct ib_device *ca, int port); void ipoib_dev_cleanup(struct net_device *dev); void ipoib_mcast_join_task(struct work_struct *work); +void ipoib_mcast_leave_task(struct work_struct *work); void ipoib_mcast_carrier_on_task(struct work_struct *work); void ipoib_mcast_send(struct net_device *dev, void *mgid, struct sk_buff *skb); diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c index 7a07a72..563370e 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_main.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c @@ -67,6 +67,11 @@ module_param_named(debug_level, ipoib_debug_level, int, 0644); MODULE_PARM_DESC(debug_level, "Enable debug tracing if > 0"); #endif +int ipoib_mc_sendonly_timeout; + +module_param_named(mc_sendonly_timeout, ipoib_mc_sendonly_timeout, int, 0644); +MODULE_PARM_DESC(mc_sendonly_timeout, "Enable debug tracing if > 0"); + struct ipoib_path_iter { struct net_device *dev; struct ipoib_path path; @@ -1020,7 +1025,8 @@ static void ipoib_setup(struct net_device *dev) INIT_LIST_HEAD(&priv->multicast_list); INIT_DELAYED_WORK(&priv->pkey_poll_task, ipoib_pkey_poll); - INIT_DELAYED_WORK(&priv->mcast_task, ipoib_mcast_join_task); + INIT_DELAYED_WORK(&priv->mcast_join_task, ipoib_mcast_join_task); + INIT_DELAYED_WORK(&priv->mcast_leave_task, ipoib_mcast_leave_task); INIT_WORK(&priv->carrier_on_task, ipoib_mcast_carrier_on_task); INIT_WORK(&priv->flush_light, ipoib_ib_dev_flush_light); INIT_WORK(&priv->flush_normal, ipoib_ib_dev_flush_normal); diff --git a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c index 3871ac6..87928c1 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c @@ -117,6 +117,7 @@ static struct ipoib_mcast *ipoib_mcast_alloc(struct net_device *dev, mcast->dev = dev; mcast->created = jiffies; + mcast->used = jiffies; mcast->backoff = 1; INIT_LIST_HEAD(&mcast->list); @@ -403,7 +404,7 @@ static int ipoib_mcast_join_complete(int status, mutex_lock(&mcast_mutex); if (test_bit(IPOIB_MCAST_RUN, &priv->flags)) queue_delayed_work(ipoib_workqueue, - &priv->mcast_task, 0); + &priv->mcast_join_task, 0); mutex_unlock(&mcast_mutex); /* @@ -436,7 +437,7 @@ static int ipoib_mcast_join_complete(int status, mutex_lock(&mcast_mutex); spin_lock_irq(&priv->lock); if (test_bit(IPOIB_MCAST_RUN, &priv->flags)) - queue_delayed_work(ipoib_workqueue, &priv->mcast_task, + queue_delayed_work(ipoib_workqueue, &priv->mcast_join_task, mcast->backoff * HZ); spin_unlock_irq(&priv->lock); mutex_unlock(&mcast_mutex); @@ -505,7 +506,7 @@ static void ipoib_mcast_join(struct net_device *dev, struct ipoib_mcast *mcast, mutex_lock(&mcast_mutex); if (test_bit(IPOIB_MCAST_RUN, &priv->flags)) queue_delayed_work(ipoib_workqueue, - &priv->mcast_task, + &priv->mcast_join_task, mcast->backoff * HZ); mutex_unlock(&mcast_mutex); } @@ -514,7 +515,7 @@ static void ipoib_mcast_join(struct net_device *dev, struct ipoib_mcast *mcast, void ipoib_mcast_join_task(struct work_struct *work) { struct ipoib_dev_priv *priv = - container_of(work, struct ipoib_dev_priv, mcast_task.work); + container_of(work, struct ipoib_dev_priv, mcast_join_task.work); struct net_device *dev = priv->dev; if (!test_bit(IPOIB_MCAST_RUN, &priv->flags)) @@ -546,7 +547,7 @@ void ipoib_mcast_join_task(struct work_struct *work) mutex_lock(&mcast_mutex); if (test_bit(IPOIB_MCAST_RUN, &priv->flags)) queue_delayed_work(ipoib_workqueue, - &priv->mcast_task, HZ); + &priv->mcast_join_task, HZ); mutex_unlock(&mcast_mutex); return; } @@ -610,7 +611,9 @@ int ipoib_mcast_start_thread(struct net_device *dev) mutex_lock(&mcast_mutex); if (!test_and_set_bit(IPOIB_MCAST_RUN, &priv->flags)) - queue_delayed_work(ipoib_workqueue, &priv->mcast_task, 0); + queue_delayed_work(ipoib_workqueue, &priv->mcast_join_task, 0); + if (!test_and_set_bit(IPOIB_MCAST_RUN_GC, &priv->flags)) + queue_delayed_work(ipoib_workqueue, &priv->mcast_leave_task, 0); mutex_unlock(&mcast_mutex); return 0; @@ -624,7 +627,9 @@ int ipoib_mcast_stop_thread(struct net_device *dev, int flush) mutex_lock(&mcast_mutex); clear_bit(IPOIB_MCAST_RUN, &priv->flags); - cancel_delayed_work(&priv->mcast_task); + clear_bit(IPOIB_MCAST_RUN_GC, &priv->flags); + cancel_delayed_work(&priv->mcast_join_task); + cancel_delayed_work(&priv->mcast_leave_task); mutex_unlock(&mcast_mutex); if (flush) @@ -727,7 +732,7 @@ out: list_add_tail(&neigh->list, &mcast->neigh_list); } } - + mcast->used = jiffies; spin_unlock_irqrestore(&priv->lock, flags); ipoib_send(dev, skb, mcast->ah, IB_MULTICAST_QPN); return; @@ -888,6 +893,35 @@ void ipoib_mcast_restart_task(struct work_struct *work) ipoib_mcast_start_thread(dev); } +void ipoib_mcast_leave_task(struct work_struct *work) +{ + struct ipoib_dev_priv *priv = + container_of(work, struct ipoib_dev_priv, mcast_leave_task.work); + struct net_device *dev = priv->dev; + struct ipoib_mcast *mcast, *tmcast; + LIST_HEAD(remove_list); + + if (!test_bit(IPOIB_MCAST_RUN_GC, &priv->flags)) + return; + + if (ipoib_mc_sendonly_timeout > 0) { + list_for_each_entry_safe(mcast, tmcast, &priv->multicast_list, list) { + if (test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags) && + time_before(mcast->used, jiffies - ipoib_mc_sendonly_timeout * HZ)) { + rb_erase(&mcast->rb_node, &priv->multicast_tree); + list_move_tail(&mcast->list, &remove_list); + } + } + + list_for_each_entry_safe(mcast, tmcast, &remove_list, list) { + ipoib_mcast_leave(dev, mcast); + ipoib_mcast_free(mcast); + } + } + + queue_delayed_work(ipoib_workqueue, &priv->mcast_leave_task, 60 * HZ); +} + #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG struct ipoib_mcast_iter *ipoib_mcast_iter_init(struct net_device *dev) -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org More majordomo info at http://vger.kernel.org/majordomo-info.html