All of lore.kernel.org
 help / color / mirror / Atom feed
* [RFC] bonding: fix workqueue re-arming races
@ 2010-08-31 17:07 Jiri Bohac
  2010-08-31 20:54 ` Jay Vosburgh
  0 siblings, 1 reply; 25+ messages in thread
From: Jiri Bohac @ 2010-08-31 17:07 UTC (permalink / raw)
  To: Jay Vosburgh; +Cc: bonding-devel, markine, jarkao2, chavey, netdev

Hi,

this is another attempt to solve the bonding workqueue re-arming
races.

The issue has been thoroughly discussed here:
http://article.gmane.org/gmane.linux.network/146949 "[PATCH]
bonding: cancel_delayed_work() -> cancel_delayed_work_sync()"
However, the only outcome was a proposal for an ugly hack with
busy-waiting on the rtnl.

The problem:
Bonding uses delayed work that automatically re-arms itself,
e.g.: bond_mii_monitor().

A dev->close() quickly followed by dev->open() on the bonding
master has a race condition. bond_close() sets kill_timers=1 and
calls cancel_delayed_work(), hoping that bond_mii_monitor() will
not re-arm again anymore.  There are two problems with this:

- bond->kill_timers is not re-checked after re-acquiring the
  bond->lock (this would be easy to fix)

- bond_open() resets bond->kill_timers to 0. If this happens
  before bond_mii_monitor() notices the flag and exits, it will
  re-arm itself. bond_open() then tries to schedule the delayed
  work, which causes a BUG.

The issue would be solved by calling cancel_delayed_work_sync(),
but this can not be done from bond_close() since it is called
under rtnl and the delayed work locks rtnl itself.

My proposal is to move all the "commit" work that requires rtnl
to a separate work and schedule it on the bonding wq. Thus, the
re-arming work does not need rtnl and can be cancelled using
cancel_delayed_work_sync().

Comments?

[note, this does not deal with bond_loadbalance_arp_mon(), where
rtnl is now taken as well in net-next; I'll do this if you think
the idea is good ]

Signed-off-by: Jiri Bohac <jbohac@suse.cz>

diff --git a/drivers/net/bonding/bond_3ad.c b/drivers/net/bonding/bond_3ad.c
index 822f586..8015e12 100644
--- a/drivers/net/bonding/bond_3ad.c
+++ b/drivers/net/bonding/bond_3ad.c
@@ -2119,10 +2119,6 @@ void bond_3ad_state_machine_handler(struct work_struct *work)
 
 	read_lock(&bond->lock);
 
-	if (bond->kill_timers) {
-		goto out;
-	}
-
 	//check if there are any slaves
 	if (bond->slave_cnt == 0) {
 		goto re_arm;
@@ -2166,7 +2162,6 @@ void bond_3ad_state_machine_handler(struct work_struct *work)
 
 re_arm:
 	queue_delayed_work(bond->wq, &bond->ad_work, ad_delta_in_ticks);
-out:
 	read_unlock(&bond->lock);
 }
 
diff --git a/drivers/net/bonding/bond_alb.c b/drivers/net/bonding/bond_alb.c
index c746b33..250d027 100644
--- a/drivers/net/bonding/bond_alb.c
+++ b/drivers/net/bonding/bond_alb.c
@@ -1397,6 +1397,23 @@ out:
 	return NETDEV_TX_OK;
 }
 
+void bond_alb_promisc_disable(struct work_struct *work)
+{
+	struct bonding *bond = container_of(work, struct bonding,
+					    alb_promisc_disable_work);
+	struct alb_bond_info *bond_info = &(BOND_ALB_INFO(bond));
+
+	/*
+	 * dev_set_promiscuity requires rtnl and
+	 * nothing else.
+	 */
+	rtnl_lock();
+	dev_set_promiscuity(bond->curr_active_slave->dev, -1);
+	bond_info->primary_is_promisc = 0;
+	bond_info->rlb_promisc_timeout_counter = 0;
+	rtnl_unlock();
+}
+
 void bond_alb_monitor(struct work_struct *work)
 {
 	struct bonding *bond = container_of(work, struct bonding,
@@ -1407,10 +1424,6 @@ void bond_alb_monitor(struct work_struct *work)
 
 	read_lock(&bond->lock);
 
-	if (bond->kill_timers) {
-		goto out;
-	}
-
 	if (bond->slave_cnt == 0) {
 		bond_info->tx_rebalance_counter = 0;
 		bond_info->lp_counter = 0;
@@ -1462,25 +1475,11 @@ void bond_alb_monitor(struct work_struct *work)
 	if (bond_info->rlb_enabled) {
 		if (bond_info->primary_is_promisc &&
 		    (++bond_info->rlb_promisc_timeout_counter >= RLB_PROMISC_TIMEOUT)) {
-
-			/*
-			 * dev_set_promiscuity requires rtnl and
-			 * nothing else.
-			 */
-			read_unlock(&bond->lock);
-			rtnl_lock();
-
-			bond_info->rlb_promisc_timeout_counter = 0;
-
 			/* If the primary was set to promiscuous mode
 			 * because a slave was disabled then
 			 * it can now leave promiscuous mode.
 			 */
-			dev_set_promiscuity(bond->curr_active_slave->dev, -1);
-			bond_info->primary_is_promisc = 0;
-
-			rtnl_unlock();
-			read_lock(&bond->lock);
+			queue_work(bond->wq, &bond->alb_promisc_disable_work);
 		}
 
 		if (bond_info->rlb_rebalance) {
@@ -1505,7 +1504,6 @@ void bond_alb_monitor(struct work_struct *work)
 
 re_arm:
 	queue_delayed_work(bond->wq, &bond->alb_work, alb_delta_in_ticks);
-out:
 	read_unlock(&bond->lock);
 }
 
diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index 2cc4cfc..3e8b57e 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -2343,10 +2343,15 @@ static int bond_miimon_inspect(struct bonding *bond)
 	return commit;
 }
 
-static void bond_miimon_commit(struct bonding *bond)
+static void bond_miimon_commit(struct work_struct *work)
 {
 	struct slave *slave;
 	int i;
+	struct bonding *bond = container_of(work, struct bonding,
+					    miimon_commit_work);
+
+	rtnl_lock();
+	read_lock(&bond->lock);
 
 	bond_for_each_slave(bond, slave, i) {
 		switch (slave->new_link) {
@@ -2421,15 +2426,18 @@ static void bond_miimon_commit(struct bonding *bond)
 		}
 
 do_failover:
-		ASSERT_RTNL();
 		write_lock_bh(&bond->curr_slave_lock);
 		bond_select_active_slave(bond);
 		write_unlock_bh(&bond->curr_slave_lock);
 	}
 
 	bond_set_carrier(bond);
+
+	read_unlock(&bond->lock);
+	rtnl_unlock();
 }
 
+
 /*
  * bond_mii_monitor
  *
@@ -2438,14 +2446,13 @@ do_failover:
  * an acquisition of appropriate locks followed by a commit phase to
  * implement whatever link state changes are indicated.
  */
+
 void bond_mii_monitor(struct work_struct *work)
 {
 	struct bonding *bond = container_of(work, struct bonding,
 					    mii_work.work);
 
 	read_lock(&bond->lock);
-	if (bond->kill_timers)
-		goto out;
 
 	if (bond->slave_cnt == 0)
 		goto re_arm;
@@ -2462,23 +2469,14 @@ void bond_mii_monitor(struct work_struct *work)
 		read_unlock(&bond->curr_slave_lock);
 	}
 
-	if (bond_miimon_inspect(bond)) {
-		read_unlock(&bond->lock);
-		rtnl_lock();
-		read_lock(&bond->lock);
-
-		bond_miimon_commit(bond);
+	if (bond_miimon_inspect(bond))
+		queue_work(bond->wq, &bond->miimon_commit_work);
 
-		read_unlock(&bond->lock);
-		rtnl_unlock();	/* might sleep, hold no other locks */
-		read_lock(&bond->lock);
-	}
 
 re_arm:
 	if (bond->params.miimon)
 		queue_delayed_work(bond->wq, &bond->mii_work,
 				   msecs_to_jiffies(bond->params.miimon));
-out:
 	read_unlock(&bond->lock);
 }
 
@@ -2778,9 +2776,6 @@ void bond_loadbalance_arp_mon(struct work_struct *work)
 
 	delta_in_ticks = msecs_to_jiffies(bond->params.arp_interval);
 
-	if (bond->kill_timers)
-		goto out;
-
 	if (bond->slave_cnt == 0)
 		goto re_arm;
 
@@ -2867,7 +2862,6 @@ void bond_loadbalance_arp_mon(struct work_struct *work)
 re_arm:
 	if (bond->params.arp_interval)
 		queue_delayed_work(bond->wq, &bond->arp_work, delta_in_ticks);
-out:
 	read_unlock(&bond->lock);
 }
 
@@ -2949,13 +2943,19 @@ static int bond_ab_arp_inspect(struct bonding *bond, int delta_in_ticks)
 /*
  * Called to commit link state changes noted by inspection step of
  * active-backup mode ARP monitor.
- *
- * Called with RTNL and bond->lock for read.
  */
-static void bond_ab_arp_commit(struct bonding *bond, int delta_in_ticks)
+static void bond_ab_arp_commit(struct work_struct *work)
 {
 	struct slave *slave;
 	int i;
+	int delta_in_ticks;
+	struct bonding *bond = container_of(work, struct bonding,
+					    ab_arp_commit_work);
+
+	rtnl_lock();
+	read_lock(&bond->lock);
+
+	delta_in_ticks = msecs_to_jiffies(bond->params.arp_interval);
 
 	bond_for_each_slave(bond, slave, i) {
 		switch (slave->new_link) {
@@ -3014,6 +3014,9 @@ do_failover:
 	}
 
 	bond_set_carrier(bond);
+
+	read_unlock(&bond->lock);
+	rtnl_unlock();
 }
 
 /*
@@ -3093,9 +3096,6 @@ void bond_activebackup_arp_mon(struct work_struct *work)
 
 	read_lock(&bond->lock);
 
-	if (bond->kill_timers)
-		goto out;
-
 	delta_in_ticks = msecs_to_jiffies(bond->params.arp_interval);
 
 	if (bond->slave_cnt == 0)
@@ -3113,24 +3113,15 @@ void bond_activebackup_arp_mon(struct work_struct *work)
 		read_unlock(&bond->curr_slave_lock);
 	}
 
-	if (bond_ab_arp_inspect(bond, delta_in_ticks)) {
-		read_unlock(&bond->lock);
-		rtnl_lock();
-		read_lock(&bond->lock);
+	if (bond_ab_arp_inspect(bond, delta_in_ticks))
+		queue_work(bond->wq, &bond->ab_arp_commit_work);
 
-		bond_ab_arp_commit(bond, delta_in_ticks);
-
-		read_unlock(&bond->lock);
-		rtnl_unlock();
-		read_lock(&bond->lock);
-	}
 
 	bond_ab_arp_probe(bond);
 
 re_arm:
 	if (bond->params.arp_interval)
 		queue_delayed_work(bond->wq, &bond->arp_work, delta_in_ticks);
-out:
 	read_unlock(&bond->lock);
 }
 
@@ -3720,8 +3711,6 @@ static int bond_open(struct net_device *bond_dev)
 {
 	struct bonding *bond = netdev_priv(bond_dev);
 
-	bond->kill_timers = 0;
-
 	if (bond_is_lb(bond)) {
 		/* bond_alb_initialize must be called before the timer
 		 * is started.
@@ -3781,26 +3770,23 @@ static int bond_close(struct net_device *bond_dev)
 	bond->send_grat_arp = 0;
 	bond->send_unsol_na = 0;
 
-	/* signal timers not to re-arm */
-	bond->kill_timers = 1;
-
 	write_unlock_bh(&bond->lock);
 
 	if (bond->params.miimon) {  /* link check interval, in milliseconds. */
-		cancel_delayed_work(&bond->mii_work);
+		cancel_delayed_work_sync(&bond->mii_work);
 	}
 
 	if (bond->params.arp_interval) {  /* arp interval, in milliseconds. */
-		cancel_delayed_work(&bond->arp_work);
+		cancel_delayed_work_sync(&bond->arp_work);
 	}
 
 	switch (bond->params.mode) {
 	case BOND_MODE_8023AD:
-		cancel_delayed_work(&bond->ad_work);
+		cancel_delayed_work_sync(&bond->ad_work);
 		break;
 	case BOND_MODE_TLB:
 	case BOND_MODE_ALB:
-		cancel_delayed_work(&bond->alb_work);
+		cancel_delayed_work_sync(&bond->alb_work);
 		break;
 	default:
 		break;
@@ -4660,23 +4646,19 @@ static void bond_setup(struct net_device *bond_dev)
 
 static void bond_work_cancel_all(struct bonding *bond)
 {
-	write_lock_bh(&bond->lock);
-	bond->kill_timers = 1;
-	write_unlock_bh(&bond->lock);
-
 	if (bond->params.miimon && delayed_work_pending(&bond->mii_work))
-		cancel_delayed_work(&bond->mii_work);
+		cancel_delayed_work_sync(&bond->mii_work);
 
 	if (bond->params.arp_interval && delayed_work_pending(&bond->arp_work))
-		cancel_delayed_work(&bond->arp_work);
+		cancel_delayed_work_sync(&bond->arp_work);
 
 	if (bond->params.mode == BOND_MODE_ALB &&
 	    delayed_work_pending(&bond->alb_work))
-		cancel_delayed_work(&bond->alb_work);
+		cancel_delayed_work_sync(&bond->alb_work);
 
 	if (bond->params.mode == BOND_MODE_8023AD &&
 	    delayed_work_pending(&bond->ad_work))
-		cancel_delayed_work(&bond->ad_work);
+		cancel_delayed_work_sync(&bond->ad_work);
 }
 
 /*
@@ -5094,6 +5076,9 @@ static int bond_init(struct net_device *bond_dev)
 	bond_prepare_sysfs_group(bond);
 
 	__hw_addr_init(&bond->mc_list);
+	INIT_WORK(&bond->miimon_commit_work, bond_miimon_commit);
+	INIT_WORK(&bond->ab_arp_commit_work, bond_ab_arp_commit);
+	INIT_WORK(&bond->alb_promisc_disable_work, bond_alb_promisc_disable);
 	return 0;
 }
 
diff --git a/drivers/net/bonding/bonding.h b/drivers/net/bonding/bonding.h
index c6fdd85..e111023 100644
--- a/drivers/net/bonding/bonding.h
+++ b/drivers/net/bonding/bonding.h
@@ -198,7 +198,6 @@ struct bonding {
 	s32      slave_cnt; /* never change this value outside the attach/detach wrappers */
 	rwlock_t lock;
 	rwlock_t curr_slave_lock;
-	s8       kill_timers;
 	s8	 send_grat_arp;
 	s8	 send_unsol_na;
 	s8	 setup_by_slave;
@@ -223,6 +222,9 @@ struct bonding {
 	struct   delayed_work arp_work;
 	struct   delayed_work alb_work;
 	struct   delayed_work ad_work;
+	struct    work_struct miimon_commit_work;
+	struct    work_struct ab_arp_commit_work;
+	struct    work_struct alb_promisc_disable_work;
 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
 	struct   in6_addr master_ipv6;
 #endif
@@ -348,6 +350,7 @@ void bond_select_active_slave(struct bonding *bond);
 void bond_change_active_slave(struct bonding *bond, struct slave *new_active);
 void bond_register_arp(struct bonding *);
 void bond_unregister_arp(struct bonding *);
+void bond_alb_promisc_disable(struct work_struct *work);
 
 struct bond_net {
 	struct net *		net;	/* Associated network namespace */


-- 
Jiri Bohac <jbohac@suse.cz>
SUSE Labs, SUSE CZ


^ permalink raw reply related	[flat|nested] 25+ messages in thread

* Re: [RFC] bonding: fix workqueue re-arming races
  2010-08-31 17:07 [RFC] bonding: fix workqueue re-arming races Jiri Bohac
@ 2010-08-31 20:54 ` Jay Vosburgh
  2010-09-01 12:23   ` Jarek Poplawski
  2010-09-01 13:16   ` Jiri Bohac
  0 siblings, 2 replies; 25+ messages in thread
From: Jay Vosburgh @ 2010-08-31 20:54 UTC (permalink / raw)
  To: Jiri Bohac; +Cc: bonding-devel, markine, jarkao2, chavey, netdev

Jiri Bohac <jbohac@suse.cz> wrote:

>Hi,
>
>this is another attempt to solve the bonding workqueue re-arming
>races.
>
>The issue has been thoroughly discussed here:
>http://article.gmane.org/gmane.linux.network/146949 "[PATCH]
>bonding: cancel_delayed_work() -> cancel_delayed_work_sync()"
>However, the only outcome was a proposal for an ugly hack with
>busy-waiting on the rtnl.
>
>The problem:
>Bonding uses delayed work that automatically re-arms itself,
>e.g.: bond_mii_monitor().
>
>A dev->close() quickly followed by dev->open() on the bonding
>master has a race condition. bond_close() sets kill_timers=1 and
>calls cancel_delayed_work(), hoping that bond_mii_monitor() will
>not re-arm again anymore.  There are two problems with this:
>
>- bond->kill_timers is not re-checked after re-acquiring the
>  bond->lock (this would be easy to fix)
>
>- bond_open() resets bond->kill_timers to 0. If this happens
>  before bond_mii_monitor() notices the flag and exits, it will
>  re-arm itself. bond_open() then tries to schedule the delayed
>  work, which causes a BUG.
>
>The issue would be solved by calling cancel_delayed_work_sync(),
>but this can not be done from bond_close() since it is called
>under rtnl and the delayed work locks rtnl itself.
>
>My proposal is to move all the "commit" work that requires rtnl
>to a separate work and schedule it on the bonding wq. Thus, the
>re-arming work does not need rtnl and can be cancelled using
>cancel_delayed_work_sync().
>
>Comments?
>
>[note, this does not deal with bond_loadbalance_arp_mon(), where
>rtnl is now taken as well in net-next; I'll do this if you think
>the idea is good ]

	I don't believe the loadbalance_arp_mon acquires RTNL in
net-next.  I recall discussing this with Andy not too long ago, but I
didn't think that change went in, and I don't see it in the tree.

>Signed-off-by: Jiri Bohac <jbohac@suse.cz>
>
>diff --git a/drivers/net/bonding/bond_3ad.c b/drivers/net/bonding/bond_3ad.c
>index 822f586..8015e12 100644
>--- a/drivers/net/bonding/bond_3ad.c
>+++ b/drivers/net/bonding/bond_3ad.c
>@@ -2119,10 +2119,6 @@ void bond_3ad_state_machine_handler(struct work_struct *work)
>
> 	read_lock(&bond->lock);
>
>-	if (bond->kill_timers) {
>-		goto out;
>-	}
>-
> 	//check if there are any slaves
> 	if (bond->slave_cnt == 0) {
> 		goto re_arm;
>@@ -2166,7 +2162,6 @@ void bond_3ad_state_machine_handler(struct work_struct *work)
>
> re_arm:
> 	queue_delayed_work(bond->wq, &bond->ad_work, ad_delta_in_ticks);
>-out:
> 	read_unlock(&bond->lock);
> }
>
>diff --git a/drivers/net/bonding/bond_alb.c b/drivers/net/bonding/bond_alb.c
>index c746b33..250d027 100644
>--- a/drivers/net/bonding/bond_alb.c
>+++ b/drivers/net/bonding/bond_alb.c
>@@ -1397,6 +1397,23 @@ out:
> 	return NETDEV_TX_OK;
> }
>
>+void bond_alb_promisc_disable(struct work_struct *work)
>+{
>+	struct bonding *bond = container_of(work, struct bonding,
>+					    alb_promisc_disable_work);
>+	struct alb_bond_info *bond_info = &(BOND_ALB_INFO(bond));
>+
>+	/*
>+	 * dev_set_promiscuity requires rtnl and
>+	 * nothing else.
>+	 */
>+	rtnl_lock();
>+	dev_set_promiscuity(bond->curr_active_slave->dev, -1);
>+	bond_info->primary_is_promisc = 0;
>+	bond_info->rlb_promisc_timeout_counter = 0;
>+	rtnl_unlock();
>+}

	What prevents this from deadlocking such that cpu A is in
bond_close, holding RTNL and in cancel_delayed_work_sync, while cpu B is
in the above function, trying to acquire RTNL?

	Also, assuming for the moment that the above isn't a problem,
curr_active_slave may be NULL if the last slave is removed between the
time bond_alb_promisc_disable is scheduled and when it runs.  I'm not
sure that the alb_bond_info can be guaranteed to be valid, either, if
the mode changed.

	-J


> void bond_alb_monitor(struct work_struct *work)
> {
> 	struct bonding *bond = container_of(work, struct bonding,
>@@ -1407,10 +1424,6 @@ void bond_alb_monitor(struct work_struct *work)
>
> 	read_lock(&bond->lock);
>
>-	if (bond->kill_timers) {
>-		goto out;
>-	}
>-
> 	if (bond->slave_cnt == 0) {
> 		bond_info->tx_rebalance_counter = 0;
> 		bond_info->lp_counter = 0;
>@@ -1462,25 +1475,11 @@ void bond_alb_monitor(struct work_struct *work)
> 	if (bond_info->rlb_enabled) {
> 		if (bond_info->primary_is_promisc &&
> 		    (++bond_info->rlb_promisc_timeout_counter >= RLB_PROMISC_TIMEOUT)) {
>-
>-			/*
>-			 * dev_set_promiscuity requires rtnl and
>-			 * nothing else.
>-			 */
>-			read_unlock(&bond->lock);
>-			rtnl_lock();
>-
>-			bond_info->rlb_promisc_timeout_counter = 0;
>-
> 			/* If the primary was set to promiscuous mode
> 			 * because a slave was disabled then
> 			 * it can now leave promiscuous mode.
> 			 */
>-			dev_set_promiscuity(bond->curr_active_slave->dev, -1);
>-			bond_info->primary_is_promisc = 0;
>-
>-			rtnl_unlock();
>-			read_lock(&bond->lock);
>+			queue_work(bond->wq, &bond->alb_promisc_disable_work);
> 		}
>
> 		if (bond_info->rlb_rebalance) {
>@@ -1505,7 +1504,6 @@ void bond_alb_monitor(struct work_struct *work)
>
> re_arm:
> 	queue_delayed_work(bond->wq, &bond->alb_work, alb_delta_in_ticks);
>-out:
> 	read_unlock(&bond->lock);
> }
>
>diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
>index 2cc4cfc..3e8b57e 100644
>--- a/drivers/net/bonding/bond_main.c
>+++ b/drivers/net/bonding/bond_main.c
>@@ -2343,10 +2343,15 @@ static int bond_miimon_inspect(struct bonding *bond)
> 	return commit;
> }
>
>-static void bond_miimon_commit(struct bonding *bond)
>+static void bond_miimon_commit(struct work_struct *work)
> {
> 	struct slave *slave;
> 	int i;
>+	struct bonding *bond = container_of(work, struct bonding,
>+					    miimon_commit_work);
>+
>+	rtnl_lock();
>+	read_lock(&bond->lock);
>
> 	bond_for_each_slave(bond, slave, i) {
> 		switch (slave->new_link) {
>@@ -2421,15 +2426,18 @@ static void bond_miimon_commit(struct bonding *bond)
> 		}
>
> do_failover:
>-		ASSERT_RTNL();
> 		write_lock_bh(&bond->curr_slave_lock);
> 		bond_select_active_slave(bond);
> 		write_unlock_bh(&bond->curr_slave_lock);
> 	}
>
> 	bond_set_carrier(bond);
>+
>+	read_unlock(&bond->lock);
>+	rtnl_unlock();
> }
>
>+
> /*
>  * bond_mii_monitor
>  *
>@@ -2438,14 +2446,13 @@ do_failover:
>  * an acquisition of appropriate locks followed by a commit phase to
>  * implement whatever link state changes are indicated.
>  */
>+
> void bond_mii_monitor(struct work_struct *work)
> {
> 	struct bonding *bond = container_of(work, struct bonding,
> 					    mii_work.work);
>
> 	read_lock(&bond->lock);
>-	if (bond->kill_timers)
>-		goto out;
>
> 	if (bond->slave_cnt == 0)
> 		goto re_arm;
>@@ -2462,23 +2469,14 @@ void bond_mii_monitor(struct work_struct *work)
> 		read_unlock(&bond->curr_slave_lock);
> 	}
>
>-	if (bond_miimon_inspect(bond)) {
>-		read_unlock(&bond->lock);
>-		rtnl_lock();
>-		read_lock(&bond->lock);
>-
>-		bond_miimon_commit(bond);
>+	if (bond_miimon_inspect(bond))
>+		queue_work(bond->wq, &bond->miimon_commit_work);
>
>-		read_unlock(&bond->lock);
>-		rtnl_unlock();	/* might sleep, hold no other locks */
>-		read_lock(&bond->lock);
>-	}
>
> re_arm:
> 	if (bond->params.miimon)
> 		queue_delayed_work(bond->wq, &bond->mii_work,
> 				   msecs_to_jiffies(bond->params.miimon));
>-out:
> 	read_unlock(&bond->lock);
> }
>
>@@ -2778,9 +2776,6 @@ void bond_loadbalance_arp_mon(struct work_struct *work)
>
> 	delta_in_ticks = msecs_to_jiffies(bond->params.arp_interval);
>
>-	if (bond->kill_timers)
>-		goto out;
>-
> 	if (bond->slave_cnt == 0)
> 		goto re_arm;
>
>@@ -2867,7 +2862,6 @@ void bond_loadbalance_arp_mon(struct work_struct *work)
> re_arm:
> 	if (bond->params.arp_interval)
> 		queue_delayed_work(bond->wq, &bond->arp_work, delta_in_ticks);
>-out:
> 	read_unlock(&bond->lock);
> }
>
>@@ -2949,13 +2943,19 @@ static int bond_ab_arp_inspect(struct bonding *bond, int delta_in_ticks)
> /*
>  * Called to commit link state changes noted by inspection step of
>  * active-backup mode ARP monitor.
>- *
>- * Called with RTNL and bond->lock for read.
>  */
>-static void bond_ab_arp_commit(struct bonding *bond, int delta_in_ticks)
>+static void bond_ab_arp_commit(struct work_struct *work)
> {
> 	struct slave *slave;
> 	int i;
>+	int delta_in_ticks;
>+	struct bonding *bond = container_of(work, struct bonding,
>+					    ab_arp_commit_work);
>+
>+	rtnl_lock();
>+	read_lock(&bond->lock);
>+
>+	delta_in_ticks = msecs_to_jiffies(bond->params.arp_interval);
>
> 	bond_for_each_slave(bond, slave, i) {
> 		switch (slave->new_link) {
>@@ -3014,6 +3014,9 @@ do_failover:
> 	}
>
> 	bond_set_carrier(bond);
>+
>+	read_unlock(&bond->lock);
>+	rtnl_unlock();
> }
>
> /*
>@@ -3093,9 +3096,6 @@ void bond_activebackup_arp_mon(struct work_struct *work)
>
> 	read_lock(&bond->lock);
>
>-	if (bond->kill_timers)
>-		goto out;
>-
> 	delta_in_ticks = msecs_to_jiffies(bond->params.arp_interval);
>
> 	if (bond->slave_cnt == 0)
>@@ -3113,24 +3113,15 @@ void bond_activebackup_arp_mon(struct work_struct *work)
> 		read_unlock(&bond->curr_slave_lock);
> 	}
>
>-	if (bond_ab_arp_inspect(bond, delta_in_ticks)) {
>-		read_unlock(&bond->lock);
>-		rtnl_lock();
>-		read_lock(&bond->lock);
>+	if (bond_ab_arp_inspect(bond, delta_in_ticks))
>+		queue_work(bond->wq, &bond->ab_arp_commit_work);
>
>-		bond_ab_arp_commit(bond, delta_in_ticks);
>-
>-		read_unlock(&bond->lock);
>-		rtnl_unlock();
>-		read_lock(&bond->lock);
>-	}
>
> 	bond_ab_arp_probe(bond);
>
> re_arm:
> 	if (bond->params.arp_interval)
> 		queue_delayed_work(bond->wq, &bond->arp_work, delta_in_ticks);
>-out:
> 	read_unlock(&bond->lock);
> }
>
>@@ -3720,8 +3711,6 @@ static int bond_open(struct net_device *bond_dev)
> {
> 	struct bonding *bond = netdev_priv(bond_dev);
>
>-	bond->kill_timers = 0;
>-
> 	if (bond_is_lb(bond)) {
> 		/* bond_alb_initialize must be called before the timer
> 		 * is started.
>@@ -3781,26 +3770,23 @@ static int bond_close(struct net_device *bond_dev)
> 	bond->send_grat_arp = 0;
> 	bond->send_unsol_na = 0;
>
>-	/* signal timers not to re-arm */
>-	bond->kill_timers = 1;
>-
> 	write_unlock_bh(&bond->lock);
>
> 	if (bond->params.miimon) {  /* link check interval, in milliseconds. */
>-		cancel_delayed_work(&bond->mii_work);
>+		cancel_delayed_work_sync(&bond->mii_work);
> 	}
>
> 	if (bond->params.arp_interval) {  /* arp interval, in milliseconds. */
>-		cancel_delayed_work(&bond->arp_work);
>+		cancel_delayed_work_sync(&bond->arp_work);
> 	}
>
> 	switch (bond->params.mode) {
> 	case BOND_MODE_8023AD:
>-		cancel_delayed_work(&bond->ad_work);
>+		cancel_delayed_work_sync(&bond->ad_work);
> 		break;
> 	case BOND_MODE_TLB:
> 	case BOND_MODE_ALB:
>-		cancel_delayed_work(&bond->alb_work);
>+		cancel_delayed_work_sync(&bond->alb_work);
> 		break;
> 	default:
> 		break;
>@@ -4660,23 +4646,19 @@ static void bond_setup(struct net_device *bond_dev)
>
> static void bond_work_cancel_all(struct bonding *bond)
> {
>-	write_lock_bh(&bond->lock);
>-	bond->kill_timers = 1;
>-	write_unlock_bh(&bond->lock);
>-
> 	if (bond->params.miimon && delayed_work_pending(&bond->mii_work))
>-		cancel_delayed_work(&bond->mii_work);
>+		cancel_delayed_work_sync(&bond->mii_work);
>
> 	if (bond->params.arp_interval && delayed_work_pending(&bond->arp_work))
>-		cancel_delayed_work(&bond->arp_work);
>+		cancel_delayed_work_sync(&bond->arp_work);
>
> 	if (bond->params.mode == BOND_MODE_ALB &&
> 	    delayed_work_pending(&bond->alb_work))
>-		cancel_delayed_work(&bond->alb_work);
>+		cancel_delayed_work_sync(&bond->alb_work);
>
> 	if (bond->params.mode == BOND_MODE_8023AD &&
> 	    delayed_work_pending(&bond->ad_work))
>-		cancel_delayed_work(&bond->ad_work);
>+		cancel_delayed_work_sync(&bond->ad_work);
> }
>
> /*
>@@ -5094,6 +5076,9 @@ static int bond_init(struct net_device *bond_dev)
> 	bond_prepare_sysfs_group(bond);
>
> 	__hw_addr_init(&bond->mc_list);
>+	INIT_WORK(&bond->miimon_commit_work, bond_miimon_commit);
>+	INIT_WORK(&bond->ab_arp_commit_work, bond_ab_arp_commit);
>+	INIT_WORK(&bond->alb_promisc_disable_work, bond_alb_promisc_disable);
> 	return 0;
> }
>
>diff --git a/drivers/net/bonding/bonding.h b/drivers/net/bonding/bonding.h
>index c6fdd85..e111023 100644
>--- a/drivers/net/bonding/bonding.h
>+++ b/drivers/net/bonding/bonding.h
>@@ -198,7 +198,6 @@ struct bonding {
> 	s32      slave_cnt; /* never change this value outside the attach/detach wrappers */
> 	rwlock_t lock;
> 	rwlock_t curr_slave_lock;
>-	s8       kill_timers;
> 	s8	 send_grat_arp;
> 	s8	 send_unsol_na;
> 	s8	 setup_by_slave;
>@@ -223,6 +222,9 @@ struct bonding {
> 	struct   delayed_work arp_work;
> 	struct   delayed_work alb_work;
> 	struct   delayed_work ad_work;
>+	struct    work_struct miimon_commit_work;
>+	struct    work_struct ab_arp_commit_work;
>+	struct    work_struct alb_promisc_disable_work;
> #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
> 	struct   in6_addr master_ipv6;
> #endif
>@@ -348,6 +350,7 @@ void bond_select_active_slave(struct bonding *bond);
> void bond_change_active_slave(struct bonding *bond, struct slave *new_active);
> void bond_register_arp(struct bonding *);
> void bond_unregister_arp(struct bonding *);
>+void bond_alb_promisc_disable(struct work_struct *work);
>
> struct bond_net {
> 	struct net *		net;	/* Associated network namespace */
>
>
>-- 
>Jiri Bohac <jbohac@suse.cz>
>SUSE Labs, SUSE CZ

---
	-Jay Vosburgh, IBM Linux Technology Center, fubar@us.ibm.com

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [RFC] bonding: fix workqueue re-arming races
  2010-08-31 20:54 ` Jay Vosburgh
@ 2010-09-01 12:23   ` Jarek Poplawski
  2010-09-01 13:30     ` Jiri Bohac
  2010-09-01 13:16   ` Jiri Bohac
  1 sibling, 1 reply; 25+ messages in thread
From: Jarek Poplawski @ 2010-09-01 12:23 UTC (permalink / raw)
  To: Jay Vosburgh; +Cc: Jiri Bohac, bonding-devel, markine, chavey, netdev

On 2010-08-31 22:54, Jay Vosburgh wrote:
> Jiri Bohac <jbohac@suse.cz> wrote:
> 
>> Hi,
>>
>> this is another attempt to solve the bonding workqueue re-arming
>> races.
>>
>> The issue has been thoroughly discussed here:
>> http://article.gmane.org/gmane.linux.network/146949 "[PATCH]
>> bonding: cancel_delayed_work() -> cancel_delayed_work_sync()"
>> However, the only outcome was a proposal for an ugly hack with
>> busy-waiting on the rtnl.
>>
>> The problem:
>> Bonding uses delayed work that automatically re-arms itself,
>> e.g.: bond_mii_monitor().
>>
>> A dev->close() quickly followed by dev->open() on the bonding
>> master has a race condition. bond_close() sets kill_timers=1 and
>> calls cancel_delayed_work(), hoping that bond_mii_monitor() will
>> not re-arm again anymore.  There are two problems with this:
>>
>> - bond->kill_timers is not re-checked after re-acquiring the
>>  bond->lock (this would be easy to fix)
>>
>> - bond_open() resets bond->kill_timers to 0. If this happens
>>  before bond_mii_monitor() notices the flag and exits, it will
>>  re-arm itself. bond_open() then tries to schedule the delayed
>>  work, which causes a BUG.
>>
>> The issue would be solved by calling cancel_delayed_work_sync(),
>> but this can not be done from bond_close() since it is called
>> under rtnl and the delayed work locks rtnl itself.
>>
>> My proposal is to move all the "commit" work that requires rtnl
>> to a separate work and schedule it on the bonding wq. Thus, the
>> re-arming work does not need rtnl and can be cancelled using
>> cancel_delayed_work_sync().
>>
>> Comments?

Looks mostly OK to me, but a bit more below...

>>
>> [note, this does not deal with bond_loadbalance_arp_mon(), where
>> rtnl is now taken as well in net-next; I'll do this if you think
>> the idea is good ]
> 
> 	I don't believe the loadbalance_arp_mon acquires RTNL in
> net-next.  I recall discussing this with Andy not too long ago, but I
> didn't think that change went in, and I don't see it in the tree.
> 
>> Signed-off-by: Jiri Bohac <jbohac@suse.cz>
>>
>> diff --git a/drivers/net/bonding/bond_3ad.c b/drivers/net/bonding/bond_3ad.c
>> index 822f586..8015e12 100644
>> --- a/drivers/net/bonding/bond_3ad.c
>> +++ b/drivers/net/bonding/bond_3ad.c
>> @@ -2119,10 +2119,6 @@ void bond_3ad_state_machine_handler(struct work_struct *work)
>>
>> 	read_lock(&bond->lock);
>>
>> -	if (bond->kill_timers) {
>> -		goto out;
>> -	}
>> -
>> 	//check if there are any slaves
>> 	if (bond->slave_cnt == 0) {
>> 		goto re_arm;
>> @@ -2166,7 +2162,6 @@ void bond_3ad_state_machine_handler(struct work_struct *work)
>>
>> re_arm:
>> 	queue_delayed_work(bond->wq, &bond->ad_work, ad_delta_in_ticks);
>> -out:
>> 	read_unlock(&bond->lock);
>> }
>>
>> diff --git a/drivers/net/bonding/bond_alb.c b/drivers/net/bonding/bond_alb.c
>> index c746b33..250d027 100644
>> --- a/drivers/net/bonding/bond_alb.c
>> +++ b/drivers/net/bonding/bond_alb.c
>> @@ -1397,6 +1397,23 @@ out:
>> 	return NETDEV_TX_OK;
>> }
>>
>> +void bond_alb_promisc_disable(struct work_struct *work)
>> +{
>> +	struct bonding *bond = container_of(work, struct bonding,
>> +					    alb_promisc_disable_work);
>> +	struct alb_bond_info *bond_info = &(BOND_ALB_INFO(bond));
>> +
>> +	/*
>> +	 * dev_set_promiscuity requires rtnl and
>> +	 * nothing else.
>> +	 */
>> +	rtnl_lock();
>> +	dev_set_promiscuity(bond->curr_active_slave->dev, -1);
>> +	bond_info->primary_is_promisc = 0;
>> +	bond_info->rlb_promisc_timeout_counter = 0;
>> +	rtnl_unlock();
>> +}
> 
> 	What prevents this from deadlocking such that cpu A is in
> bond_close, holding RTNL and in cancel_delayed_work_sync, while cpu B is
> in the above function, trying to acquire RTNL?

I guess this one isn't cancelled in bond_close, so it should be safe.

> 
> 	Also, assuming for the moment that the above isn't a problem,
> curr_active_slave may be NULL if the last slave is removed between the
> time bond_alb_promisc_disable is scheduled and when it runs.  I'm not
> sure that the alb_bond_info can be guaranteed to be valid, either, if
> the mode changed.

Probably some or all of these work functions should test for closing
eg. with netif_running() or some other flag/variable under rtnl_lock().

....
>> static void bond_work_cancel_all(struct bonding *bond)
>> {
>> -	write_lock_bh(&bond->lock);
>> -	bond->kill_timers = 1;
>> -	write_unlock_bh(&bond->lock);
>> -
>> 	if (bond->params.miimon && delayed_work_pending(&bond->mii_work))

I'd suggest to skip these delayed_work_pending() tests.

Thanks,
Jarek P.

>> -		cancel_delayed_work(&bond->mii_work);
>> +		cancel_delayed_work_sync(&bond->mii_work);
...

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [RFC] bonding: fix workqueue re-arming races
  2010-08-31 20:54 ` Jay Vosburgh
  2010-09-01 12:23   ` Jarek Poplawski
@ 2010-09-01 13:16   ` Jiri Bohac
  2010-09-01 17:14     ` Jay Vosburgh
  1 sibling, 1 reply; 25+ messages in thread
From: Jiri Bohac @ 2010-09-01 13:16 UTC (permalink / raw)
  To: Jay Vosburgh; +Cc: Jiri Bohac, bonding-devel, markine, jarkao2, chavey, netdev

On Tue, Aug 31, 2010 at 01:54:23PM -0700, Jay Vosburgh wrote:
> Jiri Bohac <jbohac@suse.cz> wrote:
> >[note, this does not deal with bond_loadbalance_arp_mon(), where
> >rtnl is now taken as well in net-next; I'll do this if you think
> >the idea is good ]
> 
> 	I don't believe the loadbalance_arp_mon acquires RTNL in
> net-next.  I recall discussing this with Andy not too long ago, but I
> didn't think that change went in, and I don't see it in the tree.

Of course, you are right, I misread the e-mail thread and did not look at
the code.

> >+void bond_alb_promisc_disable(struct work_struct *work)
> >+{
> >+	struct bonding *bond = container_of(work, struct bonding,
> >+					    alb_promisc_disable_work);
> >+	struct alb_bond_info *bond_info = &(BOND_ALB_INFO(bond));
> >+
> >+	/*
> >+	 * dev_set_promiscuity requires rtnl and
> >+	 * nothing else.
> >+	 */
> >+	rtnl_lock();
> >+	dev_set_promiscuity(bond->curr_active_slave->dev, -1);
> >+	bond_info->primary_is_promisc = 0;
> >+	bond_info->rlb_promisc_timeout_counter = 0;
> >+	rtnl_unlock();
> >+}
> 
> 	What prevents this from deadlocking such that cpu A is in
> bond_close, holding RTNL and in cancel_delayed_work_sync, while cpu B is
> in the above function, trying to acquire RTNL?

The main idea of the patch is to move the code (the "commit"
functions) that needs rtnl to another work item. Then
cancel_delayed_work_sync() can be used to cancel the re-arming
work. But you are absolutely right, there is still a deadlock,
since I queue the "commit" work on the same workqueue. So when
cancel_delayed_work_sync() waits for the re-arming work to
finish, it can wait forever because a previously queued "commit"
work is waiting for rtnl.

The solution is to move the "commit" work items to a different
workqueue. Fixed in the new version of the patch below
(bond->wq_rtnl).

> 	Also, assuming for the moment that the above isn't a problem,
> curr_active_slave may be NULL if the last slave is removed between the
> time bond_alb_promisc_disable is scheduled and when it runs.  I'm not
> sure that the alb_bond_info can be guaranteed to be valid, either, if
> the mode changed.

Yes, there may be problems like these, but these are present
already in the current code. Because bond->lock() is released
before rtnl is taken. 

Sure, it would be good to deal with these problems, but I don't
think this patch introduces new races like these. They are
already there ... (see below)

> > void bond_alb_monitor(struct work_struct *work)
> > {
> > 	struct bonding *bond = container_of(work, struct bonding,
> >@@ -1407,10 +1424,6 @@ void bond_alb_monitor(struct work_struct *work)
> >
> > 	read_lock(&bond->lock);
> >
> >-	if (bond->kill_timers) {
> >-		goto out;
> >-	}
> >-
> > 	if (bond->slave_cnt == 0) {
> > 		bond_info->tx_rebalance_counter = 0;
> > 		bond_info->lp_counter = 0;
> >@@ -1462,25 +1475,11 @@ void bond_alb_monitor(struct work_struct *work)
> > 	if (bond_info->rlb_enabled) {
> > 		if (bond_info->primary_is_promisc &&
> > 		    (++bond_info->rlb_promisc_timeout_counter >= RLB_PROMISC_TIMEOUT)) {
> >-
> >-			/*
> >-			 * dev_set_promiscuity requires rtnl and
> >-			 * nothing else.
> >-			 */
> >-			read_unlock(&bond->lock);

... e.g. here; the current slave may change/disappear, the mode
may change .... 

> >-			rtnl_lock();
> >-
> >-			bond_info->rlb_promisc_timeout_counter = 0;
> >-

I fixed both issues in this new version of the patch.

Signed-off-by: Jiri Bohac <jbohac@suse.cz>


diff --git a/drivers/net/bonding/bond_3ad.c b/drivers/net/bonding/bond_3ad.c
index 822f586..8015e12 100644
--- a/drivers/net/bonding/bond_3ad.c
+++ b/drivers/net/bonding/bond_3ad.c
@@ -2119,10 +2119,6 @@ void bond_3ad_state_machine_handler(struct work_struct *work)
 
 	read_lock(&bond->lock);
 
-	if (bond->kill_timers) {
-		goto out;
-	}
-
 	//check if there are any slaves
 	if (bond->slave_cnt == 0) {
 		goto re_arm;
@@ -2166,7 +2162,6 @@ void bond_3ad_state_machine_handler(struct work_struct *work)
 
 re_arm:
 	queue_delayed_work(bond->wq, &bond->ad_work, ad_delta_in_ticks);
-out:
 	read_unlock(&bond->lock);
 }
 
diff --git a/drivers/net/bonding/bond_alb.c b/drivers/net/bonding/bond_alb.c
index c746b33..e4fa3a5 100644
--- a/drivers/net/bonding/bond_alb.c
+++ b/drivers/net/bonding/bond_alb.c
@@ -1397,6 +1397,35 @@ out:
 	return NETDEV_TX_OK;
 }
 
+void bond_alb_promisc_disable(struct work_struct *work)
+{
+	struct bonding *bond = container_of(work, struct bonding,
+					    alb_promisc_disable_work);
+	struct alb_bond_info *bond_info = &(BOND_ALB_INFO(bond));
+
+	/*
+	 * dev_set_promiscuity requires rtnl and
+	 * nothing else.
+	 */
+	rtnl_lock();
+	read_lock(&bond->lock);
+	read_lock(&bond->curr_slave_lock);
+
+	if (!bond_is_lb(bond))
+		goto out;
+	if (!bond->curr_active_slave)
+		goto out;
+
+	dev_set_promiscuity(bond->curr_active_slave->dev, -1);
+	bond_info->primary_is_promisc = 0;
+	bond_info->rlb_promisc_timeout_counter = 0;
+
+out:
+	read_unlock(&bond->curr_slave_lock);
+	read_unlock(&bond->lock);
+	rtnl_unlock();
+}
+
 void bond_alb_monitor(struct work_struct *work)
 {
 	struct bonding *bond = container_of(work, struct bonding,
@@ -1407,10 +1436,6 @@ void bond_alb_monitor(struct work_struct *work)
 
 	read_lock(&bond->lock);
 
-	if (bond->kill_timers) {
-		goto out;
-	}
-
 	if (bond->slave_cnt == 0) {
 		bond_info->tx_rebalance_counter = 0;
 		bond_info->lp_counter = 0;
@@ -1462,25 +1487,11 @@ void bond_alb_monitor(struct work_struct *work)
 	if (bond_info->rlb_enabled) {
 		if (bond_info->primary_is_promisc &&
 		    (++bond_info->rlb_promisc_timeout_counter >= RLB_PROMISC_TIMEOUT)) {
-
-			/*
-			 * dev_set_promiscuity requires rtnl and
-			 * nothing else.
-			 */
-			read_unlock(&bond->lock);
-			rtnl_lock();
-
-			bond_info->rlb_promisc_timeout_counter = 0;
-
 			/* If the primary was set to promiscuous mode
 			 * because a slave was disabled then
 			 * it can now leave promiscuous mode.
 			 */
-			dev_set_promiscuity(bond->curr_active_slave->dev, -1);
-			bond_info->primary_is_promisc = 0;
-
-			rtnl_unlock();
-			read_lock(&bond->lock);
+			queue_work(bond->wq_rtnl, &bond->alb_promisc_disable_work);
 		}
 
 		if (bond_info->rlb_rebalance) {
@@ -1505,7 +1516,6 @@ void bond_alb_monitor(struct work_struct *work)
 
 re_arm:
 	queue_delayed_work(bond->wq, &bond->alb_work, alb_delta_in_ticks);
-out:
 	read_unlock(&bond->lock);
 }
 
diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index 2cc4cfc..aae2864 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -2343,10 +2343,15 @@ static int bond_miimon_inspect(struct bonding *bond)
 	return commit;
 }
 
-static void bond_miimon_commit(struct bonding *bond)
+static void bond_miimon_commit(struct work_struct *work)
 {
 	struct slave *slave;
 	int i;
+	struct bonding *bond = container_of(work, struct bonding,
+					    miimon_commit_work);
+
+	rtnl_lock();
+	read_lock(&bond->lock);
 
 	bond_for_each_slave(bond, slave, i) {
 		switch (slave->new_link) {
@@ -2421,15 +2426,18 @@ static void bond_miimon_commit(struct bonding *bond)
 		}
 
 do_failover:
-		ASSERT_RTNL();
 		write_lock_bh(&bond->curr_slave_lock);
 		bond_select_active_slave(bond);
 		write_unlock_bh(&bond->curr_slave_lock);
 	}
 
 	bond_set_carrier(bond);
+
+	read_unlock(&bond->lock);
+	rtnl_unlock();
 }
 
+
 /*
  * bond_mii_monitor
  *
@@ -2438,14 +2446,13 @@ do_failover:
  * an acquisition of appropriate locks followed by a commit phase to
  * implement whatever link state changes are indicated.
  */
+
 void bond_mii_monitor(struct work_struct *work)
 {
 	struct bonding *bond = container_of(work, struct bonding,
 					    mii_work.work);
 
 	read_lock(&bond->lock);
-	if (bond->kill_timers)
-		goto out;
 
 	if (bond->slave_cnt == 0)
 		goto re_arm;
@@ -2462,23 +2469,14 @@ void bond_mii_monitor(struct work_struct *work)
 		read_unlock(&bond->curr_slave_lock);
 	}
 
-	if (bond_miimon_inspect(bond)) {
-		read_unlock(&bond->lock);
-		rtnl_lock();
-		read_lock(&bond->lock);
+	if (bond_miimon_inspect(bond))
+		queue_work(bond->wq_rtnl, &bond->miimon_commit_work);
 
-		bond_miimon_commit(bond);
-
-		read_unlock(&bond->lock);
-		rtnl_unlock();	/* might sleep, hold no other locks */
-		read_lock(&bond->lock);
-	}
 
 re_arm:
 	if (bond->params.miimon)
 		queue_delayed_work(bond->wq, &bond->mii_work,
 				   msecs_to_jiffies(bond->params.miimon));
-out:
 	read_unlock(&bond->lock);
 }
 
@@ -2778,9 +2776,6 @@ void bond_loadbalance_arp_mon(struct work_struct *work)
 
 	delta_in_ticks = msecs_to_jiffies(bond->params.arp_interval);
 
-	if (bond->kill_timers)
-		goto out;
-
 	if (bond->slave_cnt == 0)
 		goto re_arm;
 
@@ -2867,7 +2862,6 @@ void bond_loadbalance_arp_mon(struct work_struct *work)
 re_arm:
 	if (bond->params.arp_interval)
 		queue_delayed_work(bond->wq, &bond->arp_work, delta_in_ticks);
-out:
 	read_unlock(&bond->lock);
 }
 
@@ -2949,13 +2943,19 @@ static int bond_ab_arp_inspect(struct bonding *bond, int delta_in_ticks)
 /*
  * Called to commit link state changes noted by inspection step of
  * active-backup mode ARP monitor.
- *
- * Called with RTNL and bond->lock for read.
  */
-static void bond_ab_arp_commit(struct bonding *bond, int delta_in_ticks)
+static void bond_ab_arp_commit(struct work_struct *work)
 {
 	struct slave *slave;
 	int i;
+	int delta_in_ticks;
+	struct bonding *bond = container_of(work, struct bonding,
+					    ab_arp_commit_work);
+
+	rtnl_lock();
+	read_lock(&bond->lock);
+
+	delta_in_ticks = msecs_to_jiffies(bond->params.arp_interval);
 
 	bond_for_each_slave(bond, slave, i) {
 		switch (slave->new_link) {
@@ -3014,6 +3014,9 @@ do_failover:
 	}
 
 	bond_set_carrier(bond);
+
+	read_unlock(&bond->lock);
+	rtnl_unlock();
 }
 
 /*
@@ -3093,9 +3096,6 @@ void bond_activebackup_arp_mon(struct work_struct *work)
 
 	read_lock(&bond->lock);
 
-	if (bond->kill_timers)
-		goto out;
-
 	delta_in_ticks = msecs_to_jiffies(bond->params.arp_interval);
 
 	if (bond->slave_cnt == 0)
@@ -3113,24 +3113,15 @@ void bond_activebackup_arp_mon(struct work_struct *work)
 		read_unlock(&bond->curr_slave_lock);
 	}
 
-	if (bond_ab_arp_inspect(bond, delta_in_ticks)) {
-		read_unlock(&bond->lock);
-		rtnl_lock();
-		read_lock(&bond->lock);
+	if (bond_ab_arp_inspect(bond, delta_in_ticks))
+		queue_work(bond->wq_rtnl, &bond->ab_arp_commit_work);
 
-		bond_ab_arp_commit(bond, delta_in_ticks);
-
-		read_unlock(&bond->lock);
-		rtnl_unlock();
-		read_lock(&bond->lock);
-	}
 
 	bond_ab_arp_probe(bond);
 
 re_arm:
 	if (bond->params.arp_interval)
 		queue_delayed_work(bond->wq, &bond->arp_work, delta_in_ticks);
-out:
 	read_unlock(&bond->lock);
 }
 
@@ -3720,8 +3711,6 @@ static int bond_open(struct net_device *bond_dev)
 {
 	struct bonding *bond = netdev_priv(bond_dev);
 
-	bond->kill_timers = 0;
-
 	if (bond_is_lb(bond)) {
 		/* bond_alb_initialize must be called before the timer
 		 * is started.
@@ -3781,26 +3770,23 @@ static int bond_close(struct net_device *bond_dev)
 	bond->send_grat_arp = 0;
 	bond->send_unsol_na = 0;
 
-	/* signal timers not to re-arm */
-	bond->kill_timers = 1;
-
 	write_unlock_bh(&bond->lock);
 
 	if (bond->params.miimon) {  /* link check interval, in milliseconds. */
-		cancel_delayed_work(&bond->mii_work);
+		cancel_delayed_work_sync(&bond->mii_work);
 	}
 
 	if (bond->params.arp_interval) {  /* arp interval, in milliseconds. */
-		cancel_delayed_work(&bond->arp_work);
+		cancel_delayed_work_sync(&bond->arp_work);
 	}
 
 	switch (bond->params.mode) {
 	case BOND_MODE_8023AD:
-		cancel_delayed_work(&bond->ad_work);
+		cancel_delayed_work_sync(&bond->ad_work);
 		break;
 	case BOND_MODE_TLB:
 	case BOND_MODE_ALB:
-		cancel_delayed_work(&bond->alb_work);
+		cancel_delayed_work_sync(&bond->alb_work);
 		break;
 	default:
 		break;
@@ -4601,6 +4587,8 @@ static void bond_destructor(struct net_device *bond_dev)
 	struct bonding *bond = netdev_priv(bond_dev);
 	if (bond->wq)
 		destroy_workqueue(bond->wq);
+	if (bond->wq_rtnl)
+		destroy_workqueue(bond->wq_rtnl);
 	free_netdev(bond_dev);
 }
 
@@ -4660,23 +4648,19 @@ static void bond_setup(struct net_device *bond_dev)
 
 static void bond_work_cancel_all(struct bonding *bond)
 {
-	write_lock_bh(&bond->lock);
-	bond->kill_timers = 1;
-	write_unlock_bh(&bond->lock);
-
 	if (bond->params.miimon && delayed_work_pending(&bond->mii_work))
-		cancel_delayed_work(&bond->mii_work);
+		cancel_delayed_work_sync(&bond->mii_work);
 
 	if (bond->params.arp_interval && delayed_work_pending(&bond->arp_work))
-		cancel_delayed_work(&bond->arp_work);
+		cancel_delayed_work_sync(&bond->arp_work);
 
 	if (bond->params.mode == BOND_MODE_ALB &&
 	    delayed_work_pending(&bond->alb_work))
-		cancel_delayed_work(&bond->alb_work);
+		cancel_delayed_work_sync(&bond->alb_work);
 
 	if (bond->params.mode == BOND_MODE_8023AD &&
 	    delayed_work_pending(&bond->ad_work))
-		cancel_delayed_work(&bond->ad_work);
+		cancel_delayed_work_sync(&bond->ad_work);
 }
 
 /*
@@ -5083,6 +5067,12 @@ static int bond_init(struct net_device *bond_dev)
 	bond->wq = create_singlethread_workqueue(bond_dev->name);
 	if (!bond->wq)
 		return -ENOMEM;
+	bond->wq_rtnl = create_singlethread_workqueue(bond_dev->name);
+	if (!bond->wq_rtnl) {
+		destroy_workqueue(bond->wq);
+		bond->wq = NULL;
+		return -ENOMEM;
+	}
 
 	bond_set_lockdep_class(bond_dev);
 
@@ -5094,6 +5084,9 @@ static int bond_init(struct net_device *bond_dev)
 	bond_prepare_sysfs_group(bond);
 
 	__hw_addr_init(&bond->mc_list);
+	INIT_WORK(&bond->miimon_commit_work, bond_miimon_commit);
+	INIT_WORK(&bond->ab_arp_commit_work, bond_ab_arp_commit);
+	INIT_WORK(&bond->alb_promisc_disable_work, bond_alb_promisc_disable);
 	return 0;
 }
 
diff --git a/drivers/net/bonding/bonding.h b/drivers/net/bonding/bonding.h
index c6fdd85..43ba807 100644
--- a/drivers/net/bonding/bonding.h
+++ b/drivers/net/bonding/bonding.h
@@ -198,7 +198,6 @@ struct bonding {
 	s32      slave_cnt; /* never change this value outside the attach/detach wrappers */
 	rwlock_t lock;
 	rwlock_t curr_slave_lock;
-	s8       kill_timers;
 	s8	 send_grat_arp;
 	s8	 send_unsol_na;
 	s8	 setup_by_slave;
@@ -219,10 +218,14 @@ struct bonding {
 	struct   vlan_group *vlgrp;
 	struct   packet_type arp_mon_pt;
 	struct   workqueue_struct *wq;
+	struct   workqueue_struct *wq_rtnl;
 	struct   delayed_work mii_work;
 	struct   delayed_work arp_work;
 	struct   delayed_work alb_work;
 	struct   delayed_work ad_work;
+	struct    work_struct miimon_commit_work;
+	struct    work_struct ab_arp_commit_work;
+	struct    work_struct alb_promisc_disable_work;
 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
 	struct   in6_addr master_ipv6;
 #endif
@@ -348,6 +351,7 @@ void bond_select_active_slave(struct bonding *bond);
 void bond_change_active_slave(struct bonding *bond, struct slave *new_active);
 void bond_register_arp(struct bonding *);
 void bond_unregister_arp(struct bonding *);
+void bond_alb_promisc_disable(struct work_struct *work);
 
 struct bond_net {
 	struct net *		net;	/* Associated network namespace */

-- 
Jiri Bohac <jbohac@suse.cz>
SUSE Labs, SUSE CZ


^ permalink raw reply related	[flat|nested] 25+ messages in thread

* Re: [RFC] bonding: fix workqueue re-arming races
  2010-09-01 12:23   ` Jarek Poplawski
@ 2010-09-01 13:30     ` Jiri Bohac
  2010-09-01 15:18       ` Jarek Poplawski
  0 siblings, 1 reply; 25+ messages in thread
From: Jiri Bohac @ 2010-09-01 13:30 UTC (permalink / raw)
  To: Jarek Poplawski
  Cc: Jay Vosburgh, Jiri Bohac, bonding-devel, markine, chavey, netdev

On Wed, Sep 01, 2010 at 12:23:56PM +0000, Jarek Poplawski wrote:
> On 2010-08-31 22:54, Jay Vosburgh wrote:
> > 	What prevents this from deadlocking such that cpu A is in
> > bond_close, holding RTNL and in cancel_delayed_work_sync, while cpu B is
> > in the above function, trying to acquire RTNL?
> 
> I guess this one isn't cancelled in bond_close, so it should be safe.

Nah, Jay was correct. Although this work item is not explicitly
cancelled with cancel_delayed_work_sync(), it is on the same
workqueue as work items that are being cancelled with
cancel_delayed_work_sync(), so this can still cause a deadlock.
Fixed in the new version of the patch by putting these on a
separate workqueue.

> > 	Also, assuming for the moment that the above isn't a problem,
> > curr_active_slave may be NULL if the last slave is removed between the
> > time bond_alb_promisc_disable is scheduled and when it runs.  I'm not
> > sure that the alb_bond_info can be guaranteed to be valid, either, if
> > the mode changed.
> 
> Probably some or all of these work functions should test for closing
> eg. with netif_running() or some other flag/variable under rtnl_lock().
> 
> ....
> >> static void bond_work_cancel_all(struct bonding *bond)
> >> {
> >> -	write_lock_bh(&bond->lock);
> >> -	bond->kill_timers = 1;
> >> -	write_unlock_bh(&bond->lock);
> >> -
> >> 	if (bond->params.miimon && delayed_work_pending(&bond->mii_work))
> 
> I'd suggest to skip these delayed_work_pending() tests.

I agree, these should not be needed anymore, since bond_close()
now makes sure the re-arming work is cancelled. I'll update the
patch if Jay thinks it's otherwise good.

-- 
Jiri Bohac <jbohac@suse.cz>
SUSE Labs, SUSE CZ


^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [RFC] bonding: fix workqueue re-arming races
  2010-09-01 13:30     ` Jiri Bohac
@ 2010-09-01 15:18       ` Jarek Poplawski
  2010-09-01 15:37         ` Jarek Poplawski
  0 siblings, 1 reply; 25+ messages in thread
From: Jarek Poplawski @ 2010-09-01 15:18 UTC (permalink / raw)
  To: Jiri Bohac; +Cc: Jay Vosburgh, bonding-devel, markine, chavey, netdev

On Wed, Sep 01, 2010 at 03:30:56PM +0200, Jiri Bohac wrote:
> On Wed, Sep 01, 2010 at 12:23:56PM +0000, Jarek Poplawski wrote:
> > On 2010-08-31 22:54, Jay Vosburgh wrote:
> > > 	What prevents this from deadlocking such that cpu A is in
> > > bond_close, holding RTNL and in cancel_delayed_work_sync, while cpu B is
> > > in the above function, trying to acquire RTNL?
> > 
> > I guess this one isn't cancelled in bond_close, so it should be safe.
> 
> Nah, Jay was correct. Although this work item is not explicitly
> cancelled with cancel_delayed_work_sync(), it is on the same
> workqueue as work items that are being cancelled with
> cancel_delayed_work_sync(), so this can still cause a deadlock.
> Fixed in the new version of the patch by putting these on a
> separate workqueue.
> 

Maybe I miss something, but the same workqueue shouldn't matter here.
Similar things are done by other network code with the kernel-global
workqueue.

Jarek P.

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [RFC] bonding: fix workqueue re-arming races
  2010-09-01 15:18       ` Jarek Poplawski
@ 2010-09-01 15:37         ` Jarek Poplawski
  2010-09-01 19:00           ` Jarek Poplawski
  0 siblings, 1 reply; 25+ messages in thread
From: Jarek Poplawski @ 2010-09-01 15:37 UTC (permalink / raw)
  To: Jiri Bohac; +Cc: Jay Vosburgh, bonding-devel, markine, chavey, netdev

On Wed, Sep 01, 2010 at 05:18:56PM +0200, Jarek Poplawski wrote:
> On Wed, Sep 01, 2010 at 03:30:56PM +0200, Jiri Bohac wrote:
> > On Wed, Sep 01, 2010 at 12:23:56PM +0000, Jarek Poplawski wrote:
> > > On 2010-08-31 22:54, Jay Vosburgh wrote:
> > > > 	What prevents this from deadlocking such that cpu A is in
> > > > bond_close, holding RTNL and in cancel_delayed_work_sync, while cpu B is
> > > > in the above function, trying to acquire RTNL?
> > > 
> > > I guess this one isn't cancelled in bond_close, so it should be safe.
> > 
> > Nah, Jay was correct. Although this work item is not explicitly
> > cancelled with cancel_delayed_work_sync(), it is on the same
> > workqueue as work items that are being cancelled with
> > cancel_delayed_work_sync(), so this can still cause a deadlock.
> > Fixed in the new version of the patch by putting these on a
> > separate workqueue.
> > 
> 
> Maybe I miss something, but the same workqueue shouldn't matter here.

Hmm... I missed your point completely and Jay was correct!

Sorry,
Jarek P.

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [RFC] bonding: fix workqueue re-arming races
  2010-09-01 13:16   ` Jiri Bohac
@ 2010-09-01 17:14     ` Jay Vosburgh
  2010-09-01 18:31       ` Jiri Bohac
  0 siblings, 1 reply; 25+ messages in thread
From: Jay Vosburgh @ 2010-09-01 17:14 UTC (permalink / raw)
  To: Jiri Bohac; +Cc: bonding-devel, markine, jarkao2, chavey, netdev

Jiri Bohac <jbohac@suse.cz> wrote:

>On Tue, Aug 31, 2010 at 01:54:23PM -0700, Jay Vosburgh wrote:
>> Jiri Bohac <jbohac@suse.cz> wrote:
>> >[note, this does not deal with bond_loadbalance_arp_mon(), where
>> >rtnl is now taken as well in net-next; I'll do this if you think
>> >the idea is good ]
>> 
>> 	I don't believe the loadbalance_arp_mon acquires RTNL in
>> net-next.  I recall discussing this with Andy not too long ago, but I
>> didn't think that change went in, and I don't see it in the tree.
>
>Of course, you are right, I misread the e-mail thread and did not look at
>the code.
>
>> >+void bond_alb_promisc_disable(struct work_struct *work)
>> >+{
>> >+	struct bonding *bond = container_of(work, struct bonding,
>> >+					    alb_promisc_disable_work);
>> >+	struct alb_bond_info *bond_info = &(BOND_ALB_INFO(bond));
>> >+
>> >+	/*
>> >+	 * dev_set_promiscuity requires rtnl and
>> >+	 * nothing else.
>> >+	 */
>> >+	rtnl_lock();
>> >+	dev_set_promiscuity(bond->curr_active_slave->dev, -1);
>> >+	bond_info->primary_is_promisc = 0;
>> >+	bond_info->rlb_promisc_timeout_counter = 0;
>> >+	rtnl_unlock();
>> >+}
>> 
>> 	What prevents this from deadlocking such that cpu A is in
>> bond_close, holding RTNL and in cancel_delayed_work_sync, while cpu B is
>> in the above function, trying to acquire RTNL?
>
>The main idea of the patch is to move the code (the "commit"
>functions) that needs rtnl to another work item. Then
>cancel_delayed_work_sync() can be used to cancel the re-arming
>work. But you are absolutely right, there is still a deadlock,
>since I queue the "commit" work on the same workqueue. So when
>cancel_delayed_work_sync() waits for the re-arming work to
>finish, it can wait forever because a previously queued "commit"
>work is waiting for rtnl.
>
>The solution is to move the "commit" work items to a different
>workqueue. Fixed in the new version of the patch below
>(bond->wq_rtnl).
>
>> 	Also, assuming for the moment that the above isn't a problem,
>> curr_active_slave may be NULL if the last slave is removed between the
>> time bond_alb_promisc_disable is scheduled and when it runs.  I'm not
>> sure that the alb_bond_info can be guaranteed to be valid, either, if
>> the mode changed.
>
>Yes, there may be problems like these, but these are present
>already in the current code. Because bond->lock() is released
>before rtnl is taken. 
>
>Sure, it would be good to deal with these problems, but I don't
>think this patch introduces new races like these. They are
>already there ... (see below)

	Yep, the current code does appear to have a race in
bond_alb_monitor that should be fixed (to check curr_active_slave before
dereferencing it).

	I also thought a bit more, and in the current code, the mode
shouldn't change in the middle of one of the work functions, because a
mode change requires the bond to be closed, so the various work things
will be stopped (more or less; excepting the race under disucssion
here).

	I don't think this is true for the new wq_rtnl functions,
however, because its work items are not canceled until the workqueue
itself is freed in bond_destructor.  Does the wq_rtnl open new races,
because it's work items are not synchronized with other activities
(close in particular)?  It's possible for its work functions (which may
do things like set the active slave, carrier, etc) to be invoked after
the bond has closed, and possibly reopened, or been otherwise adjusted.

	I'm not sure this is better than one of the alternatives I
believe we discussed the last time around: having the rtnl-acquiring
work functions do a conditional acquire of rtnl, and if that fails,
reschedule themselves.

	So, e.g., bond_mii_monitor becomes something like:

void bond_mii_monitor(struct work_struct *work)
{
	struct bonding *bond = container_of(work, struct bonding,
					    mii_work.work);

	read_lock(&bond->lock);

	if (bond->slave_cnt == 0)
		goto re_arm;

	if (bond->send_grat_arp) {
		read_lock(&bond->curr_slave_lock);
		bond_send_gratuitous_arp(bond);
		read_unlock(&bond->curr_slave_lock);
	}

	if (bond->send_unsol_na) {
		read_lock(&bond->curr_slave_lock);
		bond_send_unsolicited_na(bond);
		read_unlock(&bond->curr_slave_lock);
	}

	if (bond_miimon_inspect(bond)) {
		read_unlock(&bond->lock);
		/*
		 * Awe-inspiring comment explaining why we do this
		 */
		if (rtnl_trylock()) {
			read_lock(&bond->lock);

			bond_miimon_commit(bond);

			read_unlock(&bond->lock);
			rtnl_unlock();	/* might sleep, hold no other locks */
			read_lock(&bond->lock);
		} else {
			read_lock(&bond->lock);
			queue_work(bond->wq, &bond->mii_work);
			read_unlock(&bond->lock);
			return;
		}
	}

re_arm:
	if (bond->params.miimon)
		queue_delayed_work(bond->wq, &bond->mii_work,
				   msecs_to_jiffies(bond->params.miimon));
out:
	read_unlock(&bond->lock);
}

	Which, if I'm not missing something, does not need the
kill_timers (because I believe we can use cancel_delayed_work_sync now
that there's no deadlock against rtnl).

	It might need a "fast reschedule" flag of some sort so that the
gratutious ARPs and NAs aren't bunched up when the trylock fails, but
that's a secondary concern.

	Thoughts?

>> > void bond_alb_monitor(struct work_struct *work)
>> > {
>> > 	struct bonding *bond = container_of(work, struct bonding,
>> >@@ -1407,10 +1424,6 @@ void bond_alb_monitor(struct work_struct *work)
>> >
>> > 	read_lock(&bond->lock);
>> >
>> >-	if (bond->kill_timers) {
>> >-		goto out;
>> >-	}
>> >-
>> > 	if (bond->slave_cnt == 0) {
>> > 		bond_info->tx_rebalance_counter = 0;
>> > 		bond_info->lp_counter = 0;
>> >@@ -1462,25 +1475,11 @@ void bond_alb_monitor(struct work_struct *work)
>> > 	if (bond_info->rlb_enabled) {
>> > 		if (bond_info->primary_is_promisc &&
>> > 		    (++bond_info->rlb_promisc_timeout_counter >= RLB_PROMISC_TIMEOUT)) {
>> >-
>> >-			/*
>> >-			 * dev_set_promiscuity requires rtnl and
>> >-			 * nothing else.
>> >-			 */
>> >-			read_unlock(&bond->lock);
>
>... e.g. here; the current slave may change/disappear, the mode
>may change .... 
>
>> >-			rtnl_lock();
>> >-
>> >-			bond_info->rlb_promisc_timeout_counter = 0;
>> >-
>
>I fixed both issues in this new version of the patch.
>
>Signed-off-by: Jiri Bohac <jbohac@suse.cz>
>
>
>diff --git a/drivers/net/bonding/bond_3ad.c b/drivers/net/bonding/bond_3ad.c
>index 822f586..8015e12 100644
>--- a/drivers/net/bonding/bond_3ad.c
>+++ b/drivers/net/bonding/bond_3ad.c
>@@ -2119,10 +2119,6 @@ void bond_3ad_state_machine_handler(struct work_struct *work)
>
> 	read_lock(&bond->lock);
>
>-	if (bond->kill_timers) {
>-		goto out;
>-	}
>-
> 	//check if there are any slaves
> 	if (bond->slave_cnt == 0) {
> 		goto re_arm;
>@@ -2166,7 +2162,6 @@ void bond_3ad_state_machine_handler(struct work_struct *work)
>
> re_arm:
> 	queue_delayed_work(bond->wq, &bond->ad_work, ad_delta_in_ticks);
>-out:
> 	read_unlock(&bond->lock);
> }
>
>diff --git a/drivers/net/bonding/bond_alb.c b/drivers/net/bonding/bond_alb.c
>index c746b33..e4fa3a5 100644
>--- a/drivers/net/bonding/bond_alb.c
>+++ b/drivers/net/bonding/bond_alb.c
>@@ -1397,6 +1397,35 @@ out:
> 	return NETDEV_TX_OK;
> }
>
>+void bond_alb_promisc_disable(struct work_struct *work)
>+{
>+	struct bonding *bond = container_of(work, struct bonding,
>+					    alb_promisc_disable_work);
>+	struct alb_bond_info *bond_info = &(BOND_ALB_INFO(bond));
>+
>+	/*
>+	 * dev_set_promiscuity requires rtnl and
>+	 * nothing else.
>+	 */
>+	rtnl_lock();
>+	read_lock(&bond->lock);
>+	read_lock(&bond->curr_slave_lock);
>+
>+	if (!bond_is_lb(bond))
>+		goto out;
>+	if (!bond->curr_active_slave)
>+		goto out;
>+
>+	dev_set_promiscuity(bond->curr_active_slave->dev, -1);
>+	bond_info->primary_is_promisc = 0;
>+	bond_info->rlb_promisc_timeout_counter = 0;
>+
>+out:
>+	read_unlock(&bond->curr_slave_lock);
>+	read_unlock(&bond->lock);
>+	rtnl_unlock();

	Should we end up using this, it is also holding too many locks
over the dev_set_promiscuity call.


>+}
>+
> void bond_alb_monitor(struct work_struct *work)
> {
> 	struct bonding *bond = container_of(work, struct bonding,
>@@ -1407,10 +1436,6 @@ void bond_alb_monitor(struct work_struct *work)
>
> 	read_lock(&bond->lock);
>
>-	if (bond->kill_timers) {
>-		goto out;
>-	}
>-
> 	if (bond->slave_cnt == 0) {
> 		bond_info->tx_rebalance_counter = 0;
> 		bond_info->lp_counter = 0;
>@@ -1462,25 +1487,11 @@ void bond_alb_monitor(struct work_struct *work)
> 	if (bond_info->rlb_enabled) {
> 		if (bond_info->primary_is_promisc &&
> 		    (++bond_info->rlb_promisc_timeout_counter >= RLB_PROMISC_TIMEOUT)) {
>-
>-			/*
>-			 * dev_set_promiscuity requires rtnl and
>-			 * nothing else.
>-			 */
>-			read_unlock(&bond->lock);
>-			rtnl_lock();
>-
>-			bond_info->rlb_promisc_timeout_counter = 0;
>-
> 			/* If the primary was set to promiscuous mode
> 			 * because a slave was disabled then
> 			 * it can now leave promiscuous mode.
> 			 */
>-			dev_set_promiscuity(bond->curr_active_slave->dev, -1);
>-			bond_info->primary_is_promisc = 0;
>-
>-			rtnl_unlock();
>-			read_lock(&bond->lock);
>+			queue_work(bond->wq_rtnl, &bond->alb_promisc_disable_work);
> 		}
>
> 		if (bond_info->rlb_rebalance) {
>@@ -1505,7 +1516,6 @@ void bond_alb_monitor(struct work_struct *work)
>
> re_arm:
> 	queue_delayed_work(bond->wq, &bond->alb_work, alb_delta_in_ticks);
>-out:
> 	read_unlock(&bond->lock);
> }
>
>diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
>index 2cc4cfc..aae2864 100644
>--- a/drivers/net/bonding/bond_main.c
>+++ b/drivers/net/bonding/bond_main.c
>@@ -2343,10 +2343,15 @@ static int bond_miimon_inspect(struct bonding *bond)
> 	return commit;
> }
>
>-static void bond_miimon_commit(struct bonding *bond)
>+static void bond_miimon_commit(struct work_struct *work)
> {
> 	struct slave *slave;
> 	int i;
>+	struct bonding *bond = container_of(work, struct bonding,
>+					    miimon_commit_work);
>+
>+	rtnl_lock();
>+	read_lock(&bond->lock);
>
> 	bond_for_each_slave(bond, slave, i) {
> 		switch (slave->new_link) {
>@@ -2421,15 +2426,18 @@ static void bond_miimon_commit(struct bonding *bond)
> 		}
>
> do_failover:
>-		ASSERT_RTNL();
> 		write_lock_bh(&bond->curr_slave_lock);
> 		bond_select_active_slave(bond);
> 		write_unlock_bh(&bond->curr_slave_lock);
> 	}
>
> 	bond_set_carrier(bond);
>+
>+	read_unlock(&bond->lock);
>+	rtnl_unlock();
> }
>
>+
> /*
>  * bond_mii_monitor
>  *
>@@ -2438,14 +2446,13 @@ do_failover:
>  * an acquisition of appropriate locks followed by a commit phase to
>  * implement whatever link state changes are indicated.
>  */
>+
> void bond_mii_monitor(struct work_struct *work)
> {
> 	struct bonding *bond = container_of(work, struct bonding,
> 					    mii_work.work);
>
> 	read_lock(&bond->lock);
>-	if (bond->kill_timers)
>-		goto out;
>
> 	if (bond->slave_cnt == 0)
> 		goto re_arm;
>@@ -2462,23 +2469,14 @@ void bond_mii_monitor(struct work_struct *work)
> 		read_unlock(&bond->curr_slave_lock);
> 	}
>
>-	if (bond_miimon_inspect(bond)) {
>-		read_unlock(&bond->lock);
>-		rtnl_lock();
>-		read_lock(&bond->lock);
>+	if (bond_miimon_inspect(bond))
>+		queue_work(bond->wq_rtnl, &bond->miimon_commit_work);
>
>-		bond_miimon_commit(bond);
>-
>-		read_unlock(&bond->lock);
>-		rtnl_unlock();	/* might sleep, hold no other locks */
>-		read_lock(&bond->lock);
>-	}
>
> re_arm:
> 	if (bond->params.miimon)
> 		queue_delayed_work(bond->wq, &bond->mii_work,
> 				   msecs_to_jiffies(bond->params.miimon));
>-out:
> 	read_unlock(&bond->lock);
> }
>
>@@ -2778,9 +2776,6 @@ void bond_loadbalance_arp_mon(struct work_struct *work)
>
> 	delta_in_ticks = msecs_to_jiffies(bond->params.arp_interval);
>
>-	if (bond->kill_timers)
>-		goto out;
>-
> 	if (bond->slave_cnt == 0)
> 		goto re_arm;
>
>@@ -2867,7 +2862,6 @@ void bond_loadbalance_arp_mon(struct work_struct *work)
> re_arm:
> 	if (bond->params.arp_interval)
> 		queue_delayed_work(bond->wq, &bond->arp_work, delta_in_ticks);
>-out:
> 	read_unlock(&bond->lock);
> }
>
>@@ -2949,13 +2943,19 @@ static int bond_ab_arp_inspect(struct bonding *bond, int delta_in_ticks)
> /*
>  * Called to commit link state changes noted by inspection step of
>  * active-backup mode ARP monitor.
>- *
>- * Called with RTNL and bond->lock for read.
>  */
>-static void bond_ab_arp_commit(struct bonding *bond, int delta_in_ticks)
>+static void bond_ab_arp_commit(struct work_struct *work)
> {
> 	struct slave *slave;
> 	int i;
>+	int delta_in_ticks;
>+	struct bonding *bond = container_of(work, struct bonding,
>+					    ab_arp_commit_work);
>+
>+	rtnl_lock();
>+	read_lock(&bond->lock);
>+
>+	delta_in_ticks = msecs_to_jiffies(bond->params.arp_interval);
>
> 	bond_for_each_slave(bond, slave, i) {
> 		switch (slave->new_link) {
>@@ -3014,6 +3014,9 @@ do_failover:
> 	}
>
> 	bond_set_carrier(bond);
>+
>+	read_unlock(&bond->lock);
>+	rtnl_unlock();
> }
>
> /*
>@@ -3093,9 +3096,6 @@ void bond_activebackup_arp_mon(struct work_struct *work)
>
> 	read_lock(&bond->lock);
>
>-	if (bond->kill_timers)
>-		goto out;
>-
> 	delta_in_ticks = msecs_to_jiffies(bond->params.arp_interval);
>
> 	if (bond->slave_cnt == 0)
>@@ -3113,24 +3113,15 @@ void bond_activebackup_arp_mon(struct work_struct *work)
> 		read_unlock(&bond->curr_slave_lock);
> 	}
>
>-	if (bond_ab_arp_inspect(bond, delta_in_ticks)) {
>-		read_unlock(&bond->lock);
>-		rtnl_lock();
>-		read_lock(&bond->lock);
>+	if (bond_ab_arp_inspect(bond, delta_in_ticks))
>+		queue_work(bond->wq_rtnl, &bond->ab_arp_commit_work);
>
>-		bond_ab_arp_commit(bond, delta_in_ticks);
>-
>-		read_unlock(&bond->lock);
>-		rtnl_unlock();
>-		read_lock(&bond->lock);
>-	}
>
> 	bond_ab_arp_probe(bond);
>
> re_arm:
> 	if (bond->params.arp_interval)
> 		queue_delayed_work(bond->wq, &bond->arp_work, delta_in_ticks);
>-out:
> 	read_unlock(&bond->lock);
> }
>
>@@ -3720,8 +3711,6 @@ static int bond_open(struct net_device *bond_dev)
> {
> 	struct bonding *bond = netdev_priv(bond_dev);
>
>-	bond->kill_timers = 0;
>-
> 	if (bond_is_lb(bond)) {
> 		/* bond_alb_initialize must be called before the timer
> 		 * is started.
>@@ -3781,26 +3770,23 @@ static int bond_close(struct net_device *bond_dev)
> 	bond->send_grat_arp = 0;
> 	bond->send_unsol_na = 0;
>
>-	/* signal timers not to re-arm */
>-	bond->kill_timers = 1;
>-
> 	write_unlock_bh(&bond->lock);
>
> 	if (bond->params.miimon) {  /* link check interval, in milliseconds. */
>-		cancel_delayed_work(&bond->mii_work);
>+		cancel_delayed_work_sync(&bond->mii_work);
> 	}
>
> 	if (bond->params.arp_interval) {  /* arp interval, in milliseconds. */
>-		cancel_delayed_work(&bond->arp_work);
>+		cancel_delayed_work_sync(&bond->arp_work);
> 	}
>
> 	switch (bond->params.mode) {
> 	case BOND_MODE_8023AD:
>-		cancel_delayed_work(&bond->ad_work);
>+		cancel_delayed_work_sync(&bond->ad_work);
> 		break;
> 	case BOND_MODE_TLB:
> 	case BOND_MODE_ALB:
>-		cancel_delayed_work(&bond->alb_work);
>+		cancel_delayed_work_sync(&bond->alb_work);
> 		break;
> 	default:
> 		break;
>@@ -4601,6 +4587,8 @@ static void bond_destructor(struct net_device *bond_dev)
> 	struct bonding *bond = netdev_priv(bond_dev);
> 	if (bond->wq)
> 		destroy_workqueue(bond->wq);
>+	if (bond->wq_rtnl)
>+		destroy_workqueue(bond->wq_rtnl);
> 	free_netdev(bond_dev);
> }
>
>@@ -4660,23 +4648,19 @@ static void bond_setup(struct net_device *bond_dev)
>
> static void bond_work_cancel_all(struct bonding *bond)
> {
>-	write_lock_bh(&bond->lock);
>-	bond->kill_timers = 1;
>-	write_unlock_bh(&bond->lock);
>-
> 	if (bond->params.miimon && delayed_work_pending(&bond->mii_work))
>-		cancel_delayed_work(&bond->mii_work);
>+		cancel_delayed_work_sync(&bond->mii_work);
>
> 	if (bond->params.arp_interval && delayed_work_pending(&bond->arp_work))
>-		cancel_delayed_work(&bond->arp_work);
>+		cancel_delayed_work_sync(&bond->arp_work);
>
> 	if (bond->params.mode == BOND_MODE_ALB &&
> 	    delayed_work_pending(&bond->alb_work))
>-		cancel_delayed_work(&bond->alb_work);
>+		cancel_delayed_work_sync(&bond->alb_work);
>
> 	if (bond->params.mode == BOND_MODE_8023AD &&
> 	    delayed_work_pending(&bond->ad_work))
>-		cancel_delayed_work(&bond->ad_work);
>+		cancel_delayed_work_sync(&bond->ad_work);
> }
>
> /*
>@@ -5083,6 +5067,12 @@ static int bond_init(struct net_device *bond_dev)
> 	bond->wq = create_singlethread_workqueue(bond_dev->name);
> 	if (!bond->wq)
> 		return -ENOMEM;
>+	bond->wq_rtnl = create_singlethread_workqueue(bond_dev->name);
>+	if (!bond->wq_rtnl) {
>+		destroy_workqueue(bond->wq);
>+		bond->wq = NULL;
>+		return -ENOMEM;
>+	}
>
> 	bond_set_lockdep_class(bond_dev);
>
>@@ -5094,6 +5084,9 @@ static int bond_init(struct net_device *bond_dev)
> 	bond_prepare_sysfs_group(bond);
>
> 	__hw_addr_init(&bond->mc_list);
>+	INIT_WORK(&bond->miimon_commit_work, bond_miimon_commit);
>+	INIT_WORK(&bond->ab_arp_commit_work, bond_ab_arp_commit);
>+	INIT_WORK(&bond->alb_promisc_disable_work, bond_alb_promisc_disable);
> 	return 0;
> }
>
>diff --git a/drivers/net/bonding/bonding.h b/drivers/net/bonding/bonding.h
>index c6fdd85..43ba807 100644
>--- a/drivers/net/bonding/bonding.h
>+++ b/drivers/net/bonding/bonding.h
>@@ -198,7 +198,6 @@ struct bonding {
> 	s32      slave_cnt; /* never change this value outside the attach/detach wrappers */
> 	rwlock_t lock;
> 	rwlock_t curr_slave_lock;
>-	s8       kill_timers;
> 	s8	 send_grat_arp;
> 	s8	 send_unsol_na;
> 	s8	 setup_by_slave;
>@@ -219,10 +218,14 @@ struct bonding {
> 	struct   vlan_group *vlgrp;
> 	struct   packet_type arp_mon_pt;
> 	struct   workqueue_struct *wq;
>+	struct   workqueue_struct *wq_rtnl;
> 	struct   delayed_work mii_work;
> 	struct   delayed_work arp_work;
> 	struct   delayed_work alb_work;
> 	struct   delayed_work ad_work;
>+	struct    work_struct miimon_commit_work;
>+	struct    work_struct ab_arp_commit_work;
>+	struct    work_struct alb_promisc_disable_work;
> #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
> 	struct   in6_addr master_ipv6;
> #endif
>@@ -348,6 +351,7 @@ void bond_select_active_slave(struct bonding *bond);
> void bond_change_active_slave(struct bonding *bond, struct slave *new_active);
> void bond_register_arp(struct bonding *);
> void bond_unregister_arp(struct bonding *);
>+void bond_alb_promisc_disable(struct work_struct *work);
>
> struct bond_net {
> 	struct net *		net;	/* Associated network namespace */
>
>-- 
>Jiri Bohac <jbohac@suse.cz>
>SUSE Labs, SUSE CZ

	-J

---
	-Jay Vosburgh, IBM Linux Technology Center, fubar@us.ibm.com

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [RFC] bonding: fix workqueue re-arming races
  2010-09-01 17:14     ` Jay Vosburgh
@ 2010-09-01 18:31       ` Jiri Bohac
  2010-09-01 20:00         ` Jay Vosburgh
  0 siblings, 1 reply; 25+ messages in thread
From: Jiri Bohac @ 2010-09-01 18:31 UTC (permalink / raw)
  To: Jay Vosburgh; +Cc: Jiri Bohac, bonding-devel, markine, jarkao2, chavey, netdev

On Wed, Sep 01, 2010 at 10:14:34AM -0700, Jay Vosburgh wrote:
> 	I also thought a bit more, and in the current code, the mode
> shouldn't change in the middle of one of the work functions, because a
> mode change requires the bond to be closed, so the various work things
> will be stopped (more or less; excepting the race under disucssion
> here).
>
> 	I don't think this is true for the new wq_rtnl functions,
> however, because its work items are not canceled until the workqueue
> itself is freed in bond_destructor.  Does the wq_rtnl open new races,
> because it's work items are not synchronized with other activities
> (close in particular)?  It's possible for its work functions (which may
> do things like set the active slave, carrier, etc) to be invoked after
> the bond has closed, and possibly reopened, or been otherwise adjusted.

I don't think this patch opens new races. The current race
scenario is:

1) schedule_delayed_work(foo)
2) foo's timer expires and foo is queued on bond->wq
  (possibly, foo starts to run and either gets preempted or
  sleeps on rtnl)
3) bond_close() sets kill_timers=1 and calls
  cancel_delayed_work() which accomplishes nothing
4) bond_open() sets kill_timers=0
5) bond_open() calls schedule_delayed_work(bar)
6) foo may run the "commit" work that should not be run
7) foo re-arms
8) if (foo == bar) -> BUG	/* bond->mode did not change */

With this patch, it is:

1) schedule_delayed_work(foo)
2) foo's timer expires and foo is queued on bond->wq
3) foo may have queued foo_commit on bond->wq_rtnl
4) bond_close() cancels foo
5) bond_open()
6) foo_commit may run and it should not be run

The patch avoids the problem of 7) and 8)

I think the race in 6) remains the same. It is now easier to fix.
This could even be done with a flag (similar to kill_timers),
which would be set each time the "commit" work is queued on wq_rtnl and
cleared by bond_close(). This should avoid the races completely,
I think. The trick is that, unlike kill_timers, bond_open() would
not touch this flag.

> 	I'm not sure this is better than one of the alternatives I
> believe we discussed the last time around: having the rtnl-acquiring
> work functions do a conditional acquire of rtnl, and if that fails,
> reschedule themselves.

[...]

> 		if (rtnl_trylock()) {
> 			read_lock(&bond->lock);
> 
> 			bond_miimon_commit(bond);
> 
> 			read_unlock(&bond->lock);
> 			rtnl_unlock();	/* might sleep, hold no other locks */
> 			read_lock(&bond->lock);
> 		} else {
> 			read_lock(&bond->lock);
> 			queue_work(bond->wq, &bond->mii_work);
> 			read_unlock(&bond->lock);
> 			return;
> 		}

I actually tried the other variant suggested last time
(basically:

while (!rtnl_trylock()) {
	read_lock(&bond->lock)
	kill = bond->kill_timers;
	read_unlock(&bond->lock)
	if (kill)
		return;
})

and gave that to a customer experiencing this problem (I cannot
reproduce it myself). It was reported to lock up. I suspect some
kind of live-lock on bond->lock caused by the active waiting, but
I did not spend too much time debugging this.
[BTW, this is https://bugzilla.novell.com/show_bug.cgi?id=602969
,Novell BZ account needeed]

FWIW  this would be the only use of rtnl_trylock() in the kernel,
besides a few places that do:
	if (!rtnl_trylock()) return restart_syscall();
I think it is plain ugly -- semaphores are just not supposed to be
spinned on.

Your re-scheduling variant is more or less equivalent to active
spinning, isn't it? Anyway, if you really think it is a better approach,
are you going to apply it? I can supply the patch. (Although I
kinda don't like people seeing my name next to it ;))

-- 
Jiri Bohac <jbohac@suse.cz>
SUSE Labs, SUSE CZ


^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [RFC] bonding: fix workqueue re-arming races
  2010-09-01 15:37         ` Jarek Poplawski
@ 2010-09-01 19:00           ` Jarek Poplawski
  2010-09-01 19:11             ` Jiri Bohac
  0 siblings, 1 reply; 25+ messages in thread
From: Jarek Poplawski @ 2010-09-01 19:00 UTC (permalink / raw)
  To: Jiri Bohac; +Cc: Jay Vosburgh, bonding-devel, markine, chavey, netdev

On Wed, Sep 01, 2010 at 05:37:30PM +0200, Jarek Poplawski wrote:
> On Wed, Sep 01, 2010 at 05:18:56PM +0200, Jarek Poplawski wrote:
> > On Wed, Sep 01, 2010 at 03:30:56PM +0200, Jiri Bohac wrote:
> > > On Wed, Sep 01, 2010 at 12:23:56PM +0000, Jarek Poplawski wrote:
> > > > On 2010-08-31 22:54, Jay Vosburgh wrote:
> > > > > 	What prevents this from deadlocking such that cpu A is in
> > > > > bond_close, holding RTNL and in cancel_delayed_work_sync, while cpu B is
> > > > > in the above function, trying to acquire RTNL?
> > > > 
> > > > I guess this one isn't cancelled in bond_close, so it should be safe.
> > > 
> > > Nah, Jay was correct. Although this work item is not explicitly
> > > cancelled with cancel_delayed_work_sync(), it is on the same
> > > workqueue as work items that are being cancelled with
> > > cancel_delayed_work_sync(), so this can still cause a deadlock.
> > > Fixed in the new version of the patch by putting these on a
> > > separate workqueue.
> > > 
> > 
> > Maybe I miss something, but the same workqueue shouldn't matter here.
> 
> Hmm... I missed your point completely and Jay was correct!

Hmm#2... Alas, after getting back my sobriety, I've to say that Jay
was wrong: the same workqueue shouldn't matter here. Similar things
are done by other network code with the kernel-global workqueue, eg.
in tg3_close(), rhine_close() etc. (cancel_work_sync instaed of
cancel_delayed_work_sync doesn't matter here).

Jarek P.

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [RFC] bonding: fix workqueue re-arming races
  2010-09-01 19:00           ` Jarek Poplawski
@ 2010-09-01 19:11             ` Jiri Bohac
  2010-09-01 19:20               ` Jarek Poplawski
  0 siblings, 1 reply; 25+ messages in thread
From: Jiri Bohac @ 2010-09-01 19:11 UTC (permalink / raw)
  To: Jarek Poplawski
  Cc: Jiri Bohac, Jay Vosburgh, bonding-devel, markine, chavey, netdev

On Wed, Sep 01, 2010 at 09:00:37PM +0200, Jarek Poplawski wrote:
> On Wed, Sep 01, 2010 at 05:37:30PM +0200, Jarek Poplawski wrote:
> > On Wed, Sep 01, 2010 at 05:18:56PM +0200, Jarek Poplawski wrote:
> > > On Wed, Sep 01, 2010 at 03:30:56PM +0200, Jiri Bohac wrote:
> > > > On Wed, Sep 01, 2010 at 12:23:56PM +0000, Jarek Poplawski wrote:
> > > > > On 2010-08-31 22:54, Jay Vosburgh wrote:
> > > > > > 	What prevents this from deadlocking such that cpu A is in
> > > > > > bond_close, holding RTNL and in cancel_delayed_work_sync, while cpu B is
> > > > > > in the above function, trying to acquire RTNL?
> > > > > 
> > > > > I guess this one isn't cancelled in bond_close, so it should be safe.
> > > > 
> > > > Nah, Jay was correct. Although this work item is not explicitly
> > > > cancelled with cancel_delayed_work_sync(), it is on the same
> > > > workqueue as work items that are being cancelled with
> > > > cancel_delayed_work_sync(), so this can still cause a deadlock.
> > > > Fixed in the new version of the patch by putting these on a
> > > > separate workqueue.
> > > > 
> > > 
> > > Maybe I miss something, but the same workqueue shouldn't matter here.
> > 
> > Hmm... I missed your point completely and Jay was correct!
> 
> Hmm#2... Alas, after getting back my sobriety, I've to say that Jay
> was wrong: the same workqueue shouldn't matter here. Similar things
> are done by other network code with the kernel-global workqueue, eg.
> in tg3_close(), rhine_close() etc. 

But these don't do rtnl_lock() inside the work item, do they?
That is the main issue here: dev_close() is called with rtnl held
and so it cannot wait for completion of work items that grab rtnl
themselves.

-- 
Jiri Bohac <jbohac@suse.cz>
SUSE Labs, SUSE CZ


^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [RFC] bonding: fix workqueue re-arming races
  2010-09-01 19:11             ` Jiri Bohac
@ 2010-09-01 19:20               ` Jarek Poplawski
  2010-09-01 19:38                 ` Jarek Poplawski
  2010-09-01 19:46                 ` Jay Vosburgh
  0 siblings, 2 replies; 25+ messages in thread
From: Jarek Poplawski @ 2010-09-01 19:20 UTC (permalink / raw)
  To: Jiri Bohac; +Cc: Jay Vosburgh, bonding-devel, markine, chavey, netdev

On Wed, Sep 01, 2010 at 09:11:06PM +0200, Jiri Bohac wrote:
> On Wed, Sep 01, 2010 at 09:00:37PM +0200, Jarek Poplawski wrote:
> > On Wed, Sep 01, 2010 at 05:37:30PM +0200, Jarek Poplawski wrote:
> > > On Wed, Sep 01, 2010 at 05:18:56PM +0200, Jarek Poplawski wrote:
> > > > On Wed, Sep 01, 2010 at 03:30:56PM +0200, Jiri Bohac wrote:
> > > > > On Wed, Sep 01, 2010 at 12:23:56PM +0000, Jarek Poplawski wrote:
> > > > > > On 2010-08-31 22:54, Jay Vosburgh wrote:
> > > > > > > 	What prevents this from deadlocking such that cpu A is in
> > > > > > > bond_close, holding RTNL and in cancel_delayed_work_sync, while cpu B is
> > > > > > > in the above function, trying to acquire RTNL?
> > > > > > 
> > > > > > I guess this one isn't cancelled in bond_close, so it should be safe.
> > > > > 
> > > > > Nah, Jay was correct. Although this work item is not explicitly
> > > > > cancelled with cancel_delayed_work_sync(), it is on the same
> > > > > workqueue as work items that are being cancelled with
> > > > > cancel_delayed_work_sync(), so this can still cause a deadlock.
> > > > > Fixed in the new version of the patch by putting these on a
> > > > > separate workqueue.
> > > > > 
> > > > 
> > > > Maybe I miss something, but the same workqueue shouldn't matter here.
> > > 
> > > Hmm... I missed your point completely and Jay was correct!
> > 
> > Hmm#2... Alas, after getting back my sobriety, I've to say that Jay
> > was wrong: the same workqueue shouldn't matter here. Similar things
> > are done by other network code with the kernel-global workqueue, eg.
> > in tg3_close(), rhine_close() etc. 
> 
> But these don't do rtnl_lock() inside the work item, do they?

Exactly. Just like work items cancelled from bond_work_cancel_all()
after your patch.

Jarek P.

> That is the main issue here: dev_close() is called with rtnl held
> and so it cannot wait for completion of work items that grab rtnl
> themselves.
> 
> -- 
> Jiri Bohac <jbohac@suse.cz>
> SUSE Labs, SUSE CZ
> 

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [RFC] bonding: fix workqueue re-arming races
  2010-09-01 19:20               ` Jarek Poplawski
@ 2010-09-01 19:38                 ` Jarek Poplawski
  2010-09-01 19:46                 ` Jay Vosburgh
  1 sibling, 0 replies; 25+ messages in thread
From: Jarek Poplawski @ 2010-09-01 19:38 UTC (permalink / raw)
  To: Jiri Bohac; +Cc: Jay Vosburgh, bonding-devel, markine, chavey, netdev

On Wed, Sep 01, 2010 at 09:20:26PM +0200, Jarek Poplawski wrote:
> On Wed, Sep 01, 2010 at 09:11:06PM +0200, Jiri Bohac wrote:
> > On Wed, Sep 01, 2010 at 09:00:37PM +0200, Jarek Poplawski wrote:
...
> > > Hmm#2... Alas, after getting back my sobriety, I've to say that Jay
> > > was wrong: the same workqueue shouldn't matter here. Similar things
> > > are done by other network code with the kernel-global workqueue, eg.
> > > in tg3_close(), rhine_close() etc. 
> > 
> > But these don't do rtnl_lock() inside the work item, do they?
> 
> Exactly. Just like work items cancelled from bond_work_cancel_all()
> after your patch.

IOW: cancel_delayed_work_sync() cares only about the locking of the
cancelled work item - not others, even in the same workqueue. Btw,
testing it with CONFIG_PROVE_LOCKING should give the last answer...

Jarek P.

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [RFC] bonding: fix workqueue re-arming races
  2010-09-01 19:20               ` Jarek Poplawski
  2010-09-01 19:38                 ` Jarek Poplawski
@ 2010-09-01 19:46                 ` Jay Vosburgh
  2010-09-01 20:06                   ` Jarek Poplawski
  1 sibling, 1 reply; 25+ messages in thread
From: Jay Vosburgh @ 2010-09-01 19:46 UTC (permalink / raw)
  To: Jarek Poplawski; +Cc: Jiri Bohac, bonding-devel, markine, chavey, netdev

Jarek Poplawski <jarkao2@gmail.com> wrote:

>On Wed, Sep 01, 2010 at 09:11:06PM +0200, Jiri Bohac wrote:
>> On Wed, Sep 01, 2010 at 09:00:37PM +0200, Jarek Poplawski wrote:
>> > On Wed, Sep 01, 2010 at 05:37:30PM +0200, Jarek Poplawski wrote:
>> > > On Wed, Sep 01, 2010 at 05:18:56PM +0200, Jarek Poplawski wrote:
>> > > > On Wed, Sep 01, 2010 at 03:30:56PM +0200, Jiri Bohac wrote:
>> > > > > On Wed, Sep 01, 2010 at 12:23:56PM +0000, Jarek Poplawski wrote:
>> > > > > > On 2010-08-31 22:54, Jay Vosburgh wrote:
>> > > > > > > 	What prevents this from deadlocking such that cpu A is in
>> > > > > > > bond_close, holding RTNL and in cancel_delayed_work_sync, while cpu B is
>> > > > > > > in the above function, trying to acquire RTNL?
>> > > > > > 
>> > > > > > I guess this one isn't cancelled in bond_close, so it should be safe.
>> > > > > 
>> > > > > Nah, Jay was correct. Although this work item is not explicitly
>> > > > > cancelled with cancel_delayed_work_sync(), it is on the same
>> > > > > workqueue as work items that are being cancelled with
>> > > > > cancel_delayed_work_sync(), so this can still cause a deadlock.
>> > > > > Fixed in the new version of the patch by putting these on a
>> > > > > separate workqueue.
>> > > > > 
>> > > > 
>> > > > Maybe I miss something, but the same workqueue shouldn't matter here.
>> > > 
>> > > Hmm... I missed your point completely and Jay was correct!
>> > 
>> > Hmm#2... Alas, after getting back my sobriety, I've to say that Jay
>> > was wrong: the same workqueue shouldn't matter here. Similar things
>> > are done by other network code with the kernel-global workqueue, eg.
>> > in tg3_close(), rhine_close() etc. 
>> 
>> But these don't do rtnl_lock() inside the work item, do they?
>
>Exactly. Just like work items cancelled from bond_work_cancel_all()
>after your patch.

	I see what Jarek is getting at here: the mii_commit, etc, work
items new to the patch aren't cancelled by bond_close, so bond_close (in
cancel_delayed_work_sync) shouldn't care if they're executing or not.

	This still would leave the new work items (the "commit" ones
added in the patch) always free to run at some arbitrary time after
close, which makes me uneasy.  I don't think the extra "wq_rtnl" makes
any difference, though.

	-J

---
	-Jay Vosburgh, IBM Linux Technology Center, fubar@us.ibm.com

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [RFC] bonding: fix workqueue re-arming races
  2010-09-01 18:31       ` Jiri Bohac
@ 2010-09-01 20:00         ` Jay Vosburgh
  2010-09-01 20:56           ` Jiri Bohac
  0 siblings, 1 reply; 25+ messages in thread
From: Jay Vosburgh @ 2010-09-01 20:00 UTC (permalink / raw)
  To: Jiri Bohac; +Cc: bonding-devel, markine, jarkao2, chavey, netdev

Jiri Bohac <jbohac@suse.cz> wrote:

>On Wed, Sep 01, 2010 at 10:14:34AM -0700, Jay Vosburgh wrote:
>> 	I also thought a bit more, and in the current code, the mode
>> shouldn't change in the middle of one of the work functions, because a
>> mode change requires the bond to be closed, so the various work things
>> will be stopped (more or less; excepting the race under disucssion
>> here).
>>
>> 	I don't think this is true for the new wq_rtnl functions,
>> however, because its work items are not canceled until the workqueue
>> itself is freed in bond_destructor.  Does the wq_rtnl open new races,
>> because it's work items are not synchronized with other activities
>> (close in particular)?  It's possible for its work functions (which may
>> do things like set the active slave, carrier, etc) to be invoked after
>> the bond has closed, and possibly reopened, or been otherwise adjusted.
>
>I don't think this patch opens new races. The current race
>scenario is:
>
>1) schedule_delayed_work(foo)
>2) foo's timer expires and foo is queued on bond->wq
>  (possibly, foo starts to run and either gets preempted or
>  sleeps on rtnl)
>3) bond_close() sets kill_timers=1 and calls
>  cancel_delayed_work() which accomplishes nothing
>4) bond_open() sets kill_timers=0
>5) bond_open() calls schedule_delayed_work(bar)
>6) foo may run the "commit" work that should not be run
>7) foo re-arms
>8) if (foo == bar) -> BUG	/* bond->mode did not change */
>
>With this patch, it is:
>
>1) schedule_delayed_work(foo)
>2) foo's timer expires and foo is queued on bond->wq
>3) foo may have queued foo_commit on bond->wq_rtnl
>4) bond_close() cancels foo
>5) bond_open()
>6) foo_commit may run and it should not be run
>
>The patch avoids the problem of 7) and 8)

	But the "with patch" #6 is a bigger window; it doesn't require
step 5; the foo_commit, et al, can always run after bond_close (because
nothing ever cancels the foo_commit except for unregistration).  That's
the part that makes me nervous.

	The current race, as I understand it, requires a "close then
open" sequence with little delay between the two.

>I think the race in 6) remains the same. It is now easier to fix.
>This could even be done with a flag (similar to kill_timers),
>which would be set each time the "commit" work is queued on wq_rtnl and
>cleared by bond_close(). This should avoid the races completely,
>I think. The trick is that, unlike kill_timers, bond_open() would
>not touch this flag.

	I'm chewing on whether or not it's feasible to introduce some
kind of generation count into bond_open/close, so that, e.g., at
bond_close, the generation is incremented.  Each time any of the work
items is queued, the current generation is stashed somewhere private to
that work item (in struct bonding, probably).  Then, when it runs, it
compares the current generation to the stored one.  If they don't match,
then the work item does nothing.

	It's still a "kill_timers," but perhaps not subject to the
close/open issue that a boolean flag has.  It's also not all that
elegant, either, but maybe is less bad than the other alternatives.

>> 	I'm not sure this is better than one of the alternatives I
>> believe we discussed the last time around: having the rtnl-acquiring
>> work functions do a conditional acquire of rtnl, and if that fails,
>> reschedule themselves.
>
>[...]
>
>> 		if (rtnl_trylock()) {
>> 			read_lock(&bond->lock);
>> 
>> 			bond_miimon_commit(bond);
>> 
>> 			read_unlock(&bond->lock);
>> 			rtnl_unlock();	/* might sleep, hold no other locks */
>> 			read_lock(&bond->lock);
>> 		} else {
>> 			read_lock(&bond->lock);
>> 			queue_work(bond->wq, &bond->mii_work);
>> 			read_unlock(&bond->lock);
>> 			return;
>> 		}
>
>I actually tried the other variant suggested last time
>(basically:
>
>while (!rtnl_trylock()) {
>	read_lock(&bond->lock)
>	kill = bond->kill_timers;
>	read_unlock(&bond->lock)
>	if (kill)
>		return;
>})
>
>and gave that to a customer experiencing this problem (I cannot
>reproduce it myself). It was reported to lock up. I suspect some
>kind of live-lock on bond->lock caused by the active waiting, but
>I did not spend too much time debugging this.
>[BTW, this is https://bugzilla.novell.com/show_bug.cgi?id=602969
>,Novell BZ account needeed]

	My BZ account is unworthy to access that bug; can you provide
any information as to how they're hitting the problem?  Presumably
they're doing something that's doing a fast down/up cycle on the bond,
but anything else?

>FWIW  this would be the only use of rtnl_trylock() in the kernel,
>besides a few places that do:
>	if (!rtnl_trylock()) return restart_syscall();
>I think it is plain ugly -- semaphores are just not supposed to be
>spinned on.
>
>Your re-scheduling variant is more or less equivalent to active
>spinning, isn't it? Anyway, if you really think it is a better approach,
>are you going to apply it? I can supply the patch. (Although I
>kinda don't like people seeing my name next to it ;))

	Well, it could have a queue_delayed_work() with a bit of real
time in there, and it's not a simple spin loop, there's actually a
scheduling step in the middle.  I'm relucant to call it "better,"
though, as that kind of implies it's inherently "good."  "Less bad,"
perhaps.

	Anyway, maybe there are other ways to accomplish the end goal
(no executing of "stuff" after close) without resorting to either of
these strategies (since what we're really discussing here is relative
awfulness; neither is what I'd call a really good option).

	I'm wondering if there's any utility in the "generation count"
idea I mention above.  It's still a sentinel, but if that can be worked
out to reliably stop the work items after close, then maybe it's the
least bad option.

	-J

---
	-Jay Vosburgh, IBM Linux Technology Center, fubar@us.ibm.com



^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [RFC] bonding: fix workqueue re-arming races
  2010-09-01 19:46                 ` Jay Vosburgh
@ 2010-09-01 20:06                   ` Jarek Poplawski
  0 siblings, 0 replies; 25+ messages in thread
From: Jarek Poplawski @ 2010-09-01 20:06 UTC (permalink / raw)
  To: Jay Vosburgh; +Cc: Jiri Bohac, bonding-devel, markine, chavey, netdev

On Wed, Sep 01, 2010 at 12:46:30PM -0700, Jay Vosburgh wrote:
> Jarek Poplawski <jarkao2@gmail.com> wrote:
> 
> >On Wed, Sep 01, 2010 at 09:11:06PM +0200, Jiri Bohac wrote:
> >> But these don't do rtnl_lock() inside the work item, do they?
> >
> >Exactly. Just like work items cancelled from bond_work_cancel_all()
> >after your patch.
> 
> 	I see what Jarek is getting at here: the mii_commit, etc, work
> items new to the patch aren't cancelled by bond_close, so bond_close (in
> cancel_delayed_work_sync) shouldn't care if they're executing or not.
> 
> 	This still would leave the new work items (the "commit" ones
> added in the patch) always free to run at some arbitrary time after
> close, which makes me uneasy.  I don't think the extra "wq_rtnl" makes
> any difference, though.

Sure, but IIRC it wasn't encouraged. After all, many net drivers do it
similarly and don't even need their separate workqueue.

Jarek P.

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [RFC] bonding: fix workqueue re-arming races
  2010-09-01 20:00         ` Jay Vosburgh
@ 2010-09-01 20:56           ` Jiri Bohac
  2010-09-02  0:54             ` Jay Vosburgh
  0 siblings, 1 reply; 25+ messages in thread
From: Jiri Bohac @ 2010-09-01 20:56 UTC (permalink / raw)
  To: Jay Vosburgh; +Cc: Jiri Bohac, bonding-devel, markine, jarkao2, chavey, netdev

On Wed, Sep 01, 2010 at 01:00:38PM -0700, Jay Vosburgh wrote:
> Jiri Bohac <jbohac@suse.cz> wrote:
> >I don't think this patch opens new races. The current race
> >scenario is:
> >
> >1) schedule_delayed_work(foo)
> >2) foo's timer expires and foo is queued on bond->wq
> >  (possibly, foo starts to run and either gets preempted or
> >  sleeps on rtnl)
> >3) bond_close() sets kill_timers=1 and calls
> >  cancel_delayed_work() which accomplishes nothing
> >4) bond_open() sets kill_timers=0
> >5) bond_open() calls schedule_delayed_work(bar)
> >6) foo may run the "commit" work that should not be run
> >7) foo re-arms
> >8) if (foo == bar) -> BUG	/* bond->mode did not change */
> >
> >With this patch, it is:
> >
> >1) schedule_delayed_work(foo)
> >2) foo's timer expires and foo is queued on bond->wq
> >3) foo may have queued foo_commit on bond->wq_rtnl
> >4) bond_close() cancels foo
> >5) bond_open()
> >6) foo_commit may run and it should not be run
> >
> >The patch avoids the problem of 7) and 8)
> 
> 	But the "with patch" #6 is a bigger window; it doesn't require
> step 5; the foo_commit, et al, can always run after bond_close (because
> nothing ever cancels the foo_commit except for unregistration).  That's
> the part that makes me nervous.

We can always call cancel_work(foo_commit) in bond_close. This
should make the race window the same size it is now.
I didn't do that because I was thinking we'd avoid the race
somehow completely. Perhaps we can do cancel_work() now and solve
it cleanly later.

> 	The current race, as I understand it, requires a "close then
> open" sequence with little delay between the two.

Yeah, not sure how small the delay has to be. With releasing
bond->lock, acquiring rtnl and re-acquiring bond->lock in most of
the work items it may be pretty long. Putting an extra check for
kill_timers after bond->lock is re-acquired will make the window
much smaller ...  just in case this is the way we want to "fix"
race conditions ;-)

> >I think the race in 6) remains the same. It is now easier to fix.
> >This could even be done with a flag (similar to kill_timers),
> >which would be set each time the "commit" work is queued on wq_rtnl and
> >cleared by bond_close(). This should avoid the races completely,
> >I think. The trick is that, unlike kill_timers, bond_open() would
> >not touch this flag.
> 
> 	I'm chewing on whether or not it's feasible to introduce some
> kind of generation count into bond_open/close, so that, e.g., at
> bond_close, the generation is incremented.  Each time any of the work
> items is queued, the current generation is stashed somewhere private to
> that work item (in struct bonding, probably).  Then, when it runs, it
> compares the current generation to the stored one.  If they don't match,
> then the work item does nothing.

I thought about the generation count as well before I did this
patch. I don't think you can put the counter in struct bonding --
because that would be overwritten with the new value if the work
item is re-scheduled by bond_open.

I think you would have to create a new dynamic structure on each
work schedule and pass it to the work item in the "data" pointer.
The structure would contain the counter and the bond pointer. It
would be freed by thework item. I did not like this too much.

> >[BTW, this is https://bugzilla.novell.com/show_bug.cgi?id=602969
> >,Novell BZ account needeed]
> 
> 	My BZ account is unworthy to access that bug; can you provide
> any information as to how they're hitting the problem?  Presumably
> they're doing something that's doing a fast down/up cycle on the bond,
> but anything else?

They are doing "rcnetwork restart", which will do the
close->open. Perhaps all the contention on the rtnl (lots of work
with other network interfaces) makes the race window longer. I
couldn't reproduce this.

> 	I'm wondering if there's any utility in the "generation count"
> idea I mention above.  It's still a sentinel, but if that can be worked
> out to reliably stop the work items after close, then maybe it's the
> least bad option.

Not without the dynamic allocation, I think.
How about the "kill_timers" on top of this patch (see my
previous e-mail) -- a flag that would be set when queuing the
"commit" work and cleared by bond_close()?

While this can not stop the re-arming race it is trying to stop
now, it should be able to stop the "commit" work items (where it
does not matter if you try to queue them on the workqueue for a
second time, since it is not a delayed work).

-- 
Jiri Bohac <jbohac@suse.cz>
SUSE Labs, SUSE CZ


^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [RFC] bonding: fix workqueue re-arming races
  2010-09-01 20:56           ` Jiri Bohac
@ 2010-09-02  0:54             ` Jay Vosburgh
  2010-09-02 17:08               ` Jiri Bohac
  0 siblings, 1 reply; 25+ messages in thread
From: Jay Vosburgh @ 2010-09-02  0:54 UTC (permalink / raw)
  To: Jiri Bohac; +Cc: bonding-devel, markine, jarkao2, chavey, netdev

Jiri Bohac <jbohac@suse.cz> wrote:

>On Wed, Sep 01, 2010 at 01:00:38PM -0700, Jay Vosburgh wrote:
>> Jiri Bohac <jbohac@suse.cz> wrote:
>> >I don't think this patch opens new races. The current race
>> >scenario is:
>> >
>> >1) schedule_delayed_work(foo)
>> >2) foo's timer expires and foo is queued on bond->wq
>> >  (possibly, foo starts to run and either gets preempted or
>> >  sleeps on rtnl)
>> >3) bond_close() sets kill_timers=1 and calls
>> >  cancel_delayed_work() which accomplishes nothing
>> >4) bond_open() sets kill_timers=0
>> >5) bond_open() calls schedule_delayed_work(bar)
>> >6) foo may run the "commit" work that should not be run
>> >7) foo re-arms
>> >8) if (foo == bar) -> BUG	/* bond->mode did not change */

	In looking at bond_open a bit, I wonder if a contributing factor
to the crash (if that's what happens) is that INIT_DELAYED_WORK is being
done in bond_open on a work item that's already queued or running (left
over from the kill_timers sentinel being missed).  Calling
queue_delayed_work when the work item is already queued is ok, I
believe, so I don't think that part is an issue (or at least not a
panic-worthy one).

	I suspect that the INIT_DELAYED_WORK calls will have to move to
bond_create if any of the work items end up be able to run beyond the
end of close (which seems likely; I'm running out of ideas).

>> >With this patch, it is:
>> >
>> >1) schedule_delayed_work(foo)
>> >2) foo's timer expires and foo is queued on bond->wq
>> >3) foo may have queued foo_commit on bond->wq_rtnl
>> >4) bond_close() cancels foo
>> >5) bond_open()
>> >6) foo_commit may run and it should not be run
>> >
>> >The patch avoids the problem of 7) and 8)
>> 
>> 	But the "with patch" #6 is a bigger window; it doesn't require
>> step 5; the foo_commit, et al, can always run after bond_close (because
>> nothing ever cancels the foo_commit except for unregistration).  That's
>> the part that makes me nervous.
>
>We can always call cancel_work(foo_commit) in bond_close. This
>should make the race window the same size it is now.
>I didn't do that because I was thinking we'd avoid the race
>somehow completely. Perhaps we can do cancel_work() now and solve
>it cleanly later.
>
>> 	The current race, as I understand it, requires a "close then
>> open" sequence with little delay between the two.
>
>Yeah, not sure how small the delay has to be. With releasing
>bond->lock, acquiring rtnl and re-acquiring bond->lock in most of
>the work items it may be pretty long. Putting an extra check for
>kill_timers after bond->lock is re-acquired will make the window
>much smaller ...  just in case this is the way we want to "fix"
>race conditions ;-)
>
>> >I think the race in 6) remains the same. It is now easier to fix.
>> >This could even be done with a flag (similar to kill_timers),
>> >which would be set each time the "commit" work is queued on wq_rtnl and
>> >cleared by bond_close(). This should avoid the races completely,
>> >I think. The trick is that, unlike kill_timers, bond_open() would
>> >not touch this flag.
>> 
>> 	I'm chewing on whether or not it's feasible to introduce some
>> kind of generation count into bond_open/close, so that, e.g., at
>> bond_close, the generation is incremented.  Each time any of the work
>> items is queued, the current generation is stashed somewhere private to
>> that work item (in struct bonding, probably).  Then, when it runs, it
>> compares the current generation to the stored one.  If they don't match,
>> then the work item does nothing.
>
>I thought about the generation count as well before I did this
>patch. I don't think you can put the counter in struct bonding --
>because that would be overwritten with the new value if the work
>item is re-scheduled by bond_open.
>
>I think you would have to create a new dynamic structure on each
>work schedule and pass it to the work item in the "data" pointer.
>The structure would contain the counter and the bond pointer. It
>would be freed by thework item. I did not like this too much.
>
>> >[BTW, this is https://bugzilla.novell.com/show_bug.cgi?id=602969
>> >,Novell BZ account needeed]
>> 
>> 	My BZ account is unworthy to access that bug; can you provide
>> any information as to how they're hitting the problem?  Presumably
>> they're doing something that's doing a fast down/up cycle on the bond,
>> but anything else?
>
>They are doing "rcnetwork restart", which will do the
>close->open. Perhaps all the contention on the rtnl (lots of work
>with other network interfaces) makes the race window longer. I
>couldn't reproduce this.

	What actually happens when the failure occurs?  Is there a stack
dump?

>> 	I'm wondering if there's any utility in the "generation count"
>> idea I mention above.  It's still a sentinel, but if that can be worked
>> out to reliably stop the work items after close, then maybe it's the
>> least bad option.
>
>Not without the dynamic allocation, I think.
>How about the "kill_timers" on top of this patch (see my
>previous e-mail) -- a flag that would be set when queuing the
>"commit" work and cleared by bond_close()?
>
>While this can not stop the re-arming race it is trying to stop
>now, it should be able to stop the "commit" work items (where it
>does not matter if you try to queue them on the workqueue for a
>second time, since it is not a delayed work).
>
>-- 
>Jiri Bohac <jbohac@suse.cz>
>SUSE Labs, SUSE CZ

	-J

---
	-Jay Vosburgh, IBM Linux Technology Center, fubar@us.ibm.com

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [RFC] bonding: fix workqueue re-arming races
  2010-09-02  0:54             ` Jay Vosburgh
@ 2010-09-02 17:08               ` Jiri Bohac
  2010-09-09  0:06                 ` Jay Vosburgh
  0 siblings, 1 reply; 25+ messages in thread
From: Jiri Bohac @ 2010-09-02 17:08 UTC (permalink / raw)
  To: Jay Vosburgh; +Cc: Jiri Bohac, bonding-devel, markine, jarkao2, chavey, netdev

On Wed, Sep 01, 2010 at 05:54:00PM -0700, Jay Vosburgh wrote:
> Jiri Bohac <jbohac@suse.cz> wrote:
> 
> 	In looking at bond_open a bit, I wonder if a contributing factor
> to the crash (if that's what happens) is that INIT_DELAYED_WORK is being
> done in bond_open on a work item that's already queued or running (left
> over from the kill_timers sentinel being missed).  Calling
> queue_delayed_work when the work item is already queued is ok, I
> believe, so I don't think that part is an issue (or at least not a
> panic-worthy one).

Yes, it is the INIT_DELAYED_WORK (zeroes the pending flag), followed by
queue_delayed_work(). What then happens is
queue_delayed_work()->queue_delayed_work_on()->BUG_ON(timer_pending(timer));

> 	I suspect that the INIT_DELAYED_WORK calls will have to move to
> bond_create if any of the work items end up be able to run beyond the
> end of close (which seems likely; I'm running out of ideas).

Yes, with my patch, I have INIT_WORK done in bond_init() for the
"commit" work items. The re-arming delayed work items now don't
live after bond_close(), so INIT_DELAYED_WORK can stay on
bond_open().

> >They are doing "rcnetwork restart", which will do the
> >close->open. Perhaps all the contention on the rtnl (lots of work
> >with other network interfaces) makes the race window longer. I
> >couldn't reproduce this.
> 
> 	What actually happens when the failure occurs?  Is there a stack
> dump?

yes, see above; The stack dump is

[  305.953690] kernel BUG at /usr/src/packages/BUILD/kernel-default-2.6.32.11/linux-2.6.32/kernel/workqueue.c:243!
[  305.963746] invalid opcode: 0000 [#1] SMP 
[  305.967849] last sysfs file: /sys/devices/virtual/net/bond1/bonding/slaves
[  305.974698] CPU 2 
[  305.976739] Modules linked in: iptable_filter ip_tables x_tables ext2
nls_iso8859_1 nls_cp437 vfat fat iscsi_tcp libiscsi_tcp libiscsi
scsi_transport_iscsi iscsi_trgt crc32c ipv6 bonding microcode fuse loop dm_mod
iTCO_wdt rtc_cmos usb_storage sr_mod ses serio_raw pcspkr dcdbas(X) joydev bnx2
rtc_core iTCO_vendor_support cdrom enclosure power_meter rtc_lib button sg
usbhid hid uhci_hcd ehci_hcd usbcore sd_mod crc_t10dif edd ext3 mbcache jbd fan
processor ide_pci_generic ide_core ata_generic ata_piix libata megaraid_sas
scsi_mod thermal thermal_sys hwmon
[  306.026413] Supported: Yes
[  306.029114] Pid: 8668, comm: bond1 Tainted: G           X 2.6.32.11-0.3-default #1 PowerEdge R910
[  306.037954] RIP: 0010:[<ffffffff810603a9>]  [<ffffffff810603a9>] queue_delayed_work_on+0xf9/0x100
[  306.046816] RSP: 0018:ffff880858cb3de0  EFLAGS: 00010286
[  306.052108] RAX: 0000000000000000 RBX: ffff88085aec8b60 RCX: 0000000000000019
[  306.059524] RDX: 0000000000000000 RSI: ffff880858f5a8c0 RDI: 00000000ffffffff
[  306.066635] RBP: ffff88085aec8b60 R08: 0000000000000018 R09: 0000000000000001
[  306.073746] R10: 0000000000000001 R11: 0000000000000006 R12: ffff880858f5a8c0
[  306.080856] R13: 00000000ffffffff R14: 0000000000000019 R15: ffff88085c7406a8
[  306.087967] FS:  0000000000000000(0000) GS:ffff88109c600000(0000) knlGS:0000000000000000
[  306.096028] CS:  0010 DS: 0018 ES: 0018 CR0: 000000008005003b
[  306.101753] CR2: 0000000000700540 CR3: 0000000001804000 CR4: 00000000000006e0
[  306.108864] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[  306.115973] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
[  306.123083] Process bond1 (pid: 8668, threadinfo ffff880858cb2000, task ffff88085c740040)
[  306.131229] Stack:
[  306.133232]  ffff88002e41b680 ffff88085aec8b60 ffff88085aec8780 ffff88085aec87ac
[  306.140462] <0> ffff88085c740040 ffffffffa028d029 0000000000013600 ffff88002e41b680
[  306.148149] <0> ffffffffa028cf40 ffff880858cb3fd8 ffff88002e41b688 ffffffff8105f558
[  306.156092] Call Trace:
[  306.158553]  [<ffffffffa028d029>] bond_mii_monitor+0xe9/0x140 [bonding]
[  306.165167]  [<ffffffff8105f558>] run_workqueue+0xb8/0x140
[  306.170638]  [<ffffffff8105f676>] worker_thread+0x96/0x110
[  306.176110]  [<ffffffff810636f6>] kthread+0x96/0xa0
[  306.180976]  [<ffffffff81003fba>] child_rip+0xa/0x20
[  306.185926] Code: 8b 74 24 28 48 89 ef e8 76 7b ff ff e9 7a ff ff ff 44 89
ee 48 89 ef e8 06 7d ff ff ba 01 00 00 00 90 e9 2c ff ff ff 0f 0b eb fe <0f> 0b
eb fe 0f 1f 00 48 89 f0 48 8b 35 5e 75 8c 00 48 89 d1 48 
[  306.205568] RIP  [<ffffffff810603a9>] queue_delayed_work_on+0xf9/0x100
[  306.212089]  RSP <ffff880858cb3de0>

workqueue.c:243 is the BUG_ON(timer_pending(timer)) inside queue_delayed_work_on() in our kernel.



This is an updated version of my patch:

- it fixes the issues in bond_alb_promisc_disable()

- it only the original bond->wq workqueue. The second workqueue
  is indeed not needed.  Thanks to Jarek for pointing that out.
  It is really safe to call cancel_delayed_work_sync(foo) under
  rtnl, even if there are work items on the workqueue that hold
  rtnl. Unless foo grabs rtnl itself, of course.

- it now has three flags that prevent any of the work items from
  doing unexpected stuff after bond_close():
  miimon_commit_work_active, ab_arp_commit_work_active and
  alb_promisc_disable_work_active.
  
  bond_close() zeroes them with rtnl held. The work items do
  everything under rtnl and they check these flags after rtnl is
  acquired. The only way the "commit" work items queued before a
  bond_close() might do anything after the bond_close() is when
  bond_open() is called and it schedules the delayed re-arming
  work item again.  The re-arming work then tries to queue the
  "commit" work item which fails, because it is already queued.
  However, it sets the flag to 1. So the work item queued from before
  bond_close() would just take over the job the newly queued one
  was supposed to do.  
  
  (In practice, I don't think this can even happen anyway, because the
  newly scheduled re-arming work item is queued after the
  "commit" work item on the workqueue. So the "commit" work item
  will just do nothing and return, and the flag will be set only
  after that.)


Signed-off-by: Jiri Bohac <jbohac@suse.cz>

diff --git a/drivers/net/bonding/bond_3ad.c b/drivers/net/bonding/bond_3ad.c
index 822f586..8015e12 100644
--- a/drivers/net/bonding/bond_3ad.c
+++ b/drivers/net/bonding/bond_3ad.c
@@ -2119,10 +2119,6 @@ void bond_3ad_state_machine_handler(struct work_struct *work)
 
 	read_lock(&bond->lock);
 
-	if (bond->kill_timers) {
-		goto out;
-	}
-
 	//check if there are any slaves
 	if (bond->slave_cnt == 0) {
 		goto re_arm;
@@ -2166,7 +2162,6 @@ void bond_3ad_state_machine_handler(struct work_struct *work)
 
 re_arm:
 	queue_delayed_work(bond->wq, &bond->ad_work, ad_delta_in_ticks);
-out:
 	read_unlock(&bond->lock);
 }
 
diff --git a/drivers/net/bonding/bond_alb.c b/drivers/net/bonding/bond_alb.c
index c746b33..5c9d7ab 100644
--- a/drivers/net/bonding/bond_alb.c
+++ b/drivers/net/bonding/bond_alb.c
@@ -1397,6 +1397,43 @@ out:
 	return NETDEV_TX_OK;
 }
 
+void bond_alb_promisc_disable(struct work_struct *work)
+{
+	struct bonding *bond = container_of(work, struct bonding,
+					    alb_promisc_disable_work);
+	struct alb_bond_info *bond_info = &(BOND_ALB_INFO(bond));
+	struct net_device *dev = NULL;
+
+	/*
+	 * dev_set_promiscuity requires rtnl and
+	 * nothing else.
+	 */
+	rtnl_lock();
+	read_lock(&bond->lock);
+	read_lock(&bond->curr_slave_lock);
+
+	if (!bond->alb_promisc_disable_work_active)
+		goto out;
+	bond->alb_promisc_disable_work_active = 0;
+
+	if (!bond_is_lb(bond))
+		goto out;
+	dev = bond->curr_active_slave;
+	if (!dev)
+		goto out;
+
+	bond_info->primary_is_promisc = 0;
+	bond_info->rlb_promisc_timeout_counter = 0;
+
+out:
+	read_unlock(&bond->curr_slave_lock);
+	read_unlock(&bond->lock);
+	rtnl_unlock();
+
+	if (dev)
+		dev_set_promiscuity(dev, -1);
+}
+
 void bond_alb_monitor(struct work_struct *work)
 {
 	struct bonding *bond = container_of(work, struct bonding,
@@ -1407,10 +1444,6 @@ void bond_alb_monitor(struct work_struct *work)
 
 	read_lock(&bond->lock);
 
-	if (bond->kill_timers) {
-		goto out;
-	}
-
 	if (bond->slave_cnt == 0) {
 		bond_info->tx_rebalance_counter = 0;
 		bond_info->lp_counter = 0;
@@ -1462,25 +1495,12 @@ void bond_alb_monitor(struct work_struct *work)
 	if (bond_info->rlb_enabled) {
 		if (bond_info->primary_is_promisc &&
 		    (++bond_info->rlb_promisc_timeout_counter >= RLB_PROMISC_TIMEOUT)) {
-
-			/*
-			 * dev_set_promiscuity requires rtnl and
-			 * nothing else.
-			 */
-			read_unlock(&bond->lock);
-			rtnl_lock();
-
-			bond_info->rlb_promisc_timeout_counter = 0;
-
 			/* If the primary was set to promiscuous mode
 			 * because a slave was disabled then
 			 * it can now leave promiscuous mode.
 			 */
-			dev_set_promiscuity(bond->curr_active_slave->dev, -1);
-			bond_info->primary_is_promisc = 0;
-
-			rtnl_unlock();
-			read_lock(&bond->lock);
+			queue_work(bond->wq, &bond->alb_promisc_disable_work);
+			bond->alb_promisc_disable_work_active = 1;
 		}
 
 		if (bond_info->rlb_rebalance) {
@@ -1505,7 +1525,6 @@ void bond_alb_monitor(struct work_struct *work)
 
 re_arm:
 	queue_delayed_work(bond->wq, &bond->alb_work, alb_delta_in_ticks);
-out:
 	read_unlock(&bond->lock);
 }
 
diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index 2cc4cfc..f72487a 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -2343,10 +2343,19 @@ static int bond_miimon_inspect(struct bonding *bond)
 	return commit;
 }
 
-static void bond_miimon_commit(struct bonding *bond)
+static void bond_miimon_commit(struct work_struct *work)
 {
 	struct slave *slave;
 	int i;
+	struct bonding *bond = container_of(work, struct bonding,
+					    miimon_commit_work);
+
+	rtnl_lock();
+	read_lock(&bond->lock);
+
+	if (!bond->miimon_commit_work_active)
+		goto out;
+	bond->miimon_commit_work_active = 0;
 
 	bond_for_each_slave(bond, slave, i) {
 		switch (slave->new_link) {
@@ -2421,15 +2430,18 @@ static void bond_miimon_commit(struct bonding *bond)
 		}
 
 do_failover:
-		ASSERT_RTNL();
 		write_lock_bh(&bond->curr_slave_lock);
 		bond_select_active_slave(bond);
 		write_unlock_bh(&bond->curr_slave_lock);
 	}
 
 	bond_set_carrier(bond);
+out:
+	read_unlock(&bond->lock);
+	rtnl_unlock();
 }
 
+
 /*
  * bond_mii_monitor
  *
@@ -2438,14 +2450,13 @@ do_failover:
  * an acquisition of appropriate locks followed by a commit phase to
  * implement whatever link state changes are indicated.
  */
+
 void bond_mii_monitor(struct work_struct *work)
 {
 	struct bonding *bond = container_of(work, struct bonding,
 					    mii_work.work);
 
 	read_lock(&bond->lock);
-	if (bond->kill_timers)
-		goto out;
 
 	if (bond->slave_cnt == 0)
 		goto re_arm;
@@ -2463,22 +2474,15 @@ void bond_mii_monitor(struct work_struct *work)
 	}
 
 	if (bond_miimon_inspect(bond)) {
-		read_unlock(&bond->lock);
-		rtnl_lock();
-		read_lock(&bond->lock);
-
-		bond_miimon_commit(bond);
-
-		read_unlock(&bond->lock);
-		rtnl_unlock();	/* might sleep, hold no other locks */
-		read_lock(&bond->lock);
+		queue_work(bond->wq, &bond->miimon_commit_work);
+		bond->miimon_commit_work_active = 1;
 	}
 
+
 re_arm:
 	if (bond->params.miimon)
 		queue_delayed_work(bond->wq, &bond->mii_work,
 				   msecs_to_jiffies(bond->params.miimon));
-out:
 	read_unlock(&bond->lock);
 }
 
@@ -2778,9 +2782,6 @@ void bond_loadbalance_arp_mon(struct work_struct *work)
 
 	delta_in_ticks = msecs_to_jiffies(bond->params.arp_interval);
 
-	if (bond->kill_timers)
-		goto out;
-
 	if (bond->slave_cnt == 0)
 		goto re_arm;
 
@@ -2867,7 +2868,6 @@ void bond_loadbalance_arp_mon(struct work_struct *work)
 re_arm:
 	if (bond->params.arp_interval)
 		queue_delayed_work(bond->wq, &bond->arp_work, delta_in_ticks);
-out:
 	read_unlock(&bond->lock);
 }
 
@@ -2949,13 +2949,23 @@ static int bond_ab_arp_inspect(struct bonding *bond, int delta_in_ticks)
 /*
  * Called to commit link state changes noted by inspection step of
  * active-backup mode ARP monitor.
- *
- * Called with RTNL and bond->lock for read.
  */
-static void bond_ab_arp_commit(struct bonding *bond, int delta_in_ticks)
+static void bond_ab_arp_commit(struct work_struct *work)
 {
 	struct slave *slave;
 	int i;
+	int delta_in_ticks;
+	struct bonding *bond = container_of(work, struct bonding,
+					    ab_arp_commit_work);
+
+	rtnl_lock();
+	read_lock(&bond->lock);
+
+	if (!bond->ab_arp_commit_work_active)
+		goto out;
+	bond->ab_arp_commit_work_active = 0;
+
+	delta_in_ticks = msecs_to_jiffies(bond->params.arp_interval);
 
 	bond_for_each_slave(bond, slave, i) {
 		switch (slave->new_link) {
@@ -3014,6 +3024,9 @@ do_failover:
 	}
 
 	bond_set_carrier(bond);
+out:
+	read_unlock(&bond->lock);
+	rtnl_unlock();
 }
 
 /*
@@ -3093,9 +3106,6 @@ void bond_activebackup_arp_mon(struct work_struct *work)
 
 	read_lock(&bond->lock);
 
-	if (bond->kill_timers)
-		goto out;
-
 	delta_in_ticks = msecs_to_jiffies(bond->params.arp_interval);
 
 	if (bond->slave_cnt == 0)
@@ -3114,23 +3124,16 @@ void bond_activebackup_arp_mon(struct work_struct *work)
 	}
 
 	if (bond_ab_arp_inspect(bond, delta_in_ticks)) {
-		read_unlock(&bond->lock);
-		rtnl_lock();
-		read_lock(&bond->lock);
-
-		bond_ab_arp_commit(bond, delta_in_ticks);
-
-		read_unlock(&bond->lock);
-		rtnl_unlock();
-		read_lock(&bond->lock);
+		queue_work(bond->wq, &bond->ab_arp_commit_work);
+		bond->ab_arp_commit_work_active = 1;
 	}
 
+
 	bond_ab_arp_probe(bond);
 
 re_arm:
 	if (bond->params.arp_interval)
 		queue_delayed_work(bond->wq, &bond->arp_work, delta_in_ticks);
-out:
 	read_unlock(&bond->lock);
 }
 
@@ -3720,8 +3723,6 @@ static int bond_open(struct net_device *bond_dev)
 {
 	struct bonding *bond = netdev_priv(bond_dev);
 
-	bond->kill_timers = 0;
-
 	if (bond_is_lb(bond)) {
 		/* bond_alb_initialize must be called before the timer
 		 * is started.
@@ -3781,31 +3782,31 @@ static int bond_close(struct net_device *bond_dev)
 	bond->send_grat_arp = 0;
 	bond->send_unsol_na = 0;
 
-	/* signal timers not to re-arm */
-	bond->kill_timers = 1;
-
 	write_unlock_bh(&bond->lock);
 
 	if (bond->params.miimon) {  /* link check interval, in milliseconds. */
-		cancel_delayed_work(&bond->mii_work);
+		cancel_delayed_work_sync(&bond->mii_work);
 	}
 
 	if (bond->params.arp_interval) {  /* arp interval, in milliseconds. */
-		cancel_delayed_work(&bond->arp_work);
+		cancel_delayed_work_sync(&bond->arp_work);
 	}
 
 	switch (bond->params.mode) {
 	case BOND_MODE_8023AD:
-		cancel_delayed_work(&bond->ad_work);
+		cancel_delayed_work_sync(&bond->ad_work);
 		break;
 	case BOND_MODE_TLB:
 	case BOND_MODE_ALB:
-		cancel_delayed_work(&bond->alb_work);
+		cancel_delayed_work_sync(&bond->alb_work);
 		break;
 	default:
 		break;
 	}
 
+	bond->miimon_commit_work_active = 0;
+	bond->ab_arp_commit_work_active = 0;
+	bond->alb_promisc_disable_work_active = 0;
 
 	if (bond_is_lb(bond)) {
 		/* Must be called only after all
@@ -4660,23 +4661,19 @@ static void bond_setup(struct net_device *bond_dev)
 
 static void bond_work_cancel_all(struct bonding *bond)
 {
-	write_lock_bh(&bond->lock);
-	bond->kill_timers = 1;
-	write_unlock_bh(&bond->lock);
-
 	if (bond->params.miimon && delayed_work_pending(&bond->mii_work))
-		cancel_delayed_work(&bond->mii_work);
+		cancel_delayed_work_sync(&bond->mii_work);
 
 	if (bond->params.arp_interval && delayed_work_pending(&bond->arp_work))
-		cancel_delayed_work(&bond->arp_work);
+		cancel_delayed_work_sync(&bond->arp_work);
 
 	if (bond->params.mode == BOND_MODE_ALB &&
 	    delayed_work_pending(&bond->alb_work))
-		cancel_delayed_work(&bond->alb_work);
+		cancel_delayed_work_sync(&bond->alb_work);
 
 	if (bond->params.mode == BOND_MODE_8023AD &&
 	    delayed_work_pending(&bond->ad_work))
-		cancel_delayed_work(&bond->ad_work);
+		cancel_delayed_work_sync(&bond->ad_work);
 }
 
 /*
@@ -5094,6 +5091,9 @@ static int bond_init(struct net_device *bond_dev)
 	bond_prepare_sysfs_group(bond);
 
 	__hw_addr_init(&bond->mc_list);
+	INIT_WORK(&bond->miimon_commit_work, bond_miimon_commit);
+	INIT_WORK(&bond->ab_arp_commit_work, bond_ab_arp_commit);
+	INIT_WORK(&bond->alb_promisc_disable_work, bond_alb_promisc_disable);
 	return 0;
 }
 
diff --git a/drivers/net/bonding/bonding.h b/drivers/net/bonding/bonding.h
index c6fdd85..e771f5d 100644
--- a/drivers/net/bonding/bonding.h
+++ b/drivers/net/bonding/bonding.h
@@ -198,7 +198,6 @@ struct bonding {
 	s32      slave_cnt; /* never change this value outside the attach/detach wrappers */
 	rwlock_t lock;
 	rwlock_t curr_slave_lock;
-	s8       kill_timers;
 	s8	 send_grat_arp;
 	s8	 send_unsol_na;
 	s8	 setup_by_slave;
@@ -223,6 +222,12 @@ struct bonding {
 	struct   delayed_work arp_work;
 	struct   delayed_work alb_work;
 	struct   delayed_work ad_work;
+	struct   work_struct miimon_commit_work;
+	int	 miimon_commit_work_active;
+	struct   work_struct ab_arp_commit_work;
+	int	 ab_arp_commit_work_active;
+	struct   work_struct alb_promisc_disable_work;
+	int	 alb_promisc_disable_work_active;
 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
 	struct   in6_addr master_ipv6;
 #endif
@@ -348,6 +353,7 @@ void bond_select_active_slave(struct bonding *bond);
 void bond_change_active_slave(struct bonding *bond, struct slave *new_active);
 void bond_register_arp(struct bonding *);
 void bond_unregister_arp(struct bonding *);
+void bond_alb_promisc_disable(struct work_struct *work);
 
 struct bond_net {
 	struct net *		net;	/* Associated network namespace */

-- 
Jiri Bohac <jbohac@suse.cz>
SUSE Labs, SUSE CZ


^ permalink raw reply related	[flat|nested] 25+ messages in thread

* Re: [RFC] bonding: fix workqueue re-arming races
  2010-09-02 17:08               ` Jiri Bohac
@ 2010-09-09  0:06                 ` Jay Vosburgh
  2010-09-16 22:44                   ` Jay Vosburgh
  0 siblings, 1 reply; 25+ messages in thread
From: Jay Vosburgh @ 2010-09-09  0:06 UTC (permalink / raw)
  To: Jiri Bohac; +Cc: bonding-devel, markine, jarkao2, chavey, netdev

Jiri Bohac <jbohac@suse.cz> wrote:

>On Wed, Sep 01, 2010 at 05:54:00PM -0700, Jay Vosburgh wrote:
>> Jiri Bohac <jbohac@suse.cz> wrote:
>> 
>> 	In looking at bond_open a bit, I wonder if a contributing factor
>> to the crash (if that's what happens) is that INIT_DELAYED_WORK is being
>> done in bond_open on a work item that's already queued or running (left
>> over from the kill_timers sentinel being missed).  Calling
>> queue_delayed_work when the work item is already queued is ok, I
>> believe, so I don't think that part is an issue (or at least not a
>> panic-worthy one).
>
>Yes, it is the INIT_DELAYED_WORK (zeroes the pending flag), followed by
>queue_delayed_work(). What then happens is
>queue_delayed_work()->queue_delayed_work_on()->BUG_ON(timer_pending(timer));

	Just an update; I'm setting up this patch to give it some
testing.  I've only got one comment so far (below).  

	My current thought after having looked at the code is that this
is getting a little too complicated, so my inclination is to instead
verify that it's permissible (as in, "won't break anything") for the
various commit things to run after bond_close exits and then not worry
about the various sentinels.

	I suspect that if bond_close sets all of the slave->new_link
fields to NOCHANGE, the two commit functions won't do anything.  The
promisc one is probably not a problem here, although I haven't really
looked at it much yet.  It might need to not check for bond_is_lb, so
that the promisc setting doesn't end up orphaned if the mode changes.

>> 	I suspect that the INIT_DELAYED_WORK calls will have to move to
>> bond_create if any of the work items end up be able to run beyond the
>> end of close (which seems likely; I'm running out of ideas).
>
>Yes, with my patch, I have INIT_WORK done in bond_init() for the
>"commit" work items. The re-arming delayed work items now don't
>live after bond_close(), so INIT_DELAYED_WORK can stay on
>bond_open().
>
>> >They are doing "rcnetwork restart", which will do the
>> >close->open. Perhaps all the contention on the rtnl (lots of work
>> >with other network interfaces) makes the race window longer. I
>> >couldn't reproduce this.
>> 
>> 	What actually happens when the failure occurs?  Is there a stack
>> dump?
>
>yes, see above; The stack dump is
>
>[  305.953690] kernel BUG at /usr/src/packages/BUILD/kernel-default-2.6.32.11/linux-2.6.32/kernel/workqueue.c:243!
>[  305.963746] invalid opcode: 0000 [#1] SMP 
>[  305.967849] last sysfs file: /sys/devices/virtual/net/bond1/bonding/slaves
>[  305.974698] CPU 2 
>[  305.976739] Modules linked in: iptable_filter ip_tables x_tables ext2
>nls_iso8859_1 nls_cp437 vfat fat iscsi_tcp libiscsi_tcp libiscsi
>scsi_transport_iscsi iscsi_trgt crc32c ipv6 bonding microcode fuse loop dm_mod
>iTCO_wdt rtc_cmos usb_storage sr_mod ses serio_raw pcspkr dcdbas(X) joydev bnx2
>rtc_core iTCO_vendor_support cdrom enclosure power_meter rtc_lib button sg
>usbhid hid uhci_hcd ehci_hcd usbcore sd_mod crc_t10dif edd ext3 mbcache jbd fan
>processor ide_pci_generic ide_core ata_generic ata_piix libata megaraid_sas
>scsi_mod thermal thermal_sys hwmon
>[  306.026413] Supported: Yes
>[  306.029114] Pid: 8668, comm: bond1 Tainted: G           X 2.6.32.11-0.3-default #1 PowerEdge R910
>[  306.037954] RIP: 0010:[<ffffffff810603a9>]  [<ffffffff810603a9>] queue_delayed_work_on+0xf9/0x100
>[  306.046816] RSP: 0018:ffff880858cb3de0  EFLAGS: 00010286
>[  306.052108] RAX: 0000000000000000 RBX: ffff88085aec8b60 RCX: 0000000000000019
>[  306.059524] RDX: 0000000000000000 RSI: ffff880858f5a8c0 RDI: 00000000ffffffff
>[  306.066635] RBP: ffff88085aec8b60 R08: 0000000000000018 R09: 0000000000000001
>[  306.073746] R10: 0000000000000001 R11: 0000000000000006 R12: ffff880858f5a8c0
>[  306.080856] R13: 00000000ffffffff R14: 0000000000000019 R15: ffff88085c7406a8
>[  306.087967] FS:  0000000000000000(0000) GS:ffff88109c600000(0000) knlGS:0000000000000000
>[  306.096028] CS:  0010 DS: 0018 ES: 0018 CR0: 000000008005003b
>[  306.101753] CR2: 0000000000700540 CR3: 0000000001804000 CR4: 00000000000006e0
>[  306.108864] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
>[  306.115973] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
>[  306.123083] Process bond1 (pid: 8668, threadinfo ffff880858cb2000, task ffff88085c740040)
>[  306.131229] Stack:
>[  306.133232]  ffff88002e41b680 ffff88085aec8b60 ffff88085aec8780 ffff88085aec87ac
>[  306.140462] <0> ffff88085c740040 ffffffffa028d029 0000000000013600 ffff88002e41b680
>[  306.148149] <0> ffffffffa028cf40 ffff880858cb3fd8 ffff88002e41b688 ffffffff8105f558
>[  306.156092] Call Trace:
>[  306.158553]  [<ffffffffa028d029>] bond_mii_monitor+0xe9/0x140 [bonding]
>[  306.165167]  [<ffffffff8105f558>] run_workqueue+0xb8/0x140
>[  306.170638]  [<ffffffff8105f676>] worker_thread+0x96/0x110
>[  306.176110]  [<ffffffff810636f6>] kthread+0x96/0xa0
>[  306.180976]  [<ffffffff81003fba>] child_rip+0xa/0x20
>[  306.185926] Code: 8b 74 24 28 48 89 ef e8 76 7b ff ff e9 7a ff ff ff 44 89
>ee 48 89 ef e8 06 7d ff ff ba 01 00 00 00 90 e9 2c ff ff ff 0f 0b eb fe <0f> 0b
>eb fe 0f 1f 00 48 89 f0 48 8b 35 5e 75 8c 00 48 89 d1 48 
>[  306.205568] RIP  [<ffffffff810603a9>] queue_delayed_work_on+0xf9/0x100
>[  306.212089]  RSP <ffff880858cb3de0>
>
>workqueue.c:243 is the BUG_ON(timer_pending(timer)) inside queue_delayed_work_on() in our kernel.
>
>
>
>This is an updated version of my patch:
>
>- it fixes the issues in bond_alb_promisc_disable()
>
>- it only the original bond->wq workqueue. The second workqueue
>  is indeed not needed.  Thanks to Jarek for pointing that out.
>  It is really safe to call cancel_delayed_work_sync(foo) under
>  rtnl, even if there are work items on the workqueue that hold
>  rtnl. Unless foo grabs rtnl itself, of course.
>
>- it now has three flags that prevent any of the work items from
>  doing unexpected stuff after bond_close():
>  miimon_commit_work_active, ab_arp_commit_work_active and
>  alb_promisc_disable_work_active.
>  
>  bond_close() zeroes them with rtnl held. The work items do
>  everything under rtnl and they check these flags after rtnl is
>  acquired. The only way the "commit" work items queued before a
>  bond_close() might do anything after the bond_close() is when
>  bond_open() is called and it schedules the delayed re-arming
>  work item again.  The re-arming work then tries to queue the
>  "commit" work item which fails, because it is already queued.
>  However, it sets the flag to 1. So the work item queued from before
>  bond_close() would just take over the job the newly queued one
>  was supposed to do.  
>  
>  (In practice, I don't think this can even happen anyway, because the
>  newly scheduled re-arming work item is queued after the
>  "commit" work item on the workqueue. So the "commit" work item
>  will just do nothing and return, and the flag will be set only
>  after that.)
>
>
>Signed-off-by: Jiri Bohac <jbohac@suse.cz>
>
>diff --git a/drivers/net/bonding/bond_3ad.c b/drivers/net/bonding/bond_3ad.c
>index 822f586..8015e12 100644
>--- a/drivers/net/bonding/bond_3ad.c
>+++ b/drivers/net/bonding/bond_3ad.c
>@@ -2119,10 +2119,6 @@ void bond_3ad_state_machine_handler(struct work_struct *work)
>
> 	read_lock(&bond->lock);
>
>-	if (bond->kill_timers) {
>-		goto out;
>-	}
>-
> 	//check if there are any slaves
> 	if (bond->slave_cnt == 0) {
> 		goto re_arm;
>@@ -2166,7 +2162,6 @@ void bond_3ad_state_machine_handler(struct work_struct *work)
>
> re_arm:
> 	queue_delayed_work(bond->wq, &bond->ad_work, ad_delta_in_ticks);
>-out:
> 	read_unlock(&bond->lock);
> }
>
>diff --git a/drivers/net/bonding/bond_alb.c b/drivers/net/bonding/bond_alb.c
>index c746b33..5c9d7ab 100644
>--- a/drivers/net/bonding/bond_alb.c
>+++ b/drivers/net/bonding/bond_alb.c
>@@ -1397,6 +1397,43 @@ out:
> 	return NETDEV_TX_OK;
> }
>
>+void bond_alb_promisc_disable(struct work_struct *work)
>+{
>+	struct bonding *bond = container_of(work, struct bonding,
>+					    alb_promisc_disable_work);
>+	struct alb_bond_info *bond_info = &(BOND_ALB_INFO(bond));
>+	struct net_device *dev = NULL;
>+
>+	/*
>+	 * dev_set_promiscuity requires rtnl and
>+	 * nothing else.
>+	 */
>+	rtnl_lock();
>+	read_lock(&bond->lock);
>+	read_lock(&bond->curr_slave_lock);
>+
>+	if (!bond->alb_promisc_disable_work_active)
>+		goto out;
>+	bond->alb_promisc_disable_work_active = 0;
>+
>+	if (!bond_is_lb(bond))
>+		goto out;
>+	dev = bond->curr_active_slave;
>+	if (!dev)
>+		goto out;
>+
>+	bond_info->primary_is_promisc = 0;
>+	bond_info->rlb_promisc_timeout_counter = 0;
>+
>+out:
>+	read_unlock(&bond->curr_slave_lock);
>+	read_unlock(&bond->lock);
>+	rtnl_unlock();
>+
>+	if (dev)
>+		dev_set_promiscuity(dev, -1);

	This is what I mentioned above: the dev_set_promisc needs to
happen with rtnl held.

>+
> void bond_alb_monitor(struct work_struct *work)
> {
> 	struct bonding *bond = container_of(work, struct bonding,
>@@ -1407,10 +1444,6 @@ void bond_alb_monitor(struct work_struct *work)
>
> 	read_lock(&bond->lock);
>
>-	if (bond->kill_timers) {
>-		goto out;
>-	}
>-
> 	if (bond->slave_cnt == 0) {
> 		bond_info->tx_rebalance_counter = 0;
> 		bond_info->lp_counter = 0;
>@@ -1462,25 +1495,12 @@ void bond_alb_monitor(struct work_struct *work)
> 	if (bond_info->rlb_enabled) {
> 		if (bond_info->primary_is_promisc &&
> 		    (++bond_info->rlb_promisc_timeout_counter >= RLB_PROMISC_TIMEOUT)) {
>-
>-			/*
>-			 * dev_set_promiscuity requires rtnl and
>-			 * nothing else.
>-			 */
>-			read_unlock(&bond->lock);
>-			rtnl_lock();
>-
>-			bond_info->rlb_promisc_timeout_counter = 0;
>-
> 			/* If the primary was set to promiscuous mode
> 			 * because a slave was disabled then
> 			 * it can now leave promiscuous mode.
> 			 */
>-			dev_set_promiscuity(bond->curr_active_slave->dev, -1);
>-			bond_info->primary_is_promisc = 0;
>-
>-			rtnl_unlock();
>-			read_lock(&bond->lock);
>+			queue_work(bond->wq, &bond->alb_promisc_disable_work);
>+			bond->alb_promisc_disable_work_active = 1;
> 		}
>
> 		if (bond_info->rlb_rebalance) {
>@@ -1505,7 +1525,6 @@ void bond_alb_monitor(struct work_struct *work)
>
> re_arm:
> 	queue_delayed_work(bond->wq, &bond->alb_work, alb_delta_in_ticks);
>-out:
> 	read_unlock(&bond->lock);
> }
>
>diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
>index 2cc4cfc..f72487a 100644
>--- a/drivers/net/bonding/bond_main.c
>+++ b/drivers/net/bonding/bond_main.c
>@@ -2343,10 +2343,19 @@ static int bond_miimon_inspect(struct bonding *bond)
> 	return commit;
> }
>
>-static void bond_miimon_commit(struct bonding *bond)
>+static void bond_miimon_commit(struct work_struct *work)
> {
> 	struct slave *slave;
> 	int i;
>+	struct bonding *bond = container_of(work, struct bonding,
>+					    miimon_commit_work);
>+
>+	rtnl_lock();
>+	read_lock(&bond->lock);
>+
>+	if (!bond->miimon_commit_work_active)
>+		goto out;
>+	bond->miimon_commit_work_active = 0;
>
> 	bond_for_each_slave(bond, slave, i) {
> 		switch (slave->new_link) {
>@@ -2421,15 +2430,18 @@ static void bond_miimon_commit(struct bonding *bond)
> 		}
>
> do_failover:
>-		ASSERT_RTNL();
> 		write_lock_bh(&bond->curr_slave_lock);
> 		bond_select_active_slave(bond);
> 		write_unlock_bh(&bond->curr_slave_lock);
> 	}
>
> 	bond_set_carrier(bond);
>+out:
>+	read_unlock(&bond->lock);
>+	rtnl_unlock();
> }
>
>+
> /*
>  * bond_mii_monitor
>  *
>@@ -2438,14 +2450,13 @@ do_failover:
>  * an acquisition of appropriate locks followed by a commit phase to
>  * implement whatever link state changes are indicated.
>  */
>+
> void bond_mii_monitor(struct work_struct *work)
> {
> 	struct bonding *bond = container_of(work, struct bonding,
> 					    mii_work.work);
>
> 	read_lock(&bond->lock);
>-	if (bond->kill_timers)
>-		goto out;
>
> 	if (bond->slave_cnt == 0)
> 		goto re_arm;
>@@ -2463,22 +2474,15 @@ void bond_mii_monitor(struct work_struct *work)
> 	}
>
> 	if (bond_miimon_inspect(bond)) {
>-		read_unlock(&bond->lock);
>-		rtnl_lock();
>-		read_lock(&bond->lock);
>-
>-		bond_miimon_commit(bond);
>-
>-		read_unlock(&bond->lock);
>-		rtnl_unlock();	/* might sleep, hold no other locks */
>-		read_lock(&bond->lock);
>+		queue_work(bond->wq, &bond->miimon_commit_work);
>+		bond->miimon_commit_work_active = 1;
> 	}
>
>+
> re_arm:
> 	if (bond->params.miimon)
> 		queue_delayed_work(bond->wq, &bond->mii_work,
> 				   msecs_to_jiffies(bond->params.miimon));
>-out:
> 	read_unlock(&bond->lock);
> }
>
>@@ -2778,9 +2782,6 @@ void bond_loadbalance_arp_mon(struct work_struct *work)
>
> 	delta_in_ticks = msecs_to_jiffies(bond->params.arp_interval);
>
>-	if (bond->kill_timers)
>-		goto out;
>-
> 	if (bond->slave_cnt == 0)
> 		goto re_arm;
>
>@@ -2867,7 +2868,6 @@ void bond_loadbalance_arp_mon(struct work_struct *work)
> re_arm:
> 	if (bond->params.arp_interval)
> 		queue_delayed_work(bond->wq, &bond->arp_work, delta_in_ticks);
>-out:
> 	read_unlock(&bond->lock);
> }
>
>@@ -2949,13 +2949,23 @@ static int bond_ab_arp_inspect(struct bonding *bond, int delta_in_ticks)
> /*
>  * Called to commit link state changes noted by inspection step of
>  * active-backup mode ARP monitor.
>- *
>- * Called with RTNL and bond->lock for read.
>  */
>-static void bond_ab_arp_commit(struct bonding *bond, int delta_in_ticks)
>+static void bond_ab_arp_commit(struct work_struct *work)
> {
> 	struct slave *slave;
> 	int i;
>+	int delta_in_ticks;
>+	struct bonding *bond = container_of(work, struct bonding,
>+					    ab_arp_commit_work);
>+
>+	rtnl_lock();
>+	read_lock(&bond->lock);
>+
>+	if (!bond->ab_arp_commit_work_active)
>+		goto out;
>+	bond->ab_arp_commit_work_active = 0;
>+
>+	delta_in_ticks = msecs_to_jiffies(bond->params.arp_interval);
>
> 	bond_for_each_slave(bond, slave, i) {
> 		switch (slave->new_link) {
>@@ -3014,6 +3024,9 @@ do_failover:
> 	}
>
> 	bond_set_carrier(bond);
>+out:
>+	read_unlock(&bond->lock);
>+	rtnl_unlock();
> }
>
> /*
>@@ -3093,9 +3106,6 @@ void bond_activebackup_arp_mon(struct work_struct *work)
>
> 	read_lock(&bond->lock);
>
>-	if (bond->kill_timers)
>-		goto out;
>-
> 	delta_in_ticks = msecs_to_jiffies(bond->params.arp_interval);
>
> 	if (bond->slave_cnt == 0)
>@@ -3114,23 +3124,16 @@ void bond_activebackup_arp_mon(struct work_struct *work)
> 	}
>
> 	if (bond_ab_arp_inspect(bond, delta_in_ticks)) {
>-		read_unlock(&bond->lock);
>-		rtnl_lock();
>-		read_lock(&bond->lock);
>-
>-		bond_ab_arp_commit(bond, delta_in_ticks);
>-
>-		read_unlock(&bond->lock);
>-		rtnl_unlock();
>-		read_lock(&bond->lock);
>+		queue_work(bond->wq, &bond->ab_arp_commit_work);
>+		bond->ab_arp_commit_work_active = 1;
> 	}
>
>+
> 	bond_ab_arp_probe(bond);
>
> re_arm:
> 	if (bond->params.arp_interval)
> 		queue_delayed_work(bond->wq, &bond->arp_work, delta_in_ticks);
>-out:
> 	read_unlock(&bond->lock);
> }
>
>@@ -3720,8 +3723,6 @@ static int bond_open(struct net_device *bond_dev)
> {
> 	struct bonding *bond = netdev_priv(bond_dev);
>
>-	bond->kill_timers = 0;
>-
> 	if (bond_is_lb(bond)) {
> 		/* bond_alb_initialize must be called before the timer
> 		 * is started.
>@@ -3781,31 +3782,31 @@ static int bond_close(struct net_device *bond_dev)
> 	bond->send_grat_arp = 0;
> 	bond->send_unsol_na = 0;
>
>-	/* signal timers not to re-arm */
>-	bond->kill_timers = 1;
>-
> 	write_unlock_bh(&bond->lock);
>
> 	if (bond->params.miimon) {  /* link check interval, in milliseconds. */
>-		cancel_delayed_work(&bond->mii_work);
>+		cancel_delayed_work_sync(&bond->mii_work);
> 	}
>
> 	if (bond->params.arp_interval) {  /* arp interval, in milliseconds. */
>-		cancel_delayed_work(&bond->arp_work);
>+		cancel_delayed_work_sync(&bond->arp_work);
> 	}
>
> 	switch (bond->params.mode) {
> 	case BOND_MODE_8023AD:
>-		cancel_delayed_work(&bond->ad_work);
>+		cancel_delayed_work_sync(&bond->ad_work);
> 		break;
> 	case BOND_MODE_TLB:
> 	case BOND_MODE_ALB:
>-		cancel_delayed_work(&bond->alb_work);
>+		cancel_delayed_work_sync(&bond->alb_work);
> 		break;
> 	default:
> 		break;
> 	}
>
>+	bond->miimon_commit_work_active = 0;
>+	bond->ab_arp_commit_work_active = 0;
>+	bond->alb_promisc_disable_work_active = 0;
>
> 	if (bond_is_lb(bond)) {
> 		/* Must be called only after all
>@@ -4660,23 +4661,19 @@ static void bond_setup(struct net_device *bond_dev)
>
> static void bond_work_cancel_all(struct bonding *bond)
> {
>-	write_lock_bh(&bond->lock);
>-	bond->kill_timers = 1;
>-	write_unlock_bh(&bond->lock);
>-
> 	if (bond->params.miimon && delayed_work_pending(&bond->mii_work))
>-		cancel_delayed_work(&bond->mii_work);
>+		cancel_delayed_work_sync(&bond->mii_work);
>
> 	if (bond->params.arp_interval && delayed_work_pending(&bond->arp_work))
>-		cancel_delayed_work(&bond->arp_work);
>+		cancel_delayed_work_sync(&bond->arp_work);
>
> 	if (bond->params.mode == BOND_MODE_ALB &&
> 	    delayed_work_pending(&bond->alb_work))
>-		cancel_delayed_work(&bond->alb_work);
>+		cancel_delayed_work_sync(&bond->alb_work);
>
> 	if (bond->params.mode == BOND_MODE_8023AD &&
> 	    delayed_work_pending(&bond->ad_work))
>-		cancel_delayed_work(&bond->ad_work);
>+		cancel_delayed_work_sync(&bond->ad_work);
> }
>
> /*
>@@ -5094,6 +5091,9 @@ static int bond_init(struct net_device *bond_dev)
> 	bond_prepare_sysfs_group(bond);
>
> 	__hw_addr_init(&bond->mc_list);
>+	INIT_WORK(&bond->miimon_commit_work, bond_miimon_commit);
>+	INIT_WORK(&bond->ab_arp_commit_work, bond_ab_arp_commit);
>+	INIT_WORK(&bond->alb_promisc_disable_work, bond_alb_promisc_disable);
> 	return 0;
> }
>
>diff --git a/drivers/net/bonding/bonding.h b/drivers/net/bonding/bonding.h
>index c6fdd85..e771f5d 100644
>--- a/drivers/net/bonding/bonding.h
>+++ b/drivers/net/bonding/bonding.h
>@@ -198,7 +198,6 @@ struct bonding {
> 	s32      slave_cnt; /* never change this value outside the attach/detach wrappers */
> 	rwlock_t lock;
> 	rwlock_t curr_slave_lock;
>-	s8       kill_timers;
> 	s8	 send_grat_arp;
> 	s8	 send_unsol_na;
> 	s8	 setup_by_slave;
>@@ -223,6 +222,12 @@ struct bonding {
> 	struct   delayed_work arp_work;
> 	struct   delayed_work alb_work;
> 	struct   delayed_work ad_work;
>+	struct   work_struct miimon_commit_work;
>+	int	 miimon_commit_work_active;
>+	struct   work_struct ab_arp_commit_work;
>+	int	 ab_arp_commit_work_active;
>+	struct   work_struct alb_promisc_disable_work;
>+	int	 alb_promisc_disable_work_active;
> #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
> 	struct   in6_addr master_ipv6;
> #endif
>@@ -348,6 +353,7 @@ void bond_select_active_slave(struct bonding *bond);
> void bond_change_active_slave(struct bonding *bond, struct slave *new_active);
> void bond_register_arp(struct bonding *);
> void bond_unregister_arp(struct bonding *);
>+void bond_alb_promisc_disable(struct work_struct *work);
>
> struct bond_net {
> 	struct net *		net;	/* Associated network namespace */
>
>-- 
>Jiri Bohac <jbohac@suse.cz>
>SUSE Labs, SUSE CZ

	-J

---
	-Jay Vosburgh, IBM Linux Technology Center, fubar@us.ibm.com

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [RFC] bonding: fix workqueue re-arming races
  2010-09-09  0:06                 ` Jay Vosburgh
@ 2010-09-16 22:44                   ` Jay Vosburgh
  2010-09-24 11:23                     ` Narendra K
  0 siblings, 1 reply; 25+ messages in thread
From: Jay Vosburgh @ 2010-09-16 22:44 UTC (permalink / raw)
  To: Jiri Bohac; +Cc: bonding-devel, markine, jarkao2, chavey, netdev

Jay Vosburgh <fubar@us.ibm.com> wrote:

>Jiri Bohac <jbohac@suse.cz> wrote:
>
>>On Wed, Sep 01, 2010 at 05:54:00PM -0700, Jay Vosburgh wrote:
>>> Jiri Bohac <jbohac@suse.cz> wrote:
>>> 
>>> 	In looking at bond_open a bit, I wonder if a contributing factor
>>> to the crash (if that's what happens) is that INIT_DELAYED_WORK is being
>>> done in bond_open on a work item that's already queued or running (left
>>> over from the kill_timers sentinel being missed).  Calling
>>> queue_delayed_work when the work item is already queued is ok, I
>>> believe, so I don't think that part is an issue (or at least not a
>>> panic-worthy one).
>>
>>Yes, it is the INIT_DELAYED_WORK (zeroes the pending flag), followed by
>>queue_delayed_work(). What then happens is
>>queue_delayed_work()->queue_delayed_work_on()->BUG_ON(timer_pending(timer));
>
>	Just an update; I'm setting up this patch to give it some
>testing.  I've only got one comment so far (below).  
>
>	My current thought after having looked at the code is that this
>is getting a little too complicated, so my inclination is to instead
>verify that it's permissible (as in, "won't break anything") for the
>various commit things to run after bond_close exits and then not worry
>about the various sentinels.
>
>	I suspect that if bond_close sets all of the slave->new_link
>fields to NOCHANGE, the two commit functions won't do anything.  The
>promisc one is probably not a problem here, although I haven't really
>looked at it much yet.  It might need to not check for bond_is_lb, so
>that the promisc setting doesn't end up orphaned if the mode changes.
[...]

	I had some time to work on this, and I fixed a few nits in the
most recent patch, and also modified it as I describe above (the
new_link business).  This seems to do the right thing for the mii/arp
commit functions.

	The alb_promisc alb_promisc function, however, still has a race.
The curr_active_slave could change between the time the function is
scheduled and when it executes.  That window is pretty small, but does
exist.  Losing the race means that some interface stays promisc when it
shouldn't; I don't believe it will panic.  Fixing that is probably a
matter of stashing a pointer to the slave to be de-promisc-ified
somewhere, but that stash would have to be handled if the slave were to
be removed from the bond.

	I've tested this a bit, and it seems ok, but I can't reproduce
the original problem, so I'm not entirely sure this doesn't break
something very subtle.

	Also, I'll be out of the office for the next two weeks, so I
won't get back to this until I return.  If any interested parties could
test this out and provide some feedback before then, it would be
appreciated.

	-J



diff --git a/drivers/net/bonding/bond_3ad.c b/drivers/net/bonding/bond_3ad.c
index 822f586..8015e12 100644
--- a/drivers/net/bonding/bond_3ad.c
+++ b/drivers/net/bonding/bond_3ad.c
@@ -2119,10 +2119,6 @@ void bond_3ad_state_machine_handler(struct work_struct *work)
 
 	read_lock(&bond->lock);
 
-	if (bond->kill_timers) {
-		goto out;
-	}
-
 	//check if there are any slaves
 	if (bond->slave_cnt == 0) {
 		goto re_arm;
@@ -2166,7 +2162,6 @@ void bond_3ad_state_machine_handler(struct work_struct *work)
 
 re_arm:
 	queue_delayed_work(bond->wq, &bond->ad_work, ad_delta_in_ticks);
-out:
 	read_unlock(&bond->lock);
 }
 
diff --git a/drivers/net/bonding/bond_alb.c b/drivers/net/bonding/bond_alb.c
index c746b33..8242ee2 100644
--- a/drivers/net/bonding/bond_alb.c
+++ b/drivers/net/bonding/bond_alb.c
@@ -1397,6 +1397,40 @@ out:
 	return NETDEV_TX_OK;
 }
 
+void bond_alb_promisc_disable(struct work_struct *work)
+{
+	struct bonding *bond = container_of(work, struct bonding,
+					    alb_promisc_disable_work);
+	struct alb_bond_info *bond_info = &(BOND_ALB_INFO(bond));
+	struct net_device *dev = NULL;
+
+	/*
+	 * dev_set_promiscuity requires rtnl and
+	 * nothing else.
+	 */
+	rtnl_lock();
+	read_lock(&bond->lock);
+	read_lock(&bond->curr_slave_lock);
+
+	if (!bond_is_lb(bond))
+		goto out;
+	if (bond->curr_active_slave)
+		dev = bond->curr_active_slave->dev;
+	if (!dev)
+		goto out;
+
+	bond_info->primary_is_promisc = 0;
+	bond_info->rlb_promisc_timeout_counter = 0;
+
+out:
+	read_unlock(&bond->curr_slave_lock);
+	read_unlock(&bond->lock);
+	if (dev)
+		dev_set_promiscuity(dev, -1);
+
+	rtnl_unlock();
+}
+
 void bond_alb_monitor(struct work_struct *work)
 {
 	struct bonding *bond = container_of(work, struct bonding,
@@ -1407,10 +1441,6 @@ void bond_alb_monitor(struct work_struct *work)
 
 	read_lock(&bond->lock);
 
-	if (bond->kill_timers) {
-		goto out;
-	}
-
 	if (bond->slave_cnt == 0) {
 		bond_info->tx_rebalance_counter = 0;
 		bond_info->lp_counter = 0;
@@ -1462,25 +1492,11 @@ void bond_alb_monitor(struct work_struct *work)
 	if (bond_info->rlb_enabled) {
 		if (bond_info->primary_is_promisc &&
 		    (++bond_info->rlb_promisc_timeout_counter >= RLB_PROMISC_TIMEOUT)) {
-
-			/*
-			 * dev_set_promiscuity requires rtnl and
-			 * nothing else.
-			 */
-			read_unlock(&bond->lock);
-			rtnl_lock();
-
-			bond_info->rlb_promisc_timeout_counter = 0;
-
 			/* If the primary was set to promiscuous mode
 			 * because a slave was disabled then
 			 * it can now leave promiscuous mode.
 			 */
-			dev_set_promiscuity(bond->curr_active_slave->dev, -1);
-			bond_info->primary_is_promisc = 0;
-
-			rtnl_unlock();
-			read_lock(&bond->lock);
+			queue_work(bond->wq, &bond->alb_promisc_disable_work);
 		}
 
 		if (bond_info->rlb_rebalance) {
@@ -1505,7 +1521,6 @@ void bond_alb_monitor(struct work_struct *work)
 
 re_arm:
 	queue_delayed_work(bond->wq, &bond->alb_work, alb_delta_in_ticks);
-out:
 	read_unlock(&bond->lock);
 }
 
diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index 2cc4cfc..0ad562b 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -2343,10 +2343,15 @@ static int bond_miimon_inspect(struct bonding *bond)
 	return commit;
 }
 
-static void bond_miimon_commit(struct bonding *bond)
+static void bond_miimon_commit(struct work_struct *work)
 {
 	struct slave *slave;
 	int i;
+	struct bonding *bond = container_of(work, struct bonding,
+					    miimon_commit_work);
+
+	rtnl_lock();
+	read_lock(&bond->lock);
 
 	bond_for_each_slave(bond, slave, i) {
 		switch (slave->new_link) {
@@ -2421,13 +2426,14 @@ static void bond_miimon_commit(struct bonding *bond)
 		}
 
 do_failover:
-		ASSERT_RTNL();
 		write_lock_bh(&bond->curr_slave_lock);
 		bond_select_active_slave(bond);
 		write_unlock_bh(&bond->curr_slave_lock);
 	}
 
 	bond_set_carrier(bond);
+	read_unlock(&bond->lock);
+	rtnl_unlock();
 }
 
 /*
@@ -2444,8 +2450,6 @@ void bond_mii_monitor(struct work_struct *work)
 					    mii_work.work);
 
 	read_lock(&bond->lock);
-	if (bond->kill_timers)
-		goto out;
 
 	if (bond->slave_cnt == 0)
 		goto re_arm;
@@ -2462,23 +2466,14 @@ void bond_mii_monitor(struct work_struct *work)
 		read_unlock(&bond->curr_slave_lock);
 	}
 
-	if (bond_miimon_inspect(bond)) {
-		read_unlock(&bond->lock);
-		rtnl_lock();
-		read_lock(&bond->lock);
-
-		bond_miimon_commit(bond);
+	if (bond_miimon_inspect(bond))
+		queue_work(bond->wq, &bond->miimon_commit_work);
 
-		read_unlock(&bond->lock);
-		rtnl_unlock();	/* might sleep, hold no other locks */
-		read_lock(&bond->lock);
-	}
 
 re_arm:
 	if (bond->params.miimon)
 		queue_delayed_work(bond->wq, &bond->mii_work,
 				   msecs_to_jiffies(bond->params.miimon));
-out:
 	read_unlock(&bond->lock);
 }
 
@@ -2778,9 +2773,6 @@ void bond_loadbalance_arp_mon(struct work_struct *work)
 
 	delta_in_ticks = msecs_to_jiffies(bond->params.arp_interval);
 
-	if (bond->kill_timers)
-		goto out;
-
 	if (bond->slave_cnt == 0)
 		goto re_arm;
 
@@ -2867,7 +2859,6 @@ void bond_loadbalance_arp_mon(struct work_struct *work)
 re_arm:
 	if (bond->params.arp_interval)
 		queue_delayed_work(bond->wq, &bond->arp_work, delta_in_ticks);
-out:
 	read_unlock(&bond->lock);
 }
 
@@ -2949,13 +2940,19 @@ static int bond_ab_arp_inspect(struct bonding *bond, int delta_in_ticks)
 /*
  * Called to commit link state changes noted by inspection step of
  * active-backup mode ARP monitor.
- *
- * Called with RTNL and bond->lock for read.
  */
-static void bond_ab_arp_commit(struct bonding *bond, int delta_in_ticks)
+static void bond_ab_arp_commit(struct work_struct *work)
 {
 	struct slave *slave;
 	int i;
+	int delta_in_ticks;
+	struct bonding *bond = container_of(work, struct bonding,
+					    ab_arp_commit_work);
+
+	rtnl_lock();
+	read_lock(&bond->lock);
+
+	delta_in_ticks = msecs_to_jiffies(bond->params.arp_interval);
 
 	bond_for_each_slave(bond, slave, i) {
 		switch (slave->new_link) {
@@ -3014,6 +3011,8 @@ do_failover:
 	}
 
 	bond_set_carrier(bond);
+	read_unlock(&bond->lock);
+	rtnl_unlock();
 }
 
 /*
@@ -3093,9 +3092,6 @@ void bond_activebackup_arp_mon(struct work_struct *work)
 
 	read_lock(&bond->lock);
 
-	if (bond->kill_timers)
-		goto out;
-
 	delta_in_ticks = msecs_to_jiffies(bond->params.arp_interval);
 
 	if (bond->slave_cnt == 0)
@@ -3113,24 +3109,14 @@ void bond_activebackup_arp_mon(struct work_struct *work)
 		read_unlock(&bond->curr_slave_lock);
 	}
 
-	if (bond_ab_arp_inspect(bond, delta_in_ticks)) {
-		read_unlock(&bond->lock);
-		rtnl_lock();
-		read_lock(&bond->lock);
-
-		bond_ab_arp_commit(bond, delta_in_ticks);
-
-		read_unlock(&bond->lock);
-		rtnl_unlock();
-		read_lock(&bond->lock);
-	}
+	if (bond_ab_arp_inspect(bond, delta_in_ticks))
+		queue_work(bond->wq, &bond->ab_arp_commit_work);
 
 	bond_ab_arp_probe(bond);
 
 re_arm:
 	if (bond->params.arp_interval)
 		queue_delayed_work(bond->wq, &bond->arp_work, delta_in_ticks);
-out:
 	read_unlock(&bond->lock);
 }
 
@@ -3720,8 +3706,6 @@ static int bond_open(struct net_device *bond_dev)
 {
 	struct bonding *bond = netdev_priv(bond_dev);
 
-	bond->kill_timers = 0;
-
 	if (bond_is_lb(bond)) {
 		/* bond_alb_initialize must be called before the timer
 		 * is started.
@@ -3767,6 +3751,8 @@ static int bond_open(struct net_device *bond_dev)
 static int bond_close(struct net_device *bond_dev)
 {
 	struct bonding *bond = netdev_priv(bond_dev);
+	struct slave *slave;
+	int i;
 
 	if (bond->params.mode == BOND_MODE_8023AD) {
 		/* Unregister the receive of LACPDUs */
@@ -3781,32 +3767,36 @@ static int bond_close(struct net_device *bond_dev)
 	bond->send_grat_arp = 0;
 	bond->send_unsol_na = 0;
 
-	/* signal timers not to re-arm */
-	bond->kill_timers = 1;
+	/* There's a race between close and the rearming timers over RNTL,
+	 * so any RTNL-needing work is done as a separate work item.
+	 * Here, we arrange for the monitor commit functions to do nothing
+	 * should they happen to run after bond_close.
+	 */
+	bond_for_each_slave(bond, slave, i)
+		slave->new_link = BOND_LINK_NOCHANGE;
 
 	write_unlock_bh(&bond->lock);
 
 	if (bond->params.miimon) {  /* link check interval, in milliseconds. */
-		cancel_delayed_work(&bond->mii_work);
+		cancel_delayed_work_sync(&bond->mii_work);
 	}
 
 	if (bond->params.arp_interval) {  /* arp interval, in milliseconds. */
-		cancel_delayed_work(&bond->arp_work);
+		cancel_delayed_work_sync(&bond->arp_work);
 	}
 
 	switch (bond->params.mode) {
 	case BOND_MODE_8023AD:
-		cancel_delayed_work(&bond->ad_work);
+		cancel_delayed_work_sync(&bond->ad_work);
 		break;
 	case BOND_MODE_TLB:
 	case BOND_MODE_ALB:
-		cancel_delayed_work(&bond->alb_work);
+		cancel_delayed_work_sync(&bond->alb_work);
 		break;
 	default:
 		break;
 	}
 
-
 	if (bond_is_lb(bond)) {
 		/* Must be called only after all
 		 * slaves have been released
@@ -4660,23 +4650,19 @@ static void bond_setup(struct net_device *bond_dev)
 
 static void bond_work_cancel_all(struct bonding *bond)
 {
-	write_lock_bh(&bond->lock);
-	bond->kill_timers = 1;
-	write_unlock_bh(&bond->lock);
-
 	if (bond->params.miimon && delayed_work_pending(&bond->mii_work))
-		cancel_delayed_work(&bond->mii_work);
+		cancel_delayed_work_sync(&bond->mii_work);
 
 	if (bond->params.arp_interval && delayed_work_pending(&bond->arp_work))
-		cancel_delayed_work(&bond->arp_work);
+		cancel_delayed_work_sync(&bond->arp_work);
 
 	if (bond->params.mode == BOND_MODE_ALB &&
 	    delayed_work_pending(&bond->alb_work))
-		cancel_delayed_work(&bond->alb_work);
+		cancel_delayed_work_sync(&bond->alb_work);
 
 	if (bond->params.mode == BOND_MODE_8023AD &&
 	    delayed_work_pending(&bond->ad_work))
-		cancel_delayed_work(&bond->ad_work);
+		cancel_delayed_work_sync(&bond->ad_work);
 }
 
 /*
@@ -5094,6 +5080,9 @@ static int bond_init(struct net_device *bond_dev)
 	bond_prepare_sysfs_group(bond);
 
 	__hw_addr_init(&bond->mc_list);
+	INIT_WORK(&bond->miimon_commit_work, bond_miimon_commit);
+	INIT_WORK(&bond->ab_arp_commit_work, bond_ab_arp_commit);
+	INIT_WORK(&bond->alb_promisc_disable_work, bond_alb_promisc_disable);
 	return 0;
 }
 
diff --git a/drivers/net/bonding/bonding.h b/drivers/net/bonding/bonding.h
index c6fdd85..ddf3a94 100644
--- a/drivers/net/bonding/bonding.h
+++ b/drivers/net/bonding/bonding.h
@@ -198,7 +198,6 @@ struct bonding {
 	s32      slave_cnt; /* never change this value outside the attach/detach wrappers */
 	rwlock_t lock;
 	rwlock_t curr_slave_lock;
-	s8       kill_timers;
 	s8	 send_grat_arp;
 	s8	 send_unsol_na;
 	s8	 setup_by_slave;
@@ -223,6 +222,9 @@ struct bonding {
 	struct   delayed_work arp_work;
 	struct   delayed_work alb_work;
 	struct   delayed_work ad_work;
+	struct   work_struct miimon_commit_work;
+	struct   work_struct ab_arp_commit_work;
+	struct   work_struct alb_promisc_disable_work;
 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
 	struct   in6_addr master_ipv6;
 #endif
@@ -348,6 +350,7 @@ void bond_select_active_slave(struct bonding *bond);
 void bond_change_active_slave(struct bonding *bond, struct slave *new_active);
 void bond_register_arp(struct bonding *);
 void bond_unregister_arp(struct bonding *);
+void bond_alb_promisc_disable(struct work_struct *work);
 
 struct bond_net {
 	struct net *		net;	/* Associated network namespace */


---
	-Jay Vosburgh, IBM Linux Technology Center, fubar@us.ibm.com

^ permalink raw reply related	[flat|nested] 25+ messages in thread

* Re: [RFC] bonding: fix workqueue re-arming races
  2010-09-16 22:44                   ` Jay Vosburgh
@ 2010-09-24 11:23                     ` Narendra K
  2010-10-01 18:22                       ` Jiri Bohac
  0 siblings, 1 reply; 25+ messages in thread
From: Narendra K @ 2010-09-24 11:23 UTC (permalink / raw)
  To: Jay Vosburgh; +Cc: Jiri Bohac, bonding-devel, markine, jarkao2, chavey, netdev

On Fri, Sep 17, 2010 at 04:14:33AM +0530, Jay Vosburgh wrote:
> Jay Vosburgh <fubar@us.ibm.com> wrote:
> [...]
> 
> 	I had some time to work on this, and I fixed a few nits in the
> most recent patch, and also modified it as I describe above (the
> new_link business).  This seems to do the right thing for the mii/arp
> commit functions.
> 
> 	The alb_promisc alb_promisc function, however, still has a race.
> The curr_active_slave could change between the time the function is
> scheduled and when it executes.  That window is pretty small, but does
> exist.  Losing the race means that some interface stays promisc when it
> shouldn't; I don't believe it will panic.  Fixing that is probably a
> matter of stashing a pointer to the slave to be de-promisc-ified
> somewhere, but that stash would have to be handled if the slave were to
> be removed from the bond.
> 
> 	I've tested this a bit, and it seems ok, but I can't reproduce
> the original problem, so I'm not entirely sure this doesn't break
> something very subtle.
> 
> 	Also, I'll be out of the office for the next two weeks, so I
> won't get back to this until I return.  If any interested parties could
> test this out and provide some feedback before then, it would be
> appreciated.
> 
Thanks.

Original issue was seen when the system was rebooted and while the
network was shutting down. I applied the patch to linux-next (branch-
20100811) and issued service network stop/start in quick succession.

The bond interface had 4 slaves, 3 with link up and 1 with link down
configured in balance-alb mode, miimon=100, bonding driver version:3.7.0

The follwing call trace was seen -

2.6.35.with.upstream.patch-next-20100811-0.7-default+
[14602.945876] ------------[ cut here ]------------
[14602.950474] kernel BUG at kernel/workqueue.c:2844!
[14602.955242] invalid opcode: 0000 [#1] SMP 
[14602.959341] last sysfs file: /sys/class/net/bonding_masters
[14602.964888] CPU 1 
[14602.966714] Modules linked in: af_packet bonding ipv6 cpufreq_conservative cpufreq_userspace cpufreq_powersave acpi_cpufreq mperf microcode fuse loop dm_mod joydev usbhid hid bnx2 tpm_tis tpm tpm_bios rtc_cmos iTCO_wdt iTCO_vendor_support sr_mod power_meter cdrom sg serio_raw mptctl pcspkr rtc_core usb_storage dcdbas rtc_lib button uhci_hcd ehci_hcd usbcore sd_mod crc_t10dif edd ext3 mbcache jbd fan processor ide_pci_generic ide_core ata_generic ata_piix libata mptsas mptscsih mptbase scsi_transport_sas scsi_mod thermal thermal_sys hwmon
[14603.015002] 
[14603.016524] Pid: 4006, comm: ifdown-bonding Not tainted 2.6.35.with.upstream.patch-next-20100811-0.7-default+ #2 0M233H/PowerEdge R710
[14603.028554] RIP: 0010:[<ffffffff81067b50>]  [<ffffffff81067b50>] destroy_workqueue+0x1d0/0x1e0
[14603.037144] RSP: 0018:ffff88022a379d88  EFLAGS: 00010286
[14603.042432] RAX: 000000000000003c RBX: ffff880228674240 RCX: ffff880228f0e800
[14603.049534] RDX: 0000000000001000 RSI: 0000000000000002 RDI: 000000000000001a
[14603.056638] RBP: ffff88022a379da8 R08: ffff88022a379cf8 R09: 0000000000000000
[14603.063741] R10: 00000000ffffffff R11: 0000000000000000 R12: 0000000000000002
[14603.070842] R13: ffffffff817b8560 R14: ffff8802299d1480 R15: ffff8802299d1488
[14603.077944] FS:  00007f8e6a28f700(0000) GS:ffff880001c00000(0000) knlGS:0000000000000000
[14603.085999] CS:  0010 DS: 0000 ES: 0000 CR0: 000000008005003b
[14603.091719] CR2: 00007f8e6a2c2000 CR3: 0000000127d1c000 CR4: 00000000000006e0
[14603.098822] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[14603.105924] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
[14603.113026] Process ifdown-bonding (pid: 4006, threadinfo ffff88022a378000, task ffff8802299b0080)
[14603.121944] Stack:
[14603.123944]  ffff88022a379da8 ffff8802299d1000 ffff8802299d1000 000000010036b6a4
[14603.131182] <0> ffff88022a379dc8 ffffffffa030a91d ffff8802299d1000 000000010036b6a4
[14603.138857] <0> ffff88022a379e28 ffffffff812e0a08 ffff88022a379e38 ffff88022a379de8
[14603.146718] Call Trace:
[14603.149158]  [<ffffffffa030a91d>] bond_destructor+0x1d/0x30 [bonding]
[14603.155572]  [<ffffffff812e0a08>] netdev_run_todo+0x1a8/0x270
[14603.161293]  [<ffffffff812ee859>] rtnl_unlock+0x9/0x10
[14603.166411]  [<ffffffffa0317824>] bonding_store_bonds+0x1c4/0x1f0 [bonding]
[14603.173342]  [<ffffffff810f26be>] ? alloc_pages_current+0x9e/0x110
[14603.179497]  [<ffffffff81285c9e>] class_attr_store+0x1e/0x20
[14603.185132]  [<ffffffff8116e365>] sysfs_write_file+0xc5/0x140
[14603.190853]  [<ffffffff8110a68f>] vfs_write+0xcf/0x190
[14603.195967]  [<ffffffff8110a840>] sys_write+0x50/0x90
[14603.200996]  [<ffffffff81002ec2>] system_call_fastpath+0x16/0x1b
[14603.206974] Code: 00 7f 14 8b 3b eb 91 3d 00 10 00 00 89 c2 77 10 8b 3b e9 07 ff ff ff 3d 00 10 00 00 89 c2 76 f0 8b 3b e9 a9 fe ff ff 0f 0b eb fe <0f> 0b eb fe 66 66 66 2e 0f 1f 84 00 00 00 00 00 55 48 8b 3d 00 
[14603.226419] RIP  [<ffffffff81067b50>] destroy_workqueue+0x1d0/0x1e0
[14603.232669]  RSP <ffff88022a379d88>
[    0.000000] Initializing cgroup subsys cpuset
[    0.000000] Initializing cgroup subsys cpu

With regards,
Narendra K

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [RFC] bonding: fix workqueue re-arming races
  2010-09-24 11:23                     ` Narendra K
@ 2010-10-01 18:22                       ` Jiri Bohac
  2010-10-05 15:03                         ` Narendra_K
  0 siblings, 1 reply; 25+ messages in thread
From: Jiri Bohac @ 2010-10-01 18:22 UTC (permalink / raw)
  To: Narendra K
  Cc: Jay Vosburgh, Jiri Bohac, bonding-devel, markine, jarkao2,
	chavey, netdev

On Fri, Sep 24, 2010 at 06:23:53AM -0500, Narendra K wrote:
> On Fri, Sep 17, 2010 at 04:14:33AM +0530, Jay Vosburgh wrote:
> > Jay Vosburgh <fubar@us.ibm.com> wrote:
> The follwing call trace was seen -
> 
> 2.6.35.with.upstream.patch-next-20100811-0.7-default+
> [14602.945876] ------------[ cut here ]------------
> [14602.950474] kernel BUG at kernel/workqueue.c:2844!
> [14602.955242] invalid opcode: 0000 [#1] SMP 
> [14602.959341] last sysfs file: /sys/class/net/bonding_masters
> [14602.964888] CPU 1 
> [14602.966714] Modules linked in: af_packet bonding ipv6 cpufreq_conservative cpufreq_userspace cpufreq_powersave acpi_cpufreq mperf microcode fuse loop dm_mod joydev usbhid hid bnx2 tpm_tis tpm tpm_bios rtc_cmos iTCO_wdt iTCO_vendor_support sr_mod power_meter cdrom sg serio_raw mptctl pcspkr rtc_core usb_storage dcdbas rtc_lib button uhci_hcd ehci_hcd usbcore sd_mod crc_t10dif edd ext3 mbcache jbd fan processor ide_pci_generic ide_core ata_generic ata_piix libata mptsas mptscsih mptbase scsi_transport_sas scsi_mod thermal thermal_sys hwmon
> [14603.015002] 
> [14603.016524] Pid: 4006, comm: ifdown-bonding Not tainted 2.6.35.with.upstream.patch-next-20100811-0.7-default+ #2 0M233H/PowerEdge R710
> [14603.028554] RIP: 0010:[<ffffffff81067b50>]  [<ffffffff81067b50>] destroy_workqueue+0x1d0/0x1e0
> [14603.037144] RSP: 0018:ffff88022a379d88  EFLAGS: 00010286
> [14603.042432] RAX: 000000000000003c RBX: ffff880228674240 RCX: ffff880228f0e800
> [14603.049534] RDX: 0000000000001000 RSI: 0000000000000002 RDI: 000000000000001a
> [14603.056638] RBP: ffff88022a379da8 R08: ffff88022a379cf8 R09: 0000000000000000
> [14603.063741] R10: 00000000ffffffff R11: 0000000000000000 R12: 0000000000000002
> [14603.070842] R13: ffffffff817b8560 R14: ffff8802299d1480 R15: ffff8802299d1488
> [14603.077944] FS:  00007f8e6a28f700(0000) GS:ffff880001c00000(0000) knlGS:0000000000000000
> [14603.085999] CS:  0010 DS: 0000 ES: 0000 CR0: 000000008005003b
> [14603.091719] CR2: 00007f8e6a2c2000 CR3: 0000000127d1c000 CR4: 00000000000006e0
> [14603.098822] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
> [14603.105924] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
> [14603.113026] Process ifdown-bonding (pid: 4006, threadinfo ffff88022a378000, task ffff8802299b0080)
> [14603.121944] Stack:
> [14603.123944]  ffff88022a379da8 ffff8802299d1000 ffff8802299d1000 000000010036b6a4
> [14603.131182] <0> ffff88022a379dc8 ffffffffa030a91d ffff8802299d1000 000000010036b6a4
> [14603.138857] <0> ffff88022a379e28 ffffffff812e0a08 ffff88022a379e38 ffff88022a379de8
> [14603.146718] Call Trace:
> [14603.149158]  [<ffffffffa030a91d>] bond_destructor+0x1d/0x30 [bonding]
> [14603.155572]  [<ffffffff812e0a08>] netdev_run_todo+0x1a8/0x270
> [14603.161293]  [<ffffffff812ee859>] rtnl_unlock+0x9/0x10
> [14603.166411]  [<ffffffffa0317824>] bonding_store_bonds+0x1c4/0x1f0 [bonding]
> [14603.173342]  [<ffffffff810f26be>] ? alloc_pages_current+0x9e/0x110
> [14603.179497]  [<ffffffff81285c9e>] class_attr_store+0x1e/0x20
> [14603.185132]  [<ffffffff8116e365>] sysfs_write_file+0xc5/0x140
> [14603.190853]  [<ffffffff8110a68f>] vfs_write+0xcf/0x190
> [14603.195967]  [<ffffffff8110a840>] sys_write+0x50/0x90
> [14603.200996]  [<ffffffff81002ec2>] system_call_fastpath+0x16/0x1b
> [14603.206974] Code: 00 7f 14 8b 3b eb 91 3d 00 10 00 00 89 c2 77 10 8b 3b e9 07 ff ff ff 3d 00 10 00 00 89 c2 76 f0 8b 3b e9 a9 fe ff ff 0f 0b eb fe <0f> 0b eb fe 66 66 66 2e 0f 1f 84 00 00 00 00 00 55 48 8b 3d 00 
> [14603.226419] RIP  [<ffffffff81067b50>] destroy_workqueue+0x1d0/0x1e0
> [14603.232669]  RSP <ffff88022a379d88>
> [    0.000000] Initializing cgroup subsys cpuset
> [    0.000000] Initializing cgroup subsys cpu

This should be the BUG_ON(cwq->nr_active) in
destroy_workqueue()

This is really strange. bondng_store_bonds() can do two things:
create or delete a bonding device.

I checked the delete path, where I would normally expect such a
problem, but I can't find a way it could fail in this way.
bondng_store_bonds() calls unregister_netdevice(), which 
- calls rollback_registered() -> bond_close()
- puts the device on the net_todo_list.
On rtnl_unlock() netdev_run_todo() gets called and that calls 
bond_destructor().

bond_close() now makes sure the rearming work items are not
pending, thus, the only work items that may still be pending on
the workqueue are the non-rearming "commit" work items.
flush_workqueue(), called at the beginning of destroy_workqueue()
should have waited for these to finish.
If all of the above is correct, this BUG_ON should never trigger.

Maybe I am overlooking something, or it may be some kind of
failure/race condition in the create path, resulting in
bond_destructor() being called as well.

Narendra, any chance to capture the dmesg lines preceeding the
BUG message? This should show which of the above cases it is.

I will try to come up with a debug patch that will tell us which
work remains active on the work queue.

-- 
Jiri Bohac <jbohac@suse.cz>
SUSE Labs, SUSE CZ


^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [RFC] bonding: fix workqueue re-arming races
  2010-10-01 18:22                       ` Jiri Bohac
@ 2010-10-05 15:03                         ` Narendra_K
  2010-10-06  7:36                           ` Narendra_K
  0 siblings, 1 reply; 25+ messages in thread
From: Narendra_K @ 2010-10-05 15:03 UTC (permalink / raw)
  To: jbohac; +Cc: fubar, bonding-devel, markine, jarkao2, chavey, netdev

On Fri, Oct 01, 2010 at 11:52:32PM +0530, Jiri Bohac wrote:
>    On Fri, Sep 24, 2010 at 06:23:53AM -0500, Narendra K wrote:
>    > On Fri, Sep 17, 2010 at 04:14:33AM +0530, Jay Vosburgh wrote:
>    > > Jay Vosburgh <fubar@us.ibm.com> wrote:
>    > The follwing call trace was seen -
>    >
>    > 2.6.35.with.upstream.patch-next-20100811-0.7-default+
>    > [14602.945876] ------------[ cut here ]------------
>    > [14602.950474] kernel BUG at kernel/workqueue.c:2844!
>    > [14602.955242] invalid opcode: 0000 [#1] SMP
>    > [14602.959341] last sysfs file: /sys/class/net/bonding_masters
>    > [14602.964888] CPU 1
>    > [14602.966714] Modules linked in: af_packet bonding ipv6
>    cpufreq_conservative cpufreq_userspace cpufreq_powersave acpi_cpufreq
>    mperf microcode fuse loop dm_mod joydev usbhid hid bnx2 tpm_tis tpm
>    tpm_bios rtc_cmos iTCO_wdt iTCO_vendor_support sr_mod power_meter cdrom sg
>    serio_raw mptctl pcspkr rtc_core usb_storage dcdbas rtc_lib button
>    uhci_hcd ehci_hcd usbcore sd_mod crc_t10dif edd ext3 mbcache jbd fan
>    processor ide_pci_generic ide_core ata_generic ata_piix libata mptsas
>    mptscsih mptbase scsi_transport_sas scsi_mod thermal thermal_sys hwmon
>    > [14603.015002]
>    > [14603.016524] Pid: 4006, comm: ifdown-bonding Not tainted
>    2.6.35.with.upstream.patch-next-20100811-0.7-default+ #2 0M233H/PowerEdge
>    R710
>    > [14603.028554] RIP: 0010:[<ffffffff81067b50>]  [<ffffffff81067b50>]
>    destroy_workqueue+0x1d0/0x1e0
>    > [14603.037144] RSP: 0018:ffff88022a379d88  EFLAGS: 00010286
>    > [14603.042432] RAX: 000000000000003c RBX: ffff880228674240 RCX:
>    ffff880228f0e800
>    > [14603.049534] RDX: 0000000000001000 RSI: 0000000000000002 RDI:
>    000000000000001a
>    > [14603.056638] RBP: ffff88022a379da8 R08: ffff88022a379cf8 R09:
>    0000000000000000
>    > [14603.063741] R10: 00000000ffffffff R11: 0000000000000000 R12:
>    0000000000000002
>    > [14603.070842] R13: ffffffff817b8560 R14: ffff8802299d1480 R15:
>    ffff8802299d1488
>    > [14603.077944] FS:  00007f8e6a28f700(0000) GS:ffff880001c00000(0000)
>    knlGS:0000000000000000
>    > [14603.085999] CS:  0010 DS: 0000 ES: 0000 CR0: 000000008005003b
>    > [14603.091719] CR2: 00007f8e6a2c2000 CR3: 0000000127d1c000 CR4:
>    00000000000006e0
>    > [14603.098822] DR0: 0000000000000000 DR1: 0000000000000000 DR2:
>    0000000000000000
>    > [14603.105924] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7:
>    0000000000000400
>    > [14603.113026] Process ifdown-bonding (pid: 4006, threadinfo
>    ffff88022a378000, task ffff8802299b0080)
>    > [14603.121944] Stack:
>    > [14603.123944]  ffff88022a379da8 ffff8802299d1000 ffff8802299d1000
>    000000010036b6a4
>    > [14603.131182] <0> ffff88022a379dc8 ffffffffa030a91d ffff8802299d1000
>    000000010036b6a4
>    > [14603.138857] <0> ffff88022a379e28 ffffffff812e0a08 ffff88022a379e38
>    ffff88022a379de8
>    > [14603.146718] Call Trace:
>    > [14603.149158]  [<ffffffffa030a91d>] bond_destructor+0x1d/0x30 [bonding]
>    > [14603.155572]  [<ffffffff812e0a08>] netdev_run_todo+0x1a8/0x270
>    > [14603.161293]  [<ffffffff812ee859>] rtnl_unlock+0x9/0x10
>    > [14603.166411]  [<ffffffffa0317824>] bonding_store_bonds+0x1c4/0x1f0
>    [bonding]
>    > [14603.173342]  [<ffffffff810f26be>] ? alloc_pages_current+0x9e/0x110
>    > [14603.179497]  [<ffffffff81285c9e>] class_attr_store+0x1e/0x20
>    > [14603.185132]  [<ffffffff8116e365>] sysfs_write_file+0xc5/0x140
>    > [14603.190853]  [<ffffffff8110a68f>] vfs_write+0xcf/0x190
>    > [14603.195967]  [<ffffffff8110a840>] sys_write+0x50/0x90
>    > [14603.200996]  [<ffffffff81002ec2>] system_call_fastpath+0x16/0x1b
>    > [14603.206974] Code: 00 7f 14 8b 3b eb 91 3d 00 10 00 00 89 c2 77 10 8b
>    3b e9 07 ff ff ff 3d 00 10 00 00 89 c2 76 f0 8b 3b e9 a9 fe ff ff 0f 0b eb
>    fe <0f> 0b eb fe 66 66 66 2e 0f 1f 84 00 00 00 00 00 55 48 8b 3d 00
>    > [14603.226419] RIP  [<ffffffff81067b50>] destroy_workqueue+0x1d0/0x1e0
>    > [14603.232669]  RSP <ffff88022a379d88>
>    > [    0.000000] Initializing cgroup subsys cpuset
>    > [    0.000000] Initializing cgroup subsys cpu
> 
>    This should be the BUG_ON(cwq->nr_active) in
>    destroy_workqueue()
> 
>    This is really strange. bondng_store_bonds() can do two things:
>    create or delete a bonding device.
> 
>    I checked the delete path, where I would normally expect such a
>    problem, but I can't find a way it could fail in this way.
>    bondng_store_bonds() calls unregister_netdevice(), which
>    - calls rollback_registered() -> bond_close()
>    - puts the device on the net_todo_list.
>    On rtnl_unlock() netdev_run_todo() gets called and that calls
>    bond_destructor().
> 
>    bond_close() now makes sure the rearming work items are not
>    pending, thus, the only work items that may still be pending on
>    the workqueue are the non-rearming "commit" work items.
>    flush_workqueue(), called at the beginning of destroy_workqueue()
>    should have waited for these to finish.
>    If all of the above is correct, this BUG_ON should never trigger.
> 
>    Maybe I am overlooking something, or it may be some kind of
>    failure/race condition in the create path, resulting in
>    bond_destructor() being called as well.
> 
>    Narendra, any chance to capture the dmesg lines preceeding the
>    BUG message? This should show which of the above cases it is.

Jiri, I will try to reproduce the issue with ignore_loglevel to capture
more data on the serial console and share it shortly.

> 
>    I will try to come up with a debug patch that will tell us which
>    work remains active on the work queue.
> 
>    --
>    Jiri Bohac <jbohac@suse.cz>
>    SUSE Labs, SUSE CZ

-- 
With regards,
Narendra K

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [RFC] bonding: fix workqueue re-arming races
  2010-10-05 15:03                         ` Narendra_K
@ 2010-10-06  7:36                           ` Narendra_K
  0 siblings, 0 replies; 25+ messages in thread
From: Narendra_K @ 2010-10-06  7:36 UTC (permalink / raw)
  To: jbohac; +Cc: fubar, bonding-devel, markine, jarkao2, chavey, netdev

On Tue, Oct 05, 2010 at 08:33:29PM +0530, K, Narendra wrote:
>    On Fri, Oct 01, 2010 at 11:52:32PM +0530, Jiri Bohac wrote:
>    >    On Fri, Sep 24, 2010 at 06:23:53AM -0500, Narendra K wrote:
>    >    > On Fri, Sep 17, 2010 at 04:14:33AM +0530, Jay Vosburgh wrote:
>    >    > > Jay Vosburgh <fubar@us.ibm.com> wrote:
>    >    > The follwing call trace was seen -
>    >
>    >    This should be the BUG_ON(cwq->nr_active) in
>    >    destroy_workqueue()
>    >
>    >    This is really strange. bondng_store_bonds() can do two things:
>    >    create or delete a bonding device.
>    >
>    >    I checked the delete path, where I would normally expect such a
>    >    problem, but I can't find a way it could fail in this way.
>    >    bondng_store_bonds() calls unregister_netdevice(), which
>    >    - calls rollback_registered() -> bond_close()
>    >    - puts the device on the net_todo_list.
>    >    On rtnl_unlock() netdev_run_todo() gets called and that calls
>    >    bond_destructor().
>    >
>    >    bond_close() now makes sure the rearming work items are not
>    >    pending, thus, the only work items that may still be pending on
>    >    the workqueue are the non-rearming "commit" work items.
>    >    flush_workqueue(), called at the beginning of destroy_workqueue()
>    >    should have waited for these to finish.
>    >    If all of the above is correct, this BUG_ON should never trigger.
>    >
>    >    Maybe I am overlooking something, or it may be some kind of
>    >    failure/race condition in the create path, resulting in
>    >    bond_destructor() being called as well.
>    >
>    >    Narendra, any chance to capture the dmesg lines preceeding the
>    >    BUG message? This should show which of the above cases it is.
>
>    Jiri, I will try to reproduce the issue with ignore_loglevel to capture
>    more data on the serial console and share it shortly.

Here is a more verbose sequence of log messages just before the issue is
hit. I have attached logs beginning two iterations before the failure.
Please let me know if you need logs further back in the sequence.

[ 6139.115628] bonding: bond0 is being created...
[ 6139.122159] bonding: bond0: setting mode to balance-alb (6).
[ 6139.128537] bonding: bond0: Setting MII monitoring interval to 100.
[ 6139.137702] ADDRCONF(NETDEV_UP): bond0: link is not ready
[ 6139.145516] bonding: bond0: Adding slave eth0.
[ 6139.173964] bnx2 0000:01:00.0: irq 79 for MSI/MSI-X
[ 6139.179111] bnx2 0000:01:00.0: irq 80 for MSI/MSI-X
[ 6139.184262] bnx2 0000:01:00.0: irq 81 for MSI/MSI-X
[ 6139.189424] bnx2 0000:01:00.0: irq 82 for MSI/MSI-X
[ 6139.194559] bnx2 0000:01:00.0: irq 83 for MSI/MSI-X
[ 6139.199701] bnx2 0000:01:00.0: irq 84 for MSI/MSI-X
[ 6139.204873] bnx2 0000:01:00.0: irq 85 for MSI/MSI-X
[ 6139.210016] bnx2 0000:01:00.0: irq 86 for MSI/MSI-X
[ 6139.215158] bnx2 0000:01:00.0: irq 87 for MSI/MSI-X
[ 6139.270975] bnx2 0000:01:00.0: eth0: using MSIX
[ 6139.278893] bonding: bond0: enslaving eth0 as an active interface with a down link.
[ 6139.291007] bonding: bond0: Adding slave eth1.
[ 6139.321113] bnx2 0000:01:00.1: irq 88 for MSI/MSI-X
[ 6139.325991] bnx2 0000:01:00.1: irq 89 for MSI/MSI-X
[ 6139.330866] bnx2 0000:01:00.1: irq 90 for MSI/MSI-X
[ 6139.335752] bnx2 0000:01:00.1: irq 91 for MSI/MSI-X
[ 6139.340626] bnx2 0000:01:00.1: irq 92 for MSI/MSI-X
[ 6139.345500] bnx2 0000:01:00.1: irq 93 for MSI/MSI-X
[ 6139.350373] bnx2 0000:01:00.1: irq 94 for MSI/MSI-X
[ 6139.355246] bnx2 0000:01:00.1: irq 95 for MSI/MSI-X
[ 6139.360119] bnx2 0000:01:00.1: irq 96 for MSI/MSI-X
[ 6139.418765] bnx2 0000:01:00.1: eth1: using MSIX
[ 6139.426671] bonding: bond0: enslaving eth1 as an active interface with a down link.
[ 6139.438664] bonding: bond0: Adding slave eth2.
[ 6139.469101] bnx2 0000:02:00.0: irq 97 for MSI/MSI-X
[ 6139.473980] bnx2 0000:02:00.0: irq 98 for MSI/MSI-X
[ 6139.478856] bnx2 0000:02:00.0: irq 99 for MSI/MSI-X
[ 6139.483743] bnx2 0000:02:00.0: irq 100 for MSI/MSI-X
[ 6139.488706] bnx2 0000:02:00.0: irq 101 for MSI/MSI-X
[ 6139.493670] bnx2 0000:02:00.0: irq 102 for MSI/MSI-X
[ 6139.498641] bnx2 0000:02:00.0: irq 103 for MSI/MSI-X
[ 6139.503604] bnx2 0000:02:00.0: irq 104 for MSI/MSI-X
[ 6139.508566] bnx2 0000:02:00.0: irq 105 for MSI/MSI-X
[ 6139.566908] bnx2 0000:02:00.0: eth2: using MSIX
[ 6139.574815] bonding: bond0: enslaving eth2 as an active interface with a down link.
[ 6139.586686] bonding: bond0: Adding slave eth3.
[ 6139.617042] bnx2 0000:02:00.1: irq 106 for MSI/MSI-X
[ 6139.622011] bnx2 0000:02:00.1: irq 107 for MSI/MSI-X
[ 6139.626974] bnx2 0000:02:00.1: irq 108 for MSI/MSI-X
[ 6139.631942] bnx2 0000:02:00.1: irq 109 for MSI/MSI-X
[ 6139.636904] bnx2 0000:02:00.1: irq 110 for MSI/MSI-X
[ 6139.641867] bnx2 0000:02:00.1: irq 111 for MSI/MSI-X
[ 6139.646835] bnx2 0000:02:00.1: irq 112 for MSI/MSI-X
[ 6139.651798] bnx2 0000:02:00.1: irq 113 for MSI/MSI-X
[ 6139.656760] bnx2 0000:02:00.1: irq 114 for MSI/MSI-X
[ 6139.714833] bnx2 0000:02:00.1: eth3: using MSIX
[ 6139.722929] bonding: bond0: enslaving eth3 as an active interface with a down link.
[ 6141.684544] bnx2 0000:01:00.0: eth0: NIC Copper Link is Up, 1000 Mbps full duplex
[ 6141.732924] bonding: bond0: link status definitely up for interface eth0.
[ 6141.739714] bonding: bond0: making interface eth0 the new active one.
[ 6141.749618] bonding: bond0: first active interface up!
[ 6141.756427] ADDRCONF(NETDEV_CHANGE): bond0: link becomes ready
[ 6141.832597] bnx2 0000:01:00.1: eth1: NIC Copper Link is Up, 1000 Mbps full duplex
[ 6141.840185] bonding: bond0: link status definitely up for interface eth1.
[ 6142.013511] bnx2 0000:02:00.0: eth2: NIC Copper Link is Up, 1000 Mbps full duplex, receive & transmit flow control ON
[ 6142.037019] bonding: bond0: link status definitely up for interface eth2.
[ 6146.252919] bonding: bond0: link status definitely down for interface eth0, disabling it
[ 6146.261009] bonding: bond0: making interface eth1 the new active one.
[ 6146.305466] bnx2 0000:01:00.0: irq 79 for MSI/MSI-X
[ 6146.310347] bnx2 0000:01:00.0: irq 80 for MSI/MSI-X
[ 6146.315223] bnx2 0000:01:00.0: irq 81 for MSI/MSI-X
[ 6146.320100] bnx2 0000:01:00.0: irq 82 for MSI/MSI-X
[ 6146.324983] bnx2 0000:01:00.0: irq 83 for MSI/MSI-X
[ 6146.329858] bnx2 0000:01:00.0: irq 84 for MSI/MSI-X
[ 6146.334735] bnx2 0000:01:00.0: irq 85 for MSI/MSI-X
[ 6146.339611] bnx2 0000:01:00.0: irq 86 for MSI/MSI-X
[ 6146.344535] bnx2 0000:01:00.0: irq 87 for MSI/MSI-X
[ 6146.402416] bnx2 0000:01:00.0: eth0: using MSIX
[ 6147.100849] bonding: bond0: link status definitely down for interface eth1, disabling it
[ 6147.108943] bonding: bond0: making interface eth2 the new active one.
[ 6147.153439] bnx2 0000:01:00.1: irq 88 for MSI/MSI-X
[ 6147.158322] bnx2 0000:01:00.1: irq 89 for MSI/MSI-X
[ 6147.163199] bnx2 0000:01:00.1: irq 90 for MSI/MSI-X
[ 6147.168081] bnx2 0000:01:00.1: irq 91 for MSI/MSI-X
[ 6147.172957] bnx2 0000:01:00.1: irq 92 for MSI/MSI-X
[ 6147.177833] bnx2 0000:01:00.1: irq 93 for MSI/MSI-X
[ 6147.182717] bnx2 0000:01:00.1: irq 94 for MSI/MSI-X
[ 6147.187597] bnx2 0000:01:00.1: irq 95 for MSI/MSI-X
[ 6147.192584] bnx2 0000:01:00.1: irq 96 for MSI/MSI-X
[ 6147.250432] bnx2 0000:01:00.1: eth1: using MSIX
[ 6147.956727] bonding: bond0: link status definitely down for interface eth2, disabling it
[ 6147.964821] bonding: bond0: now running without any active interface !
[ 6148.005316] bnx2 0000:02:00.0: irq 97 for MSI/MSI-X
[ 6148.010198] bnx2 0000:02:00.0: irq 98 for MSI/MSI-X
[ 6148.015072] bnx2 0000:02:00.0: irq 99 for MSI/MSI-X
[ 6148.019949] bnx2 0000:02:00.0: irq 100 for MSI/MSI-X
[ 6148.024911] bnx2 0000:02:00.0: irq 101 for MSI/MSI-X
[ 6148.029872] bnx2 0000:02:00.0: irq 102 for MSI/MSI-X
[ 6148.034833] bnx2 0000:02:00.0: irq 103 for MSI/MSI-X
[ 6148.039793] bnx2 0000:02:00.0: irq 104 for MSI/MSI-X
[ 6148.044756] bnx2 0000:02:00.0: irq 105 for MSI/MSI-X
[ 6148.102286] bnx2 0000:02:00.0: eth2: using MSIX
[ 6148.873352] bnx2 0000:02:00.1: irq 106 for MSI/MSI-X
[ 6148.878319] bnx2 0000:02:00.1: irq 107 for MSI/MSI-X
[ 6148.883293] bnx2 0000:02:00.1: irq 108 for MSI/MSI-X
[ 6148.888253] bnx2 0000:02:00.1: irq 109 for MSI/MSI-X
[ 6148.893213] bnx2 0000:02:00.1: irq 110 for MSI/MSI-X
[ 6148.898174] bnx2 0000:02:00.1: irq 111 for MSI/MSI-X
[ 6148.903134] bnx2 0000:02:00.1: irq 112 for MSI/MSI-X
[ 6148.908094] bnx2 0000:02:00.1: irq 113 for MSI/MSI-X
[ 6148.913060] bnx2 0000:02:00.1: irq 114 for MSI/MSI-X
[ 6148.970345] bnx2 0000:02:00.1: eth3: using MSIX
[ 6149.445381] bnx2 0000:01:00.0: eth0: NIC Copper Link is Up, 1000 Mbps full duplex
[ 6149.540563] bonding: bond0: link status definitely up for interface eth0.
[ 6149.547352] bonding: bond0: making interface eth0 the new active one.
[ 6149.557274] bonding: bond0: first active interface up!
[ 6149.941355] bonding: bond0: Removing slave eth0.
[ 6149.945983] bonding: bond0: Warning: the permanent HWaddr of eth0 - 00:22:19:cc:9a:25 - is still in use by bond0. Set the HWaddr of eth0 to a different address to avoid conflicts.
[ 6149.962010] bonding: bond0: releasing active interface eth0
[ 6150.074401] bonding: bond0: Removing slave eth1.
[ 6150.079027] bonding: bond0: releasing active interface eth1
[ 6150.202403] bonding: bond0: Removing slave eth2.
[ 6150.207028] bonding: bond0: releasing active interface eth2
[ 6150.330275] bonding: bond0: Removing slave eth3.
[ 6150.334900] bonding: bond0: releasing active interface eth3
[ 6150.552776] bonding: bond0 is being deleted...
[ 6156.145939] bonding: bond0 is being created...
[ 6156.152253] bonding: bond0: setting mode to balance-alb (6).
[ 6156.158899] bonding: bond0: Setting MII monitoring interval to 100.
[ 6156.168142] ADDRCONF(NETDEV_UP): bond0: link is not ready
[ 6156.176120] bonding: bond0: Adding slave eth0.
[ 6156.205188] bnx2 0000:01:00.0: irq 79 for MSI/MSI-X
[ 6156.210066] bnx2 0000:01:00.0: irq 80 for MSI/MSI-X
[ 6156.214942] bnx2 0000:01:00.0: irq 81 for MSI/MSI-X
[ 6156.219817] bnx2 0000:01:00.0: irq 82 for MSI/MSI-X
[ 6156.224724] bnx2 0000:01:00.0: irq 83 for MSI/MSI-X
[ 6156.229599] bnx2 0000:01:00.0: irq 84 for MSI/MSI-X
[ 6156.234473] bnx2 0000:01:00.0: irq 85 for MSI/MSI-X
[ 6156.239347] bnx2 0000:01:00.0: irq 86 for MSI/MSI-X
[ 6156.244222] bnx2 0000:01:00.0: irq 87 for MSI/MSI-X
[ 6156.301985] bnx2 0000:01:00.0: eth0: using MSIX
[ 6156.309899] bonding: bond0: enslaving eth0 as an active interface with a down link.
[ 6156.321860] bonding: bond0: Adding slave eth1.
[ 6156.348184] bnx2 0000:01:00.1: irq 88 for MSI/MSI-X
[ 6156.353066] bnx2 0000:01:00.1: irq 89 for MSI/MSI-X
[ 6156.357942] bnx2 0000:01:00.1: irq 90 for MSI/MSI-X
[ 6156.362818] bnx2 0000:01:00.1: irq 91 for MSI/MSI-X
[ 6156.367694] bnx2 0000:01:00.1: irq 92 for MSI/MSI-X
[ 6156.372570] bnx2 0000:01:00.1: irq 93 for MSI/MSI-X
[ 6156.377445] bnx2 0000:01:00.1: irq 94 for MSI/MSI-X
[ 6156.382325] bnx2 0000:01:00.1: irq 95 for MSI/MSI-X
[ 6156.387199] bnx2 0000:01:00.1: irq 96 for MSI/MSI-X
[ 6156.445966] bnx2 0000:01:00.1: eth1: using MSIX
[ 6156.453878] bonding: bond0: enslaving eth1 as an active interface with a down link.
[ 6156.465826] bonding: bond0: Adding slave eth2.
[ 6156.496230] bnx2 0000:02:00.0: irq 97 for MSI/MSI-X
[ 6156.501110] bnx2 0000:02:00.0: irq 98 for MSI/MSI-X
[ 6156.505986] bnx2 0000:02:00.0: irq 99 for MSI/MSI-X
[ 6156.510868] bnx2 0000:02:00.0: irq 100 for MSI/MSI-X
[ 6156.515832] bnx2 0000:02:00.0: irq 101 for MSI/MSI-X
[ 6156.520794] bnx2 0000:02:00.0: irq 102 for MSI/MSI-X
[ 6156.525760] bnx2 0000:02:00.0: irq 103 for MSI/MSI-X
[ 6156.530723] bnx2 0000:02:00.0: irq 104 for MSI/MSI-X
[ 6156.535685] bnx2 0000:02:00.0: irq 105 for MSI/MSI-X
[ 6156.594017] bnx2 0000:02:00.0: eth2: using MSIX
[ 6156.601932] bonding: bond0: enslaving eth2 as an active interface with a down link.
[ 6156.613845] bonding: bond0: Adding slave eth3.
[ 6156.644229] bnx2 0000:02:00.1: irq 106 for MSI/MSI-X
[ 6156.649197] bnx2 0000:02:00.1: irq 107 for MSI/MSI-X
[ 6156.654161] bnx2 0000:02:00.1: irq 108 for MSI/MSI-X
[ 6156.659130] bnx2 0000:02:00.1: irq 109 for MSI/MSI-X
[ 6156.664093] bnx2 0000:02:00.1: irq 110 for MSI/MSI-X
[ 6156.669056] bnx2 0000:02:00.1: irq 111 for MSI/MSI-X
[ 6156.674023] bnx2 0000:02:00.1: irq 112 for MSI/MSI-X
[ 6156.678985] bnx2 0000:02:00.1: irq 113 for MSI/MSI-X
[ 6156.683947] bnx2 0000:02:00.1: irq 114 for MSI/MSI-X
[ 6156.742024] bnx2 0000:02:00.1: eth3: using MSIX
[ 6156.749940] bonding: bond0: enslaving eth3 as an active interface with a down link.
[ 6158.808483] bnx2 0000:01:00.0: eth0: NIC Copper Link is Up, 1000 Mbps full duplex
[ 6158.868054] bonding: bond0: link status definitely up for interface eth0.
[ 6158.874846] bonding: bond0: making interface eth0 the new active one.
[ 6158.884753] bonding: bond0: first active interface up!
[ 6158.891563] ADDRCONF(NETDEV_CHANGE): bond0: link becomes ready
[ 6158.922727] bnx2 0000:01:00.1: eth1: NIC Copper Link is Up, 1000 Mbps full duplex
[ 6158.968157] bonding: bond0: link status definitely up for interface eth1.
[ 6159.021517] bnx2 0000:02:00.0: eth2: NIC Copper Link is Up, 1000 Mbps full duplex, receive & transmit flow control ON
[ 6159.068235] bonding: bond0: link status definitely up for interface eth2.
[ 6160.040150] bnx2 0000:01:00.1: eth1: NIC Copper Link is Down
[ 6160.069773] bonding: bond0: link status definitely down for interface eth1, disabling it
[ 6162.460199] bnx2 0000:01:00.1: eth1: NIC Copper Link is Up, 1000 Mbps full duplex
[ 6162.467865] bonding: bond0: link status definitely up for interface eth1.
[ 6163.168061] bonding: bond0: link status definitely down for interface eth0, disabling it
[ 6163.176149] bonding: bond0: making interface eth1 the new active one.
[ 6163.220460] bnx2 0000:01:00.0: irq 79 for MSI/MSI-X
[ 6163.225481] bnx2 0000:01:00.0: irq 80 for MSI/MSI-X
[ 6163.230413] bnx2 0000:01:00.0: irq 81 for MSI/MSI-X
[ 6163.235342] bnx2 0000:01:00.0: irq 82 for MSI/MSI-X
[ 6163.240282] bnx2 0000:01:00.0: irq 83 for MSI/MSI-X
[ 6163.245212] bnx2 0000:01:00.0: irq 84 for MSI/MSI-X
[ 6163.250142] bnx2 0000:01:00.0: irq 85 for MSI/MSI-X
[ 6163.255080] bnx2 0000:01:00.0: irq 86 for MSI/MSI-X
[ 6163.260009] bnx2 0000:01:00.0: irq 87 for MSI/MSI-X
[ 6163.317682] bnx2 0000:01:00.0: eth0: using MSIX
[ 6164.032023] bonding: bond0: link status definitely down for interface eth1, disabling it
[ 6164.040115] bonding: bond0: making interface eth2 the new active one.
[ 6164.084607] bnx2 0000:01:00.1: irq 88 for MSI/MSI-X
[ 6164.089488] bnx2 0000:01:00.1: irq 89 for MSI/MSI-X
[ 6164.094361] bnx2 0000:01:00.1: irq 90 for MSI/MSI-X
[ 6164.099236] bnx2 0000:01:00.1: irq 91 for MSI/MSI-X
[ 6164.104111] bnx2 0000:01:00.1: irq 92 for MSI/MSI-X
[ 6164.108986] bnx2 0000:01:00.1: irq 93 for MSI/MSI-X
[ 6164.113860] bnx2 0000:01:00.1: irq 94 for MSI/MSI-X
[ 6164.118733] bnx2 0000:01:00.1: irq 95 for MSI/MSI-X
[ 6164.123608] bnx2 0000:01:00.1: irq 96 for MSI/MSI-X
[ 6164.181533] bnx2 0000:01:00.1: eth1: using MSIX
[ 6164.880097] bonding: bond0: link status definitely down for interface eth2, disabling it
[ 6164.888191] bonding: bond0: now running without any active interface !
[ 6164.932440] bnx2 0000:02:00.0: irq 97 for MSI/MSI-X
[ 6164.937322] bnx2 0000:02:00.0: irq 98 for MSI/MSI-X
[ 6164.942253] bnx2 0000:02:00.0: irq 99 for MSI/MSI-X
[ 6164.947192] bnx2 0000:02:00.0: irq 100 for MSI/MSI-X
[ 6164.952208] bnx2 0000:02:00.0: irq 101 for MSI/MSI-X
[ 6164.957234] bnx2 0000:02:00.0: irq 102 for MSI/MSI-X
[ 6164.962247] bnx2 0000:02:00.0: irq 103 for MSI/MSI-X
[ 6164.967274] bnx2 0000:02:00.0: irq 104 for MSI/MSI-X
[ 6164.972290] bnx2 0000:02:00.0: irq 105 for MSI/MSI-X
[ 6165.029585] bnx2 0000:02:00.0: eth2: using MSIX
[ 6165.776479] bnx2 0000:02:00.1: irq 106 for MSI/MSI-X
[ 6165.781449] bnx2 0000:02:00.1: irq 107 for MSI/MSI-X
[ 6165.786422] bnx2 0000:02:00.1: irq 108 for MSI/MSI-X
[ 6165.791385] bnx2 0000:02:00.1: irq 109 for MSI/MSI-X
[ 6165.796345] bnx2 0000:02:00.1: irq 110 for MSI/MSI-X
[ 6165.801312] bnx2 0000:02:00.1: irq 111 for MSI/MSI-X
[ 6165.806273] bnx2 0000:02:00.1: irq 112 for MSI/MSI-X
[ 6165.811233] bnx2 0000:02:00.1: irq 113 for MSI/MSI-X
[ 6165.816199] bnx2 0000:02:00.1: irq 114 for MSI/MSI-X
[ 6165.873445] bnx2 0000:02:00.1: eth3: using MSIX
[ 6165.951287] bnx2 0000:01:00.0: eth0: NIC Copper Link is Up, 1000 Mbps full duplex
[ 6165.967840] bonding: bond0: link status definitely up for interface eth0.
[ 6165.974631] bonding: bond0: making interface eth0 the new active one.
[ 6165.984534] bonding: bond0: first active interface up!
[ 6166.836175] bnx2 0000:01:00.1: eth1: NIC Copper Link is Up, 1000 Mbps full duplex
[ 6166.839757] bonding: bond0: Removing slave eth0.
[ 6166.839768] bonding: bond0: Warning: the permanent HWaddr of eth0 - 00:22:19:cc:9a:25 - is still in use by bond0. Set the HWaddr of eth0 to a different address to avoid conflicts.
[ 6166.839772] bonding: bond0: releasing active interface eth0
[ 6166.870004]
[ 6166.999414] bonding: bond0: Removing slave eth1.
[ 6167.004041] bonding: bond0: releasing active interface eth1
[ 6167.125571] bonding: bond0: Removing slave eth2.
[ 6167.130196] bonding: bond0: releasing active interface eth2
[ 6167.253539] bonding: bond0: Removing slave eth3.
[ 6167.258162] bonding: bond0: releasing active interface eth3
[ 6167.443911] bonding: bond0 is being deleted...
[ 6167.508557] ------------[ cut here ]------------
[ 6167.513167] kernel BUG at kernel/workqueue.c:2844!
[ 6167.517948] invalid opcode: 0000 [#1] SMP
[ 6167.522058] last sysfs file: /sys/class/net/bonding_masters
[ 6167.527619] CPU 0
[ 6167.529452] Modules linked in: af_packet bonding ipv6 mperf microcode fuse loop dm_mod joydev usbhid hid tpm_tis tpm usb_storage iTCO_wdt tpm_bios iTCO_vendor_support rtc_cmos rtc_core rtc_lib sg mptctl sr_mod cdrom dcdbas power_meter bnx2 serio_raw button pcspkr uhci_hcd ehci_hcd usbcore sd_mod crc_t10dif edd ext3 mbcache jbd fan processor ide_pci_generic ide_core ata_generic ata_piix libata mptsas mptscsih mptbase scsi_transport_sas scsi_mod thermal thermal_sys hwmon
[ 6167.571721]
[ 6167.573209] Pid: 13848, comm: ifdown-bonding Not tainted 2.6.35.with.upstream.patch-next-20100811-0.7-default+ #1 0M233H/PowerEdge R710
[ 6167.585362] RIP: 0010:[<ffffffff81067b50>]  [<ffffffff81067b50>] destroy_workqueue+0x1d0/0x1e0
[ 6167.593977] RSP: 0018:ffff880229981d88  EFLAGS: 00010286
[ 6167.599278] RAX: 000000000000003c RBX: ffff880127646800 RCX: ffff880128324700
[ 6167.606401] RDX: 0000000000001000 RSI: 0000000000000002 RDI: 000000000000001a
[ 6167.613523] RBP: ffff880229981da8 R08: ffff880229981cf8 R09: 0000000000000000
[ 6167.620646] R10: 00000000ffffffff R11: 0000000000000000 R12: 0000000000000002
[ 6167.627769] R13: ffffffff817b8560 R14: ffff88012a028480 R15: ffff88012a028488
[ 6167.634892] FS:  00007fb251db3700(0000) GS:ffff880133a00000(0000) knlGS:0000000000000000
[ 6167.642969] CS:  0010 DS: 0000 ES: 0000 CR0: 000000008005003b
[ 6167.648705] CR2: 00007fb251de6000 CR3: 00000001283ca000 CR4: 00000000000006f0
[ 6167.655827] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[ 6167.662950] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
[ 6167.670073] Process ifdown-bonding (pid: 13848, threadinfo ffff880229980000, task ffff88021db9a600)
[ 6167.679103] Stack:
[ 6167.681110]  ffff880229981da8 ffff88012a028000 ffff88012a028000 000000010016619c
[ 6167.688354] <0> ffff880229981dc8 ffffffffa03bf91d ffff88012a028000 000000010016619c
[ 6167.696053] <0> ffff880229981e28 ffffffff812e0a08 ffff880229981e38 ffff880229981de8
[ 6167.703936] Call Trace:
[ 6167.706382]  [<ffffffffa03bf91d>] bond_destructor+0x1d/0x30 [bonding]
[ 6167.712816]  [<ffffffff812e0a08>] netdev_run_todo+0x1a8/0x270
[ 6167.718555]  [<ffffffff812ee859>] rtnl_unlock+0x9/0x10
[ 6167.723752]  [<ffffffffa03cc824>] bonding_store_bonds+0x1c4/0x1f0 [bonding]
[ 6167.730705]  [<ffffffff810f26be>] ? alloc_pages_current+0x9e/0x110
[ 6167.736876]  [<ffffffff81285c9e>] class_attr_store+0x1e/0x20
[ 6167.742528]  [<ffffffff8116e365>] sysfs_write_file+0xc5/0x140
[ 6167.748267]  [<ffffffff8110a68f>] vfs_write+0xcf/0x190
[ 6167.753397]  [<ffffffff8110a840>] sys_write+0x50/0x90
[ 6167.758444]  [<ffffffff81002ec2>] system_call_fastpath+0x16/0x1b
[ 6167.764439] Code: 00 7f 14 8b 3b eb 91 3d 00 10 00 00 89 c2 77 10 8b 3b e9 07 ff ff ff 3d 00 10 00 00 89 c2 76 f0 8b 3b e9 a9 fe ff ff 0f 0b eb fe <0f> 0b eb fe 66 66 66 2e 0f 1f 84 00 00 00 00 00 55 48 8b 3d 00
[ 6167.783930] RIP  [<ffffffff81067b50>] destroy_workqueue+0x1d0/0x1e0
[ 6167.790200]  RSP <ffff880229981d88>
[    0.000000] Initializing cgroup subsys cpuset
[    0.000000] Initializing cgroup subsys cpu


--
With regards,
Narendra K

^ permalink raw reply	[flat|nested] 25+ messages in thread

end of thread, other threads:[~2010-10-06  7:41 UTC | newest]

Thread overview: 25+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2010-08-31 17:07 [RFC] bonding: fix workqueue re-arming races Jiri Bohac
2010-08-31 20:54 ` Jay Vosburgh
2010-09-01 12:23   ` Jarek Poplawski
2010-09-01 13:30     ` Jiri Bohac
2010-09-01 15:18       ` Jarek Poplawski
2010-09-01 15:37         ` Jarek Poplawski
2010-09-01 19:00           ` Jarek Poplawski
2010-09-01 19:11             ` Jiri Bohac
2010-09-01 19:20               ` Jarek Poplawski
2010-09-01 19:38                 ` Jarek Poplawski
2010-09-01 19:46                 ` Jay Vosburgh
2010-09-01 20:06                   ` Jarek Poplawski
2010-09-01 13:16   ` Jiri Bohac
2010-09-01 17:14     ` Jay Vosburgh
2010-09-01 18:31       ` Jiri Bohac
2010-09-01 20:00         ` Jay Vosburgh
2010-09-01 20:56           ` Jiri Bohac
2010-09-02  0:54             ` Jay Vosburgh
2010-09-02 17:08               ` Jiri Bohac
2010-09-09  0:06                 ` Jay Vosburgh
2010-09-16 22:44                   ` Jay Vosburgh
2010-09-24 11:23                     ` Narendra K
2010-10-01 18:22                       ` Jiri Bohac
2010-10-05 15:03                         ` Narendra_K
2010-10-06  7:36                           ` Narendra_K

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.