All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] Move inetdev/ifa over to RCU
@ 2004-08-12 23:59 David S. Miller
  2004-08-13  2:20 ` James Morris
                   ` (2 more replies)
  0 siblings, 3 replies; 46+ messages in thread
From: David S. Miller @ 2004-08-12 23:59 UTC (permalink / raw)
  To: netdev; +Cc: robert.olsson, hadi, kuznet


[ Robert, Jamal, Alexey, the previous version I sent you guys
  privately early today had a minor bug, in inet_free_ifa()
  we now need to use in_dev_put() instead of __in_dev_put() ]

The main motivation for this was to make fib_validate_source()
cheaper, as currently it needs a global lock in order to
access the inet device interface lists.

This makes is all use RCU.

I kept the non-RCU lock usage in multicast address list
handling in net/ipv4/igmp.c, but that could use RCU as
well if we wanted to.

While doing this I noticed that devinet.c had these two
counters (inet_ifa_count and inet_dev_count) which were
updated but nobody ever read, so these got killed.

Someone poke holes in this patch please :-)

# This is a BitKeeper generated diff -Nru style patch.
#
# ChangeSet
#   2004/08/12 16:42:07-07:00 davem@nuts.davemloft.net 
#   [IPV4]: Move inetdev/ifa locking over to RCU.
#   
#   Multicast ipv4 address handling still uses rwlock
#   and spinlock synchronization.
#   
#   Signed-off-by: David S. Miller <davem@redhat.com>
# 
# net/sctp/protocol.c
#   2004/08/12 16:41:13-07:00 davem@nuts.davemloft.net +3 -5
#   [IPV4]: Move inetdev/ifa locking over to RCU.
# 
# net/irda/irlan/irlan_eth.c
#   2004/08/12 16:41:13-07:00 davem@nuts.davemloft.net +2 -2
#   [IPV4]: Move inetdev/ifa locking over to RCU.
# 
# net/ipv4/route.c
#   2004/08/12 16:41:13-07:00 davem@nuts.davemloft.net +3 -3
#   [IPV4]: Move inetdev/ifa locking over to RCU.
# 
# net/ipv4/igmp.c
#   2004/08/12 16:41:13-07:00 davem@nuts.davemloft.net +47 -45
#   [IPV4]: Move inetdev/ifa locking over to RCU.
# 
# net/ipv4/icmp.c
#   2004/08/12 16:41:13-07:00 davem@nuts.davemloft.net +2 -2
#   [IPV4]: Move inetdev/ifa locking over to RCU.
# 
# net/ipv4/fib_frontend.c
#   2004/08/12 16:41:13-07:00 davem@nuts.davemloft.net +2 -2
#   [IPV4]: Move inetdev/ifa locking over to RCU.
# 
# net/ipv4/devinet.c
#   2004/08/12 16:41:13-07:00 davem@nuts.davemloft.net +49 -63
#   [IPV4]: Move inetdev/ifa locking over to RCU.
# 
# include/linux/inetdevice.h
#   2004/08/12 16:41:13-07:00 davem@nuts.davemloft.net +17 -11
#   [IPV4]: Move inetdev/ifa locking over to RCU.
# 
diff -Nru a/include/linux/inetdevice.h b/include/linux/inetdevice.h
--- a/include/linux/inetdevice.h	2004-08-12 16:42:31 -07:00
+++ b/include/linux/inetdevice.h	2004-08-12 16:42:31 -07:00
@@ -3,6 +3,8 @@
 
 #ifdef __KERNEL__
 
+#include <linux/rcupdate.h>
+
 struct ipv4_devconf
 {
 	int	accept_redirects;
@@ -31,13 +33,13 @@
 
 struct in_device
 {
-	struct net_device		*dev;
+	struct net_device	*dev;
 	atomic_t		refcnt;
-	rwlock_t		lock;
 	int			dead;
 	struct in_ifaddr	*ifa_list;	/* IP ifaddr chain		*/
+	rwlock_t		mc_list_lock;
 	struct ip_mc_list	*mc_list;	/* IP multicast filter chain    */
-	rwlock_t		mc_lock;	/* for mc_tomb */
+	spinlock_t		mc_tomb_lock;
 	struct ip_mc_list	*mc_tomb;
 	unsigned long		mr_v1_seen;
 	unsigned long		mr_v2_seen;
@@ -50,6 +52,7 @@
 
 	struct neigh_parms	*arp_parms;
 	struct ipv4_devconf	cnf;
+	struct rcu_head		rcu_head;
 };
 
 #define IN_DEV_FORWARD(in_dev)		((in_dev)->cnf.forwarding)
@@ -80,6 +83,7 @@
 {
 	struct in_ifaddr	*ifa_next;
 	struct in_device	*ifa_dev;
+	struct rcu_head		rcu_head;
 	u32			ifa_local;
 	u32			ifa_address;
 	u32			ifa_mask;
@@ -133,19 +137,16 @@
 
 #define endfor_ifa(in_dev) }
 
-extern rwlock_t inetdev_lock;
-
-
 static __inline__ struct in_device *
 in_dev_get(const struct net_device *dev)
 {
 	struct in_device *in_dev;
 
-	read_lock(&inetdev_lock);
+	rcu_read_lock();
 	in_dev = dev->ip_ptr;
 	if (in_dev)
 		atomic_inc(&in_dev->refcnt);
-	read_unlock(&inetdev_lock);
+	rcu_read_unlock();
 	return in_dev;
 }
 
@@ -157,11 +158,16 @@
 
 extern void in_dev_finish_destroy(struct in_device *idev);
 
-static __inline__ void
-in_dev_put(struct in_device *idev)
+static inline void in_dev_rcu_destroy(struct rcu_head *head)
+{
+	struct in_device *idev = container_of(head, struct in_device, rcu_head);
+	in_dev_finish_destroy(idev);
+}
+
+static inline void in_dev_put(struct in_device *idev)
 {
 	if (atomic_dec_and_test(&idev->refcnt))
-		in_dev_finish_destroy(idev);
+		call_rcu(&idev->rcu_head, in_dev_rcu_destroy);
 }
 
 #define __in_dev_put(idev)  atomic_dec(&(idev)->refcnt)
diff -Nru a/net/ipv4/devinet.c b/net/ipv4/devinet.c
--- a/net/ipv4/devinet.c	2004-08-12 16:42:31 -07:00
+++ b/net/ipv4/devinet.c	2004-08-12 16:42:31 -07:00
@@ -88,12 +88,9 @@
 static void devinet_sysctl_unregister(struct ipv4_devconf *p);
 #endif
 
-int inet_ifa_count;
-int inet_dev_count;
-
 /* Locks all the inet devices. */
 
-rwlock_t inetdev_lock = RW_LOCK_UNLOCKED;
+static spinlock_t inetdev_lock = SPIN_LOCK_UNLOCKED;
 
 static struct in_ifaddr *inet_alloc_ifa(void)
 {
@@ -101,18 +98,24 @@
 
 	if (ifa) {
 		memset(ifa, 0, sizeof(*ifa));
-		inet_ifa_count++;
+		INIT_RCU_HEAD(&ifa->rcu_head);
 	}
 
 	return ifa;
 }
 
-static __inline__ void inet_free_ifa(struct in_ifaddr *ifa)
+static inline void inet_free_ifa(struct in_ifaddr *ifa)
 {
 	if (ifa->ifa_dev)
-		__in_dev_put(ifa->ifa_dev);
+		in_dev_put(ifa->ifa_dev);
 	kfree(ifa);
-	inet_ifa_count--;
+}
+
+static void inet_rcu_free_ifa(struct rcu_head *head)
+{
+	struct in_ifaddr *ifa = container_of(head, struct in_ifaddr, rcu_head);
+
+	inet_free_ifa(ifa);
 }
 
 void in_dev_finish_destroy(struct in_device *idev)
@@ -129,7 +132,6 @@
 	if (!idev->dead)
 		printk("Freeing alive in_device %p\n", idev);
 	else {
-		inet_dev_count--;
 		kfree(idev);
 	}
 }
@@ -144,24 +146,23 @@
 	if (!in_dev)
 		goto out;
 	memset(in_dev, 0, sizeof(*in_dev));
-	in_dev->lock = RW_LOCK_UNLOCKED;
+	INIT_RCU_HEAD(&in_dev->rcu_head);
 	memcpy(&in_dev->cnf, &ipv4_devconf_dflt, sizeof(in_dev->cnf));
 	in_dev->cnf.sysctl = NULL;
 	in_dev->dev = dev;
 	if ((in_dev->arp_parms = neigh_parms_alloc(dev, &arp_tbl)) == NULL)
 		goto out_kfree;
-	inet_dev_count++;
 	/* Reference in_dev->dev */
 	dev_hold(dev);
 #ifdef CONFIG_SYSCTL
 	neigh_sysctl_register(dev, in_dev->arp_parms, NET_IPV4,
 			      NET_IPV4_NEIGH, "ipv4", NULL);
 #endif
-	write_lock_bh(&inetdev_lock);
+	spin_lock_bh(&inetdev_lock);
 	dev->ip_ptr = in_dev;
 	/* Account for reference dev->ip_ptr */
 	in_dev_hold(in_dev);
-	write_unlock_bh(&inetdev_lock);
+	spin_unlock_bh(&inetdev_lock);
 #ifdef CONFIG_SYSCTL
 	devinet_sysctl_register(in_dev, &in_dev->cnf);
 #endif
@@ -188,16 +189,16 @@
 
 	while ((ifa = in_dev->ifa_list) != NULL) {
 		inet_del_ifa(in_dev, &in_dev->ifa_list, 0);
-		inet_free_ifa(ifa);
+		call_rcu(&ifa->rcu_head, inet_rcu_free_ifa);
 	}
 
 #ifdef CONFIG_SYSCTL
 	devinet_sysctl_unregister(&in_dev->cnf);
 #endif
-	write_lock_bh(&inetdev_lock);
+	spin_lock_bh(&inetdev_lock);
 	in_dev->dev->ip_ptr = NULL;
 	/* in_dev_put following below will kill the in_device */
-	write_unlock_bh(&inetdev_lock);
+	spin_unlock_bh(&inetdev_lock);
 
 #ifdef CONFIG_SYSCTL
 	neigh_sysctl_unregister(in_dev->arp_parms);
@@ -208,16 +209,16 @@
 
 int inet_addr_onlink(struct in_device *in_dev, u32 a, u32 b)
 {
-	read_lock(&in_dev->lock);
+	rcu_read_lock();
 	for_primary_ifa(in_dev) {
 		if (inet_ifa_match(a, ifa)) {
 			if (!b || inet_ifa_match(b, ifa)) {
-				read_unlock(&in_dev->lock);
+				rcu_read_unlock();
 				return 1;
 			}
 		}
 	} endfor_ifa(in_dev);
-	read_unlock(&in_dev->lock);
+	rcu_read_unlock();
 	return 0;
 }
 
@@ -241,21 +242,21 @@
 				ifap1 = &ifa->ifa_next;
 				continue;
 			}
-			write_lock_bh(&in_dev->lock);
+			spin_lock_bh(&inetdev_lock);
 			*ifap1 = ifa->ifa_next;
-			write_unlock_bh(&in_dev->lock);
+			spin_unlock_bh(&inetdev_lock);
 
 			rtmsg_ifa(RTM_DELADDR, ifa);
 			notifier_call_chain(&inetaddr_chain, NETDEV_DOWN, ifa);
-			inet_free_ifa(ifa);
+			call_rcu(&ifa->rcu_head, inet_rcu_free_ifa);
 		}
 	}
 
 	/* 2. Unlink it */
 
-	write_lock_bh(&in_dev->lock);
+	spin_lock_bh(&inetdev_lock);
 	*ifap = ifa1->ifa_next;
-	write_unlock_bh(&in_dev->lock);
+	spin_unlock_bh(&inetdev_lock);
 
 	/* 3. Announce address deletion */
 
@@ -270,7 +271,7 @@
 	rtmsg_ifa(RTM_DELADDR, ifa1);
 	notifier_call_chain(&inetaddr_chain, NETDEV_DOWN, ifa1);
 	if (destroy) {
-		inet_free_ifa(ifa1);
+		call_rcu(&ifa1->rcu_head, inet_rcu_free_ifa);
 
 		if (!in_dev->ifa_list)
 			inetdev_destroy(in_dev);
@@ -285,7 +286,7 @@
 	ASSERT_RTNL();
 
 	if (!ifa->ifa_local) {
-		inet_free_ifa(ifa);
+		call_rcu(&ifa->rcu_head, inet_rcu_free_ifa);
 		return 0;
 	}
 
@@ -300,11 +301,11 @@
 		if (ifa1->ifa_mask == ifa->ifa_mask &&
 		    inet_ifa_match(ifa1->ifa_address, ifa)) {
 			if (ifa1->ifa_local == ifa->ifa_local) {
-				inet_free_ifa(ifa);
+				call_rcu(&ifa->rcu_head, inet_rcu_free_ifa);
 				return -EEXIST;
 			}
 			if (ifa1->ifa_scope != ifa->ifa_scope) {
-				inet_free_ifa(ifa);
+				call_rcu(&ifa->rcu_head, inet_rcu_free_ifa);
 				return -EINVAL;
 			}
 			ifa->ifa_flags |= IFA_F_SECONDARY;
@@ -317,9 +318,9 @@
 	}
 
 	ifa->ifa_next = *ifap;
-	write_lock_bh(&in_dev->lock);
+	spin_lock_bh(&inetdev_lock);
 	*ifap = ifa;
-	write_unlock_bh(&in_dev->lock);
+	spin_unlock_bh(&inetdev_lock);
 
 	/* Send message first, then call notifier.
 	   Notifier will trigger FIB update, so that
@@ -339,7 +340,7 @@
 	if (!in_dev) {
 		in_dev = inetdev_init(dev);
 		if (!in_dev) {
-			inet_free_ifa(ifa);
+			call_rcu(&ifa->rcu_head, inet_rcu_free_ifa);
 			return -ENOBUFS;
 		}
 	}
@@ -771,12 +772,11 @@
 	u32 addr = 0;
 	struct in_device *in_dev;
 
-	read_lock(&inetdev_lock);
+	rcu_read_lock();
 	in_dev = __in_dev_get(dev);
 	if (!in_dev)
 		goto out_unlock_inetdev;
 
-	read_lock(&in_dev->lock);
 	for_primary_ifa(in_dev) {
 		if (ifa->ifa_scope > scope)
 			continue;
@@ -787,8 +787,7 @@
 		if (!addr)
 			addr = ifa->ifa_local;
 	} endfor_ifa(in_dev);
-	read_unlock(&in_dev->lock);
-	read_unlock(&inetdev_lock);
+	rcu_read_unlock();
 
 	if (addr)
 		goto out;
@@ -798,30 +797,25 @@
 	   in dev_base list.
 	 */
 	read_lock(&dev_base_lock);
-	read_lock(&inetdev_lock);
+	rcu_read_lock();
 	for (dev = dev_base; dev; dev = dev->next) {
 		if ((in_dev = __in_dev_get(dev)) == NULL)
 			continue;
 
-		read_lock(&in_dev->lock);
 		for_primary_ifa(in_dev) {
 			if (ifa->ifa_scope != RT_SCOPE_LINK &&
 			    ifa->ifa_scope <= scope) {
-				read_unlock(&in_dev->lock);
 				addr = ifa->ifa_local;
 				goto out_unlock_both;
 			}
 		} endfor_ifa(in_dev);
-		read_unlock(&in_dev->lock);
 	}
 out_unlock_both:
-	read_unlock(&inetdev_lock);
 	read_unlock(&dev_base_lock);
+out_unlock_inetdev:
+	rcu_read_unlock();
 out:
 	return addr;
-out_unlock_inetdev:
-	read_unlock(&inetdev_lock);
-	goto out;
 }
 
 static u32 confirm_addr_indev(struct in_device *in_dev, u32 dst,
@@ -874,29 +868,24 @@
 	struct in_device *in_dev;
 
 	if (dev) {
-		read_lock(&inetdev_lock);
-		if ((in_dev = __in_dev_get(dev))) {
-			read_lock(&in_dev->lock);
+		rcu_read_lock();
+		if ((in_dev = __in_dev_get(dev)))
 			addr = confirm_addr_indev(in_dev, dst, local, scope);
-			read_unlock(&in_dev->lock);
-		}
-		read_unlock(&inetdev_lock);
+		rcu_read_unlock();
 
 		return addr;
 	}
 
 	read_lock(&dev_base_lock);
-	read_lock(&inetdev_lock);
+	rcu_read_lock();
 	for (dev = dev_base; dev; dev = dev->next) {
 		if ((in_dev = __in_dev_get(dev))) {
-			read_lock(&in_dev->lock);
 			addr = confirm_addr_indev(in_dev, dst, local, scope);
-			read_unlock(&in_dev->lock);
 			if (addr)
 				break;
 		}
 	}
-	read_unlock(&inetdev_lock);
+	rcu_read_unlock();
 	read_unlock(&dev_base_lock);
 
 	return addr;
@@ -1065,12 +1054,12 @@
 			continue;
 		if (idx > s_idx)
 			s_ip_idx = 0;
-		read_lock(&inetdev_lock);
+		rcu_read_lock();
 		if ((in_dev = __in_dev_get(dev)) == NULL) {
-			read_unlock(&inetdev_lock);
+			rcu_read_unlock();
 			continue;
 		}
-		read_lock(&in_dev->lock);
+
 		for (ifa = in_dev->ifa_list, ip_idx = 0; ifa;
 		     ifa = ifa->ifa_next, ip_idx++) {
 			if (ip_idx < s_ip_idx)
@@ -1078,13 +1067,11 @@
 			if (inet_fill_ifaddr(skb, ifa, NETLINK_CB(cb->skb).pid,
 					     cb->nlh->nlmsg_seq,
 					     RTM_NEWADDR) <= 0) {
-				read_unlock(&in_dev->lock);
-				read_unlock(&inetdev_lock);
+				rcu_read_unlock();
 				goto done;
 			}
 		}
-		read_unlock(&in_dev->lock);
-		read_unlock(&inetdev_lock);
+		rcu_read_unlock();
 	}
 
 done:
@@ -1138,11 +1125,11 @@
 	read_lock(&dev_base_lock);
 	for (dev = dev_base; dev; dev = dev->next) {
 		struct in_device *in_dev;
-		read_lock(&inetdev_lock);
+		rcu_read_lock();
 		in_dev = __in_dev_get(dev);
 		if (in_dev)
 			in_dev->cnf.forwarding = on;
-		read_unlock(&inetdev_lock);
+		rcu_read_unlock();
 	}
 	read_unlock(&dev_base_lock);
 
@@ -1508,6 +1495,5 @@
 EXPORT_SYMBOL(in_dev_finish_destroy);
 EXPORT_SYMBOL(inet_select_addr);
 EXPORT_SYMBOL(inetdev_by_index);
-EXPORT_SYMBOL(inetdev_lock);
 EXPORT_SYMBOL(register_inetaddr_notifier);
 EXPORT_SYMBOL(unregister_inetaddr_notifier);
diff -Nru a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
--- a/net/ipv4/fib_frontend.c	2004-08-12 16:42:31 -07:00
+++ b/net/ipv4/fib_frontend.c	2004-08-12 16:42:31 -07:00
@@ -172,13 +172,13 @@
 	int ret;
 
 	no_addr = rpf = 0;
-	read_lock(&inetdev_lock);
+	rcu_read_lock();
 	in_dev = __in_dev_get(dev);
 	if (in_dev) {
 		no_addr = in_dev->ifa_list == NULL;
 		rpf = IN_DEV_RPFILTER(in_dev);
 	}
-	read_unlock(&inetdev_lock);
+	rcu_read_unlock();
 
 	if (in_dev == NULL)
 		goto e_inval;
diff -Nru a/net/ipv4/icmp.c b/net/ipv4/icmp.c
--- a/net/ipv4/icmp.c	2004-08-12 16:42:31 -07:00
+++ b/net/ipv4/icmp.c	2004-08-12 16:42:31 -07:00
@@ -878,7 +878,7 @@
 	in_dev = in_dev_get(dev);
 	if (!in_dev)
 		goto out;
-	read_lock(&in_dev->lock);
+	rcu_read_lock();
 	if (in_dev->ifa_list &&
 	    IN_DEV_LOG_MARTIANS(in_dev) &&
 	    IN_DEV_FORWARD(in_dev)) {
@@ -895,7 +895,7 @@
 			       NIPQUAD(mask), dev->name, NIPQUAD(rt->rt_src));
 		}
 	}
-	read_unlock(&in_dev->lock);
+	rcu_read_unlock();
 	in_dev_put(in_dev);
 out:;
 }
diff -Nru a/net/ipv4/igmp.c b/net/ipv4/igmp.c
--- a/net/ipv4/igmp.c	2004-08-12 16:42:31 -07:00
+++ b/net/ipv4/igmp.c	2004-08-12 16:42:31 -07:00
@@ -487,7 +487,7 @@
 	int type;
 
 	if (!pmc) {
-		read_lock(&in_dev->lock);
+		read_lock(&in_dev->mc_list_lock);
 		for (pmc=in_dev->mc_list; pmc; pmc=pmc->next) {
 			if (pmc->multiaddr == IGMP_ALL_HOSTS)
 				continue;
@@ -499,7 +499,7 @@
 			skb = add_grec(skb, pmc, type, 0, 0);
 			spin_unlock_bh(&pmc->lock);
 		}
-		read_unlock(&in_dev->lock);
+		read_unlock(&in_dev->mc_list_lock);
 	} else {
 		spin_lock_bh(&pmc->lock);
 		if (pmc->sfcount[MCAST_EXCLUDE])
@@ -541,8 +541,8 @@
 	struct sk_buff *skb = NULL;
 	int type, dtype;
 
-	read_lock(&in_dev->lock);
-	write_lock_bh(&in_dev->mc_lock);
+	read_lock(&in_dev->mc_list_lock);
+	spin_lock_bh(&in_dev->mc_tomb_lock);
 
 	/* deleted MCA's */
 	pmc_prev = NULL;
@@ -575,7 +575,7 @@
 		} else
 			pmc_prev = pmc;
 	}
-	write_unlock_bh(&in_dev->mc_lock);
+	spin_unlock_bh(&in_dev->mc_tomb_lock);
 
 	/* change recs */
 	for (pmc=in_dev->mc_list; pmc; pmc=pmc->next) {
@@ -601,7 +601,8 @@
 		}
 		spin_unlock_bh(&pmc->lock);
 	}
-	read_unlock(&in_dev->lock);
+	read_unlock(&in_dev->mc_list_lock);
+
 	if (!skb)
 		return;
 	(void) igmpv3_sendpack(skb);
@@ -759,14 +760,14 @@
 	if (group == IGMP_ALL_HOSTS)
 		return;
 
-	read_lock(&in_dev->lock);
+	read_lock(&in_dev->mc_list_lock);
 	for (im=in_dev->mc_list; im!=NULL; im=im->next) {
 		if (im->multiaddr == group) {
 			igmp_stop_timer(im);
 			break;
 		}
 	}
-	read_unlock(&in_dev->lock);
+	read_unlock(&in_dev->mc_list_lock);
 }
 
 static void igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb,
@@ -840,7 +841,7 @@
 	 * - Use the igmp->igmp_code field as the maximum
 	 *   delay possible
 	 */
-	read_lock(&in_dev->lock);
+	read_lock(&in_dev->mc_list_lock);
 	for (im=in_dev->mc_list; im!=NULL; im=im->next) {
 		if (group && group != im->multiaddr)
 			continue;
@@ -856,7 +857,7 @@
 		spin_unlock_bh(&im->lock);
 		igmp_mod_timer(im, max_delay);
 	}
-	read_unlock(&in_dev->lock);
+	read_unlock(&in_dev->mc_list_lock);
 }
 
 int igmp_rcv(struct sk_buff *skb)
@@ -982,10 +983,10 @@
 	}
 	spin_unlock_bh(&im->lock);
 
-	write_lock_bh(&in_dev->mc_lock);
+	spin_lock_bh(&in_dev->mc_tomb_lock);
 	pmc->next = in_dev->mc_tomb;
 	in_dev->mc_tomb = pmc;
-	write_unlock_bh(&in_dev->mc_lock);
+	spin_unlock_bh(&in_dev->mc_tomb_lock);
 }
 
 static void igmpv3_del_delrec(struct in_device *in_dev, __u32 multiaddr)
@@ -993,7 +994,7 @@
 	struct ip_mc_list *pmc, *pmc_prev;
 	struct ip_sf_list *psf, *psf_next;
 
-	write_lock_bh(&in_dev->mc_lock);
+	spin_lock_bh(&in_dev->mc_tomb_lock);
 	pmc_prev = NULL;
 	for (pmc=in_dev->mc_tomb; pmc; pmc=pmc->next) {
 		if (pmc->multiaddr == multiaddr)
@@ -1006,7 +1007,7 @@
 		else
 			in_dev->mc_tomb = pmc->next;
 	}
-	write_unlock_bh(&in_dev->mc_lock);
+	spin_unlock_bh(&in_dev->mc_tomb_lock);
 	if (pmc) {
 		for (psf=pmc->tomb; psf; psf=psf_next) {
 			psf_next = psf->sf_next;
@@ -1021,10 +1022,10 @@
 {
 	struct ip_mc_list *pmc, *nextpmc;
 
-	write_lock_bh(&in_dev->mc_lock);
+	spin_lock_bh(&in_dev->mc_tomb_lock);
 	pmc = in_dev->mc_tomb;
 	in_dev->mc_tomb = NULL;
-	write_unlock_bh(&in_dev->mc_lock);
+	spin_unlock_bh(&in_dev->mc_tomb_lock);
 
 	for (; pmc; pmc = nextpmc) {
 		nextpmc = pmc->next;
@@ -1033,7 +1034,7 @@
 		kfree(pmc);
 	}
 	/* clear dead sources, too */
-	read_lock(&in_dev->lock);
+	read_lock(&in_dev->mc_list_lock);
 	for (pmc=in_dev->mc_list; pmc; pmc=pmc->next) {
 		struct ip_sf_list *psf, *psf_next;
 
@@ -1046,7 +1047,7 @@
 			kfree(psf);
 		}
 	}
-	read_unlock(&in_dev->lock);
+	read_unlock(&in_dev->mc_list_lock);
 }
 #endif
 
@@ -1167,10 +1168,10 @@
 	im->gsquery = 0;
 #endif
 	im->loaded = 0;
-	write_lock_bh(&in_dev->lock);
+	write_lock_bh(&in_dev->mc_list_lock);
 	im->next=in_dev->mc_list;
 	in_dev->mc_list=im;
-	write_unlock_bh(&in_dev->lock);
+	write_unlock_bh(&in_dev->mc_list_lock);
 #ifdef CONFIG_IP_MULTICAST
 	igmpv3_del_delrec(in_dev, im->multiaddr);
 #endif
@@ -1194,9 +1195,9 @@
 	for (ip=&in_dev->mc_list; (i=*ip)!=NULL; ip=&i->next) {
 		if (i->multiaddr==addr) {
 			if (--i->users == 0) {
-				write_lock_bh(&in_dev->lock);
+				write_lock_bh(&in_dev->mc_list_lock);
 				*ip = i->next;
-				write_unlock_bh(&in_dev->lock);
+				write_unlock_bh(&in_dev->mc_list_lock);
 				igmp_group_dropped(i);
 
 				if (!in_dev->dead)
@@ -1251,7 +1252,8 @@
 	in_dev->mr_qrv = IGMP_Unsolicited_Report_Count;
 #endif
 
-	in_dev->mc_lock = RW_LOCK_UNLOCKED;
+	in_dev->mc_list_lock = RW_LOCK_UNLOCKED;
+	in_dev->mc_tomb_lock = SPIN_LOCK_UNLOCKED;
 }
 
 /* Device going up */
@@ -1281,17 +1283,17 @@
 	/* Deactivate timers */
 	ip_mc_down(in_dev);
 
-	write_lock_bh(&in_dev->lock);
+	write_lock_bh(&in_dev->mc_list_lock);
 	while ((i = in_dev->mc_list) != NULL) {
 		in_dev->mc_list = i->next;
-		write_unlock_bh(&in_dev->lock);
+		write_unlock_bh(&in_dev->mc_list_lock);
 
 		igmp_group_dropped(i);
 		ip_ma_put(i);
 
-		write_lock_bh(&in_dev->lock);
+		write_lock_bh(&in_dev->mc_list_lock);
 	}
-	write_unlock_bh(&in_dev->lock);
+	write_unlock_bh(&in_dev->mc_list_lock);
 }
 
 static struct in_device * ip_mc_find_dev(struct ip_mreqn *imr)
@@ -1391,18 +1393,18 @@
 
 	if (!in_dev)
 		return -ENODEV;
-	read_lock(&in_dev->lock);
+	read_lock(&in_dev->mc_list_lock);
 	for (pmc=in_dev->mc_list; pmc; pmc=pmc->next) {
 		if (*pmca == pmc->multiaddr)
 			break;
 	}
 	if (!pmc) {
 		/* MCA not found?? bug */
-		read_unlock(&in_dev->lock);
+		read_unlock(&in_dev->mc_list_lock);
 		return -ESRCH;
 	}
 	spin_lock_bh(&pmc->lock);
-	read_unlock(&in_dev->lock);
+	read_unlock(&in_dev->mc_list_lock);
 #ifdef CONFIG_IP_MULTICAST
 	sf_markstate(pmc);
 #endif
@@ -1527,18 +1529,18 @@
 
 	if (!in_dev)
 		return -ENODEV;
-	read_lock(&in_dev->lock);
+	read_lock(&in_dev->mc_list_lock);
 	for (pmc=in_dev->mc_list; pmc; pmc=pmc->next) {
 		if (*pmca == pmc->multiaddr)
 			break;
 	}
 	if (!pmc) {
 		/* MCA not found?? bug */
-		read_unlock(&in_dev->lock);
+		read_unlock(&in_dev->mc_list_lock);
 		return -ESRCH;
 	}
 	spin_lock_bh(&pmc->lock);
-	read_unlock(&in_dev->lock);
+	read_unlock(&in_dev->mc_list_lock);
 
 #ifdef CONFIG_IP_MULTICAST
 	sf_markstate(pmc);
@@ -2095,7 +2097,7 @@
 	struct ip_sf_list *psf;
 	int rv = 0;
 
-	read_lock(&in_dev->lock);
+	read_lock(&in_dev->mc_list_lock);
 	for (im=in_dev->mc_list; im; im=im->next) {
 		if (im->multiaddr == mc_addr)
 			break;
@@ -2117,7 +2119,7 @@
 		} else
 			rv = 1; /* unspecified source; tentatively allow */
 	}
-	read_unlock(&in_dev->lock);
+	read_unlock(&in_dev->mc_list_lock);
 	return rv;
 }
 
@@ -2141,13 +2143,13 @@
 		in_dev = in_dev_get(state->dev);
 		if (!in_dev)
 			continue;
-		read_lock(&in_dev->lock);
+		read_lock(&in_dev->mc_list_lock);
 		im = in_dev->mc_list;
 		if (im) {
 			state->in_dev = in_dev;
 			break;
 		}
-		read_unlock(&in_dev->lock);
+		read_unlock(&in_dev->mc_list_lock);
 		in_dev_put(in_dev);
 	}
 	return im;
@@ -2159,7 +2161,7 @@
 	im = im->next;
 	while (!im) {
 		if (likely(state->in_dev != NULL)) {
-			read_unlock(&state->in_dev->lock);
+			read_unlock(&state->in_dev->mc_list_lock);
 			in_dev_put(state->in_dev);
 		}
 		state->dev = state->dev->next;
@@ -2170,7 +2172,7 @@
 		state->in_dev = in_dev_get(state->dev);
 		if (!state->in_dev)
 			continue;
-		read_lock(&state->in_dev->lock);
+		read_lock(&state->in_dev->mc_list_lock);
 		im = state->in_dev->mc_list;
 	}
 	return im;
@@ -2206,7 +2208,7 @@
 {
 	struct igmp_mc_iter_state *state = igmp_mc_seq_private(seq);
 	if (likely(state->in_dev != NULL)) {
-		read_unlock(&state->in_dev->lock);
+		read_unlock(&state->in_dev->mc_list_lock);
 		in_dev_put(state->in_dev);
 		state->in_dev = NULL;
 	}
@@ -2304,7 +2306,7 @@
 		idev = in_dev_get(state->dev);
 		if (unlikely(idev == NULL))
 			continue;
-		read_lock_bh(&idev->lock);
+		read_lock(&idev->mc_list_lock);
 		im = idev->mc_list;
 		if (likely(im != NULL)) {
 			spin_lock_bh(&im->lock);
@@ -2316,7 +2318,7 @@
 			}
 			spin_unlock_bh(&im->lock);
 		}
-		read_unlock_bh(&idev->lock);
+		read_unlock(&idev->mc_list_lock);
 		in_dev_put(idev);
 	}
 	return psf;
@@ -2332,7 +2334,7 @@
 		state->im = state->im->next;
 		while (!state->im) {
 			if (likely(state->idev != NULL)) {
-				read_unlock_bh(&state->idev->lock);
+				read_unlock(&state->idev->mc_list_lock);
 				in_dev_put(state->idev);
 			}
 			state->dev = state->dev->next;
@@ -2343,7 +2345,7 @@
 			state->idev = in_dev_get(state->dev);
 			if (!state->idev)
 				continue;
-			read_lock_bh(&state->idev->lock);
+			read_lock(&state->idev->mc_list_lock);
 			state->im = state->idev->mc_list;
 		}
 		if (!state->im)
@@ -2389,7 +2391,7 @@
 		state->im = NULL;
 	}
 	if (likely(state->idev != NULL)) {
-		read_unlock_bh(&state->idev->lock);
+		read_unlock(&state->idev->mc_list_lock);
 		in_dev_put(state->idev);
 		state->idev = NULL;
 	}
diff -Nru a/net/ipv4/route.c b/net/ipv4/route.c
--- a/net/ipv4/route.c	2004-08-12 16:42:31 -07:00
+++ b/net/ipv4/route.c	2004-08-12 16:42:31 -07:00
@@ -1855,7 +1855,7 @@
 	if (MULTICAST(daddr)) {
 		struct in_device *in_dev;
 
-		read_lock(&inetdev_lock);
+		rcu_read_lock();
 		if ((in_dev = __in_dev_get(dev)) != NULL) {
 			int our = ip_check_mc(in_dev, daddr, saddr,
 				skb->nh.iph->protocol);
@@ -1864,12 +1864,12 @@
 			    || (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
 #endif
 			    ) {
-				read_unlock(&inetdev_lock);
+				rcu_read_unlock();
 				return ip_route_input_mc(skb, daddr, saddr,
 							 tos, dev, our);
 			}
 		}
-		read_unlock(&inetdev_lock);
+		rcu_read_unlock();
 		return -EINVAL;
 	}
 	return ip_route_input_slow(skb, daddr, saddr, tos, dev);
diff -Nru a/net/irda/irlan/irlan_eth.c b/net/irda/irlan/irlan_eth.c
--- a/net/irda/irlan/irlan_eth.c	2004-08-12 16:42:31 -07:00
+++ b/net/irda/irlan/irlan_eth.c	2004-08-12 16:42:31 -07:00
@@ -306,7 +306,7 @@
 	in_dev = in_dev_get(dev);
 	if (in_dev == NULL)
 		return;
-	read_lock(&in_dev->lock);
+	rcu_read_lock();
 	if (in_dev->ifa_list)
 		
 	arp_send(ARPOP_REQUEST, ETH_P_ARP, 
@@ -314,7 +314,7 @@
 		 dev, 
 		 in_dev->ifa_list->ifa_address,
 		 NULL, dev->dev_addr, NULL);
-	read_unlock(&in_dev->lock);
+	rcu_read_unlock();
 	in_dev_put(in_dev);
 #endif /* CONFIG_INET */
 }
diff -Nru a/net/sctp/protocol.c b/net/sctp/protocol.c
--- a/net/sctp/protocol.c	2004-08-12 16:42:31 -07:00
+++ b/net/sctp/protocol.c	2004-08-12 16:42:31 -07:00
@@ -148,13 +148,12 @@
 	struct in_ifaddr *ifa;
 	struct sctp_sockaddr_entry *addr;
 
-	read_lock(&inetdev_lock);
+	rcu_read_lock();
 	if ((in_dev = __in_dev_get(dev)) == NULL) {
-		read_unlock(&inetdev_lock);
+		rcu_read_unlock();
 		return;
 	}
 
-	read_lock(&in_dev->lock);
 	for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next) {
 		/* Add the address to the local list.  */
 		addr = t_new(struct sctp_sockaddr_entry, GFP_ATOMIC);
@@ -166,8 +165,7 @@
 		}
 	}
 
-	read_unlock(&in_dev->lock);
-	read_unlock(&inetdev_lock);
+	rcu_read_unlock();
 }
 
 /* Extract our IP addresses from the system and stash them in the

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH] Move inetdev/ifa over to RCU
  2004-08-12 23:59 [PATCH] Move inetdev/ifa over to RCU David S. Miller
@ 2004-08-13  2:20 ` James Morris
  2004-08-13 10:02 ` Herbert Xu
  2004-08-13 16:03 ` Stephen Hemminger
  2 siblings, 0 replies; 46+ messages in thread
From: James Morris @ 2004-08-13  2:20 UTC (permalink / raw)
  To: David S. Miller; +Cc: netdev, robert.olsson, hadi, kuznet

On Thu, 12 Aug 2004, David S. Miller wrote:

> Someone poke holes in this patch please :-)

Looks ok to me.


- James
-- 
James Morris
<jmorris@redhat.com>

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH] Move inetdev/ifa over to RCU
  2004-08-12 23:59 [PATCH] Move inetdev/ifa over to RCU David S. Miller
  2004-08-13  2:20 ` James Morris
@ 2004-08-13 10:02 ` Herbert Xu
  2004-08-13 16:03 ` Stephen Hemminger
  2 siblings, 0 replies; 46+ messages in thread
From: Herbert Xu @ 2004-08-13 10:02 UTC (permalink / raw)
  To: David S. Miller; +Cc: netdev, robert.olsson, hadi, kuznet

David S. Miller <davem@redhat.com> wrote:
>
> static __inline__ struct in_device *
> in_dev_get(const struct net_device *dev)
> {
>        struct in_device *in_dev;
> 
> -       read_lock(&inetdev_lock);
> +       rcu_read_lock();
>        in_dev = dev->ip_ptr;
>        if (in_dev)
>                atomic_inc(&in_dev->refcnt);
> -       read_unlock(&inetdev_lock);
> +       rcu_read_unlock();
>        return in_dev;
> }
> 
> @@ -157,11 +158,16 @@
> 
> extern void in_dev_finish_destroy(struct in_device *idev);
> 
> -static __inline__ void
> -in_dev_put(struct in_device *idev)
> +static inline void in_dev_rcu_destroy(struct rcu_head *head)
> +{
> +       struct in_device *idev = container_of(head, struct in_device, rcu_head);
> +       in_dev_finish_destroy(idev);
> +}
> +
> +static inline void in_dev_put(struct in_device *idev)
> {
>        if (atomic_dec_and_test(&idev->refcnt))
> -               in_dev_finish_destroy(idev);
> +               call_rcu(&idev->rcu_head, in_dev_rcu_destroy);
> }

This doesn't look right.  The only thing in_dev_get() does in
the critical section is incrementing the refcnt.

But here we're only delaying the destruction of the idev rather
than the dropping of the refcnt.  So the following race can occur
with preemption:

CPU0				CPU1
				in_dev_get
					rcu_read_lock
					in_dev = dev->ip_ptr
inetdev_destroy
	dev->ip_ptr = NULL
	in_dev_put
		--refcnt == 0
		call_rcu
					refcnt++
					rcu_read_unlock
				PREEMPT

			sched point

in_dev_rcu_destroy
				use in_dev => BUG

The obvious thing to do is to move the call_rcu into inetdev_destroy
and get it to call in_dev_put.

But we still need to make sure that nobody increments the count again
without checking in_dev->dev->ip_ptr != NULL.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH] Move inetdev/ifa over to RCU
  2004-08-12 23:59 [PATCH] Move inetdev/ifa over to RCU David S. Miller
  2004-08-13  2:20 ` James Morris
  2004-08-13 10:02 ` Herbert Xu
@ 2004-08-13 16:03 ` Stephen Hemminger
  2004-08-13 16:38   ` David S. Miller
  2 siblings, 1 reply; 46+ messages in thread
From: Stephen Hemminger @ 2004-08-13 16:03 UTC (permalink / raw)
  To: David S. Miller; +Cc: netdev, robert.olsson, hadi, kuznet


>  static __inline__ struct in_device *
>  in_dev_get(const struct net_device *dev)
>  {
>  	struct in_device *in_dev;
>  
> -	read_lock(&inetdev_lock);
> +	rcu_read_lock();
>  	in_dev = dev->ip_ptr;

Don't you need a smp_read_barrier_depends() here?

>  	if (in_dev)
>  		atomic_inc(&in_dev->refcnt);
> -	read_unlock(&inetdev_lock);
> +	rcu_read_unlock();
>  	return in_dev;
>  }
>  
> @@ -157,11 +158,16 @@
>  
>  extern void in_dev_finish_destroy(struct in_device *idev);
>  
> -static __inline__ void
> -in_dev_put(struct in_device *idev)
> +static inline void in_dev_rcu_destroy(struct rcu_head *head)
> +{
> +	struct in_device *idev = container_of(head, struct in_device, rcu_head);
> +	in_dev_finish_destroy(idev);
> +}
> +
> +static inline void in_dev_put(struct in_device *idev)
>  {
>  	if (atomic_dec_and_test(&idev->refcnt))
> -		in_dev_finish_destroy(idev);
> +		call_rcu(&idev->rcu_head, in_dev_rcu_destroy);
>  }
>  
>  #define __in_dev_put(idev)  atomic_dec(&(idev)->refcnt)
> diff -Nru a/net/ipv4/devinet.c b/net/ipv4/devinet.c
> --- a/net/ipv4/devinet.c	2004-08-12 16:42:31 -07:00
> +++ b/net/ipv4/devinet.c	2004-08-12 16:42:31 -07:00
> @@ -88,12 +88,9 @@
>  static void devinet_sysctl_unregister(struct ipv4_devconf *p);
>  #endif
>  
> -int inet_ifa_count;
> -int inet_dev_count;
> -
>  /* Locks all the inet devices. */
>  
> -rwlock_t inetdev_lock = RW_LOCK_UNLOCKED;
> +static spinlock_t inetdev_lock = SPIN_LOCK_UNLOCKED;
>  
>  static struct in_ifaddr *inet_alloc_ifa(void)
>  {
> @@ -101,18 +98,24 @@
>  
>  	if (ifa) {
>  		memset(ifa, 0, sizeof(*ifa));
> -		inet_ifa_count++;
> +		INIT_RCU_HEAD(&ifa->rcu_head);
>  	}
>  
>  	return ifa;
>  }
>  
> -static __inline__ void inet_free_ifa(struct in_ifaddr *ifa)
> +static inline void inet_free_ifa(struct in_ifaddr *ifa)
>  {
>  	if (ifa->ifa_dev)
> -		__in_dev_put(ifa->ifa_dev);
> +		in_dev_put(ifa->ifa_dev);
>  	kfree(ifa);
> -	inet_ifa_count--;
> +}
> +
> +static void inet_rcu_free_ifa(struct rcu_head *head)
> +{
> +	struct in_ifaddr *ifa = container_of(head, struct in_ifaddr, rcu_head);
> +
> +	inet_free_ifa(ifa);
>  }
>  
>  void in_dev_finish_destroy(struct in_device *idev)
> @@ -129,7 +132,6 @@
>  	if (!idev->dead)
>  		printk("Freeing alive in_device %p\n", idev);
>  	else {
> -		inet_dev_count--;
>  		kfree(idev);
>  	}
>  }
> @@ -144,24 +146,23 @@
>  	if (!in_dev)
>  		goto out;
>  	memset(in_dev, 0, sizeof(*in_dev));
> -	in_dev->lock = RW_LOCK_UNLOCKED;
> +	INIT_RCU_HEAD(&in_dev->rcu_head);
>  	memcpy(&in_dev->cnf, &ipv4_devconf_dflt, sizeof(in_dev->cnf));
>  	in_dev->cnf.sysctl = NULL;
>  	in_dev->dev = dev;
>  	if ((in_dev->arp_parms = neigh_parms_alloc(dev, &arp_tbl)) == NULL)
>  		goto out_kfree;
> -	inet_dev_count++;
>  	/* Reference in_dev->dev */
>  	dev_hold(dev);
>  #ifdef CONFIG_SYSCTL
>  	neigh_sysctl_register(dev, in_dev->arp_parms, NET_IPV4,
>  			      NET_IPV4_NEIGH, "ipv4", NULL);
>  #endif
> -	write_lock_bh(&inetdev_lock);
> +	spin_lock_bh(&inetdev_lock);
>  	dev->ip_ptr = in_dev;
>  	/* Account for reference dev->ip_ptr */
>  	in_dev_hold(in_dev);
> -	write_unlock_bh(&inetdev_lock);
> +	spin_unlock_bh(&inetdev_lock);
>  #ifdef CONFIG_SYSCTL
>  	devinet_sysctl_register(in_dev, &in_dev->cnf);
>  #endif
> @@ -188,16 +189,16 @@
>  
>  	while ((ifa = in_dev->ifa_list) != NULL) {
>  		inet_del_ifa(in_dev, &in_dev->ifa_list, 0);
> -		inet_free_ifa(ifa);
> +		call_rcu(&ifa->rcu_head, inet_rcu_free_ifa);
>  	}
>  
>  #ifdef CONFIG_SYSCTL
>  	devinet_sysctl_unregister(&in_dev->cnf);
>  #endif
> -	write_lock_bh(&inetdev_lock);
> +	spin_lock_bh(&inetdev_lock);
>  	in_dev->dev->ip_ptr = NULL;
>  	/* in_dev_put following below will kill the in_device */
> -	write_unlock_bh(&inetdev_lock);
> +	spin_unlock_bh(&inetdev_lock);
>  
>  #ifdef CONFIG_SYSCTL
>  	neigh_sysctl_unregister(in_dev->arp_parms);
> @@ -208,16 +209,16 @@
>  
>  int inet_addr_onlink(struct in_device *in_dev, u32 a, u32 b)
>  {
> -	read_lock(&in_dev->lock);
> +	rcu_read_lock();
>  	for_primary_ifa(in_dev) {

And probably need smp_read_barrier_depends here

>  		if (inet_ifa_match(a, ifa)) {
>  			if (!b || inet_ifa_match(b, ifa)) {
> -				read_unlock(&in_dev->lock);
> +				rcu_read_unlock();
>  				return 1;
>  			}
>  		}
>  	} endfor_ifa(in_dev);
> -	read_unlock(&in_dev->lock);
> +	rcu_read_unlock();
>  	return 0;
>  }
>  
> @@ -241,21 +242,21 @@
>  				ifap1 = &ifa->ifa_next;
>  				continue;
>  			}
> -			write_lock_bh(&in_dev->lock);
> +			spin_lock_bh(&inetdev_lock);
>  			*ifap1 = ifa->ifa_next;
> -			write_unlock_bh(&in_dev->lock);
> +			spin_unlock_bh(&inetdev_lock);
>  
>  			rtmsg_ifa(RTM_DELADDR, ifa);
>  			notifier_call_chain(&inetaddr_chain, NETDEV_DOWN, ifa);
> -			inet_free_ifa(ifa);
> +			call_rcu(&ifa->rcu_head, inet_rcu_free_ifa);
>  		}
>  	}
>  
>  	/* 2. Unlink it */
>  
> -	write_lock_bh(&in_dev->lock);
> +	spin_lock_bh(&inetdev_lock);
>  	*ifap = ifa1->ifa_next;
> -	write_unlock_bh(&in_dev->lock);
> +	spin_unlock_bh(&inetdev_lock);
>  
>  	/* 3. Announce address deletion */
>  
> @@ -270,7 +271,7 @@
>  	rtmsg_ifa(RTM_DELADDR, ifa1);
>  	notifier_call_chain(&inetaddr_chain, NETDEV_DOWN, ifa1);
>  	if (destroy) {
> -		inet_free_ifa(ifa1);
> +		call_rcu(&ifa1->rcu_head, inet_rcu_free_ifa);
>  
>  		if (!in_dev->ifa_list)
>  			inetdev_destroy(in_dev);
> @@ -285,7 +286,7 @@
>  	ASSERT_RTNL();
>  
>  	if (!ifa->ifa_local) {
> -		inet_free_ifa(ifa);
> +		call_rcu(&ifa->rcu_head, inet_rcu_free_ifa);
>  		return 0;
>  	}
>  
> @@ -300,11 +301,11 @@
>  		if (ifa1->ifa_mask == ifa->ifa_mask &&
>  		    inet_ifa_match(ifa1->ifa_address, ifa)) {
>  			if (ifa1->ifa_local == ifa->ifa_local) {
> -				inet_free_ifa(ifa);
> +				call_rcu(&ifa->rcu_head, inet_rcu_free_ifa);
>  				return -EEXIST;
>  			}
>  			if (ifa1->ifa_scope != ifa->ifa_scope) {
> -				inet_free_ifa(ifa);
> +				call_rcu(&ifa->rcu_head, inet_rcu_free_ifa);
>  				return -EINVAL;
>  			}
>  			ifa->ifa_flags |= IFA_F_SECONDARY;
> @@ -317,9 +318,9 @@
>  	}
>  
>  	ifa->ifa_next = *ifap;
> -	write_lock_bh(&in_dev->lock);
> +	spin_lock_bh(&inetdev_lock);
>  	*ifap = ifa;
> -	write_unlock_bh(&in_dev->lock);
> +	spin_unlock_bh(&inetdev_lock);
>  
>  	/* Send message first, then call notifier.
>  	   Notifier will trigger FIB update, so that
> @@ -339,7 +340,7 @@
>  	if (!in_dev) {
>  		in_dev = inetdev_init(dev);
>  		if (!in_dev) {
> -			inet_free_ifa(ifa);
> +			call_rcu(&ifa->rcu_head, inet_rcu_free_ifa);
>  			return -ENOBUFS;
>  		}
>  	}
> @@ -771,12 +772,11 @@
>  	u32 addr = 0;
>  	struct in_device *in_dev;
>  
> -	read_lock(&inetdev_lock);
> +	rcu_read_lock();
>  	in_dev = __in_dev_get(dev);
>  	if (!in_dev)
>  		goto out_unlock_inetdev;
>  
> -	read_lock(&in_dev->lock);
>  	for_primary_ifa(in_dev) {
>  		if (ifa->ifa_scope > scope)
>  			continue;
> @@ -787,8 +787,7 @@
>  		if (!addr)
>  			addr = ifa->ifa_local;
>  	} endfor_ifa(in_dev);
> -	read_unlock(&in_dev->lock);
> -	read_unlock(&inetdev_lock);
> +	rcu_read_unlock();
>  
>  	if (addr)
>  		goto out;
> @@ -798,30 +797,25 @@
>  	   in dev_base list.
>  	 */
>  	read_lock(&dev_base_lock);
> -	read_lock(&inetdev_lock);
> +	rcu_read_lock();
>  	for (dev = dev_base; dev; dev = dev->next) {
>  		if ((in_dev = __in_dev_get(dev)) == NULL)
>  			continue;
>  
> -		read_lock(&in_dev->lock);
>  		for_primary_ifa(in_dev) {
>  			if (ifa->ifa_scope != RT_SCOPE_LINK &&
>  			    ifa->ifa_scope <= scope) {
> -				read_unlock(&in_dev->lock);
>  				addr = ifa->ifa_local;
>  				goto out_unlock_both;
>  			}
>  		} endfor_ifa(in_dev);
> -		read_unlock(&in_dev->lock);
>  	}
>  out_unlock_both:
> -	read_unlock(&inetdev_lock);
>  	read_unlock(&dev_base_lock);
> +out_unlock_inetdev:
> +	rcu_read_unlock();
>  out:
>  	return addr;
> -out_unlock_inetdev:
> -	read_unlock(&inetdev_lock);
> -	goto out;
>  }
>  
>  static u32 confirm_addr_indev(struct in_device *in_dev, u32 dst,
> @@ -874,29 +868,24 @@
>  	struct in_device *in_dev;
>  
>  	if (dev) {
> -		read_lock(&inetdev_lock);
> -		if ((in_dev = __in_dev_get(dev))) {
> -			read_lock(&in_dev->lock);
> +		rcu_read_lock();
> +		if ((in_dev = __in_dev_get(dev)))
>  			addr = confirm_addr_indev(in_dev, dst, local, scope);
> -			read_unlock(&in_dev->lock);
> -		}
> -		read_unlock(&inetdev_lock);
> +		rcu_read_unlock();
>  
>  		return addr;
>  	}
>  
>  	read_lock(&dev_base_lock);
> -	read_lock(&inetdev_lock);
> +	rcu_read_lock();
>  	for (dev = dev_base; dev; dev = dev->next) {
>  		if ((in_dev = __in_dev_get(dev))) {
> -			read_lock(&in_dev->lock);
>  			addr = confirm_addr_indev(in_dev, dst, local, scope);
> -			read_unlock(&in_dev->lock);
>  			if (addr)
>  				break;
>  		}
>  	}
> -	read_unlock(&inetdev_lock);
> +	rcu_read_unlock();
>  	read_unlock(&dev_base_lock);
>  
>  	return addr;
> @@ -1065,12 +1054,12 @@
>  			continue;
>  		if (idx > s_idx)
>  			s_ip_idx = 0;
> -		read_lock(&inetdev_lock);
> +		rcu_read_lock();
>  		if ((in_dev = __in_dev_get(dev)) == NULL) {
> -			read_unlock(&inetdev_lock);
> +			rcu_read_unlock();
>  			continue;
>  		}
> -		read_lock(&in_dev->lock);
> +
>  		for (ifa = in_dev->ifa_list, ip_idx = 0; ifa;
>  		     ifa = ifa->ifa_next, ip_idx++) {
>  			if (ip_idx < s_ip_idx)
> @@ -1078,13 +1067,11 @@
>  			if (inet_fill_ifaddr(skb, ifa, NETLINK_CB(cb->skb).pid,
>  					     cb->nlh->nlmsg_seq,
>  					     RTM_NEWADDR) <= 0) {
> -				read_unlock(&in_dev->lock);
> -				read_unlock(&inetdev_lock);
> +				rcu_read_unlock();
>  				goto done;
>  			}
>  		}
> -		read_unlock(&in_dev->lock);
> -		read_unlock(&inetdev_lock);
> +		rcu_read_unlock();
>  	}
>  
>  done:
> @@ -1138,11 +1125,11 @@
>  	read_lock(&dev_base_lock);
>  	for (dev = dev_base; dev; dev = dev->next) {
>  		struct in_device *in_dev;
> -		read_lock(&inetdev_lock);
> +		rcu_read_lock();
>  		in_dev = __in_dev_get(dev);
>  		if (in_dev)
>  			in_dev->cnf.forwarding = on;
> -		read_unlock(&inetdev_lock);
> +		rcu_read_unlock();
>  	}
>  	read_unlock(&dev_base_lock);
>  
> @@ -1508,6 +1495,5 @@
>  EXPORT_SYMBOL(in_dev_finish_destroy);
>  EXPORT_SYMBOL(inet_select_addr);
>  EXPORT_SYMBOL(inetdev_by_index);
> -EXPORT_SYMBOL(inetdev_lock);
>  EXPORT_SYMBOL(register_inetaddr_notifier);
>  EXPORT_SYMBOL(unregister_inetaddr_notifier);
> diff -Nru a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
> --- a/net/ipv4/fib_frontend.c	2004-08-12 16:42:31 -07:00
> +++ b/net/ipv4/fib_frontend.c	2004-08-12 16:42:31 -07:00
> @@ -172,13 +172,13 @@
>  	int ret;
>  
>  	no_addr = rpf = 0;
> -	read_lock(&inetdev_lock);
> +	rcu_read_lock();
>  	in_dev = __in_dev_get(dev);
>  	if (in_dev) {
>  		no_addr = in_dev->ifa_list == NULL;
>  		rpf = IN_DEV_RPFILTER(in_dev);
>  	}
> -	read_unlock(&inetdev_lock);
> +	rcu_read_unlock();
>  
>  	if (in_dev == NULL)
>  		goto e_inval;
> diff -Nru a/net/ipv4/icmp.c b/net/ipv4/icmp.c
> --- a/net/ipv4/icmp.c	2004-08-12 16:42:31 -07:00
> +++ b/net/ipv4/icmp.c	2004-08-12 16:42:31 -07:00
> @@ -878,7 +878,7 @@
>  	in_dev = in_dev_get(dev);
>  	if (!in_dev)
>  		goto out;
> -	read_lock(&in_dev->lock);
> +	rcu_read_lock();
>  	if (in_dev->ifa_list &&
>  	    IN_DEV_LOG_MARTIANS(in_dev) &&
>  	    IN_DEV_FORWARD(in_dev)) {
> @@ -895,7 +895,7 @@
>  			       NIPQUAD(mask), dev->name, NIPQUAD(rt->rt_src));
>  		}
>  	}
> -	read_unlock(&in_dev->lock);
> +	rcu_read_unlock();
>  	in_dev_put(in_dev);
>  out:;
>  }
> diff -Nru a/net/ipv4/igmp.c b/net/ipv4/igmp.c
> --- a/net/ipv4/igmp.c	2004-08-12 16:42:31 -07:00
> +++ b/net/ipv4/igmp.c	2004-08-12 16:42:31 -07:00
> @@ -487,7 +487,7 @@
>  	int type;
>  
>  	if (!pmc) {
> -		read_lock(&in_dev->lock);
> +		read_lock(&in_dev->mc_list_lock);
>  		for (pmc=in_dev->mc_list; pmc; pmc=pmc->next) {
>  			if (pmc->multiaddr == IGMP_ALL_HOSTS)
>  				continue;
> @@ -499,7 +499,7 @@
>  			skb = add_grec(skb, pmc, type, 0, 0);
>  			spin_unlock_bh(&pmc->lock);
>  		}
> -		read_unlock(&in_dev->lock);
> +		read_unlock(&in_dev->mc_list_lock);
>  	} else {
>  		spin_lock_bh(&pmc->lock);
>  		if (pmc->sfcount[MCAST_EXCLUDE])
> @@ -541,8 +541,8 @@
>  	struct sk_buff *skb = NULL;
>  	int type, dtype;
>  
> -	read_lock(&in_dev->lock);
> -	write_lock_bh(&in_dev->mc_lock);
> +	read_lock(&in_dev->mc_list_lock);
> +	spin_lock_bh(&in_dev->mc_tomb_lock);
>  
>  	/* deleted MCA's */
>  	pmc_prev = NULL;
> @@ -575,7 +575,7 @@
>  		} else
>  			pmc_prev = pmc;
>  	}
> -	write_unlock_bh(&in_dev->mc_lock);
> +	spin_unlock_bh(&in_dev->mc_tomb_lock);
>  
>  	/* change recs */
>  	for (pmc=in_dev->mc_list; pmc; pmc=pmc->next) {
> @@ -601,7 +601,8 @@
>  		}
>  		spin_unlock_bh(&pmc->lock);
>  	}
> -	read_unlock(&in_dev->lock);
> +	read_unlock(&in_dev->mc_list_lock);
> +
>  	if (!skb)
>  		return;
>  	(void) igmpv3_sendpack(skb);
> @@ -759,14 +760,14 @@
>  	if (group == IGMP_ALL_HOSTS)
>  		return;
>  
> -	read_lock(&in_dev->lock);
> +	read_lock(&in_dev->mc_list_lock);
>  	for (im=in_dev->mc_list; im!=NULL; im=im->next) {
>  		if (im->multiaddr == group) {
>  			igmp_stop_timer(im);
>  			break;
>  		}
>  	}
> -	read_unlock(&in_dev->lock);
> +	read_unlock(&in_dev->mc_list_lock);
>  }
>  
>  static void igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb,
> @@ -840,7 +841,7 @@
>  	 * - Use the igmp->igmp_code field as the maximum
>  	 *   delay possible
>  	 */
> -	read_lock(&in_dev->lock);
> +	read_lock(&in_dev->mc_list_lock);
>  	for (im=in_dev->mc_list; im!=NULL; im=im->next) {
>  		if (group && group != im->multiaddr)
>  			continue;
> @@ -856,7 +857,7 @@
>  		spin_unlock_bh(&im->lock);
>  		igmp_mod_timer(im, max_delay);
>  	}
> -	read_unlock(&in_dev->lock);
> +	read_unlock(&in_dev->mc_list_lock);
>  }
>  
>  int igmp_rcv(struct sk_buff *skb)
> @@ -982,10 +983,10 @@
>  	}
>  	spin_unlock_bh(&im->lock);
>  
> -	write_lock_bh(&in_dev->mc_lock);
> +	spin_lock_bh(&in_dev->mc_tomb_lock);
>  	pmc->next = in_dev->mc_tomb;
>  	in_dev->mc_tomb = pmc;
> -	write_unlock_bh(&in_dev->mc_lock);
> +	spin_unlock_bh(&in_dev->mc_tomb_lock);
>  }
>  
>  static void igmpv3_del_delrec(struct in_device *in_dev, __u32 multiaddr)
> @@ -993,7 +994,7 @@
>  	struct ip_mc_list *pmc, *pmc_prev;
>  	struct ip_sf_list *psf, *psf_next;
>  
> -	write_lock_bh(&in_dev->mc_lock);
> +	spin_lock_bh(&in_dev->mc_tomb_lock);
>  	pmc_prev = NULL;
>  	for (pmc=in_dev->mc_tomb; pmc; pmc=pmc->next) {
>  		if (pmc->multiaddr == multiaddr)
> @@ -1006,7 +1007,7 @@
>  		else
>  			in_dev->mc_tomb = pmc->next;
>  	}
> -	write_unlock_bh(&in_dev->mc_lock);
> +	spin_unlock_bh(&in_dev->mc_tomb_lock);
>  	if (pmc) {
>  		for (psf=pmc->tomb; psf; psf=psf_next) {
>  			psf_next = psf->sf_next;
> @@ -1021,10 +1022,10 @@
>  {
>  	struct ip_mc_list *pmc, *nextpmc;
>  
> -	write_lock_bh(&in_dev->mc_lock);
> +	spin_lock_bh(&in_dev->mc_tomb_lock);
>  	pmc = in_dev->mc_tomb;
>  	in_dev->mc_tomb = NULL;
> -	write_unlock_bh(&in_dev->mc_lock);
> +	spin_unlock_bh(&in_dev->mc_tomb_lock);
>  
>  	for (; pmc; pmc = nextpmc) {
>  		nextpmc = pmc->next;
> @@ -1033,7 +1034,7 @@
>  		kfree(pmc);
>  	}
>  	/* clear dead sources, too */
> -	read_lock(&in_dev->lock);
> +	read_lock(&in_dev->mc_list_lock);
>  	for (pmc=in_dev->mc_list; pmc; pmc=pmc->next) {
>  		struct ip_sf_list *psf, *psf_next;
>  
> @@ -1046,7 +1047,7 @@
>  			kfree(psf);
>  		}
>  	}
> -	read_unlock(&in_dev->lock);
> +	read_unlock(&in_dev->mc_list_lock);
>  }
>  #endif
>  
> @@ -1167,10 +1168,10 @@
>  	im->gsquery = 0;
>  #endif
>  	im->loaded = 0;
> -	write_lock_bh(&in_dev->lock);
> +	write_lock_bh(&in_dev->mc_list_lock);
>  	im->next=in_dev->mc_list;
>  	in_dev->mc_list=im;
> -	write_unlock_bh(&in_dev->lock);
> +	write_unlock_bh(&in_dev->mc_list_lock);
>  #ifdef CONFIG_IP_MULTICAST
>  	igmpv3_del_delrec(in_dev, im->multiaddr);
>  #endif
> @@ -1194,9 +1195,9 @@
>  	for (ip=&in_dev->mc_list; (i=*ip)!=NULL; ip=&i->next) {
>  		if (i->multiaddr==addr) {
>  			if (--i->users == 0) {
> -				write_lock_bh(&in_dev->lock);
> +				write_lock_bh(&in_dev->mc_list_lock);
>  				*ip = i->next;
> -				write_unlock_bh(&in_dev->lock);
> +				write_unlock_bh(&in_dev->mc_list_lock);
>  				igmp_group_dropped(i);
>  
>  				if (!in_dev->dead)
> @@ -1251,7 +1252,8 @@
>  	in_dev->mr_qrv = IGMP_Unsolicited_Report_Count;
>  #endif
>  
> -	in_dev->mc_lock = RW_LOCK_UNLOCKED;
> +	in_dev->mc_list_lock = RW_LOCK_UNLOCKED;
> +	in_dev->mc_tomb_lock = SPIN_LOCK_UNLOCKED;
>  }
>  
>  /* Device going up */
> @@ -1281,17 +1283,17 @@
>  	/* Deactivate timers */
>  	ip_mc_down(in_dev);
>  
> -	write_lock_bh(&in_dev->lock);
> +	write_lock_bh(&in_dev->mc_list_lock);
>  	while ((i = in_dev->mc_list) != NULL) {
>  		in_dev->mc_list = i->next;
> -		write_unlock_bh(&in_dev->lock);
> +		write_unlock_bh(&in_dev->mc_list_lock);
>  
>  		igmp_group_dropped(i);
>  		ip_ma_put(i);
>  
> -		write_lock_bh(&in_dev->lock);
> +		write_lock_bh(&in_dev->mc_list_lock);
>  	}
> -	write_unlock_bh(&in_dev->lock);
> +	write_unlock_bh(&in_dev->mc_list_lock);
>  }
>  
>  static struct in_device * ip_mc_find_dev(struct ip_mreqn *imr)
> @@ -1391,18 +1393,18 @@
>  
>  	if (!in_dev)
>  		return -ENODEV;
> -	read_lock(&in_dev->lock);
> +	read_lock(&in_dev->mc_list_lock);
>  	for (pmc=in_dev->mc_list; pmc; pmc=pmc->next) {
>  		if (*pmca == pmc->multiaddr)
>  			break;
>  	}
>  	if (!pmc) {
>  		/* MCA not found?? bug */
> -		read_unlock(&in_dev->lock);
> +		read_unlock(&in_dev->mc_list_lock);
>  		return -ESRCH;
>  	}
>  	spin_lock_bh(&pmc->lock);
> -	read_unlock(&in_dev->lock);
> +	read_unlock(&in_dev->mc_list_lock);
>  #ifdef CONFIG_IP_MULTICAST
>  	sf_markstate(pmc);
>  #endif
> @@ -1527,18 +1529,18 @@
>  
>  	if (!in_dev)
>  		return -ENODEV;
> -	read_lock(&in_dev->lock);
> +	read_lock(&in_dev->mc_list_lock);
>  	for (pmc=in_dev->mc_list; pmc; pmc=pmc->next) {
>  		if (*pmca == pmc->multiaddr)
>  			break;
>  	}
>  	if (!pmc) {
>  		/* MCA not found?? bug */
> -		read_unlock(&in_dev->lock);
> +		read_unlock(&in_dev->mc_list_lock);
>  		return -ESRCH;
>  	}
>  	spin_lock_bh(&pmc->lock);
> -	read_unlock(&in_dev->lock);
> +	read_unlock(&in_dev->mc_list_lock);
>  
>  #ifdef CONFIG_IP_MULTICAST
>  	sf_markstate(pmc);
> @@ -2095,7 +2097,7 @@
>  	struct ip_sf_list *psf;
>  	int rv = 0;
>  
> -	read_lock(&in_dev->lock);
> +	read_lock(&in_dev->mc_list_lock);
>  	for (im=in_dev->mc_list; im; im=im->next) {
>  		if (im->multiaddr == mc_addr)
>  			break;
> @@ -2117,7 +2119,7 @@
>  		} else
>  			rv = 1; /* unspecified source; tentatively allow */
>  	}
> -	read_unlock(&in_dev->lock);
> +	read_unlock(&in_dev->mc_list_lock);
>  	return rv;
>  }
>  
> @@ -2141,13 +2143,13 @@
>  		in_dev = in_dev_get(state->dev);
>  		if (!in_dev)
>  			continue;
> -		read_lock(&in_dev->lock);
> +		read_lock(&in_dev->mc_list_lock);
>  		im = in_dev->mc_list;
>  		if (im) {
>  			state->in_dev = in_dev;
>  			break;
>  		}
> -		read_unlock(&in_dev->lock);
> +		read_unlock(&in_dev->mc_list_lock);
>  		in_dev_put(in_dev);
>  	}
>  	return im;
> @@ -2159,7 +2161,7 @@
>  	im = im->next;
>  	while (!im) {
>  		if (likely(state->in_dev != NULL)) {
> -			read_unlock(&state->in_dev->lock);
> +			read_unlock(&state->in_dev->mc_list_lock);
>  			in_dev_put(state->in_dev);
>  		}
>  		state->dev = state->dev->next;
> @@ -2170,7 +2172,7 @@
>  		state->in_dev = in_dev_get(state->dev);
>  		if (!state->in_dev)
>  			continue;
> -		read_lock(&state->in_dev->lock);
> +		read_lock(&state->in_dev->mc_list_lock);
>  		im = state->in_dev->mc_list;
>  	}
>  	return im;
> @@ -2206,7 +2208,7 @@
>  {
>  	struct igmp_mc_iter_state *state = igmp_mc_seq_private(seq);
>  	if (likely(state->in_dev != NULL)) {
> -		read_unlock(&state->in_dev->lock);
> +		read_unlock(&state->in_dev->mc_list_lock);
>  		in_dev_put(state->in_dev);
>  		state->in_dev = NULL;
>  	}
> @@ -2304,7 +2306,7 @@
>  		idev = in_dev_get(state->dev);
>  		if (unlikely(idev == NULL))
>  			continue;
> -		read_lock_bh(&idev->lock);
> +		read_lock(&idev->mc_list_lock);
>  		im = idev->mc_list;
>  		if (likely(im != NULL)) {
>  			spin_lock_bh(&im->lock);
> @@ -2316,7 +2318,7 @@
>  			}
>  			spin_unlock_bh(&im->lock);
>  		}
> -		read_unlock_bh(&idev->lock);
> +		read_unlock(&idev->mc_list_lock);
>  		in_dev_put(idev);
>  	}
>  	return psf;
> @@ -2332,7 +2334,7 @@
>  		state->im = state->im->next;
>  		while (!state->im) {
>  			if (likely(state->idev != NULL)) {
> -				read_unlock_bh(&state->idev->lock);
> +				read_unlock(&state->idev->mc_list_lock);
>  				in_dev_put(state->idev);
>  			}
>  			state->dev = state->dev->next;
> @@ -2343,7 +2345,7 @@
>  			state->idev = in_dev_get(state->dev);
>  			if (!state->idev)
>  				continue;
> -			read_lock_bh(&state->idev->lock);
> +			read_lock(&state->idev->mc_list_lock);
>  			state->im = state->idev->mc_list;
>  		}
>  		if (!state->im)
> @@ -2389,7 +2391,7 @@
>  		state->im = NULL;
>  	}
>  	if (likely(state->idev != NULL)) {
> -		read_unlock_bh(&state->idev->lock);
> +		read_unlock(&state->idev->mc_list_lock);
>  		in_dev_put(state->idev);
>  		state->idev = NULL;
>  	}
> diff -Nru a/net/ipv4/route.c b/net/ipv4/route.c
> --- a/net/ipv4/route.c	2004-08-12 16:42:31 -07:00
> +++ b/net/ipv4/route.c	2004-08-12 16:42:31 -07:00
> @@ -1855,7 +1855,7 @@
>  	if (MULTICAST(daddr)) {
>  		struct in_device *in_dev;
>  
> -		read_lock(&inetdev_lock);
> +		rcu_read_lock();
>  		if ((in_dev = __in_dev_get(dev)) != NULL) {
>  			int our = ip_check_mc(in_dev, daddr, saddr,
>  				skb->nh.iph->protocol);
> @@ -1864,12 +1864,12 @@
>  			    || (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
>  #endif
>  			    ) {
> -				read_unlock(&inetdev_lock);
> +				rcu_read_unlock();
>  				return ip_route_input_mc(skb, daddr, saddr,
>  							 tos, dev, our);
>  			}
>  		}
> -		read_unlock(&inetdev_lock);
> +		rcu_read_unlock();
>  		return -EINVAL;
>  	}
>  	return ip_route_input_slow(skb, daddr, saddr, tos, dev);
> diff -Nru a/net/irda/irlan/irlan_eth.c b/net/irda/irlan/irlan_eth.c
> --- a/net/irda/irlan/irlan_eth.c	2004-08-12 16:42:31 -07:00
> +++ b/net/irda/irlan/irlan_eth.c	2004-08-12 16:42:31 -07:00
> @@ -306,7 +306,7 @@
>  	in_dev = in_dev_get(dev);
>  	if (in_dev == NULL)
>  		return;
> -	read_lock(&in_dev->lock);
> +	rcu_read_lock();
>  	if (in_dev->ifa_list)
>  		
>  	arp_send(ARPOP_REQUEST, ETH_P_ARP, 
> @@ -314,7 +314,7 @@
>  		 dev, 
>  		 in_dev->ifa_list->ifa_address,
>  		 NULL, dev->dev_addr, NULL);
> -	read_unlock(&in_dev->lock);
> +	rcu_read_unlock();
>  	in_dev_put(in_dev);
>  #endif /* CONFIG_INET */
>  }
> diff -Nru a/net/sctp/protocol.c b/net/sctp/protocol.c
> --- a/net/sctp/protocol.c	2004-08-12 16:42:31 -07:00
> +++ b/net/sctp/protocol.c	2004-08-12 16:42:31 -07:00
> @@ -148,13 +148,12 @@
>  	struct in_ifaddr *ifa;
>  	struct sctp_sockaddr_entry *addr;
>  
> -	read_lock(&inetdev_lock);
> +	rcu_read_lock();
>  	if ((in_dev = __in_dev_get(dev)) == NULL) {
> -		read_unlock(&inetdev_lock);
> +		rcu_read_unlock();
>  		return;
>  	}
>  
> -	read_lock(&in_dev->lock);
>  	for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next) {
>  		/* Add the address to the local list.  */
>  		addr = t_new(struct sctp_sockaddr_entry, GFP_ATOMIC);
> @@ -166,8 +165,7 @@
>  		}
>  	}
>  
> -	read_unlock(&in_dev->lock);
> -	read_unlock(&inetdev_lock);
> +	rcu_read_unlock();
>  }
>  
>  /* Extract our IP addresses from the system and stash them in the

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH] Move inetdev/ifa over to RCU
  2004-08-13 16:03 ` Stephen Hemminger
@ 2004-08-13 16:38   ` David S. Miller
  2004-08-13 21:56     ` Herbert Xu
  0 siblings, 1 reply; 46+ messages in thread
From: David S. Miller @ 2004-08-13 16:38 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: netdev, herbert


Thanks guys.  This patch should fix both problems.

Herbert, if we check inetdev->dead when trying to grab
a reference it fixes the RCU destroy race you mentioned.
I really don't want to put that ref drop into the
RCU callback as that would kill performance.

# This is a BitKeeper generated diff -Nru style patch.
#
# ChangeSet
#   2004/08/13 09:17:07-07:00 davem@nuts.davemloft.net 
#   [IPV4]: Fix two bugs in inetdev RCU handling.
#   
#   - Two missing smp_read_barrier_depends() noticed by
#     Stephen Hemminger.
#   - Fix RCU inetdev destroy race spotted by Herbert Xu.
#     Check in_dev->dead when trying to grab a reference.
#   
#   Signed-off-by: David S. Miller <davem@redhat.com>
# 
# net/ipv4/devinet.c
#   2004/08/13 09:15:41-07:00 davem@nuts.davemloft.net +1 -0
#   [IPV4]: Fix two bugs in inetdev RCU handling.
# 
# include/linux/inetdevice.h
#   2004/08/13 09:15:41-07:00 davem@nuts.davemloft.net +7 -2
#   [IPV4]: Fix two bugs in inetdev RCU handling.
# 
diff -Nru a/include/linux/inetdevice.h b/include/linux/inetdevice.h
--- a/include/linux/inetdevice.h	2004-08-13 09:20:01 -07:00
+++ b/include/linux/inetdevice.h	2004-08-13 09:20:01 -07:00
@@ -143,9 +143,14 @@
 	struct in_device *in_dev;
 
 	rcu_read_lock();
+	smp_read_barrier_depends();
 	in_dev = dev->ip_ptr;
-	if (in_dev)
-		atomic_inc(&in_dev->refcnt);
+	if (in_dev) {
+		if (in_dev->dead)
+			in_dev = NULL;
+		else
+			atomic_inc(&in_dev->refcnt);
+	}
 	rcu_read_unlock();
 	return in_dev;
 }
diff -Nru a/net/ipv4/devinet.c b/net/ipv4/devinet.c
--- a/net/ipv4/devinet.c	2004-08-13 09:20:01 -07:00
+++ b/net/ipv4/devinet.c	2004-08-13 09:20:01 -07:00
@@ -210,6 +210,7 @@
 int inet_addr_onlink(struct in_device *in_dev, u32 a, u32 b)
 {
 	rcu_read_lock();
+	smp_read_barrier_depends();
 	for_primary_ifa(in_dev) {
 		if (inet_ifa_match(a, ifa)) {
 			if (!b || inet_ifa_match(b, ifa)) {

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH] Move inetdev/ifa over to RCU
  2004-08-13 16:38   ` David S. Miller
@ 2004-08-13 21:56     ` Herbert Xu
  2004-08-13 22:19       ` David S. Miller
  0 siblings, 1 reply; 46+ messages in thread
From: Herbert Xu @ 2004-08-13 21:56 UTC (permalink / raw)
  To: David S. Miller; +Cc: Stephen Hemminger, netdev

[-- Attachment #1: Type: text/plain, Size: 2097 bytes --]

On Fri, Aug 13, 2004 at 09:38:38AM -0700, David S. Miller wrote:
> 
> Herbert, if we check inetdev->dead when trying to grab
> a reference it fixes the RCU destroy race you mentioned.

Sorry Dave but that just makes the window smaller.  A new
inetdev_destroy() can still start after the in_dev->dead
check and before the inc of the refcnt.

> I really don't want to put that ref drop into the
> RCU callback as that would kill performance.

Maybe I wasn't clear enough about the solution.  I've attached
an untested patch below.  Hopefully that should make it clearer.

> diff -Nru a/include/linux/inetdevice.h b/include/linux/inetdevice.h
> --- a/include/linux/inetdevice.h	2004-08-13 09:20:01 -07:00
> +++ b/include/linux/inetdevice.h	2004-08-13 09:20:01 -07:00
> @@ -143,9 +143,14 @@
>  	struct in_device *in_dev;
>  
>  	rcu_read_lock();
> +	smp_read_barrier_depends();

This isn't quite right either.  It rarely makes sense to have
smp_read_barrier_depends() immediately after an rcu_read_lock()
since it can't be reordered.

Now had you put this barrier between the reading of ip_ptr and
the checking of in_dev->dead, then the race would've been closed.

But I still prefer to move the work into inetdev_destroy.

>  	in_dev = dev->ip_ptr;
> -	if (in_dev)
> -		atomic_inc(&in_dev->refcnt);
> +	if (in_dev) {
> +		if (in_dev->dead)
> +			in_dev = NULL;
> +		else
> +			atomic_inc(&in_dev->refcnt);
> +	}
>  	rcu_read_unlock();
>  	return in_dev;
>  }

> diff -Nru a/net/ipv4/devinet.c b/net/ipv4/devinet.c
> --- a/net/ipv4/devinet.c	2004-08-13 09:20:01 -07:00
> +++ b/net/ipv4/devinet.c	2004-08-13 09:20:01 -07:00
> @@ -210,6 +210,7 @@
>  int inet_addr_onlink(struct in_device *in_dev, u32 a, u32 b)
>  {
>  	rcu_read_lock();
> +	smp_read_barrier_depends();
>  	for_primary_ifa(in_dev) {
>  		if (inet_ifa_match(a, ifa)) {
>  			if (!b || inet_ifa_match(b, ifa)) {

Ditto.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

[-- Attachment #2: p --]
[-- Type: text/plain, Size: 1381 bytes --]

===== include/linux/inetdevice.h 1.14 vs edited =====
--- 1.14/include/linux/inetdevice.h	2004-08-14 07:43:21 +10:00
+++ edited/include/linux/inetdevice.h	2004-08-14 07:48:59 +10:00
@@ -163,16 +163,10 @@
 
 extern void in_dev_finish_destroy(struct in_device *idev);
 
-static inline void in_dev_rcu_destroy(struct rcu_head *head)
-{
-	struct in_device *idev = container_of(head, struct in_device, rcu_head);
-	in_dev_finish_destroy(idev);
-}
-
 static inline void in_dev_put(struct in_device *idev)
 {
 	if (atomic_dec_and_test(&idev->refcnt))
-		call_rcu(&idev->rcu_head, in_dev_rcu_destroy);
+		in_dev_finish_destroy(idev);
 }
 
 #define __in_dev_put(idev)  atomic_dec(&(idev)->refcnt)
===== net/ipv4/devinet.c 1.33 vs edited =====
--- 1.33/net/ipv4/devinet.c	2004-08-14 07:43:21 +10:00
+++ edited/net/ipv4/devinet.c	2004-08-14 07:49:51 +10:00
@@ -177,6 +177,12 @@
 	goto out;
 }
 
+static void in_dev_rcu_put(struct rcu_head *head)
+{
+	struct in_device *idev = container_of(head, struct in_device, rcu_head);
+	in_dev_put(idev);
+}
+
 static void inetdev_destroy(struct in_device *in_dev)
 {
 	struct in_ifaddr *ifa;
@@ -204,7 +210,7 @@
 	neigh_sysctl_unregister(in_dev->arp_parms);
 #endif
 	neigh_parms_release(&arp_tbl, in_dev->arp_parms);
-	in_dev_put(in_dev);
+	call_rcu(&idev->rcu_head, in_dev_rcu_put);
 }
 
 int inet_addr_onlink(struct in_device *in_dev, u32 a, u32 b)

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH] Move inetdev/ifa over to RCU
  2004-08-13 21:56     ` Herbert Xu
@ 2004-08-13 22:19       ` David S. Miller
  2004-08-14  0:34         ` Herbert Xu
  0 siblings, 1 reply; 46+ messages in thread
From: David S. Miller @ 2004-08-13 22:19 UTC (permalink / raw)
  To: Herbert Xu; +Cc: shemminger, netdev

On Sat, 14 Aug 2004 07:56:02 +1000
Herbert Xu <herbert@gondor.apana.org.au> wrote:

> Now had you put this barrier between the reading of ip_ptr and
> the checking of in_dev->dead, then the race would've been closed.
> 
> But I still prefer to move the work into inetdev_destroy.

Ok I see now.  I've added your patch, it seems correct.

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH] Move inetdev/ifa over to RCU
  2004-08-13 22:19       ` David S. Miller
@ 2004-08-14  0:34         ` Herbert Xu
  2004-08-14  0:39           ` David S. Miller
  2004-08-16  2:58           ` David S. Miller
  0 siblings, 2 replies; 46+ messages in thread
From: Herbert Xu @ 2004-08-14  0:34 UTC (permalink / raw)
  To: David S. Miller; +Cc: shemminger, netdev

On Fri, Aug 13, 2004 at 03:19:23PM -0700, David S. Miller wrote:
> On Sat, 14 Aug 2004 07:56:02 +1000
> Herbert Xu <herbert@gondor.apana.org.au> wrote:
> 
> > Now had you put this barrier between the reading of ip_ptr and
> > the checking of in_dev->dead, then the race would've been closed.

Actually even that won't close the race so the barrier is completely
unnecessary.

> > But I still prefer to move the work into inetdev_destroy.
> 
> Ok I see now.  I've added your patch, it seems correct.

Thanks.  I presume you've reversed the previous patch as well?

BTW, it looks like you can remove inetdev_lock altogether.  All its
users already assume that they don't race against each other by
taking the rtnl lock.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH] Move inetdev/ifa over to RCU
  2004-08-14  0:34         ` Herbert Xu
@ 2004-08-14  0:39           ` David S. Miller
  2004-08-14  0:54             ` Herbert Xu
  2004-08-16  2:58           ` David S. Miller
  1 sibling, 1 reply; 46+ messages in thread
From: David S. Miller @ 2004-08-14  0:39 UTC (permalink / raw)
  To: Herbert Xu; +Cc: shemminger, netdev

On Sat, 14 Aug 2004 10:34:28 +1000
Herbert Xu <herbert@gondor.apana.org.au> wrote:

> > Ok I see now.  I've added your patch, it seems correct.
> 
> Thanks.  I presume you've reversed the previous patch as well?

Yes and I cured the one typo in it two, idev-->in_dev :-)

> BTW, it looks like you can remove inetdev_lock altogether.  All its
> users already assume that they don't race against each other by
> taking the rtnl lock.

Possibly, I'll have a look.

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH] Move inetdev/ifa over to RCU
  2004-08-14  0:39           ` David S. Miller
@ 2004-08-14  0:54             ` Herbert Xu
  2004-08-14  1:25               ` Herbert Xu
  2004-08-16  2:59               ` David S. Miller
  0 siblings, 2 replies; 46+ messages in thread
From: Herbert Xu @ 2004-08-14  0:54 UTC (permalink / raw)
  To: David S. Miller; +Cc: shemminger, netdev

[-- Attachment #1: Type: text/plain, Size: 514 bytes --]

On Fri, Aug 13, 2004 at 05:39:24PM -0700, David S. Miller wrote:
> 
> Yes and I cured the one typo in it two, idev-->in_dev :-)

Two birds with one stone :)

Here is a trivial optimisation for irda to since it doesn't hold
onto the idev.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

[-- Attachment #2: p --]
[-- Type: text/plain, Size: 670 bytes --]

===== net/irda/irlan/irlan_eth.c 1.12 vs edited =====
--- 1.12/net/irda/irlan/irlan_eth.c	2004-08-14 07:40:24 +10:00
+++ edited/net/irda/irlan/irlan_eth.c	2004-08-14 10:50:29 +10:00
@@ -303,10 +303,10 @@
 	 */
 #ifdef CONFIG_INET
 	IRDA_DEBUG(4, "IrLAN: Sending gratuitous ARP\n");
-	in_dev = in_dev_get(dev);
-	if (in_dev == NULL)
-		return;
 	rcu_read_lock();
+	in_dev = __in_dev_get(dev);
+	if (in_dev == NULL)
+		goto out;
 	if (in_dev->ifa_list)
 		
 	arp_send(ARPOP_REQUEST, ETH_P_ARP, 
@@ -314,8 +314,8 @@
 		 dev, 
 		 in_dev->ifa_list->ifa_address,
 		 NULL, dev->dev_addr, NULL);
+out:
 	rcu_read_unlock();
-	in_dev_put(in_dev);
 #endif /* CONFIG_INET */
 }
 

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH] Move inetdev/ifa over to RCU
  2004-08-14  0:54             ` Herbert Xu
@ 2004-08-14  1:25               ` Herbert Xu
  2004-08-14  1:30                 ` Herbert Xu
                                   ` (2 more replies)
  2004-08-16  2:59               ` David S. Miller
  1 sibling, 3 replies; 46+ messages in thread
From: Herbert Xu @ 2004-08-14  1:25 UTC (permalink / raw)
  To: David S. Miller; +Cc: shemminger, netdev

[-- Attachment #1: Type: text/plain, Size: 454 bytes --]

Hi Dave:

I gave your patch to the compiler and it doesn't like it :)
call_rcu needs three arguments, not two.

Watch out, this will probably conflict with your in_dev typo fix.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

[-- Attachment #2: r --]
[-- Type: text/plain, Size: 2608 bytes --]

===== net/ipv4/devinet.c 1.34 vs edited =====
--- 1.34/net/ipv4/devinet.c	2004-08-14 08:36:53 +10:00
+++ edited/net/ipv4/devinet.c	2004-08-14 11:20:19 +10:00
@@ -111,10 +111,8 @@
 	kfree(ifa);
 }
 
-static void inet_rcu_free_ifa(struct rcu_head *head)
+static void inet_rcu_free_ifa(void *ifa)
 {
-	struct in_ifaddr *ifa = container_of(head, struct in_ifaddr, rcu_head);
-
 	inet_free_ifa(ifa);
 }
 
@@ -177,9 +175,8 @@
 	goto out;
 }
 
-static void in_dev_rcu_put(struct rcu_head *head)
+static void in_dev_rcu_put(void *idev)
 {
-	struct in_device *idev = container_of(head, struct in_device, rcu_head);
 	in_dev_put(idev);
 }
 
@@ -195,7 +192,7 @@
 
 	while ((ifa = in_dev->ifa_list) != NULL) {
 		inet_del_ifa(in_dev, &in_dev->ifa_list, 0);
-		call_rcu(&ifa->rcu_head, inet_rcu_free_ifa);
+		call_rcu(&ifa->rcu_head, inet_rcu_free_ifa, ifa);
 	}
 
 #ifdef CONFIG_SYSCTL
@@ -210,7 +207,7 @@
 	neigh_sysctl_unregister(in_dev->arp_parms);
 #endif
 	neigh_parms_release(&arp_tbl, in_dev->arp_parms);
-	call_rcu(&idev->rcu_head, in_dev_rcu_put);
+	call_rcu(&in_dev->rcu_head, in_dev_rcu_put, in_dev);
 }
 
 int inet_addr_onlink(struct in_device *in_dev, u32 a, u32 b)
@@ -255,7 +252,7 @@
 
 			rtmsg_ifa(RTM_DELADDR, ifa);
 			notifier_call_chain(&inetaddr_chain, NETDEV_DOWN, ifa);
-			call_rcu(&ifa->rcu_head, inet_rcu_free_ifa);
+			call_rcu(&ifa->rcu_head, inet_rcu_free_ifa, ifa);
 		}
 	}
 
@@ -278,7 +275,7 @@
 	rtmsg_ifa(RTM_DELADDR, ifa1);
 	notifier_call_chain(&inetaddr_chain, NETDEV_DOWN, ifa1);
 	if (destroy) {
-		call_rcu(&ifa1->rcu_head, inet_rcu_free_ifa);
+		call_rcu(&ifa1->rcu_head, inet_rcu_free_ifa, ifa1);
 
 		if (!in_dev->ifa_list)
 			inetdev_destroy(in_dev);
@@ -293,7 +290,7 @@
 	ASSERT_RTNL();
 
 	if (!ifa->ifa_local) {
-		call_rcu(&ifa->rcu_head, inet_rcu_free_ifa);
+		call_rcu(&ifa->rcu_head, inet_rcu_free_ifa, ifa);
 		return 0;
 	}
 
@@ -308,11 +305,13 @@
 		if (ifa1->ifa_mask == ifa->ifa_mask &&
 		    inet_ifa_match(ifa1->ifa_address, ifa)) {
 			if (ifa1->ifa_local == ifa->ifa_local) {
-				call_rcu(&ifa->rcu_head, inet_rcu_free_ifa);
+				call_rcu(&ifa->rcu_head, inet_rcu_free_ifa,
+					 ifa);
 				return -EEXIST;
 			}
 			if (ifa1->ifa_scope != ifa->ifa_scope) {
-				call_rcu(&ifa->rcu_head, inet_rcu_free_ifa);
+				call_rcu(&ifa->rcu_head, inet_rcu_free_ifa,
+					 ifa);
 				return -EINVAL;
 			}
 			ifa->ifa_flags |= IFA_F_SECONDARY;
@@ -347,7 +346,7 @@
 	if (!in_dev) {
 		in_dev = inetdev_init(dev);
 		if (!in_dev) {
-			call_rcu(&ifa->rcu_head, inet_rcu_free_ifa);
+			call_rcu(&ifa->rcu_head, inet_rcu_free_ifa, ifa);
 			return -ENOBUFS;
 		}
 	}

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH] Move inetdev/ifa over to RCU
  2004-08-14  1:25               ` Herbert Xu
@ 2004-08-14  1:30                 ` Herbert Xu
  2004-08-14  5:08                   ` Herbert Xu
                                     ` (2 more replies)
  2004-08-14  1:40                 ` Herbert Xu
  2004-08-14  4:30                 ` Stephen Hemminger
  2 siblings, 3 replies; 46+ messages in thread
From: Herbert Xu @ 2004-08-14  1:30 UTC (permalink / raw)
  To: David S. Miller; +Cc: shemminger, netdev

[-- Attachment #1: Type: text/plain, Size: 339 bytes --]

Hi Dave:

Here are some more ifa_list fixes outside net/ipv4.
 
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

[-- Attachment #2: q --]
[-- Type: text/plain, Size: 5101 bytes --]

===== drivers/net/wan/syncppp.c 1.19 vs edited =====
--- 1.19/drivers/net/wan/syncppp.c	2004-07-13 23:29:35 +10:00
+++ edited/drivers/net/wan/syncppp.c	2004-08-14 10:57:25 +10:00
@@ -50,6 +50,7 @@
 #include <linux/random.h>
 #include <linux/pkt_sched.h>
 #include <linux/spinlock.h>
+#include <linux/rcupdate.h>
 
 #include <net/syncppp.h>
 
@@ -767,9 +768,9 @@
 		struct in_ifaddr *ifa;
 		u32 addr = 0, mask = ~0; /* FIXME: is the mask correct? */
 #ifdef CONFIG_INET
-		if ((in_dev=in_dev_get(dev)) != NULL)
+		rcu_read_lock();
+		if ((in_dev = __in_dev_get(dev)) != NULL)
 		{
-			read_lock(&in_dev->lock);
 			for (ifa=in_dev->ifa_list; ifa != NULL;
 				ifa=ifa->ifa_next) {
 				if (strcmp(dev->name, ifa->ifa_label) == 0) 
@@ -779,9 +780,8 @@
 					break;
 				}
 			}
-			read_unlock(&in_dev->lock);
-			in_dev_put(in_dev);
 		}
+		rcu_read_unlock();
 #endif		
 		/* I hope both addr and mask are in the net order */
 		sppp_cisco_send (sp, CISCO_ADDR_REPLY, addr, mask);
===== drivers/net/wireless/strip.c 1.24 vs edited =====
--- 1.24/drivers/net/wireless/strip.c	2004-07-13 23:29:35 +10:00
+++ edited/drivers/net/wireless/strip.c	2004-08-14 11:05:22 +10:00
@@ -106,6 +106,7 @@
 #include <linux/seq_file.h>
 #include <linux/serial.h>
 #include <linux/serialP.h>
+#include <linux/rcupdate.h>
 #include <net/arp.h>
 
 #include <linux/ip.h>
@@ -1348,14 +1349,17 @@
 	 */
 	if (haddr.c[0] == 0xFF) {
 		u32 brd = 0;
-		struct in_device *in_dev = in_dev_get(strip_info->dev);
-		if (in_dev == NULL)
+		struct in_device *in_dev;
+
+		rcu_read_lock();
+		in_dev = __in_dev_get(strip_info->dev);
+		if (in_dev == NULL) {
+			rcu_read_unlock();
 			return NULL;
-		read_lock(&in_dev->lock);
+		}
 		if (in_dev->ifa_list)
 			brd = in_dev->ifa_list->ifa_broadcast;
-		read_unlock(&in_dev->lock);
-		in_dev_put(in_dev);
+		rcu_read_unlock();
 
 		/* arp_query returns 1 if it succeeds in looking up the address, 0 if it fails */
 		if (!arp_query(haddr.c, brd, strip_info->dev)) {
@@ -1500,17 +1504,18 @@
 	}
 
 	if (1) {
-		struct in_device *in_dev = in_dev_get(strip_info->dev);
+		struct in_device *in_dev;
+
 		brd = addr = 0;
+		rcu_read_lock();
+		in_dev = __in_dev_get(strip_info->dev);
 		if (in_dev) {
-			read_lock(&in_dev->lock);
 			if (in_dev->ifa_list) {
 				brd = in_dev->ifa_list->ifa_broadcast;
 				addr = in_dev->ifa_list->ifa_local;
 			}
-			read_unlock(&in_dev->lock);
-			in_dev_put(in_dev);
 		}
+		rcu_read_unlock();
 	}
 
 
===== net/core/netpoll.c 1.12 vs edited =====
--- 1.12/net/core/netpoll.c	2004-07-03 06:38:38 +10:00
+++ edited/net/core/netpoll.c	2004-08-14 10:58:29 +10:00
@@ -18,6 +18,7 @@
 #include <linux/interrupt.h>
 #include <linux/netpoll.h>
 #include <linux/sched.h>
+#include <linux/rcupdate.h>
 #include <net/tcp.h>
 #include <net/udp.h>
 
@@ -572,16 +573,18 @@
 		memcpy(np->local_mac, ndev->dev_addr, 6);
 
 	if (!np->local_ip) {
-		in_dev = in_dev_get(ndev);
+		rcu_read_lock();
+		in_dev = __in_dev_get(ndev);
 
 		if (!in_dev) {
+			rcu_read_unlock();
 			printk(KERN_ERR "%s: no IP address for %s, aborting\n",
 			       np->name, np->dev_name);
 			goto release;
 		}
 
 		np->local_ip = ntohl(in_dev->ifa_list->ifa_local);
-		in_dev_put(in_dev);
+		rcu_read_unlock();
 		printk(KERN_INFO "%s: local IP %d.%d.%d.%d\n",
 		       np->name, HIPQUAD(np->local_ip));
 	}
===== net/core/pktgen.c 1.12 vs edited =====
--- 1.12/net/core/pktgen.c	2004-07-29 12:06:49 +10:00
+++ edited/net/core/pktgen.c	2004-08-14 11:01:55 +10:00
@@ -70,6 +70,7 @@
 #include <linux/delay.h>
 #include <linux/init.h>
 #include <linux/inet.h>
+#include <linux/rcupdate.h>
 #include <asm/byteorder.h>
 #include <asm/bitops.h>
 #include <asm/io.h>
@@ -263,14 +264,17 @@
 	info->saddr_min = 0;
 	info->saddr_max = 0;
 	if (strlen(info->src_min) == 0) {
-		struct in_device *in_dev = in_dev_get(odev);
+		struct in_device *in_dev;
+
+		rcu_read_lock();
+		in_dev = __in_dev_get(odev);
 		if (in_dev) {
 			if (in_dev->ifa_list) {
 				info->saddr_min = in_dev->ifa_list->ifa_address;
 				info->saddr_max = info->saddr_min;
 			}
-			in_dev_put(in_dev);
 		}
+		rcu_read_unlock();
 	}
 	else {
 		info->saddr_min = in_aton(info->src_min);
===== net/econet/af_econet.c 1.42 vs edited =====
--- 1.42/net/econet/af_econet.c	2004-07-17 00:13:45 +10:00
+++ edited/net/econet/af_econet.c	2004-08-14 11:09:32 +10:00
@@ -39,6 +39,7 @@
 #include <net/udp.h>
 #include <net/ip.h>
 #include <linux/spinlock.h>
+#include <linux/rcupdate.h>
 
 #include <asm/uaccess.h>
 #include <asm/system.h>
@@ -401,16 +402,17 @@
 	   y.x maps to IP a.b.c.x.  This should be replaced with something
 	   more flexible and more aware of subnet masks.  */
 	{
-		struct in_device *idev = in_dev_get(dev);
+		struct in_device *idev;
 		unsigned long network = 0;
+
+		rcu_read_lock();
+		idev = __in_dev_get(dev);
 		if (idev) {
-			read_lock(&idev->lock);
 			if (idev->ifa_list)
 				network = ntohl(idev->ifa_list->ifa_address) & 
 					0xffffff00;		/* !!! */
-			read_unlock(&idev->lock);
-			in_dev_put(idev);
 		}
+		rcu_read_unlock();
 		udpdest.sin_addr.s_addr = htonl(network | addr.station);
 	}
 

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH] Move inetdev/ifa over to RCU
  2004-08-14  1:25               ` Herbert Xu
  2004-08-14  1:30                 ` Herbert Xu
@ 2004-08-14  1:40                 ` Herbert Xu
  2004-08-16  3:03                   ` David S. Miller
  2004-08-14  4:30                 ` Stephen Hemminger
  2 siblings, 1 reply; 46+ messages in thread
From: Herbert Xu @ 2004-08-14  1:40 UTC (permalink / raw)
  To: David S. Miller; +Cc: shemminger, netdev

[-- Attachment #1: Type: text/plain, Size: 382 bytes --]

Hi Dave:

Since nobody calls inet_free_ifa directly anymore, we can
perform the following simplification.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

[-- Attachment #2: s --]
[-- Type: text/plain, Size: 2142 bytes --]

===== net/ipv4/devinet.c 1.36 vs edited =====
--- 1.36/net/ipv4/devinet.c	2004-08-14 11:23:39 +10:00
+++ edited/net/ipv4/devinet.c	2004-08-14 11:38:22 +10:00
@@ -104,16 +104,17 @@
 	return ifa;
 }
 
-static inline void inet_free_ifa(struct in_ifaddr *ifa)
+static void inet_rcu_free_ifa(void *arg)
 {
+	struct in_ifaddr *ifa = arg;
 	if (ifa->ifa_dev)
 		in_dev_put(ifa->ifa_dev);
 	kfree(ifa);
 }
 
-static void inet_rcu_free_ifa(void *ifa)
+static inline void inet_free_ifa(struct in_ifaddr *ifa)
 {
-	inet_free_ifa(ifa);
+	call_rcu(&ifa->rcu_head, inet_rcu_free_ifa, ifa);
 }
 
 void in_dev_finish_destroy(struct in_device *idev)
@@ -192,7 +193,7 @@
 
 	while ((ifa = in_dev->ifa_list) != NULL) {
 		inet_del_ifa(in_dev, &in_dev->ifa_list, 0);
-		call_rcu(&ifa->rcu_head, inet_rcu_free_ifa, ifa);
+		inet_free_ifa(ifa);
 	}
 
 #ifdef CONFIG_SYSCTL
@@ -251,7 +252,7 @@
 
 			rtmsg_ifa(RTM_DELADDR, ifa);
 			notifier_call_chain(&inetaddr_chain, NETDEV_DOWN, ifa);
-			call_rcu(&ifa->rcu_head, inet_rcu_free_ifa, ifa);
+			inet_free_ifa(ifa);
 		}
 	}
 
@@ -274,7 +275,7 @@
 	rtmsg_ifa(RTM_DELADDR, ifa1);
 	notifier_call_chain(&inetaddr_chain, NETDEV_DOWN, ifa1);
 	if (destroy) {
-		call_rcu(&ifa1->rcu_head, inet_rcu_free_ifa, ifa1);
+		inet_free_ifa(ifa1);
 
 		if (!in_dev->ifa_list)
 			inetdev_destroy(in_dev);
@@ -289,7 +290,7 @@
 	ASSERT_RTNL();
 
 	if (!ifa->ifa_local) {
-		call_rcu(&ifa->rcu_head, inet_rcu_free_ifa, ifa);
+		inet_free_ifa(ifa);
 		return 0;
 	}
 
@@ -304,13 +305,11 @@
 		if (ifa1->ifa_mask == ifa->ifa_mask &&
 		    inet_ifa_match(ifa1->ifa_address, ifa)) {
 			if (ifa1->ifa_local == ifa->ifa_local) {
-				call_rcu(&ifa->rcu_head, inet_rcu_free_ifa,
-					 ifa);
+				inet_free_ifa(ifa);
 				return -EEXIST;
 			}
 			if (ifa1->ifa_scope != ifa->ifa_scope) {
-				call_rcu(&ifa->rcu_head, inet_rcu_free_ifa,
-					 ifa);
+				inet_free_ifa(ifa);
 				return -EINVAL;
 			}
 			ifa->ifa_flags |= IFA_F_SECONDARY;
@@ -345,7 +344,7 @@
 	if (!in_dev) {
 		in_dev = inetdev_init(dev);
 		if (!in_dev) {
-			call_rcu(&ifa->rcu_head, inet_rcu_free_ifa, ifa);
+			inet_free_ifa(ifa);
 			return -ENOBUFS;
 		}
 	}

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH] Move inetdev/ifa over to RCU
  2004-08-14  1:25               ` Herbert Xu
  2004-08-14  1:30                 ` Herbert Xu
  2004-08-14  1:40                 ` Herbert Xu
@ 2004-08-14  4:30                 ` Stephen Hemminger
  2004-08-14  4:36                   ` Herbert Xu
  2 siblings, 1 reply; 46+ messages in thread
From: Stephen Hemminger @ 2004-08-14  4:30 UTC (permalink / raw)
  To: Herbert Xu; +Cc: David S. Miller, netdev

RCU now takes two args, part of the simplification from IBM in
later 2.6.8 series.

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH] Move inetdev/ifa over to RCU
  2004-08-14  4:30                 ` Stephen Hemminger
@ 2004-08-14  4:36                   ` Herbert Xu
  0 siblings, 0 replies; 46+ messages in thread
From: Herbert Xu @ 2004-08-14  4:36 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: David S. Miller, netdev

On Fri, Aug 13, 2004 at 09:30:50PM -0700, Stephen Hemminger wrote:
> RCU now takes two args, part of the simplification from IBM in
> later 2.6.8 series.

Sorry, please disregard the call_rcu patch.
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH] Move inetdev/ifa over to RCU
  2004-08-14  1:30                 ` Herbert Xu
@ 2004-08-14  5:08                   ` Herbert Xu
  2004-08-14  6:27                     ` neigh_create/inetdev_destroy race? Herbert Xu
  2004-08-16  2:08                     ` [PATCH] Move inetdev/ifa over to RCU David S. Miller
  2004-08-14  6:31                   ` Herbert Xu
  2004-08-16  3:01                   ` David S. Miller
  2 siblings, 2 replies; 46+ messages in thread
From: Herbert Xu @ 2004-08-14  5:08 UTC (permalink / raw)
  To: David S. Miller; +Cc: shemminger, netdev

Hi:

I'm just going through all the __in_dev_get() callers and the one
in ip_route_output_slow() looks fishy.  It appears to be checking
whether the subsequent inet_select_addr() calls will succeed or not.

But this is not reliable since the addresses can always disappear
between the check and the actual call.

Do we really care about the zero return value of inet_select_addr()
here? What about the other calls to inet_select_addr()?
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 46+ messages in thread

* neigh_create/inetdev_destroy race?
  2004-08-14  5:08                   ` Herbert Xu
@ 2004-08-14  6:27                     ` Herbert Xu
  2004-08-16  2:14                       ` David S. Miller
  2004-08-16  2:08                     ` [PATCH] Move inetdev/ifa over to RCU David S. Miller
  1 sibling, 1 reply; 46+ messages in thread
From: Herbert Xu @ 2004-08-14  6:27 UTC (permalink / raw)
  To: David S. Miller; +Cc: shemminger, netdev

Hi:

I'm going through in_dev_get() callers.  The call in arp_constructor()
looks racy.  It takes in_dev->arp_parms and stores it in neigh->parms.

Is there any thing that prevents the following scenario from occuring?

CPU0					CPU1
neigh_create
					inet_del_ifa
						notifier_call_chain
							neigh_ifdown
						inetdev_destroy
	arp_constructor
		neigh->parms =
			in_dev->arp_parms
							in_dev->dead = 1
							in_dev->dev->ip_ptr =
								NULL
							neigh_parms_release
	n->parms->neigh_setup => BUG

Thanks,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH] Move inetdev/ifa over to RCU
  2004-08-14  1:30                 ` Herbert Xu
  2004-08-14  5:08                   ` Herbert Xu
@ 2004-08-14  6:31                   ` Herbert Xu
  2004-08-14  6:32                     ` Herbert Xu
  2004-08-16  3:01                   ` David S. Miller
  2 siblings, 1 reply; 46+ messages in thread
From: Herbert Xu @ 2004-08-14  6:31 UTC (permalink / raw)
  To: David S. Miller; +Cc: shemminger, netdev

[-- Attachment #1: Type: text/plain, Size: 456 bytes --]

Hi Dave:

On Sat, Aug 14, 2004 at 11:30:30AM +1000, herbert wrote:
> 
> Here are some more ifa_list fixes outside net/ipv4.

Missed one in sdlamain.c because it was marked BROKEN.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

[-- Attachment #2: p1 --]
[-- Type: text/plain, Size: 1176 bytes --]

===== drivers/net/wan/sdlamain.c 1.23 vs edited =====
--- 1.23/drivers/net/wan/sdlamain.c	2003-10-06 02:23:10 +10:00
+++ edited/drivers/net/wan/sdlamain.c	2004-08-14 15:16:40 +10:00
@@ -68,6 +68,7 @@
 #include <linux/inetdevice.h>
 
 #include <linux/ip.h>
+#include <linux/rcupdate.h>
 #include <net/route.h>
  
 #define KMEM_SAFETYZONE 8
@@ -1268,37 +1269,39 @@
 	
 	struct in_ifaddr *ifaddr;
 	struct in_device *in_dev;
+	unsigned long addr = 0;
 
+	rcu_read_lock();
 	if ((in_dev = __in_dev_get(dev)) == NULL){
-		return 0;
+		goto out;
 	}
 
 	if ((ifaddr = in_dev->ifa_list)== NULL ){
-		return 0;
+		goto out;
 	}
 	
 	switch (option){
 
 	case WAN_LOCAL_IP:
-		return ifaddr->ifa_local;
+		addr = ifaddr->ifa_local;
 		break;
 	
 	case WAN_POINTOPOINT_IP:
-		return ifaddr->ifa_address;
+		addr = ifaddr->ifa_address;
 		break;	
 
 	case WAN_NETMASK_IP:
-		return ifaddr->ifa_mask;
+		addr = ifaddr->ifa_mask;
 		break;
 
 	case WAN_BROADCAST_IP:
-		return ifaddr->ifa_broadcast;
+		addr = ifaddr->ifa_broadcast;
 		break;
-	default:
-		return 0;
 	}
 
-	return 0;
+out:
+	rcu_read_unlock();
+	return addr;
 }	
 
 void add_gateway(sdla_t *card, struct net_device *dev)

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH] Move inetdev/ifa over to RCU
  2004-08-14  6:31                   ` Herbert Xu
@ 2004-08-14  6:32                     ` Herbert Xu
  0 siblings, 0 replies; 46+ messages in thread
From: Herbert Xu @ 2004-08-14  6:32 UTC (permalink / raw)
  To: David S. Miller; +Cc: shemminger, netdev

[-- Attachment #1: Type: text/plain, Size: 446 bytes --]

Hi Dave:

On Sat, Aug 14, 2004 at 04:31:19PM +1000, herbert wrote:
> 
> Missed one in sdlamain.c because it was marked BROKEN.

And a couple of them which are S390 only.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

[-- Attachment #2: p2 --]
[-- Type: text/plain, Size: 2155 bytes --]

===== drivers/s390/net/lcs.c 1.31 vs edited =====
--- 1.31/drivers/s390/net/lcs.c	2004-08-08 04:05:36 +10:00
+++ edited/drivers/s390/net/lcs.c	2004-08-14 15:34:58 +10:00
@@ -1002,7 +1002,7 @@
 	in4_dev = in_dev_get(card->dev);
 	if (in4_dev == NULL)
 		return 0;
-	read_lock(&in4_dev->lock);
+	read_lock(&in4_dev->mc_list_lock);
 	spin_lock(&card->ipm_lock);
 	/* Check for multicast addresses to be removed. */
 	list_for_each(l, &card->ipm_list) {
@@ -1046,7 +1046,7 @@
 		list_add(&ipm->list, &card->ipm_list);
 	}
 	spin_unlock(&card->ipm_lock);
-	read_unlock(&in4_dev->lock);
+	read_unlock(&in4_dev->mc_list_lock);
 	in_dev_put(in4_dev);
 	lcs_fix_multicast_list(card);
 	return 0;
===== drivers/s390/net/qeth_main.c 1.11 vs edited =====
--- 1.11/drivers/s390/net/qeth_main.c	2004-08-08 04:05:36 +10:00
+++ edited/drivers/s390/net/qeth_main.c	2004-08-14 16:27:11 +10:00
@@ -73,6 +73,7 @@
 #include <linux/reboot.h>
 #include <asm/qeth.h>
 #include <linux/mii.h>
+#include <linux/rcupdate.h>
 
 #include "qeth.h"
 #include "qeth_mpc.h"
@@ -4733,9 +4734,10 @@
 	QETH_DBF_TEXT(trace, 4, "frvaddr4");
 	if (!card->vlangrp)
 		return;
-	in_dev = in_dev_get(card->vlangrp->vlan_devices[vid]);
+	rcu_read_lock();
+	in_dev = __in_dev_get(card->vlangrp->vlan_devices[vid]);
 	if (!in_dev)
-		return;
+		goto out;
 	for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next){
 		addr = qeth_get_addr_buffer(QETH_PROT_IPV4);
 		if (addr){
@@ -4746,7 +4748,8 @@
 				kfree(addr);
 		}
 	}
-	in_dev_put(in_dev);
+out:
+	rcu_read_unlock();
 }
 
 static void
@@ -4918,9 +4921,9 @@
 		in_dev = in_dev_get(vg->vlan_devices[i]);
 		if (!in_dev)
 			continue;
-		read_lock(&in_dev->lock);
+		read_lock(&in_dev->mc_list_lock);
 		qeth_add_mc(card,in_dev);
-		read_unlock(&in_dev->lock);
+		read_unlock(&in_dev->mc_list_lock);
 		in_dev_put(in_dev);
 	}
 #endif
@@ -4935,10 +4938,10 @@
 	in4_dev = in_dev_get(card->dev);
 	if (in4_dev == NULL)
 		return;
-	read_lock(&in4_dev->lock);
+	read_lock(&in4_dev->mc_list_lock);
 	qeth_add_mc(card, in4_dev);
 	qeth_add_vlan_mc(card);
-	read_unlock(&in4_dev->lock);
+	read_unlock(&in4_dev->mc_list_lock);
 	in_dev_put(in4_dev);
 }
 

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH] Move inetdev/ifa over to RCU
  2004-08-14  5:08                   ` Herbert Xu
  2004-08-14  6:27                     ` neigh_create/inetdev_destroy race? Herbert Xu
@ 2004-08-16  2:08                     ` David S. Miller
  2004-08-16  2:43                       ` Herbert Xu
  1 sibling, 1 reply; 46+ messages in thread
From: David S. Miller @ 2004-08-16  2:08 UTC (permalink / raw)
  To: Herbert Xu; +Cc: shemminger, netdev

On Sat, 14 Aug 2004 15:08:48 +1000
Herbert Xu <herbert@gondor.apana.org.au> wrote:

> I'm just going through all the __in_dev_get() callers and the one
> in ip_route_output_slow() looks fishy.  It appears to be checking
> whether the subsequent inet_select_addr() calls will succeed or not.
> 
> But this is not reliable since the addresses can always disappear
> between the check and the actual call.
> 
> Do we really care about the zero return value of inet_select_addr()
> here? What about the other calls to inet_select_addr()?

It won't return zero, typically it will return loopback's IP
(with preference to any non-loopback addresses assigned to
 the loopback device).  This is being used for source address
selection.

Also, when device ipv4 addresses are deleted, NETDEV_DOWN messages
are broadcast to all the subsystems.  One of the subsystems is FIB,
which will disable IP on that interface if this is the last ipv4
address and it will also flush the routing cache immediately.
However you are right that we may need to synchronize this more
tightly.

Hmmm....

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: neigh_create/inetdev_destroy race?
  2004-08-14  6:27                     ` neigh_create/inetdev_destroy race? Herbert Xu
@ 2004-08-16  2:14                       ` David S. Miller
  2004-08-16 10:51                         ` Herbert Xu
  0 siblings, 1 reply; 46+ messages in thread
From: David S. Miller @ 2004-08-16  2:14 UTC (permalink / raw)
  To: Herbert Xu; +Cc: shemminger, netdev

On Sat, 14 Aug 2004 16:27:03 +1000
Herbert Xu <herbert@gondor.apana.org.au> wrote:

> Is there any thing that prevents the following scenario from occuring?
> 
> CPU0					CPU1
> neigh_create
> 					inet_del_ifa
> 						notifier_call_chain
> 							neigh_ifdown
> 						inetdev_destroy
> 	arp_constructor
> 		neigh->parms =
> 			in_dev->arp_parms
> 							in_dev->dead = 1
> 							in_dev->dev->ip_ptr =
> 								NULL
> 							neigh_parms_release
> 	n->parms->neigh_setup => BUG

Is there anything other than hostess_sv11.c, sealevel.c, and shaper.c
which are using n->parms->neigh_setup at all?

This seems to be a very obscure special case hack, which perhaps we
can removee entirely.

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH] Move inetdev/ifa over to RCU
  2004-08-16  2:08                     ` [PATCH] Move inetdev/ifa over to RCU David S. Miller
@ 2004-08-16  2:43                       ` Herbert Xu
  2004-08-16  3:08                         ` David S. Miller
  0 siblings, 1 reply; 46+ messages in thread
From: Herbert Xu @ 2004-08-16  2:43 UTC (permalink / raw)
  To: David S. Miller; +Cc: shemminger, netdev

On Sun, Aug 15, 2004 at 07:08:23PM -0700, David S. Miller wrote:
>
> > Do we really care about the zero return value of inet_select_addr()
> > here? What about the other calls to inet_select_addr()?
> 
> It won't return zero, typically it will return loopback's IP
> (with preference to any non-loopback addresses assigned to
>  the loopback device).  This is being used for source address
> selection.

It will return zero if there is no in_dev at all.  Perhaps what
we should do is get inet_select_addr() to get the address from
other devices in that case as well?

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH] Move inetdev/ifa over to RCU
  2004-08-14  0:34         ` Herbert Xu
  2004-08-14  0:39           ` David S. Miller
@ 2004-08-16  2:58           ` David S. Miller
  2004-08-16  3:08             ` Herbert Xu
  1 sibling, 1 reply; 46+ messages in thread
From: David S. Miller @ 2004-08-16  2:58 UTC (permalink / raw)
  To: Herbert Xu; +Cc: shemminger, netdev

On Sat, 14 Aug 2004 10:34:28 +1000
Herbert Xu <herbert@gondor.apana.org.au> wrote:

> BTW, it looks like you can remove inetdev_lock altogether.  All its
> users already assume that they don't race against each other by
> taking the rtnl lock.

I think it's needed at least for the:

	dev->ip_ptr = idev;
	in_dev_get(idev);

thing, isn't it?

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH] Move inetdev/ifa over to RCU
  2004-08-14  0:54             ` Herbert Xu
  2004-08-14  1:25               ` Herbert Xu
@ 2004-08-16  2:59               ` David S. Miller
  1 sibling, 0 replies; 46+ messages in thread
From: David S. Miller @ 2004-08-16  2:59 UTC (permalink / raw)
  To: Herbert Xu; +Cc: shemminger, netdev

On Sat, 14 Aug 2004 10:54:11 +1000
Herbert Xu <herbert@gondor.apana.org.au> wrote:

> On Fri, Aug 13, 2004 at 05:39:24PM -0700, David S. Miller wrote:
> > 
> > Yes and I cured the one typo in it two, idev-->in_dev :-)
> 
> Two birds with one stone :)
> 
> Here is a trivial optimisation for irda to since it doesn't hold
> onto the idev.

Applied.

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH] Move inetdev/ifa over to RCU
  2004-08-14  1:30                 ` Herbert Xu
  2004-08-14  5:08                   ` Herbert Xu
  2004-08-14  6:31                   ` Herbert Xu
@ 2004-08-16  3:01                   ` David S. Miller
  2 siblings, 0 replies; 46+ messages in thread
From: David S. Miller @ 2004-08-16  3:01 UTC (permalink / raw)
  To: Herbert Xu; +Cc: shemminger, netdev

On Sat, 14 Aug 2004 11:30:30 +1000
Herbert Xu <herbert@gondor.apana.org.au> wrote:

> Here are some more ifa_list fixes outside net/ipv4.

Applied.

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH] Move inetdev/ifa over to RCU
  2004-08-14  1:40                 ` Herbert Xu
@ 2004-08-16  3:03                   ` David S. Miller
  2004-08-16  3:23                     ` Herbert Xu
  0 siblings, 1 reply; 46+ messages in thread
From: David S. Miller @ 2004-08-16  3:03 UTC (permalink / raw)
  To: Herbert Xu; +Cc: shemminger, netdev

On Sat, 14 Aug 2004 11:40:48 +1000
Herbert Xu <herbert@gondor.apana.org.au> wrote:

> Since nobody calls inet_free_ifa directly anymore, we can
> perform the following simplification.

You need to redo this patch because the 3-arg call_rcu() bits
are in there.

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH] Move inetdev/ifa over to RCU
  2004-08-16  2:58           ` David S. Miller
@ 2004-08-16  3:08             ` Herbert Xu
  2004-08-16  6:21               ` David S. Miller
  0 siblings, 1 reply; 46+ messages in thread
From: Herbert Xu @ 2004-08-16  3:08 UTC (permalink / raw)
  To: David S. Miller; +Cc: shemminger, netdev

On Sun, Aug 15, 2004 at 07:58:20PM -0700, David S. Miller wrote:
> On Sat, 14 Aug 2004 10:34:28 +1000
> Herbert Xu <herbert@gondor.apana.org.au> wrote:
> 
> > BTW, it looks like you can remove inetdev_lock altogether.  All its
> > users already assume that they don't race against each other by
> > taking the rtnl lock.
> 
> I think it's needed at least for the:
> 
> 	dev->ip_ptr = idev;
> 	in_dev_get(idev);

You mean in_dev_hold?

> thing, isn't it?

I don't think so.  But there is a real bug here :)

The lock doesn't really do much since everyone who's holding
it (they're all in devinet.c) already hold the RTNL lock
(the rotten old lock perhaps :)

But the RCU change has created a real bug here.  Imagine this:

CPU0					CPU1
inetdev_init
	dev->ip_ptr = idev
					in_dev_get
						atomic_inc on refcnt

					in_dev_put => frees idev
	in_dev_hold(idev) => BUG

So we should reverse the two statements and add an smp_wmb().

Unfortunately the lock doesn't really help you at all.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH] Move inetdev/ifa over to RCU
  2004-08-16  2:43                       ` Herbert Xu
@ 2004-08-16  3:08                         ` David S. Miller
  2004-08-16  3:14                           ` Herbert Xu
  0 siblings, 1 reply; 46+ messages in thread
From: David S. Miller @ 2004-08-16  3:08 UTC (permalink / raw)
  To: Herbert Xu; +Cc: shemminger, netdev

On Mon, 16 Aug 2004 12:43:30 +1000
Herbert Xu <herbert@gondor.apana.org.au> wrote:

> It will return zero if there is no in_dev at all.  Perhaps what
> we should do is get inet_select_addr() to get the address from
> other devices in that case as well?

It does that now, look at how it iterates over dev_base's
list.

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH] Move inetdev/ifa over to RCU
  2004-08-16  3:08                         ` David S. Miller
@ 2004-08-16  3:14                           ` Herbert Xu
  2004-08-16  6:23                             ` David S. Miller
  0 siblings, 1 reply; 46+ messages in thread
From: Herbert Xu @ 2004-08-16  3:14 UTC (permalink / raw)
  To: David S. Miller; +Cc: shemminger, netdev

[-- Attachment #1: Type: text/plain, Size: 768 bytes --]

On Sun, Aug 15, 2004 at 08:08:38PM -0700, David S. Miller wrote:
> On Mon, 16 Aug 2004 12:43:30 +1000
> Herbert Xu <herbert@gondor.apana.org.au> wrote:
> 
> > It will return zero if there is no in_dev at all.  Perhaps what
> > we should do is get inet_select_addr() to get the address from
> > other devices in that case as well?
> 
> It does that now, look at how it iterates over dev_base's
> list.

But not if __in_dev_get() == NULL.  The following patch should
express that more clearly.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

[-- Attachment #2: p --]
[-- Type: text/plain, Size: 618 bytes --]

===== net/ipv4/devinet.c 1.38 vs edited =====
--- 1.38/net/ipv4/devinet.c	2004-08-14 14:50:31 +10:00
+++ edited/net/ipv4/devinet.c	2004-08-16 13:13:44 +10:00
@@ -780,7 +780,7 @@
 	rcu_read_lock();
 	in_dev = __in_dev_get(dev);
 	if (!in_dev)
-		goto out_unlock_inetdev;
+		goto no_in_dev;
 
 	for_primary_ifa(in_dev) {
 		if (ifa->ifa_scope > scope)
@@ -792,6 +792,7 @@
 		if (!addr)
 			addr = ifa->ifa_local;
 	} endfor_ifa(in_dev);
+no_in_dev:
 	rcu_read_unlock();
 
 	if (addr)
@@ -817,7 +818,6 @@
 	}
 out_unlock_both:
 	read_unlock(&dev_base_lock);
-out_unlock_inetdev:
 	rcu_read_unlock();
 out:
 	return addr;

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH] Move inetdev/ifa over to RCU
  2004-08-16  3:03                   ` David S. Miller
@ 2004-08-16  3:23                     ` Herbert Xu
  2004-08-16  6:24                       ` David S. Miller
  0 siblings, 1 reply; 46+ messages in thread
From: Herbert Xu @ 2004-08-16  3:23 UTC (permalink / raw)
  To: David S. Miller; +Cc: shemminger, netdev

[-- Attachment #1: Type: text/plain, Size: 645 bytes --]

On Sun, Aug 15, 2004 at 08:03:57PM -0700, David S. Miller wrote:
> On Sat, 14 Aug 2004 11:40:48 +1000
> Herbert Xu <herbert@gondor.apana.org.au> wrote:
> 
> > Since nobody calls inet_free_ifa directly anymore, we can
> > perform the following simplification.
> 
> You need to redo this patch because the 3-arg call_rcu() bits
> are in there.

Sorry.  Here it is again.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

[-- Attachment #2: q --]
[-- Type: text/plain, Size: 2226 bytes --]

===== net/ipv4/devinet.c 1.34 vs 1.38 =====
--- 1.34/net/ipv4/devinet.c	2004-08-14 08:36:53 +10:00
+++ 1.38/net/ipv4/devinet.c	2004-08-14 14:50:31 +10:00
@@ -104,18 +104,17 @@
 	return ifa;
 }
 
-static inline void inet_free_ifa(struct in_ifaddr *ifa)
+static void inet_rcu_free_ifa(struct rcu_head *head)
 {
+	struct in_ifaddr *ifa = container_of(head, struct in_ifaddr, rcu_head);
 	if (ifa->ifa_dev)
 		in_dev_put(ifa->ifa_dev);
 	kfree(ifa);
 }
 
-static void inet_rcu_free_ifa(struct rcu_head *head)
+static inline void inet_free_ifa(struct in_ifaddr *ifa)
 {
-	struct in_ifaddr *ifa = container_of(head, struct in_ifaddr, rcu_head);
-
-	inet_free_ifa(ifa);
+	call_rcu(&ifa->rcu_head, inet_rcu_free_ifa);
 }
 
 void in_dev_finish_destroy(struct in_device *idev)
@@ -195,7 +194,7 @@
 
 	while ((ifa = in_dev->ifa_list) != NULL) {
 		inet_del_ifa(in_dev, &in_dev->ifa_list, 0);
-		call_rcu(&ifa->rcu_head, inet_rcu_free_ifa);
+		inet_free_ifa(ifa);
 	}
 
 #ifdef CONFIG_SYSCTL
@@ -255,7 +253,7 @@
 
 			rtmsg_ifa(RTM_DELADDR, ifa);
 			notifier_call_chain(&inetaddr_chain, NETDEV_DOWN, ifa);
-			call_rcu(&ifa->rcu_head, inet_rcu_free_ifa);
+			inet_free_ifa(ifa);
 		}
 	}
 
@@ -278,7 +276,7 @@
 	rtmsg_ifa(RTM_DELADDR, ifa1);
 	notifier_call_chain(&inetaddr_chain, NETDEV_DOWN, ifa1);
 	if (destroy) {
-		call_rcu(&ifa1->rcu_head, inet_rcu_free_ifa);
+		inet_free_ifa(ifa1);
 
 		if (!in_dev->ifa_list)
 			inetdev_destroy(in_dev);
@@ -293,7 +291,7 @@
 	ASSERT_RTNL();
 
 	if (!ifa->ifa_local) {
-		call_rcu(&ifa->rcu_head, inet_rcu_free_ifa);
+		inet_free_ifa(ifa);
 		return 0;
 	}
 
@@ -308,11 +306,11 @@
 		if (ifa1->ifa_mask == ifa->ifa_mask &&
 		    inet_ifa_match(ifa1->ifa_address, ifa)) {
 			if (ifa1->ifa_local == ifa->ifa_local) {
-				call_rcu(&ifa->rcu_head, inet_rcu_free_ifa);
+				inet_free_ifa(ifa);
 				return -EEXIST;
 			}
 			if (ifa1->ifa_scope != ifa->ifa_scope) {
-				call_rcu(&ifa->rcu_head, inet_rcu_free_ifa);
+				inet_free_ifa(ifa);
 				return -EINVAL;
 			}
 			ifa->ifa_flags |= IFA_F_SECONDARY;
@@ -347,7 +345,7 @@
 	if (!in_dev) {
 		in_dev = inetdev_init(dev);
 		if (!in_dev) {
-			call_rcu(&ifa->rcu_head, inet_rcu_free_ifa);
+			inet_free_ifa(ifa);
 			return -ENOBUFS;
 		}
 	}

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH] Move inetdev/ifa over to RCU
  2004-08-16  3:08             ` Herbert Xu
@ 2004-08-16  6:21               ` David S. Miller
  2004-08-16  8:13                 ` Herbert Xu
  0 siblings, 1 reply; 46+ messages in thread
From: David S. Miller @ 2004-08-16  6:21 UTC (permalink / raw)
  To: Herbert Xu; +Cc: shemminger, netdev

On Mon, 16 Aug 2004 13:08:26 +1000
Herbert Xu <herbert@gondor.apana.org.au> wrote:

> So we should reverse the two statements and add an smp_wmb().

I totally agree, and let's kill the lock too since it is
in fact useless now.  I've done this as follows:

# This is a BitKeeper generated diff -Nru style patch.
#
# ChangeSet
#   2004/08/15 23:08:21-07:00 davem@nuts.davemloft.net 
#   [IPV4]: Kill inetdev_lock, no longer needed.
#   
#   It no longer protects anything, all users held RTNL
#   semaphore to boot.  Also, fix a potential race in the
#   new RCU inetdev code, grab the reference on the idev
#   before attaching it via dev->ip_ptr.
#   
#   Based upon discussions with Herbert Xu.
#   
#   Signed-off-by: David S. Miller <davem@redhat.com>
# 
# net/ipv4/devinet.c
#   2004/08/15 23:07:17-07:00 davem@nuts.davemloft.net +6 -14
#   [IPV4]: Kill inetdev_lock, no longer needed.
# 
diff -Nru a/net/ipv4/devinet.c b/net/ipv4/devinet.c
--- a/net/ipv4/devinet.c	2004-08-15 23:08:59 -07:00
+++ b/net/ipv4/devinet.c	2004-08-15 23:08:59 -07:00
@@ -90,8 +90,6 @@
 
 /* Locks all the inet devices. */
 
-static spinlock_t inetdev_lock = SPIN_LOCK_UNLOCKED;
-
 static struct in_ifaddr *inet_alloc_ifa(void)
 {
 	struct in_ifaddr *ifa = kmalloc(sizeof(*ifa), GFP_KERNEL);
@@ -158,11 +156,12 @@
 	neigh_sysctl_register(dev, in_dev->arp_parms, NET_IPV4,
 			      NET_IPV4_NEIGH, "ipv4", NULL);
 #endif
-	spin_lock_bh(&inetdev_lock);
-	dev->ip_ptr = in_dev;
+
 	/* Account for reference dev->ip_ptr */
 	in_dev_hold(in_dev);
-	spin_unlock_bh(&inetdev_lock);
+	smp_wmb();
+	dev->ip_ptr = in_dev;
+
 #ifdef CONFIG_SYSCTL
 	devinet_sysctl_register(in_dev, &in_dev->cnf);
 #endif
@@ -201,10 +200,8 @@
 #ifdef CONFIG_SYSCTL
 	devinet_sysctl_unregister(&in_dev->cnf);
 #endif
-	spin_lock_bh(&inetdev_lock);
+
 	in_dev->dev->ip_ptr = NULL;
-	/* in_dev_put following below will kill the in_device */
-	spin_unlock_bh(&inetdev_lock);
 
 #ifdef CONFIG_SYSCTL
 	neigh_sysctl_unregister(in_dev->arp_parms);
@@ -248,9 +245,8 @@
 				ifap1 = &ifa->ifa_next;
 				continue;
 			}
-			spin_lock_bh(&inetdev_lock);
+
 			*ifap1 = ifa->ifa_next;
-			spin_unlock_bh(&inetdev_lock);
 
 			rtmsg_ifa(RTM_DELADDR, ifa);
 			notifier_call_chain(&inetaddr_chain, NETDEV_DOWN, ifa);
@@ -260,9 +256,7 @@
 
 	/* 2. Unlink it */
 
-	spin_lock_bh(&inetdev_lock);
 	*ifap = ifa1->ifa_next;
-	spin_unlock_bh(&inetdev_lock);
 
 	/* 3. Announce address deletion */
 
@@ -324,9 +318,7 @@
 	}
 
 	ifa->ifa_next = *ifap;
-	spin_lock_bh(&inetdev_lock);
 	*ifap = ifa;
-	spin_unlock_bh(&inetdev_lock);
 
 	/* Send message first, then call notifier.
 	   Notifier will trigger FIB update, so that

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH] Move inetdev/ifa over to RCU
  2004-08-16  3:14                           ` Herbert Xu
@ 2004-08-16  6:23                             ` David S. Miller
  0 siblings, 0 replies; 46+ messages in thread
From: David S. Miller @ 2004-08-16  6:23 UTC (permalink / raw)
  To: Herbert Xu; +Cc: shemminger, netdev

On Mon, 16 Aug 2004 13:14:56 +1000
Herbert Xu <herbert@gondor.apana.org.au> wrote:

> But not if __in_dev_get() == NULL.  The following patch should
> express that more clearly.

Agreed, and applied.  Thanks Herbert.

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH] Move inetdev/ifa over to RCU
  2004-08-16  3:23                     ` Herbert Xu
@ 2004-08-16  6:24                       ` David S. Miller
  0 siblings, 0 replies; 46+ messages in thread
From: David S. Miller @ 2004-08-16  6:24 UTC (permalink / raw)
  To: Herbert Xu; +Cc: shemminger, netdev

On Mon, 16 Aug 2004 13:23:44 +1000
Herbert Xu <herbert@gondor.apana.org.au> wrote:

> On Sun, Aug 15, 2004 at 08:03:57PM -0700, David S. Miller wrote:
> > On Sat, 14 Aug 2004 11:40:48 +1000
> > Herbert Xu <herbert@gondor.apana.org.au> wrote:
> > 
> > > Since nobody calls inet_free_ifa directly anymore, we can
> > > perform the following simplification.
> > 
> > You need to redo this patch because the 3-arg call_rcu() bits
> > are in there.
> 
> Sorry.  Here it is again.

Thanks for fixing that up, applied.

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH] Move inetdev/ifa over to RCU
  2004-08-16  6:21               ` David S. Miller
@ 2004-08-16  8:13                 ` Herbert Xu
  0 siblings, 0 replies; 46+ messages in thread
From: Herbert Xu @ 2004-08-16  8:13 UTC (permalink / raw)
  To: David S. Miller; +Cc: shemminger, netdev

On Sun, Aug 15, 2004 at 11:21:53PM -0700, David S. Miller wrote:
> 
> I totally agree, and let's kill the lock too since it is
> in fact useless now.  I've done this as follows:

That looks great.  Thanks Dave.
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: neigh_create/inetdev_destroy race?
  2004-08-16  2:14                       ` David S. Miller
@ 2004-08-16 10:51                         ` Herbert Xu
  2004-08-29  6:42                           ` David S. Miller
  0 siblings, 1 reply; 46+ messages in thread
From: Herbert Xu @ 2004-08-16 10:51 UTC (permalink / raw)
  To: David S. Miller; +Cc: shemminger, netdev

On Sun, Aug 15, 2004 at 07:14:50PM -0700, David S. Miller wrote:
> On Sat, 14 Aug 2004 16:27:03 +1000
> Herbert Xu <herbert@gondor.apana.org.au> wrote:
> 
> > Is there any thing that prevents the following scenario from occuring?
> > 
> > CPU0					CPU1
> > neigh_create
> > 					inet_del_ifa
> > 						notifier_call_chain
> > 							neigh_ifdown
> > 						inetdev_destroy
> > 	arp_constructor
> > 		neigh->parms =
> > 			in_dev->arp_parms
> > 							in_dev->dead = 1
> > 							in_dev->dev->ip_ptr =
> > 								NULL
> > 							neigh_parms_release
> > 	n->parms->neigh_setup => BUG
> 
> Is there anything other than hostess_sv11.c, sealevel.c, and shaper.c
> which are using n->parms->neigh_setup at all?
> 
> This seems to be a very obscure special case hack, which perhaps we
> can removee entirely.

That maybe the case, but the race has nothing to do with neigh_setup.

Even if you remove neigh_setup altogether, the very next line in
neigh_create will dereference n->parms by looking up base_reachable_time.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: neigh_create/inetdev_destroy race?
  2004-08-16 10:51                         ` Herbert Xu
@ 2004-08-29  6:42                           ` David S. Miller
  2004-08-29  6:50                             ` Herbert Xu
  0 siblings, 1 reply; 46+ messages in thread
From: David S. Miller @ 2004-08-29  6:42 UTC (permalink / raw)
  To: Herbert Xu; +Cc: shemminger, netdev

On Mon, 16 Aug 2004 20:51:31 +1000
Herbert Xu <herbert@gondor.apana.org.au> wrote:

> > > CPU0					CPU1
> > > neigh_create
> > > 					inet_del_ifa
> > > 						notifier_call_chain
> > > 							neigh_ifdown
> > > 						inetdev_destroy
> > > 	arp_constructor
> > > 		neigh->parms =
> > > 			in_dev->arp_parms
> > > 							in_dev->dead = 1
> > > 							in_dev->dev->ip_ptr =
> > > 								NULL
> > > 							neigh_parms_release
> > > 	n->parms->neigh_setup => BUG
> > 
> > Is there anything other than hostess_sv11.c, sealevel.c, and shaper.c
> > which are using n->parms->neigh_setup at all?
> > 
> > This seems to be a very obscure special case hack, which perhaps we
> > can removee entirely.
> 
> That maybe the case, but the race has nothing to do with neigh_setup.
> 
> Even if you remove neigh_setup altogether, the very next line in
> neigh_create will dereference n->parms by looking up base_reachable_time.

Wait a second, how can neigh_ifdown() even find this thing?
Firstly, neigh_create() takes a reference to the device, which
in turn holds onto the inetdev preventing inetdev_destroy().

Secondly, until neigh_create() takes the tbl lock, it is not in
the hash tables and therefore neigh_ifdown() could not see it.

Thirdly, arp_constructor() does in_dev_get() and checks the
return value.  If it fails, by racing with inetdev_destroy(),
neigh_create() will return an error and not do bogus derefing.

I think that covers all the cases, right?
(please prove me wrong, this looks too easy :-)

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: neigh_create/inetdev_destroy race?
  2004-08-29  6:42                           ` David S. Miller
@ 2004-08-29  6:50                             ` Herbert Xu
  2004-08-31  6:08                               ` David S. Miller
  0 siblings, 1 reply; 46+ messages in thread
From: Herbert Xu @ 2004-08-29  6:50 UTC (permalink / raw)
  To: David S. Miller; +Cc: shemminger, netdev

On Sat, Aug 28, 2004 at 11:42:01PM -0700, David S. Miller wrote:
> On Mon, 16 Aug 2004 20:51:31 +1000
> Herbert Xu <herbert@gondor.apana.org.au> wrote:
> 
> > > > CPU0					CPU1
> > > > neigh_create
> > > > 					inet_del_ifa
> > > > 						notifier_call_chain
> > > > 							neigh_ifdown
> > > > 						inetdev_destroy
> > > > 	arp_constructor
> > > > 		neigh->parms =
> > > > 			in_dev->arp_parms
> > > > 							in_dev->dead = 1
> > > > 							in_dev->dev->ip_ptr =
> > > > 								NULL
> > > > 							neigh_parms_release
> > > > 	n->parms->neigh_setup => BUG
>
> > That maybe the case, but the race has nothing to do with neigh_setup.
> > 
> > Even if you remove neigh_setup altogether, the very next line in
> > neigh_create will dereference n->parms by looking up base_reachable_time.
> 
> Wait a second, how can neigh_ifdown() even find this thing?
> Firstly, neigh_create() takes a reference to the device, which
> in turn holds onto the inetdev preventing inetdev_destroy().

I'm not sure about this statement.  I can't see how holding a reference
on dev prevents you from deleting the primary address on dev which will
lead to the call chain on the right.

> Secondly, until neigh_create() takes the tbl lock, it is not in
> the hash tables and therefore neigh_ifdown() could not see it.

That part I agree with :) That's in fact what this race is about:
neigh_ifdown does not guarantee that the hash table will be without
references to idev.

> Thirdly, arp_constructor() does in_dev_get() and checks the
> return value.  If it fails, by racing with inetdev_destroy(),
> neigh_create() will return an error and not do bogus derefing.

In the above scenario, when arp_constructor() runs the in_dev is
still alive and well.  It only gets destroyed afterwards.

> I think that covers all the cases, right?
> (please prove me wrong, this looks too easy :-)

Not quite :)

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: neigh_create/inetdev_destroy race?
  2004-08-29  6:50                             ` Herbert Xu
@ 2004-08-31  6:08                               ` David S. Miller
  2004-08-31 10:41                                 ` Herbert Xu
  0 siblings, 1 reply; 46+ messages in thread
From: David S. Miller @ 2004-08-31  6:08 UTC (permalink / raw)
  To: Herbert Xu; +Cc: shemminger, netdev

On Sun, 29 Aug 2004 16:50:31 +1000
Herbert Xu <herbert@gondor.apana.org.au> wrote:

> > Secondly, until neigh_create() takes the tbl lock, it is not in
> > the hash tables and therefore neigh_ifdown() could not see it.
> 
> That part I agree with :) That's in fact what this race is about:
> neigh_ifdown does not guarantee that the hash table will be without
> references to idev.

I think we can clear this by putting neigh_parms_release() into an
RCU handler.  It can't be in in_dev_rcu_put.

I also now believe that these sorts of races you are mentioning percolate
out into ip_mc_destroy_dev().

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: neigh_create/inetdev_destroy race?
  2004-08-31  6:08                               ` David S. Miller
@ 2004-08-31 10:41                                 ` Herbert Xu
  2004-09-02  5:21                                   ` David S. Miller
  0 siblings, 1 reply; 46+ messages in thread
From: Herbert Xu @ 2004-08-31 10:41 UTC (permalink / raw)
  To: David S. Miller; +Cc: shemminger, netdev

On Mon, Aug 30, 2004 at 11:08:20PM -0700, David S. Miller wrote:
> On Sun, 29 Aug 2004 16:50:31 +1000
> Herbert Xu <herbert@gondor.apana.org.au> wrote:
> 
> > > Secondly, until neigh_create() takes the tbl lock, it is not in
> > > the hash tables and therefore neigh_ifdown() could not see it.
> > 
> > That part I agree with :) That's in fact what this race is about:
> > neigh_ifdown does not guarantee that the hash table will be without
> > references to idev.
> 
> I think we can clear this by putting neigh_parms_release() into an
> RCU handler.  It can't be in in_dev_rcu_put.

Yes that should go a long way in resolving this problem.  However it
is still tricky because the neighbour table doesn't refer to idev
directly.  So we'll need to go through contortions to make sure that
the corresponding idev is still alive when we add a neighbour with
its neigh_parms to the hash table.  Or better yet we should ref count
the neigh_parms directly.

> I also now believe that these sorts of races you are mentioning percolate
> out into ip_mc_destroy_dev().

Well last time I looked at it I concluded that it was safe.  It seems
that the references are either held by timers who are all taken down
at the very start of the destruction, or they're taken at places which
are under spin locks.

Did I miss something else?

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: neigh_create/inetdev_destroy race?
  2004-08-31 10:41                                 ` Herbert Xu
@ 2004-09-02  5:21                                   ` David S. Miller
  2004-09-02 13:06                                     ` Herbert Xu
  0 siblings, 1 reply; 46+ messages in thread
From: David S. Miller @ 2004-09-02  5:21 UTC (permalink / raw)
  To: Herbert Xu; +Cc: shemminger, netdev

On Tue, 31 Aug 2004 20:41:39 +1000
Herbert Xu <herbert@gondor.apana.org.au> wrote:

> > I think we can clear this by putting neigh_parms_release() into an
> > RCU handler.  It can't be in in_dev_rcu_put.
> 
> Yes that should go a long way in resolving this problem.

So here's the first step.  No rcu_read_lock()'s are needed
since the tbl->lock needs to be held as a write when
traversing these things anyways for other reasons.

Can you work on the next bit you mentioned, making
sure the corresponding idev is still alive when we add
a neighbour with its neigh_parms to the hash table?

Thanks.

===== include/net/neighbour.h 1.8 vs edited =====
--- 1.8/include/net/neighbour.h	2004-08-16 14:10:51 -07:00
+++ edited/include/net/neighbour.h	2004-09-01 21:57:37 -07:00
@@ -46,6 +46,7 @@
 #include <asm/atomic.h>
 #include <linux/skbuff.h>
 #include <linux/netdevice.h>
+#include <linux/rcupdate.h>
 
 #include <linux/err.h>
 #include <linux/sysctl.h>
@@ -65,6 +66,8 @@
 	void	*priv;
 
 	void	*sysctl_table;
+
+	struct rcu_head rcu_head;
 
 	int	base_reachable_time;
 	int	retrans_time;
===== net/core/neighbour.c 1.28 vs edited =====
--- 1.28/net/core/neighbour.c	2004-04-29 16:26:35 -07:00
+++ edited/net/core/neighbour.c	2004-09-01 22:00:59 -07:00
@@ -1120,6 +1120,7 @@
 	if (p) {
 		memcpy(p, &tbl->parms, sizeof(*p));
 		p->tbl		  = tbl;
+		INIT_RCU_HEAD(&p->rcu_head);
 		p->reachable_time =
 				neigh_rand_reach_time(p->base_reachable_time);
 		if (dev && dev->neigh_setup && dev->neigh_setup(dev, p)) {
@@ -1135,6 +1136,14 @@
 	return p;
 }
 
+static void neigh_rcu_free_parms(struct rcu_head *head)
+{
+	struct neigh_parms *parms =
+		container_of(head, struct neigh_parms, rcu_head);
+
+	kfree(parms);
+}
+
 void neigh_parms_release(struct neigh_table *tbl, struct neigh_parms *parms)
 {
 	struct neigh_parms **p;
@@ -1146,7 +1155,7 @@
 		if (*p == parms) {
 			*p = parms->next;
 			write_unlock_bh(&tbl->lock);
-			kfree(parms);
+			call_rcu(&parms->rcu_head, neigh_rcu_free_parms);
 			return;
 		}
 	}
@@ -1159,6 +1168,7 @@
 {
 	unsigned long now = jiffies;
 
+	INIT_RCU_HEAD(&tbl->parms.rcu_head);
 	tbl->parms.reachable_time =
 			  neigh_rand_reach_time(tbl->parms.base_reachable_time);
 

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: neigh_create/inetdev_destroy race?
  2004-09-02  5:21                                   ` David S. Miller
@ 2004-09-02 13:06                                     ` Herbert Xu
  2004-09-03 13:36                                       ` Herbert Xu
  0 siblings, 1 reply; 46+ messages in thread
From: Herbert Xu @ 2004-09-02 13:06 UTC (permalink / raw)
  To: David S. Miller; +Cc: shemminger, netdev

On Wed, Sep 01, 2004 at 10:21:18PM -0700, David S. Miller wrote:
> 
> So here's the first step.  No rcu_read_lock()'s are needed
> since the tbl->lock needs to be held as a write when
> traversing these things anyways for other reasons.

Thanks.

> Can you work on the next bit you mentioned, making
> sure the corresponding idev is still alive when we add
> a neighbour with its neigh_parms to the hash table?

Sure.  Actually I prefer to do it by ref counting neigh_parms directly.
I'll send you a patch soon.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: neigh_create/inetdev_destroy race?
  2004-09-02 13:06                                     ` Herbert Xu
@ 2004-09-03 13:36                                       ` Herbert Xu
  2004-09-03 16:00                                         ` Stephen Hemminger
  2004-09-03 16:18                                         ` David S. Miller
  0 siblings, 2 replies; 46+ messages in thread
From: Herbert Xu @ 2004-09-03 13:36 UTC (permalink / raw)
  To: David S. Miller; +Cc: shemminger, netdev

[-- Attachment #1: Type: text/plain, Size: 1956 bytes --]

On Thu, Sep 02, 2004 at 11:06:05PM +1000, herbert wrote:
> 
> > Can you work on the next bit you mentioned, making
> > sure the corresponding idev is still alive when we add
> > a neighbour with its neigh_parms to the hash table?
> 
> Sure.  Actually I prefer to do it by ref counting neigh_parms directly.
> I'll send you a patch soon.

Here is the patch.

I've added a refcnt on neigh_parms as well as a dead flag.  The latter
is checked under the tbl_lock before adding a neigh entry to the hash
table.

The non-trivial bit of the patch is the first chunk of net/core/neighbour.c.
I removed that line because not doing so would mean that I have to drop
the reference to the parms right there.  That would've lead to race
conditions since many places dereference neigh->parms without holding
locks.  It's also unnecessary to reset n->parms since we're no longer
in a hurry to see it go due to the new ref counting.

You'll also notice that I've put all dereferences of dev->*_ptr under
the rcu_read_lock().  Without this we may get a neigh_parms that's
already been released.

Incidentally a lot of these places were racy even before the RCU change.
For example, in the IPv6 case neigh->parms may be set to a value that's
just been released.

Finally in order to make sure that all stale entries are purged as
quickly as possible I've added neigh_ifdown/arp_ifdown calls after
every neigh_parms_release call.  In many cases we now have multiple
calls to neigh_ifdown in the shutdown path.  I didn't remove the
earlier calls because there may be hidden dependencies for them to
be there.  Once the respective maintainers have looked at them we
can probably remove most of them.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au> 

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

[-- Attachment #2: p --]
[-- Type: text/plain, Size: 10681 bytes --]

===== drivers/s390/net/qeth_main.c 1.13 vs edited =====
--- 1.13/drivers/s390/net/qeth_main.c	2004-08-27 17:02:36 +10:00
+++ edited/drivers/s390/net/qeth_main.c	2004-09-03 23:16:30 +10:00
@@ -6710,19 +6710,28 @@
 qeth_arp_constructor(struct neighbour *neigh)
 {
 	struct net_device *dev = neigh->dev;
-	struct in_device *in_dev = in_dev_get(dev);
+	struct in_device *in_dev;
+	struct neigh_parms *parms;
 
-	if (in_dev == NULL)
-		return -EINVAL;
 	if (!qeth_verify_dev(dev)) {
-		in_dev_put(in_dev);
 		return qeth_old_arp_constructor(neigh);
 	}
 
+	rcu_read_lock();
+	in_dev = __in_dev_get(dev);
+	if (in_dev == NULL) {
+		rcu_read_unlock();
+		return -EINVAL;
+	}
+
+	parms = in_dev->arp_parms;
+	if (parms) {
+		__neigh_parms_put(neigh->parms);
+		neigh->parms = neigh_parms_clone(parms);
+	}
+	rcu_read_unlock();
+
 	neigh->type = inet_addr_type(*(u32 *) neigh->primary_key);
-	if (in_dev->arp_parms)
-		neigh->parms = in_dev->arp_parms;
-	in_dev_put(in_dev);
 	neigh->nud_state = NUD_NOARP;
 	neigh->ops = arp_direct_ops;
 	neigh->output = neigh->ops->queue_xmit;
===== include/net/neighbour.h 1.9 vs edited =====
--- 1.9/include/net/neighbour.h	2004-09-02 15:03:13 +10:00
+++ edited/include/net/neighbour.h	2004-09-03 23:15:51 +10:00
@@ -67,6 +67,8 @@
 
 	void	*sysctl_table;
 
+	int dead;
+	atomic_t refcnt;
 	struct rcu_head rcu_head;
 
 	int	base_reachable_time;
@@ -199,6 +201,7 @@
 
 extern struct neigh_parms	*neigh_parms_alloc(struct net_device *dev, struct neigh_table *tbl);
 extern void			neigh_parms_release(struct neigh_table *tbl, struct neigh_parms *parms);
+extern void			neigh_parms_destroy(struct neigh_parms *parms);
 extern unsigned long		neigh_rand_reach_time(unsigned long base);
 
 extern void			pneigh_enqueue(struct neigh_table *tbl, struct neigh_parms *p,
@@ -219,6 +222,23 @@
 						      char *p_name,
 						      proc_handler *proc_handler);
 extern void			neigh_sysctl_unregister(struct neigh_parms *p);
+
+static inline void __neigh_parms_put(struct neigh_parms *parms)
+{
+	atomic_dec(&parms->refcnt);
+}
+
+static inline void neigh_parms_put(struct neigh_parms *parms)
+{
+	if (atomic_dec_and_test(&parms->refcnt))
+		neigh_parms_destroy(parms);
+}
+
+static inline struct neigh_parms *neigh_parms_clone(struct neigh_parms *parms)
+{
+	atomic_inc(&parms->refcnt);
+	return parms;
+}
 
 /*
  *	Neighbour references
===== net/atm/clip.c 1.36 vs edited =====
--- 1.36/net/atm/clip.c	2004-08-16 12:05:46 +10:00
+++ edited/net/atm/clip.c	2004-09-03 23:16:55 +10:00
@@ -26,6 +26,7 @@
 #include <linux/bitops.h>
 #include <linux/proc_fs.h>
 #include <linux/seq_file.h>
+#include <linux/rcupdate.h>
 #include <net/route.h> /* for struct rtable and routing */
 #include <net/icmp.h> /* icmp_send */
 #include <asm/param.h> /* for HZ */
@@ -311,13 +312,27 @@
 {
 	struct atmarp_entry *entry = NEIGH2ENTRY(neigh);
 	struct net_device *dev = neigh->dev;
-	struct in_device *in_dev = dev->ip_ptr;
+	struct in_device *in_dev;
+	struct neigh_parms *parms;
 
 	DPRINTK("clip_constructor (neigh %p, entry %p)\n",neigh,entry);
-	if (!in_dev) return -EINVAL;
 	neigh->type = inet_addr_type(entry->ip);
 	if (neigh->type != RTN_UNICAST) return -EINVAL;
-	if (in_dev->arp_parms) neigh->parms = in_dev->arp_parms;
+
+	rcu_read_lock();
+	in_dev = __in_dev_get(dev);
+	if (!in_dev) {
+		rcu_read_unlock();
+		return -EINVAL;
+	}
+
+	parms = in_dev->arp_parms;
+	if (parms) {
+		__neigh_parms_put(neigh->parms);
+		neigh->parms = neigh_parms_clone(parms);
+	}
+	rcu_read_unlock();
+
 	neigh->ops = &clip_neigh_ops;
 	neigh->output = neigh->nud_state & NUD_VALID ?
 	    neigh->ops->connected_output : neigh->ops->output;
===== net/core/neighbour.c 1.29 vs edited =====
--- 1.29/net/core/neighbour.c	2004-09-02 15:03:13 +10:00
+++ edited/net/core/neighbour.c	2004-09-03 23:17:17 +10:00
@@ -227,7 +227,6 @@
 				   we must kill timers etc. and move
 				   it to safe state.
 				 */
-				n->parms = &tbl->parms;
 				skb_queue_purge(&n->arp_queue);
 				n->output = neigh_blackhole;
 				if (n->nud_state & NUD_VALID)
@@ -273,7 +272,7 @@
 	n->updated	  = n->used = now;
 	n->nud_state	  = NUD_NONE;
 	n->output	  = neigh_blackhole;
-	n->parms	  = &tbl->parms;
+	n->parms	  = neigh_parms_clone(&tbl->parms);
 	init_timer(&n->timer);
 	n->timer.function = neigh_timer_handler;
 	n->timer.data	  = (unsigned long)n;
@@ -340,12 +339,16 @@
 	hash_val = tbl->hash(pkey, dev);
 
 	write_lock_bh(&tbl->lock);
+	if (n->parms->dead) {
+		rc = ERR_PTR(-EINVAL);
+		goto out_tbl_unlock;
+	}
+
 	for (n1 = tbl->hash_buckets[hash_val]; n1; n1 = n1->next) {
 		if (dev == n1->dev && !memcmp(n1->primary_key, pkey, key_len)) {
 			neigh_hold(n1);
-			write_unlock_bh(&tbl->lock);
 			rc = n1;
-			goto out_neigh_release;
+			goto out_tbl_unlock;
 		}
 	}
 
@@ -358,6 +361,8 @@
 	rc = n;
 out:
 	return rc;
+out_tbl_unlock:
+	write_unlock_bh(&tbl->lock);
 out_neigh_release:
 	neigh_release(n);
 	goto out;
@@ -494,6 +499,7 @@
 	skb_queue_purge(&neigh->arp_queue);
 
 	dev_put(neigh->dev);
+	neigh_parms_put(neigh->parms);
 
 	NEIGH_PRINTK2("neigh %p is destroyed.\n", neigh);
 
@@ -1120,6 +1126,7 @@
 	if (p) {
 		memcpy(p, &tbl->parms, sizeof(*p));
 		p->tbl		  = tbl;
+		atomic_set(&p->refcnt, 1);
 		INIT_RCU_HEAD(&p->rcu_head);
 		p->reachable_time =
 				neigh_rand_reach_time(p->base_reachable_time);
@@ -1141,7 +1148,7 @@
 	struct neigh_parms *parms =
 		container_of(head, struct neigh_parms, rcu_head);
 
-	kfree(parms);
+	neigh_parms_put(parms);
 }
 
 void neigh_parms_release(struct neigh_table *tbl, struct neigh_parms *parms)
@@ -1154,6 +1161,7 @@
 	for (p = &tbl->parms.next; *p; p = &(*p)->next) {
 		if (*p == parms) {
 			*p = parms->next;
+			parms->dead = 1;
 			write_unlock_bh(&tbl->lock);
 			call_rcu(&parms->rcu_head, neigh_rcu_free_parms);
 			return;
@@ -1163,11 +1171,17 @@
 	NEIGH_PRINTK1("neigh_parms_release: not found\n");
 }
 
+void neigh_parms_destroy(struct neigh_parms *parms)
+{
+	kfree(parms);
+}
+
 
 void neigh_table_init(struct neigh_table *tbl)
 {
 	unsigned long now = jiffies;
 
+	atomic_set(&tbl->parms.refcnt, 1);
 	INIT_RCU_HEAD(&tbl->parms.rcu_head);
 	tbl->parms.reachable_time =
 			  neigh_rand_reach_time(tbl->parms.base_reachable_time);
===== net/decnet/dn_dev.c 1.24 vs edited =====
--- 1.24/net/decnet/dn_dev.c	2004-08-19 07:36:05 +10:00
+++ edited/net/decnet/dn_dev.c	2004-09-03 22:19:04 +10:00
@@ -1215,6 +1215,7 @@
 	dev->dn_ptr = NULL;
 
 	neigh_parms_release(&dn_neigh_table, dn_db->neigh_parms);
+	neigh_ifdown(&dn_neigh_table, dev);
 
 	if (dn_db->router)
 		neigh_release(dn_db->router);
===== net/decnet/dn_neigh.c 1.10 vs edited =====
--- 1.10/net/decnet/dn_neigh.c	2004-01-26 16:13:52 +11:00
+++ edited/net/decnet/dn_neigh.c	2004-09-03 23:17:26 +10:00
@@ -35,6 +35,7 @@
 #include <linux/netfilter_decnet.h>
 #include <linux/spinlock.h>
 #include <linux/seq_file.h>
+#include <linux/rcupdate.h>
 #include <asm/atomic.h>
 #include <net/neighbour.h>
 #include <net/dst.h>
@@ -134,13 +135,22 @@
 {
 	struct net_device *dev = neigh->dev;
 	struct dn_neigh *dn = (struct dn_neigh *)neigh;
-	struct dn_dev *dn_db = (struct dn_dev *)dev->dn_ptr;
+	struct dn_dev *dn_db;
+	struct neigh_parms *parms;
 
-	if (dn_db == NULL)
+	rcu_read_lock();
+	dn_db = dev->dn_ptr;
+	if (dn_db == NULL) {
+		rcu_read_unlock();
 		return -EINVAL;
+	}
 
-	if (dn_db->neigh_parms)
-		neigh->parms = dn_db->neigh_parms;
+	parms = dn_db->neigh_parms;
+	if (parms) {
+		__neigh_parms_put(neigh->parms);
+		neigh->parms = neigh_parms_clone(parms);
+	}
+	rcu_read_unlock();
 
 	if (dn_db->use_long)
 		neigh->ops = &dn_long_ops;
===== net/ipv4/arp.c 1.43 vs edited =====
--- 1.43/net/ipv4/arp.c	2004-09-02 11:12:37 +10:00
+++ edited/net/ipv4/arp.c	2004-09-03 23:17:42 +10:00
@@ -96,6 +96,7 @@
 #include <linux/stat.h>
 #include <linux/init.h>
 #include <linux/net.h>
+#include <linux/rcupdate.h>
 #ifdef CONFIG_SYSCTL
 #include <linux/sysctl.h>
 #endif
@@ -237,16 +238,24 @@
 {
 	u32 addr = *(u32*)neigh->primary_key;
 	struct net_device *dev = neigh->dev;
-	struct in_device *in_dev = in_dev_get(dev);
-
-	if (in_dev == NULL)
-		return -EINVAL;
+	struct in_device *in_dev;
+	struct neigh_parms *parms;
 
 	neigh->type = inet_addr_type(addr);
-	if (in_dev->arp_parms)
-		neigh->parms = in_dev->arp_parms;
 
-	in_dev_put(in_dev);
+	rcu_read_lock();
+	in_dev = __in_dev_get(dev);
+	if (in_dev == NULL) {
+		rcu_read_unlock();
+		return -EINVAL;
+	}
+
+	parms = in_dev->arp_parms;
+	if (parms) {
+		__neigh_parms_put(neigh->parms);
+		neigh->parms = neigh_parms_clone(parms);
+	}
+	rcu_read_unlock();
 
 	if (dev->hard_header == NULL) {
 		neigh->nud_state = NUD_NOARP;
===== net/ipv4/devinet.c 1.36 vs edited =====
--- 1.36/net/ipv4/devinet.c	2004-08-16 16:11:59 +10:00
+++ edited/net/ipv4/devinet.c	2004-09-03 22:19:04 +10:00
@@ -184,6 +184,7 @@
 static void inetdev_destroy(struct in_device *in_dev)
 {
 	struct in_ifaddr *ifa;
+	struct net_device *dev;
 
 	ASSERT_RTNL();
 
@@ -200,12 +201,15 @@
 	devinet_sysctl_unregister(&in_dev->cnf);
 #endif
 
-	in_dev->dev->ip_ptr = NULL;
+	dev = in_dev->dev;
+	dev->ip_ptr = NULL;
 
 #ifdef CONFIG_SYSCTL
 	neigh_sysctl_unregister(in_dev->arp_parms);
 #endif
 	neigh_parms_release(&arp_tbl, in_dev->arp_parms);
+	arp_ifdown(dev);
+
 	call_rcu(&in_dev->rcu_head, in_dev_rcu_put);
 }
 
===== net/ipv6/addrconf.c 1.106 vs edited =====
--- 1.106/net/ipv6/addrconf.c	2004-08-17 12:25:06 +10:00
+++ edited/net/ipv6/addrconf.c	2004-09-03 23:04:03 +10:00
@@ -2072,6 +2072,7 @@
 		neigh_sysctl_unregister(idev->nd_parms);
 #endif
 		neigh_parms_release(&nd_tbl, idev->nd_parms);
+		neigh_ifdown(&nd_tbl, dev);
 		in6_dev_put(idev);
 	}
 	return 0;
===== net/ipv6/ndisc.c 1.87 vs edited =====
--- 1.87/net/ipv6/ndisc.c	2004-08-08 16:43:41 +10:00
+++ edited/net/ipv6/ndisc.c	2004-09-03 23:17:56 +10:00
@@ -58,6 +58,7 @@
 #include <linux/in6.h>
 #include <linux/route.h>
 #include <linux/init.h>
+#include <linux/rcupdate.h>
 #ifdef CONFIG_SYSCTL
 #include <linux/sysctl.h>
 #endif
@@ -284,14 +285,23 @@
 {
 	struct in6_addr *addr = (struct in6_addr*)&neigh->primary_key;
 	struct net_device *dev = neigh->dev;
-	struct inet6_dev *in6_dev = in6_dev_get(dev);
+	struct inet6_dev *in6_dev;
+	struct neigh_parms *parms;
 	int is_multicast = ipv6_addr_is_multicast(addr);
 
-	if (in6_dev == NULL)
+	rcu_read_lock();
+	in6_dev = in6_dev_get(dev);
+	if (in6_dev == NULL) {
+		rcu_read_unlock();
 		return -EINVAL;
+	}
 
-	if (in6_dev->nd_parms)
-		neigh->parms = in6_dev->nd_parms;
+	parms = in6_dev->nd_parms;
+	if (parms) {
+		__neigh_parms_put(neigh->parms);
+		neigh->parms = neigh_parms_clone(parms);
+	}
+	rcu_read_unlock();
 
 	neigh->type = is_multicast ? RTN_MULTICAST : RTN_UNICAST;
 	if (dev->hard_header == NULL) {

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: neigh_create/inetdev_destroy race?
  2004-09-03 13:36                                       ` Herbert Xu
@ 2004-09-03 16:00                                         ` Stephen Hemminger
  2004-09-03 23:49                                           ` Herbert Xu
  2004-09-03 16:18                                         ` David S. Miller
  1 sibling, 1 reply; 46+ messages in thread
From: Stephen Hemminger @ 2004-09-03 16:00 UTC (permalink / raw)
  To: Herbert Xu; +Cc: David S. Miller, netdev

On Fri, 3 Sep 2004 23:36:23 +1000
Herbert Xu <herbert@gondor.apana.org.au> wrote:

> On Thu, Sep 02, 2004 at 11:06:05PM +1000, herbert wrote:
> > 
> > > Can you work on the next bit you mentioned, making
> > > sure the corresponding idev is still alive when we add
> > > a neighbour with its neigh_parms to the hash table?
> > 
> > Sure.  Actually I prefer to do it by ref counting neigh_parms directly.
> > I'll send you a patch soon.
> 
> Here is the patch.
> 
> I've added a refcnt on neigh_parms as well as a dead flag.  The latter
> is checked under the tbl_lock before adding a neigh entry to the hash
> table.
> 
> The non-trivial bit of the patch is the first chunk of net/core/neighbour.c.
> I removed that line because not doing so would mean that I have to drop
> the reference to the parms right there.  That would've lead to race
> conditions since many places dereference neigh->parms without holding
> locks.  It's also unnecessary to reset n->parms since we're no longer
> in a hurry to see it go due to the new ref counting.
> 
> You'll also notice that I've put all dereferences of dev->*_ptr under
> the rcu_read_lock().  Without this we may get a neigh_parms that's
> already been released.

I haven't looked at the exact code in detail, but don't you need
use rcu_dereference() as well to make sure and get the smp_read_barrier_depends
on Alpha.

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: neigh_create/inetdev_destroy race?
  2004-09-03 13:36                                       ` Herbert Xu
  2004-09-03 16:00                                         ` Stephen Hemminger
@ 2004-09-03 16:18                                         ` David S. Miller
  1 sibling, 0 replies; 46+ messages in thread
From: David S. Miller @ 2004-09-03 16:18 UTC (permalink / raw)
  To: Herbert Xu; +Cc: shemminger, netdev

On Fri, 3 Sep 2004 23:36:23 +1000
Herbert Xu <herbert@gondor.apana.org.au> wrote:

> Here is the patch.

Looks great.  Yes, I see how the existing cases were racey
pre-RCU, it is similar to the sysctl stuff and that area was
truly horrible before Stephen and myself redid how generic
device destruction works.

Patch applied, thanks Herbert.

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: neigh_create/inetdev_destroy race?
  2004-09-03 16:00                                         ` Stephen Hemminger
@ 2004-09-03 23:49                                           ` Herbert Xu
  2004-09-07 20:50                                             ` David S. Miller
  0 siblings, 1 reply; 46+ messages in thread
From: Herbert Xu @ 2004-09-03 23:49 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: David S. Miller, netdev

[-- Attachment #1: Type: text/plain, Size: 2389 bytes --]

On Fri, Sep 03, 2004 at 09:00:53AM -0700, Stephen Hemminger wrote:
>
> > You'll also notice that I've put all dereferences of dev->*_ptr under
> > the rcu_read_lock().  Without this we may get a neigh_parms that's
> > already been released.
> 
> I haven't looked at the exact code in detail, but don't you need
> use rcu_dereference() as well to make sure and get the smp_read_barrier_depends
> on Alpha.

Not really because we're not depending on *dev->neigh_parms to be set to
NULLon shutdown.  In fact *dev->neigh_parms never gets set to NULL at all.
If it did we'd have trouble cleaning up dead entries from the hash table.

So there is no data-dependent read here whose order must be preserved
when *dev is destroyed.

But hang on a second, I had forgotten about the creation path.  Indeed
that is buggy without a barrier for every path except IPv6.  Without
the barrier, we may be reading NULL pointers from parms which may
result in stale neigh entries lingering around.  Or worse we may read
complete garbage that was there before the memset on *dev was done.
Fortunately the last bit probably can only be triggered if you're
stepping through gdb :)

So here is a patch to make sure that there is a barrier between the
reading of dev->*_ptr and *dev->neigh_parms.

With these barriers in place, it's clear that *dev->neigh_parms can no
longer be NULL since once the parms are allocated, that pointer is never
reset to NULL again.  Therefore I've also removed the parms check in
these paths.

They were bogus to begin with since if they ever triggered then we'll
have dead neigh entries stuck in the hash table.

Unfortunately I couldn't arrange for this to happen with DECnet due
to the dn_db->parms.up() call that's sandwiched between the assignment
of dev->dn_ptr and dn_db->neigh_parms.  So I've kept the parms check
there but it will now fail instead of continuing.  I've also added an
smp_wmb() there so that at least we won't be reading garbage from
dn_db->neigh_parms.

DECnet is also buggy since there is no locking at all in the destruction
path.  It either needs locking or RCU like IPv4.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>

Thanks a lot,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

[-- Attachment #2: p --]
[-- Type: text/plain, Size: 3674 bytes --]

===== drivers/s390/net/qeth_main.c 1.14 vs edited =====
--- 1.14/drivers/s390/net/qeth_main.c	2004-09-04 08:22:06 +10:00
+++ edited/drivers/s390/net/qeth_main.c	2004-09-04 09:17:17 +10:00
@@ -6718,17 +6718,15 @@
 	}
 
 	rcu_read_lock();
-	in_dev = __in_dev_get(dev);
+	in_dev = rcu_dereference(__in_dev_get(dev));
 	if (in_dev == NULL) {
 		rcu_read_unlock();
 		return -EINVAL;
 	}
 
 	parms = in_dev->arp_parms;
-	if (parms) {
-		__neigh_parms_put(neigh->parms);
-		neigh->parms = neigh_parms_clone(parms);
-	}
+	__neigh_parms_put(neigh->parms);
+	neigh->parms = neigh_parms_clone(parms);
 	rcu_read_unlock();
 
 	neigh->type = inet_addr_type(*(u32 *) neigh->primary_key);
===== net/atm/clip.c 1.37 vs edited =====
--- 1.37/net/atm/clip.c	2004-09-04 08:22:07 +10:00
+++ edited/net/atm/clip.c	2004-09-04 09:18:22 +10:00
@@ -320,17 +320,15 @@
 	if (neigh->type != RTN_UNICAST) return -EINVAL;
 
 	rcu_read_lock();
-	in_dev = __in_dev_get(dev);
+	in_dev = rcu_dereference(__in_dev_get(dev));
 	if (!in_dev) {
 		rcu_read_unlock();
 		return -EINVAL;
 	}
 
 	parms = in_dev->arp_parms;
-	if (parms) {
-		__neigh_parms_put(neigh->parms);
-		neigh->parms = neigh_parms_clone(parms);
-	}
+	__neigh_parms_put(neigh->parms);
+	neigh->parms = neigh_parms_clone(parms);
 	rcu_read_unlock();
 
 	neigh->ops = &clip_neigh_ops;
===== net/decnet/dn_dev.c 1.25 vs edited =====
--- 1.25/net/decnet/dn_dev.c	2004-09-04 08:22:07 +10:00
+++ edited/net/decnet/dn_dev.c	2004-09-04 09:36:51 +10:00
@@ -41,6 +41,7 @@
 #include <linux/sysctl.h>
 #include <linux/notifier.h>
 #include <asm/uaccess.h>
+#include <asm/system.h>
 #include <net/neighbour.h>
 #include <net/dst.h>
 #include <net/flow.h>
@@ -1108,6 +1109,7 @@
 
 	memset(dn_db, 0, sizeof(struct dn_dev));
 	memcpy(&dn_db->parms, p, sizeof(struct dn_dev_parms));
+	smp_wmb();
 	dev->dn_ptr = dn_db;
 	dn_db->dev = dev;
 	init_timer(&dn_db->timer);
===== net/decnet/dn_neigh.c 1.11 vs edited =====
--- 1.11/net/decnet/dn_neigh.c	2004-09-04 08:22:07 +10:00
+++ edited/net/decnet/dn_neigh.c	2004-09-04 09:33:12 +10:00
@@ -139,17 +139,20 @@
 	struct neigh_parms *parms;
 
 	rcu_read_lock();
-	dn_db = dev->dn_ptr;
+	dn_db = rcu_dereference(dev->dn_ptr);
 	if (dn_db == NULL) {
 		rcu_read_unlock();
 		return -EINVAL;
 	}
 
 	parms = dn_db->neigh_parms;
-	if (parms) {
-		__neigh_parms_put(neigh->parms);
-		neigh->parms = neigh_parms_clone(parms);
+	if (!parms) {
+		rcu_read_unlock();
+		return -EINVAL;
 	}
+
+	__neigh_parms_put(neigh->parms);
+	neigh->parms = neigh_parms_clone(parms);
 	rcu_read_unlock();
 
 	if (dn_db->use_long)
===== net/ipv4/arp.c 1.45 vs edited =====
--- 1.45/net/ipv4/arp.c	2004-09-04 08:22:07 +10:00
+++ edited/net/ipv4/arp.c	2004-09-04 09:17:46 +10:00
@@ -244,17 +244,15 @@
 	neigh->type = inet_addr_type(addr);
 
 	rcu_read_lock();
-	in_dev = __in_dev_get(dev);
+	in_dev = rcu_dereference(__in_dev_get(dev));
 	if (in_dev == NULL) {
 		rcu_read_unlock();
 		return -EINVAL;
 	}
 
 	parms = in_dev->arp_parms;
-	if (parms) {
-		__neigh_parms_put(neigh->parms);
-		neigh->parms = neigh_parms_clone(parms);
-	}
+	__neigh_parms_put(neigh->parms);
+	neigh->parms = neigh_parms_clone(parms);
 	rcu_read_unlock();
 
 	if (dev->hard_header == NULL) {
===== net/ipv6/ndisc.c 1.88 vs edited =====
--- 1.88/net/ipv6/ndisc.c	2004-09-04 08:22:07 +10:00
+++ edited/net/ipv6/ndisc.c	2004-09-04 09:28:12 +10:00
@@ -297,10 +297,8 @@
 	}
 
 	parms = in6_dev->nd_parms;
-	if (parms) {
-		__neigh_parms_put(neigh->parms);
-		neigh->parms = neigh_parms_clone(parms);
-	}
+	__neigh_parms_put(neigh->parms);
+	neigh->parms = neigh_parms_clone(parms);
 	rcu_read_unlock();
 
 	neigh->type = is_multicast ? RTN_MULTICAST : RTN_UNICAST;

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: neigh_create/inetdev_destroy race?
  2004-09-03 23:49                                           ` Herbert Xu
@ 2004-09-07 20:50                                             ` David S. Miller
  0 siblings, 0 replies; 46+ messages in thread
From: David S. Miller @ 2004-09-07 20:50 UTC (permalink / raw)
  To: Herbert Xu; +Cc: shemminger, netdev


Looks great Herbert, applied.

Thanks.

^ permalink raw reply	[flat|nested] 46+ messages in thread

end of thread, other threads:[~2004-09-07 20:50 UTC | newest]

Thread overview: 46+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2004-08-12 23:59 [PATCH] Move inetdev/ifa over to RCU David S. Miller
2004-08-13  2:20 ` James Morris
2004-08-13 10:02 ` Herbert Xu
2004-08-13 16:03 ` Stephen Hemminger
2004-08-13 16:38   ` David S. Miller
2004-08-13 21:56     ` Herbert Xu
2004-08-13 22:19       ` David S. Miller
2004-08-14  0:34         ` Herbert Xu
2004-08-14  0:39           ` David S. Miller
2004-08-14  0:54             ` Herbert Xu
2004-08-14  1:25               ` Herbert Xu
2004-08-14  1:30                 ` Herbert Xu
2004-08-14  5:08                   ` Herbert Xu
2004-08-14  6:27                     ` neigh_create/inetdev_destroy race? Herbert Xu
2004-08-16  2:14                       ` David S. Miller
2004-08-16 10:51                         ` Herbert Xu
2004-08-29  6:42                           ` David S. Miller
2004-08-29  6:50                             ` Herbert Xu
2004-08-31  6:08                               ` David S. Miller
2004-08-31 10:41                                 ` Herbert Xu
2004-09-02  5:21                                   ` David S. Miller
2004-09-02 13:06                                     ` Herbert Xu
2004-09-03 13:36                                       ` Herbert Xu
2004-09-03 16:00                                         ` Stephen Hemminger
2004-09-03 23:49                                           ` Herbert Xu
2004-09-07 20:50                                             ` David S. Miller
2004-09-03 16:18                                         ` David S. Miller
2004-08-16  2:08                     ` [PATCH] Move inetdev/ifa over to RCU David S. Miller
2004-08-16  2:43                       ` Herbert Xu
2004-08-16  3:08                         ` David S. Miller
2004-08-16  3:14                           ` Herbert Xu
2004-08-16  6:23                             ` David S. Miller
2004-08-14  6:31                   ` Herbert Xu
2004-08-14  6:32                     ` Herbert Xu
2004-08-16  3:01                   ` David S. Miller
2004-08-14  1:40                 ` Herbert Xu
2004-08-16  3:03                   ` David S. Miller
2004-08-16  3:23                     ` Herbert Xu
2004-08-16  6:24                       ` David S. Miller
2004-08-14  4:30                 ` Stephen Hemminger
2004-08-14  4:36                   ` Herbert Xu
2004-08-16  2:59               ` David S. Miller
2004-08-16  2:58           ` David S. Miller
2004-08-16  3:08             ` Herbert Xu
2004-08-16  6:21               ` David S. Miller
2004-08-16  8:13                 ` Herbert Xu

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.