All of lore.kernel.org
 help / color / mirror / Atom feed
* [Patch net-next] vxlan: do real refcnt for vn_sock
@ 2013-05-28 11:07 Cong Wang
  2013-05-28 15:22 ` Stephen Hemminger
                   ` (2 more replies)
  0 siblings, 3 replies; 12+ messages in thread
From: Cong Wang @ 2013-05-28 11:07 UTC (permalink / raw)
  To: netdev; +Cc: Stephen Hemminger, David S. Miller, Cong Wang

From: Cong Wang <amwang@redhat.com>

In commit 553675fb5e9ce3d71a (vxlan: listen on multiple ports),
we use kfree_rcu() to free ->vn_sock, but a) there is no use
of RCU API to access this filed, b) RCU is not enough to do refcnt
here, because in vxlan_leave_group() we drop RTNL lock before
locking the socket, it could be possible that this field is
freed during this period.

So, instead making things complex, just do basic refcnt for
the ->vn_sock, like we do for others.

Cc: Stephen Hemminger <stephen@networkplumber.org>
Cc: David S. Miller <davem@davemloft.net>
Signed-off-by: Cong Wang <amwang@redhat.com>

---
diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
index 5ed64d4..73a674e 100644
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c
@@ -83,9 +83,8 @@ static unsigned int vxlan_net_id;
 /* per UDP socket information */
 struct vxlan_sock {
 	struct hlist_node hlist;
-	struct rcu_head	  rcu;
 	struct work_struct del_work;
-	unsigned int	  refcnt;
+	atomic_t	  refcnt;
 	struct socket	  *sock;
 	struct hlist_head vni_list[VNI_HASH_SIZE];
 };
@@ -164,14 +163,30 @@ static inline struct hlist_head *vs_head(struct net *net, __be16 port)
 	return &vn->sock_list[hash_32(ntohs(port), PORT_HASH_BITS)];
 }
 
+static inline void vxlan_sock_get(struct vxlan_sock *vs)
+{
+	atomic_inc(&vs->refcnt);
+}
+
+static inline void vxlan_sock_put(struct vxlan_sock *vs)
+{
+	if (atomic_dec_and_test(&vs->refcnt)) {
+		hlist_del_rcu(&vs->hlist);
+		schedule_work(&vs->del_work);
+	}
+}
+
+
 /* Find VXLAN socket based on network namespace and UDP port */
 static struct vxlan_sock *vxlan_find_port(struct net *net, __be16 port)
 {
 	struct vxlan_sock *vs;
 
 	hlist_for_each_entry_rcu(vs, vs_head(net, port), hlist) {
+		vxlan_sock_get(vs);
 		if (inet_sk(vs->sock->sk)->inet_sport == port)
 			return vs;
+		vxlan_sock_put(vs);
 	}
 	return NULL;
 }
@@ -187,10 +202,13 @@ static struct vxlan_dev *vxlan_find_vni(struct net *net, u32 id, __be16 port)
 		return NULL;
 
 	hlist_for_each_entry_rcu(vxlan, vni_head(vs, id), hlist) {
-		if (vxlan->default_dst.remote_vni == id)
+		if (vxlan->default_dst.remote_vni == id) {
+			vxlan_sock_put(vs);
 			return vxlan;
+		}
 	}
 
+	vxlan_sock_put(vs);
 	return NULL;
 }
 
@@ -926,7 +944,7 @@ static bool route_shortcircuit(struct net_device *dev, struct sk_buff *skb)
 	return false;
 }
 
-static void vxlan_sock_put(struct sk_buff *skb)
+static void vxlan_sock_destructor(struct sk_buff *skb)
 {
 	sock_put(skb->sk);
 }
@@ -940,7 +958,7 @@ static void vxlan_set_owner(struct net_device *dev, struct sk_buff *skb)
 	skb_orphan(skb);
 	sock_hold(sk);
 	skb->sk = sk;
-	skb->destructor = vxlan_sock_put;
+	skb->destructor = vxlan_sock_destructor;
 }
 
 /* Compute source port for outgoing packet
@@ -1255,10 +1273,14 @@ static int vxlan_open(struct net_device *dev)
 	struct vxlan_dev *vxlan = netdev_priv(dev);
 	int err;
 
+	vxlan_sock_get(vxlan->vn_sock);
+
 	if (IN_MULTICAST(ntohl(vxlan->default_dst.remote_ip))) {
 		err = vxlan_join_group(dev);
-		if (err)
+		if (err) {
+			vxlan_sock_put(vxlan->vn_sock);
 			return err;
+		}
 	}
 
 	if (vxlan->age_interval)
@@ -1294,6 +1316,8 @@ static int vxlan_stop(struct net_device *dev)
 
 	del_timer_sync(&vxlan->age_timer);
 
+	vxlan_sock_put(vxlan->vn_sock);
+
 	vxlan_flush(vxlan);
 
 	return 0;
@@ -1447,7 +1471,7 @@ static void vxlan_del_work(struct work_struct *work)
 	struct vxlan_sock *vs = container_of(work, struct vxlan_sock, del_work);
 
 	sk_release_kernel(vs->sock->sk);
-	kfree_rcu(vs, rcu);
+	kfree(vs);
 }
 
 /* Create new listen socket if needed */
@@ -1503,7 +1527,7 @@ static struct vxlan_sock *vxlan_socket_create(struct net *net, __be16 port)
 	udp_sk(sk)->encap_rcv = vxlan_udp_encap_recv;
 	udp_encap_enable();
 
-	vs->refcnt = 1;
+	atomic_set(&vs->refcnt, 1);
 	return vs;
 }
 
@@ -1592,9 +1616,7 @@ static int vxlan_newlink(struct net *net, struct net_device *dev,
 	}
 
 	vs = vxlan_find_port(net, vxlan->dst_port);
-	if (vs)
-		++vs->refcnt;
-	else {
+	if (!vs) {
 		/* Drop lock because socket create acquires RTNL lock */
 		rtnl_unlock();
 		vs = vxlan_socket_create(net, vxlan->dst_port);
@@ -1610,12 +1632,7 @@ static int vxlan_newlink(struct net *net, struct net_device *dev,
 
 	err = register_netdevice(dev);
 	if (err) {
-		if (--vs->refcnt == 0) {
-			rtnl_unlock();
-			sk_release_kernel(vs->sock->sk);
-			kfree(vs);
-			rtnl_lock();
-		}
+		vxlan_sock_put(vs);
 		return err;
 	}
 
@@ -1634,10 +1651,7 @@ static void vxlan_dellink(struct net_device *dev, struct list_head *head)
 	list_del(&vxlan->next);
 	unregister_netdevice_queue(dev, head);
 
-	if (--vs->refcnt == 0) {
-		hlist_del_rcu(&vs->hlist);
-		schedule_work(&vs->del_work);
-	}
+	vxlan_sock_put(vs);
 }
 
 static size_t vxlan_get_size(const struct net_device *dev)

^ permalink raw reply related	[flat|nested] 12+ messages in thread

* Re: [Patch net-next] vxlan: do real refcnt for vn_sock
  2013-05-28 11:07 [Patch net-next] vxlan: do real refcnt for vn_sock Cong Wang
@ 2013-05-28 15:22 ` Stephen Hemminger
  2013-05-29  2:08   ` Cong Wang
  2013-05-29  4:01 ` Cong Wang
  2013-05-29  4:41 ` Stephen Hemminger
  2 siblings, 1 reply; 12+ messages in thread
From: Stephen Hemminger @ 2013-05-28 15:22 UTC (permalink / raw)
  To: Cong Wang; +Cc: netdev, David S. Miller

On Tue, 28 May 2013 19:07:22 +0800
Cong Wang <amwang@redhat.com> wrote:

> From: Cong Wang <amwang@redhat.com>
> 
> In commit 553675fb5e9ce3d71a (vxlan: listen on multiple ports),
> we use kfree_rcu() to free ->vn_sock, but a) there is no use
> of RCU API to access this filed, b) RCU is not enough to do refcnt
> here, because in vxlan_leave_group() we drop RTNL lock before
> locking the socket, it could be possible that this field is
> freed during this period.
> 
> So, instead making things complex, just do basic refcnt for
> the ->vn_sock, like we do for others.
> 
> Cc: Stephen Hemminger <stephen@networkplumber.org>
> Cc: David S. Miller <davem@davemloft.net>
> Signed-off-by: Cong Wang <amwang@redhat.com>
> 
> ---
> diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
> index 5ed64d4..73a674e 100644
> --- a/drivers/net/vxlan.c
> +++ b/drivers/net/vxlan.c
> @@ -83,9 +83,8 @@ static unsigned int vxlan_net_id;
>  /* per UDP socket information */
>  struct vxlan_sock {
>  	struct hlist_node hlist;
> -	struct rcu_head	  rcu;
>  	struct work_struct del_work;
> -	unsigned int	  refcnt;
> +	atomic_t	  refcnt;
>  	struct socket	  *sock;
>  	struct hlist_head vni_list[VNI_HASH_SIZE];
>  };
> @@ -164,14 +163,30 @@ static inline struct hlist_head *vs_head(struct net *net, __be16 port)
>  	return &vn->sock_list[hash_32(ntohs(port), PORT_HASH_BITS)];
>  }
>  
> +static inline void vxlan_sock_get(struct vxlan_sock *vs)
> +{
> +	atomic_inc(&vs->refcnt);
> +}
> +
> +static inline void vxlan_sock_put(struct vxlan_sock *vs)
> +{
> +	if (atomic_dec_and_test(&vs->refcnt)) {
> +		hlist_del_rcu(&vs->hlist);
> +		schedule_work(&vs->del_work);
> +	}
> +}
> +
> +
>  /* Find VXLAN socket based on network namespace and UDP port */
>  static struct vxlan_sock *vxlan_find_port(struct net *net, __be16 port)
>  {
>  	struct vxlan_sock *vs;
>  
>  	hlist_for_each_entry_rcu(vs, vs_head(net, port), hlist) {
> +		vxlan_sock_get(vs);
>  		if (inet_sk(vs->sock->sk)->inet_sport == port)
>  			return vs;
> +		vxlan_sock_put(vs);
>  	}
>  	return NULL;
>  }
> @@ -187,10 +202,13 @@ static struct vxlan_dev *vxlan_find_vni(struct net *net, u32 id, __be16 port)
>  		return NULL;
>  
>  	hlist_for_each_entry_rcu(vxlan, vni_head(vs, id), hlist) {
> -		if (vxlan->default_dst.remote_vni == id)
> +		if (vxlan->default_dst.remote_vni == id) {
> +			vxlan_sock_put(vs);
>  			return vxlan;
> +		}
>  	}
>  
> +	vxlan_sock_put(vs);
>  	return NULL;
>  }
>  
> @@ -926,7 +944,7 @@ static bool route_shortcircuit(struct net_device *dev, struct sk_buff *skb)
>  	return false;
>  }
>  
> -static void vxlan_sock_put(struct sk_buff *skb)
> +static void vxlan_sock_destructor(struct sk_buff *skb)
>  {
>  	sock_put(skb->sk);
>  }
> @@ -940,7 +958,7 @@ static void vxlan_set_owner(struct net_device *dev, struct sk_buff *skb)
>  	skb_orphan(skb);
>  	sock_hold(sk);
>  	skb->sk = sk;
> -	skb->destructor = vxlan_sock_put;
> +	skb->destructor = vxlan_sock_destructor;
>  }
>  
>  /* Compute source port for outgoing packet
> @@ -1255,10 +1273,14 @@ static int vxlan_open(struct net_device *dev)
>  	struct vxlan_dev *vxlan = netdev_priv(dev);
>  	int err;
>  
> +	vxlan_sock_get(vxlan->vn_sock);
> +
>  	if (IN_MULTICAST(ntohl(vxlan->default_dst.remote_ip))) {
>  		err = vxlan_join_group(dev);
> -		if (err)
> +		if (err) {
> +			vxlan_sock_put(vxlan->vn_sock);
>  			return err;
> +		}
>  	}
>  
>  	if (vxlan->age_interval)
> @@ -1294,6 +1316,8 @@ static int vxlan_stop(struct net_device *dev)
>  
>  	del_timer_sync(&vxlan->age_timer);
>  
> +	vxlan_sock_put(vxlan->vn_sock);
> +
>  	vxlan_flush(vxlan);
>  
>  	return 0;
> @@ -1447,7 +1471,7 @@ static void vxlan_del_work(struct work_struct *work)
>  	struct vxlan_sock *vs = container_of(work, struct vxlan_sock, del_work);
>  
>  	sk_release_kernel(vs->sock->sk);
> -	kfree_rcu(vs, rcu);
> +	kfree(vs);
>  }
>  
>  /* Create new listen socket if needed */
> @@ -1503,7 +1527,7 @@ static struct vxlan_sock *vxlan_socket_create(struct net *net, __be16 port)
>  	udp_sk(sk)->encap_rcv = vxlan_udp_encap_recv;
>  	udp_encap_enable();
>  
> -	vs->refcnt = 1;
> +	atomic_set(&vs->refcnt, 1);
>  	return vs;
>  }
>  
> @@ -1592,9 +1616,7 @@ static int vxlan_newlink(struct net *net, struct net_device *dev,
>  	}
>  
>  	vs = vxlan_find_port(net, vxlan->dst_port);
> -	if (vs)
> -		++vs->refcnt;
> -	else {
> +	if (!vs) {
>  		/* Drop lock because socket create acquires RTNL lock */
>  		rtnl_unlock();
>  		vs = vxlan_socket_create(net, vxlan->dst_port);
> @@ -1610,12 +1632,7 @@ static int vxlan_newlink(struct net *net, struct net_device *dev,
>  
>  	err = register_netdevice(dev);
>  	if (err) {
> -		if (--vs->refcnt == 0) {
> -			rtnl_unlock();
> -			sk_release_kernel(vs->sock->sk);
> -			kfree(vs);
> -			rtnl_lock();
> -		}
> +		vxlan_sock_put(vs);
>  		return err;
>  	}
>  
> @@ -1634,10 +1651,7 @@ static void vxlan_dellink(struct net_device *dev, struct list_head *head)
>  	list_del(&vxlan->next);
>  	unregister_netdevice_queue(dev, head);
>  
> -	if (--vs->refcnt == 0) {
> -		hlist_del_rcu(&vs->hlist);
> -		schedule_work(&vs->del_work);
> -	}
> +	vxlan_sock_put(vs);
>  }
>  
>  static size_t vxlan_get_size(const struct net_device *dev)

Not needed all access is under RTNL

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [Patch net-next] vxlan: do real refcnt for vn_sock
  2013-05-28 15:22 ` Stephen Hemminger
@ 2013-05-29  2:08   ` Cong Wang
  2013-05-29  4:22     ` Stephen Hemminger
  0 siblings, 1 reply; 12+ messages in thread
From: Cong Wang @ 2013-05-29  2:08 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: netdev, David S. Miller

On Tue, 2013-05-28 at 08:22 -0700, Stephen Hemminger wrote:
> On Tue, 28 May 2013 19:07:22 +0800
> Cong Wang <amwang@redhat.com> wrote:
> 
> > From: Cong Wang <amwang@redhat.com>
> > 
> > In commit 553675fb5e9ce3d71a (vxlan: listen on multiple ports),
> > we use kfree_rcu() to free ->vn_sock, but a) there is no use
> > of RCU API to access this filed, b) RCU is not enough to do refcnt
> > here, because in vxlan_leave_group() we drop RTNL lock before
> > locking the socket, it could be possible that this field is
> > freed during this period.
> > 
> > So, instead making things complex, just do basic refcnt for
> > the ->vn_sock, like we do for others.
> > 
...
> 
> Not needed all access is under RTNL

I know, this is why I had a patch (not posted) which adds the missing
rtnl_dereference(), but even if we had these, it is still not correct.

As I explained in the changelog, vxlan_leave_group() has a problem,
because it releases rtnl lock before locking the socket, _and_ it is
called after vxlan_dellink() which schedules a work to cleanup the
struct. Therefore the ->vn_sock could be freed right after rtnl lock is
released.

Am I miss anything?

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [Patch net-next] vxlan: do real refcnt for vn_sock
  2013-05-28 11:07 [Patch net-next] vxlan: do real refcnt for vn_sock Cong Wang
  2013-05-28 15:22 ` Stephen Hemminger
@ 2013-05-29  4:01 ` Cong Wang
  2013-05-29  4:41 ` Stephen Hemminger
  2 siblings, 0 replies; 12+ messages in thread
From: Cong Wang @ 2013-05-29  4:01 UTC (permalink / raw)
  To: netdev; +Cc: Stephen Hemminger, David S. Miller

On Tue, 2013-05-28 at 19:07 +0800, Cong Wang wrote:
> From: Cong Wang <amwang@redhat.com>
> 
> In commit 553675fb5e9ce3d71a (vxlan: listen on multiple ports),
> we use kfree_rcu() to free ->vn_sock, but a) there is no use
> of RCU API to access this filed, b) RCU is not enough to do refcnt
> here, because in vxlan_leave_group() we drop RTNL lock before
> locking the socket, it could be possible that this field is
> freed during this period.
> 
> So, instead making things complex, just do basic refcnt for
> the ->vn_sock, like we do for others.

BTW, this patch fixes a real crash when I test vxlan over IPv6, although
I can't reproduce it with IPv4.

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [Patch net-next] vxlan: do real refcnt for vn_sock
  2013-05-29  2:08   ` Cong Wang
@ 2013-05-29  4:22     ` Stephen Hemminger
  2013-05-29  4:34       ` Cong Wang
  0 siblings, 1 reply; 12+ messages in thread
From: Stephen Hemminger @ 2013-05-29  4:22 UTC (permalink / raw)
  To: Cong Wang; +Cc: netdev, David S. Miller

On Wed, 29 May 2013 10:08:53 +0800
Cong Wang <amwang@redhat.com> wrote:

> On Tue, 2013-05-28 at 08:22 -0700, Stephen Hemminger wrote:
> > On Tue, 28 May 2013 19:07:22 +0800
> > Cong Wang <amwang@redhat.com> wrote:
> > 
> > > From: Cong Wang <amwang@redhat.com>
> > > 
> > > In commit 553675fb5e9ce3d71a (vxlan: listen on multiple ports),
> > > we use kfree_rcu() to free ->vn_sock, but a) there is no use
> > > of RCU API to access this filed, b) RCU is not enough to do refcnt
> > > here, because in vxlan_leave_group() we drop RTNL lock before
> > > locking the socket, it could be possible that this field is
> > > freed during this period.
> > > 
> > > So, instead making things complex, just do basic refcnt for
> > > the ->vn_sock, like we do for others.
> > > 
> ...
> > 
> > Not needed all access is under RTNL
> 
> I know, this is why I had a patch (not posted) which adds the missing
> rtnl_dereference(), but even if we had these, it is still not correct.
> 
> As I explained in the changelog, vxlan_leave_group() has a problem,
> because it releases rtnl lock before locking the socket, _and_ it is
> called after vxlan_dellink() which schedules a work to cleanup the
> struct. Therefore the ->vn_sock could be freed right after rtnl lock is
> released.
> 
> Am I miss anything?

Ignoring your IPv6 code for now...

With IPV4:
   refcnt is incremented when socket is incremented in newlink (RTNL held).
   refcnt is decremented in by dellink (RTNL held) and socket is deleted from list
   leave_group doesn't happen until work queue is fired.

rtnl_dereference is fine, but hardly necessary when the call hierarchy is so obvious.

The problem you describe won't be fixed by just converting it to atomic,
I think you need add a dev_hold()/dev_put to vxlan_stop to prevent
device from being deleted when rtnl_lock is dropped.




   
   

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [Patch net-next] vxlan: do real refcnt for vn_sock
  2013-05-29  4:22     ` Stephen Hemminger
@ 2013-05-29  4:34       ` Cong Wang
  0 siblings, 0 replies; 12+ messages in thread
From: Cong Wang @ 2013-05-29  4:34 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: netdev, David S. Miller

On Tue, 2013-05-28 at 21:22 -0700, Stephen Hemminger wrote:
> On Wed, 29 May 2013 10:08:53 +0800
> Cong Wang <amwang@redhat.com> wrote:
> 
> > On Tue, 2013-05-28 at 08:22 -0700, Stephen Hemminger wrote:
> > > On Tue, 28 May 2013 19:07:22 +0800
> > > Cong Wang <amwang@redhat.com> wrote:
> > > 
> > > > From: Cong Wang <amwang@redhat.com>
> > > > 
> > > > In commit 553675fb5e9ce3d71a (vxlan: listen on multiple ports),
> > > > we use kfree_rcu() to free ->vn_sock, but a) there is no use
> > > > of RCU API to access this filed, b) RCU is not enough to do refcnt
> > > > here, because in vxlan_leave_group() we drop RTNL lock before
> > > > locking the socket, it could be possible that this field is
> > > > freed during this period.
> > > > 
> > > > So, instead making things complex, just do basic refcnt for
> > > > the ->vn_sock, like we do for others.
> > > > 
> > ...
> > > 
> > > Not needed all access is under RTNL
> > 
> > I know, this is why I had a patch (not posted) which adds the missing
> > rtnl_dereference(), but even if we had these, it is still not correct.
> > 
> > As I explained in the changelog, vxlan_leave_group() has a problem,
> > because it releases rtnl lock before locking the socket, _and_ it is
> > called after vxlan_dellink() which schedules a work to cleanup the
> > struct. Therefore the ->vn_sock could be freed right after rtnl lock is
> > released.
> > 
> > Am I miss anything?
> 
> Ignoring your IPv6 code for now...
> 
> With IPV4:
>    refcnt is incremented when socket is incremented in newlink (RTNL held).
>    refcnt is decremented in by dellink (RTNL held) and socket is deleted from list
>    leave_group doesn't happen until work queue is fired.
> 
> rtnl_dereference is fine, but hardly necessary when the call hierarchy is so obvious.
> 
> The problem you describe won't be fixed by just converting it to atomic,

My patch is _not_ just converting it to atomic_t, but it takes a ref for
every usage of ->vn_sock, which current implementation misses.

> I think you need add a dev_hold()/dev_put to vxlan_stop to prevent
> device from being deleted when rtnl_lock is dropped.
> 

The crash is that lock_sock() got a NULL-def bug, which is not the
related with dev_hold() at all. I think it is due to the whole ->vn_sock
is freed before calling lock_sock(), thus vxlan->vn_sock->sock->sk
points to a freed memory area.

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [Patch net-next] vxlan: do real refcnt for vn_sock
  2013-05-28 11:07 [Patch net-next] vxlan: do real refcnt for vn_sock Cong Wang
  2013-05-28 15:22 ` Stephen Hemminger
  2013-05-29  4:01 ` Cong Wang
@ 2013-05-29  4:41 ` Stephen Hemminger
  2013-05-29  5:14   ` Cong Wang
  2013-05-29  8:39   ` Cong Wang
  2 siblings, 2 replies; 12+ messages in thread
From: Stephen Hemminger @ 2013-05-29  4:41 UTC (permalink / raw)
  To: Cong Wang; +Cc: netdev, David S. Miller

Why not just fix the requirement to drop rtnl when calling igmp.
The code comes out cleaner and safer as well.

The follow COMPILE TESTED ONLY is a starting point...

--- a/drivers/net/vxlan.c	2013-05-20 10:24:19.624427346 -0700
+++ b/drivers/net/vxlan.c	2013-05-28 21:39:57.502351889 -0700
@@ -657,20 +657,12 @@ static int vxlan_join_group(struct net_d
 		.imr_multiaddr.s_addr	= vxlan->default_dst.remote_ip,
 		.imr_ifindex		= vxlan->default_dst.remote_ifindex,
 	};
-	int err;
 
 	/* Already a member of group */
 	if (vxlan_group_used(vn, vxlan))
 		return 0;
 
-	/* Need to drop RTNL to call multicast join */
-	rtnl_unlock();
-	lock_sock(sk);
-	err = ip_mc_join_group(sk, &mreq);
-	release_sock(sk);
-	rtnl_lock();
-
-	return err;
+	return _ip_mc_join_group(sk, &mreq);
 }
 
 
@@ -679,7 +671,6 @@ static int vxlan_leave_group(struct net_
 {
 	struct vxlan_dev *vxlan = netdev_priv(dev);
 	struct vxlan_net *vn = net_generic(dev_net(dev), vxlan_net_id);
-	int err = 0;
 	struct sock *sk = vxlan->vn_sock->sock->sk;
 	struct ip_mreqn mreq = {
 		.imr_multiaddr.s_addr	= vxlan->default_dst.remote_ip,
@@ -690,14 +681,7 @@ static int vxlan_leave_group(struct net_
 	if (vxlan_group_used(vn, vxlan))
 		return 0;
 
-	/* Need to drop RTNL to call multicast leave */
-	rtnl_unlock();
-	lock_sock(sk);
-	err = ip_mc_leave_group(sk, &mreq);
-	release_sock(sk);
-	rtnl_lock();
-
-	return err;
+	return _ip_mc_leave_group(sk, &mreq);
 }
 
 /* Callback from net/ipv4/udp.c to receive packets */
@@ -1244,6 +1228,7 @@ static int vxlan_open(struct net_device
 	struct vxlan_dev *vxlan = netdev_priv(dev);
 	int err;
 
+	dev_hold(dev);
 	if (IN_MULTICAST(ntohl(vxlan->default_dst.remote_ip))) {
 		err = vxlan_join_group(dev);
 		if (err)
@@ -1252,6 +1237,7 @@ static int vxlan_open(struct net_device
 
 	if (vxlan->age_interval)
 		mod_timer(&vxlan->age_timer, jiffies + FDB_AGE_INTERVAL);
+	dev_put(dev);
 
 	return 0;
 }
--- a/include/linux/igmp.h	2013-05-11 15:58:53.692325370 -0700
+++ b/include/linux/igmp.h	2013-05-28 21:37:40.560328674 -0700
@@ -109,7 +109,9 @@ struct ip_mc_list {
 
 extern int ip_check_mc_rcu(struct in_device *dev, __be32 mc_addr, __be32 src_addr, u16 proto);
 extern int igmp_rcv(struct sk_buff *);
+extern int _ip_mc_join_group(struct sock *sk, struct ip_mreqn *imr);
 extern int ip_mc_join_group(struct sock *sk, struct ip_mreqn *imr);
+extern int _ip_mc_leave_group(struct sock *sk, struct ip_mreqn *imr);
 extern int ip_mc_leave_group(struct sock *sk, struct ip_mreqn *imr);
 extern void ip_mc_drop_socket(struct sock *sk);
 extern int ip_mc_source(int add, int omode, struct sock *sk,
--- a/net/ipv4/igmp.c	2013-05-11 15:58:53.992318551 -0700
+++ b/net/ipv4/igmp.c	2013-05-28 21:39:43.302554701 -0700
@@ -1781,48 +1781,36 @@ static void ip_mc_clear_src(struct ip_mc
 	pmc->sfcount[MCAST_EXCLUDE] = 1;
 }
 
-
-/*
- * Join a multicast group
- */
-int ip_mc_join_group(struct sock *sk , struct ip_mreqn *imr)
+int _ip_mc_join_group(struct sock *sk, struct ip_mreqn *imr)
 {
-	int err;
 	__be32 addr = imr->imr_multiaddr.s_addr;
-	struct ip_mc_socklist *iml = NULL, *i;
+	struct ip_mc_socklist *iml, *i;
 	struct in_device *in_dev;
 	struct inet_sock *inet = inet_sk(sk);
 	struct net *net = sock_net(sk);
 	int ifindex;
 	int count = 0;
 
-	if (!ipv4_is_multicast(addr))
-		return -EINVAL;
-
-	rtnl_lock();
+	ASSERT_RTNL();
 
 	in_dev = ip_mc_find_dev(net, imr);
+	if (!in_dev)
+		return -ENODEV;
 
-	if (!in_dev) {
-		iml = NULL;
-		err = -ENODEV;
-		goto done;
-	}
-
-	err = -EADDRINUSE;
 	ifindex = imr->imr_ifindex;
 	for_each_pmc_rtnl(inet, i) {
 		if (i->multi.imr_multiaddr.s_addr == addr &&
 		    i->multi.imr_ifindex == ifindex)
-			goto done;
+			return -EADDRINUSE;
 		count++;
 	}
-	err = -ENOBUFS;
+
 	if (count >= sysctl_igmp_max_memberships)
-		goto done;
+		return -ENOBUFS;
+
 	iml = sock_kmalloc(sk, sizeof(*iml), GFP_KERNEL);
 	if (iml == NULL)
-		goto done;
+		return -ENOMEM;
 
 	memcpy(&iml->multi, imr, sizeof(*imr));
 	iml->next_rcu = inet->mc_list;
@@ -1830,8 +1818,24 @@ int ip_mc_join_group(struct sock *sk , s
 	iml->sfmode = MCAST_EXCLUDE;
 	rcu_assign_pointer(inet->mc_list, iml);
 	ip_mc_inc_group(in_dev, addr);
-	err = 0;
-done:
+
+	return 0;
+}
+EXPORT_SYMBOL(_ip_mc_join_group);
+
+/*
+ * Join a multicast group
+ */
+int ip_mc_join_group(struct sock *sk, struct ip_mreqn *imr)
+{
+	int err;
+	__be32 addr = imr->imr_multiaddr.s_addr;
+
+	if (!ipv4_is_multicast(addr))
+		return -EINVAL;
+
+	rtnl_lock();
+	err = _ip_mc_join_group(sk, imr);
 	rtnl_unlock();
 	return err;
 }
@@ -1857,11 +1861,7 @@ static int ip_mc_leave_src(struct sock *
 	return err;
 }
 
-/*
- *	Ask a socket to leave a group.
- */
-
-int ip_mc_leave_group(struct sock *sk, struct ip_mreqn *imr)
+int _ip_mc_leave_group(struct sock *sk, struct ip_mreqn *imr)
 {
 	struct inet_sock *inet = inet_sk(sk);
 	struct ip_mc_socklist *iml;
@@ -1870,9 +1870,9 @@ int ip_mc_leave_group(struct sock *sk, s
 	struct net *net = sock_net(sk);
 	__be32 group = imr->imr_multiaddr.s_addr;
 	u32 ifindex;
-	int ret = -EADDRNOTAVAIL;
 
-	rtnl_lock();
+	ASSERT_RTNL();
+
 	in_dev = ip_mc_find_dev(net, imr);
 	ifindex = imr->imr_ifindex;
 	for (imlp = &inet->mc_list;
@@ -1880,6 +1880,7 @@ int ip_mc_leave_group(struct sock *sk, s
 	     imlp = &iml->next_rcu) {
 		if (iml->multi.imr_multiaddr.s_addr != group)
 			continue;
+
 		if (ifindex) {
 			if (iml->multi.imr_ifindex != ifindex)
 				continue;
@@ -1887,20 +1888,32 @@ int ip_mc_leave_group(struct sock *sk, s
 				iml->multi.imr_address.s_addr)
 			continue;
 
-		(void) ip_mc_leave_src(sk, iml, in_dev);
+		ip_mc_leave_src(sk, iml, in_dev);
 
 		*imlp = iml->next_rcu;
 
 		if (in_dev)
 			ip_mc_dec_group(in_dev, group);
-		rtnl_unlock();
+
 		/* decrease mem now to avoid the memleak warning */
 		atomic_sub(sizeof(*iml), &sk->sk_omem_alloc);
 		kfree_rcu(iml, rcu);
 		return 0;
 	}
-	if (!in_dev)
-		ret = -ENODEV;
+
+	return !in_dev ? -ENODEV : -EADDRNOTAVAIL;
+}
+EXPORT_SYMBOL(_ip_mc_leave_group);
+
+/*
+ *	Ask a socket to leave a group.
+ */
+int ip_mc_leave_group(struct sock *sk, struct ip_mreqn *imr)
+{
+	int ret;
+
+	rtnl_lock();
+	ret = _ip_mc_leave_group(sk, imr);
 	rtnl_unlock();
 	return ret;
 }

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [Patch net-next] vxlan: do real refcnt for vn_sock
  2013-05-29  4:41 ` Stephen Hemminger
@ 2013-05-29  5:14   ` Cong Wang
  2013-05-29  8:39   ` Cong Wang
  1 sibling, 0 replies; 12+ messages in thread
From: Cong Wang @ 2013-05-29  5:14 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: netdev, David S. Miller

On Tue, 2013-05-28 at 21:41 -0700, Stephen Hemminger wrote:
> Why not just fix the requirement to drop rtnl when calling igmp.
> The code comes out cleaner and safer as well.
> 

This could probably work too. Please submit it as a normal patch?

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [Patch net-next] vxlan: do real refcnt for vn_sock
  2013-05-29  4:41 ` Stephen Hemminger
  2013-05-29  5:14   ` Cong Wang
@ 2013-05-29  8:39   ` Cong Wang
  2013-05-31  2:55     ` Cong Wang
  1 sibling, 1 reply; 12+ messages in thread
From: Cong Wang @ 2013-05-29  8:39 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: netdev, David S. Miller

On Tue, 2013-05-28 at 21:41 -0700, Stephen Hemminger wrote:
> Why not just fix the requirement to drop rtnl when calling igmp.
> The code comes out cleaner and safer as well.

Besides you forget to lock the socket before calling _ip_mc_join_group()
(and also the order is very important too), your patch doesn't fix the
problem I met. The full backtrace is below:

[  114.134123] BUG: unable to handle kernel NULL pointer dereference at
0000000000000068
[  114.136065] IP: [<ffffffff810a1061>] __lock_acquire+0x9c/0x45d
[  114.136065] PGD 71721067 PUD 70e11067 PMD 0 
[  114.136065] Oops: 0000 [#1] SMP DEBUG_PAGEALLOC
[  114.136065] CPU: 0 PID: 707 Comm: ip Not tainted 3.10.0-rc2+ #1075
[  114.136065] Hardware name: Bochs Bochs, BIOS Bochs 01/01/2007
[  114.136065] task: ffff88006fce2480 ti: ffff88006d9ec000 task.ti:
ffff88006d9ec000
[  114.136065] RIP: 0010:[<ffffffff810a1061>]  [<ffffffff810a1061>]
__lock_acquire+0x9c/0x45d
[  114.136065] RSP: 0018:ffff88006d9ed6a8  EFLAGS: 00010046
[  114.136065] RAX: 0000000000000068 RBX: 0000000000000000 RCX:
0000000000000000
[  114.136065] RDX: 0000000000000000 RSI: 0000000000000000 RDI:
0000000000000068
[  114.136065] RBP: ffff88006d9ed708 R08: 0000000000000002 R09:
0000000000000000
[  114.136065] R10: ffffffff8104f0bb R11: ffffffff8107632b R12:
ffff88006fce2480
[  114.136065] R13: 0000000000000000 R14: 0000000000000002 R15:
0000000000000000
[  114.136065] FS:  00007fc30cd89740(0000) GS:ffff88007f600000(0000)
knlGS:0000000000000000
[  114.136065] CS:  0010 DS: 0000 ES: 0000 CR0: 000000008005003b
[  114.136065] CR2: 0000000000000068 CR3: 000000006f3a2000 CR4:
00000000000006f0
[  114.136065] DR0: 0000000000000000 DR1: 0000000000000000 DR2:
0000000000000000
[  114.136065] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7:
0000000000000400
[  114.136065] Stack:
[  114.136065]  ffff88006d9ed6b8 ffffffff00000000 ffff880000000000
000000006fce2480
[  114.136065]  ffffffff00000000 0000000000000068 ffffffff825063d2
0000000000000000
[  114.136065]  ffff88006fce2480 ffffffff81757759 0000000000000002
0000000000000000
[  114.136065] Call Trace:
[  114.136065]  [<ffffffff81757759>] ? lock_sock_nested+0x3c/0x97
[  114.136065]  [<ffffffff810a2162>] lock_acquire+0xfa/0x140
[  114.136065]  [<ffffffff81757759>] ? lock_sock_nested+0x3c/0x97
[  114.136065]  [<ffffffff810a06d4>] ? trace_softirqs_off+0x3b/0xf3
[  114.136065]  [<ffffffff81976a1f>] _raw_spin_lock_bh+0x4a/0x7d
[  114.136065]  [<ffffffff81757759>] ? lock_sock_nested+0x3c/0x97
[  114.136065]  [<ffffffff810798db>] ? should_resched+0x9/0x28
[  114.136065]  [<ffffffff81757759>] lock_sock_nested+0x3c/0x97
[  114.136065]  [<ffffffff815e13a1>] vxlan_leave_group+0xc4/0x111
[  114.136065]  [<ffffffff81054dfc>] ? local_bh_enable_ip+0xe/0x10
[  114.136065]  [<ffffffff815e142d>] vxlan_stop+0x3f/0xaa
[  114.136065]  [<ffffffff817659ff>] __dev_close_many+0x9c/0xc4
[  114.136065]  [<ffffffff81765af3>] dev_close_many+0x70/0xd8
[  114.136065]  [<ffffffff817697f6>] rollback_registered_many+0xa2/0x1a7
[  114.136065]  [<ffffffff81769a4e>] unregister_netdevice_many+0x19/0x51
[  114.136065]  [<ffffffff817738b5>] rtnl_dellink+0xd0/0xfb
[  114.136065]  [<ffffffff81058754>] ? ns_capable+0x4d/0x66
[  114.136065]  [<ffffffff81774aaf>] rtnetlink_rcv_msg+0x19c/0x1ab
[  114.136065]  [<ffffffff81774913>] ? __rtnl_unlock+0x17/0x17
[  114.136065]  [<ffffffff817bc7d8>] netlink_rcv_skb+0x42/0x8d
[  114.136065]  [<ffffffff817748f5>] rtnetlink_rcv+0x26/0x2d
[  114.136065]  [<ffffffff817bb001>] netlink_unicast+0xb7/0x138
[  114.136065]  [<ffffffff817bba6b>] netlink_sendmsg+0x2b8/0x2f2
[  114.136065]  [<ffffffff81752e3e>] sock_sendmsg+0x7f/0xa0
[  114.136065]  [<ffffffff8111ee4d>] ? might_fault+0xa5/0xac
[  114.136065]  [<ffffffff8111ee04>] ? might_fault+0x5c/0xac
[  114.136065]  [<ffffffff817527ba>] ? move_addr_to_kernel+0x41/0x5a
[  114.136065]  [<ffffffff8175db77>] ? verify_iovec+0x5b/0xac
[  114.136065]  [<ffffffff81753064>] __sys_sendmsg+0x205/0x2a1
[  114.136065]  [<ffffffff8102c172>] ? __do_page_fault+0x2ee/0x38b
[  114.136065]  [<ffffffff8107507a>] ? up_read+0x29/0x2e
[  114.136065]  [<ffffffff8116794f>] ? fcheck_files+0xa3/0xe1
[  114.136065]  [<ffffffff81168915>] ? fget_light+0x3a/0xa4
[  114.136065]  [<ffffffff81753f91>] SyS_sendmsg+0x42/0x60
[  114.136065]  [<ffffffff819781c2>] system_call_fastpath+0x16/0x1b
[  114.136065] Code: 00 00 83 3d c1 46 d7 01 00 0f 85 cb 03 00 00 48 c7
c1 1c ef d5 81 48 c7 c2 67 5f d5 81 be fb 0b 00 00 e9 1b 02 00 00 48 8b
45 c8 <48> 81 38 a0 f5 52 82 b8 01 00 00 00 44 0f 44 f0 83 fe 01 77 10 
[  114.136065] RIP  [<ffffffff810a1061>] __lock_acquire+0x9c/0x45d
[  114.136065]  RSP <ffff88006d9ed6a8>
[  114.136065] CR2: 0000000000000068
[  114.136065] ---[ end trace 92078b41edbc404d ]---
[  114.136065] Kernel panic - not syncing: Fatal exception in interrupt

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [Patch net-next] vxlan: do real refcnt for vn_sock
  2013-05-29  8:39   ` Cong Wang
@ 2013-05-31  2:55     ` Cong Wang
  2013-05-31  3:56       ` Stephen Hemminger
  0 siblings, 1 reply; 12+ messages in thread
From: Cong Wang @ 2013-05-31  2:55 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: netdev, David S. Miller

On Wed, 2013-05-29 at 16:39 +0800, Cong Wang wrote:
> On Tue, 2013-05-28 at 21:41 -0700, Stephen Hemminger wrote:
> > Why not just fix the requirement to drop rtnl when calling igmp.
> > The code comes out cleaner and safer as well.
> 
> Besides you forget to lock the socket before calling _ip_mc_join_group()
> (and also the order is very important too), your patch doesn't fix the
> problem I met. The full backtrace is below:
> 

Hi, Stephen,

Do you think my patch is an easier solution? It at least fixes the
crash, while your patch doesn't. :)

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [Patch net-next] vxlan: do real refcnt for vn_sock
  2013-05-31  2:55     ` Cong Wang
@ 2013-05-31  3:56       ` Stephen Hemminger
  2013-05-31  4:12         ` Cong Wang
  0 siblings, 1 reply; 12+ messages in thread
From: Stephen Hemminger @ 2013-05-31  3:56 UTC (permalink / raw)
  To: Cong Wang; +Cc: netdev, David S. Miller

On Fri, 31 May 2013 10:55:45 +0800
Cong Wang <amwang@redhat.com> wrote:

> On Wed, 2013-05-29 at 16:39 +0800, Cong Wang wrote:
> > On Tue, 2013-05-28 at 21:41 -0700, Stephen Hemminger wrote:
> > > Why not just fix the requirement to drop rtnl when calling igmp.
> > > The code comes out cleaner and safer as well.
> > 
> > Besides you forget to lock the socket before calling _ip_mc_join_group()
> > (and also the order is very important too), your patch doesn't fix the
> > problem I met. The full backtrace is below:
> > 
> 
> Hi, Stephen,
> 
> Do you think my patch is an easier solution? It at least fixes the
> crash, while your patch doesn't. :)
> 
> 

No. your patch doesn't fix the real problem.
The real fix is more complex because of how socket locking interacts with RTNL.


I have been busy with health stuff, so the going is slow.

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [Patch net-next] vxlan: do real refcnt for vn_sock
  2013-05-31  3:56       ` Stephen Hemminger
@ 2013-05-31  4:12         ` Cong Wang
  0 siblings, 0 replies; 12+ messages in thread
From: Cong Wang @ 2013-05-31  4:12 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: netdev, David S. Miller

On Thu, 2013-05-30 at 20:56 -0700, Stephen Hemminger wrote:
> On Fri, 31 May 2013 10:55:45 +0800
> Cong Wang <amwang@redhat.com> wrote:
> 
> > On Wed, 2013-05-29 at 16:39 +0800, Cong Wang wrote:
> > > On Tue, 2013-05-28 at 21:41 -0700, Stephen Hemminger wrote:
> > > > Why not just fix the requirement to drop rtnl when calling igmp.
> > > > The code comes out cleaner and safer as well.
> > > 
> > > Besides you forget to lock the socket before calling _ip_mc_join_group()
> > > (and also the order is very important too), your patch doesn't fix the
> > > problem I met. The full backtrace is below:
> > > 
> > 
> > Hi, Stephen,
> > 
> > Do you think my patch is an easier solution? It at least fixes the
> > crash, while your patch doesn't. :)
> > 
> > 
> 
> No. your patch doesn't fix the real problem.
> The real fix is more complex because of how socket locking interacts with RTNL.

Yeah, I am thinking if we could replace the RTNL lock with a spinlock in
IPv4 multicast, as IPv6 multicast uses its own spin lock too. I don't
see any reason why inet->mc_list has to be protected by RTNL.

> 
> 
> I have been busy with health stuff, so the going is slow.

No problem, take care!

^ permalink raw reply	[flat|nested] 12+ messages in thread

end of thread, other threads:[~2013-05-31  4:12 UTC | newest]

Thread overview: 12+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2013-05-28 11:07 [Patch net-next] vxlan: do real refcnt for vn_sock Cong Wang
2013-05-28 15:22 ` Stephen Hemminger
2013-05-29  2:08   ` Cong Wang
2013-05-29  4:22     ` Stephen Hemminger
2013-05-29  4:34       ` Cong Wang
2013-05-29  4:01 ` Cong Wang
2013-05-29  4:41 ` Stephen Hemminger
2013-05-29  5:14   ` Cong Wang
2013-05-29  8:39   ` Cong Wang
2013-05-31  2:55     ` Cong Wang
2013-05-31  3:56       ` Stephen Hemminger
2013-05-31  4:12         ` Cong Wang

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.