* [Patch net-next] vxlan: do real refcnt for vn_sock @ 2013-05-28 11:07 Cong Wang 2013-05-28 15:22 ` Stephen Hemminger ` (2 more replies) 0 siblings, 3 replies; 12+ messages in thread From: Cong Wang @ 2013-05-28 11:07 UTC (permalink / raw) To: netdev; +Cc: Stephen Hemminger, David S. Miller, Cong Wang From: Cong Wang <amwang@redhat.com> In commit 553675fb5e9ce3d71a (vxlan: listen on multiple ports), we use kfree_rcu() to free ->vn_sock, but a) there is no use of RCU API to access this filed, b) RCU is not enough to do refcnt here, because in vxlan_leave_group() we drop RTNL lock before locking the socket, it could be possible that this field is freed during this period. So, instead making things complex, just do basic refcnt for the ->vn_sock, like we do for others. Cc: Stephen Hemminger <stephen@networkplumber.org> Cc: David S. Miller <davem@davemloft.net> Signed-off-by: Cong Wang <amwang@redhat.com> --- diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c index 5ed64d4..73a674e 100644 --- a/drivers/net/vxlan.c +++ b/drivers/net/vxlan.c @@ -83,9 +83,8 @@ static unsigned int vxlan_net_id; /* per UDP socket information */ struct vxlan_sock { struct hlist_node hlist; - struct rcu_head rcu; struct work_struct del_work; - unsigned int refcnt; + atomic_t refcnt; struct socket *sock; struct hlist_head vni_list[VNI_HASH_SIZE]; }; @@ -164,14 +163,30 @@ static inline struct hlist_head *vs_head(struct net *net, __be16 port) return &vn->sock_list[hash_32(ntohs(port), PORT_HASH_BITS)]; } +static inline void vxlan_sock_get(struct vxlan_sock *vs) +{ + atomic_inc(&vs->refcnt); +} + +static inline void vxlan_sock_put(struct vxlan_sock *vs) +{ + if (atomic_dec_and_test(&vs->refcnt)) { + hlist_del_rcu(&vs->hlist); + schedule_work(&vs->del_work); + } +} + + /* Find VXLAN socket based on network namespace and UDP port */ static struct vxlan_sock *vxlan_find_port(struct net *net, __be16 port) { struct vxlan_sock *vs; hlist_for_each_entry_rcu(vs, vs_head(net, port), hlist) { + vxlan_sock_get(vs); if (inet_sk(vs->sock->sk)->inet_sport == port) return vs; + vxlan_sock_put(vs); } return NULL; } @@ -187,10 +202,13 @@ static struct vxlan_dev *vxlan_find_vni(struct net *net, u32 id, __be16 port) return NULL; hlist_for_each_entry_rcu(vxlan, vni_head(vs, id), hlist) { - if (vxlan->default_dst.remote_vni == id) + if (vxlan->default_dst.remote_vni == id) { + vxlan_sock_put(vs); return vxlan; + } } + vxlan_sock_put(vs); return NULL; } @@ -926,7 +944,7 @@ static bool route_shortcircuit(struct net_device *dev, struct sk_buff *skb) return false; } -static void vxlan_sock_put(struct sk_buff *skb) +static void vxlan_sock_destructor(struct sk_buff *skb) { sock_put(skb->sk); } @@ -940,7 +958,7 @@ static void vxlan_set_owner(struct net_device *dev, struct sk_buff *skb) skb_orphan(skb); sock_hold(sk); skb->sk = sk; - skb->destructor = vxlan_sock_put; + skb->destructor = vxlan_sock_destructor; } /* Compute source port for outgoing packet @@ -1255,10 +1273,14 @@ static int vxlan_open(struct net_device *dev) struct vxlan_dev *vxlan = netdev_priv(dev); int err; + vxlan_sock_get(vxlan->vn_sock); + if (IN_MULTICAST(ntohl(vxlan->default_dst.remote_ip))) { err = vxlan_join_group(dev); - if (err) + if (err) { + vxlan_sock_put(vxlan->vn_sock); return err; + } } if (vxlan->age_interval) @@ -1294,6 +1316,8 @@ static int vxlan_stop(struct net_device *dev) del_timer_sync(&vxlan->age_timer); + vxlan_sock_put(vxlan->vn_sock); + vxlan_flush(vxlan); return 0; @@ -1447,7 +1471,7 @@ static void vxlan_del_work(struct work_struct *work) struct vxlan_sock *vs = container_of(work, struct vxlan_sock, del_work); sk_release_kernel(vs->sock->sk); - kfree_rcu(vs, rcu); + kfree(vs); } /* Create new listen socket if needed */ @@ -1503,7 +1527,7 @@ static struct vxlan_sock *vxlan_socket_create(struct net *net, __be16 port) udp_sk(sk)->encap_rcv = vxlan_udp_encap_recv; udp_encap_enable(); - vs->refcnt = 1; + atomic_set(&vs->refcnt, 1); return vs; } @@ -1592,9 +1616,7 @@ static int vxlan_newlink(struct net *net, struct net_device *dev, } vs = vxlan_find_port(net, vxlan->dst_port); - if (vs) - ++vs->refcnt; - else { + if (!vs) { /* Drop lock because socket create acquires RTNL lock */ rtnl_unlock(); vs = vxlan_socket_create(net, vxlan->dst_port); @@ -1610,12 +1632,7 @@ static int vxlan_newlink(struct net *net, struct net_device *dev, err = register_netdevice(dev); if (err) { - if (--vs->refcnt == 0) { - rtnl_unlock(); - sk_release_kernel(vs->sock->sk); - kfree(vs); - rtnl_lock(); - } + vxlan_sock_put(vs); return err; } @@ -1634,10 +1651,7 @@ static void vxlan_dellink(struct net_device *dev, struct list_head *head) list_del(&vxlan->next); unregister_netdevice_queue(dev, head); - if (--vs->refcnt == 0) { - hlist_del_rcu(&vs->hlist); - schedule_work(&vs->del_work); - } + vxlan_sock_put(vs); } static size_t vxlan_get_size(const struct net_device *dev) ^ permalink raw reply related [flat|nested] 12+ messages in thread
* Re: [Patch net-next] vxlan: do real refcnt for vn_sock 2013-05-28 11:07 [Patch net-next] vxlan: do real refcnt for vn_sock Cong Wang @ 2013-05-28 15:22 ` Stephen Hemminger 2013-05-29 2:08 ` Cong Wang 2013-05-29 4:01 ` Cong Wang 2013-05-29 4:41 ` Stephen Hemminger 2 siblings, 1 reply; 12+ messages in thread From: Stephen Hemminger @ 2013-05-28 15:22 UTC (permalink / raw) To: Cong Wang; +Cc: netdev, David S. Miller On Tue, 28 May 2013 19:07:22 +0800 Cong Wang <amwang@redhat.com> wrote: > From: Cong Wang <amwang@redhat.com> > > In commit 553675fb5e9ce3d71a (vxlan: listen on multiple ports), > we use kfree_rcu() to free ->vn_sock, but a) there is no use > of RCU API to access this filed, b) RCU is not enough to do refcnt > here, because in vxlan_leave_group() we drop RTNL lock before > locking the socket, it could be possible that this field is > freed during this period. > > So, instead making things complex, just do basic refcnt for > the ->vn_sock, like we do for others. > > Cc: Stephen Hemminger <stephen@networkplumber.org> > Cc: David S. Miller <davem@davemloft.net> > Signed-off-by: Cong Wang <amwang@redhat.com> > > --- > diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c > index 5ed64d4..73a674e 100644 > --- a/drivers/net/vxlan.c > +++ b/drivers/net/vxlan.c > @@ -83,9 +83,8 @@ static unsigned int vxlan_net_id; > /* per UDP socket information */ > struct vxlan_sock { > struct hlist_node hlist; > - struct rcu_head rcu; > struct work_struct del_work; > - unsigned int refcnt; > + atomic_t refcnt; > struct socket *sock; > struct hlist_head vni_list[VNI_HASH_SIZE]; > }; > @@ -164,14 +163,30 @@ static inline struct hlist_head *vs_head(struct net *net, __be16 port) > return &vn->sock_list[hash_32(ntohs(port), PORT_HASH_BITS)]; > } > > +static inline void vxlan_sock_get(struct vxlan_sock *vs) > +{ > + atomic_inc(&vs->refcnt); > +} > + > +static inline void vxlan_sock_put(struct vxlan_sock *vs) > +{ > + if (atomic_dec_and_test(&vs->refcnt)) { > + hlist_del_rcu(&vs->hlist); > + schedule_work(&vs->del_work); > + } > +} > + > + > /* Find VXLAN socket based on network namespace and UDP port */ > static struct vxlan_sock *vxlan_find_port(struct net *net, __be16 port) > { > struct vxlan_sock *vs; > > hlist_for_each_entry_rcu(vs, vs_head(net, port), hlist) { > + vxlan_sock_get(vs); > if (inet_sk(vs->sock->sk)->inet_sport == port) > return vs; > + vxlan_sock_put(vs); > } > return NULL; > } > @@ -187,10 +202,13 @@ static struct vxlan_dev *vxlan_find_vni(struct net *net, u32 id, __be16 port) > return NULL; > > hlist_for_each_entry_rcu(vxlan, vni_head(vs, id), hlist) { > - if (vxlan->default_dst.remote_vni == id) > + if (vxlan->default_dst.remote_vni == id) { > + vxlan_sock_put(vs); > return vxlan; > + } > } > > + vxlan_sock_put(vs); > return NULL; > } > > @@ -926,7 +944,7 @@ static bool route_shortcircuit(struct net_device *dev, struct sk_buff *skb) > return false; > } > > -static void vxlan_sock_put(struct sk_buff *skb) > +static void vxlan_sock_destructor(struct sk_buff *skb) > { > sock_put(skb->sk); > } > @@ -940,7 +958,7 @@ static void vxlan_set_owner(struct net_device *dev, struct sk_buff *skb) > skb_orphan(skb); > sock_hold(sk); > skb->sk = sk; > - skb->destructor = vxlan_sock_put; > + skb->destructor = vxlan_sock_destructor; > } > > /* Compute source port for outgoing packet > @@ -1255,10 +1273,14 @@ static int vxlan_open(struct net_device *dev) > struct vxlan_dev *vxlan = netdev_priv(dev); > int err; > > + vxlan_sock_get(vxlan->vn_sock); > + > if (IN_MULTICAST(ntohl(vxlan->default_dst.remote_ip))) { > err = vxlan_join_group(dev); > - if (err) > + if (err) { > + vxlan_sock_put(vxlan->vn_sock); > return err; > + } > } > > if (vxlan->age_interval) > @@ -1294,6 +1316,8 @@ static int vxlan_stop(struct net_device *dev) > > del_timer_sync(&vxlan->age_timer); > > + vxlan_sock_put(vxlan->vn_sock); > + > vxlan_flush(vxlan); > > return 0; > @@ -1447,7 +1471,7 @@ static void vxlan_del_work(struct work_struct *work) > struct vxlan_sock *vs = container_of(work, struct vxlan_sock, del_work); > > sk_release_kernel(vs->sock->sk); > - kfree_rcu(vs, rcu); > + kfree(vs); > } > > /* Create new listen socket if needed */ > @@ -1503,7 +1527,7 @@ static struct vxlan_sock *vxlan_socket_create(struct net *net, __be16 port) > udp_sk(sk)->encap_rcv = vxlan_udp_encap_recv; > udp_encap_enable(); > > - vs->refcnt = 1; > + atomic_set(&vs->refcnt, 1); > return vs; > } > > @@ -1592,9 +1616,7 @@ static int vxlan_newlink(struct net *net, struct net_device *dev, > } > > vs = vxlan_find_port(net, vxlan->dst_port); > - if (vs) > - ++vs->refcnt; > - else { > + if (!vs) { > /* Drop lock because socket create acquires RTNL lock */ > rtnl_unlock(); > vs = vxlan_socket_create(net, vxlan->dst_port); > @@ -1610,12 +1632,7 @@ static int vxlan_newlink(struct net *net, struct net_device *dev, > > err = register_netdevice(dev); > if (err) { > - if (--vs->refcnt == 0) { > - rtnl_unlock(); > - sk_release_kernel(vs->sock->sk); > - kfree(vs); > - rtnl_lock(); > - } > + vxlan_sock_put(vs); > return err; > } > > @@ -1634,10 +1651,7 @@ static void vxlan_dellink(struct net_device *dev, struct list_head *head) > list_del(&vxlan->next); > unregister_netdevice_queue(dev, head); > > - if (--vs->refcnt == 0) { > - hlist_del_rcu(&vs->hlist); > - schedule_work(&vs->del_work); > - } > + vxlan_sock_put(vs); > } > > static size_t vxlan_get_size(const struct net_device *dev) Not needed all access is under RTNL ^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [Patch net-next] vxlan: do real refcnt for vn_sock 2013-05-28 15:22 ` Stephen Hemminger @ 2013-05-29 2:08 ` Cong Wang 2013-05-29 4:22 ` Stephen Hemminger 0 siblings, 1 reply; 12+ messages in thread From: Cong Wang @ 2013-05-29 2:08 UTC (permalink / raw) To: Stephen Hemminger; +Cc: netdev, David S. Miller On Tue, 2013-05-28 at 08:22 -0700, Stephen Hemminger wrote: > On Tue, 28 May 2013 19:07:22 +0800 > Cong Wang <amwang@redhat.com> wrote: > > > From: Cong Wang <amwang@redhat.com> > > > > In commit 553675fb5e9ce3d71a (vxlan: listen on multiple ports), > > we use kfree_rcu() to free ->vn_sock, but a) there is no use > > of RCU API to access this filed, b) RCU is not enough to do refcnt > > here, because in vxlan_leave_group() we drop RTNL lock before > > locking the socket, it could be possible that this field is > > freed during this period. > > > > So, instead making things complex, just do basic refcnt for > > the ->vn_sock, like we do for others. > > ... > > Not needed all access is under RTNL I know, this is why I had a patch (not posted) which adds the missing rtnl_dereference(), but even if we had these, it is still not correct. As I explained in the changelog, vxlan_leave_group() has a problem, because it releases rtnl lock before locking the socket, _and_ it is called after vxlan_dellink() which schedules a work to cleanup the struct. Therefore the ->vn_sock could be freed right after rtnl lock is released. Am I miss anything? ^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [Patch net-next] vxlan: do real refcnt for vn_sock 2013-05-29 2:08 ` Cong Wang @ 2013-05-29 4:22 ` Stephen Hemminger 2013-05-29 4:34 ` Cong Wang 0 siblings, 1 reply; 12+ messages in thread From: Stephen Hemminger @ 2013-05-29 4:22 UTC (permalink / raw) To: Cong Wang; +Cc: netdev, David S. Miller On Wed, 29 May 2013 10:08:53 +0800 Cong Wang <amwang@redhat.com> wrote: > On Tue, 2013-05-28 at 08:22 -0700, Stephen Hemminger wrote: > > On Tue, 28 May 2013 19:07:22 +0800 > > Cong Wang <amwang@redhat.com> wrote: > > > > > From: Cong Wang <amwang@redhat.com> > > > > > > In commit 553675fb5e9ce3d71a (vxlan: listen on multiple ports), > > > we use kfree_rcu() to free ->vn_sock, but a) there is no use > > > of RCU API to access this filed, b) RCU is not enough to do refcnt > > > here, because in vxlan_leave_group() we drop RTNL lock before > > > locking the socket, it could be possible that this field is > > > freed during this period. > > > > > > So, instead making things complex, just do basic refcnt for > > > the ->vn_sock, like we do for others. > > > > ... > > > > Not needed all access is under RTNL > > I know, this is why I had a patch (not posted) which adds the missing > rtnl_dereference(), but even if we had these, it is still not correct. > > As I explained in the changelog, vxlan_leave_group() has a problem, > because it releases rtnl lock before locking the socket, _and_ it is > called after vxlan_dellink() which schedules a work to cleanup the > struct. Therefore the ->vn_sock could be freed right after rtnl lock is > released. > > Am I miss anything? Ignoring your IPv6 code for now... With IPV4: refcnt is incremented when socket is incremented in newlink (RTNL held). refcnt is decremented in by dellink (RTNL held) and socket is deleted from list leave_group doesn't happen until work queue is fired. rtnl_dereference is fine, but hardly necessary when the call hierarchy is so obvious. The problem you describe won't be fixed by just converting it to atomic, I think you need add a dev_hold()/dev_put to vxlan_stop to prevent device from being deleted when rtnl_lock is dropped. ^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [Patch net-next] vxlan: do real refcnt for vn_sock 2013-05-29 4:22 ` Stephen Hemminger @ 2013-05-29 4:34 ` Cong Wang 0 siblings, 0 replies; 12+ messages in thread From: Cong Wang @ 2013-05-29 4:34 UTC (permalink / raw) To: Stephen Hemminger; +Cc: netdev, David S. Miller On Tue, 2013-05-28 at 21:22 -0700, Stephen Hemminger wrote: > On Wed, 29 May 2013 10:08:53 +0800 > Cong Wang <amwang@redhat.com> wrote: > > > On Tue, 2013-05-28 at 08:22 -0700, Stephen Hemminger wrote: > > > On Tue, 28 May 2013 19:07:22 +0800 > > > Cong Wang <amwang@redhat.com> wrote: > > > > > > > From: Cong Wang <amwang@redhat.com> > > > > > > > > In commit 553675fb5e9ce3d71a (vxlan: listen on multiple ports), > > > > we use kfree_rcu() to free ->vn_sock, but a) there is no use > > > > of RCU API to access this filed, b) RCU is not enough to do refcnt > > > > here, because in vxlan_leave_group() we drop RTNL lock before > > > > locking the socket, it could be possible that this field is > > > > freed during this period. > > > > > > > > So, instead making things complex, just do basic refcnt for > > > > the ->vn_sock, like we do for others. > > > > > > ... > > > > > > Not needed all access is under RTNL > > > > I know, this is why I had a patch (not posted) which adds the missing > > rtnl_dereference(), but even if we had these, it is still not correct. > > > > As I explained in the changelog, vxlan_leave_group() has a problem, > > because it releases rtnl lock before locking the socket, _and_ it is > > called after vxlan_dellink() which schedules a work to cleanup the > > struct. Therefore the ->vn_sock could be freed right after rtnl lock is > > released. > > > > Am I miss anything? > > Ignoring your IPv6 code for now... > > With IPV4: > refcnt is incremented when socket is incremented in newlink (RTNL held). > refcnt is decremented in by dellink (RTNL held) and socket is deleted from list > leave_group doesn't happen until work queue is fired. > > rtnl_dereference is fine, but hardly necessary when the call hierarchy is so obvious. > > The problem you describe won't be fixed by just converting it to atomic, My patch is _not_ just converting it to atomic_t, but it takes a ref for every usage of ->vn_sock, which current implementation misses. > I think you need add a dev_hold()/dev_put to vxlan_stop to prevent > device from being deleted when rtnl_lock is dropped. > The crash is that lock_sock() got a NULL-def bug, which is not the related with dev_hold() at all. I think it is due to the whole ->vn_sock is freed before calling lock_sock(), thus vxlan->vn_sock->sock->sk points to a freed memory area. ^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [Patch net-next] vxlan: do real refcnt for vn_sock 2013-05-28 11:07 [Patch net-next] vxlan: do real refcnt for vn_sock Cong Wang 2013-05-28 15:22 ` Stephen Hemminger @ 2013-05-29 4:01 ` Cong Wang 2013-05-29 4:41 ` Stephen Hemminger 2 siblings, 0 replies; 12+ messages in thread From: Cong Wang @ 2013-05-29 4:01 UTC (permalink / raw) To: netdev; +Cc: Stephen Hemminger, David S. Miller On Tue, 2013-05-28 at 19:07 +0800, Cong Wang wrote: > From: Cong Wang <amwang@redhat.com> > > In commit 553675fb5e9ce3d71a (vxlan: listen on multiple ports), > we use kfree_rcu() to free ->vn_sock, but a) there is no use > of RCU API to access this filed, b) RCU is not enough to do refcnt > here, because in vxlan_leave_group() we drop RTNL lock before > locking the socket, it could be possible that this field is > freed during this period. > > So, instead making things complex, just do basic refcnt for > the ->vn_sock, like we do for others. BTW, this patch fixes a real crash when I test vxlan over IPv6, although I can't reproduce it with IPv4. ^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [Patch net-next] vxlan: do real refcnt for vn_sock 2013-05-28 11:07 [Patch net-next] vxlan: do real refcnt for vn_sock Cong Wang 2013-05-28 15:22 ` Stephen Hemminger 2013-05-29 4:01 ` Cong Wang @ 2013-05-29 4:41 ` Stephen Hemminger 2013-05-29 5:14 ` Cong Wang 2013-05-29 8:39 ` Cong Wang 2 siblings, 2 replies; 12+ messages in thread From: Stephen Hemminger @ 2013-05-29 4:41 UTC (permalink / raw) To: Cong Wang; +Cc: netdev, David S. Miller Why not just fix the requirement to drop rtnl when calling igmp. The code comes out cleaner and safer as well. The follow COMPILE TESTED ONLY is a starting point... --- a/drivers/net/vxlan.c 2013-05-20 10:24:19.624427346 -0700 +++ b/drivers/net/vxlan.c 2013-05-28 21:39:57.502351889 -0700 @@ -657,20 +657,12 @@ static int vxlan_join_group(struct net_d .imr_multiaddr.s_addr = vxlan->default_dst.remote_ip, .imr_ifindex = vxlan->default_dst.remote_ifindex, }; - int err; /* Already a member of group */ if (vxlan_group_used(vn, vxlan)) return 0; - /* Need to drop RTNL to call multicast join */ - rtnl_unlock(); - lock_sock(sk); - err = ip_mc_join_group(sk, &mreq); - release_sock(sk); - rtnl_lock(); - - return err; + return _ip_mc_join_group(sk, &mreq); } @@ -679,7 +671,6 @@ static int vxlan_leave_group(struct net_ { struct vxlan_dev *vxlan = netdev_priv(dev); struct vxlan_net *vn = net_generic(dev_net(dev), vxlan_net_id); - int err = 0; struct sock *sk = vxlan->vn_sock->sock->sk; struct ip_mreqn mreq = { .imr_multiaddr.s_addr = vxlan->default_dst.remote_ip, @@ -690,14 +681,7 @@ static int vxlan_leave_group(struct net_ if (vxlan_group_used(vn, vxlan)) return 0; - /* Need to drop RTNL to call multicast leave */ - rtnl_unlock(); - lock_sock(sk); - err = ip_mc_leave_group(sk, &mreq); - release_sock(sk); - rtnl_lock(); - - return err; + return _ip_mc_leave_group(sk, &mreq); } /* Callback from net/ipv4/udp.c to receive packets */ @@ -1244,6 +1228,7 @@ static int vxlan_open(struct net_device struct vxlan_dev *vxlan = netdev_priv(dev); int err; + dev_hold(dev); if (IN_MULTICAST(ntohl(vxlan->default_dst.remote_ip))) { err = vxlan_join_group(dev); if (err) @@ -1252,6 +1237,7 @@ static int vxlan_open(struct net_device if (vxlan->age_interval) mod_timer(&vxlan->age_timer, jiffies + FDB_AGE_INTERVAL); + dev_put(dev); return 0; } --- a/include/linux/igmp.h 2013-05-11 15:58:53.692325370 -0700 +++ b/include/linux/igmp.h 2013-05-28 21:37:40.560328674 -0700 @@ -109,7 +109,9 @@ struct ip_mc_list { extern int ip_check_mc_rcu(struct in_device *dev, __be32 mc_addr, __be32 src_addr, u16 proto); extern int igmp_rcv(struct sk_buff *); +extern int _ip_mc_join_group(struct sock *sk, struct ip_mreqn *imr); extern int ip_mc_join_group(struct sock *sk, struct ip_mreqn *imr); +extern int _ip_mc_leave_group(struct sock *sk, struct ip_mreqn *imr); extern int ip_mc_leave_group(struct sock *sk, struct ip_mreqn *imr); extern void ip_mc_drop_socket(struct sock *sk); extern int ip_mc_source(int add, int omode, struct sock *sk, --- a/net/ipv4/igmp.c 2013-05-11 15:58:53.992318551 -0700 +++ b/net/ipv4/igmp.c 2013-05-28 21:39:43.302554701 -0700 @@ -1781,48 +1781,36 @@ static void ip_mc_clear_src(struct ip_mc pmc->sfcount[MCAST_EXCLUDE] = 1; } - -/* - * Join a multicast group - */ -int ip_mc_join_group(struct sock *sk , struct ip_mreqn *imr) +int _ip_mc_join_group(struct sock *sk, struct ip_mreqn *imr) { - int err; __be32 addr = imr->imr_multiaddr.s_addr; - struct ip_mc_socklist *iml = NULL, *i; + struct ip_mc_socklist *iml, *i; struct in_device *in_dev; struct inet_sock *inet = inet_sk(sk); struct net *net = sock_net(sk); int ifindex; int count = 0; - if (!ipv4_is_multicast(addr)) - return -EINVAL; - - rtnl_lock(); + ASSERT_RTNL(); in_dev = ip_mc_find_dev(net, imr); + if (!in_dev) + return -ENODEV; - if (!in_dev) { - iml = NULL; - err = -ENODEV; - goto done; - } - - err = -EADDRINUSE; ifindex = imr->imr_ifindex; for_each_pmc_rtnl(inet, i) { if (i->multi.imr_multiaddr.s_addr == addr && i->multi.imr_ifindex == ifindex) - goto done; + return -EADDRINUSE; count++; } - err = -ENOBUFS; + if (count >= sysctl_igmp_max_memberships) - goto done; + return -ENOBUFS; + iml = sock_kmalloc(sk, sizeof(*iml), GFP_KERNEL); if (iml == NULL) - goto done; + return -ENOMEM; memcpy(&iml->multi, imr, sizeof(*imr)); iml->next_rcu = inet->mc_list; @@ -1830,8 +1818,24 @@ int ip_mc_join_group(struct sock *sk , s iml->sfmode = MCAST_EXCLUDE; rcu_assign_pointer(inet->mc_list, iml); ip_mc_inc_group(in_dev, addr); - err = 0; -done: + + return 0; +} +EXPORT_SYMBOL(_ip_mc_join_group); + +/* + * Join a multicast group + */ +int ip_mc_join_group(struct sock *sk, struct ip_mreqn *imr) +{ + int err; + __be32 addr = imr->imr_multiaddr.s_addr; + + if (!ipv4_is_multicast(addr)) + return -EINVAL; + + rtnl_lock(); + err = _ip_mc_join_group(sk, imr); rtnl_unlock(); return err; } @@ -1857,11 +1861,7 @@ static int ip_mc_leave_src(struct sock * return err; } -/* - * Ask a socket to leave a group. - */ - -int ip_mc_leave_group(struct sock *sk, struct ip_mreqn *imr) +int _ip_mc_leave_group(struct sock *sk, struct ip_mreqn *imr) { struct inet_sock *inet = inet_sk(sk); struct ip_mc_socklist *iml; @@ -1870,9 +1870,9 @@ int ip_mc_leave_group(struct sock *sk, s struct net *net = sock_net(sk); __be32 group = imr->imr_multiaddr.s_addr; u32 ifindex; - int ret = -EADDRNOTAVAIL; - rtnl_lock(); + ASSERT_RTNL(); + in_dev = ip_mc_find_dev(net, imr); ifindex = imr->imr_ifindex; for (imlp = &inet->mc_list; @@ -1880,6 +1880,7 @@ int ip_mc_leave_group(struct sock *sk, s imlp = &iml->next_rcu) { if (iml->multi.imr_multiaddr.s_addr != group) continue; + if (ifindex) { if (iml->multi.imr_ifindex != ifindex) continue; @@ -1887,20 +1888,32 @@ int ip_mc_leave_group(struct sock *sk, s iml->multi.imr_address.s_addr) continue; - (void) ip_mc_leave_src(sk, iml, in_dev); + ip_mc_leave_src(sk, iml, in_dev); *imlp = iml->next_rcu; if (in_dev) ip_mc_dec_group(in_dev, group); - rtnl_unlock(); + /* decrease mem now to avoid the memleak warning */ atomic_sub(sizeof(*iml), &sk->sk_omem_alloc); kfree_rcu(iml, rcu); return 0; } - if (!in_dev) - ret = -ENODEV; + + return !in_dev ? -ENODEV : -EADDRNOTAVAIL; +} +EXPORT_SYMBOL(_ip_mc_leave_group); + +/* + * Ask a socket to leave a group. + */ +int ip_mc_leave_group(struct sock *sk, struct ip_mreqn *imr) +{ + int ret; + + rtnl_lock(); + ret = _ip_mc_leave_group(sk, imr); rtnl_unlock(); return ret; } ^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [Patch net-next] vxlan: do real refcnt for vn_sock 2013-05-29 4:41 ` Stephen Hemminger @ 2013-05-29 5:14 ` Cong Wang 2013-05-29 8:39 ` Cong Wang 1 sibling, 0 replies; 12+ messages in thread From: Cong Wang @ 2013-05-29 5:14 UTC (permalink / raw) To: Stephen Hemminger; +Cc: netdev, David S. Miller On Tue, 2013-05-28 at 21:41 -0700, Stephen Hemminger wrote: > Why not just fix the requirement to drop rtnl when calling igmp. > The code comes out cleaner and safer as well. > This could probably work too. Please submit it as a normal patch? ^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [Patch net-next] vxlan: do real refcnt for vn_sock 2013-05-29 4:41 ` Stephen Hemminger 2013-05-29 5:14 ` Cong Wang @ 2013-05-29 8:39 ` Cong Wang 2013-05-31 2:55 ` Cong Wang 1 sibling, 1 reply; 12+ messages in thread From: Cong Wang @ 2013-05-29 8:39 UTC (permalink / raw) To: Stephen Hemminger; +Cc: netdev, David S. Miller On Tue, 2013-05-28 at 21:41 -0700, Stephen Hemminger wrote: > Why not just fix the requirement to drop rtnl when calling igmp. > The code comes out cleaner and safer as well. Besides you forget to lock the socket before calling _ip_mc_join_group() (and also the order is very important too), your patch doesn't fix the problem I met. The full backtrace is below: [ 114.134123] BUG: unable to handle kernel NULL pointer dereference at 0000000000000068 [ 114.136065] IP: [<ffffffff810a1061>] __lock_acquire+0x9c/0x45d [ 114.136065] PGD 71721067 PUD 70e11067 PMD 0 [ 114.136065] Oops: 0000 [#1] SMP DEBUG_PAGEALLOC [ 114.136065] CPU: 0 PID: 707 Comm: ip Not tainted 3.10.0-rc2+ #1075 [ 114.136065] Hardware name: Bochs Bochs, BIOS Bochs 01/01/2007 [ 114.136065] task: ffff88006fce2480 ti: ffff88006d9ec000 task.ti: ffff88006d9ec000 [ 114.136065] RIP: 0010:[<ffffffff810a1061>] [<ffffffff810a1061>] __lock_acquire+0x9c/0x45d [ 114.136065] RSP: 0018:ffff88006d9ed6a8 EFLAGS: 00010046 [ 114.136065] RAX: 0000000000000068 RBX: 0000000000000000 RCX: 0000000000000000 [ 114.136065] RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000000000068 [ 114.136065] RBP: ffff88006d9ed708 R08: 0000000000000002 R09: 0000000000000000 [ 114.136065] R10: ffffffff8104f0bb R11: ffffffff8107632b R12: ffff88006fce2480 [ 114.136065] R13: 0000000000000000 R14: 0000000000000002 R15: 0000000000000000 [ 114.136065] FS: 00007fc30cd89740(0000) GS:ffff88007f600000(0000) knlGS:0000000000000000 [ 114.136065] CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b [ 114.136065] CR2: 0000000000000068 CR3: 000000006f3a2000 CR4: 00000000000006f0 [ 114.136065] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 114.136065] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 [ 114.136065] Stack: [ 114.136065] ffff88006d9ed6b8 ffffffff00000000 ffff880000000000 000000006fce2480 [ 114.136065] ffffffff00000000 0000000000000068 ffffffff825063d2 0000000000000000 [ 114.136065] ffff88006fce2480 ffffffff81757759 0000000000000002 0000000000000000 [ 114.136065] Call Trace: [ 114.136065] [<ffffffff81757759>] ? lock_sock_nested+0x3c/0x97 [ 114.136065] [<ffffffff810a2162>] lock_acquire+0xfa/0x140 [ 114.136065] [<ffffffff81757759>] ? lock_sock_nested+0x3c/0x97 [ 114.136065] [<ffffffff810a06d4>] ? trace_softirqs_off+0x3b/0xf3 [ 114.136065] [<ffffffff81976a1f>] _raw_spin_lock_bh+0x4a/0x7d [ 114.136065] [<ffffffff81757759>] ? lock_sock_nested+0x3c/0x97 [ 114.136065] [<ffffffff810798db>] ? should_resched+0x9/0x28 [ 114.136065] [<ffffffff81757759>] lock_sock_nested+0x3c/0x97 [ 114.136065] [<ffffffff815e13a1>] vxlan_leave_group+0xc4/0x111 [ 114.136065] [<ffffffff81054dfc>] ? local_bh_enable_ip+0xe/0x10 [ 114.136065] [<ffffffff815e142d>] vxlan_stop+0x3f/0xaa [ 114.136065] [<ffffffff817659ff>] __dev_close_many+0x9c/0xc4 [ 114.136065] [<ffffffff81765af3>] dev_close_many+0x70/0xd8 [ 114.136065] [<ffffffff817697f6>] rollback_registered_many+0xa2/0x1a7 [ 114.136065] [<ffffffff81769a4e>] unregister_netdevice_many+0x19/0x51 [ 114.136065] [<ffffffff817738b5>] rtnl_dellink+0xd0/0xfb [ 114.136065] [<ffffffff81058754>] ? ns_capable+0x4d/0x66 [ 114.136065] [<ffffffff81774aaf>] rtnetlink_rcv_msg+0x19c/0x1ab [ 114.136065] [<ffffffff81774913>] ? __rtnl_unlock+0x17/0x17 [ 114.136065] [<ffffffff817bc7d8>] netlink_rcv_skb+0x42/0x8d [ 114.136065] [<ffffffff817748f5>] rtnetlink_rcv+0x26/0x2d [ 114.136065] [<ffffffff817bb001>] netlink_unicast+0xb7/0x138 [ 114.136065] [<ffffffff817bba6b>] netlink_sendmsg+0x2b8/0x2f2 [ 114.136065] [<ffffffff81752e3e>] sock_sendmsg+0x7f/0xa0 [ 114.136065] [<ffffffff8111ee4d>] ? might_fault+0xa5/0xac [ 114.136065] [<ffffffff8111ee04>] ? might_fault+0x5c/0xac [ 114.136065] [<ffffffff817527ba>] ? move_addr_to_kernel+0x41/0x5a [ 114.136065] [<ffffffff8175db77>] ? verify_iovec+0x5b/0xac [ 114.136065] [<ffffffff81753064>] __sys_sendmsg+0x205/0x2a1 [ 114.136065] [<ffffffff8102c172>] ? __do_page_fault+0x2ee/0x38b [ 114.136065] [<ffffffff8107507a>] ? up_read+0x29/0x2e [ 114.136065] [<ffffffff8116794f>] ? fcheck_files+0xa3/0xe1 [ 114.136065] [<ffffffff81168915>] ? fget_light+0x3a/0xa4 [ 114.136065] [<ffffffff81753f91>] SyS_sendmsg+0x42/0x60 [ 114.136065] [<ffffffff819781c2>] system_call_fastpath+0x16/0x1b [ 114.136065] Code: 00 00 83 3d c1 46 d7 01 00 0f 85 cb 03 00 00 48 c7 c1 1c ef d5 81 48 c7 c2 67 5f d5 81 be fb 0b 00 00 e9 1b 02 00 00 48 8b 45 c8 <48> 81 38 a0 f5 52 82 b8 01 00 00 00 44 0f 44 f0 83 fe 01 77 10 [ 114.136065] RIP [<ffffffff810a1061>] __lock_acquire+0x9c/0x45d [ 114.136065] RSP <ffff88006d9ed6a8> [ 114.136065] CR2: 0000000000000068 [ 114.136065] ---[ end trace 92078b41edbc404d ]--- [ 114.136065] Kernel panic - not syncing: Fatal exception in interrupt ^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [Patch net-next] vxlan: do real refcnt for vn_sock 2013-05-29 8:39 ` Cong Wang @ 2013-05-31 2:55 ` Cong Wang 2013-05-31 3:56 ` Stephen Hemminger 0 siblings, 1 reply; 12+ messages in thread From: Cong Wang @ 2013-05-31 2:55 UTC (permalink / raw) To: Stephen Hemminger; +Cc: netdev, David S. Miller On Wed, 2013-05-29 at 16:39 +0800, Cong Wang wrote: > On Tue, 2013-05-28 at 21:41 -0700, Stephen Hemminger wrote: > > Why not just fix the requirement to drop rtnl when calling igmp. > > The code comes out cleaner and safer as well. > > Besides you forget to lock the socket before calling _ip_mc_join_group() > (and also the order is very important too), your patch doesn't fix the > problem I met. The full backtrace is below: > Hi, Stephen, Do you think my patch is an easier solution? It at least fixes the crash, while your patch doesn't. :) ^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [Patch net-next] vxlan: do real refcnt for vn_sock 2013-05-31 2:55 ` Cong Wang @ 2013-05-31 3:56 ` Stephen Hemminger 2013-05-31 4:12 ` Cong Wang 0 siblings, 1 reply; 12+ messages in thread From: Stephen Hemminger @ 2013-05-31 3:56 UTC (permalink / raw) To: Cong Wang; +Cc: netdev, David S. Miller On Fri, 31 May 2013 10:55:45 +0800 Cong Wang <amwang@redhat.com> wrote: > On Wed, 2013-05-29 at 16:39 +0800, Cong Wang wrote: > > On Tue, 2013-05-28 at 21:41 -0700, Stephen Hemminger wrote: > > > Why not just fix the requirement to drop rtnl when calling igmp. > > > The code comes out cleaner and safer as well. > > > > Besides you forget to lock the socket before calling _ip_mc_join_group() > > (and also the order is very important too), your patch doesn't fix the > > problem I met. The full backtrace is below: > > > > Hi, Stephen, > > Do you think my patch is an easier solution? It at least fixes the > crash, while your patch doesn't. :) > > No. your patch doesn't fix the real problem. The real fix is more complex because of how socket locking interacts with RTNL. I have been busy with health stuff, so the going is slow. ^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [Patch net-next] vxlan: do real refcnt for vn_sock 2013-05-31 3:56 ` Stephen Hemminger @ 2013-05-31 4:12 ` Cong Wang 0 siblings, 0 replies; 12+ messages in thread From: Cong Wang @ 2013-05-31 4:12 UTC (permalink / raw) To: Stephen Hemminger; +Cc: netdev, David S. Miller On Thu, 2013-05-30 at 20:56 -0700, Stephen Hemminger wrote: > On Fri, 31 May 2013 10:55:45 +0800 > Cong Wang <amwang@redhat.com> wrote: > > > On Wed, 2013-05-29 at 16:39 +0800, Cong Wang wrote: > > > On Tue, 2013-05-28 at 21:41 -0700, Stephen Hemminger wrote: > > > > Why not just fix the requirement to drop rtnl when calling igmp. > > > > The code comes out cleaner and safer as well. > > > > > > Besides you forget to lock the socket before calling _ip_mc_join_group() > > > (and also the order is very important too), your patch doesn't fix the > > > problem I met. The full backtrace is below: > > > > > > > Hi, Stephen, > > > > Do you think my patch is an easier solution? It at least fixes the > > crash, while your patch doesn't. :) > > > > > > No. your patch doesn't fix the real problem. > The real fix is more complex because of how socket locking interacts with RTNL. Yeah, I am thinking if we could replace the RTNL lock with a spinlock in IPv4 multicast, as IPv6 multicast uses its own spin lock too. I don't see any reason why inet->mc_list has to be protected by RTNL. > > > I have been busy with health stuff, so the going is slow. No problem, take care! ^ permalink raw reply [flat|nested] 12+ messages in thread
end of thread, other threads:[~2013-05-31 4:12 UTC | newest] Thread overview: 12+ messages (download: mbox.gz / follow: Atom feed) -- links below jump to the message on this page -- 2013-05-28 11:07 [Patch net-next] vxlan: do real refcnt for vn_sock Cong Wang 2013-05-28 15:22 ` Stephen Hemminger 2013-05-29 2:08 ` Cong Wang 2013-05-29 4:22 ` Stephen Hemminger 2013-05-29 4:34 ` Cong Wang 2013-05-29 4:01 ` Cong Wang 2013-05-29 4:41 ` Stephen Hemminger 2013-05-29 5:14 ` Cong Wang 2013-05-29 8:39 ` Cong Wang 2013-05-31 2:55 ` Cong Wang 2013-05-31 3:56 ` Stephen Hemminger 2013-05-31 4:12 ` Cong Wang
This is an external index of several public inboxes, see mirroring instructions on how to clone and mirror all data and code used by this external index.