* [Patch net-next] vxlan: do real refcnt for vn_sock
@ 2013-05-28 11:07 Cong Wang
2013-05-28 15:22 ` Stephen Hemminger
` (2 more replies)
0 siblings, 3 replies; 12+ messages in thread
From: Cong Wang @ 2013-05-28 11:07 UTC (permalink / raw)
To: netdev; +Cc: Stephen Hemminger, David S. Miller, Cong Wang
From: Cong Wang <amwang@redhat.com>
In commit 553675fb5e9ce3d71a (vxlan: listen on multiple ports),
we use kfree_rcu() to free ->vn_sock, but a) there is no use
of RCU API to access this filed, b) RCU is not enough to do refcnt
here, because in vxlan_leave_group() we drop RTNL lock before
locking the socket, it could be possible that this field is
freed during this period.
So, instead making things complex, just do basic refcnt for
the ->vn_sock, like we do for others.
Cc: Stephen Hemminger <stephen@networkplumber.org>
Cc: David S. Miller <davem@davemloft.net>
Signed-off-by: Cong Wang <amwang@redhat.com>
---
diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
index 5ed64d4..73a674e 100644
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c
@@ -83,9 +83,8 @@ static unsigned int vxlan_net_id;
/* per UDP socket information */
struct vxlan_sock {
struct hlist_node hlist;
- struct rcu_head rcu;
struct work_struct del_work;
- unsigned int refcnt;
+ atomic_t refcnt;
struct socket *sock;
struct hlist_head vni_list[VNI_HASH_SIZE];
};
@@ -164,14 +163,30 @@ static inline struct hlist_head *vs_head(struct net *net, __be16 port)
return &vn->sock_list[hash_32(ntohs(port), PORT_HASH_BITS)];
}
+static inline void vxlan_sock_get(struct vxlan_sock *vs)
+{
+ atomic_inc(&vs->refcnt);
+}
+
+static inline void vxlan_sock_put(struct vxlan_sock *vs)
+{
+ if (atomic_dec_and_test(&vs->refcnt)) {
+ hlist_del_rcu(&vs->hlist);
+ schedule_work(&vs->del_work);
+ }
+}
+
+
/* Find VXLAN socket based on network namespace and UDP port */
static struct vxlan_sock *vxlan_find_port(struct net *net, __be16 port)
{
struct vxlan_sock *vs;
hlist_for_each_entry_rcu(vs, vs_head(net, port), hlist) {
+ vxlan_sock_get(vs);
if (inet_sk(vs->sock->sk)->inet_sport == port)
return vs;
+ vxlan_sock_put(vs);
}
return NULL;
}
@@ -187,10 +202,13 @@ static struct vxlan_dev *vxlan_find_vni(struct net *net, u32 id, __be16 port)
return NULL;
hlist_for_each_entry_rcu(vxlan, vni_head(vs, id), hlist) {
- if (vxlan->default_dst.remote_vni == id)
+ if (vxlan->default_dst.remote_vni == id) {
+ vxlan_sock_put(vs);
return vxlan;
+ }
}
+ vxlan_sock_put(vs);
return NULL;
}
@@ -926,7 +944,7 @@ static bool route_shortcircuit(struct net_device *dev, struct sk_buff *skb)
return false;
}
-static void vxlan_sock_put(struct sk_buff *skb)
+static void vxlan_sock_destructor(struct sk_buff *skb)
{
sock_put(skb->sk);
}
@@ -940,7 +958,7 @@ static void vxlan_set_owner(struct net_device *dev, struct sk_buff *skb)
skb_orphan(skb);
sock_hold(sk);
skb->sk = sk;
- skb->destructor = vxlan_sock_put;
+ skb->destructor = vxlan_sock_destructor;
}
/* Compute source port for outgoing packet
@@ -1255,10 +1273,14 @@ static int vxlan_open(struct net_device *dev)
struct vxlan_dev *vxlan = netdev_priv(dev);
int err;
+ vxlan_sock_get(vxlan->vn_sock);
+
if (IN_MULTICAST(ntohl(vxlan->default_dst.remote_ip))) {
err = vxlan_join_group(dev);
- if (err)
+ if (err) {
+ vxlan_sock_put(vxlan->vn_sock);
return err;
+ }
}
if (vxlan->age_interval)
@@ -1294,6 +1316,8 @@ static int vxlan_stop(struct net_device *dev)
del_timer_sync(&vxlan->age_timer);
+ vxlan_sock_put(vxlan->vn_sock);
+
vxlan_flush(vxlan);
return 0;
@@ -1447,7 +1471,7 @@ static void vxlan_del_work(struct work_struct *work)
struct vxlan_sock *vs = container_of(work, struct vxlan_sock, del_work);
sk_release_kernel(vs->sock->sk);
- kfree_rcu(vs, rcu);
+ kfree(vs);
}
/* Create new listen socket if needed */
@@ -1503,7 +1527,7 @@ static struct vxlan_sock *vxlan_socket_create(struct net *net, __be16 port)
udp_sk(sk)->encap_rcv = vxlan_udp_encap_recv;
udp_encap_enable();
- vs->refcnt = 1;
+ atomic_set(&vs->refcnt, 1);
return vs;
}
@@ -1592,9 +1616,7 @@ static int vxlan_newlink(struct net *net, struct net_device *dev,
}
vs = vxlan_find_port(net, vxlan->dst_port);
- if (vs)
- ++vs->refcnt;
- else {
+ if (!vs) {
/* Drop lock because socket create acquires RTNL lock */
rtnl_unlock();
vs = vxlan_socket_create(net, vxlan->dst_port);
@@ -1610,12 +1632,7 @@ static int vxlan_newlink(struct net *net, struct net_device *dev,
err = register_netdevice(dev);
if (err) {
- if (--vs->refcnt == 0) {
- rtnl_unlock();
- sk_release_kernel(vs->sock->sk);
- kfree(vs);
- rtnl_lock();
- }
+ vxlan_sock_put(vs);
return err;
}
@@ -1634,10 +1651,7 @@ static void vxlan_dellink(struct net_device *dev, struct list_head *head)
list_del(&vxlan->next);
unregister_netdevice_queue(dev, head);
- if (--vs->refcnt == 0) {
- hlist_del_rcu(&vs->hlist);
- schedule_work(&vs->del_work);
- }
+ vxlan_sock_put(vs);
}
static size_t vxlan_get_size(const struct net_device *dev)
^ permalink raw reply related [flat|nested] 12+ messages in thread
* Re: [Patch net-next] vxlan: do real refcnt for vn_sock
2013-05-28 11:07 [Patch net-next] vxlan: do real refcnt for vn_sock Cong Wang
@ 2013-05-28 15:22 ` Stephen Hemminger
2013-05-29 2:08 ` Cong Wang
2013-05-29 4:01 ` Cong Wang
2013-05-29 4:41 ` Stephen Hemminger
2 siblings, 1 reply; 12+ messages in thread
From: Stephen Hemminger @ 2013-05-28 15:22 UTC (permalink / raw)
To: Cong Wang; +Cc: netdev, David S. Miller
On Tue, 28 May 2013 19:07:22 +0800
Cong Wang <amwang@redhat.com> wrote:
> From: Cong Wang <amwang@redhat.com>
>
> In commit 553675fb5e9ce3d71a (vxlan: listen on multiple ports),
> we use kfree_rcu() to free ->vn_sock, but a) there is no use
> of RCU API to access this filed, b) RCU is not enough to do refcnt
> here, because in vxlan_leave_group() we drop RTNL lock before
> locking the socket, it could be possible that this field is
> freed during this period.
>
> So, instead making things complex, just do basic refcnt for
> the ->vn_sock, like we do for others.
>
> Cc: Stephen Hemminger <stephen@networkplumber.org>
> Cc: David S. Miller <davem@davemloft.net>
> Signed-off-by: Cong Wang <amwang@redhat.com>
>
> ---
> diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
> index 5ed64d4..73a674e 100644
> --- a/drivers/net/vxlan.c
> +++ b/drivers/net/vxlan.c
> @@ -83,9 +83,8 @@ static unsigned int vxlan_net_id;
> /* per UDP socket information */
> struct vxlan_sock {
> struct hlist_node hlist;
> - struct rcu_head rcu;
> struct work_struct del_work;
> - unsigned int refcnt;
> + atomic_t refcnt;
> struct socket *sock;
> struct hlist_head vni_list[VNI_HASH_SIZE];
> };
> @@ -164,14 +163,30 @@ static inline struct hlist_head *vs_head(struct net *net, __be16 port)
> return &vn->sock_list[hash_32(ntohs(port), PORT_HASH_BITS)];
> }
>
> +static inline void vxlan_sock_get(struct vxlan_sock *vs)
> +{
> + atomic_inc(&vs->refcnt);
> +}
> +
> +static inline void vxlan_sock_put(struct vxlan_sock *vs)
> +{
> + if (atomic_dec_and_test(&vs->refcnt)) {
> + hlist_del_rcu(&vs->hlist);
> + schedule_work(&vs->del_work);
> + }
> +}
> +
> +
> /* Find VXLAN socket based on network namespace and UDP port */
> static struct vxlan_sock *vxlan_find_port(struct net *net, __be16 port)
> {
> struct vxlan_sock *vs;
>
> hlist_for_each_entry_rcu(vs, vs_head(net, port), hlist) {
> + vxlan_sock_get(vs);
> if (inet_sk(vs->sock->sk)->inet_sport == port)
> return vs;
> + vxlan_sock_put(vs);
> }
> return NULL;
> }
> @@ -187,10 +202,13 @@ static struct vxlan_dev *vxlan_find_vni(struct net *net, u32 id, __be16 port)
> return NULL;
>
> hlist_for_each_entry_rcu(vxlan, vni_head(vs, id), hlist) {
> - if (vxlan->default_dst.remote_vni == id)
> + if (vxlan->default_dst.remote_vni == id) {
> + vxlan_sock_put(vs);
> return vxlan;
> + }
> }
>
> + vxlan_sock_put(vs);
> return NULL;
> }
>
> @@ -926,7 +944,7 @@ static bool route_shortcircuit(struct net_device *dev, struct sk_buff *skb)
> return false;
> }
>
> -static void vxlan_sock_put(struct sk_buff *skb)
> +static void vxlan_sock_destructor(struct sk_buff *skb)
> {
> sock_put(skb->sk);
> }
> @@ -940,7 +958,7 @@ static void vxlan_set_owner(struct net_device *dev, struct sk_buff *skb)
> skb_orphan(skb);
> sock_hold(sk);
> skb->sk = sk;
> - skb->destructor = vxlan_sock_put;
> + skb->destructor = vxlan_sock_destructor;
> }
>
> /* Compute source port for outgoing packet
> @@ -1255,10 +1273,14 @@ static int vxlan_open(struct net_device *dev)
> struct vxlan_dev *vxlan = netdev_priv(dev);
> int err;
>
> + vxlan_sock_get(vxlan->vn_sock);
> +
> if (IN_MULTICAST(ntohl(vxlan->default_dst.remote_ip))) {
> err = vxlan_join_group(dev);
> - if (err)
> + if (err) {
> + vxlan_sock_put(vxlan->vn_sock);
> return err;
> + }
> }
>
> if (vxlan->age_interval)
> @@ -1294,6 +1316,8 @@ static int vxlan_stop(struct net_device *dev)
>
> del_timer_sync(&vxlan->age_timer);
>
> + vxlan_sock_put(vxlan->vn_sock);
> +
> vxlan_flush(vxlan);
>
> return 0;
> @@ -1447,7 +1471,7 @@ static void vxlan_del_work(struct work_struct *work)
> struct vxlan_sock *vs = container_of(work, struct vxlan_sock, del_work);
>
> sk_release_kernel(vs->sock->sk);
> - kfree_rcu(vs, rcu);
> + kfree(vs);
> }
>
> /* Create new listen socket if needed */
> @@ -1503,7 +1527,7 @@ static struct vxlan_sock *vxlan_socket_create(struct net *net, __be16 port)
> udp_sk(sk)->encap_rcv = vxlan_udp_encap_recv;
> udp_encap_enable();
>
> - vs->refcnt = 1;
> + atomic_set(&vs->refcnt, 1);
> return vs;
> }
>
> @@ -1592,9 +1616,7 @@ static int vxlan_newlink(struct net *net, struct net_device *dev,
> }
>
> vs = vxlan_find_port(net, vxlan->dst_port);
> - if (vs)
> - ++vs->refcnt;
> - else {
> + if (!vs) {
> /* Drop lock because socket create acquires RTNL lock */
> rtnl_unlock();
> vs = vxlan_socket_create(net, vxlan->dst_port);
> @@ -1610,12 +1632,7 @@ static int vxlan_newlink(struct net *net, struct net_device *dev,
>
> err = register_netdevice(dev);
> if (err) {
> - if (--vs->refcnt == 0) {
> - rtnl_unlock();
> - sk_release_kernel(vs->sock->sk);
> - kfree(vs);
> - rtnl_lock();
> - }
> + vxlan_sock_put(vs);
> return err;
> }
>
> @@ -1634,10 +1651,7 @@ static void vxlan_dellink(struct net_device *dev, struct list_head *head)
> list_del(&vxlan->next);
> unregister_netdevice_queue(dev, head);
>
> - if (--vs->refcnt == 0) {
> - hlist_del_rcu(&vs->hlist);
> - schedule_work(&vs->del_work);
> - }
> + vxlan_sock_put(vs);
> }
>
> static size_t vxlan_get_size(const struct net_device *dev)
Not needed all access is under RTNL
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [Patch net-next] vxlan: do real refcnt for vn_sock
2013-05-28 15:22 ` Stephen Hemminger
@ 2013-05-29 2:08 ` Cong Wang
2013-05-29 4:22 ` Stephen Hemminger
0 siblings, 1 reply; 12+ messages in thread
From: Cong Wang @ 2013-05-29 2:08 UTC (permalink / raw)
To: Stephen Hemminger; +Cc: netdev, David S. Miller
On Tue, 2013-05-28 at 08:22 -0700, Stephen Hemminger wrote:
> On Tue, 28 May 2013 19:07:22 +0800
> Cong Wang <amwang@redhat.com> wrote:
>
> > From: Cong Wang <amwang@redhat.com>
> >
> > In commit 553675fb5e9ce3d71a (vxlan: listen on multiple ports),
> > we use kfree_rcu() to free ->vn_sock, but a) there is no use
> > of RCU API to access this filed, b) RCU is not enough to do refcnt
> > here, because in vxlan_leave_group() we drop RTNL lock before
> > locking the socket, it could be possible that this field is
> > freed during this period.
> >
> > So, instead making things complex, just do basic refcnt for
> > the ->vn_sock, like we do for others.
> >
...
>
> Not needed all access is under RTNL
I know, this is why I had a patch (not posted) which adds the missing
rtnl_dereference(), but even if we had these, it is still not correct.
As I explained in the changelog, vxlan_leave_group() has a problem,
because it releases rtnl lock before locking the socket, _and_ it is
called after vxlan_dellink() which schedules a work to cleanup the
struct. Therefore the ->vn_sock could be freed right after rtnl lock is
released.
Am I miss anything?
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [Patch net-next] vxlan: do real refcnt for vn_sock
2013-05-28 11:07 [Patch net-next] vxlan: do real refcnt for vn_sock Cong Wang
2013-05-28 15:22 ` Stephen Hemminger
@ 2013-05-29 4:01 ` Cong Wang
2013-05-29 4:41 ` Stephen Hemminger
2 siblings, 0 replies; 12+ messages in thread
From: Cong Wang @ 2013-05-29 4:01 UTC (permalink / raw)
To: netdev; +Cc: Stephen Hemminger, David S. Miller
On Tue, 2013-05-28 at 19:07 +0800, Cong Wang wrote:
> From: Cong Wang <amwang@redhat.com>
>
> In commit 553675fb5e9ce3d71a (vxlan: listen on multiple ports),
> we use kfree_rcu() to free ->vn_sock, but a) there is no use
> of RCU API to access this filed, b) RCU is not enough to do refcnt
> here, because in vxlan_leave_group() we drop RTNL lock before
> locking the socket, it could be possible that this field is
> freed during this period.
>
> So, instead making things complex, just do basic refcnt for
> the ->vn_sock, like we do for others.
BTW, this patch fixes a real crash when I test vxlan over IPv6, although
I can't reproduce it with IPv4.
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [Patch net-next] vxlan: do real refcnt for vn_sock
2013-05-29 2:08 ` Cong Wang
@ 2013-05-29 4:22 ` Stephen Hemminger
2013-05-29 4:34 ` Cong Wang
0 siblings, 1 reply; 12+ messages in thread
From: Stephen Hemminger @ 2013-05-29 4:22 UTC (permalink / raw)
To: Cong Wang; +Cc: netdev, David S. Miller
On Wed, 29 May 2013 10:08:53 +0800
Cong Wang <amwang@redhat.com> wrote:
> On Tue, 2013-05-28 at 08:22 -0700, Stephen Hemminger wrote:
> > On Tue, 28 May 2013 19:07:22 +0800
> > Cong Wang <amwang@redhat.com> wrote:
> >
> > > From: Cong Wang <amwang@redhat.com>
> > >
> > > In commit 553675fb5e9ce3d71a (vxlan: listen on multiple ports),
> > > we use kfree_rcu() to free ->vn_sock, but a) there is no use
> > > of RCU API to access this filed, b) RCU is not enough to do refcnt
> > > here, because in vxlan_leave_group() we drop RTNL lock before
> > > locking the socket, it could be possible that this field is
> > > freed during this period.
> > >
> > > So, instead making things complex, just do basic refcnt for
> > > the ->vn_sock, like we do for others.
> > >
> ...
> >
> > Not needed all access is under RTNL
>
> I know, this is why I had a patch (not posted) which adds the missing
> rtnl_dereference(), but even if we had these, it is still not correct.
>
> As I explained in the changelog, vxlan_leave_group() has a problem,
> because it releases rtnl lock before locking the socket, _and_ it is
> called after vxlan_dellink() which schedules a work to cleanup the
> struct. Therefore the ->vn_sock could be freed right after rtnl lock is
> released.
>
> Am I miss anything?
Ignoring your IPv6 code for now...
With IPV4:
refcnt is incremented when socket is incremented in newlink (RTNL held).
refcnt is decremented in by dellink (RTNL held) and socket is deleted from list
leave_group doesn't happen until work queue is fired.
rtnl_dereference is fine, but hardly necessary when the call hierarchy is so obvious.
The problem you describe won't be fixed by just converting it to atomic,
I think you need add a dev_hold()/dev_put to vxlan_stop to prevent
device from being deleted when rtnl_lock is dropped.
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [Patch net-next] vxlan: do real refcnt for vn_sock
2013-05-29 4:22 ` Stephen Hemminger
@ 2013-05-29 4:34 ` Cong Wang
0 siblings, 0 replies; 12+ messages in thread
From: Cong Wang @ 2013-05-29 4:34 UTC (permalink / raw)
To: Stephen Hemminger; +Cc: netdev, David S. Miller
On Tue, 2013-05-28 at 21:22 -0700, Stephen Hemminger wrote:
> On Wed, 29 May 2013 10:08:53 +0800
> Cong Wang <amwang@redhat.com> wrote:
>
> > On Tue, 2013-05-28 at 08:22 -0700, Stephen Hemminger wrote:
> > > On Tue, 28 May 2013 19:07:22 +0800
> > > Cong Wang <amwang@redhat.com> wrote:
> > >
> > > > From: Cong Wang <amwang@redhat.com>
> > > >
> > > > In commit 553675fb5e9ce3d71a (vxlan: listen on multiple ports),
> > > > we use kfree_rcu() to free ->vn_sock, but a) there is no use
> > > > of RCU API to access this filed, b) RCU is not enough to do refcnt
> > > > here, because in vxlan_leave_group() we drop RTNL lock before
> > > > locking the socket, it could be possible that this field is
> > > > freed during this period.
> > > >
> > > > So, instead making things complex, just do basic refcnt for
> > > > the ->vn_sock, like we do for others.
> > > >
> > ...
> > >
> > > Not needed all access is under RTNL
> >
> > I know, this is why I had a patch (not posted) which adds the missing
> > rtnl_dereference(), but even if we had these, it is still not correct.
> >
> > As I explained in the changelog, vxlan_leave_group() has a problem,
> > because it releases rtnl lock before locking the socket, _and_ it is
> > called after vxlan_dellink() which schedules a work to cleanup the
> > struct. Therefore the ->vn_sock could be freed right after rtnl lock is
> > released.
> >
> > Am I miss anything?
>
> Ignoring your IPv6 code for now...
>
> With IPV4:
> refcnt is incremented when socket is incremented in newlink (RTNL held).
> refcnt is decremented in by dellink (RTNL held) and socket is deleted from list
> leave_group doesn't happen until work queue is fired.
>
> rtnl_dereference is fine, but hardly necessary when the call hierarchy is so obvious.
>
> The problem you describe won't be fixed by just converting it to atomic,
My patch is _not_ just converting it to atomic_t, but it takes a ref for
every usage of ->vn_sock, which current implementation misses.
> I think you need add a dev_hold()/dev_put to vxlan_stop to prevent
> device from being deleted when rtnl_lock is dropped.
>
The crash is that lock_sock() got a NULL-def bug, which is not the
related with dev_hold() at all. I think it is due to the whole ->vn_sock
is freed before calling lock_sock(), thus vxlan->vn_sock->sock->sk
points to a freed memory area.
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [Patch net-next] vxlan: do real refcnt for vn_sock
2013-05-28 11:07 [Patch net-next] vxlan: do real refcnt for vn_sock Cong Wang
2013-05-28 15:22 ` Stephen Hemminger
2013-05-29 4:01 ` Cong Wang
@ 2013-05-29 4:41 ` Stephen Hemminger
2013-05-29 5:14 ` Cong Wang
2013-05-29 8:39 ` Cong Wang
2 siblings, 2 replies; 12+ messages in thread
From: Stephen Hemminger @ 2013-05-29 4:41 UTC (permalink / raw)
To: Cong Wang; +Cc: netdev, David S. Miller
Why not just fix the requirement to drop rtnl when calling igmp.
The code comes out cleaner and safer as well.
The follow COMPILE TESTED ONLY is a starting point...
--- a/drivers/net/vxlan.c 2013-05-20 10:24:19.624427346 -0700
+++ b/drivers/net/vxlan.c 2013-05-28 21:39:57.502351889 -0700
@@ -657,20 +657,12 @@ static int vxlan_join_group(struct net_d
.imr_multiaddr.s_addr = vxlan->default_dst.remote_ip,
.imr_ifindex = vxlan->default_dst.remote_ifindex,
};
- int err;
/* Already a member of group */
if (vxlan_group_used(vn, vxlan))
return 0;
- /* Need to drop RTNL to call multicast join */
- rtnl_unlock();
- lock_sock(sk);
- err = ip_mc_join_group(sk, &mreq);
- release_sock(sk);
- rtnl_lock();
-
- return err;
+ return _ip_mc_join_group(sk, &mreq);
}
@@ -679,7 +671,6 @@ static int vxlan_leave_group(struct net_
{
struct vxlan_dev *vxlan = netdev_priv(dev);
struct vxlan_net *vn = net_generic(dev_net(dev), vxlan_net_id);
- int err = 0;
struct sock *sk = vxlan->vn_sock->sock->sk;
struct ip_mreqn mreq = {
.imr_multiaddr.s_addr = vxlan->default_dst.remote_ip,
@@ -690,14 +681,7 @@ static int vxlan_leave_group(struct net_
if (vxlan_group_used(vn, vxlan))
return 0;
- /* Need to drop RTNL to call multicast leave */
- rtnl_unlock();
- lock_sock(sk);
- err = ip_mc_leave_group(sk, &mreq);
- release_sock(sk);
- rtnl_lock();
-
- return err;
+ return _ip_mc_leave_group(sk, &mreq);
}
/* Callback from net/ipv4/udp.c to receive packets */
@@ -1244,6 +1228,7 @@ static int vxlan_open(struct net_device
struct vxlan_dev *vxlan = netdev_priv(dev);
int err;
+ dev_hold(dev);
if (IN_MULTICAST(ntohl(vxlan->default_dst.remote_ip))) {
err = vxlan_join_group(dev);
if (err)
@@ -1252,6 +1237,7 @@ static int vxlan_open(struct net_device
if (vxlan->age_interval)
mod_timer(&vxlan->age_timer, jiffies + FDB_AGE_INTERVAL);
+ dev_put(dev);
return 0;
}
--- a/include/linux/igmp.h 2013-05-11 15:58:53.692325370 -0700
+++ b/include/linux/igmp.h 2013-05-28 21:37:40.560328674 -0700
@@ -109,7 +109,9 @@ struct ip_mc_list {
extern int ip_check_mc_rcu(struct in_device *dev, __be32 mc_addr, __be32 src_addr, u16 proto);
extern int igmp_rcv(struct sk_buff *);
+extern int _ip_mc_join_group(struct sock *sk, struct ip_mreqn *imr);
extern int ip_mc_join_group(struct sock *sk, struct ip_mreqn *imr);
+extern int _ip_mc_leave_group(struct sock *sk, struct ip_mreqn *imr);
extern int ip_mc_leave_group(struct sock *sk, struct ip_mreqn *imr);
extern void ip_mc_drop_socket(struct sock *sk);
extern int ip_mc_source(int add, int omode, struct sock *sk,
--- a/net/ipv4/igmp.c 2013-05-11 15:58:53.992318551 -0700
+++ b/net/ipv4/igmp.c 2013-05-28 21:39:43.302554701 -0700
@@ -1781,48 +1781,36 @@ static void ip_mc_clear_src(struct ip_mc
pmc->sfcount[MCAST_EXCLUDE] = 1;
}
-
-/*
- * Join a multicast group
- */
-int ip_mc_join_group(struct sock *sk , struct ip_mreqn *imr)
+int _ip_mc_join_group(struct sock *sk, struct ip_mreqn *imr)
{
- int err;
__be32 addr = imr->imr_multiaddr.s_addr;
- struct ip_mc_socklist *iml = NULL, *i;
+ struct ip_mc_socklist *iml, *i;
struct in_device *in_dev;
struct inet_sock *inet = inet_sk(sk);
struct net *net = sock_net(sk);
int ifindex;
int count = 0;
- if (!ipv4_is_multicast(addr))
- return -EINVAL;
-
- rtnl_lock();
+ ASSERT_RTNL();
in_dev = ip_mc_find_dev(net, imr);
+ if (!in_dev)
+ return -ENODEV;
- if (!in_dev) {
- iml = NULL;
- err = -ENODEV;
- goto done;
- }
-
- err = -EADDRINUSE;
ifindex = imr->imr_ifindex;
for_each_pmc_rtnl(inet, i) {
if (i->multi.imr_multiaddr.s_addr == addr &&
i->multi.imr_ifindex == ifindex)
- goto done;
+ return -EADDRINUSE;
count++;
}
- err = -ENOBUFS;
+
if (count >= sysctl_igmp_max_memberships)
- goto done;
+ return -ENOBUFS;
+
iml = sock_kmalloc(sk, sizeof(*iml), GFP_KERNEL);
if (iml == NULL)
- goto done;
+ return -ENOMEM;
memcpy(&iml->multi, imr, sizeof(*imr));
iml->next_rcu = inet->mc_list;
@@ -1830,8 +1818,24 @@ int ip_mc_join_group(struct sock *sk , s
iml->sfmode = MCAST_EXCLUDE;
rcu_assign_pointer(inet->mc_list, iml);
ip_mc_inc_group(in_dev, addr);
- err = 0;
-done:
+
+ return 0;
+}
+EXPORT_SYMBOL(_ip_mc_join_group);
+
+/*
+ * Join a multicast group
+ */
+int ip_mc_join_group(struct sock *sk, struct ip_mreqn *imr)
+{
+ int err;
+ __be32 addr = imr->imr_multiaddr.s_addr;
+
+ if (!ipv4_is_multicast(addr))
+ return -EINVAL;
+
+ rtnl_lock();
+ err = _ip_mc_join_group(sk, imr);
rtnl_unlock();
return err;
}
@@ -1857,11 +1861,7 @@ static int ip_mc_leave_src(struct sock *
return err;
}
-/*
- * Ask a socket to leave a group.
- */
-
-int ip_mc_leave_group(struct sock *sk, struct ip_mreqn *imr)
+int _ip_mc_leave_group(struct sock *sk, struct ip_mreqn *imr)
{
struct inet_sock *inet = inet_sk(sk);
struct ip_mc_socklist *iml;
@@ -1870,9 +1870,9 @@ int ip_mc_leave_group(struct sock *sk, s
struct net *net = sock_net(sk);
__be32 group = imr->imr_multiaddr.s_addr;
u32 ifindex;
- int ret = -EADDRNOTAVAIL;
- rtnl_lock();
+ ASSERT_RTNL();
+
in_dev = ip_mc_find_dev(net, imr);
ifindex = imr->imr_ifindex;
for (imlp = &inet->mc_list;
@@ -1880,6 +1880,7 @@ int ip_mc_leave_group(struct sock *sk, s
imlp = &iml->next_rcu) {
if (iml->multi.imr_multiaddr.s_addr != group)
continue;
+
if (ifindex) {
if (iml->multi.imr_ifindex != ifindex)
continue;
@@ -1887,20 +1888,32 @@ int ip_mc_leave_group(struct sock *sk, s
iml->multi.imr_address.s_addr)
continue;
- (void) ip_mc_leave_src(sk, iml, in_dev);
+ ip_mc_leave_src(sk, iml, in_dev);
*imlp = iml->next_rcu;
if (in_dev)
ip_mc_dec_group(in_dev, group);
- rtnl_unlock();
+
/* decrease mem now to avoid the memleak warning */
atomic_sub(sizeof(*iml), &sk->sk_omem_alloc);
kfree_rcu(iml, rcu);
return 0;
}
- if (!in_dev)
- ret = -ENODEV;
+
+ return !in_dev ? -ENODEV : -EADDRNOTAVAIL;
+}
+EXPORT_SYMBOL(_ip_mc_leave_group);
+
+/*
+ * Ask a socket to leave a group.
+ */
+int ip_mc_leave_group(struct sock *sk, struct ip_mreqn *imr)
+{
+ int ret;
+
+ rtnl_lock();
+ ret = _ip_mc_leave_group(sk, imr);
rtnl_unlock();
return ret;
}
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [Patch net-next] vxlan: do real refcnt for vn_sock
2013-05-29 4:41 ` Stephen Hemminger
@ 2013-05-29 5:14 ` Cong Wang
2013-05-29 8:39 ` Cong Wang
1 sibling, 0 replies; 12+ messages in thread
From: Cong Wang @ 2013-05-29 5:14 UTC (permalink / raw)
To: Stephen Hemminger; +Cc: netdev, David S. Miller
On Tue, 2013-05-28 at 21:41 -0700, Stephen Hemminger wrote:
> Why not just fix the requirement to drop rtnl when calling igmp.
> The code comes out cleaner and safer as well.
>
This could probably work too. Please submit it as a normal patch?
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [Patch net-next] vxlan: do real refcnt for vn_sock
2013-05-29 4:41 ` Stephen Hemminger
2013-05-29 5:14 ` Cong Wang
@ 2013-05-29 8:39 ` Cong Wang
2013-05-31 2:55 ` Cong Wang
1 sibling, 1 reply; 12+ messages in thread
From: Cong Wang @ 2013-05-29 8:39 UTC (permalink / raw)
To: Stephen Hemminger; +Cc: netdev, David S. Miller
On Tue, 2013-05-28 at 21:41 -0700, Stephen Hemminger wrote:
> Why not just fix the requirement to drop rtnl when calling igmp.
> The code comes out cleaner and safer as well.
Besides you forget to lock the socket before calling _ip_mc_join_group()
(and also the order is very important too), your patch doesn't fix the
problem I met. The full backtrace is below:
[ 114.134123] BUG: unable to handle kernel NULL pointer dereference at
0000000000000068
[ 114.136065] IP: [<ffffffff810a1061>] __lock_acquire+0x9c/0x45d
[ 114.136065] PGD 71721067 PUD 70e11067 PMD 0
[ 114.136065] Oops: 0000 [#1] SMP DEBUG_PAGEALLOC
[ 114.136065] CPU: 0 PID: 707 Comm: ip Not tainted 3.10.0-rc2+ #1075
[ 114.136065] Hardware name: Bochs Bochs, BIOS Bochs 01/01/2007
[ 114.136065] task: ffff88006fce2480 ti: ffff88006d9ec000 task.ti:
ffff88006d9ec000
[ 114.136065] RIP: 0010:[<ffffffff810a1061>] [<ffffffff810a1061>]
__lock_acquire+0x9c/0x45d
[ 114.136065] RSP: 0018:ffff88006d9ed6a8 EFLAGS: 00010046
[ 114.136065] RAX: 0000000000000068 RBX: 0000000000000000 RCX:
0000000000000000
[ 114.136065] RDX: 0000000000000000 RSI: 0000000000000000 RDI:
0000000000000068
[ 114.136065] RBP: ffff88006d9ed708 R08: 0000000000000002 R09:
0000000000000000
[ 114.136065] R10: ffffffff8104f0bb R11: ffffffff8107632b R12:
ffff88006fce2480
[ 114.136065] R13: 0000000000000000 R14: 0000000000000002 R15:
0000000000000000
[ 114.136065] FS: 00007fc30cd89740(0000) GS:ffff88007f600000(0000)
knlGS:0000000000000000
[ 114.136065] CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b
[ 114.136065] CR2: 0000000000000068 CR3: 000000006f3a2000 CR4:
00000000000006f0
[ 114.136065] DR0: 0000000000000000 DR1: 0000000000000000 DR2:
0000000000000000
[ 114.136065] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7:
0000000000000400
[ 114.136065] Stack:
[ 114.136065] ffff88006d9ed6b8 ffffffff00000000 ffff880000000000
000000006fce2480
[ 114.136065] ffffffff00000000 0000000000000068 ffffffff825063d2
0000000000000000
[ 114.136065] ffff88006fce2480 ffffffff81757759 0000000000000002
0000000000000000
[ 114.136065] Call Trace:
[ 114.136065] [<ffffffff81757759>] ? lock_sock_nested+0x3c/0x97
[ 114.136065] [<ffffffff810a2162>] lock_acquire+0xfa/0x140
[ 114.136065] [<ffffffff81757759>] ? lock_sock_nested+0x3c/0x97
[ 114.136065] [<ffffffff810a06d4>] ? trace_softirqs_off+0x3b/0xf3
[ 114.136065] [<ffffffff81976a1f>] _raw_spin_lock_bh+0x4a/0x7d
[ 114.136065] [<ffffffff81757759>] ? lock_sock_nested+0x3c/0x97
[ 114.136065] [<ffffffff810798db>] ? should_resched+0x9/0x28
[ 114.136065] [<ffffffff81757759>] lock_sock_nested+0x3c/0x97
[ 114.136065] [<ffffffff815e13a1>] vxlan_leave_group+0xc4/0x111
[ 114.136065] [<ffffffff81054dfc>] ? local_bh_enable_ip+0xe/0x10
[ 114.136065] [<ffffffff815e142d>] vxlan_stop+0x3f/0xaa
[ 114.136065] [<ffffffff817659ff>] __dev_close_many+0x9c/0xc4
[ 114.136065] [<ffffffff81765af3>] dev_close_many+0x70/0xd8
[ 114.136065] [<ffffffff817697f6>] rollback_registered_many+0xa2/0x1a7
[ 114.136065] [<ffffffff81769a4e>] unregister_netdevice_many+0x19/0x51
[ 114.136065] [<ffffffff817738b5>] rtnl_dellink+0xd0/0xfb
[ 114.136065] [<ffffffff81058754>] ? ns_capable+0x4d/0x66
[ 114.136065] [<ffffffff81774aaf>] rtnetlink_rcv_msg+0x19c/0x1ab
[ 114.136065] [<ffffffff81774913>] ? __rtnl_unlock+0x17/0x17
[ 114.136065] [<ffffffff817bc7d8>] netlink_rcv_skb+0x42/0x8d
[ 114.136065] [<ffffffff817748f5>] rtnetlink_rcv+0x26/0x2d
[ 114.136065] [<ffffffff817bb001>] netlink_unicast+0xb7/0x138
[ 114.136065] [<ffffffff817bba6b>] netlink_sendmsg+0x2b8/0x2f2
[ 114.136065] [<ffffffff81752e3e>] sock_sendmsg+0x7f/0xa0
[ 114.136065] [<ffffffff8111ee4d>] ? might_fault+0xa5/0xac
[ 114.136065] [<ffffffff8111ee04>] ? might_fault+0x5c/0xac
[ 114.136065] [<ffffffff817527ba>] ? move_addr_to_kernel+0x41/0x5a
[ 114.136065] [<ffffffff8175db77>] ? verify_iovec+0x5b/0xac
[ 114.136065] [<ffffffff81753064>] __sys_sendmsg+0x205/0x2a1
[ 114.136065] [<ffffffff8102c172>] ? __do_page_fault+0x2ee/0x38b
[ 114.136065] [<ffffffff8107507a>] ? up_read+0x29/0x2e
[ 114.136065] [<ffffffff8116794f>] ? fcheck_files+0xa3/0xe1
[ 114.136065] [<ffffffff81168915>] ? fget_light+0x3a/0xa4
[ 114.136065] [<ffffffff81753f91>] SyS_sendmsg+0x42/0x60
[ 114.136065] [<ffffffff819781c2>] system_call_fastpath+0x16/0x1b
[ 114.136065] Code: 00 00 83 3d c1 46 d7 01 00 0f 85 cb 03 00 00 48 c7
c1 1c ef d5 81 48 c7 c2 67 5f d5 81 be fb 0b 00 00 e9 1b 02 00 00 48 8b
45 c8 <48> 81 38 a0 f5 52 82 b8 01 00 00 00 44 0f 44 f0 83 fe 01 77 10
[ 114.136065] RIP [<ffffffff810a1061>] __lock_acquire+0x9c/0x45d
[ 114.136065] RSP <ffff88006d9ed6a8>
[ 114.136065] CR2: 0000000000000068
[ 114.136065] ---[ end trace 92078b41edbc404d ]---
[ 114.136065] Kernel panic - not syncing: Fatal exception in interrupt
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [Patch net-next] vxlan: do real refcnt for vn_sock
2013-05-29 8:39 ` Cong Wang
@ 2013-05-31 2:55 ` Cong Wang
2013-05-31 3:56 ` Stephen Hemminger
0 siblings, 1 reply; 12+ messages in thread
From: Cong Wang @ 2013-05-31 2:55 UTC (permalink / raw)
To: Stephen Hemminger; +Cc: netdev, David S. Miller
On Wed, 2013-05-29 at 16:39 +0800, Cong Wang wrote:
> On Tue, 2013-05-28 at 21:41 -0700, Stephen Hemminger wrote:
> > Why not just fix the requirement to drop rtnl when calling igmp.
> > The code comes out cleaner and safer as well.
>
> Besides you forget to lock the socket before calling _ip_mc_join_group()
> (and also the order is very important too), your patch doesn't fix the
> problem I met. The full backtrace is below:
>
Hi, Stephen,
Do you think my patch is an easier solution? It at least fixes the
crash, while your patch doesn't. :)
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [Patch net-next] vxlan: do real refcnt for vn_sock
2013-05-31 2:55 ` Cong Wang
@ 2013-05-31 3:56 ` Stephen Hemminger
2013-05-31 4:12 ` Cong Wang
0 siblings, 1 reply; 12+ messages in thread
From: Stephen Hemminger @ 2013-05-31 3:56 UTC (permalink / raw)
To: Cong Wang; +Cc: netdev, David S. Miller
On Fri, 31 May 2013 10:55:45 +0800
Cong Wang <amwang@redhat.com> wrote:
> On Wed, 2013-05-29 at 16:39 +0800, Cong Wang wrote:
> > On Tue, 2013-05-28 at 21:41 -0700, Stephen Hemminger wrote:
> > > Why not just fix the requirement to drop rtnl when calling igmp.
> > > The code comes out cleaner and safer as well.
> >
> > Besides you forget to lock the socket before calling _ip_mc_join_group()
> > (and also the order is very important too), your patch doesn't fix the
> > problem I met. The full backtrace is below:
> >
>
> Hi, Stephen,
>
> Do you think my patch is an easier solution? It at least fixes the
> crash, while your patch doesn't. :)
>
>
No. your patch doesn't fix the real problem.
The real fix is more complex because of how socket locking interacts with RTNL.
I have been busy with health stuff, so the going is slow.
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [Patch net-next] vxlan: do real refcnt for vn_sock
2013-05-31 3:56 ` Stephen Hemminger
@ 2013-05-31 4:12 ` Cong Wang
0 siblings, 0 replies; 12+ messages in thread
From: Cong Wang @ 2013-05-31 4:12 UTC (permalink / raw)
To: Stephen Hemminger; +Cc: netdev, David S. Miller
On Thu, 2013-05-30 at 20:56 -0700, Stephen Hemminger wrote:
> On Fri, 31 May 2013 10:55:45 +0800
> Cong Wang <amwang@redhat.com> wrote:
>
> > On Wed, 2013-05-29 at 16:39 +0800, Cong Wang wrote:
> > > On Tue, 2013-05-28 at 21:41 -0700, Stephen Hemminger wrote:
> > > > Why not just fix the requirement to drop rtnl when calling igmp.
> > > > The code comes out cleaner and safer as well.
> > >
> > > Besides you forget to lock the socket before calling _ip_mc_join_group()
> > > (and also the order is very important too), your patch doesn't fix the
> > > problem I met. The full backtrace is below:
> > >
> >
> > Hi, Stephen,
> >
> > Do you think my patch is an easier solution? It at least fixes the
> > crash, while your patch doesn't. :)
> >
> >
>
> No. your patch doesn't fix the real problem.
> The real fix is more complex because of how socket locking interacts with RTNL.
Yeah, I am thinking if we could replace the RTNL lock with a spinlock in
IPv4 multicast, as IPv6 multicast uses its own spin lock too. I don't
see any reason why inet->mc_list has to be protected by RTNL.
>
>
> I have been busy with health stuff, so the going is slow.
No problem, take care!
^ permalink raw reply [flat|nested] 12+ messages in thread
end of thread, other threads:[~2013-05-31 4:12 UTC | newest]
Thread overview: 12+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2013-05-28 11:07 [Patch net-next] vxlan: do real refcnt for vn_sock Cong Wang
2013-05-28 15:22 ` Stephen Hemminger
2013-05-29 2:08 ` Cong Wang
2013-05-29 4:22 ` Stephen Hemminger
2013-05-29 4:34 ` Cong Wang
2013-05-29 4:01 ` Cong Wang
2013-05-29 4:41 ` Stephen Hemminger
2013-05-29 5:14 ` Cong Wang
2013-05-29 8:39 ` Cong Wang
2013-05-31 2:55 ` Cong Wang
2013-05-31 3:56 ` Stephen Hemminger
2013-05-31 4:12 ` Cong Wang
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.