All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH net-next] ipv6: protect skb->sk accesses from recursive dereference inside the stack
@ 2015-04-01 15:07 Hannes Frederic Sowa
  2015-04-01 18:40 ` David Miller
                   ` (2 more replies)
  0 siblings, 3 replies; 15+ messages in thread
From: Hannes Frederic Sowa @ 2015-04-01 15:07 UTC (permalink / raw)
  To: netdev; +Cc: hannes, Jiri Pirko

From: "hannes@stressinduktion.org" <hannes@stressinduktion.org>

We should not consult skb->sk for output decisions in xmit recursion
levels > 0 in the stack. Otherwise local socket settings could influence
the result of e.g. tunnel encapsulation process.

ipv6 does not conform with this in three places:

1) ip6_fragment: we do consult ipv6_npinfo for frag_size

2) sk_mc_loop in ipv6 uses skb->sk and checks if we should
   loop the packet back to the local socket

3) ip6_skb_dst_mtu could query the settings from the user socket and
   force a wrong MTU

Furthermore:
In sk_mc_loop we could potentially land in WARN_ON(1) if we use a
PF_PACKET socket ontop of an IPv6-backed vxlan device.

Reuse xmit_recursion as we are currently only interested in protecting
tunnel devices.

Cc: Jiri Pirko <jiri@resnulli.us>
Signed-off-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
---

Jiri tried to fix this bug by passing down the tunnel sock pointer.
Currently there should be no difference in behaviour with this patch.

http://patchwork.ozlabs.org/patch/380404/
http://patchwork.ozlabs.org/patch/380406/
http://patchwork.ozlabs.org/patch/380405/

In case we do need more specific fragmentation setup semantics we would
need to go with Jiri's approach. Currently we don't care about sk_mc_loop
for kernel sockets, so it is easy to just shut them up. Other options
are safe as well.

Please review carefully!

 include/linux/netdevice.h |  6 ++++++
 include/net/ip.h          | 16 ----------------
 include/net/ip6_route.h   |  3 ++-
 include/net/sock.h        |  2 ++
 net/core/dev.c            |  4 +++-
 net/core/sock.c           | 19 +++++++++++++++++++
 net/ipv6/ip6_output.c     |  3 ++-
 7 files changed, 34 insertions(+), 19 deletions(-)

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 967bb4c..1064075 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -2176,6 +2176,12 @@ void netdev_freemem(struct net_device *dev);
 void synchronize_net(void);
 int init_dummy_netdev(struct net_device *dev);
 
+DECLARE_PER_CPU(int, xmit_recursion);
+static inline int dev_recursion_level(void)
+{
+	return this_cpu_read(xmit_recursion);
+}
+
 struct net_device *dev_get_by_index(struct net *net, int ifindex);
 struct net_device *__dev_get_by_index(struct net *net, int ifindex);
 struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex);
diff --git a/include/net/ip.h b/include/net/ip.h
index d0808a3..69cd9cb 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -455,22 +455,6 @@ static __inline__ void inet_reset_saddr(struct sock *sk)
 
 #endif
 
-static inline int sk_mc_loop(struct sock *sk)
-{
-	if (!sk)
-		return 1;
-	switch (sk->sk_family) {
-	case AF_INET:
-		return inet_sk(sk)->mc_loop;
-#if IS_ENABLED(CONFIG_IPV6)
-	case AF_INET6:
-		return inet6_sk(sk)->mc_loop;
-#endif
-	}
-	WARN_ON(1);
-	return 1;
-}
-
 bool ip_call_ra_chain(struct sk_buff *skb);
 
 /*
diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h
index 1d09b46..eda131d 100644
--- a/include/net/ip6_route.h
+++ b/include/net/ip6_route.h
@@ -174,7 +174,8 @@ int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *));
 
 static inline int ip6_skb_dst_mtu(struct sk_buff *skb)
 {
-	struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
+	struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ?
+				inet6_sk(skb->sk) : NULL;
 
 	return (np && np->pmtudisc >= IPV6_PMTUDISC_PROBE) ?
 	       skb_dst(skb)->dev->mtu : dst_mtu(skb_dst(skb));
diff --git a/include/net/sock.h b/include/net/sock.h
index 3f9b8ce..bd6f523 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1762,6 +1762,8 @@ struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie);
 
 struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie);
 
+bool sk_mc_loop(struct sock *sk);
+
 static inline bool sk_can_gso(const struct sock *sk)
 {
 	return net_gso_ok(sk->sk_route_caps, sk->sk_gso_type);
diff --git a/net/core/dev.c b/net/core/dev.c
index 65492b0..08c0df9 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2849,7 +2849,9 @@ static void skb_update_prio(struct sk_buff *skb)
 #define skb_update_prio(skb)
 #endif
 
-static DEFINE_PER_CPU(int, xmit_recursion);
+DEFINE_PER_CPU(int, xmit_recursion);
+EXPORT_SYMBOL(xmit_recursion);
+
 #define RECURSION_LIMIT 10
 
 /**
diff --git a/net/core/sock.c b/net/core/sock.c
index 119ae46..654e38a 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -653,6 +653,25 @@ static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool)
 		sock_reset_flag(sk, bit);
 }
 
+bool sk_mc_loop(struct sock *sk)
+{
+	if (dev_recursion_level())
+		return false;
+	if (!sk)
+		return true;
+	switch (sk->sk_family) {
+	case AF_INET:
+		return inet_sk(sk)->mc_loop;
+#if IS_ENABLED(CONFIG_IPV6)
+	case AF_INET6:
+		return inet6_sk(sk)->mc_loop;
+#endif
+	}
+	WARN_ON(1);
+	return true;
+}
+EXPORT_SYMBOL(sk_mc_loop);
+
 /*
  *	This is meant for all protocols to use and covers goings on
  *	at the socket level. Everything here is generic.
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 84c58da..654f245 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -542,7 +542,8 @@ int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
 {
 	struct sk_buff *frag;
 	struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
-	struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
+	struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ?
+				inet6_sk(skb->sk) : NULL;
 	struct ipv6hdr *tmp_hdr;
 	struct frag_hdr *fh;
 	unsigned int mtu, hlen, left, len;
-- 
2.1.0

^ permalink raw reply related	[flat|nested] 15+ messages in thread

* Re: [PATCH net-next] ipv6: protect skb->sk accesses from recursive dereference inside the stack
  2015-04-01 15:07 [PATCH net-next] ipv6: protect skb->sk accesses from recursive dereference inside the stack Hannes Frederic Sowa
@ 2015-04-01 18:40 ` David Miller
  2015-04-01 18:57   ` Hannes Frederic Sowa
  2015-04-02  0:06 ` Cong Wang
  2015-04-06 20:14 ` David Miller
  2 siblings, 1 reply; 15+ messages in thread
From: David Miller @ 2015-04-01 18:40 UTC (permalink / raw)
  To: hannes; +Cc: netdev, jiri

From: Hannes Frederic Sowa <hannes@stressinduktion.org>
Date: Wed,  1 Apr 2015 17:07:44 +0200

> From: "hannes@stressinduktion.org" <hannes@stressinduktion.org>
> 
> We should not consult skb->sk for output decisions in xmit recursion
> levels > 0 in the stack. Otherwise local socket settings could influence
> the result of e.g. tunnel encapsulation process.
> 
> ipv6 does not conform with this in three places:
> 
> 1) ip6_fragment: we do consult ipv6_npinfo for frag_size
> 
> 2) sk_mc_loop in ipv6 uses skb->sk and checks if we should
>    loop the packet back to the local socket
> 
> 3) ip6_skb_dst_mtu could query the settings from the user socket and
>    force a wrong MTU
> 
> Furthermore:
> In sk_mc_loop we could potentially land in WARN_ON(1) if we use a
> PF_PACKET socket ontop of an IPv6-backed vxlan device.
> 
> Reuse xmit_recursion as we are currently only interested in protecting
> tunnel devices.
> 
> Cc: Jiri Pirko <jiri@resnulli.us>
> Signed-off-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
> ---
> 
> Jiri tried to fix this bug by passing down the tunnel sock pointer.
> Currently there should be no difference in behaviour with this patch.
> 
> http://patchwork.ozlabs.org/patch/380404/
> http://patchwork.ozlabs.org/patch/380406/
> http://patchwork.ozlabs.org/patch/380405/
> 
> In case we do need more specific fragmentation setup semantics we would
> need to go with Jiri's approach. Currently we don't care about sk_mc_loop
> for kernel sockets, so it is easy to just shut them up. Other options
> are safe as well.
> 
> Please review carefully!

As a short term solution I guess this is fine.

I'll let this sit for a day or two so others can review the change.

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH net-next] ipv6: protect skb->sk accesses from recursive dereference inside the stack
  2015-04-01 18:40 ` David Miller
@ 2015-04-01 18:57   ` Hannes Frederic Sowa
  2015-04-01 19:26     ` David Miller
  0 siblings, 1 reply; 15+ messages in thread
From: Hannes Frederic Sowa @ 2015-04-01 18:57 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, jiri



On Wed, Apr 1, 2015, at 20:40, David Miller wrote:
> From: Hannes Frederic Sowa <hannes@stressinduktion.org>
> > In case we do need more specific fragmentation setup semantics we would
> > need to go with Jiri's approach. Currently we don't care about sk_mc_loop
> > for kernel sockets, so it is easy to just shut them up. Other options
> > are safe as well.
> > 
> > Please review carefully!
> 
> As a short term solution I guess this is fine.
> 
> I'll let this sit for a day or two so others can review the change.

Ok, thanks!

We seem to have the same problem with skb->ignore_df which we
conditionally set by user request but multiple layer (e.g. tunnels) do
evaluate this boolean during stack traversal. IPv4 seems to be impacted
here as well, but I have to do more research on that. Maybe the
semantics seem to be wanted?

Bye,
Hannes

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH net-next] ipv6: protect skb->sk accesses from recursive dereference inside the stack
  2015-04-01 18:57   ` Hannes Frederic Sowa
@ 2015-04-01 19:26     ` David Miller
  0 siblings, 0 replies; 15+ messages in thread
From: David Miller @ 2015-04-01 19:26 UTC (permalink / raw)
  To: hannes; +Cc: netdev, jiri

From: Hannes Frederic Sowa <hannes@stressinduktion.org>
Date: Wed, 01 Apr 2015 20:57:52 +0200

> We seem to have the same problem with skb->ignore_df which we
> conditionally set by user request but multiple layer (e.g. tunnels) do
> evaluate this boolean during stack traversal. IPv4 seems to be impacted
> here as well, but I have to do more research on that. Maybe the
> semantics seem to be wanted?

It's arguably a macro-level condition on the top-level packet when set.

So perhaps it's enough to just clear it when we encapsulate.

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH net-next] ipv6: protect skb->sk accesses from recursive dereference inside the stack
  2015-04-01 15:07 [PATCH net-next] ipv6: protect skb->sk accesses from recursive dereference inside the stack Hannes Frederic Sowa
  2015-04-01 18:40 ` David Miller
@ 2015-04-02  0:06 ` Cong Wang
  2015-04-02  0:27   ` Eric Dumazet
  2015-04-06 20:14 ` David Miller
  2 siblings, 1 reply; 15+ messages in thread
From: Cong Wang @ 2015-04-02  0:06 UTC (permalink / raw)
  To: Hannes Frederic Sowa; +Cc: netdev, Jiri Pirko

On Wed, Apr 1, 2015 at 8:07 AM, Hannes Frederic Sowa
<hannes@stressinduktion.org> wrote:
> From: "hannes@stressinduktion.org" <hannes@stressinduktion.org>
>
> We should not consult skb->sk for output decisions in xmit recursion
> levels > 0 in the stack. Otherwise local socket settings could influence
> the result of e.g. tunnel encapsulation process.
>
> ipv6 does not conform with this in three places:
>
> 1) ip6_fragment: we do consult ipv6_npinfo for frag_size
>
> 2) sk_mc_loop in ipv6 uses skb->sk and checks if we should
>    loop the packet back to the local socket
>
> 3) ip6_skb_dst_mtu could query the settings from the user socket and
>    force a wrong MTU
>
> Furthermore:
> In sk_mc_loop we could potentially land in WARN_ON(1) if we use a
> PF_PACKET socket ontop of an IPv6-backed vxlan device.
>
> Reuse xmit_recursion as we are currently only interested in protecting
> tunnel devices.
>

Shouldn't these skb's be orphaned for tunnel cases? Or we still have
to keep skb->sk for other valid use?

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH net-next] ipv6: protect skb->sk accesses from recursive dereference inside the stack
  2015-04-02  0:06 ` Cong Wang
@ 2015-04-02  0:27   ` Eric Dumazet
  2015-04-02  0:35     ` Hannes Frederic Sowa
  2015-04-02  2:55     ` Cong Wang
  0 siblings, 2 replies; 15+ messages in thread
From: Eric Dumazet @ 2015-04-02  0:27 UTC (permalink / raw)
  To: Cong Wang; +Cc: Hannes Frederic Sowa, netdev, Jiri Pirko

On Wed, 2015-04-01 at 17:06 -0700, Cong Wang wrote:

> Shouldn't these skb's be orphaned for tunnel cases? Or we still have
> to keep skb->sk for other valid use?

skb should not be orphaned, until the very last stage.

Many layers depend on this, really.

Simply ask the question to yourself :

What if I do not associate skb to a socket at first. What possibly
breaks ?

orphaning skb just because they traverse a tunnel would be quite
horrible.

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH net-next] ipv6: protect skb->sk accesses from recursive dereference inside the stack
  2015-04-02  0:27   ` Eric Dumazet
@ 2015-04-02  0:35     ` Hannes Frederic Sowa
  2015-04-02  0:48       ` Eric Dumazet
  2015-04-02  2:55     ` Cong Wang
  1 sibling, 1 reply; 15+ messages in thread
From: Hannes Frederic Sowa @ 2015-04-02  0:35 UTC (permalink / raw)
  To: Eric Dumazet, Cong Wang; +Cc: netdev, Jiri Pirko



On Thu, Apr 2, 2015, at 02:27, Eric Dumazet wrote:
> On Wed, 2015-04-01 at 17:06 -0700, Cong Wang wrote:
> 
> > Shouldn't these skb's be orphaned for tunnel cases? Or we still have
> > to keep skb->sk for other valid use?
> 
> skb should not be orphaned, until the very last stage.
> 
> Many layers depend on this, really.
> 
> Simply ask the question to yourself :
> 
> What if I do not associate skb to a socket at first. What possibly
> breaks ?
> 
> orphaning skb just because they traverse a tunnel would be quite
> horrible.

Agreed, but we have some bits in the skb->sk pointer left for signaling
we are only keeping it around for destructor and upper layer
notifications. Destructors should be the only ones having to deal with
skb->sk and they can mask the bit. That would touch a lot of NULL
checks, though.

Bye,
Hannes

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH net-next] ipv6: protect skb->sk accesses from recursive dereference inside the stack
  2015-04-02  0:35     ` Hannes Frederic Sowa
@ 2015-04-02  0:48       ` Eric Dumazet
  2015-04-02  0:56         ` Hannes Frederic Sowa
  0 siblings, 1 reply; 15+ messages in thread
From: Eric Dumazet @ 2015-04-02  0:48 UTC (permalink / raw)
  To: Hannes Frederic Sowa; +Cc: Cong Wang, netdev, Jiri Pirko

On Thu, 2015-04-02 at 02:35 +0200, Hannes Frederic Sowa wrote:
> 
> On Thu, Apr 2, 2015, at 02:27, Eric Dumazet wrote:
> > On Wed, 2015-04-01 at 17:06 -0700, Cong Wang wrote:
> > 
> > > Shouldn't these skb's be orphaned for tunnel cases? Or we still have
> > > to keep skb->sk for other valid use?
> > 
> > skb should not be orphaned, until the very last stage.
> > 
> > Many layers depend on this, really.
> > 
> > Simply ask the question to yourself :
> > 
> > What if I do not associate skb to a socket at first. What possibly
> > breaks ?
> > 
> > orphaning skb just because they traverse a tunnel would be quite
> > horrible.
> 
> Agreed, but we have some bits in the skb->sk pointer left for signaling
> we are only keeping it around for destructor and upper layer
> notifications. Destructors should be the only ones having to deal with
> skb->sk and they can mask the bit. That would touch a lot of NULL
> checks, though.

Have you checked net/sched/sch_fq.c per chance ?

skb->sk is not an opaque value.

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH net-next] ipv6: protect skb->sk accesses from recursive dereference inside the stack
  2015-04-02  0:48       ` Eric Dumazet
@ 2015-04-02  0:56         ` Hannes Frederic Sowa
  2015-04-02  1:53           ` Eric Dumazet
  0 siblings, 1 reply; 15+ messages in thread
From: Hannes Frederic Sowa @ 2015-04-02  0:56 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: Cong Wang, netdev, Jiri Pirko



On Thu, Apr 2, 2015, at 02:48, Eric Dumazet wrote:
> On Thu, 2015-04-02 at 02:35 +0200, Hannes Frederic Sowa wrote:
> > On Thu, Apr 2, 2015, at 02:27, Eric Dumazet wrote: 
> > > orphaning skb just because they traverse a tunnel would be quite
> > > horrible.
> > 
> > Agreed, but we have some bits in the skb->sk pointer left for signaling
> > we are only keeping it around for destructor and upper layer
> > notifications. Destructors should be the only ones having to deal with
> > skb->sk and they can mask the bit. That would touch a lot of NULL
> > checks, though.
> 
> Have you checked net/sched/sch_fq.c per chance ?
> 
> skb->sk is not an opaque value.

Do you think that skb->sk access through skb->__sk & ~0x1ULL would slow
down the code too much?

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH net-next] ipv6: protect skb->sk accesses from recursive dereference inside the stack
  2015-04-02  0:56         ` Hannes Frederic Sowa
@ 2015-04-02  1:53           ` Eric Dumazet
  2015-04-02 11:34             ` Hannes Frederic Sowa
  0 siblings, 1 reply; 15+ messages in thread
From: Eric Dumazet @ 2015-04-02  1:53 UTC (permalink / raw)
  To: Hannes Frederic Sowa; +Cc: Cong Wang, netdev, Jiri Pirko

On Thu, 2015-04-02 at 02:56 +0200, Hannes Frederic Sowa wrote:
> 
> On Thu, Apr 2, 2015, at 02:48, Eric Dumazet wrote:
> > On Thu, 2015-04-02 at 02:35 +0200, Hannes Frederic Sowa wrote:
> > > On Thu, Apr 2, 2015, at 02:27, Eric Dumazet wrote: 
> > > > orphaning skb just because they traverse a tunnel would be quite
> > > > horrible.
> > > 
> > > Agreed, but we have some bits in the skb->sk pointer left for signaling
> > > we are only keeping it around for destructor and upper layer
> > > notifications. Destructors should be the only ones having to deal with
> > > skb->sk and they can mask the bit. That would touch a lot of NULL
> > > checks, though.
> > 
> > Have you checked net/sched/sch_fq.c per chance ?
> > 
> > skb->sk is not an opaque value.
> 
> Do you think that skb->sk access through skb->__sk & ~0x1ULL would slow
> down the code too much?

I was only replying to your claim "Destructors should be the only ones
having to deal with skb->sk", which looked wrong to me.

Adding bit masking would slow down the code, but we did this already for
skb->dst

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH net-next] ipv6: protect skb->sk accesses from recursive dereference inside the stack
  2015-04-02  0:27   ` Eric Dumazet
  2015-04-02  0:35     ` Hannes Frederic Sowa
@ 2015-04-02  2:55     ` Cong Wang
  2015-04-02 12:05       ` Hannes Frederic Sowa
  1 sibling, 1 reply; 15+ messages in thread
From: Cong Wang @ 2015-04-02  2:55 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: Hannes Frederic Sowa, netdev, Jiri Pirko

On Wed, Apr 1, 2015 at 5:27 PM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> On Wed, 2015-04-01 at 17:06 -0700, Cong Wang wrote:
>
>> Shouldn't these skb's be orphaned for tunnel cases? Or we still have
>> to keep skb->sk for other valid use?
>
> skb should not be orphaned, until the very last stage.
>
> Many layers depend on this, really.
>
> Simply ask the question to yourself :
>
> What if I do not associate skb to a socket at first. What possibly
> breaks ?
>
> orphaning skb just because they traverse a tunnel would be quite
> horrible.
>

I didn't make it clear, I meant to say "resetting the skb->sk to tunnel
socket". I know sch_fq uses skb->sk for flow classification, but
if skb->sk still points to the user-space socket, its meaning is already
changed after encapsulation, with regarding to this, all traffic goes
out of this tunnel should be one flow.

This also means ipv6 should not use any socket setting for routing
lookup, since after resetting skb->sk in encapsulation skb's belong
to one kernel socket which doesn't have any socket options set.

As I said, there might be still some useful information from their
original skb->sk, I of course never audit all the skb->sk usage
in kernel.

Essentially, carrying L4 information to L2 does make things complicated
when udp tunnels are involved.

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH net-next] ipv6: protect skb->sk accesses from recursive dereference inside the stack
  2015-04-02  1:53           ` Eric Dumazet
@ 2015-04-02 11:34             ` Hannes Frederic Sowa
  2015-04-02 16:24               ` David Miller
  0 siblings, 1 reply; 15+ messages in thread
From: Hannes Frederic Sowa @ 2015-04-02 11:34 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: Cong Wang, netdev, Jiri Pirko

On Thu, Apr 2, 2015, at 03:53, Eric Dumazet wrote:
> On Thu, 2015-04-02 at 02:56 +0200, Hannes Frederic Sowa wrote:
> > 
> > On Thu, Apr 2, 2015, at 02:48, Eric Dumazet wrote:
> > > On Thu, 2015-04-02 at 02:35 +0200, Hannes Frederic Sowa wrote:
> > > > On Thu, Apr 2, 2015, at 02:27, Eric Dumazet wrote: 
> > > > > orphaning skb just because they traverse a tunnel would be quite
> > > > > horrible.
> > > > 
> > > > Agreed, but we have some bits in the skb->sk pointer left for signaling
> > > > we are only keeping it around for destructor and upper layer
> > > > notifications. Destructors should be the only ones having to deal with
> > > > skb->sk and they can mask the bit. That would touch a lot of NULL
> > > > checks, though.
> > > 
> > > Have you checked net/sched/sch_fq.c per chance ?
> > > 
> > > skb->sk is not an opaque value.
> > 
> > Do you think that skb->sk access through skb->__sk & ~0x1ULL would slow
> > down the code too much?
> 
> I was only replying to your claim "Destructors should be the only ones
> having to deal with skb->sk", which looked wrong to me.

You are correct, there might be even more places I need to use this. 

> Adding bit masking would slow down the code, but we did this already for
> skb->dst

Exactly.

My plan is that I actually would propose this patch for -net and stable
as it fixes the problem that currently we oops when running dhclient on
a vxlan interface. As soon as net gets merged into net-next I I would
propose this change if it works out smoothly and looks maintainable.

Thanks,
Hannes

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH net-next] ipv6: protect skb->sk accesses from recursive dereference inside the stack
  2015-04-02  2:55     ` Cong Wang
@ 2015-04-02 12:05       ` Hannes Frederic Sowa
  0 siblings, 0 replies; 15+ messages in thread
From: Hannes Frederic Sowa @ 2015-04-02 12:05 UTC (permalink / raw)
  To: Cong Wang, Eric Dumazet; +Cc: netdev, Jiri Pirko

On Thu, Apr 2, 2015, at 04:55, Cong Wang wrote:
> On Wed, Apr 1, 2015 at 5:27 PM, Eric Dumazet <eric.dumazet@gmail.com>
> wrote:
> > On Wed, 2015-04-01 at 17:06 -0700, Cong Wang wrote:
> >
> >> Shouldn't these skb's be orphaned for tunnel cases? Or we still have
> >> to keep skb->sk for other valid use?
> >
> > skb should not be orphaned, until the very last stage.
> >
> > Many layers depend on this, really.
> >
> > Simply ask the question to yourself :
> >
> > What if I do not associate skb to a socket at first. What possibly
> > breaks ?
> >
> > orphaning skb just because they traverse a tunnel would be quite
> > horrible.
> >
> 
> I didn't make it clear, I meant to say "resetting the skb->sk to tunnel
> socket". I know sch_fq uses skb->sk for flow classification, but
> if skb->sk still points to the user-space socket, its meaning is already
> changed after encapsulation, with regarding to this, all traffic goes
> out of this tunnel should be one flow.

We cannot block/stop a tunnel submitting more packets into the stack via
its socket. We have to close the feedback loop directly to user space.
If we need this tunnel socket available it has to be passed either as an
additional field in sk_buff or via arguments down the stack.
In case of forwarding this works without skb->sk because we mostly
always run to completion and submit the packet in one softirq call.

> This also means ipv6 should not use any socket setting for routing
> lookup, since after resetting skb->sk in encapsulation skb's belong
> to one kernel socket which doesn't have any socket options set.

Yes, this is right. But besides xfrm6 layer I am currently looking after
I haven't seen problematic code here.

Bye,
Hannes

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH net-next] ipv6: protect skb->sk accesses from recursive dereference inside the stack
  2015-04-02 11:34             ` Hannes Frederic Sowa
@ 2015-04-02 16:24               ` David Miller
  0 siblings, 0 replies; 15+ messages in thread
From: David Miller @ 2015-04-02 16:24 UTC (permalink / raw)
  To: hannes; +Cc: eric.dumazet, cwang, netdev, jiri

From: Hannes Frederic Sowa <hannes@stressinduktion.org>
Date: Thu, 02 Apr 2015 13:34:08 +0200

> My plan is that I actually would propose this patch for -net and stable
> as it fixes the problem that currently we oops when running dhclient on
> a vxlan interface. As soon as net gets merged into net-next I I would
> propose this change if it works out smoothly and looks maintainable.

Yes, I think this patch is good for -net and -stable, but longer term we
have to do something better.

The most infuriating thing about this mess is that passing the tunnel
socket down is only difficult because of that damn okfn argument to
nf_hook().

And that is _only_ needed for nf_queue reinjection, absolutely nothing
else in the world uses it.

I can't believe how much of a price we pay for that feature :-/

So I've been trying to brainstorm a way to get rid of the okfn argument,
or somehow make adjustments to it's signature more painless.

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH net-next] ipv6: protect skb->sk accesses from recursive dereference inside the stack
  2015-04-01 15:07 [PATCH net-next] ipv6: protect skb->sk accesses from recursive dereference inside the stack Hannes Frederic Sowa
  2015-04-01 18:40 ` David Miller
  2015-04-02  0:06 ` Cong Wang
@ 2015-04-06 20:14 ` David Miller
  2 siblings, 0 replies; 15+ messages in thread
From: David Miller @ 2015-04-06 20:14 UTC (permalink / raw)
  To: hannes; +Cc: netdev, jiri

From: Hannes Frederic Sowa <hannes@stressinduktion.org>
Date: Wed,  1 Apr 2015 17:07:44 +0200

> From: "hannes@stressinduktion.org" <hannes@stressinduktion.org>
> 
> We should not consult skb->sk for output decisions in xmit recursion
> levels > 0 in the stack. Otherwise local socket settings could influence
> the result of e.g. tunnel encapsulation process.
> 
> ipv6 does not conform with this in three places:
> 
> 1) ip6_fragment: we do consult ipv6_npinfo for frag_size
> 
> 2) sk_mc_loop in ipv6 uses skb->sk and checks if we should
>    loop the packet back to the local socket
> 
> 3) ip6_skb_dst_mtu could query the settings from the user socket and
>    force a wrong MTU
> 
> Furthermore:
> In sk_mc_loop we could potentially land in WARN_ON(1) if we use a
> PF_PACKET socket ontop of an IPv6-backed vxlan device.
> 
> Reuse xmit_recursion as we are currently only interested in protecting
> tunnel devices.
> 
> Cc: Jiri Pirko <jiri@resnulli.us>
> Signed-off-by: Hannes Frederic Sowa <hannes@stressinduktion.org>

So I'm applying this now for 'net' and queueing it up for -stable.

In net-next, we're going to pass the tunnel socket pointer down as per
the work I've been doing over the weekend.

^ permalink raw reply	[flat|nested] 15+ messages in thread

end of thread, other threads:[~2015-04-06 20:14 UTC | newest]

Thread overview: 15+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2015-04-01 15:07 [PATCH net-next] ipv6: protect skb->sk accesses from recursive dereference inside the stack Hannes Frederic Sowa
2015-04-01 18:40 ` David Miller
2015-04-01 18:57   ` Hannes Frederic Sowa
2015-04-01 19:26     ` David Miller
2015-04-02  0:06 ` Cong Wang
2015-04-02  0:27   ` Eric Dumazet
2015-04-02  0:35     ` Hannes Frederic Sowa
2015-04-02  0:48       ` Eric Dumazet
2015-04-02  0:56         ` Hannes Frederic Sowa
2015-04-02  1:53           ` Eric Dumazet
2015-04-02 11:34             ` Hannes Frederic Sowa
2015-04-02 16:24               ` David Miller
2015-04-02  2:55     ` Cong Wang
2015-04-02 12:05       ` Hannes Frederic Sowa
2015-04-06 20:14 ` David Miller

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.