All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH nf] netfilter: nf_queue: be more careful with sk refcounts
@ 2022-02-28 16:29 Florian Westphal
  2022-02-28 23:30 ` Joe Stringer
  0 siblings, 1 reply; 5+ messages in thread
From: Florian Westphal @ 2022-02-28 16:29 UTC (permalink / raw)
  To: netfilter-devel; +Cc: Florian Westphal, Joe Stringer, Eric Dumazet

Eric Dumazet says:
  The sock_hold() side seems suspect, because there is no guarantee
  that sk_refcnt is not already 0.

Also, there is a way to wire up skb->sk in a way that skb doesn't hold
a reference on the socket, so we need to check for that as well.

For refcount-less skb->sk case, try to increment the reference count
and then override the destructor.

On failure, we cannot queue the packet and need to indicate an
error.  THe packet will be dropped by the caller.

Cc: Joe Stringer <joe@cilium.io>
Fixes: 271b72c7fa82c ("udp: RCU handling for Unicast packets.")
Reported-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: Florian Westphal <fw@strlen.de>
---
 Eric, does that look sane to you?
 Joe, it would be nice if you could check the 'skb_sk_is_prefetched' part.

 Thanks!

 include/net/netfilter/nf_queue.h |  2 +-
 net/netfilter/nf_queue.c         | 25 +++++++++++++++++++++----
 net/netfilter/nfnetlink_queue.c  | 12 +++++++++---
 3 files changed, 31 insertions(+), 8 deletions(-)

diff --git a/include/net/netfilter/nf_queue.h b/include/net/netfilter/nf_queue.h
index 9eed51e920e8..980daa6e1e3a 100644
--- a/include/net/netfilter/nf_queue.h
+++ b/include/net/netfilter/nf_queue.h
@@ -37,7 +37,7 @@ void nf_register_queue_handler(const struct nf_queue_handler *qh);
 void nf_unregister_queue_handler(void);
 void nf_reinject(struct nf_queue_entry *entry, unsigned int verdict);
 
-void nf_queue_entry_get_refs(struct nf_queue_entry *entry);
+bool nf_queue_entry_get_refs(struct nf_queue_entry *entry);
 void nf_queue_entry_free(struct nf_queue_entry *entry);
 
 static inline void init_hashrandom(u32 *jhash_initval)
diff --git a/net/netfilter/nf_queue.c b/net/netfilter/nf_queue.c
index 5ab0680db445..63d1516816b1 100644
--- a/net/netfilter/nf_queue.c
+++ b/net/netfilter/nf_queue.c
@@ -96,19 +96,21 @@ static void __nf_queue_entry_init_physdevs(struct nf_queue_entry *entry)
 }
 
 /* Bump dev refs so they don't vanish while packet is out */
-void nf_queue_entry_get_refs(struct nf_queue_entry *entry)
+bool nf_queue_entry_get_refs(struct nf_queue_entry *entry)
 {
 	struct nf_hook_state *state = &entry->state;
 
+	if (state->sk && !refcount_inc_not_zero(&state->sk->sk_refcnt))
+		return false;
+
 	dev_hold(state->in);
 	dev_hold(state->out);
-	if (state->sk)
-		sock_hold(state->sk);
 
 #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
 	dev_hold(entry->physin);
 	dev_hold(entry->physout);
 #endif
+	return true;
 }
 EXPORT_SYMBOL_GPL(nf_queue_entry_get_refs);
 
@@ -178,6 +180,18 @@ static int __nf_queue(struct sk_buff *skb, const struct nf_hook_state *state,
 		break;
 	}
 
+	if (skb_sk_is_prefetched(skb)) {
+		struct sock *sk = skb->sk;
+
+		if (!sk_is_refcounted(sk)) {
+			if (!refcount_inc_not_zero(&sk->sk_refcnt))
+				return -ENOTCONN;
+
+			/* drop refcount on skb_orphan */
+			skb->destructor = sock_edemux;
+		}
+	}
+
 	entry = kmalloc(sizeof(*entry) + route_key_size, GFP_ATOMIC);
 	if (!entry)
 		return -ENOMEM;
@@ -196,7 +210,10 @@ static int __nf_queue(struct sk_buff *skb, const struct nf_hook_state *state,
 
 	__nf_queue_entry_init_physdevs(entry);
 
-	nf_queue_entry_get_refs(entry);
+	if (!nf_queue_entry_get_refs(entry)) {
+		kfree(entry);
+		return -ENOTCONN;
+	}
 
 	switch (entry->state.pf) {
 	case AF_INET:
diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c
index ea2d9c2a44cf..64a6acb6aeae 100644
--- a/net/netfilter/nfnetlink_queue.c
+++ b/net/netfilter/nfnetlink_queue.c
@@ -710,9 +710,15 @@ static struct nf_queue_entry *
 nf_queue_entry_dup(struct nf_queue_entry *e)
 {
 	struct nf_queue_entry *entry = kmemdup(e, e->size, GFP_ATOMIC);
-	if (entry)
-		nf_queue_entry_get_refs(entry);
-	return entry;
+
+	if (!entry)
+		return NULL;
+
+	if (nf_queue_entry_get_refs(entry))
+		return entry;
+
+	kfree(entry);
+	return NULL;
 }
 
 #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 5+ messages in thread

* Re: [PATCH nf] netfilter: nf_queue: be more careful with sk refcounts
  2022-02-28 16:29 [PATCH nf] netfilter: nf_queue: be more careful with sk refcounts Florian Westphal
@ 2022-02-28 23:30 ` Joe Stringer
  2022-02-28 23:41   ` Florian Westphal
  0 siblings, 1 reply; 5+ messages in thread
From: Joe Stringer @ 2022-02-28 23:30 UTC (permalink / raw)
  To: Florian Westphal; +Cc: netfilter-devel, Joe Stringer, Eric Dumazet

On Mon, Feb 28, 2022 at 8:29 AM Florian Westphal <fw@strlen.de> wrote:
>
> Eric Dumazet says:
>   The sock_hold() side seems suspect, because there is no guarantee
>   that sk_refcnt is not already 0.
>
> Also, there is a way to wire up skb->sk in a way that skb doesn't hold
> a reference on the socket, so we need to check for that as well.
>
> For refcount-less skb->sk case, try to increment the reference count
> and then override the destructor.
>
> On failure, we cannot queue the packet and need to indicate an
> error.  THe packet will be dropped by the caller.
>
> Cc: Joe Stringer <joe@cilium.io>
> Fixes: 271b72c7fa82c ("udp: RCU handling for Unicast packets.")

Hi Florian, thanks for the fix.

skb_sk_is_prefetched() was introduced in commit cf7fbe660f2d ("bpf:
Add socket assign support"). You may want to split the hunk below into
a dedicated patch for this reason.

> Reported-by: Eric Dumazet <eric.dumazet@gmail.com>
> Signed-off-by: Florian Westphal <fw@strlen.de>
> ---
>  Eric, does that look sane to you?
>  Joe, it would be nice if you could check the 'skb_sk_is_prefetched' part.

<snip>

> @@ -178,6 +180,18 @@ static int __nf_queue(struct sk_buff *skb, const struct nf_hook_state *state,
>                 break;
>         }
>
> +       if (skb_sk_is_prefetched(skb)) {
> +               struct sock *sk = skb->sk;
> +
> +               if (!sk_is_refcounted(sk)) {
> +                       if (!refcount_inc_not_zero(&sk->sk_refcnt))
> +                               return -ENOTCONN;
> +
> +                       /* drop refcount on skb_orphan */
> +                       skb->destructor = sock_edemux;
> +               }
> +       }
> +
>         entry = kmalloc(sizeof(*entry) + route_key_size, GFP_ATOMIC);
>         if (!entry)
>                 return -ENOMEM;

I've never heard of someone trying to use socket prefetch /
bpf_sk_assign in conjunction with nf_queue, bit of an unusual case.
Given that `skb_sk_is_prefetched()` relies on the skb->destructor
pointing towards sock_pfree, and this code would change that to
sock_edemux, the difference the patch would make is this: if the
packet is queued and then accepted, the socket prefetch selection
could be ignored. That said, it seems potentially dangerous to fail to
hold the socket reference during the nfqueue operation, so this
proposal is preferable to the alternative. There's possibly ways for
someone to still get such a use case to work even with this patch (eg
via iptables TPROXY target rules), or worse case they could iterate on
this code a little further to ensure this case works (eg by coming up
with a new free func that takes this case into account). More likely,
they would find an alternative solution to their problem that doesn't
involve combining these particular features in the same setup.

I looked closely at this hunk, I didn't look closely at the rest of
the patch. Assuming you split just this hunk into a dedicated patch,
you can add my Ack:

Acked-by: Joe Stringer <joe@cilium.io>

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH nf] netfilter: nf_queue: be more careful with sk refcounts
  2022-02-28 23:30 ` Joe Stringer
@ 2022-02-28 23:41   ` Florian Westphal
  2022-03-01  1:14     ` Joe Stringer
  0 siblings, 1 reply; 5+ messages in thread
From: Florian Westphal @ 2022-02-28 23:41 UTC (permalink / raw)
  To: Joe Stringer; +Cc: Florian Westphal, netfilter-devel, Eric Dumazet

Joe Stringer <joe@cilium.io> wrote:
> On Mon, Feb 28, 2022 at 8:29 AM Florian Westphal <fw@strlen.de> wrote:
> >
> > Eric Dumazet says:
> >   The sock_hold() side seems suspect, because there is no guarantee
> >   that sk_refcnt is not already 0.
> >
> > Also, there is a way to wire up skb->sk in a way that skb doesn't hold
> > a reference on the socket, so we need to check for that as well.
> >
> > For refcount-less skb->sk case, try to increment the reference count
> > and then override the destructor.
> >
> > On failure, we cannot queue the packet and need to indicate an
> > error.  THe packet will be dropped by the caller.
> >
> > Cc: Joe Stringer <joe@cilium.io>
> > Fixes: 271b72c7fa82c ("udp: RCU handling for Unicast packets.")
> 
> Hi Florian, thanks for the fix.
> 
> skb_sk_is_prefetched() was introduced in commit cf7fbe660f2d ("bpf:
> Add socket assign support"). You may want to split the hunk below into
> a dedicated patch for this reason.

Yes, I see, that helps with backports, will do.

> > +       if (skb_sk_is_prefetched(skb)) {
> > +               struct sock *sk = skb->sk;
> > +
> > +               if (!sk_is_refcounted(sk)) {
> > +                       if (!refcount_inc_not_zero(&sk->sk_refcnt))
> > +                               return -ENOTCONN;
> > +
> > +                       /* drop refcount on skb_orphan */
> > +                       skb->destructor = sock_edemux;
> > +               }
> > +       }
> > +
> >         entry = kmalloc(sizeof(*entry) + route_key_size, GFP_ATOMIC);
> >         if (!entry)
> >                 return -ENOMEM;
> 
> I've never heard of someone trying to use socket prefetch /
> bpf_sk_assign in conjunction with nf_queue, bit of an unusual case.

Me neither, but if someone does it, skb->sk leaves rcu protection.

> Given that `skb_sk_is_prefetched()` relies on the skb->destructor
> pointing towards sock_pfree, and this code would change that to
> sock_edemux, the difference the patch would make is this: if the
> packet is queued and then accepted, the socket prefetch selection
> could be ignored.

Hmmm, wait a second, is that because of orphan in input path, i.e.,
that this preselect has to work even across veth/netns crossing?

> I looked closely at this hunk, I didn't look closely at the rest of
> the patch. Assuming you split just this hunk into a dedicated patch,
> you can add my Ack:
> 
> Acked-by: Joe Stringer <joe@cilium.io>

Thats what I'll do, thanks Joe!

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH nf] netfilter: nf_queue: be more careful with sk refcounts
  2022-02-28 23:41   ` Florian Westphal
@ 2022-03-01  1:14     ` Joe Stringer
  2022-03-01  1:36       ` Florian Westphal
  0 siblings, 1 reply; 5+ messages in thread
From: Joe Stringer @ 2022-03-01  1:14 UTC (permalink / raw)
  To: Florian Westphal; +Cc: Joe Stringer, netfilter-devel, Eric Dumazet

On Mon, Feb 28, 2022 at 3:41 PM Florian Westphal <fw@strlen.de> wrote:
>
> Joe Stringer <joe@cilium.io> wrote:
> > On Mon, Feb 28, 2022 at 8:29 AM Florian Westphal <fw@strlen.de> wrote:
> > > +       if (skb_sk_is_prefetched(skb)) {
> > > +               struct sock *sk = skb->sk;
> > > +
> > > +               if (!sk_is_refcounted(sk)) {
> > > +                       if (!refcount_inc_not_zero(&sk->sk_refcnt))
> > > +                               return -ENOTCONN;
> > > +
> > > +                       /* drop refcount on skb_orphan */
> > > +                       skb->destructor = sock_edemux;
> > > +               }
> > > +       }
> > > +
> > >         entry = kmalloc(sizeof(*entry) + route_key_size, GFP_ATOMIC);
> > >         if (!entry)
> > >                 return -ENOMEM;
> >
> > I've never heard of someone trying to use socket prefetch /
> > bpf_sk_assign in conjunction with nf_queue, bit of an unusual case.
>
> Me neither, but if someone does it, skb->sk leaves rcu protection.
>
> > Given that `skb_sk_is_prefetched()` relies on the skb->destructor
> > pointing towards sock_pfree, and this code would change that to
> > sock_edemux, the difference the patch would make is this: if the
> > packet is queued and then accepted, the socket prefetch selection
> > could be ignored.
>
> Hmmm, wait a second, is that because of orphan in input path, i.e.,
> that this preselect has to work even across veth/netns crossing?

Yes it's linked to the orphan in input path which otherwise breaks the
feature inside the same netns. I'm not sure I follow the question
about veth/netns crossing as this feature can only be used on TC
ingress today. It just prefetches the socket as the packet is received
up the stack from a device.

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH nf] netfilter: nf_queue: be more careful with sk refcounts
  2022-03-01  1:14     ` Joe Stringer
@ 2022-03-01  1:36       ` Florian Westphal
  0 siblings, 0 replies; 5+ messages in thread
From: Florian Westphal @ 2022-03-01  1:36 UTC (permalink / raw)
  To: Joe Stringer; +Cc: Florian Westphal, netfilter-devel, Eric Dumazet

Joe Stringer <joe@cilium.io> wrote:
> I'm not sure I follow the question
> about veth/netns crossing as this feature can only be used on TC
> ingress today.

Ah, never mind then.  Thanks for explaining!

^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2022-03-01  1:36 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-02-28 16:29 [PATCH nf] netfilter: nf_queue: be more careful with sk refcounts Florian Westphal
2022-02-28 23:30 ` Joe Stringer
2022-02-28 23:41   ` Florian Westphal
2022-03-01  1:14     ` Joe Stringer
2022-03-01  1:36       ` Florian Westphal

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.