All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] nf_flowtable: teardown fix race condition
@ 2022-05-09  9:31 Sven Auhagen
  2022-05-09 13:13 ` Oz Shlomo
  2022-05-09 16:47 ` Florian Westphal
  0 siblings, 2 replies; 5+ messages in thread
From: Sven Auhagen @ 2022-05-09  9:31 UTC (permalink / raw)
  To: netfilter-devel; +Cc: pablo, nbd, fw, paulb, ozsh

The nf flowtable teardown forces a tcp state into established state
with the corresponding timeout and is in a race condition with
the conntrack code.
This might happen even though the state is already in a CLOSE or
FIN WAIT state and about to be closed.
In order to process the correct state, a TCP connection needs to be
set to established in the flowtable software and hardware case.
Also this is a bit optimistic as we actually do not check for the
3 way handshake ACK at this point, we do not really have a choice.

This is also fixing a race condition between the ct gc code
and the flowtable teardown where the ct might already be removed
when the flowtable teardown code runs.

Signed-off-by: Sven Auhagen <sven.auhagen@voleatech.de>

diff --git a/net/netfilter/nf_flow_table_core.c b/net/netfilter/nf_flow_table_core.c
index 87a7388b6c89..898ea2fc833e 100644
--- a/net/netfilter/nf_flow_table_core.c
+++ b/net/netfilter/nf_flow_table_core.c
@@ -5,6 +5,7 @@
 #include <linux/netfilter.h>
 #include <linux/rhashtable.h>
 #include <linux/netdevice.h>
+#include <linux/spinlock.h>
 #include <net/ip.h>
 #include <net/ip6_route.h>
 #include <net/netfilter/nf_tables.h>
@@ -171,30 +172,32 @@ int flow_offload_route_init(struct flow_offload *flow,
 }
 EXPORT_SYMBOL_GPL(flow_offload_route_init);
 
-static void flow_offload_fixup_tcp(struct ip_ct_tcp *tcp)
-{
-	tcp->state = TCP_CONNTRACK_ESTABLISHED;
-	tcp->seen[0].td_maxwin = 0;
-	tcp->seen[1].td_maxwin = 0;
-}
 
-static void flow_offload_fixup_ct_timeout(struct nf_conn *ct)
+static void flow_offload_fixup_ct(struct nf_conn *ct)
 {
 	struct net *net = nf_ct_net(ct);
 	int l4num = nf_ct_protonum(ct);
 	s32 timeout;
 
+	spin_lock_bh(&ct->lock);
+
 	if (l4num == IPPROTO_TCP) {
-		struct nf_tcp_net *tn = nf_tcp_pernet(net);
+		ct->proto.tcp.seen[0].td_maxwin = 0;
+		ct->proto.tcp.seen[1].td_maxwin = 0;
 
-		timeout = tn->timeouts[TCP_CONNTRACK_ESTABLISHED];
-		timeout -= tn->offload_timeout;
+		if (nf_conntrack_tcp_established(ct)) {
+			struct nf_tcp_net *tn = nf_tcp_pernet(net);
+
+			timeout = tn->timeouts[TCP_CONNTRACK_ESTABLISHED];
+			timeout -= tn->offload_timeout;
+		}
 	} else if (l4num == IPPROTO_UDP) {
 		struct nf_udp_net *tn = nf_udp_pernet(net);
 
 		timeout = tn->timeouts[UDP_CT_REPLIED];
 		timeout -= tn->offload_timeout;
 	} else {
+		spin_unlock_bh(&ct->lock);
 		return;
 	}
 
@@ -203,18 +206,8 @@ static void flow_offload_fixup_ct_timeout(struct nf_conn *ct)
 
 	if (nf_flow_timeout_delta(ct->timeout) > (__s32)timeout)
 		ct->timeout = nfct_time_stamp + timeout;
-}
 
-static void flow_offload_fixup_ct_state(struct nf_conn *ct)
-{
-	if (nf_ct_protonum(ct) == IPPROTO_TCP)
-		flow_offload_fixup_tcp(&ct->proto.tcp);
-}
-
-static void flow_offload_fixup_ct(struct nf_conn *ct)
-{
-	flow_offload_fixup_ct_state(ct);
-	flow_offload_fixup_ct_timeout(ct);
+	spin_unlock_bh(&ct->lock);
 }
 
 static void flow_offload_route_release(struct flow_offload *flow)
@@ -354,12 +347,9 @@ static void flow_offload_del(struct nf_flowtable *flow_table,
 			       &flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].node,
 			       nf_flow_offload_rhash_params);
 
-	clear_bit(IPS_OFFLOAD_BIT, &flow->ct->status);
+	flow_offload_fixup_ct(flow->ct);
 
-	if (nf_flow_has_expired(flow))
-		flow_offload_fixup_ct(flow->ct);
-	else
-		flow_offload_fixup_ct_timeout(flow->ct);
+	clear_bit(IPS_OFFLOAD_BIT, &flow->ct->status);
 
 	flow_offload_free(flow);
 }
@@ -367,8 +357,6 @@ static void flow_offload_del(struct nf_flowtable *flow_table,
 void flow_offload_teardown(struct flow_offload *flow)
 {
 	set_bit(NF_FLOW_TEARDOWN, &flow->flags);
-
-	flow_offload_fixup_ct_state(flow->ct);
 }
 EXPORT_SYMBOL_GPL(flow_offload_teardown);
 
diff --git a/net/netfilter/nf_flow_table_ip.c b/net/netfilter/nf_flow_table_ip.c
index 889cf88d3dba..990128cb7a61 100644
--- a/net/netfilter/nf_flow_table_ip.c
+++ b/net/netfilter/nf_flow_table_ip.c
@@ -10,6 +10,7 @@
 #include <linux/if_ether.h>
 #include <linux/if_pppox.h>
 #include <linux/ppp_defs.h>
+#include <linux/spinlock.h>
 #include <net/ip.h>
 #include <net/ipv6.h>
 #include <net/ip6_route.h>
@@ -34,6 +35,13 @@ static int nf_flow_state_check(struct flow_offload *flow, int proto,
 		return -1;
 	}
 
+	if (unlikely(!test_bit(IPS_ASSURED_BIT, &flow->ct->status))) {
+		spin_lock_bh(&flow->ct->lock);
+		flow->ct->proto.tcp.state = TCP_CONNTRACK_ESTABLISHED;
+		spin_unlock_bh(&flow->ct->lock);
+		set_bit(IPS_ASSURED_BIT, &flow->ct->status);
+	}
+
 	return 0;
 }
 
diff --git a/net/netfilter/nf_flow_table_offload.c b/net/netfilter/nf_flow_table_offload.c
index b561e0a44a45..63bf1579e75f 100644
--- a/net/netfilter/nf_flow_table_offload.c
+++ b/net/netfilter/nf_flow_table_offload.c
@@ -5,6 +5,7 @@
 #include <linux/rhashtable.h>
 #include <linux/netdevice.h>
 #include <linux/tc_act/tc_csum.h>
+#include <linux/spinlock.h>
 #include <net/flow_offload.h>
 #include <net/netfilter/nf_flow_table.h>
 #include <net/netfilter/nf_tables.h>
@@ -953,11 +954,22 @@ static void flow_offload_work_stats(struct flow_offload_work *offload)
 static void flow_offload_work_handler(struct work_struct *work)
 {
 	struct flow_offload_work *offload;
+	struct flow_offload_tuple *tuple;
+	struct flow_offload *flow;
 
 	offload = container_of(work, struct flow_offload_work, work);
 	switch (offload->cmd) {
 		case FLOW_CLS_REPLACE:
 			flow_offload_work_add(offload);
+			/* Set the TCP connection to established or teardown does not work */
+			flow = offload->flow;
+			tuple = &flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple;
+			if (tuple->l4proto == IPPROTO_TCP && !test_bit(IPS_ASSURED_BIT, &flow->ct->status)) {
+				spin_lock_bh(&flow->ct->lock);
+				flow->ct->proto.tcp.state = TCP_CONNTRACK_ESTABLISHED;
+				spin_unlock_bh(&flow->ct->lock);
+				set_bit(IPS_ASSURED_BIT, &flow->ct->status);
+			}
 			break;
 		case FLOW_CLS_DESTROY:
 			flow_offload_work_del(offload);
-- 
2.33.1


^ permalink raw reply related	[flat|nested] 5+ messages in thread

* Re: [PATCH] nf_flowtable: teardown fix race condition
  2022-05-09  9:31 [PATCH] nf_flowtable: teardown fix race condition Sven Auhagen
@ 2022-05-09 13:13 ` Oz Shlomo
  2022-05-09 13:16   ` Sven Auhagen
  2022-05-09 16:47 ` Florian Westphal
  1 sibling, 1 reply; 5+ messages in thread
From: Oz Shlomo @ 2022-05-09 13:13 UTC (permalink / raw)
  To: Sven Auhagen, netfilter-devel; +Cc: pablo, nbd, fw, paulb

Hi Sven,

It seems to me like the issue should be resolved from nft side rather 
than the flowtable side.

On 5/9/2022 12:31 PM, Sven Auhagen wrote:
> The nf flowtable teardown forces a tcp state into established state
> with the corresponding timeout and is in a race condition with
> the conntrack code.
> This might happen even though the state is already in a CLOSE or
> FIN WAIT state and about to be closed.
> In order to process the correct state, a TCP connection needs to be
> set to established in the flowtable software and hardware case.
> Also this is a bit optimistic as we actually do not check for the
> 3 way handshake ACK at this point, we do not really have a choice.
> 
> This is also fixing a race condition between the ct gc code
> and the flowtable teardown where the ct might already be removed
> when the flowtable teardown code runs >
> Signed-off-by: Sven Auhagen <sven.auhagen@voleatech.de>
> 
> diff --git a/net/netfilter/nf_flow_table_core.c b/net/netfilter/nf_flow_table_core.c
> index 87a7388b6c89..898ea2fc833e 100644
> --- a/net/netfilter/nf_flow_table_core.c
> +++ b/net/netfilter/nf_flow_table_core.c
> @@ -5,6 +5,7 @@
>   #include <linux/netfilter.h>
>   #include <linux/rhashtable.h>
>   #include <linux/netdevice.h>
> +#include <linux/spinlock.h>
>   #include <net/ip.h>
>   #include <net/ip6_route.h>
>   #include <net/netfilter/nf_tables.h>
> @@ -171,30 +172,32 @@ int flow_offload_route_init(struct flow_offload *flow,
>   }
>   EXPORT_SYMBOL_GPL(flow_offload_route_init);
>   
> -static void flow_offload_fixup_tcp(struct ip_ct_tcp *tcp)
> -{
> -	tcp->state = TCP_CONNTRACK_ESTABLISHED;
> -	tcp->seen[0].td_maxwin = 0;
> -	tcp->seen[1].td_maxwin = 0;
> -}
>   
> -static void flow_offload_fixup_ct_timeout(struct nf_conn *ct)
> +static void flow_offload_fixup_ct(struct nf_conn *ct)
>   {
>   	struct net *net = nf_ct_net(ct);
>   	int l4num = nf_ct_protonum(ct);
>   	s32 timeout;
>   
> +	spin_lock_bh(&ct->lock);
> +
>   	if (l4num == IPPROTO_TCP) {
> -		struct nf_tcp_net *tn = nf_tcp_pernet(net);
> +		ct->proto.tcp.seen[0].td_maxwin = 0;
> +		ct->proto.tcp.seen[1].td_maxwin = 0;
>   
> -		timeout = tn->timeouts[TCP_CONNTRACK_ESTABLISHED];
> -		timeout -= tn->offload_timeout;
> +		if (nf_conntrack_tcp_established(ct)) {
> +			struct nf_tcp_net *tn = nf_tcp_pernet(net);
> +
> +			timeout = tn->timeouts[TCP_CONNTRACK_ESTABLISHED];
> +			timeout -= tn->offload_timeout;
> +		}
>   	} else if (l4num == IPPROTO_UDP) {
>   		struct nf_udp_net *tn = nf_udp_pernet(net);
>   
>   		timeout = tn->timeouts[UDP_CT_REPLIED];
>   		timeout -= tn->offload_timeout;
>   	} else {
> +		spin_unlock_bh(&ct->lock);
>   		return;
>   	}
>   
> @@ -203,18 +206,8 @@ static void flow_offload_fixup_ct_timeout(struct nf_conn *ct)
>   
>   	if (nf_flow_timeout_delta(ct->timeout) > (__s32)timeout)
>   		ct->timeout = nfct_time_stamp + timeout;
> -}
>   
> -static void flow_offload_fixup_ct_state(struct nf_conn *ct)
> -{
> -	if (nf_ct_protonum(ct) == IPPROTO_TCP)
> -		flow_offload_fixup_tcp(&ct->proto.tcp);
> -}
> -
> -static void flow_offload_fixup_ct(struct nf_conn *ct)
> -{
> -	flow_offload_fixup_ct_state(ct);
> -	flow_offload_fixup_ct_timeout(ct);
> +	spin_unlock_bh(&ct->lock);
>   }
>   
>   static void flow_offload_route_release(struct flow_offload *flow)
> @@ -354,12 +347,9 @@ static void flow_offload_del(struct nf_flowtable *flow_table,
>   			       &flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].node,
>   			       nf_flow_offload_rhash_params);
>   
> -	clear_bit(IPS_OFFLOAD_BIT, &flow->ct->status);
> +	flow_offload_fixup_ct(flow->ct);
>   
> -	if (nf_flow_has_expired(flow))
> -		flow_offload_fixup_ct(flow->ct);
> -	else
> -		flow_offload_fixup_ct_timeout(flow->ct);
> +	clear_bit(IPS_OFFLOAD_BIT, &flow->ct->status);
>   
>   	flow_offload_free(flow);
>   }
> @@ -367,8 +357,6 @@ static void flow_offload_del(struct nf_flowtable *flow_table,
>   void flow_offload_teardown(struct flow_offload *flow)
>   {
>   	set_bit(NF_FLOW_TEARDOWN, &flow->flags);
> -
> -	flow_offload_fixup_ct_state(flow->ct);
>   }
>   EXPORT_SYMBOL_GPL(flow_offload_teardown);
>   
> diff --git a/net/netfilter/nf_flow_table_ip.c b/net/netfilter/nf_flow_table_ip.c
> index 889cf88d3dba..990128cb7a61 100644
> --- a/net/netfilter/nf_flow_table_ip.c
> +++ b/net/netfilter/nf_flow_table_ip.c
> @@ -10,6 +10,7 @@
>   #include <linux/if_ether.h>
>   #include <linux/if_pppox.h>
>   #include <linux/ppp_defs.h>
> +#include <linux/spinlock.h>
>   #include <net/ip.h>
>   #include <net/ipv6.h>
>   #include <net/ip6_route.h>
> @@ -34,6 +35,13 @@ static int nf_flow_state_check(struct flow_offload *flow, int proto,
>   		return -1;
>   	}
>   
> +	if (unlikely(!test_bit(IPS_ASSURED_BIT, &flow->ct->status))) {
> +		spin_lock_bh(&flow->ct->lock);
> +		flow->ct->proto.tcp.state = TCP_CONNTRACK_ESTABLISHED;
> +		spin_unlock_bh(&flow->ct->lock);
> +		set_bit(IPS_ASSURED_BIT, &flow->ct->status);
> +	}
> +
>   	return 0;
>   }
>   
> diff --git a/net/netfilter/nf_flow_table_offload.c b/net/netfilter/nf_flow_table_offload.c
> index b561e0a44a45..63bf1579e75f 100644
> --- a/net/netfilter/nf_flow_table_offload.c
> +++ b/net/netfilter/nf_flow_table_offload.c
> @@ -5,6 +5,7 @@
>   #include <linux/rhashtable.h>
>   #include <linux/netdevice.h>
>   #include <linux/tc_act/tc_csum.h>
> +#include <linux/spinlock.h>
>   #include <net/flow_offload.h>
>   #include <net/netfilter/nf_flow_table.h>
>   #include <net/netfilter/nf_tables.h>
> @@ -953,11 +954,22 @@ static void flow_offload_work_stats(struct flow_offload_work *offload)
>   static void flow_offload_work_handler(struct work_struct *work)
>   {
>   	struct flow_offload_work *offload;
> +	struct flow_offload_tuple *tuple;
> +	struct flow_offload *flow;
>   
>   	offload = container_of(work, struct flow_offload_work, work);
>   	switch (offload->cmd) {
>   		case FLOW_CLS_REPLACE:
>   			flow_offload_work_add(offload);
> +			/* Set the TCP connection to established or teardown does not work */
> +			flow = offload->flow;
> +			tuple = &flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple;
> +			if (tuple->l4proto == IPPROTO_TCP && !test_bit(IPS_ASSURED_BIT, &flow->ct->status)) {
> +				spin_lock_bh(&flow->ct->lock);
> +				flow->ct->proto.tcp.state = TCP_CONNTRACK_ESTABLISHED;
> +				spin_unlock_bh(&flow->ct->lock);
> +				set_bit(IPS_ASSURED_BIT, &flow->ct->status);
> +			}

Hmm, this looks like a workaround.
Also note that this code is called only when the flowtable 
NF_FLOWTABLE_HW_OFFLOAD bit is set.

>   			break;
>   		case FLOW_CLS_DESTROY:
>   			flow_offload_work_del(offload);

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH] nf_flowtable: teardown fix race condition
  2022-05-09 13:13 ` Oz Shlomo
@ 2022-05-09 13:16   ` Sven Auhagen
  0 siblings, 0 replies; 5+ messages in thread
From: Sven Auhagen @ 2022-05-09 13:16 UTC (permalink / raw)
  To: Oz Shlomo; +Cc: netfilter-devel, pablo, nbd, fw, paulb

Hi Oz,

On Mon, May 09, 2022 at 04:13:19PM +0300, Oz Shlomo wrote:
> Hi Sven,
> 
> It seems to me like the issue should be resolved from nft side rather than
> the flowtable side.

this is not possible to be resolved on the nftables side only.

> 
> On 5/9/2022 12:31 PM, Sven Auhagen wrote:
> > The nf flowtable teardown forces a tcp state into established state
> > with the corresponding timeout and is in a race condition with
> > the conntrack code.
> > This might happen even though the state is already in a CLOSE or
> > FIN WAIT state and about to be closed.
> > In order to process the correct state, a TCP connection needs to be
> > set to established in the flowtable software and hardware case.
> > Also this is a bit optimistic as we actually do not check for the
> > 3 way handshake ACK at this point, we do not really have a choice.
> > 
> > This is also fixing a race condition between the ct gc code
> > and the flowtable teardown where the ct might already be removed
> > when the flowtable teardown code runs >
> > Signed-off-by: Sven Auhagen <sven.auhagen@voleatech.de>
> > 
> > diff --git a/net/netfilter/nf_flow_table_core.c b/net/netfilter/nf_flow_table_core.c
> > index 87a7388b6c89..898ea2fc833e 100644
> > --- a/net/netfilter/nf_flow_table_core.c
> > +++ b/net/netfilter/nf_flow_table_core.c
> > @@ -5,6 +5,7 @@
> >   #include <linux/netfilter.h>
> >   #include <linux/rhashtable.h>
> >   #include <linux/netdevice.h>
> > +#include <linux/spinlock.h>
> >   #include <net/ip.h>
> >   #include <net/ip6_route.h>
> >   #include <net/netfilter/nf_tables.h>
> > @@ -171,30 +172,32 @@ int flow_offload_route_init(struct flow_offload *flow,
> >   }
> >   EXPORT_SYMBOL_GPL(flow_offload_route_init);
> > -static void flow_offload_fixup_tcp(struct ip_ct_tcp *tcp)
> > -{
> > -	tcp->state = TCP_CONNTRACK_ESTABLISHED;
> > -	tcp->seen[0].td_maxwin = 0;
> > -	tcp->seen[1].td_maxwin = 0;
> > -}
> > -static void flow_offload_fixup_ct_timeout(struct nf_conn *ct)
> > +static void flow_offload_fixup_ct(struct nf_conn *ct)
> >   {
> >   	struct net *net = nf_ct_net(ct);
> >   	int l4num = nf_ct_protonum(ct);
> >   	s32 timeout;
> > +	spin_lock_bh(&ct->lock);
> > +
> >   	if (l4num == IPPROTO_TCP) {
> > -		struct nf_tcp_net *tn = nf_tcp_pernet(net);
> > +		ct->proto.tcp.seen[0].td_maxwin = 0;
> > +		ct->proto.tcp.seen[1].td_maxwin = 0;
> > -		timeout = tn->timeouts[TCP_CONNTRACK_ESTABLISHED];
> > -		timeout -= tn->offload_timeout;
> > +		if (nf_conntrack_tcp_established(ct)) {
> > +			struct nf_tcp_net *tn = nf_tcp_pernet(net);
> > +
> > +			timeout = tn->timeouts[TCP_CONNTRACK_ESTABLISHED];
> > +			timeout -= tn->offload_timeout;
> > +		}
> >   	} else if (l4num == IPPROTO_UDP) {
> >   		struct nf_udp_net *tn = nf_udp_pernet(net);
> >   		timeout = tn->timeouts[UDP_CT_REPLIED];
> >   		timeout -= tn->offload_timeout;
> >   	} else {
> > +		spin_unlock_bh(&ct->lock);
> >   		return;
> >   	}
> > @@ -203,18 +206,8 @@ static void flow_offload_fixup_ct_timeout(struct nf_conn *ct)
> >   	if (nf_flow_timeout_delta(ct->timeout) > (__s32)timeout)
> >   		ct->timeout = nfct_time_stamp + timeout;
> > -}
> > -static void flow_offload_fixup_ct_state(struct nf_conn *ct)
> > -{
> > -	if (nf_ct_protonum(ct) == IPPROTO_TCP)
> > -		flow_offload_fixup_tcp(&ct->proto.tcp);
> > -}
> > -
> > -static void flow_offload_fixup_ct(struct nf_conn *ct)
> > -{
> > -	flow_offload_fixup_ct_state(ct);
> > -	flow_offload_fixup_ct_timeout(ct);
> > +	spin_unlock_bh(&ct->lock);
> >   }
> >   static void flow_offload_route_release(struct flow_offload *flow)
> > @@ -354,12 +347,9 @@ static void flow_offload_del(struct nf_flowtable *flow_table,
> >   			       &flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].node,
> >   			       nf_flow_offload_rhash_params);
> > -	clear_bit(IPS_OFFLOAD_BIT, &flow->ct->status);
> > +	flow_offload_fixup_ct(flow->ct);
> > -	if (nf_flow_has_expired(flow))
> > -		flow_offload_fixup_ct(flow->ct);
> > -	else
> > -		flow_offload_fixup_ct_timeout(flow->ct);
> > +	clear_bit(IPS_OFFLOAD_BIT, &flow->ct->status);
> >   	flow_offload_free(flow);
> >   }
> > @@ -367,8 +357,6 @@ static void flow_offload_del(struct nf_flowtable *flow_table,
> >   void flow_offload_teardown(struct flow_offload *flow)
> >   {
> >   	set_bit(NF_FLOW_TEARDOWN, &flow->flags);
> > -
> > -	flow_offload_fixup_ct_state(flow->ct);
> >   }
> >   EXPORT_SYMBOL_GPL(flow_offload_teardown);
> > diff --git a/net/netfilter/nf_flow_table_ip.c b/net/netfilter/nf_flow_table_ip.c
> > index 889cf88d3dba..990128cb7a61 100644
> > --- a/net/netfilter/nf_flow_table_ip.c
> > +++ b/net/netfilter/nf_flow_table_ip.c
> > @@ -10,6 +10,7 @@
> >   #include <linux/if_ether.h>
> >   #include <linux/if_pppox.h>
> >   #include <linux/ppp_defs.h>
> > +#include <linux/spinlock.h>
> >   #include <net/ip.h>
> >   #include <net/ipv6.h>
> >   #include <net/ip6_route.h>
> > @@ -34,6 +35,13 @@ static int nf_flow_state_check(struct flow_offload *flow, int proto,
> >   		return -1;
> >   	}
> > +	if (unlikely(!test_bit(IPS_ASSURED_BIT, &flow->ct->status))) {
> > +		spin_lock_bh(&flow->ct->lock);
> > +		flow->ct->proto.tcp.state = TCP_CONNTRACK_ESTABLISHED;
> > +		spin_unlock_bh(&flow->ct->lock);
> > +		set_bit(IPS_ASSURED_BIT, &flow->ct->status);
> > +	}
> > +
> >   	return 0;
> >   }
> > diff --git a/net/netfilter/nf_flow_table_offload.c b/net/netfilter/nf_flow_table_offload.c
> > index b561e0a44a45..63bf1579e75f 100644
> > --- a/net/netfilter/nf_flow_table_offload.c
> > +++ b/net/netfilter/nf_flow_table_offload.c
> > @@ -5,6 +5,7 @@
> >   #include <linux/rhashtable.h>
> >   #include <linux/netdevice.h>
> >   #include <linux/tc_act/tc_csum.h>
> > +#include <linux/spinlock.h>
> >   #include <net/flow_offload.h>
> >   #include <net/netfilter/nf_flow_table.h>
> >   #include <net/netfilter/nf_tables.h>
> > @@ -953,11 +954,22 @@ static void flow_offload_work_stats(struct flow_offload_work *offload)
> >   static void flow_offload_work_handler(struct work_struct *work)
> >   {
> >   	struct flow_offload_work *offload;
> > +	struct flow_offload_tuple *tuple;
> > +	struct flow_offload *flow;
> >   	offload = container_of(work, struct flow_offload_work, work);
> >   	switch (offload->cmd) {
> >   		case FLOW_CLS_REPLACE:
> >   			flow_offload_work_add(offload);
> > +			/* Set the TCP connection to established or teardown does not work */
> > +			flow = offload->flow;
> > +			tuple = &flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple;
> > +			if (tuple->l4proto == IPPROTO_TCP && !test_bit(IPS_ASSURED_BIT, &flow->ct->status)) {
> > +				spin_lock_bh(&flow->ct->lock);
> > +				flow->ct->proto.tcp.state = TCP_CONNTRACK_ESTABLISHED;
> > +				spin_unlock_bh(&flow->ct->lock);
> > +				set_bit(IPS_ASSURED_BIT, &flow->ct->status);
> > +			}
> 
> Hmm, this looks like a workaround.
> Also note that this code is called only when the flowtable
> NF_FLOWTABLE_HW_OFFLOAD bit is set.
> 

Yes, but as explained in my previous email, we need to set the TCP state to
established otherwise we have no chance of fixing up the TCP state in
flow_offload_del or we just do not know what has happened to the TCP state
between the time it was offloaded and it is know beeing processed by nftables
before the flowtable gc runs.



> >   			break;
> >   		case FLOW_CLS_DESTROY:
> >   			flow_offload_work_del(offload);

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH] nf_flowtable: teardown fix race condition
  2022-05-09  9:31 [PATCH] nf_flowtable: teardown fix race condition Sven Auhagen
  2022-05-09 13:13 ` Oz Shlomo
@ 2022-05-09 16:47 ` Florian Westphal
  2022-05-09 17:11   ` Sven Auhagen
  1 sibling, 1 reply; 5+ messages in thread
From: Florian Westphal @ 2022-05-09 16:47 UTC (permalink / raw)
  To: Sven Auhagen; +Cc: netfilter-devel, pablo, nbd, fw, paulb, ozsh

Sven Auhagen <Sven.Auhagen@voleatech.de> wrote:
> +	if (unlikely(!test_bit(IPS_ASSURED_BIT, &flow->ct->status))) {
> +		spin_lock_bh(&flow->ct->lock);
> +		flow->ct->proto.tcp.state = TCP_CONNTRACK_ESTABLISHED;
> +		spin_unlock_bh(&flow->ct->lock);
> +		set_bit(IPS_ASSURED_BIT, &flow->ct->status);

Uh. Whats going on here?  ASSURED bit prevents early-eviction,
it should not be set at random.

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH] nf_flowtable: teardown fix race condition
  2022-05-09 16:47 ` Florian Westphal
@ 2022-05-09 17:11   ` Sven Auhagen
  0 siblings, 0 replies; 5+ messages in thread
From: Sven Auhagen @ 2022-05-09 17:11 UTC (permalink / raw)
  To: Florian Westphal; +Cc: netfilter-devel, pablo, nbd, paulb, ozsh

On Mon, May 09, 2022 at 06:47:33PM +0200, Florian Westphal wrote:
> Sven Auhagen <Sven.Auhagen@voleatech.de> wrote:
> > +	if (unlikely(!test_bit(IPS_ASSURED_BIT, &flow->ct->status))) {
> > +		spin_lock_bh(&flow->ct->lock);
> > +		flow->ct->proto.tcp.state = TCP_CONNTRACK_ESTABLISHED;
> > +		spin_unlock_bh(&flow->ct->lock);
> > +		set_bit(IPS_ASSURED_BIT, &flow->ct->status);
> 
> Uh. Whats going on here?  ASSURED bit prevents early-eviction,
> it should not be set at random.

It is not set at random but when the TCP state is set to established.
The problem with the flow offload code at the moment is that it
is setting the TCP state to established on flow teardown disregarding
the current TCP state which might be CLOSED or FIN WAIT and therefore
creating a lot of long living dead state entries.

I need the tcp state to be ESTABLISHED at this point to distinguish
the right cases at flow teardown, because the TCP state at
flow creation is SYN_RECV and it will most likely stay like that
during offload.
It can transition to a different state though if the flow offload code
bumps up a packet to the nftables slow path in case of a processing
error or after flow teardown and before flow delete, because there is
a race condition at the moment.

After talking to Oz today, he rightfully mentioned that the offload
should not be allocated if the TCP state is not established to avoid
the hack here.

I will send a v2 with that implementation.

Best
Sven


^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2022-05-09 17:11 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-05-09  9:31 [PATCH] nf_flowtable: teardown fix race condition Sven Auhagen
2022-05-09 13:13 ` Oz Shlomo
2022-05-09 13:16   ` Sven Auhagen
2022-05-09 16:47 ` Florian Westphal
2022-05-09 17:11   ` Sven Auhagen

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.