netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH net-next v6 0/3] act_ct: Software offload of conntrack_in
@ 2020-03-03 15:57 Paul Blakey
  2020-03-03 15:57 ` [PATCH net-next v6 1/3] net/sched: act_ct: Create nf flow table per zone Paul Blakey
                   ` (2 more replies)
  0 siblings, 3 replies; 14+ messages in thread
From: Paul Blakey @ 2020-03-03 15:57 UTC (permalink / raw)
  To: Paul Blakey, Saeed Mahameed, Oz Shlomo, Jakub Kicinski,
	Vlad Buslov, David Miller, netdev, Jiri Pirko, Roi Dayan

This series adds software offload of connections with an established
ct state using the NF flow table offload infrastructure, so
once such flows are offloaded, they will not pass through conntrack
again, and instead act_ct will restore the conntrack info metadata
on the skb to the state it had on the offload event - established.

Act_ct maintains an FT instance per ct zone. Flow table entries
are created, per ct connection, when connections enter an established
state and deleted otherwise. Once an entry is created, the FT assumes
ownership of the entry, and manages it's aging.

On the datapath, first lookup the skb in the zone's FT before going
into conntrack, and if a matching flow is found, restore the conntrack
info metadata on the skb, and skip calling conntrack.

Note that this patchset is part of the connection tracking offload feature.
Hardware offload of connections with an established ct state series will follow
this one.

Changelog:
   v1->v2:
     Removed now unused netfilter patches

Paul Blakey (3):
  net/sched: act_ct: Create nf flow table per zone
  net/sched: act_ct: Offload established connections to flow table
  net/sched: act_ct: Software offload of established flows

 include/net/tc_act/tc_ct.h |   2 +
 net/sched/Kconfig          |   2 +-
 net/sched/act_ct.c         | 345 ++++++++++++++++++++++++++++++++++++++++++++-
 3 files changed, 347 insertions(+), 2 deletions(-)

-- 
1.8.3.1


^ permalink raw reply	[flat|nested] 14+ messages in thread

* [PATCH net-next v6 1/3] net/sched: act_ct: Create nf flow table per zone
  2020-03-03 15:57 [PATCH net-next v6 0/3] act_ct: Software offload of conntrack_in Paul Blakey
@ 2020-03-03 15:57 ` Paul Blakey
  2020-03-03 16:51   ` Marcelo Ricardo Leitner
  2020-03-07 20:12   ` Eric Dumazet
  2020-03-03 15:57 ` [PATCH net-next v6 2/3] net/sched: act_ct: Offload established connections to flow table Paul Blakey
  2020-03-03 15:57 ` [PATCH net-next v6 3/3] net/sched: act_ct: Software offload of established flows Paul Blakey
  2 siblings, 2 replies; 14+ messages in thread
From: Paul Blakey @ 2020-03-03 15:57 UTC (permalink / raw)
  To: Paul Blakey, Saeed Mahameed, Oz Shlomo, Jakub Kicinski,
	Vlad Buslov, David Miller, netdev, Jiri Pirko, Roi Dayan

Use the NF flow tables infrastructure for CT offload.

Create a nf flow table per zone.

Next patches will add FT entries to this table, and do
the software offload.

Signed-off-by: Paul Blakey <paulb@mellanox.com>
Reviewed-by: Jiri Pirko <jiri@mellanox.com>
---
  v4->v5:
    Added reviewed by Jiri, thanks!
  v3->v4:
    Alloc GFP_ATOMIC
  v2->v3:
    Ditch re-locking to alloc, and use atomic allocation
  v1->v2:
    Use spin_lock_bh instead of spin_lock, and unlock for alloc (as it can sleep)
    Free ft on last tc act instance instead of last instance + last offloaded tuple,
    this removes cleanup cb and netfilter patches, and is simpler
    Removed accidental mlx5/core/en_tc.c change
    Removed reviewed by Jiri - patch changed

 include/net/tc_act/tc_ct.h |   2 +
 net/sched/Kconfig          |   2 +-
 net/sched/act_ct.c         | 134 ++++++++++++++++++++++++++++++++++++++++++++-
 3 files changed, 136 insertions(+), 2 deletions(-)

diff --git a/include/net/tc_act/tc_ct.h b/include/net/tc_act/tc_ct.h
index a8b1564..cf3492e 100644
--- a/include/net/tc_act/tc_ct.h
+++ b/include/net/tc_act/tc_ct.h
@@ -25,6 +25,8 @@ struct tcf_ct_params {
 	u16 ct_action;
 
 	struct rcu_head rcu;
+
+	struct tcf_ct_flow_table *ct_ft;
 };
 
 struct tcf_ct {
diff --git a/net/sched/Kconfig b/net/sched/Kconfig
index edde0e5..bfbefb7 100644
--- a/net/sched/Kconfig
+++ b/net/sched/Kconfig
@@ -972,7 +972,7 @@ config NET_ACT_TUNNEL_KEY
 
 config NET_ACT_CT
 	tristate "connection tracking tc action"
-	depends on NET_CLS_ACT && NF_CONNTRACK && NF_NAT
+	depends on NET_CLS_ACT && NF_CONNTRACK && NF_NAT && NF_FLOW_TABLE
 	help
 	  Say Y here to allow sending the packets to conntrack module.
 
diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c
index f685c0d..3321087 100644
--- a/net/sched/act_ct.c
+++ b/net/sched/act_ct.c
@@ -15,6 +15,7 @@
 #include <linux/pkt_cls.h>
 #include <linux/ip.h>
 #include <linux/ipv6.h>
+#include <linux/rhashtable.h>
 #include <net/netlink.h>
 #include <net/pkt_sched.h>
 #include <net/pkt_cls.h>
@@ -24,6 +25,7 @@
 #include <uapi/linux/tc_act/tc_ct.h>
 #include <net/tc_act/tc_ct.h>
 
+#include <net/netfilter/nf_flow_table.h>
 #include <net/netfilter/nf_conntrack.h>
 #include <net/netfilter/nf_conntrack_core.h>
 #include <net/netfilter/nf_conntrack_zones.h>
@@ -31,6 +33,108 @@
 #include <net/netfilter/ipv6/nf_defrag_ipv6.h>
 #include <uapi/linux/netfilter/nf_nat.h>
 
+static struct workqueue_struct *act_ct_wq;
+static struct rhashtable zones_ht;
+static DEFINE_SPINLOCK(zones_lock);
+
+struct tcf_ct_flow_table {
+	struct rhash_head node; /* In zones tables */
+
+	struct rcu_work rwork;
+	struct nf_flowtable nf_ft;
+	u16 zone;
+	u32 ref;
+
+	bool dying;
+};
+
+static const struct rhashtable_params zones_params = {
+	.head_offset = offsetof(struct tcf_ct_flow_table, node),
+	.key_offset = offsetof(struct tcf_ct_flow_table, zone),
+	.key_len = sizeof_field(struct tcf_ct_flow_table, zone),
+	.automatic_shrinking = true,
+};
+
+static struct nf_flowtable_type flowtable_ct = {
+	.owner		= THIS_MODULE,
+};
+
+static int tcf_ct_flow_table_get(struct tcf_ct_params *params)
+{
+	struct tcf_ct_flow_table *ct_ft;
+	int err = -ENOMEM;
+
+	spin_lock_bh(&zones_lock);
+	ct_ft = rhashtable_lookup_fast(&zones_ht, &params->zone, zones_params);
+	if (ct_ft)
+		goto take_ref;
+
+	ct_ft = kzalloc(sizeof(*ct_ft), GFP_ATOMIC);
+	if (!ct_ft)
+		goto err_alloc;
+
+	ct_ft->zone = params->zone;
+	err = rhashtable_insert_fast(&zones_ht, &ct_ft->node, zones_params);
+	if (err)
+		goto err_insert;
+
+	ct_ft->nf_ft.type = &flowtable_ct;
+	err = nf_flow_table_init(&ct_ft->nf_ft);
+	if (err)
+		goto err_init;
+
+	__module_get(THIS_MODULE);
+take_ref:
+	params->ct_ft = ct_ft;
+	ct_ft->ref++;
+	spin_unlock_bh(&zones_lock);
+
+	return 0;
+
+err_init:
+	rhashtable_remove_fast(&zones_ht, &ct_ft->node, zones_params);
+err_insert:
+	kfree(ct_ft);
+err_alloc:
+	spin_unlock_bh(&zones_lock);
+	return err;
+}
+
+static void tcf_ct_flow_table_cleanup_work(struct work_struct *work)
+{
+	struct tcf_ct_flow_table *ct_ft;
+
+	ct_ft = container_of(to_rcu_work(work), struct tcf_ct_flow_table,
+			     rwork);
+	nf_flow_table_free(&ct_ft->nf_ft);
+	kfree(ct_ft);
+
+	module_put(THIS_MODULE);
+}
+
+static void tcf_ct_flow_table_put(struct tcf_ct_params *params)
+{
+	struct tcf_ct_flow_table *ct_ft = params->ct_ft;
+
+	spin_lock_bh(&zones_lock);
+	if (--params->ct_ft->ref == 0) {
+		rhashtable_remove_fast(&zones_ht, &ct_ft->node, zones_params);
+		INIT_RCU_WORK(&ct_ft->rwork, tcf_ct_flow_table_cleanup_work);
+		queue_rcu_work(act_ct_wq, &ct_ft->rwork);
+	}
+	spin_unlock_bh(&zones_lock);
+}
+
+static int tcf_ct_flow_tables_init(void)
+{
+	return rhashtable_init(&zones_ht, &zones_params);
+}
+
+static void tcf_ct_flow_tables_uninit(void)
+{
+	rhashtable_destroy(&zones_ht);
+}
+
 static struct tc_action_ops act_ct_ops;
 static unsigned int ct_net_id;
 
@@ -207,6 +311,8 @@ static void tcf_ct_params_free(struct rcu_head *head)
 	struct tcf_ct_params *params = container_of(head,
 						    struct tcf_ct_params, rcu);
 
+	tcf_ct_flow_table_put(params);
+
 	if (params->tmpl)
 		nf_conntrack_put(&params->tmpl->ct_general);
 	kfree(params);
@@ -730,6 +836,10 @@ static int tcf_ct_init(struct net *net, struct nlattr *nla,
 	if (err)
 		goto cleanup;
 
+	err = tcf_ct_flow_table_get(params);
+	if (err)
+		goto cleanup;
+
 	spin_lock_bh(&c->tcf_lock);
 	goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch);
 	params = rcu_replace_pointer(c->params, params,
@@ -974,12 +1084,34 @@ static void __net_exit ct_exit_net(struct list_head *net_list)
 
 static int __init ct_init_module(void)
 {
-	return tcf_register_action(&act_ct_ops, &ct_net_ops);
+	int err;
+
+	act_ct_wq = alloc_ordered_workqueue("act_ct_workqueue", 0);
+	if (!act_ct_wq)
+		return -ENOMEM;
+
+	err = tcf_ct_flow_tables_init();
+	if (err)
+		goto err_tbl_init;
+
+	err = tcf_register_action(&act_ct_ops, &ct_net_ops);
+	if (err)
+		goto err_register;
+
+	return 0;
+
+err_tbl_init:
+	destroy_workqueue(act_ct_wq);
+err_register:
+	tcf_ct_flow_tables_uninit();
+	return err;
 }
 
 static void __exit ct_cleanup_module(void)
 {
 	tcf_unregister_action(&act_ct_ops, &ct_net_ops);
+	tcf_ct_flow_tables_uninit();
+	destroy_workqueue(act_ct_wq);
 }
 
 module_init(ct_init_module);
-- 
1.8.3.1


^ permalink raw reply related	[flat|nested] 14+ messages in thread

* [PATCH net-next v6 2/3] net/sched: act_ct: Offload established connections to flow table
  2020-03-03 15:57 [PATCH net-next v6 0/3] act_ct: Software offload of conntrack_in Paul Blakey
  2020-03-03 15:57 ` [PATCH net-next v6 1/3] net/sched: act_ct: Create nf flow table per zone Paul Blakey
@ 2020-03-03 15:57 ` Paul Blakey
  2020-03-03 16:51   ` Marcelo Ricardo Leitner
  2020-03-03 15:57 ` [PATCH net-next v6 3/3] net/sched: act_ct: Software offload of established flows Paul Blakey
  2 siblings, 1 reply; 14+ messages in thread
From: Paul Blakey @ 2020-03-03 15:57 UTC (permalink / raw)
  To: Paul Blakey, Saeed Mahameed, Oz Shlomo, Jakub Kicinski,
	Vlad Buslov, David Miller, netdev, Jiri Pirko, Roi Dayan

Add a ft entry when connections enter an established state and delete
the connections when they leave the established state.

The flow table assumes ownership of the connection. In the following
patch act_ct will lookup the ct state from the FT. In future patches,
drivers will register for callbacks for ft add/del events and will be
able to use the information to offload the connections.

Note that connection aging is managed by the FT.

Signed-off-by: Paul Blakey <paulb@mellanox.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
---
 net/sched/act_ct.c | 63 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 63 insertions(+)

diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c
index 3321087..2ab38431 100644
--- a/net/sched/act_ct.c
+++ b/net/sched/act_ct.c
@@ -125,6 +125,67 @@ static void tcf_ct_flow_table_put(struct tcf_ct_params *params)
 	spin_unlock_bh(&zones_lock);
 }
 
+static void tcf_ct_flow_table_add(struct tcf_ct_flow_table *ct_ft,
+				  struct nf_conn *ct,
+				  bool tcp)
+{
+	struct flow_offload *entry;
+	int err;
+
+	if (test_and_set_bit(IPS_OFFLOAD_BIT, &ct->status))
+		return;
+
+	entry = flow_offload_alloc(ct);
+	if (!entry) {
+		WARN_ON_ONCE(1);
+		goto err_alloc;
+	}
+
+	if (tcp) {
+		ct->proto.tcp.seen[0].flags |= IP_CT_TCP_FLAG_BE_LIBERAL;
+		ct->proto.tcp.seen[1].flags |= IP_CT_TCP_FLAG_BE_LIBERAL;
+	}
+
+	err = flow_offload_add(&ct_ft->nf_ft, entry);
+	if (err)
+		goto err_add;
+
+	return;
+
+err_add:
+	flow_offload_free(entry);
+err_alloc:
+	clear_bit(IPS_OFFLOAD_BIT, &ct->status);
+}
+
+static void tcf_ct_flow_table_process_conn(struct tcf_ct_flow_table *ct_ft,
+					   struct nf_conn *ct,
+					   enum ip_conntrack_info ctinfo)
+{
+	bool tcp = false;
+
+	if (ctinfo != IP_CT_ESTABLISHED && ctinfo != IP_CT_ESTABLISHED_REPLY)
+		return;
+
+	switch (nf_ct_protonum(ct)) {
+	case IPPROTO_TCP:
+		tcp = true;
+		if (ct->proto.tcp.state != TCP_CONNTRACK_ESTABLISHED)
+			return;
+		break;
+	case IPPROTO_UDP:
+		break;
+	default:
+		return;
+	}
+
+	if (nf_ct_ext_exist(ct, NF_CT_EXT_HELPER) ||
+	    ct->status & IPS_SEQ_ADJUST)
+		return;
+
+	tcf_ct_flow_table_add(ct_ft, ct, tcp);
+}
+
 static int tcf_ct_flow_tables_init(void)
 {
 	return rhashtable_init(&zones_ht, &zones_params);
@@ -578,6 +639,8 @@ static int tcf_ct_act(struct sk_buff *skb, const struct tc_action *a,
 		nf_conntrack_confirm(skb);
 	}
 
+	tcf_ct_flow_table_process_conn(p->ct_ft, ct, ctinfo);
+
 out_push:
 	skb_push_rcsum(skb, nh_ofs);
 
-- 
1.8.3.1


^ permalink raw reply related	[flat|nested] 14+ messages in thread

* [PATCH net-next v6 3/3] net/sched: act_ct: Software offload of established flows
  2020-03-03 15:57 [PATCH net-next v6 0/3] act_ct: Software offload of conntrack_in Paul Blakey
  2020-03-03 15:57 ` [PATCH net-next v6 1/3] net/sched: act_ct: Create nf flow table per zone Paul Blakey
  2020-03-03 15:57 ` [PATCH net-next v6 2/3] net/sched: act_ct: Offload established connections to flow table Paul Blakey
@ 2020-03-03 15:57 ` Paul Blakey
  2020-03-03 16:51   ` Marcelo Ricardo Leitner
  2020-03-03 18:32   ` Nikolay Aleksandrov
  2 siblings, 2 replies; 14+ messages in thread
From: Paul Blakey @ 2020-03-03 15:57 UTC (permalink / raw)
  To: Paul Blakey, Saeed Mahameed, Oz Shlomo, Jakub Kicinski,
	Vlad Buslov, David Miller, netdev, Jiri Pirko, Roi Dayan

Offload nf conntrack processing by looking up the 5-tuple in the
zone's flow table.

The nf conntrack module will process the packets until a connection is
in established state. Once in established state, the ct state pointer
(nf_conn) will be restored on the skb from a successful ft lookup.

Signed-off-by: Paul Blakey <paulb@mellanox.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
---
Changelog:
  v5->v6:
   Refactor tcp fin/rst check, get tcp header inside filler, and test in caller
   Pull tcp with ports instead of two pulls
  v4->v5:
   Re-read ip/ip6 header after pulling as skb ptrs may change
   Use pskb_network_may_pull instaed of pskb_may_pull
  v1->v2:
   Add !skip_add curly braces
   Removed extra setting thoff again
   Check tcp proto outside of tcf_ct_flow_table_check_tcp

 net/sched/act_ct.c | 152 ++++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 150 insertions(+), 2 deletions(-)

diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c
index 2ab38431..23eba61 100644
--- a/net/sched/act_ct.c
+++ b/net/sched/act_ct.c
@@ -186,6 +186,147 @@ static void tcf_ct_flow_table_process_conn(struct tcf_ct_flow_table *ct_ft,
 	tcf_ct_flow_table_add(ct_ft, ct, tcp);
 }
 
+static bool
+tcf_ct_flow_table_fill_tuple_ipv4(struct sk_buff *skb,
+				  struct flow_offload_tuple *tuple,
+				  struct tcphdr **tcph)
+{
+	struct flow_ports *ports;
+	unsigned int thoff;
+	struct iphdr *iph;
+
+	if (!pskb_network_may_pull(skb, sizeof(*iph)))
+		return false;
+
+	iph = ip_hdr(skb);
+	thoff = iph->ihl * 4;
+
+	if (ip_is_fragment(iph) ||
+	    unlikely(thoff != sizeof(struct iphdr)))
+		return false;
+
+	if (iph->protocol != IPPROTO_TCP &&
+	    iph->protocol != IPPROTO_UDP)
+		return false;
+
+	if (iph->ttl <= 1)
+		return false;
+
+	if (!pskb_network_may_pull(skb, iph->protocol == IPPROTO_TCP ?
+					thoff + sizeof(struct tcphdr) :
+					thoff + sizeof(*ports)))
+		return false;
+
+	iph = ip_hdr(skb);
+	if (iph->protocol == IPPROTO_TCP)
+		*tcph = (void *)(skb_network_header(skb) + thoff);
+
+	ports = (struct flow_ports *)(skb_network_header(skb) + thoff);
+	tuple->src_v4.s_addr = iph->saddr;
+	tuple->dst_v4.s_addr = iph->daddr;
+	tuple->src_port = ports->source;
+	tuple->dst_port = ports->dest;
+	tuple->l3proto = AF_INET;
+	tuple->l4proto = iph->protocol;
+
+	return true;
+}
+
+static bool
+tcf_ct_flow_table_fill_tuple_ipv6(struct sk_buff *skb,
+				  struct flow_offload_tuple *tuple,
+				  struct tcphdr **tcph)
+{
+	struct flow_ports *ports;
+	struct ipv6hdr *ip6h;
+	unsigned int thoff;
+
+	if (!pskb_network_may_pull(skb, sizeof(*ip6h)))
+		return false;
+
+	ip6h = ipv6_hdr(skb);
+
+	if (ip6h->nexthdr != IPPROTO_TCP &&
+	    ip6h->nexthdr != IPPROTO_UDP)
+		return false;
+
+	if (ip6h->hop_limit <= 1)
+		return false;
+
+	thoff = sizeof(*ip6h);
+	if (!pskb_network_may_pull(skb, ip6h->nexthdr == IPPROTO_TCP ?
+					thoff + sizeof(struct tcphdr) :
+					thoff + sizeof(*ports)))
+		return false;
+
+	ip6h = ipv6_hdr(skb);
+	if (ip6h->nexthdr == IPPROTO_TCP)
+		*tcph = (void *)(skb_network_header(skb) + thoff);
+
+	ports = (struct flow_ports *)(skb_network_header(skb) + thoff);
+	tuple->src_v6 = ip6h->saddr;
+	tuple->dst_v6 = ip6h->daddr;
+	tuple->src_port = ports->source;
+	tuple->dst_port = ports->dest;
+	tuple->l3proto = AF_INET6;
+	tuple->l4proto = ip6h->nexthdr;
+
+	return true;
+}
+
+static bool tcf_ct_flow_table_lookup(struct tcf_ct_params *p,
+				     struct sk_buff *skb,
+				     u8 family)
+{
+	struct nf_flowtable *nf_ft = &p->ct_ft->nf_ft;
+	struct flow_offload_tuple_rhash *tuplehash;
+	struct flow_offload_tuple tuple = {};
+	enum ip_conntrack_info ctinfo;
+	struct tcphdr *tcph = NULL;
+	struct flow_offload *flow;
+	struct nf_conn *ct;
+	u8 dir;
+
+	/* Previously seen or loopback */
+	ct = nf_ct_get(skb, &ctinfo);
+	if ((ct && !nf_ct_is_template(ct)) || ctinfo == IP_CT_UNTRACKED)
+		return false;
+
+	switch (family) {
+	case NFPROTO_IPV4:
+		if (!tcf_ct_flow_table_fill_tuple_ipv4(skb, &tuple, &tcph))
+			return false;
+		break;
+	case NFPROTO_IPV6:
+		if (!tcf_ct_flow_table_fill_tuple_ipv6(skb, &tuple, &tcph))
+			return false;
+		break;
+	default:
+		return false;
+	}
+
+	tuplehash = flow_offload_lookup(nf_ft, &tuple);
+	if (!tuplehash)
+		return false;
+
+	dir = tuplehash->tuple.dir;
+	flow = container_of(tuplehash, struct flow_offload, tuplehash[dir]);
+	ct = flow->ct;
+
+	if (tcph && (unlikely(tcph->fin || tcph->rst))) {
+		flow_offload_teardown(flow);
+		return false;
+	}
+
+	ctinfo = dir == FLOW_OFFLOAD_DIR_ORIGINAL ? IP_CT_ESTABLISHED :
+						    IP_CT_ESTABLISHED_REPLY;
+
+	nf_conntrack_get(&ct->ct_general);
+	nf_ct_set(skb, ct, ctinfo);
+
+	return true;
+}
+
 static int tcf_ct_flow_tables_init(void)
 {
 	return rhashtable_init(&zones_ht, &zones_params);
@@ -554,6 +695,7 @@ static int tcf_ct_act(struct sk_buff *skb, const struct tc_action *a,
 	struct nf_hook_state state;
 	int nh_ofs, err, retval;
 	struct tcf_ct_params *p;
+	bool skip_add = false;
 	struct nf_conn *ct;
 	u8 family;
 
@@ -603,6 +745,11 @@ static int tcf_ct_act(struct sk_buff *skb, const struct tc_action *a,
 	 */
 	cached = tcf_ct_skb_nfct_cached(net, skb, p->zone, force);
 	if (!cached) {
+		if (!commit && tcf_ct_flow_table_lookup(p, skb, family)) {
+			skip_add = true;
+			goto do_nat;
+		}
+
 		/* Associate skb with specified zone. */
 		if (tmpl) {
 			ct = nf_ct_get(skb, &ctinfo);
@@ -620,6 +767,7 @@ static int tcf_ct_act(struct sk_buff *skb, const struct tc_action *a,
 			goto out_push;
 	}
 
+do_nat:
 	ct = nf_ct_get(skb, &ctinfo);
 	if (!ct)
 		goto out_push;
@@ -637,10 +785,10 @@ static int tcf_ct_act(struct sk_buff *skb, const struct tc_action *a,
 		 * even if the connection is already confirmed.
 		 */
 		nf_conntrack_confirm(skb);
+	} else if (!skip_add) {
+		tcf_ct_flow_table_process_conn(p->ct_ft, ct, ctinfo);
 	}
 
-	tcf_ct_flow_table_process_conn(p->ct_ft, ct, ctinfo);
-
 out_push:
 	skb_push_rcsum(skb, nh_ofs);
 
-- 
1.8.3.1


^ permalink raw reply related	[flat|nested] 14+ messages in thread

* Re: [PATCH net-next v6 1/3] net/sched: act_ct: Create nf flow table per zone
  2020-03-03 15:57 ` [PATCH net-next v6 1/3] net/sched: act_ct: Create nf flow table per zone Paul Blakey
@ 2020-03-03 16:51   ` Marcelo Ricardo Leitner
  2020-03-07 20:12   ` Eric Dumazet
  1 sibling, 0 replies; 14+ messages in thread
From: Marcelo Ricardo Leitner @ 2020-03-03 16:51 UTC (permalink / raw)
  To: Paul Blakey
  Cc: Saeed Mahameed, Oz Shlomo, Jakub Kicinski, Vlad Buslov,
	David Miller, netdev, Jiri Pirko, Roi Dayan

On Tue, Mar 03, 2020 at 05:57:50PM +0200, Paul Blakey wrote:
> Use the NF flow tables infrastructure for CT offload.
> 
> Create a nf flow table per zone.
> 
> Next patches will add FT entries to this table, and do
> the software offload.
> 
> Signed-off-by: Paul Blakey <paulb@mellanox.com>
> Reviewed-by: Jiri Pirko <jiri@mellanox.com>

Acked-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>

> ---
>   v4->v5:
>     Added reviewed by Jiri, thanks!
>   v3->v4:
>     Alloc GFP_ATOMIC
>   v2->v3:
>     Ditch re-locking to alloc, and use atomic allocation
>   v1->v2:
>     Use spin_lock_bh instead of spin_lock, and unlock for alloc (as it can sleep)
>     Free ft on last tc act instance instead of last instance + last offloaded tuple,
>     this removes cleanup cb and netfilter patches, and is simpler
>     Removed accidental mlx5/core/en_tc.c change
>     Removed reviewed by Jiri - patch changed
> 
>  include/net/tc_act/tc_ct.h |   2 +
>  net/sched/Kconfig          |   2 +-
>  net/sched/act_ct.c         | 134 ++++++++++++++++++++++++++++++++++++++++++++-
>  3 files changed, 136 insertions(+), 2 deletions(-)
> 
> diff --git a/include/net/tc_act/tc_ct.h b/include/net/tc_act/tc_ct.h
> index a8b1564..cf3492e 100644
> --- a/include/net/tc_act/tc_ct.h
> +++ b/include/net/tc_act/tc_ct.h
> @@ -25,6 +25,8 @@ struct tcf_ct_params {
>  	u16 ct_action;
>  
>  	struct rcu_head rcu;
> +
> +	struct tcf_ct_flow_table *ct_ft;
>  };
>  
>  struct tcf_ct {
> diff --git a/net/sched/Kconfig b/net/sched/Kconfig
> index edde0e5..bfbefb7 100644
> --- a/net/sched/Kconfig
> +++ b/net/sched/Kconfig
> @@ -972,7 +972,7 @@ config NET_ACT_TUNNEL_KEY
>  
>  config NET_ACT_CT
>  	tristate "connection tracking tc action"
> -	depends on NET_CLS_ACT && NF_CONNTRACK && NF_NAT
> +	depends on NET_CLS_ACT && NF_CONNTRACK && NF_NAT && NF_FLOW_TABLE
>  	help
>  	  Say Y here to allow sending the packets to conntrack module.
>  
> diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c
> index f685c0d..3321087 100644
> --- a/net/sched/act_ct.c
> +++ b/net/sched/act_ct.c
> @@ -15,6 +15,7 @@
>  #include <linux/pkt_cls.h>
>  #include <linux/ip.h>
>  #include <linux/ipv6.h>
> +#include <linux/rhashtable.h>
>  #include <net/netlink.h>
>  #include <net/pkt_sched.h>
>  #include <net/pkt_cls.h>
> @@ -24,6 +25,7 @@
>  #include <uapi/linux/tc_act/tc_ct.h>
>  #include <net/tc_act/tc_ct.h>
>  
> +#include <net/netfilter/nf_flow_table.h>
>  #include <net/netfilter/nf_conntrack.h>
>  #include <net/netfilter/nf_conntrack_core.h>
>  #include <net/netfilter/nf_conntrack_zones.h>
> @@ -31,6 +33,108 @@
>  #include <net/netfilter/ipv6/nf_defrag_ipv6.h>
>  #include <uapi/linux/netfilter/nf_nat.h>
>  
> +static struct workqueue_struct *act_ct_wq;
> +static struct rhashtable zones_ht;
> +static DEFINE_SPINLOCK(zones_lock);
> +
> +struct tcf_ct_flow_table {
> +	struct rhash_head node; /* In zones tables */
> +
> +	struct rcu_work rwork;
> +	struct nf_flowtable nf_ft;
> +	u16 zone;
> +	u32 ref;
> +
> +	bool dying;
> +};
> +
> +static const struct rhashtable_params zones_params = {
> +	.head_offset = offsetof(struct tcf_ct_flow_table, node),
> +	.key_offset = offsetof(struct tcf_ct_flow_table, zone),
> +	.key_len = sizeof_field(struct tcf_ct_flow_table, zone),
> +	.automatic_shrinking = true,
> +};
> +
> +static struct nf_flowtable_type flowtable_ct = {
> +	.owner		= THIS_MODULE,
> +};
> +
> +static int tcf_ct_flow_table_get(struct tcf_ct_params *params)
> +{
> +	struct tcf_ct_flow_table *ct_ft;
> +	int err = -ENOMEM;
> +
> +	spin_lock_bh(&zones_lock);
> +	ct_ft = rhashtable_lookup_fast(&zones_ht, &params->zone, zones_params);
> +	if (ct_ft)
> +		goto take_ref;
> +
> +	ct_ft = kzalloc(sizeof(*ct_ft), GFP_ATOMIC);
> +	if (!ct_ft)
> +		goto err_alloc;
> +
> +	ct_ft->zone = params->zone;
> +	err = rhashtable_insert_fast(&zones_ht, &ct_ft->node, zones_params);
> +	if (err)
> +		goto err_insert;
> +
> +	ct_ft->nf_ft.type = &flowtable_ct;
> +	err = nf_flow_table_init(&ct_ft->nf_ft);
> +	if (err)
> +		goto err_init;
> +
> +	__module_get(THIS_MODULE);
> +take_ref:
> +	params->ct_ft = ct_ft;
> +	ct_ft->ref++;
> +	spin_unlock_bh(&zones_lock);
> +
> +	return 0;
> +
> +err_init:
> +	rhashtable_remove_fast(&zones_ht, &ct_ft->node, zones_params);
> +err_insert:
> +	kfree(ct_ft);
> +err_alloc:
> +	spin_unlock_bh(&zones_lock);
> +	return err;
> +}
> +
> +static void tcf_ct_flow_table_cleanup_work(struct work_struct *work)
> +{
> +	struct tcf_ct_flow_table *ct_ft;
> +
> +	ct_ft = container_of(to_rcu_work(work), struct tcf_ct_flow_table,
> +			     rwork);
> +	nf_flow_table_free(&ct_ft->nf_ft);
> +	kfree(ct_ft);
> +
> +	module_put(THIS_MODULE);
> +}
> +
> +static void tcf_ct_flow_table_put(struct tcf_ct_params *params)
> +{
> +	struct tcf_ct_flow_table *ct_ft = params->ct_ft;
> +
> +	spin_lock_bh(&zones_lock);
> +	if (--params->ct_ft->ref == 0) {
> +		rhashtable_remove_fast(&zones_ht, &ct_ft->node, zones_params);
> +		INIT_RCU_WORK(&ct_ft->rwork, tcf_ct_flow_table_cleanup_work);
> +		queue_rcu_work(act_ct_wq, &ct_ft->rwork);
> +	}
> +	spin_unlock_bh(&zones_lock);
> +}
> +
> +static int tcf_ct_flow_tables_init(void)
> +{
> +	return rhashtable_init(&zones_ht, &zones_params);
> +}
> +
> +static void tcf_ct_flow_tables_uninit(void)
> +{
> +	rhashtable_destroy(&zones_ht);
> +}
> +
>  static struct tc_action_ops act_ct_ops;
>  static unsigned int ct_net_id;
>  
> @@ -207,6 +311,8 @@ static void tcf_ct_params_free(struct rcu_head *head)
>  	struct tcf_ct_params *params = container_of(head,
>  						    struct tcf_ct_params, rcu);
>  
> +	tcf_ct_flow_table_put(params);
> +
>  	if (params->tmpl)
>  		nf_conntrack_put(&params->tmpl->ct_general);
>  	kfree(params);
> @@ -730,6 +836,10 @@ static int tcf_ct_init(struct net *net, struct nlattr *nla,
>  	if (err)
>  		goto cleanup;
>  
> +	err = tcf_ct_flow_table_get(params);
> +	if (err)
> +		goto cleanup;
> +
>  	spin_lock_bh(&c->tcf_lock);
>  	goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch);
>  	params = rcu_replace_pointer(c->params, params,
> @@ -974,12 +1084,34 @@ static void __net_exit ct_exit_net(struct list_head *net_list)
>  
>  static int __init ct_init_module(void)
>  {
> -	return tcf_register_action(&act_ct_ops, &ct_net_ops);
> +	int err;
> +
> +	act_ct_wq = alloc_ordered_workqueue("act_ct_workqueue", 0);
> +	if (!act_ct_wq)
> +		return -ENOMEM;
> +
> +	err = tcf_ct_flow_tables_init();
> +	if (err)
> +		goto err_tbl_init;
> +
> +	err = tcf_register_action(&act_ct_ops, &ct_net_ops);
> +	if (err)
> +		goto err_register;
> +
> +	return 0;
> +
> +err_tbl_init:
> +	destroy_workqueue(act_ct_wq);
> +err_register:
> +	tcf_ct_flow_tables_uninit();
> +	return err;
>  }
>  
>  static void __exit ct_cleanup_module(void)
>  {
>  	tcf_unregister_action(&act_ct_ops, &ct_net_ops);
> +	tcf_ct_flow_tables_uninit();
> +	destroy_workqueue(act_ct_wq);
>  }
>  
>  module_init(ct_init_module);
> -- 
> 1.8.3.1
> 

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH net-next v6 2/3] net/sched: act_ct: Offload established connections to flow table
  2020-03-03 15:57 ` [PATCH net-next v6 2/3] net/sched: act_ct: Offload established connections to flow table Paul Blakey
@ 2020-03-03 16:51   ` Marcelo Ricardo Leitner
  0 siblings, 0 replies; 14+ messages in thread
From: Marcelo Ricardo Leitner @ 2020-03-03 16:51 UTC (permalink / raw)
  To: Paul Blakey
  Cc: Saeed Mahameed, Oz Shlomo, Jakub Kicinski, Vlad Buslov,
	David Miller, netdev, Jiri Pirko, Roi Dayan

On Tue, Mar 03, 2020 at 05:57:51PM +0200, Paul Blakey wrote:
> Add a ft entry when connections enter an established state and delete
> the connections when they leave the established state.
> 
> The flow table assumes ownership of the connection. In the following
> patch act_ct will lookup the ct state from the FT. In future patches,
> drivers will register for callbacks for ft add/del events and will be
> able to use the information to offload the connections.
> 
> Note that connection aging is managed by the FT.
> 
> Signed-off-by: Paul Blakey <paulb@mellanox.com>
> Acked-by: Jiri Pirko <jiri@mellanox.com>

Acked-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>

> ---
>  net/sched/act_ct.c | 63 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 63 insertions(+)
> 
> diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c
> index 3321087..2ab38431 100644
> --- a/net/sched/act_ct.c
> +++ b/net/sched/act_ct.c
> @@ -125,6 +125,67 @@ static void tcf_ct_flow_table_put(struct tcf_ct_params *params)
>  	spin_unlock_bh(&zones_lock);
>  }
>  
> +static void tcf_ct_flow_table_add(struct tcf_ct_flow_table *ct_ft,
> +				  struct nf_conn *ct,
> +				  bool tcp)
> +{
> +	struct flow_offload *entry;
> +	int err;
> +
> +	if (test_and_set_bit(IPS_OFFLOAD_BIT, &ct->status))
> +		return;
> +
> +	entry = flow_offload_alloc(ct);
> +	if (!entry) {
> +		WARN_ON_ONCE(1);
> +		goto err_alloc;
> +	}
> +
> +	if (tcp) {
> +		ct->proto.tcp.seen[0].flags |= IP_CT_TCP_FLAG_BE_LIBERAL;
> +		ct->proto.tcp.seen[1].flags |= IP_CT_TCP_FLAG_BE_LIBERAL;
> +	}
> +
> +	err = flow_offload_add(&ct_ft->nf_ft, entry);
> +	if (err)
> +		goto err_add;
> +
> +	return;
> +
> +err_add:
> +	flow_offload_free(entry);
> +err_alloc:
> +	clear_bit(IPS_OFFLOAD_BIT, &ct->status);
> +}
> +
> +static void tcf_ct_flow_table_process_conn(struct tcf_ct_flow_table *ct_ft,
> +					   struct nf_conn *ct,
> +					   enum ip_conntrack_info ctinfo)
> +{
> +	bool tcp = false;
> +
> +	if (ctinfo != IP_CT_ESTABLISHED && ctinfo != IP_CT_ESTABLISHED_REPLY)
> +		return;
> +
> +	switch (nf_ct_protonum(ct)) {
> +	case IPPROTO_TCP:
> +		tcp = true;
> +		if (ct->proto.tcp.state != TCP_CONNTRACK_ESTABLISHED)
> +			return;
> +		break;
> +	case IPPROTO_UDP:
> +		break;
> +	default:
> +		return;
> +	}
> +
> +	if (nf_ct_ext_exist(ct, NF_CT_EXT_HELPER) ||
> +	    ct->status & IPS_SEQ_ADJUST)
> +		return;
> +
> +	tcf_ct_flow_table_add(ct_ft, ct, tcp);
> +}
> +
>  static int tcf_ct_flow_tables_init(void)
>  {
>  	return rhashtable_init(&zones_ht, &zones_params);
> @@ -578,6 +639,8 @@ static int tcf_ct_act(struct sk_buff *skb, const struct tc_action *a,
>  		nf_conntrack_confirm(skb);
>  	}
>  
> +	tcf_ct_flow_table_process_conn(p->ct_ft, ct, ctinfo);
> +
>  out_push:
>  	skb_push_rcsum(skb, nh_ofs);
>  
> -- 
> 1.8.3.1
> 

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH net-next v6 3/3] net/sched: act_ct: Software offload of established flows
  2020-03-03 15:57 ` [PATCH net-next v6 3/3] net/sched: act_ct: Software offload of established flows Paul Blakey
@ 2020-03-03 16:51   ` Marcelo Ricardo Leitner
  2020-03-03 18:32   ` Nikolay Aleksandrov
  1 sibling, 0 replies; 14+ messages in thread
From: Marcelo Ricardo Leitner @ 2020-03-03 16:51 UTC (permalink / raw)
  To: Paul Blakey
  Cc: Saeed Mahameed, Oz Shlomo, Jakub Kicinski, Vlad Buslov,
	David Miller, netdev, Jiri Pirko, Roi Dayan

On Tue, Mar 03, 2020 at 05:57:52PM +0200, Paul Blakey wrote:
> Offload nf conntrack processing by looking up the 5-tuple in the
> zone's flow table.
> 
> The nf conntrack module will process the packets until a connection is
> in established state. Once in established state, the ct state pointer
> (nf_conn) will be restored on the skb from a successful ft lookup.
> 
> Signed-off-by: Paul Blakey <paulb@mellanox.com>
> Acked-by: Jiri Pirko <jiri@mellanox.com>

Acked-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>

> ---
> Changelog:
>   v5->v6:
>    Refactor tcp fin/rst check, get tcp header inside filler, and test in caller
>    Pull tcp with ports instead of two pulls
>   v4->v5:
>    Re-read ip/ip6 header after pulling as skb ptrs may change
>    Use pskb_network_may_pull instaed of pskb_may_pull
>   v1->v2:
>    Add !skip_add curly braces
>    Removed extra setting thoff again
>    Check tcp proto outside of tcf_ct_flow_table_check_tcp
> 
>  net/sched/act_ct.c | 152 ++++++++++++++++++++++++++++++++++++++++++++++++++++-
>  1 file changed, 150 insertions(+), 2 deletions(-)
> 
> diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c
> index 2ab38431..23eba61 100644
> --- a/net/sched/act_ct.c
> +++ b/net/sched/act_ct.c
> @@ -186,6 +186,147 @@ static void tcf_ct_flow_table_process_conn(struct tcf_ct_flow_table *ct_ft,
>  	tcf_ct_flow_table_add(ct_ft, ct, tcp);
>  }
>  
> +static bool
> +tcf_ct_flow_table_fill_tuple_ipv4(struct sk_buff *skb,
> +				  struct flow_offload_tuple *tuple,
> +				  struct tcphdr **tcph)
> +{
> +	struct flow_ports *ports;
> +	unsigned int thoff;
> +	struct iphdr *iph;
> +
> +	if (!pskb_network_may_pull(skb, sizeof(*iph)))
> +		return false;
> +
> +	iph = ip_hdr(skb);
> +	thoff = iph->ihl * 4;
> +
> +	if (ip_is_fragment(iph) ||
> +	    unlikely(thoff != sizeof(struct iphdr)))
> +		return false;
> +
> +	if (iph->protocol != IPPROTO_TCP &&
> +	    iph->protocol != IPPROTO_UDP)
> +		return false;
> +
> +	if (iph->ttl <= 1)
> +		return false;
> +
> +	if (!pskb_network_may_pull(skb, iph->protocol == IPPROTO_TCP ?
> +					thoff + sizeof(struct tcphdr) :
> +					thoff + sizeof(*ports)))
> +		return false;
> +
> +	iph = ip_hdr(skb);
> +	if (iph->protocol == IPPROTO_TCP)
> +		*tcph = (void *)(skb_network_header(skb) + thoff);
> +
> +	ports = (struct flow_ports *)(skb_network_header(skb) + thoff);
> +	tuple->src_v4.s_addr = iph->saddr;
> +	tuple->dst_v4.s_addr = iph->daddr;
> +	tuple->src_port = ports->source;
> +	tuple->dst_port = ports->dest;
> +	tuple->l3proto = AF_INET;
> +	tuple->l4proto = iph->protocol;
> +
> +	return true;
> +}
> +
> +static bool
> +tcf_ct_flow_table_fill_tuple_ipv6(struct sk_buff *skb,
> +				  struct flow_offload_tuple *tuple,
> +				  struct tcphdr **tcph)
> +{
> +	struct flow_ports *ports;
> +	struct ipv6hdr *ip6h;
> +	unsigned int thoff;
> +
> +	if (!pskb_network_may_pull(skb, sizeof(*ip6h)))
> +		return false;
> +
> +	ip6h = ipv6_hdr(skb);
> +
> +	if (ip6h->nexthdr != IPPROTO_TCP &&
> +	    ip6h->nexthdr != IPPROTO_UDP)
> +		return false;
> +
> +	if (ip6h->hop_limit <= 1)
> +		return false;
> +
> +	thoff = sizeof(*ip6h);
> +	if (!pskb_network_may_pull(skb, ip6h->nexthdr == IPPROTO_TCP ?
> +					thoff + sizeof(struct tcphdr) :
> +					thoff + sizeof(*ports)))
> +		return false;
> +
> +	ip6h = ipv6_hdr(skb);
> +	if (ip6h->nexthdr == IPPROTO_TCP)
> +		*tcph = (void *)(skb_network_header(skb) + thoff);
> +
> +	ports = (struct flow_ports *)(skb_network_header(skb) + thoff);
> +	tuple->src_v6 = ip6h->saddr;
> +	tuple->dst_v6 = ip6h->daddr;
> +	tuple->src_port = ports->source;
> +	tuple->dst_port = ports->dest;
> +	tuple->l3proto = AF_INET6;
> +	tuple->l4proto = ip6h->nexthdr;
> +
> +	return true;
> +}
> +
> +static bool tcf_ct_flow_table_lookup(struct tcf_ct_params *p,
> +				     struct sk_buff *skb,
> +				     u8 family)
> +{
> +	struct nf_flowtable *nf_ft = &p->ct_ft->nf_ft;
> +	struct flow_offload_tuple_rhash *tuplehash;
> +	struct flow_offload_tuple tuple = {};
> +	enum ip_conntrack_info ctinfo;
> +	struct tcphdr *tcph = NULL;
> +	struct flow_offload *flow;
> +	struct nf_conn *ct;
> +	u8 dir;
> +
> +	/* Previously seen or loopback */
> +	ct = nf_ct_get(skb, &ctinfo);
> +	if ((ct && !nf_ct_is_template(ct)) || ctinfo == IP_CT_UNTRACKED)
> +		return false;
> +
> +	switch (family) {
> +	case NFPROTO_IPV4:
> +		if (!tcf_ct_flow_table_fill_tuple_ipv4(skb, &tuple, &tcph))
> +			return false;
> +		break;
> +	case NFPROTO_IPV6:
> +		if (!tcf_ct_flow_table_fill_tuple_ipv6(skb, &tuple, &tcph))
> +			return false;
> +		break;
> +	default:
> +		return false;
> +	}
> +
> +	tuplehash = flow_offload_lookup(nf_ft, &tuple);
> +	if (!tuplehash)
> +		return false;
> +
> +	dir = tuplehash->tuple.dir;
> +	flow = container_of(tuplehash, struct flow_offload, tuplehash[dir]);
> +	ct = flow->ct;
> +
> +	if (tcph && (unlikely(tcph->fin || tcph->rst))) {
> +		flow_offload_teardown(flow);
> +		return false;
> +	}
> +
> +	ctinfo = dir == FLOW_OFFLOAD_DIR_ORIGINAL ? IP_CT_ESTABLISHED :
> +						    IP_CT_ESTABLISHED_REPLY;
> +
> +	nf_conntrack_get(&ct->ct_general);
> +	nf_ct_set(skb, ct, ctinfo);
> +
> +	return true;
> +}
> +
>  static int tcf_ct_flow_tables_init(void)
>  {
>  	return rhashtable_init(&zones_ht, &zones_params);
> @@ -554,6 +695,7 @@ static int tcf_ct_act(struct sk_buff *skb, const struct tc_action *a,
>  	struct nf_hook_state state;
>  	int nh_ofs, err, retval;
>  	struct tcf_ct_params *p;
> +	bool skip_add = false;
>  	struct nf_conn *ct;
>  	u8 family;
>  
> @@ -603,6 +745,11 @@ static int tcf_ct_act(struct sk_buff *skb, const struct tc_action *a,
>  	 */
>  	cached = tcf_ct_skb_nfct_cached(net, skb, p->zone, force);
>  	if (!cached) {
> +		if (!commit && tcf_ct_flow_table_lookup(p, skb, family)) {
> +			skip_add = true;
> +			goto do_nat;
> +		}
> +
>  		/* Associate skb with specified zone. */
>  		if (tmpl) {
>  			ct = nf_ct_get(skb, &ctinfo);
> @@ -620,6 +767,7 @@ static int tcf_ct_act(struct sk_buff *skb, const struct tc_action *a,
>  			goto out_push;
>  	}
>  
> +do_nat:
>  	ct = nf_ct_get(skb, &ctinfo);
>  	if (!ct)
>  		goto out_push;
> @@ -637,10 +785,10 @@ static int tcf_ct_act(struct sk_buff *skb, const struct tc_action *a,
>  		 * even if the connection is already confirmed.
>  		 */
>  		nf_conntrack_confirm(skb);
> +	} else if (!skip_add) {
> +		tcf_ct_flow_table_process_conn(p->ct_ft, ct, ctinfo);
>  	}
>  
> -	tcf_ct_flow_table_process_conn(p->ct_ft, ct, ctinfo);
> -
>  out_push:
>  	skb_push_rcsum(skb, nh_ofs);
>  
> -- 
> 1.8.3.1
> 

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH net-next v6 3/3] net/sched: act_ct: Software offload of established flows
  2020-03-03 15:57 ` [PATCH net-next v6 3/3] net/sched: act_ct: Software offload of established flows Paul Blakey
  2020-03-03 16:51   ` Marcelo Ricardo Leitner
@ 2020-03-03 18:32   ` Nikolay Aleksandrov
  1 sibling, 0 replies; 14+ messages in thread
From: Nikolay Aleksandrov @ 2020-03-03 18:32 UTC (permalink / raw)
  To: Paul Blakey, Saeed Mahameed, Oz Shlomo, Jakub Kicinski,
	Vlad Buslov, David Miller, netdev, Jiri Pirko, Roi Dayan

On 03/03/2020 17:57, Paul Blakey wrote:
> Offload nf conntrack processing by looking up the 5-tuple in the
> zone's flow table.
> 
> The nf conntrack module will process the packets until a connection is
> in established state. Once in established state, the ct state pointer
> (nf_conn) will be restored on the skb from a successful ft lookup.
> 
> Signed-off-by: Paul Blakey <paulb@mellanox.com>
> Acked-by: Jiri Pirko <jiri@mellanox.com>
> ---
> Changelog:
>   v5->v6:
>    Refactor tcp fin/rst check, get tcp header inside filler, and test in caller
>    Pull tcp with ports instead of two pulls
>   v4->v5:
>    Re-read ip/ip6 header after pulling as skb ptrs may change
>    Use pskb_network_may_pull instaed of pskb_may_pull
>   v1->v2:
>    Add !skip_add curly braces
>    Removed extra setting thoff again
>    Check tcp proto outside of tcf_ct_flow_table_check_tcp
> 

Looks good to me, thanks! In the future please add all reviewers to the CC list.
Acked-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>

>  net/sched/act_ct.c | 152 ++++++++++++++++++++++++++++++++++++++++++++++++++++-
>  1 file changed, 150 insertions(+), 2 deletions(-)
> 
> diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c
> index 2ab38431..23eba61 100644
> --- a/net/sched/act_ct.c
> +++ b/net/sched/act_ct.c
> @@ -186,6 +186,147 @@ static void tcf_ct_flow_table_process_conn(struct tcf_ct_flow_table *ct_ft,
>  	tcf_ct_flow_table_add(ct_ft, ct, tcp);
>  }
>  
> +static bool
> +tcf_ct_flow_table_fill_tuple_ipv4(struct sk_buff *skb,
> +				  struct flow_offload_tuple *tuple,
> +				  struct tcphdr **tcph)
> +{
> +	struct flow_ports *ports;
> +	unsigned int thoff;
> +	struct iphdr *iph;
> +
> +	if (!pskb_network_may_pull(skb, sizeof(*iph)))
> +		return false;
> +
> +	iph = ip_hdr(skb);
> +	thoff = iph->ihl * 4;
> +
> +	if (ip_is_fragment(iph) ||
> +	    unlikely(thoff != sizeof(struct iphdr)))
> +		return false;
> +
> +	if (iph->protocol != IPPROTO_TCP &&
> +	    iph->protocol != IPPROTO_UDP)
> +		return false;
> +
> +	if (iph->ttl <= 1)
> +		return false;
> +
> +	if (!pskb_network_may_pull(skb, iph->protocol == IPPROTO_TCP ?
> +					thoff + sizeof(struct tcphdr) :
> +					thoff + sizeof(*ports)))
> +		return false;
> +
> +	iph = ip_hdr(skb);
> +	if (iph->protocol == IPPROTO_TCP)
> +		*tcph = (void *)(skb_network_header(skb) + thoff);
> +
> +	ports = (struct flow_ports *)(skb_network_header(skb) + thoff);
> +	tuple->src_v4.s_addr = iph->saddr;
> +	tuple->dst_v4.s_addr = iph->daddr;
> +	tuple->src_port = ports->source;
> +	tuple->dst_port = ports->dest;
> +	tuple->l3proto = AF_INET;
> +	tuple->l4proto = iph->protocol;
> +
> +	return true;
> +}
> +
> +static bool
> +tcf_ct_flow_table_fill_tuple_ipv6(struct sk_buff *skb,
> +				  struct flow_offload_tuple *tuple,
> +				  struct tcphdr **tcph)
> +{
> +	struct flow_ports *ports;
> +	struct ipv6hdr *ip6h;
> +	unsigned int thoff;
> +
> +	if (!pskb_network_may_pull(skb, sizeof(*ip6h)))
> +		return false;
> +
> +	ip6h = ipv6_hdr(skb);
> +
> +	if (ip6h->nexthdr != IPPROTO_TCP &&
> +	    ip6h->nexthdr != IPPROTO_UDP)
> +		return false;
> +
> +	if (ip6h->hop_limit <= 1)
> +		return false;
> +
> +	thoff = sizeof(*ip6h);
> +	if (!pskb_network_may_pull(skb, ip6h->nexthdr == IPPROTO_TCP ?
> +					thoff + sizeof(struct tcphdr) :
> +					thoff + sizeof(*ports)))
> +		return false;
> +
> +	ip6h = ipv6_hdr(skb);
> +	if (ip6h->nexthdr == IPPROTO_TCP)
> +		*tcph = (void *)(skb_network_header(skb) + thoff);
> +
> +	ports = (struct flow_ports *)(skb_network_header(skb) + thoff);
> +	tuple->src_v6 = ip6h->saddr;
> +	tuple->dst_v6 = ip6h->daddr;
> +	tuple->src_port = ports->source;
> +	tuple->dst_port = ports->dest;
> +	tuple->l3proto = AF_INET6;
> +	tuple->l4proto = ip6h->nexthdr;
> +
> +	return true;
> +}
> +
> +static bool tcf_ct_flow_table_lookup(struct tcf_ct_params *p,
> +				     struct sk_buff *skb,
> +				     u8 family)
> +{
> +	struct nf_flowtable *nf_ft = &p->ct_ft->nf_ft;
> +	struct flow_offload_tuple_rhash *tuplehash;
> +	struct flow_offload_tuple tuple = {};
> +	enum ip_conntrack_info ctinfo;
> +	struct tcphdr *tcph = NULL;
> +	struct flow_offload *flow;
> +	struct nf_conn *ct;
> +	u8 dir;
> +
> +	/* Previously seen or loopback */
> +	ct = nf_ct_get(skb, &ctinfo);
> +	if ((ct && !nf_ct_is_template(ct)) || ctinfo == IP_CT_UNTRACKED)
> +		return false;
> +
> +	switch (family) {
> +	case NFPROTO_IPV4:
> +		if (!tcf_ct_flow_table_fill_tuple_ipv4(skb, &tuple, &tcph))
> +			return false;
> +		break;
> +	case NFPROTO_IPV6:
> +		if (!tcf_ct_flow_table_fill_tuple_ipv6(skb, &tuple, &tcph))
> +			return false;
> +		break;
> +	default:
> +		return false;
> +	}
> +
> +	tuplehash = flow_offload_lookup(nf_ft, &tuple);
> +	if (!tuplehash)
> +		return false;
> +
> +	dir = tuplehash->tuple.dir;
> +	flow = container_of(tuplehash, struct flow_offload, tuplehash[dir]);
> +	ct = flow->ct;
> +
> +	if (tcph && (unlikely(tcph->fin || tcph->rst))) {
> +		flow_offload_teardown(flow);
> +		return false;
> +	}
> +
> +	ctinfo = dir == FLOW_OFFLOAD_DIR_ORIGINAL ? IP_CT_ESTABLISHED :
> +						    IP_CT_ESTABLISHED_REPLY;
> +
> +	nf_conntrack_get(&ct->ct_general);
> +	nf_ct_set(skb, ct, ctinfo);
> +
> +	return true;
> +}
> +
>  static int tcf_ct_flow_tables_init(void)
>  {
>  	return rhashtable_init(&zones_ht, &zones_params);
> @@ -554,6 +695,7 @@ static int tcf_ct_act(struct sk_buff *skb, const struct tc_action *a,
>  	struct nf_hook_state state;
>  	int nh_ofs, err, retval;
>  	struct tcf_ct_params *p;
> +	bool skip_add = false;
>  	struct nf_conn *ct;
>  	u8 family;
>  
> @@ -603,6 +745,11 @@ static int tcf_ct_act(struct sk_buff *skb, const struct tc_action *a,
>  	 */
>  	cached = tcf_ct_skb_nfct_cached(net, skb, p->zone, force);
>  	if (!cached) {
> +		if (!commit && tcf_ct_flow_table_lookup(p, skb, family)) {
> +			skip_add = true;
> +			goto do_nat;
> +		}
> +
>  		/* Associate skb with specified zone. */
>  		if (tmpl) {
>  			ct = nf_ct_get(skb, &ctinfo);
> @@ -620,6 +767,7 @@ static int tcf_ct_act(struct sk_buff *skb, const struct tc_action *a,
>  			goto out_push;
>  	}
>  
> +do_nat:
>  	ct = nf_ct_get(skb, &ctinfo);
>  	if (!ct)
>  		goto out_push;
> @@ -637,10 +785,10 @@ static int tcf_ct_act(struct sk_buff *skb, const struct tc_action *a,
>  		 * even if the connection is already confirmed.
>  		 */
>  		nf_conntrack_confirm(skb);
> +	} else if (!skip_add) {
> +		tcf_ct_flow_table_process_conn(p->ct_ft, ct, ctinfo);
>  	}
>  
> -	tcf_ct_flow_table_process_conn(p->ct_ft, ct, ctinfo);
> -
>  out_push:
>  	skb_push_rcsum(skb, nh_ofs);
>  
> 


^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH net-next v6 1/3] net/sched: act_ct: Create nf flow table per zone
  2020-03-03 15:57 ` [PATCH net-next v6 1/3] net/sched: act_ct: Create nf flow table per zone Paul Blakey
  2020-03-03 16:51   ` Marcelo Ricardo Leitner
@ 2020-03-07 20:12   ` Eric Dumazet
  2020-03-07 20:53     ` Eric Dumazet
  1 sibling, 1 reply; 14+ messages in thread
From: Eric Dumazet @ 2020-03-07 20:12 UTC (permalink / raw)
  To: Paul Blakey, Saeed Mahameed, Oz Shlomo, Jakub Kicinski,
	Vlad Buslov, David Miller, netdev, Jiri Pirko, Roi Dayan



On 3/3/20 7:57 AM, Paul Blakey wrote:
> Use the NF flow tables infrastructure for CT offload.
> 
> Create a nf flow table per zone.
> 
> Next patches will add FT entries to this table, and do
> the software offload.
> 
> Signed-off-by: Paul Blakey <paulb@mellanox.com>
> Reviewed-by: Jiri Pirko <jiri@mellanox.com>
> ---
>   v4->v5:
>     Added reviewed by Jiri, thanks!
>   v3->v4:
>     Alloc GFP_ATOMIC
>   v2->v3:
>     Ditch re-locking to alloc, and use atomic allocation
>   v1->v2:
>     Use spin_lock_bh instead of spin_lock, and unlock for alloc (as it can sleep)
>     Free ft on last tc act instance instead of last instance + last offloaded tuple,
>     this removes cleanup cb and netfilter patches, and is simpler
>     Removed accidental mlx5/core/en_tc.c change
>     Removed reviewed by Jiri - patch changed
> 
>  include/net/tc_act/tc_ct.h |   2 +
>  net/sched/Kconfig          |   2 +-
>  net/sched/act_ct.c         | 134 ++++++++++++++++++++++++++++++++++++++++++++-
>  3 files changed, 136 insertions(+), 2 deletions(-)
> 
> diff --git a/include/net/tc_act/tc_ct.h b/include/net/tc_act/tc_ct.h
> index a8b1564..cf3492e 100644
> --- a/include/net/tc_act/tc_ct.h
> +++ b/include/net/tc_act/tc_ct.h
> @@ -25,6 +25,8 @@ struct tcf_ct_params {
>  	u16 ct_action;
>  
>  	struct rcu_head rcu;
> +
> +	struct tcf_ct_flow_table *ct_ft;
>  };
>  
>  struct tcf_ct {
> diff --git a/net/sched/Kconfig b/net/sched/Kconfig
> index edde0e5..bfbefb7 100644
> --- a/net/sched/Kconfig
> +++ b/net/sched/Kconfig
> @@ -972,7 +972,7 @@ config NET_ACT_TUNNEL_KEY
>  
>  config NET_ACT_CT
>  	tristate "connection tracking tc action"
> -	depends on NET_CLS_ACT && NF_CONNTRACK && NF_NAT
> +	depends on NET_CLS_ACT && NF_CONNTRACK && NF_NAT && NF_FLOW_TABLE
>  	help
>  	  Say Y here to allow sending the packets to conntrack module.
>  
> diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c
> index f685c0d..3321087 100644
> --- a/net/sched/act_ct.c
> +++ b/net/sched/act_ct.c
> @@ -15,6 +15,7 @@
>  #include <linux/pkt_cls.h>
>  #include <linux/ip.h>
>  #include <linux/ipv6.h>
> +#include <linux/rhashtable.h>
>  #include <net/netlink.h>
>  #include <net/pkt_sched.h>
>  #include <net/pkt_cls.h>
> @@ -24,6 +25,7 @@
>  #include <uapi/linux/tc_act/tc_ct.h>
>  #include <net/tc_act/tc_ct.h>
>  
> +#include <net/netfilter/nf_flow_table.h>
>  #include <net/netfilter/nf_conntrack.h>
>  #include <net/netfilter/nf_conntrack_core.h>
>  #include <net/netfilter/nf_conntrack_zones.h>
> @@ -31,6 +33,108 @@
>  #include <net/netfilter/ipv6/nf_defrag_ipv6.h>
>  #include <uapi/linux/netfilter/nf_nat.h>
>  
> +static struct workqueue_struct *act_ct_wq;
> +static struct rhashtable zones_ht;
> +static DEFINE_SPINLOCK(zones_lock);
> +
> +struct tcf_ct_flow_table {
> +	struct rhash_head node; /* In zones tables */
> +
> +	struct rcu_work rwork;
> +	struct nf_flowtable nf_ft;
> +	u16 zone;
> +	u32 ref;
> +
> +	bool dying;
> +};
> +
> +static const struct rhashtable_params zones_params = {
> +	.head_offset = offsetof(struct tcf_ct_flow_table, node),
> +	.key_offset = offsetof(struct tcf_ct_flow_table, zone),
> +	.key_len = sizeof_field(struct tcf_ct_flow_table, zone),
> +	.automatic_shrinking = true,
> +};
> +
> +static struct nf_flowtable_type flowtable_ct = {
> +	.owner		= THIS_MODULE,
> +};
> +
> +static int tcf_ct_flow_table_get(struct tcf_ct_params *params)
> +{
> +	struct tcf_ct_flow_table *ct_ft;
> +	int err = -ENOMEM;
> +
> +	spin_lock_bh(&zones_lock);
> +	ct_ft = rhashtable_lookup_fast(&zones_ht, &params->zone, zones_params);
> +	if (ct_ft)
> +		goto take_ref;
> +
> +	ct_ft = kzalloc(sizeof(*ct_ft), GFP_ATOMIC);
> +	if (!ct_ft)
> +		goto err_alloc;
> +
> +	ct_ft->zone = params->zone;
> +	err = rhashtable_insert_fast(&zones_ht, &ct_ft->node, zones_params);
> +	if (err)
> +		goto err_insert;
> +
> +	ct_ft->nf_ft.type = &flowtable_ct;
> +	err = nf_flow_table_init(&ct_ft->nf_ft);

This call is going to allocate a rhashtable (GFP_KERNEL allocations that might sleep)

Since you still hold zones_lock spinlock, a splat should occur.

"BUG: sleeping function called from invalid context in  ..."

DEBUG_ATOMIC_SLEEP=y is your friend.

And it is always a good thing to make sure a patch does not trigger a lockdep splat

CONFIG_PROVE_LOCKING=y


> +	if (err)
> +		goto err_init;
> +
> +	__module_get(THIS_MODULE);
> +take_ref:
> +	params->ct_ft = ct_ft;
> +	ct_ft->ref++;
> +	spin_unlock_bh(&zones_lock);
> +
> +	return 0;
> +
> +err_init:
> +	rhashtable_remove_fast(&zones_ht, &ct_ft->node, zones_params);
> +err_insert:
> +	kfree(ct_ft);
> +err_alloc:
> +	spin_unlock_bh(&zones_lock);
> +	return err;
> +}
> +
> +static void tcf_ct_flow_table_cleanup_work(struct work_struct *work)
> +{
> +	struct tcf_ct_flow_table *ct_ft;
> +
> +	ct_ft = container_of(to_rcu_work(work), struct tcf_ct_flow_table,
> +			     rwork);
> +	nf_flow_table_free(&ct_ft->nf_ft);
> +	kfree(ct_ft);
> +
> +	module_put(THIS_MODULE);
> +}
> +
> +static void tcf_ct_flow_table_put(struct tcf_ct_params *params)
> +{
> +	struct tcf_ct_flow_table *ct_ft = params->ct_ft;
> +
> +	spin_lock_bh(&zones_lock);
> +	if (--params->ct_ft->ref == 0) {
> +		rhashtable_remove_fast(&zones_ht, &ct_ft->node, zones_params);
> +		INIT_RCU_WORK(&ct_ft->rwork, tcf_ct_flow_table_cleanup_work);
> +		queue_rcu_work(act_ct_wq, &ct_ft->rwork);
> +	}
> +	spin_unlock_bh(&zones_lock);
> +}
> +
> +static int tcf_ct_flow_tables_init(void)
> +{
> +	return rhashtable_init(&zones_ht, &zones_params);
> +}
> +
> +static void tcf_ct_flow_tables_uninit(void)
> +{
> +	rhashtable_destroy(&zones_ht);
> +}
> +
>  static struct tc_action_ops act_ct_ops;
>  static unsigned int ct_net_id;
>  
> @@ -207,6 +311,8 @@ static void tcf_ct_params_free(struct rcu_head *head)
>  	struct tcf_ct_params *params = container_of(head,
>  						    struct tcf_ct_params, rcu);
>  
> +	tcf_ct_flow_table_put(params);
> +
>  	if (params->tmpl)
>  		nf_conntrack_put(&params->tmpl->ct_general);
>  	kfree(params);
> @@ -730,6 +836,10 @@ static int tcf_ct_init(struct net *net, struct nlattr *nla,
>  	if (err)
>  		goto cleanup;
>  
> +	err = tcf_ct_flow_table_get(params);
> +	if (err)
> +		goto cleanup;
> +
>  	spin_lock_bh(&c->tcf_lock);
>  	goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch);
>  	params = rcu_replace_pointer(c->params, params,
> @@ -974,12 +1084,34 @@ static void __net_exit ct_exit_net(struct list_head *net_list)
>  
>  static int __init ct_init_module(void)
>  {
> -	return tcf_register_action(&act_ct_ops, &ct_net_ops);
> +	int err;
> +
> +	act_ct_wq = alloc_ordered_workqueue("act_ct_workqueue", 0);
> +	if (!act_ct_wq)
> +		return -ENOMEM;
> +
> +	err = tcf_ct_flow_tables_init();
> +	if (err)
> +		goto err_tbl_init;
> +
> +	err = tcf_register_action(&act_ct_ops, &ct_net_ops);
> +	if (err)
> +		goto err_register;
> +
> +	return 0;
> +
> +err_tbl_init:
> +	destroy_workqueue(act_ct_wq);
> +err_register:
> +	tcf_ct_flow_tables_uninit();
> +	return err;
>  }
>  
>  static void __exit ct_cleanup_module(void)
>  {
>  	tcf_unregister_action(&act_ct_ops, &ct_net_ops);
> +	tcf_ct_flow_tables_uninit();
> +	destroy_workqueue(act_ct_wq);
>  }
>  
>  module_init(ct_init_module);
> 

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH net-next v6 1/3] net/sched: act_ct: Create nf flow table per zone
  2020-03-07 20:12   ` Eric Dumazet
@ 2020-03-07 20:53     ` Eric Dumazet
  2020-03-08  8:11       ` Paul Blakey
  0 siblings, 1 reply; 14+ messages in thread
From: Eric Dumazet @ 2020-03-07 20:53 UTC (permalink / raw)
  To: Paul Blakey, Saeed Mahameed, Oz Shlomo, Jakub Kicinski,
	Vlad Buslov, David Miller, netdev, Jiri Pirko, Roi Dayan



On 3/7/20 12:12 PM, Eric Dumazet wrote:
> 
> 
> On 3/3/20 7:57 AM, Paul Blakey wrote:
>> Use the NF flow tables infrastructure for CT offload.
>>
>> Create a nf flow table per zone.
>>
>> Next patches will add FT entries to this table, and do
>> the software offload.
>>
>> Signed-off-by: Paul Blakey <paulb@mellanox.com>
>> Reviewed-by: Jiri Pirko <jiri@mellanox.com>
>> ---
>>   v4->v5:
>>     Added reviewed by Jiri, thanks!
>>   v3->v4:
>>     Alloc GFP_ATOMIC
>>   v2->v3:
>>     Ditch re-locking to alloc, and use atomic allocation
>>   v1->v2:
>>     Use spin_lock_bh instead of spin_lock, and unlock for alloc (as it can sleep)
>>     Free ft on last tc act instance instead of last instance + last offloaded tuple,
>>     this removes cleanup cb and netfilter patches, and is simpler
>>     Removed accidental mlx5/core/en_tc.c change
>>     Removed reviewed by Jiri - patch changed
>>
>> +	err = nf_flow_table_init(&ct_ft->nf_ft);
> 
> This call is going to allocate a rhashtable (GFP_KERNEL allocations that might sleep)
> 
> Since you still hold zones_lock spinlock, a splat should occur.
> 
> "BUG: sleeping function called from invalid context in  ..."
> 
> DEBUG_ATOMIC_SLEEP=y is your friend.
> 
> And it is always a good thing to make sure a patch does not trigger a lockdep splat
> 
> CONFIG_PROVE_LOCKING=y

Also abusing a spinlock and GFP_ATOMIC allocations in control path is highly discouraged.

I can not test the following fix, any objections before I submit this officially ?

diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c
index 23eba61f0f819212a3522c3c63b938d0b8d997e2..3d9e678d7d5336f1746035745b091bea0dcb5fdd 100644
--- a/net/sched/act_ct.c
+++ b/net/sched/act_ct.c
@@ -35,15 +35,15 @@
 
 static struct workqueue_struct *act_ct_wq;
 static struct rhashtable zones_ht;
-static DEFINE_SPINLOCK(zones_lock);
+static DEFINE_MUTEX(zones_mutex);
 
 struct tcf_ct_flow_table {
        struct rhash_head node; /* In zones tables */
 
        struct rcu_work rwork;
        struct nf_flowtable nf_ft;
+       refcount_t ref;
        u16 zone;
-       u32 ref;
 
        bool dying;
 };
@@ -64,14 +64,15 @@ static int tcf_ct_flow_table_get(struct tcf_ct_params *params)
        struct tcf_ct_flow_table *ct_ft;
        int err = -ENOMEM;
 
-       spin_lock_bh(&zones_lock);
+       mutex_lock(&zones_mutex);
        ct_ft = rhashtable_lookup_fast(&zones_ht, &params->zone, zones_params);
-       if (ct_ft)
-               goto take_ref;
+       if (ct_ft && refcount_inc_not_zero(&ct_ft->ref))
+               goto out_unlock;
 
-       ct_ft = kzalloc(sizeof(*ct_ft), GFP_ATOMIC);
+       ct_ft = kzalloc(sizeof(*ct_ft), GFP_KERNEL);
        if (!ct_ft)
                goto err_alloc;
+       refcount_set(&ct_ft->ref, 1);
 
        ct_ft->zone = params->zone;
        err = rhashtable_insert_fast(&zones_ht, &ct_ft->node, zones_params);
@@ -84,10 +85,9 @@ static int tcf_ct_flow_table_get(struct tcf_ct_params *params)
                goto err_init;
 
        __module_get(THIS_MODULE);
-take_ref:
+out_unlock:
        params->ct_ft = ct_ft;
-       ct_ft->ref++;
-       spin_unlock_bh(&zones_lock);
+       mutex_unlock(&zones_mutex);
 
        return 0;
 
@@ -96,7 +96,7 @@ static int tcf_ct_flow_table_get(struct tcf_ct_params *params)
 err_insert:
        kfree(ct_ft);
 err_alloc:
-       spin_unlock_bh(&zones_lock);
+       mutex_unlock(&zones_mutex);
        return err;
 }
 
@@ -116,13 +116,11 @@ static void tcf_ct_flow_table_put(struct tcf_ct_params *params)
 {
        struct tcf_ct_flow_table *ct_ft = params->ct_ft;
 
-       spin_lock_bh(&zones_lock);
-       if (--params->ct_ft->ref == 0) {
+       if (refcount_dec_and_test(&params->ct_ft->ref)) {
                rhashtable_remove_fast(&zones_ht, &ct_ft->node, zones_params);
                INIT_RCU_WORK(&ct_ft->rwork, tcf_ct_flow_table_cleanup_work);
                queue_rcu_work(act_ct_wq, &ct_ft->rwork);
        }
-       spin_unlock_bh(&zones_lock);
 }
 
 static void tcf_ct_flow_table_add(struct tcf_ct_flow_table *ct_ft,


^ permalink raw reply related	[flat|nested] 14+ messages in thread

* Re: [PATCH net-next v6 1/3] net/sched: act_ct: Create nf flow table per zone
  2020-03-07 20:53     ` Eric Dumazet
@ 2020-03-08  8:11       ` Paul Blakey
  2020-03-08  8:15         ` Paul Blakey
  0 siblings, 1 reply; 14+ messages in thread
From: Paul Blakey @ 2020-03-08  8:11 UTC (permalink / raw)
  To: Eric Dumazet, Saeed Mahameed, Oz Shlomo, Jakub Kicinski,
	Vlad Buslov, David Miller, netdev, Jiri Pirko, Roi Dayan

iirc I did the spin lock bh because we can be called from queue work rcu handler , so I wanted to disable soft irq.

I got a possible deadlock splat for that.



On 3/7/2020 10:53 PM, Eric Dumazet wrote:

>
> On 3/7/20 12:12 PM, Eric Dumazet wrote:
>>
>> On 3/3/20 7:57 AM, Paul Blakey wrote:
>>> Use the NF flow tables infrastructure for CT offload.
>>>
>>> Create a nf flow table per zone.
>>>
>>> Next patches will add FT entries to this table, and do
>>> the software offload.
>>>
>>> Signed-off-by: Paul Blakey <paulb@mellanox.com>
>>> Reviewed-by: Jiri Pirko <jiri@mellanox.com>
>>> ---
>>>   v4->v5:
>>>     Added reviewed by Jiri, thanks!
>>>   v3->v4:
>>>     Alloc GFP_ATOMIC
>>>   v2->v3:
>>>     Ditch re-locking to alloc, and use atomic allocation
>>>   v1->v2:
>>>     Use spin_lock_bh instead of spin_lock, and unlock for alloc (as it can sleep)
>>>     Free ft on last tc act instance instead of last instance + last offloaded tuple,
>>>     this removes cleanup cb and netfilter patches, and is simpler
>>>     Removed accidental mlx5/core/en_tc.c change
>>>     Removed reviewed by Jiri - patch changed
>>>
>>> +	err = nf_flow_table_init(&ct_ft->nf_ft);
>> This call is going to allocate a rhashtable (GFP_KERNEL allocations that might sleep)
>>
>> Since you still hold zones_lock spinlock, a splat should occur.
>>
>> "BUG: sleeping function called from invalid context in  ..."
>>
>> DEBUG_ATOMIC_SLEEP=y is your friend.
>>
>> And it is always a good thing to make sure a patch does not trigger a lockdep splat
>>
>> CONFIG_PROVE_LOCKING=y
> Also abusing a spinlock and GFP_ATOMIC allocations in control path is highly discouraged.
>
> I can not test the following fix, any objections before I submit this officially ?
>
> diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c
> index 23eba61f0f819212a3522c3c63b938d0b8d997e2..3d9e678d7d5336f1746035745b091bea0dcb5fdd 100644
> --- a/net/sched/act_ct.c
> +++ b/net/sched/act_ct.c
> @@ -35,15 +35,15 @@
>  
>  static struct workqueue_struct *act_ct_wq;
>  static struct rhashtable zones_ht;
> -static DEFINE_SPINLOCK(zones_lock);
> +static DEFINE_MUTEX(zones_mutex);
>  
>  struct tcf_ct_flow_table {
>         struct rhash_head node; /* In zones tables */
>  
>         struct rcu_work rwork;
>         struct nf_flowtable nf_ft;
> +       refcount_t ref;
>         u16 zone;
> -       u32 ref;
>  
>         bool dying;
>  };
> @@ -64,14 +64,15 @@ static int tcf_ct_flow_table_get(struct tcf_ct_params *params)
>         struct tcf_ct_flow_table *ct_ft;
>         int err = -ENOMEM;
>  
> -       spin_lock_bh(&zones_lock);
> +       mutex_lock(&zones_mutex);
>         ct_ft = rhashtable_lookup_fast(&zones_ht, &params->zone, zones_params);
> -       if (ct_ft)
> -               goto take_ref;
> +       if (ct_ft && refcount_inc_not_zero(&ct_ft->ref))
> +               goto out_unlock;
>  
> -       ct_ft = kzalloc(sizeof(*ct_ft), GFP_ATOMIC);
> +       ct_ft = kzalloc(sizeof(*ct_ft), GFP_KERNEL);
>         if (!ct_ft)
>                 goto err_alloc;
> +       refcount_set(&ct_ft->ref, 1);
>  
>         ct_ft->zone = params->zone;
>         err = rhashtable_insert_fast(&zones_ht, &ct_ft->node, zones_params);
> @@ -84,10 +85,9 @@ static int tcf_ct_flow_table_get(struct tcf_ct_params *params)
>                 goto err_init;
>  
>         __module_get(THIS_MODULE);
> -take_ref:
> +out_unlock:
>         params->ct_ft = ct_ft;
> -       ct_ft->ref++;
> -       spin_unlock_bh(&zones_lock);
> +       mutex_unlock(&zones_mutex);
>  
>         return 0;
>  
> @@ -96,7 +96,7 @@ static int tcf_ct_flow_table_get(struct tcf_ct_params *params)
>  err_insert:
>         kfree(ct_ft);
>  err_alloc:
> -       spin_unlock_bh(&zones_lock);
> +       mutex_unlock(&zones_mutex);
>         return err;
>  }
>  
> @@ -116,13 +116,11 @@ static void tcf_ct_flow_table_put(struct tcf_ct_params *params)
>  {
>         struct tcf_ct_flow_table *ct_ft = params->ct_ft;
>  
> -       spin_lock_bh(&zones_lock);
> -       if (--params->ct_ft->ref == 0) {
> +       if (refcount_dec_and_test(&params->ct_ft->ref)) {
>                 rhashtable_remove_fast(&zones_ht, &ct_ft->node, zones_params);
>                 INIT_RCU_WORK(&ct_ft->rwork, tcf_ct_flow_table_cleanup_work);
>                 queue_rcu_work(act_ct_wq, &ct_ft->rwork);
>         }
> -       spin_unlock_bh(&zones_lock);
>  }
>  
>  static void tcf_ct_flow_table_add(struct tcf_ct_flow_table *ct_ft,
>

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH net-next v6 1/3] net/sched: act_ct: Create nf flow table per zone
  2020-03-08  8:11       ` Paul Blakey
@ 2020-03-08  8:15         ` Paul Blakey
  2020-03-08 20:42           ` Eric Dumazet
  0 siblings, 1 reply; 14+ messages in thread
From: Paul Blakey @ 2020-03-08  8:15 UTC (permalink / raw)
  To: Eric Dumazet, Saeed Mahameed, Oz Shlomo, Jakub Kicinski,
	Vlad Buslov, David Miller, netdev, Jiri Pirko, Roi Dayan

On 3/8/2020 10:11 AM, Paul Blakey wrote:

> iirc I did the spin lock bh because we can be called from queue work rcu handler , so I wanted to disable soft irq.
>
> I got a possible deadlock splat for that.

Here I meant this call rcu:

static void tcf_ct_cleanup(struct tc_action *a)
{
>-------struct tcf_ct_params *params;
>-------struct tcf_ct *c = to_ct(a);

>-------params = rcu_dereference_protected(c->params, 1);
>-------if (params)
>------->-------call_rcu(&params->rcu, tcf_ct_params_free);
}

static void tcf_ct_params_free(struct rcu_head *head)
{
>-------struct tcf_ct_params *params = container_of(head,
>------->------->------->------->------->-------    struct tcf_ct_params, rcu);

>-------tcf_ct_flow_table_put(params);

...


>
>
> On 3/7/2020 10:53 PM, Eric Dumazet wrote:
>
>> On 3/7/20 12:12 PM, Eric Dumazet wrote:
>>> On 3/3/20 7:57 AM, Paul Blakey wrote:
>>>> Use the NF flow tables infrastructure for CT offload.
>>>>
>>>> Create a nf flow table per zone.
>>>>
>>>> Next patches will add FT entries to this table, and do
>>>> the software offload.
>>>>
>>>> Signed-off-by: Paul Blakey <paulb@mellanox.com>
>>>> Reviewed-by: Jiri Pirko <jiri@mellanox.com>
>>>> ---
>>>>   v4->v5:
>>>>     Added reviewed by Jiri, thanks!
>>>>   v3->v4:
>>>>     Alloc GFP_ATOMIC
>>>>   v2->v3:
>>>>     Ditch re-locking to alloc, and use atomic allocation
>>>>   v1->v2:
>>>>     Use spin_lock_bh instead of spin_lock, and unlock for alloc (as it can sleep)
>>>>     Free ft on last tc act instance instead of last instance + last offloaded tuple,
>>>>     this removes cleanup cb and netfilter patches, and is simpler
>>>>     Removed accidental mlx5/core/en_tc.c change
>>>>     Removed reviewed by Jiri - patch changed
>>>>
>>>> +	err = nf_flow_table_init(&ct_ft->nf_ft);
>>> This call is going to allocate a rhashtable (GFP_KERNEL allocations that might sleep)
>>>
>>> Since you still hold zones_lock spinlock, a splat should occur.
>>>
>>> "BUG: sleeping function called from invalid context in  ..."
>>>
>>> DEBUG_ATOMIC_SLEEP=y is your friend.
>>>
>>> And it is always a good thing to make sure a patch does not trigger a lockdep splat
>>>
>>> CONFIG_PROVE_LOCKING=y
>> Also abusing a spinlock and GFP_ATOMIC allocations in control path is highly discouraged.
>>
>> I can not test the following fix, any objections before I submit this officially ?
>>
>> diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c
>> index 23eba61f0f819212a3522c3c63b938d0b8d997e2..3d9e678d7d5336f1746035745b091bea0dcb5fdd 100644
>> --- a/net/sched/act_ct.c
>> +++ b/net/sched/act_ct.c
>> @@ -35,15 +35,15 @@
>>  
>>  static struct workqueue_struct *act_ct_wq;
>>  static struct rhashtable zones_ht;
>> -static DEFINE_SPINLOCK(zones_lock);
>> +static DEFINE_MUTEX(zones_mutex);
>>  
>>  struct tcf_ct_flow_table {
>>         struct rhash_head node; /* In zones tables */
>>  
>>         struct rcu_work rwork;
>>         struct nf_flowtable nf_ft;
>> +       refcount_t ref;
>>         u16 zone;
>> -       u32 ref;
>>  
>>         bool dying;
>>  };
>> @@ -64,14 +64,15 @@ static int tcf_ct_flow_table_get(struct tcf_ct_params *params)
>>         struct tcf_ct_flow_table *ct_ft;
>>         int err = -ENOMEM;
>>  
>> -       spin_lock_bh(&zones_lock);
>> +       mutex_lock(&zones_mutex);
>>         ct_ft = rhashtable_lookup_fast(&zones_ht, &params->zone, zones_params);
>> -       if (ct_ft)
>> -               goto take_ref;
>> +       if (ct_ft && refcount_inc_not_zero(&ct_ft->ref))
>> +               goto out_unlock;
>>  
>> -       ct_ft = kzalloc(sizeof(*ct_ft), GFP_ATOMIC);
>> +       ct_ft = kzalloc(sizeof(*ct_ft), GFP_KERNEL);
>>         if (!ct_ft)
>>                 goto err_alloc;
>> +       refcount_set(&ct_ft->ref, 1);
>>  
>>         ct_ft->zone = params->zone;
>>         err = rhashtable_insert_fast(&zones_ht, &ct_ft->node, zones_params);
>> @@ -84,10 +85,9 @@ static int tcf_ct_flow_table_get(struct tcf_ct_params *params)
>>                 goto err_init;
>>  
>>         __module_get(THIS_MODULE);
>> -take_ref:
>> +out_unlock:
>>         params->ct_ft = ct_ft;
>> -       ct_ft->ref++;
>> -       spin_unlock_bh(&zones_lock);
>> +       mutex_unlock(&zones_mutex);
>>  
>>         return 0;
>>  
>> @@ -96,7 +96,7 @@ static int tcf_ct_flow_table_get(struct tcf_ct_params *params)
>>  err_insert:
>>         kfree(ct_ft);
>>  err_alloc:
>> -       spin_unlock_bh(&zones_lock);
>> +       mutex_unlock(&zones_mutex);
>>         return err;
>>  }
>>  
>> @@ -116,13 +116,11 @@ static void tcf_ct_flow_table_put(struct tcf_ct_params *params)
>>  {
>>         struct tcf_ct_flow_table *ct_ft = params->ct_ft;
>>  
>> -       spin_lock_bh(&zones_lock);
>> -       if (--params->ct_ft->ref == 0) {
>> +       if (refcount_dec_and_test(&params->ct_ft->ref)) {
>>                 rhashtable_remove_fast(&zones_ht, &ct_ft->node, zones_params);
>>                 INIT_RCU_WORK(&ct_ft->rwork, tcf_ct_flow_table_cleanup_work);
>>                 queue_rcu_work(act_ct_wq, &ct_ft->rwork);
>>         }
>> -       spin_unlock_bh(&zones_lock);
>>  }
>>  
>>  static void tcf_ct_flow_table_add(struct tcf_ct_flow_table *ct_ft,
>>

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH net-next v6 1/3] net/sched: act_ct: Create nf flow table per zone
  2020-03-08  8:15         ` Paul Blakey
@ 2020-03-08 20:42           ` Eric Dumazet
  2020-03-09  8:02             ` Paul Blakey
  0 siblings, 1 reply; 14+ messages in thread
From: Eric Dumazet @ 2020-03-08 20:42 UTC (permalink / raw)
  To: Paul Blakey, Eric Dumazet, Saeed Mahameed, Oz Shlomo,
	Jakub Kicinski, Vlad Buslov, David Miller, netdev, Jiri Pirko,
	Roi Dayan



On 3/8/20 12:15 AM, Paul Blakey wrote:
> On 3/8/2020 10:11 AM, Paul Blakey wrote:
> 
>> iirc I did the spin lock bh because we can be called from queue work rcu handler , so I wanted to disable soft irq.
>>
>> I got a possible deadlock splat for that.
> 
> Here I meant this call rcu:
> 
> static void tcf_ct_cleanup(struct tc_action *a)
> {
>> -------struct tcf_ct_params *params;
>> -------struct tcf_ct *c = to_ct(a);
> 
>> -------params = rcu_dereference_protected(c->params, 1);
>> -------if (params)
>> ------->-------call_rcu(&params->rcu, tcf_ct_params_free);
> }
> 

Yes, understood, but to solve this problem we had many other choices,
and still keeping GFP_KERNEL allocations and a mutex for control path.

Have you read my patch ?

By not even trying to get a spinlock in tcf_ct_flow_table_put(),
and instead use a refcount for ->ref, we avoid having this issue in the first place.

static void tcf_ct_flow_table_put(struct tcf_ct_params *params)
{
        struct tcf_ct_flow_table *ct_ft = params->ct_ft;

        if (refcount_dec_and_test(&params->ct_ft->ref)) {
                rhashtable_remove_fast(&zones_ht, &ct_ft->node, zones_params);
                INIT_RCU_WORK(&ct_ft->rwork, tcf_ct_flow_table_cleanup_work);
                queue_rcu_work(act_ct_wq, &ct_ft->rwork);
        }
}

> static void tcf_ct_params_free(struct rcu_head *head)
> {
>> -------struct tcf_ct_params *params = container_of(head,
>> ------->------->------->------->------->-------    struct tcf_ct_params, rcu);
> 
>> -------tcf_ct_flow_table_put(params);
> 
> ...
> 
> 
>>
>>
>> On 3/7/2020 10:53 PM, Eric Dumazet wrote:
>>
>>> On 3/7/20 12:12 PM, Eric Dumazet wrote:
>>>> On 3/3/20 7:57 AM, Paul Blakey wrote:
>>>>> Use the NF flow tables infrastructure for CT offload.
>>>>>
>>>>> Create a nf flow table per zone.
>>>>>
>>>>> Next patches will add FT entries to this table, and do
>>>>> the software offload.
>>>>>
>>>>> Signed-off-by: Paul Blakey <paulb@mellanox.com>
>>>>> Reviewed-by: Jiri Pirko <jiri@mellanox.com>
>>>>> ---
>>>>>   v4->v5:
>>>>>     Added reviewed by Jiri, thanks!
>>>>>   v3->v4:
>>>>>     Alloc GFP_ATOMIC
>>>>>   v2->v3:
>>>>>     Ditch re-locking to alloc, and use atomic allocation
>>>>>   v1->v2:
>>>>>     Use spin_lock_bh instead of spin_lock, and unlock for alloc (as it can sleep)
>>>>>     Free ft on last tc act instance instead of last instance + last offloaded tuple,
>>>>>     this removes cleanup cb and netfilter patches, and is simpler
>>>>>     Removed accidental mlx5/core/en_tc.c change
>>>>>     Removed reviewed by Jiri - patch changed
>>>>>
>>>>> +	err = nf_flow_table_init(&ct_ft->nf_ft);
>>>> This call is going to allocate a rhashtable (GFP_KERNEL allocations that might sleep)
>>>>
>>>> Since you still hold zones_lock spinlock, a splat should occur.
>>>>
>>>> "BUG: sleeping function called from invalid context in  ..."
>>>>
>>>> DEBUG_ATOMIC_SLEEP=y is your friend.
>>>>
>>>> And it is always a good thing to make sure a patch does not trigger a lockdep splat
>>>>
>>>> CONFIG_PROVE_LOCKING=y
>>> Also abusing a spinlock and GFP_ATOMIC allocations in control path is highly discouraged.
>>>
>>> I can not test the following fix, any objections before I submit this officially ?
>>>
>>> diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c
>>> index 23eba61f0f819212a3522c3c63b938d0b8d997e2..3d9e678d7d5336f1746035745b091bea0dcb5fdd 100644
>>> --- a/net/sched/act_ct.c
>>> +++ b/net/sched/act_ct.c
>>> @@ -35,15 +35,15 @@
>>>  
>>>  static struct workqueue_struct *act_ct_wq;
>>>  static struct rhashtable zones_ht;
>>> -static DEFINE_SPINLOCK(zones_lock);
>>> +static DEFINE_MUTEX(zones_mutex);
>>>  
>>>  struct tcf_ct_flow_table {
>>>         struct rhash_head node; /* In zones tables */
>>>  
>>>         struct rcu_work rwork;
>>>         struct nf_flowtable nf_ft;
>>> +       refcount_t ref;
>>>         u16 zone;
>>> -       u32 ref;
>>>  
>>>         bool dying;
>>>  };
>>> @@ -64,14 +64,15 @@ static int tcf_ct_flow_table_get(struct tcf_ct_params *params)
>>>         struct tcf_ct_flow_table *ct_ft;
>>>         int err = -ENOMEM;
>>>  
>>> -       spin_lock_bh(&zones_lock);
>>> +       mutex_lock(&zones_mutex);
>>>         ct_ft = rhashtable_lookup_fast(&zones_ht, &params->zone, zones_params);
>>> -       if (ct_ft)
>>> -               goto take_ref;
>>> +       if (ct_ft && refcount_inc_not_zero(&ct_ft->ref))
>>> +               goto out_unlock;
>>>  
>>> -       ct_ft = kzalloc(sizeof(*ct_ft), GFP_ATOMIC);
>>> +       ct_ft = kzalloc(sizeof(*ct_ft), GFP_KERNEL);
>>>         if (!ct_ft)
>>>                 goto err_alloc;
>>> +       refcount_set(&ct_ft->ref, 1);
>>>  
>>>         ct_ft->zone = params->zone;
>>>         err = rhashtable_insert_fast(&zones_ht, &ct_ft->node, zones_params);
>>> @@ -84,10 +85,9 @@ static int tcf_ct_flow_table_get(struct tcf_ct_params *params)
>>>                 goto err_init;
>>>  
>>>         __module_get(THIS_MODULE);
>>> -take_ref:
>>> +out_unlock:
>>>         params->ct_ft = ct_ft;
>>> -       ct_ft->ref++;
>>> -       spin_unlock_bh(&zones_lock);
>>> +       mutex_unlock(&zones_mutex);
>>>  
>>>         return 0;
>>>  
>>> @@ -96,7 +96,7 @@ static int tcf_ct_flow_table_get(struct tcf_ct_params *params)
>>>  err_insert:
>>>         kfree(ct_ft);
>>>  err_alloc:
>>> -       spin_unlock_bh(&zones_lock);
>>> +       mutex_unlock(&zones_mutex);
>>>         return err;
>>>  }
>>>  
>>> @@ -116,13 +116,11 @@ static void tcf_ct_flow_table_put(struct tcf_ct_params *params)
>>>  {
>>>         struct tcf_ct_flow_table *ct_ft = params->ct_ft;
>>>  
>>> -       spin_lock_bh(&zones_lock);
>>> -       if (--params->ct_ft->ref == 0) {
>>> +       if (refcount_dec_and_test(&params->ct_ft->ref)) {
>>>                 rhashtable_remove_fast(&zones_ht, &ct_ft->node, zones_params);
>>>                 INIT_RCU_WORK(&ct_ft->rwork, tcf_ct_flow_table_cleanup_work);
>>>                 queue_rcu_work(act_ct_wq, &ct_ft->rwork);
>>>         }
>>> -       spin_unlock_bh(&zones_lock);
>>>  }
>>>  
>>>  static void tcf_ct_flow_table_add(struct tcf_ct_flow_table *ct_ft,
>>>

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH net-next v6 1/3] net/sched: act_ct: Create nf flow table per zone
  2020-03-08 20:42           ` Eric Dumazet
@ 2020-03-09  8:02             ` Paul Blakey
  0 siblings, 0 replies; 14+ messages in thread
From: Paul Blakey @ 2020-03-09  8:02 UTC (permalink / raw)
  To: Eric Dumazet, Saeed Mahameed, Oz Shlomo, Jakub Kicinski,
	Vlad Buslov, David Miller, netdev, Jiri Pirko, Roi Dayan


On 3/8/2020 10:42 PM, Eric Dumazet wrote:
>
> On 3/8/20 12:15 AM, Paul Blakey wrote:
>> On 3/8/2020 10:11 AM, Paul Blakey wrote:
>>
>>> iirc I did the spin lock bh because we can be called from queue work rcu handler , so I wanted to disable soft irq.
>>>
>>> I got a possible deadlock splat for that.
>> Here I meant this call rcu:
>>
>> static void tcf_ct_cleanup(struct tc_action *a)
>> {
>>> -------struct tcf_ct_params *params;
>>> -------struct tcf_ct *c = to_ct(a);
>>> -------params = rcu_dereference_protected(c->params, 1);
>>> -------if (params)
>>> ------->-------call_rcu(&params->rcu, tcf_ct_params_free);
>> }
>>
> Yes, understood, but to solve this problem we had many other choices,
> and still keeping GFP_KERNEL allocations and a mutex for control path.
>
> Have you read my patch ?
>
> By not even trying to get a spinlock in tcf_ct_flow_table_put(),
> and instead use a refcount for ->ref, we avoid having this issue in the first place.
>
> static void tcf_ct_flow_table_put(struct tcf_ct_params *params)
> {
>         struct tcf_ct_flow_table *ct_ft = params->ct_ft;
>
>         if (refcount_dec_and_test(&params->ct_ft->ref)) {
>                 rhashtable_remove_fast(&zones_ht, &ct_ft->node, zones_params);
>                 INIT_RCU_WORK(&ct_ft->rwork, tcf_ct_flow_table_cleanup_work);
>                 queue_rcu_work(act_ct_wq, &ct_ft->rwork);
>         }
> }
Sorry missed that, thanks for the fix.
>> static void tcf_ct_params_free(struct rcu_head *head)
>> {
>>> -------struct tcf_ct_params *params = container_of(head,
>>> ------->------->------->------->------->-------    struct tcf_ct_params, rcu);
>>> -------tcf_ct_flow_table_put(params);
>> ...
>>
>>
>>>
>>> On 3/7/2020 10:53 PM, Eric Dumazet wrote:
>>>
>>>> On 3/7/20 12:12 PM, Eric Dumazet wrote:
>>>>> On 3/3/20 7:57 AM, Paul Blakey wrote:
>>>>>> Use the NF flow tables infrastructure for CT offload.
>>>>>>
>>>>>> Create a nf flow table per zone.
>>>>>>
>>>>>> Next patches will add FT entries to this table, and do
>>>>>> the software offload.
>>>>>>
>>>>>> Signed-off-by: Paul Blakey <paulb@mellanox.com>
>>>>>> Reviewed-by: Jiri Pirko <jiri@mellanox.com>
>>>>>> ---
>>>>>>   v4->v5:
>>>>>>     Added reviewed by Jiri, thanks!
>>>>>>   v3->v4:
>>>>>>     Alloc GFP_ATOMIC
>>>>>>   v2->v3:
>>>>>>     Ditch re-locking to alloc, and use atomic allocation
>>>>>>   v1->v2:
>>>>>>     Use spin_lock_bh instead of spin_lock, and unlock for alloc (as it can sleep)
>>>>>>     Free ft on last tc act instance instead of last instance + last offloaded tuple,
>>>>>>     this removes cleanup cb and netfilter patches, and is simpler
>>>>>>     Removed accidental mlx5/core/en_tc.c change
>>>>>>     Removed reviewed by Jiri - patch changed
>>>>>>
>>>>>> +	err = nf_flow_table_init(&ct_ft->nf_ft);
>>>>> This call is going to allocate a rhashtable (GFP_KERNEL allocations that might sleep)
>>>>>
>>>>> Since you still hold zones_lock spinlock, a splat should occur.
>>>>>
>>>>> "BUG: sleeping function called from invalid context in  ..."
>>>>>
>>>>> DEBUG_ATOMIC_SLEEP=y is your friend.
>>>>>
>>>>> And it is always a good thing to make sure a patch does not trigger a lockdep splat
>>>>>
>>>>> CONFIG_PROVE_LOCKING=y
>>>> Also abusing a spinlock and GFP_ATOMIC allocations in control path is highly discouraged.
>>>>
>>>> I can not test the following fix, any objections before I submit this officially ?
>>>>
>>>> diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c
>>>> index 23eba61f0f819212a3522c3c63b938d0b8d997e2..3d9e678d7d5336f1746035745b091bea0dcb5fdd 100644
>>>> --- a/net/sched/act_ct.c
>>>> +++ b/net/sched/act_ct.c
>>>> @@ -35,15 +35,15 @@
>>>>  
>>>>  static struct workqueue_struct *act_ct_wq;
>>>>  static struct rhashtable zones_ht;
>>>> -static DEFINE_SPINLOCK(zones_lock);
>>>> +static DEFINE_MUTEX(zones_mutex);
>>>>  
>>>>  struct tcf_ct_flow_table {
>>>>         struct rhash_head node; /* In zones tables */
>>>>  
>>>>         struct rcu_work rwork;
>>>>         struct nf_flowtable nf_ft;
>>>> +       refcount_t ref;
>>>>         u16 zone;
>>>> -       u32 ref;
>>>>  
>>>>         bool dying;
>>>>  };
>>>> @@ -64,14 +64,15 @@ static int tcf_ct_flow_table_get(struct tcf_ct_params *params)
>>>>         struct tcf_ct_flow_table *ct_ft;
>>>>         int err = -ENOMEM;
>>>>  
>>>> -       spin_lock_bh(&zones_lock);
>>>> +       mutex_lock(&zones_mutex);
>>>>         ct_ft = rhashtable_lookup_fast(&zones_ht, &params->zone, zones_params);
>>>> -       if (ct_ft)
>>>> -               goto take_ref;
>>>> +       if (ct_ft && refcount_inc_not_zero(&ct_ft->ref))
>>>> +               goto out_unlock;
>>>>  
>>>> -       ct_ft = kzalloc(sizeof(*ct_ft), GFP_ATOMIC);
>>>> +       ct_ft = kzalloc(sizeof(*ct_ft), GFP_KERNEL);
>>>>         if (!ct_ft)
>>>>                 goto err_alloc;
>>>> +       refcount_set(&ct_ft->ref, 1);
>>>>  
>>>>         ct_ft->zone = params->zone;
>>>>         err = rhashtable_insert_fast(&zones_ht, &ct_ft->node, zones_params);
>>>> @@ -84,10 +85,9 @@ static int tcf_ct_flow_table_get(struct tcf_ct_params *params)
>>>>                 goto err_init;
>>>>  
>>>>         __module_get(THIS_MODULE);
>>>> -take_ref:
>>>> +out_unlock:
>>>>         params->ct_ft = ct_ft;
>>>> -       ct_ft->ref++;
>>>> -       spin_unlock_bh(&zones_lock);
>>>> +       mutex_unlock(&zones_mutex);
>>>>  
>>>>         return 0;
>>>>  
>>>> @@ -96,7 +96,7 @@ static int tcf_ct_flow_table_get(struct tcf_ct_params *params)
>>>>  err_insert:
>>>>         kfree(ct_ft);
>>>>  err_alloc:
>>>> -       spin_unlock_bh(&zones_lock);
>>>> +       mutex_unlock(&zones_mutex);
>>>>         return err;
>>>>  }
>>>>  
>>>> @@ -116,13 +116,11 @@ static void tcf_ct_flow_table_put(struct tcf_ct_params *params)
>>>>  {
>>>>         struct tcf_ct_flow_table *ct_ft = params->ct_ft;
>>>>  
>>>> -       spin_lock_bh(&zones_lock);
>>>> -       if (--params->ct_ft->ref == 0) {
>>>> +       if (refcount_dec_and_test(&params->ct_ft->ref)) {
>>>>                 rhashtable_remove_fast(&zones_ht, &ct_ft->node, zones_params);
>>>>                 INIT_RCU_WORK(&ct_ft->rwork, tcf_ct_flow_table_cleanup_work);
>>>>                 queue_rcu_work(act_ct_wq, &ct_ft->rwork);
>>>>         }
>>>> -       spin_unlock_bh(&zones_lock);
>>>>  }
>>>>  
>>>>  static void tcf_ct_flow_table_add(struct tcf_ct_flow_table *ct_ft,
>>>>

^ permalink raw reply	[flat|nested] 14+ messages in thread

end of thread, other threads:[~2020-03-09  8:02 UTC | newest]

Thread overview: 14+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-03-03 15:57 [PATCH net-next v6 0/3] act_ct: Software offload of conntrack_in Paul Blakey
2020-03-03 15:57 ` [PATCH net-next v6 1/3] net/sched: act_ct: Create nf flow table per zone Paul Blakey
2020-03-03 16:51   ` Marcelo Ricardo Leitner
2020-03-07 20:12   ` Eric Dumazet
2020-03-07 20:53     ` Eric Dumazet
2020-03-08  8:11       ` Paul Blakey
2020-03-08  8:15         ` Paul Blakey
2020-03-08 20:42           ` Eric Dumazet
2020-03-09  8:02             ` Paul Blakey
2020-03-03 15:57 ` [PATCH net-next v6 2/3] net/sched: act_ct: Offload established connections to flow table Paul Blakey
2020-03-03 16:51   ` Marcelo Ricardo Leitner
2020-03-03 15:57 ` [PATCH net-next v6 3/3] net/sched: act_ct: Software offload of established flows Paul Blakey
2020-03-03 16:51   ` Marcelo Ricardo Leitner
2020-03-03 18:32   ` Nikolay Aleksandrov

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).