[PATCH net-next v3 0/3] act_ct: Software offload of conntrack

netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [PATCH net-next v3 0/3] act_ct: Software offload of conntrack_in
@ 2020-03-03 12:52 Paul Blakey
  2020-03-03 12:52 ` [PATCH net-next v3 1/3] net/sched: act_ct: Create nf flow table per zone Paul Blakey
                   ` (2 more replies)
  0 siblings, 3 replies; 10+ messages in thread
From: Paul Blakey @ 2020-03-03 12:52 UTC (permalink / raw)
  To: Paul Blakey, Saeed Mahameed, Oz Shlomo, Jakub Kicinski,
	Vlad Buslov, David Miller, netdev, Jiri Pirko, Roi Dayan

This series adds software offload of connections with an established
ct state using the NF flow table offload infrastructure, so
once such flows are offloaded, they will not pass through conntrack
again, and instead act_ct will restore the conntrack info metadata
on the skb to the state it had on the offload event - established.

Act_ct maintains an FT instance per ct zone. Flow table entries
are created, per ct connection, when connections enter an established
state and deleted otherwise. Once an entry is created, the FT assumes
ownership of the entry, and manages it's aging.

On the datapath, first lookup the skb in the zone's FT before going
into conntrack, and if a matching flow is found, restore the conntrack
info metadata on the skb, and skip calling conntrack.

Note that this patchset is part of the connection tracking offload feature.
Hardware offload of connections with an established ct state series will follow
this one.

Changelog:
   v1->v2:
     Removed now unused netfilter patches

Paul Blakey (3):
  net/sched: act_ct: Create nf flow table per zone
  net/sched: act_ct: Offload established connections to flow table
  net/sched: act_ct: Software offload of established flows

 include/net/tc_act/tc_ct.h |   2 +
 net/sched/Kconfig          |   2 +-
 net/sched/act_ct.c         | 353 ++++++++++++++++++++++++++++++++++++++++++++-
 3 files changed, 355 insertions(+), 2 deletions(-)

-- 
1.8.3.1

^ permalink raw reply	[flat|nested] 10+ messages in thread

* [PATCH net-next v3 1/3] net/sched: act_ct: Create nf flow table per zone
  2020-03-03 12:52 [PATCH net-next v3 0/3] act_ct: Software offload of conntrack_in Paul Blakey
@ 2020-03-03 12:52 ` Paul Blakey
  2020-03-03 13:02   ` Paul Blakey
  2020-03-03 12:52 ` [PATCH net-next v3 2/3] net/sched: act_ct: Offload established connections to flow table Paul Blakey
  2020-03-03 12:52 ` [PATCH net-next v3 3/3] net/sched: act_ct: Software offload of established flows Paul Blakey
  2 siblings, 1 reply; 10+ messages in thread
From: Paul Blakey @ 2020-03-03 12:52 UTC (permalink / raw)
  To: Paul Blakey, Saeed Mahameed, Oz Shlomo, Jakub Kicinski,
	Vlad Buslov, David Miller, netdev, Jiri Pirko, Roi Dayan

Use the NF flow tables infrastructure for CT offload.

Create a nf flow table per zone.

Next patches will add FT entries to this table, and do
the software offload.

Signed-off-by: Paul Blakey <paulb@mellanox.com>
---
Changelog:
  v2->v3:
    Ditch re-locking to alloc, and use atomic allocation
  v1->v2:
    Use spin_lock_bh instead of spin_lock, and unlock for alloc (as it can sleep)
    Free ft on last tc act instance instead of last instance + last offloaded tuple,
    this removes cleanup cb and netfilter patches, and is simpler
    Removed accidental mlx5/core/en_tc.c change
    Removed reviewed by Jiri - patch changed

 include/net/tc_act/tc_ct.h |   2 +
 net/sched/Kconfig          |   2 +-
 net/sched/act_ct.c         | 134 ++++++++++++++++++++++++++++++++++++++++++++-
 3 files changed, 136 insertions(+), 2 deletions(-)

diff --git a/include/net/tc_act/tc_ct.h b/include/net/tc_act/tc_ct.h
index a8b1564..cf3492e 100644
--- a/include/net/tc_act/tc_ct.h
+++ b/include/net/tc_act/tc_ct.h
@@ -25,6 +25,8 @@ struct tcf_ct_params {
 	u16 ct_action;
 
 	struct rcu_head rcu;
+
+	struct tcf_ct_flow_table *ct_ft;
 };
 
 struct tcf_ct {
diff --git a/net/sched/Kconfig b/net/sched/Kconfig
index edde0e5..bfbefb7 100644
--- a/net/sched/Kconfig
+++ b/net/sched/Kconfig
@@ -972,7 +972,7 @@ config NET_ACT_TUNNEL_KEY
 
 config NET_ACT_CT
 	tristate "connection tracking tc action"
-	depends on NET_CLS_ACT && NF_CONNTRACK && NF_NAT
+	depends on NET_CLS_ACT && NF_CONNTRACK && NF_NAT && NF_FLOW_TABLE
 	help
 	  Say Y here to allow sending the packets to conntrack module.
 
diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c
index f685c0d..d36417f 100644
--- a/net/sched/act_ct.c
+++ b/net/sched/act_ct.c
@@ -15,6 +15,7 @@
 #include <linux/pkt_cls.h>
 #include <linux/ip.h>
 #include <linux/ipv6.h>
+#include <linux/rhashtable.h>
 #include <net/netlink.h>
 #include <net/pkt_sched.h>
 #include <net/pkt_cls.h>
@@ -24,6 +25,7 @@
 #include <uapi/linux/tc_act/tc_ct.h>
 #include <net/tc_act/tc_ct.h>
 
+#include <net/netfilter/nf_flow_table.h>
 #include <net/netfilter/nf_conntrack.h>
 #include <net/netfilter/nf_conntrack_core.h>
 #include <net/netfilter/nf_conntrack_zones.h>
@@ -31,6 +33,108 @@
 #include <net/netfilter/ipv6/nf_defrag_ipv6.h>
 #include <uapi/linux/netfilter/nf_nat.h>
 
+static struct workqueue_struct *act_ct_wq;
+static struct rhashtable zones_ht;
+static DEFINE_SPINLOCK(zones_lock);
+
+struct tcf_ct_flow_table {
+	struct rhash_head node; /* In zones tables */
+
+	struct rcu_work rwork;
+	struct nf_flowtable nf_ft;
+	u16 zone;
+	u32 ref;
+
+	bool dying;
+};
+
+static const struct rhashtable_params zones_params = {
+	.head_offset = offsetof(struct tcf_ct_flow_table, node),
+	.key_offset = offsetof(struct tcf_ct_flow_table, zone),
+	.key_len = sizeof_field(struct tcf_ct_flow_table, zone),
+	.automatic_shrinking = true,
+};
+
+static struct nf_flowtable_type flowtable_ct = {
+	.owner		= THIS_MODULE,
+};
+
+static int tcf_ct_flow_table_get(struct tcf_ct_params *params)
+{
+	struct tcf_ct_flow_table *ct_ft;
+	int err = -ENOMEM;
+
+	spin_lock_bh(&zones_lock);
+	ct_ft = rhashtable_lookup_fast(&zones_ht, &params->zone, zones_params);
+	if (ct_ft)
+		goto take_ref;
+
+	ct_ft = kzalloc(sizeof(*ct_ft), GFP_KERNEL);
+	if (!ct_ft)
+		goto err_alloc;
+
+	ct_ft->zone = params->zone;
+	err = rhashtable_insert_fast(&zones_ht, &ct_ft->node, zones_params);
+	if (err)
+		goto err_insert;
+
+	ct_ft->nf_ft.type = &flowtable_ct;
+	err = nf_flow_table_init(&ct_ft->nf_ft);
+	if (err)
+		goto err_init;
+
+	__module_get(THIS_MODULE);
+take_ref:
+	params->ct_ft = ct_ft;
+	ct_ft->ref++;
+	spin_unlock_bh(&zones_lock);
+
+	return 0;
+
+err_init:
+	rhashtable_remove_fast(&zones_ht, &ct_ft->node, zones_params);
+err_insert:
+	kfree(ct_ft);
+err_alloc:
+	spin_unlock_bh(&zones_lock);
+	return err;
+}
+
+static void tcf_ct_flow_table_cleanup_work(struct work_struct *work)
+{
+	struct tcf_ct_flow_table *ct_ft;
+
+	ct_ft = container_of(to_rcu_work(work), struct tcf_ct_flow_table,
+			     rwork);
+	nf_flow_table_free(&ct_ft->nf_ft);
+	kfree(ct_ft);
+
+	module_put(THIS_MODULE);
+}
+
+static void tcf_ct_flow_table_put(struct tcf_ct_params *params)
+{
+	struct tcf_ct_flow_table *ct_ft = params->ct_ft;
+
+	spin_lock_bh(&zones_lock);
+	if (--params->ct_ft->ref == 0) {
+		rhashtable_remove_fast(&zones_ht, &ct_ft->node, zones_params);
+		INIT_RCU_WORK(&ct_ft->rwork, tcf_ct_flow_table_cleanup_work);
+		queue_rcu_work(act_ct_wq, &ct_ft->rwork);
+	}
+	spin_unlock_bh(&zones_lock);
+}
+
+static int tcf_ct_flow_tables_init(void)
+{
+	return rhashtable_init(&zones_ht, &zones_params);
+}
+
+static void tcf_ct_flow_tables_uninit(void)
+{
+	rhashtable_destroy(&zones_ht);
+}
+
 static struct tc_action_ops act_ct_ops;
 static unsigned int ct_net_id;
 
@@ -207,6 +311,8 @@ static void tcf_ct_params_free(struct rcu_head *head)
 	struct tcf_ct_params *params = container_of(head,
 						    struct tcf_ct_params, rcu);
 
+	tcf_ct_flow_table_put(params);
+
 	if (params->tmpl)
 		nf_conntrack_put(&params->tmpl->ct_general);
 	kfree(params);
@@ -730,6 +836,10 @@ static int tcf_ct_init(struct net *net, struct nlattr *nla,
 	if (err)
 		goto cleanup;
 
+	err = tcf_ct_flow_table_get(params);
+	if (err)
+		goto cleanup;
+
 	spin_lock_bh(&c->tcf_lock);
 	goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch);
 	params = rcu_replace_pointer(c->params, params,
@@ -974,12 +1084,34 @@ static void __net_exit ct_exit_net(struct list_head *net_list)
 
 static int __init ct_init_module(void)
 {
-	return tcf_register_action(&act_ct_ops, &ct_net_ops);
+	int err;
+
+	act_ct_wq = alloc_ordered_workqueue("act_ct_workqueue", 0);
+	if (!act_ct_wq)
+		return -ENOMEM;
+
+	err = tcf_ct_flow_tables_init();
+	if (err)
+		goto err_tbl_init;
+
+	err = tcf_register_action(&act_ct_ops, &ct_net_ops);
+	if (err)
+		goto err_register;
+
+	return 0;
+
+err_tbl_init:
+	destroy_workqueue(act_ct_wq);
+err_register:
+	tcf_ct_flow_tables_uninit();
+	return err;
 }
 
 static void __exit ct_cleanup_module(void)
 {
 	tcf_unregister_action(&act_ct_ops, &ct_net_ops);
+	tcf_ct_flow_tables_uninit();
+	destroy_workqueue(act_ct_wq);
 }
 
 module_init(ct_init_module);
-- 
1.8.3.1


^ permalink raw reply related	[flat|nested] 10+ messages in thread

* [PATCH net-next v3 2/3] net/sched: act_ct: Offload established connections to flow table
  2020-03-03 12:52 [PATCH net-next v3 0/3] act_ct: Software offload of conntrack_in Paul Blakey
  2020-03-03 12:52 ` [PATCH net-next v3 1/3] net/sched: act_ct: Create nf flow table per zone Paul Blakey
@ 2020-03-03 12:52 ` Paul Blakey
  2020-03-03 12:52 ` [PATCH net-next v3 3/3] net/sched: act_ct: Software offload of established flows Paul Blakey
  2 siblings, 0 replies; 10+ messages in thread
From: Paul Blakey @ 2020-03-03 12:52 UTC (permalink / raw)
  To: Paul Blakey, Saeed Mahameed, Oz Shlomo, Jakub Kicinski,
	Vlad Buslov, David Miller, netdev, Jiri Pirko, Roi Dayan

Add a ft entry when connections enter an established state and delete
the connections when they leave the established state.

The flow table assumes ownership of the connection. In the following
patch act_ct will lookup the ct state from the FT. In future patches,
drivers will register for callbacks for ft add/del events and will be
able to use the information to offload the connections.

Note that connection aging is managed by the FT.

Signed-off-by: Paul Blakey <paulb@mellanox.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
---
 net/sched/act_ct.c | 63 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 63 insertions(+)

diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c
index d36417f..6ad0553 100644
--- a/net/sched/act_ct.c
+++ b/net/sched/act_ct.c
@@ -125,6 +125,67 @@ static void tcf_ct_flow_table_put(struct tcf_ct_params *params)
 	spin_unlock_bh(&zones_lock);
 }
 
+static void tcf_ct_flow_table_add(struct tcf_ct_flow_table *ct_ft,
+				  struct nf_conn *ct,
+				  bool tcp)
+{
+	struct flow_offload *entry;
+	int err;
+
+	if (test_and_set_bit(IPS_OFFLOAD_BIT, &ct->status))
+		return;
+
+	entry = flow_offload_alloc(ct);
+	if (!entry) {
+		WARN_ON_ONCE(1);
+		goto err_alloc;
+	}
+
+	if (tcp) {
+		ct->proto.tcp.seen[0].flags |= IP_CT_TCP_FLAG_BE_LIBERAL;
+		ct->proto.tcp.seen[1].flags |= IP_CT_TCP_FLAG_BE_LIBERAL;
+	}
+
+	err = flow_offload_add(&ct_ft->nf_ft, entry);
+	if (err)
+		goto err_add;
+
+	return;
+
+err_add:
+	flow_offload_free(entry);
+err_alloc:
+	clear_bit(IPS_OFFLOAD_BIT, &ct->status);
+}
+
+static void tcf_ct_flow_table_process_conn(struct tcf_ct_flow_table *ct_ft,
+					   struct nf_conn *ct,
+					   enum ip_conntrack_info ctinfo)
+{
+	bool tcp = false;
+
+	if (ctinfo != IP_CT_ESTABLISHED && ctinfo != IP_CT_ESTABLISHED_REPLY)
+		return;
+
+	switch (nf_ct_protonum(ct)) {
+	case IPPROTO_TCP:
+		tcp = true;
+		if (ct->proto.tcp.state != TCP_CONNTRACK_ESTABLISHED)
+			return;
+		break;
+	case IPPROTO_UDP:
+		break;
+	default:
+		return;
+	}
+
+	if (nf_ct_ext_exist(ct, NF_CT_EXT_HELPER) ||
+	    ct->status & IPS_SEQ_ADJUST)
+		return;
+
+	tcf_ct_flow_table_add(ct_ft, ct, tcp);
+}
+
 static int tcf_ct_flow_tables_init(void)
 {
 	return rhashtable_init(&zones_ht, &zones_params);
@@ -578,6 +639,8 @@ static int tcf_ct_act(struct sk_buff *skb, const struct tc_action *a,
 		nf_conntrack_confirm(skb);
 	}
 
+	tcf_ct_flow_table_process_conn(p->ct_ft, ct, ctinfo);
+
 out_push:
 	skb_push_rcsum(skb, nh_ofs);
 
-- 
1.8.3.1


^ permalink raw reply related	[flat|nested] 10+ messages in thread

* [PATCH net-next v3 3/3] net/sched: act_ct: Software offload of established flows
  2020-03-03 12:52 [PATCH net-next v3 0/3] act_ct: Software offload of conntrack_in Paul Blakey
  2020-03-03 12:52 ` [PATCH net-next v3 1/3] net/sched: act_ct: Create nf flow table per zone Paul Blakey
  2020-03-03 12:52 ` [PATCH net-next v3 2/3] net/sched: act_ct: Offload established connections to flow table Paul Blakey
@ 2020-03-03 12:52 ` Paul Blakey
  2020-03-03 13:10   ` Nikolay Aleksandrov
  2 siblings, 1 reply; 10+ messages in thread
From: Paul Blakey @ 2020-03-03 12:52 UTC (permalink / raw)
  To: Paul Blakey, Saeed Mahameed, Oz Shlomo, Jakub Kicinski,
	Vlad Buslov, David Miller, netdev, Jiri Pirko, Roi Dayan

Offload nf conntrack processing by looking up the 5-tuple in the
zone's flow table.

The nf conntrack module will process the packets until a connection is
in established state. Once in established state, the ct state pointer
(nf_conn) will be restored on the skb from a successful ft lookup.

Signed-off-by: Paul Blakey <paulb@mellanox.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
---
Changelog:
  v1->v2:
   Add !skip_add curly braces
   Removed extra setting thoff again
   Check tcp proto outside of tcf_ct_flow_table_check_tcp

 net/sched/act_ct.c | 160 ++++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 158 insertions(+), 2 deletions(-)

diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c
index 6ad0553..2017f8f 100644
--- a/net/sched/act_ct.c
+++ b/net/sched/act_ct.c
@@ -186,6 +186,155 @@ static void tcf_ct_flow_table_process_conn(struct tcf_ct_flow_table *ct_ft,
 	tcf_ct_flow_table_add(ct_ft, ct, tcp);
 }
 
+static bool
+tcf_ct_flow_table_fill_tuple_ipv4(struct sk_buff *skb,
+				  struct flow_offload_tuple *tuple)
+{
+	struct flow_ports *ports;
+	unsigned int thoff;
+	struct iphdr *iph;
+
+	if (!pskb_may_pull(skb, sizeof(*iph)))
+		return false;
+
+	iph = ip_hdr(skb);
+	thoff = iph->ihl * 4;
+
+	if (ip_is_fragment(iph) ||
+	    unlikely(thoff != sizeof(struct iphdr)))
+		return false;
+
+	if (iph->protocol != IPPROTO_TCP &&
+	    iph->protocol != IPPROTO_UDP)
+		return false;
+
+	if (iph->ttl <= 1)
+		return false;
+
+	if (!pskb_may_pull(skb, thoff + sizeof(*ports)))
+		return false;
+
+	ports = (struct flow_ports *)(skb_network_header(skb) + thoff);
+
+	tuple->src_v4.s_addr = iph->saddr;
+	tuple->dst_v4.s_addr = iph->daddr;
+	tuple->src_port = ports->source;
+	tuple->dst_port = ports->dest;
+	tuple->l3proto = AF_INET;
+	tuple->l4proto = iph->protocol;
+
+	return true;
+}
+
+static bool
+tcf_ct_flow_table_fill_tuple_ipv6(struct sk_buff *skb,
+				  struct flow_offload_tuple *tuple)
+{
+	struct flow_ports *ports;
+	struct ipv6hdr *ip6h;
+	unsigned int thoff;
+
+	if (!pskb_may_pull(skb, sizeof(*ip6h)))
+		return false;
+
+	ip6h = ipv6_hdr(skb);
+
+	if (ip6h->nexthdr != IPPROTO_TCP &&
+	    ip6h->nexthdr != IPPROTO_UDP)
+		return false;
+
+	if (ip6h->hop_limit <= 1)
+		return false;
+
+	thoff = sizeof(*ip6h);
+	if (!pskb_may_pull(skb, thoff + sizeof(*ports)))
+		return false;
+
+	ports = (struct flow_ports *)(skb_network_header(skb) + thoff);
+
+	tuple->src_v6 = ip6h->saddr;
+	tuple->dst_v6 = ip6h->daddr;
+	tuple->src_port = ports->source;
+	tuple->dst_port = ports->dest;
+	tuple->l3proto = AF_INET6;
+	tuple->l4proto = ip6h->nexthdr;
+
+	return true;
+}
+
+static bool tcf_ct_flow_table_check_tcp(struct flow_offload *flow,
+					struct sk_buff *skb,
+					unsigned int thoff)
+{
+	struct tcphdr *tcph;
+
+	if (!pskb_may_pull(skb, thoff + sizeof(*tcph)))
+		return false;
+
+	tcph = (void *)(skb_network_header(skb) + thoff);
+	if (unlikely(tcph->fin || tcph->rst)) {
+		flow_offload_teardown(flow);
+		return false;
+	}
+
+	return true;
+}
+
+static bool tcf_ct_flow_table_lookup(struct tcf_ct_params *p,
+				     struct sk_buff *skb,
+				     u8 family)
+{
+	struct nf_flowtable *nf_ft = &p->ct_ft->nf_ft;
+	struct flow_offload_tuple_rhash *tuplehash;
+	struct flow_offload_tuple tuple = {};
+	enum ip_conntrack_info ctinfo;
+	struct flow_offload *flow;
+	struct nf_conn *ct;
+	unsigned int thoff;
+	int ip_proto;
+	u8 dir;
+
+	/* Previously seen or loopback */
+	ct = nf_ct_get(skb, &ctinfo);
+	if ((ct && !nf_ct_is_template(ct)) || ctinfo == IP_CT_UNTRACKED)
+		return false;
+
+	switch (family) {
+	case NFPROTO_IPV4:
+		if (!tcf_ct_flow_table_fill_tuple_ipv4(skb, &tuple))
+			return false;
+		break;
+	case NFPROTO_IPV6:
+		if (!tcf_ct_flow_table_fill_tuple_ipv6(skb, &tuple))
+			return false;
+		break;
+	default:
+		return false;
+	}
+
+	tuplehash = flow_offload_lookup(nf_ft, &tuple);
+	if (!tuplehash)
+		return false;
+
+	dir = tuplehash->tuple.dir;
+	flow = container_of(tuplehash, struct flow_offload, tuplehash[dir]);
+	ct = flow->ct;
+
+	ctinfo = dir == FLOW_OFFLOAD_DIR_ORIGINAL ? IP_CT_ESTABLISHED :
+						    IP_CT_ESTABLISHED_REPLY;
+
+	thoff = ip_hdr(skb)->ihl * 4;
+	ip_proto = ip_hdr(skb)->protocol;
+	if (ip_proto == IPPROTO_TCP &&
+	    !tcf_ct_flow_table_check_tcp(flow, skb, thoff))
+		return false;
+
+	nf_conntrack_get(&ct->ct_general);
+	nf_ct_set(skb, ct, ctinfo);
+
+	return true;
+}
+
 static int tcf_ct_flow_tables_init(void)
 {
 	return rhashtable_init(&zones_ht, &zones_params);
@@ -554,6 +703,7 @@ static int tcf_ct_act(struct sk_buff *skb, const struct tc_action *a,
 	struct nf_hook_state state;
 	int nh_ofs, err, retval;
 	struct tcf_ct_params *p;
+	bool skip_add = false;
 	struct nf_conn *ct;
 	u8 family;
 
@@ -603,6 +753,11 @@ static int tcf_ct_act(struct sk_buff *skb, const struct tc_action *a,
 	 */
 	cached = tcf_ct_skb_nfct_cached(net, skb, p->zone, force);
 	if (!cached) {
+		if (!commit && tcf_ct_flow_table_lookup(p, skb, family)) {
+			skip_add = true;
+			goto do_nat;
+		}
+
 		/* Associate skb with specified zone. */
 		if (tmpl) {
 			ct = nf_ct_get(skb, &ctinfo);
@@ -620,6 +775,7 @@ static int tcf_ct_act(struct sk_buff *skb, const struct tc_action *a,
 			goto out_push;
 	}
 
+do_nat:
 	ct = nf_ct_get(skb, &ctinfo);
 	if (!ct)
 		goto out_push;
@@ -637,10 +793,10 @@ static int tcf_ct_act(struct sk_buff *skb, const struct tc_action *a,
 		 * even if the connection is already confirmed.
 		 */
 		nf_conntrack_confirm(skb);
+	} else if (!skip_add) {
+		tcf_ct_flow_table_process_conn(p->ct_ft, ct, ctinfo);
 	}
 
-	tcf_ct_flow_table_process_conn(p->ct_ft, ct, ctinfo);
-
 out_push:
 	skb_push_rcsum(skb, nh_ofs);
 
-- 
1.8.3.1


^ permalink raw reply related	[flat|nested] 10+ messages in thread

* Re: [PATCH net-next v3 1/3] net/sched: act_ct: Create nf flow table per zone
  2020-03-03 12:52 ` [PATCH net-next v3 1/3] net/sched: act_ct: Create nf flow table per zone Paul Blakey
@ 2020-03-03 13:02   ` Paul Blakey
  0 siblings, 0 replies; 10+ messages in thread
From: Paul Blakey @ 2020-03-03 13:02 UTC (permalink / raw)
  To: Saeed Mahameed, Oz Shlomo, Jakub Kicinski, Vlad Buslov,
	David Miller, netdev, Jiri Pirko, Roi Dayan


On 3/3/2020 2:52 PM, Paul Blakey wrote:
> Use the NF flow tables infrastructure for CT offload.
>
> Create a nf flow table per zone.
>
> Next patches will add FT entries to this table, and do
> the software offload.
>
> Signed-off-by: Paul Blakey <paulb@mellanox.com>
> ---
> Changelog:
>   v2->v3:
>     Ditch re-locking to alloc, and use atomic allocation
>   v1->v2:
>     Use spin_lock_bh instead of spin_lock, and unlock for alloc (as it can sleep)
>     Free ft on last tc act instance instead of last instance + last offloaded tuple,
>     this removes cleanup cb and netfilter patches, and is simpler
>     Removed accidental mlx5/core/en_tc.c change
>     Removed reviewed by Jiri - patch changed
>
>  include/net/tc_act/tc_ct.h |   2 +
>  net/sched/Kconfig          |   2 +-
>  net/sched/act_ct.c         | 134 ++++++++++++++++++++++++++++++++++++++++++++-
>  3 files changed, 136 insertions(+), 2 deletions(-)
>
> diff --git a/include/net/tc_act/tc_ct.h b/include/net/tc_act/tc_ct.h
> index a8b1564..cf3492e 100644
> --- a/include/net/tc_act/tc_ct.h
> +++ b/include/net/tc_act/tc_ct.h
> @@ -25,6 +25,8 @@ struct tcf_ct_params {
>  	u16 ct_action;
>  
>  	struct rcu_head rcu;
> +
> +	struct tcf_ct_flow_table *ct_ft;
>  };
>  
>  struct tcf_ct {
> diff --git a/net/sched/Kconfig b/net/sched/Kconfig
> index edde0e5..bfbefb7 100644
> --- a/net/sched/Kconfig
> +++ b/net/sched/Kconfig
> @@ -972,7 +972,7 @@ config NET_ACT_TUNNEL_KEY
>  
>  config NET_ACT_CT
>  	tristate "connection tracking tc action"
> -	depends on NET_CLS_ACT && NF_CONNTRACK && NF_NAT
> +	depends on NET_CLS_ACT && NF_CONNTRACK && NF_NAT && NF_FLOW_TABLE
>  	help
>  	  Say Y here to allow sending the packets to conntrack module.
>  
> diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c
> index f685c0d..d36417f 100644
> --- a/net/sched/act_ct.c
> +++ b/net/sched/act_ct.c
> @@ -15,6 +15,7 @@
>  #include <linux/pkt_cls.h>
>  #include <linux/ip.h>
>  #include <linux/ipv6.h>
> +#include <linux/rhashtable.h>
>  #include <net/netlink.h>
>  #include <net/pkt_sched.h>
>  #include <net/pkt_cls.h>
> @@ -24,6 +25,7 @@
>  #include <uapi/linux/tc_act/tc_ct.h>
>  #include <net/tc_act/tc_ct.h>
>  
> +#include <net/netfilter/nf_flow_table.h>
>  #include <net/netfilter/nf_conntrack.h>
>  #include <net/netfilter/nf_conntrack_core.h>
>  #include <net/netfilter/nf_conntrack_zones.h>
> @@ -31,6 +33,108 @@
>  #include <net/netfilter/ipv6/nf_defrag_ipv6.h>
>  #include <uapi/linux/netfilter/nf_nat.h>
>  
> +static struct workqueue_struct *act_ct_wq;
> +static struct rhashtable zones_ht;
> +static DEFINE_SPINLOCK(zones_lock);
> +
> +struct tcf_ct_flow_table {
> +	struct rhash_head node; /* In zones tables */
> +
> +	struct rcu_work rwork;
> +	struct nf_flowtable nf_ft;
> +	u16 zone;
> +	u32 ref;
> +
> +	bool dying;
> +};
> +
> +static const struct rhashtable_params zones_params = {
> +	.head_offset = offsetof(struct tcf_ct_flow_table, node),
> +	.key_offset = offsetof(struct tcf_ct_flow_table, zone),
> +	.key_len = sizeof_field(struct tcf_ct_flow_table, zone),
> +	.automatic_shrinking = true,
> +};
> +
> +static struct nf_flowtable_type flowtable_ct = {
> +	.owner		= THIS_MODULE,
> +};
> +
> +static int tcf_ct_flow_table_get(struct tcf_ct_params *params)
> +{
> +	struct tcf_ct_flow_table *ct_ft;
> +	int err = -ENOMEM;
> +
> +	spin_lock_bh(&zones_lock);
> +	ct_ft = rhashtable_lookup_fast(&zones_ht, &params->zone, zones_params);
> +	if (ct_ft)
> +		goto take_ref;
> +
> +	ct_ft = kzalloc(sizeof(*ct_ft), GFP_KERNEL);

Reverted it back wrong, should be atomic here.

Sending v4, sorry

> +	if (!ct_ft)
> +		goto err_alloc;
> +
> +	ct_ft->zone = params->zone;
> +	err = rhashtable_insert_fast(&zones_ht, &ct_ft->node, zones_params);
> +	if (err)
> +		goto err_insert;
> +
> +	ct_ft->nf_ft.type = &flowtable_ct;
> +	err = nf_flow_table_init(&ct_ft->nf_ft);
> +	if (err)
> +		goto err_init;
> +
> +	__module_get(THIS_MODULE);
> +take_ref:
> +	params->ct_ft = ct_ft;
> +	ct_ft->ref++;
> +	spin_unlock_bh(&zones_lock);
> +
> +	return 0;
> +
> +err_init:
> +	rhashtable_remove_fast(&zones_ht, &ct_ft->node, zones_params);
> +err_insert:
> +	kfree(ct_ft);
> +err_alloc:
> +	spin_unlock_bh(&zones_lock);
> +	return err;
> +}
> +
> +static void tcf_ct_flow_table_cleanup_work(struct work_struct *work)
> +{
> +	struct tcf_ct_flow_table *ct_ft;
> +
> +	ct_ft = container_of(to_rcu_work(work), struct tcf_ct_flow_table,
> +			     rwork);
> +	nf_flow_table_free(&ct_ft->nf_ft);
> +	kfree(ct_ft);
> +
> +	module_put(THIS_MODULE);
> +}
> +
> +static void tcf_ct_flow_table_put(struct tcf_ct_params *params)
> +{
> +	struct tcf_ct_flow_table *ct_ft = params->ct_ft;
> +
> +	spin_lock_bh(&zones_lock);
> +	if (--params->ct_ft->ref == 0) {
> +		rhashtable_remove_fast(&zones_ht, &ct_ft->node, zones_params);
> +		INIT_RCU_WORK(&ct_ft->rwork, tcf_ct_flow_table_cleanup_work);
> +		queue_rcu_work(act_ct_wq, &ct_ft->rwork);
> +	}
> +	spin_unlock_bh(&zones_lock);
> +}
> +
> +static int tcf_ct_flow_tables_init(void)
> +{
> +	return rhashtable_init(&zones_ht, &zones_params);
> +}
> +
> +static void tcf_ct_flow_tables_uninit(void)
> +{
> +	rhashtable_destroy(&zones_ht);
> +}
> +
>  static struct tc_action_ops act_ct_ops;
>  static unsigned int ct_net_id;
>  
> @@ -207,6 +311,8 @@ static void tcf_ct_params_free(struct rcu_head *head)
>  	struct tcf_ct_params *params = container_of(head,
>  						    struct tcf_ct_params, rcu);
>  
> +	tcf_ct_flow_table_put(params);
> +
>  	if (params->tmpl)
>  		nf_conntrack_put(&params->tmpl->ct_general);
>  	kfree(params);
> @@ -730,6 +836,10 @@ static int tcf_ct_init(struct net *net, struct nlattr *nla,
>  	if (err)
>  		goto cleanup;
>  
> +	err = tcf_ct_flow_table_get(params);
> +	if (err)
> +		goto cleanup;
> +
>  	spin_lock_bh(&c->tcf_lock);
>  	goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch);
>  	params = rcu_replace_pointer(c->params, params,
> @@ -974,12 +1084,34 @@ static void __net_exit ct_exit_net(struct list_head *net_list)
>  
>  static int __init ct_init_module(void)
>  {
> -	return tcf_register_action(&act_ct_ops, &ct_net_ops);
> +	int err;
> +
> +	act_ct_wq = alloc_ordered_workqueue("act_ct_workqueue", 0);
> +	if (!act_ct_wq)
> +		return -ENOMEM;
> +
> +	err = tcf_ct_flow_tables_init();
> +	if (err)
> +		goto err_tbl_init;
> +
> +	err = tcf_register_action(&act_ct_ops, &ct_net_ops);
> +	if (err)
> +		goto err_register;
> +
> +	return 0;
> +
> +err_tbl_init:
> +	destroy_workqueue(act_ct_wq);
> +err_register:
> +	tcf_ct_flow_tables_uninit();
> +	return err;
>  }
>  
>  static void __exit ct_cleanup_module(void)
>  {
>  	tcf_unregister_action(&act_ct_ops, &ct_net_ops);
> +	tcf_ct_flow_tables_uninit();
> +	destroy_workqueue(act_ct_wq);
>  }
>  
>  module_init(ct_init_module);

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH net-next v3 3/3] net/sched: act_ct: Software offload of established flows
  2020-03-03 12:52 ` [PATCH net-next v3 3/3] net/sched: act_ct: Software offload of established flows Paul Blakey
@ 2020-03-03 13:10   ` Nikolay Aleksandrov
  2020-03-03 13:31     ` Paul Blakey
  0 siblings, 1 reply; 10+ messages in thread
From: Nikolay Aleksandrov @ 2020-03-03 13:10 UTC (permalink / raw)
  To: Paul Blakey, Saeed Mahameed, Oz Shlomo, Jakub Kicinski,
	Vlad Buslov, David Miller, netdev, Jiri Pirko, Roi Dayan

On 03/03/2020 14:52, Paul Blakey wrote:
> Offload nf conntrack processing by looking up the 5-tuple in the
> zone's flow table.
> 
> The nf conntrack module will process the packets until a connection is
> in established state. Once in established state, the ct state pointer
> (nf_conn) will be restored on the skb from a successful ft lookup.
> 
> Signed-off-by: Paul Blakey <paulb@mellanox.com>
> Acked-by: Jiri Pirko <jiri@mellanox.com>
> ---
> Changelog:
>   v1->v2:
>    Add !skip_add curly braces
>    Removed extra setting thoff again
>    Check tcp proto outside of tcf_ct_flow_table_check_tcp
> 
>  net/sched/act_ct.c | 160 ++++++++++++++++++++++++++++++++++++++++++++++++++++-
>  1 file changed, 158 insertions(+), 2 deletions(-)
> 
> diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c
> index 6ad0553..2017f8f 100644
> --- a/net/sched/act_ct.c
> +++ b/net/sched/act_ct.c
> @@ -186,6 +186,155 @@ static void tcf_ct_flow_table_process_conn(struct tcf_ct_flow_table *ct_ft,
>  	tcf_ct_flow_table_add(ct_ft, ct, tcp);
>  }
>  
> +static bool
> +tcf_ct_flow_table_fill_tuple_ipv4(struct sk_buff *skb,
> +				  struct flow_offload_tuple *tuple)
> +{
> +	struct flow_ports *ports;
> +	unsigned int thoff;
> +	struct iphdr *iph;
> +
> +	if (!pskb_may_pull(skb, sizeof(*iph)))
> +		return false;
> +
> +	iph = ip_hdr(skb);
> +	thoff = iph->ihl * 4;
> +
> +	if (ip_is_fragment(iph) ||
> +	    unlikely(thoff != sizeof(struct iphdr)))
> +		return false;
> +
> +	if (iph->protocol != IPPROTO_TCP &&
> +	    iph->protocol != IPPROTO_UDP)
> +		return false;
> +
> +	if (iph->ttl <= 1)
> +		return false;
> +
> +	if (!pskb_may_pull(skb, thoff + sizeof(*ports)))
> +		return false;
> +

I think you should reload iph after the pskb_may_pull() call.

> +	ports = (struct flow_ports *)(skb_network_header(skb) + thoff);
> +
> +	tuple->src_v4.s_addr = iph->saddr;
> +	tuple->dst_v4.s_addr = iph->daddr;
> +	tuple->src_port = ports->source;
> +	tuple->dst_port = ports->dest;
> +	tuple->l3proto = AF_INET;
> +	tuple->l4proto = iph->protocol;
> +
> +	return true;
> +}
> +
> +static bool
> +tcf_ct_flow_table_fill_tuple_ipv6(struct sk_buff *skb,
> +				  struct flow_offload_tuple *tuple)
> +{
> +	struct flow_ports *ports;
> +	struct ipv6hdr *ip6h;
> +	unsigned int thoff;
> +
> +	if (!pskb_may_pull(skb, sizeof(*ip6h)))
> +		return false;
> +
> +	ip6h = ipv6_hdr(skb);
> +
> +	if (ip6h->nexthdr != IPPROTO_TCP &&
> +	    ip6h->nexthdr != IPPROTO_UDP)
> +		return false;
> +
> +	if (ip6h->hop_limit <= 1)
> +		return false;
> +
> +	thoff = sizeof(*ip6h);
> +	if (!pskb_may_pull(skb, thoff + sizeof(*ports)))
> +		return false;

same here

> +
> +	ports = (struct flow_ports *)(skb_network_header(skb) + thoff);
> +
> +	tuple->src_v6 = ip6h->saddr;
> +	tuple->dst_v6 = ip6h->daddr;
> +	tuple->src_port = ports->source;
> +	tuple->dst_port = ports->dest;
> +	tuple->l3proto = AF_INET6;
> +	tuple->l4proto = ip6h->nexthdr;
> +
> +	return true;
> +}
> +
> +static bool tcf_ct_flow_table_check_tcp(struct flow_offload *flow,
> +					struct sk_buff *skb,
> +					unsigned int thoff)
> +{
> +	struct tcphdr *tcph;
> +
> +	if (!pskb_may_pull(skb, thoff + sizeof(*tcph)))
> +		return false;
> +
> +	tcph = (void *)(skb_network_header(skb) + thoff);
> +	if (unlikely(tcph->fin || tcph->rst)) {
> +		flow_offload_teardown(flow);
> +		return false;
> +	}
> +
> +	return true;
> +}
> +
> +static bool tcf_ct_flow_table_lookup(struct tcf_ct_params *p,
> +				     struct sk_buff *skb,
> +				     u8 family)
> +{
> +	struct nf_flowtable *nf_ft = &p->ct_ft->nf_ft;
> +	struct flow_offload_tuple_rhash *tuplehash;
> +	struct flow_offload_tuple tuple = {};
> +	enum ip_conntrack_info ctinfo;
> +	struct flow_offload *flow;
> +	struct nf_conn *ct;
> +	unsigned int thoff;
> +	int ip_proto;
> +	u8 dir;
> +
> +	/* Previously seen or loopback */
> +	ct = nf_ct_get(skb, &ctinfo);
> +	if ((ct && !nf_ct_is_template(ct)) || ctinfo == IP_CT_UNTRACKED)
> +		return false;
> +
> +	switch (family) {
> +	case NFPROTO_IPV4:
> +		if (!tcf_ct_flow_table_fill_tuple_ipv4(skb, &tuple))
> +			return false;
> +		break;
> +	case NFPROTO_IPV6:
> +		if (!tcf_ct_flow_table_fill_tuple_ipv6(skb, &tuple))
> +			return false;
> +		break;
> +	default:
> +		return false;
> +	}
> +
> +	tuplehash = flow_offload_lookup(nf_ft, &tuple);
> +	if (!tuplehash)
> +		return false;
> +
> +	dir = tuplehash->tuple.dir;
> +	flow = container_of(tuplehash, struct flow_offload, tuplehash[dir]);
> +	ct = flow->ct;
> +
> +	ctinfo = dir == FLOW_OFFLOAD_DIR_ORIGINAL ? IP_CT_ESTABLISHED :
> +						    IP_CT_ESTABLISHED_REPLY;
> +
> +	thoff = ip_hdr(skb)->ihl * 4;
> +	ip_proto = ip_hdr(skb)->protocol;
> +	if (ip_proto == IPPROTO_TCP &&
> +	    !tcf_ct_flow_table_check_tcp(flow, skb, thoff))
> +		return false;
> +
> +	nf_conntrack_get(&ct->ct_general);
> +	nf_ct_set(skb, ct, ctinfo);
> +
> +	return true;
> +}
> +
>  static int tcf_ct_flow_tables_init(void)
>  {
>  	return rhashtable_init(&zones_ht, &zones_params);
> @@ -554,6 +703,7 @@ static int tcf_ct_act(struct sk_buff *skb, const struct tc_action *a,
>  	struct nf_hook_state state;
>  	int nh_ofs, err, retval;
>  	struct tcf_ct_params *p;
> +	bool skip_add = false;
>  	struct nf_conn *ct;
>  	u8 family;
>  
> @@ -603,6 +753,11 @@ static int tcf_ct_act(struct sk_buff *skb, const struct tc_action *a,
>  	 */
>  	cached = tcf_ct_skb_nfct_cached(net, skb, p->zone, force);
>  	if (!cached) {
> +		if (!commit && tcf_ct_flow_table_lookup(p, skb, family)) {
> +			skip_add = true;
> +			goto do_nat;
> +		}
> +
>  		/* Associate skb with specified zone. */
>  		if (tmpl) {
>  			ct = nf_ct_get(skb, &ctinfo);
> @@ -620,6 +775,7 @@ static int tcf_ct_act(struct sk_buff *skb, const struct tc_action *a,
>  			goto out_push;
>  	}
>  
> +do_nat:
>  	ct = nf_ct_get(skb, &ctinfo);
>  	if (!ct)
>  		goto out_push;
> @@ -637,10 +793,10 @@ static int tcf_ct_act(struct sk_buff *skb, const struct tc_action *a,
>  		 * even if the connection is already confirmed.
>  		 */
>  		nf_conntrack_confirm(skb);
> +	} else if (!skip_add) {
> +		tcf_ct_flow_table_process_conn(p->ct_ft, ct, ctinfo);
>  	}
>  
> -	tcf_ct_flow_table_process_conn(p->ct_ft, ct, ctinfo);
> -
>  out_push:
>  	skb_push_rcsum(skb, nh_ofs);
>  
> 


^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH net-next v3 3/3] net/sched: act_ct: Software offload of established flows
  2020-03-03 13:10   ` Nikolay Aleksandrov
@ 2020-03-03 13:31     ` Paul Blakey
  2020-03-03 13:35       ` Nikolay Aleksandrov
  0 siblings, 1 reply; 10+ messages in thread
From: Paul Blakey @ 2020-03-03 13:31 UTC (permalink / raw)
  To: Nikolay Aleksandrov, Saeed Mahameed, Oz Shlomo, Jakub Kicinski,
	Vlad Buslov, David Miller, netdev, Jiri Pirko, Roi Dayan


On 3/3/2020 3:10 PM, Nikolay Aleksandrov wrote:
> On 03/03/2020 14:52, Paul Blakey wrote:
>> Offload nf conntrack processing by looking up the 5-tuple in the
>> zone's flow table.
>>
>> The nf conntrack module will process the packets until a connection is
>> in established state. Once in established state, the ct state pointer
>> (nf_conn) will be restored on the skb from a successful ft lookup.
>>
>> Signed-off-by: Paul Blakey <paulb@mellanox.com>
>> Acked-by: Jiri Pirko <jiri@mellanox.com>
>> ---
>> Changelog:
>>   v1->v2:
>>    Add !skip_add curly braces
>>    Removed extra setting thoff again
>>    Check tcp proto outside of tcf_ct_flow_table_check_tcp
>>
>>  net/sched/act_ct.c | 160 ++++++++++++++++++++++++++++++++++++++++++++++++++++-
>>  1 file changed, 158 insertions(+), 2 deletions(-)
>>
>> diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c
>> index 6ad0553..2017f8f 100644
>> --- a/net/sched/act_ct.c
>> +++ b/net/sched/act_ct.c
>> @@ -186,6 +186,155 @@ static void tcf_ct_flow_table_process_conn(struct tcf_ct_flow_table *ct_ft,
>>  	tcf_ct_flow_table_add(ct_ft, ct, tcp);
>>  }
>>  
>> +static bool
>> +tcf_ct_flow_table_fill_tuple_ipv4(struct sk_buff *skb,
>> +				  struct flow_offload_tuple *tuple)
>> +{
>> +	struct flow_ports *ports;
>> +	unsigned int thoff;
>> +	struct iphdr *iph;
>> +
>> +	if (!pskb_may_pull(skb, sizeof(*iph)))
>> +		return false;
>> +
>> +	iph = ip_hdr(skb);
>> +	thoff = iph->ihl * 4;
>> +
>> +	if (ip_is_fragment(iph) ||
>> +	    unlikely(thoff != sizeof(struct iphdr)))
>> +		return false;
>> +
>> +	if (iph->protocol != IPPROTO_TCP &&
>> +	    iph->protocol != IPPROTO_UDP)
>> +		return false;
>> +
>> +	if (iph->ttl <= 1)
>> +		return false;
>> +
>> +	if (!pskb_may_pull(skb, thoff + sizeof(*ports)))
>> +		return false;
>> +
> I think you should reload iph after the pskb_may_pull() call.

Good catch, you're right it might change skb->head.

Thanks, will send v5.

Paul.

>
>> +	ports = (struct flow_ports *)(skb_network_header(skb) + thoff);
>> +
>> +	tuple->src_v4.s_addr = iph->saddr;
>> +	tuple->dst_v4.s_addr = iph->daddr;
>> +	tuple->src_port = ports->source;
>> +	tuple->dst_port = ports->dest;
>> +	tuple->l3proto = AF_INET;
>> +	tuple->l4proto = iph->protocol;
>> +
>> +	return true;
>> +}
>> +
>> +static bool
>> +tcf_ct_flow_table_fill_tuple_ipv6(struct sk_buff *skb,
>> +				  struct flow_offload_tuple *tuple)
>> +{
>> +	struct flow_ports *ports;
>> +	struct ipv6hdr *ip6h;
>> +	unsigned int thoff;
>> +
>> +	if (!pskb_may_pull(skb, sizeof(*ip6h)))
>> +		return false;
>> +
>> +	ip6h = ipv6_hdr(skb);
>> +
>> +	if (ip6h->nexthdr != IPPROTO_TCP &&
>> +	    ip6h->nexthdr != IPPROTO_UDP)
>> +		return false;
>> +
>> +	if (ip6h->hop_limit <= 1)
>> +		return false;
>> +
>> +	thoff = sizeof(*ip6h);
>> +	if (!pskb_may_pull(skb, thoff + sizeof(*ports)))
>> +		return false;
> same here
>
>> +
>> +	ports = (struct flow_ports *)(skb_network_header(skb) + thoff);
>> +
>> +	tuple->src_v6 = ip6h->saddr;
>> +	tuple->dst_v6 = ip6h->daddr;
>> +	tuple->src_port = ports->source;
>> +	tuple->dst_port = ports->dest;
>> +	tuple->l3proto = AF_INET6;
>> +	tuple->l4proto = ip6h->nexthdr;
>> +
>> +	return true;
>> +}
>> +
>> +static bool tcf_ct_flow_table_check_tcp(struct flow_offload *flow,
>> +					struct sk_buff *skb,
>> +					unsigned int thoff)
>> +{
>> +	struct tcphdr *tcph;
>> +
>> +	if (!pskb_may_pull(skb, thoff + sizeof(*tcph)))
>> +		return false;
>> +
>> +	tcph = (void *)(skb_network_header(skb) + thoff);
>> +	if (unlikely(tcph->fin || tcph->rst)) {
>> +		flow_offload_teardown(flow);
>> +		return false;
>> +	}
>> +
>> +	return true;
>> +}
>> +
>> +static bool tcf_ct_flow_table_lookup(struct tcf_ct_params *p,
>> +				     struct sk_buff *skb,
>> +				     u8 family)
>> +{
>> +	struct nf_flowtable *nf_ft = &p->ct_ft->nf_ft;
>> +	struct flow_offload_tuple_rhash *tuplehash;
>> +	struct flow_offload_tuple tuple = {};
>> +	enum ip_conntrack_info ctinfo;
>> +	struct flow_offload *flow;
>> +	struct nf_conn *ct;
>> +	unsigned int thoff;
>> +	int ip_proto;
>> +	u8 dir;
>> +
>> +	/* Previously seen or loopback */
>> +	ct = nf_ct_get(skb, &ctinfo);
>> +	if ((ct && !nf_ct_is_template(ct)) || ctinfo == IP_CT_UNTRACKED)
>> +		return false;
>> +
>> +	switch (family) {
>> +	case NFPROTO_IPV4:
>> +		if (!tcf_ct_flow_table_fill_tuple_ipv4(skb, &tuple))
>> +			return false;
>> +		break;
>> +	case NFPROTO_IPV6:
>> +		if (!tcf_ct_flow_table_fill_tuple_ipv6(skb, &tuple))
>> +			return false;
>> +		break;
>> +	default:
>> +		return false;
>> +	}
>> +
>> +	tuplehash = flow_offload_lookup(nf_ft, &tuple);
>> +	if (!tuplehash)
>> +		return false;
>> +
>> +	dir = tuplehash->tuple.dir;
>> +	flow = container_of(tuplehash, struct flow_offload, tuplehash[dir]);
>> +	ct = flow->ct;
>> +
>> +	ctinfo = dir == FLOW_OFFLOAD_DIR_ORIGINAL ? IP_CT_ESTABLISHED :
>> +						    IP_CT_ESTABLISHED_REPLY;
>> +
>> +	thoff = ip_hdr(skb)->ihl * 4;
>> +	ip_proto = ip_hdr(skb)->protocol;
>> +	if (ip_proto == IPPROTO_TCP &&
>> +	    !tcf_ct_flow_table_check_tcp(flow, skb, thoff))
>> +		return false;
>> +
>> +	nf_conntrack_get(&ct->ct_general);
>> +	nf_ct_set(skb, ct, ctinfo);
>> +
>> +	return true;
>> +}
>> +
>>  static int tcf_ct_flow_tables_init(void)
>>  {
>>  	return rhashtable_init(&zones_ht, &zones_params);
>> @@ -554,6 +703,7 @@ static int tcf_ct_act(struct sk_buff *skb, const struct tc_action *a,
>>  	struct nf_hook_state state;
>>  	int nh_ofs, err, retval;
>>  	struct tcf_ct_params *p;
>> +	bool skip_add = false;
>>  	struct nf_conn *ct;
>>  	u8 family;
>>  
>> @@ -603,6 +753,11 @@ static int tcf_ct_act(struct sk_buff *skb, const struct tc_action *a,
>>  	 */
>>  	cached = tcf_ct_skb_nfct_cached(net, skb, p->zone, force);
>>  	if (!cached) {
>> +		if (!commit && tcf_ct_flow_table_lookup(p, skb, family)) {
>> +			skip_add = true;
>> +			goto do_nat;
>> +		}
>> +
>>  		/* Associate skb with specified zone. */
>>  		if (tmpl) {
>>  			ct = nf_ct_get(skb, &ctinfo);
>> @@ -620,6 +775,7 @@ static int tcf_ct_act(struct sk_buff *skb, const struct tc_action *a,
>>  			goto out_push;
>>  	}
>>  
>> +do_nat:
>>  	ct = nf_ct_get(skb, &ctinfo);
>>  	if (!ct)
>>  		goto out_push;
>> @@ -637,10 +793,10 @@ static int tcf_ct_act(struct sk_buff *skb, const struct tc_action *a,
>>  		 * even if the connection is already confirmed.
>>  		 */
>>  		nf_conntrack_confirm(skb);
>> +	} else if (!skip_add) {
>> +		tcf_ct_flow_table_process_conn(p->ct_ft, ct, ctinfo);
>>  	}
>>  
>> -	tcf_ct_flow_table_process_conn(p->ct_ft, ct, ctinfo);
>> -
>>  out_push:
>>  	skb_push_rcsum(skb, nh_ofs);
>>  
>>

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH net-next v3 3/3] net/sched: act_ct: Software offload of established flows
  2020-03-03 13:31     ` Paul Blakey
@ 2020-03-03 13:35       ` Nikolay Aleksandrov
  2020-03-03 14:03         ` Paul Blakey
  0 siblings, 1 reply; 10+ messages in thread
From: Nikolay Aleksandrov @ 2020-03-03 13:35 UTC (permalink / raw)
  To: Paul Blakey, Saeed Mahameed, Oz Shlomo, Jakub Kicinski,
	Vlad Buslov, David Miller, netdev, Jiri Pirko, Roi Dayan

On 03/03/2020 15:31, Paul Blakey wrote:
> 
> On 3/3/2020 3:10 PM, Nikolay Aleksandrov wrote:
>> On 03/03/2020 14:52, Paul Blakey wrote:
>>> Offload nf conntrack processing by looking up the 5-tuple in the
>>> zone's flow table.
>>>
>>> The nf conntrack module will process the packets until a connection is
>>> in established state. Once in established state, the ct state pointer
>>> (nf_conn) will be restored on the skb from a successful ft lookup.
>>>
>>> Signed-off-by: Paul Blakey <paulb@mellanox.com>
>>> Acked-by: Jiri Pirko <jiri@mellanox.com>
>>> ---
>>> Changelog:
>>>   v1->v2:
>>>    Add !skip_add curly braces
>>>    Removed extra setting thoff again
>>>    Check tcp proto outside of tcf_ct_flow_table_check_tcp
>>>
>>>  net/sched/act_ct.c | 160 ++++++++++++++++++++++++++++++++++++++++++++++++++++-
>>>  1 file changed, 158 insertions(+), 2 deletions(-)
>>>
>>> diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c
>>> index 6ad0553..2017f8f 100644
>>> --- a/net/sched/act_ct.c
>>> +++ b/net/sched/act_ct.c
>>> @@ -186,6 +186,155 @@ static void tcf_ct_flow_table_process_conn(struct tcf_ct_flow_table *ct_ft,
>>>  	tcf_ct_flow_table_add(ct_ft, ct, tcp);
>>>  }
>>>  
>>> +static bool
>>> +tcf_ct_flow_table_fill_tuple_ipv4(struct sk_buff *skb,
>>> +				  struct flow_offload_tuple *tuple)
>>> +{
>>> +	struct flow_ports *ports;
>>> +	unsigned int thoff;
>>> +	struct iphdr *iph;
>>> +
>>> +	if (!pskb_may_pull(skb, sizeof(*iph)))
>>> +		return false;
>>> +
>>> +	iph = ip_hdr(skb);
>>> +	thoff = iph->ihl * 4;
>>> +
>>> +	if (ip_is_fragment(iph) ||
>>> +	    unlikely(thoff != sizeof(struct iphdr)))
>>> +		return false;
>>> +
>>> +	if (iph->protocol != IPPROTO_TCP &&
>>> +	    iph->protocol != IPPROTO_UDP)
>>> +		return false;
>>> +
>>> +	if (iph->ttl <= 1)
>>> +		return false;
>>> +
>>> +	if (!pskb_may_pull(skb, thoff + sizeof(*ports)))
>>> +		return false;
>>> +
>> I think you should reload iph after the pskb_may_pull() call.
> 
> Good catch, you're right it might change skb->head.
> 
> Thanks, will send v5.
> 
> Paul.
> 

Actually shouldn't the code be using pskb_network_may_pull(), i.e. pskb_may_pull()
with skb_network_offset(skb) + sizeof(network header) ... ?

>>
>>> +	ports = (struct flow_ports *)(skb_network_header(skb) + thoff);
>>> +
>>> +	tuple->src_v4.s_addr = iph->saddr;
>>> +	tuple->dst_v4.s_addr = iph->daddr;
>>> +	tuple->src_port = ports->source;
>>> +	tuple->dst_port = ports->dest;
>>> +	tuple->l3proto = AF_INET;
>>> +	tuple->l4proto = iph->protocol;
>>> +
>>> +	return true;
>>> +}
>>> +
>>> +static bool
>>> +tcf_ct_flow_table_fill_tuple_ipv6(struct sk_buff *skb,
>>> +				  struct flow_offload_tuple *tuple)
>>> +{
>>> +	struct flow_ports *ports;
>>> +	struct ipv6hdr *ip6h;
>>> +	unsigned int thoff;
>>> +
>>> +	if (!pskb_may_pull(skb, sizeof(*ip6h)))
>>> +		return false;
>>> +
>>> +	ip6h = ipv6_hdr(skb);
>>> +
>>> +	if (ip6h->nexthdr != IPPROTO_TCP &&
>>> +	    ip6h->nexthdr != IPPROTO_UDP)
>>> +		return false;
>>> +
>>> +	if (ip6h->hop_limit <= 1)
>>> +		return false;
>>> +
>>> +	thoff = sizeof(*ip6h);
>>> +	if (!pskb_may_pull(skb, thoff + sizeof(*ports)))
>>> +		return false;
>> same here
>>
>>> +
>>> +	ports = (struct flow_ports *)(skb_network_header(skb) + thoff);
>>> +
>>> +	tuple->src_v6 = ip6h->saddr;
>>> +	tuple->dst_v6 = ip6h->daddr;
>>> +	tuple->src_port = ports->source;
>>> +	tuple->dst_port = ports->dest;
>>> +	tuple->l3proto = AF_INET6;
>>> +	tuple->l4proto = ip6h->nexthdr;
>>> +
>>> +	return true;
>>> +}
>>> +
>>> +static bool tcf_ct_flow_table_check_tcp(struct flow_offload *flow,
>>> +					struct sk_buff *skb,
>>> +					unsigned int thoff)
>>> +{
>>> +	struct tcphdr *tcph;
>>> +
>>> +	if (!pskb_may_pull(skb, thoff + sizeof(*tcph)))
>>> +		return false;
>>> +
>>> +	tcph = (void *)(skb_network_header(skb) + thoff);
>>> +	if (unlikely(tcph->fin || tcph->rst)) {
>>> +		flow_offload_teardown(flow);
>>> +		return false;
>>> +	}
>>> +
>>> +	return true;
>>> +}
>>> +
>>> +static bool tcf_ct_flow_table_lookup(struct tcf_ct_params *p,
>>> +				     struct sk_buff *skb,
>>> +				     u8 family)
>>> +{
>>> +	struct nf_flowtable *nf_ft = &p->ct_ft->nf_ft;
>>> +	struct flow_offload_tuple_rhash *tuplehash;
>>> +	struct flow_offload_tuple tuple = {};
>>> +	enum ip_conntrack_info ctinfo;
>>> +	struct flow_offload *flow;
>>> +	struct nf_conn *ct;
>>> +	unsigned int thoff;
>>> +	int ip_proto;
>>> +	u8 dir;
>>> +
>>> +	/* Previously seen or loopback */
>>> +	ct = nf_ct_get(skb, &ctinfo);
>>> +	if ((ct && !nf_ct_is_template(ct)) || ctinfo == IP_CT_UNTRACKED)
>>> +		return false;
>>> +
>>> +	switch (family) {
>>> +	case NFPROTO_IPV4:
>>> +		if (!tcf_ct_flow_table_fill_tuple_ipv4(skb, &tuple))
>>> +			return false;
>>> +		break;
>>> +	case NFPROTO_IPV6:
>>> +		if (!tcf_ct_flow_table_fill_tuple_ipv6(skb, &tuple))
>>> +			return false;
>>> +		break;
>>> +	default:
>>> +		return false;
>>> +	}
>>> +
>>> +	tuplehash = flow_offload_lookup(nf_ft, &tuple);
>>> +	if (!tuplehash)
>>> +		return false;
>>> +
>>> +	dir = tuplehash->tuple.dir;
>>> +	flow = container_of(tuplehash, struct flow_offload, tuplehash[dir]);
>>> +	ct = flow->ct;
>>> +
>>> +	ctinfo = dir == FLOW_OFFLOAD_DIR_ORIGINAL ? IP_CT_ESTABLISHED :
>>> +						    IP_CT_ESTABLISHED_REPLY;
>>> +
>>> +	thoff = ip_hdr(skb)->ihl * 4;
>>> +	ip_proto = ip_hdr(skb)->protocol;
>>> +	if (ip_proto == IPPROTO_TCP &&
>>> +	    !tcf_ct_flow_table_check_tcp(flow, skb, thoff))
>>> +		return false;
>>> +
>>> +	nf_conntrack_get(&ct->ct_general);
>>> +	nf_ct_set(skb, ct, ctinfo);
>>> +
>>> +	return true;
>>> +}
>>> +
>>>  static int tcf_ct_flow_tables_init(void)
>>>  {
>>>  	return rhashtable_init(&zones_ht, &zones_params);
>>> @@ -554,6 +703,7 @@ static int tcf_ct_act(struct sk_buff *skb, const struct tc_action *a,
>>>  	struct nf_hook_state state;
>>>  	int nh_ofs, err, retval;
>>>  	struct tcf_ct_params *p;
>>> +	bool skip_add = false;
>>>  	struct nf_conn *ct;
>>>  	u8 family;
>>>  
>>> @@ -603,6 +753,11 @@ static int tcf_ct_act(struct sk_buff *skb, const struct tc_action *a,
>>>  	 */
>>>  	cached = tcf_ct_skb_nfct_cached(net, skb, p->zone, force);
>>>  	if (!cached) {
>>> +		if (!commit && tcf_ct_flow_table_lookup(p, skb, family)) {
>>> +			skip_add = true;
>>> +			goto do_nat;
>>> +		}
>>> +
>>>  		/* Associate skb with specified zone. */
>>>  		if (tmpl) {
>>>  			ct = nf_ct_get(skb, &ctinfo);
>>> @@ -620,6 +775,7 @@ static int tcf_ct_act(struct sk_buff *skb, const struct tc_action *a,
>>>  			goto out_push;
>>>  	}
>>>  
>>> +do_nat:
>>>  	ct = nf_ct_get(skb, &ctinfo);
>>>  	if (!ct)
>>>  		goto out_push;
>>> @@ -637,10 +793,10 @@ static int tcf_ct_act(struct sk_buff *skb, const struct tc_action *a,
>>>  		 * even if the connection is already confirmed.
>>>  		 */
>>>  		nf_conntrack_confirm(skb);
>>> +	} else if (!skip_add) {
>>> +		tcf_ct_flow_table_process_conn(p->ct_ft, ct, ctinfo);
>>>  	}
>>>  
>>> -	tcf_ct_flow_table_process_conn(p->ct_ft, ct, ctinfo);
>>> -
>>>  out_push:
>>>  	skb_push_rcsum(skb, nh_ofs);
>>>  
>>>


^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH net-next v3 3/3] net/sched: act_ct: Software offload of established flows
  2020-03-03 13:35       ` Nikolay Aleksandrov
@ 2020-03-03 14:03         ` Paul Blakey
  2020-03-03 14:15           ` Paul Blakey
  0 siblings, 1 reply; 10+ messages in thread
From: Paul Blakey @ 2020-03-03 14:03 UTC (permalink / raw)
  To: Nikolay Aleksandrov, Saeed Mahameed, Oz Shlomo, Jakub Kicinski,
	Vlad Buslov, David Miller, netdev, Jiri Pirko, Roi Dayan


On 3/3/2020 3:35 PM, Nikolay Aleksandrov wrote:
> On 03/03/2020 15:31, Paul Blakey wrote:
>> On 3/3/2020 3:10 PM, Nikolay Aleksandrov wrote:
>>> On 03/03/2020 14:52, Paul Blakey wrote:
>>>> Offload nf conntrack processing by looking up the 5-tuple in the
>>>> zone's flow table.
>>>>
>>>> The nf conntrack module will process the packets until a connection is
>>>> in established state. Once in established state, the ct state pointer
>>>> (nf_conn) will be restored on the skb from a successful ft lookup.
>>>>
>>>> Signed-off-by: Paul Blakey <paulb@mellanox.com>
>>>> Acked-by: Jiri Pirko <jiri@mellanox.com>
>>>> ---
>>>> Changelog:
>>>>   v1->v2:
>>>>    Add !skip_add curly braces
>>>>    Removed extra setting thoff again
>>>>    Check tcp proto outside of tcf_ct_flow_table_check_tcp
>>>>
>>>>  net/sched/act_ct.c | 160 ++++++++++++++++++++++++++++++++++++++++++++++++++++-
>>>>  1 file changed, 158 insertions(+), 2 deletions(-)
>>>>
>>>> diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c
>>>> index 6ad0553..2017f8f 100644
>>>> --- a/net/sched/act_ct.c
>>>> +++ b/net/sched/act_ct.c
>>>> @@ -186,6 +186,155 @@ static void tcf_ct_flow_table_process_conn(struct tcf_ct_flow_table *ct_ft,
>>>>  	tcf_ct_flow_table_add(ct_ft, ct, tcp);
>>>>  }
>>>>  
>>>> +static bool
>>>> +tcf_ct_flow_table_fill_tuple_ipv4(struct sk_buff *skb,
>>>> +				  struct flow_offload_tuple *tuple)
>>>> +{
>>>> +	struct flow_ports *ports;
>>>> +	unsigned int thoff;
>>>> +	struct iphdr *iph;
>>>> +
>>>> +	if (!pskb_may_pull(skb, sizeof(*iph)))
>>>> +		return false;
>>>> +
>>>> +	iph = ip_hdr(skb);
>>>> +	thoff = iph->ihl * 4;
>>>> +
>>>> +	if (ip_is_fragment(iph) ||
>>>> +	    unlikely(thoff != sizeof(struct iphdr)))
>>>> +		return false;
>>>> +
>>>> +	if (iph->protocol != IPPROTO_TCP &&
>>>> +	    iph->protocol != IPPROTO_UDP)
>>>> +		return false;
>>>> +
>>>> +	if (iph->ttl <= 1)
>>>> +		return false;
>>>> +
>>>> +	if (!pskb_may_pull(skb, thoff + sizeof(*ports)))
>>>> +		return false;
>>>> +
>>> I think you should reload iph after the pskb_may_pull() call.
>> Good catch, you're right it might change skb->head.
>>
>> Thanks, will send v5.
>>
>> Paul.
>>
> Actually shouldn't the code be using pskb_network_may_pull(), i.e. pskb_may_pull()
> with skb_network_offset(skb) + sizeof(network header) ... ?

you mean pskb_network_may_pull(skb,  thoff + sizeof(ports)) ?

Should act the same as skb is trimmed to network layer by caller (tcf_ct_skb_network_trim())

I can still do this to be more bullet proof, what do you think?


>
>>>> +	ports = (struct flow_ports *)(skb_network_header(skb) + thoff);
>>>> +
>>>> +	tuple->src_v4.s_addr = iph->saddr;
>>>> +	tuple->dst_v4.s_addr = iph->daddr;
>>>> +	tuple->src_port = ports->source;
>>>> +	tuple->dst_port = ports->dest;
>>>> +	tuple->l3proto = AF_INET;
>>>> +	tuple->l4proto = iph->protocol;
>>>> +
>>>> +	return true;
>>>> +}
>>>> +
>>>> +static bool
>>>> +tcf_ct_flow_table_fill_tuple_ipv6(struct sk_buff *skb,
>>>> +				  struct flow_offload_tuple *tuple)
>>>> +{
>>>> +	struct flow_ports *ports;
>>>> +	struct ipv6hdr *ip6h;
>>>> +	unsigned int thoff;
>>>> +
>>>> +	if (!pskb_may_pull(skb, sizeof(*ip6h)))
>>>> +		return false;
>>>> +
>>>> +	ip6h = ipv6_hdr(skb);
>>>> +
>>>> +	if (ip6h->nexthdr != IPPROTO_TCP &&
>>>> +	    ip6h->nexthdr != IPPROTO_UDP)
>>>> +		return false;
>>>> +
>>>> +	if (ip6h->hop_limit <= 1)
>>>> +		return false;
>>>> +
>>>> +	thoff = sizeof(*ip6h);
>>>> +	if (!pskb_may_pull(skb, thoff + sizeof(*ports)))
>>>> +		return false;
>>> same here
>>>
>>>> +
>>>> +	ports = (struct flow_ports *)(skb_network_header(skb) + thoff);
>>>> +
>>>> +	tuple->src_v6 = ip6h->saddr;
>>>> +	tuple->dst_v6 = ip6h->daddr;
>>>> +	tuple->src_port = ports->source;
>>>> +	tuple->dst_port = ports->dest;
>>>> +	tuple->l3proto = AF_INET6;
>>>> +	tuple->l4proto = ip6h->nexthdr;
>>>> +
>>>> +	return true;
>>>> +}
>>>> +
>>>> +static bool tcf_ct_flow_table_check_tcp(struct flow_offload *flow,
>>>> +					struct sk_buff *skb,
>>>> +					unsigned int thoff)
>>>> +{
>>>> +	struct tcphdr *tcph;
>>>> +
>>>> +	if (!pskb_may_pull(skb, thoff + sizeof(*tcph)))
>>>> +		return false;
>>>> +
>>>> +	tcph = (void *)(skb_network_header(skb) + thoff);
>>>> +	if (unlikely(tcph->fin || tcph->rst)) {
>>>> +		flow_offload_teardown(flow);
>>>> +		return false;
>>>> +	}
>>>> +
>>>> +	return true;
>>>> +}
>>>> +
>>>> +static bool tcf_ct_flow_table_lookup(struct tcf_ct_params *p,
>>>> +				     struct sk_buff *skb,
>>>> +				     u8 family)
>>>> +{
>>>> +	struct nf_flowtable *nf_ft = &p->ct_ft->nf_ft;
>>>> +	struct flow_offload_tuple_rhash *tuplehash;
>>>> +	struct flow_offload_tuple tuple = {};
>>>> +	enum ip_conntrack_info ctinfo;
>>>> +	struct flow_offload *flow;
>>>> +	struct nf_conn *ct;
>>>> +	unsigned int thoff;
>>>> +	int ip_proto;
>>>> +	u8 dir;
>>>> +
>>>> +	/* Previously seen or loopback */
>>>> +	ct = nf_ct_get(skb, &ctinfo);
>>>> +	if ((ct && !nf_ct_is_template(ct)) || ctinfo == IP_CT_UNTRACKED)
>>>> +		return false;
>>>> +
>>>> +	switch (family) {
>>>> +	case NFPROTO_IPV4:
>>>> +		if (!tcf_ct_flow_table_fill_tuple_ipv4(skb, &tuple))
>>>> +			return false;
>>>> +		break;
>>>> +	case NFPROTO_IPV6:
>>>> +		if (!tcf_ct_flow_table_fill_tuple_ipv6(skb, &tuple))
>>>> +			return false;
>>>> +		break;
>>>> +	default:
>>>> +		return false;
>>>> +	}
>>>> +
>>>> +	tuplehash = flow_offload_lookup(nf_ft, &tuple);
>>>> +	if (!tuplehash)
>>>> +		return false;
>>>> +
>>>> +	dir = tuplehash->tuple.dir;
>>>> +	flow = container_of(tuplehash, struct flow_offload, tuplehash[dir]);
>>>> +	ct = flow->ct;
>>>> +
>>>> +	ctinfo = dir == FLOW_OFFLOAD_DIR_ORIGINAL ? IP_CT_ESTABLISHED :
>>>> +						    IP_CT_ESTABLISHED_REPLY;
>>>> +
>>>> +	thoff = ip_hdr(skb)->ihl * 4;
>>>> +	ip_proto = ip_hdr(skb)->protocol;
>>>> +	if (ip_proto == IPPROTO_TCP &&
>>>> +	    !tcf_ct_flow_table_check_tcp(flow, skb, thoff))
>>>> +		return false;
>>>> +
>>>> +	nf_conntrack_get(&ct->ct_general);
>>>> +	nf_ct_set(skb, ct, ctinfo);
>>>> +
>>>> +	return true;
>>>> +}
>>>> +
>>>>  static int tcf_ct_flow_tables_init(void)
>>>>  {
>>>>  	return rhashtable_init(&zones_ht, &zones_params);
>>>> @@ -554,6 +703,7 @@ static int tcf_ct_act(struct sk_buff *skb, const struct tc_action *a,
>>>>  	struct nf_hook_state state;
>>>>  	int nh_ofs, err, retval;
>>>>  	struct tcf_ct_params *p;
>>>> +	bool skip_add = false;
>>>>  	struct nf_conn *ct;
>>>>  	u8 family;
>>>>  
>>>> @@ -603,6 +753,11 @@ static int tcf_ct_act(struct sk_buff *skb, const struct tc_action *a,
>>>>  	 */
>>>>  	cached = tcf_ct_skb_nfct_cached(net, skb, p->zone, force);
>>>>  	if (!cached) {
>>>> +		if (!commit && tcf_ct_flow_table_lookup(p, skb, family)) {
>>>> +			skip_add = true;
>>>> +			goto do_nat;
>>>> +		}
>>>> +
>>>>  		/* Associate skb with specified zone. */
>>>>  		if (tmpl) {
>>>>  			ct = nf_ct_get(skb, &ctinfo);
>>>> @@ -620,6 +775,7 @@ static int tcf_ct_act(struct sk_buff *skb, const struct tc_action *a,
>>>>  			goto out_push;
>>>>  	}
>>>>  
>>>> +do_nat:
>>>>  	ct = nf_ct_get(skb, &ctinfo);
>>>>  	if (!ct)
>>>>  		goto out_push;
>>>> @@ -637,10 +793,10 @@ static int tcf_ct_act(struct sk_buff *skb, const struct tc_action *a,
>>>>  		 * even if the connection is already confirmed.
>>>>  		 */
>>>>  		nf_conntrack_confirm(skb);
>>>> +	} else if (!skip_add) {
>>>> +		tcf_ct_flow_table_process_conn(p->ct_ft, ct, ctinfo);
>>>>  	}
>>>>  
>>>> -	tcf_ct_flow_table_process_conn(p->ct_ft, ct, ctinfo);
>>>> -
>>>>  out_push:
>>>>  	skb_push_rcsum(skb, nh_ofs);
>>>>  
>>>>

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH net-next v3 3/3] net/sched: act_ct: Software offload of established flows
  2020-03-03 14:03         ` Paul Blakey
@ 2020-03-03 14:15           ` Paul Blakey
  0 siblings, 0 replies; 10+ messages in thread
From: Paul Blakey @ 2020-03-03 14:15 UTC (permalink / raw)
  To: Nikolay Aleksandrov, Saeed Mahameed, Oz Shlomo, Jakub Kicinski,
	Vlad Buslov, David Miller, netdev, Jiri Pirko, Roi Dayan


On 3/3/2020 4:03 PM, Paul Blakey wrote:
> On 3/3/2020 3:35 PM, Nikolay Aleksandrov wrote:
>> On 03/03/2020 15:31, Paul Blakey wrote:
>>> On 3/3/2020 3:10 PM, Nikolay Aleksandrov wrote:
>>>> On 03/03/2020 14:52, Paul Blakey wrote:
>>>>> Offload nf conntrack processing by looking up the 5-tuple in the
>>>>> zone's flow table.
>>>>>
>>>>> The nf conntrack module will process the packets until a connection is
>>>>> in established state. Once in established state, the ct state pointer
>>>>> (nf_conn) will be restored on the skb from a successful ft lookup.
>>>>>
>>>>> Signed-off-by: Paul Blakey <paulb@mellanox.com>
>>>>> Acked-by: Jiri Pirko <jiri@mellanox.com>
>>>>> ---
>>>>> Changelog:
>>>>>   v1->v2:
>>>>>    Add !skip_add curly braces
>>>>>    Removed extra setting thoff again
>>>>>    Check tcp proto outside of tcf_ct_flow_table_check_tcp
>>>>>
>>>>>  net/sched/act_ct.c | 160 ++++++++++++++++++++++++++++++++++++++++++++++++++++-
>>>>>  1 file changed, 158 insertions(+), 2 deletions(-)
>>>>>
>>>>> diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c
>>>>> index 6ad0553..2017f8f 100644
>>>>> --- a/net/sched/act_ct.c
>>>>> +++ b/net/sched/act_ct.c
>>>>> @@ -186,6 +186,155 @@ static void tcf_ct_flow_table_process_conn(struct tcf_ct_flow_table *ct_ft,
>>>>>  	tcf_ct_flow_table_add(ct_ft, ct, tcp);
>>>>>  }
>>>>>  
>>>>> +static bool
>>>>> +tcf_ct_flow_table_fill_tuple_ipv4(struct sk_buff *skb,
>>>>> +				  struct flow_offload_tuple *tuple)
>>>>> +{
>>>>> +	struct flow_ports *ports;
>>>>> +	unsigned int thoff;
>>>>> +	struct iphdr *iph;
>>>>> +
>>>>> +	if (!pskb_may_pull(skb, sizeof(*iph)))
>>>>> +		return false;
>>>>> +
>>>>> +	iph = ip_hdr(skb);
>>>>> +	thoff = iph->ihl * 4;
>>>>> +
>>>>> +	if (ip_is_fragment(iph) ||
>>>>> +	    unlikely(thoff != sizeof(struct iphdr)))
>>>>> +		return false;
>>>>> +
>>>>> +	if (iph->protocol != IPPROTO_TCP &&
>>>>> +	    iph->protocol != IPPROTO_UDP)
>>>>> +		return false;
>>>>> +
>>>>> +	if (iph->ttl <= 1)
>>>>> +		return false;
>>>>> +
>>>>> +	if (!pskb_may_pull(skb, thoff + sizeof(*ports)))
>>>>> +		return false;
>>>>> +
>>>> I think you should reload iph after the pskb_may_pull() call.
>>> Good catch, you're right it might change skb->head.
>>>
>>> Thanks, will send v5.
>>>
>>> Paul.
>>>
>> Actually shouldn't the code be using pskb_network_may_pull(), i.e. pskb_may_pull()
>> with skb_network_offset(skb) + sizeof(network header) ... ?
> you mean pskb_network_may_pull(skb,  thoff + sizeof(ports)) ?
>
> Should act the same as skb is trimmed to network layer by caller (tcf_ct_skb_network_trim())
>
> I can still do this to be more bullet proof, what do you think?

Since  I'm getting the ports in ref to network header, ill do the pull in ref to that as well.


>
>
>>>>> +	ports = (struct flow_ports *)(skb_network_header(skb) + thoff);
>>>>> +
>>>>> +	tuple->src_v4.s_addr = iph->saddr;
>>>>> +	tuple->dst_v4.s_addr = iph->daddr;
>>>>> +	tuple->src_port = ports->source;
>>>>> +	tuple->dst_port = ports->dest;
>>>>> +	tuple->l3proto = AF_INET;
>>>>> +	tuple->l4proto = iph->protocol;
>>>>> +
>>>>> +	return true;
>>>>> +}
>>>>> +
>>>>> +static bool
>>>>> +tcf_ct_flow_table_fill_tuple_ipv6(struct sk_buff *skb,
>>>>> +				  struct flow_offload_tuple *tuple)
>>>>> +{
>>>>> +	struct flow_ports *ports;
>>>>> +	struct ipv6hdr *ip6h;
>>>>> +	unsigned int thoff;
>>>>> +
>>>>> +	if (!pskb_may_pull(skb, sizeof(*ip6h)))
>>>>> +		return false;
>>>>> +
>>>>> +	ip6h = ipv6_hdr(skb);
>>>>> +
>>>>> +	if (ip6h->nexthdr != IPPROTO_TCP &&
>>>>> +	    ip6h->nexthdr != IPPROTO_UDP)
>>>>> +		return false;
>>>>> +
>>>>> +	if (ip6h->hop_limit <= 1)
>>>>> +		return false;
>>>>> +
>>>>> +	thoff = sizeof(*ip6h);
>>>>> +	if (!pskb_may_pull(skb, thoff + sizeof(*ports)))
>>>>> +		return false;
>>>> same here
>>>>
>>>>> +
>>>>> +	ports = (struct flow_ports *)(skb_network_header(skb) + thoff);
>>>>> +
>>>>> +	tuple->src_v6 = ip6h->saddr;
>>>>> +	tuple->dst_v6 = ip6h->daddr;
>>>>> +	tuple->src_port = ports->source;
>>>>> +	tuple->dst_port = ports->dest;
>>>>> +	tuple->l3proto = AF_INET6;
>>>>> +	tuple->l4proto = ip6h->nexthdr;
>>>>> +
>>>>> +	return true;
>>>>> +}
>>>>> +
>>>>> +static bool tcf_ct_flow_table_check_tcp(struct flow_offload *flow,
>>>>> +					struct sk_buff *skb,
>>>>> +					unsigned int thoff)
>>>>> +{
>>>>> +	struct tcphdr *tcph;
>>>>> +
>>>>> +	if (!pskb_may_pull(skb, thoff + sizeof(*tcph)))
>>>>> +		return false;
>>>>> +
>>>>> +	tcph = (void *)(skb_network_header(skb) + thoff);
>>>>> +	if (unlikely(tcph->fin || tcph->rst)) {
>>>>> +		flow_offload_teardown(flow);
>>>>> +		return false;
>>>>> +	}
>>>>> +
>>>>> +	return true;
>>>>> +}
>>>>> +
>>>>> +static bool tcf_ct_flow_table_lookup(struct tcf_ct_params *p,
>>>>> +				     struct sk_buff *skb,
>>>>> +				     u8 family)
>>>>> +{
>>>>> +	struct nf_flowtable *nf_ft = &p->ct_ft->nf_ft;
>>>>> +	struct flow_offload_tuple_rhash *tuplehash;
>>>>> +	struct flow_offload_tuple tuple = {};
>>>>> +	enum ip_conntrack_info ctinfo;
>>>>> +	struct flow_offload *flow;
>>>>> +	struct nf_conn *ct;
>>>>> +	unsigned int thoff;
>>>>> +	int ip_proto;
>>>>> +	u8 dir;
>>>>> +
>>>>> +	/* Previously seen or loopback */
>>>>> +	ct = nf_ct_get(skb, &ctinfo);
>>>>> +	if ((ct && !nf_ct_is_template(ct)) || ctinfo == IP_CT_UNTRACKED)
>>>>> +		return false;
>>>>> +
>>>>> +	switch (family) {
>>>>> +	case NFPROTO_IPV4:
>>>>> +		if (!tcf_ct_flow_table_fill_tuple_ipv4(skb, &tuple))
>>>>> +			return false;
>>>>> +		break;
>>>>> +	case NFPROTO_IPV6:
>>>>> +		if (!tcf_ct_flow_table_fill_tuple_ipv6(skb, &tuple))
>>>>> +			return false;
>>>>> +		break;
>>>>> +	default:
>>>>> +		return false;
>>>>> +	}
>>>>> +
>>>>> +	tuplehash = flow_offload_lookup(nf_ft, &tuple);
>>>>> +	if (!tuplehash)
>>>>> +		return false;
>>>>> +
>>>>> +	dir = tuplehash->tuple.dir;
>>>>> +	flow = container_of(tuplehash, struct flow_offload, tuplehash[dir]);
>>>>> +	ct = flow->ct;
>>>>> +
>>>>> +	ctinfo = dir == FLOW_OFFLOAD_DIR_ORIGINAL ? IP_CT_ESTABLISHED :
>>>>> +						    IP_CT_ESTABLISHED_REPLY;
>>>>> +
>>>>> +	thoff = ip_hdr(skb)->ihl * 4;
>>>>> +	ip_proto = ip_hdr(skb)->protocol;
>>>>> +	if (ip_proto == IPPROTO_TCP &&
>>>>> +	    !tcf_ct_flow_table_check_tcp(flow, skb, thoff))
>>>>> +		return false;
>>>>> +
>>>>> +	nf_conntrack_get(&ct->ct_general);
>>>>> +	nf_ct_set(skb, ct, ctinfo);
>>>>> +
>>>>> +	return true;
>>>>> +}
>>>>> +
>>>>>  static int tcf_ct_flow_tables_init(void)
>>>>>  {
>>>>>  	return rhashtable_init(&zones_ht, &zones_params);
>>>>> @@ -554,6 +703,7 @@ static int tcf_ct_act(struct sk_buff *skb, const struct tc_action *a,
>>>>>  	struct nf_hook_state state;
>>>>>  	int nh_ofs, err, retval;
>>>>>  	struct tcf_ct_params *p;
>>>>> +	bool skip_add = false;
>>>>>  	struct nf_conn *ct;
>>>>>  	u8 family;
>>>>>  
>>>>> @@ -603,6 +753,11 @@ static int tcf_ct_act(struct sk_buff *skb, const struct tc_action *a,
>>>>>  	 */
>>>>>  	cached = tcf_ct_skb_nfct_cached(net, skb, p->zone, force);
>>>>>  	if (!cached) {
>>>>> +		if (!commit && tcf_ct_flow_table_lookup(p, skb, family)) {
>>>>> +			skip_add = true;
>>>>> +			goto do_nat;
>>>>> +		}
>>>>> +
>>>>>  		/* Associate skb with specified zone. */
>>>>>  		if (tmpl) {
>>>>>  			ct = nf_ct_get(skb, &ctinfo);
>>>>> @@ -620,6 +775,7 @@ static int tcf_ct_act(struct sk_buff *skb, const struct tc_action *a,
>>>>>  			goto out_push;
>>>>>  	}
>>>>>  
>>>>> +do_nat:
>>>>>  	ct = nf_ct_get(skb, &ctinfo);
>>>>>  	if (!ct)
>>>>>  		goto out_push;
>>>>> @@ -637,10 +793,10 @@ static int tcf_ct_act(struct sk_buff *skb, const struct tc_action *a,
>>>>>  		 * even if the connection is already confirmed.
>>>>>  		 */
>>>>>  		nf_conntrack_confirm(skb);
>>>>> +	} else if (!skip_add) {
>>>>> +		tcf_ct_flow_table_process_conn(p->ct_ft, ct, ctinfo);
>>>>>  	}
>>>>>  
>>>>> -	tcf_ct_flow_table_process_conn(p->ct_ft, ct, ctinfo);
>>>>> -
>>>>>  out_push:
>>>>>  	skb_push_rcsum(skb, nh_ofs);
>>>>>  
>>>>>

^ permalink raw reply	[flat|nested] 10+ messages in thread

end of thread, other threads:[~2020-03-03 14:16 UTC | newest]

Thread overview: 10+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-03-03 12:52 [PATCH net-next v3 0/3] act_ct: Software offload of conntrack_in Paul Blakey
2020-03-03 12:52 ` [PATCH net-next v3 1/3] net/sched: act_ct: Create nf flow table per zone Paul Blakey
2020-03-03 13:02   ` Paul Blakey
2020-03-03 12:52 ` [PATCH net-next v3 2/3] net/sched: act_ct: Offload established connections to flow table Paul Blakey
2020-03-03 12:52 ` [PATCH net-next v3 3/3] net/sched: act_ct: Software offload of established flows Paul Blakey
2020-03-03 13:10   ` Nikolay Aleksandrov
2020-03-03 13:31     ` Paul Blakey
2020-03-03 13:35       ` Nikolay Aleksandrov
2020-03-03 14:03         ` Paul Blakey
2020-03-03 14:15           ` Paul Blakey

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).