* [PATCH RFC,WIP 1/5] netfilter: nf_conntrack: move nf_ct_netns_{get,put}() to core
2017-11-03 15:26 [PATCH RFC,WIP 0/5] Flow offload infrastructure Pablo Neira Ayuso
@ 2017-11-03 15:26 ` Pablo Neira Ayuso
2017-11-03 15:30 ` Florian Westphal
2017-11-03 15:26 ` [PATCH RFC,WIP 2/5] netfilter: add software flow offload infrastructure Pablo Neira Ayuso
` (5 subsequent siblings)
6 siblings, 1 reply; 14+ messages in thread
From: Pablo Neira Ayuso @ 2017-11-03 15:26 UTC (permalink / raw)
To: netfilter-devel; +Cc: netdev
So we can call this from other expression that need conntrack in place
to work.
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
net/netfilter/nf_conntrack_proto.c | 37 ++++++++++++++++++++++++++++++++++--
net/netfilter/nft_ct.c | 39 +++-----------------------------------
2 files changed, 38 insertions(+), 38 deletions(-)
diff --git a/net/netfilter/nf_conntrack_proto.c b/net/netfilter/nf_conntrack_proto.c
index b3e489c859ec..4379f1244154 100644
--- a/net/netfilter/nf_conntrack_proto.c
+++ b/net/netfilter/nf_conntrack_proto.c
@@ -125,7 +125,7 @@ void nf_ct_l3proto_module_put(unsigned short l3proto)
}
EXPORT_SYMBOL_GPL(nf_ct_l3proto_module_put);
-int nf_ct_netns_get(struct net *net, u8 nfproto)
+static int nf_ct_netns_do_get(struct net *net, u8 nfproto)
{
const struct nf_conntrack_l3proto *l3proto;
int ret;
@@ -150,9 +150,33 @@ int nf_ct_netns_get(struct net *net, u8 nfproto)
return ret;
}
+
+int nf_ct_netns_get(struct net *net, u8 nfproto)
+{
+ int err;
+
+ if (nfproto == NFPROTO_INET) {
+ err = nf_ct_netns_do_get(net, NFPROTO_IPV4);
+ if (err < 0)
+ goto err1;
+ err = nf_ct_netns_do_get(net, NFPROTO_IPV6);
+ if (err < 0)
+ goto err2;
+ } else {
+ err = nf_ct_netns_do_get(net, nfproto);
+ if (err < 0)
+ goto err1;
+ }
+ return 0;
+
+err2:
+ nf_ct_netns_put(net, NFPROTO_IPV4);
+err1:
+ return err;
+}
EXPORT_SYMBOL_GPL(nf_ct_netns_get);
-void nf_ct_netns_put(struct net *net, u8 nfproto)
+static void nf_ct_netns_do_put(struct net *net, u8 nfproto)
{
const struct nf_conntrack_l3proto *l3proto;
@@ -171,6 +195,15 @@ void nf_ct_netns_put(struct net *net, u8 nfproto)
nf_ct_l3proto_module_put(nfproto);
}
+
+void nf_ct_netns_put(struct net *net, uint8_t nfproto)
+{
+ if (nfproto == NFPROTO_INET) {
+ nf_ct_netns_do_put(net, NFPROTO_IPV4);
+ nf_ct_netns_do_put(net, NFPROTO_IPV6);
+ } else
+ nf_ct_netns_do_put(net, nfproto);
+}
EXPORT_SYMBOL_GPL(nf_ct_netns_put);
const struct nf_conntrack_l4proto *
diff --git a/net/netfilter/nft_ct.c b/net/netfilter/nft_ct.c
index bd0975d7dd6f..2647b895f4b0 100644
--- a/net/netfilter/nft_ct.c
+++ b/net/netfilter/nft_ct.c
@@ -312,39 +312,6 @@ static const struct nla_policy nft_ct_policy[NFTA_CT_MAX + 1] = {
[NFTA_CT_SREG] = { .type = NLA_U32 },
};
-static int nft_ct_netns_get(struct net *net, uint8_t family)
-{
- int err;
-
- if (family == NFPROTO_INET) {
- err = nf_ct_netns_get(net, NFPROTO_IPV4);
- if (err < 0)
- goto err1;
- err = nf_ct_netns_get(net, NFPROTO_IPV6);
- if (err < 0)
- goto err2;
- } else {
- err = nf_ct_netns_get(net, family);
- if (err < 0)
- goto err1;
- }
- return 0;
-
-err2:
- nf_ct_netns_put(net, NFPROTO_IPV4);
-err1:
- return err;
-}
-
-static void nft_ct_netns_put(struct net *net, uint8_t family)
-{
- if (family == NFPROTO_INET) {
- nf_ct_netns_put(net, NFPROTO_IPV4);
- nf_ct_netns_put(net, NFPROTO_IPV6);
- } else
- nf_ct_netns_put(net, family);
-}
-
#ifdef CONFIG_NF_CONNTRACK_ZONES
static void nft_ct_tmpl_put_pcpu(void)
{
@@ -489,7 +456,7 @@ static int nft_ct_get_init(const struct nft_ctx *ctx,
if (err < 0)
return err;
- err = nft_ct_netns_get(ctx->net, ctx->afi->family);
+ err = nf_ct_netns_get(ctx->net, ctx->afi->family);
if (err < 0)
return err;
@@ -583,7 +550,7 @@ static int nft_ct_set_init(const struct nft_ctx *ctx,
if (err < 0)
goto err1;
- err = nft_ct_netns_get(ctx->net, ctx->afi->family);
+ err = nf_ct_netns_get(ctx->net, ctx->afi->family);
if (err < 0)
goto err1;
@@ -606,7 +573,7 @@ static void nft_ct_set_destroy(const struct nft_ctx *ctx,
struct nft_ct *priv = nft_expr_priv(expr);
__nft_ct_set_destroy(ctx, priv);
- nft_ct_netns_put(ctx->net, ctx->afi->family);
+ nf_ct_netns_put(ctx->net, ctx->afi->family);
}
static int nft_ct_get_dump(struct sk_buff *skb, const struct nft_expr *expr)
--
2.11.0
^ permalink raw reply related [flat|nested] 14+ messages in thread
* [PATCH RFC,WIP 2/5] netfilter: add software flow offload infrastructure
2017-11-03 15:26 [PATCH RFC,WIP 0/5] Flow offload infrastructure Pablo Neira Ayuso
2017-11-03 15:26 ` [PATCH RFC,WIP 1/5] netfilter: nf_conntrack: move nf_ct_netns_{get,put}() to core Pablo Neira Ayuso
@ 2017-11-03 15:26 ` Pablo Neira Ayuso
2017-11-03 20:32 ` Florian Westphal
2017-11-03 15:26 ` [PATCH RFC,WIP 3/5] netfilter: nf_flow_offload: integration with conntrack Pablo Neira Ayuso
` (4 subsequent siblings)
6 siblings, 1 reply; 14+ messages in thread
From: Pablo Neira Ayuso @ 2017-11-03 15:26 UTC (permalink / raw)
To: netfilter-devel; +Cc: netdev
This patch adds the generic software flow offload infrastructure. This
allows users to configure fast path for established flows that will not
follow the classic forwarding path.
This adds a new hook at netfilter ingress for each existing interface.
For each packet that hits the hook, we look up for an existing flow in
the table, if there is a hit, the packet is forwarded by using the
gateway and interfaces that are cached in the flow table entry.
This comes with a kernel thread to release flow table entries if no
packets are seen after a little while, so the flow table entry is
released.
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
include/net/flow_offload.h | 67 +++++++
net/netfilter/Kconfig | 7 +
net/netfilter/Makefile | 3 +
net/netfilter/nf_flow_offload.c | 386 ++++++++++++++++++++++++++++++++++++++++
4 files changed, 463 insertions(+)
create mode 100644 include/net/flow_offload.h
create mode 100644 net/netfilter/nf_flow_offload.c
diff --git a/include/net/flow_offload.h b/include/net/flow_offload.h
new file mode 100644
index 000000000000..30bfca7ed3f1
--- /dev/null
+++ b/include/net/flow_offload.h
@@ -0,0 +1,67 @@
+#ifndef _FLOW_OFFLOAD_H
+#define _FLOW_OFFLOAD_H
+
+#include <linux/in.h>
+#include <linux/in6.h>
+#include <linux/netdevice.h>
+#include <linux/rhashtable.h>
+#include <linux/rcupdate.h>
+
+enum flow_offload_tuple_dir {
+ FLOW_OFFLOAD_DIR_ORIGINAL,
+ FLOW_OFFLOAD_DIR_REPLY,
+ __FLOW_OFFLOAD_DIR_MAX = FLOW_OFFLOAD_DIR_REPLY,
+};
+#define FLOW_OFFLOAD_DIR_MAX (__FLOW_OFFLOAD_DIR_MAX + 1)
+
+struct flow_offload_tuple {
+ union {
+ struct in_addr src_v4;
+ struct in6_addr src_v6;
+ };
+ union {
+ struct in_addr dst_v4;
+ struct in6_addr dst_v6;
+ };
+ struct {
+ __be16 src_port;
+ __be16 dst_port;
+ };
+
+ u8 l3proto;
+ u8 l4proto;
+ u8 dir;
+
+ int iifidx;
+ int oifidx;
+
+ union {
+ __be32 gateway;
+ struct in6_addr gateway6;
+ };
+};
+
+struct flow_offload_tuple_rhash {
+ struct rhash_head node;
+ struct flow_offload_tuple tuple;
+};
+
+#define FLOW_OFFLOAD_SNAT 0x1
+#define FLOW_OFFLOAD_DNAT 0x2
+#define FLOW_OFFLOAD_HW 0x4
+
+struct flow_offload {
+ struct flow_offload_tuple_rhash tuplehash[FLOW_OFFLOAD_DIR_MAX];
+ u32 flags;
+ union {
+ /* Your private driver data here. */
+ u32 timeout;
+ };
+ struct rcu_head rcu_head;
+};
+
+int flow_offload_add(struct flow_offload *flow);
+void flow_offload_del(struct flow_offload *flow);
+struct flow_offload_tuple_rhash *flow_offload_lookup(struct flow_offload_tuple *tuple);
+
+#endif /* _FLOW_OFFLOAD_H */
diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
index e4a13cc8a2e7..f022ca91f49d 100644
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -436,6 +436,13 @@ config NETFILTER_SYNPROXY
endif # NF_CONNTRACK
+config NF_FLOW_OFFLOAD
+ tristate "Netfilter Generic Flow Offload (GFO) module"
+ help
+ This option adds the flow table core infrastructure.
+
+ To compile it as a module, choose M here.
+
config NF_TABLES
select NETFILTER_NETLINK
tristate "Netfilter nf_tables support"
diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile
index d3891c93edd6..518f54113e06 100644
--- a/net/netfilter/Makefile
+++ b/net/netfilter/Makefile
@@ -69,6 +69,9 @@ obj-$(CONFIG_NETFILTER_SYNPROXY) += nf_synproxy_core.o
# generic packet duplication from netdev family
obj-$(CONFIG_NF_DUP_NETDEV) += nf_dup_netdev.o
+# generic flow table
+obj-$(CONFIG_NF_FLOW_OFFLOAD)+= nf_flow_offload.o
+
# nf_tables
nf_tables-objs := nf_tables_core.o nf_tables_api.o nf_tables_trace.o \
nft_immediate.o nft_cmp.o nft_range.o nft_bitwise.o \
diff --git a/net/netfilter/nf_flow_offload.c b/net/netfilter/nf_flow_offload.c
new file mode 100644
index 000000000000..c967b29d11a6
--- /dev/null
+++ b/net/netfilter/nf_flow_offload.c
@@ -0,0 +1,386 @@
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/netfilter.h>
+#include <linux/rhashtable.h>
+#include <linux/ip.h>
+#include <linux/netdevice.h>
+#include <net/ip.h>
+#include <net/neighbour.h>
+#include <net/flow_offload.h>
+/* For layer 4 checksum field offset. */
+#include <linux/tcp.h>
+#include <linux/udp.h>
+#include <linux/icmpv6.h>
+
+static struct rhashtable flow_table;
+
+static u32 flow_offload_hash(const void *data, u32 len, u32 seed)
+{
+ const struct flow_offload_tuple *tuple = data;
+
+ return jhash(tuple, offsetof(struct flow_offload_tuple, l4proto), seed);
+}
+
+static u32 flow_offload_hash_obj(const void *data, u32 len, u32 seed)
+{
+ const struct flow_offload_tuple_rhash *tuplehash = data;
+
+ return jhash(&tuplehash->tuple, offsetof(struct flow_offload_tuple, l4proto), seed);
+}
+
+static int flow_offload_hash_cmp(struct rhashtable_compare_arg *arg,
+ const void *ptr)
+{
+ const struct flow_offload_tuple_rhash *x = ptr;
+ const struct flow_offload_tuple *tuple = arg->key;
+
+ if (memcmp(&x->tuple, tuple, offsetof(struct flow_offload_tuple, l4proto)))
+ return 1;
+
+ return 0;
+}
+
+static const struct rhashtable_params flow_offload_rhash_params = {
+ .head_offset = offsetof(struct flow_offload_tuple_rhash, node),
+ .hashfn = flow_offload_hash,
+ .obj_hashfn = flow_offload_hash_obj,
+ .obj_cmpfn = flow_offload_hash_cmp,
+ .automatic_shrinking = true,
+};
+
+#define NF_FLOW_LIFETIME 15
+
+int flow_offload_add(struct flow_offload *flow)
+{
+ flow->timeout = (u32)jiffies;
+
+ rhashtable_insert_fast(&flow_table, &flow->tuplehash[0].node,
+ flow_offload_rhash_params);
+ rhashtable_insert_fast(&flow_table, &flow->tuplehash[1].node,
+ flow_offload_rhash_params);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(flow_offload_add);
+
+void flow_offload_del(struct flow_offload *flow)
+{
+ rhashtable_remove_fast(&flow_table, &flow->tuplehash[0].node,
+ flow_offload_rhash_params);
+ rhashtable_remove_fast(&flow_table, &flow->tuplehash[1].node,
+ flow_offload_rhash_params);
+ kfree_rcu(flow, rcu_head);
+}
+EXPORT_SYMBOL_GPL(flow_offload_del);
+
+struct flow_offload_tuple_rhash *
+flow_offload_lookup(struct flow_offload_tuple *tuple)
+{
+ return rhashtable_lookup_fast(&flow_table, tuple,
+ flow_offload_rhash_params);
+}
+EXPORT_SYMBOL_GPL(flow_offload_lookup);
+
+static void nf_flow_offload_work_gc(struct work_struct *work);
+
+static DECLARE_DEFERRABLE_WORK(nf_flow_offload_gc,
+ nf_flow_offload_work_gc);
+
+static inline bool nf_flow_has_expired(const struct flow_offload *flow)
+{
+ return (__s32)(flow->timeout - (u32)jiffies) <= 0;
+}
+
+static void nf_flow_offload_work_gc(struct work_struct *work)
+{
+ struct flow_offload_tuple_rhash *tuplehash;
+ struct rhashtable_iter hti;
+ struct flow_offload *flow;
+ int err, counter = 0;
+
+ rhashtable_walk_init(&flow_table, &hti, GFP_KERNEL);
+ err = rhashtable_walk_start(&hti);
+ if (err && err != -EAGAIN)
+ goto out;
+
+ while ((tuplehash = rhashtable_walk_next(&hti))) {
+ if (IS_ERR(tuplehash)) {
+ err = PTR_ERR(tuplehash);
+ if (err != -EAGAIN)
+ goto out;
+
+ continue;
+ }
+ if (tuplehash->tuple.dir)
+ continue;
+
+ flow = container_of(tuplehash, struct flow_offload, tuplehash[0]);
+
+ if (nf_flow_has_expired(flow))
+ flow_offload_del(flow);
+
+ counter++;
+ }
+
+ rhashtable_walk_stop(&hti);
+ rhashtable_walk_exit(&hti);
+
+out:
+ queue_delayed_work(system_power_efficient_wq, &nf_flow_offload_gc,
+ msecs_to_jiffies(1000));
+}
+
+static int nf_flow_snat_tcp(struct iphdr *iph,
+ const struct flow_offload *flow,
+ struct sk_buff *skb,
+ unsigned int thoff,
+ __be32 addr, __be32 new_addr)
+{
+ struct tcphdr *tcph;
+
+ if (!pskb_may_pull(skb, thoff + sizeof(*tcph)) ||
+ skb_try_make_writable(skb, thoff + sizeof(*tcph)))
+ return -1;
+
+ tcph = (void *)(skb_network_header(skb) + thoff);
+ inet_proto_csum_replace4(&tcph->check, skb, addr, new_addr, true);
+
+ return 0;
+}
+
+static int nf_flow_snat_udp(struct iphdr *iph,
+ const struct flow_offload *flow,
+ struct sk_buff *skb,
+ unsigned int thoff,
+ __be32 addr, __be32 new_addr)
+{
+ struct udphdr *udph;
+
+ if (!pskb_may_pull(skb, thoff + sizeof(*udph)) ||
+ skb_try_make_writable(skb, thoff + sizeof(*udph)))
+ return -1;
+
+ udph = (void *)(skb_network_header(skb) + thoff);
+ if (udph->check || skb->ip_summed == CHECKSUM_PARTIAL) {
+ inet_proto_csum_replace4(&udph->check, skb, addr,
+ new_addr, true);
+ if (!udph->check)
+ udph->check = CSUM_MANGLED_0;
+ }
+
+ return 0;
+}
+
+static int nf_flow_snat(struct iphdr *iph,
+ const struct flow_offload *flow,
+ enum flow_offload_tuple_dir dir, struct sk_buff *skb)
+{
+ __be32 new_addr, addr;
+ unsigned int thoff;
+
+ if (skb_try_make_writable(skb, sizeof(*iph)))
+ return NF_DROP;
+
+ switch (dir) {
+ case FLOW_OFFLOAD_DIR_ORIGINAL:
+ addr = iph->saddr;
+ new_addr = flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_v4.s_addr;
+ iph->saddr = new_addr;
+ break;
+ case FLOW_OFFLOAD_DIR_REPLY:
+ addr = iph->daddr;
+ new_addr = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.src_v4.s_addr;
+ iph->daddr = new_addr;
+ break;
+ default:
+ return -1;
+ }
+ csum_replace4(&iph->check, addr, new_addr);
+
+ ip_decrease_ttl(iph);
+
+ thoff = iph->ihl * 4;
+
+ switch (iph->protocol) {
+ case IPPROTO_TCP:
+ if (nf_flow_snat_tcp(iph, flow, skb, thoff, addr, new_addr) < 0)
+ return NF_DROP;
+ break;
+ case IPPROTO_UDP:
+ if (nf_flow_snat_udp(iph, flow, skb, thoff, addr, new_addr) < 0)
+ return NF_DROP;
+ break;
+ }
+
+ return 0;
+}
+
+/* Similar to rt_nexthop(). */
+static inline __be32 nf_flow_nexthop(__be32 nexthop, __be32 daddr)
+{
+ if (nexthop)
+ return nexthop;
+
+ return daddr;
+}
+
+struct flow_ports {
+ __be16 src, dst;
+};
+
+static int nf_flow_tuple_ip(struct iphdr *iph, struct sk_buff *skb,
+ struct flow_offload_tuple *tuple)
+{
+ struct flow_ports *ports;
+ unsigned int thoff;
+
+ if (iph->protocol != IPPROTO_TCP &&
+ iph->protocol != IPPROTO_UDP)
+ return -1;
+
+ thoff = iph->ihl * 4;
+ if (!pskb_may_pull(skb, thoff + sizeof(*ports)))
+ return -1;
+
+ ports = (struct flow_ports *)(skb_network_header(skb) + thoff);
+
+ tuple->src_v4.s_addr = iph->saddr;
+ tuple->dst_v4.s_addr = iph->daddr;
+ tuple->src_port = ports->src;
+ tuple->dst_port = ports->dst;
+ tuple->l3proto = AF_INET;
+ tuple->l4proto = iph->protocol;
+
+ return 0;
+}
+
+#define NF_FLOW_TIMEOUT (30 * HZ)
+
+static unsigned int
+nf_flow_offload_hook(void *priv, struct sk_buff *skb,
+ const struct nf_hook_state *state)
+{
+ struct flow_offload_tuple_rhash *tuplehash;
+ struct flow_offload_tuple tuple = {};
+ struct flow_offload *flow;
+ struct net_device *outdev;
+ struct iphdr *iph;
+ __be32 nexthop;
+ int err;
+
+ switch (skb->protocol) {
+ case cpu_to_be16(ETH_P_IP):
+ if (!pskb_may_pull(skb, sizeof(*iph)))
+ return NF_ACCEPT;
+
+ iph = ip_hdr(skb);
+ if (ip_is_fragment(iph))
+ return NF_ACCEPT;
+
+ err = nf_flow_tuple_ip(iph, skb, &tuple);
+ if (err < 0)
+ return NF_ACCEPT;
+ break;
+ default:
+ return NF_ACCEPT;
+ }
+
+ tuplehash = flow_offload_lookup(&tuple);
+ if (tuplehash == NULL)
+ return NF_ACCEPT;
+
+ outdev = dev_get_by_index_rcu(&init_net, tuplehash->tuple.oifidx);
+ if (!outdev)
+ return NF_ACCEPT;
+
+ flow = container_of(tuplehash, struct flow_offload,
+ tuplehash[tuplehash->tuple.dir]);
+
+ flow->timeout = (u32)jiffies + NF_FLOW_TIMEOUT;
+
+ if (flow->flags & FLOW_OFFLOAD_SNAT &&
+ nf_flow_snat(iph, flow, tuplehash->tuple.dir, skb) < 0)
+ return NF_DROP;
+
+ skb->dev = outdev;
+ nexthop = nf_flow_nexthop(tuplehash->tuple.gateway, iph->daddr);
+
+ neigh_xmit(NEIGH_ARP_TABLE, outdev, &nexthop, skb);
+
+ return NF_STOLEN;
+}
+
+static LIST_HEAD(nf_flow_hook_list);
+
+struct nf_flow_hook_entry {
+ struct list_head head;
+ struct nf_hook_ops ops;
+};
+
+static int __init nf_flow_offload_module_init(void)
+{
+ struct rhashtable_params params = flow_offload_rhash_params;
+ struct nf_hook_ops flow_offload_hook = {
+ .hook = nf_flow_offload_hook,
+ .pf = NFPROTO_NETDEV,
+ .hooknum = NF_NETDEV_INGRESS,
+ .priority = -100,
+ };
+ struct nf_flow_hook_entry *entry;
+ struct net_device *dev;
+ int err;
+
+ params.key_len = offsetof(struct flow_offload_tuple, dir);
+ err = rhashtable_init(&flow_table, ¶ms);
+ if (err < 0)
+ return err;
+
+ rtnl_lock();
+ for_each_netdev(&init_net, dev) {
+ entry = kmalloc(sizeof(*entry), GFP_KERNEL);
+ if (!entry) {
+ rtnl_unlock();
+ return -ENOMEM;
+ }
+ entry->ops = flow_offload_hook;
+ entry->ops.dev = dev;
+ list_add_tail(&entry->head, &nf_flow_hook_list);
+
+ err = nf_register_net_hook(&init_net, &entry->ops);
+ if (err < 0)
+ return err;
+
+ pr_info("register flow table for device %s\n", dev->name);
+ }
+ rtnl_unlock();
+
+ queue_delayed_work(system_power_efficient_wq, &nf_flow_offload_gc,
+ msecs_to_jiffies(1000));
+ return err;
+}
+
+static void flow_offload_destroy(void *ptr, void *arg)
+{
+ kfree(ptr);
+}
+
+static void __exit nf_flow_offload_module_exit(void)
+{
+ struct nf_flow_hook_entry *entry, *next;
+
+ cancel_delayed_work_sync(&nf_flow_offload_gc);
+ list_for_each_entry_safe(entry, next, &nf_flow_hook_list, head) {
+ pr_info("unregister flow table for device %s\n",
+ entry->ops.dev->name);
+ nf_unregister_net_hook(&init_net, &entry->ops);
+ list_del(&entry->head);
+ kfree(entry);
+ }
+ rhashtable_free_and_destroy(&flow_table, flow_offload_destroy, NULL);
+}
+
+module_init(nf_flow_offload_module_init);
+module_exit(nf_flow_offload_module_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>");
--
2.11.0
^ permalink raw reply related [flat|nested] 14+ messages in thread
* Re: [PATCH RFC,WIP 2/5] netfilter: add software flow offload infrastructure
2017-11-03 15:26 ` [PATCH RFC,WIP 2/5] netfilter: add software flow offload infrastructure Pablo Neira Ayuso
@ 2017-11-03 20:32 ` Florian Westphal
0 siblings, 0 replies; 14+ messages in thread
From: Florian Westphal @ 2017-11-03 20:32 UTC (permalink / raw)
To: Pablo Neira Ayuso; +Cc: netfilter-devel, netdev
Pablo Neira Ayuso <pablo@netfilter.org> wrote:
> +static int __init nf_flow_offload_module_init(void)
> +{
> + struct rhashtable_params params = flow_offload_rhash_params;
> + struct nf_hook_ops flow_offload_hook = {
> + .hook = nf_flow_offload_hook,
> + .pf = NFPROTO_NETDEV,
> + .hooknum = NF_NETDEV_INGRESS,
> + .priority = -100,
Magic number. Should this be documented in nft?
Alternatively we could reject NETDEV_INGRESS base chains from
userspace if prio < 0 to prevent userspace rules from messing
with this flow offlaod infrastructure.
I guess the rationale of using auto-builtin hook is to avoid
forcing users to configure this with nftables rules?
> + rtnl_lock();
> + for_each_netdev(&init_net, dev) {
> + entry = kmalloc(sizeof(*entry), GFP_KERNEL);
> + if (!entry) {
> + rtnl_unlock();
> + return -ENOMEM;
This would need error unwinding (Unregistering the already-registered
hooks).
> + err = nf_register_net_hook(&init_net, &entry->ops);
> + if (err < 0)
> + return err;
And here as well.
^ permalink raw reply [flat|nested] 14+ messages in thread
* [PATCH RFC,WIP 3/5] netfilter: nf_flow_offload: integration with conntrack
2017-11-03 15:26 [PATCH RFC,WIP 0/5] Flow offload infrastructure Pablo Neira Ayuso
2017-11-03 15:26 ` [PATCH RFC,WIP 1/5] netfilter: nf_conntrack: move nf_ct_netns_{get,put}() to core Pablo Neira Ayuso
2017-11-03 15:26 ` [PATCH RFC,WIP 2/5] netfilter: add software flow offload infrastructure Pablo Neira Ayuso
@ 2017-11-03 15:26 ` Pablo Neira Ayuso
2017-11-03 19:49 ` Florian Westphal
2017-11-03 15:26 ` [PATCH RFC,WIP 4/5] netfilter: nf_tables: flow offload expression Pablo Neira Ayuso
` (3 subsequent siblings)
6 siblings, 1 reply; 14+ messages in thread
From: Pablo Neira Ayuso @ 2017-11-03 15:26 UTC (permalink / raw)
To: netfilter-devel; +Cc: netdev
This patch adds the IPS_OFFLOAD status bit, this new bit tells us that
the conntrack entry is owned by the flow offload infrastructure. The
timer of such conntrack entries is stopped - the conntrack garbage
collector skips them - and they display no internal state in the case of
TCP flows.
# cat /proc/net/nf_conntrack
ipv4 2 tcp 6 src=10.141.10.2 dst=147.75.205.195 sport=36392 dport=443 src=147.75.205.195 dst=192.168.2.195 sport=443 dport=36392 [OFFLOAD] mark=0 zone=0 use=2
Note the [OFFLOAD] tag in the listing.
Conntrack entries that have been offloaded to the flow table
infrastructure cannot be deleted/flushed via ctnetlink. The flow table
infrastructure is also responsible for releasing this conntrack entry.
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
Instead of nf_flow_release_ct(), I'd rather keep a pointer reference to
the conntrack object from the flow_offload entry, so we can skip the
conntrack look up.
include/net/netfilter/nf_conntrack.h | 3 +-
include/uapi/linux/netfilter/nf_conntrack_common.h | 4 +++
net/netfilter/nf_conntrack_core.c | 7 ++++-
net/netfilter/nf_conntrack_netlink.c | 15 ++++++++-
net/netfilter/nf_conntrack_proto_tcp.c | 3 ++
net/netfilter/nf_conntrack_standalone.c | 12 +++++---
net/netfilter/nf_flow_offload.c | 36 ++++++++++++++++++++--
7 files changed, 71 insertions(+), 9 deletions(-)
diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h
index 8f3bd30511de..9af4bb0c2f46 100644
--- a/include/net/netfilter/nf_conntrack.h
+++ b/include/net/netfilter/nf_conntrack.h
@@ -272,7 +272,8 @@ static inline unsigned long nf_ct_expires(const struct nf_conn *ct)
static inline bool nf_ct_is_expired(const struct nf_conn *ct)
{
- return (__s32)(ct->timeout - nfct_time_stamp) <= 0;
+ return (__s32)(ct->timeout - nfct_time_stamp) <= 0 &&
+ !test_bit(IPS_OFFLOAD_BIT, &ct->status);
}
/* use after obtaining a reference count */
diff --git a/include/uapi/linux/netfilter/nf_conntrack_common.h b/include/uapi/linux/netfilter/nf_conntrack_common.h
index dc947e59d03a..6b463b88182d 100644
--- a/include/uapi/linux/netfilter/nf_conntrack_common.h
+++ b/include/uapi/linux/netfilter/nf_conntrack_common.h
@@ -100,6 +100,10 @@ enum ip_conntrack_status {
IPS_HELPER_BIT = 13,
IPS_HELPER = (1 << IPS_HELPER_BIT),
+ /* Conntrack has been offloaded to flow table. */
+ IPS_OFFLOAD_BIT = 14,
+ IPS_OFFLOAD = (1 << IPS_OFFLOAD_BIT),
+
/* Be careful here, modifying these bits can make things messy,
* so don't let users modify them directly.
*/
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index 01130392b7c0..48f36c4fb756 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -901,6 +901,9 @@ static unsigned int early_drop_list(struct net *net,
hlist_nulls_for_each_entry_rcu(h, n, head, hnnode) {
tmp = nf_ct_tuplehash_to_ctrack(h);
+ if (test_bit(IPS_OFFLOAD_BIT, &tmp->status))
+ continue;
+
if (nf_ct_is_expired(tmp)) {
nf_ct_gc_expired(tmp);
continue;
@@ -1011,12 +1014,14 @@ static void gc_worker(struct work_struct *work)
tmp = nf_ct_tuplehash_to_ctrack(h);
scanned++;
+ if (test_bit(IPS_OFFLOAD_BIT, &tmp->status))
+ continue;
+
if (nf_ct_is_expired(tmp)) {
nf_ct_gc_expired(tmp);
expired_count++;
continue;
}
-
if (nf_conntrack_max95 == 0 || gc_worker_skip_ct(tmp))
continue;
diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
index de4053d84364..79a74aec7c1e 100644
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -1105,6 +1105,14 @@ static const struct nla_policy ct_nla_policy[CTA_MAX+1] = {
.len = NF_CT_LABELS_MAX_SIZE },
};
+static int ctnetlink_flush_iterate(struct nf_conn *ct, void *data)
+{
+ if (test_bit(IPS_OFFLOAD_BIT, &ct->status))
+ return 0;
+
+ return ctnetlink_filter_match(ct, data);
+}
+
static int ctnetlink_flush_conntrack(struct net *net,
const struct nlattr * const cda[],
u32 portid, int report)
@@ -1117,7 +1125,7 @@ static int ctnetlink_flush_conntrack(struct net *net,
return PTR_ERR(filter);
}
- nf_ct_iterate_cleanup_net(net, ctnetlink_filter_match, filter,
+ nf_ct_iterate_cleanup_net(net, ctnetlink_flush_iterate, filter,
portid, report);
kfree(filter);
@@ -1163,6 +1171,11 @@ static int ctnetlink_del_conntrack(struct net *net, struct sock *ctnl,
ct = nf_ct_tuplehash_to_ctrack(h);
+ if (test_bit(IPS_OFFLOAD_BIT, &ct->status)) {
+ nf_ct_put(ct);
+ return -EBUSY;
+ }
+
if (cda[CTA_ID]) {
u_int32_t id = ntohl(nla_get_be32(cda[CTA_ID]));
if (id != (u32)(unsigned long)ct) {
diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c
index cba1c6ffe51a..156f529d1668 100644
--- a/net/netfilter/nf_conntrack_proto_tcp.c
+++ b/net/netfilter/nf_conntrack_proto_tcp.c
@@ -305,6 +305,9 @@ static bool tcp_invert_tuple(struct nf_conntrack_tuple *tuple,
/* Print out the private part of the conntrack. */
static void tcp_print_conntrack(struct seq_file *s, struct nf_conn *ct)
{
+ if (test_bit(IPS_OFFLOAD_BIT, &ct->status))
+ return;
+
seq_printf(s, "%s ", tcp_conntrack_names[ct->proto.tcp.state]);
}
#endif
diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c
index 5a101caa3e12..46d32baad095 100644
--- a/net/netfilter/nf_conntrack_standalone.c
+++ b/net/netfilter/nf_conntrack_standalone.c
@@ -309,10 +309,12 @@ static int ct_seq_show(struct seq_file *s, void *v)
WARN_ON(!l4proto);
ret = -ENOSPC;
- seq_printf(s, "%-8s %u %-8s %u %ld ",
+ seq_printf(s, "%-8s %u %-8s %u ",
l3proto_name(l3proto->l3proto), nf_ct_l3num(ct),
- l4proto_name(l4proto->l4proto), nf_ct_protonum(ct),
- nf_ct_expires(ct) / HZ);
+ l4proto_name(l4proto->l4proto), nf_ct_protonum(ct));
+
+ if (!test_bit(IPS_OFFLOAD_BIT, &ct->status))
+ seq_printf(s, "%ld ", nf_ct_expires(ct) / HZ);
if (l4proto->print_conntrack)
l4proto->print_conntrack(s, ct);
@@ -339,7 +341,9 @@ static int ct_seq_show(struct seq_file *s, void *v)
if (seq_print_acct(s, ct, IP_CT_DIR_REPLY))
goto release;
- if (test_bit(IPS_ASSURED_BIT, &ct->status))
+ if (test_bit(IPS_OFFLOAD_BIT, &ct->status))
+ seq_puts(s, "[OFFLOAD] ");
+ else if (test_bit(IPS_ASSURED_BIT, &ct->status))
seq_puts(s, "[ASSURED] ");
if (seq_has_overflowed(s))
diff --git a/net/netfilter/nf_flow_offload.c b/net/netfilter/nf_flow_offload.c
index c967b29d11a6..f4a3fbe11b69 100644
--- a/net/netfilter/nf_flow_offload.c
+++ b/net/netfilter/nf_flow_offload.c
@@ -13,6 +13,9 @@
#include <linux/udp.h>
#include <linux/icmpv6.h>
+#include <net/netfilter/nf_conntrack_core.h>
+#include <net/netfilter/nf_conntrack_tuple.h>
+
static struct rhashtable flow_table;
static u32 flow_offload_hash(const void *data, u32 len, u32 seed)
@@ -91,6 +94,34 @@ static inline bool nf_flow_has_expired(const struct flow_offload *flow)
return (__s32)(flow->timeout - (u32)jiffies) <= 0;
}
+static void nf_flow_release_ct(const struct flow_offload_tuple_rhash *th)
+{
+ struct nf_conntrack_tuple tuple = {};
+ struct nf_conntrack_tuple_hash *h;
+ struct nf_conntrack_zone zone;
+ struct nf_conn *ct;
+
+ nf_ct_zone_init(&zone, NF_CT_DEFAULT_ZONE_ID,
+ NF_CT_DEFAULT_ZONE_DIR, 0);
+
+ tuple.src.u3.ip = th->tuple.src_v4.s_addr;
+ tuple.dst.u3.ip = th->tuple.dst_v4.s_addr;
+ tuple.src.u.all = th->tuple.src_port;
+ tuple.dst.u.all = th->tuple.dst_port;
+ tuple.src.l3num = th->tuple.l3proto;
+ tuple.dst.protonum = th->tuple.l4proto;
+ tuple.dst.dir = IP_CT_DIR_ORIGINAL;
+
+ h = nf_conntrack_find_get(&init_net, &zone, &tuple);
+ if (!h) {
+ pr_err("cannot find conntrack for flow hash %p\n", th);
+ return;
+ }
+ ct = nf_ct_tuplehash_to_ctrack(h);
+ nf_ct_delete(ct, 0, 0);
+ nf_ct_put(ct);
+}
+
static void nf_flow_offload_work_gc(struct work_struct *work)
{
struct flow_offload_tuple_rhash *tuplehash;
@@ -116,9 +147,10 @@ static void nf_flow_offload_work_gc(struct work_struct *work)
flow = container_of(tuplehash, struct flow_offload, tuplehash[0]);
- if (nf_flow_has_expired(flow))
+ if (nf_flow_has_expired(flow)) {
flow_offload_del(flow);
-
+ nf_flow_release_ct(tuplehash);
+ }
counter++;
}
--
2.11.0
^ permalink raw reply related [flat|nested] 14+ messages in thread
* Re: [PATCH RFC,WIP 3/5] netfilter: nf_flow_offload: integration with conntrack
2017-11-03 15:26 ` [PATCH RFC,WIP 3/5] netfilter: nf_flow_offload: integration with conntrack Pablo Neira Ayuso
@ 2017-11-03 19:49 ` Florian Westphal
0 siblings, 0 replies; 14+ messages in thread
From: Florian Westphal @ 2017-11-03 19:49 UTC (permalink / raw)
To: Pablo Neira Ayuso; +Cc: netfilter-devel, netdev
Pablo Neira Ayuso <pablo@netfilter.org> wrote:
> This patch adds the IPS_OFFLOAD status bit, this new bit tells us that
> the conntrack entry is owned by the flow offload infrastructure. The
> timer of such conntrack entries is stopped - the conntrack garbage
> collector skips them - and they display no internal state in the case of
> TCP flows.
>
> Conntrack entries that have been offloaded to the flow table
> infrastructure cannot be deleted/flushed via ctnetlink. The flow table
> infrastructure is also responsible for releasing this conntrack entry.
>
> Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
> ---
> Instead of nf_flow_release_ct(), I'd rather keep a pointer reference to
> the conntrack object from the flow_offload entry, so we can skip the
> conntrack look up.
I agree, this would make sense.
> diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h
> index 8f3bd30511de..9af4bb0c2f46 100644
> --- a/include/net/netfilter/nf_conntrack.h
> +++ b/include/net/netfilter/nf_conntrack.h
> @@ -272,7 +272,8 @@ static inline unsigned long nf_ct_expires(const struct nf_conn *ct)
>
> static inline bool nf_ct_is_expired(const struct nf_conn *ct)
> {
> - return (__s32)(ct->timeout - nfct_time_stamp) <= 0;
> + return (__s32)(ct->timeout - nfct_time_stamp) <= 0 &&
> + !test_bit(IPS_OFFLOAD_BIT, &ct->status);
An alternative would be to not touch nf_ct_is_expired() and instead ...
> }
>
> @@ -1011,12 +1014,14 @@ static void gc_worker(struct work_struct *work)
> tmp = nf_ct_tuplehash_to_ctrack(h);
>
> scanned++;
> + if (test_bit(IPS_OFFLOAD_BIT, &tmp->status))
> + continue;
... advance/refresh ct->timeout from gc worker, i.e.
if (test_bit(IPS_OFFLOAD_BIT, &tmp->status)) {
ct->timeout = nfct_time_stamp + (1 DAY);
continue;
}
Would prevent normal path to ever see offloaded entry
as 'timed out', without having to check for the flag in lookup path
(OTOH the check should not be an issue either because lookup path
has to access ct->status anyway).
^ permalink raw reply [flat|nested] 14+ messages in thread
* [PATCH RFC,WIP 4/5] netfilter: nf_tables: flow offload expression
2017-11-03 15:26 [PATCH RFC,WIP 0/5] Flow offload infrastructure Pablo Neira Ayuso
` (2 preceding siblings ...)
2017-11-03 15:26 ` [PATCH RFC,WIP 3/5] netfilter: nf_flow_offload: integration with conntrack Pablo Neira Ayuso
@ 2017-11-03 15:26 ` Pablo Neira Ayuso
2017-11-04 1:19 ` Florian Westphal
2017-11-03 15:26 ` [PATCH RFC,WIP 5/5] netfilter: nft_flow_offload: add ndo hooks for hardware offload Pablo Neira Ayuso
` (2 subsequent siblings)
6 siblings, 1 reply; 14+ messages in thread
From: Pablo Neira Ayuso @ 2017-11-03 15:26 UTC (permalink / raw)
To: netfilter-devel; +Cc: netdev
Add new instruction for the nf_tables VM that allows us to specify what
flows are offloaded. This has an explicit dependency with the conntrack
subsystem.
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
include/uapi/linux/netfilter/nf_tables.h | 9 +
net/netfilter/Kconfig | 7 +
net/netfilter/Makefile | 1 +
net/netfilter/nft_flow_offload.c | 331 +++++++++++++++++++++++++++++++
4 files changed, 348 insertions(+)
create mode 100644 net/netfilter/nft_flow_offload.c
diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h
index 871afa4871bf..2edde548de68 100644
--- a/include/uapi/linux/netfilter/nf_tables.h
+++ b/include/uapi/linux/netfilter/nf_tables.h
@@ -948,6 +948,15 @@ enum nft_ct_attributes {
};
#define NFTA_CT_MAX (__NFTA_CT_MAX - 1)
+/**
+ * enum nft_ct_offload_attributes - ct offload expression attributes
+ */
+enum nft_offload_attributes {
+ NFTA_CT_OFFLOAD_UNSPEC,
+ __NFTA_CT_OFFLOAD_MAX,
+};
+#define NFTA_CT_OFFLOAD_MAX (__NFTA_CT_OFFLOAD_MAX - 1)
+
enum nft_limit_type {
NFT_LIMIT_PKTS,
NFT_LIMIT_PKT_BYTES
diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
index f022ca91f49d..0a5c33cfaeb8 100644
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -504,6 +504,13 @@ config NFT_CT
This option adds the "ct" expression that you can use to match
connection tracking information such as the flow state.
+config NFT_FLOW_OFFLOAD
+ depends on NF_CONNTRACK
+ tristate "Netfilter nf_tables hardware flow offload module"
+ help
+ This option adds the "flow_offload" expression that you can use to
+ choose what flows are placed into the hardware.
+
config NFT_SET_RBTREE
tristate "Netfilter nf_tables rbtree set module"
help
diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile
index 518f54113e06..801ce5c25e5d 100644
--- a/net/netfilter/Makefile
+++ b/net/netfilter/Makefile
@@ -86,6 +86,7 @@ obj-$(CONFIG_NFT_META) += nft_meta.o
obj-$(CONFIG_NFT_RT) += nft_rt.o
obj-$(CONFIG_NFT_NUMGEN) += nft_numgen.o
obj-$(CONFIG_NFT_CT) += nft_ct.o
+obj-$(CONFIG_NFT_FLOW_OFFLOAD) += nft_flow_offload.o
obj-$(CONFIG_NFT_LIMIT) += nft_limit.o
obj-$(CONFIG_NFT_NAT) += nft_nat.o
obj-$(CONFIG_NFT_OBJREF) += nft_objref.o
diff --git a/net/netfilter/nft_flow_offload.c b/net/netfilter/nft_flow_offload.c
new file mode 100644
index 000000000000..d38d185a19a5
--- /dev/null
+++ b/net/netfilter/nft_flow_offload.c
@@ -0,0 +1,331 @@
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/netlink.h>
+#include <linux/netfilter.h>
+#include <linux/workqueue.h>
+#include <linux/spinlock.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/flow_offload.h>
+#include <net/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables_core.h>
+#include <net/netfilter/nf_conntrack_core.h>
+#include <linux/netfilter/nf_conntrack_common.h>
+
+union flow_gateway {
+ __be32 ip;
+ struct in6_addr ip6;
+};
+
+static int flow_offload_iterate_cleanup(struct nf_conn *ct, void *data)
+{
+ struct flow_offload_tuple_rhash *tuplehash;
+ struct flow_offload_tuple tuple = {};
+ struct net_device *indev = data;
+ struct flow_offload *flow;
+
+ if (!test_and_clear_bit(IPS_OFFLOAD_BIT, &ct->status))
+ return 0;
+
+ tuple.src_v4 = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.in;
+ tuple.dst_v4 = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u3.in;
+ tuple.src_port = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u.tcp.port;
+ tuple.dst_port = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u.tcp.port;
+ tuple.l3proto = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.l3num;
+ tuple.l4proto = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum;
+
+ tuplehash = flow_offload_lookup(&tuple);
+ BUG_ON(!tuplehash);
+
+ if (indev && tuplehash->tuple.iifidx != indev->ifindex)
+ return 0;
+
+ flow = container_of(tuplehash, struct flow_offload,
+ tuplehash[tuplehash->tuple.dir]);
+
+ flow_offload_del(flow);
+
+ /* Do not remove this conntrack from table. */
+ return 0;
+}
+
+static void flow_offload_cleanup(struct net *net,
+ const struct net_device *dev)
+{
+ nf_ct_iterate_cleanup_net(net, flow_offload_iterate_cleanup,
+ (void *)dev, 0, 0);
+}
+
+static int flow_offload_netdev_event(struct notifier_block *this,
+ unsigned long event, void *ptr)
+{
+ const struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+
+ if (event != NETDEV_DOWN)
+ return NOTIFY_DONE;
+
+ flow_offload_cleanup(dev_net(dev), dev);
+
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block flow_offload_netdev_notifier = {
+ .notifier_call = flow_offload_netdev_event,
+};
+
+static struct flow_offload *
+flow_offload_alloc(const struct nf_conn *ct, int iifindex, int oifindex,
+ union flow_gateway *orig_gateway,
+ union flow_gateway *reply_gateway)
+{
+ struct flow_offload *flow;
+
+ flow = kzalloc(sizeof(*flow), GFP_ATOMIC);
+ if (!flow)
+ return NULL;
+
+ switch (ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.l3num) {
+ case NFPROTO_IPV4:
+ flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.src_v4 =
+ ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.in;
+ flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_v4 =
+ ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u3.in;
+ flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.src_v4 =
+ ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.u3.in;
+ flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_v4 =
+ ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u3.in;
+ flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.l3proto =
+ ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.l3num;
+ flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.l4proto =
+ ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum;
+ flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.l3proto =
+ ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.l3num;
+ flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.l4proto =
+ ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum;
+ flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.gateway =
+ orig_gateway->ip;
+ flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.gateway =
+ reply_gateway->ip;
+ break;
+ case NFPROTO_IPV6:
+ flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.src_v6 =
+ ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.in6;
+ flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_v6 =
+ ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u3.in6;
+ flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.src_v6 =
+ ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.u3.in6;
+ flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_v6 =
+ ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u3.in6;
+ flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.l3proto =
+ ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.l3num;
+ flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.l4proto =
+ ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum;
+ flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.l3proto =
+ ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.l3num;
+ flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.l4proto =
+ ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum;
+ flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.gateway6 =
+ orig_gateway->ip6;
+ flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.gateway6 =
+ reply_gateway->ip6;
+ break;
+ }
+
+ flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.src_port =
+ ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u.tcp.port;
+ flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_port =
+ ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u.tcp.port;
+ flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.src_port =
+ ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.u.tcp.port;
+ flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_port =
+ ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u.tcp.port;
+
+ flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dir = FLOW_OFFLOAD_DIR_ORIGINAL;
+ flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dir = FLOW_OFFLOAD_DIR_REPLY;
+
+ flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.iifidx = oifindex;
+ flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.oifidx = iifindex;
+ flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.iifidx = iifindex;
+ flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.oifidx = oifindex;
+
+ if (ct->status & IPS_SRC_NAT)
+ flow->flags |= FLOW_OFFLOAD_SNAT;
+ else if (ct->status & IPS_DST_NAT)
+ flow->flags |= FLOW_OFFLOAD_DNAT;
+
+ return flow;
+}
+
+static int nft_flow_route(const struct nft_pktinfo *pkt,
+ const struct nf_conn *ct,
+ union flow_gateway *orig_gw,
+ union flow_gateway *reply_gw)
+{
+ const struct dst_entry *reply_dst = skb_dst(pkt->skb);
+ struct dst_entry *orig_dst;
+ const struct nf_afinfo *ai;
+ struct flowi fl;
+
+ memset(&fl, 0, sizeof(fl));
+ switch (nft_pf(pkt)) {
+ case NFPROTO_IPV4:
+ fl.u.ip4.daddr = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u3.ip;
+ break;
+ case NFPROTO_IPV6:
+ fl.u.ip6.daddr = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u3.in6;
+ break;
+ }
+
+ ai = nf_get_afinfo(nft_pf(pkt));
+ if (ai) {
+ ai->route(nft_net(pkt), &orig_dst, &fl, false);
+ if (!orig_dst)
+ return -ENOENT;
+ }
+
+ switch (nft_pf(pkt)) {
+ case NFPROTO_IPV4: {
+ const struct rtable *orig_rt = (const struct rtable *)orig_dst;
+ const struct rtable *reply_rt =
+ (const struct rtable *)reply_dst;
+
+ orig_gw->ip = orig_rt->rt_gateway;
+ reply_gw->ip = reply_rt->rt_gateway;
+ break;
+ }
+ case NFPROTO_IPV6:
+ break;
+ default:
+ break;
+ }
+
+ dst_release(orig_dst);
+
+ return 0;
+}
+
+static void nft_flow_offload_eval(const struct nft_expr *expr,
+ struct nft_regs *regs,
+ const struct nft_pktinfo *pkt)
+{
+ union flow_gateway orig_gateway, reply_gateway;
+ struct net_device *outdev = pkt->xt.state->out;
+ struct net_device *indev = pkt->xt.state->in;
+ enum ip_conntrack_info ctinfo;
+ struct flow_offload *flow;
+ struct nf_conn *ct;
+ int ret;
+
+ ct = nf_ct_get(pkt->skb, &ctinfo);
+ if (!ct)
+ goto out;
+
+ switch (ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum) {
+ case IPPROTO_TCP:
+ case IPPROTO_UDP:
+ break;
+ default:
+ goto out;
+ }
+
+ if (test_bit(IPS_HELPER_BIT, &ct->status))
+ goto out;
+
+ if (ctinfo == IP_CT_NEW ||
+ ctinfo == IP_CT_RELATED)
+ goto out;
+
+ if (test_and_set_bit(IPS_OFFLOAD_BIT, &ct->status))
+ goto out;
+
+ if (nft_flow_route(pkt, ct, &orig_gateway, &reply_gateway) < 0)
+ goto err1;
+
+ flow = flow_offload_alloc(ct, indev->ifindex, outdev->ifindex,
+ &orig_gateway, &reply_gateway);
+ if (!flow)
+ goto err1;
+
+ ret = flow_offload_add(flow);
+ if (ret < 0)
+ goto err2;
+
+ return;
+err2:
+ kfree(flow);
+err1:
+ clear_bit(IPS_OFFLOAD_BIT, &ct->status);
+out:
+ regs->verdict.code = NFT_BREAK;
+}
+
+static int nft_flow_offload_validate(const struct nft_ctx *ctx,
+ const struct nft_expr *expr,
+ const struct nft_data **data)
+{
+ unsigned int hook_mask = (1 << NF_INET_FORWARD);
+
+ return nft_chain_validate_hooks(ctx->chain, hook_mask);
+}
+
+static int nft_flow_offload_init(const struct nft_ctx *ctx,
+ const struct nft_expr *expr,
+ const struct nlattr * const tb[])
+{
+ return nf_ct_netns_get(ctx->net, ctx->afi->family);
+}
+
+static void nft_flow_offload_destroy(const struct nft_ctx *ctx,
+ const struct nft_expr *expr)
+{
+ nf_ct_netns_put(ctx->net, ctx->afi->family);
+}
+
+static int nft_flow_offload_dump(struct sk_buff *skb, const struct nft_expr *expr)
+{
+ return 0;
+}
+
+struct nft_expr_type nft_flow_offload_type;
+static const struct nft_expr_ops nft_flow_offload_ops = {
+ .type = &nft_flow_offload_type,
+ .size = NFT_EXPR_SIZE(0),
+ .eval = nft_flow_offload_eval,
+ .init = nft_flow_offload_init,
+ .destroy = nft_flow_offload_destroy,
+ .validate = nft_flow_offload_validate,
+ .dump = nft_flow_offload_dump,
+};
+
+struct nft_expr_type nft_flow_offload_type __read_mostly = {
+ .name = "flow_offload",
+ .ops = &nft_flow_offload_ops,
+ .maxattr = NFTA_CT_OFFLOAD_MAX,
+ .owner = THIS_MODULE,
+};
+
+static int __init nft_flow_offload_module_init(void)
+{
+ register_netdevice_notifier(&flow_offload_netdev_notifier);
+
+ return nft_register_expr(&nft_flow_offload_type);
+}
+
+static void __exit nft_flow_offload_module_exit(void)
+{
+ struct net *net;
+
+ nft_unregister_expr(&nft_flow_offload_type);
+ unregister_netdevice_notifier(&flow_offload_netdev_notifier);
+ rtnl_lock();
+ for_each_net(net)
+ flow_offload_cleanup(net, NULL);
+ rtnl_unlock();
+}
+
+module_init(nft_flow_offload_module_init);
+module_exit(nft_flow_offload_module_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>");
+MODULE_ALIAS_NFT_EXPR("flow_offload");
--
2.11.0
^ permalink raw reply related [flat|nested] 14+ messages in thread
* [PATCH RFC,WIP 5/5] netfilter: nft_flow_offload: add ndo hooks for hardware offload
2017-11-03 15:26 [PATCH RFC,WIP 0/5] Flow offload infrastructure Pablo Neira Ayuso
` (3 preceding siblings ...)
2017-11-03 15:26 ` [PATCH RFC,WIP 4/5] netfilter: nf_tables: flow offload expression Pablo Neira Ayuso
@ 2017-11-03 15:26 ` Pablo Neira Ayuso
2017-11-03 20:56 ` Florian Westphal
2017-11-11 12:49 ` Felix Fietkau
2017-11-04 4:49 ` [PATCH RFC,WIP 0/5] Flow offload infrastructure Florian Fainelli
2017-11-14 0:52 ` Jakub Kicinski
6 siblings, 2 replies; 14+ messages in thread
From: Pablo Neira Ayuso @ 2017-11-03 15:26 UTC (permalink / raw)
To: netfilter-devel; +Cc: netdev
This patch adds the infrastructure to offload flows to hardware, in case
the nic/switch comes with built-in flow tables capabilities.
If the hardware comes with not hardware flow tables or they have
limitations in terms of features, this falls back to the software
generic flow table implementation.
The software flow table aging thread skips entries that resides in the
hardware, so the hardware will be responsible for releasing this flow
table entry too.
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
include/linux/netdevice.h | 4 ++
net/netfilter/nf_flow_offload.c | 3 ++
net/netfilter/nft_flow_offload.c | 99 ++++++++++++++++++++++++++++++++++++++++
3 files changed, 106 insertions(+)
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index f535779d9dc1..0787f53374b3 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -826,6 +826,8 @@ struct xfrmdev_ops {
};
#endif
+struct flow_offload;
+
/*
* This structure defines the management hooks for network devices.
* The following hooks can be defined; unless noted otherwise, they are
@@ -1281,6 +1283,8 @@ struct net_device_ops {
int (*ndo_bridge_dellink)(struct net_device *dev,
struct nlmsghdr *nlh,
u16 flags);
+ int (*ndo_flow_add)(struct flow_offload *flow);
+ int (*ndo_flow_del)(struct flow_offload *flow);
int (*ndo_change_carrier)(struct net_device *dev,
bool new_carrier);
int (*ndo_get_phys_port_id)(struct net_device *dev,
diff --git a/net/netfilter/nf_flow_offload.c b/net/netfilter/nf_flow_offload.c
index f4a3fbe11b69..ac5786976dbb 100644
--- a/net/netfilter/nf_flow_offload.c
+++ b/net/netfilter/nf_flow_offload.c
@@ -147,6 +147,9 @@ static void nf_flow_offload_work_gc(struct work_struct *work)
flow = container_of(tuplehash, struct flow_offload, tuplehash[0]);
+ if (flow->flags & FLOW_OFFLOAD_HW)
+ continue;
+
if (nf_flow_has_expired(flow)) {
flow_offload_del(flow);
nf_flow_release_ct(tuplehash);
diff --git a/net/netfilter/nft_flow_offload.c b/net/netfilter/nft_flow_offload.c
index d38d185a19a5..0cb194a0aaab 100644
--- a/net/netfilter/nft_flow_offload.c
+++ b/net/netfilter/nft_flow_offload.c
@@ -17,6 +17,22 @@ union flow_gateway {
struct in6_addr ip6;
};
+static void flow_hw_offload_del(struct flow_offload *flow)
+{
+ struct net_device *indev;
+ int ret;
+
+ rtnl_lock();
+ indev = __dev_get_by_index(&init_net, flow->tuplehash[0].tuple.iifidx);
+ WARN_ON(!indev);
+
+ if (indev->netdev_ops->ndo_flow_del) {
+ ret = indev->netdev_ops->ndo_flow_del(flow);
+ WARN_ON(ret < 0);
+ }
+ rtnl_unlock();
+}
+
static int flow_offload_iterate_cleanup(struct nf_conn *ct, void *data)
{
struct flow_offload_tuple_rhash *tuplehash;
@@ -44,14 +60,40 @@ static int flow_offload_iterate_cleanup(struct nf_conn *ct, void *data)
tuplehash[tuplehash->tuple.dir]);
flow_offload_del(flow);
+ if (flow->flags & FLOW_OFFLOAD_HW)
+ flow_hw_offload_del(flow);
/* Do not remove this conntrack from table. */
return 0;
}
+static LIST_HEAD(flow_hw_offload_pending_list);
+static DEFINE_SPINLOCK(flow_hw_offload_lock);
+
+struct flow_hw_offload {
+ struct list_head list;
+ struct flow_offload *flow;
+ struct nf_conn *ct;
+};
+
static void flow_offload_cleanup(struct net *net,
const struct net_device *dev)
{
+ struct flow_hw_offload *offload, *next;
+
+ spin_lock_bh(&flow_hw_offload_lock);
+ list_for_each_entry_safe(offload, next, &flow_hw_offload_pending_list, list) {
+ if (dev == NULL ||
+ offload->flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.iifidx == dev->ifindex ||
+ offload->flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.oifidx == dev->ifindex)
+ continue;
+
+ nf_conntrack_put(&offload->ct->ct_general);
+ list_del(&offload->list);
+ kfree(offload);
+ }
+ spin_unlock_bh(&flow_hw_offload_lock);
+
nf_ct_iterate_cleanup_net(net, flow_offload_iterate_cleanup,
(void *)dev, 0, 0);
}
@@ -156,6 +198,43 @@ flow_offload_alloc(const struct nf_conn *ct, int iifindex, int oifindex,
return flow;
}
+static int do_flow_offload(struct flow_offload *flow)
+{
+ struct net_device *indev;
+ int ret, ifindex;
+
+ rtnl_lock();
+ ifindex = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.iifidx;
+ indev = __dev_get_by_index(&init_net, ifindex);
+ WARN_ON(!indev);
+
+ ret = indev->netdev_ops->ndo_flow_add(flow);
+ rtnl_unlock();
+
+ if (ret >= 0)
+ flow->flags |= FLOW_OFFLOAD_HW;
+
+ return ret;
+}
+
+static struct delayed_work nft_flow_offload_dwork;
+
+static void flow_offload_work(struct work_struct *work)
+{
+ struct flow_hw_offload *offload, *next;
+
+ spin_lock_bh(&flow_hw_offload_lock);
+ list_for_each_entry_safe(offload, next, &flow_hw_offload_pending_list, list) {
+ do_flow_offload(offload->flow);
+ nf_conntrack_put(&offload->ct->ct_general);
+ list_del(&offload->list);
+ kfree(offload);
+ }
+ spin_unlock_bh(&flow_hw_offload_lock);
+
+ queue_delayed_work(system_power_efficient_wq, &nft_flow_offload_dwork, HZ);
+}
+
static int nft_flow_route(const struct nft_pktinfo *pkt,
const struct nf_conn *ct,
union flow_gateway *orig_gw,
@@ -211,6 +290,7 @@ static void nft_flow_offload_eval(const struct nft_expr *expr,
union flow_gateway orig_gateway, reply_gateway;
struct net_device *outdev = pkt->xt.state->out;
struct net_device *indev = pkt->xt.state->in;
+ struct flow_hw_offload *offload;
enum ip_conntrack_info ctinfo;
struct flow_offload *flow;
struct nf_conn *ct;
@@ -250,6 +330,21 @@ static void nft_flow_offload_eval(const struct nft_expr *expr,
if (ret < 0)
goto err2;
+ if (!indev->netdev_ops->ndo_flow_add)
+ return;
+
+ offload = kmalloc(sizeof(struct flow_hw_offload), GFP_ATOMIC);
+ if (!offload)
+ return;
+
+ nf_conntrack_get(&ct->ct_general);
+ offload->ct = ct;
+ offload->flow = flow;
+
+ spin_lock_bh(&flow_hw_offload_lock);
+ list_add_tail(&offload->list, &flow_hw_offload_pending_list);
+ spin_unlock_bh(&flow_hw_offload_lock);
+
return;
err2:
kfree(flow);
@@ -308,6 +403,9 @@ static int __init nft_flow_offload_module_init(void)
{
register_netdevice_notifier(&flow_offload_netdev_notifier);
+ INIT_DEFERRABLE_WORK(&nft_flow_offload_dwork, flow_offload_work);
+ queue_delayed_work(system_power_efficient_wq, &nft_flow_offload_dwork, HZ);
+
return nft_register_expr(&nft_flow_offload_type);
}
@@ -316,6 +414,7 @@ static void __exit nft_flow_offload_module_exit(void)
struct net *net;
nft_unregister_expr(&nft_flow_offload_type);
+ cancel_delayed_work_sync(&nft_flow_offload_dwork);
unregister_netdevice_notifier(&flow_offload_netdev_notifier);
rtnl_lock();
for_each_net(net)
--
2.11.0
^ permalink raw reply related [flat|nested] 14+ messages in thread
* Re: [PATCH RFC,WIP 5/5] netfilter: nft_flow_offload: add ndo hooks for hardware offload
2017-11-03 15:26 ` [PATCH RFC,WIP 5/5] netfilter: nft_flow_offload: add ndo hooks for hardware offload Pablo Neira Ayuso
@ 2017-11-03 20:56 ` Florian Westphal
2017-11-11 12:49 ` Felix Fietkau
1 sibling, 0 replies; 14+ messages in thread
From: Florian Westphal @ 2017-11-03 20:56 UTC (permalink / raw)
To: Pablo Neira Ayuso; +Cc: netfilter-devel, netdev
Pablo Neira Ayuso <pablo@netfilter.org> wrote:
> +static void flow_offload_work(struct work_struct *work)
> +{
> + struct flow_hw_offload *offload, *next;
> +
> + spin_lock_bh(&flow_hw_offload_lock);
> + list_for_each_entry_safe(offload, next, &flow_hw_offload_pending_list, list) {
> + do_flow_offload(offload->flow);
This should not offload flows that already have DYING bit set.
> + nf_conntrack_put(&offload->ct->ct_general);
> + list_del(&offload->list);
> + kfree(offload);
> + }
> + spin_unlock_bh(&flow_hw_offload_lock);
> +
> + queue_delayed_work(system_power_efficient_wq, &nft_flow_offload_dwork, HZ);
> +}
Missed this on first round, 1 second is quite large.
[..]
> static int nft_flow_route(const struct nft_pktinfo *pkt,
> const struct nf_conn *ct,
> union flow_gateway *orig_gw,
> @@ -211,6 +290,7 @@ static void nft_flow_offload_eval(const struct nft_expr *expr,
> union flow_gateway orig_gateway, reply_gateway;
> struct net_device *outdev = pkt->xt.state->out;
> struct net_device *indev = pkt->xt.state->in;
> + struct flow_hw_offload *offload;
> enum ip_conntrack_info ctinfo;
> struct flow_offload *flow;
> struct nf_conn *ct;
> @@ -250,6 +330,21 @@ static void nft_flow_offload_eval(const struct nft_expr *expr,
> if (ret < 0)
> goto err2;
>
> + if (!indev->netdev_ops->ndo_flow_add)
> + return;
> +
> + offload = kmalloc(sizeof(struct flow_hw_offload), GFP_ATOMIC);
> + if (!offload)
> + return;
> +
> + nf_conntrack_get(&ct->ct_general);
> + offload->ct = ct;
> + offload->flow = flow;
> +
> + spin_lock_bh(&flow_hw_offload_lock);
> + list_add_tail(&offload->list, &flow_hw_offload_pending_list);
> + spin_unlock_bh(&flow_hw_offload_lock);
> +
> return;
So this aims for lazy offloading (up to 1 second delay).
Is this intentional, e.g. to avoid offloading short-lived 'RR' flows?
I would have expected this to schedule the workqueue here, and not use
delayed wq at all (i.e., also no self-rescheduling from
flow_offload_work()).
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH RFC,WIP 5/5] netfilter: nft_flow_offload: add ndo hooks for hardware offload
2017-11-03 15:26 ` [PATCH RFC,WIP 5/5] netfilter: nft_flow_offload: add ndo hooks for hardware offload Pablo Neira Ayuso
2017-11-03 20:56 ` Florian Westphal
@ 2017-11-11 12:49 ` Felix Fietkau
1 sibling, 0 replies; 14+ messages in thread
From: Felix Fietkau @ 2017-11-11 12:49 UTC (permalink / raw)
To: Pablo Neira Ayuso, netfilter-devel; +Cc: netdev
On 2017-11-03 16:26, Pablo Neira Ayuso wrote:
> This patch adds the infrastructure to offload flows to hardware, in case
> the nic/switch comes with built-in flow tables capabilities.
>
> If the hardware comes with not hardware flow tables or they have
> limitations in terms of features, this falls back to the software
> generic flow table implementation.
>
> The software flow table aging thread skips entries that resides in the
> hardware, so the hardware will be responsible for releasing this flow
> table entry too.
>
> Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
Hi Pablo,
I'd like to start playing with those patches in OpenWrt/LEDE soon. I'm
also considering making a patch that adds iptables support.
For that to work, I think it would be a good idea to keep the code that
tries to offload flows to hardware in nf_flow_offload.c instead, so that
it can be shared with iptables integration.
By the way, do you have a git tree where you keep the current version of
your patch set?
Thanks,
- Felix
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH RFC,WIP 0/5] Flow offload infrastructure
2017-11-03 15:26 [PATCH RFC,WIP 0/5] Flow offload infrastructure Pablo Neira Ayuso
` (4 preceding siblings ...)
2017-11-03 15:26 ` [PATCH RFC,WIP 5/5] netfilter: nft_flow_offload: add ndo hooks for hardware offload Pablo Neira Ayuso
@ 2017-11-04 4:49 ` Florian Fainelli
2017-11-14 0:52 ` Jakub Kicinski
6 siblings, 0 replies; 14+ messages in thread
From: Florian Fainelli @ 2017-11-04 4:49 UTC (permalink / raw)
To: Pablo Neira Ayuso, netfilter-devel; +Cc: netdev
Hi Pablo,
On 11/03/2017 08:26 AM, Pablo Neira Ayuso wrote:
> Hi,
>
> This patch adds the flow offload infrastructure for Netfilter. This adds
> a new 'nf_flow_offload' module that registers a hook at ingress. Every
> packet that hits the flow table is forwarded to where the flow table
> entry specifies in terms of destination/gateway and netdevice. In case
> of flow table miss, the packet follows the classic forward path.
>
> This flow table is populated via the new nftables VM action
> 'flow_offload', so the user can selectively specify what flows are
> placed into the flow table, an example ruleset would look like this:
>
> table inet x {
> chain y {
> type filter hook forward priority 0; policy accept;
> ip protocol tcp flow offload counter
> counter
> }
> }
>
> The 'flow offload' action adds the flow entry once the flow is in
> established state, according to the connection tracking definition, ie.
> we have seen traffic in both directions. Therefore, only initial packets
> of the flow follow the classic forwarding path.
>
> * Patch 1/5 is nothing really interesting, just a little preparation change.
>
> * Patch 2/5 adds a software flow table representation. It uses the
> rhashtable and an API to operate with it, it also introduces the
> 'struct flow_offload' that represents a flow table entry. There's a
> garbage collector kernel thread that cleans up entries for which we
> have not seen any packet for a while.
>
> * Patch 3/5 Just adds the missing bits to integrate the software flow
> table with conntrack. The software flow table owns the conntrack
> object, so it is basically responsible for releasing it. Conntrack
> entries that have been offloaded in the conntrack table will look like
> this:
>
> ipv4 2 tcp 6 src=10.141.10.2 dst=147.75.205.195 sport=36392 dport=443 src=147.75.205.195 dst=192.168.2.195 sport=443 dport=36392 [OFFLOAD] use=2
>
> * Patch 4/5 adds the extension for nf_tables that can be used to select
> what flows are offloaded through policy.
>
> * Patch 5/5 Switches and NICs come with built-in flow table, I've been
> observing out of tree patches in OpenWRT/LEDE to integrate this into
> Netfilter for a little while. This patch adds the ndo hooks to
> populate hardware flow table. This patchs a workqueue to configure
> from user context - we need to hold the mdio mutex for this. There
> will be a little time until packets will follow the hardware path.
> So packets will be following the software flow table path for a little
> while until the start going through hardware.
>
> I'm measuring here that the software flow table forwarding path is 2.5
> faster than the classic forwarding path in my testbed.
>
> TODO, still many things:
>
> * Only IPv4 at this time.
> * Only IPv4 SNAT is supported.
> * No netns support yet.
> * Missing netlink interface to operate with the flow table, to force the
> handover of flow to the software path.
> * Higher configurability, instead of registering the flow table
> inconditionally, add an interface to specify software flow table
> properties.
> * No flow counters at this time.
>
> This should serve a number of usecases where we can rely on this kernel
> bypass. Packets that need fragmentation / PMTU / IP option handling /
> ... and any specific handling, then we should pass them up to the
> forwarding classic path.
>
> Comments welcome,
A lot of us have been waiting for this for some time, so thanks a lot
for posting the patches. At first glance this seems to cover most of the
HW that I know about out there and it does so without that much code
added which is great. Did you have a particular platform you did
experiment this with and if so, should we expect patches to be posted to
see how it integrates with real hardware?
Thanks!
> Thanks.
>
> Pablo Neira Ayuso (5):
> netfilter: nf_conntrack: move nf_ct_netns_{get,put}() to core
> netfilter: add software flow offload infrastructure
> netfilter: nf_flow_offload: integration with conntrack
> netfilter: nf_tables: flow offload expression
> netfilter: nft_flow_offload: add ndo hooks for hardware offload
>
> include/linux/netdevice.h | 4 +
> include/net/flow_offload.h | 67 ++++
> include/net/netfilter/nf_conntrack.h | 3 +-
> include/uapi/linux/netfilter/nf_conntrack_common.h | 4 +
> include/uapi/linux/netfilter/nf_tables.h | 9 +
> net/netfilter/Kconfig | 14 +
> net/netfilter/Makefile | 4 +
> net/netfilter/nf_conntrack_core.c | 7 +-
> net/netfilter/nf_conntrack_netlink.c | 15 +-
> net/netfilter/nf_conntrack_proto.c | 37 +-
> net/netfilter/nf_conntrack_proto_tcp.c | 3 +
> net/netfilter/nf_conntrack_standalone.c | 12 +-
> net/netfilter/nf_flow_offload.c | 421 ++++++++++++++++++++
> net/netfilter/nft_ct.c | 39 +-
> net/netfilter/nft_flow_offload.c | 430 +++++++++++++++++++++
> 15 files changed, 1024 insertions(+), 45 deletions(-)
> create mode 100644 include/net/flow_offload.h
> create mode 100644 net/netfilter/nf_flow_offload.c
> create mode 100644 net/netfilter/nft_flow_offload.c
>
--
Florian
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH RFC,WIP 0/5] Flow offload infrastructure
2017-11-03 15:26 [PATCH RFC,WIP 0/5] Flow offload infrastructure Pablo Neira Ayuso
` (5 preceding siblings ...)
2017-11-04 4:49 ` [PATCH RFC,WIP 0/5] Flow offload infrastructure Florian Fainelli
@ 2017-11-14 0:52 ` Jakub Kicinski
6 siblings, 0 replies; 14+ messages in thread
From: Jakub Kicinski @ 2017-11-14 0:52 UTC (permalink / raw)
To: Pablo Neira Ayuso; +Cc: netfilter-devel, netdev
On Fri, 3 Nov 2017 16:26:31 +0100, Pablo Neira Ayuso wrote:
> I'm measuring here that the software flow table forwarding path is 2.5
> faster than the classic forwarding path in my testbed.
>
> TODO, still many things:
>
> * Only IPv4 at this time.
> * Only IPv4 SNAT is supported.
> * No netns support yet.
> * Missing netlink interface to operate with the flow table, to force the
> handover of flow to the software path.
> * Higher configurability, instead of registering the flow table
> inconditionally, add an interface to specify software flow table
> properties.
> * No flow counters at this time.
>
> This should serve a number of usecases where we can rely on this kernel
> bypass. Packets that need fragmentation / PMTU / IP option handling /
> ... and any specific handling, then we should pass them up to the
> forwarding classic path.
I didn't realize it from this patch set, but it was mentioned at the
conference that this patch set is completely stateless. I.e. things
like TCP window tracking are not included here. IMHO that's a big
concern, because offloading flows is trivial when compared to state
sync. IMHO state sync is *the* challenge in implementing connection
tacking offload...
^ permalink raw reply [flat|nested] 14+ messages in thread