All of lore.kernel.org
 help / color / mirror / Atom feed
* [patch net-next v3] tc: introduce OpenFlow classifier
@ 2015-04-09 12:58 Jiri Pirko
  2015-04-09 13:00 ` [patch iproute2 v3] tc: add support for " Jiri Pirko
  2015-04-09 21:34 ` [patch net-next v3] tc: introduce " David Miller
  0 siblings, 2 replies; 18+ messages in thread
From: Jiri Pirko @ 2015-04-09 12:58 UTC (permalink / raw)
  To: netdev; +Cc: davem, jhs, tgraf, jesse

This patch introduces OpenFlow-based filter. So far, the very essential
packet fields are supported (according to OpenFlow v1.4 spec).

This patch is only the first step. There is a lot of potential performance
improvements possible to implement. Also a lot of features are missing
now. They will be addressed in follow-up patches.

To the name of this classifier, I believe that "cls_openflow" is pretty
accurate. It is actually a OpenFlow classifier.

Signed-off-by: Jiri Pirko <jiri@resnulli.us>
---
v2->v3:
- prepare masted key for faster matching
- use one mask per one cls_of_head as suggested by Thomas Graf
- Thomas Graf suggested to use hash lookup from the very beginning,
  so use rthashtable to store masked keys and do lookup in classify op

v1->v2:
- Added note to Kconfig about no relation to other OpenFlow items than
  classification suggested by Thomas Graf
- fixed TCA_BASIC_CLASSID c&p typo noticed by Jamal
- avoided union for tp suggested by Jamal
---
 include/uapi/linux/pkt_cls.h |  31 ++
 net/sched/Kconfig            |  17 +
 net/sched/Makefile           |   1 +
 net/sched/cls_openflow.c     | 791 +++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 840 insertions(+)
 create mode 100644 net/sched/cls_openflow.c

diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h
index bf08e76..910898c 100644
--- a/include/uapi/linux/pkt_cls.h
+++ b/include/uapi/linux/pkt_cls.h
@@ -404,6 +404,37 @@ enum {
 
 #define TCA_BPF_MAX (__TCA_BPF_MAX - 1)
 
+/* OpenFlow classifier */
+
+enum {
+	TCA_OF_UNSPEC,
+	TCA_OF_CLASSID,
+	TCA_OF_POLICE,
+	TCA_OF_INDEV,
+	TCA_OF_ACT,
+	TCA_OF_KEY_ETH_DST,		/* ETH_ALEN */
+	TCA_OF_KEY_ETH_DST_MASK,	/* ETH_ALEN */
+	TCA_OF_KEY_ETH_SRC,		/* ETH_ALEN */
+	TCA_OF_KEY_ETH_SRC_MASK,	/* ETH_ALEN */
+	TCA_OF_KEY_ETH_TYPE,		/* be16 */
+	TCA_OF_KEY_IP_PROTO,		/* u8 */
+	TCA_OF_KEY_IPV4_SRC,		/* be32 */
+	TCA_OF_KEY_IPV4_SRC_MASK,	/* be32 */
+	TCA_OF_KEY_IPV4_DST,		/* be32 */
+	TCA_OF_KEY_IPV4_DST_MASK,	/* be32 */
+	TCA_OF_KEY_IPV6_SRC,		/* struct in6_addr */
+	TCA_OF_KEY_IPV6_SRC_MASK,	/* struct in6_addr */
+	TCA_OF_KEY_IPV6_DST,		/* struct in6_addr */
+	TCA_OF_KEY_IPV6_DST_MASK,	/* struct in6_addr */
+	TCA_OF_KEY_TCP_SRC,		/* be16 */
+	TCA_OF_KEY_TCP_DST,		/* be16 */
+	TCA_OF_KEY_UDP_SRC,		/* be16 */
+	TCA_OF_KEY_UDP_DST,		/* be16 */
+	__TCA_OF_MAX,
+};
+
+#define TCA_OF_MAX (__TCA_OF_MAX - 1)
+
 /* Extended Matches */
 
 struct tcf_ematch_tree_hdr {
diff --git a/net/sched/Kconfig b/net/sched/Kconfig
index 2274e72..9126387 100644
--- a/net/sched/Kconfig
+++ b/net/sched/Kconfig
@@ -477,6 +477,23 @@ config NET_CLS_BPF
 	  To compile this code as a module, choose M here: the module will
 	  be called cls_bpf.
 
+config NET_CLS_OPENFLOW
+	tristate "OpenFlow classifier"
+	select NET_CLS
+	---help---
+	  If you say Y here, you will be able to classify packets based on
+	  a configurable combination of packet keys and masks according to
+	  OpenFlow standard.
+
+	  Please note that although the name of this classifier is "openflow",
+	  there is no relation to the OpenFlow wire protocol itself or any of
+	  the other OpenFlow specific concepts such as flow tables, group
+	  tables, counters, actions, etc. The only part taken from OpenFlow
+	  standard are match fields in packet that are used for classification.
+
+	  To compile this code as a module, choose M here: the module will
+	  be called cls_openflow.
+
 config NET_EMATCH
 	bool "Extended Matches"
 	select NET_CLS
diff --git a/net/sched/Makefile b/net/sched/Makefile
index 7ca7f4c..5faa9ca 100644
--- a/net/sched/Makefile
+++ b/net/sched/Makefile
@@ -56,6 +56,7 @@ obj-$(CONFIG_NET_CLS_BASIC)	+= cls_basic.o
 obj-$(CONFIG_NET_CLS_FLOW)	+= cls_flow.o
 obj-$(CONFIG_NET_CLS_CGROUP)	+= cls_cgroup.o
 obj-$(CONFIG_NET_CLS_BPF)	+= cls_bpf.o
+obj-$(CONFIG_NET_CLS_OPENFLOW)	+= cls_openflow.o
 obj-$(CONFIG_NET_EMATCH)	+= ematch.o
 obj-$(CONFIG_NET_EMATCH_CMP)	+= em_cmp.o
 obj-$(CONFIG_NET_EMATCH_NBYTE)	+= em_nbyte.o
diff --git a/net/sched/cls_openflow.c b/net/sched/cls_openflow.c
new file mode 100644
index 0000000..91af663
--- /dev/null
+++ b/net/sched/cls_openflow.c
@@ -0,0 +1,791 @@
+/*
+ * net/sched/cls_openflow.c		OpenFlow classifier
+ *
+ * Copyright (c) 2015 Jiri Pirko <jiri@resnulli.us>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/rhashtable.h>
+
+#include <linux/if_ether.h>
+#include <linux/in6.h>
+#include <linux/ip.h>
+
+#include <net/sch_generic.h>
+#include <net/pkt_cls.h>
+#include <net/ip.h>
+
+struct of_flow_key {
+	int	indev_ifindex;
+	struct {
+		u8	src[ETH_ALEN];
+		u8	dst[ETH_ALEN];
+		__be16	type;
+	} eth;
+	struct {
+		u8	proto;
+	} ip;
+	union {
+		struct {
+			__be32 src;
+			__be32 dst;
+		} ipv4;
+		struct {
+			struct in6_addr src;
+			struct in6_addr dst;
+		} ipv6;
+	};
+	struct {
+		__be16 src;
+		__be16 dst;
+	} tp;
+} __aligned(BITS_PER_LONG / 8); /* Ensure that we can do comparisons as longs. */
+
+struct of_flow_mask_range {
+	unsigned short int start;
+	unsigned short int end;
+};
+
+struct of_flow_mask {
+	struct of_flow_key key;
+	struct of_flow_mask_range range;
+	struct rcu_head	rcu;
+};
+
+struct cls_of_head {
+	struct rhashtable ht;
+	struct of_flow_mask mask;
+	u32 hgen;
+	bool mask_assigned;
+	struct list_head filters;
+	struct rhashtable_params ht_params;
+	struct rcu_head rcu;
+};
+
+struct cls_of_filter {
+	struct rhash_head ht_node;
+	struct of_flow_key mkey;
+	struct tcf_exts exts;
+	struct tcf_result res;
+	struct of_flow_key key;
+	struct list_head list;
+	u32 handle;
+	struct rcu_head	rcu;
+};
+
+static unsigned short int of_mask_range(const struct of_flow_mask *mask)
+{
+	return mask->range.end - mask->range.start;
+}
+
+static void of_mask_update_range(struct of_flow_mask *mask)
+{
+	const u8 *bytes = (const u8 *) &mask->key;
+	size_t size = sizeof(mask->key);
+	size_t i, first = 0, last = size - 1;
+
+	for (i = 0; i < sizeof(mask->key); i++) {
+		if (bytes[i]) {
+			if (!first && i)
+				first = i;
+			last = i;
+		}
+	}
+	mask->range.start = rounddown(first, sizeof(long));
+	mask->range.end = roundup(last, sizeof(long));
+}
+
+static void *of_key_get_start(struct of_flow_key *key,
+			      const struct of_flow_mask *mask)
+{
+	return (u8 *) key + mask->range.start;
+}
+
+static int __check_header(struct sk_buff *skb, int len)
+{
+	if (unlikely(skb->len < len))
+		return -EINVAL;
+	if (unlikely(!pskb_may_pull(skb, len)))
+		return -ENOMEM;
+	return 0;
+}
+
+static int of_extract_ipv4(struct sk_buff *skb, struct of_flow_key *key)
+{
+	unsigned int iph_off = skb_network_offset(skb);
+	struct iphdr *iph;
+	unsigned int iph_len;
+	int err;
+
+	err = __check_header(skb, iph_off + sizeof(*iph));
+	if (unlikely(err))
+		goto errout;
+
+	iph_len = ip_hdrlen(skb);
+	if (unlikely(iph_len < sizeof(*iph) ||
+		     skb->len < iph_off + iph_len)) {
+		err = -EINVAL;
+		goto errout;
+	}
+
+	iph = ip_hdr(skb);
+	key->ipv4.src = iph->saddr;
+	key->ipv4.dst = iph->daddr;
+	key->ip.proto = iph->protocol;
+
+	skb_set_transport_header(skb, iph_off + iph_len);
+	return 0;
+
+errout:
+	memset(&key->ip, 0, sizeof(key->ip));
+	memset(&key->ipv4, 0, sizeof(key->ipv4));
+	return err;
+}
+
+static int of_extract_ipv6(struct sk_buff *skb, struct of_flow_key *key)
+{
+	unsigned int iph_off = skb_network_offset(skb);
+	int payload_off;
+	struct ipv6hdr *iph;
+	uint8_t nexthdr;
+	__be16 frag_off;
+	int err;
+
+	err = __check_header(skb, iph_off + sizeof(*iph));
+	if (unlikely(err))
+		goto errout;
+
+	iph = ipv6_hdr(skb);
+	nexthdr = iph->nexthdr;
+	payload_off = (u8 *) (iph + 1) - skb->data;
+
+	key->ip.proto = NEXTHDR_NONE;
+	key->ipv6.src = iph->saddr;
+	key->ipv6.dst = iph->daddr;
+
+	payload_off = ipv6_skip_exthdr(skb, payload_off, &nexthdr, &frag_off);
+	if (unlikely(payload_off < 0)) {
+		err = -EINVAL;
+		goto errout;
+	}
+
+	key->ip.proto = nexthdr;
+	skb_set_transport_header(skb, payload_off);
+	return 0;
+
+errout:
+	memset(&key->ip, 0, sizeof(key->ip));
+	memset(&key->ipv6, 0, sizeof(key->ipv6));
+	return err;
+}
+
+static bool __tcphdr_ok(struct sk_buff *skb)
+{
+	int tcph_off = skb_transport_offset(skb);
+	int tcph_len;
+
+	if (unlikely(!pskb_may_pull(skb, tcph_off + sizeof(struct tcphdr))))
+		return false;
+
+	tcph_len = tcp_hdrlen(skb);
+	if (unlikely(tcph_len < sizeof(struct tcphdr) ||
+		     skb->len < tcph_off + tcph_len))
+		return false;
+
+	return true;
+}
+
+static bool __udphdr_ok(struct sk_buff *skb)
+{
+	return pskb_may_pull(skb, skb_transport_offset(skb) +
+				  sizeof(struct udphdr));
+}
+
+static void of_extract_tp(struct sk_buff *skb, struct of_flow_key *key)
+{
+	if (key->ip.proto == IPPROTO_TCP) {
+		if (__tcphdr_ok(skb)) {
+			struct tcphdr *tcp = tcp_hdr(skb);
+
+			key->tp.src = tcp->source;
+			key->tp.dst = tcp->dest;
+		} else {
+			memset(&key->tp, 0, sizeof(key->tp));
+		}
+
+	} else if (key->ip.proto == IPPROTO_UDP) {
+		if (__udphdr_ok(skb)) {
+			struct udphdr *udp = udp_hdr(skb);
+
+			key->tp.src = udp->source;
+			key->tp.dst = udp->dest;
+		} else {
+			memset(&key->tp, 0, sizeof(key->tp));
+		}
+	}
+}
+
+static void of_extract_key(struct sk_buff *skb, struct of_flow_key *key)
+{
+	struct ethhdr *eth;
+	int err;
+
+	key->indev_ifindex = skb->skb_iif;
+
+	eth = eth_hdr(skb);
+	ether_addr_copy(key->eth.src, eth->h_source);
+	ether_addr_copy(key->eth.dst, eth->h_dest);
+
+	key->eth.type = skb->protocol;
+	if (key->eth.type == htons(ETH_P_IP)) {
+		err = of_extract_ipv4(skb, key);
+		if (likely(!err))
+			of_extract_tp(skb, key);
+	} else if (key->eth.type == htons(ETH_P_IPV6)) {
+		err = of_extract_ipv6(skb, key);
+		if (likely(!err))
+			of_extract_tp(skb, key);
+	}
+}
+
+static void of_set_masked_key(struct of_flow_key *mkey, struct of_flow_key *key,
+			      struct of_flow_mask *mask)
+{
+	const long *lkey = of_key_get_start(key, mask);
+	const long *lmask = of_key_get_start(&mask->key, mask);
+	long *lmkey = of_key_get_start(mkey, mask);
+	int i;
+
+	for (i = 0; i < of_mask_range(mask); i += sizeof(long))
+		*lmkey++ = *lkey++ & *lmask++;
+}
+
+static int of_classify(struct sk_buff *skb, const struct tcf_proto *tp,
+		       struct tcf_result *res)
+{
+	struct cls_of_head *head = rcu_dereference_bh(tp->root);
+	struct cls_of_filter *f;
+	struct of_flow_key skb_key;
+	struct of_flow_key skb_mkey;
+
+	of_extract_key(skb, &skb_key);
+	of_set_masked_key(&skb_mkey, &skb_key, &head->mask);
+
+	f = rhashtable_lookup_fast(&head->ht,
+				   of_key_get_start(&skb_mkey, &head->mask),
+				   head->ht_params);
+	if (f) {
+		*res = f->res;
+		return tcf_exts_exec(skb, &f->exts, res);
+	}
+	return -1;
+}
+
+static int of_init(struct tcf_proto *tp)
+{
+	struct cls_of_head *head;
+
+	head = kzalloc(sizeof(*head), GFP_KERNEL);
+	if (!head)
+		return -ENOBUFS;
+
+	INIT_LIST_HEAD_RCU(&head->filters);
+	rcu_assign_pointer(tp->root, head);
+
+	return 0;
+}
+
+static void of_destroy_filter(struct rcu_head *head)
+{
+	struct cls_of_filter *f = container_of(head, struct cls_of_filter, rcu);
+
+	tcf_exts_destroy(&f->exts);
+	kfree(f);
+}
+
+static bool of_destroy(struct tcf_proto *tp, bool force)
+{
+	struct cls_of_head *head = rtnl_dereference(tp->root);
+	struct cls_of_filter *f, *next;
+
+	if (!force && !list_empty(&head->filters))
+		return false;
+
+	list_for_each_entry_safe(f, next, &head->filters, list) {
+		list_del_rcu(&f->list);
+		call_rcu(&f->rcu, of_destroy_filter);
+	}
+	RCU_INIT_POINTER(tp->root, NULL);
+	if (head->mask_assigned)
+		rhashtable_destroy(&head->ht);
+	kfree_rcu(head, rcu);
+	return true;
+}
+
+static unsigned long of_get(struct tcf_proto *tp, u32 handle)
+{
+	struct cls_of_head *head = rtnl_dereference(tp->root);
+	struct cls_of_filter *f;
+
+	list_for_each_entry(f, &head->filters, list)
+		if (f->handle == handle)
+			return (unsigned long) f;
+	return 0;
+}
+
+static const struct nla_policy of_policy[TCA_OF_MAX + 1] = {
+	[TCA_OF_UNSPEC]			= { .type = NLA_UNSPEC },
+	[TCA_OF_CLASSID]		= { .type = NLA_U32 },
+	[TCA_OF_INDEV]			= { .type = NLA_STRING,
+					    .len = IFNAMSIZ },
+	[TCA_OF_KEY_ETH_DST]		= { .len = ETH_ALEN },
+	[TCA_OF_KEY_ETH_DST_MASK]	= { .len = ETH_ALEN },
+	[TCA_OF_KEY_ETH_SRC]		= { .len = ETH_ALEN },
+	[TCA_OF_KEY_ETH_SRC_MASK]	= { .len = ETH_ALEN },
+	[TCA_OF_KEY_ETH_TYPE]		= { .type = NLA_U16 },
+	[TCA_OF_KEY_IP_PROTO]		= { .type = NLA_U8 },
+	[TCA_OF_KEY_IPV4_SRC]		= { .type = NLA_U32 },
+	[TCA_OF_KEY_IPV4_SRC_MASK]	= { .type = NLA_U32 },
+	[TCA_OF_KEY_IPV4_DST]		= { .type = NLA_U32 },
+	[TCA_OF_KEY_IPV4_DST_MASK]	= { .type = NLA_U32 },
+	[TCA_OF_KEY_IPV6_SRC]		= { .len = sizeof(struct in6_addr) },
+	[TCA_OF_KEY_IPV6_SRC_MASK]	= { .len = sizeof(struct in6_addr) },
+	[TCA_OF_KEY_IPV6_DST]		= { .len = sizeof(struct in6_addr) },
+	[TCA_OF_KEY_IPV6_DST_MASK]	= { .len = sizeof(struct in6_addr) },
+	[TCA_OF_KEY_TCP_SRC]		= { .type = NLA_U16 },
+	[TCA_OF_KEY_TCP_DST]		= { .type = NLA_U16 },
+	[TCA_OF_KEY_TCP_SRC]		= { .type = NLA_U16 },
+	[TCA_OF_KEY_TCP_DST]		= { .type = NLA_U16 },
+};
+
+static void of_set_key_val(struct nlattr **tb,
+			   void *val, int val_type,
+			   void *mask, int mask_type, int len)
+{
+	if (!tb[val_type])
+		return;
+	memcpy(val, nla_data(tb[val_type]), len);
+	if (mask_type == TCA_OF_UNSPEC || !tb[mask_type])
+		memset(mask, 0xff, len);
+	else
+		memcpy(mask, nla_data(tb[mask_type]), len);
+}
+
+static int of_set_key(struct net *net, struct nlattr **tb,
+		      struct of_flow_key *key, struct of_flow_key *mask)
+{
+	int err;
+
+	if (tb[TCA_OF_INDEV]) {
+		err = tcf_change_indev(net, tb[TCA_OF_INDEV]);
+		if (err < 0)
+			return err;
+		key->indev_ifindex = err;
+		mask->indev_ifindex = 0xffffffff;
+	}
+
+	of_set_key_val(tb, key->eth.dst, TCA_OF_KEY_ETH_DST,
+		       mask->eth.dst, TCA_OF_KEY_ETH_DST_MASK,
+		       sizeof(key->eth.dst));
+	of_set_key_val(tb, key->eth.src, TCA_OF_KEY_ETH_SRC,
+		       mask->eth.src, TCA_OF_KEY_ETH_SRC_MASK,
+		       sizeof(key->eth.src));
+	of_set_key_val(tb, &key->eth.type, TCA_OF_KEY_ETH_TYPE,
+		       &mask->eth.type, TCA_OF_UNSPEC,
+		       sizeof(key->eth.type));
+	if (key->eth.type == htons(ETH_P_IP) ||
+	    key->eth.type == htons(ETH_P_IPV6)) {
+		of_set_key_val(tb, &key->ip.proto, TCA_OF_KEY_IP_PROTO,
+			       &mask->ip.proto, TCA_OF_UNSPEC,
+			       sizeof(key->ip.proto));
+	}
+	if (key->eth.type == htons(ETH_P_IP)) {
+		of_set_key_val(tb, &key->ipv4.src, TCA_OF_KEY_IPV4_SRC,
+			       &mask->ipv4.src, TCA_OF_KEY_IPV4_SRC_MASK,
+			       sizeof(key->ipv4.src));
+		of_set_key_val(tb, &key->ipv4.dst, TCA_OF_KEY_IPV4_DST,
+			       &mask->ipv4.dst, TCA_OF_KEY_IPV4_DST_MASK,
+			       sizeof(key->ipv4.dst));
+	} else if (key->eth.type == htons(ETH_P_IPV6)) {
+		of_set_key_val(tb, &key->ipv6.src, TCA_OF_KEY_IPV6_SRC,
+			       &mask->ipv6.src, TCA_OF_KEY_IPV6_SRC_MASK,
+			       sizeof(key->ipv6.src));
+		of_set_key_val(tb, &key->ipv6.dst, TCA_OF_KEY_IPV6_DST,
+			       &mask->ipv6.dst, TCA_OF_KEY_IPV6_DST_MASK,
+			       sizeof(key->ipv6.dst));
+	}
+	if (key->ip.proto == IPPROTO_TCP) {
+		of_set_key_val(tb, &key->tp.src, TCA_OF_KEY_TCP_SRC,
+			       &mask->tp.src, TCA_OF_UNSPEC,
+			       sizeof(key->tp.src));
+		of_set_key_val(tb, &key->tp.dst, TCA_OF_KEY_TCP_DST,
+			       &mask->tp.dst, TCA_OF_UNSPEC,
+			       sizeof(key->tp.dst));
+	} else if (key->ip.proto == IPPROTO_UDP) {
+		of_set_key_val(tb, &key->tp.src, TCA_OF_KEY_UDP_SRC,
+			       &mask->tp.src, TCA_OF_UNSPEC,
+			       sizeof(key->tp.src));
+		of_set_key_val(tb, &key->tp.dst, TCA_OF_KEY_UDP_DST,
+			       &mask->tp.dst, TCA_OF_UNSPEC,
+			       sizeof(key->tp.dst));
+	}
+
+	return 0;
+}
+
+static bool of_mask_eq(struct of_flow_mask *mask1,
+		       struct of_flow_mask *mask2)
+{
+	const long *lmask1 = of_key_get_start(&mask1->key, mask1);
+	const long *lmask2 = of_key_get_start(&mask2->key, mask2);
+
+	return !memcmp(&mask1->range, &mask2->range, sizeof(mask1->range)) &&
+	       !memcmp(lmask1, lmask2, of_mask_range(mask1));
+}
+
+static const struct rhashtable_params of_ht_params = {
+	.key_offset = offsetof(struct cls_of_filter, mkey), /* base offset */
+	.head_offset = offsetof(struct cls_of_filter, ht_node),
+	.automatic_shrinking = true,
+};
+
+static int of_init_hashtable(struct cls_of_head *head,
+			     struct of_flow_mask *mask)
+{
+	head->ht_params = of_ht_params;
+	head->ht_params.key_len = of_mask_range(mask);
+	head->ht_params.key_offset += mask->range.start;
+
+	return rhashtable_init(&head->ht, &head->ht_params);
+}
+
+static int of_check_assign_mask(struct cls_of_head *head,
+				struct of_flow_mask *mask)
+{
+	int err;
+
+	if (head->mask_assigned) {
+		if (!of_mask_eq(&head->mask, mask))
+			return -EINVAL;
+		else
+			return 0;
+	}
+
+	/* Mask is not assigned yet. So assign it and init hashtable
+	 * according to that.
+	 */
+	err = of_init_hashtable(head, mask);
+	if (err)
+		return err;
+	memcpy(&head->mask, mask, sizeof(head->mask));
+	head->mask_assigned = true;
+	return 0;
+}
+
+static int of_set_parms(struct net *net, struct tcf_proto *tp,
+			struct cls_of_filter *f, struct of_flow_mask *mask,
+			unsigned long base, struct nlattr **tb,
+			struct nlattr *est, bool ovr)
+{
+	struct tcf_exts e;
+	int err;
+
+	tcf_exts_init(&e, TCA_OF_ACT, TCA_OF_POLICE);
+	err = tcf_exts_validate(net, tp, tb, est, &e, ovr);
+	if (err < 0)
+		return err;
+
+	if (tb[TCA_OF_CLASSID]) {
+		f->res.classid = nla_get_u32(tb[TCA_OF_CLASSID]);
+		tcf_bind_filter(tp, &f->res, base);
+	}
+
+	err = of_set_key(net, tb, &f->key, &mask->key);
+	if (err)
+		goto errout;
+
+	of_mask_update_range(mask);
+	of_set_masked_key(&f->mkey, &f->key, mask);
+
+	tcf_exts_change(tp, &f->exts, &e);
+
+	return 0;
+errout:
+	tcf_exts_destroy(&e);
+	return err;
+}
+
+static u32 of_grab_new_handle(struct tcf_proto *tp,
+			      struct cls_of_head *head)
+{
+	unsigned int i = 0x80000000;
+	u32 handle;
+
+	do {
+		if (++head->hgen == 0x7FFFFFFF)
+			head->hgen = 1;
+	} while (--i > 0 && of_get(tp, head->hgen));
+
+	if (unlikely(i == 0)) {
+		pr_err("Insufficient number of handles\n");
+		handle = 0;
+	} else {
+		handle = head->hgen;
+	}
+
+	return handle;
+}
+
+static int of_change(struct net *net, struct sk_buff *in_skb,
+		     struct tcf_proto *tp, unsigned long base,
+		     u32 handle, struct nlattr **tca,
+		     unsigned long *arg, bool ovr)
+{
+	struct cls_of_head *head = rtnl_dereference(tp->root);
+	struct cls_of_filter *fold = (struct cls_of_filter *) *arg;
+	struct cls_of_filter *fnew;
+	struct nlattr *tb[TCA_OF_MAX + 1];
+	struct of_flow_mask mask = {};
+	int err;
+
+	if (!tca[TCA_OPTIONS])
+		return -EINVAL;
+
+	err = nla_parse_nested(tb, TCA_OF_MAX, tca[TCA_OPTIONS], of_policy);
+	if (err < 0)
+		return err;
+
+	if (fold && handle && fold->handle != handle)
+		return -EINVAL;
+
+	fnew = kzalloc(sizeof(*fnew), GFP_KERNEL);
+	if (!fnew)
+		return -ENOBUFS;
+
+	tcf_exts_init(&fnew->exts, TCA_OF_ACT, TCA_OF_POLICE);
+
+	if (!handle) {
+		handle = of_grab_new_handle(tp, head);
+		if (!handle) {
+			err = -EINVAL;
+			goto errout;
+		}
+	}
+	fnew->handle = handle;
+
+	err = of_set_parms(net, tp, fnew, &mask, base, tb, tca[TCA_RATE], ovr);
+	if (err)
+		goto errout;
+
+	err = of_check_assign_mask(head, &mask);
+	if (err)
+		goto errout;
+
+	err = rhashtable_insert_fast(&head->ht, &fnew->ht_node,
+				     head->ht_params);
+	if (err)
+		goto errout;
+	if (fold)
+		rhashtable_remove_fast(&head->ht, &fold->ht_node,
+				       head->ht_params);
+
+	*arg = (unsigned long) fnew;
+
+	if (fold) {
+		list_replace_rcu(&fnew->list, &fold->list);
+		tcf_unbind_filter(tp, &fold->res);
+		call_rcu(&fold->rcu, of_destroy_filter);
+	} else {
+		list_add_tail_rcu(&fnew->list, &head->filters);
+	}
+
+	return 0;
+
+errout:
+	kfree(fnew);
+	return err;
+}
+
+static int of_delete(struct tcf_proto *tp, unsigned long arg)
+{
+	struct cls_of_head *head = rtnl_dereference(tp->root);
+	struct cls_of_filter *f = (struct cls_of_filter *) arg;
+
+	rhashtable_remove_fast(&head->ht, &f->ht_node,
+			       head->ht_params);
+	list_del_rcu(&f->list);
+	tcf_unbind_filter(tp, &f->res);
+	call_rcu(&f->rcu, of_destroy_filter);
+	return 0;
+}
+
+static void of_walk(struct tcf_proto *tp, struct tcf_walker *arg)
+{
+	struct cls_of_head *head = rtnl_dereference(tp->root);
+	struct cls_of_filter *f;
+
+	list_for_each_entry_rcu(f, &head->filters, list) {
+		if (arg->count < arg->skip)
+			goto skip;
+		if (arg->fn(tp, (unsigned long) f, arg) < 0) {
+			arg->stop = 1;
+			break;
+		}
+skip:
+		arg->count++;
+	}
+}
+
+static int of_dump_key_val(struct sk_buff *skb,
+			   void *val, int val_type,
+			   void *mask, int mask_type, int len)
+{
+	int err;
+
+	if (!memchr_inv(mask, 0, len))
+		return 0;
+	err = nla_put(skb, val_type, len, val);
+	if (err)
+		return err;
+	if (mask_type != TCA_OF_UNSPEC) {
+		err = nla_put(skb, mask_type, len, mask);
+		if (err)
+			return err;
+	}
+	return 0;
+}
+
+static int of_dump(struct net *net, struct tcf_proto *tp, unsigned long fh,
+		   struct sk_buff *skb, struct tcmsg *t)
+{
+	struct cls_of_head *head = rtnl_dereference(tp->root);
+	struct cls_of_filter *f = (struct cls_of_filter *) fh;
+	struct nlattr *nest;
+	struct of_flow_key *key, *mask;
+
+	if (!f)
+		return skb->len;
+
+	t->tcm_handle = f->handle;
+
+	nest = nla_nest_start(skb, TCA_OPTIONS);
+	if (!nest)
+		goto nla_put_failure;
+
+	if (f->res.classid &&
+	    nla_put_u32(skb, TCA_OF_CLASSID, f->res.classid))
+		goto nla_put_failure;
+
+	key = &f->key;
+	mask = &head->mask.key;
+
+	if (mask->indev_ifindex) {
+		struct net_device *dev;
+
+		dev = __dev_get_by_index(net, key->indev_ifindex);
+		if (dev && nla_put_string(skb, TCA_OF_INDEV, dev->name))
+			goto nla_put_failure;
+	}
+
+	if (of_dump_key_val(skb, key->eth.dst, TCA_OF_KEY_ETH_DST,
+			    mask->eth.dst, TCA_OF_KEY_ETH_DST_MASK,
+			    sizeof(key->eth.dst)) ||
+	    of_dump_key_val(skb, key->eth.src, TCA_OF_KEY_ETH_SRC,
+			    mask->eth.src, TCA_OF_KEY_ETH_SRC_MASK,
+			    sizeof(key->eth.src)) ||
+	    of_dump_key_val(skb, &key->eth.type, TCA_OF_KEY_ETH_TYPE,
+			    &mask->eth.type, TCA_OF_UNSPEC,
+			    sizeof(key->eth.type)))
+		goto nla_put_failure;
+	if ((key->eth.type == htons(ETH_P_IP) ||
+	     key->eth.type == htons(ETH_P_IPV6)) &&
+	    of_dump_key_val(skb, &key->ip.proto, TCA_OF_KEY_IP_PROTO,
+			    &mask->ip.proto, TCA_OF_UNSPEC,
+			    sizeof(key->ip.proto)))
+		goto nla_put_failure;
+
+	if (key->eth.type == htons(ETH_P_IP) &&
+	    (of_dump_key_val(skb, &key->ipv4.src, TCA_OF_KEY_IPV4_SRC,
+			     &mask->ipv4.src, TCA_OF_KEY_IPV4_SRC_MASK,
+			     sizeof(key->ipv4.src)) ||
+	     of_dump_key_val(skb, &key->ipv4.dst, TCA_OF_KEY_IPV4_DST,
+			     &mask->ipv4.dst, TCA_OF_KEY_IPV4_DST_MASK,
+			     sizeof(key->ipv4.dst))))
+		goto nla_put_failure;
+	else if (key->eth.type == htons(ETH_P_IPV6) &&
+		 (of_dump_key_val(skb, &key->ipv6.src, TCA_OF_KEY_IPV6_SRC,
+				  &mask->ipv6.src, TCA_OF_KEY_IPV6_SRC_MASK,
+				  sizeof(key->ipv6.src)) ||
+		  of_dump_key_val(skb, &key->ipv6.dst, TCA_OF_KEY_IPV6_DST,
+				  &mask->ipv6.dst, TCA_OF_KEY_IPV6_DST_MASK,
+				  sizeof(key->ipv6.dst))))
+		goto nla_put_failure;
+
+	if (key->ip.proto == IPPROTO_TCP &&
+	    (of_dump_key_val(skb, &key->tp.src, TCA_OF_KEY_TCP_SRC,
+			     &mask->tp.src, TCA_OF_UNSPEC,
+			     sizeof(key->tp.src)) ||
+	     of_dump_key_val(skb, &key->tp.dst, TCA_OF_KEY_TCP_DST,
+			     &mask->tp.dst, TCA_OF_UNSPEC,
+			     sizeof(key->tp.dst))))
+		goto nla_put_failure;
+	else if (key->ip.proto == IPPROTO_UDP &&
+		 (of_dump_key_val(skb, &key->tp.src, TCA_OF_KEY_UDP_SRC,
+				  &mask->tp.src, TCA_OF_UNSPEC,
+				  sizeof(key->tp.src)) ||
+		  of_dump_key_val(skb, &key->tp.dst, TCA_OF_KEY_UDP_DST,
+				  &mask->tp.dst, TCA_OF_UNSPEC,
+				  sizeof(key->tp.dst))))
+		goto nla_put_failure;
+
+	if (tcf_exts_dump(skb, &f->exts))
+		goto nla_put_failure;
+
+	nla_nest_end(skb, nest);
+
+	if (tcf_exts_dump_stats(skb, &f->exts) < 0)
+		goto nla_put_failure;
+
+	return skb->len;
+
+nla_put_failure:
+	nla_nest_cancel(skb, nest);
+	return -1;
+}
+
+static struct tcf_proto_ops cls_of_ops __read_mostly = {
+	.kind		= "openflow",
+	.classify	= of_classify,
+	.init		= of_init,
+	.destroy	= of_destroy,
+	.get		= of_get,
+	.change		= of_change,
+	.delete		= of_delete,
+	.walk		= of_walk,
+	.dump		= of_dump,
+	.owner		= THIS_MODULE,
+};
+
+static int __init cls_of_init(void)
+{
+	return register_tcf_proto_ops(&cls_of_ops);
+}
+
+static void __exit cls_of_exit(void)
+{
+	unregister_tcf_proto_ops(&cls_of_ops);
+}
+
+module_init(cls_of_init);
+module_exit(cls_of_exit);
+
+MODULE_AUTHOR("Jiri Pirko <jiri@resnulli.us>");
+MODULE_DESCRIPTION("OpenFlow classifier");
+MODULE_LICENSE("GPL v2");
-- 
1.9.3

^ permalink raw reply related	[flat|nested] 18+ messages in thread

* [patch iproute2 v3] tc: add support for OpenFlow classifier
  2015-04-09 12:58 [patch net-next v3] tc: introduce OpenFlow classifier Jiri Pirko
@ 2015-04-09 13:00 ` Jiri Pirko
  2015-04-09 21:34 ` [patch net-next v3] tc: introduce " David Miller
  1 sibling, 0 replies; 18+ messages in thread
From: Jiri Pirko @ 2015-04-09 13:00 UTC (permalink / raw)
  To: netdev; +Cc: davem, jhs, tgraf, jesse

Signed-off-by: Jiri Pirko <jiri@resnulli.us>
---
v2->v3:
- added note about masks and prios

v1->v2:
- no change
---
 include/linux/pkt_cls.h |  31 +++
 tc/Makefile             |   1 +
 tc/f_openflow.c         | 554 ++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 586 insertions(+)
 create mode 100644 tc/f_openflow.c

diff --git a/include/linux/pkt_cls.h b/include/linux/pkt_cls.h
index bf08e76..910898c 100644
--- a/include/linux/pkt_cls.h
+++ b/include/linux/pkt_cls.h
@@ -404,6 +404,37 @@ enum {
 
 #define TCA_BPF_MAX (__TCA_BPF_MAX - 1)
 
+/* OpenFlow classifier */
+
+enum {
+	TCA_OF_UNSPEC,
+	TCA_OF_CLASSID,
+	TCA_OF_POLICE,
+	TCA_OF_INDEV,
+	TCA_OF_ACT,
+	TCA_OF_KEY_ETH_DST,		/* ETH_ALEN */
+	TCA_OF_KEY_ETH_DST_MASK,	/* ETH_ALEN */
+	TCA_OF_KEY_ETH_SRC,		/* ETH_ALEN */
+	TCA_OF_KEY_ETH_SRC_MASK,	/* ETH_ALEN */
+	TCA_OF_KEY_ETH_TYPE,		/* be16 */
+	TCA_OF_KEY_IP_PROTO,		/* u8 */
+	TCA_OF_KEY_IPV4_SRC,		/* be32 */
+	TCA_OF_KEY_IPV4_SRC_MASK,	/* be32 */
+	TCA_OF_KEY_IPV4_DST,		/* be32 */
+	TCA_OF_KEY_IPV4_DST_MASK,	/* be32 */
+	TCA_OF_KEY_IPV6_SRC,		/* struct in6_addr */
+	TCA_OF_KEY_IPV6_SRC_MASK,	/* struct in6_addr */
+	TCA_OF_KEY_IPV6_DST,		/* struct in6_addr */
+	TCA_OF_KEY_IPV6_DST_MASK,	/* struct in6_addr */
+	TCA_OF_KEY_TCP_SRC,		/* be16 */
+	TCA_OF_KEY_TCP_DST,		/* be16 */
+	TCA_OF_KEY_UDP_SRC,		/* be16 */
+	TCA_OF_KEY_UDP_DST,		/* be16 */
+	__TCA_OF_MAX,
+};
+
+#define TCA_OF_MAX (__TCA_OF_MAX - 1)
+
 /* Extended Matches */
 
 struct tcf_ematch_tree_hdr {
diff --git a/tc/Makefile b/tc/Makefile
index 2eff082..778732f 100644
--- a/tc/Makefile
+++ b/tc/Makefile
@@ -30,6 +30,7 @@ TCMODULES += f_basic.o
 TCMODULES += f_bpf.o
 TCMODULES += f_flow.o
 TCMODULES += f_cgroup.o
+TCMODULES += f_openflow.o
 TCMODULES += q_dsmark.o
 TCMODULES += q_gred.o
 TCMODULES += f_tcindex.o
diff --git a/tc/f_openflow.c b/tc/f_openflow.c
new file mode 100644
index 0000000..1c43c23
--- /dev/null
+++ b/tc/f_openflow.c
@@ -0,0 +1,554 @@
+/*
+ * f_openflow.c		OpenFlow Classifier
+ *
+ *		This program is free software; you can distribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		as published by the Free Software Foundation; either version
+ *		2 of the License, or (at your option) any later version.
+ *
+ * Authors:     Jiri Pirko <jiri@resnulli.us>
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <syslog.h>
+#include <string.h>
+#include <net/if.h>
+#include <linux/if_ether.h>
+#include <linux/ip.h>
+
+#include "utils.h"
+#include "tc_util.h"
+#include "rt_names.h"
+
+static void explain(void)
+{
+	fprintf(stderr, "Usage: ... openflow [ MATCH-LIST ]\n");
+	fprintf(stderr, "                    [ action ACTION-SPEC ] [ classid CLASSID ]\n");
+	fprintf(stderr, "\n");
+	fprintf(stderr, "Where: MATCH-LIST := [ MATCH-LIST ] MATCH\n");
+	fprintf(stderr, "       MATCH      := [ indev DEV-NAME | \n");
+	fprintf(stderr, "                       dst_mac MAC-ADDR | \n");
+	fprintf(stderr, "                       src_mac MAC-ADDR | \n");
+	fprintf(stderr, "                       eth_type [ipv4 | ipv6 | ETH-TYPE ] | \n");
+	fprintf(stderr, "                       ip_proto [tcp | udp | IP-PROTO ] | \n");
+	fprintf(stderr, "                       dst_ip [ IPV4-ADDR | IPV6-ADDR ] | \n");
+	fprintf(stderr, "                       src_ip [ IPV4-ADDR | IPV6-ADDR ] | \n");
+	fprintf(stderr, "                       dst_port PORT-NUMBER | \n");
+	fprintf(stderr, "                       src_port PORT-NUMBER | \n");
+	fprintf(stderr,	"       FILTERID := X:Y:Z\n");
+	fprintf(stderr,	"       ACTION-SPEC := ... look at individual actions\n");
+	fprintf(stderr,	"\n");
+	fprintf(stderr,	"NOTE: CLASSID, ETH-TYPE, IP-PROTO are parsed as hexadecimal input.\n");
+	fprintf(stderr,	"NOTE: There can be only used one mask per one prio. If user needs\n");
+	fprintf(stderr,	"      to specify different mask, he has to use different prio.\n");
+}
+
+static int openflow_parse_eth_addr(char *str, int addr_type, int mask_type,
+				   struct nlmsghdr *n)
+{
+	int ret;
+	char addr[ETH_ALEN];
+
+	ret = ll_addr_a2n(addr, sizeof(addr), str);
+	if (ret < 0)
+		return -1;
+	addattr_l(n, MAX_MSG, addr_type, addr, sizeof(addr));
+	memset(addr, 0xff, ETH_ALEN);
+	addattr_l(n, MAX_MSG, mask_type, addr, sizeof(addr));
+	return 0;
+}
+
+static int openflow_parse_eth_type(char *str, int type, __be16 *p_eth_type,
+				   struct nlmsghdr *n)
+{
+	int ret;
+	__be16 eth_type;
+
+	if (matches(str, "ipv4") == 0) {
+		eth_type = htons(ETH_P_IP);
+	} else if (matches(str, "ipv6") == 0) {
+		eth_type = htons(ETH_P_IPV6);
+	} else {
+		__u16 tmp;
+
+		ret = get_u16(&tmp, str, 16);
+		if (ret)
+			return -1;
+		eth_type = htons(tmp);
+	}
+	addattr16(n, MAX_MSG, type, eth_type);
+	*p_eth_type = eth_type;
+	return 0;
+}
+
+static int openflow_parse_ip_proto(char *str, __be16 eth_type, int type,
+				   __u8 *p_ip_proto, struct nlmsghdr *n)
+{
+	int ret;
+	__u8 ip_proto;
+
+	if (eth_type != htons(ETH_P_IP) && eth_type != htons(ETH_P_IPV6)) {
+		fprintf(stderr, "Illegal \"eth_type\" for ip proto\n");
+		return -1;
+	}
+	if (matches(str, "tcp") == 0) {
+		ip_proto = IPPROTO_TCP;
+	} else if (matches(str, "udp") == 0) {
+		ip_proto = IPPROTO_UDP;
+	} else {
+		ret = get_u8(&ip_proto, str, 16);
+		if (ret)
+			return -1;
+	}
+	addattr8(n, MAX_MSG, type, ip_proto);
+	*p_ip_proto = ip_proto;
+	return 0;
+}
+
+static int openflow_parse_ip_addr(char *str, __be16 eth_type,
+				  int addr4_type, int mask4_type,
+				  int addr6_type, int mask6_type,
+				  struct nlmsghdr *n)
+{
+	int ret;
+	inet_prefix addr;
+	int family;
+	int bits;
+	int i;
+
+	if (eth_type == htons(ETH_P_IP)) {
+		family = AF_INET;
+	} else if (eth_type == htons(ETH_P_IPV6)) {
+		family = AF_INET6;
+	} else {
+		fprintf(stderr, "Illegal \"eth_type\" for ip address\n");
+		return -1;
+	}
+
+	ret = get_prefix(&addr, str, family);
+	if (ret)
+		return -1;
+
+	if (addr.family != family)
+		return -1;
+
+	addattr_l(n, MAX_MSG, addr.family == AF_INET ? addr4_type : addr6_type,
+		  addr.data, addr.bytelen);
+
+	memset(addr.data, 0xff, addr.bytelen);
+	bits = addr.bitlen;
+	for (i = 0; i < addr.bytelen / 4; i++) {
+		if (!bits) {
+			addr.data[i] = 0;
+		} else if (bits / 32 >= 1) {
+			bits -= 32;
+		} else {
+			addr.data[i] <<= 32 - bits;
+			addr.data[i] = htonl(addr.data[i]);
+			bits = 0;
+		}
+	}
+
+	addattr_l(n, MAX_MSG, addr.family == AF_INET ? mask4_type : mask6_type,
+		  addr.data, addr.bytelen);
+
+	return 0;
+}
+
+static int openflow_parse_port(char *str, __u8 ip_port,
+			       int tcp_type, int udp_type, struct nlmsghdr *n)
+{
+	int ret;
+	int type;
+	__be16 port;
+
+	if (ip_port == IPPROTO_TCP) {
+		type = tcp_type;
+	} else if (ip_port == IPPROTO_UDP) {
+		type = udp_type;
+	} else {
+		fprintf(stderr, "Illegal \"ip_proto\" for port\n");
+		return -1;
+	}
+
+	ret = get_u16(&port, str, 10);
+	if (ret)
+		return -1;
+
+
+	addattr16(n, MAX_MSG, type, htons(port));
+
+	return 0;
+}
+
+static int openflow_parse_opt(struct filter_util *qu, char *handle,
+			      int argc, char **argv, struct nlmsghdr *n)
+{
+	int ret;
+	struct tcmsg *t = NLMSG_DATA(n);
+	struct rtattr *tail;
+	__be16 eth_type = 0;
+	__u8 ip_proto = 0xff;
+
+	if (argc == 0)
+		return 0;
+
+	if (handle) {
+		ret = get_u32(&t->tcm_handle, handle, 0);
+		if (ret) {
+			fprintf(stderr, "Illegal \"handle\"\n");
+			return -1;
+		}
+	}
+
+	tail = (struct rtattr*)(((void*)n)+NLMSG_ALIGN(n->nlmsg_len));
+	addattr_l(n, MAX_MSG, TCA_OPTIONS, NULL, 0);
+
+	while (argc > 0) {
+		if (matches(*argv, "classid") == 0 ||
+		    matches(*argv, "flowid") == 0) {
+			unsigned handle;
+
+			NEXT_ARG();
+			ret = get_tc_classid(&handle, *argv);
+			if (ret) {
+				fprintf(stderr, "Illegal \"classid\"\n");
+				return -1;
+			}
+			addattr_l(n, MAX_MSG, TCA_OF_CLASSID, &handle, 4);
+		} else if (matches(*argv, "police") == 0) {
+			NEXT_ARG();
+			ret = parse_police(&argc, &argv, TCA_OF_POLICE, n);
+			if (ret) {
+				fprintf(stderr, "Illegal \"police\"\n");
+				return -1;
+			}
+			continue;
+		} else if (matches(*argv, "indev") == 0) {
+			char ifname[IFNAMSIZ];
+
+			NEXT_ARG();
+			memset(ifname, 0, sizeof(ifname));
+			strncpy(ifname, *argv, sizeof(ifname) - 1);
+			addattrstrz(n, MAX_MSG, TCA_OF_INDEV, ifname);
+		} else if (matches(*argv, "dst_mac") == 0) {
+			NEXT_ARG();
+			ret = openflow_parse_eth_addr(*argv,
+						      TCA_OF_KEY_ETH_DST,
+						      TCA_OF_KEY_ETH_DST_MASK,
+						      n);
+			if (ret < 0) {
+				fprintf(stderr, "Illegal \"dst_mac\"\n");
+				return -1;
+			}
+		} else if (matches(*argv, "src_mac") == 0) {
+			NEXT_ARG();
+			ret = openflow_parse_eth_addr(*argv,
+						      TCA_OF_KEY_ETH_SRC,
+						      TCA_OF_KEY_ETH_SRC_MASK,
+						      n);
+			if (ret < 0) {
+				fprintf(stderr, "Illegal \"src_mac\"\n");
+				return -1;
+			}
+		} else if (matches(*argv, "eth_type") == 0) {
+			NEXT_ARG();
+			ret = openflow_parse_eth_type(*argv,
+						      TCA_OF_KEY_ETH_TYPE,
+						      &eth_type, n);
+			if (ret < 0) {
+				fprintf(stderr, "Illegal \"eth_type\"\n");
+				return -1;
+			}
+		} else if (matches(*argv, "ip_proto") == 0) {
+			NEXT_ARG();
+			ret = openflow_parse_ip_proto(*argv, eth_type,
+						      TCA_OF_KEY_IP_PROTO,
+						      &ip_proto, n);
+			if (ret < 0) {
+				fprintf(stderr, "Illegal \"ip_proto\"\n");
+				return -1;
+			}
+		} else if (matches(*argv, "dst_ip") == 0) {
+			NEXT_ARG();
+			ret = openflow_parse_ip_addr(*argv, eth_type,
+						     TCA_OF_KEY_IPV4_DST,
+						     TCA_OF_KEY_IPV4_DST_MASK,
+						     TCA_OF_KEY_IPV6_DST,
+						     TCA_OF_KEY_IPV6_DST_MASK,
+						     n);
+			if (ret < 0) {
+				fprintf(stderr, "Illegal \"dst_ip\"\n");
+				return -1;
+			}
+		} else if (matches(*argv, "src_ip") == 0) {
+			NEXT_ARG();
+			ret = openflow_parse_ip_addr(*argv, eth_type,
+						     TCA_OF_KEY_IPV4_SRC,
+						     TCA_OF_KEY_IPV4_SRC_MASK,
+						     TCA_OF_KEY_IPV6_SRC,
+						     TCA_OF_KEY_IPV6_SRC_MASK,
+						     n);
+			if (ret < 0) {
+				fprintf(stderr, "Illegal \"src_ip\"\n");
+				return -1;
+			}
+		} else if (matches(*argv, "dst_port") == 0) {
+			NEXT_ARG();
+			ret = openflow_parse_port(*argv, ip_proto,
+						  TCA_OF_KEY_TCP_DST,
+						  TCA_OF_KEY_UDP_DST, n);
+			if (ret < 0) {
+				fprintf(stderr, "Illegal \"dst_port\"\n");
+				return -1;
+			}
+		} else if (matches(*argv, "src_port") == 0) {
+			NEXT_ARG();
+			ret = openflow_parse_port(*argv, ip_proto,
+						  TCA_OF_KEY_TCP_SRC,
+						  TCA_OF_KEY_UDP_SRC, n);
+			if (ret < 0) {
+				fprintf(stderr, "Illegal \"src_port\"\n");
+				return -1;
+			}
+		} else if (matches(*argv, "action") == 0) {
+			NEXT_ARG();
+			ret = parse_action(&argc, &argv, TCA_OF_ACT, n);
+			if (ret) {
+				fprintf(stderr, "Illegal \"action\"\n");
+				return -1;
+			}
+			continue;
+		} else if (strcmp(*argv, "help") == 0) {
+			explain();
+			return -1;
+		} else {
+			fprintf(stderr, "What is \"%s\"?\n", *argv);
+			explain();
+			return -1;
+		}
+		argc--; argv++;
+	}
+
+	tail->rta_len = (((void*)n)+n->nlmsg_len) - (void*)tail;
+
+	return 0;
+}
+
+static int __mask_bits(char *addr, size_t len)
+{
+	int bits = 0;
+	bool hole = false;
+	int i;
+	int j;
+
+	for (i = 0; i < len; i++, addr++) {
+		for (j = 7; j >= 0; j--) {
+			if (((*addr) >> j) & 0x1) {
+				if (hole)
+					return -1;
+				bits++;
+			} else if (bits) {
+				hole = true;
+			} else{
+				return -1;
+			}
+		}
+	}
+	return bits;
+}
+
+static void openflow_print_eth_addr(FILE *f, char *name,
+				    struct rtattr *addr_attr,
+				    struct rtattr *mask_attr)
+{
+	SPRINT_BUF(b1);
+	int bits;
+
+	if (!addr_attr || RTA_PAYLOAD(addr_attr) != ETH_ALEN)
+		return;
+	fprintf(f, "\n  %s %s", name, ll_addr_n2a(RTA_DATA(addr_attr), ETH_ALEN,
+						  0, b1, sizeof(b1)));
+	if (!mask_attr || RTA_PAYLOAD(mask_attr) != ETH_ALEN)
+		return;
+	bits = __mask_bits(RTA_DATA(mask_attr), ETH_ALEN);
+	if (bits < 0)
+		fprintf(f, "/%s", ll_addr_n2a(RTA_DATA(mask_attr), ETH_ALEN,
+					      0, b1, sizeof(b1)));
+	else if (bits < ETH_ALEN * 8)
+		fprintf(f, "/%d", bits);
+}
+
+static void openflow_print_eth_type(FILE *f, __be16 *p_eth_type,
+				    struct rtattr *eth_type_attr)
+{
+	__be16 eth_type;
+
+	if (!eth_type_attr)
+		return;
+
+	eth_type = rta_getattr_u16(eth_type_attr);
+	fprintf(f, "\n  eth_type ");
+	if (eth_type == htons(ETH_P_IP))
+		fprintf(f, "ipv4");
+	else if (eth_type == htons(ETH_P_IPV6))
+		fprintf(f, "ipv6");
+	else
+		fprintf(f, "%04x", ntohs(eth_type));
+	*p_eth_type = eth_type;
+}
+
+static void openflow_print_ip_proto(FILE *f, __u8 *p_ip_proto,
+				    struct rtattr *ip_proto_attr)
+{
+	__u8 ip_proto;
+
+	if (!ip_proto_attr)
+		return;
+
+	ip_proto = rta_getattr_u8(ip_proto_attr);
+	fprintf(f, "\n  ip_proto ");
+	if (ip_proto == IPPROTO_TCP)
+		fprintf(f, "tcp");
+	else if (ip_proto == IPPROTO_UDP)
+		fprintf(f, "udp");
+	else
+		fprintf(f, "%02x", ip_proto);
+	*p_ip_proto = ip_proto;
+}
+
+static void openflow_print_ip_addr(FILE *f, char *name, __be16 eth_type,
+				   struct rtattr *addr4_attr,
+				   struct rtattr *mask4_attr,
+				   struct rtattr *addr6_attr,
+				   struct rtattr *mask6_attr)
+{
+	SPRINT_BUF(b1);
+	struct rtattr *addr_attr;
+	struct rtattr *mask_attr;
+	int family;
+	size_t len;
+	int bits;
+
+	if (eth_type == htons(ETH_P_IP)) {
+		family = AF_INET;
+		addr_attr = addr4_attr;
+		mask_attr = mask4_attr;
+		len = 4;
+	} else if (eth_type == htons(ETH_P_IPV6)) {
+		family = AF_INET6;
+		addr_attr = addr6_attr;
+		mask_attr = mask6_attr;
+		len = 16;
+	} else {
+		return;
+	}
+	if (!addr_attr || RTA_PAYLOAD(addr_attr) != len)
+		return;
+	fprintf(f, "\n  %s %s", name, rt_addr_n2a(family,
+						  RTA_PAYLOAD(addr_attr),
+						  RTA_DATA(addr_attr),
+						  b1, sizeof(b1)));
+	if (!mask_attr || RTA_PAYLOAD(mask_attr) != len)
+		return;
+	bits = __mask_bits(RTA_DATA(mask_attr), len);
+	if (bits < 0)
+		fprintf(f, "/%s", rt_addr_n2a(family,
+					      RTA_PAYLOAD(mask_attr),
+					      RTA_DATA(mask_attr),
+					      b1, sizeof(b1)));
+	else if (bits < len * 8)
+		fprintf(f, "/%d", bits);
+}
+
+static void openflow_print_port(FILE *f, char *name, __u8 ip_proto,
+				struct rtattr *tcp_attr,
+				struct rtattr *udp_attr)
+{
+	struct rtattr *attr;
+
+	if (ip_proto == IPPROTO_TCP)
+		attr = tcp_attr;
+	else if (ip_proto == IPPROTO_UDP)
+		attr = udp_attr;
+	else
+		return;
+	if (!attr)
+		return;
+	fprintf(f, "\n  %s %d", name, ntohs(rta_getattr_u16(attr)));
+}
+
+static int openflow_print_opt(struct filter_util *qu, FILE *f,
+			      struct rtattr *opt, __u32 handle)
+{
+	struct rtattr *tb[TCA_OF_MAX + 1];
+	__be16 eth_type = 0;
+	__u8 ip_proto = 0xff;
+
+	if (!opt)
+		return 0;
+
+	parse_rtattr_nested(tb, TCA_OF_MAX, opt);
+
+	if (handle)
+		fprintf(f, "handle 0x%x ", handle);
+
+	if (tb[TCA_OF_CLASSID]) {
+		SPRINT_BUF(b1);
+		fprintf(f, "classid %s ",
+			sprint_tc_classid(rta_getattr_u32(tb[TCA_OF_CLASSID]), b1));
+	}
+
+	if (tb[TCA_OF_POLICE]) {
+		fprintf(f, "\n");
+		tc_print_police(f, tb[TCA_OF_POLICE]);
+	}
+
+	if (tb[TCA_FW_INDEV]) {
+		struct rtattr *attr = tb[TCA_FW_INDEV];
+
+		fprintf(f, "\n  indev %s", rta_getattr_str(attr));
+	}
+
+	openflow_print_eth_addr(f, "dst_mac", tb[TCA_OF_KEY_ETH_DST],
+				tb[TCA_OF_KEY_ETH_DST_MASK]);
+	openflow_print_eth_addr(f, "src_mac", tb[TCA_OF_KEY_ETH_SRC],
+				tb[TCA_OF_KEY_ETH_SRC_MASK]);
+
+	openflow_print_eth_type(f, &eth_type, tb[TCA_OF_KEY_ETH_TYPE]);
+	openflow_print_ip_proto(f, &ip_proto, tb[TCA_OF_KEY_IP_PROTO]);
+
+	openflow_print_ip_addr(f, "dst_ip", eth_type,
+			       tb[TCA_OF_KEY_IPV4_DST],
+			       tb[TCA_OF_KEY_IPV4_DST_MASK],
+			       tb[TCA_OF_KEY_IPV6_DST],
+			       tb[TCA_OF_KEY_IPV6_DST_MASK]);
+
+	openflow_print_ip_addr(f, "src_ip", eth_type,
+			       tb[TCA_OF_KEY_IPV4_SRC],
+			       tb[TCA_OF_KEY_IPV4_SRC_MASK],
+			       tb[TCA_OF_KEY_IPV6_SRC],
+			       tb[TCA_OF_KEY_IPV6_SRC_MASK]);
+
+	openflow_print_port(f, "dst_port", ip_proto,
+			    tb[TCA_OF_KEY_TCP_DST],
+			    tb[TCA_OF_KEY_UDP_DST]);
+
+	openflow_print_port(f, "src_port", ip_proto,
+			    tb[TCA_OF_KEY_TCP_SRC],
+			    tb[TCA_OF_KEY_UDP_SRC]);
+
+	if (tb[TCA_OF_ACT]) {
+		tc_print_action(f, tb[TCA_OF_ACT]);
+	}
+
+	return 0;
+}
+
+struct filter_util openflow_filter_util = {
+	.id = "openflow",
+	.parse_fopt = openflow_parse_opt,
+	.print_fopt = openflow_print_opt,
+};
-- 
1.9.3

^ permalink raw reply related	[flat|nested] 18+ messages in thread

* Re: [patch net-next v3] tc: introduce OpenFlow classifier
  2015-04-09 12:58 [patch net-next v3] tc: introduce OpenFlow classifier Jiri Pirko
  2015-04-09 13:00 ` [patch iproute2 v3] tc: add support for " Jiri Pirko
@ 2015-04-09 21:34 ` David Miller
  2015-04-10  9:12   ` Jiri Pirko
  1 sibling, 1 reply; 18+ messages in thread
From: David Miller @ 2015-04-09 21:34 UTC (permalink / raw)
  To: jiri; +Cc: netdev, jhs, tgraf, jesse

From: Jiri Pirko <jiri@resnulli.us>
Date: Thu,  9 Apr 2015 14:58:07 +0200

> This patch introduces OpenFlow-based filter. So far, the very essential
> packet fields are supported (according to OpenFlow v1.4 spec).
> 
> This patch is only the first step. There is a lot of potential performance
> improvements possible to implement. Also a lot of features are missing
> now. They will be addressed in follow-up patches.
> 
> To the name of this classifier, I believe that "cls_openflow" is pretty
> accurate. It is actually a OpenFlow classifier.
> 
> Signed-off-by: Jiri Pirko <jiri@resnulli.us>

I'm not so sure what my opinion is about whether we should
even have an openflow classifier or not (I find major aspects
of OpenFLOW extremely distasteful, it's basically pushing the
SDK agenda of several major chip vendors).

However I am sure that I majorly object to having yet another flow
parsing engine.  Therefore, at least adjust this code to use our flow
dissector and datastructures.  Adjust the flow dissector to fit your
needs, if necessary.

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [patch net-next v3] tc: introduce OpenFlow classifier
  2015-04-09 21:34 ` [patch net-next v3] tc: introduce " David Miller
@ 2015-04-10  9:12   ` Jiri Pirko
  2015-04-10  9:30     ` Daniel Borkmann
                       ` (2 more replies)
  0 siblings, 3 replies; 18+ messages in thread
From: Jiri Pirko @ 2015-04-10  9:12 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, jhs, tgraf, jesse

Thu, Apr 09, 2015 at 11:34:23PM CEST, davem@davemloft.net wrote:
>From: Jiri Pirko <jiri@resnulli.us>
>Date: Thu,  9 Apr 2015 14:58:07 +0200
>
>> This patch introduces OpenFlow-based filter. So far, the very essential
>> packet fields are supported (according to OpenFlow v1.4 spec).
>> 
>> This patch is only the first step. There is a lot of potential performance
>> improvements possible to implement. Also a lot of features are missing
>> now. They will be addressed in follow-up patches.
>> 
>> To the name of this classifier, I believe that "cls_openflow" is pretty
>> accurate. It is actually a OpenFlow classifier.
>> 
>> Signed-off-by: Jiri Pirko <jiri@resnulli.us>
>
>I'm not so sure what my opinion is about whether we should
>even have an openflow classifier or not (I find major aspects
>of OpenFLOW extremely distasteful, it's basically pushing the
>SDK agenda of several major chip vendors).

Yep, I don't like OpenFlow as well. But anyway, we already have
OpenFlow-based classifier in kernel - in OVS code.

The thing is, why don't have it in tc? It does not cost anything. And
having it, people used to use ovs kernel DP will be able to migrate
easily to tc.

We can loose the name and name this cls_ngflow or something like that if
that helps.

>
>However I am sure that I majorly object to having yet another flow
>parsing engine.  Therefore, at least adjust this code to use our flow
>dissector and datastructures.  Adjust the flow dissector to fit your
>needs, if necessary.

Yep, Thomas already suggested the merge. The thing is, cls_flow uses
linked list for doing lookups. That is not scalable. in cls_openflow I
use rhashtable. Using rhashtable in cls_flow would break the existing
assumption that first inrested filter would be first hit.

So that would lead into major dual code in cls_flow. So I believe that
it is better to do it in separate cls_openflow (do one thing and do it
right).

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [patch net-next v3] tc: introduce OpenFlow classifier
  2015-04-10  9:12   ` Jiri Pirko
@ 2015-04-10  9:30     ` Daniel Borkmann
  2015-04-10 11:46       ` Jiri Pirko
  2015-04-10 10:03     ` Thomas Graf
  2015-04-10 12:23     ` David Miller
  2 siblings, 1 reply; 18+ messages in thread
From: Daniel Borkmann @ 2015-04-10  9:30 UTC (permalink / raw)
  To: Jiri Pirko, David Miller; +Cc: netdev, jhs, tgraf, jesse

On 04/10/2015 11:12 AM, Jiri Pirko wrote:
...
>> However I am sure that I majorly object to having yet another flow
>> parsing engine.  Therefore, at least adjust this code to use our flow
>> dissector and datastructures.  Adjust the flow dissector to fit your
>> needs, if necessary.
>
> Yep, Thomas already suggested the merge. The thing is, cls_flow uses
> linked list for doing lookups. That is not scalable. in cls_openflow I
> use rhashtable. Using rhashtable in cls_flow would break the existing
> assumption that first inrested filter would be first hit.
>
> So that would lead into major dual code in cls_flow. So I believe that
> it is better to do it in separate cls_openflow (do one thing and do it
> right).

Would it be possible to 1) reuse the majority of the existing internal
cls_flow code as much as possible, integrate the remaining openflow
parts into it, and then 2) just register a 2nd classifier kind, a la ...

static struct tcf_proto_ops cls_flow_ht_ops __read_mostly = {
	.kind		= "flow-ht",
	.classify	= flow_classify_ht,
	.init		= flow_init,
	.destroy	= flow_destroy,
	.change		= flow_change,
	.delete		= flow_delete,
	.get		= flow_get,
	.dump		= flow_dump,
	.walk		= flow_walk,
	.owner		= THIS_MODULE,
};

... so you could do the non-linear rhashtable matching from there, but
without much duplication?

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [patch net-next v3] tc: introduce OpenFlow classifier
  2015-04-10  9:12   ` Jiri Pirko
  2015-04-10  9:30     ` Daniel Borkmann
@ 2015-04-10 10:03     ` Thomas Graf
  2015-04-10 12:23     ` David Miller
  2 siblings, 0 replies; 18+ messages in thread
From: Thomas Graf @ 2015-04-10 10:03 UTC (permalink / raw)
  To: Jiri Pirko; +Cc: David Miller, netdev, jhs, jesse

On 04/10/15 at 11:12am, Jiri Pirko wrote:
> Thu, Apr 09, 2015 at 11:34:23PM CEST, davem@davemloft.net wrote:
> >I'm not so sure what my opinion is about whether we should
> >even have an openflow classifier or not (I find major aspects
> >of OpenFLOW extremely distasteful, it's basically pushing the
> >SDK agenda of several major chip vendors).
> 
> Yep, I don't like OpenFlow as well. But anyway, we already have
> OpenFlow-based classifier in kernel - in OVS code.

Can we stop referring to OpenFlow? There is really no relation ;-)
In fact, there are several open source projects which utilize the
OVS kernel datapath *without* caring about OpenFlow at all. A good
example of this is Midonet.

> The thing is, why don't have it in tc? It does not cost anything. And
> having it, people used to use ovs kernel DP will be able to migrate
> easily to tc.

That part I agree with. We should have a single common flow dissector
which is used for all purposes. After all, it's just parsing and
wildcard matching through hash tables. If that flow dissector is not
rich enough for someone, eBPF offers a more programmable approach and
we already see that work progressing nicely. We should also
consolidate as much of the actions as possible.

As I stated in a private email to a couple of days ago. I've started
working on ripping out most of the OVS vport logic and make all OVS
bridge ports be regular net_devices (most of them already were). This
will get rid of most of the OVS specific tunnel handling logic and
should allow to implement flow based tunnels with iproute2 as well.
The OVS datapath structure will just become net_device private data
and all ports will be regular slaves.

Adding slaves to an OVS bridge will be possible through iproute2
and we can even expose OVS bridges as regular Linux bridges ithrough
AF_BRIDGE *if* we want. Although I'm not sure how feasiable that is
given that most Linux bridge knobs including all the STP config
does not apply to OVS bridges.

I think everybody agrees that we should melt as much as possible and
stop reimplementing everything from scratch in isolated silos for
every bridge flavour that pops up.

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [patch net-next v3] tc: introduce OpenFlow classifier
  2015-04-10  9:30     ` Daniel Borkmann
@ 2015-04-10 11:46       ` Jiri Pirko
  2015-04-10 13:48         ` Jamal Hadi Salim
  0 siblings, 1 reply; 18+ messages in thread
From: Jiri Pirko @ 2015-04-10 11:46 UTC (permalink / raw)
  To: Daniel Borkmann; +Cc: David Miller, netdev, jhs, tgraf, jesse

Fri, Apr 10, 2015 at 11:30:06AM CEST, daniel@iogearbox.net wrote:
>On 04/10/2015 11:12 AM, Jiri Pirko wrote:
>...
>>>However I am sure that I majorly object to having yet another flow
>>>parsing engine.  Therefore, at least adjust this code to use our flow
>>>dissector and datastructures.  Adjust the flow dissector to fit your
>>>needs, if necessary.
>>
>>Yep, Thomas already suggested the merge. The thing is, cls_flow uses
>>linked list for doing lookups. That is not scalable. in cls_openflow I
>>use rhashtable. Using rhashtable in cls_flow would break the existing
>>assumption that first inrested filter would be first hit.
>>
>>So that would lead into major dual code in cls_flow. So I believe that
>>it is better to do it in separate cls_openflow (do one thing and do it
>>right).
>
>Would it be possible to 1) reuse the majority of the existing internal
>cls_flow code as much as possible, integrate the remaining openflow
>parts into it, and then 2) just register a 2nd classifier kind, a la ...
>
>static struct tcf_proto_ops cls_flow_ht_ops __read_mostly = {
>	.kind		= "flow-ht",
>	.classify	= flow_classify_ht,
>	.init		= flow_init,
>	.destroy	= flow_destroy,
>	.change		= flow_change,
>	.delete		= flow_delete,
>	.get		= flow_get,
>	.dump		= flow_dump,
>	.walk		= flow_walk,
>	.owner		= THIS_MODULE,
>};
>
>... so you could do the non-linear rhashtable matching from there, but
>without much duplication?

I might be missing something, but to me, the codes of cls_flow and
cls_openflow are very different. Merging cls_openflow with for example
cls_fw makes similar sense to me.

cls_flow is no match-action classifier. All skbs are hashed into classid
of pre-defined range. You cannot speficy explicit action for match.

On the other hand cls_openflow is match-action classifier (similar to
for example bpf - might make more sense to me to merge it with bpf).

What I say is, lets do things clearly, separate, not "overmerge" stuff.

I will definitelly loose name "openflow" for my future submission to not
to confuse people.

Thanks!

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [patch net-next v3] tc: introduce OpenFlow classifier
  2015-04-10  9:12   ` Jiri Pirko
  2015-04-10  9:30     ` Daniel Borkmann
  2015-04-10 10:03     ` Thomas Graf
@ 2015-04-10 12:23     ` David Miller
  2015-04-10 12:45       ` Jiri Pirko
  2 siblings, 1 reply; 18+ messages in thread
From: David Miller @ 2015-04-10 12:23 UTC (permalink / raw)
  To: jiri; +Cc: netdev, jhs, tgraf, jesse

From: Jiri Pirko <jiri@resnulli.us>
Date: Fri, 10 Apr 2015 11:12:03 +0200

> Thu, Apr 09, 2015 at 11:34:23PM CEST, davem@davemloft.net wrote:
>>However I am sure that I majorly object to having yet another flow
>>parsing engine.  Therefore, at least adjust this code to use our flow
>>dissector and datastructures.  Adjust the flow dissector to fit your
>>needs, if necessary.
> 
> Yep, Thomas already suggested the merge. The thing is, cls_flow uses
> linked list for doing lookups. That is not scalable. in cls_openflow I
> use rhashtable. Using rhashtable in cls_flow would break the existing
> assumption that first inrested filter would be first hit.

I'm talking about using net/core/flow_dissect.c's interfaces instead
of your by-hand header parsing.  This has nothing to do with cls_flow

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [patch net-next v3] tc: introduce OpenFlow classifier
  2015-04-10 12:23     ` David Miller
@ 2015-04-10 12:45       ` Jiri Pirko
  2015-04-11 16:12         ` Alexei Starovoitov
  0 siblings, 1 reply; 18+ messages in thread
From: Jiri Pirko @ 2015-04-10 12:45 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, jhs, tgraf, jesse

Fri, Apr 10, 2015 at 02:23:59PM CEST, davem@davemloft.net wrote:
>From: Jiri Pirko <jiri@resnulli.us>
>Date: Fri, 10 Apr 2015 11:12:03 +0200
>
>> Thu, Apr 09, 2015 at 11:34:23PM CEST, davem@davemloft.net wrote:
>>>However I am sure that I majorly object to having yet another flow
>>>parsing engine.  Therefore, at least adjust this code to use our flow
>>>dissector and datastructures.  Adjust the flow dissector to fit your
>>>needs, if necessary.
>> 
>> Yep, Thomas already suggested the merge. The thing is, cls_flow uses
>> linked list for doing lookups. That is not scalable. in cls_openflow I
>> use rhashtable. Using rhashtable in cls_flow would break the existing
>> assumption that first inrested filter would be first hit.
>
>I'm talking about using net/core/flow_dissect.c's interfaces instead
>of your by-hand header parsing.  This has nothing to do with cls_flow

Okay. That was misunderstanding. I was thinking about using existing
flow_dissect. There are couple things which I'm scared of:
- there are eventually many fields to be added to dissection function and to
  the structure as well. Not sure how acceptable that would be for
  performance reasons when flow_dissect is used by different users...
- ipv6 addresses are hashed into int. Not sure how to resolve it, maybe to
  have another field for unhashed addresses

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [patch net-next v3] tc: introduce OpenFlow classifier
  2015-04-10 11:46       ` Jiri Pirko
@ 2015-04-10 13:48         ` Jamal Hadi Salim
  0 siblings, 0 replies; 18+ messages in thread
From: Jamal Hadi Salim @ 2015-04-10 13:48 UTC (permalink / raw)
  To: Jiri Pirko, Daniel Borkmann; +Cc: David Miller, netdev, tgraf, jesse

On 04/10/15 07:46, Jiri Pirko wrote:
> Fri, Apr 10, 2015 at 11:30:06AM CEST, daniel@iogearbox.net wrote:
>> On 04/10/2015 11:12 AM, Jiri Pirko wrote:
>> ...

> I might be missing something, but to me, the codes of cls_flow and
> cls_openflow are very different. Merging cls_openflow with for example
> cls_fw makes similar sense to me.
>
> cls_flow is no match-action classifier. All skbs are hashed into classid
> of pre-defined range. You cannot speficy explicit action for match.
>
> On the other hand cls_openflow is match-action classifier (similar to
> for example bpf - might make more sense to me to merge it with bpf).
>
> What I say is, lets do things clearly, separate, not "overmerge" stuff.
>
> I will definitelly loose name "openflow" for my future submission to not
> to confuse people.
>

I think dropping the term "openflow" would be reasonable. I am not sure
if allowing for this to be part of cls_flow is reasonable.
I do think it makes sense to add another classifier that would do what
Dave was asking for. Such a classifier will have more advantages than
anything else imo since it will get most of the classification almost
for free.

cheers,
jamal

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [patch net-next v3] tc: introduce OpenFlow classifier
  2015-04-10 12:45       ` Jiri Pirko
@ 2015-04-11 16:12         ` Alexei Starovoitov
  2015-04-12  7:53           ` Jiri Pirko
  0 siblings, 1 reply; 18+ messages in thread
From: Alexei Starovoitov @ 2015-04-11 16:12 UTC (permalink / raw)
  To: Jiri Pirko; +Cc: David Miller, netdev, jhs, tgraf, jesse

On Fri, Apr 10, 2015 at 02:45:17PM +0200, Jiri Pirko wrote:
> Okay. That was misunderstanding. I was thinking about using existing
> flow_dissect. There are couple things which I'm scared of:
> - there are eventually many fields to be added to dissection function and to
>   the structure as well. Not sure how acceptable that would be for
>   performance reasons when flow_dissect is used by different users...

I share the same concern. I think flow_dissect is too performance
critical to reuse by expanding 'struct flow_keys'.
I think it would be better to generalize ovs's key_extract() into
common piece of code that TC classifier and ovs datapath can use.
It uses kernel internal 'struct sw_flow_key' which we can tweak to
accommodate more users. It's already gigantic at 392 bytes, so
split and a bit of diet would help too.

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [patch net-next v3] tc: introduce OpenFlow classifier
  2015-04-11 16:12         ` Alexei Starovoitov
@ 2015-04-12  7:53           ` Jiri Pirko
  2015-04-12 23:44             ` David Miller
  0 siblings, 1 reply; 18+ messages in thread
From: Jiri Pirko @ 2015-04-12  7:53 UTC (permalink / raw)
  To: Alexei Starovoitov; +Cc: David Miller, netdev, jhs, tgraf, jesse

Sat, Apr 11, 2015 at 06:12:25PM CEST, alexei.starovoitov@gmail.com wrote:
>On Fri, Apr 10, 2015 at 02:45:17PM +0200, Jiri Pirko wrote:
>> Okay. That was misunderstanding. I was thinking about using existing
>> flow_dissect. There are couple things which I'm scared of:
>> - there are eventually many fields to be added to dissection function and to
>>   the structure as well. Not sure how acceptable that would be for
>>   performance reasons when flow_dissect is used by different users...
>
>I share the same concern. I think flow_dissect is too performance
>critical to reuse by expanding 'struct flow_keys'.
>I think it would be better to generalize ovs's key_extract() into
>common piece of code that TC classifier and ovs datapath can use.
>It uses kernel internal 'struct sw_flow_key' which we can tweak to
>accommodate more users. It's already gigantic at 392 bytes, so
>split and a bit of diet would help too.

Yep, those are few next topics on my agenda.

Thanks

Jiri

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [patch net-next v3] tc: introduce OpenFlow classifier
  2015-04-12  7:53           ` Jiri Pirko
@ 2015-04-12 23:44             ` David Miller
  2015-04-13  0:12               ` Alexei Starovoitov
  0 siblings, 1 reply; 18+ messages in thread
From: David Miller @ 2015-04-12 23:44 UTC (permalink / raw)
  To: jiri; +Cc: alexei.starovoitov, netdev, jhs, tgraf, jesse

From: Jiri Pirko <jiri@resnulli.us>
Date: Sun, 12 Apr 2015 09:53:51 +0200

> Sat, Apr 11, 2015 at 06:12:25PM CEST, alexei.starovoitov@gmail.com wrote:
>>On Fri, Apr 10, 2015 at 02:45:17PM +0200, Jiri Pirko wrote:
>>> Okay. That was misunderstanding. I was thinking about using existing
>>> flow_dissect. There are couple things which I'm scared of:
>>> - there are eventually many fields to be added to dissection function and to
>>>   the structure as well. Not sure how acceptable that would be for
>>>   performance reasons when flow_dissect is used by different users...
>>
>>I share the same concern. I think flow_dissect is too performance
>>critical to reuse by expanding 'struct flow_keys'.
>>I think it would be better to generalize ovs's key_extract() into
>>common piece of code that TC classifier and ovs datapath can use.
>>It uses kernel internal 'struct sw_flow_key' which we can tweak to
>>accommodate more users. It's already gigantic at 392 bytes, so
>>split and a bit of diet would help too.
> 
> Yep, those are few next topics on my agenda.

This argument kinda ignores the fact that full flow dissection is run
on _every_ single RX packet on basically all Intel chipsets.

Therefore, I cannot take seriously someone saying that it is too much
overhead for a classifier.

And if it is that expensive, fix it, because that helps everyone not
just your special thing.

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [patch net-next v3] tc: introduce OpenFlow classifier
  2015-04-12 23:44             ` David Miller
@ 2015-04-13  0:12               ` Alexei Starovoitov
  2015-04-13  0:36                 ` David Miller
  0 siblings, 1 reply; 18+ messages in thread
From: Alexei Starovoitov @ 2015-04-13  0:12 UTC (permalink / raw)
  To: David Miller; +Cc: jiri, netdev, jhs, tgraf, jesse

On Sun, Apr 12, 2015 at 07:44:43PM -0400, David Miller wrote:
> From: Jiri Pirko <jiri@resnulli.us>
> Date: Sun, 12 Apr 2015 09:53:51 +0200
> 
> > Sat, Apr 11, 2015 at 06:12:25PM CEST, alexei.starovoitov@gmail.com wrote:
> >>On Fri, Apr 10, 2015 at 02:45:17PM +0200, Jiri Pirko wrote:
> >>> Okay. That was misunderstanding. I was thinking about using existing
> >>> flow_dissect. There are couple things which I'm scared of:
> >>> - there are eventually many fields to be added to dissection function and to
> >>>   the structure as well. Not sure how acceptable that would be for
> >>>   performance reasons when flow_dissect is used by different users...
> >>
> >>I share the same concern. I think flow_dissect is too performance
> >>critical to reuse by expanding 'struct flow_keys'.
> >>I think it would be better to generalize ovs's key_extract() into
> >>common piece of code that TC classifier and ovs datapath can use.
> >>It uses kernel internal 'struct sw_flow_key' which we can tweak to
> >>accommodate more users. It's already gigantic at 392 bytes, so
> >>split and a bit of diet would help too.
> > 
> > Yep, those are few next topics on my agenda.
> 
> This argument kinda ignores the fact that full flow dissection is run
> on _every_ single RX packet on basically all Intel chipsets.
> 
> Therefore, I cannot take seriously someone saying that it is too much
> overhead for a classifier.

I was taking about different thing. skb_flow_dissect() today is fast,
because it needs to copy very few fields into flow_keys, whereas
Jiri's classifier and ovs's key_extract copy pretty much everything
they see in the packet into gigantic sw_flow_key and that gigantic
struct is used a lookup key. Parsing itself is cheap, comparing to
copying everything.

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [patch net-next v3] tc: introduce OpenFlow classifier
  2015-04-13  0:12               ` Alexei Starovoitov
@ 2015-04-13  0:36                 ` David Miller
  2015-04-13  1:03                   ` Alexei Starovoitov
  0 siblings, 1 reply; 18+ messages in thread
From: David Miller @ 2015-04-13  0:36 UTC (permalink / raw)
  To: alexei.starovoitov; +Cc: jiri, netdev, jhs, tgraf, jesse

From: Alexei Starovoitov <alexei.starovoitov@gmail.com>
Date: Sun, 12 Apr 2015 17:12:13 -0700

> On Sun, Apr 12, 2015 at 07:44:43PM -0400, David Miller wrote:
>> From: Jiri Pirko <jiri@resnulli.us>
>> Date: Sun, 12 Apr 2015 09:53:51 +0200
>> 
>> > Sat, Apr 11, 2015 at 06:12:25PM CEST, alexei.starovoitov@gmail.com wrote:
>> >>On Fri, Apr 10, 2015 at 02:45:17PM +0200, Jiri Pirko wrote:
>> >>> Okay. That was misunderstanding. I was thinking about using existing
>> >>> flow_dissect. There are couple things which I'm scared of:
>> >>> - there are eventually many fields to be added to dissection function and to
>> >>>   the structure as well. Not sure how acceptable that would be for
>> >>>   performance reasons when flow_dissect is used by different users...
>> >>
>> >>I share the same concern. I think flow_dissect is too performance
>> >>critical to reuse by expanding 'struct flow_keys'.
>> >>I think it would be better to generalize ovs's key_extract() into
>> >>common piece of code that TC classifier and ovs datapath can use.
>> >>It uses kernel internal 'struct sw_flow_key' which we can tweak to
>> >>accommodate more users. It's already gigantic at 392 bytes, so
>> >>split and a bit of diet would help too.
>> > 
>> > Yep, those are few next topics on my agenda.
>> 
>> This argument kinda ignores the fact that full flow dissection is run
>> on _every_ single RX packet on basically all Intel chipsets.
>> 
>> Therefore, I cannot take seriously someone saying that it is too much
>> overhead for a classifier.
> 
> I was taking about different thing. skb_flow_dissect() today is fast,
> because it needs to copy very few fields into flow_keys, whereas
> Jiri's classifier and ovs's key_extract copy pretty much everything
> they see in the packet into gigantic sw_flow_key and that gigantic
> struct is used a lookup key. Parsing itself is cheap, comparing to
> copying everything.

It's easy to parameterize what flow dissector does and share code.

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [patch net-next v3] tc: introduce OpenFlow classifier
  2015-04-13  0:36                 ` David Miller
@ 2015-04-13  1:03                   ` Alexei Starovoitov
  2015-04-13  8:26                     ` Thomas Graf
  0 siblings, 1 reply; 18+ messages in thread
From: Alexei Starovoitov @ 2015-04-13  1:03 UTC (permalink / raw)
  To: David Miller; +Cc: jiri, netdev, jhs, tgraf, jesse

On Sun, Apr 12, 2015 at 08:36:18PM -0400, David Miller wrote:
> 
> It's easy to parameterize what flow dissector does and share code.

yes. Though I wouldn't pessimize ixgbe rx path even by single
cycle to accommodate openflow spec. As far as I understand the
skb_flow_dissect() job from driver point of view is to figure
out header length for copybreak, which includes parsing encaps,
whereas openflow stops at outer header. Even with this difference,
it surely can be parameterized. Would be great to share if it
doesn't cost rx anything.

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [patch net-next v3] tc: introduce OpenFlow classifier
  2015-04-13  1:03                   ` Alexei Starovoitov
@ 2015-04-13  8:26                     ` Thomas Graf
  2015-04-13 14:34                       ` Jiri Pirko
  0 siblings, 1 reply; 18+ messages in thread
From: Thomas Graf @ 2015-04-13  8:26 UTC (permalink / raw)
  To: Alexei Starovoitov; +Cc: David Miller, jiri, netdev, jhs, jesse

On 04/12/15 at 06:03pm, Alexei Starovoitov wrote:
> On Sun, Apr 12, 2015 at 08:36:18PM -0400, David Miller wrote:
> > 
> > It's easy to parameterize what flow dissector does and share code.
> 
> yes. Though I wouldn't pessimize ixgbe rx path even by single
> cycle to accommodate openflow spec. As far as I understand the
> skb_flow_dissect() job from driver point of view is to figure
> out header length for copybreak, which includes parsing encaps,
> whereas openflow stops at outer header. Even with this difference,
> it surely can be parameterized. Would be great to share if it
> doesn't cost rx anything.

A flow dissector that only extracts what's needed would benefit
everyone. I guess static keys could help here as the flow field
selection is definitely a slow path operation.

Even OVS could benefit from this eventually, it would just need
to do a full extract before the key is sent to user space.

Also, the logic to generate the wildcard mask and match on it
should be generalized as well.

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [patch net-next v3] tc: introduce OpenFlow classifier
  2015-04-13  8:26                     ` Thomas Graf
@ 2015-04-13 14:34                       ` Jiri Pirko
  0 siblings, 0 replies; 18+ messages in thread
From: Jiri Pirko @ 2015-04-13 14:34 UTC (permalink / raw)
  To: Thomas Graf; +Cc: Alexei Starovoitov, David Miller, netdev, jhs, jesse

Mon, Apr 13, 2015 at 10:26:00AM CEST, tgraf@suug.ch wrote:
>On 04/12/15 at 06:03pm, Alexei Starovoitov wrote:
>> On Sun, Apr 12, 2015 at 08:36:18PM -0400, David Miller wrote:
>> > 
>> > It's easy to parameterize what flow dissector does and share code.
>> 
>> yes. Though I wouldn't pessimize ixgbe rx path even by single
>> cycle to accommodate openflow spec. As far as I understand the
>> skb_flow_dissect() job from driver point of view is to figure
>> out header length for copybreak, which includes parsing encaps,
>> whereas openflow stops at outer header. Even with this difference,
>> it surely can be parameterized. Would be great to share if it
>> doesn't cost rx anything.
>
>A flow dissector that only extracts what's needed would benefit
>everyone. I guess static keys could help here as the flow field
>selection is definitely a slow path operation.

Hmm. Ok. I'll try to cook some patch in order to extend
skb_flow_dissect. Stay tuned.


>
>Even OVS could benefit from this eventually, it would just need
>to do a full extract before the key is sent to user space.
>
>Also, the logic to generate the wildcard mask and match on it
>should be generalized as well.

^ permalink raw reply	[flat|nested] 18+ messages in thread

end of thread, other threads:[~2015-04-13 14:34 UTC | newest]

Thread overview: 18+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2015-04-09 12:58 [patch net-next v3] tc: introduce OpenFlow classifier Jiri Pirko
2015-04-09 13:00 ` [patch iproute2 v3] tc: add support for " Jiri Pirko
2015-04-09 21:34 ` [patch net-next v3] tc: introduce " David Miller
2015-04-10  9:12   ` Jiri Pirko
2015-04-10  9:30     ` Daniel Borkmann
2015-04-10 11:46       ` Jiri Pirko
2015-04-10 13:48         ` Jamal Hadi Salim
2015-04-10 10:03     ` Thomas Graf
2015-04-10 12:23     ` David Miller
2015-04-10 12:45       ` Jiri Pirko
2015-04-11 16:12         ` Alexei Starovoitov
2015-04-12  7:53           ` Jiri Pirko
2015-04-12 23:44             ` David Miller
2015-04-13  0:12               ` Alexei Starovoitov
2015-04-13  0:36                 ` David Miller
2015-04-13  1:03                   ` Alexei Starovoitov
2015-04-13  8:26                     ` Thomas Graf
2015-04-13 14:34                       ` Jiri Pirko

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.