All of lore.kernel.org
 help / color / mirror / Atom feed
* [net-next 0/6] Add Geneve tunnel protocol support
@ 2014-10-02  8:04 Andy Zhou
  2014-10-02  8:04 ` [net-next 1/6] net: Add Geneve tunneling protocol driver Andy Zhou
                   ` (5 more replies)
  0 siblings, 6 replies; 11+ messages in thread
From: Andy Zhou @ 2014-10-02  8:04 UTC (permalink / raw)
  To: davem; +Cc: netdev, Andy Zhou

This patch series adds kernel support for Geneve (Generic Network
Virtualization Encapsulation) based on Geneve IETF draft: 
http://www.ietf.org/id/draft-gross-geneve-01.txt

Patch 1 implements Geneve tunneling protocol drvier 

Patch 2-6 adds openvswitch support for creating and using 
Geneve tunnels by OVS user space.


Andy Zhou (1):
  net: Add Geneve tunneling protocol driver

Jesse Gross (5):
  openvswitch: Eliminate memset() from flow_extract.
  openvswitch: Add support for matching on OAM packets.
  openvswitch: Wrap struct ovs_key_ipv4_tunnel in a new structure.
  openvswitch: Factor out allocation and verification of actions.
  openvswitch: Add support for Geneve tunneling.

 include/net/geneve.h             |   91 ++++++++++
 include/net/ip_tunnels.h         |   19 +-
 include/uapi/linux/openvswitch.h |    5 +-
 net/ipv4/Kconfig                 |   14 ++
 net/ipv4/Makefile                |    1 +
 net/ipv4/geneve.c                |  373 ++++++++++++++++++++++++++++++++++++++
 net/openvswitch/Kconfig          |   11 ++
 net/openvswitch/Makefile         |    4 +
 net/openvswitch/actions.c        |    5 +-
 net/openvswitch/datapath.c       |   44 +++--
 net/openvswitch/datapath.h       |    2 +-
 net/openvswitch/flow.c           |   76 ++++++--
 net/openvswitch/flow.h           |   48 +++--
 net/openvswitch/flow_netlink.c   |  227 +++++++++++++++++++----
 net/openvswitch/vport-geneve.c   |  236 ++++++++++++++++++++++++
 net/openvswitch/vport-gre.c      |   16 +-
 net/openvswitch/vport-vxlan.c    |   10 +-
 net/openvswitch/vport.c          |    9 +-
 net/openvswitch/vport.h          |    3 +-
 19 files changed, 1093 insertions(+), 101 deletions(-)
 create mode 100644 include/net/geneve.h
 create mode 100644 net/ipv4/geneve.c
 create mode 100644 net/openvswitch/vport-geneve.c

-- 
1.7.9.5

^ permalink raw reply	[flat|nested] 11+ messages in thread

* [net-next 1/6] net: Add Geneve tunneling protocol driver
  2014-10-02  8:04 [net-next 0/6] Add Geneve tunnel protocol support Andy Zhou
@ 2014-10-02  8:04 ` Andy Zhou
  2014-10-03 13:44   ` Nicolas Dichtel
  2014-10-02  8:04 ` [net-next 2/6] openvswitch: Eliminate memset() from flow_extract Andy Zhou
                   ` (4 subsequent siblings)
  5 siblings, 1 reply; 11+ messages in thread
From: Andy Zhou @ 2014-10-02  8:04 UTC (permalink / raw)
  To: davem; +Cc: netdev, Andy Zhou, Jesse Gross

This adds a device level support for Geneve -- Generic Network
Virtualization Encapsulation. The protocol is documented at
http://tools.ietf.org/html/draft-gross-geneve-01

Only protocol layer Geneve support is provided by this driver.
Openvswitch can be used for configuring, set up and tear down
functional Geneve tunnels.

Signed-off-by: Jesse Gross <jesse@nicira.com>
Signed-off-by: Andy Zhou <azhou@nicira.com>
---
 include/net/geneve.h     |   91 +++++++++++
 include/net/ip_tunnels.h |    2 +
 net/ipv4/Kconfig         |   14 ++
 net/ipv4/Makefile        |    1 +
 net/ipv4/geneve.c        |  373 ++++++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 481 insertions(+)
 create mode 100644 include/net/geneve.h
 create mode 100644 net/ipv4/geneve.c

diff --git a/include/net/geneve.h b/include/net/geneve.h
new file mode 100644
index 0000000..ce98865
--- /dev/null
+++ b/include/net/geneve.h
@@ -0,0 +1,91 @@
+#ifndef __NET_GENEVE_H
+#define __NET_GENEVE_H  1
+
+#include <net/udp_tunnel.h>
+
+struct geneve_sock;
+
+typedef void (geneve_rcv_t)(struct geneve_sock *gs, struct sk_buff *skb);
+
+struct geneve_sock {
+	struct hlist_node	hlist;
+	geneve_rcv_t		*rcv;
+	void			*rcv_data;
+	struct work_struct	del_work;
+	struct socket		*sock;
+	struct rcu_head		rcu;
+	atomic_t		refcnt;
+	struct udp_offload	udp_offloads;
+};
+
+/* Geneve Header:
+ *  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *  |Ver|  Opt Len  |O|C|    Rsvd.  |          Protocol Type        |
+ *  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *  |        Virtual Network Identifier (VNI)       |    Reserved   |
+ *  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *  |                    Variable Length Options                    |
+ *  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *
+ * Option Header:
+ *  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *  |          Option Class         |      Type     |R|R|R| Length  |
+ *  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *  |                      Variable Option Data                     |
+ *  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ */
+
+struct geneve_opt {
+	__be16	opt_class;
+	u8	type;
+#ifdef __LITTLE_ENDIAN_BITFIELD
+	u8	length:5;
+	u8	r3:1;
+	u8	r2:1;
+	u8	r1:1;
+#else
+	u8	r1:1;
+	u8	r2:1;
+	u8	r3:1;
+	u8	length:5;
+#endif
+	u8	opt_data[];
+};
+
+#define GENEVE_CRIT_OPT_TYPE (1 << 7)
+
+struct genevehdr {
+#ifdef __LITTLE_ENDIAN_BITFIELD
+	u8 opt_len:6;
+	u8 ver:2;
+	u8 rsvd1:6;
+	u8 critical:1;
+	u8 oam:1;
+#else
+	u8 ver:2;
+	u8 opt_len:6;
+	u8 oam:1;
+	u8 critical:1;
+	u8 rsvd1:6;
+#endif
+	__be16 proto_type;
+	u8 vni[3];
+	u8 rsvd2;
+	struct geneve_opt options[];
+};
+
+#define GENEVE_VER 0
+#define GENEVE_BASE_HLEN (sizeof(struct udphdr) + sizeof(struct genevehdr))
+
+struct geneve_sock *geneve_sock_add(struct net *net, __be16 port,
+				    geneve_rcv_t *rcv, void *data,
+				    bool no_share, bool ipv6);
+
+void geneve_sock_release(struct geneve_sock *vs);
+
+int geneve_xmit_skb(struct geneve_sock *gs, struct rtable *rt,
+		    struct sk_buff *skb, __be32 src, __be32 dst, __u8 tos,
+		    __u8 ttl, __be16 df, __be16 src_port, __be16 dst_port,
+		    __be16 tun_flags, u8 vni[3], u8 opt_len, u8 *opt,
+		    bool xnet);
+#endif
diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h
index 7f538ba..a9ce155 100644
--- a/include/net/ip_tunnels.h
+++ b/include/net/ip_tunnels.h
@@ -95,6 +95,8 @@ struct ip_tunnel {
 #define TUNNEL_VERSION	__cpu_to_be16(0x40)
 #define TUNNEL_NO_KEY	__cpu_to_be16(0x80)
 #define TUNNEL_DONT_FRAGMENT    __cpu_to_be16(0x0100)
+#define TUNNEL_OAM	__cpu_to_be16(0x0200)
+#define TUNNEL_CRIT_OPT	__cpu_to_be16(0x0400)
 
 struct tnl_ptk_info {
 	__be16 flags;
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index 69fb378..15ce6b0 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -453,6 +453,20 @@ config TCP_CONG_BIC
 	increase provides TCP friendliness.
 	See http://www.csc.ncsu.edu/faculty/rhee/export/bitcp/
 
+config GENEVE
+       tristate "Generic Network Virtualization Encapsulation (Geneve)"
+       depends on INET
+       select NET_IP_TUNNEL
+       select NET_UDP_TUNNEL
+       ---help---
+	  This allows one to create Geneve virtual interfaces that provide
+	  Layer 2 Networks over Layer 3 Networks. Geneve is often used
+	  to tunnel virtual network infrastructure in virtualized environments.
+	  For more information see:
+	    http://tools.ietf.org/html/draft-gross-geneve-01
+
+	  To compile this driver as a module, choose M here: the module
+
 config TCP_CONG_CUBIC
 	tristate "CUBIC TCP"
 	default y
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index d810578..518c04e 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -56,6 +56,7 @@ obj-$(CONFIG_TCP_CONG_YEAH) += tcp_yeah.o
 obj-$(CONFIG_TCP_CONG_ILLINOIS) += tcp_illinois.o
 obj-$(CONFIG_MEMCG_KMEM) += tcp_memcontrol.o
 obj-$(CONFIG_NETLABEL) += cipso_ipv4.o
+obj-$(CONFIG_GENEVE) += geneve.o
 
 obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \
 		      xfrm4_output.o xfrm4_protocol.o
diff --git a/net/ipv4/geneve.c b/net/ipv4/geneve.c
new file mode 100644
index 0000000..f008c55
--- /dev/null
+++ b/net/ipv4/geneve.c
@@ -0,0 +1,373 @@
+/*
+ * Geneve: Generic Network Virtualization Encapsulation
+ *
+ * Copyright (c) 2014 Nicira, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/slab.h>
+#include <linux/skbuff.h>
+#include <linux/rculist.h>
+#include <linux/netdevice.h>
+#include <linux/in.h>
+#include <linux/ip.h>
+#include <linux/udp.h>
+#include <linux/igmp.h>
+#include <linux/etherdevice.h>
+#include <linux/if_ether.h>
+#include <linux/if_vlan.h>
+#include <linux/hash.h>
+#include <linux/ethtool.h>
+#include <net/arp.h>
+#include <net/ndisc.h>
+#include <net/ip.h>
+#include <net/ip_tunnels.h>
+#include <net/icmp.h>
+#include <net/udp.h>
+#include <net/rtnetlink.h>
+#include <net/route.h>
+#include <net/dsfield.h>
+#include <net/inet_ecn.h>
+#include <net/net_namespace.h>
+#include <net/netns/generic.h>
+#include <net/geneve.h>
+#include <net/protocol.h>
+#include <net/udp_tunnel.h>
+#if IS_ENABLED(CONFIG_IPV6)
+#include <net/ipv6.h>
+#include <net/addrconf.h>
+#include <net/ip6_tunnel.h>
+#include <net/ip6_checksum.h>
+#endif
+
+#define PORT_HASH_BITS 8
+#define PORT_HASH_SIZE (1<<PORT_HASH_BITS)
+
+/* per-network namespace private data for this module */
+struct geneve_net {
+	struct hlist_head	sock_list[PORT_HASH_SIZE];
+	spinlock_t		sock_lock;   /* Protects sock_list */
+};
+
+static int geneve_net_id;
+
+static struct workqueue_struct *geneve_wq;
+
+static inline struct genevehdr *geneve_hdr(const struct sk_buff *skb)
+{
+	return (struct genevehdr *)(udp_hdr(skb) + 1);
+}
+
+static struct hlist_head *gs_head(struct net *net, __be16 port)
+{
+	struct geneve_net *gn = net_generic(net, geneve_net_id);
+
+	return &gn->sock_list[hash_32(ntohs(port), PORT_HASH_BITS)];
+}
+
+/* Find geneve socket based on network namespace and UDP port */
+static struct geneve_sock *geneve_find_sock(struct net *net, __be16 port)
+{
+	struct geneve_sock *gs;
+
+	hlist_for_each_entry_rcu(gs, gs_head(net, port), hlist) {
+		if (inet_sk(gs->sock->sk)->inet_sport == port)
+			return gs;
+	}
+
+	return NULL;
+}
+
+static void geneve_build_header(struct genevehdr *geneveh,
+				__be16 tun_flags, u8 vni[3],
+				u8 options_len, u8 *options)
+{
+	geneveh->ver = GENEVE_VER;
+	geneveh->opt_len = options_len / 4;
+	geneveh->oam = !!(tun_flags & TUNNEL_OAM);
+	geneveh->critical = !!(tun_flags & TUNNEL_CRIT_OPT);
+	geneveh->rsvd1 = 0;
+	memcpy(geneveh->vni, vni, 3);
+	geneveh->proto_type = htons(ETH_P_TEB);
+	geneveh->rsvd2 = 0;
+
+	memcpy(geneveh->options, options, options_len);
+}
+
+/* Transmit a fully formated Geneve frame.
+ *
+ * When calling this function. The skb->data should point
+ * to the geneve header which is fully formed.
+ *
+ * This function will add other UDP tunnel headers.
+ */
+int geneve_xmit_skb(struct geneve_sock *gs, struct rtable *rt,
+		    struct sk_buff *skb, __be32 src, __be32 dst, __u8 tos,
+		    __u8 ttl, __be16 df, __be16 src_port, __be16 dst_port,
+		    __be16 tun_flags, u8 vni[3], u8 opt_len, u8 *opt,
+		    bool xnet)
+{
+	struct genevehdr *gnvh;
+	int min_headroom;
+	int err;
+
+	skb = udp_tunnel_handle_offloads(skb, !gs->sock->sk->sk_no_check_tx);
+
+	min_headroom = LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len
+			+ GENEVE_BASE_HLEN + opt_len + sizeof(struct iphdr)
+			+ (vlan_tx_tag_present(skb) ? VLAN_HLEN : 0);
+
+	err = skb_cow_head(skb, min_headroom);
+	if (unlikely(err))
+		return err;
+
+	if (vlan_tx_tag_present(skb)) {
+		if (unlikely(!__vlan_put_tag(skb,
+					     skb->vlan_proto,
+					     vlan_tx_tag_get(skb)))) {
+			err = -ENOMEM;
+			return err;
+		}
+		skb->vlan_tci = 0;
+	}
+
+	gnvh = (struct genevehdr *)__skb_push(skb, sizeof(*gnvh) + opt_len);
+	geneve_build_header(gnvh, tun_flags, vni, opt_len, opt);
+
+	return udp_tunnel_xmit_skb(gs->sock, rt, skb, src, dst,
+				   tos, ttl, df, src_port, dst_port, xnet);
+}
+EXPORT_SYMBOL_GPL(geneve_xmit_skb);
+
+static void geneve_notify_add_rx_port(struct geneve_sock *gs)
+{
+	struct sock *sk = gs->sock->sk;
+	sa_family_t sa_family = sk->sk_family;
+	int err;
+
+	if (sa_family == AF_INET) {
+		err = udp_add_offload(&gs->udp_offloads);
+		if (err)
+			pr_warn("geneve: udp_add_offload failed with status %d\n",
+				err);
+	}
+}
+
+/* Callback from net/ipv4/udp.c to receive packets */
+static int geneve_udp_encap_recv(struct sock *sk, struct sk_buff *skb)
+{
+	struct genevehdr *geneveh;
+	struct geneve_sock *gs;
+	int opts_len;
+
+	/* Need Geneve and inner Ethernet header to be present */
+	if (unlikely(!pskb_may_pull(skb, GENEVE_BASE_HLEN)))
+		goto error;
+
+	/* Return packets with reserved bits set */
+	geneveh = geneve_hdr(skb);
+
+	if (unlikely(geneveh->ver != GENEVE_VER))
+		goto error;
+
+	if (unlikely(geneveh->proto_type != htons(ETH_P_TEB)))
+		goto error;
+
+	opts_len = geneveh->opt_len * 4;
+	if (iptunnel_pull_header(skb, GENEVE_BASE_HLEN + opts_len,
+				 htons(ETH_P_TEB)))
+		goto drop;
+
+	gs = rcu_dereference_sk_user_data(sk);
+	if (!gs)
+		goto drop;
+
+	gs->rcv(gs, skb);
+	return 0;
+
+drop:
+	/* Consume bad packet */
+	kfree_skb(skb);
+	return 0;
+
+error:
+	/* Let the UDP layer deal with the skb */
+	return 1;
+}
+
+static void geneve_del_work(struct work_struct *work)
+{
+	struct geneve_sock *gs = container_of(work, struct geneve_sock,
+					      del_work);
+
+	udp_tunnel_sock_release(gs->sock);
+	kfree_rcu(gs, rcu);
+}
+
+static struct socket *geneve_create_sock(struct net *net, bool ipv6,
+					 __be16 port)
+{
+	struct socket *sock;
+	struct udp_port_cfg udp_conf;
+	int err;
+
+	memset(&udp_conf, 0, sizeof(udp_conf));
+
+	if (ipv6) {
+		udp_conf.family = AF_INET6;
+	} else {
+		udp_conf.family = AF_INET;
+		udp_conf.local_ip.s_addr = INADDR_ANY;
+	}
+
+	udp_conf.local_udp_port = port;
+
+	/* Open UDP socket */
+	err = udp_sock_create(net, &udp_conf, &sock);
+	if (err < 0)
+		return ERR_PTR(err);
+
+	return sock;
+}
+
+/* Create new listen socket if needed */
+static struct geneve_sock *geneve_socket_create(struct net *net, __be16 port,
+						geneve_rcv_t *rcv, void *data,
+						bool ipv6)
+{
+	struct geneve_net *gn = net_generic(net, geneve_net_id);
+	struct geneve_sock *gs;
+	struct socket *sock;
+	struct udp_tunnel_sock_cfg tunnel_cfg;
+
+	gs = kzalloc(sizeof(*gs), GFP_KERNEL);
+	if (!gs)
+		return ERR_PTR(-ENOMEM);
+
+	INIT_WORK(&gs->del_work, geneve_del_work);
+
+	sock = geneve_create_sock(net, ipv6, port);
+	if (IS_ERR(sock)) {
+		kfree(gs);
+		return ERR_CAST(sock);
+	}
+
+	gs->sock = sock;
+	atomic_set(&gs->refcnt, 1);
+	gs->rcv = rcv;
+	gs->rcv_data = data;
+
+	/* Initialize the geneve udp offloads structure */
+	gs->udp_offloads.port = port;
+	gs->udp_offloads.callbacks.gro_receive = NULL;
+	gs->udp_offloads.callbacks.gro_complete = NULL;
+
+	spin_lock(&gn->sock_lock);
+	hlist_add_head_rcu(&gs->hlist, gs_head(net, port));
+	geneve_notify_add_rx_port(gs);
+	spin_unlock(&gn->sock_lock);
+
+	/* Mark socket as an encapsulation socket */
+	tunnel_cfg.sk_user_data = gs;
+	tunnel_cfg.encap_type = 1;
+	tunnel_cfg.encap_rcv = geneve_udp_encap_recv;
+	tunnel_cfg.encap_destroy = NULL;
+	setup_udp_tunnel_sock(net, sock, &tunnel_cfg);
+
+	return gs;
+}
+
+struct geneve_sock *geneve_sock_add(struct net *net, __be16 port,
+				    geneve_rcv_t *rcv, void *data,
+				    bool no_share, bool ipv6)
+{
+	struct geneve_sock *gs;
+
+	gs = geneve_socket_create(net, port, rcv, data, ipv6);
+	if (!IS_ERR(gs))
+		return gs;
+
+	if (no_share)	/* Return error if sharing is not allowed. */
+		return ERR_PTR(-EINVAL);
+
+	gs = geneve_find_sock(net, port);
+	if (gs) {
+		if (gs->rcv == rcv)
+			atomic_inc(&gs->refcnt);
+		else
+			gs = ERR_PTR(-EBUSY);
+	} else {
+		gs = ERR_PTR(-EINVAL);
+	}
+
+	return gs;
+}
+EXPORT_SYMBOL_GPL(geneve_sock_add);
+
+void geneve_sock_release(struct geneve_sock *gs)
+{
+	if (!atomic_dec_and_test(&gs->refcnt))
+		return;
+
+	queue_work(geneve_wq, &gs->del_work);
+}
+EXPORT_SYMBOL_GPL(geneve_sock_release);
+
+static __net_init int geneve_init_net(struct net *net)
+{
+	struct geneve_net *gn = net_generic(net, geneve_net_id);
+	unsigned int h;
+
+	spin_lock_init(&gn->sock_lock);
+
+	for (h = 0; h < PORT_HASH_SIZE; ++h)
+		INIT_HLIST_HEAD(&gn->sock_list[h]);
+
+	return 0;
+}
+
+static struct pernet_operations geneve_net_ops = {
+	.init = geneve_init_net,
+	.exit = NULL,
+	.id   = &geneve_net_id,
+	.size = sizeof(struct geneve_net),
+};
+
+static int __init geneve_init_module(void)
+{
+	int rc;
+
+	geneve_wq = alloc_workqueue("geneve", 0, 0);
+	if (!geneve_wq)
+		return -ENOMEM;
+
+	rc = register_pernet_subsys(&geneve_net_ops);
+	if (rc)
+		return rc;
+
+	pr_info("Geneve driver\n");
+
+	return 0;
+}
+late_initcall(geneve_init_module);
+
+static void __exit geneve_cleanup_module(void)
+{
+	destroy_workqueue(geneve_wq);
+}
+module_exit(geneve_cleanup_module);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Jesse Gross <jesse@nicira.com>");
+MODULE_DESCRIPTION("Driver for GENEVE encapsulated traffic");
+MODULE_ALIAS_RTNL_LINK("geneve");
-- 
1.7.9.5

^ permalink raw reply related	[flat|nested] 11+ messages in thread

* [net-next 2/6] openvswitch: Eliminate memset() from flow_extract.
  2014-10-02  8:04 [net-next 0/6] Add Geneve tunnel protocol support Andy Zhou
  2014-10-02  8:04 ` [net-next 1/6] net: Add Geneve tunneling protocol driver Andy Zhou
@ 2014-10-02  8:04 ` Andy Zhou
  2014-10-02  8:04 ` [net-next 3/6] openvswitch: Add support for matching on OAM packets Andy Zhou
                   ` (3 subsequent siblings)
  5 siblings, 0 replies; 11+ messages in thread
From: Andy Zhou @ 2014-10-02  8:04 UTC (permalink / raw)
  To: davem; +Cc: netdev, Jesse Gross, Andy Zhou

From: Jesse Gross <jesse@nicira.com>

As new protocols are added, the size of the flow key tends to
increase although few protocols care about all of the fields. In
order to optimize this for hashing and matching, OVS uses a variable
length portion of the key. However, when fields are extracted from
the packet we must still zero out the entire key.

This is no longer necessary now that OVS implements masking. Any
fields (or holes in the structure) which are not part of a given
protocol will be by definition not part of the mask and zeroed out
during lookup. Furthermore, since masking already uses variable
length keys this zeroing operation automatically benefits as well.

In principle, the only thing that needs to be done at this point
is remove the memset() at the beginning of flow. However, some
fields assume that they are initialized to zero, which now must be
done explicitly. In addition, in the event of an error we must also
zero out corresponding fields to signal that there is no valid data
present. These increase the total amount of code but very little of
it is executed in non-error situations.

Removing the memset() reduces the profile of ovs_flow_extract()
from 0.64% to 0.56% when tested with large packets on a 10G link.

Suggested-by: Pravin Shelar <pshelar@nicira.com>
Signed-off-by: Jesse Gross <jesse@nicira.com>
Signed-off-by: Andy Zhou <azhou@nicira.com>
---
 net/openvswitch/flow.c |   54 ++++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 45 insertions(+), 9 deletions(-)

diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c
index 4010423..913bdc1 100644
--- a/net/openvswitch/flow.c
+++ b/net/openvswitch/flow.c
@@ -462,6 +462,7 @@ static int key_extract(struct sk_buff *skb, struct sw_flow_key *key)
 	 * update skb->csum here.
 	 */
 
+	key->eth.tci = 0;
 	if (vlan_tx_tag_present(skb))
 		key->eth.tci = htons(skb->vlan_tci);
 	else if (eth->h_proto == htons(ETH_P_8021Q))
@@ -482,6 +483,8 @@ static int key_extract(struct sk_buff *skb, struct sw_flow_key *key)
 
 		error = check_iphdr(skb);
 		if (unlikely(error)) {
+			memset(&key->ip, 0, sizeof(key->ip));
+			memset(&key->ipv4, 0, sizeof(key->ipv4));
 			if (error == -EINVAL) {
 				skb->transport_header = skb->network_header;
 				error = 0;
@@ -503,8 +506,10 @@ static int key_extract(struct sk_buff *skb, struct sw_flow_key *key)
 			return 0;
 		}
 		if (nh->frag_off & htons(IP_MF) ||
-			 skb_shinfo(skb)->gso_type & SKB_GSO_UDP)
+			skb_shinfo(skb)->gso_type & SKB_GSO_UDP)
 			key->ip.frag = OVS_FRAG_TYPE_FIRST;
+		else
+			key->ip.frag = OVS_FRAG_TYPE_NONE;
 
 		/* Transport layer. */
 		if (key->ip.proto == IPPROTO_TCP) {
@@ -513,18 +518,25 @@ static int key_extract(struct sk_buff *skb, struct sw_flow_key *key)
 				key->tp.src = tcp->source;
 				key->tp.dst = tcp->dest;
 				key->tp.flags = TCP_FLAGS_BE16(tcp);
+			} else {
+				memset(&key->tp, 0, sizeof(key->tp));
 			}
+
 		} else if (key->ip.proto == IPPROTO_UDP) {
 			if (udphdr_ok(skb)) {
 				struct udphdr *udp = udp_hdr(skb);
 				key->tp.src = udp->source;
 				key->tp.dst = udp->dest;
+			} else {
+				memset(&key->tp, 0, sizeof(key->tp));
 			}
 		} else if (key->ip.proto == IPPROTO_SCTP) {
 			if (sctphdr_ok(skb)) {
 				struct sctphdr *sctp = sctp_hdr(skb);
 				key->tp.src = sctp->source;
 				key->tp.dst = sctp->dest;
+			} else {
+				memset(&key->tp, 0, sizeof(key->tp));
 			}
 		} else if (key->ip.proto == IPPROTO_ICMP) {
 			if (icmphdr_ok(skb)) {
@@ -534,33 +546,44 @@ static int key_extract(struct sk_buff *skb, struct sw_flow_key *key)
 				 * them in 16-bit network byte order. */
 				key->tp.src = htons(icmp->type);
 				key->tp.dst = htons(icmp->code);
+			} else {
+				memset(&key->tp, 0, sizeof(key->tp));
 			}
 		}
 
-	} else if ((key->eth.type == htons(ETH_P_ARP) ||
-		   key->eth.type == htons(ETH_P_RARP)) && arphdr_ok(skb)) {
+	} else if (key->eth.type == htons(ETH_P_ARP) ||
+		   key->eth.type == htons(ETH_P_RARP)) {
 		struct arp_eth_header *arp;
 
 		arp = (struct arp_eth_header *)skb_network_header(skb);
 
-		if (arp->ar_hrd == htons(ARPHRD_ETHER)
-				&& arp->ar_pro == htons(ETH_P_IP)
-				&& arp->ar_hln == ETH_ALEN
-				&& arp->ar_pln == 4) {
+		if (arphdr_ok(skb) &&
+		    arp->ar_hrd == htons(ARPHRD_ETHER) &&
+		    arp->ar_pro == htons(ETH_P_IP) &&
+		    arp->ar_hln == ETH_ALEN &&
+		    arp->ar_pln == 4) {
 
 			/* We only match on the lower 8 bits of the opcode. */
 			if (ntohs(arp->ar_op) <= 0xff)
 				key->ip.proto = ntohs(arp->ar_op);
+			else
+				key->ip.proto = 0;
+
 			memcpy(&key->ipv4.addr.src, arp->ar_sip, sizeof(key->ipv4.addr.src));
 			memcpy(&key->ipv4.addr.dst, arp->ar_tip, sizeof(key->ipv4.addr.dst));
 			ether_addr_copy(key->ipv4.arp.sha, arp->ar_sha);
 			ether_addr_copy(key->ipv4.arp.tha, arp->ar_tha);
+		} else {
+			memset(&key->ip, 0, sizeof(key->ip));
+			memset(&key->ipv4, 0, sizeof(key->ipv4));
 		}
 	} else if (key->eth.type == htons(ETH_P_IPV6)) {
 		int nh_len;             /* IPv6 Header + Extensions */
 
 		nh_len = parse_ipv6hdr(skb, key);
 		if (unlikely(nh_len < 0)) {
+			memset(&key->ip, 0, sizeof(key->ip));
+			memset(&key->ipv6.addr, 0, sizeof(key->ipv6.addr));
 			if (nh_len == -EINVAL) {
 				skb->transport_header = skb->network_header;
 				error = 0;
@@ -582,24 +605,32 @@ static int key_extract(struct sk_buff *skb, struct sw_flow_key *key)
 				key->tp.src = tcp->source;
 				key->tp.dst = tcp->dest;
 				key->tp.flags = TCP_FLAGS_BE16(tcp);
+			} else {
+				memset(&key->tp, 0, sizeof(key->tp));
 			}
 		} else if (key->ip.proto == NEXTHDR_UDP) {
 			if (udphdr_ok(skb)) {
 				struct udphdr *udp = udp_hdr(skb);
 				key->tp.src = udp->source;
 				key->tp.dst = udp->dest;
+			} else {
+				memset(&key->tp, 0, sizeof(key->tp));
 			}
 		} else if (key->ip.proto == NEXTHDR_SCTP) {
 			if (sctphdr_ok(skb)) {
 				struct sctphdr *sctp = sctp_hdr(skb);
 				key->tp.src = sctp->source;
 				key->tp.dst = sctp->dest;
+			} else {
+				memset(&key->tp, 0, sizeof(key->tp));
 			}
 		} else if (key->ip.proto == NEXTHDR_ICMP) {
 			if (icmp6hdr_ok(skb)) {
 				error = parse_icmpv6(skb, key, nh_len);
 				if (error)
 					return error;
+			} else {
+				memset(&key->tp, 0, sizeof(key->tp));
 			}
 		}
 	}
@@ -615,13 +646,19 @@ int ovs_flow_key_extract(struct ovs_key_ipv4_tunnel *tun_key,
 			 struct sk_buff *skb, struct sw_flow_key *key)
 {
 	/* Extract metadata from packet. */
-	memset(key, 0, sizeof(*key));
 	if (tun_key)
 		memcpy(&key->tun_key, tun_key, sizeof(key->tun_key));
+	else
+		memset(&key->tun_key, 0, sizeof(key->tun_key));
 
 	key->phy.priority = skb->priority;
 	key->phy.in_port = OVS_CB(skb)->input_vport->port_no;
 	key->phy.skb_mark = skb->mark;
+	key->ovs_flow_hash = 0;
+	key->recirc_id = 0;
+
+	/* Flags are always used as part of stats */
+	key->tp.flags = 0;
 
 	return key_extract(skb, key);
 }
@@ -632,7 +669,6 @@ int ovs_flow_key_extract_userspace(const struct nlattr *attr,
 {
 	int err;
 
-	memset(key, 0, sizeof(*key));
 	/* Extract metadata from netlink attributes. */
 	err = ovs_nla_get_flow_metadata(attr, key);
 	if (err)
-- 
1.7.9.5

^ permalink raw reply related	[flat|nested] 11+ messages in thread

* [net-next 3/6] openvswitch: Add support for matching on OAM packets.
  2014-10-02  8:04 [net-next 0/6] Add Geneve tunnel protocol support Andy Zhou
  2014-10-02  8:04 ` [net-next 1/6] net: Add Geneve tunneling protocol driver Andy Zhou
  2014-10-02  8:04 ` [net-next 2/6] openvswitch: Eliminate memset() from flow_extract Andy Zhou
@ 2014-10-02  8:04 ` Andy Zhou
  2014-10-02  8:04 ` [net-next 4/6] openvswitch: Wrap struct ovs_key_ipv4_tunnel in a new structure Andy Zhou
                   ` (2 subsequent siblings)
  5 siblings, 0 replies; 11+ messages in thread
From: Andy Zhou @ 2014-10-02  8:04 UTC (permalink / raw)
  To: davem; +Cc: netdev, Jesse Gross, Andy Zhou

From: Jesse Gross <jesse@nicira.com>

Some tunnel formats have mechanisms for indicating that packets are
OAM frames that should be handled specially (either as high priority or
not forwarded beyond an endpoint). This provides support for allowing
those types of packets to be matched.

Signed-off-by: Jesse Gross <jesse@nicira.com>
Signed-off-by: Andy Zhou <azhou@nicira.com>
---
 include/uapi/linux/openvswitch.h |    1 +
 net/openvswitch/datapath.c       |    1 +
 net/openvswitch/flow_netlink.c   |   17 ++++++++++++-----
 3 files changed, 14 insertions(+), 5 deletions(-)

diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h
index f7fc507..7c06106 100644
--- a/include/uapi/linux/openvswitch.h
+++ b/include/uapi/linux/openvswitch.h
@@ -309,6 +309,7 @@ enum ovs_tunnel_key_attr {
 	OVS_TUNNEL_KEY_ATTR_TTL,                /* u8 Tunnel IP TTL. */
 	OVS_TUNNEL_KEY_ATTR_DONT_FRAGMENT,      /* No argument, set DF. */
 	OVS_TUNNEL_KEY_ATTR_CSUM,               /* No argument. CSUM packet. */
+	OVS_TUNNEL_KEY_ATTR_OAM,                /* No argument. OAM frame.  */
 	__OVS_TUNNEL_KEY_ATTR_MAX
 };
 
diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
index 9e3a2fa..f6bd93d 100644
--- a/net/openvswitch/datapath.c
+++ b/net/openvswitch/datapath.c
@@ -369,6 +369,7 @@ static size_t key_attr_size(void)
 		  + nla_total_size(1)   /* OVS_TUNNEL_KEY_ATTR_TTL */
 		  + nla_total_size(0)   /* OVS_TUNNEL_KEY_ATTR_DONT_FRAGMENT */
 		  + nla_total_size(0)   /* OVS_TUNNEL_KEY_ATTR_CSUM */
+		  + nla_total_size(0)   /* OVS_TUNNEL_KEY_ATTR_OAM */
 		+ nla_total_size(4)   /* OVS_KEY_ATTR_IN_PORT */
 		+ nla_total_size(4)   /* OVS_KEY_ATTR_SKB_MARK */
 		+ nla_total_size(12)  /* OVS_KEY_ATTR_ETHERNET */
diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c
index f4c8daa..22c855f 100644
--- a/net/openvswitch/flow_netlink.c
+++ b/net/openvswitch/flow_netlink.c
@@ -346,6 +346,7 @@ static int ipv4_tun_from_nlattr(const struct nlattr *attr,
 			[OVS_TUNNEL_KEY_ATTR_TTL] = 1,
 			[OVS_TUNNEL_KEY_ATTR_DONT_FRAGMENT] = 0,
 			[OVS_TUNNEL_KEY_ATTR_CSUM] = 0,
+			[OVS_TUNNEL_KEY_ATTR_OAM] = 0,
 		};
 
 		if (type > OVS_TUNNEL_KEY_ATTR_MAX) {
@@ -390,6 +391,9 @@ static int ipv4_tun_from_nlattr(const struct nlattr *attr,
 		case OVS_TUNNEL_KEY_ATTR_CSUM:
 			tun_flags |= TUNNEL_CSUM;
 			break;
+		case OVS_TUNNEL_KEY_ATTR_OAM:
+			tun_flags |= TUNNEL_OAM;
+			break;
 		default:
 			return -EINVAL;
 		}
@@ -431,21 +435,24 @@ static int ipv4_tun_to_nlattr(struct sk_buff *skb,
 	    nla_put_be64(skb, OVS_TUNNEL_KEY_ATTR_ID, output->tun_id))
 		return -EMSGSIZE;
 	if (output->ipv4_src &&
-		nla_put_be32(skb, OVS_TUNNEL_KEY_ATTR_IPV4_SRC, output->ipv4_src))
+	    nla_put_be32(skb, OVS_TUNNEL_KEY_ATTR_IPV4_SRC, output->ipv4_src))
 		return -EMSGSIZE;
 	if (output->ipv4_dst &&
-		nla_put_be32(skb, OVS_TUNNEL_KEY_ATTR_IPV4_DST, output->ipv4_dst))
+	    nla_put_be32(skb, OVS_TUNNEL_KEY_ATTR_IPV4_DST, output->ipv4_dst))
 		return -EMSGSIZE;
 	if (output->ipv4_tos &&
-		nla_put_u8(skb, OVS_TUNNEL_KEY_ATTR_TOS, output->ipv4_tos))
+	    nla_put_u8(skb, OVS_TUNNEL_KEY_ATTR_TOS, output->ipv4_tos))
 		return -EMSGSIZE;
 	if (nla_put_u8(skb, OVS_TUNNEL_KEY_ATTR_TTL, output->ipv4_ttl))
 		return -EMSGSIZE;
 	if ((output->tun_flags & TUNNEL_DONT_FRAGMENT) &&
-		nla_put_flag(skb, OVS_TUNNEL_KEY_ATTR_DONT_FRAGMENT))
+	    nla_put_flag(skb, OVS_TUNNEL_KEY_ATTR_DONT_FRAGMENT))
 		return -EMSGSIZE;
 	if ((output->tun_flags & TUNNEL_CSUM) &&
-		nla_put_flag(skb, OVS_TUNNEL_KEY_ATTR_CSUM))
+	    nla_put_flag(skb, OVS_TUNNEL_KEY_ATTR_CSUM))
+		return -EMSGSIZE;
+	if ((output->tun_flags & TUNNEL_OAM) &&
+	    nla_put_flag(skb, OVS_TUNNEL_KEY_ATTR_OAM))
 		return -EMSGSIZE;
 
 	nla_nest_end(skb, nla);
-- 
1.7.9.5

^ permalink raw reply related	[flat|nested] 11+ messages in thread

* [net-next 4/6] openvswitch: Wrap struct ovs_key_ipv4_tunnel in a new structure.
  2014-10-02  8:04 [net-next 0/6] Add Geneve tunnel protocol support Andy Zhou
                   ` (2 preceding siblings ...)
  2014-10-02  8:04 ` [net-next 3/6] openvswitch: Add support for matching on OAM packets Andy Zhou
@ 2014-10-02  8:04 ` Andy Zhou
  2014-10-02  8:04 ` [net-next 5/6] openvswitch: Factor out allocation and verification of actions Andy Zhou
  2014-10-02  8:04 ` [net-next 6/6] openvswitch: Add support for Geneve tunneling Andy Zhou
  5 siblings, 0 replies; 11+ messages in thread
From: Andy Zhou @ 2014-10-02  8:04 UTC (permalink / raw)
  To: davem; +Cc: netdev, Jesse Gross, Andy Zhou

From: Jesse Gross <jesse@nicira.com>

Currently, the flow information that is matched for tunnels and
the tunnel data passed around with packets is the same. However,
as additional information is added this is not necessarily desirable,
as in the case of pointers.

This adds a new structure for tunnel metadata which currently contains
only the existing struct. This change is purely internal to the kernel
since the current OVS_KEY_ATTR_IPV4_TUNNEL is simply a compressed version
of OVS_KEY_ATTR_TUNNEL that is translated at flow setup.

Signed-off-by: Jesse Gross <jesse@nicira.com>
Signed-off-by: Andy Zhou <azhou@nicira.com>
---
 include/uapi/linux/openvswitch.h |    2 +-
 net/openvswitch/actions.c        |    5 +++--
 net/openvswitch/datapath.h       |    2 +-
 net/openvswitch/flow.c           |    6 +++---
 net/openvswitch/flow.h           |   30 +++++++++++++++++-------------
 net/openvswitch/flow_netlink.c   |   38 +++++++++++++++++++++++++++++++-------
 net/openvswitch/vport-gre.c      |   16 +++++++++-------
 net/openvswitch/vport-vxlan.c    |   10 +++++-----
 net/openvswitch/vport.c          |    6 +++---
 net/openvswitch/vport.h          |    2 +-
 10 files changed, 74 insertions(+), 43 deletions(-)

diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h
index 7c06106..6753032 100644
--- a/include/uapi/linux/openvswitch.h
+++ b/include/uapi/linux/openvswitch.h
@@ -294,7 +294,7 @@ enum ovs_key_attr {
 	OVS_KEY_ATTR_RECIRC_ID, /* u32 recirc id */
 
 #ifdef __KERNEL__
-	OVS_KEY_ATTR_IPV4_TUNNEL,  /* struct ovs_key_ipv4_tunnel */
+	OVS_KEY_ATTR_TUNNEL_INFO,  /* struct ovs_tunnel_info */
 #endif
 	__OVS_KEY_ATTR_MAX
 };
diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c
index 6932a42..006886d 100644
--- a/net/openvswitch/actions.c
+++ b/net/openvswitch/actions.c
@@ -590,8 +590,8 @@ static int execute_set_action(struct sk_buff *skb,
 		skb->mark = nla_get_u32(nested_attr);
 		break;
 
-	case OVS_KEY_ATTR_IPV4_TUNNEL:
-		OVS_CB(skb)->egress_tun_key = nla_data(nested_attr);
+	case OVS_KEY_ATTR_TUNNEL_INFO:
+		OVS_CB(skb)->egress_tun_info = nla_data(nested_attr);
 		break;
 
 	case OVS_KEY_ATTR_ETHERNET:
@@ -778,6 +778,7 @@ int ovs_execute_actions(struct datapath *dp, struct sk_buff *skb,
 	acts = rcu_dereference(OVS_CB(skb)->flow->sf_acts);
 
 	this_cpu_inc(exec_actions_level);
+	OVS_CB(skb)->egress_tun_info = NULL;
 	err = do_execute_actions(dp, skb, key,
 				 acts->actions, acts->actions_len);
 
diff --git a/net/openvswitch/datapath.h b/net/openvswitch/datapath.h
index ac3f3df..9741354 100644
--- a/net/openvswitch/datapath.h
+++ b/net/openvswitch/datapath.h
@@ -102,8 +102,8 @@ struct datapath {
  */
 struct ovs_skb_cb {
 	struct sw_flow		*flow;
+	struct ovs_tunnel_info  *egress_tun_info;
 	struct vport		*input_vport;
-	struct ovs_key_ipv4_tunnel  *egress_tun_key;
 };
 #define OVS_CB(skb) ((struct ovs_skb_cb *)(skb)->cb)
 
diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c
index 913bdc1..2924cb3 100644
--- a/net/openvswitch/flow.c
+++ b/net/openvswitch/flow.c
@@ -642,12 +642,12 @@ int ovs_flow_key_update(struct sk_buff *skb, struct sw_flow_key *key)
 	return key_extract(skb, key);
 }
 
-int ovs_flow_key_extract(struct ovs_key_ipv4_tunnel *tun_key,
+int ovs_flow_key_extract(struct ovs_tunnel_info *tun_info,
 			 struct sk_buff *skb, struct sw_flow_key *key)
 {
 	/* Extract metadata from packet. */
-	if (tun_key)
-		memcpy(&key->tun_key, tun_key, sizeof(key->tun_key));
+	if (tun_info)
+		memcpy(&key->tun_key, &tun_info->tunnel, sizeof(key->tun_key));
 	else
 		memset(&key->tun_key, 0, sizeof(key->tun_key));
 
diff --git a/net/openvswitch/flow.h b/net/openvswitch/flow.h
index 0f5db4e..fe5a71b 100644
--- a/net/openvswitch/flow.h
+++ b/net/openvswitch/flow.h
@@ -49,20 +49,24 @@ struct ovs_key_ipv4_tunnel {
 	u8   ipv4_ttl;
 } __packed __aligned(4); /* Minimize padding. */
 
-static inline void ovs_flow_tun_key_init(struct ovs_key_ipv4_tunnel *tun_key,
-					 const struct iphdr *iph, __be64 tun_id,
-					 __be16 tun_flags)
+struct ovs_tunnel_info {
+	struct ovs_key_ipv4_tunnel tunnel;
+};
+
+static inline void ovs_flow_tun_info_init(struct ovs_tunnel_info *tun_info,
+					  const struct iphdr *iph,
+					  __be64 tun_id, __be16 tun_flags)
 {
-	tun_key->tun_id = tun_id;
-	tun_key->ipv4_src = iph->saddr;
-	tun_key->ipv4_dst = iph->daddr;
-	tun_key->ipv4_tos = iph->tos;
-	tun_key->ipv4_ttl = iph->ttl;
-	tun_key->tun_flags = tun_flags;
+	tun_info->tunnel.tun_id = tun_id;
+	tun_info->tunnel.ipv4_src = iph->saddr;
+	tun_info->tunnel.ipv4_dst = iph->daddr;
+	tun_info->tunnel.ipv4_tos = iph->tos;
+	tun_info->tunnel.ipv4_ttl = iph->ttl;
+	tun_info->tunnel.tun_flags = tun_flags;
 
 	/* clear struct padding. */
-	memset((unsigned char *) tun_key + OVS_TUNNEL_KEY_SIZE, 0,
-	       sizeof(*tun_key) - OVS_TUNNEL_KEY_SIZE);
+	memset((unsigned char *)&tun_info->tunnel + OVS_TUNNEL_KEY_SIZE, 0,
+	       sizeof(tun_info->tunnel) - OVS_TUNNEL_KEY_SIZE);
 }
 
 struct sw_flow_key {
@@ -190,8 +194,8 @@ void ovs_flow_stats_clear(struct sw_flow *);
 u64 ovs_flow_used_time(unsigned long flow_jiffies);
 
 int ovs_flow_key_update(struct sk_buff *skb, struct sw_flow_key *key);
-int ovs_flow_key_extract(struct ovs_key_ipv4_tunnel *tun_key,
-			 struct sk_buff *skb, struct sw_flow_key *key);
+int ovs_flow_key_extract(struct ovs_tunnel_info *tun_info, struct sk_buff *skb,
+			 struct sw_flow_key *key);
 /* Extract key from packet coming from userspace. */
 int ovs_flow_key_extract_userspace(const struct nlattr *attr,
 				   struct sk_buff *skb,
diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c
index 22c855f..5d6194d 100644
--- a/net/openvswitch/flow_netlink.c
+++ b/net/openvswitch/flow_netlink.c
@@ -1148,13 +1148,14 @@ out:
 	return  (struct nlattr *) ((unsigned char *)(*sfa) + next_offset);
 }
 
-static int add_action(struct sw_flow_actions **sfa, int attrtype, void *data, int len)
+static struct nlattr *__add_action(struct sw_flow_actions **sfa,
+				   int attrtype, void *data, int len)
 {
 	struct nlattr *a;
 
 	a = reserve_sfa_size(sfa, nla_attr_size(len));
 	if (IS_ERR(a))
-		return PTR_ERR(a);
+		return a;
 
 	a->nla_type = attrtype;
 	a->nla_len = nla_attr_size(len);
@@ -1163,6 +1164,18 @@ static int add_action(struct sw_flow_actions **sfa, int attrtype, void *data, in
 		memcpy(nla_data(a), data, len);
 	memset((unsigned char *) a + a->nla_len, 0, nla_padlen(len));
 
+	return a;
+}
+
+static int add_action(struct sw_flow_actions **sfa, int attrtype,
+		      void *data, int len)
+{
+	struct nlattr *a;
+
+	a = __add_action(sfa, attrtype, data, len);
+	if (IS_ERR(a))
+		return PTR_ERR(a);
+
 	return 0;
 }
 
@@ -1268,6 +1281,8 @@ static int validate_and_copy_set_tun(const struct nlattr *attr,
 {
 	struct sw_flow_match match;
 	struct sw_flow_key key;
+	struct ovs_tunnel_info *tun_info;
+	struct nlattr *a;
 	int err, start;
 
 	ovs_match_init(&match, &key, NULL);
@@ -1279,8 +1294,14 @@ static int validate_and_copy_set_tun(const struct nlattr *attr,
 	if (start < 0)
 		return start;
 
-	err = add_action(sfa, OVS_KEY_ATTR_IPV4_TUNNEL, &match.key->tun_key,
-			sizeof(match.key->tun_key));
+	a = __add_action(sfa, OVS_KEY_ATTR_TUNNEL_INFO, NULL,
+			 sizeof(*tun_info));
+	if (IS_ERR(a))
+		return PTR_ERR(a);
+
+	tun_info = nla_data(a);
+	tun_info->tunnel = key.tun_key;
+
 	add_nested_action_end(*sfa, start);
 
 	return err;
@@ -1563,17 +1584,20 @@ static int set_action_to_attr(const struct nlattr *a, struct sk_buff *skb)
 	int err;
 
 	switch (key_type) {
-	case OVS_KEY_ATTR_IPV4_TUNNEL:
+	case OVS_KEY_ATTR_TUNNEL_INFO: {
+		struct ovs_tunnel_info *tun_info = nla_data(ovs_key);
+
 		start = nla_nest_start(skb, OVS_ACTION_ATTR_SET);
 		if (!start)
 			return -EMSGSIZE;
 
-		err = ipv4_tun_to_nlattr(skb, nla_data(ovs_key),
-					     nla_data(ovs_key));
+		err = ipv4_tun_to_nlattr(skb, &tun_info->tunnel,
+					 nla_data(ovs_key));
 		if (err)
 			return err;
 		nla_nest_end(skb, start);
 		break;
+	}
 	default:
 		if (nla_put(skb, OVS_ACTION_ATTR_SET, nla_len(a), ovs_key))
 			return -EMSGSIZE;
diff --git a/net/openvswitch/vport-gre.c b/net/openvswitch/vport-gre.c
index 309cca6..fe768bd 100644
--- a/net/openvswitch/vport-gre.c
+++ b/net/openvswitch/vport-gre.c
@@ -63,8 +63,10 @@ static __be16 filter_tnl_flags(__be16 flags)
 static struct sk_buff *__build_header(struct sk_buff *skb,
 				      int tunnel_hlen)
 {
-	const struct ovs_key_ipv4_tunnel *tun_key = OVS_CB(skb)->egress_tun_key;
 	struct tnl_ptk_info tpi;
+	const struct ovs_key_ipv4_tunnel *tun_key;
+
+	tun_key = &OVS_CB(skb)->egress_tun_info->tunnel;
 
 	skb = gre_handle_offloads(skb, !!(tun_key->tun_flags & TUNNEL_CSUM));
 	if (IS_ERR(skb))
@@ -92,7 +94,7 @@ static __be64 key_to_tunnel_id(__be32 key, __be32 seq)
 static int gre_rcv(struct sk_buff *skb,
 		   const struct tnl_ptk_info *tpi)
 {
-	struct ovs_key_ipv4_tunnel tun_key;
+	struct ovs_tunnel_info tun_info;
 	struct ovs_net *ovs_net;
 	struct vport *vport;
 	__be64 key;
@@ -103,10 +105,10 @@ static int gre_rcv(struct sk_buff *skb,
 		return PACKET_REJECT;
 
 	key = key_to_tunnel_id(tpi->key, tpi->seq);
-	ovs_flow_tun_key_init(&tun_key, ip_hdr(skb), key,
-			      filter_tnl_flags(tpi->flags));
+	ovs_flow_tun_info_init(&tun_info, ip_hdr(skb), key,
+			       filter_tnl_flags(tpi->flags));
 
-	ovs_vport_receive(vport, skb, &tun_key);
+	ovs_vport_receive(vport, skb, &tun_info);
 	return PACKET_RCVD;
 }
 
@@ -137,12 +139,12 @@ static int gre_tnl_send(struct vport *vport, struct sk_buff *skb)
 	__be16 df;
 	int err;
 
-	if (unlikely(!OVS_CB(skb)->egress_tun_key)) {
+	if (unlikely(!OVS_CB(skb)->egress_tun_info)) {
 		err = -EINVAL;
 		goto error;
 	}
 
-	tun_key = OVS_CB(skb)->egress_tun_key;
+	tun_key = &OVS_CB(skb)->egress_tun_info->tunnel;
 	/* Route lookup */
 	memset(&fl, 0, sizeof(fl));
 	fl.daddr = tun_key->ipv4_dst;
diff --git a/net/openvswitch/vport-vxlan.c b/net/openvswitch/vport-vxlan.c
index f19539b..5fbff2c 100644
--- a/net/openvswitch/vport-vxlan.c
+++ b/net/openvswitch/vport-vxlan.c
@@ -58,7 +58,7 @@ static inline struct vxlan_port *vxlan_vport(const struct vport *vport)
 /* Called with rcu_read_lock and BH disabled. */
 static void vxlan_rcv(struct vxlan_sock *vs, struct sk_buff *skb, __be32 vx_vni)
 {
-	struct ovs_key_ipv4_tunnel tun_key;
+	struct ovs_tunnel_info tun_info;
 	struct vport *vport = vs->data;
 	struct iphdr *iph;
 	__be64 key;
@@ -66,9 +66,9 @@ static void vxlan_rcv(struct vxlan_sock *vs, struct sk_buff *skb, __be32 vx_vni)
 	/* Save outer tunnel values */
 	iph = ip_hdr(skb);
 	key = cpu_to_be64(ntohl(vx_vni) >> 8);
-	ovs_flow_tun_key_init(&tun_key, iph, key, TUNNEL_KEY);
+	ovs_flow_tun_info_init(&tun_info, iph, key, TUNNEL_KEY);
 
-	ovs_vport_receive(vport, skb, &tun_key);
+	ovs_vport_receive(vport, skb, &tun_info);
 }
 
 static int vxlan_get_options(const struct vport *vport, struct sk_buff *skb)
@@ -147,12 +147,12 @@ static int vxlan_tnl_send(struct vport *vport, struct sk_buff *skb)
 	__be16 df;
 	int err;
 
-	if (unlikely(!OVS_CB(skb)->egress_tun_key)) {
+	if (unlikely(!OVS_CB(skb)->egress_tun_info)) {
 		err = -EINVAL;
 		goto error;
 	}
 
-	tun_key = OVS_CB(skb)->egress_tun_key;
+	tun_key = &OVS_CB(skb)->egress_tun_info->tunnel;
 	/* Route lookup */
 	memset(&fl, 0, sizeof(fl));
 	fl.daddr = tun_key->ipv4_dst;
diff --git a/net/openvswitch/vport.c b/net/openvswitch/vport.c
index 5df8377..3e50ee8 100644
--- a/net/openvswitch/vport.c
+++ b/net/openvswitch/vport.c
@@ -432,7 +432,7 @@ u32 ovs_vport_find_upcall_portid(const struct vport *p, struct sk_buff *skb)
  * skb->data should point to the Ethernet header.
  */
 void ovs_vport_receive(struct vport *vport, struct sk_buff *skb,
-		       struct ovs_key_ipv4_tunnel *tun_key)
+		       struct ovs_tunnel_info *tun_info)
 {
 	struct pcpu_sw_netstats *stats;
 	struct sw_flow_key key;
@@ -445,9 +445,9 @@ void ovs_vport_receive(struct vport *vport, struct sk_buff *skb,
 	u64_stats_update_end(&stats->syncp);
 
 	OVS_CB(skb)->input_vport = vport;
-	OVS_CB(skb)->egress_tun_key = NULL;
+	OVS_CB(skb)->egress_tun_info = NULL;
 	/* Extract flow from 'skb' into 'key'. */
-	error = ovs_flow_key_extract(tun_key, skb, &key);
+	error = ovs_flow_key_extract(tun_info, skb, &key);
 	if (unlikely(error)) {
 		kfree_skb(skb);
 		return;
diff --git a/net/openvswitch/vport.h b/net/openvswitch/vport.h
index 0efd62f..e28964a 100644
--- a/net/openvswitch/vport.h
+++ b/net/openvswitch/vport.h
@@ -207,7 +207,7 @@ static inline struct vport *vport_from_priv(void *priv)
 }
 
 void ovs_vport_receive(struct vport *, struct sk_buff *,
-		       struct ovs_key_ipv4_tunnel *);
+		       struct ovs_tunnel_info *);
 
 /* List of statically compiled vport implementations.  Don't forget to also
  * add yours to the list at the top of vport.c. */
-- 
1.7.9.5

^ permalink raw reply related	[flat|nested] 11+ messages in thread

* [net-next 5/6] openvswitch: Factor out allocation and verification of actions.
  2014-10-02  8:04 [net-next 0/6] Add Geneve tunnel protocol support Andy Zhou
                   ` (3 preceding siblings ...)
  2014-10-02  8:04 ` [net-next 4/6] openvswitch: Wrap struct ovs_key_ipv4_tunnel in a new structure Andy Zhou
@ 2014-10-02  8:04 ` Andy Zhou
  2014-10-02  8:04 ` [net-next 6/6] openvswitch: Add support for Geneve tunneling Andy Zhou
  5 siblings, 0 replies; 11+ messages in thread
From: Andy Zhou @ 2014-10-02  8:04 UTC (permalink / raw)
  To: davem; +Cc: netdev, Jesse Gross, Andy Zhou

From: Jesse Gross <jesse@nicira.com>

As the size of the flow key grows, it can put some pressure on the
stack. This is particularly true in ovs_flow_cmd_set(), which needs several
copies of the key on the stack. One of those uses is logically separate,
so this factors it out to reduce stack pressure and improve readibility.

Signed-off-by: Jesse Gross <jesse@nicira.com>
Signed-off-by: Andy Zhou <azhou@nicira.com>
---
 net/openvswitch/datapath.c |   38 +++++++++++++++++++++++++++-----------
 1 file changed, 27 insertions(+), 11 deletions(-)

diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
index f6bd93d..010125c 100644
--- a/net/openvswitch/datapath.c
+++ b/net/openvswitch/datapath.c
@@ -933,11 +933,34 @@ error:
 	return error;
 }
 
+static struct sw_flow_actions *get_flow_actions(const struct nlattr *a,
+						const struct sw_flow_key *key,
+						const struct sw_flow_mask *mask)
+{
+	struct sw_flow_actions *acts;
+	struct sw_flow_key masked_key;
+	int error;
+
+	acts = ovs_nla_alloc_flow_actions(nla_len(a));
+	if (IS_ERR(acts))
+		return acts;
+
+	ovs_flow_mask_key(&masked_key, key, mask);
+	error = ovs_nla_copy_actions(a, &masked_key, 0, &acts);
+	if (error) {
+		OVS_NLERR("Flow actions may not be safe on all matching packets.\n");
+		kfree(acts);
+		return ERR_PTR(error);
+	}
+
+	return acts;
+}
+
 static int ovs_flow_cmd_set(struct sk_buff *skb, struct genl_info *info)
 {
 	struct nlattr **a = info->attrs;
 	struct ovs_header *ovs_header = info->userhdr;
-	struct sw_flow_key key, masked_key;
+	struct sw_flow_key key;
 	struct sw_flow *flow;
 	struct sw_flow_mask mask;
 	struct sk_buff *reply = NULL;
@@ -959,17 +982,10 @@ static int ovs_flow_cmd_set(struct sk_buff *skb, struct genl_info *info)
 
 	/* Validate actions. */
 	if (a[OVS_FLOW_ATTR_ACTIONS]) {
-		acts = ovs_nla_alloc_flow_actions(nla_len(a[OVS_FLOW_ATTR_ACTIONS]));
-		error = PTR_ERR(acts);
-		if (IS_ERR(acts))
+		acts = get_flow_actions(a[OVS_FLOW_ATTR_ACTIONS], &key, &mask);
+		if (IS_ERR(acts)) {
+			error = PTR_ERR(acts);
 			goto error;
-
-		ovs_flow_mask_key(&masked_key, &key, &mask);
-		error = ovs_nla_copy_actions(a[OVS_FLOW_ATTR_ACTIONS],
-					     &masked_key, 0, &acts);
-		if (error) {
-			OVS_NLERR("Flow actions may not be safe on all matching packets.\n");
-			goto err_kfree_acts;
 		}
 	}
 
-- 
1.7.9.5

^ permalink raw reply related	[flat|nested] 11+ messages in thread

* [net-next 6/6] openvswitch: Add support for Geneve tunneling.
  2014-10-02  8:04 [net-next 0/6] Add Geneve tunnel protocol support Andy Zhou
                   ` (4 preceding siblings ...)
  2014-10-02  8:04 ` [net-next 5/6] openvswitch: Factor out allocation and verification of actions Andy Zhou
@ 2014-10-02  8:04 ` Andy Zhou
  2014-10-02 23:04   ` Tom Herbert
  5 siblings, 1 reply; 11+ messages in thread
From: Andy Zhou @ 2014-10-02  8:04 UTC (permalink / raw)
  To: davem; +Cc: netdev, Jesse Gross, Andy Zhou

From: Jesse Gross <jesse@nicira.com>

The Openvswitch implementation is completely agnostic to the options
that are in use and can handle newly defined options without
further work. It does this by simply matching on a byte array
of options and allowing userspace to setup flows on this array.

Signed-off-by: Jesse Gross <jesse@nicira.com>
Signed-off-by: Andy Zhou <azhou@nicira.com>
---
 include/net/ip_tunnels.h         |   21 ++--
 include/uapi/linux/openvswitch.h |    2 +
 net/openvswitch/Kconfig          |   11 ++
 net/openvswitch/Makefile         |    4 +
 net/openvswitch/datapath.c       |    5 +-
 net/openvswitch/flow.c           |   20 +++-
 net/openvswitch/flow.h           |   20 +++-
 net/openvswitch/flow_netlink.c   |  176 +++++++++++++++++++++++-----
 net/openvswitch/vport-geneve.c   |  236 ++++++++++++++++++++++++++++++++++++++
 net/openvswitch/vport-gre.c      |    2 +-
 net/openvswitch/vport-vxlan.c    |    2 +-
 net/openvswitch/vport.c          |    3 +
 net/openvswitch/vport.h          |    1 +
 13 files changed, 461 insertions(+), 42 deletions(-)
 create mode 100644 net/openvswitch/vport-geneve.c

diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h
index a9ce155..5bc6ede 100644
--- a/include/net/ip_tunnels.h
+++ b/include/net/ip_tunnels.h
@@ -86,17 +86,18 @@ struct ip_tunnel {
 	struct gro_cells	gro_cells;
 };
 
-#define TUNNEL_CSUM	__cpu_to_be16(0x01)
-#define TUNNEL_ROUTING	__cpu_to_be16(0x02)
-#define TUNNEL_KEY	__cpu_to_be16(0x04)
-#define TUNNEL_SEQ	__cpu_to_be16(0x08)
-#define TUNNEL_STRICT	__cpu_to_be16(0x10)
-#define TUNNEL_REC	__cpu_to_be16(0x20)
-#define TUNNEL_VERSION	__cpu_to_be16(0x40)
-#define TUNNEL_NO_KEY	__cpu_to_be16(0x80)
+#define TUNNEL_CSUM		__cpu_to_be16(0x01)
+#define TUNNEL_ROUTING		__cpu_to_be16(0x02)
+#define TUNNEL_KEY		__cpu_to_be16(0x04)
+#define TUNNEL_SEQ		__cpu_to_be16(0x08)
+#define TUNNEL_STRICT		__cpu_to_be16(0x10)
+#define TUNNEL_REC		__cpu_to_be16(0x20)
+#define TUNNEL_VERSION		__cpu_to_be16(0x40)
+#define TUNNEL_NO_KEY		__cpu_to_be16(0x80)
 #define TUNNEL_DONT_FRAGMENT    __cpu_to_be16(0x0100)
-#define TUNNEL_OAM	__cpu_to_be16(0x0200)
-#define TUNNEL_CRIT_OPT	__cpu_to_be16(0x0400)
+#define TUNNEL_OAM		__cpu_to_be16(0x0200)
+#define TUNNEL_CRIT_OPT		__cpu_to_be16(0x0400)
+#define TUNNEL_OPTIONS_PRESENT	__cpu_to_be16(0x0800)
 
 struct tnl_ptk_info {
 	__be16 flags;
diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h
index 6753032..435eabc 100644
--- a/include/uapi/linux/openvswitch.h
+++ b/include/uapi/linux/openvswitch.h
@@ -192,6 +192,7 @@ enum ovs_vport_type {
 	OVS_VPORT_TYPE_INTERNAL, /* network device implemented by datapath */
 	OVS_VPORT_TYPE_GRE,      /* GRE tunnel. */
 	OVS_VPORT_TYPE_VXLAN,	 /* VXLAN tunnel. */
+	OVS_VPORT_TYPE_GENEVE,	 /* Geneve tunnel. */
 	__OVS_VPORT_TYPE_MAX
 };
 
@@ -310,6 +311,7 @@ enum ovs_tunnel_key_attr {
 	OVS_TUNNEL_KEY_ATTR_DONT_FRAGMENT,      /* No argument, set DF. */
 	OVS_TUNNEL_KEY_ATTR_CSUM,               /* No argument. CSUM packet. */
 	OVS_TUNNEL_KEY_ATTR_OAM,                /* No argument. OAM frame.  */
+	OVS_TUNNEL_KEY_ATTR_GENEVE_OPTS,        /* Array of Geneve options. */
 	__OVS_TUNNEL_KEY_ATTR_MAX
 };
 
diff --git a/net/openvswitch/Kconfig b/net/openvswitch/Kconfig
index 6ecf491..ba3bb82 100644
--- a/net/openvswitch/Kconfig
+++ b/net/openvswitch/Kconfig
@@ -54,3 +54,14 @@ config OPENVSWITCH_VXLAN
 	  Say N to exclude this support and reduce the binary size.
 
 	  If unsure, say Y.
+
+config OPENVSWITCH_GENEVE
+	bool "Open vSwitch Geneve tunneling support"
+	depends on INET
+	depends on OPENVSWITCH
+	depends on GENEVE && !(OPENVSWITCH=y && GENEVE=m)
+	default y
+	---help---
+	  If you say Y here, then the Open vSwitch will be able create geneve vport.
+
+	  Say N to exclude this support and reduce the binary size.
diff --git a/net/openvswitch/Makefile b/net/openvswitch/Makefile
index 3591cb5..9a33a27 100644
--- a/net/openvswitch/Makefile
+++ b/net/openvswitch/Makefile
@@ -15,6 +15,10 @@ openvswitch-y := \
 	vport-internal_dev.o \
 	vport-netdev.o
 
+ifneq ($(CONFIG_OPENVSWITCH_GENEVE),)
+openvswitch-y += vport-geneve.o
+endif
+
 ifneq ($(CONFIG_OPENVSWITCH_VXLAN),)
 openvswitch-y += vport-vxlan.o
 endif
diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
index 010125c..2e31d9e 100644
--- a/net/openvswitch/datapath.c
+++ b/net/openvswitch/datapath.c
@@ -370,6 +370,7 @@ static size_t key_attr_size(void)
 		  + nla_total_size(0)   /* OVS_TUNNEL_KEY_ATTR_DONT_FRAGMENT */
 		  + nla_total_size(0)   /* OVS_TUNNEL_KEY_ATTR_CSUM */
 		  + nla_total_size(0)   /* OVS_TUNNEL_KEY_ATTR_OAM */
+		  + nla_total_size(256)   /* OVS_TUNNEL_KEY_ATTR_GENEVE_OPTS */
 		+ nla_total_size(4)   /* OVS_KEY_ATTR_IN_PORT */
 		+ nla_total_size(4)   /* OVS_KEY_ATTR_SKB_MARK */
 		+ nla_total_size(12)  /* OVS_KEY_ATTR_ETHERNET */
@@ -556,10 +557,12 @@ static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info)
 
 	err = ovs_nla_copy_actions(a[OVS_PACKET_ATTR_ACTIONS],
 				   &flow->key, 0, &acts);
-	rcu_assign_pointer(flow->sf_acts, acts);
 	if (err)
 		goto err_flow_free;
 
+	rcu_assign_pointer(flow->sf_acts, acts);
+
+	OVS_CB(packet)->egress_tun_info = NULL;
 	OVS_CB(packet)->flow = flow;
 	packet->priority = flow->key.phy.priority;
 	packet->mark = flow->key.phy.skb_mark;
diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c
index 2924cb3..62db02b 100644
--- a/net/openvswitch/flow.c
+++ b/net/openvswitch/flow.c
@@ -448,6 +448,9 @@ static int key_extract(struct sk_buff *skb, struct sw_flow_key *key)
 	int error;
 	struct ethhdr *eth;
 
+	/* Flags are always used as part of stats */
+	key->tp.flags = 0;
+
 	skb_reset_mac_header(skb);
 
 	/* Link layer.  We are guaranteed to have at least the 14 byte Ethernet
@@ -646,10 +649,23 @@ int ovs_flow_key_extract(struct ovs_tunnel_info *tun_info,
 			 struct sk_buff *skb, struct sw_flow_key *key)
 {
 	/* Extract metadata from packet. */
-	if (tun_info)
+	if (tun_info) {
 		memcpy(&key->tun_key, &tun_info->tunnel, sizeof(key->tun_key));
-	else
+
+		if (tun_info->options) {
+			BUILD_BUG_ON((1 << (sizeof(tun_info->options_len) *
+						   8)) - 1
+					> sizeof(key->tun_opts));
+			memcpy(GENEVE_OPTS(key, tun_info->options_len),
+			       tun_info->options, tun_info->options_len);
+			key->tun_opts_len = tun_info->options_len;
+		} else {
+			key->tun_opts_len = 0;
+		}
+	} else  {
+		key->tun_opts_len = 0;
 		memset(&key->tun_key, 0, sizeof(key->tun_key));
+	}
 
 	key->phy.priority = skb->priority;
 	key->phy.in_port = OVS_CB(skb)->input_vport->port_no;
diff --git a/net/openvswitch/flow.h b/net/openvswitch/flow.h
index fe5a71b..7181331 100644
--- a/net/openvswitch/flow.h
+++ b/net/openvswitch/flow.h
@@ -51,11 +51,24 @@ struct ovs_key_ipv4_tunnel {
 
 struct ovs_tunnel_info {
 	struct ovs_key_ipv4_tunnel tunnel;
+	struct geneve_opt *options;
+	u8 options_len;
 };
 
+/* Store options at the end of the array if they are less than the
+ * maximum size. This allows us to get the benefits of variable length
+ * matching for small options.
+ */
+#define GENEVE_OPTS(flow_key, opt_len)	\
+	((struct geneve_opt *)((flow_key)->tun_opts + \
+			       FIELD_SIZEOF(struct sw_flow_key, tun_opts) - \
+			       opt_len))
+
 static inline void ovs_flow_tun_info_init(struct ovs_tunnel_info *tun_info,
 					  const struct iphdr *iph,
-					  __be64 tun_id, __be16 tun_flags)
+					  __be64 tun_id, __be16 tun_flags,
+					  struct geneve_opt *opts,
+					  u8 opts_len)
 {
 	tun_info->tunnel.tun_id = tun_id;
 	tun_info->tunnel.ipv4_src = iph->saddr;
@@ -67,9 +80,14 @@ static inline void ovs_flow_tun_info_init(struct ovs_tunnel_info *tun_info,
 	/* clear struct padding. */
 	memset((unsigned char *)&tun_info->tunnel + OVS_TUNNEL_KEY_SIZE, 0,
 	       sizeof(tun_info->tunnel) - OVS_TUNNEL_KEY_SIZE);
+
+	tun_info->options = opts;
+	tun_info->options_len = opts_len;
 }
 
 struct sw_flow_key {
+	u8 tun_opts[255];
+	u8 tun_opts_len;
 	struct ovs_key_ipv4_tunnel tun_key;  /* Encapsulating tunnel key. */
 	struct {
 		u32	priority;	/* Packet QoS priority. */
diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c
index 5d6194d..368f233 100644
--- a/net/openvswitch/flow_netlink.c
+++ b/net/openvswitch/flow_netlink.c
@@ -42,6 +42,7 @@
 #include <linux/icmp.h>
 #include <linux/icmpv6.h>
 #include <linux/rculist.h>
+#include <net/geneve.h>
 #include <net/ip.h>
 #include <net/ipv6.h>
 #include <net/ndisc.h>
@@ -88,18 +89,20 @@ static void update_range__(struct sw_flow_match *match,
 		}                                                           \
 	} while (0)
 
-#define SW_FLOW_KEY_MEMCPY(match, field, value_p, len, is_mask) \
-	do { \
-		update_range__(match, offsetof(struct sw_flow_key, field),  \
-				len, is_mask);                              \
-		if (is_mask) {						    \
-			if ((match)->mask)				    \
-				memcpy(&(match)->mask->key.field, value_p, len);\
-		} else {                                                    \
-			memcpy(&(match)->key->field, value_p, len);         \
-		}                                                           \
+#define SW_FLOW_KEY_MEMCPY_OFFSET(match, offset, value_p, len, is_mask)	    \
+	do {								    \
+		update_range__(match, offset, len, is_mask);		    \
+		if (is_mask)						    \
+			memcpy((u8 *)&(match)->mask->key + offset, value_p, \
+			       len);					    \
+		else							    \
+			memcpy((u8 *)(match)->key + offset, value_p, len);  \
 	} while (0)
 
+#define SW_FLOW_KEY_MEMCPY(match, field, value_p, len, is_mask)		      \
+	SW_FLOW_KEY_MEMCPY_OFFSET(match, offsetof(struct sw_flow_key, field), \
+				  value_p, len, is_mask)
+
 static u16 range_n_bytes(const struct sw_flow_key_range *range)
 {
 	return range->end - range->start;
@@ -335,6 +338,7 @@ static int ipv4_tun_from_nlattr(const struct nlattr *attr,
 	int rem;
 	bool ttl = false;
 	__be16 tun_flags = 0;
+	unsigned long opt_key_offset;
 
 	nla_for_each_nested(a, attr, rem) {
 		int type = nla_type(a);
@@ -347,6 +351,7 @@ static int ipv4_tun_from_nlattr(const struct nlattr *attr,
 			[OVS_TUNNEL_KEY_ATTR_DONT_FRAGMENT] = 0,
 			[OVS_TUNNEL_KEY_ATTR_CSUM] = 0,
 			[OVS_TUNNEL_KEY_ATTR_OAM] = 0,
+			[OVS_TUNNEL_KEY_ATTR_GENEVE_OPTS] = -1,
 		};
 
 		if (type > OVS_TUNNEL_KEY_ATTR_MAX) {
@@ -355,7 +360,8 @@ static int ipv4_tun_from_nlattr(const struct nlattr *attr,
 			return -EINVAL;
 		}
 
-		if (ovs_tunnel_key_lens[type] != nla_len(a)) {
+		if (ovs_tunnel_key_lens[type] != nla_len(a) &&
+		    ovs_tunnel_key_lens[type] != -1) {
 			OVS_NLERR("IPv4 tunnel attribute type has unexpected "
 				  " length (type=%d, length=%d, expected=%d).\n",
 				  type, nla_len(a), ovs_tunnel_key_lens[type]);
@@ -394,7 +400,60 @@ static int ipv4_tun_from_nlattr(const struct nlattr *attr,
 		case OVS_TUNNEL_KEY_ATTR_OAM:
 			tun_flags |= TUNNEL_OAM;
 			break;
+		case OVS_TUNNEL_KEY_ATTR_GENEVE_OPTS:
+			tun_flags |= TUNNEL_OPTIONS_PRESENT;
+			if (nla_len(a) > sizeof(match->key->tun_opts)) {
+				OVS_NLERR("Geneve option length exceeds maximum size (len %d, max %zu).\n",
+					  nla_len(a),
+					  sizeof(match->key->tun_opts));
+				return -EINVAL;
+			}
+
+			if (nla_len(a) % 4 != 0) {
+				OVS_NLERR("Geneve option length is not a multiple of 4 (len %d).\n",
+					  nla_len(a));
+				return -EINVAL;
+			}
+
+			/* We need to record the length of the options passed
+			 * down, otherwise packets with the same format but
+			 * additional options will be silently matched.
+			 */
+			if (!is_mask) {
+				SW_FLOW_KEY_PUT(match, tun_opts_len, nla_len(a),
+						false);
+			} else {
+				/* This is somewhat unusual because it looks at
+				 * both the key and mask while parsing the
+				 * attributes (and by extension assumes the key
+				 * is parsed first). Normally, we would verify
+				 * that each is the correct length and that the
+				 * attributes line up in the validate function.
+				 * However, that is difficult because this is
+				 * variable length and we won't have the
+				 * information later.
+				 */
+				if (match->key->tun_opts_len != nla_len(a)) {
+					OVS_NLERR("Geneve option key length (%d) is different from mask length (%d).",
+						  match->key->tun_opts_len,
+						  nla_len(a));
+					return -EINVAL;
+				}
+
+				SW_FLOW_KEY_PUT(match, tun_opts_len, 0xff,
+						true);
+			}
+
+			opt_key_offset = (unsigned long)GENEVE_OPTS(
+					  (struct sw_flow_key *)0,
+					  nla_len(a));
+			SW_FLOW_KEY_MEMCPY_OFFSET(match, opt_key_offset,
+						  nla_data(a), nla_len(a),
+						  is_mask);
+			break;
 		default:
+			OVS_NLERR("Unknown IPv4 tunnel attribute (%d).\n",
+				  type);
 			return -EINVAL;
 		}
 	}
@@ -421,16 +480,11 @@ static int ipv4_tun_from_nlattr(const struct nlattr *attr,
 	return 0;
 }
 
-static int ipv4_tun_to_nlattr(struct sk_buff *skb,
-			      const struct ovs_key_ipv4_tunnel *tun_key,
-			      const struct ovs_key_ipv4_tunnel *output)
+static int __ipv4_tun_to_nlattr(struct sk_buff *skb,
+				const struct ovs_key_ipv4_tunnel *output,
+				const struct geneve_opt *tun_opts,
+				int swkey_tun_opts_len)
 {
-	struct nlattr *nla;
-
-	nla = nla_nest_start(skb, OVS_KEY_ATTR_TUNNEL);
-	if (!nla)
-		return -EMSGSIZE;
-
 	if (output->tun_flags & TUNNEL_KEY &&
 	    nla_put_be64(skb, OVS_TUNNEL_KEY_ATTR_ID, output->tun_id))
 		return -EMSGSIZE;
@@ -454,12 +508,35 @@ static int ipv4_tun_to_nlattr(struct sk_buff *skb,
 	if ((output->tun_flags & TUNNEL_OAM) &&
 	    nla_put_flag(skb, OVS_TUNNEL_KEY_ATTR_OAM))
 		return -EMSGSIZE;
+	if (tun_opts &&
+	    nla_put(skb, OVS_TUNNEL_KEY_ATTR_GENEVE_OPTS,
+		    swkey_tun_opts_len, tun_opts))
+		return -EMSGSIZE;
 
-	nla_nest_end(skb, nla);
 	return 0;
 }
 
 
+static int ipv4_tun_to_nlattr(struct sk_buff *skb,
+			      const struct ovs_key_ipv4_tunnel *output,
+			      const struct geneve_opt *tun_opts,
+			      int swkey_tun_opts_len)
+{
+	struct nlattr *nla;
+	int err;
+
+	nla = nla_nest_start(skb, OVS_KEY_ATTR_TUNNEL);
+	if (!nla)
+		return -EMSGSIZE;
+
+	err = __ipv4_tun_to_nlattr(skb, output, tun_opts, swkey_tun_opts_len);
+	if (err)
+		return err;
+
+	nla_nest_end(skb, nla);
+	return 0;
+}
+
 static int metadata_from_nlattrs(struct sw_flow_match *match,  u64 *attrs,
 				 const struct nlattr **a, bool is_mask)
 {
@@ -905,9 +982,16 @@ int ovs_nla_put_flow(const struct sw_flow_key *swkey,
 	if (nla_put_u32(skb, OVS_KEY_ATTR_PRIORITY, output->phy.priority))
 		goto nla_put_failure;
 
-	if ((swkey->tun_key.ipv4_dst || is_mask) &&
-	    ipv4_tun_to_nlattr(skb, &swkey->tun_key, &output->tun_key))
-		goto nla_put_failure;
+	if ((swkey->tun_key.ipv4_dst || is_mask)) {
+		const struct geneve_opt *opts = NULL;
+
+		if (output->tun_key.tun_flags & TUNNEL_OPTIONS_PRESENT)
+			opts = GENEVE_OPTS(output, swkey->tun_opts_len);
+
+		if (ipv4_tun_to_nlattr(skb, &output->tun_key, opts,
+				       swkey->tun_opts_len))
+			goto nla_put_failure;
+	}
 
 	if (swkey->phy.in_port == DP_MAX_PORTS) {
 		if (is_mask && (output->phy.in_port == 0xffff))
@@ -1290,17 +1374,55 @@ static int validate_and_copy_set_tun(const struct nlattr *attr,
 	if (err)
 		return err;
 
+	if (key.tun_opts_len) {
+		struct geneve_opt *option = GENEVE_OPTS(&key,
+							key.tun_opts_len);
+		int opts_len = key.tun_opts_len;
+		bool crit_opt = false;
+
+		while (opts_len > 0) {
+			int len;
+
+			if (opts_len < sizeof(*option))
+				return -EINVAL;
+
+			len = sizeof(*option) + option->length * 4;
+			if (len > opts_len)
+				return -EINVAL;
+
+			crit_opt |= !!(option->type & GENEVE_CRIT_OPT_TYPE);
+
+			option = (struct geneve_opt *)((u8 *)option + len);
+			opts_len -= len;
+		};
+
+		key.tun_key.tun_flags |= crit_opt ? TUNNEL_CRIT_OPT : 0;
+	};
+
 	start = add_nested_action_start(sfa, OVS_ACTION_ATTR_SET);
 	if (start < 0)
 		return start;
 
 	a = __add_action(sfa, OVS_KEY_ATTR_TUNNEL_INFO, NULL,
-			 sizeof(*tun_info));
+			 sizeof(*tun_info) + key.tun_opts_len);
 	if (IS_ERR(a))
 		return PTR_ERR(a);
 
 	tun_info = nla_data(a);
 	tun_info->tunnel = key.tun_key;
+	tun_info->options_len = key.tun_opts_len;
+
+	if (tun_info->options_len) {
+		/* We need to store the options in the action itself since
+		 * everything else will go away after flow setup. We can append
+		 * it to tun_info and then point there.
+		 */
+		memcpy((tun_info + 1), GENEVE_OPTS(&key, key.tun_opts_len),
+		       key.tun_opts_len);
+		tun_info->options = (struct geneve_opt *)(tun_info + 1);
+	} else {
+		tun_info->options = NULL;
+	}
 
 	add_nested_action_end(*sfa, start);
 
@@ -1592,7 +1714,9 @@ static int set_action_to_attr(const struct nlattr *a, struct sk_buff *skb)
 			return -EMSGSIZE;
 
 		err = ipv4_tun_to_nlattr(skb, &tun_info->tunnel,
-					 nla_data(ovs_key));
+					 tun_info->options_len ?
+						tun_info->options : NULL,
+					 tun_info->options_len);
 		if (err)
 			return err;
 		nla_nest_end(skb, start);
diff --git a/net/openvswitch/vport-geneve.c b/net/openvswitch/vport-geneve.c
new file mode 100644
index 0000000..5572d48
--- /dev/null
+++ b/net/openvswitch/vport-geneve.c
@@ -0,0 +1,236 @@
+/*
+ * Copyright (c) 2014 Nicira, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/version.h>
+
+#include <linux/in.h>
+#include <linux/ip.h>
+#include <linux/net.h>
+#include <linux/rculist.h>
+#include <linux/udp.h>
+#include <linux/if_vlan.h>
+
+#include <net/geneve.h>
+#include <net/icmp.h>
+#include <net/ip.h>
+#include <net/route.h>
+#include <net/udp.h>
+#include <net/xfrm.h>
+
+#include "datapath.h"
+#include "vport.h"
+
+/**
+ * struct geneve_port - Keeps track of open UDP ports
+ * @sock: The socket created for this port number.
+ * @name: vport name.
+ */
+struct geneve_port {
+	struct geneve_sock *gs;
+	char name[IFNAMSIZ];
+};
+
+static LIST_HEAD(geneve_ports);
+
+static inline struct geneve_port *geneve_vport(const struct vport *vport)
+{
+	return vport_priv(vport);
+}
+
+static inline struct genevehdr *geneve_hdr(const struct sk_buff *skb)
+{
+	return (struct genevehdr *)(udp_hdr(skb) + 1);
+}
+
+/* Convert 64 bit tunnel ID to 24 bit VNI. */
+static void tunnel_id_to_vni(__be64 tun_id, __u8 *vni)
+{
+#ifdef __BIG_ENDIAN
+	vni[0] = (__force __u8)(tun_id >> 16);
+	vni[1] = (__force __u8)(tun_id >> 8);
+	vni[2] = (__force __u8)tun_id;
+#else
+	vni[0] = (__force __u8)((__force u64)tun_id >> 40);
+	vni[1] = (__force __u8)((__force u64)tun_id >> 48);
+	vni[2] = (__force __u8)((__force u64)tun_id >> 56);
+#endif
+}
+
+/* Convert 24 bit VNI to 64 bit tunnel ID. */
+static __be64 vni_to_tunnel_id(__u8 *vni)
+{
+#ifdef __BIG_ENDIAN
+	return (vni[0] << 16) | (vni[1] << 8) | vni[2];
+#else
+	return (__force __be64)(((__force u64)vni[0] << 40) |
+				((__force u64)vni[1] << 48) |
+				((__force u64)vni[2] << 56));
+#endif
+}
+
+static void geneve_rcv(struct geneve_sock *gs, struct sk_buff *skb)
+{
+	struct vport *vport = gs->rcv_data;
+	struct genevehdr *geneveh = geneve_hdr(skb);
+	int opts_len;
+	struct ovs_tunnel_info tun_info;
+	__be64 key;
+	__be16 flags;
+
+	opts_len = geneveh->opt_len * 4;
+
+	flags = TUNNEL_KEY | TUNNEL_OPTIONS_PRESENT |
+		(udp_hdr(skb)->check != 0 ? TUNNEL_CSUM : 0) |
+		(geneveh->oam ? TUNNEL_OAM : 0) |
+		(geneveh->critical ? TUNNEL_CRIT_OPT : 0);
+
+	key = vni_to_tunnel_id(geneveh->vni);
+
+	ovs_flow_tun_info_init(&tun_info, ip_hdr(skb), key, flags,
+			       geneveh->options, opts_len);
+
+	ovs_vport_receive(vport, skb, &tun_info);
+}
+
+static int geneve_get_options(const struct vport *vport,
+			      struct sk_buff *skb)
+{
+	struct geneve_port *geneve_port = geneve_vport(vport);
+	__be16 sport;
+
+	sport = ntohs(inet_sk(geneve_port->gs->sock->sk)->inet_sport);
+	if (nla_put_u16(skb, OVS_TUNNEL_ATTR_DST_PORT, sport))
+		return -EMSGSIZE;
+	return 0;
+}
+
+static void geneve_tnl_destroy(struct vport *vport)
+{
+	struct geneve_port *geneve_port = geneve_vport(vport);
+
+	geneve_sock_release(geneve_port->gs);
+
+	ovs_vport_deferred_free(vport);
+}
+
+static struct vport *geneve_tnl_create(const struct vport_parms *parms)
+{
+	struct net *net = ovs_dp_get_net(parms->dp);
+	struct nlattr *options = parms->options;
+	struct geneve_port *geneve_port;
+	struct geneve_sock *gs;
+	struct vport *vport;
+	struct nlattr *a;
+	int err;
+	u16 dst_port;
+
+	if (!options) {
+		err = -EINVAL;
+		goto error;
+	}
+
+	a = nla_find_nested(options, OVS_TUNNEL_ATTR_DST_PORT);
+	if (a && nla_len(a) == sizeof(u16)) {
+		dst_port = nla_get_u16(a);
+	} else {
+		/* Require destination port from userspace. */
+		err = -EINVAL;
+		goto error;
+	}
+
+	vport = ovs_vport_alloc(sizeof(struct geneve_port),
+				&ovs_geneve_vport_ops, parms);
+	if (IS_ERR(vport))
+		return vport;
+
+	geneve_port = geneve_vport(vport);
+	strncpy(geneve_port->name, parms->name, IFNAMSIZ);
+
+	gs = geneve_sock_add(net, htons(dst_port), geneve_rcv, vport, true, 0);
+	if (IS_ERR(gs)) {
+		ovs_vport_free(vport);
+		return (void *)gs;
+	}
+	geneve_port->gs = gs;
+
+	return vport;
+error:
+	return ERR_PTR(err);
+}
+
+static int geneve_tnl_send(struct vport *vport, struct sk_buff *skb)
+{
+	struct ovs_key_ipv4_tunnel *tun_key;
+	struct ovs_tunnel_info *tun_info;
+	struct net *net = ovs_dp_get_net(vport->dp);
+	struct geneve_port *geneve_port = geneve_vport(vport);
+	__be16 dport = inet_sk(geneve_port->gs->sock->sk)->inet_sport;
+	__be16 sport;
+	struct rtable *rt;
+	struct flowi4 fl;
+	u8 vni[3];
+	__be16 df;
+	int err;
+
+	tun_info = OVS_CB(skb)->egress_tun_info;
+	if (unlikely(!tun_info)) {
+		err = -EINVAL;
+		goto error;
+	}
+
+	tun_key = &tun_info->tunnel;
+
+	/* Route lookup */
+	memset(&fl, 0, sizeof(fl));
+	fl.daddr = tun_key->ipv4_dst;
+	fl.saddr = tun_key->ipv4_src;
+	fl.flowi4_tos = RT_TOS(tun_key->ipv4_tos);
+	fl.flowi4_mark = skb->mark;
+	fl.flowi4_proto = IPPROTO_UDP;
+
+	rt = ip_route_output_key(net, &fl);
+	if (IS_ERR(rt)) {
+		err = PTR_ERR(rt);
+		goto error;
+	}
+
+	df = tun_key->tun_flags & TUNNEL_DONT_FRAGMENT ? htons(IP_DF) : 0;
+	sport = udp_flow_src_port(net, skb, 1, USHRT_MAX, true);
+	tunnel_id_to_vni(tun_key->tun_id, vni);
+	skb->ignore_df = 1;
+
+	err = geneve_xmit_skb(geneve_port->gs, rt, skb, fl.saddr,
+			      tun_key->ipv4_dst, tun_key->ipv4_tos,
+			      tun_key->ipv4_ttl, df, sport, dport,
+			      tun_key->tun_flags, vni,
+			      tun_info->options_len, (u8 *)tun_info->options,
+			      false);
+	if (err < 0)
+		ip_rt_put(rt);
+error:
+	return err;
+}
+
+static const char *geneve_get_name(const struct vport *vport)
+{
+	struct geneve_port *geneve_port = geneve_vport(vport);
+
+	return geneve_port->name;
+}
+
+const struct vport_ops ovs_geneve_vport_ops = {
+	.type		= OVS_VPORT_TYPE_GENEVE,
+	.create		= geneve_tnl_create,
+	.destroy	= geneve_tnl_destroy,
+	.get_name	= geneve_get_name,
+	.get_options	= geneve_get_options,
+	.send		= geneve_tnl_send,
+};
diff --git a/net/openvswitch/vport-gre.c b/net/openvswitch/vport-gre.c
index fe768bd..108b82d 100644
--- a/net/openvswitch/vport-gre.c
+++ b/net/openvswitch/vport-gre.c
@@ -106,7 +106,7 @@ static int gre_rcv(struct sk_buff *skb,
 
 	key = key_to_tunnel_id(tpi->key, tpi->seq);
 	ovs_flow_tun_info_init(&tun_info, ip_hdr(skb), key,
-			       filter_tnl_flags(tpi->flags));
+			       filter_tnl_flags(tpi->flags), NULL, 0);
 
 	ovs_vport_receive(vport, skb, &tun_info);
 	return PACKET_RCVD;
diff --git a/net/openvswitch/vport-vxlan.c b/net/openvswitch/vport-vxlan.c
index 5fbff2c..2735e01 100644
--- a/net/openvswitch/vport-vxlan.c
+++ b/net/openvswitch/vport-vxlan.c
@@ -66,7 +66,7 @@ static void vxlan_rcv(struct vxlan_sock *vs, struct sk_buff *skb, __be32 vx_vni)
 	/* Save outer tunnel values */
 	iph = ip_hdr(skb);
 	key = cpu_to_be64(ntohl(vx_vni) >> 8);
-	ovs_flow_tun_info_init(&tun_info, iph, key, TUNNEL_KEY);
+	ovs_flow_tun_info_init(&tun_info, iph, key, TUNNEL_KEY, NULL, 0);
 
 	ovs_vport_receive(vport, skb, &tun_info);
 }
diff --git a/net/openvswitch/vport.c b/net/openvswitch/vport.c
index 3e50ee8..53001b0 100644
--- a/net/openvswitch/vport.c
+++ b/net/openvswitch/vport.c
@@ -48,6 +48,9 @@ static const struct vport_ops *vport_ops_list[] = {
 #ifdef CONFIG_OPENVSWITCH_VXLAN
 	&ovs_vxlan_vport_ops,
 #endif
+#ifdef CONFIG_OPENVSWITCH_GENEVE
+	&ovs_geneve_vport_ops,
+#endif
 };
 
 /* Protected by RCU read lock for reading, ovs_mutex for writing. */
diff --git a/net/openvswitch/vport.h b/net/openvswitch/vport.h
index e28964a..8942125 100644
--- a/net/openvswitch/vport.h
+++ b/net/openvswitch/vport.h
@@ -215,6 +215,7 @@ extern const struct vport_ops ovs_netdev_vport_ops;
 extern const struct vport_ops ovs_internal_vport_ops;
 extern const struct vport_ops ovs_gre_vport_ops;
 extern const struct vport_ops ovs_vxlan_vport_ops;
+extern const struct vport_ops ovs_geneve_vport_ops;
 
 static inline void ovs_skb_postpush_rcsum(struct sk_buff *skb,
 				      const void *start, unsigned int len)
-- 
1.7.9.5

^ permalink raw reply related	[flat|nested] 11+ messages in thread

* Re: [net-next 6/6] openvswitch: Add support for Geneve tunneling.
  2014-10-02  8:04 ` [net-next 6/6] openvswitch: Add support for Geneve tunneling Andy Zhou
@ 2014-10-02 23:04   ` Tom Herbert
  2014-10-03  1:48     ` Jesse Gross
  0 siblings, 1 reply; 11+ messages in thread
From: Tom Herbert @ 2014-10-02 23:04 UTC (permalink / raw)
  To: Andy Zhou; +Cc: David Miller, Linux Netdev List, Jesse Gross

On Thu, Oct 2, 2014 at 1:04 AM, Andy Zhou <azhou@nicira.com> wrote:
> From: Jesse Gross <jesse@nicira.com>
>
> The Openvswitch implementation is completely agnostic to the options
> that are in use and can handle newly defined options without
> further work. It does this by simply matching on a byte array
> of options and allowing userspace to setup flows on this array.
>
> Signed-off-by: Jesse Gross <jesse@nicira.com>
> Signed-off-by: Andy Zhou <azhou@nicira.com>
> ---
>  include/net/ip_tunnels.h         |   21 ++--
>  include/uapi/linux/openvswitch.h |    2 +
>  net/openvswitch/Kconfig          |   11 ++
>  net/openvswitch/Makefile         |    4 +
>  net/openvswitch/datapath.c       |    5 +-
>  net/openvswitch/flow.c           |   20 +++-
>  net/openvswitch/flow.h           |   20 +++-
>  net/openvswitch/flow_netlink.c   |  176 +++++++++++++++++++++++-----
>  net/openvswitch/vport-geneve.c   |  236 ++++++++++++++++++++++++++++++++++++++
>  net/openvswitch/vport-gre.c      |    2 +-
>  net/openvswitch/vport-vxlan.c    |    2 +-
>  net/openvswitch/vport.c          |    3 +
>  net/openvswitch/vport.h          |    1 +
>  13 files changed, 461 insertions(+), 42 deletions(-)
>  create mode 100644 net/openvswitch/vport-geneve.c
>
> diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h
> index a9ce155..5bc6ede 100644
> --- a/include/net/ip_tunnels.h
> +++ b/include/net/ip_tunnels.h
> @@ -86,17 +86,18 @@ struct ip_tunnel {
>         struct gro_cells        gro_cells;
>  };
>
> -#define TUNNEL_CSUM    __cpu_to_be16(0x01)
> -#define TUNNEL_ROUTING __cpu_to_be16(0x02)
> -#define TUNNEL_KEY     __cpu_to_be16(0x04)
> -#define TUNNEL_SEQ     __cpu_to_be16(0x08)
> -#define TUNNEL_STRICT  __cpu_to_be16(0x10)
> -#define TUNNEL_REC     __cpu_to_be16(0x20)
> -#define TUNNEL_VERSION __cpu_to_be16(0x40)
> -#define TUNNEL_NO_KEY  __cpu_to_be16(0x80)
> +#define TUNNEL_CSUM            __cpu_to_be16(0x01)
> +#define TUNNEL_ROUTING         __cpu_to_be16(0x02)
> +#define TUNNEL_KEY             __cpu_to_be16(0x04)
> +#define TUNNEL_SEQ             __cpu_to_be16(0x08)
> +#define TUNNEL_STRICT          __cpu_to_be16(0x10)
> +#define TUNNEL_REC             __cpu_to_be16(0x20)

Just changing whitespace in these?

> +#define TUNNEL_VERSION         __cpu_to_be16(0x40)
> +#define TUNNEL_NO_KEY          __cpu_to_be16(0x80)
>  #define TUNNEL_DONT_FRAGMENT    __cpu_to_be16(0x0100)
> -#define TUNNEL_OAM     __cpu_to_be16(0x0200)
> -#define TUNNEL_CRIT_OPT        __cpu_to_be16(0x0400)
> +#define TUNNEL_OAM             __cpu_to_be16(0x0200)
> +#define TUNNEL_CRIT_OPT                __cpu_to_be16(0x0400)
> +#define TUNNEL_OPTIONS_PRESENT __cpu_to_be16(0x0800)
>
>  struct tnl_ptk_info {
>         __be16 flags;
> diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h
> index 6753032..435eabc 100644
> --- a/include/uapi/linux/openvswitch.h
> +++ b/include/uapi/linux/openvswitch.h
> @@ -192,6 +192,7 @@ enum ovs_vport_type {
>         OVS_VPORT_TYPE_INTERNAL, /* network device implemented by datapath */
>         OVS_VPORT_TYPE_GRE,      /* GRE tunnel. */
>         OVS_VPORT_TYPE_VXLAN,    /* VXLAN tunnel. */
> +       OVS_VPORT_TYPE_GENEVE,   /* Geneve tunnel. */
>         __OVS_VPORT_TYPE_MAX
>  };
>
> @@ -310,6 +311,7 @@ enum ovs_tunnel_key_attr {
>         OVS_TUNNEL_KEY_ATTR_DONT_FRAGMENT,      /* No argument, set DF. */
>         OVS_TUNNEL_KEY_ATTR_CSUM,               /* No argument. CSUM packet. */
>         OVS_TUNNEL_KEY_ATTR_OAM,                /* No argument. OAM frame.  */
> +       OVS_TUNNEL_KEY_ATTR_GENEVE_OPTS,        /* Array of Geneve options. */
>         __OVS_TUNNEL_KEY_ATTR_MAX
>  };
>
> diff --git a/net/openvswitch/Kconfig b/net/openvswitch/Kconfig
> index 6ecf491..ba3bb82 100644
> --- a/net/openvswitch/Kconfig
> +++ b/net/openvswitch/Kconfig
> @@ -54,3 +54,14 @@ config OPENVSWITCH_VXLAN
>           Say N to exclude this support and reduce the binary size.
>
>           If unsure, say Y.
> +
> +config OPENVSWITCH_GENEVE
> +       bool "Open vSwitch Geneve tunneling support"
> +       depends on INET
> +       depends on OPENVSWITCH
> +       depends on GENEVE && !(OPENVSWITCH=y && GENEVE=m)
> +       default y
> +       ---help---
> +         If you say Y here, then the Open vSwitch will be able create geneve vport.
> +
> +         Say N to exclude this support and reduce the binary size.
> diff --git a/net/openvswitch/Makefile b/net/openvswitch/Makefile
> index 3591cb5..9a33a27 100644
> --- a/net/openvswitch/Makefile
> +++ b/net/openvswitch/Makefile
> @@ -15,6 +15,10 @@ openvswitch-y := \
>         vport-internal_dev.o \
>         vport-netdev.o
>
> +ifneq ($(CONFIG_OPENVSWITCH_GENEVE),)
> +openvswitch-y += vport-geneve.o
> +endif
> +
>  ifneq ($(CONFIG_OPENVSWITCH_VXLAN),)
>  openvswitch-y += vport-vxlan.o
>  endif
> diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
> index 010125c..2e31d9e 100644
> --- a/net/openvswitch/datapath.c
> +++ b/net/openvswitch/datapath.c
> @@ -370,6 +370,7 @@ static size_t key_attr_size(void)
>                   + nla_total_size(0)   /* OVS_TUNNEL_KEY_ATTR_DONT_FRAGMENT */
>                   + nla_total_size(0)   /* OVS_TUNNEL_KEY_ATTR_CSUM */
>                   + nla_total_size(0)   /* OVS_TUNNEL_KEY_ATTR_OAM */
> +                 + nla_total_size(256)   /* OVS_TUNNEL_KEY_ATTR_GENEVE_OPTS */
>                 + nla_total_size(4)   /* OVS_KEY_ATTR_IN_PORT */
>                 + nla_total_size(4)   /* OVS_KEY_ATTR_SKB_MARK */
>                 + nla_total_size(12)  /* OVS_KEY_ATTR_ETHERNET */
> @@ -556,10 +557,12 @@ static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info)
>
>         err = ovs_nla_copy_actions(a[OVS_PACKET_ATTR_ACTIONS],
>                                    &flow->key, 0, &acts);
> -       rcu_assign_pointer(flow->sf_acts, acts);
>         if (err)
>                 goto err_flow_free;
>
> +       rcu_assign_pointer(flow->sf_acts, acts);
> +
> +       OVS_CB(packet)->egress_tun_info = NULL;
>         OVS_CB(packet)->flow = flow;
>         packet->priority = flow->key.phy.priority;
>         packet->mark = flow->key.phy.skb_mark;
> diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c
> index 2924cb3..62db02b 100644
> --- a/net/openvswitch/flow.c
> +++ b/net/openvswitch/flow.c
> @@ -448,6 +448,9 @@ static int key_extract(struct sk_buff *skb, struct sw_flow_key *key)
>         int error;
>         struct ethhdr *eth;
>
> +       /* Flags are always used as part of stats */
> +       key->tp.flags = 0;
> +
>         skb_reset_mac_header(skb);
>
>         /* Link layer.  We are guaranteed to have at least the 14 byte Ethernet
> @@ -646,10 +649,23 @@ int ovs_flow_key_extract(struct ovs_tunnel_info *tun_info,
>                          struct sk_buff *skb, struct sw_flow_key *key)
>  {
>         /* Extract metadata from packet. */
> -       if (tun_info)
> +       if (tun_info) {
>                 memcpy(&key->tun_key, &tun_info->tunnel, sizeof(key->tun_key));
> -       else
> +
> +               if (tun_info->options) {
> +                       BUILD_BUG_ON((1 << (sizeof(tun_info->options_len) *
> +                                                  8)) - 1
> +                                       > sizeof(key->tun_opts));
> +                       memcpy(GENEVE_OPTS(key, tun_info->options_len),
> +                              tun_info->options, tun_info->options_len);
> +                       key->tun_opts_len = tun_info->options_len;
> +               } else {
> +                       key->tun_opts_len = 0;
> +               }
> +       } else  {
> +               key->tun_opts_len = 0;
>                 memset(&key->tun_key, 0, sizeof(key->tun_key));
> +       }
>
>         key->phy.priority = skb->priority;
>         key->phy.in_port = OVS_CB(skb)->input_vport->port_no;
> diff --git a/net/openvswitch/flow.h b/net/openvswitch/flow.h
> index fe5a71b..7181331 100644
> --- a/net/openvswitch/flow.h
> +++ b/net/openvswitch/flow.h
> @@ -51,11 +51,24 @@ struct ovs_key_ipv4_tunnel {
>
>  struct ovs_tunnel_info {
>         struct ovs_key_ipv4_tunnel tunnel;
> +       struct geneve_opt *options;
> +       u8 options_len;
>  };
>
> +/* Store options at the end of the array if they are less than the
> + * maximum size. This allows us to get the benefits of variable length
> + * matching for small options.
> + */
> +#define GENEVE_OPTS(flow_key, opt_len) \
> +       ((struct geneve_opt *)((flow_key)->tun_opts + \
> +                              FIELD_SIZEOF(struct sw_flow_key, tun_opts) - \
> +                              opt_len))
> +
>  static inline void ovs_flow_tun_info_init(struct ovs_tunnel_info *tun_info,
>                                           const struct iphdr *iph,
> -                                         __be64 tun_id, __be16 tun_flags)
> +                                         __be64 tun_id, __be16 tun_flags,
> +                                         struct geneve_opt *opts,
> +                                         u8 opts_len)
>  {
>         tun_info->tunnel.tun_id = tun_id;
>         tun_info->tunnel.ipv4_src = iph->saddr;
> @@ -67,9 +80,14 @@ static inline void ovs_flow_tun_info_init(struct ovs_tunnel_info *tun_info,
>         /* clear struct padding. */
>         memset((unsigned char *)&tun_info->tunnel + OVS_TUNNEL_KEY_SIZE, 0,
>                sizeof(tun_info->tunnel) - OVS_TUNNEL_KEY_SIZE);
> +
> +       tun_info->options = opts;
> +       tun_info->options_len = opts_len;
>  }
>
>  struct sw_flow_key {
> +       u8 tun_opts[255];
> +       u8 tun_opts_len;
>         struct ovs_key_ipv4_tunnel tun_key;  /* Encapsulating tunnel key. */
>         struct {
>                 u32     priority;       /* Packet QoS priority. */
> diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c
> index 5d6194d..368f233 100644
> --- a/net/openvswitch/flow_netlink.c
> +++ b/net/openvswitch/flow_netlink.c
> @@ -42,6 +42,7 @@
>  #include <linux/icmp.h>
>  #include <linux/icmpv6.h>
>  #include <linux/rculist.h>
> +#include <net/geneve.h>
>  #include <net/ip.h>
>  #include <net/ipv6.h>
>  #include <net/ndisc.h>
> @@ -88,18 +89,20 @@ static void update_range__(struct sw_flow_match *match,
>                 }                                                           \
>         } while (0)
>
> -#define SW_FLOW_KEY_MEMCPY(match, field, value_p, len, is_mask) \
> -       do { \
> -               update_range__(match, offsetof(struct sw_flow_key, field),  \
> -                               len, is_mask);                              \
> -               if (is_mask) {                                              \
> -                       if ((match)->mask)                                  \
> -                               memcpy(&(match)->mask->key.field, value_p, len);\
> -               } else {                                                    \
> -                       memcpy(&(match)->key->field, value_p, len);         \
> -               }                                                           \
> +#define SW_FLOW_KEY_MEMCPY_OFFSET(match, offset, value_p, len, is_mask)            \
> +       do {                                                                \
> +               update_range__(match, offset, len, is_mask);                \
> +               if (is_mask)                                                \
> +                       memcpy((u8 *)&(match)->mask->key + offset, value_p, \
> +                              len);                                        \
> +               else                                                        \
> +                       memcpy((u8 *)(match)->key + offset, value_p, len);  \
>         } while (0)
>
> +#define SW_FLOW_KEY_MEMCPY(match, field, value_p, len, is_mask)                      \
> +       SW_FLOW_KEY_MEMCPY_OFFSET(match, offsetof(struct sw_flow_key, field), \
> +                                 value_p, len, is_mask)
> +
>  static u16 range_n_bytes(const struct sw_flow_key_range *range)
>  {
>         return range->end - range->start;
> @@ -335,6 +338,7 @@ static int ipv4_tun_from_nlattr(const struct nlattr *attr,
>         int rem;
>         bool ttl = false;
>         __be16 tun_flags = 0;
> +       unsigned long opt_key_offset;
>
>         nla_for_each_nested(a, attr, rem) {
>                 int type = nla_type(a);
> @@ -347,6 +351,7 @@ static int ipv4_tun_from_nlattr(const struct nlattr *attr,
>                         [OVS_TUNNEL_KEY_ATTR_DONT_FRAGMENT] = 0,
>                         [OVS_TUNNEL_KEY_ATTR_CSUM] = 0,
>                         [OVS_TUNNEL_KEY_ATTR_OAM] = 0,
> +                       [OVS_TUNNEL_KEY_ATTR_GENEVE_OPTS] = -1,
>                 };
>
>                 if (type > OVS_TUNNEL_KEY_ATTR_MAX) {
> @@ -355,7 +360,8 @@ static int ipv4_tun_from_nlattr(const struct nlattr *attr,
>                         return -EINVAL;
>                 }
>
> -               if (ovs_tunnel_key_lens[type] != nla_len(a)) {
> +               if (ovs_tunnel_key_lens[type] != nla_len(a) &&
> +                   ovs_tunnel_key_lens[type] != -1) {
>                         OVS_NLERR("IPv4 tunnel attribute type has unexpected "
>                                   " length (type=%d, length=%d, expected=%d).\n",
>                                   type, nla_len(a), ovs_tunnel_key_lens[type]);
> @@ -394,7 +400,60 @@ static int ipv4_tun_from_nlattr(const struct nlattr *attr,
>                 case OVS_TUNNEL_KEY_ATTR_OAM:
>                         tun_flags |= TUNNEL_OAM;
>                         break;
> +               case OVS_TUNNEL_KEY_ATTR_GENEVE_OPTS:
> +                       tun_flags |= TUNNEL_OPTIONS_PRESENT;
> +                       if (nla_len(a) > sizeof(match->key->tun_opts)) {
> +                               OVS_NLERR("Geneve option length exceeds maximum size (len %d, max %zu).\n",
> +                                         nla_len(a),
> +                                         sizeof(match->key->tun_opts));
> +                               return -EINVAL;
> +                       }
> +
> +                       if (nla_len(a) % 4 != 0) {
> +                               OVS_NLERR("Geneve option length is not a multiple of 4 (len %d).\n",
> +                                         nla_len(a));
> +                               return -EINVAL;
> +                       }
> +
> +                       /* We need to record the length of the options passed
> +                        * down, otherwise packets with the same format but
> +                        * additional options will be silently matched.
> +                        */
> +                       if (!is_mask) {
> +                               SW_FLOW_KEY_PUT(match, tun_opts_len, nla_len(a),
> +                                               false);
> +                       } else {
> +                               /* This is somewhat unusual because it looks at
> +                                * both the key and mask while parsing the
> +                                * attributes (and by extension assumes the key
> +                                * is parsed first). Normally, we would verify
> +                                * that each is the correct length and that the
> +                                * attributes line up in the validate function.
> +                                * However, that is difficult because this is
> +                                * variable length and we won't have the
> +                                * information later.
> +                                */
> +                               if (match->key->tun_opts_len != nla_len(a)) {
> +                                       OVS_NLERR("Geneve option key length (%d) is different from mask length (%d).",
> +                                                 match->key->tun_opts_len,
> +                                                 nla_len(a));
> +                                       return -EINVAL;
> +                               }
> +
> +                               SW_FLOW_KEY_PUT(match, tun_opts_len, 0xff,
> +                                               true);
> +                       }
> +
> +                       opt_key_offset = (unsigned long)GENEVE_OPTS(
> +                                         (struct sw_flow_key *)0,
> +                                         nla_len(a));
> +                       SW_FLOW_KEY_MEMCPY_OFFSET(match, opt_key_offset,
> +                                                 nla_data(a), nla_len(a),
> +                                                 is_mask);
> +                       break;
>                 default:
> +                       OVS_NLERR("Unknown IPv4 tunnel attribute (%d).\n",
> +                                 type);
>                         return -EINVAL;
>                 }
>         }
> @@ -421,16 +480,11 @@ static int ipv4_tun_from_nlattr(const struct nlattr *attr,
>         return 0;
>  }
>
> -static int ipv4_tun_to_nlattr(struct sk_buff *skb,
> -                             const struct ovs_key_ipv4_tunnel *tun_key,
> -                             const struct ovs_key_ipv4_tunnel *output)
> +static int __ipv4_tun_to_nlattr(struct sk_buff *skb,
> +                               const struct ovs_key_ipv4_tunnel *output,
> +                               const struct geneve_opt *tun_opts,
> +                               int swkey_tun_opts_len)
>  {
> -       struct nlattr *nla;
> -
> -       nla = nla_nest_start(skb, OVS_KEY_ATTR_TUNNEL);
> -       if (!nla)
> -               return -EMSGSIZE;
> -
>         if (output->tun_flags & TUNNEL_KEY &&
>             nla_put_be64(skb, OVS_TUNNEL_KEY_ATTR_ID, output->tun_id))
>                 return -EMSGSIZE;
> @@ -454,12 +508,35 @@ static int ipv4_tun_to_nlattr(struct sk_buff *skb,
>         if ((output->tun_flags & TUNNEL_OAM) &&
>             nla_put_flag(skb, OVS_TUNNEL_KEY_ATTR_OAM))
>                 return -EMSGSIZE;
> +       if (tun_opts &&
> +           nla_put(skb, OVS_TUNNEL_KEY_ATTR_GENEVE_OPTS,
> +                   swkey_tun_opts_len, tun_opts))
> +               return -EMSGSIZE;
>
> -       nla_nest_end(skb, nla);
>         return 0;
>  }
>
>
> +static int ipv4_tun_to_nlattr(struct sk_buff *skb,
> +                             const struct ovs_key_ipv4_tunnel *output,
> +                             const struct geneve_opt *tun_opts,
> +                             int swkey_tun_opts_len)
> +{
> +       struct nlattr *nla;
> +       int err;
> +
> +       nla = nla_nest_start(skb, OVS_KEY_ATTR_TUNNEL);
> +       if (!nla)
> +               return -EMSGSIZE;
> +
> +       err = __ipv4_tun_to_nlattr(skb, output, tun_opts, swkey_tun_opts_len);
> +       if (err)
> +               return err;
> +
> +       nla_nest_end(skb, nla);
> +       return 0;
> +}
> +
>  static int metadata_from_nlattrs(struct sw_flow_match *match,  u64 *attrs,
>                                  const struct nlattr **a, bool is_mask)
>  {
> @@ -905,9 +982,16 @@ int ovs_nla_put_flow(const struct sw_flow_key *swkey,
>         if (nla_put_u32(skb, OVS_KEY_ATTR_PRIORITY, output->phy.priority))
>                 goto nla_put_failure;
>
> -       if ((swkey->tun_key.ipv4_dst || is_mask) &&
> -           ipv4_tun_to_nlattr(skb, &swkey->tun_key, &output->tun_key))
> -               goto nla_put_failure;
> +       if ((swkey->tun_key.ipv4_dst || is_mask)) {
> +               const struct geneve_opt *opts = NULL;
> +
> +               if (output->tun_key.tun_flags & TUNNEL_OPTIONS_PRESENT)
> +                       opts = GENEVE_OPTS(output, swkey->tun_opts_len);
> +
> +               if (ipv4_tun_to_nlattr(skb, &output->tun_key, opts,
> +                                      swkey->tun_opts_len))
> +                       goto nla_put_failure;
> +       }
>
>         if (swkey->phy.in_port == DP_MAX_PORTS) {
>                 if (is_mask && (output->phy.in_port == 0xffff))
> @@ -1290,17 +1374,55 @@ static int validate_and_copy_set_tun(const struct nlattr *attr,
>         if (err)
>                 return err;
>
> +       if (key.tun_opts_len) {
> +               struct geneve_opt *option = GENEVE_OPTS(&key,
> +                                                       key.tun_opts_len);
> +               int opts_len = key.tun_opts_len;
> +               bool crit_opt = false;
> +
> +               while (opts_len > 0) {
> +                       int len;
> +
> +                       if (opts_len < sizeof(*option))
> +                               return -EINVAL;
> +
> +                       len = sizeof(*option) + option->length * 4;
> +                       if (len > opts_len)
> +                               return -EINVAL;
> +
> +                       crit_opt |= !!(option->type & GENEVE_CRIT_OPT_TYPE);
> +
> +                       option = (struct geneve_opt *)((u8 *)option + len);
> +                       opts_len -= len;
> +               };
> +
> +               key.tun_key.tun_flags |= crit_opt ? TUNNEL_CRIT_OPT : 0;
> +       };
> +
>         start = add_nested_action_start(sfa, OVS_ACTION_ATTR_SET);
>         if (start < 0)
>                 return start;
>
>         a = __add_action(sfa, OVS_KEY_ATTR_TUNNEL_INFO, NULL,
> -                        sizeof(*tun_info));
> +                        sizeof(*tun_info) + key.tun_opts_len);
>         if (IS_ERR(a))
>                 return PTR_ERR(a);
>
>         tun_info = nla_data(a);
>         tun_info->tunnel = key.tun_key;
> +       tun_info->options_len = key.tun_opts_len;
> +
> +       if (tun_info->options_len) {
> +               /* We need to store the options in the action itself since
> +                * everything else will go away after flow setup. We can append
> +                * it to tun_info and then point there.
> +                */
> +               memcpy((tun_info + 1), GENEVE_OPTS(&key, key.tun_opts_len),
> +                      key.tun_opts_len);
> +               tun_info->options = (struct geneve_opt *)(tun_info + 1);
> +       } else {
> +               tun_info->options = NULL;
> +       }
>
>         add_nested_action_end(*sfa, start);
>
> @@ -1592,7 +1714,9 @@ static int set_action_to_attr(const struct nlattr *a, struct sk_buff *skb)
>                         return -EMSGSIZE;
>
>                 err = ipv4_tun_to_nlattr(skb, &tun_info->tunnel,
> -                                        nla_data(ovs_key));
> +                                        tun_info->options_len ?
> +                                               tun_info->options : NULL,
> +                                        tun_info->options_len);
>                 if (err)
>                         return err;
>                 nla_nest_end(skb, start);
> diff --git a/net/openvswitch/vport-geneve.c b/net/openvswitch/vport-geneve.c
> new file mode 100644
> index 0000000..5572d48
> --- /dev/null
> +++ b/net/openvswitch/vport-geneve.c
> @@ -0,0 +1,236 @@
> +/*
> + * Copyright (c) 2014 Nicira, Inc.
> + *
> + * This program is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU General Public License
> + * as published by the Free Software Foundation; either version
> + * 2 of the License, or (at your option) any later version.
> + */
> +
> +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
> +
> +#include <linux/version.h>
> +
> +#include <linux/in.h>
> +#include <linux/ip.h>
> +#include <linux/net.h>
> +#include <linux/rculist.h>
> +#include <linux/udp.h>
> +#include <linux/if_vlan.h>
> +
> +#include <net/geneve.h>
> +#include <net/icmp.h>
> +#include <net/ip.h>
> +#include <net/route.h>
> +#include <net/udp.h>
> +#include <net/xfrm.h>
> +
> +#include "datapath.h"
> +#include "vport.h"
> +
> +/**
> + * struct geneve_port - Keeps track of open UDP ports
> + * @sock: The socket created for this port number.
> + * @name: vport name.
> + */
> +struct geneve_port {
> +       struct geneve_sock *gs;
> +       char name[IFNAMSIZ];
> +};
> +
> +static LIST_HEAD(geneve_ports);
> +
> +static inline struct geneve_port *geneve_vport(const struct vport *vport)
> +{
> +       return vport_priv(vport);
> +}
> +
> +static inline struct genevehdr *geneve_hdr(const struct sk_buff *skb)
> +{
> +       return (struct genevehdr *)(udp_hdr(skb) + 1);
> +}
> +
> +/* Convert 64 bit tunnel ID to 24 bit VNI. */
> +static void tunnel_id_to_vni(__be64 tun_id, __u8 *vni)
> +{
> +#ifdef __BIG_ENDIAN
> +       vni[0] = (__force __u8)(tun_id >> 16);
> +       vni[1] = (__force __u8)(tun_id >> 8);
> +       vni[2] = (__force __u8)tun_id;
> +#else
> +       vni[0] = (__force __u8)((__force u64)tun_id >> 40);
> +       vni[1] = (__force __u8)((__force u64)tun_id >> 48);
> +       vni[2] = (__force __u8)((__force u64)tun_id >> 56);
> +#endif
> +}
> +
> +/* Convert 24 bit VNI to 64 bit tunnel ID. */
> +static __be64 vni_to_tunnel_id(__u8 *vni)
> +{
> +#ifdef __BIG_ENDIAN
> +       return (vni[0] << 16) | (vni[1] << 8) | vni[2];
> +#else
> +       return (__force __be64)(((__force u64)vni[0] << 40) |
> +                               ((__force u64)vni[1] << 48) |
> +                               ((__force u64)vni[2] << 56));
> +#endif
> +}
> +
> +static void geneve_rcv(struct geneve_sock *gs, struct sk_buff *skb)
> +{
> +       struct vport *vport = gs->rcv_data;
> +       struct genevehdr *geneveh = geneve_hdr(skb);
> +       int opts_len;
> +       struct ovs_tunnel_info tun_info;
> +       __be64 key;
> +       __be16 flags;
> +
> +       opts_len = geneveh->opt_len * 4;
> +
> +       flags = TUNNEL_KEY | TUNNEL_OPTIONS_PRESENT |
> +               (udp_hdr(skb)->check != 0 ? TUNNEL_CSUM : 0) |
> +               (geneveh->oam ? TUNNEL_OAM : 0) |
> +               (geneveh->critical ? TUNNEL_CRIT_OPT : 0);
> +
> +       key = vni_to_tunnel_id(geneveh->vni);
> +
> +       ovs_flow_tun_info_init(&tun_info, ip_hdr(skb), key, flags,
> +                              geneveh->options, opts_len);
> +
> +       ovs_vport_receive(vport, skb, &tun_info);
> +}
> +
> +static int geneve_get_options(const struct vport *vport,
> +                             struct sk_buff *skb)
> +{
> +       struct geneve_port *geneve_port = geneve_vport(vport);
> +       __be16 sport;
> +
> +       sport = ntohs(inet_sk(geneve_port->gs->sock->sk)->inet_sport);
> +       if (nla_put_u16(skb, OVS_TUNNEL_ATTR_DST_PORT, sport))
> +               return -EMSGSIZE;
> +       return 0;
> +}
> +
> +static void geneve_tnl_destroy(struct vport *vport)
> +{
> +       struct geneve_port *geneve_port = geneve_vport(vport);
> +
> +       geneve_sock_release(geneve_port->gs);
> +
> +       ovs_vport_deferred_free(vport);
> +}
> +
> +static struct vport *geneve_tnl_create(const struct vport_parms *parms)
> +{
> +       struct net *net = ovs_dp_get_net(parms->dp);
> +       struct nlattr *options = parms->options;
> +       struct geneve_port *geneve_port;
> +       struct geneve_sock *gs;
> +       struct vport *vport;
> +       struct nlattr *a;
> +       int err;
> +       u16 dst_port;
> +
> +       if (!options) {
> +               err = -EINVAL;
> +               goto error;
> +       }
> +
> +       a = nla_find_nested(options, OVS_TUNNEL_ATTR_DST_PORT);
> +       if (a && nla_len(a) == sizeof(u16)) {
> +               dst_port = nla_get_u16(a);
> +       } else {
> +               /* Require destination port from userspace. */
> +               err = -EINVAL;
> +               goto error;
> +       }
> +
> +       vport = ovs_vport_alloc(sizeof(struct geneve_port),
> +                               &ovs_geneve_vport_ops, parms);
> +       if (IS_ERR(vport))
> +               return vport;
> +
> +       geneve_port = geneve_vport(vport);
> +       strncpy(geneve_port->name, parms->name, IFNAMSIZ);
> +
> +       gs = geneve_sock_add(net, htons(dst_port), geneve_rcv, vport, true, 0);
> +       if (IS_ERR(gs)) {
> +               ovs_vport_free(vport);
> +               return (void *)gs;
> +       }
> +       geneve_port->gs = gs;
> +
> +       return vport;
> +error:
> +       return ERR_PTR(err);
> +}
> +
> +static int geneve_tnl_send(struct vport *vport, struct sk_buff *skb)
> +{
> +       struct ovs_key_ipv4_tunnel *tun_key;
> +       struct ovs_tunnel_info *tun_info;
> +       struct net *net = ovs_dp_get_net(vport->dp);
> +       struct geneve_port *geneve_port = geneve_vport(vport);
> +       __be16 dport = inet_sk(geneve_port->gs->sock->sk)->inet_sport;
> +       __be16 sport;
> +       struct rtable *rt;
> +       struct flowi4 fl;
> +       u8 vni[3];
> +       __be16 df;
> +       int err;
> +
> +       tun_info = OVS_CB(skb)->egress_tun_info;
> +       if (unlikely(!tun_info)) {
> +               err = -EINVAL;
> +               goto error;
> +       }
> +
> +       tun_key = &tun_info->tunnel;
> +
> +       /* Route lookup */
> +       memset(&fl, 0, sizeof(fl));
> +       fl.daddr = tun_key->ipv4_dst;
> +       fl.saddr = tun_key->ipv4_src;
> +       fl.flowi4_tos = RT_TOS(tun_key->ipv4_tos);
> +       fl.flowi4_mark = skb->mark;
> +       fl.flowi4_proto = IPPROTO_UDP;
> +
> +       rt = ip_route_output_key(net, &fl);
> +       if (IS_ERR(rt)) {
> +               err = PTR_ERR(rt);
> +               goto error;
> +       }
> +
> +       df = tun_key->tun_flags & TUNNEL_DONT_FRAGMENT ? htons(IP_DF) : 0;
> +       sport = udp_flow_src_port(net, skb, 1, USHRT_MAX, true);
> +       tunnel_id_to_vni(tun_key->tun_id, vni);
> +       skb->ignore_df = 1;
> +
> +       err = geneve_xmit_skb(geneve_port->gs, rt, skb, fl.saddr,
> +                             tun_key->ipv4_dst, tun_key->ipv4_tos,
> +                             tun_key->ipv4_ttl, df, sport, dport,
> +                             tun_key->tun_flags, vni,
> +                             tun_info->options_len, (u8 *)tun_info->options,
> +                             false);
> +       if (err < 0)
> +               ip_rt_put(rt);
> +error:
> +       return err;
> +}
> +
> +static const char *geneve_get_name(const struct vport *vport)
> +{
> +       struct geneve_port *geneve_port = geneve_vport(vport);
> +
> +       return geneve_port->name;
> +}
> +
> +const struct vport_ops ovs_geneve_vport_ops = {
> +       .type           = OVS_VPORT_TYPE_GENEVE,
> +       .create         = geneve_tnl_create,
> +       .destroy        = geneve_tnl_destroy,
> +       .get_name       = geneve_get_name,
> +       .get_options    = geneve_get_options,
> +       .send           = geneve_tnl_send,
> +};
> diff --git a/net/openvswitch/vport-gre.c b/net/openvswitch/vport-gre.c
> index fe768bd..108b82d 100644
> --- a/net/openvswitch/vport-gre.c
> +++ b/net/openvswitch/vport-gre.c
> @@ -106,7 +106,7 @@ static int gre_rcv(struct sk_buff *skb,
>
>         key = key_to_tunnel_id(tpi->key, tpi->seq);
>         ovs_flow_tun_info_init(&tun_info, ip_hdr(skb), key,
> -                              filter_tnl_flags(tpi->flags));
> +                              filter_tnl_flags(tpi->flags), NULL, 0);
>
>         ovs_vport_receive(vport, skb, &tun_info);
>         return PACKET_RCVD;
> diff --git a/net/openvswitch/vport-vxlan.c b/net/openvswitch/vport-vxlan.c
> index 5fbff2c..2735e01 100644
> --- a/net/openvswitch/vport-vxlan.c
> +++ b/net/openvswitch/vport-vxlan.c
> @@ -66,7 +66,7 @@ static void vxlan_rcv(struct vxlan_sock *vs, struct sk_buff *skb, __be32 vx_vni)
>         /* Save outer tunnel values */
>         iph = ip_hdr(skb);
>         key = cpu_to_be64(ntohl(vx_vni) >> 8);
> -       ovs_flow_tun_info_init(&tun_info, iph, key, TUNNEL_KEY);
> +       ovs_flow_tun_info_init(&tun_info, iph, key, TUNNEL_KEY, NULL, 0);
>
>         ovs_vport_receive(vport, skb, &tun_info);
>  }
> diff --git a/net/openvswitch/vport.c b/net/openvswitch/vport.c
> index 3e50ee8..53001b0 100644
> --- a/net/openvswitch/vport.c
> +++ b/net/openvswitch/vport.c
> @@ -48,6 +48,9 @@ static const struct vport_ops *vport_ops_list[] = {
>  #ifdef CONFIG_OPENVSWITCH_VXLAN
>         &ovs_vxlan_vport_ops,
>  #endif
> +#ifdef CONFIG_OPENVSWITCH_GENEVE
> +       &ovs_geneve_vport_ops,
> +#endif
>  };
>
>  /* Protected by RCU read lock for reading, ovs_mutex for writing. */
> diff --git a/net/openvswitch/vport.h b/net/openvswitch/vport.h
> index e28964a..8942125 100644
> --- a/net/openvswitch/vport.h
> +++ b/net/openvswitch/vport.h
> @@ -215,6 +215,7 @@ extern const struct vport_ops ovs_netdev_vport_ops;
>  extern const struct vport_ops ovs_internal_vport_ops;
>  extern const struct vport_ops ovs_gre_vport_ops;
>  extern const struct vport_ops ovs_vxlan_vport_ops;
> +extern const struct vport_ops ovs_geneve_vport_ops;
>
>  static inline void ovs_skb_postpush_rcsum(struct sk_buff *skb,
>                                       const void *start, unsigned int len)
> --
> 1.7.9.5
>
> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [net-next 6/6] openvswitch: Add support for Geneve tunneling.
  2014-10-02 23:04   ` Tom Herbert
@ 2014-10-03  1:48     ` Jesse Gross
  0 siblings, 0 replies; 11+ messages in thread
From: Jesse Gross @ 2014-10-03  1:48 UTC (permalink / raw)
  To: Tom Herbert; +Cc: Andy Zhou, David Miller, Linux Netdev List

On Thu, Oct 2, 2014 at 4:04 PM, Tom Herbert <therbert@google.com> wrote:
> On Thu, Oct 2, 2014 at 1:04 AM, Andy Zhou <azhou@nicira.com> wrote:
>> From: Jesse Gross <jesse@nicira.com>
>>
>> The Openvswitch implementation is completely agnostic to the options
>> that are in use and can handle newly defined options without
>> further work. It does this by simply matching on a byte array
>> of options and allowing userspace to setup flows on this array.
>>
>> Signed-off-by: Jesse Gross <jesse@nicira.com>
>> Signed-off-by: Andy Zhou <azhou@nicira.com>
>> ---
>>  include/net/ip_tunnels.h         |   21 ++--
>>  include/uapi/linux/openvswitch.h |    2 +
>>  net/openvswitch/Kconfig          |   11 ++
>>  net/openvswitch/Makefile         |    4 +
>>  net/openvswitch/datapath.c       |    5 +-
>>  net/openvswitch/flow.c           |   20 +++-
>>  net/openvswitch/flow.h           |   20 +++-
>>  net/openvswitch/flow_netlink.c   |  176 +++++++++++++++++++++++-----
>>  net/openvswitch/vport-geneve.c   |  236 ++++++++++++++++++++++++++++++++++++++
>>  net/openvswitch/vport-gre.c      |    2 +-
>>  net/openvswitch/vport-vxlan.c    |    2 +-
>>  net/openvswitch/vport.c          |    3 +
>>  net/openvswitch/vport.h          |    1 +
>>  13 files changed, 461 insertions(+), 42 deletions(-)
>>  create mode 100644 net/openvswitch/vport-geneve.c
>>
>> diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h
>> index a9ce155..5bc6ede 100644
>> --- a/include/net/ip_tunnels.h
>> +++ b/include/net/ip_tunnels.h
>> @@ -86,17 +86,18 @@ struct ip_tunnel {
>>         struct gro_cells        gro_cells;
>>  };
>>
>> -#define TUNNEL_CSUM    __cpu_to_be16(0x01)
>> -#define TUNNEL_ROUTING __cpu_to_be16(0x02)
>> -#define TUNNEL_KEY     __cpu_to_be16(0x04)
>> -#define TUNNEL_SEQ     __cpu_to_be16(0x08)
>> -#define TUNNEL_STRICT  __cpu_to_be16(0x10)
>> -#define TUNNEL_REC     __cpu_to_be16(0x20)
>> -#define TUNNEL_VERSION __cpu_to_be16(0x40)
>> -#define TUNNEL_NO_KEY  __cpu_to_be16(0x80)
>> +#define TUNNEL_CSUM            __cpu_to_be16(0x01)
>> +#define TUNNEL_ROUTING         __cpu_to_be16(0x02)
>> +#define TUNNEL_KEY             __cpu_to_be16(0x04)
>> +#define TUNNEL_SEQ             __cpu_to_be16(0x08)
>> +#define TUNNEL_STRICT          __cpu_to_be16(0x10)
>> +#define TUNNEL_REC             __cpu_to_be16(0x20)
>
> Just changing whitespace in these?

Yeah, it's just reindenting to match the new values.

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [net-next 1/6] net: Add Geneve tunneling protocol driver
  2014-10-02  8:04 ` [net-next 1/6] net: Add Geneve tunneling protocol driver Andy Zhou
@ 2014-10-03 13:44   ` Nicolas Dichtel
  2014-10-03 19:45     ` Andy Zhou
  0 siblings, 1 reply; 11+ messages in thread
From: Nicolas Dichtel @ 2014-10-03 13:44 UTC (permalink / raw)
  To: Andy Zhou, davem; +Cc: netdev, Jesse Gross

Le 02/10/2014 10:04, Andy Zhou a écrit :
> This adds a device level support for Geneve -- Generic Network
> Virtualization Encapsulation. The protocol is documented at
> http://tools.ietf.org/html/draft-gross-geneve-01
>
> Only protocol layer Geneve support is provided by this driver.
> Openvswitch can be used for configuring, set up and tear down
> functional Geneve tunnels.
Do you plan too add the full support (ie being able to configure a
geneve netdev interface with iproute2)?

Another small comment below.
>
> Signed-off-by: Jesse Gross <jesse@nicira.com>
> Signed-off-by: Andy Zhou <azhou@nicira.com>
> ---
>   include/net/geneve.h     |   91 +++++++++++
>   include/net/ip_tunnels.h |    2 +
>   net/ipv4/Kconfig         |   14 ++
>   net/ipv4/Makefile        |    1 +
>   net/ipv4/geneve.c        |  373 ++++++++++++++++++++++++++++++++++++++++++++++
>   5 files changed, 481 insertions(+)
>   create mode 100644 include/net/geneve.h
>   create mode 100644 net/ipv4/geneve.c
>
> diff --git a/include/net/geneve.h b/include/net/geneve.h
> new file mode 100644
> index 0000000..ce98865
> --- /dev/null
> +++ b/include/net/geneve.h
> @@ -0,0 +1,91 @@
> +#ifndef __NET_GENEVE_H
> +#define __NET_GENEVE_H  1
> +
> +#include <net/udp_tunnel.h>
> +
> +struct geneve_sock;
> +
> +typedef void (geneve_rcv_t)(struct geneve_sock *gs, struct sk_buff *skb);
> +
> +struct geneve_sock {
> +	struct hlist_node	hlist;
> +	geneve_rcv_t		*rcv;
> +	void			*rcv_data;
> +	struct work_struct	del_work;
> +	struct socket		*sock;
> +	struct rcu_head		rcu;
> +	atomic_t		refcnt;
> +	struct udp_offload	udp_offloads;
> +};
> +
> +/* Geneve Header:
> + *  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
> + *  |Ver|  Opt Len  |O|C|    Rsvd.  |          Protocol Type        |
> + *  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
> + *  |        Virtual Network Identifier (VNI)       |    Reserved   |
> + *  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
> + *  |                    Variable Length Options                    |
> + *  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
> + *
> + * Option Header:
> + *  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
> + *  |          Option Class         |      Type     |R|R|R| Length  |
> + *  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
> + *  |                      Variable Option Data                     |
> + *  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
> + */
> +
> +struct geneve_opt {
> +	__be16	opt_class;
> +	u8	type;
> +#ifdef __LITTLE_ENDIAN_BITFIELD
> +	u8	length:5;
> +	u8	r3:1;
> +	u8	r2:1;
> +	u8	r1:1;
> +#else
> +	u8	r1:1;
> +	u8	r2:1;
> +	u8	r3:1;
> +	u8	length:5;
> +#endif
> +	u8	opt_data[];
> +};
> +
> +#define GENEVE_CRIT_OPT_TYPE (1 << 7)
> +
> +struct genevehdr {
> +#ifdef __LITTLE_ENDIAN_BITFIELD
> +	u8 opt_len:6;
> +	u8 ver:2;
> +	u8 rsvd1:6;
> +	u8 critical:1;
> +	u8 oam:1;
> +#else
> +	u8 ver:2;
> +	u8 opt_len:6;
> +	u8 oam:1;
> +	u8 critical:1;
> +	u8 rsvd1:6;
> +#endif
> +	__be16 proto_type;
> +	u8 vni[3];
> +	u8 rsvd2;
> +	struct geneve_opt options[];
> +};
> +
> +#define GENEVE_VER 0
> +#define GENEVE_BASE_HLEN (sizeof(struct udphdr) + sizeof(struct genevehdr))
> +
> +struct geneve_sock *geneve_sock_add(struct net *net, __be16 port,
> +				    geneve_rcv_t *rcv, void *data,
> +				    bool no_share, bool ipv6);
> +
> +void geneve_sock_release(struct geneve_sock *vs);
> +
> +int geneve_xmit_skb(struct geneve_sock *gs, struct rtable *rt,
> +		    struct sk_buff *skb, __be32 src, __be32 dst, __u8 tos,
> +		    __u8 ttl, __be16 df, __be16 src_port, __be16 dst_port,
> +		    __be16 tun_flags, u8 vni[3], u8 opt_len, u8 *opt,
> +		    bool xnet);
> +#endif
> diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h
> index 7f538ba..a9ce155 100644
> --- a/include/net/ip_tunnels.h
> +++ b/include/net/ip_tunnels.h
> @@ -95,6 +95,8 @@ struct ip_tunnel {
>   #define TUNNEL_VERSION	__cpu_to_be16(0x40)
>   #define TUNNEL_NO_KEY	__cpu_to_be16(0x80)
>   #define TUNNEL_DONT_FRAGMENT    __cpu_to_be16(0x0100)
> +#define TUNNEL_OAM	__cpu_to_be16(0x0200)
> +#define TUNNEL_CRIT_OPT	__cpu_to_be16(0x0400)
>
>   struct tnl_ptk_info {
>   	__be16 flags;
> diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
> index 69fb378..15ce6b0 100644
> --- a/net/ipv4/Kconfig
> +++ b/net/ipv4/Kconfig
> @@ -453,6 +453,20 @@ config TCP_CONG_BIC
>   	increase provides TCP friendliness.
>   	See http://www.csc.ncsu.edu/faculty/rhee/export/bitcp/
>
> +config GENEVE
> +       tristate "Generic Network Virtualization Encapsulation (Geneve)"
> +       depends on INET
> +       select NET_IP_TUNNEL
> +       select NET_UDP_TUNNEL
> +       ---help---
Use tabs instead of spaces for the baove lines.

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [net-next 1/6] net: Add Geneve tunneling protocol driver
  2014-10-03 13:44   ` Nicolas Dichtel
@ 2014-10-03 19:45     ` Andy Zhou
  0 siblings, 0 replies; 11+ messages in thread
From: Andy Zhou @ 2014-10-03 19:45 UTC (permalink / raw)
  To: nicolas.dichtel; +Cc: David Miller, netdev, Jesse Gross

>
> Do you plan too add the full support (ie being able to configure a
> geneve netdev interface with iproute2)?
>
According to the following email thread, John Linville is looking at adding it.
http://lists.openwall.net/netdev/2014/07/22/198

> Another small comment below.
>
>> +config GENEVE
>> +       tristate "Generic Network Virtualization Encapsulation (Geneve)"
>> +       depends on INET
>> +       select NET_IP_TUNNEL
>> +       select NET_UDP_TUNNEL
>> +       ---help---
>
> Use tabs instead of spaces for the baove lines.

Sure, I will fix this.

^ permalink raw reply	[flat|nested] 11+ messages in thread

end of thread, other threads:[~2014-10-03 19:45 UTC | newest]

Thread overview: 11+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2014-10-02  8:04 [net-next 0/6] Add Geneve tunnel protocol support Andy Zhou
2014-10-02  8:04 ` [net-next 1/6] net: Add Geneve tunneling protocol driver Andy Zhou
2014-10-03 13:44   ` Nicolas Dichtel
2014-10-03 19:45     ` Andy Zhou
2014-10-02  8:04 ` [net-next 2/6] openvswitch: Eliminate memset() from flow_extract Andy Zhou
2014-10-02  8:04 ` [net-next 3/6] openvswitch: Add support for matching on OAM packets Andy Zhou
2014-10-02  8:04 ` [net-next 4/6] openvswitch: Wrap struct ovs_key_ipv4_tunnel in a new structure Andy Zhou
2014-10-02  8:04 ` [net-next 5/6] openvswitch: Factor out allocation and verification of actions Andy Zhou
2014-10-02  8:04 ` [net-next 6/6] openvswitch: Add support for Geneve tunneling Andy Zhou
2014-10-02 23:04   ` Tom Herbert
2014-10-03  1:48     ` Jesse Gross

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.