[PATCH net-next 0/3] VXLAN driver

netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [PATCH net-next 0/3] VXLAN driver
@ 2012-09-24 18:43 Stephen Hemminger
  2012-09-24 18:43 ` [PATCH net-next 1/3] netlink: add attributes to fdb interface Stephen Hemminger
                   ` (2 more replies)
  0 siblings, 3 replies; 43+ messages in thread
From: Stephen Hemminger @ 2012-09-24 18:43 UTC (permalink / raw)
  To: David Miller, Chris Wright; +Cc: netdev

This patchset implements the Virtual eXtensible Local Area Network
tunnel. The code comes from GRE, Bridge, and the OVS Vxlan. The code
has only been tested with the TAP based vxlan 
    https://github.com/upa/vxlan.git

Thanks to Chris for the idea.

^ permalink raw reply	[flat|nested] 43+ messages in thread

* [PATCH net-next 1/3] netlink: add attributes to fdb interface
  2012-09-24 18:43 [PATCH net-next 0/3] VXLAN driver Stephen Hemminger
@ 2012-09-24 18:43 ` Stephen Hemminger
  2012-09-24 18:43 ` [PATCH net-next 2/3] igmp: export symbol ip_mc_leave_group Stephen Hemminger
  2012-09-24 18:43 ` [PATCH net-next 3/3] vxlan: virtual extensible lan Stephen Hemminger
  2 siblings, 0 replies; 43+ messages in thread
From: Stephen Hemminger @ 2012-09-24 18:43 UTC (permalink / raw)
  To: David Miller, Chris Wright; +Cc: netdev

[-- Attachment #1: fdb-attr.patch --]
[-- Type: text/plain, Size: 4186 bytes --]

Later changes need to be able to refer to neighbour attributes
when doing fdb_add.

Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>

---
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c |    2 +-
 drivers/net/macvlan.c                         |    2 +-
 include/linux/netdevice.h                     |    4 +++-
 net/bridge/br_fdb.c                           |    3 ++-
 net/bridge/br_private.h                       |    2 +-
 net/core/rtnetlink.c                          |    6 ++++--
 6 files changed, 12 insertions(+), 7 deletions(-)

--- a/drivers/net/macvlan.c	2012-09-17 11:24:34.000000000 -0700
+++ b/drivers/net/macvlan.c	2012-09-17 11:28:13.999165490 -0700
@@ -546,7 +546,7 @@ static int macvlan_vlan_rx_kill_vid(stru
 	return 0;
 }
 
-static int macvlan_fdb_add(struct ndmsg *ndm,
+static int macvlan_fdb_add(struct ndmsg *ndm, struct nlattr *tb[],
 			   struct net_device *dev,
 			   const unsigned char *addr,
 			   u16 flags)
--- a/include/linux/netdevice.h	2012-09-17 11:24:34.000000000 -0700
+++ b/include/linux/netdevice.h	2012-09-17 11:28:13.999165490 -0700
@@ -906,7 +906,8 @@ struct netdev_fcoe_hbainfo {
  *	feature set might be less than what was returned by ndo_fix_features()).
  *	Must return >0 or -errno if it changed dev->features itself.
  *
- * int (*ndo_fdb_add)(struct ndmsg *ndm, struct net_device *dev,
+ * int (*ndo_fdb_add)(struct ndmsg *ndm, struct nlattr *tb[],
+ *		      struct net_device *dev,
  *		      const unsigned char *addr, u16 flags)
  *	Adds an FDB entry to dev for addr.
  * int (*ndo_fdb_del)(struct ndmsg *ndm, struct net_device *dev,
@@ -1016,6 +1017,7 @@ struct net_device_ops {
 	void			(*ndo_neigh_destroy)(struct neighbour *n);
 
 	int			(*ndo_fdb_add)(struct ndmsg *ndm,
+					       struct nlattr *tb[],
 					       struct net_device *dev,
 					       const unsigned char *addr,
 					       u16 flags);
--- a/net/bridge/br_fdb.c	2012-09-17 11:24:34.000000000 -0700
+++ b/net/bridge/br_fdb.c	2012-09-17 11:28:13.999165490 -0700
@@ -608,7 +608,8 @@ static int fdb_add_entry(struct net_brid
 }
 
 /* Add new permanent fdb entry with RTM_NEWNEIGH */
-int br_fdb_add(struct ndmsg *ndm, struct net_device *dev,
+int br_fdb_add(struct ndmsg *ndm, struct nlattr *tb[],
+	       struct net_device *dev,
 	       const unsigned char *addr, u16 nlh_flags)
 {
 	struct net_bridge_port *p;
--- a/net/bridge/br_private.h	2012-09-17 11:24:34.000000000 -0700
+++ b/net/bridge/br_private.h	2012-09-17 11:28:13.999165490 -0700
@@ -364,7 +364,7 @@ extern void br_fdb_update(struct net_bri
 extern int br_fdb_delete(struct ndmsg *ndm,
 			 struct net_device *dev,
 			 const unsigned char *addr);
-extern int br_fdb_add(struct ndmsg *nlh,
+extern int br_fdb_add(struct ndmsg *nlh, struct nlattr *tb[],
 		      struct net_device *dev,
 		      const unsigned char *addr,
 		      u16 nlh_flags);
--- a/net/core/rtnetlink.c	2012-09-17 11:24:34.000000000 -0700
+++ b/net/core/rtnetlink.c	2012-09-17 11:28:14.003165450 -0700
@@ -2090,7 +2090,8 @@ static int rtnl_fdb_add(struct sk_buff *
 	if ((!ndm->ndm_flags || ndm->ndm_flags & NTF_MASTER) &&
 	    (dev->priv_flags & IFF_BRIDGE_PORT)) {
 		master = dev->master;
-		err = master->netdev_ops->ndo_fdb_add(ndm, dev, addr,
+		err = master->netdev_ops->ndo_fdb_add(ndm, tb,
+						      dev, addr,
 						      nlh->nlmsg_flags);
 		if (err)
 			goto out;
@@ -2100,7 +2101,8 @@ static int rtnl_fdb_add(struct sk_buff *
 
 	/* Embedded bridge, macvlan, and any other device support */
 	if ((ndm->ndm_flags & NTF_SELF) && dev->netdev_ops->ndo_fdb_add) {
-		err = dev->netdev_ops->ndo_fdb_add(ndm, dev, addr,
+		err = dev->netdev_ops->ndo_fdb_add(ndm, tb,
+						   dev, addr,
 						   nlh->nlmsg_flags);
 
 		if (!err) {
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c	2012-09-17 11:25:15.000000000 -0700
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c	2012-09-17 11:31:03.325464947 -0700
@@ -6888,7 +6888,7 @@ static int ixgbe_set_features(struct net
 	return 0;
 }
 
-static int ixgbe_ndo_fdb_add(struct ndmsg *ndm,
+static int ixgbe_ndo_fdb_add(struct ndmsg *ndm, struct nlattr *tb[],
 			     struct net_device *dev,
 			     const unsigned char *addr,
 			     u16 flags)

^ permalink raw reply	[flat|nested] 43+ messages in thread

* [PATCH net-next 2/3] igmp: export symbol ip_mc_leave_group
  2012-09-24 18:43 [PATCH net-next 0/3] VXLAN driver Stephen Hemminger
  2012-09-24 18:43 ` [PATCH net-next 1/3] netlink: add attributes to fdb interface Stephen Hemminger
@ 2012-09-24 18:43 ` Stephen Hemminger
  2012-09-24 18:43 ` [PATCH net-next 3/3] vxlan: virtual extensible lan Stephen Hemminger
  2 siblings, 0 replies; 43+ messages in thread
From: Stephen Hemminger @ 2012-09-24 18:43 UTC (permalink / raw)
  To: David Miller, Chris Wright; +Cc: netdev

[-- Attachment #1: igmp-leave.patch --]
[-- Type: text/plain, Size: 427 bytes --]

Needed for VXLAN.

Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>

--- a/net/ipv4/igmp.c	2012-09-17 17:15:06.747860247 -0700
+++ b/net/ipv4/igmp.c	2012-09-17 17:16:33.554984978 -0700
@@ -1904,6 +1904,7 @@ int ip_mc_leave_group(struct sock *sk, s
 	rtnl_unlock();
 	return ret;
 }
+EXPORT_SYMBOL(ip_mc_leave_group);
 
 int ip_mc_source(int add, int omode, struct sock *sk, struct
 	ip_mreq_source *mreqs, int ifindex)

^ permalink raw reply	[flat|nested] 43+ messages in thread

* [PATCH net-next 3/3] vxlan: virtual extensible lan
  2012-09-24 18:43 [PATCH net-next 0/3] VXLAN driver Stephen Hemminger
  2012-09-24 18:43 ` [PATCH net-next 1/3] netlink: add attributes to fdb interface Stephen Hemminger
  2012-09-24 18:43 ` [PATCH net-next 2/3] igmp: export symbol ip_mc_leave_group Stephen Hemminger
@ 2012-09-24 18:43 ` Stephen Hemminger
  2012-09-24 19:33   ` Eric Dumazet
                     ` (2 more replies)
  2 siblings, 3 replies; 43+ messages in thread
From: Stephen Hemminger @ 2012-09-24 18:43 UTC (permalink / raw)
  To: David Miller, Chris Wright; +Cc: netdev

[-- Attachment #1: vxlan.patch --]
[-- Type: text/plain, Size: 33509 bytes --]

This is an implementation of Virtual eXtensible Local Area Network
as described in draft RFC:
  http://tools.ietf.org/html/draft-mahalingam-dutt-dcops-vxlan-02

The driver integrates a Virtual Tunnel Endpoint (VTEP) functionality
that learns MAC to IP address mapping. 

This implementation has not been tested for Interoperation with
other equipment.

Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>

---
 Documentation/networking/vxlan.txt |   36 +
 drivers/net/Kconfig                |   13 
 drivers/net/Makefile               |    1 
 drivers/net/vxlan.c                | 1180 +++++++++++++++++++++++++++++++++++++
 include/linux/if_link.h            |   14 
 5 files changed, 1244 insertions(+)

--- a/drivers/net/Kconfig	2012-09-24 10:56:57.080291529 -0700
+++ b/drivers/net/Kconfig	2012-09-24 11:08:02.865416523 -0700
@@ -149,6 +149,19 @@ config MACVTAP
 	  To compile this driver as a module, choose M here: the module
 	  will be called macvtap.
 
+config VXLAN
+       tristate "Virtual eXtensible Local Area Network (VXLAN)"
+       depends on EXPERIMENTAL
+       ---help---
+	  This allows one to create vxlan virtual interfaces that provide
+	  Layer 2 Networks over Layer 3 Networks. VXLAN is often used
+	  to tunnel virtual network infrastructure in virtualized environments.
+	  For more information see:
+	    http://tools.ietf.org/html/draft-mahalingam-dutt-dcops-vxlan-02
+
+	  To compile this driver as a module, choose M here: the module
+	  will be called macvlan.
+
 config NETCONSOLE
 	tristate "Network console logging support"
 	---help---
--- a/drivers/net/Makefile	2012-09-24 10:56:57.080291529 -0700
+++ b/drivers/net/Makefile	2012-09-24 11:08:02.865416523 -0700
@@ -21,6 +21,7 @@ obj-$(CONFIG_NET_TEAM) += team/
 obj-$(CONFIG_TUN) += tun.o
 obj-$(CONFIG_VETH) += veth.o
 obj-$(CONFIG_VIRTIO_NET) += virtio_net.o
+obj-$(CONFIG_VXLAN) += vxlan.o
 
 #
 # Networking Drivers
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ b/drivers/net/vxlan.c	2012-09-24 11:08:02.869416484 -0700
@@ -0,0 +1,1171 @@
+/*
+ * VXLAN: Virtual eXtensiable Local Area Network
+ *
+ * Copyright (c) 2012 Vyatta Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * TODO
+ *  - use IANA UDP port number (when defined)
+ *  - IPv6 (not in RFC)
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/slab.h>
+#include <linux/skbuff.h>
+#include <linux/rculist.h>
+#include <linux/netdevice.h>
+#include <linux/in.h>
+#include <linux/ip.h>
+#include <linux/udp.h>
+#include <linux/igmp.h>
+#include <linux/etherdevice.h>
+#include <linux/if_ether.h>
+#include <linux/version.h>
+#include <linux/hash.h>
+#include <net/ip.h>
+#include <net/icmp.h>
+#include <net/udp.h>
+#include <net/rtnetlink.h>
+#include <net/route.h>
+#include <net/dsfield.h>
+#include <net/inet_ecn.h>
+#include <net/net_namespace.h>
+#include <net/netns/generic.h>
+
+#define VXLAN_VERSION	"0.0"
+
+#define VNI_HASH_BITS	10
+#define VNI_HASH_SIZE	(1<<VNI_HASH_BITS)
+#define FDB_HASH_BITS	8
+#define FDB_HASH_SIZE	(1<<FDB_HASH_BITS)
+
+#define VXLAN_N_VID	(1u << 24)
+#define VXLAN_VID_MASK	(VXLAN_N_VID - 1)
+
+#define FDB_AGE_INTERVAL (10 * HZ)	/* rescan interval */
+#define FDB_AGE_TIME	 (300 * HZ)	/* drop if not used in 5 min */
+
+#define VXLAN_FLAGS 0x08000000	/* struct vxlanhdr.vx_flags required value. */
+
+/* VXLAN protocol header */
+struct vxlanhdr {
+	__be32 vx_flags;
+	__be32 vx_vni;
+};
+
+#define VXLAN_HEADROOM (sizeof(struct iphdr)		\
+			+ sizeof(struct udphdr)		\
+			+ sizeof(struct vxlanhdr))
+
+/* UDP port for VXLAN traffic. */
+static unsigned int vxlan_port __read_mostly = 8472;
+module_param_named(port, vxlan_port, uint, 0);
+MODULE_PARM_DESC(vxlan_port, "Destination UDP port");
+
+/* per-net private data for this module */
+static unsigned int vxlan_net_id;
+struct vxlan_net {
+	struct socket	  *sock;	/* UDP encap socket */
+	struct hlist_head vni_list[VNI_HASH_SIZE];
+};
+
+/* Forwarding table entry */
+struct vxlan_fdb {
+	struct hlist_node hlist;	/* linked list of entries */
+	struct rcu_head	  rcu;
+	unsigned long	  updated;	/* jiffies */
+	unsigned long	  used;
+#define VXLAN_FDB_PERM			0x1
+	u32		  flags;
+	__be32		  remote_ip;
+	u8		  eth_addr[ETH_ALEN];
+};
+
+/* Per-cpu network traffic stats */
+struct vxlan_stats {
+	u64			rx_packets;
+	u64			rx_bytes;
+	u64			tx_packets;
+	u64			tx_bytes;
+	struct u64_stats_sync	syncp;
+};
+
+/* Pseudo network device */
+struct vxlan_dev {
+	struct hlist_node hlist;
+	struct net_device *dev;
+	struct vxlan_stats __percpu *stats;
+	__u32		  vni;		/* virtual network id */
+	__be32	          gaddr;	/* multicast group */
+	__be32		  saddr;	/* source address */
+	unsigned int      link;		/* link to multicast over */
+	__u8		  tos;		/* TOS override */
+	__u8		  ttl;
+	bool		  learn;
+
+	struct timer_list age_timer;
+	spinlock_t	  hash_lock;
+	struct hlist_head fdb_head[FDB_HASH_SIZE];
+};
+
+/* salt for hash table */
+static u32 vxlan_salt __read_mostly;
+
+static inline struct hlist_head *vni_head(struct net *net, u32 id)
+{
+	struct vxlan_net *vn = net_generic(net, vxlan_net_id);
+
+	return &vn->vni_list[hash_32(id, VNI_HASH_BITS)];
+}
+
+/* Look up VNI in a per net namespace table */
+static struct vxlan_dev *vxlan_find_vni(struct net *net, u32 id)
+{
+	struct vxlan_dev *vxlan;
+	struct hlist_node *node;
+
+	hlist_for_each_entry_rcu(vxlan, node, vni_head(net, id), hlist) {
+		if (vxlan->vni == id)
+			return vxlan;
+	}
+
+	return NULL;
+}
+
+/* Fill in neighbour message in skbuff. */
+static int vxlan_fdb_info(struct sk_buff *skb, struct vxlan_dev *vxlan,
+			   const struct vxlan_fdb *fdb,
+			   u32 portid, u32 seq, int type, unsigned int flags)
+{
+	unsigned long now = jiffies;
+	struct nda_cacheinfo ci;
+	struct nlmsghdr *nlh;
+	struct ndmsg *ndm;
+
+	nlh = nlmsg_put(skb, portid, seq, type, sizeof(*ndm), flags);
+	if (nlh == NULL)
+		return -EMSGSIZE;
+
+	ndm = nlmsg_data(nlh);
+	memset(ndm, 0, sizeof(*ndm));
+	ndm->ndm_family	= AF_BRIDGE;
+
+	if (fdb->flags & VXLAN_FDB_PERM)
+		ndm->ndm_state = NUD_PERMANENT;
+	else if (time_before_eq(fdb->used + FDB_AGE_TIME, now))
+		ndm->ndm_state = NUD_STALE;
+	else
+		ndm->ndm_state = NUD_REACHABLE;
+
+	if (nla_put(skb, NDA_LLADDR, ETH_ALEN, &fdb->eth_addr))
+		goto nla_put_failure;
+
+	if (nla_put_be32(skb, NDA_DST, fdb->remote_ip))
+		goto nla_put_failure;
+
+	ci.ndm_used	 = jiffies_to_clock_t(now - fdb->used);
+	ci.ndm_confirmed = 0;
+	ci.ndm_updated	 = jiffies_to_clock_t(now - fdb->updated);
+	ci.ndm_refcnt	 = 0;
+
+	if (nla_put(skb, NDA_CACHEINFO, sizeof(ci), &ci))
+		goto nla_put_failure;
+
+	return nlmsg_end(skb, nlh);
+
+nla_put_failure:
+	nlmsg_cancel(skb, nlh);
+	return -EMSGSIZE;
+}
+
+static inline size_t vxlan_nlmsg_size(void)
+{
+	return NLMSG_ALIGN(sizeof(struct ndmsg))
+		+ nla_total_size(ETH_ALEN) /* NDA_LLADDR */
+		+ nla_total_size(sizeof(__be32)) /* NDA_DST */
+		+ nla_total_size(sizeof(struct nda_cacheinfo));
+}
+
+static void vxlan_fdb_notify(struct vxlan_dev *vxlan,
+			     const struct vxlan_fdb *fdb, int type)
+{
+	struct net *net = dev_net(vxlan->dev);
+	struct sk_buff *skb;
+	int err = -ENOBUFS;
+
+	skb = nlmsg_new(vxlan_nlmsg_size(), GFP_ATOMIC);
+	if (skb == NULL)
+		goto errout;
+
+	err = vxlan_fdb_info(skb, vxlan, fdb, 0, 0, type, 0);
+	if (err < 0) {
+		/* -EMSGSIZE implies BUG in vxlan_nlmsg_size() */
+		WARN_ON(err == -EMSGSIZE);
+		kfree_skb(skb);
+		goto errout;
+	}
+
+	rtnl_notify(skb, net, 0, RTNLGRP_NEIGH, NULL, GFP_ATOMIC);
+	return;
+errout:
+	if (err < 0)
+		rtnl_set_sk_err(net, RTNLGRP_NEIGH, err);
+}
+
+/* Hash Ethernet address */
+static u32 eth_hash(const unsigned char *addr)
+{
+	/* could be optimized for unaligned access */
+	u32 a = addr[5] << 8 | addr[4];
+	u32 b = addr[3] << 24 | addr[2] << 16 | addr[1] << 8 | addr[0];
+
+	return jhash_2words(a, b, vxlan_salt);
+}
+
+/* Hash chain to use given mac address */
+static inline struct hlist_head *vxlan_fdb_head(struct vxlan_dev *vxlan,
+						const u8 *mac)
+{
+	return &vxlan->fdb_head[hash_32(eth_hash(mac), FDB_HASH_BITS)];
+}
+
+/* Look up Ethernet address in forwarding table */
+static struct vxlan_fdb *vxlan_find_mac(struct vxlan_dev *vxlan,
+					const u8 *mac)
+
+{
+	struct hlist_head *head = vxlan_fdb_head(vxlan, mac);
+	struct vxlan_fdb *f;
+	struct hlist_node *node;
+
+	hlist_for_each_entry_rcu(f, node, head, hlist) {
+		if (compare_ether_addr(mac, f->eth_addr) == 0)
+			return f;
+	}
+
+	return NULL;
+}
+
+/* Add new entry to forwarding table -- assumes lock held */
+static int vxlan_fdb_create(struct vxlan_dev *vxlan,
+			    const u8 *mac, __be32 ip,
+			    u32 flags)
+{
+	struct vxlan_fdb *f;
+
+	if (unlikely(vxlan_find_mac(vxlan, mac))) {
+		netdev_dbg(vxlan->dev,
+			   "lost race to create %pM\n", mac);
+		return -EEXIST;
+	}
+
+	netdev_dbg(vxlan->dev, "add %pM -> %pI4\n", mac, &ip);
+
+	f = kmalloc(sizeof(*f), GFP_ATOMIC);
+	if (!f)
+		return -ENOMEM;
+
+	f->remote_ip = ip;
+	memcpy(f->eth_addr, mac, ETH_ALEN);
+	hlist_add_head_rcu(&f->hlist,
+			   vxlan_fdb_head(vxlan, mac));
+	f->updated = f->used = jiffies;
+	f->flags = flags;
+
+	vxlan_fdb_notify(vxlan, f, RTM_NEWNEIGH);
+
+	return 0;
+}
+
+static void vxlan_fdb_destroy(struct vxlan_dev *vxlan, struct vxlan_fdb *f)
+{
+	netdev_dbg(vxlan->dev,
+		    "delete %pM\n", f->eth_addr);
+
+	vxlan_fdb_notify(vxlan, f, RTM_DELNEIGH);
+
+	hlist_del_rcu(&f->hlist);
+	kfree_rcu(f, rcu);
+}
+
+/* Add static entry (via netlink) */
+static int vxlan_fdb_add(struct ndmsg *ndm, struct nlattr *tb[],
+			 struct net_device *dev,
+			 const unsigned char *addr, u16 flags)
+{
+	struct vxlan_dev *vxlan = netdev_priv(dev);
+	__be32 ip;
+	int err;
+
+	if (tb[NDA_DST] == NULL)
+		return -EINVAL;
+
+	if (nla_len(tb[NDA_DST]) != sizeof(__be32))
+		return -EAFNOSUPPORT;
+
+	ip = nla_get_be32(tb[NDA_DST]);
+
+	spin_lock_bh(&vxlan->hash_lock);
+	err = vxlan_fdb_create(vxlan, addr, ip, VXLAN_FDB_PERM);
+	spin_unlock_bh(&vxlan->hash_lock);
+
+	return err;
+}
+
+/* Delete entry (via netlink) */
+static int vxlan_fdb_delete(struct ndmsg *ndm, struct net_device *dev,
+			    const unsigned char *addr)
+{
+	struct vxlan_dev *vxlan = netdev_priv(dev);
+	struct vxlan_fdb *f;
+	int err = -ENOENT;
+
+	spin_lock_bh(&vxlan->hash_lock);
+	f = vxlan_find_mac(vxlan, addr);
+	if (f) {
+		vxlan_fdb_destroy(vxlan, f);
+		err = 0;
+	}
+	spin_unlock_bh(&vxlan->hash_lock);
+
+	return err;
+}
+
+/* Dump forwarding table */
+static int vxlan_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb,
+			  struct net_device *dev, int idx)
+{
+	struct vxlan_dev *vxlan = netdev_priv(dev);
+	unsigned int h;
+
+	for (h = 0; h < FDB_HASH_SIZE; ++h) {
+		struct vxlan_fdb *f;
+		struct hlist_node *n;
+
+		hlist_for_each_entry_rcu(f, n, &vxlan->fdb_head[h], hlist) {
+			if (idx < cb->args[0])
+				goto skip;
+
+			if (vxlan_fdb_info(skb, vxlan, f,
+					    NETLINK_CB(cb->skb).portid,
+					    cb->nlh->nlmsg_seq,
+					    RTM_NEWNEIGH,
+					    NLM_F_MULTI) < 0)
+				break;
+skip:
+			++idx;
+		}
+	}
+
+	return idx;
+}
+
+/* Watch incoming packets to learn mapping between Ethernet address
+ * and Tunnel endpoint.
+ */
+static void vxlan_snoop(struct net_device *dev,
+			__be32 src_ip, const u8 *src_mac)
+{
+	struct vxlan_dev *vxlan = netdev_priv(dev);
+	struct vxlan_fdb *f;
+	int err;
+
+	f = vxlan_find_mac(vxlan, src_mac);
+	if (likely(f)) {
+		f->used = jiffies;
+		if (likely(f->remote_ip == src_ip))
+			return;
+
+		f->remote_ip = src_ip;
+		f->updated = jiffies;
+
+		if (net_ratelimit())
+			netdev_info(dev,
+				    "%pM migrated from %pI4 to %pI4\n",
+				    src_mac, &f->remote_ip, &src_ip);
+	} else {
+		/* learned new entry */
+		spin_lock(&vxlan->hash_lock);
+		err = vxlan_fdb_create(vxlan, src_mac, src_ip, 0);
+		spin_unlock(&vxlan->hash_lock);
+	}
+}
+
+
+/* See if multicast group is already in use by other ID */
+static bool vxlan_group_used(struct vxlan_net *vn,
+			     const struct vxlan_dev *this)
+{
+	const struct vxlan_dev *vxlan;
+	struct hlist_node *node;
+	unsigned h;
+
+	for (h = 0; h < VNI_HASH_SIZE; ++h)
+		hlist_for_each_entry(vxlan, node, &vn->vni_list[h], hlist) {
+			if (vxlan == this)
+				continue;
+
+			if (!netif_running(vxlan->dev))
+				continue;
+
+			if (vxlan->gaddr == this->gaddr)
+				return true;
+		}
+
+	return false;
+}
+
+/* kernel equivalent to IP_ADD_MEMBERSHIP */
+static int vxlan_join_group(struct net_device *dev)
+{
+	struct vxlan_dev *vxlan = netdev_priv(dev);
+	struct vxlan_net *vn = net_generic(dev_net(dev), vxlan_net_id);
+	struct sock *sk = vn->sock->sk;
+	struct ip_mreqn mreq = {
+		.imr_multiaddr.s_addr = vxlan->gaddr,
+	};
+	int err;
+
+	if (vxlan_group_used(vn, vxlan))
+		return 0;
+
+	/* Need to drop RTNL to call multicast join */
+	rtnl_unlock();
+	lock_sock(sk);
+	err = ip_mc_join_group(sk, &mreq);
+	release_sock(sk);
+	rtnl_lock();
+
+	return err;
+}
+
+
+/* kernel equivalent to IP_DROP_MEMBERSHIP */
+static int vxlan_leave_group(struct net_device *dev)
+{
+	struct vxlan_dev *vxlan = netdev_priv(dev);
+	struct vxlan_net *vn = net_generic(dev_net(dev), vxlan_net_id);
+	int err = 0;
+	struct sock *sk = vn->sock->sk;
+	struct ip_mreqn mreq = {
+		.imr_multiaddr.s_addr = vxlan->gaddr,
+	};
+
+	if (vxlan_group_used(vn, vxlan))
+		return 0;
+
+	/* Need to drop RTNL to call multicast leave */
+	rtnl_unlock();
+	lock_sock(sk);
+	err = ip_mc_leave_group(sk, &mreq);
+	release_sock(sk);
+	rtnl_lock();
+
+	return err;
+}
+
+/* Propogate ECN from outer IP header to tunneled packet */
+static inline void vxlan_ecn_decap(const struct iphdr *iph, struct sk_buff *skb)
+{
+	if (INET_ECN_is_ce(iph->tos)) {
+		if (skb->protocol == htons(ETH_P_IP))
+			IP_ECN_set_ce(ip_hdr(skb));
+		else if (skb->protocol == htons(ETH_P_IPV6))
+			IP6_ECN_set_ce(ipv6_hdr(skb));
+	}
+}
+
+/* Callback from net/ipv4/udp.c to receive packets */
+static int vxlan_udp_encap_recv(struct sock *sk, struct sk_buff *skb)
+{
+	struct iphdr *oip = ip_hdr(skb);
+	struct vxlanhdr *vxh;
+	struct vxlan_dev *vxlan;
+	struct vxlan_stats *stats;
+	__u32 vni;
+	__be32 saddr = oip->saddr;	/* source address for learning */
+	__be32 daddr = oip->daddr;	/* destination for checking */
+
+	/* pop off outer UDP header */
+	__skb_pull(skb, sizeof(struct udphdr));
+
+	/* Need Vxlan and inner Ethernet header to be present */
+	if (!pskb_may_pull(skb,
+			   sizeof(struct vxlanhdr) + sizeof(struct ethhdr)))
+		goto error;
+
+	/* Drop packets with reserved bits set */
+	vxh = (struct vxlanhdr *) skb->data;
+	if (vxh->vx_flags != htonl(VXLAN_FLAGS) ||
+	    (vxh->vx_vni & htonl(0xff)))
+		goto error;
+
+	__skb_pull(skb, sizeof(struct vxlanhdr));
+
+	/* Is this VNI defined? */
+	vni = ntohl(vxh->vx_vni) >> 8;
+	vxlan = vxlan_find_vni(sock_net(sk), vni);
+	if (!vxlan)
+		goto drop;
+
+	/* Ignore packets if device is not up */
+	if (!netif_running(vxlan->dev))
+		goto drop;
+
+	/* Re-examine inner Ethernet packet */
+	skb->protocol = eth_type_trans(skb, vxlan->dev);
+	skb->ip_summed = CHECKSUM_NONE;
+
+	/* Ignore packet loops (and multicast echo) */
+	if (compare_ether_addr(eth_hdr(skb)->h_source,
+			       vxlan->dev->dev_addr) == 0)
+		goto drop;
+
+	/* Check for multicast group configuration errors */
+	if (IN_MULTICAST(ntohl(daddr)) &&
+	    daddr != vxlan->gaddr) {
+		if (net_ratelimit())
+			netdev_notice(vxlan->dev,
+				      "group address %pI4 does not match\n",
+				      &daddr);
+		goto drop;
+	}
+
+	if (vxlan->learn)
+		vxlan_snoop(skb->dev, saddr, eth_hdr(skb)->h_source);
+
+	stats = this_cpu_ptr(vxlan->stats);
+	u64_stats_update_begin(&stats->syncp);
+	stats->rx_packets++;
+	stats->rx_bytes += skb->len;
+	u64_stats_update_end(&stats->syncp);
+
+	__skb_tunnel_rx(skb, vxlan->dev);
+	skb_reset_network_header(skb);
+	vxlan_ecn_decap(oip, skb);
+
+	netif_rx(skb);
+
+	return 0;
+error:
+	/* Put UDP header back */
+	__skb_push(skb, sizeof(struct udphdr));
+
+	return 1;
+drop:
+	/* Consume bad packet */
+	kfree_skb(skb);
+	return 0;
+}
+
+/* Extract dsfield from inner protocol */
+static inline u8 vxlan_get_dsfield(const struct iphdr *iph,
+				   const struct sk_buff *skb)
+{
+	if (skb->protocol == htons(ETH_P_IP))
+		return iph->tos;
+	else if (skb->protocol == htons(ETH_P_IPV6))
+		return ipv6_get_dsfield((const struct ipv6hdr *)iph);
+	else
+		return 0;
+}
+
+/* Propogate ECN bits out */
+static inline u8 vxlan_ecn_encap(u8 tos,
+				 const struct iphdr *iph,
+				 const struct sk_buff *skb)
+{
+	u8 inner = vxlan_get_dsfield(iph, skb);
+
+	return INET_ECN_encapsulate(tos, inner);
+}
+
+/* Transmit local packets over Vxlan
+ *
+ * Outer IP header inherits ECN and DF from inner header.
+ * Outer UDP destination is the VXLAN assigned port.
+ *           source port is based on hash of flow if available
+ *                       otherwise use a random value
+ */
+static netdev_tx_t vxlan_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+	struct vxlan_dev *vxlan = netdev_priv(dev);
+	struct rtable *rt;
+	const struct ethhdr *eth;
+	const struct iphdr *old_iph;
+	struct iphdr *iph;
+	struct vxlanhdr *vxh;
+	struct udphdr *uh;
+	struct flowi4 fl4;
+	struct vxlan_fdb *f;
+	unsigned int pkt_len = skb->len;
+	unsigned int mtu;
+	u32 hash;
+	__be32 dst;
+	__be16 df = 0;
+	__u8 tos, ttl;
+	int err;
+
+	/* Need space for new headers (invalidates iph ptr) */
+	if (skb_cow_head(skb, VXLAN_HEADROOM))
+		goto drop;
+
+	eth = (void *)skb->data;
+	old_iph = ip_hdr(skb);
+
+	if (!is_multicast_ether_addr(eth->h_dest) &&
+	    (f = vxlan_find_mac(vxlan, eth->h_dest)))
+		dst = f->remote_ip;
+	else if (vxlan->gaddr) {
+		dst = vxlan->gaddr;
+	} else
+		goto drop;
+
+	ttl = vxlan->ttl;
+	if (!ttl && IN_MULTICAST(ntohl(dst)))
+		ttl = 1;
+
+	tos = vxlan->tos;
+	if (tos == 1)
+		tos = vxlan_get_dsfield(old_iph, skb);
+
+	hash = skb_get_rxhash(skb);
+
+	rt = ip_route_output_gre(dev_net(dev), &fl4, dst,
+				 vxlan->saddr, vxlan->vni,
+				 RT_TOS(tos), vxlan->link);
+	if (IS_ERR(rt)) {
+		netdev_dbg(dev, "no route to %pI4\n", &dst);
+		dev->stats.tx_carrier_errors++;
+		goto tx_error;
+	}
+
+	if (rt->dst.dev == dev) {
+		netdev_dbg(dev, "circular route to %pI4\n", &dst);
+		ip_rt_put(rt);
+		dev->stats.collisions++;
+		goto tx_error;
+	}
+
+	mtu = dst_mtu(&rt->dst) - VXLAN_HEADROOM;
+	/* Do PMTU */
+	if (skb->protocol == htons(ETH_P_IP)) {
+		df |= old_iph->frag_off & htons(IP_DF);
+		if (df && mtu < pkt_len) {
+			icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
+				  htonl(mtu));
+			ip_rt_put(rt);
+			goto tx_error;
+		}
+	}
+#if IS_ENABLED(CONFIG_IPV6)
+	else if (skb->protocol == htons(ETH_P_IPV6)) {
+		if (mtu >= IPV6_MIN_MTU && mtu < pkt_len) {
+			icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
+			ip_rt_put(rt);
+			goto tx_error;
+		}
+	}
+#endif
+
+	memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
+	IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED |
+			      IPSKB_REROUTED);
+	skb_dst_drop(skb);
+	skb_dst_set(skb, &rt->dst);
+
+	vxh = (struct vxlanhdr *) __skb_push(skb, sizeof(*vxh));
+	vxh->vx_flags = htonl(VXLAN_FLAGS);
+	vxh->vx_vni = htonl(vxlan->vni << 8);
+
+	__skb_push(skb, sizeof(*uh));
+	skb_reset_transport_header(skb);
+	uh = udp_hdr(skb);
+
+	uh->dest = htons(vxlan_port);
+	uh->source = hash ? :random32();
+
+	uh->len = htons(skb->len);
+	uh->check = 0;
+
+	__skb_push(skb, sizeof(*iph));
+	skb_reset_network_header(skb);
+	iph		= ip_hdr(skb);
+	iph->version	= 4;
+	iph->ihl	= sizeof(struct iphdr) >> 2;
+	iph->frag_off	= df;
+	iph->protocol	= IPPROTO_UDP;
+	iph->tos	= tos;
+	iph->daddr	= fl4.daddr;
+	iph->saddr	= fl4.saddr;
+	iph->ttl	= ttl ? : ip4_dst_hoplimit(&rt->dst);
+
+	/* See __IPTUNNEL_XMIT */
+	skb->ip_summed = CHECKSUM_NONE;
+	ip_select_ident(iph, &rt->dst, NULL);
+
+	err = ip_local_out(skb);
+	if (likely(net_xmit_eval(err) == 0)) {
+		struct vxlan_stats *stats = this_cpu_ptr(vxlan->stats);
+
+		u64_stats_update_begin(&stats->syncp);
+		stats->tx_packets++;
+		stats->tx_bytes += pkt_len;
+		u64_stats_update_end(&stats->syncp);
+	} else {
+		dev->stats.tx_errors++;
+		dev->stats.tx_aborted_errors++;
+	}
+	return NETDEV_TX_OK;
+
+drop:
+	dev->stats.tx_dropped++;
+	goto tx_free;
+
+tx_error:
+	dev->stats.tx_errors++;
+tx_free:
+	dev_kfree_skb(skb);
+	return NETDEV_TX_OK;
+}
+
+/* Walk the forwarding table and purge stale entries */
+static void vxlan_cleanup(unsigned long arg)
+{
+	struct vxlan_dev *vxlan = (struct vxlan_dev *) arg;
+	unsigned long next_timer = jiffies + FDB_AGE_INTERVAL;
+	unsigned int h;
+
+	if (!netif_running(vxlan->dev))
+		return;
+
+	spin_lock_bh(&vxlan->hash_lock);
+	for (h = 0; h < FDB_HASH_SIZE; ++h) {
+		struct hlist_node *p, *n;
+		hlist_for_each_safe(p, n, &vxlan->fdb_head[h]) {
+			struct vxlan_fdb *f
+				= container_of(p, struct vxlan_fdb, hlist);
+			unsigned long timeout = f->used + FDB_AGE_TIME;
+
+			if (f->flags & VXLAN_FDB_PERM)
+				continue;
+
+			if (time_before_eq(timeout, jiffies)) {
+				netdev_dbg(vxlan->dev,
+					   "garbage collect %pM\n",
+					   f->eth_addr);
+				vxlan_fdb_destroy(vxlan, f);
+			} else if (time_before(timeout, next_timer))
+				next_timer = timeout;
+		}
+	}
+	spin_unlock_bh(&vxlan->hash_lock);
+
+	mod_timer(&vxlan->age_timer, next_timer);
+}
+
+/* Setup stats when device is created */
+static int vxlan_init(struct net_device *dev)
+{
+	struct vxlan_dev *vxlan = netdev_priv(dev);
+
+	vxlan->stats = alloc_percpu(struct vxlan_stats);
+	if (!vxlan->stats)
+		return -ENOMEM;
+
+	return 0;
+}
+
+/* Start ageing timer and join group when device is brought up */
+static int vxlan_open(struct net_device *dev)
+{
+	struct vxlan_dev *vxlan = netdev_priv(dev);
+	int err;
+
+	if (vxlan->gaddr) {
+		err = vxlan_join_group(dev);
+		if (err)
+			return err;
+	}
+
+	mod_timer(&vxlan->age_timer, jiffies + FDB_AGE_INTERVAL);
+	return 0;
+}
+
+/* Purge the forwarding table */
+static void vxlan_flush(struct vxlan_dev *vxlan)
+{
+	unsigned h;
+
+	spin_lock_bh(&vxlan->hash_lock);
+	for (h = 0; h < FDB_HASH_SIZE; ++h) {
+		struct hlist_node *p, *n;
+		hlist_for_each_safe(p, n, &vxlan->fdb_head[h]) {
+			struct vxlan_fdb *f
+				= container_of(p, struct vxlan_fdb, hlist);
+			vxlan_fdb_destroy(vxlan, f);
+		}
+	}
+	spin_unlock_bh(&vxlan->hash_lock);
+}
+
+/* Cleanup timer and forwarding table on shutdown */
+static int vxlan_stop(struct net_device *dev)
+{
+	struct vxlan_dev *vxlan = netdev_priv(dev);
+
+	if (vxlan->gaddr)
+		vxlan_leave_group(dev);
+
+	del_timer_sync(&vxlan->age_timer);
+
+	vxlan_flush(vxlan);
+
+	return 0;
+}
+
+/* Merge per-cpu statistics */
+static struct rtnl_link_stats64 *vxlan_stats64(struct net_device *dev,
+					       struct rtnl_link_stats64 *stats)
+{
+	struct vxlan_dev *vxlan = netdev_priv(dev);
+	struct vxlan_stats tmp, sum = { 0 };
+	unsigned int cpu;
+
+	for_each_possible_cpu(cpu) {
+		unsigned int start;
+		const struct vxlan_stats *stats
+			= per_cpu_ptr(vxlan->stats, cpu);
+
+		do {
+			start = u64_stats_fetch_begin_bh(&stats->syncp);
+			memcpy(&tmp, stats, sizeof(tmp));
+		} while (u64_stats_fetch_retry_bh(&stats->syncp, start));
+
+		sum.tx_bytes   += tmp.tx_bytes;
+		sum.tx_packets += tmp.tx_packets;
+		sum.rx_bytes   += tmp.rx_bytes;
+		sum.rx_packets += tmp.rx_packets;
+	}
+
+	stats->tx_bytes   = sum.tx_bytes;
+	stats->tx_packets = sum.tx_packets;
+	stats->rx_bytes   = sum.rx_bytes;
+	stats->rx_packets = sum.rx_packets;
+
+	stats->tx_dropped = dev->stats.tx_dropped;
+	stats->tx_errors  = dev->stats.tx_errors;
+	stats->tx_carrier_errors  = dev->stats.tx_carrier_errors;
+	stats->collisions  = dev->stats.collisions;
+
+	return stats;
+}
+
+/* Stub, nothing needs to be done. */
+static void vxlan_set_multicast_list(struct net_device *dev)
+{
+}
+
+static const struct net_device_ops vxlan_netdev_ops = {
+	.ndo_init		= vxlan_init,
+	.ndo_open		= vxlan_open,
+	.ndo_stop		= vxlan_stop,
+	.ndo_start_xmit		= vxlan_xmit,
+	.ndo_get_stats64	= vxlan_stats64,
+	.ndo_set_rx_mode	= vxlan_set_multicast_list,
+	.ndo_change_mtu		= eth_change_mtu,
+	.ndo_validate_addr	= eth_validate_addr,
+	.ndo_set_mac_address	= eth_mac_addr,
+	.ndo_fdb_add		= vxlan_fdb_add,
+	.ndo_fdb_del		= vxlan_fdb_delete,
+	.ndo_fdb_dump		= vxlan_fdb_dump,
+};
+
+/* Info for udev, that this is a virtual tunnel endpoint */
+static struct device_type vxlan_type = {
+	.name = "vxlan",
+};
+
+static void vxlan_free(struct net_device *dev)
+{
+	struct vxlan_dev *vxlan = netdev_priv(dev);
+
+	free_percpu(vxlan->stats);
+	free_netdev(dev);
+}
+
+/* Initialize the device structure. */
+static void vxlan_setup(struct net_device *dev)
+{
+	struct vxlan_dev *vxlan = netdev_priv(dev);
+	unsigned h;
+
+	eth_hw_addr_random(dev);
+	ether_setup(dev);
+
+	dev->netdev_ops = &vxlan_netdev_ops;
+	dev->destructor = vxlan_free;
+	SET_NETDEV_DEVTYPE(dev, &vxlan_type);
+
+	dev->tx_queue_len = 0;
+	dev->features	|= NETIF_F_LLTX;
+	dev->features	|= NETIF_F_NETNS_LOCAL;
+	dev->priv_flags	&= ~IFF_XMIT_DST_RELEASE;
+
+	spin_lock_init(&vxlan->hash_lock);
+
+	init_timer_deferrable(&vxlan->age_timer);
+	vxlan->age_timer.function = vxlan_cleanup;
+	vxlan->age_timer.data = (unsigned long) vxlan;
+
+	vxlan->dev = dev;
+
+	for (h = 0; h < FDB_HASH_SIZE; ++h)
+		INIT_HLIST_HEAD(&vxlan->fdb_head[h]);
+}
+
+static const struct nla_policy vxlan_policy[IFLA_VXLAN_MAX + 1] = {
+	[IFLA_VXLAN_ID]		= { .type = NLA_U32 },
+	[IFLA_VXLAN_GROUP]	= { .len = FIELD_SIZEOF(struct iphdr, daddr) },
+	[IFLA_VXLAN_LINK]	= { .type = NLA_U32 },
+	[IFLA_VXLAN_LOCAL]	= { .len = FIELD_SIZEOF(struct iphdr, saddr) },
+	[IFLA_VXLAN_TOS]	= { .type = NLA_U8 },
+	[IFLA_VXLAN_LEARNING]	= { .type = NLA_U8 },
+};
+
+static int vxlan_validate(struct nlattr *tb[], struct nlattr *data[])
+{
+	if (tb[IFLA_ADDRESS]) {
+		if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN) {
+			pr_debug("invalid link address (not ethernet)\n");
+			return -EINVAL;
+		}
+
+		if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS]))) {
+			pr_debug("invalid all zero ethernet address\n");
+			return -EADDRNOTAVAIL;
+		}
+	}
+
+	if (!data)
+		return -EINVAL;
+
+	if (data[IFLA_VXLAN_ID]) {
+		__u32 id = nla_get_u32(data[IFLA_VXLAN_ID]);
+		if (id >= VXLAN_VID_MASK)
+			return -ERANGE;
+	}
+
+	if (data[IFLA_VXLAN_GROUP]) {
+		__be32 gaddr = nla_get_be32(data[IFLA_VXLAN_GROUP]);
+		if (!IN_MULTICAST(ntohl(gaddr))) {
+			pr_debug("group address is not IPv4 multicast\n");
+			return -EADDRNOTAVAIL;
+		}
+	}
+
+	return 0;
+}
+
+static int vxlan_newlink(struct net *net, struct net_device *dev,
+			 struct nlattr *tb[], struct nlattr *data[])
+{
+	struct vxlan_dev *vxlan = netdev_priv(dev);
+	__u32 vni;
+	int err;
+
+	if (!data[IFLA_VXLAN_ID])
+		return -EINVAL;
+
+	vni = nla_get_u32(data[IFLA_VXLAN_ID]);
+	if (vxlan_find_vni(net, vni)) {
+		pr_info("duplicate VNI %u\n", vni);
+		return -EEXIST;
+	}
+	vxlan->vni = vni;
+
+	if (data[IFLA_VXLAN_GROUP])
+		vxlan->gaddr = nla_get_be32(data[IFLA_VXLAN_GROUP]);
+
+	if (data[IFLA_VXLAN_LOCAL])
+		vxlan->saddr = nla_get_be32(data[IFLA_VXLAN_LOCAL]);
+
+	if (data[IFLA_VXLAN_LINK])
+		vxlan->link = nla_get_u32(data[IFLA_VXLAN_LINK]);
+
+	if (data[IFLA_VXLAN_TOS])
+		vxlan->tos  = nla_get_u8(data[IFLA_VXLAN_TOS]);
+
+	if (!data[IFLA_VXLAN_LEARNING] || nla_get_u8(data[IFLA_VXLAN_LEARNING]))
+		vxlan->learn = true;
+
+	err = register_netdevice(dev);
+	if (!err)
+		hlist_add_head_rcu(&vxlan->hlist, vni_head(net, vxlan->vni));
+
+	return err;
+}
+
+static void vxlan_dellink(struct net_device *dev, struct list_head *head)
+{
+	struct vxlan_dev *vxlan = netdev_priv(dev);
+
+	hlist_del_rcu(&vxlan->hlist);
+
+	unregister_netdevice_queue(dev, head);
+}
+
+static size_t vxlan_get_size(const struct net_device *dev)
+{
+
+	return nla_total_size(sizeof(__u32)) +	/* IFLA_VXLAN_ID */
+		nla_total_size(sizeof(__be32)) +/* IFLA_VXLAN_GROUP */
+		nla_total_size(sizeof(__u32)) +	/* IFLA_VXLAN_LINK */
+		nla_total_size(sizeof(__be32))+	/* IFLA_VXLAN_LOCAL */
+		nla_total_size(sizeof(__u8)) +	/* IFLA_VXLAN_TTL */
+		nla_total_size(sizeof(__u8)) +	/* IFLA_VXLAN_TOS */
+		nla_total_size(sizeof(__u8)) +	/* IFLA_VXLAN_LEARNING */
+		0;
+}
+
+static int vxlan_fill_info(struct sk_buff *skb, const struct net_device *dev)
+{
+	const struct vxlan_dev *vxlan = netdev_priv(dev);
+
+	if (nla_put_u32(skb, IFLA_VXLAN_ID, vxlan->vni))
+		goto nla_put_failure;
+
+	if (vxlan->gaddr && nla_put_u32(skb, IFLA_VXLAN_GROUP, vxlan->gaddr))
+		goto nla_put_failure;
+
+	if (vxlan->link && nla_put_u32(skb, IFLA_VXLAN_LINK, vxlan->link))
+		goto nla_put_failure;
+
+	if (vxlan->saddr && nla_put_u32(skb, IFLA_VXLAN_LOCAL, vxlan->saddr))
+		goto nla_put_failure;
+
+	if (nla_put_u8(skb, IFLA_VXLAN_TTL, vxlan->ttl) ||
+	    nla_put_u8(skb, IFLA_VXLAN_TOS, vxlan->tos) ||
+	    nla_put_u8(skb, IFLA_VXLAN_LEARNING, vxlan->learn))
+		goto nla_put_failure;
+
+	return 0;
+
+nla_put_failure:
+	return -EMSGSIZE;
+}
+
+static struct rtnl_link_ops vxlan_link_ops __read_mostly = {
+	.kind		= "vxlan",
+	.maxtype	= IFLA_VXLAN_MAX,
+	.policy		= vxlan_policy,
+	.priv_size	= sizeof(struct vxlan_dev),
+	.setup		= vxlan_setup,
+	.validate	= vxlan_validate,
+	.newlink	= vxlan_newlink,
+	.dellink	= vxlan_dellink,
+	.get_size	= vxlan_get_size,
+	.fill_info	= vxlan_fill_info,
+};
+
+static __net_init int vxlan_init_net(struct net *net)
+{
+	struct vxlan_net *vn = net_generic(net, vxlan_net_id);
+	struct sock *sk;
+	struct sockaddr_in addr;
+	int rc;
+	unsigned h;
+
+	/* Create UDP socket for encapsulation receive. */
+	rc = sock_create_kern(AF_INET, SOCK_DGRAM, IPPROTO_UDP, &vn->sock);
+	if (rc < 0) {
+		pr_debug("UDP socket create failed\n");
+		return rc;
+	}
+
+	addr.sin_family = AF_INET;
+	addr.sin_addr.s_addr = INADDR_ANY;
+	addr.sin_port = htons(vxlan_port);
+
+	rc = kernel_bind(vn->sock, (struct sockaddr *) &addr, sizeof(addr));
+	if (rc < 0) {
+		pr_debug("bind for port %u failed %d\n", vxlan_port, rc);
+		sock_release(vn->sock);
+		vn->sock = NULL;
+		return rc;
+	}
+
+	/* Disable multicast loopback */
+	sk = vn->sock->sk;
+	inet_sk(sk)->mc_loop = 0;
+
+	/* Mark socket as an encapsulation socket. */
+	udp_sk(sk)->encap_type = UDP_ENCAP_L2TPINUDP;
+	udp_sk(sk)->encap_rcv = vxlan_udp_encap_recv;
+	udp_encap_enable();
+
+	for (h = 0; h < VNI_HASH_SIZE; ++h)
+		INIT_HLIST_HEAD(&vn->vni_list[h]);
+
+	return 0;
+}
+
+static __net_exit void vxlan_exit_net(struct net *net)
+{
+	struct vxlan_net *vn = net_generic(net, vxlan_net_id);
+
+	if (vn->sock) {
+		sock_release(vn->sock);
+		vn->sock = NULL;
+	}
+}
+
+static struct pernet_operations vxlan_net_ops = {
+	.init = vxlan_init_net,
+	.exit = vxlan_exit_net,
+	.id   = &vxlan_net_id,
+	.size = sizeof(struct vxlan_net),
+};
+
+static int __init vxlan_init_module(void)
+{
+	int rc;
+
+	get_random_bytes(&vxlan_salt, sizeof(vxlan_salt));
+
+	rc = register_pernet_device(&vxlan_net_ops);
+	if (rc)
+		goto out1;
+
+	rc = rtnl_link_register(&vxlan_link_ops);
+	if (rc)
+		goto out2;
+
+	return 0;
+
+out2:
+	unregister_pernet_device(&vxlan_net_ops);
+out1:
+	return rc;
+}
+module_init(vxlan_init_module);
+
+static void __exit vxlan_cleanup_module(void)
+{
+	rtnl_link_unregister(&vxlan_link_ops);
+	unregister_pernet_device(&vxlan_net_ops);
+}
+module_exit(vxlan_cleanup_module);
+
+MODULE_LICENSE("GPL");
+MODULE_VERSION(VXLAN_VERSION);
+MODULE_AUTHOR("Stephen Hemminger <shemminger@vyatta.com>");
+MODULE_ALIAS_RTNL_LINK("vxlan");
--- a/include/linux/if_link.h	2012-09-24 10:56:57.080291529 -0700
+++ b/include/linux/if_link.h	2012-09-24 11:08:02.869416484 -0700
@@ -272,6 +272,20 @@ enum macvlan_mode {
 
 #define MACVLAN_FLAG_NOPROMISC	1
 
+/* VXLAN section */
+enum {
+	IFLA_VXLAN_UNSPEC,
+	IFLA_VXLAN_ID,
+	IFLA_VXLAN_GROUP,
+	IFLA_VXLAN_LINK,
+	IFLA_VXLAN_LOCAL,
+	IFLA_VXLAN_TTL,
+	IFLA_VXLAN_TOS,
+	IFLA_VXLAN_LEARNING,
+	__IFLA_VXLAN_MAX
+};
+#define IFLA_VXLAN_MAX	(__IFLA_VXLAN_MAX - 1)
+
 /* SR-IOV virtual function management section */
 
 enum {
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ b/Documentation/networking/vxlan.txt	2012-09-24 11:08:02.869416484 -0700
@@ -0,0 +1,36 @@
+Virtual eXtensible Local Area Networking documentation
+======================================================
+
+The VXLAN protocol is a tunnelling protocol that is designed to
+solve the problem of limited number of available VLAN's (4096).
+With VXLAN identifier is expanded to 24 bits.
+
+It is a draft RFC standard, that is implemented by Cisco Nexus,
+Vmware and Brocade. The protocol runs over UDP using a single
+destination port (still not standardized by IANA).
+This document describes the Linux kernel tunnel device,
+there is also an implantation of VXLAN for Openvswitch.
+
+Unlike most tunnels, a VXLAN is a 1 to N network, not just point
+to point. A VXLAN device can either dynamically learn the IP address
+of the other end, in a manner similar to a learning bridge, or the
+forwarding entries can be configured statically.
+
+The management of vxlan is done in a similar fashion to it's
+too closest neighbors GRE and VLAN. Configuring VXLAN requires
+the version of iproute2 that matches the kernel release
+where VXLAN was first merged upstream.
+
+1. Create vxlan device
+  # ip li add vxlan0 type vxlan id 42 group 239.1.1.1 dev eth1
+
+This creates a new device (vxlan0). The device uses the
+the multicast group 239.1.1.1 over eth1 to handle packets where
+no entry is in the forwarding table.
+
+2. Delete vxlan device
+  # ip link delete vxlan0
+
+3. Show vxlan info
+  # ip -d show vxlan0
+

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH net-next 3/3] vxlan: virtual extensible lan
  2012-09-24 18:43 ` [PATCH net-next 3/3] vxlan: virtual extensible lan Stephen Hemminger
@ 2012-09-24 19:33   ` Eric Dumazet
  2012-09-24 19:39   ` Eric Dumazet
  2012-09-24 20:58   ` [PATCH " Chris Wright
  2 siblings, 0 replies; 43+ messages in thread
From: Eric Dumazet @ 2012-09-24 19:33 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: David Miller, Chris Wright, netdev

On Mon, 2012-09-24 at 11:43 -0700, Stephen Hemminger wrote:

> --- a/drivers/net/Kconfig	2012-09-24 10:56:57.080291529 -0700
> +++ b/drivers/net/Kconfig	2012-09-24 11:08:02.865416523 -0700
> @@ -149,6 +149,19 @@ config MACVTAP
>  	  To compile this driver as a module, choose M here: the module
>  	  will be called macvtap.
>  
> +config VXLAN
> +       tristate "Virtual eXtensible Local Area Network (VXLAN)"
> +       depends on EXPERIMENTAL
> +       ---help---
> +	  This allows one to create vxlan virtual interfaces that provide
> +	  Layer 2 Networks over Layer 3 Networks. VXLAN is often used
> +	  to tunnel virtual network infrastructure in virtualized environments.
> +	  For more information see:
> +	    http://tools.ietf.org/html/draft-mahalingam-dutt-dcops-vxlan-02
> +
> +	  To compile this driver as a module, choose M here: the module
> +	  will be called macvlan.
> +

copy/paste error, this module wont be called macvlan ;)

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH net-next 3/3] vxlan: virtual extensible lan
  2012-09-24 18:43 ` [PATCH net-next 3/3] vxlan: virtual extensible lan Stephen Hemminger
  2012-09-24 19:33   ` Eric Dumazet
@ 2012-09-24 19:39   ` Eric Dumazet
  2012-09-24 19:46     ` [PATCHv2 " Stephen Hemminger
  2012-09-24 20:58   ` [PATCH " Chris Wright
  2 siblings, 1 reply; 43+ messages in thread
From: Eric Dumazet @ 2012-09-24 19:39 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: David Miller, Chris Wright, netdev

On Mon, 2012-09-24 at 11:43 -0700, Stephen Hemminger wrote:

> +/* Callback from net/ipv4/udp.c to receive packets */
> +static int vxlan_udp_encap_recv(struct sock *sk, struct sk_buff *skb)
> +{
> +	struct iphdr *oip = ip_hdr(skb);

oip is set to ip_hdr(skb)

> +	struct vxlanhdr *vxh;
> +	struct vxlan_dev *vxlan;
> +	struct vxlan_stats *stats;
> +	__u32 vni;
> +	__be32 saddr = oip->saddr;	/* source address for learning */
> +	__be32 daddr = oip->daddr;	/* destination for checking */
> +
> +	/* pop off outer UDP header */
> +	__skb_pull(skb, sizeof(struct udphdr));
> +
> +	/* Need Vxlan and inner Ethernet header to be present */
> +	if (!pskb_may_pull(skb,
> +			   sizeof(struct vxlanhdr) + sizeof(struct ethhdr)))
> +		goto error;
> +

here oip may points to a freed memory, if pskb_may_pull) has to
reallocate skb->head

> +	/* Drop packets with reserved bits set */
> +	vxh = (struct vxlanhdr *) skb->data;
> +	if (vxh->vx_flags != htonl(VXLAN_FLAGS) ||
> +	    (vxh->vx_vni & htonl(0xff)))
> +		goto error;
> +
> +	__skb_pull(skb, sizeof(struct vxlanhdr));
> +
> +	/* Is this VNI defined? */
> +	vni = ntohl(vxh->vx_vni) >> 8;
> +	vxlan = vxlan_find_vni(sock_net(sk), vni);
> +	if (!vxlan)
> +		goto drop;
> +
> +	/* Ignore packets if device is not up */
> +	if (!netif_running(vxlan->dev))
> +		goto drop;
> +
> +	/* Re-examine inner Ethernet packet */
> +	skb->protocol = eth_type_trans(skb, vxlan->dev);
> +	skb->ip_summed = CHECKSUM_NONE;
> +
> +	/* Ignore packet loops (and multicast echo) */
> +	if (compare_ether_addr(eth_hdr(skb)->h_source,
> +			       vxlan->dev->dev_addr) == 0)
> +		goto drop;
> +
> +	/* Check for multicast group configuration errors */
> +	if (IN_MULTICAST(ntohl(daddr)) &&
> +	    daddr != vxlan->gaddr) {
> +		if (net_ratelimit())
> +			netdev_notice(vxlan->dev,
> +				      "group address %pI4 does not match\n",
> +				      &daddr);
> +		goto drop;
> +	}
> +
> +	if (vxlan->learn)
> +		vxlan_snoop(skb->dev, saddr, eth_hdr(skb)->h_source);
> +
> +	stats = this_cpu_ptr(vxlan->stats);
> +	u64_stats_update_begin(&stats->syncp);
> +	stats->rx_packets++;
> +	stats->rx_bytes += skb->len;
> +	u64_stats_update_end(&stats->syncp);
> +
> +	__skb_tunnel_rx(skb, vxlan->dev);
> +	skb_reset_network_header(skb);


> +	vxlan_ecn_decap(oip, skb);

potential crash

^ permalink raw reply	[flat|nested] 43+ messages in thread

* [PATCHv2 net-next 3/3] vxlan: virtual extensible lan
  2012-09-24 19:39   ` Eric Dumazet
@ 2012-09-24 19:46     ` Stephen Hemminger
  2012-09-24 19:55       ` Eric Dumazet
  2012-09-24 20:09       ` [PATCHv2 " Eric Dumazet
  0 siblings, 2 replies; 43+ messages in thread
From: Stephen Hemminger @ 2012-09-24 19:46 UTC (permalink / raw)
  To: Eric Dumazet, David Miller; +Cc: Chris Wright, netdev

Subject: vxlan: virtual extensible lan

This is an implementation of Virtual eXtensible Local Area Network
as described in draft RFC:
  http://tools.ietf.org/html/draft-mahalingam-dutt-dcops-vxlan-02

The driver integrates a Virtual Tunnel Endpoint (VTEP) functionality
that learns MAC to IP address mapping. 


This implementation has not been tested for Interoperation with
other equipment.

Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>

---
v2 - fix reference to possible freed memory in receive path

 Documentation/networking/vxlan.txt |   36 +
 drivers/net/Kconfig                |   13 
 drivers/net/Makefile               |    1 
 drivers/net/vxlan.c                | 1180 +++++++++++++++++++++++++++++++++++++
 include/linux/if_link.h            |   14 
 5 files changed, 1244 insertions(+)

--- a/drivers/net/Kconfig	2012-09-24 10:56:57.080291529 -0700
+++ b/drivers/net/Kconfig	2012-09-24 12:43:25.298239026 -0700
@@ -149,6 +149,19 @@ config MACVTAP
 	  To compile this driver as a module, choose M here: the module
 	  will be called macvtap.
 
+config VXLAN
+       tristate "Virtual eXtensible Local Area Network (VXLAN)"
+       depends on EXPERIMENTAL
+       ---help---
+	  This allows one to create vxlan virtual interfaces that provide
+	  Layer 2 Networks over Layer 3 Networks. VXLAN is often used
+	  to tunnel virtual network infrastructure in virtualized environments.
+	  For more information see:
+	    http://tools.ietf.org/html/draft-mahalingam-dutt-dcops-vxlan-02
+
+	  To compile this driver as a module, choose M here: the module
+	  will be called vxlan.
+
 config NETCONSOLE
 	tristate "Network console logging support"
 	---help---
--- a/drivers/net/Makefile	2012-09-24 10:56:57.080291529 -0700
+++ b/drivers/net/Makefile	2012-09-24 11:08:02.865416523 -0700
@@ -21,6 +21,7 @@ obj-$(CONFIG_NET_TEAM) += team/
 obj-$(CONFIG_TUN) += tun.o
 obj-$(CONFIG_VETH) += veth.o
 obj-$(CONFIG_VIRTIO_NET) += virtio_net.o
+obj-$(CONFIG_VXLAN) += vxlan.o
 
 #
 # Networking Drivers
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ b/drivers/net/vxlan.c	2012-09-24 12:44:19.169695844 -0700
@@ -0,0 +1,1171 @@
+/*
+ * VXLAN: Virtual eXtensiable Local Area Network
+ *
+ * Copyright (c) 2012 Vyatta Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * TODO
+ *  - use IANA UDP port number (when defined)
+ *  - IPv6 (not in RFC)
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/slab.h>
+#include <linux/skbuff.h>
+#include <linux/rculist.h>
+#include <linux/netdevice.h>
+#include <linux/in.h>
+#include <linux/ip.h>
+#include <linux/udp.h>
+#include <linux/igmp.h>
+#include <linux/etherdevice.h>
+#include <linux/if_ether.h>
+#include <linux/version.h>
+#include <linux/hash.h>
+#include <net/ip.h>
+#include <net/icmp.h>
+#include <net/udp.h>
+#include <net/rtnetlink.h>
+#include <net/route.h>
+#include <net/dsfield.h>
+#include <net/inet_ecn.h>
+#include <net/net_namespace.h>
+#include <net/netns/generic.h>
+
+#define VXLAN_VERSION	"0.0"
+
+#define VNI_HASH_BITS	10
+#define VNI_HASH_SIZE	(1<<VNI_HASH_BITS)
+#define FDB_HASH_BITS	8
+#define FDB_HASH_SIZE	(1<<FDB_HASH_BITS)
+
+#define VXLAN_N_VID	(1u << 24)
+#define VXLAN_VID_MASK	(VXLAN_N_VID - 1)
+
+#define FDB_AGE_INTERVAL (10 * HZ)	/* rescan interval */
+#define FDB_AGE_TIME	 (300 * HZ)	/* drop if not used in 5 min */
+
+#define VXLAN_FLAGS 0x08000000	/* struct vxlanhdr.vx_flags required value. */
+
+/* VXLAN protocol header */
+struct vxlanhdr {
+	__be32 vx_flags;
+	__be32 vx_vni;
+};
+
+#define VXLAN_HEADROOM (sizeof(struct iphdr)		\
+			+ sizeof(struct udphdr)		\
+			+ sizeof(struct vxlanhdr))
+
+/* UDP port for VXLAN traffic. */
+static unsigned int vxlan_port __read_mostly = 8472;
+module_param_named(port, vxlan_port, uint, 0);
+MODULE_PARM_DESC(vxlan_port, "Destination UDP port");
+
+/* per-net private data for this module */
+static unsigned int vxlan_net_id;
+struct vxlan_net {
+	struct socket	  *sock;	/* UDP encap socket */
+	struct hlist_head vni_list[VNI_HASH_SIZE];
+};
+
+/* Forwarding table entry */
+struct vxlan_fdb {
+	struct hlist_node hlist;	/* linked list of entries */
+	struct rcu_head	  rcu;
+	unsigned long	  updated;	/* jiffies */
+	unsigned long	  used;
+#define VXLAN_FDB_PERM			0x1
+	u32		  flags;
+	__be32		  remote_ip;
+	u8		  eth_addr[ETH_ALEN];
+};
+
+/* Per-cpu network traffic stats */
+struct vxlan_stats {
+	u64			rx_packets;
+	u64			rx_bytes;
+	u64			tx_packets;
+	u64			tx_bytes;
+	struct u64_stats_sync	syncp;
+};
+
+/* Pseudo network device */
+struct vxlan_dev {
+	struct hlist_node hlist;
+	struct net_device *dev;
+	struct vxlan_stats __percpu *stats;
+	__u32		  vni;		/* virtual network id */
+	__be32	          gaddr;	/* multicast group */
+	__be32		  saddr;	/* source address */
+	unsigned int      link;		/* link to multicast over */
+	__u8		  tos;		/* TOS override */
+	__u8		  ttl;
+	bool		  learn;
+
+	struct timer_list age_timer;
+	spinlock_t	  hash_lock;
+	struct hlist_head fdb_head[FDB_HASH_SIZE];
+};
+
+/* salt for hash table */
+static u32 vxlan_salt __read_mostly;
+
+static inline struct hlist_head *vni_head(struct net *net, u32 id)
+{
+	struct vxlan_net *vn = net_generic(net, vxlan_net_id);
+
+	return &vn->vni_list[hash_32(id, VNI_HASH_BITS)];
+}
+
+/* Look up VNI in a per net namespace table */
+static struct vxlan_dev *vxlan_find_vni(struct net *net, u32 id)
+{
+	struct vxlan_dev *vxlan;
+	struct hlist_node *node;
+
+	hlist_for_each_entry_rcu(vxlan, node, vni_head(net, id), hlist) {
+		if (vxlan->vni == id)
+			return vxlan;
+	}
+
+	return NULL;
+}
+
+/* Fill in neighbour message in skbuff. */
+static int vxlan_fdb_info(struct sk_buff *skb, struct vxlan_dev *vxlan,
+			   const struct vxlan_fdb *fdb,
+			   u32 portid, u32 seq, int type, unsigned int flags)
+{
+	unsigned long now = jiffies;
+	struct nda_cacheinfo ci;
+	struct nlmsghdr *nlh;
+	struct ndmsg *ndm;
+
+	nlh = nlmsg_put(skb, portid, seq, type, sizeof(*ndm), flags);
+	if (nlh == NULL)
+		return -EMSGSIZE;
+
+	ndm = nlmsg_data(nlh);
+	memset(ndm, 0, sizeof(*ndm));
+	ndm->ndm_family	= AF_BRIDGE;
+
+	if (fdb->flags & VXLAN_FDB_PERM)
+		ndm->ndm_state = NUD_PERMANENT;
+	else if (time_before_eq(fdb->used + FDB_AGE_TIME, now))
+		ndm->ndm_state = NUD_STALE;
+	else
+		ndm->ndm_state = NUD_REACHABLE;
+
+	if (nla_put(skb, NDA_LLADDR, ETH_ALEN, &fdb->eth_addr))
+		goto nla_put_failure;
+
+	if (nla_put_be32(skb, NDA_DST, fdb->remote_ip))
+		goto nla_put_failure;
+
+	ci.ndm_used	 = jiffies_to_clock_t(now - fdb->used);
+	ci.ndm_confirmed = 0;
+	ci.ndm_updated	 = jiffies_to_clock_t(now - fdb->updated);
+	ci.ndm_refcnt	 = 0;
+
+	if (nla_put(skb, NDA_CACHEINFO, sizeof(ci), &ci))
+		goto nla_put_failure;
+
+	return nlmsg_end(skb, nlh);
+
+nla_put_failure:
+	nlmsg_cancel(skb, nlh);
+	return -EMSGSIZE;
+}
+
+static inline size_t vxlan_nlmsg_size(void)
+{
+	return NLMSG_ALIGN(sizeof(struct ndmsg))
+		+ nla_total_size(ETH_ALEN) /* NDA_LLADDR */
+		+ nla_total_size(sizeof(__be32)) /* NDA_DST */
+		+ nla_total_size(sizeof(struct nda_cacheinfo));
+}
+
+static void vxlan_fdb_notify(struct vxlan_dev *vxlan,
+			     const struct vxlan_fdb *fdb, int type)
+{
+	struct net *net = dev_net(vxlan->dev);
+	struct sk_buff *skb;
+	int err = -ENOBUFS;
+
+	skb = nlmsg_new(vxlan_nlmsg_size(), GFP_ATOMIC);
+	if (skb == NULL)
+		goto errout;
+
+	err = vxlan_fdb_info(skb, vxlan, fdb, 0, 0, type, 0);
+	if (err < 0) {
+		/* -EMSGSIZE implies BUG in vxlan_nlmsg_size() */
+		WARN_ON(err == -EMSGSIZE);
+		kfree_skb(skb);
+		goto errout;
+	}
+
+	rtnl_notify(skb, net, 0, RTNLGRP_NEIGH, NULL, GFP_ATOMIC);
+	return;
+errout:
+	if (err < 0)
+		rtnl_set_sk_err(net, RTNLGRP_NEIGH, err);
+}
+
+/* Hash Ethernet address */
+static u32 eth_hash(const unsigned char *addr)
+{
+	/* could be optimized for unaligned access */
+	u32 a = addr[5] << 8 | addr[4];
+	u32 b = addr[3] << 24 | addr[2] << 16 | addr[1] << 8 | addr[0];
+
+	return jhash_2words(a, b, vxlan_salt);
+}
+
+/* Hash chain to use given mac address */
+static inline struct hlist_head *vxlan_fdb_head(struct vxlan_dev *vxlan,
+						const u8 *mac)
+{
+	return &vxlan->fdb_head[hash_32(eth_hash(mac), FDB_HASH_BITS)];
+}
+
+/* Look up Ethernet address in forwarding table */
+static struct vxlan_fdb *vxlan_find_mac(struct vxlan_dev *vxlan,
+					const u8 *mac)
+
+{
+	struct hlist_head *head = vxlan_fdb_head(vxlan, mac);
+	struct vxlan_fdb *f;
+	struct hlist_node *node;
+
+	hlist_for_each_entry_rcu(f, node, head, hlist) {
+		if (compare_ether_addr(mac, f->eth_addr) == 0)
+			return f;
+	}
+
+	return NULL;
+}
+
+/* Add new entry to forwarding table -- assumes lock held */
+static int vxlan_fdb_create(struct vxlan_dev *vxlan,
+			    const u8 *mac, __be32 ip,
+			    u32 flags)
+{
+	struct vxlan_fdb *f;
+
+	if (unlikely(vxlan_find_mac(vxlan, mac))) {
+		netdev_dbg(vxlan->dev,
+			   "lost race to create %pM\n", mac);
+		return -EEXIST;
+	}
+
+	netdev_dbg(vxlan->dev, "add %pM -> %pI4\n", mac, &ip);
+
+	f = kmalloc(sizeof(*f), GFP_ATOMIC);
+	if (!f)
+		return -ENOMEM;
+
+	f->remote_ip = ip;
+	memcpy(f->eth_addr, mac, ETH_ALEN);
+	hlist_add_head_rcu(&f->hlist,
+			   vxlan_fdb_head(vxlan, mac));
+	f->updated = f->used = jiffies;
+	f->flags = flags;
+
+	vxlan_fdb_notify(vxlan, f, RTM_NEWNEIGH);
+
+	return 0;
+}
+
+static void vxlan_fdb_destroy(struct vxlan_dev *vxlan, struct vxlan_fdb *f)
+{
+	netdev_dbg(vxlan->dev,
+		    "delete %pM\n", f->eth_addr);
+
+	vxlan_fdb_notify(vxlan, f, RTM_DELNEIGH);
+
+	hlist_del_rcu(&f->hlist);
+	kfree_rcu(f, rcu);
+}
+
+/* Add static entry (via netlink) */
+static int vxlan_fdb_add(struct ndmsg *ndm, struct nlattr *tb[],
+			 struct net_device *dev,
+			 const unsigned char *addr, u16 flags)
+{
+	struct vxlan_dev *vxlan = netdev_priv(dev);
+	__be32 ip;
+	int err;
+
+	if (tb[NDA_DST] == NULL)
+		return -EINVAL;
+
+	if (nla_len(tb[NDA_DST]) != sizeof(__be32))
+		return -EAFNOSUPPORT;
+
+	ip = nla_get_be32(tb[NDA_DST]);
+
+	spin_lock_bh(&vxlan->hash_lock);
+	err = vxlan_fdb_create(vxlan, addr, ip, VXLAN_FDB_PERM);
+	spin_unlock_bh(&vxlan->hash_lock);
+
+	return err;
+}
+
+/* Delete entry (via netlink) */
+static int vxlan_fdb_delete(struct ndmsg *ndm, struct net_device *dev,
+			    const unsigned char *addr)
+{
+	struct vxlan_dev *vxlan = netdev_priv(dev);
+	struct vxlan_fdb *f;
+	int err = -ENOENT;
+
+	spin_lock_bh(&vxlan->hash_lock);
+	f = vxlan_find_mac(vxlan, addr);
+	if (f) {
+		vxlan_fdb_destroy(vxlan, f);
+		err = 0;
+	}
+	spin_unlock_bh(&vxlan->hash_lock);
+
+	return err;
+}
+
+/* Dump forwarding table */
+static int vxlan_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb,
+			  struct net_device *dev, int idx)
+{
+	struct vxlan_dev *vxlan = netdev_priv(dev);
+	unsigned int h;
+
+	for (h = 0; h < FDB_HASH_SIZE; ++h) {
+		struct vxlan_fdb *f;
+		struct hlist_node *n;
+
+		hlist_for_each_entry_rcu(f, n, &vxlan->fdb_head[h], hlist) {
+			if (idx < cb->args[0])
+				goto skip;
+
+			if (vxlan_fdb_info(skb, vxlan, f,
+					    NETLINK_CB(cb->skb).portid,
+					    cb->nlh->nlmsg_seq,
+					    RTM_NEWNEIGH,
+					    NLM_F_MULTI) < 0)
+				break;
+skip:
+			++idx;
+		}
+	}
+
+	return idx;
+}
+
+/* Watch incoming packets to learn mapping between Ethernet address
+ * and Tunnel endpoint.
+ */
+static void vxlan_snoop(struct net_device *dev,
+			__be32 src_ip, const u8 *src_mac)
+{
+	struct vxlan_dev *vxlan = netdev_priv(dev);
+	struct vxlan_fdb *f;
+	int err;
+
+	f = vxlan_find_mac(vxlan, src_mac);
+	if (likely(f)) {
+		f->used = jiffies;
+		if (likely(f->remote_ip == src_ip))
+			return;
+
+		f->remote_ip = src_ip;
+		f->updated = jiffies;
+
+		if (net_ratelimit())
+			netdev_info(dev,
+				    "%pM migrated from %pI4 to %pI4\n",
+				    src_mac, &f->remote_ip, &src_ip);
+	} else {
+		/* learned new entry */
+		spin_lock(&vxlan->hash_lock);
+		err = vxlan_fdb_create(vxlan, src_mac, src_ip, 0);
+		spin_unlock(&vxlan->hash_lock);
+	}
+}
+
+
+/* See if multicast group is already in use by other ID */
+static bool vxlan_group_used(struct vxlan_net *vn,
+			     const struct vxlan_dev *this)
+{
+	const struct vxlan_dev *vxlan;
+	struct hlist_node *node;
+	unsigned h;
+
+	for (h = 0; h < VNI_HASH_SIZE; ++h)
+		hlist_for_each_entry(vxlan, node, &vn->vni_list[h], hlist) {
+			if (vxlan == this)
+				continue;
+
+			if (!netif_running(vxlan->dev))
+				continue;
+
+			if (vxlan->gaddr == this->gaddr)
+				return true;
+		}
+
+	return false;
+}
+
+/* kernel equivalent to IP_ADD_MEMBERSHIP */
+static int vxlan_join_group(struct net_device *dev)
+{
+	struct vxlan_dev *vxlan = netdev_priv(dev);
+	struct vxlan_net *vn = net_generic(dev_net(dev), vxlan_net_id);
+	struct sock *sk = vn->sock->sk;
+	struct ip_mreqn mreq = {
+		.imr_multiaddr.s_addr = vxlan->gaddr,
+	};
+	int err;
+
+	if (vxlan_group_used(vn, vxlan))
+		return 0;
+
+	/* Need to drop RTNL to call multicast join */
+	rtnl_unlock();
+	lock_sock(sk);
+	err = ip_mc_join_group(sk, &mreq);
+	release_sock(sk);
+	rtnl_lock();
+
+	return err;
+}
+
+
+/* kernel equivalent to IP_DROP_MEMBERSHIP */
+static int vxlan_leave_group(struct net_device *dev)
+{
+	struct vxlan_dev *vxlan = netdev_priv(dev);
+	struct vxlan_net *vn = net_generic(dev_net(dev), vxlan_net_id);
+	int err = 0;
+	struct sock *sk = vn->sock->sk;
+	struct ip_mreqn mreq = {
+		.imr_multiaddr.s_addr = vxlan->gaddr,
+	};
+
+	if (vxlan_group_used(vn, vxlan))
+		return 0;
+
+	/* Need to drop RTNL to call multicast leave */
+	rtnl_unlock();
+	lock_sock(sk);
+	err = ip_mc_leave_group(sk, &mreq);
+	release_sock(sk);
+	rtnl_lock();
+
+	return err;
+}
+
+/* Propogate ECN from outer IP header to tunneled packet */
+static inline void vxlan_ecn_decap(const struct iphdr *iph, struct sk_buff *skb)
+{
+	if (INET_ECN_is_ce(iph->tos)) {
+		if (skb->protocol == htons(ETH_P_IP))
+			IP_ECN_set_ce(ip_hdr(skb));
+		else if (skb->protocol == htons(ETH_P_IPV6))
+			IP6_ECN_set_ce(ipv6_hdr(skb));
+	}
+}
+
+/* Callback from net/ipv4/udp.c to receive packets */
+static int vxlan_udp_encap_recv(struct sock *sk, struct sk_buff *skb)
+{
+	struct iphdr *oip;
+	struct vxlanhdr *vxh;
+	struct vxlan_dev *vxlan;
+	struct vxlan_stats *stats;
+	__u32 vni;
+
+	/* pop off outer UDP header */
+	__skb_pull(skb, sizeof(struct udphdr));
+
+	/* Need Vxlan and inner Ethernet header to be present */
+	if (!pskb_may_pull(skb,
+			   sizeof(struct vxlanhdr) + sizeof(struct ethhdr)))
+		goto error;
+
+	oip = ip_hdr(skb);
+
+	/* Drop packets with reserved bits set */
+	vxh = (struct vxlanhdr *) skb->data;
+	if (vxh->vx_flags != htonl(VXLAN_FLAGS) ||
+	    (vxh->vx_vni & htonl(0xff)))
+		goto error;
+
+	__skb_pull(skb, sizeof(struct vxlanhdr));
+
+	/* Is this VNI defined? */
+	vni = ntohl(vxh->vx_vni) >> 8;
+	vxlan = vxlan_find_vni(sock_net(sk), vni);
+	if (!vxlan)
+		goto drop;
+
+	/* Ignore packets if device is not up */
+	if (!netif_running(vxlan->dev))
+		goto drop;
+
+	/* Re-examine inner Ethernet packet */
+	skb->protocol = eth_type_trans(skb, vxlan->dev);
+	skb->ip_summed = CHECKSUM_NONE;
+
+	/* Ignore packet loops (and multicast echo) */
+	if (compare_ether_addr(eth_hdr(skb)->h_source,
+			       vxlan->dev->dev_addr) == 0)
+		goto drop;
+
+	/* Check for multicast group configuration errors */
+	if (IN_MULTICAST(ntohl(oip->daddr)) &&
+	    oip->daddr != vxlan->gaddr) {
+		if (net_ratelimit())
+			netdev_notice(vxlan->dev,
+				      "group address %pI4 does not match\n",
+				      &oip->daddr);
+		goto drop;
+	}
+
+	if (vxlan->learn)
+		vxlan_snoop(skb->dev, oip->saddr, eth_hdr(skb)->h_source);
+
+	stats = this_cpu_ptr(vxlan->stats);
+	u64_stats_update_begin(&stats->syncp);
+	stats->rx_packets++;
+	stats->rx_bytes += skb->len;
+	u64_stats_update_end(&stats->syncp);
+
+	__skb_tunnel_rx(skb, vxlan->dev);
+	skb_reset_network_header(skb);
+	vxlan_ecn_decap(oip, skb);
+
+	netif_rx(skb);
+
+	return 0;
+error:
+	/* Put UDP header back */
+	__skb_push(skb, sizeof(struct udphdr));
+
+	return 1;
+drop:
+	/* Consume bad packet */
+	kfree_skb(skb);
+	return 0;
+}
+
+/* Extract dsfield from inner protocol */
+static inline u8 vxlan_get_dsfield(const struct iphdr *iph,
+				   const struct sk_buff *skb)
+{
+	if (skb->protocol == htons(ETH_P_IP))
+		return iph->tos;
+	else if (skb->protocol == htons(ETH_P_IPV6))
+		return ipv6_get_dsfield((const struct ipv6hdr *)iph);
+	else
+		return 0;
+}
+
+/* Propogate ECN bits out */
+static inline u8 vxlan_ecn_encap(u8 tos,
+				 const struct iphdr *iph,
+				 const struct sk_buff *skb)
+{
+	u8 inner = vxlan_get_dsfield(iph, skb);
+
+	return INET_ECN_encapsulate(tos, inner);
+}
+
+/* Transmit local packets over Vxlan
+ *
+ * Outer IP header inherits ECN and DF from inner header.
+ * Outer UDP destination is the VXLAN assigned port.
+ *           source port is based on hash of flow if available
+ *                       otherwise use a random value
+ */
+static netdev_tx_t vxlan_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+	struct vxlan_dev *vxlan = netdev_priv(dev);
+	struct rtable *rt;
+	const struct ethhdr *eth;
+	const struct iphdr *old_iph;
+	struct iphdr *iph;
+	struct vxlanhdr *vxh;
+	struct udphdr *uh;
+	struct flowi4 fl4;
+	struct vxlan_fdb *f;
+	unsigned int pkt_len = skb->len;
+	unsigned int mtu;
+	u32 hash;
+	__be32 dst;
+	__be16 df = 0;
+	__u8 tos, ttl;
+	int err;
+
+	/* Need space for new headers (invalidates iph ptr) */
+	if (skb_cow_head(skb, VXLAN_HEADROOM))
+		goto drop;
+
+	eth = (void *)skb->data;
+	old_iph = ip_hdr(skb);
+
+	if (!is_multicast_ether_addr(eth->h_dest) &&
+	    (f = vxlan_find_mac(vxlan, eth->h_dest)))
+		dst = f->remote_ip;
+	else if (vxlan->gaddr) {
+		dst = vxlan->gaddr;
+	} else
+		goto drop;
+
+	ttl = vxlan->ttl;
+	if (!ttl && IN_MULTICAST(ntohl(dst)))
+		ttl = 1;
+
+	tos = vxlan->tos;
+	if (tos == 1)
+		tos = vxlan_get_dsfield(old_iph, skb);
+
+	hash = skb_get_rxhash(skb);
+
+	rt = ip_route_output_gre(dev_net(dev), &fl4, dst,
+				 vxlan->saddr, vxlan->vni,
+				 RT_TOS(tos), vxlan->link);
+	if (IS_ERR(rt)) {
+		netdev_dbg(dev, "no route to %pI4\n", &dst);
+		dev->stats.tx_carrier_errors++;
+		goto tx_error;
+	}
+
+	if (rt->dst.dev == dev) {
+		netdev_dbg(dev, "circular route to %pI4\n", &dst);
+		ip_rt_put(rt);
+		dev->stats.collisions++;
+		goto tx_error;
+	}
+
+	mtu = dst_mtu(&rt->dst) - VXLAN_HEADROOM;
+	/* Do PMTU */
+	if (skb->protocol == htons(ETH_P_IP)) {
+		df |= old_iph->frag_off & htons(IP_DF);
+		if (df && mtu < pkt_len) {
+			icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
+				  htonl(mtu));
+			ip_rt_put(rt);
+			goto tx_error;
+		}
+	}
+#if IS_ENABLED(CONFIG_IPV6)
+	else if (skb->protocol == htons(ETH_P_IPV6)) {
+		if (mtu >= IPV6_MIN_MTU && mtu < pkt_len) {
+			icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
+			ip_rt_put(rt);
+			goto tx_error;
+		}
+	}
+#endif
+
+	memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
+	IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED |
+			      IPSKB_REROUTED);
+	skb_dst_drop(skb);
+	skb_dst_set(skb, &rt->dst);
+
+	vxh = (struct vxlanhdr *) __skb_push(skb, sizeof(*vxh));
+	vxh->vx_flags = htonl(VXLAN_FLAGS);
+	vxh->vx_vni = htonl(vxlan->vni << 8);
+
+	__skb_push(skb, sizeof(*uh));
+	skb_reset_transport_header(skb);
+	uh = udp_hdr(skb);
+
+	uh->dest = htons(vxlan_port);
+	uh->source = hash ? :random32();
+
+	uh->len = htons(skb->len);
+	uh->check = 0;
+
+	__skb_push(skb, sizeof(*iph));
+	skb_reset_network_header(skb);
+	iph		= ip_hdr(skb);
+	iph->version	= 4;
+	iph->ihl	= sizeof(struct iphdr) >> 2;
+	iph->frag_off	= df;
+	iph->protocol	= IPPROTO_UDP;
+	iph->tos	= tos;
+	iph->daddr	= fl4.daddr;
+	iph->saddr	= fl4.saddr;
+	iph->ttl	= ttl ? : ip4_dst_hoplimit(&rt->dst);
+
+	/* See __IPTUNNEL_XMIT */
+	skb->ip_summed = CHECKSUM_NONE;
+	ip_select_ident(iph, &rt->dst, NULL);
+
+	err = ip_local_out(skb);
+	if (likely(net_xmit_eval(err) == 0)) {
+		struct vxlan_stats *stats = this_cpu_ptr(vxlan->stats);
+
+		u64_stats_update_begin(&stats->syncp);
+		stats->tx_packets++;
+		stats->tx_bytes += pkt_len;
+		u64_stats_update_end(&stats->syncp);
+	} else {
+		dev->stats.tx_errors++;
+		dev->stats.tx_aborted_errors++;
+	}
+	return NETDEV_TX_OK;
+
+drop:
+	dev->stats.tx_dropped++;
+	goto tx_free;
+
+tx_error:
+	dev->stats.tx_errors++;
+tx_free:
+	dev_kfree_skb(skb);
+	return NETDEV_TX_OK;
+}
+
+/* Walk the forwarding table and purge stale entries */
+static void vxlan_cleanup(unsigned long arg)
+{
+	struct vxlan_dev *vxlan = (struct vxlan_dev *) arg;
+	unsigned long next_timer = jiffies + FDB_AGE_INTERVAL;
+	unsigned int h;
+
+	if (!netif_running(vxlan->dev))
+		return;
+
+	spin_lock_bh(&vxlan->hash_lock);
+	for (h = 0; h < FDB_HASH_SIZE; ++h) {
+		struct hlist_node *p, *n;
+		hlist_for_each_safe(p, n, &vxlan->fdb_head[h]) {
+			struct vxlan_fdb *f
+				= container_of(p, struct vxlan_fdb, hlist);
+			unsigned long timeout = f->used + FDB_AGE_TIME;
+
+			if (f->flags & VXLAN_FDB_PERM)
+				continue;
+
+			if (time_before_eq(timeout, jiffies)) {
+				netdev_dbg(vxlan->dev,
+					   "garbage collect %pM\n",
+					   f->eth_addr);
+				vxlan_fdb_destroy(vxlan, f);
+			} else if (time_before(timeout, next_timer))
+				next_timer = timeout;
+		}
+	}
+	spin_unlock_bh(&vxlan->hash_lock);
+
+	mod_timer(&vxlan->age_timer, next_timer);
+}
+
+/* Setup stats when device is created */
+static int vxlan_init(struct net_device *dev)
+{
+	struct vxlan_dev *vxlan = netdev_priv(dev);
+
+	vxlan->stats = alloc_percpu(struct vxlan_stats);
+	if (!vxlan->stats)
+		return -ENOMEM;
+
+	return 0;
+}
+
+/* Start ageing timer and join group when device is brought up */
+static int vxlan_open(struct net_device *dev)
+{
+	struct vxlan_dev *vxlan = netdev_priv(dev);
+	int err;
+
+	if (vxlan->gaddr) {
+		err = vxlan_join_group(dev);
+		if (err)
+			return err;
+	}
+
+	mod_timer(&vxlan->age_timer, jiffies + FDB_AGE_INTERVAL);
+	return 0;
+}
+
+/* Purge the forwarding table */
+static void vxlan_flush(struct vxlan_dev *vxlan)
+{
+	unsigned h;
+
+	spin_lock_bh(&vxlan->hash_lock);
+	for (h = 0; h < FDB_HASH_SIZE; ++h) {
+		struct hlist_node *p, *n;
+		hlist_for_each_safe(p, n, &vxlan->fdb_head[h]) {
+			struct vxlan_fdb *f
+				= container_of(p, struct vxlan_fdb, hlist);
+			vxlan_fdb_destroy(vxlan, f);
+		}
+	}
+	spin_unlock_bh(&vxlan->hash_lock);
+}
+
+/* Cleanup timer and forwarding table on shutdown */
+static int vxlan_stop(struct net_device *dev)
+{
+	struct vxlan_dev *vxlan = netdev_priv(dev);
+
+	if (vxlan->gaddr)
+		vxlan_leave_group(dev);
+
+	del_timer_sync(&vxlan->age_timer);
+
+	vxlan_flush(vxlan);
+
+	return 0;
+}
+
+/* Merge per-cpu statistics */
+static struct rtnl_link_stats64 *vxlan_stats64(struct net_device *dev,
+					       struct rtnl_link_stats64 *stats)
+{
+	struct vxlan_dev *vxlan = netdev_priv(dev);
+	struct vxlan_stats tmp, sum = { 0 };
+	unsigned int cpu;
+
+	for_each_possible_cpu(cpu) {
+		unsigned int start;
+		const struct vxlan_stats *stats
+			= per_cpu_ptr(vxlan->stats, cpu);
+
+		do {
+			start = u64_stats_fetch_begin_bh(&stats->syncp);
+			memcpy(&tmp, stats, sizeof(tmp));
+		} while (u64_stats_fetch_retry_bh(&stats->syncp, start));
+
+		sum.tx_bytes   += tmp.tx_bytes;
+		sum.tx_packets += tmp.tx_packets;
+		sum.rx_bytes   += tmp.rx_bytes;
+		sum.rx_packets += tmp.rx_packets;
+	}
+
+	stats->tx_bytes   = sum.tx_bytes;
+	stats->tx_packets = sum.tx_packets;
+	stats->rx_bytes   = sum.rx_bytes;
+	stats->rx_packets = sum.rx_packets;
+
+	stats->tx_dropped = dev->stats.tx_dropped;
+	stats->tx_errors  = dev->stats.tx_errors;
+	stats->tx_carrier_errors  = dev->stats.tx_carrier_errors;
+	stats->collisions  = dev->stats.collisions;
+
+	return stats;
+}
+
+/* Stub, nothing needs to be done. */
+static void vxlan_set_multicast_list(struct net_device *dev)
+{
+}
+
+static const struct net_device_ops vxlan_netdev_ops = {
+	.ndo_init		= vxlan_init,
+	.ndo_open		= vxlan_open,
+	.ndo_stop		= vxlan_stop,
+	.ndo_start_xmit		= vxlan_xmit,
+	.ndo_get_stats64	= vxlan_stats64,
+	.ndo_set_rx_mode	= vxlan_set_multicast_list,
+	.ndo_change_mtu		= eth_change_mtu,
+	.ndo_validate_addr	= eth_validate_addr,
+	.ndo_set_mac_address	= eth_mac_addr,
+	.ndo_fdb_add		= vxlan_fdb_add,
+	.ndo_fdb_del		= vxlan_fdb_delete,
+	.ndo_fdb_dump		= vxlan_fdb_dump,
+};
+
+/* Info for udev, that this is a virtual tunnel endpoint */
+static struct device_type vxlan_type = {
+	.name = "vxlan",
+};
+
+static void vxlan_free(struct net_device *dev)
+{
+	struct vxlan_dev *vxlan = netdev_priv(dev);
+
+	free_percpu(vxlan->stats);
+	free_netdev(dev);
+}
+
+/* Initialize the device structure. */
+static void vxlan_setup(struct net_device *dev)
+{
+	struct vxlan_dev *vxlan = netdev_priv(dev);
+	unsigned h;
+
+	eth_hw_addr_random(dev);
+	ether_setup(dev);
+
+	dev->netdev_ops = &vxlan_netdev_ops;
+	dev->destructor = vxlan_free;
+	SET_NETDEV_DEVTYPE(dev, &vxlan_type);
+
+	dev->tx_queue_len = 0;
+	dev->features	|= NETIF_F_LLTX;
+	dev->features	|= NETIF_F_NETNS_LOCAL;
+	dev->priv_flags	&= ~IFF_XMIT_DST_RELEASE;
+
+	spin_lock_init(&vxlan->hash_lock);
+
+	init_timer_deferrable(&vxlan->age_timer);
+	vxlan->age_timer.function = vxlan_cleanup;
+	vxlan->age_timer.data = (unsigned long) vxlan;
+
+	vxlan->dev = dev;
+
+	for (h = 0; h < FDB_HASH_SIZE; ++h)
+		INIT_HLIST_HEAD(&vxlan->fdb_head[h]);
+}
+
+static const struct nla_policy vxlan_policy[IFLA_VXLAN_MAX + 1] = {
+	[IFLA_VXLAN_ID]		= { .type = NLA_U32 },
+	[IFLA_VXLAN_GROUP]	= { .len = FIELD_SIZEOF(struct iphdr, daddr) },
+	[IFLA_VXLAN_LINK]	= { .type = NLA_U32 },
+	[IFLA_VXLAN_LOCAL]	= { .len = FIELD_SIZEOF(struct iphdr, saddr) },
+	[IFLA_VXLAN_TOS]	= { .type = NLA_U8 },
+	[IFLA_VXLAN_LEARNING]	= { .type = NLA_U8 },
+};
+
+static int vxlan_validate(struct nlattr *tb[], struct nlattr *data[])
+{
+	if (tb[IFLA_ADDRESS]) {
+		if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN) {
+			pr_debug("invalid link address (not ethernet)\n");
+			return -EINVAL;
+		}
+
+		if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS]))) {
+			pr_debug("invalid all zero ethernet address\n");
+			return -EADDRNOTAVAIL;
+		}
+	}
+
+	if (!data)
+		return -EINVAL;
+
+	if (data[IFLA_VXLAN_ID]) {
+		__u32 id = nla_get_u32(data[IFLA_VXLAN_ID]);
+		if (id >= VXLAN_VID_MASK)
+			return -ERANGE;
+	}
+
+	if (data[IFLA_VXLAN_GROUP]) {
+		__be32 gaddr = nla_get_be32(data[IFLA_VXLAN_GROUP]);
+		if (!IN_MULTICAST(ntohl(gaddr))) {
+			pr_debug("group address is not IPv4 multicast\n");
+			return -EADDRNOTAVAIL;
+		}
+	}
+
+	return 0;
+}
+
+static int vxlan_newlink(struct net *net, struct net_device *dev,
+			 struct nlattr *tb[], struct nlattr *data[])
+{
+	struct vxlan_dev *vxlan = netdev_priv(dev);
+	__u32 vni;
+	int err;
+
+	if (!data[IFLA_VXLAN_ID])
+		return -EINVAL;
+
+	vni = nla_get_u32(data[IFLA_VXLAN_ID]);
+	if (vxlan_find_vni(net, vni)) {
+		pr_info("duplicate VNI %u\n", vni);
+		return -EEXIST;
+	}
+	vxlan->vni = vni;
+
+	if (data[IFLA_VXLAN_GROUP])
+		vxlan->gaddr = nla_get_be32(data[IFLA_VXLAN_GROUP]);
+
+	if (data[IFLA_VXLAN_LOCAL])
+		vxlan->saddr = nla_get_be32(data[IFLA_VXLAN_LOCAL]);
+
+	if (data[IFLA_VXLAN_LINK])
+		vxlan->link = nla_get_u32(data[IFLA_VXLAN_LINK]);
+
+	if (data[IFLA_VXLAN_TOS])
+		vxlan->tos  = nla_get_u8(data[IFLA_VXLAN_TOS]);
+
+	if (!data[IFLA_VXLAN_LEARNING] || nla_get_u8(data[IFLA_VXLAN_LEARNING]))
+		vxlan->learn = true;
+
+	err = register_netdevice(dev);
+	if (!err)
+		hlist_add_head_rcu(&vxlan->hlist, vni_head(net, vxlan->vni));
+
+	return err;
+}
+
+static void vxlan_dellink(struct net_device *dev, struct list_head *head)
+{
+	struct vxlan_dev *vxlan = netdev_priv(dev);
+
+	hlist_del_rcu(&vxlan->hlist);
+
+	unregister_netdevice_queue(dev, head);
+}
+
+static size_t vxlan_get_size(const struct net_device *dev)
+{
+
+	return nla_total_size(sizeof(__u32)) +	/* IFLA_VXLAN_ID */
+		nla_total_size(sizeof(__be32)) +/* IFLA_VXLAN_GROUP */
+		nla_total_size(sizeof(__u32)) +	/* IFLA_VXLAN_LINK */
+		nla_total_size(sizeof(__be32))+	/* IFLA_VXLAN_LOCAL */
+		nla_total_size(sizeof(__u8)) +	/* IFLA_VXLAN_TTL */
+		nla_total_size(sizeof(__u8)) +	/* IFLA_VXLAN_TOS */
+		nla_total_size(sizeof(__u8)) +	/* IFLA_VXLAN_LEARNING */
+		0;
+}
+
+static int vxlan_fill_info(struct sk_buff *skb, const struct net_device *dev)
+{
+	const struct vxlan_dev *vxlan = netdev_priv(dev);
+
+	if (nla_put_u32(skb, IFLA_VXLAN_ID, vxlan->vni))
+		goto nla_put_failure;
+
+	if (vxlan->gaddr && nla_put_u32(skb, IFLA_VXLAN_GROUP, vxlan->gaddr))
+		goto nla_put_failure;
+
+	if (vxlan->link && nla_put_u32(skb, IFLA_VXLAN_LINK, vxlan->link))
+		goto nla_put_failure;
+
+	if (vxlan->saddr && nla_put_u32(skb, IFLA_VXLAN_LOCAL, vxlan->saddr))
+		goto nla_put_failure;
+
+	if (nla_put_u8(skb, IFLA_VXLAN_TTL, vxlan->ttl) ||
+	    nla_put_u8(skb, IFLA_VXLAN_TOS, vxlan->tos) ||
+	    nla_put_u8(skb, IFLA_VXLAN_LEARNING, vxlan->learn))
+		goto nla_put_failure;
+
+	return 0;
+
+nla_put_failure:
+	return -EMSGSIZE;
+}
+
+static struct rtnl_link_ops vxlan_link_ops __read_mostly = {
+	.kind		= "vxlan",
+	.maxtype	= IFLA_VXLAN_MAX,
+	.policy		= vxlan_policy,
+	.priv_size	= sizeof(struct vxlan_dev),
+	.setup		= vxlan_setup,
+	.validate	= vxlan_validate,
+	.newlink	= vxlan_newlink,
+	.dellink	= vxlan_dellink,
+	.get_size	= vxlan_get_size,
+	.fill_info	= vxlan_fill_info,
+};
+
+static __net_init int vxlan_init_net(struct net *net)
+{
+	struct vxlan_net *vn = net_generic(net, vxlan_net_id);
+	struct sock *sk;
+	struct sockaddr_in addr;
+	int rc;
+	unsigned h;
+
+	/* Create UDP socket for encapsulation receive. */
+	rc = sock_create_kern(AF_INET, SOCK_DGRAM, IPPROTO_UDP, &vn->sock);
+	if (rc < 0) {
+		pr_debug("UDP socket create failed\n");
+		return rc;
+	}
+
+	addr.sin_family = AF_INET;
+	addr.sin_addr.s_addr = INADDR_ANY;
+	addr.sin_port = htons(vxlan_port);
+
+	rc = kernel_bind(vn->sock, (struct sockaddr *) &addr, sizeof(addr));
+	if (rc < 0) {
+		pr_debug("bind for port %u failed %d\n", vxlan_port, rc);
+		sock_release(vn->sock);
+		vn->sock = NULL;
+		return rc;
+	}
+
+	/* Disable multicast loopback */
+	sk = vn->sock->sk;
+	inet_sk(sk)->mc_loop = 0;
+
+	/* Mark socket as an encapsulation socket. */
+	udp_sk(sk)->encap_type = UDP_ENCAP_L2TPINUDP;
+	udp_sk(sk)->encap_rcv = vxlan_udp_encap_recv;
+	udp_encap_enable();
+
+	for (h = 0; h < VNI_HASH_SIZE; ++h)
+		INIT_HLIST_HEAD(&vn->vni_list[h]);
+
+	return 0;
+}
+
+static __net_exit void vxlan_exit_net(struct net *net)
+{
+	struct vxlan_net *vn = net_generic(net, vxlan_net_id);
+
+	if (vn->sock) {
+		sock_release(vn->sock);
+		vn->sock = NULL;
+	}
+}
+
+static struct pernet_operations vxlan_net_ops = {
+	.init = vxlan_init_net,
+	.exit = vxlan_exit_net,
+	.id   = &vxlan_net_id,
+	.size = sizeof(struct vxlan_net),
+};
+
+static int __init vxlan_init_module(void)
+{
+	int rc;
+
+	get_random_bytes(&vxlan_salt, sizeof(vxlan_salt));
+
+	rc = register_pernet_device(&vxlan_net_ops);
+	if (rc)
+		goto out1;
+
+	rc = rtnl_link_register(&vxlan_link_ops);
+	if (rc)
+		goto out2;
+
+	return 0;
+
+out2:
+	unregister_pernet_device(&vxlan_net_ops);
+out1:
+	return rc;
+}
+module_init(vxlan_init_module);
+
+static void __exit vxlan_cleanup_module(void)
+{
+	rtnl_link_unregister(&vxlan_link_ops);
+	unregister_pernet_device(&vxlan_net_ops);
+}
+module_exit(vxlan_cleanup_module);
+
+MODULE_LICENSE("GPL");
+MODULE_VERSION(VXLAN_VERSION);
+MODULE_AUTHOR("Stephen Hemminger <shemminger@vyatta.com>");
+MODULE_ALIAS_RTNL_LINK("vxlan");
--- a/include/linux/if_link.h	2012-09-24 10:56:57.080291529 -0700
+++ b/include/linux/if_link.h	2012-09-24 11:08:02.869416484 -0700
@@ -272,6 +272,20 @@ enum macvlan_mode {
 
 #define MACVLAN_FLAG_NOPROMISC	1
 
+/* VXLAN section */
+enum {
+	IFLA_VXLAN_UNSPEC,
+	IFLA_VXLAN_ID,
+	IFLA_VXLAN_GROUP,
+	IFLA_VXLAN_LINK,
+	IFLA_VXLAN_LOCAL,
+	IFLA_VXLAN_TTL,
+	IFLA_VXLAN_TOS,
+	IFLA_VXLAN_LEARNING,
+	__IFLA_VXLAN_MAX
+};
+#define IFLA_VXLAN_MAX	(__IFLA_VXLAN_MAX - 1)
+
 /* SR-IOV virtual function management section */
 
 enum {
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ b/Documentation/networking/vxlan.txt	2012-09-24 11:08:02.869416484 -0700
@@ -0,0 +1,36 @@
+Virtual eXtensible Local Area Networking documentation
+======================================================
+
+The VXLAN protocol is a tunnelling protocol that is designed to
+solve the problem of limited number of available VLAN's (4096).
+With VXLAN identifier is expanded to 24 bits.
+
+It is a draft RFC standard, that is implemented by Cisco Nexus,
+Vmware and Brocade. The protocol runs over UDP using a single
+destination port (still not standardized by IANA).
+This document describes the Linux kernel tunnel device,
+there is also an implantation of VXLAN for Openvswitch.
+
+Unlike most tunnels, a VXLAN is a 1 to N network, not just point
+to point. A VXLAN device can either dynamically learn the IP address
+of the other end, in a manner similar to a learning bridge, or the
+forwarding entries can be configured statically.
+
+The management of vxlan is done in a similar fashion to it's
+too closest neighbors GRE and VLAN. Configuring VXLAN requires
+the version of iproute2 that matches the kernel release
+where VXLAN was first merged upstream.
+
+1. Create vxlan device
+  # ip li add vxlan0 type vxlan id 42 group 239.1.1.1 dev eth1
+
+This creates a new device (vxlan0). The device uses the
+the multicast group 239.1.1.1 over eth1 to handle packets where
+no entry is in the forwarding table.
+
+2. Delete vxlan device
+  # ip link delete vxlan0
+
+3. Show vxlan info
+  # ip -d show vxlan0
+

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCHv2 net-next 3/3] vxlan: virtual extensible lan
  2012-09-24 19:46     ` [PATCHv2 " Stephen Hemminger
@ 2012-09-24 19:55       ` Eric Dumazet
  2012-09-24 20:02         ` [PATCHv3 " Stephen Hemminger
  2012-09-24 20:09       ` [PATCHv2 " Eric Dumazet
  1 sibling, 1 reply; 43+ messages in thread
From: Eric Dumazet @ 2012-09-24 19:55 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: David Miller, Chris Wright, netdev

On Mon, 2012-09-24 at 12:46 -0700, Stephen Hemminger wrote:

> +	f = vxlan_find_mac(vxlan, src_mac);
> +	if (likely(f)) {
> +		f->used = jiffies;
> +		if (likely(f->remote_ip == src_ip))
> +			return;
> +
> +		f->remote_ip = src_ip;
> +		f->updated = jiffies;
> +
> +		if (net_ratelimit())
> +			netdev_info(dev,
> +				    "%pM migrated from %pI4 to %pI4\n",
> +				    src_mac, &f->remote_ip, &src_ip);


This message is useless, as previous f_remote_ip content was already
overwritten.

^ permalink raw reply	[flat|nested] 43+ messages in thread

* [PATCHv3 net-next 3/3] vxlan: virtual extensible lan
  2012-09-24 19:55       ` Eric Dumazet
@ 2012-09-24 20:02         ` Stephen Hemminger
  2012-09-24 20:24           ` John Fastabend
  0 siblings, 1 reply; 43+ messages in thread
From: Stephen Hemminger @ 2012-09-24 20:02 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: David Miller, Chris Wright, netdev

This is an implementation of Virtual eXtensible Local Area Network
as described in draft RFC:
  http://tools.ietf.org/html/draft-mahalingam-dutt-dcops-vxlan-02

The driver integrates a Virtual Tunnel Endpoint (VTEP) functionality
that learns MAC to IP address mapping. 


This implementation has not been tested for Interoperation with
other equipment.

Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>

---
v3 - fix ordering of change versus migration message
v2 - fix use of ip header after pskb_may_pull

 Documentation/networking/vxlan.txt |   36 +
 drivers/net/Kconfig                |   13 
 drivers/net/Makefile               |    1 
 drivers/net/vxlan.c                | 1180 +++++++++++++++++++++++++++++++++++++
 include/linux/if_link.h            |   14 
 5 files changed, 1244 insertions(+)

--- a/drivers/net/Kconfig	2012-09-24 10:56:57.080291529 -0700
+++ b/drivers/net/Kconfig	2012-09-24 12:43:25.298239026 -0700
@@ -149,6 +149,19 @@ config MACVTAP
 	  To compile this driver as a module, choose M here: the module
 	  will be called macvtap.
 
+config VXLAN
+       tristate "Virtual eXtensible Local Area Network (VXLAN)"
+       depends on EXPERIMENTAL
+       ---help---
+	  This allows one to create vxlan virtual interfaces that provide
+	  Layer 2 Networks over Layer 3 Networks. VXLAN is often used
+	  to tunnel virtual network infrastructure in virtualized environments.
+	  For more information see:
+	    http://tools.ietf.org/html/draft-mahalingam-dutt-dcops-vxlan-02
+
+	  To compile this driver as a module, choose M here: the module
+	  will be called vxlan.
+
 config NETCONSOLE
 	tristate "Network console logging support"
 	---help---
--- a/drivers/net/Makefile	2012-09-24 10:56:57.080291529 -0700
+++ b/drivers/net/Makefile	2012-09-24 11:08:02.865416523 -0700
@@ -21,6 +21,7 @@ obj-$(CONFIG_NET_TEAM) += team/
 obj-$(CONFIG_TUN) += tun.o
 obj-$(CONFIG_VETH) += veth.o
 obj-$(CONFIG_VIRTIO_NET) += virtio_net.o
+obj-$(CONFIG_VXLAN) += vxlan.o
 
 #
 # Networking Drivers
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ b/drivers/net/vxlan.c	2012-09-24 12:58:01.809401221 -0700
@@ -0,0 +1,1171 @@
+/*
+ * VXLAN: Virtual eXtensiable Local Area Network
+ *
+ * Copyright (c) 2012 Vyatta Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * TODO
+ *  - use IANA UDP port number (when defined)
+ *  - IPv6 (not in RFC)
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/slab.h>
+#include <linux/skbuff.h>
+#include <linux/rculist.h>
+#include <linux/netdevice.h>
+#include <linux/in.h>
+#include <linux/ip.h>
+#include <linux/udp.h>
+#include <linux/igmp.h>
+#include <linux/etherdevice.h>
+#include <linux/if_ether.h>
+#include <linux/version.h>
+#include <linux/hash.h>
+#include <net/ip.h>
+#include <net/icmp.h>
+#include <net/udp.h>
+#include <net/rtnetlink.h>
+#include <net/route.h>
+#include <net/dsfield.h>
+#include <net/inet_ecn.h>
+#include <net/net_namespace.h>
+#include <net/netns/generic.h>
+
+#define VXLAN_VERSION	"0.0"
+
+#define VNI_HASH_BITS	10
+#define VNI_HASH_SIZE	(1<<VNI_HASH_BITS)
+#define FDB_HASH_BITS	8
+#define FDB_HASH_SIZE	(1<<FDB_HASH_BITS)
+
+#define VXLAN_N_VID	(1u << 24)
+#define VXLAN_VID_MASK	(VXLAN_N_VID - 1)
+
+#define FDB_AGE_INTERVAL (10 * HZ)	/* rescan interval */
+#define FDB_AGE_TIME	 (300 * HZ)	/* drop if not used in 5 min */
+
+#define VXLAN_FLAGS 0x08000000	/* struct vxlanhdr.vx_flags required value. */
+
+/* VXLAN protocol header */
+struct vxlanhdr {
+	__be32 vx_flags;
+	__be32 vx_vni;
+};
+
+#define VXLAN_HEADROOM (sizeof(struct iphdr)		\
+			+ sizeof(struct udphdr)		\
+			+ sizeof(struct vxlanhdr))
+
+/* UDP port for VXLAN traffic. */
+static unsigned int vxlan_port __read_mostly = 8472;
+module_param_named(port, vxlan_port, uint, 0);
+MODULE_PARM_DESC(vxlan_port, "Destination UDP port");
+
+/* per-net private data for this module */
+static unsigned int vxlan_net_id;
+struct vxlan_net {
+	struct socket	  *sock;	/* UDP encap socket */
+	struct hlist_head vni_list[VNI_HASH_SIZE];
+};
+
+/* Forwarding table entry */
+struct vxlan_fdb {
+	struct hlist_node hlist;	/* linked list of entries */
+	struct rcu_head	  rcu;
+	unsigned long	  updated;	/* jiffies */
+	unsigned long	  used;
+#define VXLAN_FDB_PERM			0x1
+	u32		  flags;
+	__be32		  remote_ip;
+	u8		  eth_addr[ETH_ALEN];
+};
+
+/* Per-cpu network traffic stats */
+struct vxlan_stats {
+	u64			rx_packets;
+	u64			rx_bytes;
+	u64			tx_packets;
+	u64			tx_bytes;
+	struct u64_stats_sync	syncp;
+};
+
+/* Pseudo network device */
+struct vxlan_dev {
+	struct hlist_node hlist;
+	struct net_device *dev;
+	struct vxlan_stats __percpu *stats;
+	__u32		  vni;		/* virtual network id */
+	__be32	          gaddr;	/* multicast group */
+	__be32		  saddr;	/* source address */
+	unsigned int      link;		/* link to multicast over */
+	__u8		  tos;		/* TOS override */
+	__u8		  ttl;
+	bool		  learn;
+
+	struct timer_list age_timer;
+	spinlock_t	  hash_lock;
+	struct hlist_head fdb_head[FDB_HASH_SIZE];
+};
+
+/* salt for hash table */
+static u32 vxlan_salt __read_mostly;
+
+static inline struct hlist_head *vni_head(struct net *net, u32 id)
+{
+	struct vxlan_net *vn = net_generic(net, vxlan_net_id);
+
+	return &vn->vni_list[hash_32(id, VNI_HASH_BITS)];
+}
+
+/* Look up VNI in a per net namespace table */
+static struct vxlan_dev *vxlan_find_vni(struct net *net, u32 id)
+{
+	struct vxlan_dev *vxlan;
+	struct hlist_node *node;
+
+	hlist_for_each_entry_rcu(vxlan, node, vni_head(net, id), hlist) {
+		if (vxlan->vni == id)
+			return vxlan;
+	}
+
+	return NULL;
+}
+
+/* Fill in neighbour message in skbuff. */
+static int vxlan_fdb_info(struct sk_buff *skb, struct vxlan_dev *vxlan,
+			   const struct vxlan_fdb *fdb,
+			   u32 portid, u32 seq, int type, unsigned int flags)
+{
+	unsigned long now = jiffies;
+	struct nda_cacheinfo ci;
+	struct nlmsghdr *nlh;
+	struct ndmsg *ndm;
+
+	nlh = nlmsg_put(skb, portid, seq, type, sizeof(*ndm), flags);
+	if (nlh == NULL)
+		return -EMSGSIZE;
+
+	ndm = nlmsg_data(nlh);
+	memset(ndm, 0, sizeof(*ndm));
+	ndm->ndm_family	= AF_BRIDGE;
+
+	if (fdb->flags & VXLAN_FDB_PERM)
+		ndm->ndm_state = NUD_PERMANENT;
+	else if (time_before_eq(fdb->used + FDB_AGE_TIME, now))
+		ndm->ndm_state = NUD_STALE;
+	else
+		ndm->ndm_state = NUD_REACHABLE;
+
+	if (nla_put(skb, NDA_LLADDR, ETH_ALEN, &fdb->eth_addr))
+		goto nla_put_failure;
+
+	if (nla_put_be32(skb, NDA_DST, fdb->remote_ip))
+		goto nla_put_failure;
+
+	ci.ndm_used	 = jiffies_to_clock_t(now - fdb->used);
+	ci.ndm_confirmed = 0;
+	ci.ndm_updated	 = jiffies_to_clock_t(now - fdb->updated);
+	ci.ndm_refcnt	 = 0;
+
+	if (nla_put(skb, NDA_CACHEINFO, sizeof(ci), &ci))
+		goto nla_put_failure;
+
+	return nlmsg_end(skb, nlh);
+
+nla_put_failure:
+	nlmsg_cancel(skb, nlh);
+	return -EMSGSIZE;
+}
+
+static inline size_t vxlan_nlmsg_size(void)
+{
+	return NLMSG_ALIGN(sizeof(struct ndmsg))
+		+ nla_total_size(ETH_ALEN) /* NDA_LLADDR */
+		+ nla_total_size(sizeof(__be32)) /* NDA_DST */
+		+ nla_total_size(sizeof(struct nda_cacheinfo));
+}
+
+static void vxlan_fdb_notify(struct vxlan_dev *vxlan,
+			     const struct vxlan_fdb *fdb, int type)
+{
+	struct net *net = dev_net(vxlan->dev);
+	struct sk_buff *skb;
+	int err = -ENOBUFS;
+
+	skb = nlmsg_new(vxlan_nlmsg_size(), GFP_ATOMIC);
+	if (skb == NULL)
+		goto errout;
+
+	err = vxlan_fdb_info(skb, vxlan, fdb, 0, 0, type, 0);
+	if (err < 0) {
+		/* -EMSGSIZE implies BUG in vxlan_nlmsg_size() */
+		WARN_ON(err == -EMSGSIZE);
+		kfree_skb(skb);
+		goto errout;
+	}
+
+	rtnl_notify(skb, net, 0, RTNLGRP_NEIGH, NULL, GFP_ATOMIC);
+	return;
+errout:
+	if (err < 0)
+		rtnl_set_sk_err(net, RTNLGRP_NEIGH, err);
+}
+
+/* Hash Ethernet address */
+static u32 eth_hash(const unsigned char *addr)
+{
+	/* could be optimized for unaligned access */
+	u32 a = addr[5] << 8 | addr[4];
+	u32 b = addr[3] << 24 | addr[2] << 16 | addr[1] << 8 | addr[0];
+
+	return jhash_2words(a, b, vxlan_salt);
+}
+
+/* Hash chain to use given mac address */
+static inline struct hlist_head *vxlan_fdb_head(struct vxlan_dev *vxlan,
+						const u8 *mac)
+{
+	return &vxlan->fdb_head[hash_32(eth_hash(mac), FDB_HASH_BITS)];
+}
+
+/* Look up Ethernet address in forwarding table */
+static struct vxlan_fdb *vxlan_find_mac(struct vxlan_dev *vxlan,
+					const u8 *mac)
+
+{
+	struct hlist_head *head = vxlan_fdb_head(vxlan, mac);
+	struct vxlan_fdb *f;
+	struct hlist_node *node;
+
+	hlist_for_each_entry_rcu(f, node, head, hlist) {
+		if (compare_ether_addr(mac, f->eth_addr) == 0)
+			return f;
+	}
+
+	return NULL;
+}
+
+/* Add new entry to forwarding table -- assumes lock held */
+static int vxlan_fdb_create(struct vxlan_dev *vxlan,
+			    const u8 *mac, __be32 ip,
+			    u32 flags)
+{
+	struct vxlan_fdb *f;
+
+	if (unlikely(vxlan_find_mac(vxlan, mac))) {
+		netdev_dbg(vxlan->dev,
+			   "lost race to create %pM\n", mac);
+		return -EEXIST;
+	}
+
+	netdev_dbg(vxlan->dev, "add %pM -> %pI4\n", mac, &ip);
+
+	f = kmalloc(sizeof(*f), GFP_ATOMIC);
+	if (!f)
+		return -ENOMEM;
+
+	f->remote_ip = ip;
+	memcpy(f->eth_addr, mac, ETH_ALEN);
+	hlist_add_head_rcu(&f->hlist,
+			   vxlan_fdb_head(vxlan, mac));
+	f->updated = f->used = jiffies;
+	f->flags = flags;
+
+	vxlan_fdb_notify(vxlan, f, RTM_NEWNEIGH);
+
+	return 0;
+}
+
+static void vxlan_fdb_destroy(struct vxlan_dev *vxlan, struct vxlan_fdb *f)
+{
+	netdev_dbg(vxlan->dev,
+		    "delete %pM\n", f->eth_addr);
+
+	vxlan_fdb_notify(vxlan, f, RTM_DELNEIGH);
+
+	hlist_del_rcu(&f->hlist);
+	kfree_rcu(f, rcu);
+}
+
+/* Add static entry (via netlink) */
+static int vxlan_fdb_add(struct ndmsg *ndm, struct nlattr *tb[],
+			 struct net_device *dev,
+			 const unsigned char *addr, u16 flags)
+{
+	struct vxlan_dev *vxlan = netdev_priv(dev);
+	__be32 ip;
+	int err;
+
+	if (tb[NDA_DST] == NULL)
+		return -EINVAL;
+
+	if (nla_len(tb[NDA_DST]) != sizeof(__be32))
+		return -EAFNOSUPPORT;
+
+	ip = nla_get_be32(tb[NDA_DST]);
+
+	spin_lock_bh(&vxlan->hash_lock);
+	err = vxlan_fdb_create(vxlan, addr, ip, VXLAN_FDB_PERM);
+	spin_unlock_bh(&vxlan->hash_lock);
+
+	return err;
+}
+
+/* Delete entry (via netlink) */
+static int vxlan_fdb_delete(struct ndmsg *ndm, struct net_device *dev,
+			    const unsigned char *addr)
+{
+	struct vxlan_dev *vxlan = netdev_priv(dev);
+	struct vxlan_fdb *f;
+	int err = -ENOENT;
+
+	spin_lock_bh(&vxlan->hash_lock);
+	f = vxlan_find_mac(vxlan, addr);
+	if (f) {
+		vxlan_fdb_destroy(vxlan, f);
+		err = 0;
+	}
+	spin_unlock_bh(&vxlan->hash_lock);
+
+	return err;
+}
+
+/* Dump forwarding table */
+static int vxlan_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb,
+			  struct net_device *dev, int idx)
+{
+	struct vxlan_dev *vxlan = netdev_priv(dev);
+	unsigned int h;
+
+	for (h = 0; h < FDB_HASH_SIZE; ++h) {
+		struct vxlan_fdb *f;
+		struct hlist_node *n;
+
+		hlist_for_each_entry_rcu(f, n, &vxlan->fdb_head[h], hlist) {
+			if (idx < cb->args[0])
+				goto skip;
+
+			if (vxlan_fdb_info(skb, vxlan, f,
+					    NETLINK_CB(cb->skb).portid,
+					    cb->nlh->nlmsg_seq,
+					    RTM_NEWNEIGH,
+					    NLM_F_MULTI) < 0)
+				break;
+skip:
+			++idx;
+		}
+	}
+
+	return idx;
+}
+
+/* Watch incoming packets to learn mapping between Ethernet address
+ * and Tunnel endpoint.
+ */
+static void vxlan_snoop(struct net_device *dev,
+			__be32 src_ip, const u8 *src_mac)
+{
+	struct vxlan_dev *vxlan = netdev_priv(dev);
+	struct vxlan_fdb *f;
+	int err;
+
+	f = vxlan_find_mac(vxlan, src_mac);
+	if (likely(f)) {
+		f->used = jiffies;
+		if (likely(f->remote_ip == src_ip))
+			return;
+
+		if (net_ratelimit())
+			netdev_info(dev,
+				    "%pM migrated from %pI4 to %pI4\n",
+				    src_mac, &f->remote_ip, &src_ip);
+
+		f->remote_ip = src_ip;
+		f->updated = jiffies;
+	} else {
+		/* learned new entry */
+		spin_lock(&vxlan->hash_lock);
+		err = vxlan_fdb_create(vxlan, src_mac, src_ip, 0);
+		spin_unlock(&vxlan->hash_lock);
+	}
+}
+
+
+/* See if multicast group is already in use by other ID */
+static bool vxlan_group_used(struct vxlan_net *vn,
+			     const struct vxlan_dev *this)
+{
+	const struct vxlan_dev *vxlan;
+	struct hlist_node *node;
+	unsigned h;
+
+	for (h = 0; h < VNI_HASH_SIZE; ++h)
+		hlist_for_each_entry(vxlan, node, &vn->vni_list[h], hlist) {
+			if (vxlan == this)
+				continue;
+
+			if (!netif_running(vxlan->dev))
+				continue;
+
+			if (vxlan->gaddr == this->gaddr)
+				return true;
+		}
+
+	return false;
+}
+
+/* kernel equivalent to IP_ADD_MEMBERSHIP */
+static int vxlan_join_group(struct net_device *dev)
+{
+	struct vxlan_dev *vxlan = netdev_priv(dev);
+	struct vxlan_net *vn = net_generic(dev_net(dev), vxlan_net_id);
+	struct sock *sk = vn->sock->sk;
+	struct ip_mreqn mreq = {
+		.imr_multiaddr.s_addr = vxlan->gaddr,
+	};
+	int err;
+
+	if (vxlan_group_used(vn, vxlan))
+		return 0;
+
+	/* Need to drop RTNL to call multicast join */
+	rtnl_unlock();
+	lock_sock(sk);
+	err = ip_mc_join_group(sk, &mreq);
+	release_sock(sk);
+	rtnl_lock();
+
+	return err;
+}
+
+
+/* kernel equivalent to IP_DROP_MEMBERSHIP */
+static int vxlan_leave_group(struct net_device *dev)
+{
+	struct vxlan_dev *vxlan = netdev_priv(dev);
+	struct vxlan_net *vn = net_generic(dev_net(dev), vxlan_net_id);
+	int err = 0;
+	struct sock *sk = vn->sock->sk;
+	struct ip_mreqn mreq = {
+		.imr_multiaddr.s_addr = vxlan->gaddr,
+	};
+
+	if (vxlan_group_used(vn, vxlan))
+		return 0;
+
+	/* Need to drop RTNL to call multicast leave */
+	rtnl_unlock();
+	lock_sock(sk);
+	err = ip_mc_leave_group(sk, &mreq);
+	release_sock(sk);
+	rtnl_lock();
+
+	return err;
+}
+
+/* Propogate ECN from outer IP header to tunneled packet */
+static inline void vxlan_ecn_decap(const struct iphdr *iph, struct sk_buff *skb)
+{
+	if (INET_ECN_is_ce(iph->tos)) {
+		if (skb->protocol == htons(ETH_P_IP))
+			IP_ECN_set_ce(ip_hdr(skb));
+		else if (skb->protocol == htons(ETH_P_IPV6))
+			IP6_ECN_set_ce(ipv6_hdr(skb));
+	}
+}
+
+/* Callback from net/ipv4/udp.c to receive packets */
+static int vxlan_udp_encap_recv(struct sock *sk, struct sk_buff *skb)
+{
+	struct iphdr *oip;
+	struct vxlanhdr *vxh;
+	struct vxlan_dev *vxlan;
+	struct vxlan_stats *stats;
+	__u32 vni;
+
+	/* pop off outer UDP header */
+	__skb_pull(skb, sizeof(struct udphdr));
+
+	/* Need Vxlan and inner Ethernet header to be present */
+	if (!pskb_may_pull(skb,
+			   sizeof(struct vxlanhdr) + sizeof(struct ethhdr)))
+		goto error;
+
+	oip = ip_hdr(skb);
+
+	/* Drop packets with reserved bits set */
+	vxh = (struct vxlanhdr *) skb->data;
+	if (vxh->vx_flags != htonl(VXLAN_FLAGS) ||
+	    (vxh->vx_vni & htonl(0xff)))
+		goto error;
+
+	__skb_pull(skb, sizeof(struct vxlanhdr));
+
+	/* Is this VNI defined? */
+	vni = ntohl(vxh->vx_vni) >> 8;
+	vxlan = vxlan_find_vni(sock_net(sk), vni);
+	if (!vxlan)
+		goto drop;
+
+	/* Ignore packets if device is not up */
+	if (!netif_running(vxlan->dev))
+		goto drop;
+
+	/* Re-examine inner Ethernet packet */
+	skb->protocol = eth_type_trans(skb, vxlan->dev);
+	skb->ip_summed = CHECKSUM_NONE;
+
+	/* Ignore packet loops (and multicast echo) */
+	if (compare_ether_addr(eth_hdr(skb)->h_source,
+			       vxlan->dev->dev_addr) == 0)
+		goto drop;
+
+	/* Check for multicast group configuration errors */
+	if (IN_MULTICAST(ntohl(oip->daddr)) &&
+	    oip->daddr != vxlan->gaddr) {
+		if (net_ratelimit())
+			netdev_notice(vxlan->dev,
+				      "group address %pI4 does not match\n",
+				      &oip->daddr);
+		goto drop;
+	}
+
+	if (vxlan->learn)
+		vxlan_snoop(skb->dev, oip->saddr, eth_hdr(skb)->h_source);
+
+	stats = this_cpu_ptr(vxlan->stats);
+	u64_stats_update_begin(&stats->syncp);
+	stats->rx_packets++;
+	stats->rx_bytes += skb->len;
+	u64_stats_update_end(&stats->syncp);
+
+	__skb_tunnel_rx(skb, vxlan->dev);
+	skb_reset_network_header(skb);
+	vxlan_ecn_decap(oip, skb);
+
+	netif_rx(skb);
+
+	return 0;
+error:
+	/* Put UDP header back */
+	__skb_push(skb, sizeof(struct udphdr));
+
+	return 1;
+drop:
+	/* Consume bad packet */
+	kfree_skb(skb);
+	return 0;
+}
+
+/* Extract dsfield from inner protocol */
+static inline u8 vxlan_get_dsfield(const struct iphdr *iph,
+				   const struct sk_buff *skb)
+{
+	if (skb->protocol == htons(ETH_P_IP))
+		return iph->tos;
+	else if (skb->protocol == htons(ETH_P_IPV6))
+		return ipv6_get_dsfield((const struct ipv6hdr *)iph);
+	else
+		return 0;
+}
+
+/* Propogate ECN bits out */
+static inline u8 vxlan_ecn_encap(u8 tos,
+				 const struct iphdr *iph,
+				 const struct sk_buff *skb)
+{
+	u8 inner = vxlan_get_dsfield(iph, skb);
+
+	return INET_ECN_encapsulate(tos, inner);
+}
+
+/* Transmit local packets over Vxlan
+ *
+ * Outer IP header inherits ECN and DF from inner header.
+ * Outer UDP destination is the VXLAN assigned port.
+ *           source port is based on hash of flow if available
+ *                       otherwise use a random value
+ */
+static netdev_tx_t vxlan_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+	struct vxlan_dev *vxlan = netdev_priv(dev);
+	struct rtable *rt;
+	const struct ethhdr *eth;
+	const struct iphdr *old_iph;
+	struct iphdr *iph;
+	struct vxlanhdr *vxh;
+	struct udphdr *uh;
+	struct flowi4 fl4;
+	struct vxlan_fdb *f;
+	unsigned int pkt_len = skb->len;
+	unsigned int mtu;
+	u32 hash;
+	__be32 dst;
+	__be16 df = 0;
+	__u8 tos, ttl;
+	int err;
+
+	/* Need space for new headers (invalidates iph ptr) */
+	if (skb_cow_head(skb, VXLAN_HEADROOM))
+		goto drop;
+
+	eth = (void *)skb->data;
+	old_iph = ip_hdr(skb);
+
+	if (!is_multicast_ether_addr(eth->h_dest) &&
+	    (f = vxlan_find_mac(vxlan, eth->h_dest)))
+		dst = f->remote_ip;
+	else if (vxlan->gaddr) {
+		dst = vxlan->gaddr;
+	} else
+		goto drop;
+
+	ttl = vxlan->ttl;
+	if (!ttl && IN_MULTICAST(ntohl(dst)))
+		ttl = 1;
+
+	tos = vxlan->tos;
+	if (tos == 1)
+		tos = vxlan_get_dsfield(old_iph, skb);
+
+	hash = skb_get_rxhash(skb);
+
+	rt = ip_route_output_gre(dev_net(dev), &fl4, dst,
+				 vxlan->saddr, vxlan->vni,
+				 RT_TOS(tos), vxlan->link);
+	if (IS_ERR(rt)) {
+		netdev_dbg(dev, "no route to %pI4\n", &dst);
+		dev->stats.tx_carrier_errors++;
+		goto tx_error;
+	}
+
+	if (rt->dst.dev == dev) {
+		netdev_dbg(dev, "circular route to %pI4\n", &dst);
+		ip_rt_put(rt);
+		dev->stats.collisions++;
+		goto tx_error;
+	}
+
+	mtu = dst_mtu(&rt->dst) - VXLAN_HEADROOM;
+	/* Do PMTU */
+	if (skb->protocol == htons(ETH_P_IP)) {
+		df |= old_iph->frag_off & htons(IP_DF);
+		if (df && mtu < pkt_len) {
+			icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
+				  htonl(mtu));
+			ip_rt_put(rt);
+			goto tx_error;
+		}
+	}
+#if IS_ENABLED(CONFIG_IPV6)
+	else if (skb->protocol == htons(ETH_P_IPV6)) {
+		if (mtu >= IPV6_MIN_MTU && mtu < pkt_len) {
+			icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
+			ip_rt_put(rt);
+			goto tx_error;
+		}
+	}
+#endif
+
+	memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
+	IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED |
+			      IPSKB_REROUTED);
+	skb_dst_drop(skb);
+	skb_dst_set(skb, &rt->dst);
+
+	vxh = (struct vxlanhdr *) __skb_push(skb, sizeof(*vxh));
+	vxh->vx_flags = htonl(VXLAN_FLAGS);
+	vxh->vx_vni = htonl(vxlan->vni << 8);
+
+	__skb_push(skb, sizeof(*uh));
+	skb_reset_transport_header(skb);
+	uh = udp_hdr(skb);
+
+	uh->dest = htons(vxlan_port);
+	uh->source = hash ? :random32();
+
+	uh->len = htons(skb->len);
+	uh->check = 0;
+
+	__skb_push(skb, sizeof(*iph));
+	skb_reset_network_header(skb);
+	iph		= ip_hdr(skb);
+	iph->version	= 4;
+	iph->ihl	= sizeof(struct iphdr) >> 2;
+	iph->frag_off	= df;
+	iph->protocol	= IPPROTO_UDP;
+	iph->tos	= tos;
+	iph->daddr	= fl4.daddr;
+	iph->saddr	= fl4.saddr;
+	iph->ttl	= ttl ? : ip4_dst_hoplimit(&rt->dst);
+
+	/* See __IPTUNNEL_XMIT */
+	skb->ip_summed = CHECKSUM_NONE;
+	ip_select_ident(iph, &rt->dst, NULL);
+
+	err = ip_local_out(skb);
+	if (likely(net_xmit_eval(err) == 0)) {
+		struct vxlan_stats *stats = this_cpu_ptr(vxlan->stats);
+
+		u64_stats_update_begin(&stats->syncp);
+		stats->tx_packets++;
+		stats->tx_bytes += pkt_len;
+		u64_stats_update_end(&stats->syncp);
+	} else {
+		dev->stats.tx_errors++;
+		dev->stats.tx_aborted_errors++;
+	}
+	return NETDEV_TX_OK;
+
+drop:
+	dev->stats.tx_dropped++;
+	goto tx_free;
+
+tx_error:
+	dev->stats.tx_errors++;
+tx_free:
+	dev_kfree_skb(skb);
+	return NETDEV_TX_OK;
+}
+
+/* Walk the forwarding table and purge stale entries */
+static void vxlan_cleanup(unsigned long arg)
+{
+	struct vxlan_dev *vxlan = (struct vxlan_dev *) arg;
+	unsigned long next_timer = jiffies + FDB_AGE_INTERVAL;
+	unsigned int h;
+
+	if (!netif_running(vxlan->dev))
+		return;
+
+	spin_lock_bh(&vxlan->hash_lock);
+	for (h = 0; h < FDB_HASH_SIZE; ++h) {
+		struct hlist_node *p, *n;
+		hlist_for_each_safe(p, n, &vxlan->fdb_head[h]) {
+			struct vxlan_fdb *f
+				= container_of(p, struct vxlan_fdb, hlist);
+			unsigned long timeout = f->used + FDB_AGE_TIME;
+
+			if (f->flags & VXLAN_FDB_PERM)
+				continue;
+
+			if (time_before_eq(timeout, jiffies)) {
+				netdev_dbg(vxlan->dev,
+					   "garbage collect %pM\n",
+					   f->eth_addr);
+				vxlan_fdb_destroy(vxlan, f);
+			} else if (time_before(timeout, next_timer))
+				next_timer = timeout;
+		}
+	}
+	spin_unlock_bh(&vxlan->hash_lock);
+
+	mod_timer(&vxlan->age_timer, next_timer);
+}
+
+/* Setup stats when device is created */
+static int vxlan_init(struct net_device *dev)
+{
+	struct vxlan_dev *vxlan = netdev_priv(dev);
+
+	vxlan->stats = alloc_percpu(struct vxlan_stats);
+	if (!vxlan->stats)
+		return -ENOMEM;
+
+	return 0;
+}
+
+/* Start ageing timer and join group when device is brought up */
+static int vxlan_open(struct net_device *dev)
+{
+	struct vxlan_dev *vxlan = netdev_priv(dev);
+	int err;
+
+	if (vxlan->gaddr) {
+		err = vxlan_join_group(dev);
+		if (err)
+			return err;
+	}
+
+	mod_timer(&vxlan->age_timer, jiffies + FDB_AGE_INTERVAL);
+	return 0;
+}
+
+/* Purge the forwarding table */
+static void vxlan_flush(struct vxlan_dev *vxlan)
+{
+	unsigned h;
+
+	spin_lock_bh(&vxlan->hash_lock);
+	for (h = 0; h < FDB_HASH_SIZE; ++h) {
+		struct hlist_node *p, *n;
+		hlist_for_each_safe(p, n, &vxlan->fdb_head[h]) {
+			struct vxlan_fdb *f
+				= container_of(p, struct vxlan_fdb, hlist);
+			vxlan_fdb_destroy(vxlan, f);
+		}
+	}
+	spin_unlock_bh(&vxlan->hash_lock);
+}
+
+/* Cleanup timer and forwarding table on shutdown */
+static int vxlan_stop(struct net_device *dev)
+{
+	struct vxlan_dev *vxlan = netdev_priv(dev);
+
+	if (vxlan->gaddr)
+		vxlan_leave_group(dev);
+
+	del_timer_sync(&vxlan->age_timer);
+
+	vxlan_flush(vxlan);
+
+	return 0;
+}
+
+/* Merge per-cpu statistics */
+static struct rtnl_link_stats64 *vxlan_stats64(struct net_device *dev,
+					       struct rtnl_link_stats64 *stats)
+{
+	struct vxlan_dev *vxlan = netdev_priv(dev);
+	struct vxlan_stats tmp, sum = { 0 };
+	unsigned int cpu;
+
+	for_each_possible_cpu(cpu) {
+		unsigned int start;
+		const struct vxlan_stats *stats
+			= per_cpu_ptr(vxlan->stats, cpu);
+
+		do {
+			start = u64_stats_fetch_begin_bh(&stats->syncp);
+			memcpy(&tmp, stats, sizeof(tmp));
+		} while (u64_stats_fetch_retry_bh(&stats->syncp, start));
+
+		sum.tx_bytes   += tmp.tx_bytes;
+		sum.tx_packets += tmp.tx_packets;
+		sum.rx_bytes   += tmp.rx_bytes;
+		sum.rx_packets += tmp.rx_packets;
+	}
+
+	stats->tx_bytes   = sum.tx_bytes;
+	stats->tx_packets = sum.tx_packets;
+	stats->rx_bytes   = sum.rx_bytes;
+	stats->rx_packets = sum.rx_packets;
+
+	stats->tx_dropped = dev->stats.tx_dropped;
+	stats->tx_errors  = dev->stats.tx_errors;
+	stats->tx_carrier_errors  = dev->stats.tx_carrier_errors;
+	stats->collisions  = dev->stats.collisions;
+
+	return stats;
+}
+
+/* Stub, nothing needs to be done. */
+static void vxlan_set_multicast_list(struct net_device *dev)
+{
+}
+
+static const struct net_device_ops vxlan_netdev_ops = {
+	.ndo_init		= vxlan_init,
+	.ndo_open		= vxlan_open,
+	.ndo_stop		= vxlan_stop,
+	.ndo_start_xmit		= vxlan_xmit,
+	.ndo_get_stats64	= vxlan_stats64,
+	.ndo_set_rx_mode	= vxlan_set_multicast_list,
+	.ndo_change_mtu		= eth_change_mtu,
+	.ndo_validate_addr	= eth_validate_addr,
+	.ndo_set_mac_address	= eth_mac_addr,
+	.ndo_fdb_add		= vxlan_fdb_add,
+	.ndo_fdb_del		= vxlan_fdb_delete,
+	.ndo_fdb_dump		= vxlan_fdb_dump,
+};
+
+/* Info for udev, that this is a virtual tunnel endpoint */
+static struct device_type vxlan_type = {
+	.name = "vxlan",
+};
+
+static void vxlan_free(struct net_device *dev)
+{
+	struct vxlan_dev *vxlan = netdev_priv(dev);
+
+	free_percpu(vxlan->stats);
+	free_netdev(dev);
+}
+
+/* Initialize the device structure. */
+static void vxlan_setup(struct net_device *dev)
+{
+	struct vxlan_dev *vxlan = netdev_priv(dev);
+	unsigned h;
+
+	eth_hw_addr_random(dev);
+	ether_setup(dev);
+
+	dev->netdev_ops = &vxlan_netdev_ops;
+	dev->destructor = vxlan_free;
+	SET_NETDEV_DEVTYPE(dev, &vxlan_type);
+
+	dev->tx_queue_len = 0;
+	dev->features	|= NETIF_F_LLTX;
+	dev->features	|= NETIF_F_NETNS_LOCAL;
+	dev->priv_flags	&= ~IFF_XMIT_DST_RELEASE;
+
+	spin_lock_init(&vxlan->hash_lock);
+
+	init_timer_deferrable(&vxlan->age_timer);
+	vxlan->age_timer.function = vxlan_cleanup;
+	vxlan->age_timer.data = (unsigned long) vxlan;
+
+	vxlan->dev = dev;
+
+	for (h = 0; h < FDB_HASH_SIZE; ++h)
+		INIT_HLIST_HEAD(&vxlan->fdb_head[h]);
+}
+
+static const struct nla_policy vxlan_policy[IFLA_VXLAN_MAX + 1] = {
+	[IFLA_VXLAN_ID]		= { .type = NLA_U32 },
+	[IFLA_VXLAN_GROUP]	= { .len = FIELD_SIZEOF(struct iphdr, daddr) },
+	[IFLA_VXLAN_LINK]	= { .type = NLA_U32 },
+	[IFLA_VXLAN_LOCAL]	= { .len = FIELD_SIZEOF(struct iphdr, saddr) },
+	[IFLA_VXLAN_TOS]	= { .type = NLA_U8 },
+	[IFLA_VXLAN_LEARNING]	= { .type = NLA_U8 },
+};
+
+static int vxlan_validate(struct nlattr *tb[], struct nlattr *data[])
+{
+	if (tb[IFLA_ADDRESS]) {
+		if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN) {
+			pr_debug("invalid link address (not ethernet)\n");
+			return -EINVAL;
+		}
+
+		if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS]))) {
+			pr_debug("invalid all zero ethernet address\n");
+			return -EADDRNOTAVAIL;
+		}
+	}
+
+	if (!data)
+		return -EINVAL;
+
+	if (data[IFLA_VXLAN_ID]) {
+		__u32 id = nla_get_u32(data[IFLA_VXLAN_ID]);
+		if (id >= VXLAN_VID_MASK)
+			return -ERANGE;
+	}
+
+	if (data[IFLA_VXLAN_GROUP]) {
+		__be32 gaddr = nla_get_be32(data[IFLA_VXLAN_GROUP]);
+		if (!IN_MULTICAST(ntohl(gaddr))) {
+			pr_debug("group address is not IPv4 multicast\n");
+			return -EADDRNOTAVAIL;
+		}
+	}
+
+	return 0;
+}
+
+static int vxlan_newlink(struct net *net, struct net_device *dev,
+			 struct nlattr *tb[], struct nlattr *data[])
+{
+	struct vxlan_dev *vxlan = netdev_priv(dev);
+	__u32 vni;
+	int err;
+
+	if (!data[IFLA_VXLAN_ID])
+		return -EINVAL;
+
+	vni = nla_get_u32(data[IFLA_VXLAN_ID]);
+	if (vxlan_find_vni(net, vni)) {
+		pr_info("duplicate VNI %u\n", vni);
+		return -EEXIST;
+	}
+	vxlan->vni = vni;
+
+	if (data[IFLA_VXLAN_GROUP])
+		vxlan->gaddr = nla_get_be32(data[IFLA_VXLAN_GROUP]);
+
+	if (data[IFLA_VXLAN_LOCAL])
+		vxlan->saddr = nla_get_be32(data[IFLA_VXLAN_LOCAL]);
+
+	if (data[IFLA_VXLAN_LINK])
+		vxlan->link = nla_get_u32(data[IFLA_VXLAN_LINK]);
+
+	if (data[IFLA_VXLAN_TOS])
+		vxlan->tos  = nla_get_u8(data[IFLA_VXLAN_TOS]);
+
+	if (!data[IFLA_VXLAN_LEARNING] || nla_get_u8(data[IFLA_VXLAN_LEARNING]))
+		vxlan->learn = true;
+
+	err = register_netdevice(dev);
+	if (!err)
+		hlist_add_head_rcu(&vxlan->hlist, vni_head(net, vxlan->vni));
+
+	return err;
+}
+
+static void vxlan_dellink(struct net_device *dev, struct list_head *head)
+{
+	struct vxlan_dev *vxlan = netdev_priv(dev);
+
+	hlist_del_rcu(&vxlan->hlist);
+
+	unregister_netdevice_queue(dev, head);
+}
+
+static size_t vxlan_get_size(const struct net_device *dev)
+{
+
+	return nla_total_size(sizeof(__u32)) +	/* IFLA_VXLAN_ID */
+		nla_total_size(sizeof(__be32)) +/* IFLA_VXLAN_GROUP */
+		nla_total_size(sizeof(__u32)) +	/* IFLA_VXLAN_LINK */
+		nla_total_size(sizeof(__be32))+	/* IFLA_VXLAN_LOCAL */
+		nla_total_size(sizeof(__u8)) +	/* IFLA_VXLAN_TTL */
+		nla_total_size(sizeof(__u8)) +	/* IFLA_VXLAN_TOS */
+		nla_total_size(sizeof(__u8)) +	/* IFLA_VXLAN_LEARNING */
+		0;
+}
+
+static int vxlan_fill_info(struct sk_buff *skb, const struct net_device *dev)
+{
+	const struct vxlan_dev *vxlan = netdev_priv(dev);
+
+	if (nla_put_u32(skb, IFLA_VXLAN_ID, vxlan->vni))
+		goto nla_put_failure;
+
+	if (vxlan->gaddr && nla_put_u32(skb, IFLA_VXLAN_GROUP, vxlan->gaddr))
+		goto nla_put_failure;
+
+	if (vxlan->link && nla_put_u32(skb, IFLA_VXLAN_LINK, vxlan->link))
+		goto nla_put_failure;
+
+	if (vxlan->saddr && nla_put_u32(skb, IFLA_VXLAN_LOCAL, vxlan->saddr))
+		goto nla_put_failure;
+
+	if (nla_put_u8(skb, IFLA_VXLAN_TTL, vxlan->ttl) ||
+	    nla_put_u8(skb, IFLA_VXLAN_TOS, vxlan->tos) ||
+	    nla_put_u8(skb, IFLA_VXLAN_LEARNING, vxlan->learn))
+		goto nla_put_failure;
+
+	return 0;
+
+nla_put_failure:
+	return -EMSGSIZE;
+}
+
+static struct rtnl_link_ops vxlan_link_ops __read_mostly = {
+	.kind		= "vxlan",
+	.maxtype	= IFLA_VXLAN_MAX,
+	.policy		= vxlan_policy,
+	.priv_size	= sizeof(struct vxlan_dev),
+	.setup		= vxlan_setup,
+	.validate	= vxlan_validate,
+	.newlink	= vxlan_newlink,
+	.dellink	= vxlan_dellink,
+	.get_size	= vxlan_get_size,
+	.fill_info	= vxlan_fill_info,
+};
+
+static __net_init int vxlan_init_net(struct net *net)
+{
+	struct vxlan_net *vn = net_generic(net, vxlan_net_id);
+	struct sock *sk;
+	struct sockaddr_in addr;
+	int rc;
+	unsigned h;
+
+	/* Create UDP socket for encapsulation receive. */
+	rc = sock_create_kern(AF_INET, SOCK_DGRAM, IPPROTO_UDP, &vn->sock);
+	if (rc < 0) {
+		pr_debug("UDP socket create failed\n");
+		return rc;
+	}
+
+	addr.sin_family = AF_INET;
+	addr.sin_addr.s_addr = INADDR_ANY;
+	addr.sin_port = htons(vxlan_port);
+
+	rc = kernel_bind(vn->sock, (struct sockaddr *) &addr, sizeof(addr));
+	if (rc < 0) {
+		pr_debug("bind for port %u failed %d\n", vxlan_port, rc);
+		sock_release(vn->sock);
+		vn->sock = NULL;
+		return rc;
+	}
+
+	/* Disable multicast loopback */
+	sk = vn->sock->sk;
+	inet_sk(sk)->mc_loop = 0;
+
+	/* Mark socket as an encapsulation socket. */
+	udp_sk(sk)->encap_type = UDP_ENCAP_L2TPINUDP;
+	udp_sk(sk)->encap_rcv = vxlan_udp_encap_recv;
+	udp_encap_enable();
+
+	for (h = 0; h < VNI_HASH_SIZE; ++h)
+		INIT_HLIST_HEAD(&vn->vni_list[h]);
+
+	return 0;
+}
+
+static __net_exit void vxlan_exit_net(struct net *net)
+{
+	struct vxlan_net *vn = net_generic(net, vxlan_net_id);
+
+	if (vn->sock) {
+		sock_release(vn->sock);
+		vn->sock = NULL;
+	}
+}
+
+static struct pernet_operations vxlan_net_ops = {
+	.init = vxlan_init_net,
+	.exit = vxlan_exit_net,
+	.id   = &vxlan_net_id,
+	.size = sizeof(struct vxlan_net),
+};
+
+static int __init vxlan_init_module(void)
+{
+	int rc;
+
+	get_random_bytes(&vxlan_salt, sizeof(vxlan_salt));
+
+	rc = register_pernet_device(&vxlan_net_ops);
+	if (rc)
+		goto out1;
+
+	rc = rtnl_link_register(&vxlan_link_ops);
+	if (rc)
+		goto out2;
+
+	return 0;
+
+out2:
+	unregister_pernet_device(&vxlan_net_ops);
+out1:
+	return rc;
+}
+module_init(vxlan_init_module);
+
+static void __exit vxlan_cleanup_module(void)
+{
+	rtnl_link_unregister(&vxlan_link_ops);
+	unregister_pernet_device(&vxlan_net_ops);
+}
+module_exit(vxlan_cleanup_module);
+
+MODULE_LICENSE("GPL");
+MODULE_VERSION(VXLAN_VERSION);
+MODULE_AUTHOR("Stephen Hemminger <shemminger@vyatta.com>");
+MODULE_ALIAS_RTNL_LINK("vxlan");
--- a/include/linux/if_link.h	2012-09-24 10:56:57.080291529 -0700
+++ b/include/linux/if_link.h	2012-09-24 11:08:02.869416484 -0700
@@ -272,6 +272,20 @@ enum macvlan_mode {
 
 #define MACVLAN_FLAG_NOPROMISC	1
 
+/* VXLAN section */
+enum {
+	IFLA_VXLAN_UNSPEC,
+	IFLA_VXLAN_ID,
+	IFLA_VXLAN_GROUP,
+	IFLA_VXLAN_LINK,
+	IFLA_VXLAN_LOCAL,
+	IFLA_VXLAN_TTL,
+	IFLA_VXLAN_TOS,
+	IFLA_VXLAN_LEARNING,
+	__IFLA_VXLAN_MAX
+};
+#define IFLA_VXLAN_MAX	(__IFLA_VXLAN_MAX - 1)
+
 /* SR-IOV virtual function management section */
 
 enum {
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ b/Documentation/networking/vxlan.txt	2012-09-24 11:08:02.869416484 -0700
@@ -0,0 +1,36 @@
+Virtual eXtensible Local Area Networking documentation
+======================================================
+
+The VXLAN protocol is a tunnelling protocol that is designed to
+solve the problem of limited number of available VLAN's (4096).
+With VXLAN identifier is expanded to 24 bits.
+
+It is a draft RFC standard, that is implemented by Cisco Nexus,
+Vmware and Brocade. The protocol runs over UDP using a single
+destination port (still not standardized by IANA).
+This document describes the Linux kernel tunnel device,
+there is also an implantation of VXLAN for Openvswitch.
+
+Unlike most tunnels, a VXLAN is a 1 to N network, not just point
+to point. A VXLAN device can either dynamically learn the IP address
+of the other end, in a manner similar to a learning bridge, or the
+forwarding entries can be configured statically.
+
+The management of vxlan is done in a similar fashion to it's
+too closest neighbors GRE and VLAN. Configuring VXLAN requires
+the version of iproute2 that matches the kernel release
+where VXLAN was first merged upstream.
+
+1. Create vxlan device
+  # ip li add vxlan0 type vxlan id 42 group 239.1.1.1 dev eth1
+
+This creates a new device (vxlan0). The device uses the
+the multicast group 239.1.1.1 over eth1 to handle packets where
+no entry is in the forwarding table.
+
+2. Delete vxlan device
+  # ip link delete vxlan0
+
+3. Show vxlan info
+  # ip -d show vxlan0
+

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCHv2 net-next 3/3] vxlan: virtual extensible lan
  2012-09-24 19:46     ` [PATCHv2 " Stephen Hemminger
  2012-09-24 19:55       ` Eric Dumazet
@ 2012-09-24 20:09       ` Eric Dumazet
  2012-09-24 20:26         ` Stephen Hemminger
  1 sibling, 1 reply; 43+ messages in thread
From: Eric Dumazet @ 2012-09-24 20:09 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: David Miller, Chris Wright, netdev

On Mon, 2012-09-24 at 12:46 -0700, Stephen Hemminger wrote:

> +/* Initialize the device structure. */
> +static void vxlan_setup(struct net_device *dev)
> +{
> +	struct vxlan_dev *vxlan = netdev_priv(dev);
> +	unsigned h;
> +
> +	eth_hw_addr_random(dev);
> +	ether_setup(dev);
> +
> +	dev->netdev_ops = &vxlan_netdev_ops;
> +	dev->destructor = vxlan_free;
> +	SET_NETDEV_DEVTYPE(dev, &vxlan_type);
> +
> +	dev->tx_queue_len = 0;
> +	dev->features	|= NETIF_F_LLTX;
> +	dev->features	|= NETIF_F_NETNS_LOCAL;
> +	dev->priv_flags	&= ~IFF_XMIT_DST_RELEASE;

I am not sure why you remove IFF_XMIT_DST_RELEASE here

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCHv3 net-next 3/3] vxlan: virtual extensible lan
  2012-09-24 20:02         ` [PATCHv3 " Stephen Hemminger
@ 2012-09-24 20:24           ` John Fastabend
  2012-09-24 20:27             ` Stephen Hemminger
  0 siblings, 1 reply; 43+ messages in thread
From: John Fastabend @ 2012-09-24 20:24 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: Eric Dumazet, David Miller, Chris Wright, netdev

[...]

On 9/24/2012 1:02 PM, Stephen Hemminger wrote:
> +
> +/* Transmit local packets over Vxlan
> + *
> + * Outer IP header inherits ECN and DF from inner header.
> + * Outer UDP destination is the VXLAN assigned port.
> + *           source port is based on hash of flow if available
> + *                       otherwise use a random value
> + */
> +static netdev_tx_t vxlan_xmit(struct sk_buff *skb, struct net_device *dev)
> +{
> +	struct vxlan_dev *vxlan = netdev_priv(dev);
> +	struct rtable *rt;
> +	const struct ethhdr *eth;
> +	const struct iphdr *old_iph;
> +	struct iphdr *iph;
> +	struct vxlanhdr *vxh;
> +	struct udphdr *uh;
> +	struct flowi4 fl4;
> +	struct vxlan_fdb *f;
> +	unsigned int pkt_len = skb->len;
> +	unsigned int mtu;
> +	u32 hash;
> +	__be32 dst;
> +	__be16 df = 0;
> +	__u8 tos, ttl;
> +	int err;
> +

[...]

> +	err = ip_local_out(skb);
> +	if (likely(net_xmit_eval(err) == 0)) {
> +		struct vxlan_stats *stats = this_cpu_ptr(vxlan->stats);
> +
> +		u64_stats_update_begin(&stats->syncp);
> +		stats->tx_packets++;
> +		stats->tx_bytes += pkt_len;

Should pkt_len include the outer headers?

> +		u64_stats_update_end(&stats->syncp);
> +	} else {
> +		dev->stats.tx_errors++;
> +		dev->stats.tx_aborted_errors++;
> +	}
> +	return NETDEV_TX_OK;
> +
> +drop:
> +	dev->stats.tx_dropped++;
> +	goto tx_free;
> +
> +tx_error:
> +	dev->stats.tx_errors++;
> +tx_free:
> +	dev_kfree_skb(skb);
> +	return NETDEV_TX_OK;
> +}

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCHv2 net-next 3/3] vxlan: virtual extensible lan
  2012-09-24 20:09       ` [PATCHv2 " Eric Dumazet
@ 2012-09-24 20:26         ` Stephen Hemminger
  2012-09-24 20:41           ` Eric Dumazet
  0 siblings, 1 reply; 43+ messages in thread
From: Stephen Hemminger @ 2012-09-24 20:26 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: David Miller, Chris Wright, netdev

On Mon, 24 Sep 2012 22:09:50 +0200
Eric Dumazet <eric.dumazet@gmail.com> wrote:

> On Mon, 2012-09-24 at 12:46 -0700, Stephen Hemminger wrote:
> 
> > +/* Initialize the device structure. */
> > +static void vxlan_setup(struct net_device *dev)
> > +{
> > +	struct vxlan_dev *vxlan = netdev_priv(dev);
> > +	unsigned h;
> > +
> > +	eth_hw_addr_random(dev);
> > +	ether_setup(dev);
> > +
> > +	dev->netdev_ops = &vxlan_netdev_ops;
> > +	dev->destructor = vxlan_free;
> > +	SET_NETDEV_DEVTYPE(dev, &vxlan_type);
> > +
> > +	dev->tx_queue_len = 0;
> > +	dev->features	|= NETIF_F_LLTX;
> > +	dev->features	|= NETIF_F_NETNS_LOCAL;
> > +	dev->priv_flags	&= ~IFF_XMIT_DST_RELEASE;
> 
> I am not sure why you remove IFF_XMIT_DST_RELEASE here

Copied from GRE. What is impact either way? 

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCHv3 net-next 3/3] vxlan: virtual extensible lan
  2012-09-24 20:24           ` John Fastabend
@ 2012-09-24 20:27             ` Stephen Hemminger
  2012-09-24 23:17               ` John Fastabend
  0 siblings, 1 reply; 43+ messages in thread
From: Stephen Hemminger @ 2012-09-24 20:27 UTC (permalink / raw)
  To: John Fastabend; +Cc: Eric Dumazet, David Miller, Chris Wright, netdev

On Mon, 24 Sep 2012 13:24:41 -0700
John Fastabend <john.r.fastabend@intel.com> wrote:

> [...]
> 
> On 9/24/2012 1:02 PM, Stephen Hemminger wrote:
> > +
> > +/* Transmit local packets over Vxlan
> > + *
> > + * Outer IP header inherits ECN and DF from inner header.
> > + * Outer UDP destination is the VXLAN assigned port.
> > + *           source port is based on hash of flow if available
> > + *                       otherwise use a random value
> > + */
> > +static netdev_tx_t vxlan_xmit(struct sk_buff *skb, struct net_device *dev)
> > +{
> > +	struct vxlan_dev *vxlan = netdev_priv(dev);
> > +	struct rtable *rt;
> > +	const struct ethhdr *eth;
> > +	const struct iphdr *old_iph;
> > +	struct iphdr *iph;
> > +	struct vxlanhdr *vxh;
> > +	struct udphdr *uh;
> > +	struct flowi4 fl4;
> > +	struct vxlan_fdb *f;
> > +	unsigned int pkt_len = skb->len;
> > +	unsigned int mtu;
> > +	u32 hash;
> > +	__be32 dst;
> > +	__be16 df = 0;
> > +	__u8 tos, ttl;
> > +	int err;
> > +
> 
> [...]
> 
> > +	err = ip_local_out(skb);
> > +	if (likely(net_xmit_eval(err) == 0)) {
> > +		struct vxlan_stats *stats = this_cpu_ptr(vxlan->stats);
> > +
> > +		u64_stats_update_begin(&stats->syncp);
> > +		stats->tx_packets++;
> > +		stats->tx_bytes += pkt_len;
> 
> Should pkt_len include the outer headers?

It doesn't for GRE and related tunnels.

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCHv2 net-next 3/3] vxlan: virtual extensible lan
  2012-09-24 20:26         ` Stephen Hemminger
@ 2012-09-24 20:41           ` Eric Dumazet
  0 siblings, 0 replies; 43+ messages in thread
From: Eric Dumazet @ 2012-09-24 20:41 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: David Miller, Chris Wright, netdev

On Mon, 2012-09-24 at 13:26 -0700, Stephen Hemminger wrote:
> On Mon, 24 Sep 2012 22:09:50 +0200
> Eric Dumazet <eric.dumazet@gmail.com> wrote:
> 

> > 
> > I am not sure why you remove IFF_XMIT_DST_RELEASE here
> 
> Copied from GRE. What is impact either way? 

GRE needs skb_dst(skb) to be non NULL if tiph->daddr == NULL

but I am not sure your code needs this.

This is a performance issue, for packets waiting in a Qdisc : we'll need
to force a reference (and a cache line false sharing)

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH net-next 3/3] vxlan: virtual extensible lan
  2012-09-24 18:43 ` [PATCH net-next 3/3] vxlan: virtual extensible lan Stephen Hemminger
  2012-09-24 19:33   ` Eric Dumazet
  2012-09-24 19:39   ` Eric Dumazet
@ 2012-09-24 20:58   ` Chris Wright
  2012-09-24 21:11     ` Stephen Hemminger
  2012-09-24 21:50     ` [PATCHv4 net-next] vxlan: virtual extensible lan Stephen Hemminger
  2 siblings, 2 replies; 43+ messages in thread
From: Chris Wright @ 2012-09-24 20:58 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: David Miller, Chris Wright, netdev

* Stephen Hemminger (shemminger@vyatta.com) wrote:
> This is an implementation of Virtual eXtensible Local Area Network
> as described in draft RFC:
>   http://tools.ietf.org/html/draft-mahalingam-dutt-dcops-vxlan-02
> 
> The driver integrates a Virtual Tunnel Endpoint (VTEP) functionality
> that learns MAC to IP address mapping. 
> 
> This implementation has not been tested for Interoperation with
> other equipment.

I'm working on doing some interop

> --- a/drivers/net/Kconfig	2012-09-24 10:56:57.080291529 -0700
> +++ b/drivers/net/Kconfig	2012-09-24 11:08:02.865416523 -0700
> @@ -149,6 +149,19 @@ config MACVTAP
>  	  To compile this driver as a module, choose M here: the module
>  	  will be called macvtap.
>  
> +config VXLAN
> +       tristate "Virtual eXtensible Local Area Network (VXLAN)"
> +       depends on EXPERIMENTAL
> +       ---help---
> +	  This allows one to create vxlan virtual interfaces that provide
> +	  Layer 2 Networks over Layer 3 Networks. VXLAN is often used
> +	  to tunnel virtual network infrastructure in virtualized environments.
> +	  For more information see:
> +	    http://tools.ietf.org/html/draft-mahalingam-dutt-dcops-vxlan-02
> +
> +	  To compile this driver as a module, choose M here: the module
> +	  will be called macvlan.
                         ^^^^^^^
Cut 'n paste error, s/macvlan/vxlan/

> +/* Add static entry (via netlink) */
> +static int vxlan_fdb_add(struct ndmsg *ndm, struct nlattr *tb[],
> +			 struct net_device *dev,
> +			 const unsigned char *addr, u16 flags)
> +{
> +	struct vxlan_dev *vxlan = netdev_priv(dev);
> +	__be32 ip;
> +	int err;
> +
> +	if (tb[NDA_DST] == NULL)
> +		return -EINVAL;
> +
> +	if (nla_len(tb[NDA_DST]) != sizeof(__be32))
> +		return -EAFNOSUPPORT;
> +
> +	ip = nla_get_be32(tb[NDA_DST]);
> +
> +	spin_lock_bh(&vxlan->hash_lock);
> +	err = vxlan_fdb_create(vxlan, addr, ip, VXLAN_FDB_PERM);

Any reason to force permanent when created from userspace?

> +static bool vxlan_group_used(struct vxlan_net *vn,
> +			     const struct vxlan_dev *this)
> +{
> +	const struct vxlan_dev *vxlan;
> +	struct hlist_node *node;
> +	unsigned h;
> +
> +	for (h = 0; h < VNI_HASH_SIZE; ++h)
> +		hlist_for_each_entry(vxlan, node, &vn->vni_list[h], hlist) {

is walking this chain only protected with rtnl?

> +/* Propogate ECN from outer IP header to tunneled packet */
> +static inline void vxlan_ecn_decap(const struct iphdr *iph, struct sk_buff *skb)
> +{
> +	if (INET_ECN_is_ce(iph->tos)) {
> +		if (skb->protocol == htons(ETH_P_IP))
> +			IP_ECN_set_ce(ip_hdr(skb));
> +		else if (skb->protocol == htons(ETH_P_IPV6))
> +			IP6_ECN_set_ce(ipv6_hdr(skb));
> +	}
> +}
<snip>
> +/* Propogate ECN bits out */
> +static inline u8 vxlan_ecn_encap(u8 tos,
> +				 const struct iphdr *iph,
> +				 const struct sk_buff *skb)
> +{
> +	u8 inner = vxlan_get_dsfield(iph, skb);
> +
> +	return INET_ECN_encapsulate(tos, inner);
> +}

Goal is to be RFC 6040 compliant, and it looks like some edge cases aren't
met, for example, should drop on decap when inner is not supporting ECN
and outer has set CE.

<snip>
> +/* Callback from net/ipv4/udp.c to receive packets */
> +	/* Mark socket as an encapsulation socket. */
> +	udp_sk(sk)->encap_type = UDP_ENCAP_L2TPINUDP;

I don't think we need this particular encap_type value, just != 0

> +	udp_sk(sk)->encap_rcv = vxlan_udp_encap_recv;
> +	udp_encap_enable();

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH net-next 3/3] vxlan: virtual extensible lan
  2012-09-24 20:58   ` [PATCH " Chris Wright
@ 2012-09-24 21:11     ` Stephen Hemminger
  2012-09-24 21:22       ` Chris Wright
  2012-09-24 21:50     ` [PATCHv4 net-next] vxlan: virtual extensible lan Stephen Hemminger
  1 sibling, 1 reply; 43+ messages in thread
From: Stephen Hemminger @ 2012-09-24 21:11 UTC (permalink / raw)
  To: Chris Wright; +Cc: David Miller, netdev

On Mon, 24 Sep 2012 13:58:22 -0700
Chris Wright <chrisw@redhat.com> wrote:

> * Stephen Hemminger (shemminger@vyatta.com) wrote:
> > This is an implementation of Virtual eXtensible Local Area Network
> > as described in draft RFC:
> >   http://tools.ietf.org/html/draft-mahalingam-dutt-dcops-vxlan-02
> > 
> > The driver integrates a Virtual Tunnel Endpoint (VTEP) functionality
> > that learns MAC to IP address mapping. 
> > 
> > This implementation has not been tested for Interoperation with
> > other equipment.
> 
> I'm working on doing some interop
> 
> > --- a/drivers/net/Kconfig	2012-09-24 10:56:57.080291529 -0700
> > +++ b/drivers/net/Kconfig	2012-09-24 11:08:02.865416523 -0700
> > @@ -149,6 +149,19 @@ config MACVTAP
> >  	  To compile this driver as a module, choose M here: the module
> >  	  will be called macvtap.
> >  
> > +config VXLAN
> > +       tristate "Virtual eXtensible Local Area Network (VXLAN)"
> > +       depends on EXPERIMENTAL
> > +       ---help---
> > +	  This allows one to create vxlan virtual interfaces that provide
> > +	  Layer 2 Networks over Layer 3 Networks. VXLAN is often used
> > +	  to tunnel virtual network infrastructure in virtualized environments.
> > +	  For more information see:
> > +	    http://tools.ietf.org/html/draft-mahalingam-dutt-dcops-vxlan-02
> > +
> > +	  To compile this driver as a module, choose M here: the module
> > +	  will be called macvlan.
>                          ^^^^^^^
> Cut 'n paste error, s/macvlan/vxlan/
> 
> > +/* Add static entry (via netlink) */
> > +static int vxlan_fdb_add(struct ndmsg *ndm, struct nlattr *tb[],
> > +			 struct net_device *dev,
> > +			 const unsigned char *addr, u16 flags)
> > +{
> > +	struct vxlan_dev *vxlan = netdev_priv(dev);
> > +	__be32 ip;
> > +	int err;
> > +
> > +	if (tb[NDA_DST] == NULL)
> > +		return -EINVAL;
> > +
> > +	if (nla_len(tb[NDA_DST]) != sizeof(__be32))
> > +		return -EAFNOSUPPORT;
> > +
> > +	ip = nla_get_be32(tb[NDA_DST]);
> > +
> > +	spin_lock_bh(&vxlan->hash_lock);
> > +	err = vxlan_fdb_create(vxlan, addr, ip, VXLAN_FDB_PERM);
> 
> Any reason to force permanent when created from userspace?

Should use neighbour flag (NUD_PERMANENT) instead.

> 
> > +static bool vxlan_group_used(struct vxlan_net *vn,
> > +			     const struct vxlan_dev *this)
> > +{
> > +	const struct vxlan_dev *vxlan;
> > +	struct hlist_node *node;
> > +	unsigned h;
> > +
> > +	for (h = 0; h < VNI_HASH_SIZE; ++h)
> > +		hlist_for_each_entry(vxlan, node, &vn->vni_list[h], hlist) {
> 
> is walking this chain only protected with rtnl?

Yes. that should be enough, only used when creating new vxlan
to avoid joining same group twice.

> 
> > +/* Propogate ECN from outer IP header to tunneled packet */
> > +static inline void vxlan_ecn_decap(const struct iphdr *iph, struct sk_buff *skb)
> > +{
> > +	if (INET_ECN_is_ce(iph->tos)) {
> > +		if (skb->protocol == htons(ETH_P_IP))
> > +			IP_ECN_set_ce(ip_hdr(skb));
> > +		else if (skb->protocol == htons(ETH_P_IPV6))
> > +			IP6_ECN_set_ce(ipv6_hdr(skb));
> > +	}
> > +}
> <snip>
> > +/* Propogate ECN bits out */
> > +static inline u8 vxlan_ecn_encap(u8 tos,
> > +				 const struct iphdr *iph,
> > +				 const struct sk_buff *skb)
> > +{
> > +	u8 inner = vxlan_get_dsfield(iph, skb);
> > +
> > +	return INET_ECN_encapsulate(tos, inner);
> > +}
> 
> Goal is to be RFC 6040 compliant, and it looks like some edge cases aren't
> met, for example, should drop on decap when inner is not supporting ECN
> and outer has set CE.

The code was taken from existing GRE in Linux.
Looks like both VXLAN and GRE need to handle that.


> 
> <snip>
> > +/* Callback from net/ipv4/udp.c to receive packets */
> > +	/* Mark socket as an encapsulation socket. */
> > +	udp_sk(sk)->encap_type = UDP_ENCAP_L2TPINUDP;
> 
> I don't think we need this particular encap_type value, just != 0

Is there any value in defining new value?


> 
> > +	udp_sk(sk)->encap_rcv = vxlan_udp_encap_recv;
> > +	udp_encap_enable();

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH net-next 3/3] vxlan: virtual extensible lan
  2012-09-24 21:11     ` Stephen Hemminger
@ 2012-09-24 21:22       ` Chris Wright
  2012-09-24 21:44         ` [RFC] gre: conform to RFC6040 ECN progogation Stephen Hemminger
  0 siblings, 1 reply; 43+ messages in thread
From: Chris Wright @ 2012-09-24 21:22 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: Chris Wright, David Miller, netdev

* Stephen Hemminger (shemminger@vyatta.com) wrote:
> Chris Wright <chrisw@redhat.com> wrote:
> > > +/* Propogate ECN bits out */
> > > +static inline u8 vxlan_ecn_encap(u8 tos,
> > > +				 const struct iphdr *iph,
> > > +				 const struct sk_buff *skb)
> > > +{
> > > +	u8 inner = vxlan_get_dsfield(iph, skb);
> > > +
> > > +	return INET_ECN_encapsulate(tos, inner);
> > > +}
> > 
> > Goal is to be RFC 6040 compliant, and it looks like some edge cases aren't
> > met, for example, should drop on decap when inner is not supporting ECN
> > and outer has set CE.
> 
> The code was taken from existing GRE in Linux.
> Looks like both VXLAN and GRE need to handle that.

Right.

> > > +/* Callback from net/ipv4/udp.c to receive packets */
> > > +	/* Mark socket as an encapsulation socket. */
> > > +	udp_sk(sk)->encap_type = UDP_ENCAP_L2TPINUDP;
> > 
> > I don't think we need this particular encap_type value, just != 0
> 
> Is there any value in defining new value?

No, more the propagation of L2TP constant starts to look like cargo cult
programming.  IOW, no special meaning in this context.

^ permalink raw reply	[flat|nested] 43+ messages in thread

* [RFC] gre: conform to RFC6040 ECN progogation
  2012-09-24 21:22       ` Chris Wright
@ 2012-09-24 21:44         ` Stephen Hemminger
  2012-09-24 22:25           ` Eric Dumazet
  2012-10-01 15:55           ` Ben Hutchings
  0 siblings, 2 replies; 43+ messages in thread
From: Stephen Hemminger @ 2012-09-24 21:44 UTC (permalink / raw)
  To: Chris Wright; +Cc: David Miller, netdev

Linux GRE was likely written before this RFC and therefore does not
conform to one of the rules in Section 4.2.  Default Tunnel Egress Behaviour.

The new code addresses:
 o If the inner ECN field is Not-ECT, the decapsulator MUST NOT
      propagate any other ECN codepoint onwards.  This is because the
      inner Not-ECT marking is set by transports that rely on dropped
      packets as an indication of congestion and would not understand or
      respond to any other ECN codepoint [RFC4774].  Specifically:

      *  If the inner ECN field is Not-ECT and the outer ECN field is
         CE, the decapsulator MUST drop the packet.

      *  If the inner ECN field is Not-ECT and the outer ECN field is
         Not-ECT, ECT(0), or ECT(1), the decapsulator MUST forward the
         outgoing packet with the ECN field cleared to Not-ECT.

This was caught by Chris Wright while reviewing VXLAN.
This code has not been tested with real ECN through tunnel.

Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>

--- a/net/ipv4/ip_gre.c	2012-09-21 08:45:55.948772761 -0700
+++ b/net/ipv4/ip_gre.c	2012-09-24 14:35:54.666185603 -0700
@@ -567,15 +567,16 @@ out:
 	rcu_read_unlock();
 }
 
-static inline void ipgre_ecn_decapsulate(const struct iphdr *iph, struct sk_buff *skb)
+static int ipgre_ecn_decapsulate(const struct iphdr *iph, struct sk_buff *skb)
 {
 	if (INET_ECN_is_ce(iph->tos)) {
 		if (skb->protocol == htons(ETH_P_IP)) {
-			IP_ECN_set_ce(ip_hdr(skb));
+			return IP_ECN_set_ce(ip_hdr(skb));
 		} else if (skb->protocol == htons(ETH_P_IPV6)) {
-			IP6_ECN_set_ce(ipv6_hdr(skb));
+			return IP6_ECN_set_ce(ipv6_hdr(skb));
 		}
 	}
+	return 1;
 }
 
 static inline u8
@@ -703,17 +704,18 @@ static int ipgre_rcv(struct sk_buff *skb
 			skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
 		}
 
+		__skb_tunnel_rx(skb, tunnel->dev);
+
+		skb_reset_network_header(skb);
+		if (!ipgre_ecn_decapsulate(iph, skb))
+			goto drop;
+
 		tstats = this_cpu_ptr(tunnel->dev->tstats);
 		u64_stats_update_begin(&tstats->syncp);
 		tstats->rx_packets++;
 		tstats->rx_bytes += skb->len;
 		u64_stats_update_end(&tstats->syncp);
 
-		__skb_tunnel_rx(skb, tunnel->dev);
-
-		skb_reset_network_header(skb);
-		ipgre_ecn_decapsulate(iph, skb);
-
 		netif_rx(skb);
 
 		rcu_read_unlock();

^ permalink raw reply	[flat|nested] 43+ messages in thread

* [PATCHv4 net-next] vxlan: virtual extensible lan
  2012-09-24 20:58   ` [PATCH " Chris Wright
  2012-09-24 21:11     ` Stephen Hemminger
@ 2012-09-24 21:50     ` Stephen Hemminger
  2012-09-25 21:55       ` Jesse Gross
  1 sibling, 1 reply; 43+ messages in thread
From: Stephen Hemminger @ 2012-09-24 21:50 UTC (permalink / raw)
  To: Chris Wright; +Cc: David Miller, netdev

This is an implementation of Virtual eXtensible Local Area Network
as described in draft RFC:
  http://tools.ietf.org/html/draft-mahalingam-dutt-dcops-vxlan-02

The driver integrates a Virtual Tunnel Endpoint (VTEP) functionality
that learns MAC to IP address mapping. 

This implementation has not been tested for Interoperation with
other equipment. Still needs more testing of static entry manipulation.

Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>

---
v4 - fix ecn and set state of fdb entries
v3 - fix ordering of change versus migration message
v2 - fix use of ip header after pskb_may_pull

 Documentation/networking/vxlan.txt |   36 +
 drivers/net/Kconfig                |   13 
 drivers/net/Makefile               |    1 
 drivers/net/vxlan.c                | 1180 +++++++++++++++++++++++++++++++++++++
 include/linux/if_link.h            |   14 
 5 files changed, 1244 insertions(+)

--- a/drivers/net/Kconfig	2012-09-24 10:56:57.080291529 -0700
+++ b/drivers/net/Kconfig	2012-09-24 12:43:25.298239026 -0700
@@ -149,6 +149,19 @@ config MACVTAP
 	  To compile this driver as a module, choose M here: the module
 	  will be called macvtap.
 
+config VXLAN
+       tristate "Virtual eXtensible Local Area Network (VXLAN)"
+       depends on EXPERIMENTAL
+       ---help---
+	  This allows one to create vxlan virtual interfaces that provide
+	  Layer 2 Networks over Layer 3 Networks. VXLAN is often used
+	  to tunnel virtual network infrastructure in virtualized environments.
+	  For more information see:
+	    http://tools.ietf.org/html/draft-mahalingam-dutt-dcops-vxlan-02
+
+	  To compile this driver as a module, choose M here: the module
+	  will be called vxlan.
+
 config NETCONSOLE
 	tristate "Network console logging support"
 	---help---
--- a/drivers/net/Makefile	2012-09-24 10:56:57.080291529 -0700
+++ b/drivers/net/Makefile	2012-09-24 11:08:02.865416523 -0700
@@ -21,6 +21,7 @@ obj-$(CONFIG_NET_TEAM) += team/
 obj-$(CONFIG_TUN) += tun.o
 obj-$(CONFIG_VETH) += veth.o
 obj-$(CONFIG_VIRTIO_NET) += virtio_net.o
+obj-$(CONFIG_VXLAN) += vxlan.o
 
 #
 # Networking Drivers
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ b/drivers/net/vxlan.c	2012-09-24 14:33:00.687939813 -0700
@@ -0,0 +1,1182 @@
+/*
+ * VXLAN: Virtual eXtensiable Local Area Network
+ *
+ * Copyright (c) 2012 Vyatta Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * TODO
+ *  - use IANA UDP port number (when defined)
+ *  - IPv6 (not in RFC)
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/slab.h>
+#include <linux/skbuff.h>
+#include <linux/rculist.h>
+#include <linux/netdevice.h>
+#include <linux/in.h>
+#include <linux/ip.h>
+#include <linux/udp.h>
+#include <linux/igmp.h>
+#include <linux/etherdevice.h>
+#include <linux/if_ether.h>
+#include <linux/version.h>
+#include <linux/hash.h>
+#include <net/ip.h>
+#include <net/icmp.h>
+#include <net/udp.h>
+#include <net/rtnetlink.h>
+#include <net/route.h>
+#include <net/dsfield.h>
+#include <net/inet_ecn.h>
+#include <net/net_namespace.h>
+#include <net/netns/generic.h>
+
+#define VXLAN_VERSION	"0.0"
+
+#define VNI_HASH_BITS	10
+#define VNI_HASH_SIZE	(1<<VNI_HASH_BITS)
+#define FDB_HASH_BITS	8
+#define FDB_HASH_SIZE	(1<<FDB_HASH_BITS)
+
+#define VXLAN_N_VID	(1u << 24)
+#define VXLAN_VID_MASK	(VXLAN_N_VID - 1)
+
+#define FDB_AGE_INTERVAL (10 * HZ)	/* rescan interval */
+#define FDB_AGE_TIME	 (300 * HZ)	/* drop if not used in 5 min */
+
+#define VXLAN_FLAGS 0x08000000	/* struct vxlanhdr.vx_flags required value. */
+
+/* VXLAN protocol header */
+struct vxlanhdr {
+	__be32 vx_flags;
+	__be32 vx_vni;
+};
+
+#define VXLAN_HEADROOM (sizeof(struct iphdr)		\
+			+ sizeof(struct udphdr)		\
+			+ sizeof(struct vxlanhdr))
+
+/* UDP port for VXLAN traffic. */
+static unsigned int vxlan_port __read_mostly = 8472;
+module_param_named(port, vxlan_port, uint, 0);
+MODULE_PARM_DESC(vxlan_port, "Destination UDP port");
+
+/* per-net private data for this module */
+static unsigned int vxlan_net_id;
+struct vxlan_net {
+	struct socket	  *sock;	/* UDP encap socket */
+	struct hlist_head vni_list[VNI_HASH_SIZE];
+};
+
+/* Forwarding table entry */
+struct vxlan_fdb {
+	struct hlist_node hlist;	/* linked list of entries */
+	struct rcu_head	  rcu;
+	unsigned long	  updated;	/* jiffies */
+	unsigned long	  used;
+	__be32		  remote_ip;
+	u16		  state;	/* see ndm_state */
+	u8		  eth_addr[ETH_ALEN];
+};
+
+/* Per-cpu network traffic stats */
+struct vxlan_stats {
+	u64			rx_packets;
+	u64			rx_bytes;
+	u64			tx_packets;
+	u64			tx_bytes;
+	struct u64_stats_sync	syncp;
+};
+
+/* Pseudo network device */
+struct vxlan_dev {
+	struct hlist_node hlist;
+	struct net_device *dev;
+	struct vxlan_stats __percpu *stats;
+	__u32		  vni;		/* virtual network id */
+	__be32	          gaddr;	/* multicast group */
+	__be32		  saddr;	/* source address */
+	unsigned int      link;		/* link to multicast over */
+	__u8		  tos;		/* TOS override */
+	__u8		  ttl;
+	bool		  learn;
+
+	struct timer_list age_timer;
+	spinlock_t	  hash_lock;
+	struct hlist_head fdb_head[FDB_HASH_SIZE];
+};
+
+/* salt for hash table */
+static u32 vxlan_salt __read_mostly;
+
+static inline struct hlist_head *vni_head(struct net *net, u32 id)
+{
+	struct vxlan_net *vn = net_generic(net, vxlan_net_id);
+
+	return &vn->vni_list[hash_32(id, VNI_HASH_BITS)];
+}
+
+/* Look up VNI in a per net namespace table */
+static struct vxlan_dev *vxlan_find_vni(struct net *net, u32 id)
+{
+	struct vxlan_dev *vxlan;
+	struct hlist_node *node;
+
+	hlist_for_each_entry_rcu(vxlan, node, vni_head(net, id), hlist) {
+		if (vxlan->vni == id)
+			return vxlan;
+	}
+
+	return NULL;
+}
+
+/* Fill in neighbour message in skbuff. */
+static int vxlan_fdb_info(struct sk_buff *skb, struct vxlan_dev *vxlan,
+			   const struct vxlan_fdb *fdb,
+			   u32 portid, u32 seq, int type, unsigned int flags)
+{
+	unsigned long now = jiffies;
+	struct nda_cacheinfo ci;
+	struct nlmsghdr *nlh;
+	struct ndmsg *ndm;
+
+	nlh = nlmsg_put(skb, portid, seq, type, sizeof(*ndm), flags);
+	if (nlh == NULL)
+		return -EMSGSIZE;
+
+	ndm = nlmsg_data(nlh);
+	memset(ndm, 0, sizeof(*ndm));
+	ndm->ndm_family	= AF_BRIDGE;
+	ndm->ndm_state = fdb->state;
+
+	if (nla_put(skb, NDA_LLADDR, ETH_ALEN, &fdb->eth_addr))
+		goto nla_put_failure;
+
+	if (nla_put_be32(skb, NDA_DST, fdb->remote_ip))
+		goto nla_put_failure;
+
+	ci.ndm_used	 = jiffies_to_clock_t(now - fdb->used);
+	ci.ndm_confirmed = 0;
+	ci.ndm_updated	 = jiffies_to_clock_t(now - fdb->updated);
+	ci.ndm_refcnt	 = 0;
+
+	if (nla_put(skb, NDA_CACHEINFO, sizeof(ci), &ci))
+		goto nla_put_failure;
+
+	return nlmsg_end(skb, nlh);
+
+nla_put_failure:
+	nlmsg_cancel(skb, nlh);
+	return -EMSGSIZE;
+}
+
+static inline size_t vxlan_nlmsg_size(void)
+{
+	return NLMSG_ALIGN(sizeof(struct ndmsg))
+		+ nla_total_size(ETH_ALEN) /* NDA_LLADDR */
+		+ nla_total_size(sizeof(__be32)) /* NDA_DST */
+		+ nla_total_size(sizeof(struct nda_cacheinfo));
+}
+
+static void vxlan_fdb_notify(struct vxlan_dev *vxlan,
+			     const struct vxlan_fdb *fdb, int type)
+{
+	struct net *net = dev_net(vxlan->dev);
+	struct sk_buff *skb;
+	int err = -ENOBUFS;
+
+	skb = nlmsg_new(vxlan_nlmsg_size(), GFP_ATOMIC);
+	if (skb == NULL)
+		goto errout;
+
+	err = vxlan_fdb_info(skb, vxlan, fdb, 0, 0, type, 0);
+	if (err < 0) {
+		/* -EMSGSIZE implies BUG in vxlan_nlmsg_size() */
+		WARN_ON(err == -EMSGSIZE);
+		kfree_skb(skb);
+		goto errout;
+	}
+
+	rtnl_notify(skb, net, 0, RTNLGRP_NEIGH, NULL, GFP_ATOMIC);
+	return;
+errout:
+	if (err < 0)
+		rtnl_set_sk_err(net, RTNLGRP_NEIGH, err);
+}
+
+/* Hash Ethernet address */
+static u32 eth_hash(const unsigned char *addr)
+{
+	/* could be optimized for unaligned access */
+	u32 a = addr[5] << 8 | addr[4];
+	u32 b = addr[3] << 24 | addr[2] << 16 | addr[1] << 8 | addr[0];
+
+	return jhash_2words(a, b, vxlan_salt);
+}
+
+/* Hash chain to use given mac address */
+static inline struct hlist_head *vxlan_fdb_head(struct vxlan_dev *vxlan,
+						const u8 *mac)
+{
+	return &vxlan->fdb_head[hash_32(eth_hash(mac), FDB_HASH_BITS)];
+}
+
+/* Look up Ethernet address in forwarding table */
+static struct vxlan_fdb *vxlan_find_mac(struct vxlan_dev *vxlan,
+					const u8 *mac)
+
+{
+	struct hlist_head *head = vxlan_fdb_head(vxlan, mac);
+	struct vxlan_fdb *f;
+	struct hlist_node *node;
+
+	hlist_for_each_entry_rcu(f, node, head, hlist) {
+		if (compare_ether_addr(mac, f->eth_addr) == 0)
+			return f;
+	}
+
+	return NULL;
+}
+
+/* Add new entry to forwarding table -- assumes lock held */
+static int vxlan_fdb_create(struct vxlan_dev *vxlan,
+			    const u8 *mac, __be32 ip,
+			    __u16 state, __u16 flags)
+{
+	struct vxlan_fdb *f;
+
+	f = vxlan_find_mac(vxlan, mac);
+	if (f) {
+		if (flags & NLM_F_EXCL) {
+			netdev_dbg(vxlan->dev,
+				   "lost race to create %pM\n", mac);
+			return -EEXIST;
+		}
+	} else {
+		if (!(flags & NLM_F_CREATE))
+			return -ENOENT;
+
+		netdev_dbg(vxlan->dev, "add %pM -> %pI4\n", mac, &ip);
+		f = kmalloc(sizeof(*f), GFP_ATOMIC);
+		if (!f)
+			return -ENOMEM;
+
+		f->remote_ip = ip;
+		memcpy(f->eth_addr, mac, ETH_ALEN);
+		hlist_add_head_rcu(&f->hlist,
+				   vxlan_fdb_head(vxlan, mac));
+		f->state = 0;
+	}
+
+	if (f->state != state) {
+		f->state = state;
+		f->updated = f->used = jiffies;
+		vxlan_fdb_notify(vxlan, f, RTM_NEWNEIGH);
+	}
+
+	return 0;
+}
+
+static void vxlan_fdb_destroy(struct vxlan_dev *vxlan, struct vxlan_fdb *f)
+{
+	netdev_dbg(vxlan->dev,
+		    "delete %pM\n", f->eth_addr);
+
+	vxlan_fdb_notify(vxlan, f, RTM_DELNEIGH);
+
+	hlist_del_rcu(&f->hlist);
+	kfree_rcu(f, rcu);
+}
+
+/* Add static entry (via netlink) */
+static int vxlan_fdb_add(struct ndmsg *ndm, struct nlattr *tb[],
+			 struct net_device *dev,
+			 const unsigned char *addr, u16 flags)
+{
+	struct vxlan_dev *vxlan = netdev_priv(dev);
+	__be32 ip;
+	int err;
+
+	if (!(ndm->ndm_state & (NUD_PERMANENT|NUD_REACHABLE))) {
+		pr_info("RTM_NEWNEIGH with invalid state %#x\n", ndm->ndm_state);
+		return -EINVAL;
+	}
+
+	if (tb[NDA_DST] == NULL)
+		return -EINVAL;
+
+	if (nla_len(tb[NDA_DST]) != sizeof(__be32))
+		return -EAFNOSUPPORT;
+
+	ip = nla_get_be32(tb[NDA_DST]);
+
+	spin_lock_bh(&vxlan->hash_lock);
+	err = vxlan_fdb_create(vxlan, addr, ip, ndm->ndm_state, flags);
+	spin_unlock_bh(&vxlan->hash_lock);
+
+	return err;
+}
+
+/* Delete entry (via netlink) */
+static int vxlan_fdb_delete(struct ndmsg *ndm, struct net_device *dev,
+			    const unsigned char *addr)
+{
+	struct vxlan_dev *vxlan = netdev_priv(dev);
+	struct vxlan_fdb *f;
+	int err = -ENOENT;
+
+	spin_lock_bh(&vxlan->hash_lock);
+	f = vxlan_find_mac(vxlan, addr);
+	if (f) {
+		vxlan_fdb_destroy(vxlan, f);
+		err = 0;
+	}
+	spin_unlock_bh(&vxlan->hash_lock);
+
+	return err;
+}
+
+/* Dump forwarding table */
+static int vxlan_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb,
+			  struct net_device *dev, int idx)
+{
+	struct vxlan_dev *vxlan = netdev_priv(dev);
+	unsigned int h;
+
+	for (h = 0; h < FDB_HASH_SIZE; ++h) {
+		struct vxlan_fdb *f;
+		struct hlist_node *n;
+
+		hlist_for_each_entry_rcu(f, n, &vxlan->fdb_head[h], hlist) {
+			if (idx < cb->args[0])
+				goto skip;
+
+			if (vxlan_fdb_info(skb, vxlan, f,
+					    NETLINK_CB(cb->skb).portid,
+					    cb->nlh->nlmsg_seq,
+					    RTM_NEWNEIGH,
+					    NLM_F_MULTI) < 0)
+				break;
+skip:
+			++idx;
+		}
+	}
+
+	return idx;
+}
+
+/* Watch incoming packets to learn mapping between Ethernet address
+ * and Tunnel endpoint.
+ */
+static void vxlan_snoop(struct net_device *dev,
+			__be32 src_ip, const u8 *src_mac)
+{
+	struct vxlan_dev *vxlan = netdev_priv(dev);
+	struct vxlan_fdb *f;
+	int err;
+
+	f = vxlan_find_mac(vxlan, src_mac);
+	if (likely(f)) {
+		f->used = jiffies;
+		if (likely(f->remote_ip == src_ip))
+			return;
+
+		if (net_ratelimit())
+			netdev_info(dev,
+				    "%pM migrated from %pI4 to %pI4\n",
+				    src_mac, &f->remote_ip, &src_ip);
+
+		f->remote_ip = src_ip;
+		f->updated = jiffies;
+	} else {
+		/* learned new entry */
+		spin_lock(&vxlan->hash_lock);
+		err = vxlan_fdb_create(vxlan, src_mac, src_ip,
+				       NUD_REACHABLE,
+				       NLM_F_EXCL|NLM_F_CREATE);
+		spin_unlock(&vxlan->hash_lock);
+	}
+}
+
+
+/* See if multicast group is already in use by other ID */
+static bool vxlan_group_used(struct vxlan_net *vn,
+			     const struct vxlan_dev *this)
+{
+	const struct vxlan_dev *vxlan;
+	struct hlist_node *node;
+	unsigned h;
+
+	for (h = 0; h < VNI_HASH_SIZE; ++h)
+		hlist_for_each_entry(vxlan, node, &vn->vni_list[h], hlist) {
+			if (vxlan == this)
+				continue;
+
+			if (!netif_running(vxlan->dev))
+				continue;
+
+			if (vxlan->gaddr == this->gaddr)
+				return true;
+		}
+
+	return false;
+}
+
+/* kernel equivalent to IP_ADD_MEMBERSHIP */
+static int vxlan_join_group(struct net_device *dev)
+{
+	struct vxlan_dev *vxlan = netdev_priv(dev);
+	struct vxlan_net *vn = net_generic(dev_net(dev), vxlan_net_id);
+	struct sock *sk = vn->sock->sk;
+	struct ip_mreqn mreq = {
+		.imr_multiaddr.s_addr = vxlan->gaddr,
+	};
+	int err;
+
+	if (vxlan_group_used(vn, vxlan))
+		return 0;
+
+	/* Need to drop RTNL to call multicast join */
+	rtnl_unlock();
+	lock_sock(sk);
+	err = ip_mc_join_group(sk, &mreq);
+	release_sock(sk);
+	rtnl_lock();
+
+	return err;
+}
+
+
+/* kernel equivalent to IP_DROP_MEMBERSHIP */
+static int vxlan_leave_group(struct net_device *dev)
+{
+	struct vxlan_dev *vxlan = netdev_priv(dev);
+	struct vxlan_net *vn = net_generic(dev_net(dev), vxlan_net_id);
+	int err = 0;
+	struct sock *sk = vn->sock->sk;
+	struct ip_mreqn mreq = {
+		.imr_multiaddr.s_addr = vxlan->gaddr,
+	};
+
+	if (vxlan_group_used(vn, vxlan))
+		return 0;
+
+	/* Need to drop RTNL to call multicast leave */
+	rtnl_unlock();
+	lock_sock(sk);
+	err = ip_mc_leave_group(sk, &mreq);
+	release_sock(sk);
+	rtnl_lock();
+
+	return err;
+}
+
+/* Propogate ECN from outer IP header to tunneled packet */
+static inline int vxlan_ecn_decap(const struct iphdr *iph, struct sk_buff *skb)
+{
+	if (unlikely(INET_ECN_is_ce(iph->tos))) {
+		if (skb->protocol == htons(ETH_P_IP))
+			return IP_ECN_set_ce(ip_hdr(skb));
+		else if (skb->protocol == htons(ETH_P_IPV6))
+			return IP6_ECN_set_ce(ipv6_hdr(skb));
+	}
+	return 1;
+}
+
+/* Callback from net/ipv4/udp.c to receive packets */
+static int vxlan_udp_encap_recv(struct sock *sk, struct sk_buff *skb)
+{
+	struct iphdr *oip;
+	struct vxlanhdr *vxh;
+	struct vxlan_dev *vxlan;
+	struct vxlan_stats *stats;
+	__u32 vni;
+
+	/* pop off outer UDP header */
+	__skb_pull(skb, sizeof(struct udphdr));
+
+	/* Need Vxlan and inner Ethernet header to be present */
+	if (!pskb_may_pull(skb,
+			   sizeof(struct vxlanhdr) + sizeof(struct ethhdr)))
+		goto error;
+
+	oip = ip_hdr(skb);
+
+	/* Drop packets with reserved bits set */
+	vxh = (struct vxlanhdr *) skb->data;
+	if (vxh->vx_flags != htonl(VXLAN_FLAGS) ||
+	    (vxh->vx_vni & htonl(0xff)))
+		goto error;
+
+	__skb_pull(skb, sizeof(struct vxlanhdr));
+
+	/* Is this VNI defined? */
+	vni = ntohl(vxh->vx_vni) >> 8;
+	vxlan = vxlan_find_vni(sock_net(sk), vni);
+	if (!vxlan)
+		goto drop;
+
+	/* Ignore packets if device is not up */
+	if (!netif_running(vxlan->dev))
+		goto drop;
+
+	/* Re-examine inner Ethernet packet */
+	skb->protocol = eth_type_trans(skb, vxlan->dev);
+	skb->ip_summed = CHECKSUM_NONE;
+
+	/* Ignore packet loops (and multicast echo) */
+	if (compare_ether_addr(eth_hdr(skb)->h_source,
+			       vxlan->dev->dev_addr) == 0)
+		goto drop;
+
+	/* Check for multicast group configuration errors */
+	if (IN_MULTICAST(ntohl(oip->daddr)) &&
+	    oip->daddr != vxlan->gaddr) {
+		if (net_ratelimit())
+			netdev_notice(vxlan->dev,
+				      "group address %pI4 does not match\n",
+				      &oip->daddr);
+		goto drop;
+	}
+
+	if (vxlan->learn)
+		vxlan_snoop(skb->dev, oip->saddr, eth_hdr(skb)->h_source);
+
+	__skb_tunnel_rx(skb, vxlan->dev);
+	skb_reset_network_header(skb);
+	if (!vxlan_ecn_decap(oip, skb))
+		goto drop;
+
+	stats = this_cpu_ptr(vxlan->stats);
+	u64_stats_update_begin(&stats->syncp);
+	stats->rx_packets++;
+	stats->rx_bytes += skb->len;
+	u64_stats_update_end(&stats->syncp);
+
+	netif_rx(skb);
+
+	return 0;
+error:
+	/* Put UDP header back */
+	__skb_push(skb, sizeof(struct udphdr));
+
+	return 1;
+drop:
+	/* Consume bad packet */
+	kfree_skb(skb);
+	return 0;
+}
+
+/* Extract dsfield from inner protocol */
+static inline u8 vxlan_get_dsfield(const struct iphdr *iph,
+				   const struct sk_buff *skb)
+{
+	if (skb->protocol == htons(ETH_P_IP))
+		return iph->tos;
+	else if (skb->protocol == htons(ETH_P_IPV6))
+		return ipv6_get_dsfield((const struct ipv6hdr *)iph);
+	else
+		return 0;
+}
+
+/* Propogate ECN bits out */
+static inline u8 vxlan_ecn_encap(u8 tos,
+				 const struct iphdr *iph,
+				 const struct sk_buff *skb)
+{
+	u8 inner = vxlan_get_dsfield(iph, skb);
+
+	return INET_ECN_encapsulate(tos, inner);
+}
+
+/* Transmit local packets over Vxlan
+ *
+ * Outer IP header inherits ECN and DF from inner header.
+ * Outer UDP destination is the VXLAN assigned port.
+ *           source port is based on hash of flow if available
+ *                       otherwise use a random value
+ */
+static netdev_tx_t vxlan_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+	struct vxlan_dev *vxlan = netdev_priv(dev);
+	struct rtable *rt;
+	const struct ethhdr *eth;
+	const struct iphdr *old_iph;
+	struct iphdr *iph;
+	struct vxlanhdr *vxh;
+	struct udphdr *uh;
+	struct flowi4 fl4;
+	struct vxlan_fdb *f;
+	unsigned int pkt_len = skb->len;
+	unsigned int mtu;
+	u32 hash;
+	__be32 dst;
+	__be16 df = 0;
+	__u8 tos, ttl;
+	int err;
+
+	/* Need space for new headers (invalidates iph ptr) */
+	if (skb_cow_head(skb, VXLAN_HEADROOM))
+		goto drop;
+
+	eth = (void *)skb->data;
+	old_iph = ip_hdr(skb);
+
+	if (!is_multicast_ether_addr(eth->h_dest) &&
+	    (f = vxlan_find_mac(vxlan, eth->h_dest)))
+		dst = f->remote_ip;
+	else if (vxlan->gaddr) {
+		dst = vxlan->gaddr;
+	} else
+		goto drop;
+
+	ttl = vxlan->ttl;
+	if (!ttl && IN_MULTICAST(ntohl(dst)))
+		ttl = 1;
+
+	tos = vxlan->tos;
+	if (tos == 1)
+		tos = vxlan_get_dsfield(old_iph, skb);
+
+	hash = skb_get_rxhash(skb);
+
+	rt = ip_route_output_gre(dev_net(dev), &fl4, dst,
+				 vxlan->saddr, vxlan->vni,
+				 RT_TOS(tos), vxlan->link);
+	if (IS_ERR(rt)) {
+		netdev_dbg(dev, "no route to %pI4\n", &dst);
+		dev->stats.tx_carrier_errors++;
+		goto tx_error;
+	}
+
+	if (rt->dst.dev == dev) {
+		netdev_dbg(dev, "circular route to %pI4\n", &dst);
+		ip_rt_put(rt);
+		dev->stats.collisions++;
+		goto tx_error;
+	}
+
+	mtu = dst_mtu(&rt->dst) - VXLAN_HEADROOM;
+	/* Do PMTU */
+	if (skb->protocol == htons(ETH_P_IP)) {
+		df |= old_iph->frag_off & htons(IP_DF);
+		if (df && mtu < pkt_len) {
+			icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
+				  htonl(mtu));
+			ip_rt_put(rt);
+			goto tx_error;
+		}
+	}
+#if IS_ENABLED(CONFIG_IPV6)
+	else if (skb->protocol == htons(ETH_P_IPV6)) {
+		if (mtu >= IPV6_MIN_MTU && mtu < pkt_len) {
+			icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
+			ip_rt_put(rt);
+			goto tx_error;
+		}
+	}
+#endif
+
+	memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
+	IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED |
+			      IPSKB_REROUTED);
+	skb_dst_drop(skb);
+	skb_dst_set(skb, &rt->dst);
+
+	vxh = (struct vxlanhdr *) __skb_push(skb, sizeof(*vxh));
+	vxh->vx_flags = htonl(VXLAN_FLAGS);
+	vxh->vx_vni = htonl(vxlan->vni << 8);
+
+	__skb_push(skb, sizeof(*uh));
+	skb_reset_transport_header(skb);
+	uh = udp_hdr(skb);
+
+	uh->dest = htons(vxlan_port);
+	uh->source = hash ? :random32();
+
+	uh->len = htons(skb->len);
+	uh->check = 0;
+
+	__skb_push(skb, sizeof(*iph));
+	skb_reset_network_header(skb);
+	iph		= ip_hdr(skb);
+	iph->version	= 4;
+	iph->ihl	= sizeof(struct iphdr) >> 2;
+	iph->frag_off	= df;
+	iph->protocol	= IPPROTO_UDP;
+	iph->tos	= vxlan_ecn_encap(tos, old_iph, skb);
+	iph->daddr	= fl4.daddr;
+	iph->saddr	= fl4.saddr;
+	iph->ttl	= ttl ? : ip4_dst_hoplimit(&rt->dst);
+
+	/* See __IPTUNNEL_XMIT */
+	skb->ip_summed = CHECKSUM_NONE;
+	ip_select_ident(iph, &rt->dst, NULL);
+
+	err = ip_local_out(skb);
+	if (likely(net_xmit_eval(err) == 0)) {
+		struct vxlan_stats *stats = this_cpu_ptr(vxlan->stats);
+
+		u64_stats_update_begin(&stats->syncp);
+		stats->tx_packets++;
+		stats->tx_bytes += pkt_len;
+		u64_stats_update_end(&stats->syncp);
+	} else {
+		dev->stats.tx_errors++;
+		dev->stats.tx_aborted_errors++;
+	}
+	return NETDEV_TX_OK;
+
+drop:
+	dev->stats.tx_dropped++;
+	goto tx_free;
+
+tx_error:
+	dev->stats.tx_errors++;
+tx_free:
+	dev_kfree_skb(skb);
+	return NETDEV_TX_OK;
+}
+
+/* Walk the forwarding table and purge stale entries */
+static void vxlan_cleanup(unsigned long arg)
+{
+	struct vxlan_dev *vxlan = (struct vxlan_dev *) arg;
+	unsigned long next_timer = jiffies + FDB_AGE_INTERVAL;
+	unsigned int h;
+
+	if (!netif_running(vxlan->dev))
+		return;
+
+	spin_lock_bh(&vxlan->hash_lock);
+	for (h = 0; h < FDB_HASH_SIZE; ++h) {
+		struct hlist_node *p, *n;
+		hlist_for_each_safe(p, n, &vxlan->fdb_head[h]) {
+			struct vxlan_fdb *f
+				= container_of(p, struct vxlan_fdb, hlist);
+			unsigned long timeout = f->used + FDB_AGE_TIME;
+
+			if (f->state == NUD_PERMANENT)
+				continue;
+
+			if (time_before_eq(timeout, jiffies)) {
+				netdev_dbg(vxlan->dev,
+					   "garbage collect %pM\n",
+					   f->eth_addr);
+				f->state = NUD_STALE;
+				vxlan_fdb_destroy(vxlan, f);
+			} else if (time_before(timeout, next_timer))
+				next_timer = timeout;
+		}
+	}
+	spin_unlock_bh(&vxlan->hash_lock);
+
+	mod_timer(&vxlan->age_timer, next_timer);
+}
+
+/* Setup stats when device is created */
+static int vxlan_init(struct net_device *dev)
+{
+	struct vxlan_dev *vxlan = netdev_priv(dev);
+
+	vxlan->stats = alloc_percpu(struct vxlan_stats);
+	if (!vxlan->stats)
+		return -ENOMEM;
+
+	return 0;
+}
+
+/* Start ageing timer and join group when device is brought up */
+static int vxlan_open(struct net_device *dev)
+{
+	struct vxlan_dev *vxlan = netdev_priv(dev);
+	int err;
+
+	if (vxlan->gaddr) {
+		err = vxlan_join_group(dev);
+		if (err)
+			return err;
+	}
+
+	mod_timer(&vxlan->age_timer, jiffies + FDB_AGE_INTERVAL);
+	return 0;
+}
+
+/* Purge the forwarding table */
+static void vxlan_flush(struct vxlan_dev *vxlan)
+{
+	unsigned h;
+
+	spin_lock_bh(&vxlan->hash_lock);
+	for (h = 0; h < FDB_HASH_SIZE; ++h) {
+		struct hlist_node *p, *n;
+		hlist_for_each_safe(p, n, &vxlan->fdb_head[h]) {
+			struct vxlan_fdb *f
+				= container_of(p, struct vxlan_fdb, hlist);
+			vxlan_fdb_destroy(vxlan, f);
+		}
+	}
+	spin_unlock_bh(&vxlan->hash_lock);
+}
+
+/* Cleanup timer and forwarding table on shutdown */
+static int vxlan_stop(struct net_device *dev)
+{
+	struct vxlan_dev *vxlan = netdev_priv(dev);
+
+	if (vxlan->gaddr)
+		vxlan_leave_group(dev);
+
+	del_timer_sync(&vxlan->age_timer);
+
+	vxlan_flush(vxlan);
+
+	return 0;
+}
+
+/* Merge per-cpu statistics */
+static struct rtnl_link_stats64 *vxlan_stats64(struct net_device *dev,
+					       struct rtnl_link_stats64 *stats)
+{
+	struct vxlan_dev *vxlan = netdev_priv(dev);
+	struct vxlan_stats tmp, sum = { 0 };
+	unsigned int cpu;
+
+	for_each_possible_cpu(cpu) {
+		unsigned int start;
+		const struct vxlan_stats *stats
+			= per_cpu_ptr(vxlan->stats, cpu);
+
+		do {
+			start = u64_stats_fetch_begin_bh(&stats->syncp);
+			memcpy(&tmp, stats, sizeof(tmp));
+		} while (u64_stats_fetch_retry_bh(&stats->syncp, start));
+
+		sum.tx_bytes   += tmp.tx_bytes;
+		sum.tx_packets += tmp.tx_packets;
+		sum.rx_bytes   += tmp.rx_bytes;
+		sum.rx_packets += tmp.rx_packets;
+	}
+
+	stats->tx_bytes   = sum.tx_bytes;
+	stats->tx_packets = sum.tx_packets;
+	stats->rx_bytes   = sum.rx_bytes;
+	stats->rx_packets = sum.rx_packets;
+
+	stats->tx_dropped = dev->stats.tx_dropped;
+	stats->tx_errors  = dev->stats.tx_errors;
+	stats->tx_carrier_errors  = dev->stats.tx_carrier_errors;
+	stats->collisions  = dev->stats.collisions;
+
+	return stats;
+}
+
+/* Stub, nothing needs to be done. */
+static void vxlan_set_multicast_list(struct net_device *dev)
+{
+}
+
+static const struct net_device_ops vxlan_netdev_ops = {
+	.ndo_init		= vxlan_init,
+	.ndo_open		= vxlan_open,
+	.ndo_stop		= vxlan_stop,
+	.ndo_start_xmit		= vxlan_xmit,
+	.ndo_get_stats64	= vxlan_stats64,
+	.ndo_set_rx_mode	= vxlan_set_multicast_list,
+	.ndo_change_mtu		= eth_change_mtu,
+	.ndo_validate_addr	= eth_validate_addr,
+	.ndo_set_mac_address	= eth_mac_addr,
+	.ndo_fdb_add		= vxlan_fdb_add,
+	.ndo_fdb_del		= vxlan_fdb_delete,
+	.ndo_fdb_dump		= vxlan_fdb_dump,
+};
+
+/* Info for udev, that this is a virtual tunnel endpoint */
+static struct device_type vxlan_type = {
+	.name = "vxlan",
+};
+
+static void vxlan_free(struct net_device *dev)
+{
+	struct vxlan_dev *vxlan = netdev_priv(dev);
+
+	free_percpu(vxlan->stats);
+	free_netdev(dev);
+}
+
+/* Initialize the device structure. */
+static void vxlan_setup(struct net_device *dev)
+{
+	struct vxlan_dev *vxlan = netdev_priv(dev);
+	unsigned h;
+
+	eth_hw_addr_random(dev);
+	ether_setup(dev);
+
+	dev->netdev_ops = &vxlan_netdev_ops;
+	dev->destructor = vxlan_free;
+	SET_NETDEV_DEVTYPE(dev, &vxlan_type);
+
+	dev->tx_queue_len = 0;
+	dev->features	|= NETIF_F_LLTX;
+	dev->features	|= NETIF_F_NETNS_LOCAL;
+	dev->priv_flags	&= ~IFF_XMIT_DST_RELEASE;
+
+	spin_lock_init(&vxlan->hash_lock);
+
+	init_timer_deferrable(&vxlan->age_timer);
+	vxlan->age_timer.function = vxlan_cleanup;
+	vxlan->age_timer.data = (unsigned long) vxlan;
+
+	vxlan->dev = dev;
+
+	for (h = 0; h < FDB_HASH_SIZE; ++h)
+		INIT_HLIST_HEAD(&vxlan->fdb_head[h]);
+}
+
+static const struct nla_policy vxlan_policy[IFLA_VXLAN_MAX + 1] = {
+	[IFLA_VXLAN_ID]		= { .type = NLA_U32 },
+	[IFLA_VXLAN_GROUP]	= { .len = FIELD_SIZEOF(struct iphdr, daddr) },
+	[IFLA_VXLAN_LINK]	= { .type = NLA_U32 },
+	[IFLA_VXLAN_LOCAL]	= { .len = FIELD_SIZEOF(struct iphdr, saddr) },
+	[IFLA_VXLAN_TOS]	= { .type = NLA_U8 },
+	[IFLA_VXLAN_LEARNING]	= { .type = NLA_U8 },
+};
+
+static int vxlan_validate(struct nlattr *tb[], struct nlattr *data[])
+{
+	if (tb[IFLA_ADDRESS]) {
+		if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN) {
+			pr_debug("invalid link address (not ethernet)\n");
+			return -EINVAL;
+		}
+
+		if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS]))) {
+			pr_debug("invalid all zero ethernet address\n");
+			return -EADDRNOTAVAIL;
+		}
+	}
+
+	if (!data)
+		return -EINVAL;
+
+	if (data[IFLA_VXLAN_ID]) {
+		__u32 id = nla_get_u32(data[IFLA_VXLAN_ID]);
+		if (id >= VXLAN_VID_MASK)
+			return -ERANGE;
+	}
+
+	if (data[IFLA_VXLAN_GROUP]) {
+		__be32 gaddr = nla_get_be32(data[IFLA_VXLAN_GROUP]);
+		if (!IN_MULTICAST(ntohl(gaddr))) {
+			pr_debug("group address is not IPv4 multicast\n");
+			return -EADDRNOTAVAIL;
+		}
+	}
+
+	return 0;
+}
+
+static int vxlan_newlink(struct net *net, struct net_device *dev,
+			 struct nlattr *tb[], struct nlattr *data[])
+{
+	struct vxlan_dev *vxlan = netdev_priv(dev);
+	__u32 vni;
+	int err;
+
+	if (!data[IFLA_VXLAN_ID])
+		return -EINVAL;
+
+	vni = nla_get_u32(data[IFLA_VXLAN_ID]);
+	if (vxlan_find_vni(net, vni)) {
+		pr_info("duplicate VNI %u\n", vni);
+		return -EEXIST;
+	}
+	vxlan->vni = vni;
+
+	if (data[IFLA_VXLAN_GROUP])
+		vxlan->gaddr = nla_get_be32(data[IFLA_VXLAN_GROUP]);
+
+	if (data[IFLA_VXLAN_LOCAL])
+		vxlan->saddr = nla_get_be32(data[IFLA_VXLAN_LOCAL]);
+
+	if (data[IFLA_VXLAN_LINK])
+		vxlan->link = nla_get_u32(data[IFLA_VXLAN_LINK]);
+
+	if (data[IFLA_VXLAN_TOS])
+		vxlan->tos  = nla_get_u8(data[IFLA_VXLAN_TOS]);
+
+	if (!data[IFLA_VXLAN_LEARNING] || nla_get_u8(data[IFLA_VXLAN_LEARNING]))
+		vxlan->learn = true;
+
+	err = register_netdevice(dev);
+	if (!err)
+		hlist_add_head_rcu(&vxlan->hlist, vni_head(net, vxlan->vni));
+
+	return err;
+}
+
+static void vxlan_dellink(struct net_device *dev, struct list_head *head)
+{
+	struct vxlan_dev *vxlan = netdev_priv(dev);
+
+	hlist_del_rcu(&vxlan->hlist);
+
+	unregister_netdevice_queue(dev, head);
+}
+
+static size_t vxlan_get_size(const struct net_device *dev)
+{
+
+	return nla_total_size(sizeof(__u32)) +	/* IFLA_VXLAN_ID */
+		nla_total_size(sizeof(__be32)) +/* IFLA_VXLAN_GROUP */
+		nla_total_size(sizeof(__u32)) +	/* IFLA_VXLAN_LINK */
+		nla_total_size(sizeof(__be32))+	/* IFLA_VXLAN_LOCAL */
+		nla_total_size(sizeof(__u8)) +	/* IFLA_VXLAN_TTL */
+		nla_total_size(sizeof(__u8)) +	/* IFLA_VXLAN_TOS */
+		nla_total_size(sizeof(__u8)) +	/* IFLA_VXLAN_LEARNING */
+		0;
+}
+
+static int vxlan_fill_info(struct sk_buff *skb, const struct net_device *dev)
+{
+	const struct vxlan_dev *vxlan = netdev_priv(dev);
+
+	if (nla_put_u32(skb, IFLA_VXLAN_ID, vxlan->vni))
+		goto nla_put_failure;
+
+	if (vxlan->gaddr && nla_put_u32(skb, IFLA_VXLAN_GROUP, vxlan->gaddr))
+		goto nla_put_failure;
+
+	if (vxlan->link && nla_put_u32(skb, IFLA_VXLAN_LINK, vxlan->link))
+		goto nla_put_failure;
+
+	if (vxlan->saddr && nla_put_u32(skb, IFLA_VXLAN_LOCAL, vxlan->saddr))
+		goto nla_put_failure;
+
+	if (nla_put_u8(skb, IFLA_VXLAN_TTL, vxlan->ttl) ||
+	    nla_put_u8(skb, IFLA_VXLAN_TOS, vxlan->tos) ||
+	    nla_put_u8(skb, IFLA_VXLAN_LEARNING, vxlan->learn))
+		goto nla_put_failure;
+
+	return 0;
+
+nla_put_failure:
+	return -EMSGSIZE;
+}
+
+static struct rtnl_link_ops vxlan_link_ops __read_mostly = {
+	.kind		= "vxlan",
+	.maxtype	= IFLA_VXLAN_MAX,
+	.policy		= vxlan_policy,
+	.priv_size	= sizeof(struct vxlan_dev),
+	.setup		= vxlan_setup,
+	.validate	= vxlan_validate,
+	.newlink	= vxlan_newlink,
+	.dellink	= vxlan_dellink,
+	.get_size	= vxlan_get_size,
+	.fill_info	= vxlan_fill_info,
+};
+
+static __net_init int vxlan_init_net(struct net *net)
+{
+	struct vxlan_net *vn = net_generic(net, vxlan_net_id);
+	struct sock *sk;
+	struct sockaddr_in addr;
+	int rc;
+	unsigned h;
+
+	/* Create UDP socket for encapsulation receive. */
+	rc = sock_create_kern(AF_INET, SOCK_DGRAM, IPPROTO_UDP, &vn->sock);
+	if (rc < 0) {
+		pr_debug("UDP socket create failed\n");
+		return rc;
+	}
+
+	addr.sin_family = AF_INET;
+	addr.sin_addr.s_addr = INADDR_ANY;
+	addr.sin_port = htons(vxlan_port);
+
+	rc = kernel_bind(vn->sock, (struct sockaddr *) &addr, sizeof(addr));
+	if (rc < 0) {
+		pr_debug("bind for port %u failed %d\n", vxlan_port, rc);
+		sock_release(vn->sock);
+		vn->sock = NULL;
+		return rc;
+	}
+
+	/* Disable multicast loopback */
+	sk = vn->sock->sk;
+	inet_sk(sk)->mc_loop = 0;
+
+	/* Mark socket as an encapsulation socket. */
+	udp_sk(sk)->encap_type = 1;
+	udp_sk(sk)->encap_rcv = vxlan_udp_encap_recv;
+	udp_encap_enable();
+
+	for (h = 0; h < VNI_HASH_SIZE; ++h)
+		INIT_HLIST_HEAD(&vn->vni_list[h]);
+
+	return 0;
+}
+
+static __net_exit void vxlan_exit_net(struct net *net)
+{
+	struct vxlan_net *vn = net_generic(net, vxlan_net_id);
+
+	if (vn->sock) {
+		sock_release(vn->sock);
+		vn->sock = NULL;
+	}
+}
+
+static struct pernet_operations vxlan_net_ops = {
+	.init = vxlan_init_net,
+	.exit = vxlan_exit_net,
+	.id   = &vxlan_net_id,
+	.size = sizeof(struct vxlan_net),
+};
+
+static int __init vxlan_init_module(void)
+{
+	int rc;
+
+	get_random_bytes(&vxlan_salt, sizeof(vxlan_salt));
+
+	rc = register_pernet_device(&vxlan_net_ops);
+	if (rc)
+		goto out1;
+
+	rc = rtnl_link_register(&vxlan_link_ops);
+	if (rc)
+		goto out2;
+
+	return 0;
+
+out2:
+	unregister_pernet_device(&vxlan_net_ops);
+out1:
+	return rc;
+}
+module_init(vxlan_init_module);
+
+static void __exit vxlan_cleanup_module(void)
+{
+	rtnl_link_unregister(&vxlan_link_ops);
+	unregister_pernet_device(&vxlan_net_ops);
+}
+module_exit(vxlan_cleanup_module);
+
+MODULE_LICENSE("GPL");
+MODULE_VERSION(VXLAN_VERSION);
+MODULE_AUTHOR("Stephen Hemminger <shemminger@vyatta.com>");
+MODULE_ALIAS_RTNL_LINK("vxlan");
--- a/include/linux/if_link.h	2012-09-24 10:56:57.080291529 -0700
+++ b/include/linux/if_link.h	2012-09-24 11:08:02.869416484 -0700
@@ -272,6 +272,20 @@ enum macvlan_mode {
 
 #define MACVLAN_FLAG_NOPROMISC	1
 
+/* VXLAN section */
+enum {
+	IFLA_VXLAN_UNSPEC,
+	IFLA_VXLAN_ID,
+	IFLA_VXLAN_GROUP,
+	IFLA_VXLAN_LINK,
+	IFLA_VXLAN_LOCAL,
+	IFLA_VXLAN_TTL,
+	IFLA_VXLAN_TOS,
+	IFLA_VXLAN_LEARNING,
+	__IFLA_VXLAN_MAX
+};
+#define IFLA_VXLAN_MAX	(__IFLA_VXLAN_MAX - 1)
+
 /* SR-IOV virtual function management section */
 
 enum {
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ b/Documentation/networking/vxlan.txt	2012-09-24 11:08:02.869416484 -0700
@@ -0,0 +1,36 @@
+Virtual eXtensible Local Area Networking documentation
+======================================================
+
+The VXLAN protocol is a tunnelling protocol that is designed to
+solve the problem of limited number of available VLAN's (4096).
+With VXLAN identifier is expanded to 24 bits.
+
+It is a draft RFC standard, that is implemented by Cisco Nexus,
+Vmware and Brocade. The protocol runs over UDP using a single
+destination port (still not standardized by IANA).
+This document describes the Linux kernel tunnel device,
+there is also an implantation of VXLAN for Openvswitch.
+
+Unlike most tunnels, a VXLAN is a 1 to N network, not just point
+to point. A VXLAN device can either dynamically learn the IP address
+of the other end, in a manner similar to a learning bridge, or the
+forwarding entries can be configured statically.
+
+The management of vxlan is done in a similar fashion to it's
+too closest neighbors GRE and VLAN. Configuring VXLAN requires
+the version of iproute2 that matches the kernel release
+where VXLAN was first merged upstream.
+
+1. Create vxlan device
+  # ip li add vxlan0 type vxlan id 42 group 239.1.1.1 dev eth1
+
+This creates a new device (vxlan0). The device uses the
+the multicast group 239.1.1.1 over eth1 to handle packets where
+no entry is in the forwarding table.
+
+2. Delete vxlan device
+  # ip link delete vxlan0
+
+3. Show vxlan info
+  # ip -d show vxlan0
+

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [RFC] gre: conform to RFC6040 ECN progogation
  2012-09-24 21:44         ` [RFC] gre: conform to RFC6040 ECN progogation Stephen Hemminger
@ 2012-09-24 22:25           ` Eric Dumazet
  2012-09-24 22:30             ` Stephen Hemminger
  2012-10-01 15:55           ` Ben Hutchings
  1 sibling, 1 reply; 43+ messages in thread
From: Eric Dumazet @ 2012-09-24 22:25 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: Chris Wright, David Miller, netdev

On Mon, 2012-09-24 at 14:44 -0700, Stephen Hemminger wrote:
> Linux GRE was likely written before this RFC and therefore does not
> conform to one of the rules in Section 4.2.  Default Tunnel Egress Behaviour.
> 
> The new code addresses:
>  o If the inner ECN field is Not-ECT, the decapsulator MUST NOT
>       propagate any other ECN codepoint onwards.  This is because the
>       inner Not-ECT marking is set by transports that rely on dropped
>       packets as an indication of congestion and would not understand or
>       respond to any other ECN codepoint [RFC4774].  Specifically:
> 
>       *  If the inner ECN field is Not-ECT and the outer ECN field is
>          CE, the decapsulator MUST drop the packet.
> 
>       *  If the inner ECN field is Not-ECT and the outer ECN field is
>          Not-ECT, ECT(0), or ECT(1), the decapsulator MUST forward the
>          outgoing packet with the ECN field cleared to Not-ECT.
> 
> This was caught by Chris Wright while reviewing VXLAN.
> This code has not been tested with real ECN through tunnel.
> 
> Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>

It seems dangerous to me without any logging ?

One could argue that the outer ECN field should not be CE if the inner
was Not-ECT

It means :
1) the encapsulator set ECT(0), or ECT(1) and a congestioned hop set CE
2) the encapsulator set CE

1) or 2) while inner was not-ECT (!!!)

If a router does such a thing, we should log a message to help
diagnostics.

By the way other tunnels probably have the same issues.

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [RFC] gre: conform to RFC6040 ECN progogation
  2012-09-24 22:25           ` Eric Dumazet
@ 2012-09-24 22:30             ` Stephen Hemminger
  2012-09-25  5:17               ` Eric Dumazet
  0 siblings, 1 reply; 43+ messages in thread
From: Stephen Hemminger @ 2012-09-24 22:30 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: Chris Wright, David Miller, netdev

On Tue, 25 Sep 2012 00:25:36 +0200
Eric Dumazet <eric.dumazet@gmail.com> wrote:

> On Mon, 2012-09-24 at 14:44 -0700, Stephen Hemminger wrote:
> > Linux GRE was likely written before this RFC and therefore does not
> > conform to one of the rules in Section 4.2.  Default Tunnel Egress Behaviour.
> > 
> > The new code addresses:
> >  o If the inner ECN field is Not-ECT, the decapsulator MUST NOT
> >       propagate any other ECN codepoint onwards.  This is because the
> >       inner Not-ECT marking is set by transports that rely on dropped
> >       packets as an indication of congestion and would not understand or
> >       respond to any other ECN codepoint [RFC4774].  Specifically:
> > 
> >       *  If the inner ECN field is Not-ECT and the outer ECN field is
> >          CE, the decapsulator MUST drop the packet.
> > 
> >       *  If the inner ECN field is Not-ECT and the outer ECN field is
> >          Not-ECT, ECT(0), or ECT(1), the decapsulator MUST forward the
> >          outgoing packet with the ECN field cleared to Not-ECT.
> > 
> > This was caught by Chris Wright while reviewing VXLAN.
> > This code has not been tested with real ECN through tunnel.
> > 
> > Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>
> 
> It seems dangerous to me without any logging ?
> 
> One could argue that the outer ECN field should not be CE if the inner
> was Not-ECT
> 
> It means :
> 1) the encapsulator set ECT(0), or ECT(1) and a congestioned hop set CE
> 2) the encapsulator set CE
> 
> 1) or 2) while inner was not-ECT (!!!)
> 
> If a router does such a thing, we should log a message to help
> diagnostics.
> 
> By the way other tunnels probably have the same issues.

Logging is a bad idea in this case since the tunnel might be from a remote
host/protocol and the log would be filled with crap.

The tunnels in general do need to have rx_dropped counter, but it looks
like that isn't being done right either.

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCHv3 net-next 3/3] vxlan: virtual extensible lan
  2012-09-24 20:27             ` Stephen Hemminger
@ 2012-09-24 23:17               ` John Fastabend
  0 siblings, 0 replies; 43+ messages in thread
From: John Fastabend @ 2012-09-24 23:17 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: Eric Dumazet, David Miller, Chris Wright, netdev


>>
>> [...]
>>
>>> +	err = ip_local_out(skb);
>>> +	if (likely(net_xmit_eval(err) == 0)) {
>>> +		struct vxlan_stats *stats = this_cpu_ptr(vxlan->stats);
>>> +
>>> +		u64_stats_update_begin(&stats->syncp);
>>> +		stats->tx_packets++;
>>> +		stats->tx_bytes += pkt_len;
>>
>> Should pkt_len include the outer headers?
>
> It doesn't for GRE and related tunnels.
>

OK best to keep it the same as other tunnels.

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [RFC] gre: conform to RFC6040 ECN progogation
  2012-09-24 22:30             ` Stephen Hemminger
@ 2012-09-25  5:17               ` Eric Dumazet
  0 siblings, 0 replies; 43+ messages in thread
From: Eric Dumazet @ 2012-09-25  5:17 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: Chris Wright, David Miller, netdev

On Mon, 2012-09-24 at 15:30 -0700, Stephen Hemminger wrote:

> Logging is a bad idea in this case since the tunnel might be from a remote
> host/protocol and the log would be filled with crap.
> 
> The tunnels in general do need to have rx_dropped counter, but it looks
> like that isn't being done right either.
> 

I never suggested to log _all_ messages.


Stephen, I personally was hit by some provider playing with TOS bits so
wrong I had to patch linux to fix a minor problem. [1]

I would like to know why my tunnels are going to fail, and what should I
do to get a fallback. Ie reverting your patches.


RFC 6040 states :

      In these cases,
      particularly the more dangerous ones, the decapsulator SHOULD log
      the event and MAY also raise an alarm.

      Just because the highlighted combinations are currently unused,
      does not mean that all the other combinations are always valid.
      Some are only valid if they have arrived from a particular type of
      legacy ingress, and dangerous otherwise.  Therefore, an
      implementation MAY allow an operator to configure logging and
      alarms for such additional header combinations known to be
      dangerous or CU for the particular configuration of tunnel
      endpoints deployed at run-time.

      Alarms SHOULD be rate-limited so that the anomalous combinations
      will not amplify into a flood of alarm messages.  It MUST be
      possible to suppress alarms or logging, e.g., if it becomes
      apparent that a combination that previously was not used has
      started to be used for legitimate purposes such as a new standards
      action.


[1]
commit 7a269ffad72f3604b8982fa09c387670e0d2ee14
Author: Eric Dumazet <eric.dumazet@gmail.com>
Date:   Thu Sep 22 20:02:19 2011 +0000

    tcp: ECN blackhole should not force quickack mode
    
    While playing with a new ADSL box at home, I discovered that ECN
    blackhole can trigger suboptimal quickack mode on linux : We send one
    ACK for each incoming data frame, without any delay and eventual
    piggyback.
    
    This is because TCP_ECN_check_ce() considers that if no ECT is seen on a
    segment, this is because this segment was a retransmit.
    
    Refine this heuristic and apply it only if we seen ECT in a previous
    segment, to detect ECN blackhole at IP level.

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCHv4 net-next] vxlan: virtual extensible lan
  2012-09-24 21:50     ` [PATCHv4 net-next] vxlan: virtual extensible lan Stephen Hemminger
@ 2012-09-25 21:55       ` Jesse Gross
  2012-09-25 22:03         ` Stephen Hemminger
                           ` (2 more replies)
  0 siblings, 3 replies; 43+ messages in thread
From: Jesse Gross @ 2012-09-25 21:55 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: Chris Wright, David Miller, netdev

On Mon, Sep 24, 2012 at 2:50 PM, Stephen Hemminger
<shemminger@vyatta.com> wrote:
> +static netdev_tx_t vxlan_xmit(struct sk_buff *skb, struct net_device *dev)
[...]
> +       /* Do PMTU */
> +       if (skb->protocol == htons(ETH_P_IP)) {
> +               df |= old_iph->frag_off & htons(IP_DF);
> +               if (df && mtu < pkt_len) {
> +                       icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
> +                                 htonl(mtu));
> +                       ip_rt_put(rt);
> +                       goto tx_error;
> +               }
> +       }
> +#if IS_ENABLED(CONFIG_IPV6)
> +       else if (skb->protocol == htons(ETH_P_IPV6)) {
> +               if (mtu >= IPV6_MIN_MTU && mtu < pkt_len) {
> +                       icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
> +                       ip_rt_put(rt);
> +                       goto tx_error;
> +               }
> +       }
> +#endif

Won't this black hole packets if we need to generate ICMP messages?
Since we're doing switching and not routing here icmp_send() doesn't
necessarily have a route to the relevant endpoint.  It looks like
Ethernet over GRE has this issue as well.

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCHv4 net-next] vxlan: virtual extensible lan
  2012-09-25 21:55       ` Jesse Gross
@ 2012-09-25 22:03         ` Stephen Hemminger
  2012-09-25 22:09         ` [PATCHv5 " Stephen Hemminger
  2012-09-26  4:36         ` [PATCHv4 net-next] vxlan: virtual extensible lan Stephen Hemminger
  2 siblings, 0 replies; 43+ messages in thread
From: Stephen Hemminger @ 2012-09-25 22:03 UTC (permalink / raw)
  To: Jesse Gross; +Cc: Chris Wright, David Miller, netdev

On Tue, 25 Sep 2012 14:55:13 -0700
Jesse Gross <jesse@nicira.com> wrote:

> On Mon, Sep 24, 2012 at 2:50 PM, Stephen Hemminger
> <shemminger@vyatta.com> wrote:
> > +static netdev_tx_t vxlan_xmit(struct sk_buff *skb, struct net_device *dev)
> [...]
> > +       /* Do PMTU */
> > +       if (skb->protocol == htons(ETH_P_IP)) {
> > +               df |= old_iph->frag_off & htons(IP_DF);
> > +               if (df && mtu < pkt_len) {
> > +                       icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
> > +                                 htonl(mtu));
> > +                       ip_rt_put(rt);
> > +                       goto tx_error;
> > +               }
> > +       }
> > +#if IS_ENABLED(CONFIG_IPV6)
> > +       else if (skb->protocol == htons(ETH_P_IPV6)) {
> > +               if (mtu >= IPV6_MIN_MTU && mtu < pkt_len) {
> > +                       icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
> > +                       ip_rt_put(rt);
> > +                       goto tx_error;
> > +               }
> > +       }
> > +#endif
> 
> Won't this black hole packets if we need to generate ICMP messages?
> Since we're doing switching and not routing here icmp_send() doesn't
> necessarily have a route to the relevant endpoint.  It looks like
> Ethernet over GRE has this issue as well.

Good catch. The IP network space of the tunneled network will likely be
completely separate from the overlay network.

^ permalink raw reply	[flat|nested] 43+ messages in thread

* [PATCHv5 net-next] vxlan: virtual extensible lan
  2012-09-25 21:55       ` Jesse Gross
  2012-09-25 22:03         ` Stephen Hemminger
@ 2012-09-25 22:09         ` Stephen Hemminger
  2012-09-27 22:47           ` David Miller
  2012-09-26  4:36         ` [PATCHv4 net-next] vxlan: virtual extensible lan Stephen Hemminger
  2 siblings, 1 reply; 43+ messages in thread
From: Stephen Hemminger @ 2012-09-25 22:09 UTC (permalink / raw)
  To: Jesse Gross, Chris Wright, David Miller; +Cc: netdev

This is an implementation of Virtual eXtensible Local Area Network
as described in draft RFC:
  http://tools.ietf.org/html/draft-mahalingam-dutt-dcops-vxlan-02

The driver integrates a Virtual Tunnel Endpoint (VTEP) functionality
that learns MAC to IP address mapping. 

This implementation has not been tested for Interoperation with
other equipment.

Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>

---
v5 - drop MTU discovery since network is overlaid
     use common code to do ECN decapsulation
v4 - fix ecn and set state of fdb entries
v3 - fix ordering of change versus migration message
v2 - fix use of ip header after pskb_may_pull

 Documentation/networking/vxlan.txt |   36 +
 drivers/net/Kconfig                |   13 
 drivers/net/Makefile               |    1 
 drivers/net/vxlan.c                | 1164 +++++++++++++++++++++++++++++++++++++
 include/linux/if_link.h            |   14 
 5 files changed, 1228 insertions(+)

--- a/drivers/net/Kconfig	2012-09-24 17:23:51.888577602 -0700
+++ b/drivers/net/Kconfig	2012-09-25 14:03:41.564362855 -0700
@@ -149,6 +149,19 @@ config MACVTAP
 	  To compile this driver as a module, choose M here: the module
 	  will be called macvtap.
 
+config VXLAN
+       tristate "Virtual eXtensible Local Area Network (VXLAN)"
+       depends on EXPERIMENTAL
+       ---help---
+	  This allows one to create vxlan virtual interfaces that provide
+	  Layer 2 Networks over Layer 3 Networks. VXLAN is often used
+	  to tunnel virtual network infrastructure in virtualized environments.
+	  For more information see:
+	    http://tools.ietf.org/html/draft-mahalingam-dutt-dcops-vxlan-02
+
+	  To compile this driver as a module, choose M here: the module
+	  will be called vxlan.
+
 config NETCONSOLE
 	tristate "Network console logging support"
 	---help---
--- a/drivers/net/Makefile	2012-09-24 17:23:51.888577602 -0700
+++ b/drivers/net/Makefile	2012-09-25 14:03:41.564362855 -0700
@@ -21,6 +21,7 @@ obj-$(CONFIG_NET_TEAM) += team/
 obj-$(CONFIG_TUN) += tun.o
 obj-$(CONFIG_VETH) += veth.o
 obj-$(CONFIG_VIRTIO_NET) += virtio_net.o
+obj-$(CONFIG_VXLAN) += vxlan.o
 
 #
 # Networking Drivers
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ b/drivers/net/vxlan.c	2012-09-25 15:08:03.546231656 -0700
@@ -0,0 +1,1164 @@
+/*
+ * VXLAN: Virtual eXtensiable Local Area Network
+ *
+ * Copyright (c) 2012 Vyatta Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * TODO
+ *  - use IANA UDP port number (when defined)
+ *  - IPv6 (not in RFC)
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/slab.h>
+#include <linux/skbuff.h>
+#include <linux/rculist.h>
+#include <linux/netdevice.h>
+#include <linux/in.h>
+#include <linux/ip.h>
+#include <linux/udp.h>
+#include <linux/igmp.h>
+#include <linux/etherdevice.h>
+#include <linux/if_ether.h>
+#include <linux/version.h>
+#include <linux/hash.h>
+#include <net/ip.h>
+#include <net/icmp.h>
+#include <net/udp.h>
+#include <net/rtnetlink.h>
+#include <net/route.h>
+#include <net/dsfield.h>
+#include <net/inet_ecn.h>
+#include <net/net_namespace.h>
+#include <net/netns/generic.h>
+
+#define VXLAN_VERSION	"0.0"
+
+#define VNI_HASH_BITS	10
+#define VNI_HASH_SIZE	(1<<VNI_HASH_BITS)
+#define FDB_HASH_BITS	8
+#define FDB_HASH_SIZE	(1<<FDB_HASH_BITS)
+
+#define VXLAN_N_VID	(1u << 24)
+#define VXLAN_VID_MASK	(VXLAN_N_VID - 1)
+
+#define FDB_AGE_INTERVAL (10 * HZ)	/* rescan interval */
+#define FDB_AGE_TIME	 (300 * HZ)	/* drop if not used in 5 min */
+
+#define VXLAN_FLAGS 0x08000000	/* struct vxlanhdr.vx_flags required value. */
+
+/* VXLAN protocol header */
+struct vxlanhdr {
+	__be32 vx_flags;
+	__be32 vx_vni;
+};
+
+#define VXLAN_HEADROOM (sizeof(struct iphdr)		\
+			+ sizeof(struct udphdr)		\
+			+ sizeof(struct vxlanhdr))
+
+/* UDP port for VXLAN traffic. */
+static unsigned int vxlan_port __read_mostly = 8472;
+module_param_named(port, vxlan_port, uint, 0);
+MODULE_PARM_DESC(vxlan_port, "Destination UDP port");
+
+static bool log_ecn_error = true;
+module_param(log_ecn_error, bool, 0644);
+MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN");
+
+/* per-net private data for this module */
+static unsigned int vxlan_net_id;
+struct vxlan_net {
+	struct socket	  *sock;	/* UDP encap socket */
+	struct hlist_head vni_list[VNI_HASH_SIZE];
+};
+
+/* Forwarding table entry */
+struct vxlan_fdb {
+	struct hlist_node hlist;	/* linked list of entries */
+	struct rcu_head	  rcu;
+	unsigned long	  updated;	/* jiffies */
+	unsigned long	  used;
+	__be32		  remote_ip;
+	u16		  state;	/* see ndm_state */
+	u8		  eth_addr[ETH_ALEN];
+};
+
+/* Per-cpu network traffic stats */
+struct vxlan_stats {
+	u64			rx_packets;
+	u64			rx_bytes;
+	u64			tx_packets;
+	u64			tx_bytes;
+	struct u64_stats_sync	syncp;
+};
+
+/* Pseudo network device */
+struct vxlan_dev {
+	struct hlist_node hlist;
+	struct net_device *dev;
+	struct vxlan_stats __percpu *stats;
+	__u32		  vni;		/* virtual network id */
+	__be32	          gaddr;	/* multicast group */
+	__be32		  saddr;	/* source address */
+	unsigned int      link;		/* link to multicast over */
+	__u8		  tos;		/* TOS override */
+	__u8		  ttl;
+	bool		  learn;
+
+	struct timer_list age_timer;
+	spinlock_t	  hash_lock;
+	struct hlist_head fdb_head[FDB_HASH_SIZE];
+};
+
+/* salt for hash table */
+static u32 vxlan_salt __read_mostly;
+
+static inline struct hlist_head *vni_head(struct net *net, u32 id)
+{
+	struct vxlan_net *vn = net_generic(net, vxlan_net_id);
+
+	return &vn->vni_list[hash_32(id, VNI_HASH_BITS)];
+}
+
+/* Look up VNI in a per net namespace table */
+static struct vxlan_dev *vxlan_find_vni(struct net *net, u32 id)
+{
+	struct vxlan_dev *vxlan;
+	struct hlist_node *node;
+
+	hlist_for_each_entry_rcu(vxlan, node, vni_head(net, id), hlist) {
+		if (vxlan->vni == id)
+			return vxlan;
+	}
+
+	return NULL;
+}
+
+/* Fill in neighbour message in skbuff. */
+static int vxlan_fdb_info(struct sk_buff *skb, struct vxlan_dev *vxlan,
+			   const struct vxlan_fdb *fdb,
+			   u32 portid, u32 seq, int type, unsigned int flags)
+{
+	unsigned long now = jiffies;
+	struct nda_cacheinfo ci;
+	struct nlmsghdr *nlh;
+	struct ndmsg *ndm;
+
+	nlh = nlmsg_put(skb, portid, seq, type, sizeof(*ndm), flags);
+	if (nlh == NULL)
+		return -EMSGSIZE;
+
+	ndm = nlmsg_data(nlh);
+	memset(ndm, 0, sizeof(*ndm));
+	ndm->ndm_family	= AF_BRIDGE;
+	ndm->ndm_state = fdb->state;
+
+	if (nla_put(skb, NDA_LLADDR, ETH_ALEN, &fdb->eth_addr))
+		goto nla_put_failure;
+
+	if (nla_put_be32(skb, NDA_DST, fdb->remote_ip))
+		goto nla_put_failure;
+
+	ci.ndm_used	 = jiffies_to_clock_t(now - fdb->used);
+	ci.ndm_confirmed = 0;
+	ci.ndm_updated	 = jiffies_to_clock_t(now - fdb->updated);
+	ci.ndm_refcnt	 = 0;
+
+	if (nla_put(skb, NDA_CACHEINFO, sizeof(ci), &ci))
+		goto nla_put_failure;
+
+	return nlmsg_end(skb, nlh);
+
+nla_put_failure:
+	nlmsg_cancel(skb, nlh);
+	return -EMSGSIZE;
+}
+
+static inline size_t vxlan_nlmsg_size(void)
+{
+	return NLMSG_ALIGN(sizeof(struct ndmsg))
+		+ nla_total_size(ETH_ALEN) /* NDA_LLADDR */
+		+ nla_total_size(sizeof(__be32)) /* NDA_DST */
+		+ nla_total_size(sizeof(struct nda_cacheinfo));
+}
+
+static void vxlan_fdb_notify(struct vxlan_dev *vxlan,
+			     const struct vxlan_fdb *fdb, int type)
+{
+	struct net *net = dev_net(vxlan->dev);
+	struct sk_buff *skb;
+	int err = -ENOBUFS;
+
+	skb = nlmsg_new(vxlan_nlmsg_size(), GFP_ATOMIC);
+	if (skb == NULL)
+		goto errout;
+
+	err = vxlan_fdb_info(skb, vxlan, fdb, 0, 0, type, 0);
+	if (err < 0) {
+		/* -EMSGSIZE implies BUG in vxlan_nlmsg_size() */
+		WARN_ON(err == -EMSGSIZE);
+		kfree_skb(skb);
+		goto errout;
+	}
+
+	rtnl_notify(skb, net, 0, RTNLGRP_NEIGH, NULL, GFP_ATOMIC);
+	return;
+errout:
+	if (err < 0)
+		rtnl_set_sk_err(net, RTNLGRP_NEIGH, err);
+}
+
+/* Hash Ethernet address */
+static u32 eth_hash(const unsigned char *addr)
+{
+	/* could be optimized for unaligned access */
+	u32 a = addr[5] << 8 | addr[4];
+	u32 b = addr[3] << 24 | addr[2] << 16 | addr[1] << 8 | addr[0];
+
+	return jhash_2words(a, b, vxlan_salt);
+}
+
+/* Hash chain to use given mac address */
+static inline struct hlist_head *vxlan_fdb_head(struct vxlan_dev *vxlan,
+						const u8 *mac)
+{
+	return &vxlan->fdb_head[hash_32(eth_hash(mac), FDB_HASH_BITS)];
+}
+
+/* Look up Ethernet address in forwarding table */
+static struct vxlan_fdb *vxlan_find_mac(struct vxlan_dev *vxlan,
+					const u8 *mac)
+
+{
+	struct hlist_head *head = vxlan_fdb_head(vxlan, mac);
+	struct vxlan_fdb *f;
+	struct hlist_node *node;
+
+	hlist_for_each_entry_rcu(f, node, head, hlist) {
+		if (compare_ether_addr(mac, f->eth_addr) == 0)
+			return f;
+	}
+
+	return NULL;
+}
+
+/* Add new entry to forwarding table -- assumes lock held */
+static int vxlan_fdb_create(struct vxlan_dev *vxlan,
+			    const u8 *mac, __be32 ip,
+			    __u16 state, __u16 flags)
+{
+	struct vxlan_fdb *f;
+
+	f = vxlan_find_mac(vxlan, mac);
+	if (f) {
+		if (flags & NLM_F_EXCL) {
+			netdev_dbg(vxlan->dev,
+				   "lost race to create %pM\n", mac);
+			return -EEXIST;
+		}
+	} else {
+		if (!(flags & NLM_F_CREATE))
+			return -ENOENT;
+
+		netdev_dbg(vxlan->dev, "add %pM -> %pI4\n", mac, &ip);
+		f = kmalloc(sizeof(*f), GFP_ATOMIC);
+		if (!f)
+			return -ENOMEM;
+
+		f->remote_ip = ip;
+		memcpy(f->eth_addr, mac, ETH_ALEN);
+		hlist_add_head_rcu(&f->hlist,
+				   vxlan_fdb_head(vxlan, mac));
+		f->state = 0;
+	}
+
+	if (f->state != state) {
+		f->state = state;
+		f->updated = f->used = jiffies;
+		vxlan_fdb_notify(vxlan, f, RTM_NEWNEIGH);
+	}
+
+	return 0;
+}
+
+static void vxlan_fdb_destroy(struct vxlan_dev *vxlan, struct vxlan_fdb *f)
+{
+	netdev_dbg(vxlan->dev,
+		    "delete %pM\n", f->eth_addr);
+
+	vxlan_fdb_notify(vxlan, f, RTM_DELNEIGH);
+
+	hlist_del_rcu(&f->hlist);
+	kfree_rcu(f, rcu);
+}
+
+/* Add static entry (via netlink) */
+static int vxlan_fdb_add(struct ndmsg *ndm, struct nlattr *tb[],
+			 struct net_device *dev,
+			 const unsigned char *addr, u16 flags)
+{
+	struct vxlan_dev *vxlan = netdev_priv(dev);
+	__be32 ip;
+	int err;
+
+	if (!(ndm->ndm_state & (NUD_PERMANENT|NUD_REACHABLE))) {
+		pr_info("RTM_NEWNEIGH with invalid state %#x\n", ndm->ndm_state);
+		return -EINVAL;
+	}
+
+	if (tb[NDA_DST] == NULL)
+		return -EINVAL;
+
+	if (nla_len(tb[NDA_DST]) != sizeof(__be32))
+		return -EAFNOSUPPORT;
+
+	ip = nla_get_be32(tb[NDA_DST]);
+
+	spin_lock_bh(&vxlan->hash_lock);
+	err = vxlan_fdb_create(vxlan, addr, ip, ndm->ndm_state, flags);
+	spin_unlock_bh(&vxlan->hash_lock);
+
+	return err;
+}
+
+/* Delete entry (via netlink) */
+static int vxlan_fdb_delete(struct ndmsg *ndm, struct net_device *dev,
+			    const unsigned char *addr)
+{
+	struct vxlan_dev *vxlan = netdev_priv(dev);
+	struct vxlan_fdb *f;
+	int err = -ENOENT;
+
+	spin_lock_bh(&vxlan->hash_lock);
+	f = vxlan_find_mac(vxlan, addr);
+	if (f) {
+		vxlan_fdb_destroy(vxlan, f);
+		err = 0;
+	}
+	spin_unlock_bh(&vxlan->hash_lock);
+
+	return err;
+}
+
+/* Dump forwarding table */
+static int vxlan_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb,
+			  struct net_device *dev, int idx)
+{
+	struct vxlan_dev *vxlan = netdev_priv(dev);
+	unsigned int h;
+
+	for (h = 0; h < FDB_HASH_SIZE; ++h) {
+		struct vxlan_fdb *f;
+		struct hlist_node *n;
+
+		hlist_for_each_entry_rcu(f, n, &vxlan->fdb_head[h], hlist) {
+			if (idx < cb->args[0])
+				goto skip;
+
+			if (vxlan_fdb_info(skb, vxlan, f,
+					    NETLINK_CB(cb->skb).portid,
+					    cb->nlh->nlmsg_seq,
+					    RTM_NEWNEIGH,
+					    NLM_F_MULTI) < 0)
+				break;
+skip:
+			++idx;
+		}
+	}
+
+	return idx;
+}
+
+/* Watch incoming packets to learn mapping between Ethernet address
+ * and Tunnel endpoint.
+ */
+static void vxlan_snoop(struct net_device *dev,
+			__be32 src_ip, const u8 *src_mac)
+{
+	struct vxlan_dev *vxlan = netdev_priv(dev);
+	struct vxlan_fdb *f;
+	int err;
+
+	f = vxlan_find_mac(vxlan, src_mac);
+	if (likely(f)) {
+		f->used = jiffies;
+		if (likely(f->remote_ip == src_ip))
+			return;
+
+		if (net_ratelimit())
+			netdev_info(dev,
+				    "%pM migrated from %pI4 to %pI4\n",
+				    src_mac, &f->remote_ip, &src_ip);
+
+		f->remote_ip = src_ip;
+		f->updated = jiffies;
+	} else {
+		/* learned new entry */
+		spin_lock(&vxlan->hash_lock);
+		err = vxlan_fdb_create(vxlan, src_mac, src_ip,
+				       NUD_REACHABLE,
+				       NLM_F_EXCL|NLM_F_CREATE);
+		spin_unlock(&vxlan->hash_lock);
+	}
+}
+
+
+/* See if multicast group is already in use by other ID */
+static bool vxlan_group_used(struct vxlan_net *vn,
+			     const struct vxlan_dev *this)
+{
+	const struct vxlan_dev *vxlan;
+	struct hlist_node *node;
+	unsigned h;
+
+	for (h = 0; h < VNI_HASH_SIZE; ++h)
+		hlist_for_each_entry(vxlan, node, &vn->vni_list[h], hlist) {
+			if (vxlan == this)
+				continue;
+
+			if (!netif_running(vxlan->dev))
+				continue;
+
+			if (vxlan->gaddr == this->gaddr)
+				return true;
+		}
+
+	return false;
+}
+
+/* kernel equivalent to IP_ADD_MEMBERSHIP */
+static int vxlan_join_group(struct net_device *dev)
+{
+	struct vxlan_dev *vxlan = netdev_priv(dev);
+	struct vxlan_net *vn = net_generic(dev_net(dev), vxlan_net_id);
+	struct sock *sk = vn->sock->sk;
+	struct ip_mreqn mreq = {
+		.imr_multiaddr.s_addr = vxlan->gaddr,
+	};
+	int err;
+
+	if (vxlan_group_used(vn, vxlan))
+		return 0;
+
+	/* Need to drop RTNL to call multicast join */
+	rtnl_unlock();
+	lock_sock(sk);
+	err = ip_mc_join_group(sk, &mreq);
+	release_sock(sk);
+	rtnl_lock();
+
+	return err;
+}
+
+
+/* kernel equivalent to IP_DROP_MEMBERSHIP */
+static int vxlan_leave_group(struct net_device *dev)
+{
+	struct vxlan_dev *vxlan = netdev_priv(dev);
+	struct vxlan_net *vn = net_generic(dev_net(dev), vxlan_net_id);
+	int err = 0;
+	struct sock *sk = vn->sock->sk;
+	struct ip_mreqn mreq = {
+		.imr_multiaddr.s_addr = vxlan->gaddr,
+	};
+
+	if (vxlan_group_used(vn, vxlan))
+		return 0;
+
+	/* Need to drop RTNL to call multicast leave */
+	rtnl_unlock();
+	lock_sock(sk);
+	err = ip_mc_leave_group(sk, &mreq);
+	release_sock(sk);
+	rtnl_lock();
+
+	return err;
+}
+
+/* Callback from net/ipv4/udp.c to receive packets */
+static int vxlan_udp_encap_recv(struct sock *sk, struct sk_buff *skb)
+{
+	struct iphdr *oip;
+	struct vxlanhdr *vxh;
+	struct vxlan_dev *vxlan;
+	struct vxlan_stats *stats;
+	__u32 vni;
+	int err;
+
+	/* pop off outer UDP header */
+	__skb_pull(skb, sizeof(struct udphdr));
+
+	/* Need Vxlan and inner Ethernet header to be present */
+	if (!pskb_may_pull(skb,
+			   sizeof(struct vxlanhdr) + sizeof(struct ethhdr)))
+		goto error;
+
+	oip = ip_hdr(skb);
+
+	/* Drop packets with reserved bits set */
+	vxh = (struct vxlanhdr *) skb->data;
+	if (vxh->vx_flags != htonl(VXLAN_FLAGS) ||
+	    (vxh->vx_vni & htonl(0xff)))
+		goto error;
+
+	__skb_pull(skb, sizeof(struct vxlanhdr));
+
+	/* Is this VNI defined? */
+	vni = ntohl(vxh->vx_vni) >> 8;
+	vxlan = vxlan_find_vni(sock_net(sk), vni);
+	if (!vxlan)
+		goto drop;
+
+	/* Ignore packets if device is not up */
+	if (!netif_running(vxlan->dev))
+		goto drop;
+
+	/* Re-examine inner Ethernet packet */
+	skb->protocol = eth_type_trans(skb, vxlan->dev);
+	skb->ip_summed = CHECKSUM_NONE;
+
+	/* Ignore packet loops (and multicast echo) */
+	if (compare_ether_addr(eth_hdr(skb)->h_source,
+			       vxlan->dev->dev_addr) == 0)
+		goto drop;
+
+	/* Check for multicast group configuration errors */
+	if (IN_MULTICAST(ntohl(oip->daddr)) &&
+	    oip->daddr != vxlan->gaddr) {
+		if (net_ratelimit())
+			netdev_notice(vxlan->dev,
+				      "group address %pI4 does not match\n",
+				      &oip->daddr);
+		goto drop;
+	}
+
+	if (vxlan->learn)
+		vxlan_snoop(skb->dev, oip->saddr, eth_hdr(skb)->h_source);
+
+	__skb_tunnel_rx(skb, vxlan->dev);
+	skb_reset_network_header(skb);
+
+	err = IP_ECN_decapsulate(oip, skb);
+	if (unlikely(err)) {
+		if (log_ecn_error)
+			net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n",
+					     &oip->saddr, oip->tos);
+		if (err > 1) {
+			++vxlan->dev->stats.rx_frame_errors;
+			++vxlan->dev->stats.rx_errors;
+			goto drop;
+		}
+	}
+
+	stats = this_cpu_ptr(vxlan->stats);
+	u64_stats_update_begin(&stats->syncp);
+	stats->rx_packets++;
+	stats->rx_bytes += skb->len;
+	u64_stats_update_end(&stats->syncp);
+
+	netif_rx(skb);
+
+	return 0;
+error:
+	/* Put UDP header back */
+	__skb_push(skb, sizeof(struct udphdr));
+
+	return 1;
+drop:
+	/* Consume bad packet */
+	kfree_skb(skb);
+	return 0;
+}
+
+/* Extract dsfield from inner protocol */
+static inline u8 vxlan_get_dsfield(const struct iphdr *iph,
+				   const struct sk_buff *skb)
+{
+	if (skb->protocol == htons(ETH_P_IP))
+		return iph->tos;
+	else if (skb->protocol == htons(ETH_P_IPV6))
+		return ipv6_get_dsfield((const struct ipv6hdr *)iph);
+	else
+		return 0;
+}
+
+/* Propogate ECN bits out */
+static inline u8 vxlan_ecn_encap(u8 tos,
+				 const struct iphdr *iph,
+				 const struct sk_buff *skb)
+{
+	u8 inner = vxlan_get_dsfield(iph, skb);
+
+	return INET_ECN_encapsulate(tos, inner);
+}
+
+/* Transmit local packets over Vxlan
+ *
+ * Outer IP header inherits ECN and DF from inner header.
+ * Outer UDP destination is the VXLAN assigned port.
+ *           source port is based on hash of flow if available
+ *                       otherwise use a random value
+ */
+static netdev_tx_t vxlan_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+	struct vxlan_dev *vxlan = netdev_priv(dev);
+	struct rtable *rt;
+	const struct ethhdr *eth;
+	const struct iphdr *old_iph;
+	struct iphdr *iph;
+	struct vxlanhdr *vxh;
+	struct udphdr *uh;
+	struct flowi4 fl4;
+	struct vxlan_fdb *f;
+	unsigned int pkt_len = skb->len;
+	u32 hash;
+	__be32 dst;
+	__be16 df = 0;
+	__u8 tos, ttl;
+	int err;
+
+	/* Need space for new headers (invalidates iph ptr) */
+	if (skb_cow_head(skb, VXLAN_HEADROOM))
+		goto drop;
+
+	eth = (void *)skb->data;
+	old_iph = ip_hdr(skb);
+
+	if (!is_multicast_ether_addr(eth->h_dest) &&
+	    (f = vxlan_find_mac(vxlan, eth->h_dest)))
+		dst = f->remote_ip;
+	else if (vxlan->gaddr) {
+		dst = vxlan->gaddr;
+	} else
+		goto drop;
+
+	ttl = vxlan->ttl;
+	if (!ttl && IN_MULTICAST(ntohl(dst)))
+		ttl = 1;
+
+	tos = vxlan->tos;
+	if (tos == 1)
+		tos = vxlan_get_dsfield(old_iph, skb);
+
+	hash = skb_get_rxhash(skb);
+
+	rt = ip_route_output_gre(dev_net(dev), &fl4, dst,
+				 vxlan->saddr, vxlan->vni,
+				 RT_TOS(tos), vxlan->link);
+	if (IS_ERR(rt)) {
+		netdev_dbg(dev, "no route to %pI4\n", &dst);
+		dev->stats.tx_carrier_errors++;
+		goto tx_error;
+	}
+
+	if (rt->dst.dev == dev) {
+		netdev_dbg(dev, "circular route to %pI4\n", &dst);
+		ip_rt_put(rt);
+		dev->stats.collisions++;
+		goto tx_error;
+	}
+
+	memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
+	IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED |
+			      IPSKB_REROUTED);
+	skb_dst_drop(skb);
+	skb_dst_set(skb, &rt->dst);
+
+	vxh = (struct vxlanhdr *) __skb_push(skb, sizeof(*vxh));
+	vxh->vx_flags = htonl(VXLAN_FLAGS);
+	vxh->vx_vni = htonl(vxlan->vni << 8);
+
+	__skb_push(skb, sizeof(*uh));
+	skb_reset_transport_header(skb);
+	uh = udp_hdr(skb);
+
+	uh->dest = htons(vxlan_port);
+	uh->source = hash ? :random32();
+
+	uh->len = htons(skb->len);
+	uh->check = 0;
+
+	__skb_push(skb, sizeof(*iph));
+	skb_reset_network_header(skb);
+	iph		= ip_hdr(skb);
+	iph->version	= 4;
+	iph->ihl	= sizeof(struct iphdr) >> 2;
+	iph->frag_off	= df;
+	iph->protocol	= IPPROTO_UDP;
+	iph->tos	= vxlan_ecn_encap(tos, old_iph, skb);
+	iph->daddr	= fl4.daddr;
+	iph->saddr	= fl4.saddr;
+	iph->ttl	= ttl ? : ip4_dst_hoplimit(&rt->dst);
+
+	/* See __IPTUNNEL_XMIT */
+	skb->ip_summed = CHECKSUM_NONE;
+	ip_select_ident(iph, &rt->dst, NULL);
+
+	err = ip_local_out(skb);
+	if (likely(net_xmit_eval(err) == 0)) {
+		struct vxlan_stats *stats = this_cpu_ptr(vxlan->stats);
+
+		u64_stats_update_begin(&stats->syncp);
+		stats->tx_packets++;
+		stats->tx_bytes += pkt_len;
+		u64_stats_update_end(&stats->syncp);
+	} else {
+		dev->stats.tx_errors++;
+		dev->stats.tx_aborted_errors++;
+	}
+	return NETDEV_TX_OK;
+
+drop:
+	dev->stats.tx_dropped++;
+	goto tx_free;
+
+tx_error:
+	dev->stats.tx_errors++;
+tx_free:
+	dev_kfree_skb(skb);
+	return NETDEV_TX_OK;
+}
+
+/* Walk the forwarding table and purge stale entries */
+static void vxlan_cleanup(unsigned long arg)
+{
+	struct vxlan_dev *vxlan = (struct vxlan_dev *) arg;
+	unsigned long next_timer = jiffies + FDB_AGE_INTERVAL;
+	unsigned int h;
+
+	if (!netif_running(vxlan->dev))
+		return;
+
+	spin_lock_bh(&vxlan->hash_lock);
+	for (h = 0; h < FDB_HASH_SIZE; ++h) {
+		struct hlist_node *p, *n;
+		hlist_for_each_safe(p, n, &vxlan->fdb_head[h]) {
+			struct vxlan_fdb *f
+				= container_of(p, struct vxlan_fdb, hlist);
+			unsigned long timeout = f->used + FDB_AGE_TIME;
+
+			if (f->state == NUD_PERMANENT)
+				continue;
+
+			if (time_before_eq(timeout, jiffies)) {
+				netdev_dbg(vxlan->dev,
+					   "garbage collect %pM\n",
+					   f->eth_addr);
+				f->state = NUD_STALE;
+				vxlan_fdb_destroy(vxlan, f);
+			} else if (time_before(timeout, next_timer))
+				next_timer = timeout;
+		}
+	}
+	spin_unlock_bh(&vxlan->hash_lock);
+
+	mod_timer(&vxlan->age_timer, next_timer);
+}
+
+/* Setup stats when device is created */
+static int vxlan_init(struct net_device *dev)
+{
+	struct vxlan_dev *vxlan = netdev_priv(dev);
+
+	vxlan->stats = alloc_percpu(struct vxlan_stats);
+	if (!vxlan->stats)
+		return -ENOMEM;
+
+	return 0;
+}
+
+/* Start ageing timer and join group when device is brought up */
+static int vxlan_open(struct net_device *dev)
+{
+	struct vxlan_dev *vxlan = netdev_priv(dev);
+	int err;
+
+	if (vxlan->gaddr) {
+		err = vxlan_join_group(dev);
+		if (err)
+			return err;
+	}
+
+	mod_timer(&vxlan->age_timer, jiffies + FDB_AGE_INTERVAL);
+	return 0;
+}
+
+/* Purge the forwarding table */
+static void vxlan_flush(struct vxlan_dev *vxlan)
+{
+	unsigned h;
+
+	spin_lock_bh(&vxlan->hash_lock);
+	for (h = 0; h < FDB_HASH_SIZE; ++h) {
+		struct hlist_node *p, *n;
+		hlist_for_each_safe(p, n, &vxlan->fdb_head[h]) {
+			struct vxlan_fdb *f
+				= container_of(p, struct vxlan_fdb, hlist);
+			vxlan_fdb_destroy(vxlan, f);
+		}
+	}
+	spin_unlock_bh(&vxlan->hash_lock);
+}
+
+/* Cleanup timer and forwarding table on shutdown */
+static int vxlan_stop(struct net_device *dev)
+{
+	struct vxlan_dev *vxlan = netdev_priv(dev);
+
+	if (vxlan->gaddr)
+		vxlan_leave_group(dev);
+
+	del_timer_sync(&vxlan->age_timer);
+
+	vxlan_flush(vxlan);
+
+	return 0;
+}
+
+/* Merge per-cpu statistics */
+static struct rtnl_link_stats64 *vxlan_stats64(struct net_device *dev,
+					       struct rtnl_link_stats64 *stats)
+{
+	struct vxlan_dev *vxlan = netdev_priv(dev);
+	struct vxlan_stats tmp, sum = { 0 };
+	unsigned int cpu;
+
+	for_each_possible_cpu(cpu) {
+		unsigned int start;
+		const struct vxlan_stats *stats
+			= per_cpu_ptr(vxlan->stats, cpu);
+
+		do {
+			start = u64_stats_fetch_begin_bh(&stats->syncp);
+			memcpy(&tmp, stats, sizeof(tmp));
+		} while (u64_stats_fetch_retry_bh(&stats->syncp, start));
+
+		sum.tx_bytes   += tmp.tx_bytes;
+		sum.tx_packets += tmp.tx_packets;
+		sum.rx_bytes   += tmp.rx_bytes;
+		sum.rx_packets += tmp.rx_packets;
+	}
+
+	stats->tx_bytes   = sum.tx_bytes;
+	stats->tx_packets = sum.tx_packets;
+	stats->rx_bytes   = sum.rx_bytes;
+	stats->rx_packets = sum.rx_packets;
+
+	stats->tx_dropped = dev->stats.tx_dropped;
+	stats->tx_errors  = dev->stats.tx_errors;
+	stats->tx_carrier_errors  = dev->stats.tx_carrier_errors;
+	stats->collisions  = dev->stats.collisions;
+
+	return stats;
+}
+
+/* Stub, nothing needs to be done. */
+static void vxlan_set_multicast_list(struct net_device *dev)
+{
+}
+
+static const struct net_device_ops vxlan_netdev_ops = {
+	.ndo_init		= vxlan_init,
+	.ndo_open		= vxlan_open,
+	.ndo_stop		= vxlan_stop,
+	.ndo_start_xmit		= vxlan_xmit,
+	.ndo_get_stats64	= vxlan_stats64,
+	.ndo_set_rx_mode	= vxlan_set_multicast_list,
+	.ndo_change_mtu		= eth_change_mtu,
+	.ndo_validate_addr	= eth_validate_addr,
+	.ndo_set_mac_address	= eth_mac_addr,
+	.ndo_fdb_add		= vxlan_fdb_add,
+	.ndo_fdb_del		= vxlan_fdb_delete,
+	.ndo_fdb_dump		= vxlan_fdb_dump,
+};
+
+/* Info for udev, that this is a virtual tunnel endpoint */
+static struct device_type vxlan_type = {
+	.name = "vxlan",
+};
+
+static void vxlan_free(struct net_device *dev)
+{
+	struct vxlan_dev *vxlan = netdev_priv(dev);
+
+	free_percpu(vxlan->stats);
+	free_netdev(dev);
+}
+
+/* Initialize the device structure. */
+static void vxlan_setup(struct net_device *dev)
+{
+	struct vxlan_dev *vxlan = netdev_priv(dev);
+	unsigned h;
+
+	eth_hw_addr_random(dev);
+	ether_setup(dev);
+
+	dev->netdev_ops = &vxlan_netdev_ops;
+	dev->destructor = vxlan_free;
+	SET_NETDEV_DEVTYPE(dev, &vxlan_type);
+
+	dev->tx_queue_len = 0;
+	dev->features	|= NETIF_F_LLTX;
+	dev->features	|= NETIF_F_NETNS_LOCAL;
+	dev->priv_flags	&= ~IFF_XMIT_DST_RELEASE;
+
+	spin_lock_init(&vxlan->hash_lock);
+
+	init_timer_deferrable(&vxlan->age_timer);
+	vxlan->age_timer.function = vxlan_cleanup;
+	vxlan->age_timer.data = (unsigned long) vxlan;
+
+	vxlan->dev = dev;
+
+	for (h = 0; h < FDB_HASH_SIZE; ++h)
+		INIT_HLIST_HEAD(&vxlan->fdb_head[h]);
+}
+
+static const struct nla_policy vxlan_policy[IFLA_VXLAN_MAX + 1] = {
+	[IFLA_VXLAN_ID]		= { .type = NLA_U32 },
+	[IFLA_VXLAN_GROUP]	= { .len = FIELD_SIZEOF(struct iphdr, daddr) },
+	[IFLA_VXLAN_LINK]	= { .type = NLA_U32 },
+	[IFLA_VXLAN_LOCAL]	= { .len = FIELD_SIZEOF(struct iphdr, saddr) },
+	[IFLA_VXLAN_TOS]	= { .type = NLA_U8 },
+	[IFLA_VXLAN_TTL]	= { .type = NLA_U8 },
+	[IFLA_VXLAN_LEARNING]	= { .type = NLA_U8 },
+};
+
+static int vxlan_validate(struct nlattr *tb[], struct nlattr *data[])
+{
+	if (tb[IFLA_ADDRESS]) {
+		if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN) {
+			pr_debug("invalid link address (not ethernet)\n");
+			return -EINVAL;
+		}
+
+		if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS]))) {
+			pr_debug("invalid all zero ethernet address\n");
+			return -EADDRNOTAVAIL;
+		}
+	}
+
+	if (!data)
+		return -EINVAL;
+
+	if (data[IFLA_VXLAN_ID]) {
+		__u32 id = nla_get_u32(data[IFLA_VXLAN_ID]);
+		if (id >= VXLAN_VID_MASK)
+			return -ERANGE;
+	}
+
+	if (data[IFLA_VXLAN_GROUP]) {
+		__be32 gaddr = nla_get_be32(data[IFLA_VXLAN_GROUP]);
+		if (!IN_MULTICAST(ntohl(gaddr))) {
+			pr_debug("group address is not IPv4 multicast\n");
+			return -EADDRNOTAVAIL;
+		}
+	}
+
+	return 0;
+}
+
+static int vxlan_newlink(struct net *net, struct net_device *dev,
+			 struct nlattr *tb[], struct nlattr *data[])
+{
+	struct vxlan_dev *vxlan = netdev_priv(dev);
+	__u32 vni;
+	int err;
+
+	if (!data[IFLA_VXLAN_ID])
+		return -EINVAL;
+
+	vni = nla_get_u32(data[IFLA_VXLAN_ID]);
+	if (vxlan_find_vni(net, vni)) {
+		pr_info("duplicate VNI %u\n", vni);
+		return -EEXIST;
+	}
+	vxlan->vni = vni;
+
+	if (data[IFLA_VXLAN_GROUP])
+		vxlan->gaddr = nla_get_be32(data[IFLA_VXLAN_GROUP]);
+
+	if (data[IFLA_VXLAN_LOCAL])
+		vxlan->saddr = nla_get_be32(data[IFLA_VXLAN_LOCAL]);
+
+	if (data[IFLA_VXLAN_LINK])
+		vxlan->link = nla_get_u32(data[IFLA_VXLAN_LINK]);
+
+	if (data[IFLA_VXLAN_TOS])
+		vxlan->tos  = nla_get_u8(data[IFLA_VXLAN_TOS]);
+
+	if (!data[IFLA_VXLAN_LEARNING] || nla_get_u8(data[IFLA_VXLAN_LEARNING]))
+		vxlan->learn = true;
+
+	err = register_netdevice(dev);
+	if (!err)
+		hlist_add_head_rcu(&vxlan->hlist, vni_head(net, vxlan->vni));
+
+	return err;
+}
+
+static void vxlan_dellink(struct net_device *dev, struct list_head *head)
+{
+	struct vxlan_dev *vxlan = netdev_priv(dev);
+
+	hlist_del_rcu(&vxlan->hlist);
+
+	unregister_netdevice_queue(dev, head);
+}
+
+static size_t vxlan_get_size(const struct net_device *dev)
+{
+
+	return nla_total_size(sizeof(__u32)) +	/* IFLA_VXLAN_ID */
+		nla_total_size(sizeof(__be32)) +/* IFLA_VXLAN_GROUP */
+		nla_total_size(sizeof(__u32)) +	/* IFLA_VXLAN_LINK */
+		nla_total_size(sizeof(__be32))+	/* IFLA_VXLAN_LOCAL */
+		nla_total_size(sizeof(__u8)) +	/* IFLA_VXLAN_TTL */
+		nla_total_size(sizeof(__u8)) +	/* IFLA_VXLAN_TOS */
+		nla_total_size(sizeof(__u8)) +	/* IFLA_VXLAN_LEARNING */
+		0;
+}
+
+static int vxlan_fill_info(struct sk_buff *skb, const struct net_device *dev)
+{
+	const struct vxlan_dev *vxlan = netdev_priv(dev);
+
+	if (nla_put_u32(skb, IFLA_VXLAN_ID, vxlan->vni))
+		goto nla_put_failure;
+
+	if (vxlan->gaddr && nla_put_u32(skb, IFLA_VXLAN_GROUP, vxlan->gaddr))
+		goto nla_put_failure;
+
+	if (vxlan->link && nla_put_u32(skb, IFLA_VXLAN_LINK, vxlan->link))
+		goto nla_put_failure;
+
+	if (vxlan->saddr && nla_put_u32(skb, IFLA_VXLAN_LOCAL, vxlan->saddr))
+		goto nla_put_failure;
+
+	if (nla_put_u8(skb, IFLA_VXLAN_TTL, vxlan->ttl) ||
+	    nla_put_u8(skb, IFLA_VXLAN_TOS, vxlan->tos) ||
+	    nla_put_u8(skb, IFLA_VXLAN_LEARNING, vxlan->learn))
+		goto nla_put_failure;
+
+	return 0;
+
+nla_put_failure:
+	return -EMSGSIZE;
+}
+
+static struct rtnl_link_ops vxlan_link_ops __read_mostly = {
+	.kind		= "vxlan",
+	.maxtype	= IFLA_VXLAN_MAX,
+	.policy		= vxlan_policy,
+	.priv_size	= sizeof(struct vxlan_dev),
+	.setup		= vxlan_setup,
+	.validate	= vxlan_validate,
+	.newlink	= vxlan_newlink,
+	.dellink	= vxlan_dellink,
+	.get_size	= vxlan_get_size,
+	.fill_info	= vxlan_fill_info,
+};
+
+static __net_init int vxlan_init_net(struct net *net)
+{
+	struct vxlan_net *vn = net_generic(net, vxlan_net_id);
+	struct sock *sk;
+	struct sockaddr_in addr;
+	int rc;
+	unsigned h;
+
+	/* Create UDP socket for encapsulation receive. */
+	rc = sock_create_kern(AF_INET, SOCK_DGRAM, IPPROTO_UDP, &vn->sock);
+	if (rc < 0) {
+		pr_debug("UDP socket create failed\n");
+		return rc;
+	}
+
+	addr.sin_family = AF_INET;
+	addr.sin_addr.s_addr = INADDR_ANY;
+	addr.sin_port = htons(vxlan_port);
+
+	rc = kernel_bind(vn->sock, (struct sockaddr *) &addr, sizeof(addr));
+	if (rc < 0) {
+		pr_debug("bind for port %u failed %d\n", vxlan_port, rc);
+		sock_release(vn->sock);
+		vn->sock = NULL;
+		return rc;
+	}
+
+	/* Disable multicast loopback */
+	sk = vn->sock->sk;
+	inet_sk(sk)->mc_loop = 0;
+
+	/* Mark socket as an encapsulation socket. */
+	udp_sk(sk)->encap_type = 1;
+	udp_sk(sk)->encap_rcv = vxlan_udp_encap_recv;
+	udp_encap_enable();
+
+	for (h = 0; h < VNI_HASH_SIZE; ++h)
+		INIT_HLIST_HEAD(&vn->vni_list[h]);
+
+	return 0;
+}
+
+static __net_exit void vxlan_exit_net(struct net *net)
+{
+	struct vxlan_net *vn = net_generic(net, vxlan_net_id);
+
+	if (vn->sock) {
+		sock_release(vn->sock);
+		vn->sock = NULL;
+	}
+}
+
+static struct pernet_operations vxlan_net_ops = {
+	.init = vxlan_init_net,
+	.exit = vxlan_exit_net,
+	.id   = &vxlan_net_id,
+	.size = sizeof(struct vxlan_net),
+};
+
+static int __init vxlan_init_module(void)
+{
+	int rc;
+
+	get_random_bytes(&vxlan_salt, sizeof(vxlan_salt));
+
+	rc = register_pernet_device(&vxlan_net_ops);
+	if (rc)
+		goto out1;
+
+	rc = rtnl_link_register(&vxlan_link_ops);
+	if (rc)
+		goto out2;
+
+	return 0;
+
+out2:
+	unregister_pernet_device(&vxlan_net_ops);
+out1:
+	return rc;
+}
+module_init(vxlan_init_module);
+
+static void __exit vxlan_cleanup_module(void)
+{
+	rtnl_link_unregister(&vxlan_link_ops);
+	unregister_pernet_device(&vxlan_net_ops);
+}
+module_exit(vxlan_cleanup_module);
+
+MODULE_LICENSE("GPL");
+MODULE_VERSION(VXLAN_VERSION);
+MODULE_AUTHOR("Stephen Hemminger <shemminger@vyatta.com>");
+MODULE_ALIAS_RTNL_LINK("vxlan");
--- a/include/linux/if_link.h	2012-09-24 17:23:51.888577602 -0700
+++ b/include/linux/if_link.h	2012-09-25 14:03:41.568362815 -0700
@@ -272,6 +272,20 @@ enum macvlan_mode {
 
 #define MACVLAN_FLAG_NOPROMISC	1
 
+/* VXLAN section */
+enum {
+	IFLA_VXLAN_UNSPEC,
+	IFLA_VXLAN_ID,
+	IFLA_VXLAN_GROUP,
+	IFLA_VXLAN_LINK,
+	IFLA_VXLAN_LOCAL,
+	IFLA_VXLAN_TTL,
+	IFLA_VXLAN_TOS,
+	IFLA_VXLAN_LEARNING,
+	__IFLA_VXLAN_MAX
+};
+#define IFLA_VXLAN_MAX	(__IFLA_VXLAN_MAX - 1)
+
 /* SR-IOV virtual function management section */
 
 enum {
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ b/Documentation/networking/vxlan.txt	2012-09-25 14:03:41.568362815 -0700
@@ -0,0 +1,36 @@
+Virtual eXtensible Local Area Networking documentation
+======================================================
+
+The VXLAN protocol is a tunnelling protocol that is designed to
+solve the problem of limited number of available VLAN's (4096).
+With VXLAN identifier is expanded to 24 bits.
+
+It is a draft RFC standard, that is implemented by Cisco Nexus,
+Vmware and Brocade. The protocol runs over UDP using a single
+destination port (still not standardized by IANA).
+This document describes the Linux kernel tunnel device,
+there is also an implantation of VXLAN for Openvswitch.
+
+Unlike most tunnels, a VXLAN is a 1 to N network, not just point
+to point. A VXLAN device can either dynamically learn the IP address
+of the other end, in a manner similar to a learning bridge, or the
+forwarding entries can be configured statically.
+
+The management of vxlan is done in a similar fashion to it's
+too closest neighbors GRE and VLAN. Configuring VXLAN requires
+the version of iproute2 that matches the kernel release
+where VXLAN was first merged upstream.
+
+1. Create vxlan device
+  # ip li add vxlan0 type vxlan id 42 group 239.1.1.1 dev eth1
+
+This creates a new device (vxlan0). The device uses the
+the multicast group 239.1.1.1 over eth1 to handle packets where
+no entry is in the forwarding table.
+
+2. Delete vxlan device
+  # ip link delete vxlan0
+
+3. Show vxlan info
+  # ip -d show vxlan0
+

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCHv4 net-next] vxlan: virtual extensible lan
  2012-09-25 21:55       ` Jesse Gross
  2012-09-25 22:03         ` Stephen Hemminger
  2012-09-25 22:09         ` [PATCHv5 " Stephen Hemminger
@ 2012-09-26  4:36         ` Stephen Hemminger
  2012-09-27 17:20           ` Jesse Gross
  2 siblings, 1 reply; 43+ messages in thread
From: Stephen Hemminger @ 2012-09-26  4:36 UTC (permalink / raw)
  To: Jesse Gross; +Cc: Chris Wright, David Miller, netdev

On Tue, 25 Sep 2012 14:55:13 -0700
Jesse Gross <jesse@nicira.com> wrote:

> On Mon, Sep 24, 2012 at 2:50 PM, Stephen Hemminger
> <shemminger@vyatta.com> wrote:
> > +static netdev_tx_t vxlan_xmit(struct sk_buff *skb, struct net_device *dev)
> [...]
> > +       /* Do PMTU */
> > +       if (skb->protocol == htons(ETH_P_IP)) {
> > +               df |= old_iph->frag_off & htons(IP_DF);
> > +               if (df && mtu < pkt_len) {
> > +                       icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
> > +                                 htonl(mtu));
> > +                       ip_rt_put(rt);
> > +                       goto tx_error;
> > +               }
> > +       }
> > +#if IS_ENABLED(CONFIG_IPV6)
> > +       else if (skb->protocol == htons(ETH_P_IPV6)) {
> > +               if (mtu >= IPV6_MIN_MTU && mtu < pkt_len) {
> > +                       icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
> > +                       ip_rt_put(rt);
> > +                       goto tx_error;
> > +               }
> > +       }
> > +#endif
> 
> Won't this black hole packets if we need to generate ICMP messages?
> Since we're doing switching and not routing here icmp_send() doesn't
> necessarily have a route to the relevant endpoint.  It looks like
> Ethernet over GRE has this issue as well.

It is an interesting question about what is the correct way to handle packets
where the inner header is IPv6 or IPv4 with Don't Fragment set. As you mention
sending an ICMP response won't work because the tunnel endpoint is not part
of that IP network.

The simple option is to fragment it in the tunnel and since the fragmentation
is not visible to the overlay network, that is okay. But for PMTU discovery
it might be better to just drop the packet and not send a fragmented payload.

Some backbone networks don't allow fragmentation at all (in a futile attempt
to block DoS attacks and protect fragile Windows hosts). Fragmentation
brings all sorts of evil problems like the potential of corrupted assembly
because of sequence wrap; the checksum in the inner packet will defend against
that but tunnels are not supposed to rely on inner protocol data protection.

Or you can just do what Cisco and Microsoft do and just tell everyone
to set larger MTU on the backbone.

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCHv4 net-next] vxlan: virtual extensible lan
  2012-09-26  4:36         ` [PATCHv4 net-next] vxlan: virtual extensible lan Stephen Hemminger
@ 2012-09-27 17:20           ` Jesse Gross
  0 siblings, 0 replies; 43+ messages in thread
From: Jesse Gross @ 2012-09-27 17:20 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: Chris Wright, David Miller, netdev

On Tue, Sep 25, 2012 at 9:36 PM, Stephen Hemminger
<shemminger@vyatta.com> wrote:
> On Tue, 25 Sep 2012 14:55:13 -0700
> Jesse Gross <jesse@nicira.com> wrote:
>
>> On Mon, Sep 24, 2012 at 2:50 PM, Stephen Hemminger
>> <shemminger@vyatta.com> wrote:
>> > +static netdev_tx_t vxlan_xmit(struct sk_buff *skb, struct net_device *dev)
>> [...]
>> > +       /* Do PMTU */
>> > +       if (skb->protocol == htons(ETH_P_IP)) {
>> > +               df |= old_iph->frag_off & htons(IP_DF);
>> > +               if (df && mtu < pkt_len) {
>> > +                       icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
>> > +                                 htonl(mtu));
>> > +                       ip_rt_put(rt);
>> > +                       goto tx_error;
>> > +               }
>> > +       }
>> > +#if IS_ENABLED(CONFIG_IPV6)
>> > +       else if (skb->protocol == htons(ETH_P_IPV6)) {
>> > +               if (mtu >= IPV6_MIN_MTU && mtu < pkt_len) {
>> > +                       icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
>> > +                       ip_rt_put(rt);
>> > +                       goto tx_error;
>> > +               }
>> > +       }
>> > +#endif
>>
>> Won't this black hole packets if we need to generate ICMP messages?
>> Since we're doing switching and not routing here icmp_send() doesn't
>> necessarily have a route to the relevant endpoint.  It looks like
>> Ethernet over GRE has this issue as well.
>
> It is an interesting question about what is the correct way to handle packets
> where the inner header is IPv6 or IPv4 with Don't Fragment set. As you mention
> sending an ICMP response won't work because the tunnel endpoint is not part
> of that IP network.
>
> The simple option is to fragment it in the tunnel and since the fragmentation
> is not visible to the overlay network, that is okay. But for PMTU discovery
> it might be better to just drop the packet and not send a fragmented payload.
>
> Some backbone networks don't allow fragmentation at all (in a futile attempt
> to block DoS attacks and protect fragile Windows hosts). Fragmentation
> brings all sorts of evil problems like the potential of corrupted assembly
> because of sequence wrap; the checksum in the inner packet will defend against
> that but tunnels are not supposed to rely on inner protocol data protection.
>
> Or you can just do what Cisco and Microsoft do and just tell everyone
> to set larger MTU on the backbone.

What I think people usually do in these situations are:
 1. Insist people set the MTU to take into account the tunnel.
 2. Use MSS clamping for TCP traffic.
 3. Either drop or fragment the tunnel packet.  In theory some IP
stacks will probe for a lower MTU if packets are dropping, in practice
things seem to just break.  If the backbone is going to drop
fragmented packets then I guess it doesn't make a difference, modulo
the potential for corruption that you mentioned.  Always dropping
seems worse (although it is the behavior of many hardware devices that
can't do fragmentation at all).

So I think what you have currently is correct.

A couple of other options:
 * In many cases it might be desirable to do fragmentation on the
inner rather than outer packet, especially if there are middleboxes
looking inside the tunnel.  This assumes that the inner packet is IP
and doesn't have the DF bit set.  In theory, you could do it even if
the DF bit is set since we can't do path MTU discovery anyways.
 * A few years ago I wrote an implementation of path MTU discovery in
OVS to handle this situation.  It's pretty effective but it relies on
guessing/faking some addresses.  I think we're going to pull it out
soon in favor of MSS clamping soon though.

I wouldn't implement either of these here, at least at this time though.

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCHv5 net-next] vxlan: virtual extensible lan
  2012-09-25 22:09         ` [PATCHv5 " Stephen Hemminger
@ 2012-09-27 22:47           ` David Miller
  2012-09-27 23:00             ` Stephen Hemminger
  0 siblings, 1 reply; 43+ messages in thread
From: David Miller @ 2012-09-27 22:47 UTC (permalink / raw)
  To: shemminger; +Cc: jesse, chrisw, netdev

From: Stephen Hemminger <shemminger@vyatta.com>
Date: Tue, 25 Sep 2012 15:09:57 -0700

> +/* Hash Ethernet address */
> +static u32 eth_hash(const unsigned char *addr)
> +{
> +	/* could be optimized for unaligned access */
> +	u32 a = addr[5] << 8 | addr[4];
> +	u32 b = addr[3] << 24 | addr[2] << 16 | addr[1] << 8 | addr[0];
> +
> +	return jhash_2words(a, b, vxlan_salt);
> +}
> +
> +/* Hash chain to use given mac address */
> +static inline struct hlist_head *vxlan_fdb_head(struct vxlan_dev *vxlan,
> +						const u8 *mac)
> +{
> +	return &vxlan->fdb_head[hash_32(eth_hash(mac), FDB_HASH_BITS)];
> +}

This hashing is way overkill, and in fact is a mis-use of Jenkins.

Jenkins can work fine with power-of-two hash sizes, so using hash_32()
on the Jenkins hash result is nothing short of silly.

I would go one step further and change this to:

	(a ^ b) * vxlan_salt

much like how we hash IP addresses in the ipv4 ARP tables.  That
function is a universal hash, and therefore provides whatever kind
of protection you're trying to obtain using the salt.

But I wonder if this matters at all, the administrator controls
the contents of this table, rather than external entitites.

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCHv5 net-next] vxlan: virtual extensible lan
  2012-09-27 22:47           ` David Miller
@ 2012-09-27 23:00             ` Stephen Hemminger
  2012-09-27 23:12               ` David Miller
  0 siblings, 1 reply; 43+ messages in thread
From: Stephen Hemminger @ 2012-09-27 23:00 UTC (permalink / raw)
  To: David Miller; +Cc: jesse, chrisw, netdev

On Thu, 27 Sep 2012 18:47:40 -0400 (EDT)
David Miller <davem@davemloft.net> wrote:

> From: Stephen Hemminger <shemminger@vyatta.com>
> Date: Tue, 25 Sep 2012 15:09:57 -0700
> 
> > +/* Hash Ethernet address */
> > +static u32 eth_hash(const unsigned char *addr)
> > +{
> > +	/* could be optimized for unaligned access */
> > +	u32 a = addr[5] << 8 | addr[4];
> > +	u32 b = addr[3] << 24 | addr[2] << 16 | addr[1] << 8 | addr[0];
> > +
> > +	return jhash_2words(a, b, vxlan_salt);
> > +}
> > +
> > +/* Hash chain to use given mac address */
> > +static inline struct hlist_head *vxlan_fdb_head(struct vxlan_dev *vxlan,
> > +						const u8 *mac)
> > +{
> > +	return &vxlan->fdb_head[hash_32(eth_hash(mac), FDB_HASH_BITS)];
> > +}
> 
> This hashing is way overkill, and in fact is a mis-use of Jenkins.
> 
> Jenkins can work fine with power-of-two hash sizes, so using hash_32()
> on the Jenkins hash result is nothing short of silly.
> 
> I would go one step further and change this to:
> 
> 	(a ^ b) * vxlan_salt

Ok, but doubt that it makes much difference.

> 
> much like how we hash IP addresses in the ipv4 ARP tables.  That
> function is a universal hash, and therefore provides whatever kind
> of protection you're trying to obtain using the salt.
> 
> But I wonder if this matters at all, the administrator controls
> the contents of this table, rather than external entitites.

The table includes values learned from packets received. Like a bridge,
a malicious attacker who can forge MAC sourc addresses can overload one
chain by swamping the table with bogus values.  Probably needs a table limit.

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCHv5 net-next] vxlan: virtual extensible lan
  2012-09-27 23:00             ` Stephen Hemminger
@ 2012-09-27 23:12               ` David Miller
  2012-10-01 20:57                 ` [PATCHv6 " Stephen Hemminger
                                   ` (2 more replies)
  0 siblings, 3 replies; 43+ messages in thread
From: David Miller @ 2012-09-27 23:12 UTC (permalink / raw)
  To: shemminger; +Cc: jesse, chrisw, netdev

From: Stephen Hemminger <shemminger@vyatta.com>
Date: Thu, 27 Sep 2012 16:00:54 -0700

> On Thu, 27 Sep 2012 18:47:40 -0400 (EDT)
> David Miller <davem@davemloft.net> wrote:
> 
>> But I wonder if this matters at all, the administrator controls
>> the contents of this table, rather than external entitites.
> 
> The table includes values learned from packets received. Like a bridge,
> a malicious attacker who can forge MAC sourc addresses can overload one
> chain by swamping the table with bogus values.  Probably needs a table limit.

Ok.

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [RFC] gre: conform to RFC6040 ECN progogation
  2012-09-24 21:44         ` [RFC] gre: conform to RFC6040 ECN progogation Stephen Hemminger
  2012-09-24 22:25           ` Eric Dumazet
@ 2012-10-01 15:55           ` Ben Hutchings
  2012-10-01 15:56             ` Stephen Hemminger
  1 sibling, 1 reply; 43+ messages in thread
From: Ben Hutchings @ 2012-10-01 15:55 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: Chris Wright, David Miller, netdev

On Mon, 2012-09-24 at 14:44 -0700, Stephen Hemminger wrote:
[...]
> --- a/net/ipv4/ip_gre.c	2012-09-21 08:45:55.948772761 -0700
> +++ b/net/ipv4/ip_gre.c	2012-09-24 14:35:54.666185603 -0700
[...]
> @@ -703,17 +704,18 @@ static int ipgre_rcv(struct sk_buff *skb
>  			skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
>  		}
>  
> +		__skb_tunnel_rx(skb, tunnel->dev);
> +
> +		skb_reset_network_header(skb);
> +		if (!ipgre_ecn_decapsulate(iph, skb))
> +			goto drop;
> +
>  		tstats = this_cpu_ptr(tunnel->dev->tstats);
>  		u64_stats_update_begin(&tstats->syncp);
>  		tstats->rx_packets++;
>  		tstats->rx_bytes += skb->len;
>  		u64_stats_update_end(&tstats->syncp);

I don't know why you're moving this code above the stats update;
rx_packets/rx_bytes should include dropped packets.

Ben.

> -		__skb_tunnel_rx(skb, tunnel->dev);
> -
> -		skb_reset_network_header(skb);
> -		ipgre_ecn_decapsulate(iph, skb);
> -
>  		netif_rx(skb);
>  
>  		rcu_read_unlock();

-- 
Ben Hutchings, Staff Engineer, Solarflare
Not speaking for my employer; that's the marketing department's job.
They asked us to note that Solarflare product names are trademarked.

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [RFC] gre: conform to RFC6040 ECN progogation
  2012-10-01 15:55           ` Ben Hutchings
@ 2012-10-01 15:56             ` Stephen Hemminger
  2012-10-01 16:49               ` Ben Hutchings
  0 siblings, 1 reply; 43+ messages in thread
From: Stephen Hemminger @ 2012-10-01 15:56 UTC (permalink / raw)
  To: Ben Hutchings; +Cc: Chris Wright, David Miller, netdev

On Mon, 1 Oct 2012 16:55:19 +0100
Ben Hutchings <bhutchings@solarflare.com> wrote:

> On Mon, 2012-09-24 at 14:44 -0700, Stephen Hemminger wrote:
> [...]
> > --- a/net/ipv4/ip_gre.c	2012-09-21 08:45:55.948772761 -0700
> > +++ b/net/ipv4/ip_gre.c	2012-09-24 14:35:54.666185603 -0700
> [...]
> > @@ -703,17 +704,18 @@ static int ipgre_rcv(struct sk_buff *skb
> >  			skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
> >  		}
> >  
> > +		__skb_tunnel_rx(skb, tunnel->dev);
> > +
> > +		skb_reset_network_header(skb);
> > +		if (!ipgre_ecn_decapsulate(iph, skb))
> > +			goto drop;
> > +
> >  		tstats = this_cpu_ptr(tunnel->dev->tstats);
> >  		u64_stats_update_begin(&tstats->syncp);
> >  		tstats->rx_packets++;
> >  		tstats->rx_bytes += skb->len;
> >  		u64_stats_update_end(&tstats->syncp);
> 
> I don't know why you're moving this code above the stats update;
> rx_packets/rx_bytes should include dropped packets.
> 
> Ben.
> 
> > -		__skb_tunnel_rx(skb, tunnel->dev);
> > -
> > -		skb_reset_network_header(skb);
> > -		ipgre_ecn_decapsulate(iph, skb);
> > -
> >  		netif_rx(skb);
> >  
> >  		rcu_read_unlock();
> 

Is that true? I thought they were accounted for by rx_dropped.

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [RFC] gre: conform to RFC6040 ECN progogation
  2012-10-01 15:56             ` Stephen Hemminger
@ 2012-10-01 16:49               ` Ben Hutchings
  2012-10-01 17:13                 ` Eric Dumazet
  0 siblings, 1 reply; 43+ messages in thread
From: Ben Hutchings @ 2012-10-01 16:49 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: Chris Wright, David Miller, netdev

On Mon, 2012-10-01 at 08:56 -0700, Stephen Hemminger wrote:
> On Mon, 1 Oct 2012 16:55:19 +0100
> Ben Hutchings <bhutchings@solarflare.com> wrote:
> 
> > On Mon, 2012-09-24 at 14:44 -0700, Stephen Hemminger wrote:
> > [...]
> > > --- a/net/ipv4/ip_gre.c	2012-09-21 08:45:55.948772761 -0700
> > > +++ b/net/ipv4/ip_gre.c	2012-09-24 14:35:54.666185603 -0700
> > [...]
> > > @@ -703,17 +704,18 @@ static int ipgre_rcv(struct sk_buff *skb
> > >  			skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
> > >  		}
> > >  
> > > +		__skb_tunnel_rx(skb, tunnel->dev);
> > > +
> > > +		skb_reset_network_header(skb);
> > > +		if (!ipgre_ecn_decapsulate(iph, skb))
> > > +			goto drop;
> > > +
> > >  		tstats = this_cpu_ptr(tunnel->dev->tstats);
> > >  		u64_stats_update_begin(&tstats->syncp);
> > >  		tstats->rx_packets++;
> > >  		tstats->rx_bytes += skb->len;
> > >  		u64_stats_update_end(&tstats->syncp);
> > 
> > I don't know why you're moving this code above the stats update;
> > rx_packets/rx_bytes should include dropped packets.
> > 
> > Ben.
> > 
> > > -		__skb_tunnel_rx(skb, tunnel->dev);
> > > -
> > > -		skb_reset_network_header(skb);
> > > -		ipgre_ecn_decapsulate(iph, skb);
> > > -
> > >  		netif_rx(skb);
> > >  
> > >  		rcu_read_unlock();
> > 
> 
> Is that true? I thought they were accounted for by rx_dropped.

I don't think rx_dropped is appropriate for counting invalid packets,
but maybe actual practice is already different.

As for whether packets counted in rx_dropped should also be counted in
rx_packets/rx_bytes, I really don't know.  The current comments on
rtnl_link_stats (inherited from net_device_stats) are totally inadequate
as a specification.

Ben.

-- 
Ben Hutchings, Staff Engineer, Solarflare
Not speaking for my employer; that's the marketing department's job.
They asked us to note that Solarflare product names are trademarked.

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [RFC] gre: conform to RFC6040 ECN progogation
  2012-10-01 16:49               ` Ben Hutchings
@ 2012-10-01 17:13                 ` Eric Dumazet
  2012-10-01 21:21                   ` Stephen Hemminger
  0 siblings, 1 reply; 43+ messages in thread
From: Eric Dumazet @ 2012-10-01 17:13 UTC (permalink / raw)
  To: Ben Hutchings; +Cc: Stephen Hemminger, Chris Wright, David Miller, netdev

On Mon, 2012-10-01 at 17:49 +0100, Ben Hutchings wrote:

> I don't think rx_dropped is appropriate for counting invalid packets,
> but maybe actual practice is already different.
> 
> As for whether packets counted in rx_dropped should also be counted in
> rx_packets/rx_bytes, I really don't know.  The current comments on
> rtnl_link_stats (inherited from net_device_stats) are totally inadequate
> as a specification.

rx_dropped is used by core network stack, not the devices themselves.

So a packet is first accounted in rx_bytes/rx_packets by the driver,
and if net/core/dev.c drops it, rx_dropped is incremented as well.

^ permalink raw reply	[flat|nested] 43+ messages in thread

* [PATCHv6 net-next] vxlan: virtual extensible lan
  2012-09-27 23:12               ` David Miller
@ 2012-10-01 20:57                 ` Stephen Hemminger
  2012-10-01 22:07                   ` David Miller
       [not found]                 ` <20121001140206.2bbf9c41@nehalam.linuxnetplumber.net>
  2012-10-01 21:02                 ` [PATCH 1/2] iproute2: vxlan support Stephen Hemminger
  2 siblings, 1 reply; 43+ messages in thread
From: Stephen Hemminger @ 2012-10-01 20:57 UTC (permalink / raw)
  To: David Miller; +Cc: jesse, chrisw, netdev

This is an implementation of Virtual eXtensible Local Area Network
as described in draft RFC:
  http://tools.ietf.org/html/draft-mahalingam-dutt-dcops-vxlan-02

The driver integrates a Virtual Tunnel Endpoint (VTEP) functionality
that learns MAC to IP address mapping. 

This implementation has only been tested with the user-mode TAP
based version for Linux, not against other vendors (yet).

Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>

---
v6 - simplify hash function
     allow configuring forwarding table limit and ageing timer
     account for VLAN in header
     adjust mtu based on lower device (if any)
     fix fields in fdb_show

v5 - drop MTU discovery since network is overlaid
     use common code to do ECN decapsulation
v4 - fix ecn and set state of fdb entries
v3 - fix ordering of change versus migration message
v2 - fix use of ip header after pskb_may_pull

 Documentation/networking/vxlan.txt |   47 +
 drivers/net/Kconfig                |   13 
 drivers/net/Makefile               |    1 
 drivers/net/vxlan.c                | 1217 +++++++++++++++++++++++++++++++++++++
 include/linux/if_link.h            |   16 
 5 files changed, 1294 insertions(+)

--- a/drivers/net/Kconfig	2012-10-01 08:04:59.822350516 -0700
+++ b/drivers/net/Kconfig	2012-10-01 08:27:16.625148235 -0700
@@ -149,6 +149,19 @@ config MACVTAP
 	  To compile this driver as a module, choose M here: the module
 	  will be called macvtap.
 
+config VXLAN
+       tristate "Virtual eXtensible Local Area Network (VXLAN)"
+       depends on EXPERIMENTAL
+       ---help---
+	  This allows one to create vxlan virtual interfaces that provide
+	  Layer 2 Networks over Layer 3 Networks. VXLAN is often used
+	  to tunnel virtual network infrastructure in virtualized environments.
+	  For more information see:
+	    http://tools.ietf.org/html/draft-mahalingam-dutt-dcops-vxlan-02
+
+	  To compile this driver as a module, choose M here: the module
+	  will be called vxlan.
+
 config NETCONSOLE
 	tristate "Network console logging support"
 	---help---
--- a/drivers/net/Makefile	2012-10-01 08:04:59.822350516 -0700
+++ b/drivers/net/Makefile	2012-10-01 08:27:16.629148196 -0700
@@ -21,6 +21,7 @@ obj-$(CONFIG_NET_TEAM) += team/
 obj-$(CONFIG_TUN) += tun.o
 obj-$(CONFIG_VETH) += veth.o
 obj-$(CONFIG_VIRTIO_NET) += virtio_net.o
+obj-$(CONFIG_VXLAN) += vxlan.o
 
 #
 # Networking Drivers
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ b/drivers/net/vxlan.c	2012-10-01 13:54:15.912814598 -0700
@@ -0,0 +1,1217 @@
+/*
+ * VXLAN: Virtual eXtensiable Local Area Network
+ *
+ * Copyright (c) 2012 Vyatta Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * TODO
+ *  - use IANA UDP port number (when defined)
+ *  - IPv6 (not in RFC)
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/slab.h>
+#include <linux/skbuff.h>
+#include <linux/rculist.h>
+#include <linux/netdevice.h>
+#include <linux/in.h>
+#include <linux/ip.h>
+#include <linux/udp.h>
+#include <linux/igmp.h>
+#include <linux/etherdevice.h>
+#include <linux/if_ether.h>
+#include <linux/version.h>
+#include <linux/hash.h>
+#include <net/ip.h>
+#include <net/icmp.h>
+#include <net/udp.h>
+#include <net/rtnetlink.h>
+#include <net/route.h>
+#include <net/dsfield.h>
+#include <net/inet_ecn.h>
+#include <net/net_namespace.h>
+#include <net/netns/generic.h>
+
+#define VXLAN_VERSION	"0.1"
+
+#define VNI_HASH_BITS	10
+#define VNI_HASH_SIZE	(1<<VNI_HASH_BITS)
+#define FDB_HASH_BITS	8
+#define FDB_HASH_SIZE	(1<<FDB_HASH_BITS)
+#define FDB_AGE_DEFAULT 300 /* 5 min */
+#define FDB_AGE_INTERVAL (10 * HZ)	/* rescan interval */
+
+#define VXLAN_N_VID	(1u << 24)
+#define VXLAN_VID_MASK	(VXLAN_N_VID - 1)
+/* VLAN + IP header + UDP + VXLAN */
+#define VXLAN_HEADROOM (4 + 20 + 8 + 8)
+
+#define VXLAN_FLAGS 0x08000000	/* struct vxlanhdr.vx_flags required value. */
+
+/* VXLAN protocol header */
+struct vxlanhdr {
+	__be32 vx_flags;
+	__be32 vx_vni;
+};
+
+/* UDP port for VXLAN traffic. */
+static unsigned int vxlan_port __read_mostly = 8472;
+module_param_named(udp_port, vxlan_port, uint, 0444);
+MODULE_PARM_DESC(udp_port, "Destination UDP port");
+
+static bool log_ecn_error = true;
+module_param(log_ecn_error, bool, 0644);
+MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN");
+
+/* per-net private data for this module */
+static unsigned int vxlan_net_id;
+struct vxlan_net {
+	struct socket	  *sock;	/* UDP encap socket */
+	struct hlist_head vni_list[VNI_HASH_SIZE];
+};
+
+/* Forwarding table entry */
+struct vxlan_fdb {
+	struct hlist_node hlist;	/* linked list of entries */
+	struct rcu_head	  rcu;
+	unsigned long	  updated;	/* jiffies */
+	unsigned long	  used;
+	__be32		  remote_ip;
+	u16		  state;	/* see ndm_state */
+	u8		  eth_addr[ETH_ALEN];
+};
+
+/* Per-cpu network traffic stats */
+struct vxlan_stats {
+	u64			rx_packets;
+	u64			rx_bytes;
+	u64			tx_packets;
+	u64			tx_bytes;
+	struct u64_stats_sync	syncp;
+};
+
+/* Pseudo network device */
+struct vxlan_dev {
+	struct hlist_node hlist;
+	struct net_device *dev;
+	struct vxlan_stats __percpu *stats;
+	__u32		  vni;		/* virtual network id */
+	__be32	          gaddr;	/* multicast group */
+	__be32		  saddr;	/* source address */
+	unsigned int      link;		/* link to multicast over */
+	__u8		  tos;		/* TOS override */
+	__u8		  ttl;
+	bool		  learn;
+
+	unsigned long	  age_interval;
+	struct timer_list age_timer;
+	spinlock_t	  hash_lock;
+	unsigned int	  addrcnt;
+	unsigned int	  addrmax;
+	unsigned int	  addrexceeded;
+
+	struct hlist_head fdb_head[FDB_HASH_SIZE];
+};
+
+/* salt for hash table */
+static u32 vxlan_salt __read_mostly;
+
+static inline struct hlist_head *vni_head(struct net *net, u32 id)
+{
+	struct vxlan_net *vn = net_generic(net, vxlan_net_id);
+
+	return &vn->vni_list[hash_32(id, VNI_HASH_BITS)];
+}
+
+/* Look up VNI in a per net namespace table */
+static struct vxlan_dev *vxlan_find_vni(struct net *net, u32 id)
+{
+	struct vxlan_dev *vxlan;
+	struct hlist_node *node;
+
+	hlist_for_each_entry_rcu(vxlan, node, vni_head(net, id), hlist) {
+		if (vxlan->vni == id)
+			return vxlan;
+	}
+
+	return NULL;
+}
+
+/* Fill in neighbour message in skbuff. */
+static int vxlan_fdb_info(struct sk_buff *skb, struct vxlan_dev *vxlan,
+			   const struct vxlan_fdb *fdb,
+			   u32 portid, u32 seq, int type, unsigned int flags)
+{
+	unsigned long now = jiffies;
+	struct nda_cacheinfo ci;
+	struct nlmsghdr *nlh;
+	struct ndmsg *ndm;
+
+	nlh = nlmsg_put(skb, portid, seq, type, sizeof(*ndm), flags);
+	if (nlh == NULL)
+		return -EMSGSIZE;
+
+	ndm = nlmsg_data(nlh);
+	memset(ndm, 0, sizeof(*ndm));
+	ndm->ndm_family	= AF_BRIDGE;
+	ndm->ndm_state = fdb->state;
+	ndm->ndm_ifindex = vxlan->dev->ifindex;
+	ndm->ndm_flags = NTF_SELF;
+	ndm->ndm_type = NDA_DST;
+
+	if (nla_put(skb, NDA_LLADDR, ETH_ALEN, &fdb->eth_addr))
+		goto nla_put_failure;
+
+	if (nla_put_be32(skb, NDA_DST, fdb->remote_ip))
+		goto nla_put_failure;
+
+	ci.ndm_used	 = jiffies_to_clock_t(now - fdb->used);
+	ci.ndm_confirmed = 0;
+	ci.ndm_updated	 = jiffies_to_clock_t(now - fdb->updated);
+	ci.ndm_refcnt	 = 0;
+
+	if (nla_put(skb, NDA_CACHEINFO, sizeof(ci), &ci))
+		goto nla_put_failure;
+
+	return nlmsg_end(skb, nlh);
+
+nla_put_failure:
+	nlmsg_cancel(skb, nlh);
+	return -EMSGSIZE;
+}
+
+static inline size_t vxlan_nlmsg_size(void)
+{
+	return NLMSG_ALIGN(sizeof(struct ndmsg))
+		+ nla_total_size(ETH_ALEN) /* NDA_LLADDR */
+		+ nla_total_size(sizeof(__be32)) /* NDA_DST */
+		+ nla_total_size(sizeof(struct nda_cacheinfo));
+}
+
+static void vxlan_fdb_notify(struct vxlan_dev *vxlan,
+			     const struct vxlan_fdb *fdb, int type)
+{
+	struct net *net = dev_net(vxlan->dev);
+	struct sk_buff *skb;
+	int err = -ENOBUFS;
+
+	skb = nlmsg_new(vxlan_nlmsg_size(), GFP_ATOMIC);
+	if (skb == NULL)
+		goto errout;
+
+	err = vxlan_fdb_info(skb, vxlan, fdb, 0, 0, type, 0);
+	if (err < 0) {
+		/* -EMSGSIZE implies BUG in vxlan_nlmsg_size() */
+		WARN_ON(err == -EMSGSIZE);
+		kfree_skb(skb);
+		goto errout;
+	}
+
+	rtnl_notify(skb, net, 0, RTNLGRP_NEIGH, NULL, GFP_ATOMIC);
+	return;
+errout:
+	if (err < 0)
+		rtnl_set_sk_err(net, RTNLGRP_NEIGH, err);
+}
+
+/* Hash Ethernet address */
+static u32 eth_hash(const unsigned char *addr)
+{
+	u64 value = get_unaligned((u64 *)addr);
+
+	/* only want 6 bytes */
+#ifdef __BIG_ENDIAN
+	value <<= 16;
+#else
+	value >>= 16;
+#endif
+	return hash_64(value, FDB_HASH_BITS);
+}
+
+/* Hash chain to use given mac address */
+static inline struct hlist_head *vxlan_fdb_head(struct vxlan_dev *vxlan,
+						const u8 *mac)
+{
+	return &vxlan->fdb_head[eth_hash(mac)];
+}
+
+/* Look up Ethernet address in forwarding table */
+static struct vxlan_fdb *vxlan_find_mac(struct vxlan_dev *vxlan,
+					const u8 *mac)
+
+{
+	struct hlist_head *head = vxlan_fdb_head(vxlan, mac);
+	struct vxlan_fdb *f;
+	struct hlist_node *node;
+
+	hlist_for_each_entry_rcu(f, node, head, hlist) {
+		if (compare_ether_addr(mac, f->eth_addr) == 0)
+			return f;
+	}
+
+	return NULL;
+}
+
+/* Add new entry to forwarding table -- assumes lock held */
+static int vxlan_fdb_create(struct vxlan_dev *vxlan,
+			    const u8 *mac, __be32 ip,
+			    __u16 state, __u16 flags)
+{
+	struct vxlan_fdb *f;
+	int notify = 0;
+
+	f = vxlan_find_mac(vxlan, mac);
+	if (f) {
+		if (flags & NLM_F_EXCL) {
+			netdev_dbg(vxlan->dev,
+				   "lost race to create %pM\n", mac);
+			return -EEXIST;
+		}
+		if (f->state != state) {
+			f->state = state;
+			f->updated = jiffies;
+			notify = 1;
+		}
+	} else {
+		if (!(flags & NLM_F_CREATE))
+			return -ENOENT;
+
+		if (vxlan->addrmax && vxlan->addrcnt >= vxlan->addrmax)
+			return -ENOSPC;
+
+		netdev_dbg(vxlan->dev, "add %pM -> %pI4\n", mac, &ip);
+		f = kmalloc(sizeof(*f), GFP_ATOMIC);
+		if (!f)
+			return -ENOMEM;
+
+		notify = 1;
+		f->remote_ip = ip;
+		f->state = state;
+		f->updated = f->used = jiffies;
+		memcpy(f->eth_addr, mac, ETH_ALEN);
+
+		++vxlan->addrcnt;
+		hlist_add_head_rcu(&f->hlist,
+				   vxlan_fdb_head(vxlan, mac));
+	}
+
+	if (notify)
+		vxlan_fdb_notify(vxlan, f, RTM_NEWNEIGH);
+
+	return 0;
+}
+
+static void vxlan_fdb_destroy(struct vxlan_dev *vxlan, struct vxlan_fdb *f)
+{
+	netdev_dbg(vxlan->dev,
+		    "delete %pM\n", f->eth_addr);
+
+	--vxlan->addrcnt;
+	vxlan_fdb_notify(vxlan, f, RTM_DELNEIGH);
+
+	hlist_del_rcu(&f->hlist);
+	kfree_rcu(f, rcu);
+}
+
+/* Add static entry (via netlink) */
+static int vxlan_fdb_add(struct ndmsg *ndm, struct nlattr *tb[],
+			 struct net_device *dev,
+			 const unsigned char *addr, u16 flags)
+{
+	struct vxlan_dev *vxlan = netdev_priv(dev);
+	__be32 ip;
+	int err;
+
+	if (!(ndm->ndm_state & (NUD_PERMANENT|NUD_REACHABLE))) {
+		pr_info("RTM_NEWNEIGH with invalid state %#x\n",
+			ndm->ndm_state);
+		return -EINVAL;
+	}
+
+	if (tb[NDA_DST] == NULL)
+		return -EINVAL;
+
+	if (nla_len(tb[NDA_DST]) != sizeof(__be32))
+		return -EAFNOSUPPORT;
+
+	ip = nla_get_be32(tb[NDA_DST]);
+
+	spin_lock_bh(&vxlan->hash_lock);
+	err = vxlan_fdb_create(vxlan, addr, ip, ndm->ndm_state, flags);
+	spin_unlock_bh(&vxlan->hash_lock);
+
+	return err;
+}
+
+/* Delete entry (via netlink) */
+static int vxlan_fdb_delete(struct ndmsg *ndm, struct net_device *dev,
+			    const unsigned char *addr)
+{
+	struct vxlan_dev *vxlan = netdev_priv(dev);
+	struct vxlan_fdb *f;
+	int err = -ENOENT;
+
+	spin_lock_bh(&vxlan->hash_lock);
+	f = vxlan_find_mac(vxlan, addr);
+	if (f) {
+		vxlan_fdb_destroy(vxlan, f);
+		err = 0;
+	}
+	spin_unlock_bh(&vxlan->hash_lock);
+
+	return err;
+}
+
+/* Dump forwarding table */
+static int vxlan_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb,
+			  struct net_device *dev, int idx)
+{
+	struct vxlan_dev *vxlan = netdev_priv(dev);
+	unsigned int h;
+
+	for (h = 0; h < FDB_HASH_SIZE; ++h) {
+		struct vxlan_fdb *f;
+		struct hlist_node *n;
+		int err;
+
+		hlist_for_each_entry_rcu(f, n, &vxlan->fdb_head[h], hlist) {
+			if (idx < cb->args[0])
+				goto skip;
+
+			err = vxlan_fdb_info(skb, vxlan, f,
+					     NETLINK_CB(cb->skb).portid,
+					     cb->nlh->nlmsg_seq,
+					     RTM_NEWNEIGH,
+					     NLM_F_MULTI);
+			if (err < 0)
+				break;
+skip:
+			++idx;
+		}
+	}
+
+	return idx;
+}
+
+/* Watch incoming packets to learn mapping between Ethernet address
+ * and Tunnel endpoint.
+ */
+static void vxlan_snoop(struct net_device *dev,
+			__be32 src_ip, const u8 *src_mac)
+{
+	struct vxlan_dev *vxlan = netdev_priv(dev);
+	struct vxlan_fdb *f;
+	int err;
+
+	f = vxlan_find_mac(vxlan, src_mac);
+	if (likely(f)) {
+		f->used = jiffies;
+		if (likely(f->remote_ip == src_ip))
+			return;
+
+		if (net_ratelimit())
+			netdev_info(dev,
+				    "%pM migrated from %pI4 to %pI4\n",
+				    src_mac, &f->remote_ip, &src_ip);
+
+		f->remote_ip = src_ip;
+		f->updated = jiffies;
+	} else {
+		/* learned new entry */
+		spin_lock(&vxlan->hash_lock);
+		err = vxlan_fdb_create(vxlan, src_mac, src_ip,
+				       NUD_REACHABLE,
+				       NLM_F_EXCL|NLM_F_CREATE);
+		spin_unlock(&vxlan->hash_lock);
+	}
+}
+
+
+/* See if multicast group is already in use by other ID */
+static bool vxlan_group_used(struct vxlan_net *vn,
+			     const struct vxlan_dev *this)
+{
+	const struct vxlan_dev *vxlan;
+	struct hlist_node *node;
+	unsigned h;
+
+	for (h = 0; h < VNI_HASH_SIZE; ++h)
+		hlist_for_each_entry(vxlan, node, &vn->vni_list[h], hlist) {
+			if (vxlan == this)
+				continue;
+
+			if (!netif_running(vxlan->dev))
+				continue;
+
+			if (vxlan->gaddr == this->gaddr)
+				return true;
+		}
+
+	return false;
+}
+
+/* kernel equivalent to IP_ADD_MEMBERSHIP */
+static int vxlan_join_group(struct net_device *dev)
+{
+	struct vxlan_dev *vxlan = netdev_priv(dev);
+	struct vxlan_net *vn = net_generic(dev_net(dev), vxlan_net_id);
+	struct sock *sk = vn->sock->sk;
+	struct ip_mreqn mreq = {
+		.imr_multiaddr.s_addr = vxlan->gaddr,
+	};
+	int err;
+
+	/* Already a member of group */
+	if (vxlan_group_used(vn, vxlan))
+		return 0;
+
+	/* Need to drop RTNL to call multicast join */
+	rtnl_unlock();
+	lock_sock(sk);
+	err = ip_mc_join_group(sk, &mreq);
+	release_sock(sk);
+	rtnl_lock();
+
+	return err;
+}
+
+
+/* kernel equivalent to IP_DROP_MEMBERSHIP */
+static int vxlan_leave_group(struct net_device *dev)
+{
+	struct vxlan_dev *vxlan = netdev_priv(dev);
+	struct vxlan_net *vn = net_generic(dev_net(dev), vxlan_net_id);
+	int err = 0;
+	struct sock *sk = vn->sock->sk;
+	struct ip_mreqn mreq = {
+		.imr_multiaddr.s_addr = vxlan->gaddr,
+	};
+
+	/* Only leave group when last vxlan is done. */
+	if (vxlan_group_used(vn, vxlan))
+		return 0;
+
+	/* Need to drop RTNL to call multicast leave */
+	rtnl_unlock();
+	lock_sock(sk);
+	err = ip_mc_leave_group(sk, &mreq);
+	release_sock(sk);
+	rtnl_lock();
+
+	return err;
+}
+
+/* Callback from net/ipv4/udp.c to receive packets */
+static int vxlan_udp_encap_recv(struct sock *sk, struct sk_buff *skb)
+{
+	struct iphdr *oip;
+	struct vxlanhdr *vxh;
+	struct vxlan_dev *vxlan;
+	struct vxlan_stats *stats;
+	__u32 vni;
+	int err;
+
+	/* pop off outer UDP header */
+	__skb_pull(skb, sizeof(struct udphdr));
+
+	/* Need Vxlan and inner Ethernet header to be present */
+	if (!pskb_may_pull(skb, sizeof(struct vxlanhdr)))
+		goto error;
+
+	/* Drop packets with reserved bits set */
+	vxh = (struct vxlanhdr *) skb->data;
+	if (vxh->vx_flags != htonl(VXLAN_FLAGS) ||
+	    (vxh->vx_vni & htonl(0xff))) {
+		netdev_dbg(skb->dev, "invalid vxlan flags=%#x vni=%#x\n",
+			   ntohl(vxh->vx_flags), ntohl(vxh->vx_vni));
+		goto error;
+	}
+
+	__skb_pull(skb, sizeof(struct vxlanhdr));
+	skb_postpull_rcsum(skb, eth_hdr(skb), sizeof(struct vxlanhdr));
+
+	/* Is this VNI defined? */
+	vni = ntohl(vxh->vx_vni) >> 8;
+	vxlan = vxlan_find_vni(sock_net(sk), vni);
+	if (!vxlan) {
+		netdev_dbg(skb->dev, "unknown vni %d\n", vni);
+		goto drop;
+	}
+
+	if (!pskb_may_pull(skb, ETH_HLEN)) {
+		vxlan->dev->stats.rx_length_errors++;
+		vxlan->dev->stats.rx_errors++;
+		goto drop;
+	}
+
+	/* Re-examine inner Ethernet packet */
+	oip = ip_hdr(skb);
+	skb->protocol = eth_type_trans(skb, vxlan->dev);
+	skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
+
+	/* Ignore packet loops (and multicast echo) */
+	if (compare_ether_addr(eth_hdr(skb)->h_source,
+			       vxlan->dev->dev_addr) == 0)
+		goto drop;
+
+	if (vxlan->learn)
+		vxlan_snoop(skb->dev, oip->saddr, eth_hdr(skb)->h_source);
+
+	__skb_tunnel_rx(skb, vxlan->dev);
+	skb_reset_network_header(skb);
+
+	err = IP_ECN_decapsulate(oip, skb);
+	if (unlikely(err)) {
+		if (log_ecn_error)
+			net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n",
+					     &oip->saddr, oip->tos);
+		if (err > 1) {
+			++vxlan->dev->stats.rx_frame_errors;
+			++vxlan->dev->stats.rx_errors;
+			goto drop;
+		}
+	}
+
+	stats = this_cpu_ptr(vxlan->stats);
+	u64_stats_update_begin(&stats->syncp);
+	stats->rx_packets++;
+	stats->rx_bytes += skb->len;
+	u64_stats_update_end(&stats->syncp);
+
+	netif_rx(skb);
+
+	return 0;
+error:
+	/* Put UDP header back */
+	__skb_push(skb, sizeof(struct udphdr));
+
+	return 1;
+drop:
+	/* Consume bad packet */
+	kfree_skb(skb);
+	return 0;
+}
+
+/* Extract dsfield from inner protocol */
+static inline u8 vxlan_get_dsfield(const struct iphdr *iph,
+				   const struct sk_buff *skb)
+{
+	if (skb->protocol == htons(ETH_P_IP))
+		return iph->tos;
+	else if (skb->protocol == htons(ETH_P_IPV6))
+		return ipv6_get_dsfield((const struct ipv6hdr *)iph);
+	else
+		return 0;
+}
+
+/* Propogate ECN bits out */
+static inline u8 vxlan_ecn_encap(u8 tos,
+				 const struct iphdr *iph,
+				 const struct sk_buff *skb)
+{
+	u8 inner = vxlan_get_dsfield(iph, skb);
+
+	return INET_ECN_encapsulate(tos, inner);
+}
+
+/* Transmit local packets over Vxlan
+ *
+ * Outer IP header inherits ECN and DF from inner header.
+ * Outer UDP destination is the VXLAN assigned port.
+ *           source port is based on hash of flow if available
+ *                       otherwise use a random value
+ */
+static netdev_tx_t vxlan_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+	struct vxlan_dev *vxlan = netdev_priv(dev);
+	struct rtable *rt;
+	const struct ethhdr *eth;
+	const struct iphdr *old_iph;
+	struct iphdr *iph;
+	struct vxlanhdr *vxh;
+	struct udphdr *uh;
+	struct flowi4 fl4;
+	struct vxlan_fdb *f;
+	unsigned int pkt_len = skb->len;
+	u32 hash;
+	__be32 dst;
+	__be16 df = 0;
+	__u8 tos, ttl;
+	int err;
+
+	/* Need space for new headers (invalidates iph ptr) */
+	if (skb_cow_head(skb, VXLAN_HEADROOM))
+		goto drop;
+
+	eth = (void *)skb->data;
+	old_iph = ip_hdr(skb);
+
+	if (!is_multicast_ether_addr(eth->h_dest) &&
+	    (f = vxlan_find_mac(vxlan, eth->h_dest)))
+		dst = f->remote_ip;
+	else if (vxlan->gaddr) {
+		dst = vxlan->gaddr;
+	} else
+		goto drop;
+
+	ttl = vxlan->ttl;
+	if (!ttl && IN_MULTICAST(ntohl(dst)))
+		ttl = 1;
+
+	tos = vxlan->tos;
+	if (tos == 1)
+		tos = vxlan_get_dsfield(old_iph, skb);
+
+	hash = skb_get_rxhash(skb);
+
+	rt = ip_route_output_gre(dev_net(dev), &fl4, dst,
+				 vxlan->saddr, vxlan->vni,
+				 RT_TOS(tos), vxlan->link);
+	if (IS_ERR(rt)) {
+		netdev_dbg(dev, "no route to %pI4\n", &dst);
+		dev->stats.tx_carrier_errors++;
+		goto tx_error;
+	}
+
+	if (rt->dst.dev == dev) {
+		netdev_dbg(dev, "circular route to %pI4\n", &dst);
+		ip_rt_put(rt);
+		dev->stats.collisions++;
+		goto tx_error;
+	}
+
+	memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
+	IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED |
+			      IPSKB_REROUTED);
+	skb_dst_drop(skb);
+	skb_dst_set(skb, &rt->dst);
+
+	vxh = (struct vxlanhdr *) __skb_push(skb, sizeof(*vxh));
+	vxh->vx_flags = htonl(VXLAN_FLAGS);
+	vxh->vx_vni = htonl(vxlan->vni << 8);
+
+	__skb_push(skb, sizeof(*uh));
+	skb_reset_transport_header(skb);
+	uh = udp_hdr(skb);
+
+	uh->dest = htons(vxlan_port);
+	uh->source = hash ? :random32();
+
+	uh->len = htons(skb->len);
+	uh->check = 0;
+
+	__skb_push(skb, sizeof(*iph));
+	skb_reset_network_header(skb);
+	iph		= ip_hdr(skb);
+	iph->version	= 4;
+	iph->ihl	= sizeof(struct iphdr) >> 2;
+	iph->frag_off	= df;
+	iph->protocol	= IPPROTO_UDP;
+	iph->tos	= vxlan_ecn_encap(tos, old_iph, skb);
+	iph->daddr	= fl4.daddr;
+	iph->saddr	= fl4.saddr;
+	iph->ttl	= ttl ? : ip4_dst_hoplimit(&rt->dst);
+
+	/* See __IPTUNNEL_XMIT */
+	skb->ip_summed = CHECKSUM_NONE;
+	ip_select_ident(iph, &rt->dst, NULL);
+
+	err = ip_local_out(skb);
+	if (likely(net_xmit_eval(err) == 0)) {
+		struct vxlan_stats *stats = this_cpu_ptr(vxlan->stats);
+
+		u64_stats_update_begin(&stats->syncp);
+		stats->tx_packets++;
+		stats->tx_bytes += pkt_len;
+		u64_stats_update_end(&stats->syncp);
+	} else {
+		dev->stats.tx_errors++;
+		dev->stats.tx_aborted_errors++;
+	}
+	return NETDEV_TX_OK;
+
+drop:
+	dev->stats.tx_dropped++;
+	goto tx_free;
+
+tx_error:
+	dev->stats.tx_errors++;
+tx_free:
+	dev_kfree_skb(skb);
+	return NETDEV_TX_OK;
+}
+
+/* Walk the forwarding table and purge stale entries */
+static void vxlan_cleanup(unsigned long arg)
+{
+	struct vxlan_dev *vxlan = (struct vxlan_dev *) arg;
+	unsigned long next_timer = jiffies + FDB_AGE_INTERVAL;
+	unsigned int h;
+
+	if (!netif_running(vxlan->dev))
+		return;
+
+	spin_lock_bh(&vxlan->hash_lock);
+	for (h = 0; h < FDB_HASH_SIZE; ++h) {
+		struct hlist_node *p, *n;
+		hlist_for_each_safe(p, n, &vxlan->fdb_head[h]) {
+			struct vxlan_fdb *f
+				= container_of(p, struct vxlan_fdb, hlist);
+			unsigned long timeout;
+
+			if (f->state == NUD_PERMANENT)
+				continue;
+
+			timeout = f->used + vxlan->age_interval * HZ;
+			if (time_before_eq(timeout, jiffies)) {
+				netdev_dbg(vxlan->dev,
+					   "garbage collect %pM\n",
+					   f->eth_addr);
+				f->state = NUD_STALE;
+				vxlan_fdb_destroy(vxlan, f);
+			} else if (time_before(timeout, next_timer))
+				next_timer = timeout;
+		}
+	}
+	spin_unlock_bh(&vxlan->hash_lock);
+
+	mod_timer(&vxlan->age_timer, next_timer);
+}
+
+/* Setup stats when device is created */
+static int vxlan_init(struct net_device *dev)
+{
+	struct vxlan_dev *vxlan = netdev_priv(dev);
+
+	vxlan->stats = alloc_percpu(struct vxlan_stats);
+	if (!vxlan->stats)
+		return -ENOMEM;
+
+	return 0;
+}
+
+/* Start ageing timer and join group when device is brought up */
+static int vxlan_open(struct net_device *dev)
+{
+	struct vxlan_dev *vxlan = netdev_priv(dev);
+	int err;
+
+	if (vxlan->gaddr) {
+		err = vxlan_join_group(dev);
+		if (err)
+			return err;
+	}
+
+	if (vxlan->age_interval)
+		mod_timer(&vxlan->age_timer, jiffies + FDB_AGE_INTERVAL);
+
+	return 0;
+}
+
+/* Purge the forwarding table */
+static void vxlan_flush(struct vxlan_dev *vxlan)
+{
+	unsigned h;
+
+	spin_lock_bh(&vxlan->hash_lock);
+	for (h = 0; h < FDB_HASH_SIZE; ++h) {
+		struct hlist_node *p, *n;
+		hlist_for_each_safe(p, n, &vxlan->fdb_head[h]) {
+			struct vxlan_fdb *f
+				= container_of(p, struct vxlan_fdb, hlist);
+			vxlan_fdb_destroy(vxlan, f);
+		}
+	}
+	spin_unlock_bh(&vxlan->hash_lock);
+}
+
+/* Cleanup timer and forwarding table on shutdown */
+static int vxlan_stop(struct net_device *dev)
+{
+	struct vxlan_dev *vxlan = netdev_priv(dev);
+
+	if (vxlan->gaddr)
+		vxlan_leave_group(dev);
+
+	del_timer_sync(&vxlan->age_timer);
+
+	vxlan_flush(vxlan);
+
+	return 0;
+}
+
+/* Merge per-cpu statistics */
+static struct rtnl_link_stats64 *vxlan_stats64(struct net_device *dev,
+					       struct rtnl_link_stats64 *stats)
+{
+	struct vxlan_dev *vxlan = netdev_priv(dev);
+	struct vxlan_stats tmp, sum = { 0 };
+	unsigned int cpu;
+
+	for_each_possible_cpu(cpu) {
+		unsigned int start;
+		const struct vxlan_stats *stats
+			= per_cpu_ptr(vxlan->stats, cpu);
+
+		do {
+			start = u64_stats_fetch_begin_bh(&stats->syncp);
+			memcpy(&tmp, stats, sizeof(tmp));
+		} while (u64_stats_fetch_retry_bh(&stats->syncp, start));
+
+		sum.tx_bytes   += tmp.tx_bytes;
+		sum.tx_packets += tmp.tx_packets;
+		sum.rx_bytes   += tmp.rx_bytes;
+		sum.rx_packets += tmp.rx_packets;
+	}
+
+	stats->tx_bytes   = sum.tx_bytes;
+	stats->tx_packets = sum.tx_packets;
+	stats->rx_bytes   = sum.rx_bytes;
+	stats->rx_packets = sum.rx_packets;
+
+	stats->multicast = dev->stats.multicast;
+	stats->rx_length_errors = dev->stats.rx_length_errors;
+	stats->rx_frame_errors = dev->stats.rx_frame_errors;
+	stats->rx_errors = dev->stats.rx_errors;
+
+	stats->tx_dropped = dev->stats.tx_dropped;
+	stats->tx_carrier_errors  = dev->stats.tx_carrier_errors;
+	stats->tx_aborted_errors  = dev->stats.tx_aborted_errors;
+	stats->collisions  = dev->stats.collisions;
+	stats->tx_errors = dev->stats.tx_errors;
+
+	return stats;
+}
+
+/* Stub, nothing needs to be done. */
+static void vxlan_set_multicast_list(struct net_device *dev)
+{
+}
+
+static const struct net_device_ops vxlan_netdev_ops = {
+	.ndo_init		= vxlan_init,
+	.ndo_open		= vxlan_open,
+	.ndo_stop		= vxlan_stop,
+	.ndo_start_xmit		= vxlan_xmit,
+	.ndo_get_stats64	= vxlan_stats64,
+	.ndo_set_rx_mode	= vxlan_set_multicast_list,
+	.ndo_change_mtu		= eth_change_mtu,
+	.ndo_validate_addr	= eth_validate_addr,
+	.ndo_set_mac_address	= eth_mac_addr,
+	.ndo_fdb_add		= vxlan_fdb_add,
+	.ndo_fdb_del		= vxlan_fdb_delete,
+	.ndo_fdb_dump		= vxlan_fdb_dump,
+};
+
+/* Info for udev, that this is a virtual tunnel endpoint */
+static struct device_type vxlan_type = {
+	.name = "vxlan",
+};
+
+static void vxlan_free(struct net_device *dev)
+{
+	struct vxlan_dev *vxlan = netdev_priv(dev);
+
+	free_percpu(vxlan->stats);
+	free_netdev(dev);
+}
+
+/* Initialize the device structure. */
+static void vxlan_setup(struct net_device *dev)
+{
+	struct vxlan_dev *vxlan = netdev_priv(dev);
+	unsigned h;
+
+	eth_hw_addr_random(dev);
+	ether_setup(dev);
+
+	dev->netdev_ops = &vxlan_netdev_ops;
+	dev->destructor = vxlan_free;
+	SET_NETDEV_DEVTYPE(dev, &vxlan_type);
+
+	dev->tx_queue_len = 0;
+	dev->features	|= NETIF_F_LLTX;
+	dev->features	|= NETIF_F_NETNS_LOCAL;
+	dev->priv_flags	&= ~IFF_XMIT_DST_RELEASE;
+
+	spin_lock_init(&vxlan->hash_lock);
+
+	init_timer_deferrable(&vxlan->age_timer);
+	vxlan->age_timer.function = vxlan_cleanup;
+	vxlan->age_timer.data = (unsigned long) vxlan;
+
+	vxlan->dev = dev;
+
+	for (h = 0; h < FDB_HASH_SIZE; ++h)
+		INIT_HLIST_HEAD(&vxlan->fdb_head[h]);
+}
+
+static const struct nla_policy vxlan_policy[IFLA_VXLAN_MAX + 1] = {
+	[IFLA_VXLAN_ID]		= { .type = NLA_U32 },
+	[IFLA_VXLAN_GROUP]	= { .len = FIELD_SIZEOF(struct iphdr, daddr) },
+	[IFLA_VXLAN_LINK]	= { .type = NLA_U32 },
+	[IFLA_VXLAN_LOCAL]	= { .len = FIELD_SIZEOF(struct iphdr, saddr) },
+	[IFLA_VXLAN_TOS]	= { .type = NLA_U8 },
+	[IFLA_VXLAN_TTL]	= { .type = NLA_U8 },
+	[IFLA_VXLAN_LEARNING]	= { .type = NLA_U8 },
+	[IFLA_VXLAN_AGEING]	= { .type = NLA_U32 },
+	[IFLA_VXLAN_LIMIT]	= { .type = NLA_U32 },
+};
+
+static int vxlan_validate(struct nlattr *tb[], struct nlattr *data[])
+{
+	if (tb[IFLA_ADDRESS]) {
+		if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN) {
+			pr_debug("invalid link address (not ethernet)\n");
+			return -EINVAL;
+		}
+
+		if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS]))) {
+			pr_debug("invalid all zero ethernet address\n");
+			return -EADDRNOTAVAIL;
+		}
+	}
+
+	if (!data)
+		return -EINVAL;
+
+	if (data[IFLA_VXLAN_ID]) {
+		__u32 id = nla_get_u32(data[IFLA_VXLAN_ID]);
+		if (id >= VXLAN_VID_MASK)
+			return -ERANGE;
+	}
+
+	if (data[IFLA_VXLAN_GROUP]) {
+		__be32 gaddr = nla_get_be32(data[IFLA_VXLAN_GROUP]);
+		if (!IN_MULTICAST(ntohl(gaddr))) {
+			pr_debug("group address is not IPv4 multicast\n");
+			return -EADDRNOTAVAIL;
+		}
+	}
+	return 0;
+}
+
+static int vxlan_newlink(struct net *net, struct net_device *dev,
+			 struct nlattr *tb[], struct nlattr *data[])
+{
+	struct vxlan_dev *vxlan = netdev_priv(dev);
+	__u32 vni;
+	int err;
+
+	if (!data[IFLA_VXLAN_ID])
+		return -EINVAL;
+
+	vni = nla_get_u32(data[IFLA_VXLAN_ID]);
+	if (vxlan_find_vni(net, vni)) {
+		pr_info("duplicate VNI %u\n", vni);
+		return -EEXIST;
+	}
+	vxlan->vni = vni;
+
+	if (data[IFLA_VXLAN_GROUP])
+		vxlan->gaddr = nla_get_be32(data[IFLA_VXLAN_GROUP]);
+
+	if (data[IFLA_VXLAN_LOCAL])
+		vxlan->saddr = nla_get_be32(data[IFLA_VXLAN_LOCAL]);
+
+	if (data[IFLA_VXLAN_LINK]) {
+		vxlan->link = nla_get_u32(data[IFLA_VXLAN_LINK]);
+
+		if (!tb[IFLA_MTU]) {
+			struct net_device *lowerdev;
+			lowerdev = __dev_get_by_index(net, vxlan->link);
+			dev->mtu = lowerdev->mtu - VXLAN_HEADROOM;
+		}
+	}
+
+	if (data[IFLA_VXLAN_TOS])
+		vxlan->tos  = nla_get_u8(data[IFLA_VXLAN_TOS]);
+
+	if (!data[IFLA_VXLAN_LEARNING] || nla_get_u8(data[IFLA_VXLAN_LEARNING]))
+		vxlan->learn = true;
+
+	if (data[IFLA_VXLAN_AGEING])
+		vxlan->age_interval = nla_get_u32(data[IFLA_VXLAN_AGEING]);
+	else
+		vxlan->age_interval = FDB_AGE_DEFAULT;
+
+	if (data[IFLA_VXLAN_LIMIT])
+		vxlan->addrmax = nla_get_u32(data[IFLA_VXLAN_LIMIT]);
+
+	err = register_netdevice(dev);
+	if (!err)
+		hlist_add_head_rcu(&vxlan->hlist, vni_head(net, vxlan->vni));
+
+	return err;
+}
+
+static void vxlan_dellink(struct net_device *dev, struct list_head *head)
+{
+	struct vxlan_dev *vxlan = netdev_priv(dev);
+
+	hlist_del_rcu(&vxlan->hlist);
+
+	unregister_netdevice_queue(dev, head);
+}
+
+static size_t vxlan_get_size(const struct net_device *dev)
+{
+
+	return nla_total_size(sizeof(__u32)) +	/* IFLA_VXLAN_ID */
+		nla_total_size(sizeof(__be32)) +/* IFLA_VXLAN_GROUP */
+		nla_total_size(sizeof(__u32)) +	/* IFLA_VXLAN_LINK */
+		nla_total_size(sizeof(__be32))+	/* IFLA_VXLAN_LOCAL */
+		nla_total_size(sizeof(__u8)) +	/* IFLA_VXLAN_TTL */
+		nla_total_size(sizeof(__u8)) +	/* IFLA_VXLAN_TOS */
+		nla_total_size(sizeof(__u8)) +	/* IFLA_VXLAN_LEARNING */
+		nla_total_size(sizeof(__u32)) +	/* IFLA_VXLAN_AGEING */
+		nla_total_size(sizeof(__u32)) +	/* IFLA_VXLAN_LIMIT */
+		0;
+}
+
+static int vxlan_fill_info(struct sk_buff *skb, const struct net_device *dev)
+{
+	const struct vxlan_dev *vxlan = netdev_priv(dev);
+
+	if (nla_put_u32(skb, IFLA_VXLAN_ID, vxlan->vni))
+		goto nla_put_failure;
+
+	if (vxlan->gaddr && nla_put_u32(skb, IFLA_VXLAN_GROUP, vxlan->gaddr))
+		goto nla_put_failure;
+
+	if (vxlan->link && nla_put_u32(skb, IFLA_VXLAN_LINK, vxlan->link))
+		goto nla_put_failure;
+
+	if (vxlan->saddr && nla_put_u32(skb, IFLA_VXLAN_LOCAL, vxlan->saddr))
+		goto nla_put_failure;
+
+	if (nla_put_u8(skb, IFLA_VXLAN_TTL, vxlan->ttl) ||
+	    nla_put_u8(skb, IFLA_VXLAN_TOS, vxlan->tos) ||
+	    nla_put_u8(skb, IFLA_VXLAN_LEARNING, vxlan->learn) ||
+	    nla_put_u32(skb, IFLA_VXLAN_AGEING, vxlan->age_interval) ||
+	    nla_put_u32(skb, IFLA_VXLAN_LIMIT, vxlan->addrmax))
+		goto nla_put_failure;
+
+	return 0;
+
+nla_put_failure:
+	return -EMSGSIZE;
+}
+
+static struct rtnl_link_ops vxlan_link_ops __read_mostly = {
+	.kind		= "vxlan",
+	.maxtype	= IFLA_VXLAN_MAX,
+	.policy		= vxlan_policy,
+	.priv_size	= sizeof(struct vxlan_dev),
+	.setup		= vxlan_setup,
+	.validate	= vxlan_validate,
+	.newlink	= vxlan_newlink,
+	.dellink	= vxlan_dellink,
+	.get_size	= vxlan_get_size,
+	.fill_info	= vxlan_fill_info,
+};
+
+static __net_init int vxlan_init_net(struct net *net)
+{
+	struct vxlan_net *vn = net_generic(net, vxlan_net_id);
+	struct sock *sk;
+	struct sockaddr_in vxlan_addr = {
+		.sin_family = AF_INET,
+		.sin_addr.s_addr = htonl(INADDR_ANY),
+	};
+	int rc;
+	unsigned h;
+
+	/* Create UDP socket for encapsulation receive. */
+	rc = sock_create_kern(AF_INET, SOCK_DGRAM, IPPROTO_UDP, &vn->sock);
+	if (rc < 0) {
+		pr_debug("UDP socket create failed\n");
+		return rc;
+	}
+
+	vxlan_addr.sin_port = htons(vxlan_port);
+
+	rc = kernel_bind(vn->sock, (struct sockaddr *) &vxlan_addr,
+			 sizeof(vxlan_addr));
+	if (rc < 0) {
+		pr_debug("bind for UDP socket %pI4:%u (%d)\n",
+			 &vxlan_addr.sin_addr, ntohs(vxlan_addr.sin_port), rc);
+		sock_release(vn->sock);
+		vn->sock = NULL;
+		return rc;
+	}
+
+	/* Disable multicast loopback */
+	sk = vn->sock->sk;
+	inet_sk(sk)->mc_loop = 0;
+
+	/* Mark socket as an encapsulation socket. */
+	udp_sk(sk)->encap_type = 1;
+	udp_sk(sk)->encap_rcv = vxlan_udp_encap_recv;
+	udp_encap_enable();
+
+	for (h = 0; h < VNI_HASH_SIZE; ++h)
+		INIT_HLIST_HEAD(&vn->vni_list[h]);
+
+	return 0;
+}
+
+static __net_exit void vxlan_exit_net(struct net *net)
+{
+	struct vxlan_net *vn = net_generic(net, vxlan_net_id);
+
+	if (vn->sock) {
+		sock_release(vn->sock);
+		vn->sock = NULL;
+	}
+}
+
+static struct pernet_operations vxlan_net_ops = {
+	.init = vxlan_init_net,
+	.exit = vxlan_exit_net,
+	.id   = &vxlan_net_id,
+	.size = sizeof(struct vxlan_net),
+};
+
+static int __init vxlan_init_module(void)
+{
+	int rc;
+
+	get_random_bytes(&vxlan_salt, sizeof(vxlan_salt));
+
+	rc = register_pernet_device(&vxlan_net_ops);
+	if (rc)
+		goto out1;
+
+	rc = rtnl_link_register(&vxlan_link_ops);
+	if (rc)
+		goto out2;
+
+	return 0;
+
+out2:
+	unregister_pernet_device(&vxlan_net_ops);
+out1:
+	return rc;
+}
+module_init(vxlan_init_module);
+
+static void __exit vxlan_cleanup_module(void)
+{
+	rtnl_link_unregister(&vxlan_link_ops);
+	unregister_pernet_device(&vxlan_net_ops);
+}
+module_exit(vxlan_cleanup_module);
+
+MODULE_LICENSE("GPL");
+MODULE_VERSION(VXLAN_VERSION);
+MODULE_AUTHOR("Stephen Hemminger <shemminger@vyatta.com>");
+MODULE_ALIAS_RTNL_LINK("vxlan");
--- a/include/linux/if_link.h	2012-10-01 08:04:59.822350516 -0700
+++ b/include/linux/if_link.h	2012-10-01 08:27:16.629148196 -0700
@@ -272,6 +272,22 @@ enum macvlan_mode {
 
 #define MACVLAN_FLAG_NOPROMISC	1
 
+/* VXLAN section */
+enum {
+	IFLA_VXLAN_UNSPEC,
+	IFLA_VXLAN_ID,
+	IFLA_VXLAN_GROUP,
+	IFLA_VXLAN_LINK,
+	IFLA_VXLAN_LOCAL,
+	IFLA_VXLAN_TTL,
+	IFLA_VXLAN_TOS,
+	IFLA_VXLAN_LEARNING,
+	IFLA_VXLAN_AGEING,
+	IFLA_VXLAN_LIMIT,
+	__IFLA_VXLAN_MAX
+};
+#define IFLA_VXLAN_MAX	(__IFLA_VXLAN_MAX - 1)
+
 /* SR-IOV virtual function management section */
 
 enum {
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ b/Documentation/networking/vxlan.txt	2012-10-01 13:52:23.833932321 -0700
@@ -0,0 +1,47 @@
+Virtual eXtensible Local Area Networking documentation
+======================================================
+
+The VXLAN protocol is a tunnelling protocol that is designed to
+solve the problem of limited number of available VLAN's (4096).
+With VXLAN identifier is expanded to 24 bits.
+
+It is a draft RFC standard, that is implemented by Cisco Nexus,
+Vmware and Brocade. The protocol runs over UDP using a single
+destination port (still not standardized by IANA).
+This document describes the Linux kernel tunnel device,
+there is also an implantation of VXLAN for Openvswitch.
+
+Unlike most tunnels, a VXLAN is a 1 to N network, not just point
+to point. A VXLAN device can either dynamically learn the IP address
+of the other end, in a manner similar to a learning bridge, or the
+forwarding entries can be configured statically.
+
+The management of vxlan is done in a similar fashion to it's
+too closest neighbors GRE and VLAN. Configuring VXLAN requires
+the version of iproute2 that matches the kernel release
+where VXLAN was first merged upstream.
+
+1. Create vxlan device
+  # ip li add vxlan0 type vxlan id 42 group 239.1.1.1 dev eth1
+
+This creates a new device (vxlan0). The device uses the
+the multicast group 239.1.1.1 over eth1 to handle packets where
+no entry is in the forwarding table.
+
+2. Delete vxlan device
+  # ip link delete vxlan0
+
+3. Show vxlan info
+  # ip -d show vxlan0
+
+It is possible to create, destroy and display the vxlan
+forwarding table using the new bridge command.
+
+1. Create forwarding table entry
+  # bridge fdb add to 00:17:42:8a:b4:05 dst 192.19.0.2 dev vxlan0
+
+2. Delete forwarding table entry
+  # bridge fdb delete 00:17:42:8a:b4:05
+
+3. Show forwarding table
+  # bridge fdb show dev vxlan0

^ permalink raw reply	[flat|nested] 43+ messages in thread

* [PATCH 2/2] iproute2: manage VXLAN forwarding entries
       [not found]                 ` <20121001140206.2bbf9c41@nehalam.linuxnetplumber.net>
@ 2012-10-01 21:02                   ` Stephen Hemminger
  0 siblings, 0 replies; 43+ messages in thread
From: Stephen Hemminger @ 2012-10-01 21:02 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: David Miller, jesse, chrisw, netdev

Allow extending bridge forwarding table to handle VXLAN as well.
Change format of output to be close to 'ip neighbour'
---
 bridge/br_common.h |    1 +
 bridge/bridge.c    |   18 ++++++++
 bridge/fdb.c       |  129 +++++++++++++++++++++++++++++++---------------------
 3 files changed, 97 insertions(+), 51 deletions(-)

diff --git a/bridge/br_common.h b/bridge/br_common.h
index ec1671d..718ecb9 100644
--- a/bridge/br_common.h
+++ b/bridge/br_common.h
@@ -7,6 +7,7 @@ extern int print_fdb(const struct sockaddr_nl *who,
 extern int do_fdb(int argc, char **argv);
 extern int do_monitor(int argc, char **argv);
 
+extern int preferred_family;
 extern int show_stats;
 extern int show_detail;
 extern int timestamp;
diff --git a/bridge/bridge.c b/bridge/bridge.c
index 4702340..e2c33b0 100644
--- a/bridge/bridge.c
+++ b/bridge/bridge.c
@@ -15,6 +15,7 @@
 #include "br_common.h"
 
 struct rtnl_handle rth = { .fd = -1 };
+int preferred_family = AF_UNSPEC;
 int resolve_hosts;
 int show_stats;
 int show_details;
@@ -86,6 +87,23 @@ main(int argc, char **argv)
 			++show_details;
 		} else if (matches(opt, "-timestamp") == 0) {
 			++timestamp;
+                } else if (matches(opt, "-family") == 0) {
+			argc--;
+			argv++;
+			if (argc <= 1)
+				usage();
+			if (strcmp(argv[1], "inet") == 0)
+				preferred_family = AF_INET;
+			else if (strcmp(argv[1], "inet6") == 0)
+				preferred_family = AF_INET6;
+			else if (strcmp(argv[1], "help") == 0)
+				usage();
+			else
+				invarg("invalid protocol family", argv[1]);
+		} else if (strcmp(opt, "-4") == 0) {
+			preferred_family = AF_INET;
+		} else if (strcmp(opt, "-6") == 0) {
+			preferred_family = AF_INET6;
 		} else {
 			fprintf(stderr, "Option \"%s\" is unknown, try \"bridge help\".\n", opt);
 			exit(-1);
diff --git a/bridge/fdb.c b/bridge/fdb.c
index eaefa81..6041acc 100644
--- a/bridge/fdb.c
+++ b/bridge/fdb.c
@@ -1,6 +1,8 @@
 /*
  * Get/set/delete fdb table with netlink
  *
+ * TODO: merge/replace this with ip neighbour
+ *
  * Authors:	Stephen Hemminger <shemminger@vyatta.com>
  */
 
@@ -20,13 +22,14 @@
 
 #include "libnetlink.h"
 #include "br_common.h"
+#include "rt_names.h"
 #include "utils.h"
 
 int filter_index;
 
 static void usage(void)
 {
-	fprintf(stderr, "Usage: bridge fdb { add | del } ADDR dev DEV {self|master}\n");
+	fprintf(stderr, "Usage: bridge fdb { add | del } ADDR dev DEV {self|master} [ temp ] [ dst IPADDR]\n");
 	fprintf(stderr, "       bridge fdb {show} [ dev DEV ]\n");
 	exit(-1);
 }
@@ -35,15 +38,15 @@ static const char *state_n2a(unsigned s)
 {
 	static char buf[32];
 
-	if (s & NUD_PERMANENT) 
-		return "local";
+	if (s & NUD_PERMANENT)
+		return "permanent";
 
 	if (s & NUD_NOARP)
 		return "static";
 
 	if (s & NUD_STALE)
 		return "stale";
-	
+
 	if (s & NUD_REACHABLE)
 		return "";
 
@@ -51,24 +54,19 @@ static const char *state_n2a(unsigned s)
 	return buf;
 }
 
-static char *fmt_time(char *b, size_t l, unsigned long tick)
-{
-	static int hz;
-	
-	if (hz == 0)
-		hz = __get_user_hz();
-
-	snprintf(b, l, "%lu.%02lu", tick / hz, ((tick % hz) * hz) / 100);
-	return b;
-}
-
 int print_fdb(const struct sockaddr_nl *who, struct nlmsghdr *n, void *arg)
 {
+	FILE *fp = arg;
 	struct ndmsg *r = NLMSG_DATA(n);
 	int len = n->nlmsg_len;
 	struct rtattr * tb[NDA_MAX+1];
-	const __u8 *addr = NULL;
-	char b1[32];
+
+	if (n->nlmsg_type != RTM_NEWNEIGH && n->nlmsg_type != RTM_DELNEIGH) {
+		fprintf(stderr, "Not RTM_NEWNEIGH: %08x %08x %08x\n",
+			n->nlmsg_len, n->nlmsg_type, n->nlmsg_flags);
+
+		return 0;
+	}
 
 	len -= NLMSG_LENGTH(sizeof(*r));
 	if (len < 0) {
@@ -86,37 +84,49 @@ int print_fdb(const struct sockaddr_nl *who, struct nlmsghdr *n, void *arg)
 		     n->nlmsg_len - NLMSG_LENGTH(sizeof(*r)));
 
 	if (n->nlmsg_type == RTM_DELNEIGH)
-		printf("Deleted ");
-
-	if (tb[NDA_LLADDR])
-		addr = RTA_DATA(tb[NDA_LLADDR]);
-	else {
-		fprintf(stderr, "missing lladdr\n");
-		return -1;
+		fprintf(fp, "Deleted ");
+
+	if (tb[NDA_LLADDR]) {
+		SPRINT_BUF(b1);
+		fprintf(fp, "%s ",
+			ll_addr_n2a(RTA_DATA(tb[NDA_LLADDR]),
+				    RTA_PAYLOAD(tb[NDA_LLADDR]),
+				    ll_index_to_type(r->ndm_ifindex),
+				    b1, sizeof(b1)));
 	}
-
-	printf("%s\t%.2x:%.2x:%.2x:%.2x:%.2x:%.2x\t%s %s",
-		ll_index_to_name(r->ndm_ifindex),
-		addr[0], addr[1], addr[2],
-		addr[3], addr[4], addr[5],
-		state_n2a(r->ndm_state),
-		(r->ndm_flags & NTF_SELF) ? "self" : "master");
-
+	
+	if (!filter_index && r->ndm_ifindex)
+		fprintf(fp, "dev %s ", ll_index_to_name(r->ndm_ifindex));
+
+	if (tb[NDA_DST]) {
+		SPRINT_BUF(abuf);
+		fprintf(fp, "dst %s ",
+			format_host(AF_INET,
+				    RTA_PAYLOAD(tb[NDA_DST]),
+				    RTA_DATA(tb[NDA_DST]),
+				    abuf, sizeof(abuf)));
+	}
+		
 	if (show_stats && tb[NDA_CACHEINFO]) {
 		struct nda_cacheinfo *ci = RTA_DATA(tb[NDA_CACHEINFO]);
+		int hz = get_user_hz();
 
-		printf("\t%8s", fmt_time(b1, sizeof(b1), ci->ndm_updated));
-		printf(" %8s", fmt_time(b1, sizeof(b1), ci->ndm_used));
+		fprintf(fp, " used %d/%d", ci->ndm_used/hz,
+		       ci->ndm_updated/hz);
 	}
-	printf("\n");
+	if (r->ndm_flags & NTF_SELF)
+		fprintf(fp, "self ");
+	if (r->ndm_flags & NTF_MASTER)
+		fprintf(fp, "master ");
 
+	fprintf(fp, "%s\n", state_n2a(r->ndm_state));
 	return 0;
 }
 
 static int fdb_show(int argc, char **argv)
 {
 	char *filter_dev = NULL;
-	
+
 	while (argc > 0) {
 		if (strcmp(*argv, "dev") == 0) {
 			NEXT_ARG();
@@ -128,8 +138,10 @@ static int fdb_show(int argc, char **argv)
 	}
 
 	if (filter_dev) {
-		if ((filter_index = if_nametoindex(filter_dev)) == 0) {
-			fprintf(stderr, "Cannot find device \"%s\"\n", filter_dev);
+		filter_index = if_nametoindex(filter_dev);
+		if (filter_index == 0) {
+			fprintf(stderr, "Cannot find device \"%s\"\n",
+				filter_dev);
 			return -1;
 		}
 	}
@@ -138,11 +150,8 @@ static int fdb_show(int argc, char **argv)
 		perror("Cannot send dump request");
 		exit(1);
 	}
-	
-	printf("port\tmac addr\t\tflags%s\n",
-	       show_stats ? "\t updated     used" : "");
 
-	if (rtnl_dump_filter(&rth, print_fdb, NULL) < 0) {
+	if (rtnl_dump_filter(&rth, print_fdb, stdout) < 0) {
 		fprintf(stderr, "Dump terminated\n");
 		exit(1);
 	}
@@ -160,6 +169,8 @@ static int fdb_modify(int cmd, int flags, int argc, char **argv)
 	char *addr = NULL;
 	char *d = NULL;
 	char abuf[ETH_ALEN];
+	int dst_ok = 0;
+	inet_prefix dst;
 
 	memset(&req, 0, sizeof(req));
 
@@ -173,21 +184,27 @@ static int fdb_modify(int cmd, int flags, int argc, char **argv)
 		if (strcmp(*argv, "dev") == 0) {
 			NEXT_ARG();
 			d = *argv;
-		} else if (strcmp(*argv, "local") == 0) {
-			req.ndm.ndm_state = NUD_PERMANENT;
-		} else if (strcmp(*argv, "temp") == 0) {
-			req.ndm.ndm_state = NUD_REACHABLE;
+		} else if (strcmp(*argv, "dst") == 0) {
+			NEXT_ARG();
+			if (dst_ok)
+				duparg2("dst", *argv);
+			get_addr(&dst, *argv, preferred_family);
+			dst_ok = 1;
 		} else if (strcmp(*argv, "self") == 0) {
 			req.ndm.ndm_flags |= NTF_SELF;
-		} else if (strcmp(*argv, "master") == 0) {
+		} else if (matches(*argv, "master") == 0) {
 			req.ndm.ndm_flags |= NTF_MASTER;
+		} else if (matches(*argv, "local") == 0|| 
+			   matches(*argv, "permanent") == 0) {
+			req.ndm.ndm_state |= NUD_PERMANENT;
+		} else if (matches(*argv, "temp") == 0) {
+			req.ndm.ndm_state |= NUD_REACHABLE;
 		} else {
 			if (strcmp(*argv, "to") == 0) {
 				NEXT_ARG();
 			}
-			if (matches(*argv, "help") == 0) {
-				NEXT_ARG();
-			}
+			if (matches(*argv, "help") == 0)
+				usage();
 			if (addr)
 				duparg2("to", *argv);
 			addr = *argv;
@@ -200,7 +217,15 @@ static int fdb_modify(int cmd, int flags, int argc, char **argv)
 		exit(-1);
 	}
 
-	if (sscanf(addr, "%hhx:%hhx:%hhx:%hhx:%hhx:%hhx", 
+	/* Assume self */
+	if (!(req.ndm.ndm_flags&(NTF_SELF|NTF_MASTER)))
+		req.ndm.ndm_flags |= NTF_SELF;
+
+	/* Assume permanent */
+	if (!(req.ndm.ndm_state&(NUD_PERMANENT|NUD_REACHABLE)))
+		req.ndm.ndm_state |= NUD_PERMANENT;
+
+	if (sscanf(addr, "%hhx:%hhx:%hhx:%hhx:%hhx:%hhx",
 		   abuf, abuf+1, abuf+2,
 		   abuf+3, abuf+4, abuf+5) != 6) {
 		fprintf(stderr, "Invalid mac address %s\n", addr);
@@ -208,6 +233,8 @@ static int fdb_modify(int cmd, int flags, int argc, char **argv)
 	}
 
 	addattr_l(&req.n, sizeof(req), NDA_LLADDR, abuf, ETH_ALEN);
+	if (dst_ok)
+		addattr_l(&req.n, sizeof(req), NDA_DST, &dst.data, dst.bytelen);
 
 	req.ndm.ndm_ifindex = ll_name_to_index(d);
 	if (req.ndm.ndm_ifindex == 0) {
-- 
1.7.10.4

^ permalink raw reply related	[flat|nested] 43+ messages in thread

* [PATCH 1/2] iproute2: vxlan support
  2012-09-27 23:12               ` David Miller
  2012-10-01 20:57                 ` [PATCHv6 " Stephen Hemminger
       [not found]                 ` <20121001140206.2bbf9c41@nehalam.linuxnetplumber.net>
@ 2012-10-01 21:02                 ` Stephen Hemminger
  2 siblings, 0 replies; 43+ messages in thread
From: Stephen Hemminger @ 2012-10-01 21:02 UTC (permalink / raw)
  To: David Miller; +Cc: jesse, chrisw, netdev

Support managing vxlan tunnels

---
 include/linux/if_link.h |   27 ++++++
 ip/Makefile             |    3 +-
 ip/iplink_vxlan.c       |  230 +++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 259 insertions(+), 1 deletion(-)
 create mode 100644 ip/iplink_vxlan.c

diff --git a/include/linux/if_link.h b/include/linux/if_link.h
index 46f03db..e253a98 100644
--- a/include/linux/if_link.h
+++ b/include/linux/if_link.h
@@ -270,6 +270,22 @@ enum macvlan_mode {
 
 #define MACVLAN_FLAG_NOPROMISC	1
 
+/* VXLAN section */
+enum {
+	IFLA_VXLAN_UNSPEC,
+	IFLA_VXLAN_ID,
+	IFLA_VXLAN_GROUP,
+	IFLA_VXLAN_LINK,
+	IFLA_VXLAN_LOCAL,
+	IFLA_VXLAN_TTL,
+	IFLA_VXLAN_TOS,
+	IFLA_VXLAN_LEARNING,
+	IFLA_VXLAN_AGEING,
+	IFLA_VXLAN_LIMIT,
+	__IFLA_VXLAN_MAX
+};
+#define IFLA_VXLAN_MAX	(__IFLA_VXLAN_MAX - 1)
+
 /* SR-IOV virtual function management section */
 
 enum {
@@ -384,4 +400,15 @@ struct ifla_port_vsi {
 	__u8 pad[3];
 };
 
+
+/* IPoIB section */
+
+enum {
+	IFLA_IPOIB_UNSPEC,
+	IFLA_IPOIB_PKEY,
+	__IFLA_IPOIB_MAX
+};
+
+#define IFLA_IPOIB_MAX (__IFLA_IPOIB_MAX - 1)
+
 #endif /* _LINUX_IF_LINK_H */
diff --git a/ip/Makefile b/ip/Makefile
index 6a518f8..3bc1516 100644
--- a/ip/Makefile
+++ b/ip/Makefile
@@ -3,7 +3,8 @@ IPOBJ=ip.o ipaddress.o ipaddrlabel.o iproute.o iprule.o ipnetns.o \
     ipmaddr.o ipmonitor.o ipmroute.o ipprefix.o iptuntap.o \
     ipxfrm.o xfrm_state.o xfrm_policy.o xfrm_monitor.o \
     iplink_vlan.o link_veth.o link_gre.o iplink_can.o \
-    iplink_macvlan.o iplink_macvtap.o ipl2tp.o link_vti.o
+    iplink_macvlan.o iplink_macvtap.o ipl2tp.o link_vti.o \
+    iplink_vxlan.o
 
 RTMONOBJ=rtmon.o
 
diff --git a/ip/iplink_vxlan.c b/ip/iplink_vxlan.c
new file mode 100644
index 0000000..f52eb18
--- /dev/null
+++ b/ip/iplink_vxlan.c
@@ -0,0 +1,230 @@
+/*
+ * iplink_vxlan.c	VXLAN device support
+ *
+ *              This program is free software; you can redistribute it and/or
+ *              modify it under the terms of the GNU General Public License
+ *              as published by the Free Software Foundation; either version
+ *              2 of the License, or (at your option) any later version.
+ *
+ * Authors:     Stephen Hemminger <shemminger@vyatta.com
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <net/if.h>
+#include <linux/ip.h>
+#include <linux/if_link.h>
+#include <arpa/inet.h>
+
+#include "rt_names.h"
+#include "utils.h"
+#include "ip_common.h"
+
+static void explain(void)
+{
+	fprintf(stderr, "Usage: ... vxlan id VNI [ group ADDR ] [ local ADDR ]\n");
+	fprintf(stderr, "                 [ ttl TTL ] [ tos TOS ] [ [no]learning ] [ dev PHYS_DEV ]\n");
+	fprintf(stderr, "\n");
+	fprintf(stderr, "Where: VNI := 0-16777215\n");
+	fprintf(stderr, "       ADDR := { IP_ADDRESS | any }\n");
+	fprintf(stderr, "       TOS  := { NUMBER | inherit }\n");
+	fprintf(stderr, "       TTL  := { 1..255 | inherit }\n");
+}
+
+static int vxlan_parse_opt(struct link_util *lu, int argc, char **argv,
+			  struct nlmsghdr *n)
+{
+	__u32 vni = 0;
+	int vni_set = 0;
+	__u32 saddr = 0;
+	__u32 gaddr = 0;
+	unsigned link = 0;
+	__u8 tos = 0;
+	__u8 ttl = 0;
+	__u8 learning = 1;
+	__u8 noage = 0;
+	__u32 age = 0;
+	__u32 maxaddr = 0;
+
+	while (argc > 0) {
+		if (!matches(*argv, "id") ||
+		    !matches(*argv, "vni")) {
+			NEXT_ARG();
+			if (get_u32(&vni, *argv, 0) ||
+			    vni >= 1u << 24)
+				invarg("invalid id", *argv);
+			vni_set = 1;
+		} else if (!matches(*argv, "group")) {
+			NEXT_ARG();
+			gaddr = get_addr32(*argv);
+
+			if (!IN_MULTICAST(ntohl(gaddr)))
+				invarg("invald group address", *argv);
+		} else if (!matches(*argv, "local")) {
+			NEXT_ARG();
+			if (strcmp(*argv, "any"))
+				saddr = get_addr32(*argv);
+			if (IN_MULTICAST(ntohl(saddr)))
+				invarg("invalid local address", *argv);
+		} else if (!matches(*argv, "dev")) {
+			NEXT_ARG();
+			link = if_nametoindex(*argv);
+			if (link == 0)
+				exit(-1);
+		} else if (!matches(*argv, "ttl") ||
+			   !matches(*argv, "hoplimit")) {
+			unsigned uval;
+
+			NEXT_ARG();
+			if (strcmp(*argv, "inherit") != 0) {
+				if (get_unsigned(&uval, *argv, 0))
+					invarg("invalid TTL\n", *argv);
+				if (uval > 255)
+					invarg("TTL must be <= 255\n", *argv);
+				ttl = uval;
+			}
+		} else if (!matches(*argv, "tos") ||
+			   !matches(*argv, "dsfield")) {
+			__u32 uval;
+
+			NEXT_ARG();
+			if (strcmp(*argv, "inherit") != 0) {
+				if (rtnl_dsfield_a2n(&uval, *argv))
+					invarg("bad TOS value", *argv);
+				tos = uval;
+			} else
+				tos = 1;
+		} else if (!matches(*argv, "ageing")) {
+			NEXT_ARG();
+			if (strcmp(*argv, "none") == 0)
+				noage = 1;
+			else if (get_u32(&age, *argv, 0))
+				invarg("ageing timer\n", *argv);
+		} else if (!matches(*argv, "maxaddress")) {
+			NEXT_ARG();
+			if (strcmp(*argv, "unlimited") == 0)
+				maxaddr = 0;
+			else if (get_u32(&maxaddr, *argv, 0))
+				invarg("max addresses\n", *argv);
+		} else if (!matches(*argv, "nolearning")) {
+			learning = 0;
+		} else if (!matches(*argv, "learning")) {
+			learning = 1;
+		} else if (matches(*argv, "help") == 0) {
+			explain();
+			return -1;
+		} else {
+			fprintf(stderr, "vxlan: what is \"%s\"?\n", *argv);
+			explain();
+			return -1;
+		}
+		argc--, argv++;
+	}
+
+	if (!vni_set) {
+		fprintf(stderr, "vxlan: missing virtual network identifier\n");
+		return -1;
+	}
+	addattr32(n, 1024, IFLA_VXLAN_ID, vni);
+	addattr_l(n, 1024, IFLA_VXLAN_GROUP, &gaddr, 4);
+	addattr_l(n, 1024, IFLA_VXLAN_LOCAL, &saddr, 4);
+	if (link)
+		addattr32(n, 1024, IFLA_VXLAN_LINK, link);
+	addattr8(n, 1024, IFLA_VXLAN_TTL, ttl);
+	addattr8(n, 1024, IFLA_VXLAN_TOS, tos);
+	addattr8(n, 1024, IFLA_VXLAN_LEARNING, learning);
+	if (noage)
+		addattr32(n, 1024, IFLA_VXLAN_AGEING, 0);
+	else if (age)
+		addattr32(n, 1024, IFLA_VXLAN_AGEING, age);
+	if (maxaddr)
+		addattr32(n, 1024, IFLA_VXLAN_LIMIT, maxaddr);
+
+	return 0;
+}
+
+static void vxlan_print_opt(struct link_util *lu, FILE *f, struct rtattr *tb[])
+{
+	__u32 vni;
+	unsigned link;
+	char s1[1024];
+	char s2[64];
+
+	if (!tb)
+		return;
+
+	if (!tb[IFLA_VXLAN_ID] ||
+	    RTA_PAYLOAD(tb[IFLA_VXLAN_ID]) < sizeof(__u32))
+		return;
+
+	vni = rta_getattr_u32(tb[IFLA_VXLAN_ID]);
+	fprintf(f, "id %u ", vni);
+
+	if (tb[IFLA_VXLAN_GROUP]) {
+		__u32 addr = rta_getattr_u32(tb[IFLA_VXLAN_GROUP]);
+
+		if (addr)
+			fprintf(f, "group %s ",
+				format_host(AF_INET, 4, &addr, s1, sizeof(s1)));
+	}
+
+	if (tb[IFLA_VXLAN_LOCAL]) {
+		unsigned addr = rta_getattr_u32(tb[IFLA_VXLAN_LOCAL]);
+
+		if (addr)
+			fprintf(f, "local %s ", 
+				format_host(AF_INET, 4, &addr, s1, sizeof(s1)));
+	}
+
+	if (tb[IFLA_VXLAN_LINK] &&
+	    (link = rta_getattr_u32(tb[IFLA_VXLAN_LINK]))) {
+		const char *n = if_indextoname(link, s2);
+
+		if (n)
+			fprintf(f, "dev %s ", n);
+		else
+			fprintf(f, "dev %u ", link);
+	}
+
+	if (tb[IFLA_VXLAN_LEARNING] &&
+	    !rta_getattr_u8(tb[IFLA_VXLAN_LEARNING]))
+		fputs("nolearning ", f);
+
+	if (tb[IFLA_VXLAN_TOS]) {
+		__u8 tos = rta_getattr_u8(tb[IFLA_VXLAN_TOS]);
+
+		if (tos == 1)
+			fprintf(f, "tos inherit ");
+		else
+			fprintf(f, "tos %#x ", tos);
+	}
+
+	if (tb[IFLA_VXLAN_TTL]) {
+		__u8 ttl = rta_getattr_u8(tb[IFLA_VXLAN_TTL]);
+		if (ttl)
+			fprintf(f, "ttl %d ", ttl);
+	}
+
+	if (tb[IFLA_VXLAN_AGEING]) {
+		__u32 age = rta_getattr_u32(tb[IFLA_VXLAN_AGEING]);
+		if (age == 0)
+			fprintf(f, "ageing none ");
+		else
+			fprintf(f, "ageing %u ", age);
+	}
+	if (tb[IFLA_VXLAN_LIMIT]) {
+		__u32 maxaddr = rta_getattr_u32(tb[IFLA_VXLAN_LIMIT]);
+		if (maxaddr == 0)
+			fprintf(f, "maxaddr unlimited ");
+		else
+			fprintf(f, "maxaddr %u ", maxaddr);
+	}
+}
+
+struct link_util vxlan_link_util = {
+	.id		= "vxlan",
+	.maxattr	= IFLA_VXLAN_MAX,
+	.parse_opt	= vxlan_parse_opt,
+	.print_opt	= vxlan_print_opt,
+};
-- 
1.7.10.4

^ permalink raw reply related	[flat|nested] 43+ messages in thread

* Re: [RFC] gre: conform to RFC6040 ECN progogation
  2012-10-01 17:13                 ` Eric Dumazet
@ 2012-10-01 21:21                   ` Stephen Hemminger
  0 siblings, 0 replies; 43+ messages in thread
From: Stephen Hemminger @ 2012-10-01 21:21 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: Ben Hutchings, Chris Wright, David Miller, netdev

On Mon, 01 Oct 2012 19:13:47 +0200
Eric Dumazet <eric.dumazet@gmail.com> wrote:

> On Mon, 2012-10-01 at 17:49 +0100, Ben Hutchings wrote:
> 
> > I don't think rx_dropped is appropriate for counting invalid packets,
> > but maybe actual practice is already different.
> > 
> > As for whether packets counted in rx_dropped should also be counted in
> > rx_packets/rx_bytes, I really don't know.  The current comments on
> > rtnl_link_stats (inherited from net_device_stats) are totally inadequate
> > as a specification.
> 
> rx_dropped is used by core network stack, not the devices themselves.
> 
> So a packet is first accounted in rx_bytes/rx_packets by the driver,
> and if net/core/dev.c drops it, rx_dropped is incremented as well.

The tunnel drivers are consistent in putting any dropped packet because
of protocol problem into a rx_XXX_error value and incrementing rx_errors.

I just made it treat ECN bit breakage like all the other protocol errors.

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCHv6 net-next] vxlan: virtual extensible lan
  2012-10-01 20:57                 ` [PATCHv6 " Stephen Hemminger
@ 2012-10-01 22:07                   ` David Miller
  2012-10-01 22:23                     ` Stephen Hemminger
  2012-10-01 22:30                     ` Stephen Hemminger
  0 siblings, 2 replies; 43+ messages in thread
From: David Miller @ 2012-10-01 22:07 UTC (permalink / raw)
  To: shemminger; +Cc: jesse, chrisw, netdev

From: Stephen Hemminger <shemminger@vyatta.com>
Date: Mon, 1 Oct 2012 13:57:19 -0700

> This is an implementation of Virtual eXtensible Local Area Network
> as described in draft RFC:
>   http://tools.ietf.org/html/draft-mahalingam-dutt-dcops-vxlan-02
> 
> The driver integrates a Virtual Tunnel Endpoint (VTEP) functionality
> that learns MAC to IP address mapping. 
> 
> This implementation has only been tested with the user-mode TAP
> based version for Linux, not against other vendors (yet).
> 
> Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>

It doesn't build.

And I'm not telling you what the build error is, you'll have to do an
allmodconfig build yourself to see it.

I want you to get into the habit of doing an allmodconfig build to
validate your changes because that's the very first thing I'm going to
do.

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCHv6 net-next] vxlan: virtual extensible lan
  2012-10-01 22:07                   ` David Miller
@ 2012-10-01 22:23                     ` Stephen Hemminger
  2012-10-01 22:30                     ` Stephen Hemminger
  1 sibling, 0 replies; 43+ messages in thread
From: Stephen Hemminger @ 2012-10-01 22:23 UTC (permalink / raw)
  To: David Miller; +Cc: jesse, chrisw, netdev

On Mon, 01 Oct 2012 18:07:12 -0400 (EDT)
David Miller <davem@davemloft.net> wrote:

> From: Stephen Hemminger <shemminger@vyatta.com>
> Date: Mon, 1 Oct 2012 13:57:19 -0700
> 
> > This is an implementation of Virtual eXtensible Local Area Network
> > as described in draft RFC:
> >   http://tools.ietf.org/html/draft-mahalingam-dutt-dcops-vxlan-02
> > 
> > The driver integrates a Virtual Tunnel Endpoint (VTEP) functionality
> > that learns MAC to IP address mapping. 
> > 
> > This implementation has only been tested with the user-mode TAP
> > based version for Linux, not against other vendors (yet).
> > 
> > Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>
> 
> It doesn't build.
> 
> And I'm not telling you what the build error is, you'll have to do an
> allmodconfig build yourself to see it.
> 
> I want you to get into the habit of doing an allmodconfig build to
> validate your changes because that's the very first thing I'm going to
> do.

Thanks, still waiting for build to finish.

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCHv6 net-next] vxlan: virtual extensible lan
  2012-10-01 22:07                   ` David Miller
  2012-10-01 22:23                     ` Stephen Hemminger
@ 2012-10-01 22:30                     ` Stephen Hemminger
  2012-10-01 22:34                       ` David Miller
  1 sibling, 1 reply; 43+ messages in thread
From: Stephen Hemminger @ 2012-10-01 22:30 UTC (permalink / raw)
  To: David Miller; +Cc: jesse, chrisw, netdev

On Mon, 01 Oct 2012 18:07:12 -0400 (EDT)
David Miller <davem@davemloft.net> wrote:

> From: Stephen Hemminger <shemminger@vyatta.com>
> Date: Mon, 1 Oct 2012 13:57:19 -0700
> 
> > This is an implementation of Virtual eXtensible Local Area Network
> > as described in draft RFC:
> >   http://tools.ietf.org/html/draft-mahalingam-dutt-dcops-vxlan-02
> > 
> > The driver integrates a Virtual Tunnel Endpoint (VTEP) functionality
> > that learns MAC to IP address mapping. 
> > 
> > This implementation has only been tested with the user-mode TAP
> > based version for Linux, not against other vendors (yet).
> > 
> > Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>
> 
> It doesn't build.
> 
> And I'm not telling you what the build error is, you'll have to do an
> allmodconfig build yourself to see it.
> 
> I want you to get into the habit of doing an allmodconfig build to
> validate your changes because that's the very first thing I'm going to
> do.

Dave did you remember to include the two pre-cursor patches.

Vxlan was originally submitted as a 3 part series and only the
last one ever changed.

 [PATCH net-next 1/3] netlink: add attributes to fdb interface
 [PATCH net-next 2/3] igmp: export symbol ip_mc_leave_group

Make allmodconfig works for me (x86-64).

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCHv6 net-next] vxlan: virtual extensible lan
  2012-10-01 22:30                     ` Stephen Hemminger
@ 2012-10-01 22:34                       ` David Miller
  0 siblings, 0 replies; 43+ messages in thread
From: David Miller @ 2012-10-01 22:34 UTC (permalink / raw)
  To: shemminger; +Cc: jesse, chrisw, netdev

From: Stephen Hemminger <shemminger@vyatta.com>
Date: Mon, 1 Oct 2012 15:30:14 -0700

> On Mon, 01 Oct 2012 18:07:12 -0400 (EDT)
> David Miller <davem@davemloft.net> wrote:
> 
>> From: Stephen Hemminger <shemminger@vyatta.com>
>> Date: Mon, 1 Oct 2012 13:57:19 -0700
>> 
>> > This is an implementation of Virtual eXtensible Local Area Network
>> > as described in draft RFC:
>> >   http://tools.ietf.org/html/draft-mahalingam-dutt-dcops-vxlan-02
>> > 
>> > The driver integrates a Virtual Tunnel Endpoint (VTEP) functionality
>> > that learns MAC to IP address mapping. 
>> > 
>> > This implementation has only been tested with the user-mode TAP
>> > based version for Linux, not against other vendors (yet).
>> > 
>> > Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>
>> 
>> It doesn't build.
>> 
>> And I'm not telling you what the build error is, you'll have to do an
>> allmodconfig build yourself to see it.
>> 
>> I want you to get into the habit of doing an allmodconfig build to
>> validate your changes because that's the very first thing I'm going to
>> do.
> 
> Dave did you remember to include the two pre-cursor patches.
> 
> Vxlan was originally submitted as a 3 part series and only the
> last one ever changed.
> 
>  [PATCH net-next 1/3] netlink: add attributes to fdb interface
>  [PATCH net-next 2/3] igmp: export symbol ip_mc_leave_group
> 
> Make allmodconfig works for me (x86-64).

You MUST always resubmit the entire series when you resubmit one
patch.

Furthermore you confused the situation even more by not including
the patch number information in your Subject line.

^ permalink raw reply	[flat|nested] 43+ messages in thread

end of thread, other threads:[~2012-10-01 22:34 UTC | newest]

Thread overview: 43+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2012-09-24 18:43 [PATCH net-next 0/3] VXLAN driver Stephen Hemminger
2012-09-24 18:43 ` [PATCH net-next 1/3] netlink: add attributes to fdb interface Stephen Hemminger
2012-09-24 18:43 ` [PATCH net-next 2/3] igmp: export symbol ip_mc_leave_group Stephen Hemminger
2012-09-24 18:43 ` [PATCH net-next 3/3] vxlan: virtual extensible lan Stephen Hemminger
2012-09-24 19:33   ` Eric Dumazet
2012-09-24 19:39   ` Eric Dumazet
2012-09-24 19:46     ` [PATCHv2 " Stephen Hemminger
2012-09-24 19:55       ` Eric Dumazet
2012-09-24 20:02         ` [PATCHv3 " Stephen Hemminger
2012-09-24 20:24           ` John Fastabend
2012-09-24 20:27             ` Stephen Hemminger
2012-09-24 23:17               ` John Fastabend
2012-09-24 20:09       ` [PATCHv2 " Eric Dumazet
2012-09-24 20:26         ` Stephen Hemminger
2012-09-24 20:41           ` Eric Dumazet
2012-09-24 20:58   ` [PATCH " Chris Wright
2012-09-24 21:11     ` Stephen Hemminger
2012-09-24 21:22       ` Chris Wright
2012-09-24 21:44         ` [RFC] gre: conform to RFC6040 ECN progogation Stephen Hemminger
2012-09-24 22:25           ` Eric Dumazet
2012-09-24 22:30             ` Stephen Hemminger
2012-09-25  5:17               ` Eric Dumazet
2012-10-01 15:55           ` Ben Hutchings
2012-10-01 15:56             ` Stephen Hemminger
2012-10-01 16:49               ` Ben Hutchings
2012-10-01 17:13                 ` Eric Dumazet
2012-10-01 21:21                   ` Stephen Hemminger
2012-09-24 21:50     ` [PATCHv4 net-next] vxlan: virtual extensible lan Stephen Hemminger
2012-09-25 21:55       ` Jesse Gross
2012-09-25 22:03         ` Stephen Hemminger
2012-09-25 22:09         ` [PATCHv5 " Stephen Hemminger
2012-09-27 22:47           ` David Miller
2012-09-27 23:00             ` Stephen Hemminger
2012-09-27 23:12               ` David Miller
2012-10-01 20:57                 ` [PATCHv6 " Stephen Hemminger
2012-10-01 22:07                   ` David Miller
2012-10-01 22:23                     ` Stephen Hemminger
2012-10-01 22:30                     ` Stephen Hemminger
2012-10-01 22:34                       ` David Miller
     [not found]                 ` <20121001140206.2bbf9c41@nehalam.linuxnetplumber.net>
2012-10-01 21:02                   ` [PATCH 2/2] iproute2: manage VXLAN forwarding entries Stephen Hemminger
2012-10-01 21:02                 ` [PATCH 1/2] iproute2: vxlan support Stephen Hemminger
2012-09-26  4:36         ` [PATCHv4 net-next] vxlan: virtual extensible lan Stephen Hemminger
2012-09-27 17:20           ` Jesse Gross

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).