netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [Patch net-next v1 1/4] vxlan: defer vxlan init as late as possible
@ 2013-03-31  5:43 Cong Wang
  2013-03-31  5:43 ` [Patch net-next v1 2/4] ipv6: export ipv6_sock_mc_join and ipv6_sock_mc_drop Cong Wang
                   ` (3 more replies)
  0 siblings, 4 replies; 16+ messages in thread
From: Cong Wang @ 2013-03-31  5:43 UTC (permalink / raw)
  To: netdev; +Cc: Stephen Hemminger, David S. Miller, Cong Wang

From: Cong Wang <amwang@redhat.com>

When vxlan is compiled as builtin, its init code
runs before IPv6 init, this could cause problems
if we create IPv6 socket in the latter patch.

Cc: Stephen Hemminger <stephen@networkplumber.org>
Cc: David S. Miller <davem@davemloft.net>
Signed-off-by: Cong Wang <amwang@redhat.com>
---
 drivers/net/vxlan.c |    2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
index 62a4438..cac4e4f 100644
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c
@@ -1619,7 +1619,7 @@ out2:
 out1:
 	return rc;
 }
-module_init(vxlan_init_module);
+late_initcall(vxlan_init_module);
 
 static void __exit vxlan_cleanup_module(void)
 {
-- 
1.7.7.6

^ permalink raw reply related	[flat|nested] 16+ messages in thread

* [Patch net-next v1 2/4] ipv6: export ipv6_sock_mc_join and ipv6_sock_mc_drop
  2013-03-31  5:43 [Patch net-next v1 1/4] vxlan: defer vxlan init as late as possible Cong Wang
@ 2013-03-31  5:43 ` Cong Wang
  2013-03-31  5:43 ` [Patch net-next v1 3/4] vxlan: add ipv6 support Cong Wang
                   ` (2 subsequent siblings)
  3 siblings, 0 replies; 16+ messages in thread
From: Cong Wang @ 2013-03-31  5:43 UTC (permalink / raw)
  To: netdev; +Cc: Stephen Hemminger, David S. Miller, Cong Wang

From: Cong Wang <amwang@redhat.com>

They will be used by vxlan module.

Cc: Stephen Hemminger <stephen@networkplumber.org>
Cc: David S. Miller <davem@davemloft.net>
Signed-off-by: Cong Wang <amwang@redhat.com>
---
 net/ipv6/mcast.c |    2 ++
 1 files changed, 2 insertions(+), 0 deletions(-)

diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c
index bfa6cc3..d03426d 100644
--- a/net/ipv6/mcast.c
+++ b/net/ipv6/mcast.c
@@ -200,6 +200,7 @@ int ipv6_sock_mc_join(struct sock *sk, int ifindex, const struct in6_addr *addr)
 
 	return 0;
 }
+EXPORT_SYMBOL(ipv6_sock_mc_join);
 
 /*
  *	socket leave on multicast group
@@ -246,6 +247,7 @@ int ipv6_sock_mc_drop(struct sock *sk, int ifindex, const struct in6_addr *addr)
 
 	return -EADDRNOTAVAIL;
 }
+EXPORT_SYMBOL(ipv6_sock_mc_drop);
 
 /* called with rcu_read_lock() */
 static struct inet6_dev *ip6_mc_find_dev_rcu(struct net *net,
-- 
1.7.7.6

^ permalink raw reply related	[flat|nested] 16+ messages in thread

* [Patch net-next v1 3/4] vxlan: add ipv6 support
  2013-03-31  5:43 [Patch net-next v1 1/4] vxlan: defer vxlan init as late as possible Cong Wang
  2013-03-31  5:43 ` [Patch net-next v1 2/4] ipv6: export ipv6_sock_mc_join and ipv6_sock_mc_drop Cong Wang
@ 2013-03-31  5:43 ` Cong Wang
  2013-04-01 15:19   ` David Stevens
  2013-04-01 20:14   ` Stephen Hemminger
  2013-03-31  5:43 ` [Patch net-next v1 4/4] ipv6: Add generic UDP Tunnel segmentation Cong Wang
  2013-03-31  6:17 ` [PATCH iproute2] vxlan: add ipv6 support Cong Wang
  3 siblings, 2 replies; 16+ messages in thread
From: Cong Wang @ 2013-03-31  5:43 UTC (permalink / raw)
  To: netdev; +Cc: Stephen Hemminger, David S. Miller, Cong Wang

From: Cong Wang <amwang@redhat.com>

This patch adds IPv6 support to vxlan device, as the new version
RFC already mentions it:

   http://tools.ietf.org/html/draft-mahalingam-dutt-dcops-vxlan-03

(I am not sure if I do checksum correctly, at least it works)

Cc: Stephen Hemminger <stephen@networkplumber.org>
Cc: David S. Miller <davem@davemloft.net>
Signed-off-by: Cong Wang <amwang@redhat.com>
---
 drivers/net/vxlan.c          |  553 ++++++++++++++++++++++++++++++++---------
 include/uapi/linux/if_link.h |    2 +
 2 files changed, 433 insertions(+), 122 deletions(-)

diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
index cac4e4f..8923593 100644
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c
@@ -9,7 +9,6 @@
  *
  * TODO
  *  - use IANA UDP port number (when defined)
- *  - IPv6 (not in RFC)
  */
 
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
@@ -42,6 +41,11 @@
 #include <net/inet_ecn.h>
 #include <net/net_namespace.h>
 #include <net/netns/generic.h>
+#if IS_ENABLED(CONFIG_IPV6)
+#include <net/addrconf.h>
+#include <net/ip6_route.h>
+#include <net/ip6_tunnel.h>
+#endif
 
 #define VXLAN_VERSION	"0.1"
 
@@ -56,6 +60,7 @@
 #define VXLAN_VID_MASK	(VXLAN_N_VID - 1)
 /* IP header + UDP + VXLAN + Ethernet header */
 #define VXLAN_HEADROOM (20 + 8 + 8 + 14)
+#define VXLAN6_HEADROOM (40 + 8 + 8 + 14)
 
 #define VXLAN_FLAGS 0x08000000	/* struct vxlanhdr.vx_flags required value. */
 
@@ -81,9 +86,19 @@ struct vxlan_net {
 	struct hlist_head vni_list[VNI_HASH_SIZE];
 };
 
+struct vxlan_ip {
+	union {
+		__be32  ip4;
+#if IS_ENABLED(CONFIG_IPV6)
+		struct in6_addr ip6;
+#endif
+	};
+	__be16          proto;
+};
+
 struct vxlan_rdst {
 	struct rcu_head		 rcu;
-	__be32			 remote_ip;
+	struct vxlan_ip		 remote_ip;
 	__be16			 remote_port;
 	u32			 remote_vni;
 	u32			 remote_ifindex;
@@ -106,8 +121,8 @@ struct vxlan_dev {
 	struct hlist_node hlist;
 	struct net_device *dev;
 	__u32		  vni;		/* virtual network id */
-	__be32	          gaddr;	/* multicast group */
-	__be32		  saddr;	/* source address */
+	struct vxlan_ip	  gaddr;	/* multicast group */
+	struct vxlan_ip	  saddr;	/* source address */
 	unsigned int      link;		/* link to multicast over */
 	__u16		  port_min;	/* source port range */
 	__u16		  port_max;
@@ -130,6 +145,79 @@ struct vxlan_dev {
 #define VXLAN_F_L2MISS	0x08
 #define VXLAN_F_L3MISS	0x10
 
+#if IS_ENABLED(CONFIG_IPV6)
+static inline bool vxlan_ip_equal(const struct vxlan_ip *a, const struct vxlan_ip *b)
+{
+	if (a->proto != b->proto)
+		return false;
+	switch (a->proto) {
+	case htons(ETH_P_IP):
+		return a->ip4 == b->ip4;
+	case htons(ETH_P_IPV6):
+		return ipv6_addr_equal(&a->ip6, &b->ip6);
+	}
+	return false;
+}
+
+static inline bool vxlan_ip_any(const struct vxlan_ip *ipa)
+{
+	if (ipa->proto == htons(ETH_P_IP))
+		return ipa->ip4 == htonl(INADDR_ANY);
+	else
+		return ipv6_addr_any(&ipa->ip6);
+}
+
+static int vxlan_nla_get_addr(struct vxlan_ip *ip, struct nlattr *nla)
+{
+	if (nla_len(nla) == sizeof(__be32)) {
+		ip->ip4 = nla_get_be32(nla);
+		ip->proto = htons(ETH_P_IP);
+	} else if (nla_len(nla) == sizeof(struct in6_addr)) {
+		nla_memcpy(&ip->ip6, nla, sizeof(struct in6_addr));
+		ip->proto = htons(ETH_P_IPV6);
+	} else
+		return -EAFNOSUPPORT;
+	return 0;
+}
+
+static int vxlan_nla_put_addr(struct sk_buff *skb, int attr, const struct vxlan_ip *ip)
+{
+	if (ip->proto == htons(ETH_P_IP))
+		return nla_put_be32(skb, attr, ip->ip4);
+	else if (ip->proto == htons(ETH_P_IPV6))
+		return nla_put(skb, attr, sizeof(struct in6_addr), &ip->ip6);
+	else
+		return -EAFNOSUPPORT;
+}
+#else
+static inline bool vxlan_ip_equal(const struct vxlan_ip *a, const struct vxlan_ip *b)
+{
+	return a->ip4 == b->ip4;
+}
+
+static inline bool vxlan_ip_any(const struct vxlan_ip *ipa)
+{
+	return ipa->ip4 == htonl(INADDR_ANY);
+}
+
+static int vxlan_nla_get_addr(struct vxlan_ip *ip, struct nlattr *nla)
+{
+	if (nla_len(nla) == sizeof(__be32)) {
+		ip->ip4 = nla_get_be32(nla);
+		ip->proto = htons(ETH_P_IP);
+	} else
+		return -EAFNOSUPPORT;
+}
+
+static int vxlan_nla_put_addr(struct sk_buff *skb, int attr, const struct vxlan_ip *ip)
+{
+	if (ip->proto == htons(ETH_P_IP))
+		return nla_put_be32(skb, attr, ip->ip4);
+	else
+		return -EAFNOSUPPORT;
+}
+#endif
+
 /* salt for hash table */
 static u32 vxlan_salt __read_mostly;
 
@@ -176,7 +264,7 @@ static int vxlan_fdb_info(struct sk_buff *skb, struct vxlan_dev *vxlan,
 
 	if (type == RTM_GETNEIGH) {
 		ndm->ndm_family	= AF_INET;
-		send_ip = rdst->remote_ip != htonl(INADDR_ANY);
+		send_ip = !vxlan_ip_any(&rdst->remote_ip);
 		send_eth = !is_zero_ether_addr(fdb->eth_addr);
 	} else
 		ndm->ndm_family	= AF_BRIDGE;
@@ -188,7 +276,7 @@ static int vxlan_fdb_info(struct sk_buff *skb, struct vxlan_dev *vxlan,
 	if (send_eth && nla_put(skb, NDA_LLADDR, ETH_ALEN, &fdb->eth_addr))
 		goto nla_put_failure;
 
-	if (send_ip && nla_put_be32(skb, NDA_DST, rdst->remote_ip))
+	if (send_ip && vxlan_nla_put_addr(skb, NDA_DST, &rdst->remote_ip))
 		goto nla_put_failure;
 
 	if (rdst->remote_port && rdst->remote_port != vxlan_port &&
@@ -220,7 +308,7 @@ static inline size_t vxlan_nlmsg_size(void)
 {
 	return NLMSG_ALIGN(sizeof(struct ndmsg))
 		+ nla_total_size(ETH_ALEN) /* NDA_LLADDR */
-		+ nla_total_size(sizeof(__be32)) /* NDA_DST */
+		+ nla_total_size(sizeof(struct in6_addr)) /* NDA_DST */
 		+ nla_total_size(sizeof(__be32)) /* NDA_PORT */
 		+ nla_total_size(sizeof(__be32)) /* NDA_VNI */
 		+ nla_total_size(sizeof(__u32)) /* NDA_IFINDEX */
@@ -253,14 +341,14 @@ errout:
 		rtnl_set_sk_err(net, RTNLGRP_NEIGH, err);
 }
 
-static void vxlan_ip_miss(struct net_device *dev, __be32 ipa)
+static void vxlan_ip_miss(struct net_device *dev, struct vxlan_ip *ipa)
 {
 	struct vxlan_dev *vxlan = netdev_priv(dev);
 	struct vxlan_fdb f;
 
 	memset(&f, 0, sizeof f);
 	f.state = NUD_STALE;
-	f.remote.remote_ip = ipa; /* goes to NDA_DST */
+	f.remote.remote_ip = *ipa; /* goes to NDA_DST */
 	f.remote.remote_vni = VXLAN_N_VID;
 
 	vxlan_fdb_notify(vxlan, &f, RTM_GETNEIGH);
@@ -316,13 +404,13 @@ static struct vxlan_fdb *vxlan_find_mac(struct vxlan_dev *vxlan,
 
 /* Add/update destinations for multicast */
 static int vxlan_fdb_append(struct vxlan_fdb *f,
-			    __be32 ip, __u32 port, __u32 vni, __u32 ifindex)
+			    struct vxlan_ip *ip, __u32 port, __u32 vni, __u32 ifindex)
 {
 	struct vxlan_rdst *rd_prev, *rd;
 
 	rd_prev = NULL;
 	for (rd = &f->remote; rd; rd = rd->remote_next) {
-		if (rd->remote_ip == ip &&
+		if (vxlan_ip_equal(&rd->remote_ip, ip) &&
 		    rd->remote_port == port &&
 		    rd->remote_vni == vni &&
 		    rd->remote_ifindex == ifindex)
@@ -332,7 +420,7 @@ static int vxlan_fdb_append(struct vxlan_fdb *f,
 	rd = kmalloc(sizeof(*rd), GFP_ATOMIC);
 	if (rd == NULL)
 		return -ENOBUFS;
-	rd->remote_ip = ip;
+	rd->remote_ip = *ip;
 	rd->remote_port = port;
 	rd->remote_vni = vni;
 	rd->remote_ifindex = ifindex;
@@ -343,7 +431,7 @@ static int vxlan_fdb_append(struct vxlan_fdb *f,
 
 /* Add new entry to forwarding table -- assumes lock held */
 static int vxlan_fdb_create(struct vxlan_dev *vxlan,
-			    const u8 *mac, __be32 ip,
+			    const u8 *mac, struct vxlan_ip *ip,
 			    __u16 state, __u16 flags,
 			    __u32 port, __u32 vni, __u32 ifindex)
 {
@@ -383,7 +471,7 @@ static int vxlan_fdb_create(struct vxlan_dev *vxlan,
 			return -ENOMEM;
 
 		notify = 1;
-		f->remote.remote_ip = ip;
+		f->remote.remote_ip = *ip;
 		f->remote.remote_port = port;
 		f->remote.remote_vni = vni;
 		f->remote.remote_ifindex = ifindex;
@@ -435,7 +523,7 @@ static int vxlan_fdb_add(struct ndmsg *ndm, struct nlattr *tb[],
 {
 	struct vxlan_dev *vxlan = netdev_priv(dev);
 	struct net *net = dev_net(vxlan->dev);
-	__be32 ip;
+	struct vxlan_ip ip;
 	u32 port, vni, ifindex;
 	int err;
 
@@ -448,10 +536,9 @@ static int vxlan_fdb_add(struct ndmsg *ndm, struct nlattr *tb[],
 	if (tb[NDA_DST] == NULL)
 		return -EINVAL;
 
-	if (nla_len(tb[NDA_DST]) != sizeof(__be32))
-		return -EAFNOSUPPORT;
-
-	ip = nla_get_be32(tb[NDA_DST]);
+	err = vxlan_nla_get_addr(&ip, tb[NDA_DST]);
+	if (err)
+		return err;
 
 	if (tb[NDA_PORT]) {
 		if (nla_len(tb[NDA_PORT]) != sizeof(u32))
@@ -481,7 +568,7 @@ static int vxlan_fdb_add(struct ndmsg *ndm, struct nlattr *tb[],
 		ifindex = 0;
 
 	spin_lock_bh(&vxlan->hash_lock);
-	err = vxlan_fdb_create(vxlan, addr, ip, ndm->ndm_state, flags, port,
+	err = vxlan_fdb_create(vxlan, addr, &ip, ndm->ndm_state, flags, port,
 		vni, ifindex);
 	spin_unlock_bh(&vxlan->hash_lock);
 
@@ -545,7 +632,7 @@ skip:
  * and Tunnel endpoint.
  */
 static void vxlan_snoop(struct net_device *dev,
-			__be32 src_ip, const u8 *src_mac)
+			struct vxlan_ip *src_ip, const u8 *src_mac)
 {
 	struct vxlan_dev *vxlan = netdev_priv(dev);
 	struct vxlan_fdb *f;
@@ -554,7 +641,7 @@ static void vxlan_snoop(struct net_device *dev,
 	f = vxlan_find_mac(vxlan, src_mac);
 	if (likely(f)) {
 		f->used = jiffies;
-		if (likely(f->remote.remote_ip == src_ip))
+		if (likely(vxlan_ip_equal(&f->remote.remote_ip, src_ip)))
 			return;
 
 		if (net_ratelimit())
@@ -562,7 +649,7 @@ static void vxlan_snoop(struct net_device *dev,
 				    "%pM migrated from %pI4 to %pI4\n",
 				    src_mac, &f->remote.remote_ip, &src_ip);
 
-		f->remote.remote_ip = src_ip;
+		f->remote.remote_ip = *src_ip;
 		f->updated = jiffies;
 	} else {
 		/* learned new entry */
@@ -591,7 +678,7 @@ static bool vxlan_group_used(struct vxlan_net *vn,
 			if (!netif_running(vxlan->dev))
 				continue;
 
-			if (vxlan->gaddr == this->gaddr)
+			if (vxlan_ip_equal(&vxlan->gaddr, &this->gaddr))
 				return true;
 		}
 
@@ -605,7 +692,7 @@ static int vxlan_join_group(struct net_device *dev)
 	struct vxlan_net *vn = net_generic(dev_net(dev), vxlan_net_id);
 	struct sock *sk = vn->sock->sk;
 	struct ip_mreqn mreq = {
-		.imr_multiaddr.s_addr	= vxlan->gaddr,
+		.imr_multiaddr.s_addr	= vxlan->gaddr.ip4,
 		.imr_ifindex		= vxlan->link,
 	};
 	int err;
@@ -616,8 +703,15 @@ static int vxlan_join_group(struct net_device *dev)
 
 	/* Need to drop RTNL to call multicast join */
 	rtnl_unlock();
-	lock_sock(sk);
-	err = ip_mc_join_group(sk, &mreq);
+	if (vxlan->gaddr.proto == htons(ETH_P_IP)) {
+		lock_sock(sk);
+		err = ip_mc_join_group(sk, &mreq);
+	} else {
+#if IS_ENABLED(CONFIG_IPV6)
+		lock_sock(sk);
+		err = ipv6_sock_mc_join(sk, vxlan->link, &vxlan->gaddr.ip6);
+#endif
+	}
 	release_sock(sk);
 	rtnl_lock();
 
@@ -633,7 +727,7 @@ static int vxlan_leave_group(struct net_device *dev)
 	int err = 0;
 	struct sock *sk = vn->sock->sk;
 	struct ip_mreqn mreq = {
-		.imr_multiaddr.s_addr	= vxlan->gaddr,
+		.imr_multiaddr.s_addr	= vxlan->gaddr.ip4,
 		.imr_ifindex		= vxlan->link,
 	};
 
@@ -643,8 +737,15 @@ static int vxlan_leave_group(struct net_device *dev)
 
 	/* Need to drop RTNL to call multicast leave */
 	rtnl_unlock();
-	lock_sock(sk);
-	err = ip_mc_leave_group(sk, &mreq);
+	if (vxlan->gaddr.proto == htons(ETH_P_IP)) {
+		lock_sock(sk);
+		err = ip_mc_leave_group(sk, &mreq);
+	} else {
+#if IS_ENABLED(CONFIG_IPV6)
+		lock_sock(sk);
+		err = ipv6_sock_mc_drop(sk, vxlan->link, &vxlan->gaddr.ip6);
+#endif
+	}
 	release_sock(sk);
 	rtnl_lock();
 
@@ -654,10 +755,12 @@ static int vxlan_leave_group(struct net_device *dev)
 /* Callback from net/ipv4/udp.c to receive packets */
 static int vxlan_udp_encap_recv(struct sock *sk, struct sk_buff *skb)
 {
-	struct iphdr *oip;
+	struct iphdr *oip = NULL;
+	struct ipv6hdr *oip6 = NULL;
 	struct vxlanhdr *vxh;
 	struct vxlan_dev *vxlan;
 	struct pcpu_tstats *stats;
+	struct vxlan_ip src_ip;
 	__u32 vni;
 	int err;
 
@@ -696,7 +799,13 @@ static int vxlan_udp_encap_recv(struct sock *sk, struct sk_buff *skb)
 	skb_reset_mac_header(skb);
 
 	/* Re-examine inner Ethernet packet */
-	oip = ip_hdr(skb);
+	if (skb->protocol == htons(ETH_P_IP))
+		oip = ip_hdr(skb);
+#if IS_ENABLED(CONFIG_IPV6)
+	else
+		oip6 = ipv6_hdr(skb);
+#endif
+
 	skb->protocol = eth_type_trans(skb, vxlan->dev);
 
 	/* Ignore packet loops (and multicast echo) */
@@ -704,8 +813,19 @@ static int vxlan_udp_encap_recv(struct sock *sk, struct sk_buff *skb)
 			       vxlan->dev->dev_addr) == 0)
 		goto drop;
 
-	if (vxlan->flags & VXLAN_F_LEARN)
-		vxlan_snoop(skb->dev, oip->saddr, eth_hdr(skb)->h_source);
+	if (vxlan->flags & VXLAN_F_LEARN) {
+		if (oip) {
+			src_ip.ip4 = oip->saddr;
+			src_ip.proto = htons(ETH_P_IP);
+		}
+#if IS_ENABLED(CONFIG_IPV6)
+		if (oip6) {
+			src_ip.ip6 = oip6->saddr;
+			src_ip.proto = ETH_P_IPV6;
+		}
+#endif
+		vxlan_snoop(skb->dev, &src_ip, eth_hdr(skb)->h_source);
+	}
 
 	__skb_tunnel_rx(skb, vxlan->dev);
 	skb_reset_network_header(skb);
@@ -721,15 +841,32 @@ static int vxlan_udp_encap_recv(struct sock *sk, struct sk_buff *skb)
 
 	skb->encapsulation = 0;
 
-	err = IP_ECN_decapsulate(oip, skb);
-	if (unlikely(err)) {
-		if (log_ecn_error)
-			net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n",
-					     &oip->saddr, oip->tos);
-		if (err > 1) {
-			++vxlan->dev->stats.rx_frame_errors;
-			++vxlan->dev->stats.rx_errors;
-			goto drop;
+#if IS_ENABLED(CONFIG_IPV6)
+	if (oip6) {
+		err = IP6_ECN_decapsulate(oip6, skb);
+		if (unlikely(err)) {
+			if (log_ecn_error)
+				net_info_ratelimited("non-ECT from %pI4\n",
+						     &oip6->saddr);
+			if (err > 1) {
+				++vxlan->dev->stats.rx_frame_errors;
+				++vxlan->dev->stats.rx_errors;
+				goto drop;
+			}
+		}
+	}
+#endif
+	if (oip) {
+		err = IP_ECN_decapsulate(oip, skb);
+		if (unlikely(err)) {
+			if (log_ecn_error)
+				net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n",
+						     &oip->saddr, oip->tos);
+			if (err > 1) {
+				++vxlan->dev->stats.rx_frame_errors;
+				++vxlan->dev->stats.rx_errors;
+				goto drop;
+			}
 		}
 	}
 
@@ -760,6 +897,7 @@ static int arp_reduce(struct net_device *dev, struct sk_buff *skb)
 	u8 *arpptr, *sha;
 	__be32 sip, tip;
 	struct neighbour *n;
+	struct vxlan_ip ipa;
 
 	if (dev->flags & IFF_NOARP)
 		goto out;
@@ -801,7 +939,7 @@ static int arp_reduce(struct net_device *dev, struct sk_buff *skb)
 		}
 
 		f = vxlan_find_mac(vxlan, n->ha);
-		if (f && f->remote.remote_ip == htonl(INADDR_ANY)) {
+		if (f && vxlan_ip_any(&f->remote.remote_ip)) {
 			/* bridge-local neighbor */
 			neigh_release(n);
 			goto out;
@@ -819,8 +957,11 @@ static int arp_reduce(struct net_device *dev, struct sk_buff *skb)
 
 		if (netif_rx_ni(reply) == NET_RX_DROP)
 			dev->stats.rx_dropped++;
-	} else if (vxlan->flags & VXLAN_F_L3MISS)
-		vxlan_ip_miss(dev, tip);
+	} else if (vxlan->flags & VXLAN_F_L3MISS) {
+		ipa.ip4 = tip;
+		ipa.proto = htons(ETH_P_IP);
+		vxlan_ip_miss(dev, &ipa);
+	}
 out:
 	consume_skb(skb);
 	return NETDEV_TX_OK;
@@ -842,6 +983,14 @@ static bool route_shortcircuit(struct net_device *dev, struct sk_buff *skb)
 			return false;
 		pip = ip_hdr(skb);
 		n = neigh_lookup(&arp_tbl, &pip->daddr, dev);
+		if (!n && vxlan->flags & VXLAN_F_L3MISS) {
+			struct vxlan_ip ipa;
+			ipa.ip4 = pip->daddr;
+			ipa.proto = htons(ETH_P_IP);
+			vxlan_ip_miss(dev, &ipa);
+			return false;
+		}
+
 		break;
 	default:
 		return false;
@@ -858,8 +1007,8 @@ static bool route_shortcircuit(struct net_device *dev, struct sk_buff *skb)
 		}
 		neigh_release(n);
 		return diff;
-	} else if (vxlan->flags & VXLAN_F_L3MISS)
-		vxlan_ip_miss(dev, pip->daddr);
+	}
+
 	return false;
 }
 
@@ -869,7 +1018,8 @@ static void vxlan_sock_free(struct sk_buff *skb)
 }
 
 /* On transmit, associate with the tunnel socket */
-static void vxlan_set_owner(struct net_device *dev, struct sk_buff *skb)
+static inline void vxlan_set_owner(struct net_device *dev,
+				   struct sk_buff *skb)
 {
 	struct vxlan_net *vn = net_generic(dev_net(dev), vxlan_net_id);
 	struct sock *sk = vn->sock->sk;
@@ -917,13 +1067,20 @@ static netdev_tx_t vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
 {
 	struct vxlan_dev *vxlan = netdev_priv(dev);
 	struct rtable *rt;
-	const struct iphdr *old_iph;
+	const struct iphdr *old_iph = NULL;
 	struct iphdr *iph;
+	struct ipv6hdr *ip6h;
 	struct vxlanhdr *vxh;
 	struct udphdr *uh;
 	struct flowi4 fl4;
+#if IS_ENABLED(CONFIG_IPV6)
+	struct flowi6 fl6;
+	struct vxlan_net *vn = net_generic(dev_net(dev), vxlan_net_id);
+	struct sock *sk = vn->sock->sk;
+#endif
 	unsigned int pkt_len = skb->len;
-	__be32 dst;
+	const struct vxlan_ip *dst;
+	struct dst_entry *ndst;
 	__u16 src_port, dst_port;
         u32 vni;
 	__be16 df = 0;
@@ -931,9 +1088,9 @@ static netdev_tx_t vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
 
 	dst_port = rdst->remote_port ? rdst->remote_port : vxlan_port;
 	vni = rdst->remote_vni;
-	dst = rdst->remote_ip;
+	dst = &rdst->remote_ip;
 
-	if (!dst) {
+	if (vxlan_ip_any(dst)) {
 		if (did_rsc) {
 			__skb_pull(skb, skb_network_offset(skb));
 			skb->ip_summed = CHECKSUM_NONE;
@@ -961,47 +1118,88 @@ static netdev_tx_t vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
 		skb->encapsulation = 1;
 	}
 
-	/* Need space for new headers (invalidates iph ptr) */
-	if (skb_cow_head(skb, VXLAN_HEADROOM))
-		goto drop;
+	if (dst->proto == htons(ETH_P_IP)) {
+		/* Need space for new headers (invalidates iph ptr) */
+		if (skb_cow_head(skb, VXLAN_HEADROOM))
+			goto drop;
 
-	old_iph = ip_hdr(skb);
+		old_iph = ip_hdr(skb);
+		ttl = vxlan->ttl;
+		if (!ttl && IN_MULTICAST(ntohl(dst->ip4)))
+			ttl = 1;
 
-	ttl = vxlan->ttl;
-	if (!ttl && IN_MULTICAST(ntohl(dst)))
-		ttl = 1;
+		tos = vxlan->tos;
+		if (tos == 1)
+			tos = ip_tunnel_get_dsfield(old_iph, skb);
 
-	tos = vxlan->tos;
-	if (tos == 1)
-		tos = ip_tunnel_get_dsfield(old_iph, skb);
+		src_port = vxlan_src_port(vxlan, skb);
 
-	src_port = vxlan_src_port(vxlan, skb);
+		memset(&fl4, 0, sizeof(fl4));
+		fl4.flowi4_oif = rdst->remote_ifindex;
+		fl4.flowi4_tos = RT_TOS(tos);
+		fl4.daddr = dst->ip4;
+		fl4.saddr = vxlan->saddr.ip4;
 
-	memset(&fl4, 0, sizeof(fl4));
-	fl4.flowi4_oif = rdst->remote_ifindex;
-	fl4.flowi4_tos = RT_TOS(tos);
-	fl4.daddr = dst;
-	fl4.saddr = vxlan->saddr;
+		rt = ip_route_output_key(dev_net(dev), &fl4);
+		if (IS_ERR(rt)) {
+			netdev_dbg(dev, "no route to %pI4\n", &dst->ip4);
+			dev->stats.tx_carrier_errors++;
+			goto tx_error;
+		}
 
-	rt = ip_route_output_key(dev_net(dev), &fl4);
-	if (IS_ERR(rt)) {
-		netdev_dbg(dev, "no route to %pI4\n", &dst);
-		dev->stats.tx_carrier_errors++;
-		goto tx_error;
-	}
+		if (rt->dst.dev == dev) {
+			netdev_dbg(dev, "circular route to %pI4\n", &dst->ip4);
+			ip_rt_put(rt);
+			dev->stats.collisions++;
+			goto tx_error;
+		}
+		ndst = &rt->dst;
+		memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
+	} else {
+#if IS_ENABLED(CONFIG_IPV6)
+		const struct ipv6hdr *old_iph6;
+
+		/* Need space for new headers (invalidates iph ptr) */
+		if (skb_cow_head(skb, VXLAN6_HEADROOM))
+			goto drop;
 
-	if (rt->dst.dev == dev) {
-		netdev_dbg(dev, "circular route to %pI4\n", &dst);
-		ip_rt_put(rt);
-		dev->stats.collisions++;
-		goto tx_error;
+		old_iph6 = ipv6_hdr(skb);
+		ttl = vxlan->ttl;
+		if (!ttl && ipv6_addr_is_multicast(&dst->ip6))
+			ttl = 1;
+
+		tos = vxlan->tos;
+		if (tos == 1)
+			tos = ipv6_get_dsfield(old_iph6);
+
+		src_port = vxlan_src_port(vxlan, skb);
+
+		memset(&fl6, 0, sizeof(fl6));
+		fl6.flowi6_oif = vxlan->link;
+		fl6.flowi6_tos = RT_TOS(tos);
+		fl6.daddr = dst->ip6;
+		fl6.saddr = vxlan->saddr.ip6;
+		fl6.flowi6_proto = skb->protocol;
+
+		if (ip6_dst_lookup(sk, &ndst, &fl6)) {
+			netdev_dbg(dev, "no route to %pI6\n", &dst->ip6);
+			dev->stats.tx_carrier_errors++;
+			goto tx_error;
+		}
+
+		if (ndst->dev == dev) {
+			netdev_dbg(dev, "circular route to %pI6\n", &dst->ip6);
+			dst_release(ndst);
+			dev->stats.collisions++;
+			goto tx_error;
+		}
+#endif
 	}
 
-	memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
 	IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED |
 			      IPSKB_REROUTED);
 	skb_dst_drop(skb);
-	skb_dst_set(skb, &rt->dst);
+	skb_dst_set(skb, ndst);
 
 	vxh = (struct vxlanhdr *) __skb_push(skb, sizeof(*vxh));
 	vxh->vx_flags = htonl(VXLAN_FLAGS);
@@ -1017,27 +1215,55 @@ static netdev_tx_t vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
 	uh->len = htons(skb->len);
 	uh->check = 0;
 
-	__skb_push(skb, sizeof(*iph));
-	skb_reset_network_header(skb);
-	iph		= ip_hdr(skb);
-	iph->version	= 4;
-	iph->ihl	= sizeof(struct iphdr) >> 2;
-	iph->frag_off	= df;
-	iph->protocol	= IPPROTO_UDP;
-	iph->tos	= ip_tunnel_ecn_encap(tos, old_iph, skb);
-	iph->daddr	= dst;
-	iph->saddr	= fl4.saddr;
-	iph->ttl	= ttl ? : ip4_dst_hoplimit(&rt->dst);
-	tunnel_ip_select_ident(skb, old_iph, &rt->dst);
-
-	nf_reset(skb);
+	if (dst->proto == htons(ETH_P_IP)) {
+		__skb_push(skb, sizeof(*iph));
+		skb_reset_network_header(skb);
+		iph		= ip_hdr(skb);
+		iph->version	= 4;
+		iph->ihl	= sizeof(struct iphdr) >> 2;
+		iph->frag_off	= df;
+		iph->protocol	= IPPROTO_UDP;
+		iph->tos	= ip_tunnel_ecn_encap(tos, old_iph, skb);
+		iph->daddr	= dst->ip4;
+		iph->saddr	= fl4.saddr;
+		iph->ttl	= ttl ? : ip4_dst_hoplimit(ndst);
+		tunnel_ip_select_ident(skb, old_iph, ndst);
+	} else {
+#if IS_ENABLED(CONFIG_IPV6)
+		if (skb->ip_summed == CHECKSUM_PARTIAL) {
+			skb->csum_start = skb_transport_header(skb) - skb->head;
+			skb->csum_offset = offsetof(struct udphdr, check);
+		} else
+			uh->check = csum_ipv6_magic(&fl6.saddr, &fl6.daddr,
+						    skb->len, IPPROTO_UDP,
+						    csum_partial(uh, skb->len, 0));
+		__skb_push(skb, sizeof(*ip6h));
+		skb_reset_network_header(skb);
+		ip6h		  = ipv6_hdr(skb);
+		ip6h->version	  = 6;
+		ip6h->priority	  = 0;
+		ip6h->flow_lbl[0] = 0;
+		ip6h->flow_lbl[1] = 0;
+		ip6h->flow_lbl[2] = 0;
+		ip6h->payload_len = htons(skb->len);
+		ip6h->nexthdr     = IPPROTO_UDP;
+		ip6h->hop_limit   = ttl ? : ip6_dst_hoplimit(ndst);
+		ip6h->daddr	  = fl6.daddr;
+		ip6h->saddr	  = fl6.saddr;
+#endif
+	}
 
 	vxlan_set_owner(dev, skb);
 
 	if (handle_offloads(skb))
 		goto drop;
 
-	iptunnel_xmit(skb, dev);
+	if (dst->proto == htons(ETH_P_IP))
+		iptunnel_xmit(skb, dev);
+#if IS_ENABLED(CONFIG_IPV6)
+	else
+		ip6tunnel_xmit(skb, dev);
+#endif
 	return NETDEV_TX_OK;
 
 drop:
@@ -1084,7 +1310,7 @@ static netdev_tx_t vxlan_xmit(struct sk_buff *skb, struct net_device *dev)
 		group.remote_next = 0;
 		rdst0 = &group;
 
-		if (group.remote_ip == htonl(INADDR_ANY) &&
+		if (vxlan_ip_any(&group.remote_ip) &&
 		    (vxlan->flags & VXLAN_F_L2MISS) &&
 		    !is_multicast_ether_addr(eth->h_dest))
 			vxlan_fdb_miss(vxlan, eth->h_dest);
@@ -1162,7 +1388,7 @@ static int vxlan_open(struct net_device *dev)
 	struct vxlan_dev *vxlan = netdev_priv(dev);
 	int err;
 
-	if (vxlan->gaddr) {
+	if (!vxlan_ip_any(&vxlan->gaddr)) {
 		err = vxlan_join_group(dev);
 		if (err)
 			return err;
@@ -1196,7 +1422,7 @@ static int vxlan_stop(struct net_device *dev)
 {
 	struct vxlan_dev *vxlan = netdev_priv(dev);
 
-	if (vxlan->gaddr)
+	if (!vxlan_ip_any(&vxlan->gaddr))
 		vxlan_leave_group(dev);
 
 	del_timer_sync(&vxlan->age_timer);
@@ -1246,7 +1472,10 @@ static void vxlan_setup(struct net_device *dev)
 
 	eth_hw_addr_random(dev);
 	ether_setup(dev);
-	dev->hard_header_len = ETH_HLEN + VXLAN_HEADROOM;
+	if (vxlan->gaddr.proto == htons(ETH_P_IP))
+		dev->hard_header_len = ETH_HLEN + VXLAN_HEADROOM;
+	else
+		dev->hard_header_len = ETH_HLEN + VXLAN6_HEADROOM;
 
 	dev->netdev_ops = &vxlan_netdev_ops;
 	dev->destructor = vxlan_free;
@@ -1283,8 +1512,10 @@ static void vxlan_setup(struct net_device *dev)
 static const struct nla_policy vxlan_policy[IFLA_VXLAN_MAX + 1] = {
 	[IFLA_VXLAN_ID]		= { .type = NLA_U32 },
 	[IFLA_VXLAN_GROUP]	= { .len = FIELD_SIZEOF(struct iphdr, daddr) },
+	[IFLA_VXLAN_GROUP6]	= { .len = sizeof(struct in6_addr) },
 	[IFLA_VXLAN_LINK]	= { .type = NLA_U32 },
 	[IFLA_VXLAN_LOCAL]	= { .len = FIELD_SIZEOF(struct iphdr, saddr) },
+	[IFLA_VXLAN_LOCAL6]	= { .len = sizeof(struct in6_addr) },
 	[IFLA_VXLAN_TOS]	= { .type = NLA_U8 },
 	[IFLA_VXLAN_TTL]	= { .type = NLA_U8 },
 	[IFLA_VXLAN_LEARNING]	= { .type = NLA_U8 },
@@ -1326,6 +1557,13 @@ static int vxlan_validate(struct nlattr *tb[], struct nlattr *data[])
 			pr_debug("group address is not IPv4 multicast\n");
 			return -EADDRNOTAVAIL;
 		}
+	} else if (data[IFLA_VXLAN_GROUP6]) {
+		struct in6_addr gaddr;
+		nla_memcpy(&gaddr, data[IFLA_VXLAN_GROUP6], sizeof(struct in6_addr));
+		if (!ipv6_addr_is_multicast(&gaddr)) {
+			pr_debug("group address is not IPv6 multicast\n");
+			return -EADDRNOTAVAIL;
+		}
 	}
 
 	if (data[IFLA_VXLAN_PORT_RANGE]) {
@@ -1371,11 +1609,21 @@ static int vxlan_newlink(struct net *net, struct net_device *dev,
 	}
 	vxlan->vni = vni;
 
-	if (data[IFLA_VXLAN_GROUP])
-		vxlan->gaddr = nla_get_be32(data[IFLA_VXLAN_GROUP]);
+	if (data[IFLA_VXLAN_GROUP]) {
+		vxlan->gaddr.ip4 = nla_get_be32(data[IFLA_VXLAN_GROUP]);
+		vxlan->gaddr.proto = htons(ETH_P_IP);
+	} else if (data[IFLA_VXLAN_GROUP6]) {
+		nla_memcpy(&vxlan->gaddr.ip6, data[IFLA_VXLAN_GROUP6], sizeof(struct in6_addr));
+		vxlan->gaddr.proto = htons(ETH_P_IPV6);
+	}
 
-	if (data[IFLA_VXLAN_LOCAL])
-		vxlan->saddr = nla_get_be32(data[IFLA_VXLAN_LOCAL]);
+	if (data[IFLA_VXLAN_LOCAL]) {
+		vxlan->saddr.ip4 = nla_get_be32(data[IFLA_VXLAN_GROUP]);
+		vxlan->saddr.proto = htons(ETH_P_IP);
+	} else if (data[IFLA_VXLAN_LOCAL6]) {
+		nla_memcpy(&vxlan->saddr.ip6, data[IFLA_VXLAN_GROUP6], sizeof(struct in6_addr));
+		vxlan->saddr.proto = htons(ETH_P_IPV6);
+	}
 
 	if (data[IFLA_VXLAN_LINK] &&
 	    (vxlan->link = nla_get_u32(data[IFLA_VXLAN_LINK]))) {
@@ -1453,9 +1701,9 @@ static size_t vxlan_get_size(const struct net_device *dev)
 {
 
 	return nla_total_size(sizeof(__u32)) +	/* IFLA_VXLAN_ID */
-		nla_total_size(sizeof(__be32)) +/* IFLA_VXLAN_GROUP */
+		nla_total_size(sizeof(struct in6_addr)) + /* IFLA_VXLAN_GROUP{6} */
 		nla_total_size(sizeof(__u32)) +	/* IFLA_VXLAN_LINK */
-		nla_total_size(sizeof(__be32))+	/* IFLA_VXLAN_LOCAL */
+		nla_total_size(sizeof(struct in6_addr)) + /* IFLA_VXLAN_LOCAL{6} */
 		nla_total_size(sizeof(__u8)) +	/* IFLA_VXLAN_TTL */
 		nla_total_size(sizeof(__u8)) +	/* IFLA_VXLAN_TOS */
 		nla_total_size(sizeof(__u8)) +	/* IFLA_VXLAN_LEARNING */
@@ -1480,14 +1728,28 @@ static int vxlan_fill_info(struct sk_buff *skb, const struct net_device *dev)
 	if (nla_put_u32(skb, IFLA_VXLAN_ID, vxlan->vni))
 		goto nla_put_failure;
 
-	if (vxlan->gaddr && nla_put_be32(skb, IFLA_VXLAN_GROUP, vxlan->gaddr))
-		goto nla_put_failure;
+	if (!vxlan_ip_any(&vxlan->gaddr)) {
+		if (vxlan->gaddr.proto == htons(ETH_P_IP)) {
+			if (nla_put_be32(skb, IFLA_VXLAN_GROUP, vxlan->gaddr.ip4))
+				goto nla_put_failure;
+		} else {
+			if (nla_put(skb, IFLA_VXLAN_GROUP6, sizeof(struct in6_addr), &vxlan->gaddr.ip6))
+				goto nla_put_failure;
+		}
+	}
 
 	if (vxlan->link && nla_put_u32(skb, IFLA_VXLAN_LINK, vxlan->link))
 		goto nla_put_failure;
 
-	if (vxlan->saddr && nla_put_be32(skb, IFLA_VXLAN_LOCAL, vxlan->saddr))
-		goto nla_put_failure;
+	if (!vxlan_ip_any(&vxlan->saddr)) {
+		if (vxlan->saddr.proto == htons(ETH_P_IP)) {
+			if (nla_put_be32(skb, IFLA_VXLAN_LOCAL, vxlan->saddr.ip4))
+				goto nla_put_failure;
+		} else {
+			if (nla_put(skb, IFLA_VXLAN_LOCAL6, sizeof(struct in6_addr), &vxlan->saddr.ip6))
+				goto nla_put_failure;
+		}
+	}
 
 	if (nla_put_u8(skb, IFLA_VXLAN_TTL, vxlan->ttl) ||
 	    nla_put_u8(skb, IFLA_VXLAN_TOS, vxlan->tos) ||
@@ -1526,38 +1788,82 @@ static struct rtnl_link_ops vxlan_link_ops __read_mostly = {
 	.fill_info	= vxlan_fill_info,
 };
 
-static __net_init int vxlan_init_net(struct net *net)
+/* Create UDP socket for encapsulation receive. AF_INET6 sockets
+ * could be used for both IPv4 and IPv6 communications.
+ */
+#if IS_ENABLED(CONFIG_IPV6)
+static __net_init int create_sock(struct net *net, struct sock **sk)
+{
+	struct vxlan_net *vn = net_generic(net, vxlan_net_id);
+	struct sockaddr_in6 vxlan_addr = {
+		.sin6_family = AF_INET6,
+		.sin6_port = htons(vxlan_port),
+	};
+	int rc;
+
+	rc = sock_create_kern(AF_INET6, SOCK_DGRAM, IPPROTO_UDP, &vn->sock);
+	if (rc < 0) {
+		pr_debug("UDP socket create failed\n");
+		return rc;
+	}
+	/* Put in proper namespace */
+	*sk = vn->sock->sk;
+	sk_change_net(*sk, net);
+
+	rc = kernel_bind(vn->sock, (struct sockaddr *)&vxlan_addr,
+			 sizeof(struct sockaddr_in6));
+	if (rc < 0) {
+		pr_debug("bind for UDP socket %pI6:%u (%d)\n",
+			 &vxlan_addr.sin6_addr, ntohs(vxlan_addr.sin6_port), rc);
+		sk_release_kernel(*sk);
+		vn->sock = NULL;
+		return rc;
+	}
+	return 0;
+}
+#else
+static __net_init int create_sock(struct net *net, struct sock **sk)
 {
 	struct vxlan_net *vn = net_generic(net, vxlan_net_id);
-	struct sock *sk;
 	struct sockaddr_in vxlan_addr = {
 		.sin_family = AF_INET,
+		.sin_port = htons(vxlan_port),
 		.sin_addr.s_addr = htonl(INADDR_ANY),
 	};
 	int rc;
-	unsigned h;
 
-	/* Create UDP socket for encapsulation receive. */
 	rc = sock_create_kern(AF_INET, SOCK_DGRAM, IPPROTO_UDP, &vn->sock);
 	if (rc < 0) {
 		pr_debug("UDP socket create failed\n");
 		return rc;
 	}
 	/* Put in proper namespace */
-	sk = vn->sock->sk;
-	sk_change_net(sk, net);
+	*sk = vn->sock->sk;
+	sk_change_net(*sk, net);
 
-	vxlan_addr.sin_port = htons(vxlan_port);
-
-	rc = kernel_bind(vn->sock, (struct sockaddr *) &vxlan_addr,
-			 sizeof(vxlan_addr));
+	rc = kernel_bind(vn->sock, (struct sockaddr *)&vxlan_addr,
+			 sizeof(struct sockaddr_in));
 	if (rc < 0) {
 		pr_debug("bind for UDP socket %pI4:%u (%d)\n",
 			 &vxlan_addr.sin_addr, ntohs(vxlan_addr.sin_port), rc);
-		sk_release_kernel(sk);
+		sk_release_kernel(*sk);
 		vn->sock = NULL;
 		return rc;
 	}
+	return 0;
+}
+#endif
+
+static __net_init int vxlan_init_net(struct net *net)
+{
+	struct vxlan_net *vn = net_generic(net, vxlan_net_id);
+	struct sock *sk;
+	int rc;
+	unsigned h;
+
+	rc = create_sock(net, &sk);
+	if (rc < 0)
+		return rc;
 
 	/* Disable multicast loopback */
 	inet_sk(sk)->mc_loop = 0;
@@ -1566,6 +1872,9 @@ static __net_init int vxlan_init_net(struct net *net)
 	udp_sk(sk)->encap_type = 1;
 	udp_sk(sk)->encap_rcv = vxlan_udp_encap_recv;
 	udp_encap_enable();
+#if IS_ENABLED(CONFIG_IPV6)
+	udpv6_encap_enable();
+#endif
 
 	for (h = 0; h < VNI_HASH_SIZE; ++h)
 		INIT_HLIST_HEAD(&vn->vni_list[h]);
diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index c4edfe1..0eee00f 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -308,6 +308,8 @@ enum {
 	IFLA_VXLAN_RSC,
 	IFLA_VXLAN_L2MISS,
 	IFLA_VXLAN_L3MISS,
+	IFLA_VXLAN_GROUP6,
+	IFLA_VXLAN_LOCAL6,
 	__IFLA_VXLAN_MAX
 };
 #define IFLA_VXLAN_MAX	(__IFLA_VXLAN_MAX - 1)
-- 
1.7.7.6

^ permalink raw reply related	[flat|nested] 16+ messages in thread

* [Patch net-next v1 4/4] ipv6: Add generic UDP Tunnel segmentation
  2013-03-31  5:43 [Patch net-next v1 1/4] vxlan: defer vxlan init as late as possible Cong Wang
  2013-03-31  5:43 ` [Patch net-next v1 2/4] ipv6: export ipv6_sock_mc_join and ipv6_sock_mc_drop Cong Wang
  2013-03-31  5:43 ` [Patch net-next v1 3/4] vxlan: add ipv6 support Cong Wang
@ 2013-03-31  5:43 ` Cong Wang
  2013-03-31  6:17 ` [PATCH iproute2] vxlan: add ipv6 support Cong Wang
  3 siblings, 0 replies; 16+ messages in thread
From: Cong Wang @ 2013-03-31  5:43 UTC (permalink / raw)
  To: netdev
  Cc: Jesse Gross, Pravin B Shelar, Stephen Hemminger, David S. Miller,
	Cong Wang

From: Cong Wang <amwang@redhat.com>

Similar to commit 731362674580cb0c696cd1b1a03d8461a10cf90a
(tunneling: Add generic Tunnel segmentation)

This patch adds generic tunneling offloading support for IPv6-UDP based
tunnels.

This can be used by tunneling protocols like VXLAN.

Cc: Jesse Gross <jesse@nicira.com>
Cc: Pravin B Shelar <pshelar@nicira.com>
Cc: Stephen Hemminger <stephen@networkplumber.org>
Cc: David S. Miller <davem@davemloft.net>
Signed-off-by: Cong Wang <amwang@redhat.com>
---
 net/ipv6/ip6_offload.c |    4 +-
 net/ipv6/udp_offload.c |  155 +++++++++++++++++++++++++++++++++---------------
 2 files changed, 110 insertions(+), 49 deletions(-)

diff --git a/net/ipv6/ip6_offload.c b/net/ipv6/ip6_offload.c
index 71b766e..f031ccf 100644
--- a/net/ipv6/ip6_offload.c
+++ b/net/ipv6/ip6_offload.c
@@ -91,6 +91,7 @@ static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb,
 	unsigned int unfrag_ip6hlen;
 	u8 *prevhdr;
 	int offset = 0;
+	bool tunnel;
 
 	if (unlikely(skb_shinfo(skb)->gso_type &
 		     ~(SKB_GSO_UDP |
@@ -105,6 +106,7 @@ static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb,
 	if (unlikely(!pskb_may_pull(skb, sizeof(*ipv6h))))
 		goto out;
 
+	tunnel = !!skb->encapsulation;
 	ipv6h = ipv6_hdr(skb);
 	__skb_pull(skb, sizeof(*ipv6h));
 	segs = ERR_PTR(-EPROTONOSUPPORT);
@@ -125,7 +127,7 @@ static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb,
 		ipv6h = ipv6_hdr(skb);
 		ipv6h->payload_len = htons(skb->len - skb->mac_len -
 					   sizeof(*ipv6h));
-		if (proto == IPPROTO_UDP) {
+		if (!tunnel && proto == IPPROTO_UDP) {
 			unfrag_ip6hlen = ip6_find_1stfragopt(skb, &prevhdr);
 			fptr = (struct frag_hdr *)(skb_network_header(skb) +
 				unfrag_ip6hlen);
diff --git a/net/ipv6/udp_offload.c b/net/ipv6/udp_offload.c
index 3bb3a89..bbde7ba 100644
--- a/net/ipv6/udp_offload.c
+++ b/net/ipv6/udp_offload.c
@@ -21,26 +21,81 @@ static int udp6_ufo_send_check(struct sk_buff *skb)
 	const struct ipv6hdr *ipv6h;
 	struct udphdr *uh;
 
-	/* UDP Tunnel offload on ipv6 is not yet supported. */
-	if (skb->encapsulation)
-		return -EINVAL;
-
 	if (!pskb_may_pull(skb, sizeof(*uh)))
 		return -EINVAL;
 
-	ipv6h = ipv6_hdr(skb);
-	uh = udp_hdr(skb);
+	if (likely(!skb->encapsulation)) {
+		ipv6h = ipv6_hdr(skb);
+		uh = udp_hdr(skb);
+
+		uh->check = ~csum_ipv6_magic(&ipv6h->saddr, &ipv6h->daddr, skb->len,
+					     IPPROTO_UDP, 0);
+		skb->csum_start = skb_transport_header(skb) - skb->head;
+		skb->csum_offset = offsetof(struct udphdr, check);
+		skb->ip_summed = CHECKSUM_PARTIAL;
+	}
 
-	uh->check = ~csum_ipv6_magic(&ipv6h->saddr, &ipv6h->daddr, skb->len,
-				     IPPROTO_UDP, 0);
-	skb->csum_start = skb_transport_header(skb) - skb->head;
-	skb->csum_offset = offsetof(struct udphdr, check);
-	skb->ip_summed = CHECKSUM_PARTIAL;
 	return 0;
 }
 
+static struct sk_buff *skb_udp6_tunnel_segment(struct sk_buff *skb,
+					       netdev_features_t features)
+{
+	struct sk_buff *segs = ERR_PTR(-EINVAL);
+	int mac_len = skb->mac_len;
+	int tnl_hlen = skb_inner_mac_header(skb) - skb_transport_header(skb);
+	int outer_hlen;
+	netdev_features_t enc_features;
+
+	if (unlikely(!pskb_may_pull(skb, tnl_hlen)))
+		goto out;
+
+	skb->encapsulation = 0;
+	__skb_pull(skb, tnl_hlen);
+	skb_reset_mac_header(skb);
+	skb_set_network_header(skb, skb_inner_network_offset(skb));
+	skb->mac_len = skb_inner_network_offset(skb);
+
+	/* segment inner packet. */
+	enc_features = skb->dev->hw_enc_features & netif_skb_features(skb);
+	segs = skb_mac_gso_segment(skb, enc_features);
+	if (!segs || IS_ERR(segs))
+		goto out;
+
+	outer_hlen = skb_tnl_header_len(skb);
+	skb = segs;
+	do {
+		struct udphdr *uh;
+		int udp_offset = outer_hlen - tnl_hlen;
+
+		skb->mac_len = mac_len;
+
+		skb_push(skb, outer_hlen);
+		skb_reset_mac_header(skb);
+		skb_set_network_header(skb, mac_len);
+		skb_set_transport_header(skb, udp_offset);
+		uh = udp_hdr(skb);
+		uh->len = htons(skb->len - udp_offset);
+
+		/* csum segment if tunnel sets skb with csum. */
+		if (unlikely(uh->check)) {
+			struct ipv6hdr *iph = ipv6_hdr(skb);
+
+			uh->check = csum_ipv6_magic(&iph->saddr, &iph->daddr,
+						       skb->len - udp_offset,
+						       IPPROTO_UDP, 0);
+			if (uh->check == 0)
+				uh->check = CSUM_MANGLED_0;
+
+		}
+		skb->ip_summed = CHECKSUM_NONE;
+	} while ((skb = skb->next));
+out:
+	return segs;
+}
+
 static struct sk_buff *udp6_ufo_fragment(struct sk_buff *skb,
-	netdev_features_t features)
+					 netdev_features_t features)
 {
 	struct sk_buff *segs = ERR_PTR(-EINVAL);
 	unsigned int mss;
@@ -73,43 +128,47 @@ static struct sk_buff *udp6_ufo_fragment(struct sk_buff *skb,
 		goto out;
 	}
 
-	/* Do software UFO. Complete and fill in the UDP checksum as HW cannot
-	 * do checksum of UDP packets sent as multiple IP fragments.
-	 */
-	offset = skb_checksum_start_offset(skb);
-	csum = skb_checksum(skb, offset, skb->len - offset, 0);
-	offset += skb->csum_offset;
-	*(__sum16 *)(skb->data + offset) = csum_fold(csum);
-	skb->ip_summed = CHECKSUM_NONE;
-
-	/* Check if there is enough headroom to insert fragment header. */
-	if ((skb_mac_header(skb) < skb->head + frag_hdr_sz) &&
-	    pskb_expand_head(skb, frag_hdr_sz, 0, GFP_ATOMIC))
-		goto out;
+	if (skb->encapsulation && skb_shinfo(skb)->gso_type & SKB_GSO_UDP_TUNNEL)
+		segs = skb_udp6_tunnel_segment(skb, features);
+	else {
+		/* Do software UFO. Complete and fill in the UDP checksum as HW cannot
+		 * do checksum of UDP packets sent as multiple IP fragments.
+		 */
+		offset = skb_checksum_start_offset(skb);
+		csum = skb_checksum(skb, offset, skb->len - offset, 0);
+		offset += skb->csum_offset;
+		*(__sum16 *)(skb->data + offset) = csum_fold(csum);
+		skb->ip_summed = CHECKSUM_NONE;
+
+		/* Check if there is enough headroom to insert fragment header. */
+		if ((skb_mac_header(skb) < skb->head + frag_hdr_sz) &&
+		    pskb_expand_head(skb, frag_hdr_sz, 0, GFP_ATOMIC))
+			goto out;
 
-	/* Find the unfragmentable header and shift it left by frag_hdr_sz
-	 * bytes to insert fragment header.
-	 */
-	unfrag_ip6hlen = ip6_find_1stfragopt(skb, &prevhdr);
-	nexthdr = *prevhdr;
-	*prevhdr = NEXTHDR_FRAGMENT;
-	unfrag_len = skb_network_header(skb) - skb_mac_header(skb) +
-		     unfrag_ip6hlen;
-	mac_start = skb_mac_header(skb);
-	memmove(mac_start-frag_hdr_sz, mac_start, unfrag_len);
-
-	skb->mac_header -= frag_hdr_sz;
-	skb->network_header -= frag_hdr_sz;
-
-	fptr = (struct frag_hdr *)(skb_network_header(skb) + unfrag_ip6hlen);
-	fptr->nexthdr = nexthdr;
-	fptr->reserved = 0;
-	ipv6_select_ident(fptr, (struct rt6_info *)skb_dst(skb));
-
-	/* Fragment the skb. ipv6 header and the remaining fields of the
-	 * fragment header are updated in ipv6_gso_segment()
-	 */
-	segs = skb_segment(skb, features);
+		/* Find the unfragmentable header and shift it left by frag_hdr_sz
+		 * bytes to insert fragment header.
+		 */
+		unfrag_ip6hlen = ip6_find_1stfragopt(skb, &prevhdr);
+		nexthdr = *prevhdr;
+		*prevhdr = NEXTHDR_FRAGMENT;
+		unfrag_len = skb_network_header(skb) - skb_mac_header(skb) +
+			     unfrag_ip6hlen;
+		mac_start = skb_mac_header(skb);
+		memmove(mac_start-frag_hdr_sz, mac_start, unfrag_len);
+
+		skb->mac_header -= frag_hdr_sz;
+		skb->network_header -= frag_hdr_sz;
+
+		fptr = (struct frag_hdr *)(skb_network_header(skb) + unfrag_ip6hlen);
+		fptr->nexthdr = nexthdr;
+		fptr->reserved = 0;
+		ipv6_select_ident(fptr, (struct rt6_info *)skb_dst(skb));
+
+		/* Fragment the skb. ipv6 header and the remaining fields of the
+		 * fragment header are updated in ipv6_gso_segment()
+		 */
+		segs = skb_segment(skb, features);
+	}
 
 out:
 	return segs;
-- 
1.7.7.6

^ permalink raw reply related	[flat|nested] 16+ messages in thread

* [PATCH iproute2] vxlan: add ipv6 support
  2013-03-31  5:43 [Patch net-next v1 1/4] vxlan: defer vxlan init as late as possible Cong Wang
                   ` (2 preceding siblings ...)
  2013-03-31  5:43 ` [Patch net-next v1 4/4] ipv6: Add generic UDP Tunnel segmentation Cong Wang
@ 2013-03-31  6:17 ` Cong Wang
  3 siblings, 0 replies; 16+ messages in thread
From: Cong Wang @ 2013-03-31  6:17 UTC (permalink / raw)
  To: netdev; +Cc: Stephen Hemminger, Cong Wang

From: Cong Wang <amwang@redhat.com>

Cc: Stephen Hemminger <stephen@networkplumber.org>
Signed-off-by: Cong Wang <amwang@redhat.com>
---
 include/linux/if_link.h |    2 ++
 ip/iplink_vxlan.c       |   45 ++++++++++++++++++++++++++++++++++++++-------
 2 files changed, 40 insertions(+), 7 deletions(-)

diff --git a/include/linux/if_link.h b/include/linux/if_link.h
index 40167af..f74b8cc 100644
--- a/include/linux/if_link.h
+++ b/include/linux/if_link.h
@@ -306,6 +306,8 @@ enum {
 	IFLA_VXLAN_RSC,
 	IFLA_VXLAN_L2MISS,
 	IFLA_VXLAN_L3MISS,
+	IFLA_VXLAN_GROUP6,
+	IFLA_VXLAN_LOCAL6,
 	__IFLA_VXLAN_MAX
 };
 #define IFLA_VXLAN_MAX	(__IFLA_VXLAN_MAX - 1)
diff --git a/ip/iplink_vxlan.c b/ip/iplink_vxlan.c
index 1025326..c10ec0f 100644
--- a/ip/iplink_vxlan.c
+++ b/ip/iplink_vxlan.c
@@ -42,6 +42,8 @@ static int vxlan_parse_opt(struct link_util *lu, int argc, char **argv,
 	int vni_set = 0;
 	__u32 saddr = 0;
 	__u32 gaddr = 0;
+	struct in6_addr saddr6 = IN6ADDR_ANY_INIT;
+	struct in6_addr gaddr6 = IN6ADDR_ANY_INIT;
 	unsigned link = 0;
 	__u8 tos = 0;
 	__u8 ttl = 0;
@@ -65,15 +67,26 @@ static int vxlan_parse_opt(struct link_util *lu, int argc, char **argv,
 			vni_set = 1;
 		} else if (!matches(*argv, "group")) {
 			NEXT_ARG();
-			gaddr = get_addr32(*argv);
-
-			if (!IN_MULTICAST(ntohl(gaddr)))
-				invarg("invald group address", *argv);
+			if (!inet_pton(AF_INET, *argv, &gaddr)) {
+				if (!inet_pton(AF_INET6, *argv, &gaddr6)) {
+					fprintf(stderr, "Invalid address \"%s\"\n", *argv);
+					return -1;
+				} else if (!IN6_IS_ADDR_MULTICAST(&gaddr6))
+					invarg("invald group address", *argv);
+			} else if (!IN_MULTICAST(ntohl(gaddr)))
+					invarg("invald group address", *argv);
 		} else if (!matches(*argv, "local")) {
 			NEXT_ARG();
-			if (strcmp(*argv, "any"))
-				saddr = get_addr32(*argv);
-			if (IN_MULTICAST(ntohl(saddr)))
+			if (strcmp(*argv, "any")) {
+				if (!inet_pton(AF_INET, *argv, &saddr)) {
+					if (!inet_pton(AF_INET6, *argv, &saddr6)) {
+						fprintf(stderr, "Invalid address \"%s\"\n", *argv);
+						return -1;
+					}
+				}
+			}
+
+			if (IN_MULTICAST(ntohl(saddr)) || IN6_IS_ADDR_MULTICAST(&saddr6))
 				invarg("invalid local address", *argv);
 		} else if (!matches(*argv, "dev")) {
 			NEXT_ARG();
@@ -163,8 +176,14 @@ static int vxlan_parse_opt(struct link_util *lu, int argc, char **argv,
 	addattr32(n, 1024, IFLA_VXLAN_ID, vni);
 	if (gaddr)
 		addattr_l(n, 1024, IFLA_VXLAN_GROUP, &gaddr, 4);
+	else if (memcmp(&gaddr6, &in6addr_any, sizeof(gaddr6)) != 0)
+		addattr_l(n, 1024, IFLA_VXLAN_GROUP6, &gaddr6, sizeof(struct in6_addr));
+
 	if (saddr)
 		addattr_l(n, 1024, IFLA_VXLAN_LOCAL, &saddr, 4);
+	else if (memcmp(&saddr6, &in6addr_any, sizeof(saddr6)) != 0)
+		addattr_l(n, 1024, IFLA_VXLAN_LOCAL6, &saddr6, sizeof(struct in6_addr));
+
 	if (link)
 		addattr32(n, 1024, IFLA_VXLAN_LINK, link);
 	addattr8(n, 1024, IFLA_VXLAN_TTL, ttl);
@@ -211,6 +230,12 @@ static void vxlan_print_opt(struct link_util *lu, FILE *f, struct rtattr *tb[])
 		if (addr)
 			fprintf(f, "group %s ",
 				format_host(AF_INET, 4, &addr, s1, sizeof(s1)));
+	} else if (tb[IFLA_VXLAN_GROUP6]) {
+		struct in6_addr addr;
+		memcpy(&addr, RTA_DATA(tb[IFLA_VXLAN_GROUP6]), sizeof(struct in6_addr));
+		if (memcmp(&addr, &in6addr_any, sizeof(addr)) != 0)
+			fprintf(f, "group %s ",
+				format_host(AF_INET6, sizeof(struct in6_addr), &addr, s1, sizeof(s1)));
 	}
 
 	if (tb[IFLA_VXLAN_LOCAL]) {
@@ -218,6 +243,12 @@ static void vxlan_print_opt(struct link_util *lu, FILE *f, struct rtattr *tb[])
 		if (addr)
 			fprintf(f, "local %s ",
 				format_host(AF_INET, 4, &addr, s1, sizeof(s1)));
+	} else if (tb[IFLA_VXLAN_LOCAL6]) {
+		struct in6_addr addr;
+		memcpy(&addr, RTA_DATA(tb[IFLA_VXLAN_LOCAL6]), sizeof(struct in6_addr));
+		if (memcmp(&addr, &in6addr_any, sizeof(addr)) != 0)
+			fprintf(f, "local %s ",
+				format_host(AF_INET6, sizeof(struct in6_addr), &addr, s1, sizeof(s1)));
 	}
 
 	if (tb[IFLA_VXLAN_LINK] &&
-- 
1.7.7.6

^ permalink raw reply related	[flat|nested] 16+ messages in thread

* Re: [Patch net-next v1 3/4] vxlan: add ipv6 support
  2013-03-31  5:43 ` [Patch net-next v1 3/4] vxlan: add ipv6 support Cong Wang
@ 2013-04-01 15:19   ` David Stevens
  2013-04-01 15:36     ` Stephen Hemminger
  2013-04-01 17:02     ` David Miller
  2013-04-01 20:14   ` Stephen Hemminger
  1 sibling, 2 replies; 16+ messages in thread
From: David Stevens @ 2013-04-01 15:19 UTC (permalink / raw)
  To: Cong Wang
  Cc: Cong Wang, David S. Miller, netdev, netdev-owner, Stephen Hemminger

netdev-owner@vger.kernel.org wrote on 03/31/2013 01:43:44 AM:
 
> +struct vxlan_ip {
> +   union {
> +      __be32  ip4;
> +#if IS_ENABLED(CONFIG_IPV6)
> +      struct in6_addr ip6;
> +#endif
> +   };
> +   __be16          proto;
> +};
> +

        This looks suspiciously like a sockaddr. sockaddr_storage is
much bigger than you need, but you could just make it a sockaddr_in6
and cast it to sockaddr_in when needed, make it sockaddr_in6 and use
V4_MAPPED addresses for v4, or make it a union of sockaddr_in and
sockaddr_in6, or have a buffer the size of sockaddr_in6 and use it
as a sockaddr to determine the family, then go from there.
        I think anything along those lines is better than a new variant
with the same functionality of sockaddr that isn't an overlay of a
sockaddr.

                                                        +-DLS

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [Patch net-next v1 3/4] vxlan: add ipv6 support
  2013-04-01 15:19   ` David Stevens
@ 2013-04-01 15:36     ` Stephen Hemminger
  2013-04-01 17:02     ` David Miller
  1 sibling, 0 replies; 16+ messages in thread
From: Stephen Hemminger @ 2013-04-01 15:36 UTC (permalink / raw)
  To: David Stevens; +Cc: Cong Wang, David S. Miller, netdev, netdev-owner

On Mon, 1 Apr 2013 11:19:10 -0400
David Stevens <dlstevens@us.ibm.com> wrote:

> netdev-owner@vger.kernel.org wrote on 03/31/2013 01:43:44 AM:
>  
> > +struct vxlan_ip {
> > +   union {
> > +      __be32  ip4;
> > +#if IS_ENABLED(CONFIG_IPV6)
> > +      struct in6_addr ip6;
> > +#endif
> > +   };
> > +   __be16          proto;
> > +};
> > +
> 
>         This looks suspiciously like a sockaddr. sockaddr_storage is
> much bigger than you need, but you could just make it a sockaddr_in6
> and cast it to sockaddr_in when needed, make it sockaddr_in6 and use
> V4_MAPPED addresses for v4, or make it a union of sockaddr_in and
> sockaddr_in6, or have a buffer the size of sockaddr_in6 and use it
> as a sockaddr to determine the family, then go from there.
>         I think anything along those lines is better than a new variant
> with the same functionality of sockaddr that isn't an overlay of a
> sockaddr.
> 
>                                                         +-DLS
> 

That is exactly what I was thinking.

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [Patch net-next v1 3/4] vxlan: add ipv6 support
  2013-04-01 15:19   ` David Stevens
  2013-04-01 15:36     ` Stephen Hemminger
@ 2013-04-01 17:02     ` David Miller
  2013-04-01 18:05       ` David Stevens
  1 sibling, 1 reply; 16+ messages in thread
From: David Miller @ 2013-04-01 17:02 UTC (permalink / raw)
  To: dlstevens; +Cc: amwang, netdev, netdev-owner, stephen

From: David Stevens <dlstevens@us.ibm.com>
Date: Mon, 1 Apr 2013 11:19:10 -0400

> netdev-owner@vger.kernel.org wrote on 03/31/2013 01:43:44 AM:
>  
>> +struct vxlan_ip {
>> +   union {
>> +      __be32  ip4;
>> +#if IS_ENABLED(CONFIG_IPV6)
>> +      struct in6_addr ip6;
>> +#endif
>> +   };
>> +   __be16          proto;
>> +};
>> +
> 
>         This looks suspiciously like a sockaddr. sockaddr_storage is
> much bigger than you need, but you could just make it a sockaddr_in6
> and cast it to sockaddr_in when needed, make it sockaddr_in6 and use
> V4_MAPPED addresses for v4, or make it a union of sockaddr_in and
> sockaddr_in6, or have a buffer the size of sockaddr_in6 and use it
> as a sockaddr to determine the family, then go from there.
>         I think anything along those lines is better than a new variant
> with the same functionality of sockaddr that isn't an overlay of a
> sockaddr.

People avoid using sockaddr because it gets defined both in the kernel
exported userland headers and the native libc ones with no easy
protection between the two to avoid getting a double definition error.

Once you start including kernel exported headers for things like these
network device specific interfaces, you potentially run into that issue.

Therefore I'd rather the subsystem define their own unique type both
to avoid this double definition problem and to allow easy extention
later.

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [Patch net-next v1 3/4] vxlan: add ipv6 support
  2013-04-01 17:02     ` David Miller
@ 2013-04-01 18:05       ` David Stevens
  2013-04-01 18:15         ` David Miller
  2013-04-02  1:46         ` Cong Wang
  0 siblings, 2 replies; 16+ messages in thread
From: David Stevens @ 2013-04-01 18:05 UTC (permalink / raw)
  To: David Miller; +Cc: amwang, netdev, netdev-owner, stephen

David Miller <davem@davemloft.net> wrote on 04/01/2013 01:02:23 PM:

> 
> People avoid using sockaddr because it gets defined both in the kernel
> exported userland headers and the native libc ones with no easy
> protection between the two to avoid getting a double definition error.
> 
> Once you start including kernel exported headers for things like these
> network device specific interfaces, you potentially run into that issue.
> 
> Therefore I'd rather the subsystem define their own unique type both
> to avoid this double definition problem and to allow easy extention
> later.

        I guess, but in this case, I'm not saying it's like a sockaddr 
with
device-specific requirements. Rather, I'm saying it's exactly a sockaddr--
it is either a sockaddr_in or a sockaddr_in6 and a family field to say
which.
        As is, any user/kernel include file conflicts (in the "ip" 
command,
presumably) are still present because he's using in6_addr, another 
structure
both in user and kernel space.
        The primary multiple include issue here would be in the "ip" 
command,
presumably, but sockaddrs in particular have to agree between user and
kernel space already and both appear with "ip".
        This patch also has issues due to NOT copying other fields in the
sockaddr_in6 structure (scope_id and port).

        Personally, I don't think it's too difficult to make correct code
using sockaddr/sockaddr_in/sockaddr_in6 here, but even with a new type,
the code within vxlan.c could (and I argue should) use something like:

       union {
               struct sockaddr_in       vip_un_sin;
               struct sockaddr_in6      vip_un_sin6;
               struct sockaddr          vip_un_sa;
       } vip_sun;

#define vip_sa  vip_sun.vip_un_sa
#define vip_sin vip_sun.vip_un_sin
#define vip_sin6        vip_sun.vip_un.sin6

and then code like:
switch (vip->vip_sa.sa_family) {
case AF_INET:
        vip->vip_sin.sin_addr.s_addr = blah blah
        break;
case AF_INET6:
        vip->vip_sin6.sin6_addr = blah blah
        break;
...
}

...or whatever appropriate to the context and family.


                                                        +-DLS

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [Patch net-next v1 3/4] vxlan: add ipv6 support
  2013-04-01 18:05       ` David Stevens
@ 2013-04-01 18:15         ` David Miller
  2013-04-01 20:03           ` David Stevens
  2013-04-02  1:46         ` Cong Wang
  1 sibling, 1 reply; 16+ messages in thread
From: David Miller @ 2013-04-01 18:15 UTC (permalink / raw)
  To: dlstevens; +Cc: amwang, netdev, netdev-owner, stephen

From: David Stevens <dlstevens@us.ibm.com>
Date: Mon, 1 Apr 2013 14:05:37 -0400

>         I guess, but in this case, I'm not saying it's like a sockaddr 
> with
> device-specific requirements. Rather, I'm saying it's exactly a sockaddr--
> it is either a sockaddr_in or a sockaddr_in6 and a family field to say
> which.

in6_addr, which is what sockaddr_in6 is composed of, is precisely the
problematic type that we want to avoid to elide the double definition
error.

http://www.spinics.net/linux/fedora/libvir/msg70921.html

It's really not safe to use at the moment.

We're not creating more instances of this mess, and that decision
is final.

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [Patch net-next v1 3/4] vxlan: add ipv6 support
  2013-04-01 18:15         ` David Miller
@ 2013-04-01 20:03           ` David Stevens
  2013-04-01 20:05             ` David Miller
  0 siblings, 1 reply; 16+ messages in thread
From: David Stevens @ 2013-04-01 20:03 UTC (permalink / raw)
  To: David Miller; +Cc: amwang, netdev, netdev-owner, stephen

netdev-owner@vger.kernel.org wrote on 04/01/2013 02:15:48 PM:
Date: Mon, 1 Apr 2013 14:05:37 -0400
> 
> >         I guess, but in this case, I'm not saying it's like a sockaddr 

> > with
> > device-specific requirements. Rather, I'm saying it's exactly a 
sockaddr--
> > it is either a sockaddr_in or a sockaddr_in6 and a family field to say
> > which.
> 
> in6_addr, which is what sockaddr_in6 is composed of, is precisely the
> problematic type that we want to avoid to elide the double definition
> error.
> 
> http://www.spinics.net/linux/fedora/libvir/msg70921.html
> 
> It's really not safe to use at the moment.
> 
> We're not creating more instances of this mess, and that decision
> is final.

Dave,

I think we're talking about two different things. I'm suggesting
we should use sockaddrs within the kernel module code as the type for
what is now a v4-only address but would be one of v4 or v6 after
adding v6 support.

Within the kernel module code, without changing any header files,
I'm suggesting we use the existing kernel sockaddr definitions and the
sa_family to know which address type we have, instead of defining
a new type local to vxlan.c with the same purpose and most of the
fields as a sockaddr for that very same kernel-only private data.

An additional comment is that those fields missing in the private type
and present in the sockaddr types are useful in cases that apply
to these destination addresses, too; "vxlan_ip" ought to have all the
fields a sockaddr_in6 or sockaddr_in does and if it did, then it
ought to use sockaddr/sockaddr_in/sockaddr_in6 for this kernel-only data.

My suggestion doesn't apply to anything at user level, or visible
to user level through an include file, kernel or glibc.

                                                        +-DLS

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [Patch net-next v1 3/4] vxlan: add ipv6 support
  2013-04-01 20:03           ` David Stevens
@ 2013-04-01 20:05             ` David Miller
  0 siblings, 0 replies; 16+ messages in thread
From: David Miller @ 2013-04-01 20:05 UTC (permalink / raw)
  To: dlstevens; +Cc: amwang, netdev, netdev-owner, stephen

From: David Stevens <dlstevens@us.ibm.com>
Date: Mon, 1 Apr 2013 16:03:38 -0400

> My suggestion doesn't apply to anything at user level, or visible
> to user level through an include file, kernel or glibc.

In that case I rescind my objection.

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [Patch net-next v1 3/4] vxlan: add ipv6 support
  2013-03-31  5:43 ` [Patch net-next v1 3/4] vxlan: add ipv6 support Cong Wang
  2013-04-01 15:19   ` David Stevens
@ 2013-04-01 20:14   ` Stephen Hemminger
  2013-04-02  1:39     ` Cong Wang
  1 sibling, 1 reply; 16+ messages in thread
From: Stephen Hemminger @ 2013-04-01 20:14 UTC (permalink / raw)
  To: Cong Wang; +Cc: netdev, David S. Miller

On Sun, 31 Mar 2013 13:43:44 +0800
Cong Wang <amwang@redhat.com> wrote:

>  	/* Need to drop RTNL to call multicast leave */
>  	rtnl_unlock();
> -	lock_sock(sk);
> -	err = ip_mc_leave_group(sk, &mreq);
> +	if (vxlan->gaddr.proto == htons(ETH_P_IP)) {
> +		lock_sock(sk);
> +		err = ip_mc_leave_group(sk, &mreq);
> +	} else {
> +#if IS_ENABLED(CONFIG_IPV6)
> +		lock_sock(sk);
> +		err = ipv6_sock_mc_drop(sk, vxlan->link, &vxlan->gaddr.ip6);
> +#endif
> +	}
>  	release_sock(sk);

Since both v4 and v6 need socket locked why not?

	rtnl_unlock();
	lock_sock(sk);
	if (vxlan->gaddr.proto == htons(ETH_P_IP)) 
		err = ip_mc_leave_group(sk, &mreq);
if IS_ENABLED(CONFIG_IPV6)
	else
		err = ipv6_sock_mc_drop(sk, vxlan->link, &vxlan->gaddr.ip6);
#endif

  	release_sock(sk);

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [Patch net-next v1 3/4] vxlan: add ipv6 support
  2013-04-01 20:14   ` Stephen Hemminger
@ 2013-04-02  1:39     ` Cong Wang
  0 siblings, 0 replies; 16+ messages in thread
From: Cong Wang @ 2013-04-02  1:39 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: netdev, David S. Miller

On Mon, 2013-04-01 at 13:14 -0700, Stephen Hemminger wrote:
> On Sun, 31 Mar 2013 13:43:44 +0800
> Cong Wang <amwang@redhat.com> wrote:
> 
> >  	/* Need to drop RTNL to call multicast leave */
> >  	rtnl_unlock();
> > -	lock_sock(sk);
> > -	err = ip_mc_leave_group(sk, &mreq);
> > +	if (vxlan->gaddr.proto == htons(ETH_P_IP)) {
> > +		lock_sock(sk);
> > +		err = ip_mc_leave_group(sk, &mreq);
> > +	} else {
> > +#if IS_ENABLED(CONFIG_IPV6)
> > +		lock_sock(sk);
> > +		err = ipv6_sock_mc_drop(sk, vxlan->link, &vxlan->gaddr.ip6);
> > +#endif
> > +	}
> >  	release_sock(sk);
> 
> Since both v4 and v6 need socket locked why not?
> 
> 	rtnl_unlock();
> 	lock_sock(sk);
> 	if (vxlan->gaddr.proto == htons(ETH_P_IP)) 
> 		err = ip_mc_leave_group(sk, &mreq);
> if IS_ENABLED(CONFIG_IPV6)
> 	else
> 		err = ipv6_sock_mc_drop(sk, vxlan->link, &vxlan->gaddr.ip6);
> #endif
> 
>   	release_sock(sk);

Oh, of course, I missed that...

Thanks!

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [Patch net-next v1 3/4] vxlan: add ipv6 support
  2013-04-01 18:05       ` David Stevens
  2013-04-01 18:15         ` David Miller
@ 2013-04-02  1:46         ` Cong Wang
  2013-04-02 13:13           ` David Stevens
  1 sibling, 1 reply; 16+ messages in thread
From: Cong Wang @ 2013-04-02  1:46 UTC (permalink / raw)
  To: David Stevens; +Cc: David Miller, netdev, netdev-owner, stephen

On Mon, 2013-04-01 at 14:05 -0400, David Stevens wrote:
> David Miller <davem@davemloft.net> wrote on 04/01/2013 01:02:23 PM:
> 
> > 
> > People avoid using sockaddr because it gets defined both in the kernel
> > exported userland headers and the native libc ones with no easy
> > protection between the two to avoid getting a double definition error.
> > 
> > Once you start including kernel exported headers for things like these
> > network device specific interfaces, you potentially run into that issue.
> > 
> > Therefore I'd rather the subsystem define their own unique type both
> > to avoid this double definition problem and to allow easy extention
> > later.
> 
>         I guess, but in this case, I'm not saying it's like a sockaddr 
> with
> device-specific requirements. Rather, I'm saying it's exactly a sockaddr--
> it is either a sockaddr_in or a sockaddr_in6 and a family field to say
> which.
>         As is, any user/kernel include file conflicts (in the "ip" 
> command,
> presumably) are still present because he's using in6_addr, another 
> structure
> both in user and kernel space.
>         The primary multiple include issue here would be in the "ip" 
> command,
> presumably, but sockaddrs in particular have to agree between user and
> kernel space already and both appear with "ip".
>         This patch also has issues due to NOT copying other fields in the
> sockaddr_in6 structure (scope_id and port).
> 
>         Personally, I don't think it's too difficult to make correct code
> using sockaddr/sockaddr_in/sockaddr_in6 here, but even with a new type,
> the code within vxlan.c could (and I argue should) use something like:
> 
>        union {
>                struct sockaddr_in       vip_un_sin;
>                struct sockaddr_in6      vip_un_sin6;
>                struct sockaddr          vip_un_sa;
>        } vip_sun;
> 
> #define vip_sa  vip_sun.vip_un_sa
> #define vip_sin vip_sun.vip_un_sin
> #define vip_sin6        vip_sun.vip_un.sin6
> 
> and then code like:
> switch (vip->vip_sa.sa_family) {
> case AF_INET:
>         vip->vip_sin.sin_addr.s_addr = blah blah
>         break;
> case AF_INET6:
>         vip->vip_sin6.sin6_addr = blah blah
>         break;
> ...
> }
> 
> ...or whatever appropriate to the context and family.

Well, besides avoid redefining another type, what else could we gain by
using sockaddr_in6?

Look, we would have "vip->vip_sin.sin_addr.s_addr" instead of
"ipa->ip4", much longer than the current one...

So why defining a shorter and less complex struct matters?

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [Patch net-next v1 3/4] vxlan: add ipv6 support
  2013-04-02  1:46         ` Cong Wang
@ 2013-04-02 13:13           ` David Stevens
  0 siblings, 0 replies; 16+ messages in thread
From: David Stevens @ 2013-04-02 13:13 UTC (permalink / raw)
  To: Cong Wang; +Cc: David Miller, netdev, netdev-owner, stephen

Cong Wang <amwang@redhat.com> wrote on 04/01/2013 09:46:43 PM:

> Well, besides avoid redefining another type, what else could we gain by
> using sockaddr_in6?

        The gain is code readability. If you have a type that's
already defined and used throughout the network code, someone
trying to understand your code won't need to look up the type
and details to figure it out and modify it.

> Look, we would have "vip->vip_sin.sin_addr.s_addr" instead of
> "ipa->ip4", much longer than the current one... 
> 
> So why defining a shorter and less complex struct matters?
> 
> 

        Because you have entire sections of nearly duplicated
code that handle IPv4 alone or IPv6 alone. That's a maintenance
issue over time because future changes need to happen in 2
separate places and are more likely to be missed in one or
not replicated identically.
        Using sockaddrs, the field containing an address is
the same (e.g., "gaddr" or "dst") whether it is v4 or v6
because the structure self-identifies the address format
by the address family. So, you can have common code for
either address family -- "vxlan->gaddr = group;" or similar--
and you only have family-specific code where there is a
difference between the families.
        VXLAN is an in-kernel client and server and like its
user-space counterparts, you can write a v4-only implementation
and a v6-only implementation separately, or you can use common code
for both with generic addresses. Using common code is usually
easier to understand, modify and debug.

                                                        +-DLS

^ permalink raw reply	[flat|nested] 16+ messages in thread

end of thread, other threads:[~2013-04-02 13:13 UTC | newest]

Thread overview: 16+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2013-03-31  5:43 [Patch net-next v1 1/4] vxlan: defer vxlan init as late as possible Cong Wang
2013-03-31  5:43 ` [Patch net-next v1 2/4] ipv6: export ipv6_sock_mc_join and ipv6_sock_mc_drop Cong Wang
2013-03-31  5:43 ` [Patch net-next v1 3/4] vxlan: add ipv6 support Cong Wang
2013-04-01 15:19   ` David Stevens
2013-04-01 15:36     ` Stephen Hemminger
2013-04-01 17:02     ` David Miller
2013-04-01 18:05       ` David Stevens
2013-04-01 18:15         ` David Miller
2013-04-01 20:03           ` David Stevens
2013-04-01 20:05             ` David Miller
2013-04-02  1:46         ` Cong Wang
2013-04-02 13:13           ` David Stevens
2013-04-01 20:14   ` Stephen Hemminger
2013-04-02  1:39     ` Cong Wang
2013-03-31  5:43 ` [Patch net-next v1 4/4] ipv6: Add generic UDP Tunnel segmentation Cong Wang
2013-03-31  6:17 ` [PATCH iproute2] vxlan: add ipv6 support Cong Wang

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).