All of lore.kernel.org
 help / color / mirror / Atom feed
From: Florian Westphal <fw@strlen.de>
To: <netdev@vger.kernel.org>
Cc: aconole@redhat.com, sbrivio@redhat.com, Florian Westphal <fw@strlen.de>
Subject: [PATCH net-next 2/3] vxlan: allow to disable path mtu learning on encap socket
Date: Sun, 12 Jul 2020 22:07:04 +0200	[thread overview]
Message-ID: <20200712200705.9796-3-fw@strlen.de> (raw)
In-Reply-To: <20200712200705.9796-1-fw@strlen.de>

While its already possible to configure VXLAN to never set the DF bit
on packets that it sends, it was not yet possible to tell kernel to
not update the encapsulation sockets path MTU.

This can be used to tell ip stack to always use the interface MTU
when VXLAN wants to send a packet.

When packets are routed, VXLAN use skbs existing dst entries to
propagate the MTU update to the overlay, but on a bridge this doesn't
work (no routing, no dst entry, and no ip forwarding takes place, so
nothing emits icmp packet w. mtu update to sender).

This is only useful when VXLAN is used as a bridge port and the
network is known to accept packets up to the link MTU to avoid bogus
pmtu icmp packets from stopping tunneled traffic.

Signed-off-by: Florian Westphal <fw@strlen.de>
---
 drivers/net/vxlan.c          | 65 +++++++++++++++++++++++++++++++-----
 include/net/vxlan.h          |  2 ++
 include/uapi/linux/if_link.h |  1 +
 3 files changed, 59 insertions(+), 9 deletions(-)

diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
index a43c97b13924..ceb2940a2a62 100644
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c
@@ -3316,6 +3316,7 @@ static const struct nla_policy vxlan_policy[IFLA_VXLAN_MAX + 1] = {
 	[IFLA_VXLAN_REMCSUM_NOPARTIAL]	= { .type = NLA_FLAG },
 	[IFLA_VXLAN_TTL_INHERIT]	= { .type = NLA_FLAG },
 	[IFLA_VXLAN_DF]		= { .type = NLA_U8 },
+	[IFLA_VXLAN_PMTUDISC]	= { .type = NLA_U8 },
 };
 
 static int vxlan_validate(struct nlattr *tb[], struct nlattr *data[],
@@ -3418,7 +3419,8 @@ static const struct ethtool_ops vxlan_ethtool_ops = {
 };
 
 static struct socket *vxlan_create_sock(struct net *net, bool ipv6,
-					__be16 port, u32 flags, int ifindex)
+					const struct vxlan_config *cfg,
+					int ifindex)
 {
 	struct socket *sock;
 	struct udp_port_cfg udp_conf;
@@ -3429,13 +3431,18 @@ static struct socket *vxlan_create_sock(struct net *net, bool ipv6,
 	if (ipv6) {
 		udp_conf.family = AF_INET6;
 		udp_conf.use_udp6_rx_checksums =
-		    !(flags & VXLAN_F_UDP_ZERO_CSUM6_RX);
+		    !(cfg->flags & VXLAN_F_UDP_ZERO_CSUM6_RX);
 		udp_conf.ipv6_v6only = 1;
 	} else {
 		udp_conf.family = AF_INET;
 	}
 
-	udp_conf.local_udp_port = port;
+	if (cfg->pmtudisc) {
+		udp_conf.ip_pmtudisc = 1;
+		udp_conf.ip_pmtudiscv = cfg->pmtudiscv;
+	}
+
+	udp_conf.local_udp_port = cfg->dst_port;
 	udp_conf.bind_ifindex = ifindex;
 
 	/* Open UDP socket */
@@ -3448,7 +3455,7 @@ static struct socket *vxlan_create_sock(struct net *net, bool ipv6,
 
 /* Create new listen socket if needed */
 static struct vxlan_sock *vxlan_socket_create(struct net *net, bool ipv6,
-					      __be16 port, u32 flags,
+					      const struct vxlan_config *cfg,
 					      int ifindex)
 {
 	struct vxlan_net *vn = net_generic(net, vxlan_net_id);
@@ -3464,7 +3471,7 @@ static struct vxlan_sock *vxlan_socket_create(struct net *net, bool ipv6,
 	for (h = 0; h < VNI_HASH_SIZE; ++h)
 		INIT_HLIST_HEAD(&vs->vni_list[h]);
 
-	sock = vxlan_create_sock(net, ipv6, port, flags, ifindex);
+	sock = vxlan_create_sock(net, ipv6, cfg, ifindex);
 	if (IS_ERR(sock)) {
 		kfree(vs);
 		return ERR_CAST(sock);
@@ -3472,10 +3479,10 @@ static struct vxlan_sock *vxlan_socket_create(struct net *net, bool ipv6,
 
 	vs->sock = sock;
 	refcount_set(&vs->refcnt, 1);
-	vs->flags = (flags & VXLAN_F_RCV_FLAGS);
+	vs->flags = (cfg->flags & VXLAN_F_RCV_FLAGS);
 
 	spin_lock(&vn->sock_lock);
-	hlist_add_head_rcu(&vs->hlist, vs_head(net, port));
+	hlist_add_head_rcu(&vs->hlist, vs_head(net, cfg->dst_port));
 	udp_tunnel_notify_add_rx_port(sock,
 				      (vs->flags & VXLAN_F_GPE) ?
 				      UDP_TUNNEL_TYPE_VXLAN_GPE :
@@ -3521,8 +3528,7 @@ static int __vxlan_sock_add(struct vxlan_dev *vxlan, bool ipv6)
 	}
 	if (!vs)
 		vs = vxlan_socket_create(vxlan->net, ipv6,
-					 vxlan->cfg.dst_port, vxlan->cfg.flags,
-					 l3mdev_index);
+					 &vxlan->cfg, l3mdev_index);
 	if (IS_ERR(vs))
 		return PTR_ERR(vs);
 #if IS_ENABLED(CONFIG_IPV6)
@@ -3984,6 +3990,21 @@ static int vxlan_nl2conf(struct nlattr *tb[], struct nlattr *data[],
 
 	if (data[IFLA_VXLAN_LINK])
 		conf->remote_ifindex = nla_get_u32(data[IFLA_VXLAN_LINK]);
+	if (data[IFLA_VXLAN_PMTUDISC]) {
+		int pmtuv = nla_get_u8(data[IFLA_VXLAN_PMTUDISC]);
+
+		if (pmtuv < IP_PMTUDISC_DONT) {
+			NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_VXLAN_PMTUDISC], "PMTUDISC Value < 0");
+			return -EOPNOTSUPP;
+		}
+		if (pmtuv > IP_PMTUDISC_OMIT) {
+			NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_VXLAN_PMTUDISC], "PMTUDISC Value > IP_PMTUDISC_OMIT");
+			return -EOPNOTSUPP;
+		}
+
+		conf->pmtudisc = 1;
+		conf->pmtudiscv = pmtuv;
+	}
 
 	if (data[IFLA_VXLAN_TOS])
 		conf->tos  = nla_get_u8(data[IFLA_VXLAN_TOS]);
@@ -4249,6 +4270,27 @@ static int vxlan_changelink(struct net_device *dev, struct nlattr *tb[],
 	netdev_adjacent_change_commit(dst->remote_dev, lowerdev, dev);
 	if (lowerdev && lowerdev != dst->remote_dev)
 		dst->remote_dev = lowerdev;
+
+	if (conf.pmtudisc && conf.pmtudiscv != vxlan->cfg.pmtudiscv) {
+		struct vxlan_sock *sock4 = rtnl_dereference(vxlan->vn4_sock);
+#if IS_ENABLED(CONFIG_IPV6)
+		struct vxlan_sock *sock6 = rtnl_dereference(vxlan->vn6_sock);
+#endif
+		struct socket *sock;
+
+		if (sock4) {
+			sock = sock4->sock;
+			ip_sock_set_mtu_discover(sock->sk, conf.pmtudiscv);
+		}
+#if IS_ENABLED(CONFIG_IPV6)
+		if (sock6) {
+			sock = sock6->sock;
+			ip6_sock_set_mtu_discover(sock->sk, conf.pmtudiscv);
+			ip_sock_set_mtu_discover(sock->sk, conf.pmtudiscv);
+		}
+#endif
+	}
+
 	vxlan_config_apply(dev, &conf, lowerdev, vxlan->net, true);
 	return 0;
 }
@@ -4276,6 +4318,7 @@ static size_t vxlan_get_size(const struct net_device *dev)
 		nla_total_size(sizeof(__u8)) +	/* IFLA_VXLAN_TTL_INHERIT */
 		nla_total_size(sizeof(__u8)) +	/* IFLA_VXLAN_TOS */
 		nla_total_size(sizeof(__u8)) +	/* IFLA_VXLAN_DF */
+		nla_total_size(sizeof(__u8)) +	/* IFLA_VXLAN_PMTUDISC */
 		nla_total_size(sizeof(__be32)) + /* IFLA_VXLAN_LABEL */
 		nla_total_size(sizeof(__u8)) +	/* IFLA_VXLAN_LEARNING */
 		nla_total_size(sizeof(__u8)) +	/* IFLA_VXLAN_PROXY */
@@ -4374,6 +4417,10 @@ static int vxlan_fill_info(struct sk_buff *skb, const struct net_device *dev)
 	if (nla_put(skb, IFLA_VXLAN_PORT_RANGE, sizeof(ports), &ports))
 		goto nla_put_failure;
 
+	if (vxlan->cfg.pmtudisc &&
+	    nla_put_u8(skb, IFLA_VXLAN_PMTUDISC, vxlan->cfg.pmtudiscv))
+		goto nla_put_failure;
+
 	if (vxlan->cfg.flags & VXLAN_F_GBP &&
 	    nla_put_flag(skb, IFLA_VXLAN_GBP))
 		goto nla_put_failure;
diff --git a/include/net/vxlan.h b/include/net/vxlan.h
index 3a41627cbdfe..1414cfa2005f 100644
--- a/include/net/vxlan.h
+++ b/include/net/vxlan.h
@@ -220,6 +220,8 @@ struct vxlan_config {
 	unsigned long		age_interval;
 	unsigned int		addrmax;
 	bool			no_share;
+	u8			pmtudisc:1;
+	u8			pmtudiscv:3;
 	enum ifla_vxlan_df	df;
 };
 
diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index cc185a007ade..f22cf508871c 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -548,6 +548,7 @@ enum {
 	IFLA_VXLAN_GPE,
 	IFLA_VXLAN_TTL_INHERIT,
 	IFLA_VXLAN_DF,
+	IFLA_VXLAN_PMTUDISC,
 	__IFLA_VXLAN_MAX
 };
 #define IFLA_VXLAN_MAX	(__IFLA_VXLAN_MAX - 1)
-- 
2.26.2


  parent reply	other threads:[~2020-07-12 20:07 UTC|newest]

Thread overview: 35+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2020-07-12 20:07 [PATCH net-next 0/3] vxlan, geneve: allow to turn off PMTU updates on encap socket Florian Westphal
2020-07-12 20:07 ` [PATCH net-next 1/3] udp_tunnel: allow to turn off path mtu discovery on encap sockets Florian Westphal
2020-07-12 22:38   ` Stefano Brivio
2020-07-13  8:04     ` Florian Westphal
2020-07-13 10:04       ` Stefano Brivio
2020-07-13 10:51         ` Numan Siddique
2020-07-14 20:38           ` Aaron Conole
2020-07-15 11:58             ` Stefano Brivio
2020-07-13 13:25       ` David Ahern
2020-07-13 14:02         ` Florian Westphal
2020-07-13 14:41           ` David Ahern
2020-07-13 14:59             ` Florian Westphal
2020-07-13 15:57               ` Stefano Brivio
2020-07-13 16:22                 ` Florian Westphal
2020-07-14 12:33                   ` Stefano Brivio
2020-07-14 12:33           ` Stefano Brivio
2020-07-15 12:42             ` Florian Westphal
2020-07-15 13:35               ` Stefano Brivio
2020-07-15 14:33                 ` Florian Westphal
2020-07-17 12:27                   ` Stefano Brivio
2020-07-17 15:04                     ` David Ahern
2020-07-17 18:43                       ` Florian Westphal
2020-07-18  6:56                       ` Stefano Brivio
2020-07-18 17:02                         ` David Ahern
2020-07-18 17:58                           ` Stefano Brivio
2020-07-18 18:04                             ` Stefano Brivio
2020-07-19 18:43                             ` David Ahern
2020-07-19 21:49                               ` Stefano Brivio
2020-07-20  3:19                                 ` David Ahern
2020-07-26 17:01                                   ` Stefano Brivio
2020-07-12 20:07 ` Florian Westphal [this message]
2020-07-16 19:33   ` [PATCH net-next 2/3] vxlan: allow to disable path mtu learning on encap socket Jakub Kicinski
2020-07-17 10:13     ` Florian Westphal
2020-07-12 20:07 ` [PATCH net-next 3/3] geneve: allow disabling of pmtu detection on encap sk Florian Westphal
2020-07-12 22:39 ` [PATCH net-next 0/3] vxlan, geneve: allow to turn off PMTU updates on encap socket Stefano Brivio

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20200712200705.9796-3-fw@strlen.de \
    --to=fw@strlen.de \
    --cc=aconole@redhat.com \
    --cc=netdev@vger.kernel.org \
    --cc=sbrivio@redhat.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.