All of lore.kernel.org
 help / color / mirror / Atom feed
* [net-next PATCH 0/3] net: optimize ICMP-reply code path
@ 2017-01-09 15:03 Jesper Dangaard Brouer
  2017-01-09 15:04 ` [net-next PATCH 1/3] Revert "icmp: avoid allocating large struct on stack" Jesper Dangaard Brouer
                   ` (4 more replies)
  0 siblings, 5 replies; 33+ messages in thread
From: Jesper Dangaard Brouer @ 2017-01-09 15:03 UTC (permalink / raw)
  To: netdev, Eric Dumazet, Jesper Dangaard Brouer; +Cc: xiyou.wangcong

This patchset is optimizing the ICMP-reply code path, for ICMP packets
that gets rate limited. A remote party can easily trigger this code
path by sending packets to port number with no listening service.

Generally the patchset moves the sysctl_icmp_msgs_per_sec ratelimit
checking to earlier in the code path and removes an allocation.


Use-case: The specific case I experienced this being a bottleneck is,
sending UDP packets to a port with no listener, which obviously result
in kernel replying with ICMP Destination Unreachable (type:3), Port
Unreachable (code:3), which cause the bottleneck.

 After Eric and Paolo optimized the UDP socket code, the kernels PPS
processing capabilities is lower for no-listen ports, than normal UDP
sockets.  This is bad for capacity planning when restarting a service.

UDP no-listen benchmark 8xCPUs using pktgen_sample04_many_flows.sh:
 Baseline: 6.6 Mpps
 Patch:   14.7 Mpps
Driver mlx5 at 50Gbit/s.

---

Jesper Dangaard Brouer (3):
      Revert "icmp: avoid allocating large struct on stack"
      net: reduce cycles spend on ICMP replies that gets rate limited
      net: for rate-limited ICMP replies save one atomic operation


 net/ipv4/icmp.c |  125 +++++++++++++++++++++++++++++++++----------------------
 net/ipv6/icmp.c |   68 +++++++++++++++++++++---------
 2 files changed, 123 insertions(+), 70 deletions(-)

^ permalink raw reply	[flat|nested] 33+ messages in thread

* [net-next PATCH 1/3] Revert "icmp: avoid allocating large struct on stack"
  2017-01-09 15:03 [net-next PATCH 0/3] net: optimize ICMP-reply code path Jesper Dangaard Brouer
@ 2017-01-09 15:04 ` Jesper Dangaard Brouer
  2017-01-09 17:42   ` Cong Wang
  2017-01-09 17:42   ` Eric Dumazet
  2017-01-09 15:04 ` [net-next PATCH 2/3] net: reduce cycles spend on ICMP replies that gets rate limited Jesper Dangaard Brouer
                   ` (3 subsequent siblings)
  4 siblings, 2 replies; 33+ messages in thread
From: Jesper Dangaard Brouer @ 2017-01-09 15:04 UTC (permalink / raw)
  To: netdev, Eric Dumazet, Jesper Dangaard Brouer; +Cc: xiyou.wangcong

This reverts commit 9a99d4a50cb8 ("icmp: avoid allocating large struct
on stack"), because struct icmp_bxm no really a large struct, and
allocating and free of this small 112 bytes hurts performance.

Fixes: 9a99d4a50cb8 ("icmp: avoid allocating large struct on stack")
Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
---
 net/ipv4/icmp.c |   40 +++++++++++++++++-----------------------
 1 file changed, 17 insertions(+), 23 deletions(-)

diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index 0777ea949223..b4b9807329a7 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -571,7 +571,7 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
 {
 	struct iphdr *iph;
 	int room;
-	struct icmp_bxm *icmp_param;
+	struct icmp_bxm icmp_param;
 	struct rtable *rt = skb_rtable(skb_in);
 	struct ipcm_cookie ipc;
 	struct flowi4 fl4;
@@ -648,13 +648,9 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
 		}
 	}
 
-	icmp_param = kmalloc(sizeof(*icmp_param), GFP_ATOMIC);
-	if (!icmp_param)
-		return;
-
 	sk = icmp_xmit_lock(net);
 	if (!sk)
-		goto out_free;
+		return;
 
 	/*
 	 *	Construct source address and options.
@@ -681,7 +677,7 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
 					  iph->tos;
 	mark = IP4_REPLY_MARK(net, skb_in->mark);
 
-	if (ip_options_echo(&icmp_param->replyopts.opt.opt, skb_in))
+	if (ip_options_echo(&icmp_param.replyopts.opt.opt, skb_in))
 		goto out_unlock;
 
 
@@ -689,22 +685,22 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
 	 *	Prepare data for ICMP header.
 	 */
 
-	icmp_param->data.icmph.type	 = type;
-	icmp_param->data.icmph.code	 = code;
-	icmp_param->data.icmph.un.gateway = info;
-	icmp_param->data.icmph.checksum	 = 0;
-	icmp_param->skb	  = skb_in;
-	icmp_param->offset = skb_network_offset(skb_in);
+	icmp_param.data.icmph.type	 = type;
+	icmp_param.data.icmph.code	 = code;
+	icmp_param.data.icmph.un.gateway = info;
+	icmp_param.data.icmph.checksum	 = 0;
+	icmp_param.skb	  = skb_in;
+	icmp_param.offset = skb_network_offset(skb_in);
 	inet_sk(sk)->tos = tos;
 	sk->sk_mark = mark;
 	ipc.addr = iph->saddr;
-	ipc.opt = &icmp_param->replyopts.opt;
+	ipc.opt = &icmp_param.replyopts.opt;
 	ipc.tx_flags = 0;
 	ipc.ttl = 0;
 	ipc.tos = -1;
 
 	rt = icmp_route_lookup(net, &fl4, skb_in, iph, saddr, tos, mark,
-			       type, code, icmp_param);
+			       type, code, &icmp_param);
 	if (IS_ERR(rt))
 		goto out_unlock;
 
@@ -716,21 +712,19 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
 	room = dst_mtu(&rt->dst);
 	if (room > 576)
 		room = 576;
-	room -= sizeof(struct iphdr) + icmp_param->replyopts.opt.opt.optlen;
+	room -= sizeof(struct iphdr) + icmp_param.replyopts.opt.opt.optlen;
 	room -= sizeof(struct icmphdr);
 
-	icmp_param->data_len = skb_in->len - icmp_param->offset;
-	if (icmp_param->data_len > room)
-		icmp_param->data_len = room;
-	icmp_param->head_len = sizeof(struct icmphdr);
+	icmp_param.data_len = skb_in->len - icmp_param.offset;
+	if (icmp_param.data_len > room)
+		icmp_param.data_len = room;
+	icmp_param.head_len = sizeof(struct icmphdr);
 
-	icmp_push_reply(icmp_param, &fl4, &ipc, &rt);
+	icmp_push_reply(&icmp_param, &fl4, &ipc, &rt);
 ende:
 	ip_rt_put(rt);
 out_unlock:
 	icmp_xmit_unlock(sk);
-out_free:
-	kfree(icmp_param);
 out:;
 }
 EXPORT_SYMBOL(icmp_send);

^ permalink raw reply related	[flat|nested] 33+ messages in thread

* [net-next PATCH 2/3] net: reduce cycles spend on ICMP replies that gets rate limited
  2017-01-09 15:03 [net-next PATCH 0/3] net: optimize ICMP-reply code path Jesper Dangaard Brouer
  2017-01-09 15:04 ` [net-next PATCH 1/3] Revert "icmp: avoid allocating large struct on stack" Jesper Dangaard Brouer
@ 2017-01-09 15:04 ` Jesper Dangaard Brouer
  2017-01-09 17:44   ` Eric Dumazet
  2017-06-04  7:11   ` Florian Weimer
  2017-01-09 15:04 ` [net-next PATCH 3/3] net: for rate-limited ICMP replies save one atomic operation Jesper Dangaard Brouer
                   ` (2 subsequent siblings)
  4 siblings, 2 replies; 33+ messages in thread
From: Jesper Dangaard Brouer @ 2017-01-09 15:04 UTC (permalink / raw)
  To: netdev, Eric Dumazet, Jesper Dangaard Brouer; +Cc: xiyou.wangcong

This patch split the global and per (inet)peer ICMP-reply limiter
code, and moves the global limit check to earlier in the packet
processing path.  Thus, avoid spending cycles on ICMP replies that
gets limited/suppressed anyhow.

The global ICMP rate limiter icmp_global_allow() is a good solution,
it just happens too late in the process.  The kernel goes through the
full route lookup (return path) for the ICMP message, before taking
the rate limit decision of not sending the ICMP reply.

Details: The kernels global rate limiter for ICMP messages got added
in commit 4cdf507d5452 ("icmp: add a global rate limitation").  It is
a token bucket limiter with a global lock.  It brilliantly avoids
locking congestion by only updating when 20ms (HZ/50) were elapsed. It
can then avoids taking lock when credit is exhausted (when under
pressure) and time constraint for refill is not yet meet.

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
---
 net/ipv4/icmp.c |   71 +++++++++++++++++++++++++++++++++++++------------------
 net/ipv6/icmp.c |   49 ++++++++++++++++++++++++++------------
 2 files changed, 82 insertions(+), 38 deletions(-)

diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index b4b9807329a7..58d75ca58b83 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -282,6 +282,33 @@ bool icmp_global_allow(void)
 }
 EXPORT_SYMBOL(icmp_global_allow);
 
+static bool icmpv4_mask_allow(struct net *net, int type, int code)
+{
+	if (type > NR_ICMP_TYPES)
+		return true;
+
+	/* Don't limit PMTU discovery. */
+	if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED)
+		return true;
+
+	/* Limit if icmp type is enabled in ratemask. */
+	if (!((1 << type) & net->ipv4.sysctl_icmp_ratemask))
+		return true;
+
+	return false;
+}
+
+static bool icmpv4_global_allow(struct net *net, int type, int code)
+{
+	if (icmpv4_mask_allow(net, type, code))
+		return true;
+
+	if (icmp_global_allow())
+		return true;
+
+	return false;
+}
+
 /*
  *	Send an ICMP frame.
  */
@@ -290,34 +317,22 @@ static bool icmpv4_xrlim_allow(struct net *net, struct rtable *rt,
 			       struct flowi4 *fl4, int type, int code)
 {
 	struct dst_entry *dst = &rt->dst;
+	struct inet_peer *peer;
 	bool rc = true;
+	int vif;
 
-	if (type > NR_ICMP_TYPES)
-		goto out;
-
-	/* Don't limit PMTU discovery. */
-	if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED)
+	if (icmpv4_mask_allow(net, type, code))
 		goto out;
 
 	/* No rate limit on loopback */
 	if (dst->dev && (dst->dev->flags&IFF_LOOPBACK))
 		goto out;
 
-	/* Limit if icmp type is enabled in ratemask. */
-	if (!((1 << type) & net->ipv4.sysctl_icmp_ratemask))
-		goto out;
-
-	rc = false;
-	if (icmp_global_allow()) {
-		int vif = l3mdev_master_ifindex(dst->dev);
-		struct inet_peer *peer;
-
-		peer = inet_getpeer_v4(net->ipv4.peers, fl4->daddr, vif, 1);
-		rc = inet_peer_xrlim_allow(peer,
-					   net->ipv4.sysctl_icmp_ratelimit);
-		if (peer)
-			inet_putpeer(peer);
-	}
+	vif = l3mdev_master_ifindex(dst->dev);
+	peer = inet_getpeer_v4(net->ipv4.peers, fl4->daddr, vif, 1);
+	rc = inet_peer_xrlim_allow(peer, net->ipv4.sysctl_icmp_ratelimit);
+	if (peer)
+		inet_putpeer(peer);
 out:
 	return rc;
 }
@@ -396,6 +411,8 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb)
 	struct inet_sock *inet;
 	__be32 daddr, saddr;
 	u32 mark = IP4_REPLY_MARK(net, skb->mark);
+	int type = icmp_param->data.icmph.type;
+	int code = icmp_param->data.icmph.code;
 
 	if (ip_options_echo(&icmp_param->replyopts.opt.opt, skb))
 		return;
@@ -405,6 +422,10 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb)
 		return;
 	inet = inet_sk(sk);
 
+	/* global icmp_msgs_per_sec */
+	if (!icmpv4_global_allow(net, type, code))
+		goto out_unlock;
+
 	icmp_param->data.icmph.checksum = 0;
 
 	inet->tos = ip_hdr(skb)->tos;
@@ -433,8 +454,7 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb)
 	rt = ip_route_output_key(net, &fl4);
 	if (IS_ERR(rt))
 		goto out_unlock;
-	if (icmpv4_xrlim_allow(net, rt, &fl4, icmp_param->data.icmph.type,
-			       icmp_param->data.icmph.code))
+	if (icmpv4_xrlim_allow(net, rt, &fl4, type, code))
 		icmp_push_reply(icmp_param, &fl4, &ipc, &rt);
 	ip_rt_put(rt);
 out_unlock:
@@ -650,7 +670,11 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
 
 	sk = icmp_xmit_lock(net);
 	if (!sk)
-		return;
+		goto out;
+
+	/* Check global sysctl_icmp_msgs_per_sec ratelimit */
+	if (!icmpv4_global_allow(net, type, code))
+		goto out_unlock;
 
 	/*
 	 *	Construct source address and options.
@@ -704,6 +728,7 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
 	if (IS_ERR(rt))
 		goto out_unlock;
 
+	/* peer icmp_ratelimit */
 	if (!icmpv4_xrlim_allow(net, rt, &fl4, type, code))
 		goto ende;
 
diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c
index 3036f665e6c8..b26ae8b5c1ce 100644
--- a/net/ipv6/icmp.c
+++ b/net/ipv6/icmp.c
@@ -168,6 +168,30 @@ static bool is_ineligible(const struct sk_buff *skb)
 	return false;
 }
 
+static bool icmpv6_mask_allow(int type)
+{
+	/* Informational messages are not limited. */
+	if (type & ICMPV6_INFOMSG_MASK)
+		return true;
+
+	/* Do not limit pmtu discovery, it would break it. */
+	if (type == ICMPV6_PKT_TOOBIG)
+		return true;
+
+	return false;
+}
+
+static bool icmpv6_global_allow(int type)
+{
+	if (icmpv6_mask_allow(type))
+		return true;
+
+	if (icmp_global_allow())
+		return true;
+
+	return false;
+}
+
 /*
  * Check the ICMP output rate limit
  */
@@ -178,12 +202,7 @@ static bool icmpv6_xrlim_allow(struct sock *sk, u8 type,
 	struct dst_entry *dst;
 	bool res = false;
 
-	/* Informational messages are not limited. */
-	if (type & ICMPV6_INFOMSG_MASK)
-		return true;
-
-	/* Do not limit pmtu discovery, it would break it. */
-	if (type == ICMPV6_PKT_TOOBIG)
+	if (icmpv6_mask_allow(type))
 		return true;
 
 	/*
@@ -200,20 +219,16 @@ static bool icmpv6_xrlim_allow(struct sock *sk, u8 type,
 	} else {
 		struct rt6_info *rt = (struct rt6_info *)dst;
 		int tmo = net->ipv6.sysctl.icmpv6_time;
+		struct inet_peer *peer;
 
 		/* Give more bandwidth to wider prefixes. */
 		if (rt->rt6i_dst.plen < 128)
 			tmo >>= ((128 - rt->rt6i_dst.plen)>>5);
 
-		if (icmp_global_allow()) {
-			struct inet_peer *peer;
-
-			peer = inet_getpeer_v6(net->ipv6.peers,
-					       &fl6->daddr, 1);
-			res = inet_peer_xrlim_allow(peer, tmo);
-			if (peer)
-				inet_putpeer(peer);
-		}
+		peer = inet_getpeer_v6(net->ipv6.peers, &fl6->daddr, 1);
+		res = inet_peer_xrlim_allow(peer, tmo);
+		if (peer)
+			inet_putpeer(peer);
 	}
 	dst_release(dst);
 	return res;
@@ -493,6 +508,10 @@ static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info,
 	sk = icmpv6_xmit_lock(net);
 	if (!sk)
 		return;
+
+	if (!icmpv6_global_allow(type))
+		goto out;
+
 	sk->sk_mark = mark;
 	np = inet6_sk(sk);
 

^ permalink raw reply related	[flat|nested] 33+ messages in thread

* [net-next PATCH 3/3] net: for rate-limited ICMP replies save one atomic operation
  2017-01-09 15:03 [net-next PATCH 0/3] net: optimize ICMP-reply code path Jesper Dangaard Brouer
  2017-01-09 15:04 ` [net-next PATCH 1/3] Revert "icmp: avoid allocating large struct on stack" Jesper Dangaard Brouer
  2017-01-09 15:04 ` [net-next PATCH 2/3] net: reduce cycles spend on ICMP replies that gets rate limited Jesper Dangaard Brouer
@ 2017-01-09 15:04 ` Jesper Dangaard Brouer
  2017-01-09 17:44   ` Eric Dumazet
  2017-01-09 17:43 ` [net-next PATCH 0/3] net: optimize ICMP-reply code path Cong Wang
  2017-01-09 20:49 ` David Miller
  4 siblings, 1 reply; 33+ messages in thread
From: Jesper Dangaard Brouer @ 2017-01-09 15:04 UTC (permalink / raw)
  To: netdev, Eric Dumazet, Jesper Dangaard Brouer; +Cc: xiyou.wangcong

It is possible to avoid the atomic operation in icmp{v6,}_xmit_lock,
by checking the sysctl_icmp_msgs_per_sec ratelimit before these calls,
as pointed out by Eric Dumazet, but the BH disabled state must be correct.

The icmp_global_allow() call states it must be called with BH
disabled.  This protection was given by the calls icmp_xmit_lock and
icmpv6_xmit_lock.  Thus, split out local_bh_disable/enable from these
functions and maintain it explicitly at callers.

Suggested-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
---
 net/ipv4/icmp.c |   34 +++++++++++++++++++++-------------
 net/ipv6/icmp.c |   25 ++++++++++++++++---------
 2 files changed, 37 insertions(+), 22 deletions(-)

diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index 58d75ca58b83..fc310db2708b 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -209,19 +209,17 @@ static struct sock *icmp_sk(struct net *net)
 	return *this_cpu_ptr(net->ipv4.icmp_sk);
 }
 
+/* Called with BH disabled */
 static inline struct sock *icmp_xmit_lock(struct net *net)
 {
 	struct sock *sk;
 
-	local_bh_disable();
-
 	sk = icmp_sk(net);
 
 	if (unlikely(!spin_trylock(&sk->sk_lock.slock))) {
 		/* This can happen if the output path signals a
 		 * dst_link_failure() for an outgoing ICMP packet.
 		 */
-		local_bh_enable();
 		return NULL;
 	}
 	return sk;
@@ -229,7 +227,7 @@ static inline struct sock *icmp_xmit_lock(struct net *net)
 
 static inline void icmp_xmit_unlock(struct sock *sk)
 {
-	spin_unlock_bh(&sk->sk_lock.slock);
+	spin_unlock(&sk->sk_lock.slock);
 }
 
 int sysctl_icmp_msgs_per_sec __read_mostly = 1000;
@@ -417,14 +415,17 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb)
 	if (ip_options_echo(&icmp_param->replyopts.opt.opt, skb))
 		return;
 
-	sk = icmp_xmit_lock(net);
-	if (!sk)
-		return;
-	inet = inet_sk(sk);
+	/* Needed by both icmp_global_allow and icmp_xmit_lock */
+	local_bh_disable();
 
 	/* global icmp_msgs_per_sec */
 	if (!icmpv4_global_allow(net, type, code))
-		goto out_unlock;
+		goto out_bh_enable;
+
+	sk = icmp_xmit_lock(net);
+	if (!sk)
+		goto out_bh_enable;
+	inet = inet_sk(sk);
 
 	icmp_param->data.icmph.checksum = 0;
 
@@ -459,6 +460,8 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb)
 	ip_rt_put(rt);
 out_unlock:
 	icmp_xmit_unlock(sk);
+out_bh_enable:
+	local_bh_enable();
 }
 
 #ifdef CONFIG_IP_ROUTE_MULTIPATH
@@ -668,13 +671,16 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
 		}
 	}
 
-	sk = icmp_xmit_lock(net);
-	if (!sk)
-		goto out;
+	/* Needed by both icmp_global_allow and icmp_xmit_lock */
+	local_bh_disable();
 
 	/* Check global sysctl_icmp_msgs_per_sec ratelimit */
 	if (!icmpv4_global_allow(net, type, code))
-		goto out_unlock;
+		goto out_bh_enable;
+
+	sk = icmp_xmit_lock(net);
+	if (!sk)
+		goto out_bh_enable;
 
 	/*
 	 *	Construct source address and options.
@@ -750,6 +756,8 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
 	ip_rt_put(rt);
 out_unlock:
 	icmp_xmit_unlock(sk);
+out_bh_enable:
+	local_bh_enable();
 out:;
 }
 EXPORT_SYMBOL(icmp_send);
diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c
index b26ae8b5c1ce..230b5aac9f03 100644
--- a/net/ipv6/icmp.c
+++ b/net/ipv6/icmp.c
@@ -110,19 +110,17 @@ static const struct inet6_protocol icmpv6_protocol = {
 	.flags		=	INET6_PROTO_NOPOLICY|INET6_PROTO_FINAL,
 };
 
+/* Called with BH disabled */
 static __inline__ struct sock *icmpv6_xmit_lock(struct net *net)
 {
 	struct sock *sk;
 
-	local_bh_disable();
-
 	sk = icmpv6_sk(net);
 	if (unlikely(!spin_trylock(&sk->sk_lock.slock))) {
 		/* This can happen if the output path (f.e. SIT or
 		 * ip6ip6 tunnel) signals dst_link_failure() for an
 		 * outgoing ICMP6 packet.
 		 */
-		local_bh_enable();
 		return NULL;
 	}
 	return sk;
@@ -130,7 +128,7 @@ static __inline__ struct sock *icmpv6_xmit_lock(struct net *net)
 
 static __inline__ void icmpv6_xmit_unlock(struct sock *sk)
 {
-	spin_unlock_bh(&sk->sk_lock.slock);
+	spin_unlock(&sk->sk_lock.slock);
 }
 
 /*
@@ -489,6 +487,13 @@ static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info,
 		return;
 	}
 
+	/* Needed by both icmp_global_allow and icmpv6_xmit_lock */
+	local_bh_disable();
+
+	/* Check global sysctl_icmp_msgs_per_sec ratelimit */
+	if (!icmpv6_global_allow(type))
+		goto out_bh_enable;
+
 	mip6_addr_swap(skb);
 
 	memset(&fl6, 0, sizeof(fl6));
@@ -507,10 +512,7 @@ static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info,
 
 	sk = icmpv6_xmit_lock(net);
 	if (!sk)
-		return;
-
-	if (!icmpv6_global_allow(type))
-		goto out;
+		goto out_bh_enable;
 
 	sk->sk_mark = mark;
 	np = inet6_sk(sk);
@@ -571,6 +573,8 @@ static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info,
 	dst_release(dst);
 out:
 	icmpv6_xmit_unlock(sk);
+out_bh_enable:
+	local_bh_enable();
 }
 
 /* Slightly more convenient version of icmp6_send.
@@ -684,9 +688,10 @@ static void icmpv6_echo_reply(struct sk_buff *skb)
 	fl6.flowi6_uid = sock_net_uid(net, NULL);
 	security_skb_classify_flow(skb, flowi6_to_flowi(&fl6));
 
+	local_bh_disable();
 	sk = icmpv6_xmit_lock(net);
 	if (!sk)
-		return;
+		goto out_bh_enable;
 	sk->sk_mark = mark;
 	np = inet6_sk(sk);
 
@@ -728,6 +733,8 @@ static void icmpv6_echo_reply(struct sk_buff *skb)
 	dst_release(dst);
 out:
 	icmpv6_xmit_unlock(sk);
+out_bh_enable:
+	local_bh_enable();
 }
 
 void icmpv6_notify(struct sk_buff *skb, u8 type, u8 code, __be32 info)

^ permalink raw reply related	[flat|nested] 33+ messages in thread

* Re: [net-next PATCH 1/3] Revert "icmp: avoid allocating large struct on stack"
  2017-01-09 15:04 ` [net-next PATCH 1/3] Revert "icmp: avoid allocating large struct on stack" Jesper Dangaard Brouer
@ 2017-01-09 17:42   ` Cong Wang
  2017-01-09 17:50     ` Eric Dumazet
  2017-01-09 17:42   ` Eric Dumazet
  1 sibling, 1 reply; 33+ messages in thread
From: Cong Wang @ 2017-01-09 17:42 UTC (permalink / raw)
  To: Jesper Dangaard Brouer; +Cc: Linux Kernel Network Developers, Eric Dumazet

On Mon, Jan 9, 2017 at 7:04 AM, Jesper Dangaard Brouer
<brouer@redhat.com> wrote:
> This reverts commit 9a99d4a50cb8 ("icmp: avoid allocating large struct
> on stack"), because struct icmp_bxm no really a large struct, and
> allocating and free of this small 112 bytes hurts performance.

The original commit fixes a warning for large stack usage, icmp_send()
is deep in the call stack.

Your optimization for a slow path makes no sense to me.

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [net-next PATCH 1/3] Revert "icmp: avoid allocating large struct on stack"
  2017-01-09 15:04 ` [net-next PATCH 1/3] Revert "icmp: avoid allocating large struct on stack" Jesper Dangaard Brouer
  2017-01-09 17:42   ` Cong Wang
@ 2017-01-09 17:42   ` Eric Dumazet
  1 sibling, 0 replies; 33+ messages in thread
From: Eric Dumazet @ 2017-01-09 17:42 UTC (permalink / raw)
  To: Jesper Dangaard Brouer; +Cc: netdev, xiyou.wangcong

On Mon, 2017-01-09 at 16:04 +0100, Jesper Dangaard Brouer wrote:
> This reverts commit 9a99d4a50cb8 ("icmp: avoid allocating large struct
> on stack"), because struct icmp_bxm no really a large struct, and
> allocating and free of this small 112 bytes hurts performance.
> 
> Fixes: 9a99d4a50cb8 ("icmp: avoid allocating large struct on stack")
> Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
> ---

Acked-by: Eric Dumazet <edumazet@google.com>

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [net-next PATCH 0/3] net: optimize ICMP-reply code path
  2017-01-09 15:03 [net-next PATCH 0/3] net: optimize ICMP-reply code path Jesper Dangaard Brouer
                   ` (2 preceding siblings ...)
  2017-01-09 15:04 ` [net-next PATCH 3/3] net: for rate-limited ICMP replies save one atomic operation Jesper Dangaard Brouer
@ 2017-01-09 17:43 ` Cong Wang
  2017-01-09 17:56   ` Eric Dumazet
  2017-01-09 20:49 ` David Miller
  4 siblings, 1 reply; 33+ messages in thread
From: Cong Wang @ 2017-01-09 17:43 UTC (permalink / raw)
  To: Jesper Dangaard Brouer; +Cc: Linux Kernel Network Developers, Eric Dumazet

On Mon, Jan 9, 2017 at 7:03 AM, Jesper Dangaard Brouer
<brouer@redhat.com> wrote:
>
> Use-case: The specific case I experienced this being a bottleneck is,
> sending UDP packets to a port with no listener, which obviously result
> in kernel replying with ICMP Destination Unreachable (type:3), Port
> Unreachable (code:3), which cause the bottleneck.

Why this is a case we should care about for performance?

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [net-next PATCH 2/3] net: reduce cycles spend on ICMP replies that gets rate limited
  2017-01-09 15:04 ` [net-next PATCH 2/3] net: reduce cycles spend on ICMP replies that gets rate limited Jesper Dangaard Brouer
@ 2017-01-09 17:44   ` Eric Dumazet
  2017-01-11 17:15     ` Eric Dumazet
  2017-06-04  7:11   ` Florian Weimer
  1 sibling, 1 reply; 33+ messages in thread
From: Eric Dumazet @ 2017-01-09 17:44 UTC (permalink / raw)
  To: Jesper Dangaard Brouer; +Cc: netdev, xiyou.wangcong

On Mon, 2017-01-09 at 16:04 +0100, Jesper Dangaard Brouer wrote:
> This patch split the global and per (inet)peer ICMP-reply limiter
> code, and moves the global limit check to earlier in the packet
> processing path.  Thus, avoid spending cycles on ICMP replies that
> gets limited/suppressed anyhow.
> 
> The global ICMP rate limiter icmp_global_allow() is a good solution,
> it just happens too late in the process.  The kernel goes through the
> full route lookup (return path) for the ICMP message, before taking
> the rate limit decision of not sending the ICMP reply.
> 
> Details: The kernels global rate limiter for ICMP messages got added
> in commit 4cdf507d5452 ("icmp: add a global rate limitation").  It is
> a token bucket limiter with a global lock.  It brilliantly avoids
> locking congestion by only updating when 20ms (HZ/50) were elapsed. It
> can then avoids taking lock when credit is exhausted (when under
> pressure) and time constraint for refill is not yet meet.
> 
> Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
> ---


Acked-by: Eric Dumazet <edumazet@google.com>

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [net-next PATCH 3/3] net: for rate-limited ICMP replies save one atomic operation
  2017-01-09 15:04 ` [net-next PATCH 3/3] net: for rate-limited ICMP replies save one atomic operation Jesper Dangaard Brouer
@ 2017-01-09 17:44   ` Eric Dumazet
  0 siblings, 0 replies; 33+ messages in thread
From: Eric Dumazet @ 2017-01-09 17:44 UTC (permalink / raw)
  To: Jesper Dangaard Brouer; +Cc: netdev, xiyou.wangcong

On Mon, 2017-01-09 at 16:04 +0100, Jesper Dangaard Brouer wrote:
> It is possible to avoid the atomic operation in icmp{v6,}_xmit_lock,
> by checking the sysctl_icmp_msgs_per_sec ratelimit before these calls,
> as pointed out by Eric Dumazet, but the BH disabled state must be correct.
> 
> The icmp_global_allow() call states it must be called with BH
> disabled.  This protection was given by the calls icmp_xmit_lock and
> icmpv6_xmit_lock.  Thus, split out local_bh_disable/enable from these
> functions and maintain it explicitly at callers.
> 
> Suggested-by: Eric Dumazet <eric.dumazet@gmail.com>
> Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
> ---

Acked-by: Eric Dumazet <edumazet@google.com>

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [net-next PATCH 1/3] Revert "icmp: avoid allocating large struct on stack"
  2017-01-09 17:42   ` Cong Wang
@ 2017-01-09 17:50     ` Eric Dumazet
  2017-01-09 17:59       ` Cong Wang
  0 siblings, 1 reply; 33+ messages in thread
From: Eric Dumazet @ 2017-01-09 17:50 UTC (permalink / raw)
  To: Cong Wang; +Cc: Jesper Dangaard Brouer, Linux Kernel Network Developers

On Mon, 2017-01-09 at 09:42 -0800, Cong Wang wrote:
> On Mon, Jan 9, 2017 at 7:04 AM, Jesper Dangaard Brouer
> <brouer@redhat.com> wrote:
> > This reverts commit 9a99d4a50cb8 ("icmp: avoid allocating large struct
> > on stack"), because struct icmp_bxm no really a large struct, and
> > allocating and free of this small 112 bytes hurts performance.
> 
> The original commit fixes a warning for large stack usage, icmp_send()
> is deep in the call stack.
> 
> Your optimization for a slow path makes no sense to me.

Do you have the stack trace of this event ?

Even Linus allowed vmalloc() kernel stacks, while it certainly was an
heresy 10 years ago.

I doubt it makes a difference trying to save 104 bytes of kernel stack.

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [net-next PATCH 0/3] net: optimize ICMP-reply code path
  2017-01-09 17:43 ` [net-next PATCH 0/3] net: optimize ICMP-reply code path Cong Wang
@ 2017-01-09 17:56   ` Eric Dumazet
  0 siblings, 0 replies; 33+ messages in thread
From: Eric Dumazet @ 2017-01-09 17:56 UTC (permalink / raw)
  To: Cong Wang; +Cc: Jesper Dangaard Brouer, Linux Kernel Network Developers

On Mon, 2017-01-09 at 09:43 -0800, Cong Wang wrote:
> On Mon, Jan 9, 2017 at 7:03 AM, Jesper Dangaard Brouer
> <brouer@redhat.com> wrote:
> >
> > Use-case: The specific case I experienced this being a bottleneck is,
> > sending UDP packets to a port with no listener, which obviously result
> > in kernel replying with ICMP Destination Unreachable (type:3), Port
> > Unreachable (code:3), which cause the bottleneck.
> 
> Why this is a case we should care about for performance?

This is called provisioning.

If you have a server farm that was qualified to handle 100 Mpps,
you want to absorb these 100 Mpps, even if the UDP server is restarted
in a clean or catastrophic mode.

The catastrophic mode would be the case that Jesper described : No UDP
socket is bound and ready to receive packets.

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [net-next PATCH 1/3] Revert "icmp: avoid allocating large struct on stack"
  2017-01-09 17:50     ` Eric Dumazet
@ 2017-01-09 17:59       ` Cong Wang
  2017-01-09 18:07         ` Eric Dumazet
  2017-01-09 18:47         ` David Miller
  0 siblings, 2 replies; 33+ messages in thread
From: Cong Wang @ 2017-01-09 17:59 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: Jesper Dangaard Brouer, Linux Kernel Network Developers

On Mon, Jan 9, 2017 at 9:50 AM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> On Mon, 2017-01-09 at 09:42 -0800, Cong Wang wrote:
>> On Mon, Jan 9, 2017 at 7:04 AM, Jesper Dangaard Brouer
>> <brouer@redhat.com> wrote:
>> > This reverts commit 9a99d4a50cb8 ("icmp: avoid allocating large struct
>> > on stack"), because struct icmp_bxm no really a large struct, and
>> > allocating and free of this small 112 bytes hurts performance.
>>
>> The original commit fixes a warning for large stack usage, icmp_send()
>> is deep in the call stack.
>>
>> Your optimization for a slow path makes no sense to me.
>
> Do you have the stack trace of this event ?
>
> Even Linus allowed vmalloc() kernel stacks, while it certainly was an
> heresy 10 years ago.
>
> I doubt it makes a difference trying to save 104 bytes of kernel stack.

I think you should have known this, quote from Eric Dumazet
(hopefully the same one):

    On Fri, 2013-05-31 at 22:22 -0700, Eric Dumazet wrote:

    Strange, I posted a patch like that some days ago.

which is from: https://patchwork.ozlabs.org/patch/248051/

Facepalm...

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [net-next PATCH 1/3] Revert "icmp: avoid allocating large struct on stack"
  2017-01-09 17:59       ` Cong Wang
@ 2017-01-09 18:07         ` Eric Dumazet
  2017-01-09 18:52           ` David Miller
                             ` (2 more replies)
  2017-01-09 18:47         ` David Miller
  1 sibling, 3 replies; 33+ messages in thread
From: Eric Dumazet @ 2017-01-09 18:07 UTC (permalink / raw)
  To: Cong Wang; +Cc: Jesper Dangaard Brouer, Linux Kernel Network Developers

On Mon, 2017-01-09 at 09:59 -0800, Cong Wang wrote:
> On Mon, Jan 9, 2017 at 9:50 AM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> > On Mon, 2017-01-09 at 09:42 -0800, Cong Wang wrote:
> >> On Mon, Jan 9, 2017 at 7:04 AM, Jesper Dangaard Brouer
> >> <brouer@redhat.com> wrote:
> >> > This reverts commit 9a99d4a50cb8 ("icmp: avoid allocating large struct
> >> > on stack"), because struct icmp_bxm no really a large struct, and
> >> > allocating and free of this small 112 bytes hurts performance.
> >>
> >> The original commit fixes a warning for large stack usage, icmp_send()
> >> is deep in the call stack.
> >>
> >> Your optimization for a slow path makes no sense to me.
> >
> > Do you have the stack trace of this event ?
> >
> > Even Linus allowed vmalloc() kernel stacks, while it certainly was an
> > heresy 10 years ago.
> >
> > I doubt it makes a difference trying to save 104 bytes of kernel stack.
> 
> I think you should have known this, quote from Eric Dumazet
> (hopefully the same one):
> 
>     On Fri, 2013-05-31 at 22:22 -0700, Eric Dumazet wrote:
> 
>     Strange, I posted a patch like that some days ago.
> 
> which is from: https://patchwork.ozlabs.org/patch/248051/
> 
> Facepalm...


We are in 2017.  Whatever was said in 2013 is irrelevant.

You really should come to netdev conferences so that you understand
goals and efforts, instead of living in your cave.

Then you can slap me in the face, since this is obviously your desire.

Then, we will drink a beer and relax.

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [net-next PATCH 1/3] Revert "icmp: avoid allocating large struct on stack"
  2017-01-09 17:59       ` Cong Wang
  2017-01-09 18:07         ` Eric Dumazet
@ 2017-01-09 18:47         ` David Miller
  1 sibling, 0 replies; 33+ messages in thread
From: David Miller @ 2017-01-09 18:47 UTC (permalink / raw)
  To: xiyou.wangcong; +Cc: eric.dumazet, brouer, netdev

From: Cong Wang <xiyou.wangcong@gmail.com>
Date: Mon, 9 Jan 2017 09:59:34 -0800

> Facepalm...

This is completely unnecessary.  Discuss facts and real issues, rather
than make emotional retorts.

Thank you.

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [net-next PATCH 1/3] Revert "icmp: avoid allocating large struct on stack"
  2017-01-09 18:07         ` Eric Dumazet
@ 2017-01-09 18:52           ` David Miller
  2017-01-09 20:53             ` Jesper Dangaard Brouer
  2017-01-10 18:06             ` Cong Wang
  2017-01-09 19:33           ` Joe Perches
  2017-01-10 18:01           ` Cong Wang
  2 siblings, 2 replies; 33+ messages in thread
From: David Miller @ 2017-01-09 18:52 UTC (permalink / raw)
  To: eric.dumazet; +Cc: xiyou.wangcong, brouer, netdev

From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Mon, 09 Jan 2017 10:07:04 -0800

> You really should come to netdev conferences so that you understand
> goals and efforts, instead of living in your cave.

I completely agree with Eric.

Cong we have a very serious problem with you exactly because you make
quite vicious emotional statements targetted at other developers
merely when they say something you disagree with.

This is completely unacceptable behavior, and you must stop doing
this, now.

And I am absolutely positive that if you had met us all in person at
netdev you would not be treating us like this.

So please do us a _HUGE_ favor, and leave your cave, and come to
an upcoming netdev.

Thank you.

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [net-next PATCH 1/3] Revert "icmp: avoid allocating large struct on stack"
  2017-01-09 18:07         ` Eric Dumazet
  2017-01-09 18:52           ` David Miller
@ 2017-01-09 19:33           ` Joe Perches
  2017-01-10 18:01           ` Cong Wang
  2 siblings, 0 replies; 33+ messages in thread
From: Joe Perches @ 2017-01-09 19:33 UTC (permalink / raw)
  To: Eric Dumazet, Cong Wang
  Cc: Jesper Dangaard Brouer, Linux Kernel Network Developers

On Mon, 2017-01-09 at 10:07 -0800, Eric Dumazet wrote:
> On Mon, 2017-01-09 at 09:59 -0800, Cong Wang wrote:
[]
> > Facepalm...
[]
> We are in 2017.  Whatever was said in 2013 is irrelevant.

Unless you morph into the PEOTUS, as a
general statement, this is untrue.

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [net-next PATCH 0/3] net: optimize ICMP-reply code path
  2017-01-09 15:03 [net-next PATCH 0/3] net: optimize ICMP-reply code path Jesper Dangaard Brouer
                   ` (3 preceding siblings ...)
  2017-01-09 17:43 ` [net-next PATCH 0/3] net: optimize ICMP-reply code path Cong Wang
@ 2017-01-09 20:49 ` David Miller
  4 siblings, 0 replies; 33+ messages in thread
From: David Miller @ 2017-01-09 20:49 UTC (permalink / raw)
  To: brouer; +Cc: netdev, eric.dumazet, xiyou.wangcong

From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Mon, 09 Jan 2017 16:03:59 +0100

> This patchset is optimizing the ICMP-reply code path, for ICMP packets
> that gets rate limited. A remote party can easily trigger this code
> path by sending packets to port number with no listening service.
> 
> Generally the patchset moves the sysctl_icmp_msgs_per_sec ratelimit
> checking to earlier in the code path and removes an allocation.
> 
> 
> Use-case: The specific case I experienced this being a bottleneck is,
> sending UDP packets to a port with no listener, which obviously result
> in kernel replying with ICMP Destination Unreachable (type:3), Port
> Unreachable (code:3), which cause the bottleneck.
> 
>  After Eric and Paolo optimized the UDP socket code, the kernels PPS
> processing capabilities is lower for no-listen ports, than normal UDP
> sockets.  This is bad for capacity planning when restarting a service.
> 
> UDP no-listen benchmark 8xCPUs using pktgen_sample04_many_flows.sh:
>  Baseline: 6.6 Mpps
>  Patch:   14.7 Mpps
> Driver mlx5 at 50Gbit/s.

Series applied, thanks Jesper!

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [net-next PATCH 1/3] Revert "icmp: avoid allocating large struct on stack"
  2017-01-09 18:52           ` David Miller
@ 2017-01-09 20:53             ` Jesper Dangaard Brouer
  2017-01-10 18:06             ` Cong Wang
  1 sibling, 0 replies; 33+ messages in thread
From: Jesper Dangaard Brouer @ 2017-01-09 20:53 UTC (permalink / raw)
  To: David Miller; +Cc: eric.dumazet, xiyou.wangcong, netdev, brouer

On Mon, 09 Jan 2017 13:52:59 -0500 (EST)
David Miller <davem@davemloft.net> wrote:

> From: Eric Dumazet <eric.dumazet@gmail.com> 
> Date: Mon, 09 Jan 2017 10:07:04 -0800
> 
> > You really should come to netdev conferences so that you understand
> > goals and efforts, instead of living in your cave.  
> 
> I completely agree with Eric.
> 
> Cong we have a very serious problem with you exactly because you make
> quite vicious emotional statements targetted at other developers
> merely when they say something you disagree with.
> 
> This is completely unacceptable behavior, and you must stop doing
> this, now.

I agree, and it is even documented in:
 Documentation/process/code-of-conflict.rst

Quote: "As a reviewer of code, please strive to keep things civil and
focused on the technical issues involved. [...]"

https://www.kernel.org/doc/html/latest/process/code-of-conflict.html

-- 
Best regards,
  Jesper Dangaard Brouer
  MSc.CS, Principal Kernel Engineer at Red Hat
  LinkedIn: http://www.linkedin.com/in/brouer

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [net-next PATCH 1/3] Revert "icmp: avoid allocating large struct on stack"
  2017-01-09 18:07         ` Eric Dumazet
  2017-01-09 18:52           ` David Miller
  2017-01-09 19:33           ` Joe Perches
@ 2017-01-10 18:01           ` Cong Wang
  2 siblings, 0 replies; 33+ messages in thread
From: Cong Wang @ 2017-01-10 18:01 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: Jesper Dangaard Brouer, Linux Kernel Network Developers

On Mon, Jan 9, 2017 at 10:07 AM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> We are in 2017.  Whatever was said in 2013 is irrelevant.
>
> You really should come to netdev conferences so that you understand
> goals and efforts, instead of living in your cave.
>
> Then you can slap me in the face, since this is obviously your desire.
>
> Then, we will drink a beer and relax.

LOL...

Eric, you really need to focus on the technical part, which is you
asked for a question you already knew the answer 4 years ago,
and now you are going to be against yourself.

/me don't contribute anything expect helping you to remember
what you said.

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [net-next PATCH 1/3] Revert "icmp: avoid allocating large struct on stack"
  2017-01-09 18:52           ` David Miller
  2017-01-09 20:53             ` Jesper Dangaard Brouer
@ 2017-01-10 18:06             ` Cong Wang
  2017-01-10 18:12               ` David Miller
  1 sibling, 1 reply; 33+ messages in thread
From: Cong Wang @ 2017-01-10 18:06 UTC (permalink / raw)
  To: David Miller
  Cc: Eric Dumazet, Jesper Dangaard Brouer, Linux Kernel Network Developers

On Mon, Jan 9, 2017 at 10:52 AM, David Miller <davem@davemloft.net> wrote:
> From: Eric Dumazet <eric.dumazet@gmail.com>
> Date: Mon, 09 Jan 2017 10:07:04 -0800
>
>> You really should come to netdev conferences so that you understand
>> goals and efforts, instead of living in your cave.
>
> I completely agree with Eric.
>
> Cong we have a very serious problem with you exactly because you make
> quite vicious emotional statements targetted at other developers
> merely when they say something you disagree with.

What emotional? Pointing out Eris's words from 4 years ago is NOT
emotional, it is just a help.


>
> This is completely unacceptable behavior, and you must stop doing
> this, now.
>
> And I am absolutely positive that if you had met us all in person at
> netdev you would not be treating us like this.

This is not an invitation from any point of view, David.

>
> So please do us a _HUGE_ favor, and leave your cave, and come to
> an upcoming netdev.
>

Not everyone is as free to travel to Canada without a visa as you,
unfortunately.

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [net-next PATCH 1/3] Revert "icmp: avoid allocating large struct on stack"
  2017-01-10 18:06             ` Cong Wang
@ 2017-01-10 18:12               ` David Miller
  2017-01-10 18:44                 ` Cong Wang
  2017-01-10 21:41                 ` Joe Perches
  0 siblings, 2 replies; 33+ messages in thread
From: David Miller @ 2017-01-10 18:12 UTC (permalink / raw)
  To: xiyou.wangcong; +Cc: eric.dumazet, brouer, netdev

From: Cong Wang <xiyou.wangcong@gmail.com>
Date: Tue, 10 Jan 2017 10:06:01 -0800

> On Mon, Jan 9, 2017 at 10:52 AM, David Miller <davem@davemloft.net> wrote:
>> From: Eric Dumazet <eric.dumazet@gmail.com>
>> Date: Mon, 09 Jan 2017 10:07:04 -0800
>>
>>> You really should come to netdev conferences so that you understand
>>> goals and efforts, instead of living in your cave.
>>
>> I completely agree with Eric.
>>
>> Cong we have a very serious problem with you exactly because you make
>> quite vicious emotional statements targetted at other developers
>> merely when they say something you disagree with.
> 
> What emotional? Pointing out Eris's words from 4 years ago is NOT
> emotional, it is just a help.

Saying "Facepalm" is emotional and has nothing to do with the
technical issues.

You can keep showing us how expertly you can deflect the real
issue we are discussion here, but that won't improve the situation
at all I am afraid.

> Not everyone is as free to travel to Canada without a visa as you,
> unfortunately.

We hold netdev in other countries all over the world, stop making
excuses.

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [net-next PATCH 1/3] Revert "icmp: avoid allocating large struct on stack"
  2017-01-10 18:12               ` David Miller
@ 2017-01-10 18:44                 ` Cong Wang
  2017-01-10 18:48                   ` Cong Wang
                                     ` (2 more replies)
  2017-01-10 21:41                 ` Joe Perches
  1 sibling, 3 replies; 33+ messages in thread
From: Cong Wang @ 2017-01-10 18:44 UTC (permalink / raw)
  To: David Miller
  Cc: Eric Dumazet, Jesper Dangaard Brouer, Linux Kernel Network Developers

On Tue, Jan 10, 2017 at 10:12 AM, David Miller <davem@davemloft.net> wrote:
> From: Cong Wang <xiyou.wangcong@gmail.com>
> Date: Tue, 10 Jan 2017 10:06:01 -0800
>
>> On Mon, Jan 9, 2017 at 10:52 AM, David Miller <davem@davemloft.net> wrote:
>>> From: Eric Dumazet <eric.dumazet@gmail.com>
>>> Date: Mon, 09 Jan 2017 10:07:04 -0800
>>>
>>>> You really should come to netdev conferences so that you understand
>>>> goals and efforts, instead of living in your cave.
>>>
>>> I completely agree with Eric.
>>>
>>> Cong we have a very serious problem with you exactly because you make
>>> quite vicious emotional statements targetted at other developers
>>> merely when they say something you disagree with.
>>
>> What emotional? Pointing out Eris's words from 4 years ago is NOT
>> emotional, it is just a help.
>
> Saying "Facepalm" is emotional and has nothing to do with the
> technical issues.

Facepalm is heavily used on Internet:
https://en.wikipedia.org/wiki/Facepalm#Internet_usage

it is how people including me react to disappointing. It is those
who only see facepalm make it irrelevant to the technical discussion
in this thread.

>
> You can keep showing us how expertly you can deflect the real
> issue we are discussion here, but that won't improve the situation
> at all I am afraid.

Of course, there are just too many people too lazy to do a google search:

https://lists.debian.org/debian-kernel/2013/05/msg00500.html


>
>> Not everyone is as free to travel to Canada without a visa as you,
>> unfortunately.
>
> We hold netdev in other countries all over the world, stop making
> excuses.

The only countries you hold netdev are Canada, Japan and Spain
(to my knowledge). If you check:

https://en.wikipedia.org/wiki/Visa_requirements_for_Chinese_citizens#Visa_requirements

It is very easy to find out if it is an excuse or a fact.

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [net-next PATCH 1/3] Revert "icmp: avoid allocating large struct on stack"
  2017-01-10 18:44                 ` Cong Wang
@ 2017-01-10 18:48                   ` Cong Wang
  2017-01-10 18:54                   ` David Miller
  2017-01-10 20:08                   ` Jesper Dangaard Brouer
  2 siblings, 0 replies; 33+ messages in thread
From: Cong Wang @ 2017-01-10 18:48 UTC (permalink / raw)
  To: David Miller
  Cc: Eric Dumazet, Jesper Dangaard Brouer, Linux Kernel Network Developers

On Tue, Jan 10, 2017 at 10:44 AM, Cong Wang <xiyou.wangcong@gmail.com> wrote:
> Of course, there are just too many people too lazy to do a google search:
>
> https://lists.debian.org/debian-kernel/2013/05/msg00500.html
>

One more:
https://communities.intel.com/thread/28935 (<-- search icmp_send)

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [net-next PATCH 1/3] Revert "icmp: avoid allocating large struct on stack"
  2017-01-10 18:44                 ` Cong Wang
  2017-01-10 18:48                   ` Cong Wang
@ 2017-01-10 18:54                   ` David Miller
  2017-01-12 22:46                     ` Cong Wang
  2017-01-10 20:08                   ` Jesper Dangaard Brouer
  2 siblings, 1 reply; 33+ messages in thread
From: David Miller @ 2017-01-10 18:54 UTC (permalink / raw)
  To: xiyou.wangcong; +Cc: eric.dumazet, brouer, netdev

From: Cong Wang <xiyou.wangcong@gmail.com>
Date: Tue, 10 Jan 2017 10:44:59 -0800

> On Tue, Jan 10, 2017 at 10:12 AM, David Miller <davem@davemloft.net> wrote:
> Facepalm is heavily used on Internet:
> https://en.wikipedia.org/wiki/Facepalm#Internet_usage
> 
> it is how people including me react to disappointing.

Everyone here, except you, believe that this is rude behavior.

Keep coming up with your own reasons why you are special and
can behave this way.

> The only countries you hold netdev are Canada, Japan and Spain
> (to my knowledge). If you check:
> 
> https://en.wikipedia.org/wiki/Visa_requirements_for_Chinese_citizens#Visa_requirements
> 
> It is very easy to find out if it is an excuse or a fact.

The conference explciitly offers VISA help for anyone who needs it.
Many people come to the conference on a VISA we helped them obtain.

It is not hard to do.  You have put in exactly zero effort in trying
to solve this problem.  If you had simply contacted the conference
organizers asking for help, you would have gotten all the help you
needed.

In fact we explicitly asked you to attend, and you turned us down.

So don't say that every effort hasn't been made to get you into the
conference.

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [net-next PATCH 1/3] Revert "icmp: avoid allocating large struct on stack"
  2017-01-10 18:44                 ` Cong Wang
  2017-01-10 18:48                   ` Cong Wang
  2017-01-10 18:54                   ` David Miller
@ 2017-01-10 20:08                   ` Jesper Dangaard Brouer
  2017-01-10 21:48                     ` Eric Dumazet
  2 siblings, 1 reply; 33+ messages in thread
From: Jesper Dangaard Brouer @ 2017-01-10 20:08 UTC (permalink / raw)
  To: Cong Wang
  Cc: David Miller, Eric Dumazet, Linux Kernel Network Developers, brouer


On Tue, 10 Jan 2017 10:44:59 -0800 Cong Wang <xiyou.wangcong@gmail.com> wrote:

> On Tue, Jan 10, 2017 at 10:12 AM, David Miller <davem@davemloft.net> wrote:
[...]
> > You can keep showing us how expertly you can deflect the real
> > issue we are discussion here, but that won't improve the situation
> > at all I am afraid.  
> 
> Of course, there are just too many people too lazy to do a google search:
> 
> https://lists.debian.org/debian-kernel/2013/05/msg00500.html

My analysis of the problem shown in above link is not related to using
all the stack space, but instead that skb->cb was not cleared.  This
can cause the ip_options_echo() call in icmp_send() to access garbage
as this is: __ip_options_echo(dopt, skb, &IPCB(skb)->opt).

Fixed by commit a622260254ee ("ip_tunnel: fix kernel panic with icmp_dest_unreach")
 https://git.kernel.org/torvalds/c/a622260254ee

Thus, it is (likely) the __ip_options_echo() call that violates stack
access, as it is passed in a pointer to the stack, and advance this
based on garbage "optlen".

-- 
Best regards,
  Jesper Dangaard Brouer
  MSc.CS, Principal Kernel Engineer at Red Hat
  LinkedIn: http://www.linkedin.com/in/brouer

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [net-next PATCH 1/3] Revert "icmp: avoid allocating large struct on stack"
  2017-01-10 18:12               ` David Miller
  2017-01-10 18:44                 ` Cong Wang
@ 2017-01-10 21:41                 ` Joe Perches
  1 sibling, 0 replies; 33+ messages in thread
From: Joe Perches @ 2017-01-10 21:41 UTC (permalink / raw)
  To: David Miller, xiyou.wangcong; +Cc: eric.dumazet, brouer, netdev

On Tue, 2017-01-10 at 13:12 -0500, David Miller wrote:
> From: Cong Wang <xiyou.wangcong@gmail.com>
> Date: Tue, 10 Jan 2017 10:06:01 -0800
> 
> > On Mon, Jan 9, 2017 at 10:52 AM, David Miller <davem@davemloft.net> wrote:
> >> From: Eric Dumazet <eric.dumazet@gmail.com>
> >> Date: Mon, 09 Jan 2017 10:07:04 -0800
> >>
> >>> You really should come to netdev conferences so that you understand
> >>> goals and efforts, instead of living in your cave.
> >>
> >> I completely agree with Eric.
> >>
> >> Cong we have a very serious problem with you exactly because you make
> >> quite vicious emotional statements targetted at other developers

"quite vicious" is overstatement here.

> >> merely when they say something you disagree with.
> > 
> > What emotional? Pointing out Eris's words from 4 years ago is NOT
> > emotional, it is just a help.

Not really.  Your "facepalm" didn't seem to be written as
humor but more as an expression of your exasperation.

> Saying "Facepalm" is emotional and has nothing to do with the
> technical issues.

Many words are emotional or conflictual.

Saying "stupid" is emotional and also has no technical content.

Stupid is used here all the time.  Happily, it's generally used
as a reference to self and not about others.

However it is still occasionally and unfortunately used when
referencing proposed code or other person's behaviors.

Thankfully, these types of words are being used less and less
in discussions here.

I agree with David and Eric that it's generally better to avoid
these word choices.

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [net-next PATCH 1/3] Revert "icmp: avoid allocating large struct on stack"
  2017-01-10 20:08                   ` Jesper Dangaard Brouer
@ 2017-01-10 21:48                     ` Eric Dumazet
  2017-01-12 22:21                       ` Cong Wang
  0 siblings, 1 reply; 33+ messages in thread
From: Eric Dumazet @ 2017-01-10 21:48 UTC (permalink / raw)
  To: Jesper Dangaard Brouer
  Cc: Cong Wang, David Miller, Linux Kernel Network Developers

On Tue, 2017-01-10 at 21:08 +0100, Jesper Dangaard Brouer wrote:
> On Tue, 10 Jan 2017 10:44:59 -0800 Cong Wang <xiyou.wangcong@gmail.com> wrote:
> 
> > On Tue, Jan 10, 2017 at 10:12 AM, David Miller <davem@davemloft.net> wrote:
> [...]
> > > You can keep showing us how expertly you can deflect the real
> > > issue we are discussion here, but that won't improve the situation
> > > at all I am afraid.  
> > 
> > Of course, there are just too many people too lazy to do a google search:
> > 
> > https://lists.debian.org/debian-kernel/2013/05/msg00500.html
> 
> My analysis of the problem shown in above link is not related to using
> all the stack space, but instead that skb->cb was not cleared.  This
> can cause the ip_options_echo() call in icmp_send() to access garbage
> as this is: __ip_options_echo(dopt, skb, &IPCB(skb)->opt).
> 
> Fixed by commit a622260254ee ("ip_tunnel: fix kernel panic with icmp_dest_unreach")
>  https://git.kernel.org/torvalds/c/a622260254ee
> 
> Thus, it is (likely) the __ip_options_echo() call that violates stack
> access, as it is passed in a pointer to the stack, and advance this
> based on garbage "optlen".
> 

I totally agree.

This can not be stack being too small in current kernels.

> #0 [ffff88003fd03798] machine_kexec at ffffffff81027430
> #1 [ffff88003fd037e8] crash_kexec at ffffffff8107da80
> #2 [ffff88003fd038b8] panic at ffffffff81540026
> #3 [ffff88003fd03938] __stack_chk_fail at ffffffff81037f77
> #4 [ffff88003fd03948] icmp_send at ffffffff814d5fec
> #5 [ffff88003fd03b78] dev_hard_start_xmit at ffffffff8146e032
> #6 [ffff88003fd03bc8] sch_direct_xmit at ffffffff81487d66
> #7 [ffff88003fd03c08] __qdisc_run at ffffffff81487efd
> #8 [ffff88003fd03c48] dev_queue_xmit at ffffffff8146e5a7
> #9 [ffff88003fd03c88] ip_finish_output at ffffffff814ab596
> #10 [ffff88003fd03ce8] __netif_receive_skb at ffffffff8146ed13
> #11 [ffff88003fd03d88] napi_gro_receive at ffffffff8146fc50
> #12 [ffff88003fd03da8] e1000_clean_rx_irq at ffffffff813bc67b
> #13 [ffff88003fd03e48] e1000e_poll at ffffffff813c3a20
> #14 [ffff88003fd03e98] net_rx_action at ffffffff8146f796
> #15 [ffff88003fd03ee8] __do_softirq at ffffffff8103ebb9
> #16 [ffff88003fd03f38] call_softirq at ffffffff8154444c
> #17 [ffff88003fd03f50] do_softirq at ffffffff810047dd
> #18 [ffff88003fd03f80] do_IRQ at ffffffff81003f6c

Total stack used is about 3FFF - 3938, which is less than 2KB.

x86_64 is supposed to have at least 16 KB irq stacks.

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [net-next PATCH 2/3] net: reduce cycles spend on ICMP replies that gets rate limited
  2017-01-09 17:44   ` Eric Dumazet
@ 2017-01-11 17:15     ` Eric Dumazet
  0 siblings, 0 replies; 33+ messages in thread
From: Eric Dumazet @ 2017-01-11 17:15 UTC (permalink / raw)
  To: Jesper Dangaard Brouer; +Cc: netdev, xiyou.wangcong

On Mon, 2017-01-09 at 09:44 -0800, Eric Dumazet wrote:
> On Mon, 2017-01-09 at 16:04 +0100, Jesper Dangaard Brouer wrote:
> > This patch split the global and per (inet)peer ICMP-reply limiter
> > code, and moves the global limit check to earlier in the packet
> > processing path.  Thus, avoid spending cycles on ICMP replies that
> > gets limited/suppressed anyhow.
> > 
> > The global ICMP rate limiter icmp_global_allow() is a good solution,
> > it just happens too late in the process.  The kernel goes through the
> > full route lookup (return path) for the ICMP message, before taking
> > the rate limit decision of not sending the ICMP reply.
> > 
> > Details: The kernels global rate limiter for ICMP messages got added
> > in commit 4cdf507d5452 ("icmp: add a global rate limitation").  It is
> > a token bucket limiter with a global lock.  It brilliantly avoids
> > locking congestion by only updating when 20ms (HZ/50) were elapsed. It
> > can then avoids taking lock when credit is exhausted (when under
> > pressure) and time constraint for refill is not yet meet.
> > 
> > Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
> > ---
> 
> 
> Acked-by: Eric Dumazet <edumazet@google.com>

Remaining problem :

A moderate load (1000 packets per second) of UDP packets from a rogue
source (not even spoofing source IP) to a closed port will consume all
the (global) budget, even if the per destination budget allows one ICMP
per second.

Meaning that single UDP message sent by other sources are not able to
get an ICMP in response.

This makes ICMP much less useful (unlikely to be sent by a host)

In my commit (4cdf507d5452 : icmp: add a global rate limitation) I gave
this hint :

<quote>
Note that if we really want to send millions of ICMP messages per
second, we might extend idea and infra added in commit 04ca6973f7c1a
("ip: make IP identifiers less predictable") :
add a token bucket in the ip_idents hash and no longer rely on inetpeer.
</quote>

The idea would be to use a hash table to quickly filter elephant flows,
preventing them from stealing all the global ICMP credits.

Or if you prefer, no longer use control variables stored in inetpeer.

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [net-next PATCH 1/3] Revert "icmp: avoid allocating large struct on stack"
  2017-01-10 21:48                     ` Eric Dumazet
@ 2017-01-12 22:21                       ` Cong Wang
  0 siblings, 0 replies; 33+ messages in thread
From: Cong Wang @ 2017-01-12 22:21 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Jesper Dangaard Brouer, David Miller, Linux Kernel Network Developers

On Tue, Jan 10, 2017 at 1:48 PM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> On Tue, 2017-01-10 at 21:08 +0100, Jesper Dangaard Brouer wrote:
>> On Tue, 10 Jan 2017 10:44:59 -0800 Cong Wang <xiyou.wangcong@gmail.com> wrote:
>>
>> > On Tue, Jan 10, 2017 at 10:12 AM, David Miller <davem@davemloft.net> wrote:
>> [...]
>> > > You can keep showing us how expertly you can deflect the real
>> > > issue we are discussion here, but that won't improve the situation
>> > > at all I am afraid.
>> >
>> > Of course, there are just too many people too lazy to do a google search:
>> >
>> > https://lists.debian.org/debian-kernel/2013/05/msg00500.html
>>
>> My analysis of the problem shown in above link is not related to using
>> all the stack space, but instead that skb->cb was not cleared.  This
>> can cause the ip_options_echo() call in icmp_send() to access garbage
>> as this is: __ip_options_echo(dopt, skb, &IPCB(skb)->opt).
>>
>> Fixed by commit a622260254ee ("ip_tunnel: fix kernel panic with icmp_dest_unreach")
>>  https://git.kernel.org/torvalds/c/a622260254ee
>>
>> Thus, it is (likely) the __ip_options_echo() call that violates stack
>> access, as it is passed in a pointer to the stack, and advance this
>> based on garbage "optlen".
>>
>
> I totally agree.

I can't agree, iptunnel or ipgre symbols are not in the above stack trace
at all. Although I do agree that the above stack usage is not aggressive,
especially when compared with the other I sent.

My vague memory told me the original problem I fixed is related to vxlan
but after trying to search all netdev archives in 2013 May/Jun, I still can't
find it, perhaps it was reported to LKML or somewhere else rather than
netdev. It was certainly a real problem.

Even though the irq stack is 16K, but it is too easy to stack netdevices
and stack qdisc's too, so for TX path I am not surprised at all if 16K could
be exhausted eventually. Yeah, it is hard to blame one of them in the call
chain, but 112 bytes _alone_ are aggressive for such a function deeply
in the call stack. That's my whole point.

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [net-next PATCH 1/3] Revert "icmp: avoid allocating large struct on stack"
  2017-01-10 18:54                   ` David Miller
@ 2017-01-12 22:46                     ` Cong Wang
  0 siblings, 0 replies; 33+ messages in thread
From: Cong Wang @ 2017-01-12 22:46 UTC (permalink / raw)
  To: David Miller
  Cc: Eric Dumazet, Jesper Dangaard Brouer, Linux Kernel Network Developers

On Tue, Jan 10, 2017 at 10:54 AM, David Miller <davem@davemloft.net> wrote:
> From: Cong Wang <xiyou.wangcong@gmail.com>
> Date: Tue, 10 Jan 2017 10:44:59 -0800
>> The only countries you hold netdev are Canada, Japan and Spain
>> (to my knowledge). If you check:
>>
>> https://en.wikipedia.org/wiki/Visa_requirements_for_Chinese_citizens#Visa_requirements
>>
>> It is very easy to find out if it is an excuse or a fact.
>
> The conference explciitly offers VISA help for anyone who needs it.
> Many people come to the conference on a VISA we helped them obtain.

I never complain about visa sponsorship or money, this is the last
problem for me to consider.

>
> It is not hard to do.  You have put in exactly zero effort in trying
> to solve this problem.  If you had simply contacted the conference
> organizers asking for help, you would have gotten all the help you
> needed.

If obtaining a visa were as easy as obtaining a sponsorship, I would
go as many conferences as I know. But the fact is there are already
too many hassles to obtain even _one_ visa, not to mention I need
to obtain _two_ visas to go to Canada (or Japan, Spain) and return
to US. I'd be very happy to attend it if it were held in US, but this never
happens.

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [net-next PATCH 2/3] net: reduce cycles spend on ICMP replies that gets rate limited
  2017-01-09 15:04 ` [net-next PATCH 2/3] net: reduce cycles spend on ICMP replies that gets rate limited Jesper Dangaard Brouer
  2017-01-09 17:44   ` Eric Dumazet
@ 2017-06-04  7:11   ` Florian Weimer
  2017-06-04 14:38     ` Jesper Dangaard Brouer
  1 sibling, 1 reply; 33+ messages in thread
From: Florian Weimer @ 2017-06-04  7:11 UTC (permalink / raw)
  To: Jesper Dangaard Brouer, netdev, Eric Dumazet; +Cc: xiyou.wangcong

[-- Attachment #1: Type: text/plain, Size: 1845 bytes --]

On 01/09/2017 04:04 PM, Jesper Dangaard Brouer wrote:
> This patch split the global and per (inet)peer ICMP-reply limiter
> code, and moves the global limit check to earlier in the packet
> processing path.  Thus, avoid spending cycles on ICMP replies that
> gets limited/suppressed anyhow.
> 
> The global ICMP rate limiter icmp_global_allow() is a good solution,
> it just happens too late in the process.  The kernel goes through the
> full route lookup (return path) for the ICMP message, before taking
> the rate limit decision of not sending the ICMP reply.
> 
> Details: The kernels global rate limiter for ICMP messages got added
> in commit 4cdf507d5452 ("icmp: add a global rate limitation").  It is
> a token bucket limiter with a global lock.  It brilliantly avoids
> locking congestion by only updating when 20ms (HZ/50) were elapsed. It
> can then avoids taking lock when credit is exhausted (when under
> pressure) and time constraint for refill is not yet meet.

This patch removed the rate limit bypass for localhost.  As a result, it
is impossible to write deterministic UDP client tests tests which
exercise failover behavior in response to unreachable servers.

H.J. Lu noted that a glibc test started failing on kernel 4.11 and
identified the regression:

  https://sourceware.org/ml/libc-alpha/2017-06/msg00167.html

(I have more tests which are afflicted by this, but are not yet in glibc
upstream.)

This is particularly annoying because we already run such tests in a
network namespace for isolation, but the rate limit counter is global,
so that doesn't help here.

I'm attaching a self-contained test case.  It fails for me with:

localhost-icmp: iteration 50: no ICMP message (poll timeout)

On kernel 4.10, it passes and runs within just a few milliseconds.

Would you please fix this in some way?  Thanks.

Florian

[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #2: localhost-icmp.c --]
[-- Type: text/x-csrc; name="localhost-icmp.c", Size: 1728 bytes --]

#include <arpa/inet.h>
#include <err.h>
#include <netinet/in.h>
#include <stdio.h>
#include <sys/poll.h>
#include <sys/socket.h>
#include <unistd.h>

/* How many UDP packets to send to a non-responding part.  */
enum { ITERATIONS = 1000 };

int
main (void)
{
  /* Pick a port number which is likely unused.  */
  unsigned short port;
  {
    int sock = socket (AF_INET, SOCK_DGRAM, 0);
    if (sock < 0)
      err (1, "socket");
    struct sockaddr_in sin = { .sin_family = AF_INET };
    if (bind (sock, (struct sockaddr *) &sin, sizeof (sin)) < 0)
      err (1, "bind");
    socklen_t sinlen = sizeof (sin);
    if (getsockname (sock, (struct sockaddr *) &sin, &sinlen))
      err (1, "getsockname");
    if (sinlen != sizeof (sin) || sin.sin_family != AF_INET)
      errx (1, "wrong address information for socket");
    if (close (sock) < 0)
      err (1, "close");
    port = sin.sin_port;
  }

  for (int i = 0; i < ITERATIONS; ++i)
    {
      int sock = socket (AF_INET, SOCK_DGRAM, 0);
      if (sock < 0)
        err (1, "socket");
      struct sockaddr_in sin =
        {
          .sin_family = AF_INET,
          .sin_addr = { ntohl (INADDR_LOOPBACK) },
          .sin_port = port,
        };
      if (connect (sock, (struct sockaddr *) &sin, sizeof (sin)) < 0)
        err (1, "connect");
      if (sendto (sock, "", 1, 0, NULL, 0) < 0)
        err (1, "sendto");
      struct pollfd fd = { .fd = sock, .events = POLLIN };
      int ret = poll (&fd, 1, 5000);
      if (ret < 0)
        err (1, "poll");
      if (ret == 0)
        errx (1, "iteration %d: no ICMP message (poll timeout)", i);
      if (close (sock) < 0)
        err (1, "close");
    }
}

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [net-next PATCH 2/3] net: reduce cycles spend on ICMP replies that gets rate limited
  2017-06-04  7:11   ` Florian Weimer
@ 2017-06-04 14:38     ` Jesper Dangaard Brouer
  2017-06-05 14:22       ` Florian Weimer
  0 siblings, 1 reply; 33+ messages in thread
From: Jesper Dangaard Brouer @ 2017-06-04 14:38 UTC (permalink / raw)
  To: Florian Weimer; +Cc: netdev, Eric Dumazet, xiyou.wangcong, brouer

On Sun, 4 Jun 2017 09:11:53 +0200
Florian Weimer <fweimer@redhat.com> wrote:

> On 01/09/2017 04:04 PM, Jesper Dangaard Brouer wrote:
>
> > This patch split the global and per (inet)peer ICMP-reply limiter
> > code, and moves the global limit check to earlier in the packet
> > processing path.  Thus, avoid spending cycles on ICMP replies that
> > gets limited/suppressed anyhow.
> > 
> > The global ICMP rate limiter icmp_global_allow() is a good solution,
> > it just happens too late in the process.  The kernel goes through the
> > full route lookup (return path) for the ICMP message, before taking
> > the rate limit decision of not sending the ICMP reply.
> > 
> > Details: The kernels global rate limiter for ICMP messages got added
> > in commit 4cdf507d5452 ("icmp: add a global rate limitation").  It is
> > a token bucket limiter with a global lock.  It brilliantly avoids
> > locking congestion by only updating when 20ms (HZ/50) were elapsed. It
> > can then avoids taking lock when credit is exhausted (when under
> > pressure) and time constraint for refill is not yet meet.  
> 
> This patch removed the rate limit bypass for localhost.  As a result, it
> is impossible to write deterministic UDP client tests tests which
> exercise failover behavior in response to unreachable servers.

You cannot rely on ICMP responses delivery, too many systems (and
middleboxes) limit or drop ICMP. Before this patch, loopback dev was
explicitly excluded from being ICMP rate limited.  Thus, your localhost
test passed.

Is there a real use-case behind "failover behavior in response to
unreachable servers" (which would need to run on localhost)?


Adding back outgoing-dev loopback test will require a full
route-lookup, which is what the hole optimization gain[1] comes from.
[1] https://git.kernel.org/torvalds/c/9f2f27a9a518

I've tried to come-up with an alternative solution, see inlined patch
below...

 
> H.J. Lu noted that a glibc test started failing on kernel 4.11 and
> identified the regression:
> 
>   https://sourceware.org/ml/libc-alpha/2017-06/msg00167.html
> 
> (I have more tests which are afflicted by this, but are not yet in glibc
> upstream.)
> 
> This is particularly annoying because we already run such tests in a
> network namespace for isolation, but the rate limit counter is global,
> so that doesn't help here.
> 
> I'm attaching a self-contained test case.  It fails for me with:
> 
> localhost-icmp: iteration 50: no ICMP message (poll timeout)
> 
> On kernel 4.10, it passes and runs within just a few milliseconds.
> 
> Would you please fix this in some way?  Thanks.

I did a quick patch that fixes the problem... at least for your
reproducer test-vase.


[PATCH] net: don't global ICMP rate limit packets originating from loopback

From: Jesper Dangaard Brouer <brouer@redhat.com>

Florian Weimer seems to have a use-case that requires, that loopback
interfaces does not get ICMP ratelimited.  This was broken by commit
c0303efeab73 ("net: reduce cycles spend on ICMP replies that gets rate limited").

An ICMP response will usually be routed back-out the same incomming
interface.  Thus, take advantage of this and skip global ICMP ratelimit
when the incomming device is loopback.

This seems to fix the reproducer given by Florian.

Fixes: c0303efeab73 ("net: reduce cycles spend on ICMP replies that gets rate limited")
Reported-by: Florian Weimer <fweimer@redhat.com>
Reported-by: "H.J. Lu" <hjl.tools@gmail.com>
---
 net/ipv4/icmp.c |    9 +++++++--
 net/ipv6/icmp.c |    2 +-
 2 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index 43318b5f5647..6d781e470678 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -419,6 +419,7 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb)
 	local_bh_disable();
 
 	/* global icmp_msgs_per_sec */
+	// XXX This is not the problem case getting hit ... see icmp_send
 	if (!icmpv4_global_allow(net, type, code))
 		goto out_bh_enable;
 
@@ -657,8 +658,12 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
 	/* Needed by both icmp_global_allow and icmp_xmit_lock */
 	local_bh_disable();
 
-	/* Check global sysctl_icmp_msgs_per_sec ratelimit */
-	if (!icmpv4_global_allow(net, type, code))
+	/* Check global sysctl_icmp_msgs_per_sec ratelimit, but only
+	 * when incomming dev is not loopback.  If outgoing dev is not
+	 * loopback then it will be peer ratelimited (icmpv4_xrlim_allow)
+	 */
+	if (!(rt->dst.dev && (rt->dst.dev->flags&IFF_LOOPBACK)) &&
+	      !icmpv4_global_allow(net, type, code))
 		goto out_bh_enable;
 
 	sk = icmp_xmit_lock(net);
diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c
index 230b5aac9f03..8d7b113958b1 100644
--- a/net/ipv6/icmp.c
+++ b/net/ipv6/icmp.c
@@ -491,7 +491,7 @@ static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info,
 	local_bh_disable();
 
 	/* Check global sysctl_icmp_msgs_per_sec ratelimit */
-	if (!icmpv6_global_allow(type))
+	if (!(skb->dev->flags&IFF_LOOPBACK) && !icmpv6_global_allow(type))
 		goto out_bh_enable;
 
 	mip6_addr_swap(skb);

^ permalink raw reply related	[flat|nested] 33+ messages in thread

* Re: [net-next PATCH 2/3] net: reduce cycles spend on ICMP replies that gets rate limited
  2017-06-04 14:38     ` Jesper Dangaard Brouer
@ 2017-06-05 14:22       ` Florian Weimer
  0 siblings, 0 replies; 33+ messages in thread
From: Florian Weimer @ 2017-06-05 14:22 UTC (permalink / raw)
  To: Jesper Dangaard Brouer; +Cc: netdev, Eric Dumazet, xiyou.wangcong

On 06/04/2017 04:38 PM, Jesper Dangaard Brouer wrote:
> On Sun, 4 Jun 2017 09:11:53 +0200
> Florian Weimer <fweimer@redhat.com> wrote:
> 
>> On 01/09/2017 04:04 PM, Jesper Dangaard Brouer wrote:
>>
>>> This patch split the global and per (inet)peer ICMP-reply limiter
>>> code, and moves the global limit check to earlier in the packet
>>> processing path.  Thus, avoid spending cycles on ICMP replies that
>>> gets limited/suppressed anyhow.
>>>
>>> The global ICMP rate limiter icmp_global_allow() is a good solution,
>>> it just happens too late in the process.  The kernel goes through the
>>> full route lookup (return path) for the ICMP message, before taking
>>> the rate limit decision of not sending the ICMP reply.
>>>
>>> Details: The kernels global rate limiter for ICMP messages got added
>>> in commit 4cdf507d5452 ("icmp: add a global rate limitation").  It is
>>> a token bucket limiter with a global lock.  It brilliantly avoids
>>> locking congestion by only updating when 20ms (HZ/50) were elapsed. It
>>> can then avoids taking lock when credit is exhausted (when under
>>> pressure) and time constraint for refill is not yet meet.  
>>
>> This patch removed the rate limit bypass for localhost.  As a result, it
>> is impossible to write deterministic UDP client tests tests which
>> exercise failover behavior in response to unreachable servers.
> 
> You cannot rely on ICMP responses delivery, too many systems (and
> middleboxes) limit or drop ICMP. Before this patch, loopback dev was
> explicitly excluded from being ICMP rate limited.  Thus, your localhost
> test passed.

Yes, I know that.  But there's a difference between failing a UDP query
immediately and waiting for the timeout to happen.  ICMP responses are
really helpful for that, even though they cannot be relied upon.

> Is there a real use-case behind "failover behavior in response to
> unreachable servers" (which would need to run on localhost)?

It's also relevant during boot when local UDP services are not running.
There, waiting for the timeout can delay the boot process.

It used to be relevant for switching to backup UDP-based servers (such
as name servers), but it seems the Linux kernel has not generated ICMP
messages at a sufficient rate to facilitate that long before the recent
changes.  And of course, it only covers a subset of the failure
scenarios (and arguably only a small subset of them).

In any case, we need a working way to test clients which have ICMP-based
failure detection, and we can't do that if the kernel sends them only
once in a while.

> Adding back outgoing-dev loopback test will require a full
> route-lookup, which is what the hole optimization gain[1] comes from.
> [1] https://git.kernel.org/torvalds/c/9f2f27a9a518
> 
> I've tried to come-up with an alternative solution, see inlined patch
> below...

Looking at the incoming interface doesn't seem unreasonable here.

Thanks,
Florian

^ permalink raw reply	[flat|nested] 33+ messages in thread

end of thread, other threads:[~2017-06-05 14:22 UTC | newest]

Thread overview: 33+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-01-09 15:03 [net-next PATCH 0/3] net: optimize ICMP-reply code path Jesper Dangaard Brouer
2017-01-09 15:04 ` [net-next PATCH 1/3] Revert "icmp: avoid allocating large struct on stack" Jesper Dangaard Brouer
2017-01-09 17:42   ` Cong Wang
2017-01-09 17:50     ` Eric Dumazet
2017-01-09 17:59       ` Cong Wang
2017-01-09 18:07         ` Eric Dumazet
2017-01-09 18:52           ` David Miller
2017-01-09 20:53             ` Jesper Dangaard Brouer
2017-01-10 18:06             ` Cong Wang
2017-01-10 18:12               ` David Miller
2017-01-10 18:44                 ` Cong Wang
2017-01-10 18:48                   ` Cong Wang
2017-01-10 18:54                   ` David Miller
2017-01-12 22:46                     ` Cong Wang
2017-01-10 20:08                   ` Jesper Dangaard Brouer
2017-01-10 21:48                     ` Eric Dumazet
2017-01-12 22:21                       ` Cong Wang
2017-01-10 21:41                 ` Joe Perches
2017-01-09 19:33           ` Joe Perches
2017-01-10 18:01           ` Cong Wang
2017-01-09 18:47         ` David Miller
2017-01-09 17:42   ` Eric Dumazet
2017-01-09 15:04 ` [net-next PATCH 2/3] net: reduce cycles spend on ICMP replies that gets rate limited Jesper Dangaard Brouer
2017-01-09 17:44   ` Eric Dumazet
2017-01-11 17:15     ` Eric Dumazet
2017-06-04  7:11   ` Florian Weimer
2017-06-04 14:38     ` Jesper Dangaard Brouer
2017-06-05 14:22       ` Florian Weimer
2017-01-09 15:04 ` [net-next PATCH 3/3] net: for rate-limited ICMP replies save one atomic operation Jesper Dangaard Brouer
2017-01-09 17:44   ` Eric Dumazet
2017-01-09 17:43 ` [net-next PATCH 0/3] net: optimize ICMP-reply code path Cong Wang
2017-01-09 17:56   ` Eric Dumazet
2017-01-09 20:49 ` David Miller

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.