netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH net-next] icmp: add a global rate limitation
@ 2014-09-19 14:38 Eric Dumazet
  2014-09-19 14:56 ` Joe Perches
                   ` (2 more replies)
  0 siblings, 3 replies; 6+ messages in thread
From: Eric Dumazet @ 2014-09-19 14:38 UTC (permalink / raw)
  To: David Miller; +Cc: netdev

From: Eric Dumazet <edumazet@google.com>

Current ICMP rate limiting uses inetpeer cache, which is an RBL tree
protected by a lock, meaning that hosts can be stuck hard if all cpus
want to check ICMP limits.

When say a DNS or NTP server process is restarted, inetpeer tree grows
quick and machine comes to its knees.

iptables can not help because the bottleneck happens before ICMP
messages are even cooked and sent.

This patch adds a new global limitation, using a token bucket filter,
controlled by two new sysctl :

icmp_msgs_per_sec - INTEGER
    Limit maximal number of ICMP packets sent per second from this host.
    Only messages whose type matches icmp_ratemask are
    controlled by this limit.
    Default: 1000

icmp_msgs_burst - INTEGER
    icmp_msgs_per_sec controls number of ICMP packets sent per second,
    while icmp_msgs_burst controls the burst size of these packets.
    Default: 50

Note that if we really want to send millions of ICMP messages per
second, we might extend idea and infra added in commit 04ca6973f7c1a
("ip: make IP identifiers less predictable") :
add a token bucket in the ip_idents hash and no longer rely on inetpeer.

Signed-off-by: Eric Dumazet <edumazet@google.com>
---
 Documentation/networking/ip-sysctl.txt |   13 ++++
 include/net/ip.h                       |    4 +
 net/ipv4/icmp.c                        |   64 +++++++++++++++++++++--
 net/ipv4/sysctl_net_ipv4.c             |   16 +++++
 net/ipv6/icmp.c                        |   20 ++++---
 5 files changed, 105 insertions(+), 12 deletions(-)

diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index 1b5581a30d77..c7a81ace35d0 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -769,8 +769,21 @@ icmp_ratelimit - INTEGER
 	icmp_ratemask (see below) to specific targets.
 	0 to disable any limiting,
 	otherwise the minimal space between responses in milliseconds.
+	Note that another sysctl, icmp_msgs_per_sec limits the number
+	of ICMP packets	sent on all targets.
 	Default: 1000
 
+icmp_msgs_per_sec - INTEGER
+	Limit maximal number of ICMP packets sent per second from this host.
+	Only messages whose type matches icmp_ratemask (see below) are
+	controlled by this limit.
+	Default: 1000
+
+icmp_msgs_burst - INTEGER
+	icmp_msgs_per_sec controls number of ICMP packets sent per second,
+	while icmp_msgs_burst controls the burst size of these packets.
+	Default: 50
+
 icmp_ratemask - INTEGER
 	Mask made of ICMP types for which rates are being limited.
 	Significant bits: IHGFEDCBA9876543210
diff --git a/include/net/ip.h b/include/net/ip.h
index 14bfc8e1bcf9..fcd9068fb8c3 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -548,6 +548,10 @@ void ip_icmp_error(struct sock *sk, struct sk_buff *skb, int err, __be16 port,
 void ip_local_error(struct sock *sk, int err, __be32 daddr, __be16 dport,
 		    u32 info);
 
+bool icmp_global_allow(void);
+extern int sysctl_icmp_msgs_per_sec;
+extern int sysctl_icmp_msgs_burst;
+
 #ifdef CONFIG_PROC_FS
 int ip_misc_proc_init(void);
 #endif
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index ea7d4afe8205..ba86c146a4ab 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -231,12 +231,62 @@ static inline void icmp_xmit_unlock(struct sock *sk)
 	spin_unlock_bh(&sk->sk_lock.slock);
 }
 
+int sysctl_icmp_msgs_per_sec __read_mostly = 1000;
+int sysctl_icmp_msgs_burst __read_mostly = 50;
+
+static struct {
+	spinlock_t	lock;
+	u32		credit;
+	u32		stamp;
+} icmp_global = {
+	.lock		= __SPIN_LOCK_UNLOCKED(icmp_global.lock),
+};
+
+/**
+ * icmp_global_allow - Are we allowed to send one more ICMP message ?
+ *
+ * Uses a token bucket to limit our ICMP messages to sysctl_icmp_msgs_per_sec.
+ * Returns false if we reached the limit and can not send another packet.
+ * Note: called with BH disabled
+ */
+bool icmp_global_allow(void)
+{
+	u32 credit, delta, incr = 0, now = (u32)jiffies;
+	bool rc = false;
+
+	/* Check if token bucket is empty and cannot be refilled
+	 * without taking the spinlock.
+	 */
+	if (!icmp_global.credit) {
+		delta = min_t(u32, now - icmp_global.stamp, HZ);
+		if (delta < HZ / 50)
+			return false;
+	}
+
+	spin_lock(&icmp_global.lock);
+	delta = min_t(u32, now - icmp_global.stamp, HZ);
+	if (delta >= HZ / 50) {
+		incr = sysctl_icmp_msgs_per_sec * delta / HZ ;
+		if (incr)
+			icmp_global.stamp = now;
+	}
+	credit = min_t(u32, icmp_global.credit + incr, sysctl_icmp_msgs_burst);
+	if (credit) {
+		credit--;
+		rc = true;
+	}
+	icmp_global.credit = credit;
+	spin_unlock(&icmp_global.lock);
+	return rc;
+}
+EXPORT_SYMBOL(icmp_global_allow);
+
 /*
  *	Send an ICMP frame.
  */
 
-static inline bool icmpv4_xrlim_allow(struct net *net, struct rtable *rt,
-				      struct flowi4 *fl4, int type, int code)
+static bool icmpv4_xrlim_allow(struct net *net, struct rtable *rt,
+			       struct flowi4 *fl4, int type, int code)
 {
 	struct dst_entry *dst = &rt->dst;
 	bool rc = true;
@@ -253,8 +303,14 @@ static inline bool icmpv4_xrlim_allow(struct net *net, struct rtable *rt,
 		goto out;
 
 	/* Limit if icmp type is enabled in ratemask. */
-	if ((1 << type) & net->ipv4.sysctl_icmp_ratemask) {
-		struct inet_peer *peer = inet_getpeer_v4(net->ipv4.peers, fl4->daddr, 1);
+	if (!((1 << type) & net->ipv4.sysctl_icmp_ratemask))
+		goto out;
+
+	rc = false;
+	if (icmp_global_allow()) {
+		struct inet_peer *peer;
+
+		peer = inet_getpeer_v4(net->ipv4.peers, fl4->daddr, 1);
 		rc = inet_peer_xrlim_allow(peer,
 					   net->ipv4.sysctl_icmp_ratelimit);
 		if (peer)
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 1599966f4639..8a25509c35b3 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -731,6 +731,22 @@ static struct ctl_table ipv4_table[] = {
 		.extra2		= &one,
 	},
 	{
+		.procname	= "icmp_msgs_per_sec",
+		.data		= &sysctl_icmp_msgs_per_sec,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &zero,
+	},
+	{
+		.procname	= "icmp_msgs_burst",
+		.data		= &sysctl_icmp_msgs_burst,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &zero,
+	},
+	{
 		.procname	= "udp_mem",
 		.data		= &sysctl_udp_mem,
 		.maxlen		= sizeof(sysctl_udp_mem),
diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c
index 394bb824fe4b..141e1f3ab74e 100644
--- a/net/ipv6/icmp.c
+++ b/net/ipv6/icmp.c
@@ -170,11 +170,11 @@ static bool is_ineligible(const struct sk_buff *skb)
 /*
  * Check the ICMP output rate limit
  */
-static inline bool icmpv6_xrlim_allow(struct sock *sk, u8 type,
-				      struct flowi6 *fl6)
+static bool icmpv6_xrlim_allow(struct sock *sk, u8 type,
+			       struct flowi6 *fl6)
 {
-	struct dst_entry *dst;
 	struct net *net = sock_net(sk);
+	struct dst_entry *dst;
 	bool res = false;
 
 	/* Informational messages are not limited. */
@@ -199,16 +199,20 @@ static inline bool icmpv6_xrlim_allow(struct sock *sk, u8 type,
 	} else {
 		struct rt6_info *rt = (struct rt6_info *)dst;
 		int tmo = net->ipv6.sysctl.icmpv6_time;
-		struct inet_peer *peer;
 
 		/* Give more bandwidth to wider prefixes. */
 		if (rt->rt6i_dst.plen < 128)
 			tmo >>= ((128 - rt->rt6i_dst.plen)>>5);
 
-		peer = inet_getpeer_v6(net->ipv6.peers, &rt->rt6i_dst.addr, 1);
-		res = inet_peer_xrlim_allow(peer, tmo);
-		if (peer)
-			inet_putpeer(peer);
+		if (icmp_global_allow()) {
+			struct inet_peer *peer;
+
+			peer = inet_getpeer_v6(net->ipv6.peers,
+					       &rt->rt6i_dst.addr, 1);
+			res = inet_peer_xrlim_allow(peer, tmo);
+			if (peer)
+				inet_putpeer(peer);
+		}
 	}
 	dst_release(dst);
 	return res;

^ permalink raw reply related	[flat|nested] 6+ messages in thread

* Re: [PATCH net-next] icmp: add a global rate limitation
  2014-09-19 14:38 [PATCH net-next] icmp: add a global rate limitation Eric Dumazet
@ 2014-09-19 14:56 ` Joe Perches
  2014-09-19 15:11   ` Eric Dumazet
  2014-09-22 20:09 ` David Miller
  2014-09-23 16:48 ` David Miller
  2 siblings, 1 reply; 6+ messages in thread
From: Joe Perches @ 2014-09-19 14:56 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: David Miller, netdev

On Fri, 2014-09-19 at 07:38 -0700, Eric Dumazet wrote:
> Current ICMP rate limiting uses inetpeer cache, which is an RBL tree
> protected by a lock, meaning that hosts can be stuck hard if all cpus
> want to check ICMP limits.
> 
> When say a DNS or NTP server process is restarted, inetpeer tree grows
> quick and machine comes to its knees.
> 
> iptables can not help because the bottleneck happens before ICMP
> messages are even cooked and sent.
> 
> This patch adds a new global limitation, using a token bucket filter,
> controlled by two new sysctl :
> 
> icmp_msgs_per_sec - INTEGER
>     Limit maximal number of ICMP packets sent per second from this host.
>     Only messages whose type matches icmp_ratemask are
>     controlled by this limit.
>     Default: 1000
> 
> icmp_msgs_burst - INTEGER
>     icmp_msgs_per_sec controls number of ICMP packets sent per second,
>     while icmp_msgs_burst controls the burst size of these packets.
>     Default: 50

nice.

> diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
[]
> @@ -231,12 +231,62 @@ static inline void icmp_xmit_unlock(struct sock *sk)
>  	spin_unlock_bh(&sk->sk_lock.slock);
>  }
>  
> +int sysctl_icmp_msgs_per_sec __read_mostly = 1000;
> +int sysctl_icmp_msgs_burst __read_mostly = 50;
> +
> +static struct {
> +	spinlock_t	lock;
> +	u32		credit;
> +	u32		stamp;
> +} icmp_global = {
> +	.lock		= __SPIN_LOCK_UNLOCKED(icmp_global.lock),
> +};

Is there any real benefit using a u32 stamp
instead of unsigned long?

The stamp comparisons are to jiffies and now
have slightly odd (u32) casts.

> +
> +/**
> + * icmp_global_allow - Are we allowed to send one more ICMP message ?
> + *
> + * Uses a token bucket to limit our ICMP messages to sysctl_icmp_msgs_per_sec.
> + * Returns false if we reached the limit and can not send another packet.
> + * Note: called with BH disabled
> + */
> +bool icmp_global_allow(void)
> +{
> +	u32 credit, delta, incr = 0, now = (u32)jiffies;

Doesn't casting jiffies costs a couple cycles
on a 64 bit machine?

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH net-next] icmp: add a global rate limitation
  2014-09-19 14:56 ` Joe Perches
@ 2014-09-19 15:11   ` Eric Dumazet
  0 siblings, 0 replies; 6+ messages in thread
From: Eric Dumazet @ 2014-09-19 15:11 UTC (permalink / raw)
  To: Joe Perches; +Cc: David Miller, netdev

On Fri, 2014-09-19 at 07:56 -0700, Joe Perches wrote:
> On Fri, 2014-09-19 at 07:38 -0700, Eric Dumazet wrote:
> > Current ICMP rate limiting uses inetpeer cache, which is an RBL tree
> > protected by a lock, meaning that hosts can be stuck hard if all cpus
> > want to check ICMP limits.
> > 
> > When say a DNS or NTP server process is restarted, inetpeer tree grows
> > quick and machine comes to its knees.
> > 
> > iptables can not help because the bottleneck happens before ICMP
> > messages are even cooked and sent.
> > 
> > This patch adds a new global limitation, using a token bucket filter,
> > controlled by two new sysctl :
> > 
> > icmp_msgs_per_sec - INTEGER
> >     Limit maximal number of ICMP packets sent per second from this host.
> >     Only messages whose type matches icmp_ratemask are
> >     controlled by this limit.
> >     Default: 1000
> > 
> > icmp_msgs_burst - INTEGER
> >     icmp_msgs_per_sec controls number of ICMP packets sent per second,
> >     while icmp_msgs_burst controls the burst size of these packets.
> >     Default: 50
> 
> nice.
> 
> > diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
> []
> > @@ -231,12 +231,62 @@ static inline void icmp_xmit_unlock(struct sock *sk)
> >  	spin_unlock_bh(&sk->sk_lock.slock);
> >  }
> >  
> > +int sysctl_icmp_msgs_per_sec __read_mostly = 1000;
> > +int sysctl_icmp_msgs_burst __read_mostly = 50;
> > +
> > +static struct {
> > +	spinlock_t	lock;
> > +	u32		credit;
> > +	u32		stamp;
> > +} icmp_global = {
> > +	.lock		= __SPIN_LOCK_UNLOCKED(icmp_global.lock),
> > +};
> 
> Is there any real benefit using a u32 stamp
> instead of unsigned long?

This is because the computation and sysctl are 32bit wide.

We clamp the delta to 1 sec anyway, so there is no value having 64bit
wide stamp.

Note that I do not expect anybody trying to overflow the computations,
so I did not bother using u64 to perform the (x * y / HZ) operation.

> 
> The stamp comparisons are to jiffies and now
> have slightly odd (u32) casts.

Thats because I am planning to eventually use a common helper for the
inetpeer token bucket, or the hashed one I mention at the end of
changelog. Keeping u32 would avoid taking too much room.

> 
> > +
> > +/**
> > + * icmp_global_allow - Are we allowed to send one more ICMP message ?
> > + *
> > + * Uses a token bucket to limit our ICMP messages to sysctl_icmp_msgs_per_sec.
> > + * Returns false if we reached the limit and can not send another packet.
> > + * Note: called with BH disabled
> > + */
> > +bool icmp_global_allow(void)
> > +{
> > +	u32 credit, delta, incr = 0, now = (u32)jiffies;
> 
> Doesn't casting jiffies costs a couple cycles
> on a 64 bit machine?

No, thats a word access, instead of a qword.

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH net-next] icmp: add a global rate limitation
  2014-09-19 14:38 [PATCH net-next] icmp: add a global rate limitation Eric Dumazet
  2014-09-19 14:56 ` Joe Perches
@ 2014-09-22 20:09 ` David Miller
  2014-09-22 20:25   ` Eric Dumazet
  2014-09-23 16:48 ` David Miller
  2 siblings, 1 reply; 6+ messages in thread
From: David Miller @ 2014-09-22 20:09 UTC (permalink / raw)
  To: eric.dumazet; +Cc: netdev

From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Fri, 19 Sep 2014 07:38:40 -0700

> Current ICMP rate limiting uses inetpeer cache, which is an RBL tree
> protected by a lock, meaning that hosts can be stuck hard if all cpus
> want to check ICMP limits.

However, the replacement uses a single global spinlock for
synchronization.

 ...
> Note that if we really want to send millions of ICMP messages per
> second, we might extend idea and infra added in commit 04ca6973f7c1a
> ("ip: make IP identifiers less predictable") :
> add a token bucket in the ip_idents hash and no longer rely on inetpeer.

That would be preferred.

I don't really see how this patch makes things better.  The code goes
through a global spinlock unconditionally, and if it passes then it
looks up the inetpeer anyways.

I need more information to be convinced that this is an improvement
and that it actually solves the stated problem.  I'm sure you also
have some performance metrics to share, right? :-)

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH net-next] icmp: add a global rate limitation
  2014-09-22 20:09 ` David Miller
@ 2014-09-22 20:25   ` Eric Dumazet
  0 siblings, 0 replies; 6+ messages in thread
From: Eric Dumazet @ 2014-09-22 20:25 UTC (permalink / raw)
  To: David Miller; +Cc: netdev

On Mon, 2014-09-22 at 16:09 -0400, David Miller wrote:
> From: Eric Dumazet <eric.dumazet@gmail.com>
> Date: Fri, 19 Sep 2014 07:38:40 -0700
> 
> > Current ICMP rate limiting uses inetpeer cache, which is an RBL tree
> > protected by a lock, meaning that hosts can be stuck hard if all cpus
> > want to check ICMP limits.
> 
> However, the replacement uses a single global spinlock for
> synchronization.
> 
>  ...
> > Note that if we really want to send millions of ICMP messages per
> > second, we might extend idea and infra added in commit 04ca6973f7c1a
> > ("ip: make IP identifiers less predictable") :
> > add a token bucket in the ip_idents hash and no longer rely on inetpeer.
> 
> That would be preferred.


> 
> I don't really see how this patch makes things better.  The code goes
> through a global spinlock unconditionally, and if it passes then it
> looks up the inetpeer anyways.
> 

The inetpeer is only hit at most 1000 times per second, which is a
rather low rate.


> I need more information to be convinced that this is an improvement
> and that it actually solves the stated problem.  I'm sure you also
> have some performance metrics to share, right? :-)

It does not use the global spinlock in the stress situation.

I tested this patch I can tell you it works ;)

+       /* Check if token bucket is empty and cannot be refilled
+        * without taking the spinlock.
+        */
+       if (!icmp_global.credit) {
+               delta = min_t(u32, now - icmp_global.stamp, HZ);
+               if (delta < HZ / 50)
+                       return false;
+       }
+

So if we are under pressure (credit is exhausted), we exit early unless
20ms were elapsed, and in this case we'll refill the token bucket.

Thanks

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH net-next] icmp: add a global rate limitation
  2014-09-19 14:38 [PATCH net-next] icmp: add a global rate limitation Eric Dumazet
  2014-09-19 14:56 ` Joe Perches
  2014-09-22 20:09 ` David Miller
@ 2014-09-23 16:48 ` David Miller
  2 siblings, 0 replies; 6+ messages in thread
From: David Miller @ 2014-09-23 16:48 UTC (permalink / raw)
  To: eric.dumazet; +Cc: netdev

From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Fri, 19 Sep 2014 07:38:40 -0700

> From: Eric Dumazet <edumazet@google.com>
> 
> Current ICMP rate limiting uses inetpeer cache, which is an RBL tree
> protected by a lock, meaning that hosts can be stuck hard if all cpus
> want to check ICMP limits.
> 
> When say a DNS or NTP server process is restarted, inetpeer tree grows
> quick and machine comes to its knees.
> 
> iptables can not help because the bottleneck happens before ICMP
> messages are even cooked and sent.
> 
> This patch adds a new global limitation, using a token bucket filter,
> controlled by two new sysctl :
> 
> icmp_msgs_per_sec - INTEGER
>     Limit maximal number of ICMP packets sent per second from this host.
>     Only messages whose type matches icmp_ratemask are
>     controlled by this limit.
>     Default: 1000
> 
> icmp_msgs_burst - INTEGER
>     icmp_msgs_per_sec controls number of ICMP packets sent per second,
>     while icmp_msgs_burst controls the burst size of these packets.
>     Default: 50
> 
> Note that if we really want to send millions of ICMP messages per
> second, we might extend idea and infra added in commit 04ca6973f7c1a
> ("ip: make IP identifiers less predictable") :
> add a token bucket in the ip_idents hash and no longer rely on inetpeer.
> 
> Signed-off-by: Eric Dumazet <edumazet@google.com>

Applied.

^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2014-09-23 16:48 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2014-09-19 14:38 [PATCH net-next] icmp: add a global rate limitation Eric Dumazet
2014-09-19 14:56 ` Joe Perches
2014-09-19 15:11   ` Eric Dumazet
2014-09-22 20:09 ` David Miller
2014-09-22 20:25   ` Eric Dumazet
2014-09-23 16:48 ` David Miller

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).