netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [RFC PATCH] ipv4: fix fnhe dump record multiplication
@ 2022-01-20 23:50 Tomas Hlavacek
  2022-01-22 16:55 ` David Ahern
  0 siblings, 1 reply; 3+ messages in thread
From: Tomas Hlavacek @ 2022-01-20 23:50 UTC (permalink / raw)
  To: David S . Miller, Jakub Kicinski, Hideaki YOSHIFUJI, David Ahern
  Cc: netdev, Tomas Hlavacek

Fix the multiplication of the FNHE records during dump: Check in
fnhe_dump_bucket() that the dumped record destination address falls
within the key (prefix, prefixlen) of the FIB leaf that is being dumped.

FNHE table records can be dumped multiple times to netlink on RTM_GETROUTE
command with NLM_F_DUMP flag - either to "ip route show cache" or to any
routing daemon. The multiplication is substantial under specific
conditions - it can produce over 120M netlink messages in one dump.
It happens if there is one shared struct fib_nh linked through
struct fib_info (->fib_nh) from many leafs in FIB over struct fib_alias.

This situation can be triggered by importing a full BGP table over GRE
tunnel. In this case there are ~800k routes that translates to ~120k leafs
in FIB that all ulimately links the same next-hop (the other end of
the GRE tunnel). The GRE tunnel creates one FNHE record for each
destination IP that is routed to the tunnel because of PMTU. In my case
I had around 1000 PMTU records after a few minutes in a lab connected to
the public internet so the FNHE dump produced 120M records that easily
stalled BIRD routing daemon as described here:
http://trubka.network.cz/pipermail/bird-users/2022-January/015897.html
(There is a work-around already committed to BIRD that removes unnecessary
dumps of FNHE.)

Signed-off-by: Tomas Hlavacek <tmshlvck@gmail.com>
---
 include/net/route.h |  3 ++-
 net/ipv4/fib_trie.c |  3 ++-
 net/ipv4/route.c    | 25 ++++++++++++++++++++++---
 3 files changed, 26 insertions(+), 5 deletions(-)

diff --git a/include/net/route.h b/include/net/route.h
index 2e6c0e153e3a..066eab9c5d99 100644
--- a/include/net/route.h
+++ b/include/net/route.h
@@ -244,7 +244,8 @@ void rt_del_uncached_list(struct rtable *rt);
 
 int fib_dump_info_fnhe(struct sk_buff *skb, struct netlink_callback *cb,
 		       u32 table_id, struct fib_info *fi,
-		       int *fa_index, int fa_start, unsigned int flags);
+		       int *fa_index, int fa_start, unsigned int flags,
+		       __be32 prefix, unsigned char prefixlen);
 
 static inline void ip_rt_put(struct rtable *rt)
 {
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 8060524f4256..7a42db70f46d 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -2313,7 +2313,8 @@ static int fn_trie_dump_leaf(struct key_vector *l, struct fib_table *tb,
 
 		if (filter->dump_exceptions) {
 			err = fib_dump_info_fnhe(skb, cb, tb->tb_id, fi,
-						 &i_fa, s_fa, flags);
+						 &i_fa, s_fa, flags, xkey,
+						 (KEYLENGTH - fa->fa_slen));
 			if (err < 0)
 				goto stop;
 		}
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 0b4103b1e622..bc882c85228d 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -3049,10 +3049,25 @@ static int rt_fill_info(struct net *net, __be32 dst, __be32 src,
 	return -EMSGSIZE;
 }
 
+static int fnhe_daddr_check(__be32 daddr, struct net *net, u32 table_id,
+			    __be32 prefix, unsigned char prefixlen)
+{
+	struct flowi4 fl4 = { .daddr = daddr };
+	struct fib_table *tb = fib_get_table(net, table_id);
+	struct fib_result res;
+	int err = fib_table_lookup(tb, &fl4, &res, FIB_LOOKUP_NOREF);
+
+	if (!err && res.prefix == prefix && res.prefixlen == prefixlen)
+		return 1;
+
+	return 0;
+}
+
 static int fnhe_dump_bucket(struct net *net, struct sk_buff *skb,
 			    struct netlink_callback *cb, u32 table_id,
 			    struct fnhe_hash_bucket *bucket, int genid,
-			    int *fa_index, int fa_start, unsigned int flags)
+			    int *fa_index, int fa_start, unsigned int flags,
+			    __be32 prefix, unsigned char prefixlen)
 {
 	int i;
 
@@ -3067,6 +3082,9 @@ static int fnhe_dump_bucket(struct net *net, struct sk_buff *skb,
 			if (*fa_index < fa_start)
 				goto next;
 
+			if (!fnhe_daddr_check(fnhe->fnhe_daddr, net, table_id, prefix, prefixlen))
+				goto next;
+
 			if (fnhe->fnhe_genid != genid)
 				goto next;
 
@@ -3096,7 +3114,8 @@ static int fnhe_dump_bucket(struct net *net, struct sk_buff *skb,
 
 int fib_dump_info_fnhe(struct sk_buff *skb, struct netlink_callback *cb,
 		       u32 table_id, struct fib_info *fi,
-		       int *fa_index, int fa_start, unsigned int flags)
+		       int *fa_index, int fa_start, unsigned int flags,
+		       __be32 prefix, unsigned char prefixlen)
 {
 	struct net *net = sock_net(cb->skb->sk);
 	int nhsel, genid = fnhe_genid(net);
@@ -3115,7 +3134,7 @@ int fib_dump_info_fnhe(struct sk_buff *skb, struct netlink_callback *cb,
 		if (bucket)
 			err = fnhe_dump_bucket(net, skb, cb, table_id, bucket,
 					       genid, fa_index, fa_start,
-					       flags);
+					       flags, prefix, prefixlen);
 		rcu_read_unlock();
 		if (err)
 			return err;
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 3+ messages in thread

* Re: [RFC PATCH] ipv4: fix fnhe dump record multiplication
  2022-01-20 23:50 [RFC PATCH] ipv4: fix fnhe dump record multiplication Tomas Hlavacek
@ 2022-01-22 16:55 ` David Ahern
  2022-01-26  8:31   ` Ido Schimmel
  0 siblings, 1 reply; 3+ messages in thread
From: David Ahern @ 2022-01-22 16:55 UTC (permalink / raw)
  To: Tomas Hlavacek, David S . Miller, Jakub Kicinski,
	Hideaki YOSHIFUJI, David Ahern
  Cc: netdev, Stefano Brivio, Ido Schimmel

[ cc Stefano and Ido ]

On 1/20/22 4:50 PM, Tomas Hlavacek wrote:
> Fix the multiplication of the FNHE records during dump: Check in
> fnhe_dump_bucket() that the dumped record destination address falls
> within the key (prefix, prefixlen) of the FIB leaf that is being dumped.
> 
> FNHE table records can be dumped multiple times to netlink on RTM_GETROUTE
> command with NLM_F_DUMP flag - either to "ip route show cache" or to any
> routing daemon. The multiplication is substantial under specific
> conditions - it can produce over 120M netlink messages in one dump.
> It happens if there is one shared struct fib_nh linked through
> struct fib_info (->fib_nh) from many leafs in FIB over struct fib_alias.
> 
> This situation can be triggered by importing a full BGP table over GRE
> tunnel. In this case there are ~800k routes that translates to ~120k leafs
> in FIB that all ulimately links the same next-hop (the other end of
> the GRE tunnel). The GRE tunnel creates one FNHE record for each
> destination IP that is routed to the tunnel because of PMTU. In my case
> I had around 1000 PMTU records after a few minutes in a lab connected to
> the public internet so the FNHE dump produced 120M records that easily
> stalled BIRD routing daemon as described here:
> http://trubka.network.cz/pipermail/bird-users/2022-January/015897.html
> (There is a work-around already committed to BIRD that removes unnecessary
> dumps of FNHE.)
> 
> Signed-off-by: Tomas Hlavacek <tmshlvck@gmail.com>
> ---
>  include/net/route.h |  3 ++-
>  net/ipv4/fib_trie.c |  3 ++-
>  net/ipv4/route.c    | 25 ++++++++++++++++++++++---
>  3 files changed, 26 insertions(+), 5 deletions(-)
> 
> diff --git a/include/net/route.h b/include/net/route.h
> index 2e6c0e153e3a..066eab9c5d99 100644
> --- a/include/net/route.h
> +++ b/include/net/route.h
> @@ -244,7 +244,8 @@ void rt_del_uncached_list(struct rtable *rt);
>  
>  int fib_dump_info_fnhe(struct sk_buff *skb, struct netlink_callback *cb,
>  		       u32 table_id, struct fib_info *fi,
> -		       int *fa_index, int fa_start, unsigned int flags);
> +		       int *fa_index, int fa_start, unsigned int flags,
> +		       __be32 prefix, unsigned char prefixlen);
>  
>  static inline void ip_rt_put(struct rtable *rt)
>  {
> diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
> index 8060524f4256..7a42db70f46d 100644
> --- a/net/ipv4/fib_trie.c
> +++ b/net/ipv4/fib_trie.c
> @@ -2313,7 +2313,8 @@ static int fn_trie_dump_leaf(struct key_vector *l, struct fib_table *tb,
>  
>  		if (filter->dump_exceptions) {
>  			err = fib_dump_info_fnhe(skb, cb, tb->tb_id, fi,
> -						 &i_fa, s_fa, flags);
> +						 &i_fa, s_fa, flags, xkey,
> +						 (KEYLENGTH - fa->fa_slen));
>  			if (err < 0)
>  				goto stop;
>  		}
> diff --git a/net/ipv4/route.c b/net/ipv4/route.c
> index 0b4103b1e622..bc882c85228d 100644
> --- a/net/ipv4/route.c
> +++ b/net/ipv4/route.c
> @@ -3049,10 +3049,25 @@ static int rt_fill_info(struct net *net, __be32 dst, __be32 src,
>  	return -EMSGSIZE;
>  }
>  
> +static int fnhe_daddr_check(__be32 daddr, struct net *net, u32 table_id,
> +			    __be32 prefix, unsigned char prefixlen)
> +{
> +	struct flowi4 fl4 = { .daddr = daddr };
> +	struct fib_table *tb = fib_get_table(net, table_id);
> +	struct fib_result res;
> +	int err = fib_table_lookup(tb, &fl4, &res, FIB_LOOKUP_NOREF);

I get the fundamental problem you want to solve. I think a fib lookup on
each nexthop exception for each leaf is a heavyweight solution this is
going to add up to significant overhead.

The fundamental problem you are trying to solve is to not walk the
exceptions for a fib_info more than once. A fib_info can be used with
many fib_entries so perhaps the solution is to walk the fib_info structs
that exist in fib_info_hash outside of the trie walk.


> +
> +	if (!err && res.prefix == prefix && res.prefixlen == prefixlen)
> +		return 1;
> +
> +	return 0;
> +}
> +
>  static int fnhe_dump_bucket(struct net *net, struct sk_buff *skb,
>  			    struct netlink_callback *cb, u32 table_id,
>  			    struct fnhe_hash_bucket *bucket, int genid,
> -			    int *fa_index, int fa_start, unsigned int flags)
> +			    int *fa_index, int fa_start, unsigned int flags,
> +			    __be32 prefix, unsigned char prefixlen)
>  {
>  	int i;
>  
> @@ -3067,6 +3082,9 @@ static int fnhe_dump_bucket(struct net *net, struct sk_buff *skb,
>  			if (*fa_index < fa_start)
>  				goto next;
>  
> +			if (!fnhe_daddr_check(fnhe->fnhe_daddr, net, table_id, prefix, prefixlen))
> +				goto next;
> +
>  			if (fnhe->fnhe_genid != genid)
>  				goto next;
>  
> @@ -3096,7 +3114,8 @@ static int fnhe_dump_bucket(struct net *net, struct sk_buff *skb,
>  
>  int fib_dump_info_fnhe(struct sk_buff *skb, struct netlink_callback *cb,
>  		       u32 table_id, struct fib_info *fi,
> -		       int *fa_index, int fa_start, unsigned int flags)
> +		       int *fa_index, int fa_start, unsigned int flags,
> +		       __be32 prefix, unsigned char prefixlen)
>  {
>  	struct net *net = sock_net(cb->skb->sk);
>  	int nhsel, genid = fnhe_genid(net);
> @@ -3115,7 +3134,7 @@ int fib_dump_info_fnhe(struct sk_buff *skb, struct netlink_callback *cb,
>  		if (bucket)
>  			err = fnhe_dump_bucket(net, skb, cb, table_id, bucket,
>  					       genid, fa_index, fa_start,
> -					       flags);
> +					       flags, prefix, prefixlen);
>  		rcu_read_unlock();
>  		if (err)
>  			return err;


^ permalink raw reply	[flat|nested] 3+ messages in thread

* Re: [RFC PATCH] ipv4: fix fnhe dump record multiplication
  2022-01-22 16:55 ` David Ahern
@ 2022-01-26  8:31   ` Ido Schimmel
  0 siblings, 0 replies; 3+ messages in thread
From: Ido Schimmel @ 2022-01-26  8:31 UTC (permalink / raw)
  To: David Ahern
  Cc: Tomas Hlavacek, David S . Miller, Jakub Kicinski,
	Hideaki YOSHIFUJI, David Ahern, netdev, Stefano Brivio

On Sat, Jan 22, 2022 at 09:55:18AM -0700, David Ahern wrote:
> [ cc Stefano and Ido ]
> 
> On 1/20/22 4:50 PM, Tomas Hlavacek wrote:
> > Fix the multiplication of the FNHE records during dump: Check in
> > fnhe_dump_bucket() that the dumped record destination address falls
> > within the key (prefix, prefixlen) of the FIB leaf that is being dumped.
> > 
> > FNHE table records can be dumped multiple times to netlink on RTM_GETROUTE
> > command with NLM_F_DUMP flag - either to "ip route show cache" or to any
> > routing daemon. The multiplication is substantial under specific
> > conditions - it can produce over 120M netlink messages in one dump.
> > It happens if there is one shared struct fib_nh linked through
> > struct fib_info (->fib_nh) from many leafs in FIB over struct fib_alias.
> > 
> > This situation can be triggered by importing a full BGP table over GRE
> > tunnel. In this case there are ~800k routes that translates to ~120k leafs
> > in FIB that all ulimately links the same next-hop (the other end of
> > the GRE tunnel). The GRE tunnel creates one FNHE record for each
> > destination IP that is routed to the tunnel because of PMTU. In my case
> > I had around 1000 PMTU records after a few minutes in a lab connected to
> > the public internet so the FNHE dump produced 120M records that easily
> > stalled BIRD routing daemon as described here:
> > http://trubka.network.cz/pipermail/bird-users/2022-January/015897.html
> > (There is a work-around already committed to BIRD that removes unnecessary
> > dumps of FNHE.)
> > 
> > Signed-off-by: Tomas Hlavacek <tmshlvck@gmail.com>
> > ---
> >  include/net/route.h |  3 ++-
> >  net/ipv4/fib_trie.c |  3 ++-
> >  net/ipv4/route.c    | 25 ++++++++++++++++++++++---
> >  3 files changed, 26 insertions(+), 5 deletions(-)
> > 
> > diff --git a/include/net/route.h b/include/net/route.h
> > index 2e6c0e153e3a..066eab9c5d99 100644
> > --- a/include/net/route.h
> > +++ b/include/net/route.h
> > @@ -244,7 +244,8 @@ void rt_del_uncached_list(struct rtable *rt);
> >  
> >  int fib_dump_info_fnhe(struct sk_buff *skb, struct netlink_callback *cb,
> >  		       u32 table_id, struct fib_info *fi,
> > -		       int *fa_index, int fa_start, unsigned int flags);
> > +		       int *fa_index, int fa_start, unsigned int flags,
> > +		       __be32 prefix, unsigned char prefixlen);
> >  
> >  static inline void ip_rt_put(struct rtable *rt)
> >  {
> > diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
> > index 8060524f4256..7a42db70f46d 100644
> > --- a/net/ipv4/fib_trie.c
> > +++ b/net/ipv4/fib_trie.c
> > @@ -2313,7 +2313,8 @@ static int fn_trie_dump_leaf(struct key_vector *l, struct fib_table *tb,
> >  
> >  		if (filter->dump_exceptions) {
> >  			err = fib_dump_info_fnhe(skb, cb, tb->tb_id, fi,
> > -						 &i_fa, s_fa, flags);
> > +						 &i_fa, s_fa, flags, xkey,
> > +						 (KEYLENGTH - fa->fa_slen));
> >  			if (err < 0)
> >  				goto stop;
> >  		}
> > diff --git a/net/ipv4/route.c b/net/ipv4/route.c
> > index 0b4103b1e622..bc882c85228d 100644
> > --- a/net/ipv4/route.c
> > +++ b/net/ipv4/route.c
> > @@ -3049,10 +3049,25 @@ static int rt_fill_info(struct net *net, __be32 dst, __be32 src,
> >  	return -EMSGSIZE;
> >  }
> >  
> > +static int fnhe_daddr_check(__be32 daddr, struct net *net, u32 table_id,
> > +			    __be32 prefix, unsigned char prefixlen)
> > +{
> > +	struct flowi4 fl4 = { .daddr = daddr };
> > +	struct fib_table *tb = fib_get_table(net, table_id);
> > +	struct fib_result res;
> > +	int err = fib_table_lookup(tb, &fl4, &res, FIB_LOOKUP_NOREF);
> 
> I get the fundamental problem you want to solve. I think a fib lookup on
> each nexthop exception for each leaf is a heavyweight solution this is
> going to add up to significant overhead.

I agree

> 
> The fundamental problem you are trying to solve is to not walk the
> exceptions for a fib_info more than once. A fib_info can be used with
> many fib_entries so perhaps the solution is to walk the fib_info structs
> that exist in fib_info_hash outside of the trie walk.

Sounds like a good idea, though I'm not sure how difficult to implement.
If a dump needs to be restarted, then the netlink callback arguments
need to differentiate between the place in the FIB trie where the dump
was stopped and the FIB info hash table.

Also, isn't the problem also present in IPv6 when nexthop objects are
used? In the legacy model, IPv6 nexthops are not shared (which means
exceptions are not dumped multiple times), but with nexthop objects they
can be shared.

> 
> 
> > +
> > +	if (!err && res.prefix == prefix && res.prefixlen == prefixlen)
> > +		return 1;
> > +
> > +	return 0;
> > +}
> > +
> >  static int fnhe_dump_bucket(struct net *net, struct sk_buff *skb,
> >  			    struct netlink_callback *cb, u32 table_id,
> >  			    struct fnhe_hash_bucket *bucket, int genid,
> > -			    int *fa_index, int fa_start, unsigned int flags)
> > +			    int *fa_index, int fa_start, unsigned int flags,
> > +			    __be32 prefix, unsigned char prefixlen)
> >  {
> >  	int i;
> >  
> > @@ -3067,6 +3082,9 @@ static int fnhe_dump_bucket(struct net *net, struct sk_buff *skb,
> >  			if (*fa_index < fa_start)
> >  				goto next;
> >  
> > +			if (!fnhe_daddr_check(fnhe->fnhe_daddr, net, table_id, prefix, prefixlen))
> > +				goto next;
> > +
> >  			if (fnhe->fnhe_genid != genid)
> >  				goto next;
> >  
> > @@ -3096,7 +3114,8 @@ static int fnhe_dump_bucket(struct net *net, struct sk_buff *skb,
> >  
> >  int fib_dump_info_fnhe(struct sk_buff *skb, struct netlink_callback *cb,
> >  		       u32 table_id, struct fib_info *fi,
> > -		       int *fa_index, int fa_start, unsigned int flags)
> > +		       int *fa_index, int fa_start, unsigned int flags,
> > +		       __be32 prefix, unsigned char prefixlen)
> >  {
> >  	struct net *net = sock_net(cb->skb->sk);
> >  	int nhsel, genid = fnhe_genid(net);
> > @@ -3115,7 +3134,7 @@ int fib_dump_info_fnhe(struct sk_buff *skb, struct netlink_callback *cb,
> >  		if (bucket)
> >  			err = fnhe_dump_bucket(net, skb, cb, table_id, bucket,
> >  					       genid, fa_index, fa_start,
> > -					       flags);
> > +					       flags, prefix, prefixlen);
> >  		rcu_read_unlock();
> >  		if (err)
> >  			return err;
> 

^ permalink raw reply	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2022-01-26  8:32 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-01-20 23:50 [RFC PATCH] ipv4: fix fnhe dump record multiplication Tomas Hlavacek
2022-01-22 16:55 ` David Ahern
2022-01-26  8:31   ` Ido Schimmel

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).