* [PATCH net-next 1/6] ipv6: Remove external dependency on rt6i_dst and rt6i_src
2015-04-28 21:07 [PATCH net-next 0/6 v2] ipv6: Only create RTF_CACHE route after encountering pmtu exception Martin KaFai Lau
@ 2015-04-28 21:07 ` Martin KaFai Lau
2015-04-28 21:07 ` [PATCH net-next 2/6] ipv6: Remove external dependency on rt6i_gateway and RTF_ANYCAST Martin KaFai Lau
` (4 subsequent siblings)
5 siblings, 0 replies; 12+ messages in thread
From: Martin KaFai Lau @ 2015-04-28 21:07 UTC (permalink / raw)
To: netdev; +Cc: Hannes Frederic Sowa, Steffen Klassert, David Miller, Kernel Team
This patch removes the assumptions that the returned rt is always
a RTF_CACHE entry with the rt6i_dst and rt6i_src containing the
destination and source address. The dst and src can be recovered from
the calling site.
We may consider to rename (rt6i_dst, rt6i_src) to
(rt6i_key_dst, rt6i_key_src) later.
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Reviewed-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Cc: Steffen Klassert <steffen.klassert@secunet.com>
---
drivers/scsi/cxgbi/libcxgbi.c | 2 +-
include/net/ipv6.h | 3 ++-
net/ipv6/icmp.c | 2 +-
net/ipv6/ip6_output.c | 22 +++++++++++-----------
net/ipv6/ndisc.c | 2 +-
net/ipv6/output_core.c | 9 +++++----
net/ipv6/tcp_ipv6.c | 2 +-
net/netfilter/ipvs/ip_vs_xmit.c | 4 ++--
net/sctp/ipv6.c | 3 ++-
9 files changed, 26 insertions(+), 23 deletions(-)
diff --git a/drivers/scsi/cxgbi/libcxgbi.c b/drivers/scsi/cxgbi/libcxgbi.c
index eb58afc..45d3039 100644
--- a/drivers/scsi/cxgbi/libcxgbi.c
+++ b/drivers/scsi/cxgbi/libcxgbi.c
@@ -728,7 +728,7 @@ static struct cxgbi_sock *cxgbi_check_route6(struct sockaddr *dst_addr)
}
ndev = n->dev;
- if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
+ if (ipv6_addr_is_multicast(&daddr6->sin6_addr)) {
pr_info("multi-cast route %pI6 port %u, dev %s.\n",
daddr6->sin6_addr.s6_addr,
ntohs(daddr6->sin6_port), ndev->name);
diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index eec8ad3..a0890d6 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -670,7 +670,8 @@ static inline int ipv6_addr_diff(const struct in6_addr *a1, const struct in6_add
}
void ipv6_select_ident(struct net *net, struct frag_hdr *fhdr,
- struct rt6_info *rt);
+ const struct in6_addr *daddr,
+ const struct in6_addr *saddr);
void ipv6_proxy_select_ident(struct net *net, struct sk_buff *skb);
int ip6_dst_hoplimit(struct dst_entry *dst);
diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c
index 2c2b5d5..24b359d 100644
--- a/net/ipv6/icmp.c
+++ b/net/ipv6/icmp.c
@@ -207,7 +207,7 @@ static bool icmpv6_xrlim_allow(struct sock *sk, u8 type,
struct inet_peer *peer;
peer = inet_getpeer_v6(net->ipv6.peers,
- &rt->rt6i_dst.addr, 1);
+ &fl6->daddr, 1);
res = inet_peer_xrlim_allow(peer, tmo);
if (peer)
inet_putpeer(peer);
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 7fde1f2..b987fbf 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -459,7 +459,7 @@ int ip6_forward(struct sk_buff *skb)
else
target = &hdr->daddr;
- peer = inet_getpeer_v6(net->ipv6.peers, &rt->rt6i_dst.addr, 1);
+ peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr, 1);
/* Limit redirects both by destination (here)
and by source (inside ndisc_send_redirect)
@@ -549,6 +549,7 @@ int ip6_fragment(struct sock *sk, struct sk_buff *skb,
inet6_sk(skb->sk) : NULL;
struct ipv6hdr *tmp_hdr;
struct frag_hdr *fh;
+ struct frag_hdr tmp_fh;
unsigned int mtu, hlen, left, len;
int hroom, troom;
__be32 frag_id = 0;
@@ -584,6 +585,10 @@ int ip6_fragment(struct sock *sk, struct sk_buff *skb,
}
mtu -= hlen + sizeof(struct frag_hdr);
+ ipv6_select_ident(net, &tmp_fh, &ipv6_hdr(skb)->daddr,
+ &ipv6_hdr(skb)->saddr);
+ frag_id = tmp_fh.identification;
+
if (skb_has_frag_list(skb)) {
int first_len = skb_pagelen(skb);
struct sk_buff *frag2;
@@ -632,11 +637,10 @@ int ip6_fragment(struct sock *sk, struct sk_buff *skb,
skb_reset_network_header(skb);
memcpy(skb_network_header(skb), tmp_hdr, hlen);
- ipv6_select_ident(net, fh, rt);
fh->nexthdr = nexthdr;
fh->reserved = 0;
fh->frag_off = htons(IP6_MF);
- frag_id = fh->identification;
+ fh->identification = frag_id;
first_len = skb_pagelen(skb);
skb->data_len = first_len - skb_headlen(skb);
@@ -778,11 +782,7 @@ slow_path:
*/
fh->nexthdr = nexthdr;
fh->reserved = 0;
- if (!frag_id) {
- ipv6_select_ident(net, fh, rt);
- frag_id = fh->identification;
- } else
- fh->identification = frag_id;
+ fh->identification = frag_id;
/*
* Copy a block of the IP datagram.
@@ -1037,7 +1037,7 @@ static inline int ip6_ufo_append_data(struct sock *sk,
int odd, struct sk_buff *skb),
void *from, int length, int hh_len, int fragheaderlen,
int transhdrlen, int mtu, unsigned int flags,
- struct rt6_info *rt)
+ const struct flowi6 *fl6)
{
struct sk_buff *skb;
@@ -1083,7 +1083,7 @@ static inline int ip6_ufo_append_data(struct sock *sk,
skb_shinfo(skb)->gso_size = (mtu - fragheaderlen -
sizeof(struct frag_hdr)) & ~7;
skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
- ipv6_select_ident(sock_net(sk), &fhdr, rt);
+ ipv6_select_ident(sock_net(sk), &fhdr, &fl6->daddr, &fl6->saddr);
skb_shinfo(skb)->ip6_frag_id = fhdr.identification;
append:
@@ -1307,7 +1307,7 @@ emsgsize:
(sk->sk_type == SOCK_DGRAM)) {
err = ip6_ufo_append_data(sk, queue, getfrag, from, length,
hh_len, fragheaderlen,
- transhdrlen, mtu, flags, rt);
+ transhdrlen, mtu, flags, fl6);
if (err)
goto error;
return 0;
diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index 96f153c..0a05b35 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -1506,7 +1506,7 @@ void ndisc_send_redirect(struct sk_buff *skb, const struct in6_addr *target)
"Redirect: destination is not a neighbour\n");
goto release;
}
- peer = inet_getpeer_v6(net->ipv6.peers, &rt->rt6i_dst.addr, 1);
+ peer = inet_getpeer_v6(net->ipv6.peers, &ipv6_hdr(skb)->saddr, 1);
ret = inet_peer_xrlim_allow(peer, 1*HZ);
if (peer)
inet_putpeer(peer);
diff --git a/net/ipv6/output_core.c b/net/ipv6/output_core.c
index 85892af..f37cfa9 100644
--- a/net/ipv6/output_core.c
+++ b/net/ipv6/output_core.c
@@ -10,7 +10,8 @@
#include <net/secure_seq.h>
static u32 __ipv6_select_ident(struct net *net, u32 hashrnd,
- struct in6_addr *dst, struct in6_addr *src)
+ const struct in6_addr *dst,
+ const struct in6_addr *src)
{
u32 hash, id;
@@ -61,15 +62,15 @@ void ipv6_proxy_select_ident(struct net *net, struct sk_buff *skb)
EXPORT_SYMBOL_GPL(ipv6_proxy_select_ident);
void ipv6_select_ident(struct net *net, struct frag_hdr *fhdr,
- struct rt6_info *rt)
+ const struct in6_addr *daddr,
+ const struct in6_addr *saddr)
{
static u32 ip6_idents_hashrnd __read_mostly;
u32 id;
net_get_random_once(&ip6_idents_hashrnd, sizeof(ip6_idents_hashrnd));
- id = __ipv6_select_ident(net, ip6_idents_hashrnd, &rt->rt6i_dst.addr,
- &rt->rt6i_src.addr);
+ id = __ipv6_select_ident(net, ip6_idents_hashrnd, daddr, saddr);
fhdr->identification = htonl(id);
}
EXPORT_SYMBOL(ipv6_select_ident);
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index b6575d6..042a645 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -262,7 +262,7 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
rt = (struct rt6_info *) dst;
if (tcp_death_row.sysctl_tw_recycle &&
!tp->rx_opt.ts_recent_stamp &&
- ipv6_addr_equal(&rt->rt6i_dst.addr, &sk->sk_v6_daddr))
+ ipv6_addr_equal(&fl6.daddr, &sk->sk_v6_daddr))
tcp_fetch_timewait_stamp(sk, dst);
icsk->icsk_ext_hdr_len = 0;
diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c
index 19986ec..38f8627 100644
--- a/net/netfilter/ipvs/ip_vs_xmit.c
+++ b/net/netfilter/ipvs/ip_vs_xmit.c
@@ -781,7 +781,7 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
/* From world but DNAT to loopback address? */
if (local && skb->dev && !(skb->dev->flags & IFF_LOOPBACK) &&
- ipv6_addr_type(&rt->rt6i_dst.addr) & IPV6_ADDR_LOOPBACK) {
+ ipv6_addr_type(&cp->daddr.in6) & IPV6_ADDR_LOOPBACK) {
IP_VS_DBG_RL_PKT(1, AF_INET6, pp, skb, 0,
"ip_vs_nat_xmit_v6(): "
"stopping DNAT to loopback address");
@@ -1346,7 +1346,7 @@ ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
/* From world but DNAT to loopback address? */
if (local && skb->dev && !(skb->dev->flags & IFF_LOOPBACK) &&
- ipv6_addr_type(&rt->rt6i_dst.addr) & IPV6_ADDR_LOOPBACK) {
+ ipv6_addr_type(&cp->daddr.in6) & IPV6_ADDR_LOOPBACK) {
IP_VS_DBG(1, "%s(): "
"stopping DNAT to loopback %pI6\n",
__func__, &cp->daddr.in6);
diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c
index 0e4198e..9fa13f6 100644
--- a/net/sctp/ipv6.c
+++ b/net/sctp/ipv6.c
@@ -332,7 +332,8 @@ out:
rt = (struct rt6_info *)dst;
t->dst = dst;
t->dst_cookie = rt->rt6i_node ? rt->rt6i_node->fn_sernum : 0;
- pr_debug("rt6_dst:%pI6 rt6_src:%pI6\n", &rt->rt6i_dst.addr,
+ pr_debug("rt6_dst:%pI6/%d rt6_src:%pI6\n",
+ &rt->rt6i_dst.addr, rt->rt6i_dst.plen,
&fl6->saddr);
} else {
t->dst = NULL;
--
1.8.1
^ permalink raw reply related [flat|nested] 12+ messages in thread
* [PATCH net-next 2/6] ipv6: Remove external dependency on rt6i_gateway and RTF_ANYCAST
2015-04-28 21:07 [PATCH net-next 0/6 v2] ipv6: Only create RTF_CACHE route after encountering pmtu exception Martin KaFai Lau
2015-04-28 21:07 ` [PATCH net-next 1/6] ipv6: Remove external dependency on rt6i_dst and rt6i_src Martin KaFai Lau
@ 2015-04-28 21:07 ` Martin KaFai Lau
2015-04-29 8:28 ` Julian Anastasov
2015-04-28 21:07 ` [PATCH net-next 3/6] ipv6: Combine rt6_alloc_cow and rt6_alloc_clone Martin KaFai Lau
` (3 subsequent siblings)
5 siblings, 1 reply; 12+ messages in thread
From: Martin KaFai Lau @ 2015-04-28 21:07 UTC (permalink / raw)
To: netdev; +Cc: Hannes Frederic Sowa, Steffen Klassert, David Miller, Kernel Team
When creating a RTF_CACHE route, RTF_ANYCAST is set based on rt6i_dst.
Also, rt6i_gateway is always set to the nexthop while the nexthop
could be a gateway or the rt6i_dst.addr.
After removing the rt6i_dst and rt6i_src dependency in the last patch,
we also need to stop the caller from depending on rt6i_gateway and
RTF_ANYCAST.
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Reviewed-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Cc: Steffen Klassert <steffen.klassert@secunet.com>
---
include/net/ip6_route.h | 14 +++++++++-----
net/bluetooth/6lowpan.c | 2 +-
net/ipv6/icmp.c | 4 ++--
net/ipv6/ip6_output.c | 5 +++--
net/ipv6/route.c | 6 +-----
net/netfilter/nf_conntrack_h323_main.c | 4 ++--
net/netfilter/xt_addrtype.c | 2 +-
7 files changed, 19 insertions(+), 18 deletions(-)
diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h
index 5e19206..0e4d170 100644
--- a/include/net/ip6_route.h
+++ b/include/net/ip6_route.h
@@ -163,11 +163,14 @@ static inline bool ipv6_unicast_destination(const struct sk_buff *skb)
return rt->rt6i_flags & RTF_LOCAL;
}
-static inline bool ipv6_anycast_destination(const struct sk_buff *skb)
+static inline bool ipv6_anycast_destination(const struct dst_entry *dst,
+ const struct in6_addr *daddr)
{
- struct rt6_info *rt = (struct rt6_info *) skb_dst(skb);
+ struct rt6_info *rt = (struct rt6_info *)dst;
- return rt->rt6i_flags & RTF_ANYCAST;
+ return rt->rt6i_flags & RTF_ANYCAST ||
+ (rt->rt6i_dst.plen != 128 &&
+ ipv6_addr_equal(&rt->rt6i_dst.addr, daddr));
}
int ip6_fragment(struct sock *sk, struct sk_buff *skb,
@@ -194,9 +197,10 @@ static inline bool ip6_sk_ignore_df(const struct sock *sk)
inet6_sk(sk)->pmtudisc == IPV6_PMTUDISC_OMIT;
}
-static inline struct in6_addr *rt6_nexthop(struct rt6_info *rt)
+static inline struct in6_addr *rt6_nexthop(struct rt6_info *rt,
+ struct in6_addr *daddr)
{
- return &rt->rt6i_gateway;
+ return (rt->rt6i_flags & RTF_GATEWAY) ? &rt->rt6i_gateway : daddr;
}
#endif
diff --git a/net/bluetooth/6lowpan.c b/net/bluetooth/6lowpan.c
index 1742b84..f3d6046 100644
--- a/net/bluetooth/6lowpan.c
+++ b/net/bluetooth/6lowpan.c
@@ -192,7 +192,7 @@ static inline struct lowpan_peer *peer_lookup_dst(struct lowpan_dev *dev,
if (ipv6_addr_any(nexthop))
return NULL;
} else {
- nexthop = rt6_nexthop(rt);
+ nexthop = rt6_nexthop(rt, daddr);
/* We need to remember the address because it is needed
* by bt_xmit() when sending the packet. In bt_xmit(), the
diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c
index 24b359d..713d743 100644
--- a/net/ipv6/icmp.c
+++ b/net/ipv6/icmp.c
@@ -337,7 +337,7 @@ static struct dst_entry *icmpv6_route_lookup(struct net *net,
* We won't send icmp if the destination is known
* anycast.
*/
- if (((struct rt6_info *)dst)->rt6i_flags & RTF_ANYCAST) {
+ if (ipv6_anycast_destination(dst, &fl6->daddr)) {
net_dbg_ratelimited("icmp6_send: acast source\n");
dst_release(dst);
return ERR_PTR(-EINVAL);
@@ -564,7 +564,7 @@ static void icmpv6_echo_reply(struct sk_buff *skb)
if (!ipv6_unicast_destination(skb) &&
!(net->ipv6.sysctl.anycast_src_echo_reply &&
- ipv6_anycast_destination(skb)))
+ ipv6_anycast_destination(skb_dst(skb), saddr)))
saddr = NULL;
memcpy(&tmp_hdr, icmph, sizeof(tmp_hdr));
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index b987fbf..e58e402 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -105,7 +105,7 @@ static int ip6_finish_output2(struct sock *sk, struct sk_buff *skb)
}
rcu_read_lock_bh();
- nexthop = rt6_nexthop((struct rt6_info *)dst);
+ nexthop = rt6_nexthop((struct rt6_info *)dst, &ipv6_hdr(skb)->daddr);
neigh = __ipv6_neigh_lookup_noref(dst->dev, nexthop);
if (unlikely(!neigh))
neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false);
@@ -913,7 +913,8 @@ static int ip6_dst_lookup_tail(struct sock *sk,
*/
rt = (struct rt6_info *) *dst;
rcu_read_lock_bh();
- n = __ipv6_neigh_lookup_noref(rt->dst.dev, rt6_nexthop(rt));
+ n = __ipv6_neigh_lookup_noref(rt->dst.dev,
+ rt6_nexthop(rt, &fl6->daddr));
err = n && !(n->nud_state & NUD_VALID) ? -EINVAL : 0;
rcu_read_unlock_bh();
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 3522711..8efde73 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -1936,11 +1936,7 @@ static struct rt6_info *ip6_rt_copy(struct rt6_info *ort,
if (rt->rt6i_idev)
in6_dev_hold(rt->rt6i_idev);
rt->dst.lastuse = jiffies;
-
- if (ort->rt6i_flags & RTF_GATEWAY)
- rt->rt6i_gateway = ort->rt6i_gateway;
- else
- rt->rt6i_gateway = *dest;
+ rt->rt6i_gateway = ort->rt6i_gateway;
rt->rt6i_flags = ort->rt6i_flags;
rt6_set_from(rt, ort);
rt->rt6i_metric = 0;
diff --git a/net/netfilter/nf_conntrack_h323_main.c b/net/netfilter/nf_conntrack_h323_main.c
index 1d69f5b..9511af0 100644
--- a/net/netfilter/nf_conntrack_h323_main.c
+++ b/net/netfilter/nf_conntrack_h323_main.c
@@ -779,8 +779,8 @@ static int callforward_do_filter(struct net *net,
flowi6_to_flowi(&fl1), false)) {
if (!afinfo->route(net, (struct dst_entry **)&rt2,
flowi6_to_flowi(&fl2), false)) {
- if (ipv6_addr_equal(rt6_nexthop(rt1),
- rt6_nexthop(rt2)) &&
+ if (ipv6_addr_equal(rt6_nexthop(rt1, &fl1.daddr),
+ rt6_nexthop(rt2, &fl2.daddr)) &&
rt1->dst.dev == rt2->dst.dev)
ret = 1;
dst_release(&rt2->dst);
diff --git a/net/netfilter/xt_addrtype.c b/net/netfilter/xt_addrtype.c
index fab6eea..5b4743c 100644
--- a/net/netfilter/xt_addrtype.c
+++ b/net/netfilter/xt_addrtype.c
@@ -73,7 +73,7 @@ static u32 match_lookup_rt6(struct net *net, const struct net_device *dev,
if (dev == NULL && rt->rt6i_flags & RTF_LOCAL)
ret |= XT_ADDRTYPE_LOCAL;
- if (rt->rt6i_flags & RTF_ANYCAST)
+ if (ipv6_anycast_destination((struct dst_entry *)rt, addr))
ret |= XT_ADDRTYPE_ANYCAST;
dst_release(&rt->dst);
--
1.8.1
^ permalink raw reply related [flat|nested] 12+ messages in thread
* Re: [PATCH net-next 2/6] ipv6: Remove external dependency on rt6i_gateway and RTF_ANYCAST
2015-04-28 21:07 ` [PATCH net-next 2/6] ipv6: Remove external dependency on rt6i_gateway and RTF_ANYCAST Martin KaFai Lau
@ 2015-04-29 8:28 ` Julian Anastasov
2015-04-29 9:02 ` Julian Anastasov
2015-04-29 21:19 ` Martin KaFai Lau
0 siblings, 2 replies; 12+ messages in thread
From: Julian Anastasov @ 2015-04-29 8:28 UTC (permalink / raw)
To: Martin KaFai Lau
Cc: netdev, Hannes Frederic Sowa, Steffen Klassert, David Miller,
Kernel Team
Hello,
On Tue, 28 Apr 2015, Martin KaFai Lau wrote:
> -static inline struct in6_addr *rt6_nexthop(struct rt6_info *rt)
> +static inline struct in6_addr *rt6_nexthop(struct rt6_info *rt,
> + struct in6_addr *daddr)
> {
> - return &rt->rt6i_gateway;
> + return (rt->rt6i_flags & RTF_GATEWAY) ? &rt->rt6i_gateway : daddr;
> }
Similar change in ipv4 caused problem for places
that request route to one nexthop but later the transmitted
IP header contains different destination. Such examples
were:
- net/ipv4/raw.c when hdrincl is used
- xt_TEE.c: I see that it has IPv6 support
- IPVS: request route to real server but send packet with
daddr=virtual IP (Direct Route method)
For IPv4 link routes, callers can get a route
with rt_gateway=0 and rt_nexthop() can return daddr from
header.
We then solved it with FLOWI_FLAG_KNOWN_NH flag which
forces a cached route where rt_gateway is filled with requested
nexthop address:
commit c27c9322d015
commit 2ad5b9e4bd31
commit ad4d3ef8b7eb
Can you check if this series causes the same
problem for net/ipv6/raw.c (hdrincl=1), xt_TEE.c. IPVS code is
in __ip_vs_get_out_rt_v6() where the route is resolved
and then attached to skb with skb_dst_set_noref().
Regards
--
Julian Anastasov <ja@ssi.bg>
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [PATCH net-next 2/6] ipv6: Remove external dependency on rt6i_gateway and RTF_ANYCAST
2015-04-29 8:28 ` Julian Anastasov
@ 2015-04-29 9:02 ` Julian Anastasov
2015-04-29 21:19 ` Martin KaFai Lau
1 sibling, 0 replies; 12+ messages in thread
From: Julian Anastasov @ 2015-04-29 9:02 UTC (permalink / raw)
To: Martin KaFai Lau
Cc: netdev, Hannes Frederic Sowa, Steffen Klassert, David Miller,
Kernel Team
Hello,
On Wed, 29 Apr 2015, Julian Anastasov wrote:
> We then solved it with FLOWI_FLAG_KNOWN_NH flag which
> forces a cached route where rt_gateway is filled with requested
Sorry, I mean "non-cached", not "cached"...
> nexthop address:
Regards
--
Julian Anastasov <ja@ssi.bg>
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [PATCH net-next 2/6] ipv6: Remove external dependency on rt6i_gateway and RTF_ANYCAST
2015-04-29 8:28 ` Julian Anastasov
2015-04-29 9:02 ` Julian Anastasov
@ 2015-04-29 21:19 ` Martin KaFai Lau
1 sibling, 0 replies; 12+ messages in thread
From: Martin KaFai Lau @ 2015-04-29 21:19 UTC (permalink / raw)
To: Julian Anastasov
Cc: netdev, Hannes Frederic Sowa, Steffen Klassert, David Miller,
Kernel Team
Hi,
On Wed, Apr 29, 2015 at 11:28:46AM +0300, Julian Anastasov wrote:
>
> Hello,
>
> On Tue, 28 Apr 2015, Martin KaFai Lau wrote:
>
> > -static inline struct in6_addr *rt6_nexthop(struct rt6_info *rt)
> > +static inline struct in6_addr *rt6_nexthop(struct rt6_info *rt,
> > + struct in6_addr *daddr)
> > {
> > - return &rt->rt6i_gateway;
> > + return (rt->rt6i_flags & RTF_GATEWAY) ? &rt->rt6i_gateway : daddr;
> > }
>
> Similar change in ipv4 caused problem for places
> that request route to one nexthop but later the transmitted
> IP header contains different destination. Such examples
> were:
>
> - net/ipv4/raw.c when hdrincl is used
> - xt_TEE.c: I see that it has IPv6 support
> - IPVS: request route to real server but send packet with
> daddr=virtual IP (Direct Route method)
>
> For IPv4 link routes, callers can get a route
> with rt_gateway=0 and rt_nexthop() can return daddr from
> header.
>
> We then solved it with FLOWI_FLAG_KNOWN_NH flag which
> forces a cached route where rt_gateway is filled with requested
> nexthop address:
>
> commit c27c9322d015
> commit 2ad5b9e4bd31
> commit ad4d3ef8b7eb
>
> Can you check if this series causes the same
> problem for net/ipv6/raw.c (hdrincl=1), xt_TEE.c. IPVS code is
> in __ip_vs_get_out_rt_v6() where the route is resolved
> and then attached to skb with skb_dst_set_noref().
Thanks for the pointers.
I have looked around and I can see your points. I think we can mostly mimic
the IPv4 approach here. After a quick thought, one hurdle is in the
dst_check() since the rt6 does not have the genid like rt4 does. It is removed
from rt6_info at some point to fix another perf bug, iirc. I will try some
ways without re-introducing genid.
--Martin
^ permalink raw reply [flat|nested] 12+ messages in thread
* [PATCH net-next 3/6] ipv6: Combine rt6_alloc_cow and rt6_alloc_clone
2015-04-28 21:07 [PATCH net-next 0/6 v2] ipv6: Only create RTF_CACHE route after encountering pmtu exception Martin KaFai Lau
2015-04-28 21:07 ` [PATCH net-next 1/6] ipv6: Remove external dependency on rt6i_dst and rt6i_src Martin KaFai Lau
2015-04-28 21:07 ` [PATCH net-next 2/6] ipv6: Remove external dependency on rt6i_gateway and RTF_ANYCAST Martin KaFai Lau
@ 2015-04-28 21:07 ` Martin KaFai Lau
2015-04-28 21:07 ` [PATCH net-next 4/6] ipv6: Only create RTF_CACHE routes after encountering pmtu exception Martin KaFai Lau
` (2 subsequent siblings)
5 siblings, 0 replies; 12+ messages in thread
From: Martin KaFai Lau @ 2015-04-28 21:07 UTC (permalink / raw)
To: netdev; +Cc: Hannes Frederic Sowa, Steffen Klassert, David Miller, Kernel Team
A prep work for creating RTF_CACHE on exception only. After this
patch, the same condition (rt->rt6i_flags & (RTF_NONEXTHOP | RTF_GATEWAY))
is checked twice. This redundancy will be removed in the later patch.
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Reviewed-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Cc: Steffen Klassert <steffen.klassert@secunet.com>
---
net/ipv6/route.c | 45 ++++++++++++++++++++-------------------------
1 file changed, 20 insertions(+), 25 deletions(-)
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 8efde73..8bc83bc 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -655,6 +655,11 @@ static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict)
return match ? match : net->ipv6.ip6_null_entry;
}
+static bool rt6_is_gw_or_nonexthop(const struct rt6_info *rt)
+{
+ return (rt->rt6i_flags & (RTF_NONEXTHOP | RTF_GATEWAY));
+}
+
#ifdef CONFIG_IPV6_ROUTE_INFO
int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
const struct in6_addr *gwaddr)
@@ -833,9 +838,9 @@ int ip6_ins_rt(struct rt6_info *rt)
return __ip6_ins_rt(rt, &info, &mxc);
}
-static struct rt6_info *rt6_alloc_cow(struct rt6_info *ort,
- const struct in6_addr *daddr,
- const struct in6_addr *saddr)
+static struct rt6_info *ip6_pmtu_rt_cache_alloc(struct rt6_info *ort,
+ const struct in6_addr *daddr,
+ const struct in6_addr *saddr)
{
struct rt6_info *rt;
@@ -846,33 +851,24 @@ static struct rt6_info *rt6_alloc_cow(struct rt6_info *ort,
rt = ip6_rt_copy(ort, daddr);
if (rt) {
- if (ort->rt6i_dst.plen != 128 &&
- ipv6_addr_equal(&ort->rt6i_dst.addr, daddr))
- rt->rt6i_flags |= RTF_ANYCAST;
-
rt->rt6i_flags |= RTF_CACHE;
+ if (!rt6_is_gw_or_nonexthop(ort)) {
+ if (ort->rt6i_dst.plen != 128 &&
+ ipv6_addr_equal(&ort->rt6i_dst.addr, daddr))
+ rt->rt6i_flags |= RTF_ANYCAST;
#ifdef CONFIG_IPV6_SUBTREES
- if (rt->rt6i_src.plen && saddr) {
- rt->rt6i_src.addr = *saddr;
- rt->rt6i_src.plen = 128;
- }
+ if (rt->rt6i_src.plen && saddr) {
+ rt->rt6i_src.addr = *saddr;
+ rt->rt6i_src.plen = 128;
+ }
#endif
+ }
}
return rt;
}
-static struct rt6_info *rt6_alloc_clone(struct rt6_info *ort,
- const struct in6_addr *daddr)
-{
- struct rt6_info *rt = ip6_rt_copy(ort, daddr);
-
- if (rt)
- rt->rt6i_flags |= RTF_CACHE;
- return rt;
-}
-
static struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, int oif,
struct flowi6 *fl6, int flags)
{
@@ -918,10 +914,9 @@ redo_rt6_select:
if (rt->rt6i_flags & RTF_CACHE)
goto out2;
- if (!(rt->rt6i_flags & (RTF_NONEXTHOP | RTF_GATEWAY)))
- nrt = rt6_alloc_cow(rt, &fl6->daddr, &fl6->saddr);
- else if (!(rt->dst.flags & DST_HOST) || !(rt->dst.flags & RTF_LOCAL))
- nrt = rt6_alloc_clone(rt, &fl6->daddr);
+ if (!rt6_is_gw_or_nonexthop(rt) ||
+ !(rt->dst.flags & DST_HOST) || !(rt->dst.flags & RTF_LOCAL))
+ nrt = ip6_pmtu_rt_cache_alloc(rt, &fl6->daddr, &fl6->saddr);
else
goto out2;
--
1.8.1
^ permalink raw reply related [flat|nested] 12+ messages in thread
* [PATCH net-next 4/6] ipv6: Only create RTF_CACHE routes after encountering pmtu exception
2015-04-28 21:07 [PATCH net-next 0/6 v2] ipv6: Only create RTF_CACHE route after encountering pmtu exception Martin KaFai Lau
` (2 preceding siblings ...)
2015-04-28 21:07 ` [PATCH net-next 3/6] ipv6: Combine rt6_alloc_cow and rt6_alloc_clone Martin KaFai Lau
@ 2015-04-28 21:07 ` Martin KaFai Lau
2015-04-29 11:39 ` Steffen Klassert
2015-04-28 21:07 ` [PATCH net-next 5/6] ipv6: Break up ip6_rt_copy() Martin KaFai Lau
2015-04-28 21:07 ` [PATCH net-next 6/6] ipv6: Create percpu rt6_info Martin KaFai Lau
5 siblings, 1 reply; 12+ messages in thread
From: Martin KaFai Lau @ 2015-04-28 21:07 UTC (permalink / raw)
To: netdev; +Cc: Hannes Frederic Sowa, Steffen Klassert, David Miller, Kernel Team
This patch creates a RTF_CACHE routes only after encountering a pmtu
exception.
After ip6_rt_update_pmtu() has inserted the RTF_CACHE route to the fib6
tree, the rt->rt6i_node->fn_sernum is bumped which will fail the
ip6_dst_check() and trigger a relookup.
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Reviewed-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Cc: Steffen Klassert <steffen.klassert@secunet.com>
---
net/ipv6/route.c | 96 ++++++++++++++++++++++++++++----------------------------
1 file changed, 48 insertions(+), 48 deletions(-)
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 8bc83bc..09ab0f4 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -873,16 +873,13 @@ static struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
struct flowi6 *fl6, int flags)
{
struct fib6_node *fn, *saved_fn;
- struct rt6_info *rt, *nrt;
+ struct rt6_info *rt;
int strict = 0;
- int attempts = 3;
- int err;
strict |= flags & RT6_LOOKUP_F_IFACE;
if (net->ipv6.devconf_all->forwarding == 0)
strict |= RT6_LOOKUP_F_REACHABLE;
-redo_fib6_lookup_lock:
read_lock_bh(&table->tb6_lock);
fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
@@ -901,46 +898,12 @@ redo_rt6_select:
strict &= ~RT6_LOOKUP_F_REACHABLE;
fn = saved_fn;
goto redo_rt6_select;
- } else {
- dst_hold(&rt->dst);
- read_unlock_bh(&table->tb6_lock);
- goto out2;
}
}
dst_hold(&rt->dst);
read_unlock_bh(&table->tb6_lock);
- if (rt->rt6i_flags & RTF_CACHE)
- goto out2;
-
- if (!rt6_is_gw_or_nonexthop(rt) ||
- !(rt->dst.flags & DST_HOST) || !(rt->dst.flags & RTF_LOCAL))
- nrt = ip6_pmtu_rt_cache_alloc(rt, &fl6->daddr, &fl6->saddr);
- else
- goto out2;
-
- ip6_rt_put(rt);
- rt = nrt ? : net->ipv6.ip6_null_entry;
-
- dst_hold(&rt->dst);
- if (nrt) {
- err = ip6_ins_rt(nrt);
- if (!err)
- goto out2;
- }
-
- if (--attempts <= 0)
- goto out2;
-
- /*
- * Race condition! In the gap, when table->tb6_lock was
- * released someone could insert this route. Relookup.
- */
- ip6_rt_put(rt);
- goto redo_fib6_lookup_lock;
-
-out2:
rt6_dst_from_metrics_check(rt);
rt->dst.lastuse = jiffies;
rt->dst.__use++;
@@ -1113,22 +1076,59 @@ static void ip6_link_failure(struct sk_buff *skb)
}
}
-static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
- struct sk_buff *skb, u32 mtu)
+static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
+ const struct ipv6hdr *iph, u32 mtu)
{
struct rt6_info *rt6 = (struct rt6_info *)dst;
+ struct net *net;
+
+ if (rt6->rt6i_flags & RTF_LOCAL)
+ return;
dst_confirm(dst);
- if (mtu < dst_mtu(dst) && (rt6->rt6i_flags & RTF_CACHE)) {
- struct net *net = dev_net(dst->dev);
+ mtu = max_t(u32, mtu, IPV6_MIN_MTU);
+ if (mtu >= dst_mtu(dst))
+ return;
- rt6->rt6i_flags |= RTF_MODIFIED;
- if (mtu < IPV6_MIN_MTU)
- mtu = IPV6_MIN_MTU;
+ if (!(rt6->rt6i_flags & RTF_CACHE)) {
+ const struct in6_addr *daddr, *saddr;
+ struct rt6_info *nrt6;
- rt6->rt6i_pmtu = mtu;
- rt6_update_expires(rt6, net->ipv6.sysctl.ip6_rt_mtu_expires);
+ if (iph) {
+ daddr = &iph->daddr;
+ saddr = &iph->saddr;
+ } else if (sk) {
+ daddr = &sk->sk_v6_daddr;
+ saddr = &inet6_sk(sk)->saddr;
+ } else {
+ return;
+ }
+ nrt6 = ip6_pmtu_rt_cache_alloc(rt6, daddr, saddr);
+ if (!nrt6)
+ return;
+ /* ip6_ins_rt(nrt6) will bump the rt6->rt6i_node->fn_sernum
+ * which will fail the next rt6_check() and invalidate the
+ * sk->sk_dst_cache.
+ */
+ if (ip6_ins_rt(nrt6)) {
+ dst_destroy(&nrt6->dst);
+ return;
+ }
+
+ rt6 = nrt6;
+ dst = &nrt6->dst;
}
+
+ net = dev_net(dst->dev);
+ rt6->rt6i_flags |= RTF_MODIFIED;
+ rt6->rt6i_pmtu = mtu;
+ rt6_update_expires(rt6, net->ipv6.sysctl.ip6_rt_mtu_expires);
+}
+
+static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
+ struct sk_buff *skb, u32 mtu)
+{
+ __ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu);
}
void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
@@ -1147,7 +1147,7 @@ void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
dst = ip6_route_output(net, NULL, &fl6);
if (!dst->error)
- ip6_rt_update_pmtu(dst, NULL, skb, ntohl(mtu));
+ __ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu));
dst_release(dst);
}
EXPORT_SYMBOL_GPL(ip6_update_pmtu);
--
1.8.1
^ permalink raw reply related [flat|nested] 12+ messages in thread
* Re: [PATCH net-next 4/6] ipv6: Only create RTF_CACHE routes after encountering pmtu exception
2015-04-28 21:07 ` [PATCH net-next 4/6] ipv6: Only create RTF_CACHE routes after encountering pmtu exception Martin KaFai Lau
@ 2015-04-29 11:39 ` Steffen Klassert
2015-04-29 18:31 ` Martin KaFai Lau
0 siblings, 1 reply; 12+ messages in thread
From: Steffen Klassert @ 2015-04-29 11:39 UTC (permalink / raw)
To: Martin KaFai Lau; +Cc: netdev, Hannes Frederic Sowa, David Miller, Kernel Team
On Tue, Apr 28, 2015 at 02:07:51PM -0700, Martin KaFai Lau wrote:
>
> -static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
> - struct sk_buff *skb, u32 mtu)
> +static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
> + const struct ipv6hdr *iph, u32 mtu)
> {
> struct rt6_info *rt6 = (struct rt6_info *)dst;
> + struct net *net;
> +
> + if (rt6->rt6i_flags & RTF_LOCAL)
> + return;
>
> dst_confirm(dst);
> - if (mtu < dst_mtu(dst) && (rt6->rt6i_flags & RTF_CACHE)) {
> - struct net *net = dev_net(dst->dev);
> + mtu = max_t(u32, mtu, IPV6_MIN_MTU);
> + if (mtu >= dst_mtu(dst))
> + return;
>
> - rt6->rt6i_flags |= RTF_MODIFIED;
> - if (mtu < IPV6_MIN_MTU)
> - mtu = IPV6_MIN_MTU;
> + if (!(rt6->rt6i_flags & RTF_CACHE)) {
> + const struct in6_addr *daddr, *saddr;
> + struct rt6_info *nrt6;
>
> - rt6->rt6i_pmtu = mtu;
> - rt6_update_expires(rt6, net->ipv6.sysctl.ip6_rt_mtu_expires);
> + if (iph) {
> + daddr = &iph->daddr;
> + saddr = &iph->saddr;
> + } else if (sk) {
> + daddr = &sk->sk_v6_daddr;
> + saddr = &inet6_sk(sk)->saddr;
> + } else {
> + return;
> + }
> + nrt6 = ip6_pmtu_rt_cache_alloc(rt6, daddr, saddr);
> + if (!nrt6)
> + return;
> + /* ip6_ins_rt(nrt6) will bump the rt6->rt6i_node->fn_sernum
> + * which will fail the next rt6_check() and invalidate the
> + * sk->sk_dst_cache.
> + */
> + if (ip6_ins_rt(nrt6)) {
> + dst_destroy(&nrt6->dst);
fib6_add() does a dst_free() on error, so calling dst_destroy()
here might result in a use after free.
> + return;
> + }
> +
> + rt6 = nrt6;
> + dst = &nrt6->dst;
> }
> +
> + net = dev_net(dst->dev);
> + rt6->rt6i_flags |= RTF_MODIFIED;
> + rt6->rt6i_pmtu = mtu;
> + rt6_update_expires(rt6, net->ipv6.sysctl.ip6_rt_mtu_expires);
The update of expires and the setting of rt6i_pmtu should
happen before the route is inserted with ip6_ins_rt().
This is because fib6_add_rt2node() tries to reuse old
expired routes if still in the fib tree, the necessary
informations are copied from the new route before it
returnes -EEXIST on the new route. If your new route
has no expires value set, fib6_add_rt2node() cleans
expires of the old route before it resues it.
Also rt6i_pmtu should be copied to the reused route in
fib6_add_rt2node(), this should be done already in your
first patchset. Otherwise we might use stale pmtu informations.
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [PATCH net-next 4/6] ipv6: Only create RTF_CACHE routes after encountering pmtu exception
2015-04-29 11:39 ` Steffen Klassert
@ 2015-04-29 18:31 ` Martin KaFai Lau
0 siblings, 0 replies; 12+ messages in thread
From: Martin KaFai Lau @ 2015-04-29 18:31 UTC (permalink / raw)
To: Steffen Klassert; +Cc: netdev, Hannes Frederic Sowa, David Miller, Kernel Team
On Wed, Apr 29, 2015 at 01:39:18PM +0200, Steffen Klassert wrote:
> On Tue, Apr 28, 2015 at 02:07:51PM -0700, Martin KaFai Lau wrote:
> > + if (ip6_ins_rt(nrt6)) {
> > + dst_destroy(&nrt6->dst);
>
> fib6_add() does a dst_free() on error, so calling dst_destroy()
> here might result in a use after free.
Good catch.
>
>
> > + return;
> > + }
> > +
> > + rt6 = nrt6;
> > + dst = &nrt6->dst;
> > }
> > +
> > + net = dev_net(dst->dev);
> > + rt6->rt6i_flags |= RTF_MODIFIED;
> > + rt6->rt6i_pmtu = mtu;
> > + rt6_update_expires(rt6, net->ipv6.sysctl.ip6_rt_mtu_expires);
>
> The update of expires and the setting of rt6i_pmtu should
> happen before the route is inserted with ip6_ins_rt().
>
> This is because fib6_add_rt2node() tries to reuse old
> expired routes if still in the fib tree, the necessary
> informations are copied from the new route before it
> returnes -EEXIST on the new route. If your new route
> has no expires value set, fib6_add_rt2node() cleans
> expires of the old route before it resues it.
>
> Also rt6i_pmtu should be copied to the reused route in
> fib6_add_rt2node(), this should be done already in your
> first patchset. Otherwise we might use stale pmtu informations.
Good catch.
A similar race may also happen in the current ip6_pol_route()
where it may clear the RTF_EXPIRES of the existing pmtu clone.
Hence, copying rt6i_pmtu (at fib6_add_rt2node()) in the last patchset will
not be right.
I will do the copying and early-set-expire in this patchset instead.
Thanks,
---Martin
^ permalink raw reply [flat|nested] 12+ messages in thread
* [PATCH net-next 5/6] ipv6: Break up ip6_rt_copy()
2015-04-28 21:07 [PATCH net-next 0/6 v2] ipv6: Only create RTF_CACHE route after encountering pmtu exception Martin KaFai Lau
` (3 preceding siblings ...)
2015-04-28 21:07 ` [PATCH net-next 4/6] ipv6: Only create RTF_CACHE routes after encountering pmtu exception Martin KaFai Lau
@ 2015-04-28 21:07 ` Martin KaFai Lau
2015-04-28 21:07 ` [PATCH net-next 6/6] ipv6: Create percpu rt6_info Martin KaFai Lau
5 siblings, 0 replies; 12+ messages in thread
From: Martin KaFai Lau @ 2015-04-28 21:07 UTC (permalink / raw)
To: netdev; +Cc: Hannes Frederic Sowa, Steffen Klassert, David Miller, Kernel Team
This patch breaks up ip6_rt_copy() into ip6_rt_copy_init() and
ip6_rt_cache_alloc().
In the later patch, we need to create a percpu rt6_info copy. Hence,
refactor the common rt6_info init codes to ip6_rt_copy_init().
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Reviewed-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Cc: Steffen Klassert <steffen.klassert@secunet.com>
---
net/ipv6/route.c | 80 ++++++++++++++++++++++++++++++++------------------------
1 file changed, 46 insertions(+), 34 deletions(-)
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 09ab0f4..cb3c585 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -72,8 +72,11 @@ enum rt6_nud_state {
RT6_NUD_SUCCEED = 1
};
-static struct rt6_info *ip6_rt_copy(struct rt6_info *ort,
- const struct in6_addr *dest);
+static void ip6_rt_copy_init(struct rt6_info *rt,
+ struct rt6_info *ort,
+ const struct in6_addr *dest);
+static struct rt6_info *ip6_rt_cache_alloc(struct rt6_info *ort,
+ const struct in6_addr *dest);
static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
static unsigned int ip6_default_advmss(const struct dst_entry *dst);
static unsigned int ip6_mtu(const struct dst_entry *dst);
@@ -848,11 +851,9 @@ static struct rt6_info *ip6_pmtu_rt_cache_alloc(struct rt6_info *ort,
* Clone the route.
*/
- rt = ip6_rt_copy(ort, daddr);
+ rt = ip6_rt_cache_alloc(ort, daddr);
if (rt) {
- rt->rt6i_flags |= RTF_CACHE;
-
if (!rt6_is_gw_or_nonexthop(ort)) {
if (ort->rt6i_dst.plen != 128 &&
ipv6_addr_equal(&ort->rt6i_dst.addr, daddr))
@@ -1865,7 +1866,7 @@ static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_bu
NEIGH_UPDATE_F_ISROUTER))
);
- nrt = ip6_rt_copy(rt, &msg->dest);
+ nrt = ip6_rt_cache_alloc(rt, &msg->dest);
if (!nrt)
goto out;
@@ -1907,41 +1908,52 @@ static void rt6_set_from(struct rt6_info *rt, struct rt6_info *from)
dst_init_metrics(&rt->dst, dst_metrics_ptr(&from->dst), true);
}
-static struct rt6_info *ip6_rt_copy(struct rt6_info *ort,
- const struct in6_addr *dest)
+static void ip6_rt_copy_init(struct rt6_info *rt,
+ struct rt6_info *ort,
+ const struct in6_addr *dest)
{
- struct net *net = dev_net(ort->dst.dev);
- struct rt6_info *rt;
-
- if (ort->rt6i_flags & RTF_CACHE)
- ort = (struct rt6_info *)ort->dst.from;
-
- rt = ip6_dst_alloc(net, ort->dst.dev, 0,
- ort->rt6i_table);
-
- if (rt) {
- rt->dst.input = ort->dst.input;
- rt->dst.output = ort->dst.output;
+ if (dest) {
rt->dst.flags |= DST_HOST;
-
rt->rt6i_dst.addr = *dest;
rt->rt6i_dst.plen = 128;
- rt->dst.error = ort->dst.error;
- rt->rt6i_idev = ort->rt6i_idev;
- if (rt->rt6i_idev)
- in6_dev_hold(rt->rt6i_idev);
- rt->dst.lastuse = jiffies;
- rt->rt6i_gateway = ort->rt6i_gateway;
- rt->rt6i_flags = ort->rt6i_flags;
- rt6_set_from(rt, ort);
- rt->rt6i_metric = 0;
+ } else {
+ rt->rt6i_dst = ort->rt6i_dst;
+ }
+ rt->dst.input = ort->dst.input;
+ rt->dst.output = ort->dst.output;
+ rt->dst.error = ort->dst.error;
+ rt->rt6i_idev = ort->rt6i_idev;
+ if (rt->rt6i_idev)
+ in6_dev_hold(rt->rt6i_idev);
+ rt->dst.lastuse = jiffies;
+ rt->rt6i_gateway = ort->rt6i_gateway;
+ rt->rt6i_flags = ort->rt6i_flags;
+ rt6_set_from(rt, ort);
+ rt->rt6i_metric = ort->rt6i_metric;
#ifdef CONFIG_IPV6_SUBTREES
- memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
+ rt->rt6i_src = ort->rt6i_src;
#endif
- memcpy(&rt->rt6i_prefsrc, &ort->rt6i_prefsrc, sizeof(struct rt6key));
- rt->rt6i_table = ort->rt6i_table;
- }
+ rt->rt6i_prefsrc = ort->rt6i_prefsrc;
+ rt->rt6i_table = ort->rt6i_table;
+}
+
+static struct rt6_info *ip6_rt_cache_alloc(struct rt6_info *ort,
+ const struct in6_addr *dest)
+{
+ struct rt6_info *rt;
+
+ if (ort->rt6i_flags & RTF_CACHE)
+ ort = (struct rt6_info *)ort->dst.from;
+
+ rt = ip6_dst_alloc(dev_net(ort->dst.dev), ort->dst.dev,
+ 0, ort->rt6i_table);
+
+ if (!rt)
+ return NULL;
+ ip6_rt_copy_init(rt, ort, dest);
+ rt->rt6i_flags |= RTF_CACHE;
+ rt->rt6i_metric = 0;
return rt;
}
--
1.8.1
^ permalink raw reply related [flat|nested] 12+ messages in thread
* [PATCH net-next 6/6] ipv6: Create percpu rt6_info
2015-04-28 21:07 [PATCH net-next 0/6 v2] ipv6: Only create RTF_CACHE route after encountering pmtu exception Martin KaFai Lau
` (4 preceding siblings ...)
2015-04-28 21:07 ` [PATCH net-next 5/6] ipv6: Break up ip6_rt_copy() Martin KaFai Lau
@ 2015-04-28 21:07 ` Martin KaFai Lau
5 siblings, 0 replies; 12+ messages in thread
From: Martin KaFai Lau @ 2015-04-28 21:07 UTC (permalink / raw)
To: netdev; +Cc: Hannes Frederic Sowa, Steffen Klassert, David Miller, Kernel Team
After the patch
'ipv6: Only create RTF_CACHE routes after encountering pmtu exception',
we need to compensate the performance hit (bouncing dst->__refcnt).
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Reviewed-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Cc: Steffen Klassert <steffen.klassert@secunet.com>
---
include/net/ip6_fib.h | 8 +++
include/net/ip6_route.h | 2 +-
include/uapi/linux/ipv6_route.h | 1 +
net/ipv6/ip6_fib.c | 25 ++++++-
net/ipv6/ip6_tunnel.c | 2 +-
net/ipv6/route.c | 150 +++++++++++++++++++++++++++++++++++-----
net/ipv6/tcp_ipv6.c | 3 +-
net/ipv6/xfrm6_policy.c | 6 +-
net/netfilter/ipvs/ip_vs_xmit.c | 2 +-
net/sctp/ipv6.c | 2 +-
10 files changed, 171 insertions(+), 30 deletions(-)
diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index e000180..4b1fc9b 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -121,6 +121,7 @@ struct rt6_info {
struct rt6key rt6i_prefsrc;
struct inet6_dev *rt6i_idev;
+ struct rt6_info __rcu * __percpu *rt6i_pcpu;
u32 rt6i_metric;
u32 rt6i_pmtu;
@@ -159,6 +160,13 @@ static inline void rt6_update_expires(struct rt6_info *rt0, int timeout)
rt0->rt6i_flags |= RTF_EXPIRES;
}
+static inline u32 rt6_get_cookie(const struct rt6_info *rt)
+{
+ if (rt->rt6i_flags & RTF_PCPU)
+ rt = (struct rt6_info *)(rt->dst.from);
+ return rt->rt6i_node ? rt->rt6i_node->fn_sernum : 0;
+}
+
static inline void ip6_rt_put(struct rt6_info *rt)
{
/* dst_release() accepts a NULL parameter.
diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h
index 0e4d170..397dd3a 100644
--- a/include/net/ip6_route.h
+++ b/include/net/ip6_route.h
@@ -145,7 +145,7 @@ static inline void __ip6_dst_store(struct sock *sk, struct dst_entry *dst,
#ifdef CONFIG_IPV6_SUBTREES
np->saddr_cache = saddr;
#endif
- np->dst_cookie = rt->rt6i_node ? rt->rt6i_node->fn_sernum : 0;
+ np->dst_cookie = rt6_get_cookie(rt);
}
static inline void ip6_dst_store(struct sock *sk, struct dst_entry *dst,
diff --git a/include/uapi/linux/ipv6_route.h b/include/uapi/linux/ipv6_route.h
index 2be7bd1..f6598d1 100644
--- a/include/uapi/linux/ipv6_route.h
+++ b/include/uapi/linux/ipv6_route.h
@@ -34,6 +34,7 @@
#define RTF_PREF(pref) ((pref) << 27)
#define RTF_PREF_MASK 0x18000000
+#define RTF_PCPU 0x40000000
#define RTF_LOCAL 0x80000000
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index 96dbfff..bf12be7 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -154,10 +154,33 @@ static void node_free(struct fib6_node *fn)
kmem_cache_free(fib6_node_kmem, fn);
}
+static void rt6_free_pcpu(struct rt6_info *non_pcpu_rt)
+{
+ int cpu;
+
+ if (!non_pcpu_rt->rt6i_pcpu)
+ return;
+
+ for_each_possible_cpu(cpu) {
+ struct rt6_info **ppcpu_rt;
+ struct rt6_info *pcpu_rt;
+
+ ppcpu_rt = per_cpu_ptr(non_pcpu_rt->rt6i_pcpu, cpu);
+ pcpu_rt = rcu_dereference_protected(*ppcpu_rt,
+ lockdep_is_held(&non_pcpu_rt->rt6i_table->tb6_lock));
+ if (pcpu_rt) {
+ dst_free(&pcpu_rt->dst);
+ *ppcpu_rt = NULL;
+ }
+ }
+}
+
static void rt6_release(struct rt6_info *rt)
{
- if (atomic_dec_and_test(&rt->rt6i_ref))
+ if (atomic_dec_and_test(&rt->rt6i_ref)) {
+ rt6_free_pcpu(rt);
dst_free(&rt->dst);
+ }
}
static void fib6_link_table(struct net *net, struct fib6_table *tb)
diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c
index 5cafd92..2e67b66 100644
--- a/net/ipv6/ip6_tunnel.c
+++ b/net/ipv6/ip6_tunnel.c
@@ -151,7 +151,7 @@ EXPORT_SYMBOL_GPL(ip6_tnl_dst_reset);
void ip6_tnl_dst_store(struct ip6_tnl *t, struct dst_entry *dst)
{
struct rt6_info *rt = (struct rt6_info *) dst;
- t->dst_cookie = rt->rt6i_node ? rt->rt6i_node->fn_sernum : 0;
+ t->dst_cookie = rt6_get_cookie(rt);
dst_release(t->dst_cache);
t->dst_cache = dst;
}
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index cb3c585..29227a0 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -108,11 +108,18 @@ static struct rt6_info *rt6_get_route_info(struct net *net,
const struct in6_addr *gwaddr, int ifindex);
#endif
+static u32 *rt6_pcpu_cow_metrics(struct rt6_info *rt)
+{
+ return dst_metrics_write_ptr(rt->dst.from);
+}
+
static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old)
{
struct rt6_info *rt = (struct rt6_info *)dst;
- if (rt->rt6i_flags & RTF_CACHE)
+ if (rt->rt6i_flags & RTF_PCPU)
+ return rt6_pcpu_cow_metrics(rt);
+ else if (rt->rt6i_flags & RTF_CACHE)
return NULL;
else
return dst_cow_metrics_generic(dst, old);
@@ -252,10 +259,10 @@ static const struct rt6_info ip6_blk_hole_entry_template = {
#endif
/* allocate dst with ip6_dst_ops */
-static inline struct rt6_info *ip6_dst_alloc(struct net *net,
- struct net_device *dev,
- int flags,
- struct fib6_table *table)
+static struct rt6_info *__ip6_dst_alloc(struct net *net,
+ struct net_device *dev,
+ int flags,
+ struct fib6_table *table)
{
struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
0, DST_OBSOLETE_FORCE_CHK, flags);
@@ -269,6 +276,34 @@ static inline struct rt6_info *ip6_dst_alloc(struct net *net,
return rt;
}
+static struct rt6_info *ip6_dst_alloc(struct net *net,
+ struct net_device *dev,
+ int flags,
+ struct fib6_table *table)
+{
+ struct rt6_info *rt = __ip6_dst_alloc(net, dev, flags, table);
+
+ if (rt) {
+ rt->rt6i_pcpu = alloc_percpu_gfp(struct rt6_info *, GFP_ATOMIC);
+ if (rt->rt6i_pcpu) {
+ int cpu;
+
+ for_each_possible_cpu(cpu) {
+ struct rt6_info **p;
+
+ p = per_cpu_ptr(rt->rt6i_pcpu, cpu);
+ /* no one shares rt */
+ *p = NULL;
+ }
+ } else {
+ dst_destroy((struct dst_entry *)rt);
+ return NULL;
+ }
+ }
+
+ return rt;
+}
+
static void ip6_dst_destroy(struct dst_entry *dst)
{
struct rt6_info *rt = (struct rt6_info *)dst;
@@ -277,6 +312,9 @@ static void ip6_dst_destroy(struct dst_entry *dst)
dst_destroy_metrics_generic(dst);
+ if (rt->rt6i_pcpu)
+ free_percpu(rt->rt6i_pcpu);
+
if (idev) {
rt->rt6i_idev = NULL;
in6_dev_put(idev);
@@ -870,11 +908,69 @@ static struct rt6_info *ip6_pmtu_rt_cache_alloc(struct rt6_info *ort,
return rt;
}
+static struct rt6_info *ip6_rt_pcpu_alloc(struct rt6_info *rt)
+{
+ struct rt6_info *pcpu_rt;
+
+ pcpu_rt = __ip6_dst_alloc(dev_net(rt->dst.dev),
+ rt->dst.dev, rt->dst.flags,
+ rt->rt6i_table);
+
+ if (!pcpu_rt)
+ return NULL;
+ ip6_rt_copy_init(pcpu_rt, rt, NULL);
+ pcpu_rt->rt6i_protocol = rt->rt6i_protocol;
+ pcpu_rt->rt6i_flags |= RTF_PCPU;
+ return pcpu_rt;
+}
+
+static struct rt6_info *rt6_get_pcpu_route(struct rt6_info *rt)
+{
+ struct rt6_info *pcpu_rt, *orig, *prev, **p;
+ struct net *net = dev_net(rt->dst.dev);
+
+ if (rt == net->ipv6.ip6_null_entry || rt->rt6i_flags & RTF_CACHE)
+ goto done;
+
+ rcu_read_lock();
+ p = raw_cpu_ptr(rt->rt6i_pcpu);
+ orig = rcu_dereference_check(*p,
+ lockdep_is_held(&rt->rt6i_table->tb6_lock));
+ if (orig) {
+ rt6_dst_from_metrics_check(orig);
+ dst_hold(&orig->dst);
+ rcu_read_unlock();
+ return orig;
+ }
+ rcu_read_unlock();
+
+ pcpu_rt = ip6_rt_pcpu_alloc(rt);
+ if (!pcpu_rt) {
+ rt = net->ipv6.ip6_null_entry;
+ goto done;
+ }
+ dst_hold(&pcpu_rt->dst);
+
+ prev = cmpxchg(p, orig, pcpu_rt);
+ if (prev == orig) {
+ if (orig)
+ call_rcu(&orig->dst.rcu_head, dst_rcu_free);
+ } else {
+ pcpu_rt->dst.flags |= DST_NOCACHE;
+ }
+ return pcpu_rt;
+
+done:
+ rt6_dst_from_metrics_check(rt);
+ dst_hold(&rt->dst);
+ return rt;
+}
+
static struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, int oif,
struct flowi6 *fl6, int flags)
{
struct fib6_node *fn, *saved_fn;
- struct rt6_info *rt;
+ struct rt6_info *rt, *pcpu_rt;
int strict = 0;
strict |= flags & RT6_LOOKUP_F_IFACE;
@@ -902,14 +998,13 @@ redo_rt6_select:
}
}
- dst_hold(&rt->dst);
+ pcpu_rt = rt6_get_pcpu_route(rt);
read_unlock_bh(&table->tb6_lock);
- rt6_dst_from_metrics_check(rt);
rt->dst.lastuse = jiffies;
rt->dst.__use++;
- return rt;
+ return pcpu_rt;
}
static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table,
@@ -1020,6 +1115,26 @@ static void rt6_dst_from_metrics_check(struct rt6_info *rt)
dst_init_metrics(&rt->dst, dst_metrics_ptr(rt->dst.from), true);
}
+static struct dst_entry *rt6_check(struct rt6_info *rt, u32 cookie)
+{
+ if (!rt->rt6i_node || rt->rt6i_node->fn_sernum != cookie)
+ return NULL;
+
+ if (rt6_check_expired(rt))
+ return NULL;
+
+ return &rt->dst;
+}
+
+static struct dst_entry *rt6_pcpu_check(struct rt6_info *rt, u32 cookie)
+{
+ if (rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
+ rt6_check((struct rt6_info *)(rt->dst.from), cookie))
+ return &rt->dst;
+ else
+ return NULL;
+}
+
static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
{
struct rt6_info *rt;
@@ -1030,15 +1145,12 @@ static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
* DST_OBSOLETE_FORCE_CHK which forces validation calls down
* into this function always.
*/
- if (!rt->rt6i_node || (rt->rt6i_node->fn_sernum != cookie))
- return NULL;
-
- if (rt6_check_expired(rt))
- return NULL;
-
rt6_dst_from_metrics_check(rt);
- return dst;
+ if (rt->rt6i_flags & RTF_PCPU)
+ return rt6_pcpu_check(rt, cookie);
+ else
+ return rt6_check(rt, cookie);
}
static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
@@ -1943,11 +2055,11 @@ static struct rt6_info *ip6_rt_cache_alloc(struct rt6_info *ort,
{
struct rt6_info *rt;
- if (ort->rt6i_flags & RTF_CACHE)
+ if (ort->rt6i_flags & (RTF_PCPU | RTF_CACHE))
ort = (struct rt6_info *)ort->dst.from;
- rt = ip6_dst_alloc(dev_net(ort->dst.dev), ort->dst.dev,
- 0, ort->rt6i_table);
+ rt = __ip6_dst_alloc(dev_net(ort->dst.dev), ort->dst.dev,
+ 0, ort->rt6i_table);
if (!rt)
return NULL;
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 042a645..b416305 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -99,8 +99,7 @@ static void inet6_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
dst_hold(dst);
sk->sk_rx_dst = dst;
inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
- if (rt->rt6i_node)
- inet6_sk(sk)->rx_dst_cookie = rt->rt6i_node->fn_sernum;
+ inet6_sk(sk)->rx_dst_cookie = rt6_get_cookie(rt);
}
}
diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c
index 6ae256b..ed0583c 100644
--- a/net/ipv6/xfrm6_policy.c
+++ b/net/ipv6/xfrm6_policy.c
@@ -76,8 +76,7 @@ static int xfrm6_init_path(struct xfrm_dst *path, struct dst_entry *dst,
{
if (dst->ops->family == AF_INET6) {
struct rt6_info *rt = (struct rt6_info *)dst;
- if (rt->rt6i_node)
- path->path_cookie = rt->rt6i_node->fn_sernum;
+ path->path_cookie = rt6_get_cookie(rt);
}
path->u.rt6.rt6i_nfheader_len = nfheader_len;
@@ -105,8 +104,7 @@ static int xfrm6_fill_dst(struct xfrm_dst *xdst, struct net_device *dev,
RTF_LOCAL);
xdst->u.rt6.rt6i_metric = rt->rt6i_metric;
xdst->u.rt6.rt6i_node = rt->rt6i_node;
- if (rt->rt6i_node)
- xdst->route_cookie = rt->rt6i_node->fn_sernum;
+ xdst->route_cookie = rt6_get_cookie(rt);
xdst->u.rt6.rt6i_gateway = rt->rt6i_gateway;
xdst->u.rt6.rt6i_dst = rt->rt6i_dst;
xdst->u.rt6.rt6i_src = rt->rt6i_src;
diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c
index 38f8627..5eff9f6 100644
--- a/net/netfilter/ipvs/ip_vs_xmit.c
+++ b/net/netfilter/ipvs/ip_vs_xmit.c
@@ -435,7 +435,7 @@ __ip_vs_get_out_rt_v6(int skb_af, struct sk_buff *skb, struct ip_vs_dest *dest,
goto err_unreach;
}
rt = (struct rt6_info *) dst;
- cookie = rt->rt6i_node ? rt->rt6i_node->fn_sernum : 0;
+ cookie = rt6_get_cookie(rt);
__ip_vs_dst_set(dest, dest_dst, &rt->dst, cookie);
spin_unlock_bh(&dest->dst_lock);
IP_VS_DBG(10, "new dst %pI6, src %pI6, refcnt=%d\n",
diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c
index 9fa13f6..d012834 100644
--- a/net/sctp/ipv6.c
+++ b/net/sctp/ipv6.c
@@ -331,7 +331,7 @@ out:
rt = (struct rt6_info *)dst;
t->dst = dst;
- t->dst_cookie = rt->rt6i_node ? rt->rt6i_node->fn_sernum : 0;
+ t->dst_cookie = rt6_get_cookie(rt);
pr_debug("rt6_dst:%pI6/%d rt6_src:%pI6\n",
&rt->rt6i_dst.addr, rt->rt6i_dst.plen,
&fl6->saddr);
--
1.8.1
^ permalink raw reply related [flat|nested] 12+ messages in thread