* [PATCH net 0/3] tcp: provide correct skb->priority
@ 2019-09-24 15:01 Eric Dumazet
2019-09-24 15:01 ` [PATCH net 1/3] ipv6: add priority parameter to ip6_xmit() Eric Dumazet
` (3 more replies)
0 siblings, 4 replies; 5+ messages in thread
From: Eric Dumazet @ 2019-09-24 15:01 UTC (permalink / raw)
To: David S . Miller; +Cc: netdev, Eric Dumazet, Eric Dumazet
SO_PRIORITY socket option requests TCP egress packets
to contain a user provided value.
TCP manages to send most packets with the requested values,
notably for TCP_ESTABLISHED state, but fails to do so for
few packets.
These packets are control packets sent on behalf
of SYN_RECV or TIME_WAIT states.
Note that to test this with packetdrill, it is a bit
of a hassle, since packetdrill can not verify priority
of egress packets, other than indirect observations,
using for example sch_prio on its tunnel device.
The bad skb priorities cause problems for GCP,
as this field is one of the keys used in routing.
Eric Dumazet (3):
ipv6: add priority parameter to ip6_xmit()
ipv6: tcp: provide sk->sk_priority to ctl packets
tcp: honor SO_PRIORITY in TIME_WAIT state
include/net/inet_timewait_sock.h | 1 +
include/net/ipv6.h | 2 +-
net/dccp/ipv6.c | 5 +++--
net/ipv4/ip_output.c | 1 -
net/ipv4/tcp_ipv4.c | 4 ++++
net/ipv4/tcp_minisocks.c | 1 +
net/ipv6/inet6_connection_sock.c | 2 +-
net/ipv6/ip6_output.c | 4 ++--
net/ipv6/tcp_ipv6.c | 24 +++++++++++++++---------
net/sctp/ipv6.c | 2 +-
10 files changed, 29 insertions(+), 17 deletions(-)
--
2.23.0.351.gc4317032e6-goog
^ permalink raw reply [flat|nested] 5+ messages in thread
* [PATCH net 1/3] ipv6: add priority parameter to ip6_xmit()
2019-09-24 15:01 [PATCH net 0/3] tcp: provide correct skb->priority Eric Dumazet
@ 2019-09-24 15:01 ` Eric Dumazet
2019-09-24 15:01 ` [PATCH net 2/3] ipv6: tcp: provide sk->sk_priority to ctl packets Eric Dumazet
` (2 subsequent siblings)
3 siblings, 0 replies; 5+ messages in thread
From: Eric Dumazet @ 2019-09-24 15:01 UTC (permalink / raw)
To: David S . Miller; +Cc: netdev, Eric Dumazet, Eric Dumazet
Currently, ip6_xmit() sets skb->priority based on sk->sk_priority
This is not desirable for TCP since TCP shares the same ctl socket
for a given netns. We want to be able to send RST or ACK packets
with a non zero skb->priority.
This patch has no functional change.
Signed-off-by: Eric Dumazet <edumazet@google.com>
---
include/net/ipv6.h | 2 +-
net/dccp/ipv6.c | 5 +++--
net/ipv6/inet6_connection_sock.c | 2 +-
net/ipv6/ip6_output.c | 4 ++--
net/ipv6/tcp_ipv6.c | 6 ++++--
net/sctp/ipv6.c | 2 +-
6 files changed, 12 insertions(+), 9 deletions(-)
diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index 8dfc65639aa4c4dc10277989cf4bfa03ec161354..009605c56f209040d10f4878353aafa66c1a845f 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -981,7 +981,7 @@ int ip6_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb);
* upper-layer output functions
*/
int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
- __u32 mark, struct ipv6_txoptions *opt, int tclass);
+ __u32 mark, struct ipv6_txoptions *opt, int tclass, u32 priority);
int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr);
diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c
index 1b7381ff787b388488a8458b6f52760f73a4f22d..25aab672fc9907801f63ff2bd374f6db17fe28d2 100644
--- a/net/dccp/ipv6.c
+++ b/net/dccp/ipv6.c
@@ -230,7 +230,8 @@ static int dccp_v6_send_response(const struct sock *sk, struct request_sock *req
opt = ireq->ipv6_opt;
if (!opt)
opt = rcu_dereference(np->opt);
- err = ip6_xmit(sk, skb, &fl6, sk->sk_mark, opt, np->tclass);
+ err = ip6_xmit(sk, skb, &fl6, sk->sk_mark, opt, np->tclass,
+ sk->sk_priority);
rcu_read_unlock();
err = net_xmit_eval(err);
}
@@ -284,7 +285,7 @@ static void dccp_v6_ctl_send_reset(const struct sock *sk, struct sk_buff *rxskb)
dst = ip6_dst_lookup_flow(ctl_sk, &fl6, NULL);
if (!IS_ERR(dst)) {
skb_dst_set(skb, dst);
- ip6_xmit(ctl_sk, skb, &fl6, 0, NULL, 0);
+ ip6_xmit(ctl_sk, skb, &fl6, 0, NULL, 0, 0);
DCCP_INC_STATS(DCCP_MIB_OUTSEGS);
DCCP_INC_STATS(DCCP_MIB_OUTRSTS);
return;
diff --git a/net/ipv6/inet6_connection_sock.c b/net/ipv6/inet6_connection_sock.c
index 4da24aa6c696dff6867f876b20805f2dd207e8c5..0a0945a5b30da1647aaf1b9f8b46db69102e8ff8 100644
--- a/net/ipv6/inet6_connection_sock.c
+++ b/net/ipv6/inet6_connection_sock.c
@@ -133,7 +133,7 @@ int inet6_csk_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl_unused
fl6.daddr = sk->sk_v6_daddr;
res = ip6_xmit(sk, skb, &fl6, sk->sk_mark, rcu_dereference(np->opt),
- np->tclass);
+ np->tclass, sk->sk_priority);
rcu_read_unlock();
return res;
}
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 89a4c7c2e25d01cc2ef18ae9c407f8f3319590d9..edadee4a7e76105f737d705052db8f5bbc6c0152 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -193,7 +193,7 @@ bool ip6_autoflowlabel(struct net *net, const struct ipv6_pinfo *np)
* which are using proper atomic operations or spinlocks.
*/
int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
- __u32 mark, struct ipv6_txoptions *opt, int tclass)
+ __u32 mark, struct ipv6_txoptions *opt, int tclass, u32 priority)
{
struct net *net = sock_net(sk);
const struct ipv6_pinfo *np = inet6_sk(sk);
@@ -258,7 +258,7 @@ int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
hdr->daddr = *first_hop;
skb->protocol = htons(ETH_P_IPV6);
- skb->priority = sk->sk_priority;
+ skb->priority = priority;
skb->mark = mark;
mtu = dst_mtu(dst);
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 87f44d3250ee6844777a000760326d6ad6831de6..806064c2886777ad37a1f0b8406aa8bee7945723 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -512,7 +512,8 @@ static int tcp_v6_send_synack(const struct sock *sk, struct dst_entry *dst,
opt = ireq->ipv6_opt;
if (!opt)
opt = rcu_dereference(np->opt);
- err = ip6_xmit(sk, skb, fl6, sk->sk_mark, opt, np->tclass);
+ err = ip6_xmit(sk, skb, fl6, sk->sk_mark, opt, np->tclass,
+ sk->sk_priority);
rcu_read_unlock();
err = net_xmit_eval(err);
}
@@ -907,7 +908,8 @@ static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32
dst = ip6_dst_lookup_flow(ctl_sk, &fl6, NULL);
if (!IS_ERR(dst)) {
skb_dst_set(buff, dst);
- ip6_xmit(ctl_sk, buff, &fl6, fl6.flowi6_mark, NULL, tclass);
+ ip6_xmit(ctl_sk, buff, &fl6, fl6.flowi6_mark, NULL, tclass,
+ 0);
TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
if (rst)
TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c
index e5f2fc726a983579dddbd201ea5ee5eb371a50ac..dd860fea014843b350fd01108b78b538cb09ecbb 100644
--- a/net/sctp/ipv6.c
+++ b/net/sctp/ipv6.c
@@ -215,7 +215,7 @@ static int sctp_v6_xmit(struct sk_buff *skb, struct sctp_transport *transport)
rcu_read_lock();
res = ip6_xmit(sk, skb, fl6, sk->sk_mark, rcu_dereference(np->opt),
- tclass);
+ tclass, sk->sk_priority);
rcu_read_unlock();
return res;
}
--
2.23.0.351.gc4317032e6-goog
^ permalink raw reply related [flat|nested] 5+ messages in thread
* [PATCH net 2/3] ipv6: tcp: provide sk->sk_priority to ctl packets
2019-09-24 15:01 [PATCH net 0/3] tcp: provide correct skb->priority Eric Dumazet
2019-09-24 15:01 ` [PATCH net 1/3] ipv6: add priority parameter to ip6_xmit() Eric Dumazet
@ 2019-09-24 15:01 ` Eric Dumazet
2019-09-24 15:01 ` [PATCH net 3/3] tcp: honor SO_PRIORITY in TIME_WAIT state Eric Dumazet
2019-09-27 10:05 ` [PATCH net 0/3] tcp: provide correct skb->priority David Miller
3 siblings, 0 replies; 5+ messages in thread
From: Eric Dumazet @ 2019-09-24 15:01 UTC (permalink / raw)
To: David S . Miller; +Cc: netdev, Eric Dumazet, Eric Dumazet
We can populate skb->priority for some ctl packets
instead of always using zero.
Signed-off-by: Eric Dumazet <edumazet@google.com>
---
net/ipv6/tcp_ipv6.c | 16 +++++++++-------
1 file changed, 9 insertions(+), 7 deletions(-)
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 806064c2886777ad37a1f0b8406aa8bee7945723..5f557bf27da2ba6bcc74034a53a3f76a99fdf9f4 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -804,7 +804,7 @@ static const struct tcp_request_sock_ops tcp_request_sock_ipv6_ops = {
static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32 seq,
u32 ack, u32 win, u32 tsval, u32 tsecr,
int oif, struct tcp_md5sig_key *key, int rst,
- u8 tclass, __be32 label)
+ u8 tclass, __be32 label, u32 priority)
{
const struct tcphdr *th = tcp_hdr(skb);
struct tcphdr *t1;
@@ -909,7 +909,7 @@ static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32
if (!IS_ERR(dst)) {
skb_dst_set(buff, dst);
ip6_xmit(ctl_sk, buff, &fl6, fl6.flowi6_mark, NULL, tclass,
- 0);
+ priority);
TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
if (rst)
TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
@@ -932,6 +932,7 @@ static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb)
struct sock *sk1 = NULL;
#endif
__be32 label = 0;
+ u32 priority = 0;
struct net *net;
int oif = 0;
@@ -992,6 +993,7 @@ static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb)
trace_tcp_send_reset(sk, skb);
if (np->repflow)
label = ip6_flowlabel(ipv6h);
+ priority = sk->sk_priority;
}
if (sk->sk_state == TCP_TIME_WAIT)
label = cpu_to_be32(inet_twsk(sk)->tw_flowlabel);
@@ -1001,7 +1003,7 @@ static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb)
}
tcp_v6_send_response(sk, skb, seq, ack_seq, 0, 0, 0, oif, key, 1, 0,
- label);
+ label, priority);
#ifdef CONFIG_TCP_MD5SIG
out:
@@ -1012,10 +1014,10 @@ static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb)
static void tcp_v6_send_ack(const struct sock *sk, struct sk_buff *skb, u32 seq,
u32 ack, u32 win, u32 tsval, u32 tsecr, int oif,
struct tcp_md5sig_key *key, u8 tclass,
- __be32 label)
+ __be32 label, u32 priority)
{
tcp_v6_send_response(sk, skb, seq, ack, win, tsval, tsecr, oif, key, 0,
- tclass, label);
+ tclass, label, priority);
}
static void tcp_v6_timewait_ack(struct sock *sk, struct sk_buff *skb)
@@ -1027,7 +1029,7 @@ static void tcp_v6_timewait_ack(struct sock *sk, struct sk_buff *skb)
tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
tcp_time_stamp_raw() + tcptw->tw_ts_offset,
tcptw->tw_ts_recent, tw->tw_bound_dev_if, tcp_twsk_md5_key(tcptw),
- tw->tw_tclass, cpu_to_be32(tw->tw_flowlabel));
+ tw->tw_tclass, cpu_to_be32(tw->tw_flowlabel), 0);
inet_twsk_put(tw);
}
@@ -1050,7 +1052,7 @@ static void tcp_v6_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
tcp_time_stamp_raw() + tcp_rsk(req)->ts_off,
req->ts_recent, sk->sk_bound_dev_if,
tcp_v6_md5_do_lookup(sk, &ipv6_hdr(skb)->saddr),
- 0, 0);
+ 0, 0, sk->sk_priority);
}
--
2.23.0.351.gc4317032e6-goog
^ permalink raw reply related [flat|nested] 5+ messages in thread
* [PATCH net 3/3] tcp: honor SO_PRIORITY in TIME_WAIT state
2019-09-24 15:01 [PATCH net 0/3] tcp: provide correct skb->priority Eric Dumazet
2019-09-24 15:01 ` [PATCH net 1/3] ipv6: add priority parameter to ip6_xmit() Eric Dumazet
2019-09-24 15:01 ` [PATCH net 2/3] ipv6: tcp: provide sk->sk_priority to ctl packets Eric Dumazet
@ 2019-09-24 15:01 ` Eric Dumazet
2019-09-27 10:05 ` [PATCH net 0/3] tcp: provide correct skb->priority David Miller
3 siblings, 0 replies; 5+ messages in thread
From: Eric Dumazet @ 2019-09-24 15:01 UTC (permalink / raw)
To: David S . Miller; +Cc: netdev, Eric Dumazet, Eric Dumazet
ctl packets sent on behalf of TIME_WAIT sockets currently
have a zero skb->priority, which can cause various problems.
In this patch we :
- add a tw_priority field in struct inet_timewait_sock.
- populate it from sk->sk_priority when a TIME_WAIT is created.
- For IPv4, change ip_send_unicast_reply() and its two
callers to propagate tw_priority correctly.
ip_send_unicast_reply() no longer changes sk->sk_priority.
- For IPv6, make sure TIME_WAIT sockets pass their tw_priority
field to tcp_v6_send_response() and tcp_v6_send_ack().
Signed-off-by: Eric Dumazet <edumazet@google.com>
---
include/net/inet_timewait_sock.h | 1 +
net/ipv4/ip_output.c | 1 -
net/ipv4/tcp_ipv4.c | 4 ++++
net/ipv4/tcp_minisocks.c | 1 +
net/ipv6/tcp_ipv6.c | 6 ++++--
5 files changed, 10 insertions(+), 3 deletions(-)
diff --git a/include/net/inet_timewait_sock.h b/include/net/inet_timewait_sock.h
index aef38c140014600dbf88b1d664bad1b0adf63668..dfd919b3119e8efcbc436a67e3e6fbd02091db10 100644
--- a/include/net/inet_timewait_sock.h
+++ b/include/net/inet_timewait_sock.h
@@ -71,6 +71,7 @@ struct inet_timewait_sock {
tw_pad : 2, /* 2 bits hole */
tw_tos : 8;
u32 tw_txhash;
+ u32 tw_priority;
struct timer_list tw_timer;
struct inet_bind_bucket *tw_tb;
};
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index a77c3a4c24de40ff6bf3fa9da9a018457139e2b5..28fca408812c5576fc4ea957c1c4dec97ec8faf3 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -1694,7 +1694,6 @@ void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb,
inet_sk(sk)->tos = arg->tos;
- sk->sk_priority = skb->priority;
sk->sk_protocol = ip_hdr(skb)->protocol;
sk->sk_bound_dev_if = arg->bound_dev_if;
sk->sk_sndbuf = sysctl_wmem_default;
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index fd394ad179a008085b4e87215290f243ea1993b6..2ee45e3755e92e60b5e1810e2f68205221b8308d 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -771,6 +771,8 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
if (sk) {
ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
inet_twsk(sk)->tw_mark : sk->sk_mark;
+ ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
+ inet_twsk(sk)->tw_priority : sk->sk_priority;
transmit_time = tcp_transmit_time(sk);
}
ip_send_unicast_reply(ctl_sk,
@@ -866,6 +868,8 @@ static void tcp_v4_send_ack(const struct sock *sk,
ctl_sk = this_cpu_read(*net->ipv4.tcp_sk);
ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
inet_twsk(sk)->tw_mark : sk->sk_mark;
+ ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
+ inet_twsk(sk)->tw_priority : sk->sk_priority;
transmit_time = tcp_transmit_time(sk);
ip_send_unicast_reply(ctl_sk,
skb, &TCP_SKB_CB(skb)->header.h4.opt,
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 8bcaf2586b6892b52fc3b25545017ec21afb0bde..bb140a5db8c066e57f1018fd47bccd4628def642 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -266,6 +266,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
tw->tw_transparent = inet->transparent;
tw->tw_mark = sk->sk_mark;
+ tw->tw_priority = sk->sk_priority;
tw->tw_rcv_wscale = tp->rx_opt.rcv_wscale;
tcptw->tw_rcv_nxt = tp->rcv_nxt;
tcptw->tw_snd_nxt = tp->snd_nxt;
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 5f557bf27da2ba6bcc74034a53a3f76a99fdf9f4..e3d9f4559c99f252eba448845cce434bc53f3fd8 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -995,8 +995,10 @@ static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb)
label = ip6_flowlabel(ipv6h);
priority = sk->sk_priority;
}
- if (sk->sk_state == TCP_TIME_WAIT)
+ if (sk->sk_state == TCP_TIME_WAIT) {
label = cpu_to_be32(inet_twsk(sk)->tw_flowlabel);
+ priority = inet_twsk(sk)->tw_priority;
+ }
} else {
if (net->ipv6.sysctl.flowlabel_reflect & FLOWLABEL_REFLECT_TCP_RESET)
label = ip6_flowlabel(ipv6h);
@@ -1029,7 +1031,7 @@ static void tcp_v6_timewait_ack(struct sock *sk, struct sk_buff *skb)
tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
tcp_time_stamp_raw() + tcptw->tw_ts_offset,
tcptw->tw_ts_recent, tw->tw_bound_dev_if, tcp_twsk_md5_key(tcptw),
- tw->tw_tclass, cpu_to_be32(tw->tw_flowlabel), 0);
+ tw->tw_tclass, cpu_to_be32(tw->tw_flowlabel), tw->tw_priority);
inet_twsk_put(tw);
}
--
2.23.0.351.gc4317032e6-goog
^ permalink raw reply related [flat|nested] 5+ messages in thread
* Re: [PATCH net 0/3] tcp: provide correct skb->priority
2019-09-24 15:01 [PATCH net 0/3] tcp: provide correct skb->priority Eric Dumazet
` (2 preceding siblings ...)
2019-09-24 15:01 ` [PATCH net 3/3] tcp: honor SO_PRIORITY in TIME_WAIT state Eric Dumazet
@ 2019-09-27 10:05 ` David Miller
3 siblings, 0 replies; 5+ messages in thread
From: David Miller @ 2019-09-27 10:05 UTC (permalink / raw)
To: edumazet; +Cc: netdev, eric.dumazet
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 24 Sep 2019 08:01:13 -0700
> SO_PRIORITY socket option requests TCP egress packets
> to contain a user provided value.
>
> TCP manages to send most packets with the requested values,
> notably for TCP_ESTABLISHED state, but fails to do so for
> few packets.
>
> These packets are control packets sent on behalf
> of SYN_RECV or TIME_WAIT states.
>
> Note that to test this with packetdrill, it is a bit
> of a hassle, since packetdrill can not verify priority
> of egress packets, other than indirect observations,
> using for example sch_prio on its tunnel device.
>
> The bad skb priorities cause problems for GCP,
> as this field is one of the keys used in routing.
Series applied, thanks Eric.
^ permalink raw reply [flat|nested] 5+ messages in thread
end of thread, other threads:[~2019-09-27 10:05 UTC | newest]
Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-09-24 15:01 [PATCH net 0/3] tcp: provide correct skb->priority Eric Dumazet
2019-09-24 15:01 ` [PATCH net 1/3] ipv6: add priority parameter to ip6_xmit() Eric Dumazet
2019-09-24 15:01 ` [PATCH net 2/3] ipv6: tcp: provide sk->sk_priority to ctl packets Eric Dumazet
2019-09-24 15:01 ` [PATCH net 3/3] tcp: honor SO_PRIORITY in TIME_WAIT state Eric Dumazet
2019-09-27 10:05 ` [PATCH net 0/3] tcp: provide correct skb->priority David Miller
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).