All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH net-next] tcp: switch pacing timer to softirq based hrtimer
@ 2018-05-10 19:49 Eric Dumazet
  2018-05-10 20:55 ` Eric Dumazet
  0 siblings, 1 reply; 3+ messages in thread
From: Eric Dumazet @ 2018-05-10 19:49 UTC (permalink / raw)
  To: David S . Miller; +Cc: netdev, Eric Dumazet, Eric Dumazet

linux-4.16 got support for softirq based hrtimers.
TCP can switch its pacing hrtimer to this variant, since this
avoids going through a tasklet and some atomic operations.

Signed-off-by: Eric Dumazet <edumazet@google.com>
---
 net/ipv4/tcp_output.c | 69 ++++++++++++++++---------------------------
 net/ipv4/tcp_timer.c  |  2 +-
 2 files changed, 26 insertions(+), 45 deletions(-)

diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index d07c0dcc99aaa55c4da963599c8286c8baa1f783..0d8f950a9006598c70dbf51e281a3fe32dfaa234 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -772,7 +772,7 @@ struct tsq_tasklet {
 };
 static DEFINE_PER_CPU(struct tsq_tasklet, tsq_tasklet);
 
-static void tcp_tsq_handler(struct sock *sk)
+static void tcp_tsq_write(struct sock *sk)
 {
 	if ((1 << sk->sk_state) &
 	    (TCPF_ESTABLISHED | TCPF_FIN_WAIT1 | TCPF_CLOSING |
@@ -789,6 +789,16 @@ static void tcp_tsq_handler(struct sock *sk)
 			       0, GFP_ATOMIC);
 	}
 }
+
+static void tcp_tsq_handler(struct sock *sk)
+{
+	bh_lock_sock(sk);
+	if (!sock_owned_by_user(sk))
+		tcp_tsq_write(sk);
+	else if (!test_and_set_bit(TCP_TSQ_DEFERRED, &sk->sk_tsq_flags))
+		sock_hold(sk);
+	bh_unlock_sock(sk);
+}
 /*
  * One tasklet per cpu tries to send more skbs.
  * We run in tasklet context but need to disable irqs when
@@ -816,16 +826,7 @@ static void tcp_tasklet_func(unsigned long data)
 		smp_mb__before_atomic();
 		clear_bit(TSQ_QUEUED, &sk->sk_tsq_flags);
 
-		if (!sk->sk_lock.owned &&
-		    test_bit(TCP_TSQ_DEFERRED, &sk->sk_tsq_flags)) {
-			bh_lock_sock(sk);
-			if (!sock_owned_by_user(sk)) {
-				clear_bit(TCP_TSQ_DEFERRED, &sk->sk_tsq_flags);
-				tcp_tsq_handler(sk);
-			}
-			bh_unlock_sock(sk);
-		}
-
+		tcp_tsq_handler(sk);
 		sk_free(sk);
 	}
 }
@@ -853,9 +854,10 @@ void tcp_release_cb(struct sock *sk)
 		nflags = flags & ~TCP_DEFERRED_ALL;
 	} while (cmpxchg(&sk->sk_tsq_flags, flags, nflags) != flags);
 
-	if (flags & TCPF_TSQ_DEFERRED)
-		tcp_tsq_handler(sk);
-
+	if (flags & TCPF_TSQ_DEFERRED) {
+		tcp_tsq_write(sk);
+		__sock_put(sk);
+	}
 	/* Here begins the tricky part :
 	 * We are called from release_sock() with :
 	 * 1) BH disabled
@@ -929,7 +931,7 @@ void tcp_wfree(struct sk_buff *skb)
 		if (!(oval & TSQF_THROTTLED) || (oval & TSQF_QUEUED))
 			goto out;
 
-		nval = (oval & ~TSQF_THROTTLED) | TSQF_QUEUED | TCPF_TSQ_DEFERRED;
+		nval = (oval & ~TSQF_THROTTLED) | TSQF_QUEUED;
 		nval = cmpxchg(&sk->sk_tsq_flags, oval, nval);
 		if (nval != oval)
 			continue;
@@ -948,37 +950,17 @@ void tcp_wfree(struct sk_buff *skb)
 	sk_free(sk);
 }
 
-/* Note: Called under hard irq.
- * We can not call TCP stack right away.
+/* Note: Called under soft irq.
+ * We can call TCP stack right away, unless socket is owned by user.
  */
 enum hrtimer_restart tcp_pace_kick(struct hrtimer *timer)
 {
 	struct tcp_sock *tp = container_of(timer, struct tcp_sock, pacing_timer);
 	struct sock *sk = (struct sock *)tp;
-	unsigned long nval, oval;
 
-	for (oval = READ_ONCE(sk->sk_tsq_flags);; oval = nval) {
-		struct tsq_tasklet *tsq;
-		bool empty;
+	tcp_tsq_handler(sk);
+	sock_put(sk);
 
-		if (oval & TSQF_QUEUED)
-			break;
-
-		nval = (oval & ~TSQF_THROTTLED) | TSQF_QUEUED | TCPF_TSQ_DEFERRED;
-		nval = cmpxchg(&sk->sk_tsq_flags, oval, nval);
-		if (nval != oval)
-			continue;
-
-		if (!refcount_inc_not_zero(&sk->sk_wmem_alloc))
-			break;
-		/* queue this socket to tasklet queue */
-		tsq = this_cpu_ptr(&tsq_tasklet);
-		empty = list_empty(&tsq->head);
-		list_add(&tp->tsq_node, &tsq->head);
-		if (empty)
-			tasklet_schedule(&tsq->tasklet);
-		break;
-	}
 	return HRTIMER_NORESTART;
 }
 
@@ -1011,7 +993,8 @@ static void tcp_internal_pacing(struct sock *sk, const struct sk_buff *skb)
 	do_div(len_ns, rate);
 	hrtimer_start(&tcp_sk(sk)->pacing_timer,
 		      ktime_add_ns(ktime_get(), len_ns),
-		      HRTIMER_MODE_ABS_PINNED);
+		      HRTIMER_MODE_ABS_PINNED_SOFT);
+	sock_hold(sk);
 }
 
 static void tcp_update_skb_after_send(struct tcp_sock *tp, struct sk_buff *skb)
@@ -1078,7 +1061,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
 
 	/* if no packet is in qdisc/device queue, then allow XPS to select
 	 * another queue. We can be called from tcp_tsq_handler()
-	 * which holds one reference to sk_wmem_alloc.
+	 * which holds one reference to sk.
 	 *
 	 * TODO: Ideally, in-flight pure ACK packets should not matter here.
 	 * One way to get this would be to set skb->truesize = 2 on them.
@@ -2185,7 +2168,7 @@ static int tcp_mtu_probe(struct sock *sk)
 static bool tcp_pacing_check(const struct sock *sk)
 {
 	return tcp_needs_internal_pacing(sk) &&
-	       hrtimer_active(&tcp_sk(sk)->pacing_timer);
+	       hrtimer_is_queued(&tcp_sk(sk)->pacing_timer);
 }
 
 /* TCP Small Queues :
@@ -2365,8 +2348,6 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
 					  skb, limit, mss_now, gfp)))
 			break;
 
-		if (test_bit(TCP_TSQ_DEFERRED, &sk->sk_tsq_flags))
-			clear_bit(TCP_TSQ_DEFERRED, &sk->sk_tsq_flags);
 		if (tcp_small_queue_check(sk, skb, 0))
 			break;
 
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index f7d944855f8ebd0a312fe73a53a56ab8d451ee44..92bdf64fffae3a5be291ca419eb21276b4c8cbae 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -713,6 +713,6 @@ void tcp_init_xmit_timers(struct sock *sk)
 	inet_csk_init_xmit_timers(sk, &tcp_write_timer, &tcp_delack_timer,
 				  &tcp_keepalive_timer);
 	hrtimer_init(&tcp_sk(sk)->pacing_timer, CLOCK_MONOTONIC,
-		     HRTIMER_MODE_ABS_PINNED);
+		     HRTIMER_MODE_ABS_PINNED_SOFT);
 	tcp_sk(sk)->pacing_timer.function = tcp_pace_kick;
 }
-- 
2.17.0.441.gb46fe60e1d-goog

^ permalink raw reply related	[flat|nested] 3+ messages in thread

* Re: [PATCH net-next] tcp: switch pacing timer to softirq based hrtimer
  2018-05-10 19:49 [PATCH net-next] tcp: switch pacing timer to softirq based hrtimer Eric Dumazet
@ 2018-05-10 20:55 ` Eric Dumazet
  2018-05-10 22:00   ` David Miller
  0 siblings, 1 reply; 3+ messages in thread
From: Eric Dumazet @ 2018-05-10 20:55 UTC (permalink / raw)
  To: Eric Dumazet, David S . Miller; +Cc: netdev



On 05/10/2018 12:49 PM, Eric Dumazet wrote:
> linux-4.16 got support for softirq based hrtimers.
> TCP can switch its pacing hrtimer to this variant, since this
> avoids going through a tasklet and some atomic operations.
> 

I need to send a V2, adding a test of hrtimer_cancel() return value
in tcp_clear_xmit_timers() to eventually release the socket reference.

^ permalink raw reply	[flat|nested] 3+ messages in thread

* Re: [PATCH net-next] tcp: switch pacing timer to softirq based hrtimer
  2018-05-10 20:55 ` Eric Dumazet
@ 2018-05-10 22:00   ` David Miller
  0 siblings, 0 replies; 3+ messages in thread
From: David Miller @ 2018-05-10 22:00 UTC (permalink / raw)
  To: eric.dumazet; +Cc: edumazet, netdev

From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Thu, 10 May 2018 13:55:00 -0700

> 
> 
> On 05/10/2018 12:49 PM, Eric Dumazet wrote:
>> linux-4.16 got support for softirq based hrtimers.
>> TCP can switch its pacing hrtimer to this variant, since this
>> avoids going through a tasklet and some atomic operations.
>> 
> 
> I need to send a V2, adding a test of hrtimer_cancel() return value
> in tcp_clear_xmit_timers() to eventually release the socket reference.

Ok.

^ permalink raw reply	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2018-05-10 22:00 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-05-10 19:49 [PATCH net-next] tcp: switch pacing timer to softirq based hrtimer Eric Dumazet
2018-05-10 20:55 ` Eric Dumazet
2018-05-10 22:00   ` David Miller

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.