All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] fs: eventpoll: add empty event
@ 2021-09-27 20:29 Johannes Lundberg
  2021-09-27 20:47 ` Eric Dumazet
  0 siblings, 1 reply; 4+ messages in thread
From: Johannes Lundberg @ 2021-09-27 20:29 UTC (permalink / raw)
  To: linux-kernel
  Cc: Johannes Lundberg, David S. Miller, Jakub Kicinski, Eric Dumazet,
	Hideaki YOSHIFUJI, David Ahern, Paolo Abeni, Florian Westphal,
	Alexander Aring, Tonghao Zhang, Yangbo Lu, Thomas Gleixner,
	netdev

The EPOLLEMPTY event will trigger when the TCP write buffer becomes
empty, i.e., when all outgoing data have been ACKed.

The need for this functionality comes from a business requirement
of measuring with higher precision how much time is spent
transmitting data to a client. For reference, similar functionality
was previously added to FreeBSD as the kqueue event EVFILT_EMPTY.

Signed-off-by: Johannes Lundberg <jlundberg@llnw.com>
---
 include/net/sock.h             | 11 +++++++++++
 include/uapi/linux/eventpoll.h |  1 +
 net/core/sock.c                |  5 +++++
 net/core/stream.c              | 14 ++++++++++++++
 net/ipv4/tcp.c                 |  5 +++++
 5 files changed, 36 insertions(+)

diff --git a/include/net/sock.h b/include/net/sock.h
index c005c3c750e8..9047a9e225a9 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -516,6 +516,7 @@ struct sock {
 	void			(*sk_state_change)(struct sock *sk);
 	void			(*sk_data_ready)(struct sock *sk);
 	void			(*sk_write_space)(struct sock *sk);
+	void			(*sk_empty)(struct sock *sk);
 	void			(*sk_error_report)(struct sock *sk);
 	int			(*sk_backlog_rcv)(struct sock *sk,
 						  struct sk_buff *skb);
@@ -965,6 +966,7 @@ static inline void sk_wmem_queued_add(struct sock *sk, int val)
 	WRITE_ONCE(sk->sk_wmem_queued, sk->sk_wmem_queued + val);
 }
 
+void sk_stream_empty(struct sock *sk);
 void sk_stream_write_space(struct sock *sk);
 
 /* OOB backlog add */
@@ -1288,6 +1290,11 @@ static inline void sk_refcnt_debug_release(const struct sock *sk)
 
 INDIRECT_CALLABLE_DECLARE(bool tcp_stream_memory_free(const struct sock *sk, int wake));
 
+static inline bool sk_stream_is_empty(const struct sock *sk)
+{
+	return (sk->sk_wmem_queued == 0);
+}
+
 static inline bool __sk_stream_memory_free(const struct sock *sk, int wake)
 {
 	if (READ_ONCE(sk->sk_wmem_queued) >= READ_ONCE(sk->sk_sndbuf))
@@ -1559,6 +1566,10 @@ DECLARE_STATIC_KEY_FALSE(tcp_tx_skb_cache_key);
 static inline void sk_wmem_free_skb(struct sock *sk, struct sk_buff *skb)
 {
 	sk_wmem_queued_add(sk, -skb->truesize);
+
+	if (sk_stream_is_empty(sk))
+		sk->sk_empty(sk);
+
 	sk_mem_uncharge(sk, skb->truesize);
 	if (static_branch_unlikely(&tcp_tx_skb_cache_key) &&
 	    !sk->sk_tx_skb_cache && !skb_cloned(skb)) {
diff --git a/include/uapi/linux/eventpoll.h b/include/uapi/linux/eventpoll.h
index 8a3432d0f0dc..aab9f1f624d0 100644
--- a/include/uapi/linux/eventpoll.h
+++ b/include/uapi/linux/eventpoll.h
@@ -39,6 +39,7 @@
 #define EPOLLWRNORM	(__force __poll_t)0x00000100
 #define EPOLLWRBAND	(__force __poll_t)0x00000200
 #define EPOLLMSG	(__force __poll_t)0x00000400
+#define EPOLLEMPTY	(__force __poll_t)0x00000800
 #define EPOLLRDHUP	(__force __poll_t)0x00002000
 
 /* Set exclusive wakeup mode for the target file descriptor */
diff --git a/net/core/sock.c b/net/core/sock.c
index 512e629f9780..f917791d8149 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -3062,6 +3062,10 @@ static void sock_def_write_space(struct sock *sk)
 	rcu_read_unlock();
 }
 
+static void sock_def_empty(struct sock *sk)
+{
+}
+
 static void sock_def_destruct(struct sock *sk)
 {
 }
@@ -3136,6 +3140,7 @@ void sock_init_data(struct socket *sock, struct sock *sk)
 	sk->sk_state_change	=	sock_def_wakeup;
 	sk->sk_data_ready	=	sock_def_readable;
 	sk->sk_write_space	=	sock_def_write_space;
+	sk->sk_empty		=	sock_def_empty;
 	sk->sk_error_report	=	sock_def_error_report;
 	sk->sk_destruct		=	sock_def_destruct;
 
diff --git a/net/core/stream.c b/net/core/stream.c
index 4f1d4aa5fb38..c7e4135542a2 100644
--- a/net/core/stream.c
+++ b/net/core/stream.c
@@ -21,6 +21,20 @@
 #include <linux/wait.h>
 #include <net/sock.h>
 
+void sk_stream_empty(struct sock *sk)
+{
+	struct socket *sock = sk->sk_socket;
+	struct socket_wq *wq;
+
+	if (sk_stream_is_empty(sk) && sock) {
+		rcu_read_lock();
+		wq = rcu_dereference(sk->sk_wq);
+		if (skwq_has_sleeper(wq))
+			wake_up_interruptible_poll(&wq->wait, EPOLLEMPTY);
+		rcu_read_unlock();
+	}
+}
+
 /**
  * sk_stream_write_space - stream socket write_space callback.
  * @sk: socket
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index e8b48df73c85..550bae79af06 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -453,6 +453,8 @@ void tcp_init_sock(struct sock *sk)
 	tp->tsoffset = 0;
 	tp->rack.reo_wnd_steps = 1;
 
+	sk->sk_empty = sk_stream_empty;
+
 	sk->sk_write_space = sk_stream_write_space;
 	sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
 
@@ -561,6 +563,9 @@ __poll_t tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
 		    tp->urg_data)
 			target++;
 
+		if (sk_stream_is_empty(sk))
+			mask |= EPOLLEMPTY;
+
 		if (tcp_stream_is_readable(sk, target))
 			mask |= EPOLLIN | EPOLLRDNORM;
 
-- 
2.17.1


^ permalink raw reply related	[flat|nested] 4+ messages in thread

* Re: [PATCH] fs: eventpoll: add empty event
  2021-09-27 20:29 [PATCH] fs: eventpoll: add empty event Johannes Lundberg
@ 2021-09-27 20:47 ` Eric Dumazet
  2021-09-27 21:17   ` Johannes Lundberg
  0 siblings, 1 reply; 4+ messages in thread
From: Eric Dumazet @ 2021-09-27 20:47 UTC (permalink / raw)
  To: Johannes Lundberg
  Cc: LKML, David S. Miller, Jakub Kicinski, Hideaki YOSHIFUJI,
	David Ahern, Paolo Abeni, Florian Westphal, Alexander Aring,
	Tonghao Zhang, Yangbo Lu, Thomas Gleixner, netdev,
	Willem de Bruijn

On Mon, Sep 27, 2021 at 1:30 PM Johannes Lundberg <jlundberg@llnw.com> wrote:
>
> The EPOLLEMPTY event will trigger when the TCP write buffer becomes
> empty, i.e., when all outgoing data have been ACKed.
>
> The need for this functionality comes from a business requirement
> of measuring with higher precision how much time is spent
> transmitting data to a client. For reference, similar functionality
> was previously added to FreeBSD as the kqueue event EVFILT_EMPTY.


Adding yet another indirect call [1] in TCP fast path, for something
(measuring with higher precision..)
which is already implemented differently in TCP stack [2] is not desirable.

Our timestamping infrastructure should be ported to FreeBSD instead :)

[1] CONFIG_RETPOLINE=y

[2] Refs :
   commit e1c8a607b28190cd09a271508aa3025d3c2f312e
      net-timestamp: ACK timestamp for bytestreams
    tools/testing/selftests/net/txtimestamp.c

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH] fs: eventpoll: add empty event
  2021-09-27 20:47 ` Eric Dumazet
@ 2021-09-27 21:17   ` Johannes Lundberg
  2021-09-27 21:33     ` Eric Dumazet
  0 siblings, 1 reply; 4+ messages in thread
From: Johannes Lundberg @ 2021-09-27 21:17 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: LKML, David S. Miller, Jakub Kicinski, Hideaki YOSHIFUJI,
	David Ahern, Paolo Abeni, Florian Westphal, Alexander Aring,
	Tonghao Zhang, Yangbo Lu, Thomas Gleixner, netdev,
	Willem de Bruijn


On 9/27/21 1:47 PM, Eric Dumazet wrote:
> On Mon, Sep 27, 2021 at 1:30 PM Johannes Lundberg <jlundberg@llnw.com> wrote:
>> The EPOLLEMPTY event will trigger when the TCP write buffer becomes
>> empty, i.e., when all outgoing data have been ACKed.
>>
>> The need for this functionality comes from a business requirement
>> of measuring with higher precision how much time is spent
>> transmitting data to a client. For reference, similar functionality
>> was previously added to FreeBSD as the kqueue event EVFILT_EMPTY.
>
> Adding yet another indirect call [1] in TCP fast path, for something
> (measuring with higher precision..)
> which is already implemented differently in TCP stack [2] is not desirable.
>
> Our timestamping infrastructure should be ported to FreeBSD instead :)
>
> [1] CONFIG_RETPOLINE=y
>
> [2] Refs :
>     commit e1c8a607b28190cd09a271508aa3025d3c2f312e
>        net-timestamp: ACK timestamp for bytestreams
>      tools/testing/selftests/net/txtimestamp.c

Hi Eric

Thanks for the feedback! If there's a way to achieve the same thing with 
current Linux I'm all for it. I'll look into how to use timestamps for this.


^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH] fs: eventpoll: add empty event
  2021-09-27 21:17   ` Johannes Lundberg
@ 2021-09-27 21:33     ` Eric Dumazet
  0 siblings, 0 replies; 4+ messages in thread
From: Eric Dumazet @ 2021-09-27 21:33 UTC (permalink / raw)
  To: Johannes Lundberg, Eric Dumazet
  Cc: LKML, David S. Miller, Jakub Kicinski, Hideaki YOSHIFUJI,
	David Ahern, Paolo Abeni, Florian Westphal, Alexander Aring,
	Tonghao Zhang, Yangbo Lu, Thomas Gleixner, netdev,
	Willem de Bruijn



On 9/27/21 2:17 PM, Johannes Lundberg wrote:
> 
> On 9/27/21 1:47 PM, Eric Dumazet wrote:
>> On Mon, Sep 27, 2021 at 1:30 PM Johannes Lundberg <jlundberg@llnw.com> wrote:
>>> The EPOLLEMPTY event will trigger when the TCP write buffer becomes
>>> empty, i.e., when all outgoing data have been ACKed.
>>>
>>> The need for this functionality comes from a business requirement
>>> of measuring with higher precision how much time is spent
>>> transmitting data to a client. For reference, similar functionality
>>> was previously added to FreeBSD as the kqueue event EVFILT_EMPTY.
>>
>> Adding yet another indirect call [1] in TCP fast path, for something
>> (measuring with higher precision..)
>> which is already implemented differently in TCP stack [2] is not desirable.
>>
>> Our timestamping infrastructure should be ported to FreeBSD instead :)
>>
>> [1] CONFIG_RETPOLINE=y
>>
>> [2] Refs :
>>     commit e1c8a607b28190cd09a271508aa3025d3c2f312e
>>        net-timestamp: ACK timestamp for bytestreams
>>      tools/testing/selftests/net/txtimestamp.c
> 
> Hi Eric
> 
> Thanks for the feedback! If there's a way to achieve the same thing with current Linux I'm all for it. I'll look into how to use timestamps for this.
> 

You are welcome !

Note that timestamping allows to trigger many events, even if write queue is not empty.

This is particularly useful when an application does not want a write queue to be drained,
since this would add transmit stalls.

Also, since the events are time stamped exactly when the relevant ACK are processed,
they are more accurate than something based on epoll, since I guess you would
get timestamps after a thread wakeup.


^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2021-09-27 21:34 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-09-27 20:29 [PATCH] fs: eventpoll: add empty event Johannes Lundberg
2021-09-27 20:47 ` Eric Dumazet
2021-09-27 21:17   ` Johannes Lundberg
2021-09-27 21:33     ` Eric Dumazet

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.