All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] net: udp: add socket option to report RX queue level
@ 2017-03-17 21:13 Chris Kuiper
  2017-03-17 22:01 ` Eric Dumazet
  0 siblings, 1 reply; 5+ messages in thread
From: Chris Kuiper @ 2017-03-17 21:13 UTC (permalink / raw)
  To: netdev; +Cc: pgynther, Chris Kuiper

This adds a new socket option "SO_RXQ_ALLOC" that enables providing
the RX queue buffer allocation as ancillary data from the recvmsg()
system call. The value reported is a byte number and together with
the RX queue size (obtained via getsockopt(SO_RCVBUF) can be used to
calculate a percentage value on how full the socket buffer is.
---
 arch/alpha/include/uapi/asm/socket.h   |  2 ++
 arch/avr32/include/uapi/asm/socket.h   |  2 ++
 arch/frv/include/uapi/asm/socket.h     |  3 ++-
 arch/ia64/include/uapi/asm/socket.h    |  2 ++
 arch/m32r/include/uapi/asm/socket.h    |  2 ++
 arch/mips/include/uapi/asm/socket.h    |  2 ++
 arch/mn10300/include/uapi/asm/socket.h |  2 ++
 arch/parisc/include/uapi/asm/socket.h  |  2 ++
 arch/powerpc/include/uapi/asm/socket.h |  2 ++
 arch/s390/include/uapi/asm/socket.h    |  2 ++
 arch/sparc/include/uapi/asm/socket.h   |  2 ++
 arch/xtensa/include/uapi/asm/socket.h  |  2 ++
 include/net/sock.h                     |  9 +++++++++
 include/uapi/asm-generic/socket.h      |  2 ++
 net/core/sock.c                        |  8 ++++++++
 net/ipv4/udp.c                         |  1 +
 net/ipv6/udp.c                         |  1 +
 net/socket.c                           | 12 ++++++++++++
 18 files changed, 57 insertions(+), 1 deletion(-)

diff --git a/arch/alpha/include/uapi/asm/socket.h b/arch/alpha/include/uapi/asm/socket.h
index afc901b7a6f6..f6e945dccdc7 100644
--- a/arch/alpha/include/uapi/asm/socket.h
+++ b/arch/alpha/include/uapi/asm/socket.h
@@ -99,4 +99,6 @@
 
 #define SCM_TIMESTAMPING_OPT_STATS	54
 
+#define SO_RXQ_ALLOC		55
+
 #endif /* _UAPI_ASM_SOCKET_H */
diff --git a/arch/avr32/include/uapi/asm/socket.h b/arch/avr32/include/uapi/asm/socket.h
index 5a650426f357..b1023265a58e 100644
--- a/arch/avr32/include/uapi/asm/socket.h
+++ b/arch/avr32/include/uapi/asm/socket.h
@@ -92,4 +92,6 @@
 
 #define SCM_TIMESTAMPING_OPT_STATS	54
 
+#define SO_RXQ_ALLOC		55
+
 #endif /* _UAPI__ASM_AVR32_SOCKET_H */
diff --git a/arch/frv/include/uapi/asm/socket.h b/arch/frv/include/uapi/asm/socket.h
index 81e03530ed39..136b6eb01801 100644
--- a/arch/frv/include/uapi/asm/socket.h
+++ b/arch/frv/include/uapi/asm/socket.h
@@ -92,5 +92,6 @@
 
 #define SCM_TIMESTAMPING_OPT_STATS	54
 
-#endif /* _ASM_SOCKET_H */
+#define SO_RXQ_ALLOC		55
 
+#endif /* _ASM_SOCKET_H */
diff --git a/arch/ia64/include/uapi/asm/socket.h b/arch/ia64/include/uapi/asm/socket.h
index 57feb0c1f7d7..b0b39ef019cf 100644
--- a/arch/ia64/include/uapi/asm/socket.h
+++ b/arch/ia64/include/uapi/asm/socket.h
@@ -101,4 +101,6 @@
 
 #define SCM_TIMESTAMPING_OPT_STATS	54
 
+#define SO_RXQ_ALLOC		55
+
 #endif /* _ASM_IA64_SOCKET_H */
diff --git a/arch/m32r/include/uapi/asm/socket.h b/arch/m32r/include/uapi/asm/socket.h
index 5853f8e92c20..f70966ea464e 100644
--- a/arch/m32r/include/uapi/asm/socket.h
+++ b/arch/m32r/include/uapi/asm/socket.h
@@ -92,4 +92,6 @@
 
 #define SCM_TIMESTAMPING_OPT_STATS	54
 
+#define SO_RXQ_ALLOC		55
+
 #endif /* _ASM_M32R_SOCKET_H */
diff --git a/arch/mips/include/uapi/asm/socket.h b/arch/mips/include/uapi/asm/socket.h
index 566ecdcb5b4b..03932e896273 100644
--- a/arch/mips/include/uapi/asm/socket.h
+++ b/arch/mips/include/uapi/asm/socket.h
@@ -110,4 +110,6 @@
 
 #define SCM_TIMESTAMPING_OPT_STATS	54
 
+#define SO_RXQ_ALLOC		55
+
 #endif /* _UAPI_ASM_SOCKET_H */
diff --git a/arch/mn10300/include/uapi/asm/socket.h b/arch/mn10300/include/uapi/asm/socket.h
index 0e12527c4b0e..d7b8dc2f454b 100644
--- a/arch/mn10300/include/uapi/asm/socket.h
+++ b/arch/mn10300/include/uapi/asm/socket.h
@@ -92,4 +92,6 @@
 
 #define SCM_TIMESTAMPING_OPT_STATS	54
 
+#define SO_RXQ_ALLOC		55
+
 #endif /* _ASM_SOCKET_H */
diff --git a/arch/parisc/include/uapi/asm/socket.h b/arch/parisc/include/uapi/asm/socket.h
index 7a109b73ddf7..9062b32b828d 100644
--- a/arch/parisc/include/uapi/asm/socket.h
+++ b/arch/parisc/include/uapi/asm/socket.h
@@ -91,4 +91,6 @@
 
 #define SCM_TIMESTAMPING_OPT_STATS	0x402F
 
+#define SO_RXQ_ALLOC		0x4030
+
 #endif /* _UAPI_ASM_SOCKET_H */
diff --git a/arch/powerpc/include/uapi/asm/socket.h b/arch/powerpc/include/uapi/asm/socket.h
index 44583a52f882..6971a55b3b2f 100644
--- a/arch/powerpc/include/uapi/asm/socket.h
+++ b/arch/powerpc/include/uapi/asm/socket.h
@@ -99,4 +99,6 @@
 
 #define SCM_TIMESTAMPING_OPT_STATS	54
 
+#define SO_RXQ_ALLOC		55
+
 #endif	/* _ASM_POWERPC_SOCKET_H */
diff --git a/arch/s390/include/uapi/asm/socket.h b/arch/s390/include/uapi/asm/socket.h
index b24a64cbfeb1..30c7b63e3ed6 100644
--- a/arch/s390/include/uapi/asm/socket.h
+++ b/arch/s390/include/uapi/asm/socket.h
@@ -98,4 +98,6 @@
 
 #define SCM_TIMESTAMPING_OPT_STATS	54
 
+#define SO_RXQ_ALLOC		55
+
 #endif /* _ASM_SOCKET_H */
diff --git a/arch/sparc/include/uapi/asm/socket.h b/arch/sparc/include/uapi/asm/socket.h
index a25dc32f5d6a..f1d1fb318be0 100644
--- a/arch/sparc/include/uapi/asm/socket.h
+++ b/arch/sparc/include/uapi/asm/socket.h
@@ -88,6 +88,8 @@
 
 #define SCM_TIMESTAMPING_OPT_STATS	0x0038
 
+#define SO_RXQ_ALLOC		0x0039
+
 /* Security levels - as per NRL IPv6 - don't actually do anything */
 #define SO_SECURITY_AUTHENTICATION		0x5001
 #define SO_SECURITY_ENCRYPTION_TRANSPORT	0x5002
diff --git a/arch/xtensa/include/uapi/asm/socket.h b/arch/xtensa/include/uapi/asm/socket.h
index 9fdbe1fe0473..e9637dd7c5a6 100644
--- a/arch/xtensa/include/uapi/asm/socket.h
+++ b/arch/xtensa/include/uapi/asm/socket.h
@@ -103,4 +103,6 @@
 
 #define SCM_TIMESTAMPING_OPT_STATS	54
 
+#define SO_RXQ_ALLOC		55
+
 #endif	/* _XTENSA_SOCKET_H */
diff --git a/include/net/sock.h b/include/net/sock.h
index 08142be8938e..6297d65078bf 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -758,6 +758,7 @@ enum sock_flags {
 	SOCK_TIMESTAMPING_RX_SOFTWARE,  /* %SOF_TIMESTAMPING_RX_SOFTWARE */
 	SOCK_FASYNC, /* fasync() active */
 	SOCK_RXQ_OVFL,
+	SOCK_RXQ_ALLOC,
 	SOCK_ZEROCOPY, /* buffers from userspace */
 	SOCK_WIFI_STATUS, /* push wifi status to userspace */
 	SOCK_NOFCS, /* Tell NIC not to do the Ethernet FCS.
@@ -2253,6 +2254,14 @@ static inline void sock_recv_ts_and_drops(struct msghdr *msg, struct sock *sk,
 		sk->sk_stamp = skb->tstamp;
 }
 
+void __sock_recv_alloc(struct msghdr *msg, struct sock *sk);
+
+static inline void sock_recv_alloc(struct msghdr *msg, struct sock *sk)
+{
+	if (sock_flag(sk, SOCK_RXQ_ALLOC))
+		__sock_recv_alloc(msg, sk);
+}
+
 void __sock_tx_timestamp(__u16 tsflags, __u8 *tx_flags);
 
 /**
diff --git a/include/uapi/asm-generic/socket.h b/include/uapi/asm-generic/socket.h
index 2c748ddad5f8..b0f569ceca03 100644
--- a/include/uapi/asm-generic/socket.h
+++ b/include/uapi/asm-generic/socket.h
@@ -94,4 +94,6 @@
 
 #define SCM_TIMESTAMPING_OPT_STATS	54
 
+#define SO_RXQ_ALLOC		55
+
 #endif /* __ASM_GENERIC_SOCKET_H */
diff --git a/net/core/sock.c b/net/core/sock.c
index a83731c36761..486d462349fb 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1006,6 +1006,10 @@ int sock_setsockopt(struct socket *sock, int level, int optname,
 		sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
 		break;
 
+	case SO_RXQ_ALLOC:
+		sock_valbool_flag(sk, SOCK_RXQ_ALLOC, valbool);
+		break;
+
 	case SO_WIFI_STATUS:
 		sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
 		break;
@@ -1263,6 +1267,10 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
 		v.val = sock_flag(sk, SOCK_RXQ_OVFL);
 		break;
 
+	case SO_RXQ_ALLOC:
+		v.val = sock_flag(sk, SOCK_RXQ_ALLOC);
+		break;
+
 	case SO_WIFI_STATUS:
 		v.val = sock_flag(sk, SOCK_WIFI_STATUS);
 		break;
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index ea6e4cff9faf..0adf2d3840f1 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1470,6 +1470,7 @@ int udp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int noblock,
 			      UDP_MIB_INDATAGRAMS, is_udplite);
 
 	sock_recv_ts_and_drops(msg, sk, skb);
+	sock_recv_alloc(msg, sk);
 
 	/* Copy the address. */
 	if (sin) {
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 08a188ffe070..c95bbd65444f 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -413,6 +413,7 @@ int udpv6_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
 	}
 
 	sock_recv_ts_and_drops(msg, sk, skb);
+	sock_recv_alloc(msg, sk);
 
 	/* Copy the address. */
 	if (msg->msg_name) {
diff --git a/net/socket.c b/net/socket.c
index e034fe4164be..13ee85e4c634 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -734,6 +734,18 @@ void __sock_recv_ts_and_drops(struct msghdr *msg, struct sock *sk,
 }
 EXPORT_SYMBOL_GPL(__sock_recv_ts_and_drops);
 
+void __sock_recv_alloc(struct msghdr *msg, struct sock *sk)
+{
+	__u32 rmem_alloc;
+
+	if (sock_flag(sk, SOCK_RXQ_ALLOC)) {
+		rmem_alloc = atomic_read(&sk->sk_rmem_alloc);
+		put_cmsg(msg, SOL_SOCKET, SO_RXQ_ALLOC,
+				sizeof(rmem_alloc), &rmem_alloc);
+	}
+}
+EXPORT_SYMBOL_GPL(__sock_recv_alloc);
+
 static inline int sock_recvmsg_nosec(struct socket *sock, struct msghdr *msg,
 				     int flags)
 {
-- 
2.12.0.367.g23dc2f6d3c-goog

^ permalink raw reply related	[flat|nested] 5+ messages in thread

* Re: [PATCH] net: udp: add socket option to report RX queue level
  2017-03-17 21:13 [PATCH] net: udp: add socket option to report RX queue level Chris Kuiper
@ 2017-03-17 22:01 ` Eric Dumazet
  2017-03-18  4:01   ` Josh Hunt
  2017-03-28  1:08   ` Chris Kuiper
  0 siblings, 2 replies; 5+ messages in thread
From: Eric Dumazet @ 2017-03-17 22:01 UTC (permalink / raw)
  To: Chris Kuiper, Josh Hunt; +Cc: netdev, pgynther

On Fri, 2017-03-17 at 14:13 -0700, Chris Kuiper wrote:
> This adds a new socket option "SO_RXQ_ALLOC" that enables providing
> the RX queue buffer allocation as ancillary data from the recvmsg()
> system call. The value reported is a byte number and together with
> the RX queue size (obtained via getsockopt(SO_RCVBUF) can be used to
> calculate a percentage value on how full the socket buffer is.
> ---

Seems a lot of overhead, and only UDP would be supported.

I very much prefer Josh Hunt proposal
( https://patchwork.ozlabs.org/patch/738250/ )

Ie using a separate getsockopt() call instead of adding code to UDP fast
path ?

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH] net: udp: add socket option to report RX queue level
  2017-03-17 22:01 ` Eric Dumazet
@ 2017-03-18  4:01   ` Josh Hunt
  2017-03-28  1:08   ` Chris Kuiper
  1 sibling, 0 replies; 5+ messages in thread
From: Josh Hunt @ 2017-03-18  4:01 UTC (permalink / raw)
  To: Eric Dumazet, Chris Kuiper; +Cc: netdev, pgynther

On 03/17/2017 05:01 PM, Eric Dumazet wrote:
> On Fri, 2017-03-17 at 14:13 -0700, Chris Kuiper wrote:
>> This adds a new socket option "SO_RXQ_ALLOC" that enables providing
>> the RX queue buffer allocation as ancillary data from the recvmsg()
>> system call. The value reported is a byte number and together with
>> the RX queue size (obtained via getsockopt(SO_RCVBUF) can be used to
>> calculate a percentage value on how full the socket buffer is.
>> ---
>
> Seems a lot of overhead, and only UDP would be supported.
>
> I very much prefer Josh Hunt proposal
> ( https://patchwork.ozlabs.org/patch/738250/ )
>
> Ie using a separate getsockopt() call instead of adding code to UDP fast
> path ?
>

Yes, let me know and I can send a patch adding a sockopt to return all 
of the meminfo vars.

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH] net: udp: add socket option to report RX queue level
  2017-03-17 22:01 ` Eric Dumazet
  2017-03-18  4:01   ` Josh Hunt
@ 2017-03-28  1:08   ` Chris Kuiper
  2017-03-31 16:13     ` Eric Dumazet
  1 sibling, 1 reply; 5+ messages in thread
From: Chris Kuiper @ 2017-03-28  1:08 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: Josh Hunt, netdev, Petri Gynther

Sorry, I have been transferring jobs and had no time to look at this.

Josh Hunt's change seems to solve a different problem. I was looking
for something that works the same way as SO_RXQ_OVERFL, providing
information as ancillary data to the recvmsg() call. The problem with
SO_RXQ_OVERFL alone is that it tells you when things have already gone
wrong (you dropped data), so the new option SO_RX_ALLOC acts as a
leading indicator to check if you are getting close to hitting such
problem.

Regarding only UDP being supported, it is only meaningful for UDP. TCP
doesn't drop data and if its buffer gets full it just stops the sender
from sending more. The buffer level in that case doesn't even tell you
the whole picture, since it doesn't include any information on how
much additional buffering is done at the sender side.

In terms of "a lot overhead", logically the overhead of adding
additional getsockopt() calls after each recvmsg() is significantly
larger than just getting the information as part of recvmsg(). If you
don't need it, then don't enable this option. Admitted you can reduce
the frequency of calling getsockopt() relative to recvmsg(), but that
also increases your risk of missing the point where data is dropped.

-Chris


On Fri, Mar 17, 2017 at 3:01 PM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> On Fri, 2017-03-17 at 14:13 -0700, Chris Kuiper wrote:
>> This adds a new socket option "SO_RXQ_ALLOC" that enables providing
>> the RX queue buffer allocation as ancillary data from the recvmsg()
>> system call. The value reported is a byte number and together with
>> the RX queue size (obtained via getsockopt(SO_RCVBUF) can be used to
>> calculate a percentage value on how full the socket buffer is.
>> ---
>
> Seems a lot of overhead, and only UDP would be supported.
>
> I very much prefer Josh Hunt proposal
> ( https://patchwork.ozlabs.org/patch/738250/ )
>
> Ie using a separate getsockopt() call instead of adding code to UDP fast
> path ?
>
>
>

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH] net: udp: add socket option to report RX queue level
  2017-03-28  1:08   ` Chris Kuiper
@ 2017-03-31 16:13     ` Eric Dumazet
  0 siblings, 0 replies; 5+ messages in thread
From: Eric Dumazet @ 2017-03-31 16:13 UTC (permalink / raw)
  To: Chris Kuiper; +Cc: Josh Hunt, netdev, Petri Gynther

Please do not top post on netdev

On Mon, 2017-03-27 at 18:08 -0700, Chris Kuiper wrote:
> Sorry, I have been transferring jobs and had no time to look at this.
> 
> Josh Hunt's change seems to solve a different problem. I was looking
> for something that works the same way as SO_RXQ_OVERFL, providing
> information as ancillary data to the recvmsg() call. The problem with
> SO_RXQ_OVERFL alone is that it tells you when things have already gone
> wrong (you dropped data), so the new option SO_RX_ALLOC acts as a
> leading indicator to check if you are getting close to hitting such
> problem.

SO_RXQ_OVERFL gives a very precise info for every skb that was queued.

This is a different indicator, because it can tell you where is the
discontinuity point at the time skb were queued, not at the time they
are dequeued.

Just tune SO_RCVBUF to not even have to care about this.

By the time you sample the queue occupancy, the information might be
completely stale and queue already overflowed.

There is very little point having a super system call gathering all kind
of (stale) info

> 
> Regarding only UDP being supported, it is only meaningful for UDP. TCP
> doesn't drop data and if its buffer gets full it just stops the sender
> from sending more. The buffer level in that case doesn't even tell you
> the whole picture, since it doesn't include any information on how
> much additional buffering is done at the sender side.
> 

We have more protocols than UDP and TCP in linux kernel.

> In terms of "a lot overhead", logically the overhead of adding
> additional getsockopt() calls after each recvmsg() is significantly
> larger than just getting the information as part of recvmsg(). If you
> don't need it, then don't enable this option. Admitted you can reduce
> the frequency of calling getsockopt() relative to recvmsg(), but that
> also increases your risk of missing the point where data is dropped.

Your proposal adds overhead for all UDP recvmsg() calls, while most of
them absolutely not care about overruns. There is little you can do if
you are under attack or if your SO_RCVBUF is too small for the workload.

Some people work hard to reach 2 Millions UDP recvmsg() calls per second
on a single UDP socket, so everything added in fast path will be
scrutinized.

^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2017-03-31 16:13 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-03-17 21:13 [PATCH] net: udp: add socket option to report RX queue level Chris Kuiper
2017-03-17 22:01 ` Eric Dumazet
2017-03-18  4:01   ` Josh Hunt
2017-03-28  1:08   ` Chris Kuiper
2017-03-31 16:13     ` Eric Dumazet

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.