netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH v6] net: Option to retrieve the pending data from send queue of UDP socket
@ 2020-05-02  8:28 Lese Doru Calin
  2020-05-03 19:11 ` Willem de Bruijn
  0 siblings, 1 reply; 2+ messages in thread
From: Lese Doru Calin @ 2020-05-02  8:28 UTC (permalink / raw)
  To: David Miller, netdev
  Cc: Paolo Abeni, Alexey Kuznetsov, Hideaki YOSHIFUJI, Eric Dumazet

In this year's edition of GSoC, there is a project idea for CRIU to add support
for checkpoint/restore of cork-ed UDP sockets. But to add it, the kernel API needs
to be extended.
This is what this patch does. It adds a new command, called SIOUDPPENDGET, to the 
ioctl syscall regarding UDP sockets, which stores the pending data from the write
queue and the destination address in a struct msghdr. The arg for ioctl needs to 
be a pointer to a user space struct msghdr. The syscall returns the number of writed
bytes, if successful, or error. To retrive the data requires the CAP_NET_ADMIN
capability.

Signed-off-by: Lese Doru Calin <lesedorucalin01@gmail.com>
---
 include/linux/socket.h       |   2 +
 include/uapi/linux/sockios.h |   3 +
 net/ipv4/udp.c               | 145 +++++++++++++++++++++++++++++++----
 net/socket.c                 |   4 +-
 4 files changed, 139 insertions(+), 15 deletions(-)

diff --git a/include/linux/socket.h b/include/linux/socket.h
index 54338fac45cb..632ba0ea6709 100644
--- a/include/linux/socket.h
+++ b/include/linux/socket.h
@@ -351,6 +351,8 @@ struct ucred {
 #define IPX_TYPE	1
 
 extern int move_addr_to_kernel(void __user *uaddr, int ulen, struct sockaddr_storage *kaddr);
+extern int move_addr_to_user(struct sockaddr_storage *kaddr, int klen,
+			     void __user *uaddr, int __user *ulen);
 extern int put_cmsg(struct msghdr*, int level, int type, int len, void *data);
 
 struct timespec64;
diff --git a/include/uapi/linux/sockios.h b/include/uapi/linux/sockios.h
index 7d1bccbbef78..3639fa906604 100644
--- a/include/uapi/linux/sockios.h
+++ b/include/uapi/linux/sockios.h
@@ -153,6 +153,9 @@
 #define SIOCSHWTSTAMP	0x89b0		/* set and get config		*/
 #define SIOCGHWTSTAMP	0x89b1		/* get config			*/
 
+/* UDP socket calls*/
+#define SIOUDPPENDGET 0x89C0	/* get the pending data from write queue */
+
 /* Device private ioctl calls */
 
 /*
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 32564b350823..f729a5e7f90b 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1620,6 +1620,133 @@ static int first_packet_length(struct sock *sk)
 	return res;
 }
 
+static void udp_set_source_addr(struct sock *sk, struct msghdr *msg,
+				int *addr_len, u32 addr, u16 port)
+{
+	DECLARE_SOCKADDR(struct sockaddr_in *, sin, msg->msg_name);
+
+	if (sin) {
+		sin->sin_family = AF_INET;
+		sin->sin_port = port;
+		sin->sin_addr.s_addr = addr;
+		memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
+		*addr_len = sizeof(*sin);
+
+		if (cgroup_bpf_enabled)
+			BPF_CGROUP_RUN_PROG_UDP4_RECVMSG_LOCK(sk,
+					 (struct sockaddr *)sin);
+	}
+}
+
+static int udp_peek_sndq(struct sock *sk, struct msghdr *msg, int off, int len)
+{
+	int copy, copied = 0, err = 0;
+	struct sk_buff *skb;
+
+	skb_queue_walk(&sk->sk_write_queue, skb) {
+		copy = len - copied;
+		if (copy > skb->len - off)
+			copy = skb->len - off;
+
+		err = skb_copy_datagram_msg(skb, off, msg, copy);
+		if (err)
+			break;
+
+		copied += copy;
+		if (len <= copied)
+			break;
+	}
+	return err ?: copied;
+}
+
+static int udp_get_pending_write_queue(struct sock *sk, struct msghdr *msg,
+				       int *addr_len)
+{
+	int err = 0, off = sizeof(struct udphdr);
+	struct inet_sock *inet = inet_sk(sk);
+	struct udp_sock *up = udp_sk(sk);
+	struct flowi4 *fl4;
+	struct flowi6 *fl6;
+
+	switch (up->pending) {
+	case 0:
+		return -ENODATA;
+	case AF_INET:
+		off += sizeof(struct iphdr);
+		fl4 = &inet->cork.fl.u.ip4;
+		udp_set_source_addr(sk, msg, addr_len,
+				    fl4->daddr, fl4->fl4_dport);
+		break;
+	case AF_INET6:
+		off += sizeof(struct ipv6hdr);
+		if (msg->msg_name) {
+			DECLARE_SOCKADDR(struct sockaddr_in6 *, sin6,
+					 msg->msg_name);
+
+			fl6 = &inet->cork.fl.u.ip6;
+			sin6->sin6_family = AF_INET6;
+			sin6->sin6_port = fl6->fl6_dport;
+			sin6->sin6_flowinfo = 0;
+			sin6->sin6_addr = fl6->daddr;
+			sin6->sin6_scope_id = fl6->flowi6_oif;
+			*addr_len = sizeof(*sin6);
+
+			if (cgroup_bpf_enabled)
+				BPF_CGROUP_RUN_PROG_UDP6_RECVMSG_LOCK(sk,
+						(struct sockaddr *)sin6);
+		}
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	lock_sock(sk);
+	if (unlikely(!up->pending)) {
+		release_sock(sk);
+		return -EINVAL;
+	}
+	err = udp_peek_sndq(sk, msg, off, msg_data_left(msg));
+	release_sock(sk);
+	return err;
+}
+
+static int prep_msghdr_recv_pending(struct sock *sk, void __user *argp)
+{
+	struct iovec iovstack[UIO_FASTIOV], *iov = iovstack;
+	struct user_msghdr __user *msg;
+	struct sockaddr __user *uaddr;
+	struct sockaddr_storage addr;
+	struct msghdr msg_sys;
+	int __user *uaddr_len;
+	int err = 0, len = 0;
+
+	if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
+		return -EPERM;
+
+	if (!argp)
+		return -EINVAL;
+
+	msg = (struct user_msghdr __user *)argp;
+	err = recvmsg_copy_msghdr(&msg_sys, msg, 0, &uaddr, &iov);
+	if (err < 0)
+		return err;
+
+	uaddr_len = &msg->msg_namelen;
+	msg_sys.msg_name = &addr;
+	msg_sys.msg_flags = 0;
+
+	err = udp_get_pending_write_queue(sk, &msg_sys, &len);
+	msg_sys.msg_namelen = len;
+	len = err;
+
+	if (uaddr && err >= 0)
+		err = move_addr_to_user(&addr, msg_sys.msg_namelen,
+					uaddr, uaddr_len);
+
+	kfree(iov);
+	return err < 0 ? err : len;
+}
+
 /*
  *	IOCTL requests applicable to the UDP protocol
  */
@@ -1641,6 +1768,9 @@ int udp_ioctl(struct sock *sk, int cmd, unsigned long arg)
 		return put_user(amount, (int __user *)arg);
 	}
 
+	case SIOUDPPENDGET:
+		return prep_msghdr_recv_pending(sk, (void __user *)arg);
+
 	default:
 		return -ENOIOCTLCMD;
 	}
@@ -1729,7 +1859,6 @@ int udp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int noblock,
 		int flags, int *addr_len)
 {
 	struct inet_sock *inet = inet_sk(sk);
-	DECLARE_SOCKADDR(struct sockaddr_in *, sin, msg->msg_name);
 	struct sk_buff *skb;
 	unsigned int ulen, copied;
 	int off, err, peeking = flags & MSG_PEEK;
@@ -1794,18 +1923,8 @@ int udp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int noblock,
 
 	sock_recv_ts_and_drops(msg, sk, skb);
 
-	/* Copy the address. */
-	if (sin) {
-		sin->sin_family = AF_INET;
-		sin->sin_port = udp_hdr(skb)->source;
-		sin->sin_addr.s_addr = ip_hdr(skb)->saddr;
-		memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
-		*addr_len = sizeof(*sin);
-
-		if (cgroup_bpf_enabled)
-			BPF_CGROUP_RUN_PROG_UDP4_RECVMSG_LOCK(sk,
-							(struct sockaddr *)sin);
-	}
+	udp_set_source_addr(sk, msg, addr_len, ip_hdr(skb)->saddr,
+			    udp_hdr(skb)->source);
 
 	if (udp_sk(sk)->gro_enabled)
 		udp_cmsg_recv(msg, sk, skb);
diff --git a/net/socket.c b/net/socket.c
index 2dd739fba866..bd25d528c9a0 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -217,8 +217,8 @@ int move_addr_to_kernel(void __user *uaddr, int ulen, struct sockaddr_storage *k
  *	specified. Zero is returned for a success.
  */
 
-static int move_addr_to_user(struct sockaddr_storage *kaddr, int klen,
-			     void __user *uaddr, int __user *ulen)
+int move_addr_to_user(struct sockaddr_storage *kaddr, int klen,
+		      void __user *uaddr, int __user *ulen)
 {
 	int err;
 	int len;
-- 
2.17.1


^ permalink raw reply related	[flat|nested] 2+ messages in thread

* Re: [PATCH v6] net: Option to retrieve the pending data from send queue of UDP socket
  2020-05-02  8:28 [PATCH v6] net: Option to retrieve the pending data from send queue of UDP socket Lese Doru Calin
@ 2020-05-03 19:11 ` Willem de Bruijn
  0 siblings, 0 replies; 2+ messages in thread
From: Willem de Bruijn @ 2020-05-03 19:11 UTC (permalink / raw)
  To: Lese Doru Calin
  Cc: David Miller, Network Development, Paolo Abeni, Alexey Kuznetsov,
	Hideaki YOSHIFUJI, Eric Dumazet

On Sat, May 2, 2020 at 4:29 AM Lese Doru Calin
<lesedorucalin01@gmail.com> wrote:
>
> In this year's edition of GSoC, there is a project idea for CRIU to add support
> for checkpoint/restore of cork-ed UDP sockets. But to add it, the kernel API needs
> to be extended.
> This is what this patch does. It adds a new command, called SIOUDPPENDGET, to the
> ioctl syscall regarding UDP sockets, which stores the pending data from the write
> queue and the destination address in a struct msghdr. The arg for ioctl needs to
> be a pointer to a user space struct msghdr. The syscall returns the number of writed
> bytes, if successful, or error. To retrive the data requires the CAP_NET_ADMIN
> capability.
>
> Signed-off-by: Lese Doru Calin <lesedorucalin01@gmail.com>

A few concerns:

- why call the BPF recvmsg hook from this ioctl function?

- The patch saves msg_name, but not other relevant state stored in
inet_cork, such as state passed through cmsg. Without that, this might
introduce more subtle bugs than not checkpointing at all.

- Duplicating usercopy code from net/socket.c is fragile and adds
maintenance burden. If anything such refactoring should stay inside
that file. To some extent the same applies to udp_peek_sndq and
net/core/datagram.c.

Less important

- a getsockopt is generally preferred over extending ioctl.

Overall, I'm not sure that this is the right approach. It is quite a
bit of code, for a mostly hypothetical omission to CRIU?

Since these are unreliable datagrams, it is arguably sufficient to
just drop a datagram if checkpoint/restore happened on a corked
socket. That might be simpler.

As long as CRIU is behind a static branch and thus adds no cycles in
the common hot path, I personally don't mind having it in the normal
send/recv path as much -- if that allows it to reuse existing code,
e.g., for copy to user. The MSG_ERRQUEUE path in particular is already
a slow path to loop packets from the send patch back to the sending
process.


> ---
>  include/linux/socket.h       |   2 +
>  include/uapi/linux/sockios.h |   3 +
>  net/ipv4/udp.c               | 145 +++++++++++++++++++++++++++++++----
>  net/socket.c                 |   4 +-
>  4 files changed, 139 insertions(+), 15 deletions(-)
>
> diff --git a/include/linux/socket.h b/include/linux/socket.h
> index 54338fac45cb..632ba0ea6709 100644
> --- a/include/linux/socket.h
> +++ b/include/linux/socket.h
> @@ -351,6 +351,8 @@ struct ucred {
>  #define IPX_TYPE       1
>
>  extern int move_addr_to_kernel(void __user *uaddr, int ulen, struct sockaddr_storage *kaddr);
> +extern int move_addr_to_user(struct sockaddr_storage *kaddr, int klen,
> +                            void __user *uaddr, int __user *ulen);
>  extern int put_cmsg(struct msghdr*, int level, int type, int len, void *data);
>
>  struct timespec64;
> diff --git a/include/uapi/linux/sockios.h b/include/uapi/linux/sockios.h
> index 7d1bccbbef78..3639fa906604 100644
> --- a/include/uapi/linux/sockios.h
> +++ b/include/uapi/linux/sockios.h
> @@ -153,6 +153,9 @@
>  #define SIOCSHWTSTAMP  0x89b0          /* set and get config           */
>  #define SIOCGHWTSTAMP  0x89b1          /* get config                   */
>
> +/* UDP socket calls*/
> +#define SIOUDPPENDGET 0x89C0   /* get the pending data from write queue */
> +
>  /* Device private ioctl calls */
>
>  /*
> diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
> index 32564b350823..f729a5e7f90b 100644
> --- a/net/ipv4/udp.c
> +++ b/net/ipv4/udp.c
> @@ -1620,6 +1620,133 @@ static int first_packet_length(struct sock *sk)
>         return res;
>  }
>
> +static void udp_set_source_addr(struct sock *sk, struct msghdr *msg,
> +                               int *addr_len, u32 addr, u16 port)
> +{
> +       DECLARE_SOCKADDR(struct sockaddr_in *, sin, msg->msg_name);
> +
> +       if (sin) {
> +               sin->sin_family = AF_INET;
> +               sin->sin_port = port;
> +               sin->sin_addr.s_addr = addr;
> +               memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
> +               *addr_len = sizeof(*sin);
> +
> +               if (cgroup_bpf_enabled)
> +                       BPF_CGROUP_RUN_PROG_UDP4_RECVMSG_LOCK(sk,
> +                                        (struct sockaddr *)sin);
> +       }
> +}
> +
> +static int udp_peek_sndq(struct sock *sk, struct msghdr *msg, int off, int len)
> +{
> +       int copy, copied = 0, err = 0;
> +       struct sk_buff *skb;
> +
> +       skb_queue_walk(&sk->sk_write_queue, skb) {
> +               copy = len - copied;
> +               if (copy > skb->len - off)
> +                       copy = skb->len - off;
> +
> +               err = skb_copy_datagram_msg(skb, off, msg, copy);
> +               if (err)
> +                       break;
> +
> +               copied += copy;
> +               if (len <= copied)
> +                       break;
> +       }
> +       return err ?: copied;
> +}
> +
> +static int udp_get_pending_write_queue(struct sock *sk, struct msghdr *msg,
> +                                      int *addr_len)
> +{
> +       int err = 0, off = sizeof(struct udphdr);
> +       struct inet_sock *inet = inet_sk(sk);
> +       struct udp_sock *up = udp_sk(sk);
> +       struct flowi4 *fl4;
> +       struct flowi6 *fl6;
> +
> +       switch (up->pending) {
> +       case 0:
> +               return -ENODATA;
> +       case AF_INET:
> +               off += sizeof(struct iphdr);
> +               fl4 = &inet->cork.fl.u.ip4;
> +               udp_set_source_addr(sk, msg, addr_len,
> +                                   fl4->daddr, fl4->fl4_dport);
> +               break;
> +       case AF_INET6:
> +               off += sizeof(struct ipv6hdr);
> +               if (msg->msg_name) {
> +                       DECLARE_SOCKADDR(struct sockaddr_in6 *, sin6,
> +                                        msg->msg_name);
> +
> +                       fl6 = &inet->cork.fl.u.ip6;
> +                       sin6->sin6_family = AF_INET6;
> +                       sin6->sin6_port = fl6->fl6_dport;
> +                       sin6->sin6_flowinfo = 0;
> +                       sin6->sin6_addr = fl6->daddr;
> +                       sin6->sin6_scope_id = fl6->flowi6_oif;
> +                       *addr_len = sizeof(*sin6);
> +
> +                       if (cgroup_bpf_enabled)
> +                               BPF_CGROUP_RUN_PROG_UDP6_RECVMSG_LOCK(sk,
> +                                               (struct sockaddr *)sin6);
> +               }
> +               break;
> +       default:
> +               return -EINVAL;
> +       }
> +
> +       lock_sock(sk);
> +       if (unlikely(!up->pending)) {
> +               release_sock(sk);
> +               return -EINVAL;
> +       }
> +       err = udp_peek_sndq(sk, msg, off, msg_data_left(msg));
> +       release_sock(sk);
> +       return err;
> +}
> +
> +static int prep_msghdr_recv_pending(struct sock *sk, void __user *argp)
> +{
> +       struct iovec iovstack[UIO_FASTIOV], *iov = iovstack;
> +       struct user_msghdr __user *msg;
> +       struct sockaddr __user *uaddr;
> +       struct sockaddr_storage addr;
> +       struct msghdr msg_sys;
> +       int __user *uaddr_len;
> +       int err = 0, len = 0;
> +
> +       if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
> +               return -EPERM;
> +
> +       if (!argp)
> +               return -EINVAL;
> +
> +       msg = (struct user_msghdr __user *)argp;
> +       err = recvmsg_copy_msghdr(&msg_sys, msg, 0, &uaddr, &iov);
> +       if (err < 0)
> +               return err;
> +
> +       uaddr_len = &msg->msg_namelen;
> +       msg_sys.msg_name = &addr;
> +       msg_sys.msg_flags = 0;
> +
> +       err = udp_get_pending_write_queue(sk, &msg_sys, &len);
> +       msg_sys.msg_namelen = len;
> +       len = err;
> +
> +       if (uaddr && err >= 0)
> +               err = move_addr_to_user(&addr, msg_sys.msg_namelen,
> +                                       uaddr, uaddr_len);
> +
> +       kfree(iov);
> +       return err < 0 ? err : len;
> +}
> +
>  /*
>   *     IOCTL requests applicable to the UDP protocol
>   */
> @@ -1641,6 +1768,9 @@ int udp_ioctl(struct sock *sk, int cmd, unsigned long arg)
>                 return put_user(amount, (int __user *)arg);
>         }
>
> +       case SIOUDPPENDGET:
> +               return prep_msghdr_recv_pending(sk, (void __user *)arg);
> +
>         default:
>                 return -ENOIOCTLCMD;
>         }
> @@ -1729,7 +1859,6 @@ int udp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int noblock,
>                 int flags, int *addr_len)
>  {
>         struct inet_sock *inet = inet_sk(sk);
> -       DECLARE_SOCKADDR(struct sockaddr_in *, sin, msg->msg_name);
>         struct sk_buff *skb;
>         unsigned int ulen, copied;
>         int off, err, peeking = flags & MSG_PEEK;
> @@ -1794,18 +1923,8 @@ int udp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int noblock,
>
>         sock_recv_ts_and_drops(msg, sk, skb);
>
> -       /* Copy the address. */
> -       if (sin) {
> -               sin->sin_family = AF_INET;
> -               sin->sin_port = udp_hdr(skb)->source;
> -               sin->sin_addr.s_addr = ip_hdr(skb)->saddr;
> -               memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
> -               *addr_len = sizeof(*sin);
> -
> -               if (cgroup_bpf_enabled)
> -                       BPF_CGROUP_RUN_PROG_UDP4_RECVMSG_LOCK(sk,
> -                                                       (struct sockaddr *)sin);
> -       }
> +       udp_set_source_addr(sk, msg, addr_len, ip_hdr(skb)->saddr,
> +                           udp_hdr(skb)->source);
>
>         if (udp_sk(sk)->gro_enabled)
>                 udp_cmsg_recv(msg, sk, skb);
> diff --git a/net/socket.c b/net/socket.c
> index 2dd739fba866..bd25d528c9a0 100644
> --- a/net/socket.c
> +++ b/net/socket.c
> @@ -217,8 +217,8 @@ int move_addr_to_kernel(void __user *uaddr, int ulen, struct sockaddr_storage *k
>   *     specified. Zero is returned for a success.
>   */
>
> -static int move_addr_to_user(struct sockaddr_storage *kaddr, int klen,
> -                            void __user *uaddr, int __user *ulen)
> +int move_addr_to_user(struct sockaddr_storage *kaddr, int klen,
> +                     void __user *uaddr, int __user *ulen)
>  {
>         int err;
>         int len;
> --
> 2.17.1
>

^ permalink raw reply	[flat|nested] 2+ messages in thread

end of thread, other threads:[~2020-05-03 19:11 UTC | newest]

Thread overview: 2+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-05-02  8:28 [PATCH v6] net: Option to retrieve the pending data from send queue of UDP socket Lese Doru Calin
2020-05-03 19:11 ` Willem de Bruijn

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).