All of lore.kernel.org
 help / color / mirror / Atom feed
* Re: [PATCH RFC 1/3] vsock: support sockmap
  2023-01-19 10:39     ` Stefano Garzarella
  (?)
@ 2023-01-18 15:08     ` Bobby Eshleman
  2023-01-20  8:55         ` Stefano Garzarella
  -1 siblings, 1 reply; 20+ messages in thread
From: Bobby Eshleman @ 2023-01-18 15:08 UTC (permalink / raw)
  To: Stefano Garzarella
  Cc: Bobby Eshleman, Cong Wang, kvm, Michael S. Tsirkin,
	Alexei Starovoitov, virtualization, Song Liu, Eric Dumazet,
	Stanislav Fomichev, linux-kselftest, Shuah Khan, Mykola Lysenko,
	Daniel Borkmann, John Fastabend, Andrii Nakryiko, Yonghong Song,
	Paolo Abeni, KP Singh, Stefan Hajnoczi, Jakub Kicinski, Hao Luo,
	netdev, linux-kernel, Jiri Olsa, bpf, Martin KaFai Lau,
	David S. Miller

On Thu, Jan 19, 2023 at 11:39:36AM +0100, Stefano Garzarella wrote:
> On Wed, Jan 18, 2023 at 12:27:39PM -0800, Bobby Eshleman wrote:
> > This patch adds sockmap support for vsock sockets. It is intended to be
> > usable by all transports, but only the virtio transport is implemented.
> > 
> > Signed-off-by: Bobby Eshleman <bobby.eshleman@bytedance.com>
> > ---
> > drivers/vhost/vsock.c                   |   1 +
> > include/linux/virtio_vsock.h            |   1 +
> > include/net/af_vsock.h                  |  17 +++
> > net/vmw_vsock/Makefile                  |   1 +
> > net/vmw_vsock/af_vsock.c                |  59 +++++++++--
> > net/vmw_vsock/virtio_transport.c        |   2 +
> > net/vmw_vsock/virtio_transport_common.c |  22 ++++
> > net/vmw_vsock/vsock_bpf.c               | 180 ++++++++++++++++++++++++++++++++
> > net/vmw_vsock/vsock_loopback.c          |   2 +
> > 9 files changed, 279 insertions(+), 6 deletions(-)
> 
> ./scripts/checkpatch.pl --strict prints some simple warnings/checks that
> I suggest to fix :-)
> 

Oops, thanks. New machine, forgot my pre-commit hook. Putting in place
now.

> > 
> > diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c
> > index 1f3b89c885cca..3c6dc036b9044 100644
> > --- a/drivers/vhost/vsock.c
> > +++ b/drivers/vhost/vsock.c
> > @@ -439,6 +439,7 @@ static struct virtio_transport vhost_transport = {
> > 		.notify_send_post_enqueue = virtio_transport_notify_send_post_enqueue,
> > 		.notify_buffer_size       = virtio_transport_notify_buffer_size,
> > 
> > +		.read_skb = virtio_transport_read_skb,
> > 	},
> > 
> > 	.send_pkt = vhost_transport_send_pkt,
> > diff --git a/include/linux/virtio_vsock.h b/include/linux/virtio_vsock.h
> > index 3f9c166113063..c58453699ee98 100644
> > --- a/include/linux/virtio_vsock.h
> > +++ b/include/linux/virtio_vsock.h
> > @@ -245,4 +245,5 @@ u32 virtio_transport_get_credit(struct virtio_vsock_sock *vvs, u32 wanted);
> > void virtio_transport_put_credit(struct virtio_vsock_sock *vvs, u32 credit);
> > void virtio_transport_deliver_tap_pkt(struct sk_buff *skb);
> > int virtio_transport_purge_skbs(void *vsk, struct sk_buff_head *list);
> > +int virtio_transport_read_skb(struct vsock_sock *vsk, skb_read_actor_t read_actor);
> > #endif /* _LINUX_VIRTIO_VSOCK_H */
> > diff --git a/include/net/af_vsock.h b/include/net/af_vsock.h
> > index 568a87c5e0d0f..a73f5fbd296af 100644
> > --- a/include/net/af_vsock.h
> > +++ b/include/net/af_vsock.h
> > @@ -75,6 +75,7 @@ struct vsock_sock {
> > 	void *trans;
> > };
> > 
> > +s64 vsock_connectible_has_data(struct vsock_sock *vsk);
> > s64 vsock_stream_has_data(struct vsock_sock *vsk);
> > s64 vsock_stream_has_space(struct vsock_sock *vsk);
> > struct sock *vsock_create_connected(struct sock *parent);
> > @@ -173,6 +174,9 @@ struct vsock_transport {
> > 
> > 	/* Addressing. */
> > 	u32 (*get_local_cid)(void);
> > +
> > +	/* Read a single skb */
> > +	int (*read_skb)(struct vsock_sock *, skb_read_actor_t);
> > };
> > 
> > /**** CORE ****/
> > @@ -225,5 +229,18 @@ int vsock_init_tap(void);
> > int vsock_add_tap(struct vsock_tap *vt);
> > int vsock_remove_tap(struct vsock_tap *vt);
> > void vsock_deliver_tap(struct sk_buff *build_skb(void *opaque), void *opaque);
> > +int vsock_connectible_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
> > +			      int flags);
> > +int vsock_dgram_recvmsg(struct socket *sock, struct msghdr *msg,
> > +			       size_t len, int flags);
> > +
> > +#ifdef CONFIG_BPF_SYSCALL
> > +extern struct proto vsock_proto;
> > +int vsock_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore);
> > +void __init vsock_bpf_build_proto(void);
> > +#else
> > +static inline void __init vsock_bpf_build_proto(void)
> > +{}
> > +#endif
> > 
> > #endif /* __AF_VSOCK_H__ */
> > diff --git a/net/vmw_vsock/Makefile b/net/vmw_vsock/Makefile
> > index 6a943ec95c4a5..5da74c4a9f1d1 100644
> > --- a/net/vmw_vsock/Makefile
> > +++ b/net/vmw_vsock/Makefile
> > @@ -8,6 +8,7 @@ obj-$(CONFIG_HYPERV_VSOCKETS) += hv_sock.o
> > obj-$(CONFIG_VSOCKETS_LOOPBACK) += vsock_loopback.o
> > 
> > vsock-y += af_vsock.o af_vsock_tap.o vsock_addr.o
> > +vsock-$(CONFIG_BPF_SYSCALL) += vsock_bpf.o
> > 
> > vsock_diag-y += diag.o
> > 
> > diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c
> > index d593d5b6d4b15..7081b3a992c1e 100644
> > --- a/net/vmw_vsock/af_vsock.c
> > +++ b/net/vmw_vsock/af_vsock.c
> > @@ -116,10 +116,13 @@ static void vsock_sk_destruct(struct sock *sk);
> > static int vsock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb);
> > 
> > /* Protocol family. */
> > -static struct proto vsock_proto = {
> > +struct proto vsock_proto = {
> > 	.name = "AF_VSOCK",
> > 	.owner = THIS_MODULE,
> > 	.obj_size = sizeof(struct vsock_sock),
> > +#ifdef CONFIG_BPF_SYSCALL
> > +	.psock_update_sk_prot = vsock_bpf_update_proto,
> > +#endif
> > };
> > 
> > /* The default peer timeout indicates how long we will wait for a peer response
> > @@ -865,7 +868,7 @@ s64 vsock_stream_has_data(struct vsock_sock *vsk)
> > }
> > EXPORT_SYMBOL_GPL(vsock_stream_has_data);
> > 
> > -static s64 vsock_connectible_has_data(struct vsock_sock *vsk)
> > +s64 vsock_connectible_has_data(struct vsock_sock *vsk)
> > {
> > 	struct sock *sk = sk_vsock(vsk);
> > 
> > @@ -874,6 +877,7 @@ static s64 vsock_connectible_has_data(struct vsock_sock *vsk)
> > 	else
> > 		return vsock_stream_has_data(vsk);
> > }
> > +EXPORT_SYMBOL_GPL(vsock_connectible_has_data);
> > 
> > s64 vsock_stream_has_space(struct vsock_sock *vsk)
> > {
> > @@ -1131,6 +1135,19 @@ static __poll_t vsock_poll(struct file *file, struct socket *sock,
> > 	return mask;
> > }
> > 
> > +static int vsock_read_skb(struct sock *sk, skb_read_actor_t read_actor)
> > +{
> > +	struct vsock_sock *vsk = vsock_sk(sk);
> > +
> > +	if (!vsk->transport)
> > +		return -ENODEV;
> > +
> > +	if (!vsk->transport->read_skb)
> > +		return -EOPNOTSUPP;
> > +
> > +	return vsk->transport->read_skb(vsk, read_actor);
> > +}
> > +
> > static int vsock_dgram_sendmsg(struct socket *sock, struct msghdr *msg,
> > 			       size_t len)
> > {
> > @@ -1241,19 +1258,32 @@ static int vsock_dgram_connect(struct socket *sock,
> > 
> > 	memcpy(&vsk->remote_addr, remote_addr, sizeof(vsk->remote_addr));
> > 	sock->state = SS_CONNECTED;
> > +	sk->sk_state = TCP_ESTABLISHED;
> 
> Why we need this change?
> If it's a fix, we should put it in another patch.
> 

This is just required by sockmap's function that determines if a socket
is a valid one to add to a map. It will refuse to add any socket that is
not TCP_ESTABLISHED to a sockmap.

This was the approach that unix dgrams took, so I followed here.

> > 
> > out:
> > 	release_sock(sk);
> > 	return err;
> > }
> > 
> > -static int vsock_dgram_recvmsg(struct socket *sock, struct msghdr *msg,
> > -			       size_t len, int flags)
> > +int vsock_dgram_recvmsg(struct socket *sock, struct msghdr *msg,
> > +			size_t len, int flags)
> > {
> > -	struct vsock_sock *vsk = vsock_sk(sock->sk);
> > +	const struct proto *prot;
> 
> We should use the guard for this statement as in
> vsock_connectible_recvmsg().
> 

Got it.

> > +	struct vsock_sock *vsk;
> > +	struct sock *sk;
> > +
> > +	sk = sock->sk;
> > +	vsk = vsock_sk(sk);
> > +
> > +#ifdef CONFIG_BPF_SYSCALL
> > +	prot = READ_ONCE(sk->sk_prot);
> > +	if (prot != &vsock_proto)
> > +		return prot->recvmsg(sk, msg, len, flags, NULL);
> > +#endif
> > 
> > 	return vsk->transport->dgram_dequeue(vsk, msg, len, flags);
> > }
> > +EXPORT_SYMBOL_GPL(vsock_dgram_recvmsg);
> > 
> > static const struct proto_ops vsock_dgram_ops = {
> > 	.family = PF_VSOCK,
> > @@ -1272,6 +1302,7 @@ static const struct proto_ops vsock_dgram_ops = {
> > 	.recvmsg = vsock_dgram_recvmsg,
> > 	.mmap = sock_no_mmap,
> > 	.sendpage = sock_no_sendpage,
> > +	.read_skb = vsock_read_skb,
> > };
> > 
> > static int vsock_transport_cancel_pkt(struct vsock_sock *vsk)
> > @@ -2085,13 +2116,16 @@ static int __vsock_seqpacket_recvmsg(struct sock *sk, struct msghdr *msg,
> > 	return err;
> > }
> > 
> > -static int
> > +int
> > vsock_connectible_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
> > 			  int flags)
> > {
> > 	struct sock *sk;
> > 	struct vsock_sock *vsk;
> > 	const struct vsock_transport *transport;
> > +#ifdef CONFIG_BPF_SYSCALL
> > +	const struct proto *prot;
> > +#endif
> > 	int err;
> > 
> > 	sk = sock->sk;
> > @@ -2138,6 +2172,14 @@ vsock_connectible_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
> > 		goto out;
> > 	}
> > 
> > +#ifdef CONFIG_BPF_SYSCALL
> > +	prot = READ_ONCE(sk->sk_prot);
> > +	if (prot != &vsock_proto) {
> > +		release_sock(sk);
> > +		return prot->recvmsg(sk, msg, len, flags, NULL);
> > +	}
> > +#endif
> > +
> > 	if (sk->sk_type == SOCK_STREAM)
> > 		err = __vsock_stream_recvmsg(sk, msg, len, flags);
> > 	else
> > @@ -2147,6 +2189,7 @@ vsock_connectible_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
> > 	release_sock(sk);
> > 	return err;
> > }
> > +EXPORT_SYMBOL_GPL(vsock_connectible_recvmsg);
> > 
> > static int vsock_set_rcvlowat(struct sock *sk, int val)
> > {
> > @@ -2187,6 +2230,7 @@ static const struct proto_ops vsock_stream_ops = {
> > 	.mmap = sock_no_mmap,
> > 	.sendpage = sock_no_sendpage,
> > 	.set_rcvlowat = vsock_set_rcvlowat,
> > +	.read_skb = vsock_read_skb,
> > };
> > 
> > static const struct proto_ops vsock_seqpacket_ops = {
> > @@ -2208,6 +2252,7 @@ static const struct proto_ops vsock_seqpacket_ops = {
> > 	.recvmsg = vsock_connectible_recvmsg,
> > 	.mmap = sock_no_mmap,
> > 	.sendpage = sock_no_sendpage,
> > +	.read_skb = vsock_read_skb,
> > };
> > 
> > static int vsock_create(struct net *net, struct socket *sock,
> > @@ -2347,6 +2392,8 @@ static int __init vsock_init(void)
> > 		goto err_unregister_proto;
> > 	}
> > 
> > +	vsock_bpf_build_proto();
> > +
> > 	return 0;
> > 
> > err_unregister_proto:
> > diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c
> > index 28b5a8e8e0948..e95df847176b6 100644
> > --- a/net/vmw_vsock/virtio_transport.c
> > +++ b/net/vmw_vsock/virtio_transport.c
> > @@ -457,6 +457,8 @@ static struct virtio_transport virtio_transport = {
> > 		.notify_send_pre_enqueue  =
> > 		virtio_transport_notify_send_pre_enqueue,
> > 		.notify_send_post_enqueue = virtio_transport_notify_send_post_enqueue,
> > 		.notify_buffer_size       = virtio_transport_notify_buffer_size,
> > +
> > +		.read_skb = virtio_transport_read_skb,
> > 	},
> > 
> > 	.send_pkt = virtio_transport_send_pkt,
> > diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c
> > index a1581c77cf84a..9a87ead5b1fc5 100644
> > --- a/net/vmw_vsock/virtio_transport_common.c
> > +++ b/net/vmw_vsock/virtio_transport_common.c
> > @@ -1388,6 +1388,28 @@ int virtio_transport_purge_skbs(void *vsk, struct sk_buff_head *queue)
> > }
> > EXPORT_SYMBOL_GPL(virtio_transport_purge_skbs);
> > 
> > +int virtio_transport_read_skb(struct vsock_sock *vsk, skb_read_actor_t recv_actor)
> > +{
> > +	struct virtio_vsock_sock *vvs = vsk->trans;
> > +	struct sock *sk = sk_vsock(vsk);
> > +	struct sk_buff *skb;
> > +	int copied = 0;
> 
> We could avoid initializing `copied`, since it is overwritten later.
> 

Got it.

> > +	int off = 0;
> > +	int err;
> > +
> > +	spin_lock_bh(&vvs->rx_lock);
> > +	skb = __skb_recv_datagram(sk, &vvs->rx_queue, MSG_DONTWAIT, &off, &err);
> 
> Will this work also for STREAM and SEQPACKET sockets?
> 

Yep, it is used for non-datagram sockets as well because it is free of race
conditions, and handles waits/errors sensibly. For example, in
unix_accept():

https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/net/unix/af_unix.c#n1696

> > +	spin_unlock_bh(&vvs->rx_lock);
> > +
> > +	if (!skb)
> > +		return err;
> > +
> > +	copied = recv_actor(sk, skb);
> > +	kfree_skb(skb);
> 
> I would have moved these steps to vsock_read_skb() to avoid duplicating
> these steps in each transport. Perhaps not all transports want to pass
> skb ownership to the caller, though, so maybe we can leave it that
> way for now.
> 

That is a good point though. I bet your initial hunch is the right one.
If even one other transport duplicates this, then I'd say it is worth
pulling up into vsock_read_skb().

> > +	return copied;
> > +}
> > +EXPORT_SYMBOL_GPL(virtio_transport_read_skb);
> > +
> > MODULE_LICENSE("GPL v2");
> > MODULE_AUTHOR("Asias He");
> > MODULE_DESCRIPTION("common code for virtio vsock");
> > diff --git a/net/vmw_vsock/vsock_bpf.c b/net/vmw_vsock/vsock_bpf.c
> > new file mode 100644
> > index 0000000000000..9e11282d3bc1f
> > --- /dev/null
> > +++ b/net/vmw_vsock/vsock_bpf.c
> > @@ -0,0 +1,180 @@
> > +// SPDX-License-Identifier: GPL-2.0
> > +/* Copyright (c) 2022 Bobby Eshleman <bobby.eshleman@bytedance.com>
> > + *
> > + * Based off of net/unix/unix_bpf.c
> > + */
> > +
> > +#include <linux/bpf.h>
> > +#include <linux/module.h>
> > +#include <linux/skmsg.h>
> > +#include <linux/socket.h>
> > +#include <net/af_vsock.h>
> > +#include <net/sock.h>
> > +
> > +#define vsock_sk_has_data(__sk, __psock)				\
> > +		({	!skb_queue_empty(&__sk->sk_receive_queue) ||	\
> > +			!skb_queue_empty(&__psock->ingress_skb) ||	\
> > +			!list_empty(&__psock->ingress_msg);		\
> > +		})
> > +
> > +static struct proto *vsock_dgram_prot_saved __read_mostly;
> > +static DEFINE_SPINLOCK(vsock_dgram_prot_lock);
> > +static struct proto vsock_dgram_bpf_prot;
> > +
> > +static bool vsock_has_data(struct vsock_sock *vsk, struct sk_psock *psock)
> > +{
> > +	struct sock *sk = sk_vsock(vsk);
> > +	s64 ret;
> > +
> > +	ret = vsock_connectible_has_data(vsk);
> > +	if (ret > 0)
> > +		return true;
> > +
> > +	return vsock_sk_has_data(sk, psock);
> > +}
> > +
> > +static int vsock_msg_wait_data(struct sock *sk, struct sk_psock *psock, long timeo)
> > +{
> > +	struct vsock_sock *vsk;
> > +	int err;
> > +
> > +	DEFINE_WAIT(wait);
> > +
> > +	vsk = vsock_sk(sk);
> > +	err = 0;
> > +
> > +	while (vsock_has_data(vsk, psock)) {
> > +		prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
> > +
> > +		if (sk->sk_err != 0 ||
> > +		    (sk->sk_shutdown & RCV_SHUTDOWN) ||
> > +		    (vsk->peer_shutdown & SEND_SHUTDOWN)) {
> > +			break;
> > +		}
> > +
> > +		if (timeo == 0) {
> > +			err = -EAGAIN;
> > +			break;
> > +		}
> > +
> > +		release_sock(sk);
> > +		timeo = schedule_timeout(timeo);
> > +		lock_sock(sk);
> > +
> > +		if (signal_pending(current)) {
> > +			err = sock_intr_errno(timeo);
> > +			break;
> > +		} else if (timeo == 0) {
> > +			err = -EAGAIN;
> > +			break;
> > +		}
> > +	}
> > +
> > +	finish_wait(sk_sleep(sk), &wait);
> > +
> > +	if (err)
> > +		return err;
> > +
> > +	return 0;
> > +}
> > +
> > +static int vsock_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags)
> > +{
> > +	int err;
> > +	struct socket *sock = sk->sk_socket;
> > +
> > +	if (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET)
> > +		err = vsock_connectible_recvmsg(sock, msg, len, flags);
> > +	else
> 
> Could it happen that it is not DGRAM and we should return an error in
> this case?
> 

I'm not sure but for the sake of safety, I'll add that.

> Thanks,
> Stefano
> 

Thanks for the review Stefano.

Best,
Bobby

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH RFC 0/3] vsock: add support for sockmap
  2023-01-19 10:49   ` Stefano Garzarella
  (?)
@ 2023-01-18 15:10   ` Bobby Eshleman
  -1 siblings, 0 replies; 20+ messages in thread
From: Bobby Eshleman @ 2023-01-18 15:10 UTC (permalink / raw)
  To: Stefano Garzarella
  Cc: Bobby Eshleman, Cong Wang, kvm, Michael S. Tsirkin,
	Alexei Starovoitov, virtualization, Song Liu, Eric Dumazet,
	Stanislav Fomichev, linux-kselftest, Shuah Khan, Mykola Lysenko,
	Daniel Borkmann, John Fastabend, Andrii Nakryiko, Yonghong Song,
	Paolo Abeni, KP Singh, Stefan Hajnoczi, Jakub Kicinski, Hao Luo,
	netdev, linux-kernel, Jiri Olsa, bpf, Martin KaFai Lau,
	David S. Miller

On Thu, Jan 19, 2023 at 11:49:02AM +0100, Stefano Garzarella wrote:
> Hi Bobby,
> 
> On Wed, Jan 18, 2023 at 12:27:39PM -0800, Bobby Eshleman wrote:
> > Add support for sockmap to vsock.
> > 
> > We're testing usage of vsock as a way to redirect guest-local UDS requests to
> > the host and this patch series greatly improves the performance of such a
> > setup.
> > 
> > Compared to copying packets via userspace, this improves throughput by 221% in
> > basic testing.
> 
> Cool, nice series!
> 
> > 
> > Tested as follows.
> > 
> > Setup: guest unix dgram sender -> guest vsock redirector -> host vsock server
> > Threads: 1
> > Payload: 64k
> > No sockmap:
> > - 76.3 MB/s
> > - The guest vsock redirector was
> >  "socat VSOCK-CONNECT:2:1234 UNIX-RECV:/path/to/sock"
> > Using sockmap (this patch):
> > - 168.8 MB/s (+221%)
> 
> Assuming the absolute value is correct, there is a typo here, it would be
> +121% right?
> 

Lol, yes.

> > - The guest redirector was a simple sockmap echo server,
> >  redirecting unix ingress to vsock 2:1234 egress.
> > - Same sender and server programs
> > 
> > Only the virtio transport has been tested.
> 
> I think is fine for now.
> 
> > The loopback transport was used in
> > writing bpf/selftests, but not thoroughly tested otherwise.
> 
> I did a quick review mainly for vsock stuff.
> Hoping others can take a better look at net/vmw_vsock/vsock_bpf.c, since I'm
> not very familiar with that subsystem.
> 
> FYI I will be off the next two weeks (till Feb 7) with limited internet
> access.
> 

Roger that.

Thanks,
Bobby

> Thanks,
> Stefano
> 
> > 
> > This series requires the skb patch.
> > 
> > To: Stefan Hajnoczi <stefanha@redhat.com>
> > To: Stefano Garzarella <sgarzare@redhat.com>
> > To: "Michael S. Tsirkin" <mst@redhat.com>
> > To: Jason Wang <jasowang@redhat.com>
> > To: "David S. Miller" <davem@davemloft.net>
> > To: Eric Dumazet <edumazet@google.com>
> > To: Jakub Kicinski <kuba@kernel.org>
> > To: Paolo Abeni <pabeni@redhat.com>
> > To: Andrii Nakryiko <andrii@kernel.org>
> > To: Mykola Lysenko <mykolal@fb.com>
> > To: Alexei Starovoitov <ast@kernel.org>
> > To: Daniel Borkmann <daniel@iogearbox.net>
> > To: Martin KaFai Lau <martin.lau@linux.dev>
> > To: Song Liu <song@kernel.org>
> > To: Yonghong Song <yhs@fb.com>
> > To: John Fastabend <john.fastabend@gmail.com>
> > To: KP Singh <kpsingh@kernel.org>
> > To: Stanislav Fomichev <sdf@google.com>
> > To: Hao Luo <haoluo@google.com>
> > To: Jiri Olsa <jolsa@kernel.org>
> > To: Shuah Khan <shuah@kernel.org>
> > Cc: linux-kernel@vger.kernel.org
> > Cc: kvm@vger.kernel.org
> > Cc: virtualization@lists.linux-foundation.org
> > Cc: netdev@vger.kernel.org
> > Cc: bpf@vger.kernel.org
> > Cc: linux-kselftest@vger.kernel.org
> > Signed-off-by: Bobby Eshleman <bobby.eshleman@bytedance.com>
> > 
> > ---
> > Bobby Eshleman (3):
> >      vsock: support sockmap
> >      selftests/bpf: add vsock to vmtest.sh
> >      selftests/bpf: Add a test case for vsock sockmap
> > 
> > drivers/vhost/vsock.c                              |   1 +
> > include/linux/virtio_vsock.h                       |   1 +
> > include/net/af_vsock.h                             |  17 ++
> > net/vmw_vsock/Makefile                             |   1 +
> > net/vmw_vsock/af_vsock.c                           |  59 ++++++-
> > net/vmw_vsock/virtio_transport.c                   |   2 +
> > net/vmw_vsock/virtio_transport_common.c            |  22 +++
> > net/vmw_vsock/vsock_bpf.c                          | 180 +++++++++++++++++++++
> > net/vmw_vsock/vsock_loopback.c                     |   2 +
> > tools/testing/selftests/bpf/config.x86_64          |   4 +
> > .../selftests/bpf/prog_tests/sockmap_listen.c      | 163 +++++++++++++++++++
> > tools/testing/selftests/bpf/vmtest.sh              |   1 +
> > 12 files changed, 447 insertions(+), 6 deletions(-)
> > ---
> > base-commit: f12f4326c6a75a74e908714be6d2f0e2f0fd0d76
> > change-id: 20230118-support-vsock-sockmap-connectible-2e1297d2111a
> > 
> > Best regards,
> > -- 
> > Bobby Eshleman <bobby.eshleman@bytedance.com>
> > 
> 
> _______________________________________________
> Virtualization mailing list
> Virtualization@lists.linux-foundation.org
> https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH RFC 3/3] selftests/bpf: Add a test case for vsock sockmap
  2023-01-19 10:48     ` Stefano Garzarella
  (?)
@ 2023-01-18 15:14     ` Bobby Eshleman
  -1 siblings, 0 replies; 20+ messages in thread
From: Bobby Eshleman @ 2023-01-18 15:14 UTC (permalink / raw)
  To: Stefano Garzarella
  Cc: Bobby Eshleman, Stefan Hajnoczi, Michael S. Tsirkin, Jason Wang,
	David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
	Andrii Nakryiko, Mykola Lysenko, Alexei Starovoitov,
	Daniel Borkmann, Martin KaFai Lau, Song Liu, Yonghong Song,
	John Fastabend, KP Singh, Stanislav Fomichev, Hao Luo, Jiri Olsa,
	Shuah Khan, linux-kernel, kvm, virtualization, netdev, bpf,
	linux-kselftest, Cong Wang

On Thu, Jan 19, 2023 at 11:48:13AM +0100, Stefano Garzarella wrote:
> On Wed, Jan 18, 2023 at 12:27:41PM -0800, Bobby Eshleman wrote:
> > Add a test case testing the redirection from connectible AF_VSOCK
> > sockets to connectible AF_UNIX sockets.
> > 
> > Signed-off-by: Bobby Eshleman <bobby.eshleman@bytedance.com>
> > ---
> > .../selftests/bpf/prog_tests/sockmap_listen.c      | 163 +++++++++++++++++++++
> > 1 file changed, 163 insertions(+)
> > 
> > diff --git a/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c b/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c
> > index 2cf0c7a3fe232..8b5a2e09c9ede 100644
> > --- a/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c
> > +++ b/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c
> > @@ -18,6 +18,7 @@
> > #include <string.h>
> > #include <sys/select.h>
> > #include <unistd.h>
> > +#include <linux/vm_sockets.h>
> > 
> > #include <bpf/bpf.h>
> > #include <bpf/libbpf.h>
> > @@ -249,6 +250,16 @@ static void init_addr_loopback6(struct sockaddr_storage *ss, socklen_t *len)
> > 	*len = sizeof(*addr6);
> > }
> > 
> > +static void init_addr_loopback_vsock(struct sockaddr_storage *ss, socklen_t *len)
> > +{
> > +	struct sockaddr_vm *addr = memset(ss, 0, sizeof(*ss));
> > +
> > +	addr->svm_family = AF_VSOCK;
> > +	addr->svm_port = VMADDR_PORT_ANY;
> > +	addr->svm_cid = VMADDR_CID_LOCAL;
> 
> Wait, IIUC we only use loopback, so why do we need to attach the
> vhost-vsock-pci device to QEMU?
> 
> At that point if we add CONFIG_VSOCKETS_LOOPBACK in all configurations, it
> should also work with aarch64 and s390x.
> 

Oh that is great, I'll drop the vhost-vsock-pci device then. I thought
adding it fixed the error I was getting when trying to use
VMADDR_CID_LOCAL, but it must have just been adding
CONFIG_VSOCKETS_LOOPBACK that fixed it.

Thanks,
Bobby

> > +	*len = sizeof(*addr);
> > +}
> > +
> > static void init_addr_loopback(int family, struct sockaddr_storage *ss,
> > 			       socklen_t *len)
> > {
> > @@ -259,6 +270,9 @@ static void init_addr_loopback(int family, struct sockaddr_storage *ss,
> > 	case AF_INET6:
> > 		init_addr_loopback6(ss, len);
> > 		return;
> > +	case AF_VSOCK:
> > +		init_addr_loopback_vsock(ss, len);
> > +		return;
> > 	default:
> > 		FAIL("unsupported address family %d", family);
> > 	}
> > @@ -1434,6 +1448,8 @@ static const char *family_str(sa_family_t family)
> > 		return "IPv6";
> > 	case AF_UNIX:
> > 		return "Unix";
> > +	case AF_VSOCK:
> > +		return "VSOCK";
> > 	default:
> > 		return "unknown";
> > 	}
> > @@ -1644,6 +1660,151 @@ static void test_unix_redir(struct test_sockmap_listen *skel, struct bpf_map *ma
> > 	unix_skb_redir_to_connected(skel, map, sotype);
> > }
> > 
> > +/* Returns two connected loopback vsock sockets */
> > +static int vsock_socketpair_connectible(int sotype, int *v0, int *v1)
> > +{
> > +	struct sockaddr_storage addr;
> > +	socklen_t len = sizeof(addr);
> > +	int s, p, c;
> > +
> > +	s = socket_loopback(AF_VSOCK, sotype);
> > +	if (s < 0)
> > +		return -1;
> > +
> > +	c = xsocket(AF_VSOCK, sotype | SOCK_NONBLOCK, 0);
> > +	if (c == -1)
> > +		goto close_srv;
> > +
> > +	if (getsockname(s, sockaddr(&addr), &len) < 0)
> > +		goto close_cli;
> > +
> > +	if (connect(c, sockaddr(&addr), len) < 0 && errno != EINPROGRESS) {
> > +		FAIL_ERRNO("connect");
> > +		goto close_cli;
> > +	}
> > +
> > +	len = sizeof(addr);
> > +	p = accept_timeout(s, sockaddr(&addr), &len, IO_TIMEOUT_SEC);
> > +	if (p < 0)
> > +		goto close_cli;
> > +
> > +	*v0 = p;
> > +	*v1 = c;
> > +
> > +	return 0;
> > +
> > +close_cli:
> > +	close(c);
> > +close_srv:
> > +	close(s);
> > +
> > +	return -1;
> > +}
> > +
> > +static void vsock_unix_redir_connectible(int sock_mapfd, int verd_mapfd,
> > +					 enum redir_mode mode, int sotype)
> > +{
> > +	const char *log_prefix = redir_mode_str(mode);
> > +	char a = 'a', b = 'b';
> > +	int u0, u1, v0, v1;
> > +	int sfd[2];
> > +	unsigned int pass;
> > +	int err, n;
> > +	u32 key;
> > +
> > +	zero_verdict_count(verd_mapfd);
> > +
> > +	if (socketpair(AF_UNIX, SOCK_STREAM | SOCK_NONBLOCK, 0, sfd))
> > +		return;
> > +
> > +	u0 = sfd[0];
> > +	u1 = sfd[1];
> > +
> > +	err = vsock_socketpair_connectible(sotype, &v0, &v1);
> > +	if (err) {
> > +		FAIL("vsock_socketpair_connectible() failed");
> > +		goto close_uds;
> > +	}
> > +
> > +	err = add_to_sockmap(sock_mapfd, u0, v0);
> > +	if (err) {
> > +		FAIL("add_to_sockmap failed");
> > +		goto close_vsock;
> > +	}
> > +
> > +	n = write(v1, &a, sizeof(a));
> > +	if (n < 0)
> > +		FAIL_ERRNO("%s: write", log_prefix);
> > +	if (n == 0)
> > +		FAIL("%s: incomplete write", log_prefix);
> > +	if (n < 1)
> > +		goto out;
> > +
> > +	n = recv(mode == REDIR_INGRESS ? u0 : u1, &b, sizeof(b), MSG_DONTWAIT);
> > +	if (n < 0)
> > +		FAIL("%s: recv() err, errno=%d", log_prefix, errno);
> > +	if (n == 0)
> > +		FAIL("%s: incomplete recv", log_prefix);
> > +	if (b != a)
> > +		FAIL("%s: vsock socket map failed, %c != %c", log_prefix, a, b);
> > +
> > +	key = SK_PASS;
> > +	err = xbpf_map_lookup_elem(verd_mapfd, &key, &pass);
> > +	if (err)
> > +		goto out;
> > +	if (pass != 1)
> > +		FAIL("%s: want pass count 1, have %d", log_prefix, pass);
> > +out:
> > +	key = 0;
> > +	bpf_map_delete_elem(sock_mapfd, &key);
> > +	key = 1;
> > +	bpf_map_delete_elem(sock_mapfd, &key);
> > +
> > +close_vsock:
> > +	close(v0);
> > +	close(v1);
> > +
> > +close_uds:
> > +	close(u0);
> > +	close(u1);
> > +}
> > +
> > +static void vsock_unix_skb_redir_connectible(struct test_sockmap_listen *skel,
> > +					     struct bpf_map *inner_map,
> > +					     int sotype)
> > +{
> > +	int verdict = bpf_program__fd(skel->progs.prog_skb_verdict);
> > +	int verdict_map = bpf_map__fd(skel->maps.verdict_map);
> > +	int sock_map = bpf_map__fd(inner_map);
> > +	int err;
> > +
> > +	err = xbpf_prog_attach(verdict, sock_map, BPF_SK_SKB_VERDICT, 0);
> > +	if (err)
> > +		return;
> > +
> > +	skel->bss->test_ingress = false;
> > +	vsock_unix_redir_connectible(sock_map, verdict_map, REDIR_EGRESS, sotype);
> > +	skel->bss->test_ingress = true;
> > +	vsock_unix_redir_connectible(sock_map, verdict_map, REDIR_INGRESS, sotype);
> > +
> > +	xbpf_prog_detach2(verdict, sock_map, BPF_SK_SKB_VERDICT);
> > +}
> > +
> > +static void test_vsock_redir(struct test_sockmap_listen *skel, struct bpf_map *map)
> > +{
> > +	const char *family_name, *map_name;
> > +	char s[MAX_TEST_NAME];
> > +
> > +	family_name = family_str(AF_VSOCK);
> > +	map_name = map_type_str(map);
> > +	snprintf(s, sizeof(s), "%s %s %s", map_name, family_name, __func__);
> > +	if (!test__start_subtest(s))
> > +		return;
> > +
> > +	vsock_unix_skb_redir_connectible(skel, map, SOCK_STREAM);
> > +	vsock_unix_skb_redir_connectible(skel, map, SOCK_SEQPACKET);
> > +}
> > +
> > static void test_reuseport(struct test_sockmap_listen *skel,
> > 			   struct bpf_map *map, int family, int sotype)
> > {
> > @@ -2015,12 +2176,14 @@ void serial_test_sockmap_listen(void)
> > 	run_tests(skel, skel->maps.sock_map, AF_INET6);
> > 	test_unix_redir(skel, skel->maps.sock_map, SOCK_DGRAM);
> > 	test_unix_redir(skel, skel->maps.sock_map, SOCK_STREAM);
> > +	test_vsock_redir(skel, skel->maps.sock_map);
> > 
> > 	skel->bss->test_sockmap = false;
> > 	run_tests(skel, skel->maps.sock_hash, AF_INET);
> > 	run_tests(skel, skel->maps.sock_hash, AF_INET6);
> > 	test_unix_redir(skel, skel->maps.sock_hash, SOCK_DGRAM);
> > 	test_unix_redir(skel, skel->maps.sock_hash, SOCK_STREAM);
> > +	test_vsock_redir(skel, skel->maps.sock_hash);
> > 
> > 	test_sockmap_listen__destroy(skel);
> > }
> > 
> > -- 
> > 2.30.2
> > 
> 

^ permalink raw reply	[flat|nested] 20+ messages in thread

* [PATCH RFC 0/3] vsock: add support for sockmap
@ 2023-01-18 20:27 Bobby Eshleman
  2023-01-18 20:27 ` [PATCH RFC 1/3] vsock: support sockmap Bobby Eshleman
                   ` (3 more replies)
  0 siblings, 4 replies; 20+ messages in thread
From: Bobby Eshleman @ 2023-01-18 20:27 UTC (permalink / raw)
  To: Stefan Hajnoczi, Stefano Garzarella, Michael S. Tsirkin,
	Jason Wang, David S. Miller, Eric Dumazet, Jakub Kicinski,
	Paolo Abeni, Andrii Nakryiko, Mykola Lysenko, Alexei Starovoitov,
	Daniel Borkmann, Martin KaFai Lau, Song Liu, Yonghong Song,
	John Fastabend, KP Singh, Stanislav Fomichev, Hao Luo, Jiri Olsa,
	Shuah Khan
  Cc: linux-kernel, kvm, virtualization, netdev, bpf, linux-kselftest,
	Bobby Eshleman, Cong Wang

Add support for sockmap to vsock.

We're testing usage of vsock as a way to redirect guest-local UDS requests to
the host and this patch series greatly improves the performance of such a
setup.

Compared to copying packets via userspace, this improves throughput by 221% in
basic testing.

Tested as follows.

Setup: guest unix dgram sender -> guest vsock redirector -> host vsock server
Threads: 1
Payload: 64k
No sockmap:
- 76.3 MB/s
- The guest vsock redirector was
  "socat VSOCK-CONNECT:2:1234 UNIX-RECV:/path/to/sock"
Using sockmap (this patch):
- 168.8 MB/s (+221%)
- The guest redirector was a simple sockmap echo server,
  redirecting unix ingress to vsock 2:1234 egress.
- Same sender and server programs

Only the virtio transport has been tested. The loopback transport was used in
writing bpf/selftests, but not thoroughly tested otherwise.

This series requires the skb patch.

To: Stefan Hajnoczi <stefanha@redhat.com>
To: Stefano Garzarella <sgarzare@redhat.com>
To: "Michael S. Tsirkin" <mst@redhat.com>
To: Jason Wang <jasowang@redhat.com>
To: "David S. Miller" <davem@davemloft.net>
To: Eric Dumazet <edumazet@google.com>
To: Jakub Kicinski <kuba@kernel.org>
To: Paolo Abeni <pabeni@redhat.com>
To: Andrii Nakryiko <andrii@kernel.org>
To: Mykola Lysenko <mykolal@fb.com>
To: Alexei Starovoitov <ast@kernel.org>
To: Daniel Borkmann <daniel@iogearbox.net>
To: Martin KaFai Lau <martin.lau@linux.dev>
To: Song Liu <song@kernel.org>
To: Yonghong Song <yhs@fb.com>
To: John Fastabend <john.fastabend@gmail.com>
To: KP Singh <kpsingh@kernel.org>
To: Stanislav Fomichev <sdf@google.com>
To: Hao Luo <haoluo@google.com>
To: Jiri Olsa <jolsa@kernel.org>
To: Shuah Khan <shuah@kernel.org>
Cc: linux-kernel@vger.kernel.org
Cc: kvm@vger.kernel.org
Cc: virtualization@lists.linux-foundation.org
Cc: netdev@vger.kernel.org
Cc: bpf@vger.kernel.org
Cc: linux-kselftest@vger.kernel.org
Signed-off-by: Bobby Eshleman <bobby.eshleman@bytedance.com>

---
Bobby Eshleman (3):
      vsock: support sockmap
      selftests/bpf: add vsock to vmtest.sh
      selftests/bpf: Add a test case for vsock sockmap

 drivers/vhost/vsock.c                              |   1 +
 include/linux/virtio_vsock.h                       |   1 +
 include/net/af_vsock.h                             |  17 ++
 net/vmw_vsock/Makefile                             |   1 +
 net/vmw_vsock/af_vsock.c                           |  59 ++++++-
 net/vmw_vsock/virtio_transport.c                   |   2 +
 net/vmw_vsock/virtio_transport_common.c            |  22 +++
 net/vmw_vsock/vsock_bpf.c                          | 180 +++++++++++++++++++++
 net/vmw_vsock/vsock_loopback.c                     |   2 +
 tools/testing/selftests/bpf/config.x86_64          |   4 +
 .../selftests/bpf/prog_tests/sockmap_listen.c      | 163 +++++++++++++++++++
 tools/testing/selftests/bpf/vmtest.sh              |   1 +
 12 files changed, 447 insertions(+), 6 deletions(-)
---
base-commit: f12f4326c6a75a74e908714be6d2f0e2f0fd0d76
change-id: 20230118-support-vsock-sockmap-connectible-2e1297d2111a

Best regards,
-- 
Bobby Eshleman <bobby.eshleman@bytedance.com>

^ permalink raw reply	[flat|nested] 20+ messages in thread

* [PATCH RFC 1/3] vsock: support sockmap
  2023-01-18 20:27 [PATCH RFC 0/3] vsock: add support for sockmap Bobby Eshleman
@ 2023-01-18 20:27 ` Bobby Eshleman
  2023-01-19 10:39     ` Stefano Garzarella
  2023-01-21 19:22     ` Cong Wang
  2023-01-18 20:27 ` [PATCH RFC 2/3] selftests/bpf: add vsock to vmtest.sh Bobby Eshleman
                   ` (2 subsequent siblings)
  3 siblings, 2 replies; 20+ messages in thread
From: Bobby Eshleman @ 2023-01-18 20:27 UTC (permalink / raw)
  To: Stefan Hajnoczi, Stefano Garzarella, Michael S. Tsirkin,
	Jason Wang, David S. Miller, Eric Dumazet, Jakub Kicinski,
	Paolo Abeni, Andrii Nakryiko, Mykola Lysenko, Alexei Starovoitov,
	Daniel Borkmann, Martin KaFai Lau, Song Liu, Yonghong Song,
	John Fastabend, KP Singh, Stanislav Fomichev, Hao Luo, Jiri Olsa,
	Shuah Khan
  Cc: linux-kernel, kvm, virtualization, netdev, bpf, linux-kselftest,
	Bobby Eshleman, Cong Wang

This patch adds sockmap support for vsock sockets. It is intended to be
usable by all transports, but only the virtio transport is implemented.

Signed-off-by: Bobby Eshleman <bobby.eshleman@bytedance.com>
---
 drivers/vhost/vsock.c                   |   1 +
 include/linux/virtio_vsock.h            |   1 +
 include/net/af_vsock.h                  |  17 +++
 net/vmw_vsock/Makefile                  |   1 +
 net/vmw_vsock/af_vsock.c                |  59 +++++++++--
 net/vmw_vsock/virtio_transport.c        |   2 +
 net/vmw_vsock/virtio_transport_common.c |  22 ++++
 net/vmw_vsock/vsock_bpf.c               | 180 ++++++++++++++++++++++++++++++++
 net/vmw_vsock/vsock_loopback.c          |   2 +
 9 files changed, 279 insertions(+), 6 deletions(-)

diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c
index 1f3b89c885cca..3c6dc036b9044 100644
--- a/drivers/vhost/vsock.c
+++ b/drivers/vhost/vsock.c
@@ -439,6 +439,7 @@ static struct virtio_transport vhost_transport = {
 		.notify_send_post_enqueue = virtio_transport_notify_send_post_enqueue,
 		.notify_buffer_size       = virtio_transport_notify_buffer_size,
 
+		.read_skb = virtio_transport_read_skb,
 	},
 
 	.send_pkt = vhost_transport_send_pkt,
diff --git a/include/linux/virtio_vsock.h b/include/linux/virtio_vsock.h
index 3f9c166113063..c58453699ee98 100644
--- a/include/linux/virtio_vsock.h
+++ b/include/linux/virtio_vsock.h
@@ -245,4 +245,5 @@ u32 virtio_transport_get_credit(struct virtio_vsock_sock *vvs, u32 wanted);
 void virtio_transport_put_credit(struct virtio_vsock_sock *vvs, u32 credit);
 void virtio_transport_deliver_tap_pkt(struct sk_buff *skb);
 int virtio_transport_purge_skbs(void *vsk, struct sk_buff_head *list);
+int virtio_transport_read_skb(struct vsock_sock *vsk, skb_read_actor_t read_actor);
 #endif /* _LINUX_VIRTIO_VSOCK_H */
diff --git a/include/net/af_vsock.h b/include/net/af_vsock.h
index 568a87c5e0d0f..a73f5fbd296af 100644
--- a/include/net/af_vsock.h
+++ b/include/net/af_vsock.h
@@ -75,6 +75,7 @@ struct vsock_sock {
 	void *trans;
 };
 
+s64 vsock_connectible_has_data(struct vsock_sock *vsk);
 s64 vsock_stream_has_data(struct vsock_sock *vsk);
 s64 vsock_stream_has_space(struct vsock_sock *vsk);
 struct sock *vsock_create_connected(struct sock *parent);
@@ -173,6 +174,9 @@ struct vsock_transport {
 
 	/* Addressing. */
 	u32 (*get_local_cid)(void);
+
+	/* Read a single skb */
+	int (*read_skb)(struct vsock_sock *, skb_read_actor_t);
 };
 
 /**** CORE ****/
@@ -225,5 +229,18 @@ int vsock_init_tap(void);
 int vsock_add_tap(struct vsock_tap *vt);
 int vsock_remove_tap(struct vsock_tap *vt);
 void vsock_deliver_tap(struct sk_buff *build_skb(void *opaque), void *opaque);
+int vsock_connectible_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
+			      int flags);
+int vsock_dgram_recvmsg(struct socket *sock, struct msghdr *msg,
+			       size_t len, int flags);
+
+#ifdef CONFIG_BPF_SYSCALL
+extern struct proto vsock_proto;
+int vsock_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore);
+void __init vsock_bpf_build_proto(void);
+#else
+static inline void __init vsock_bpf_build_proto(void)
+{}
+#endif
 
 #endif /* __AF_VSOCK_H__ */
diff --git a/net/vmw_vsock/Makefile b/net/vmw_vsock/Makefile
index 6a943ec95c4a5..5da74c4a9f1d1 100644
--- a/net/vmw_vsock/Makefile
+++ b/net/vmw_vsock/Makefile
@@ -8,6 +8,7 @@ obj-$(CONFIG_HYPERV_VSOCKETS) += hv_sock.o
 obj-$(CONFIG_VSOCKETS_LOOPBACK) += vsock_loopback.o
 
 vsock-y += af_vsock.o af_vsock_tap.o vsock_addr.o
+vsock-$(CONFIG_BPF_SYSCALL) += vsock_bpf.o
 
 vsock_diag-y += diag.o
 
diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c
index d593d5b6d4b15..7081b3a992c1e 100644
--- a/net/vmw_vsock/af_vsock.c
+++ b/net/vmw_vsock/af_vsock.c
@@ -116,10 +116,13 @@ static void vsock_sk_destruct(struct sock *sk);
 static int vsock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb);
 
 /* Protocol family. */
-static struct proto vsock_proto = {
+struct proto vsock_proto = {
 	.name = "AF_VSOCK",
 	.owner = THIS_MODULE,
 	.obj_size = sizeof(struct vsock_sock),
+#ifdef CONFIG_BPF_SYSCALL
+	.psock_update_sk_prot = vsock_bpf_update_proto,
+#endif
 };
 
 /* The default peer timeout indicates how long we will wait for a peer response
@@ -865,7 +868,7 @@ s64 vsock_stream_has_data(struct vsock_sock *vsk)
 }
 EXPORT_SYMBOL_GPL(vsock_stream_has_data);
 
-static s64 vsock_connectible_has_data(struct vsock_sock *vsk)
+s64 vsock_connectible_has_data(struct vsock_sock *vsk)
 {
 	struct sock *sk = sk_vsock(vsk);
 
@@ -874,6 +877,7 @@ static s64 vsock_connectible_has_data(struct vsock_sock *vsk)
 	else
 		return vsock_stream_has_data(vsk);
 }
+EXPORT_SYMBOL_GPL(vsock_connectible_has_data);
 
 s64 vsock_stream_has_space(struct vsock_sock *vsk)
 {
@@ -1131,6 +1135,19 @@ static __poll_t vsock_poll(struct file *file, struct socket *sock,
 	return mask;
 }
 
+static int vsock_read_skb(struct sock *sk, skb_read_actor_t read_actor)
+{
+	struct vsock_sock *vsk = vsock_sk(sk);
+
+	if (!vsk->transport)
+		return -ENODEV;
+
+	if (!vsk->transport->read_skb)
+		return -EOPNOTSUPP;
+
+	return vsk->transport->read_skb(vsk, read_actor);
+}
+
 static int vsock_dgram_sendmsg(struct socket *sock, struct msghdr *msg,
 			       size_t len)
 {
@@ -1241,19 +1258,32 @@ static int vsock_dgram_connect(struct socket *sock,
 
 	memcpy(&vsk->remote_addr, remote_addr, sizeof(vsk->remote_addr));
 	sock->state = SS_CONNECTED;
+	sk->sk_state = TCP_ESTABLISHED;
 
 out:
 	release_sock(sk);
 	return err;
 }
 
-static int vsock_dgram_recvmsg(struct socket *sock, struct msghdr *msg,
-			       size_t len, int flags)
+int vsock_dgram_recvmsg(struct socket *sock, struct msghdr *msg,
+			size_t len, int flags)
 {
-	struct vsock_sock *vsk = vsock_sk(sock->sk);
+	const struct proto *prot;
+	struct vsock_sock *vsk;
+	struct sock *sk;
+
+	sk = sock->sk;
+	vsk = vsock_sk(sk);
+
+#ifdef CONFIG_BPF_SYSCALL
+	prot = READ_ONCE(sk->sk_prot);
+	if (prot != &vsock_proto)
+		return prot->recvmsg(sk, msg, len, flags, NULL);
+#endif
 
 	return vsk->transport->dgram_dequeue(vsk, msg, len, flags);
 }
+EXPORT_SYMBOL_GPL(vsock_dgram_recvmsg);
 
 static const struct proto_ops vsock_dgram_ops = {
 	.family = PF_VSOCK,
@@ -1272,6 +1302,7 @@ static const struct proto_ops vsock_dgram_ops = {
 	.recvmsg = vsock_dgram_recvmsg,
 	.mmap = sock_no_mmap,
 	.sendpage = sock_no_sendpage,
+	.read_skb = vsock_read_skb,
 };
 
 static int vsock_transport_cancel_pkt(struct vsock_sock *vsk)
@@ -2085,13 +2116,16 @@ static int __vsock_seqpacket_recvmsg(struct sock *sk, struct msghdr *msg,
 	return err;
 }
 
-static int
+int
 vsock_connectible_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
 			  int flags)
 {
 	struct sock *sk;
 	struct vsock_sock *vsk;
 	const struct vsock_transport *transport;
+#ifdef CONFIG_BPF_SYSCALL
+	const struct proto *prot;
+#endif
 	int err;
 
 	sk = sock->sk;
@@ -2138,6 +2172,14 @@ vsock_connectible_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
 		goto out;
 	}
 
+#ifdef CONFIG_BPF_SYSCALL
+	prot = READ_ONCE(sk->sk_prot);
+	if (prot != &vsock_proto) {
+		release_sock(sk);
+		return prot->recvmsg(sk, msg, len, flags, NULL);
+	}
+#endif
+
 	if (sk->sk_type == SOCK_STREAM)
 		err = __vsock_stream_recvmsg(sk, msg, len, flags);
 	else
@@ -2147,6 +2189,7 @@ vsock_connectible_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
 	release_sock(sk);
 	return err;
 }
+EXPORT_SYMBOL_GPL(vsock_connectible_recvmsg);
 
 static int vsock_set_rcvlowat(struct sock *sk, int val)
 {
@@ -2187,6 +2230,7 @@ static const struct proto_ops vsock_stream_ops = {
 	.mmap = sock_no_mmap,
 	.sendpage = sock_no_sendpage,
 	.set_rcvlowat = vsock_set_rcvlowat,
+	.read_skb = vsock_read_skb,
 };
 
 static const struct proto_ops vsock_seqpacket_ops = {
@@ -2208,6 +2252,7 @@ static const struct proto_ops vsock_seqpacket_ops = {
 	.recvmsg = vsock_connectible_recvmsg,
 	.mmap = sock_no_mmap,
 	.sendpage = sock_no_sendpage,
+	.read_skb = vsock_read_skb,
 };
 
 static int vsock_create(struct net *net, struct socket *sock,
@@ -2347,6 +2392,8 @@ static int __init vsock_init(void)
 		goto err_unregister_proto;
 	}
 
+	vsock_bpf_build_proto();
+
 	return 0;
 
 err_unregister_proto:
diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c
index 28b5a8e8e0948..e95df847176b6 100644
--- a/net/vmw_vsock/virtio_transport.c
+++ b/net/vmw_vsock/virtio_transport.c
@@ -457,6 +457,8 @@ static struct virtio_transport virtio_transport = {
 		.notify_send_pre_enqueue  = virtio_transport_notify_send_pre_enqueue,
 		.notify_send_post_enqueue = virtio_transport_notify_send_post_enqueue,
 		.notify_buffer_size       = virtio_transport_notify_buffer_size,
+
+		.read_skb = virtio_transport_read_skb,
 	},
 
 	.send_pkt = virtio_transport_send_pkt,
diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c
index a1581c77cf84a..9a87ead5b1fc5 100644
--- a/net/vmw_vsock/virtio_transport_common.c
+++ b/net/vmw_vsock/virtio_transport_common.c
@@ -1388,6 +1388,28 @@ int virtio_transport_purge_skbs(void *vsk, struct sk_buff_head *queue)
 }
 EXPORT_SYMBOL_GPL(virtio_transport_purge_skbs);
 
+int virtio_transport_read_skb(struct vsock_sock *vsk, skb_read_actor_t recv_actor)
+{
+	struct virtio_vsock_sock *vvs = vsk->trans;
+	struct sock *sk = sk_vsock(vsk);
+	struct sk_buff *skb;
+	int copied = 0;
+	int off = 0;
+	int err;
+
+	spin_lock_bh(&vvs->rx_lock);
+	skb = __skb_recv_datagram(sk, &vvs->rx_queue, MSG_DONTWAIT, &off, &err);
+	spin_unlock_bh(&vvs->rx_lock);
+
+	if (!skb)
+		return err;
+
+	copied = recv_actor(sk, skb);
+	kfree_skb(skb);
+	return copied;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_read_skb);
+
 MODULE_LICENSE("GPL v2");
 MODULE_AUTHOR("Asias He");
 MODULE_DESCRIPTION("common code for virtio vsock");
diff --git a/net/vmw_vsock/vsock_bpf.c b/net/vmw_vsock/vsock_bpf.c
new file mode 100644
index 0000000000000..9e11282d3bc1f
--- /dev/null
+++ b/net/vmw_vsock/vsock_bpf.c
@@ -0,0 +1,180 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2022 Bobby Eshleman <bobby.eshleman@bytedance.com>
+ *
+ * Based off of net/unix/unix_bpf.c
+ */
+
+#include <linux/bpf.h>
+#include <linux/module.h>
+#include <linux/skmsg.h>
+#include <linux/socket.h>
+#include <net/af_vsock.h>
+#include <net/sock.h>
+
+#define vsock_sk_has_data(__sk, __psock)				\
+		({	!skb_queue_empty(&__sk->sk_receive_queue) ||	\
+			!skb_queue_empty(&__psock->ingress_skb) ||	\
+			!list_empty(&__psock->ingress_msg);		\
+		})
+
+static struct proto *vsock_dgram_prot_saved __read_mostly;
+static DEFINE_SPINLOCK(vsock_dgram_prot_lock);
+static struct proto vsock_dgram_bpf_prot;
+
+static bool vsock_has_data(struct vsock_sock *vsk, struct sk_psock *psock)
+{
+	struct sock *sk = sk_vsock(vsk);
+	s64 ret;
+
+	ret = vsock_connectible_has_data(vsk);
+	if (ret > 0)
+		return true;
+
+	return vsock_sk_has_data(sk, psock);
+}
+
+static int vsock_msg_wait_data(struct sock *sk, struct sk_psock *psock, long timeo)
+{
+	struct vsock_sock *vsk;
+	int err;
+
+	DEFINE_WAIT(wait);
+
+	vsk = vsock_sk(sk);
+	err = 0;
+
+	while (vsock_has_data(vsk, psock)) {
+		prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
+
+		if (sk->sk_err != 0 ||
+		    (sk->sk_shutdown & RCV_SHUTDOWN) ||
+		    (vsk->peer_shutdown & SEND_SHUTDOWN)) {
+			break;
+		}
+
+		if (timeo == 0) {
+			err = -EAGAIN;
+			break;
+		}
+
+		release_sock(sk);
+		timeo = schedule_timeout(timeo);
+		lock_sock(sk);
+
+		if (signal_pending(current)) {
+			err = sock_intr_errno(timeo);
+			break;
+		} else if (timeo == 0) {
+			err = -EAGAIN;
+			break;
+		}
+	}
+
+	finish_wait(sk_sleep(sk), &wait);
+
+	if (err)
+		return err;
+
+	return 0;
+}
+
+static int vsock_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags)
+{
+	int err;
+	struct socket *sock = sk->sk_socket;
+
+	if (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET)
+		err = vsock_connectible_recvmsg(sock, msg, len, flags);
+	else
+		err = vsock_dgram_recvmsg(sock, msg, len, flags);
+
+	return err;
+}
+
+static int vsock_bpf_recvmsg(struct sock *sk, struct msghdr *msg,
+			     size_t len, int flags, int *addr_len)
+{
+	int copied;
+	struct sk_psock *psock;
+
+	lock_sock(sk);
+	psock = sk_psock_get(sk);
+	if (unlikely(!psock)) {
+		release_sock(sk);
+		return vsock_recvmsg(sk, msg, len, flags);
+	}
+
+	if (vsock_has_data(vsock_sk(sk), psock) && sk_psock_queue_empty(psock)) {
+		sk_psock_put(sk, psock);
+		release_sock(sk);
+		return vsock_recvmsg(sk, msg, len, flags);
+	}
+
+msg_bytes_ready:
+	copied = sk_msg_recvmsg(sk, psock, msg, len, flags);
+	if (!copied) {
+		long timeo;
+		int data;
+
+		timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
+		data = vsock_msg_wait_data(sk, psock, timeo);
+		if (data) {
+			if (!sk_psock_queue_empty(psock))
+				goto msg_bytes_ready;
+			sk_psock_put(sk, psock);
+			release_sock(sk);
+			return vsock_recvmsg(sk, msg, len, flags);
+		}
+		copied = -EAGAIN;
+	}
+	sk_psock_put(sk, psock);
+	release_sock(sk);
+
+	return copied;
+}
+
+/* Copy of original proto with updated sock_map methods */
+static struct proto vsock_dgram_bpf_prot = {
+	.close = sock_map_close,
+	.recvmsg = vsock_bpf_recvmsg,
+	.sock_is_readable = sk_msg_is_readable,
+	.unhash = sock_map_unhash,
+};
+
+static void vsock_dgram_bpf_rebuild_protos(struct proto *prot, const struct proto *base)
+{
+	*prot        = *base;
+	prot->close  = sock_map_close;
+	prot->recvmsg = vsock_bpf_recvmsg;
+	prot->sock_is_readable = sk_msg_is_readable;
+}
+
+static void vsock_dgram_bpf_check_needs_rebuild(struct proto *ops)
+{
+	if (unlikely(ops != smp_load_acquire(&vsock_dgram_prot_saved))) {
+		spin_lock_bh(&vsock_dgram_prot_lock);
+		if (likely(ops != vsock_dgram_prot_saved)) {
+			vsock_dgram_bpf_rebuild_protos(&vsock_dgram_bpf_prot, ops);
+			smp_store_release(&vsock_dgram_prot_saved, ops);
+		}
+		spin_unlock_bh(&vsock_dgram_prot_lock);
+	}
+}
+
+int vsock_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore)
+{
+	if (restore) {
+		sk->sk_write_space = psock->saved_write_space;
+		sock_replace_proto(sk, psock->sk_proto);
+		return 0;
+	}
+
+	vsock_dgram_bpf_check_needs_rebuild(psock->sk_proto);
+	sock_replace_proto(sk, &vsock_dgram_bpf_prot);
+	return 0;
+}
+
+void __init vsock_bpf_build_proto(void)
+{
+	vsock_dgram_bpf_rebuild_protos(&vsock_dgram_bpf_prot, &vsock_proto);
+}
diff --git a/net/vmw_vsock/vsock_loopback.c b/net/vmw_vsock/vsock_loopback.c
index 671e03240fc52..40753b661c135 100644
--- a/net/vmw_vsock/vsock_loopback.c
+++ b/net/vmw_vsock/vsock_loopback.c
@@ -94,6 +94,8 @@ static struct virtio_transport loopback_transport = {
 		.notify_send_pre_enqueue  = virtio_transport_notify_send_pre_enqueue,
 		.notify_send_post_enqueue = virtio_transport_notify_send_post_enqueue,
 		.notify_buffer_size       = virtio_transport_notify_buffer_size,
+
+		.read_skb = virtio_transport_read_skb,
 	},
 
 	.send_pkt = vsock_loopback_send_pkt,

-- 
2.30.2

^ permalink raw reply related	[flat|nested] 20+ messages in thread

* [PATCH RFC 2/3] selftests/bpf: add vsock to vmtest.sh
  2023-01-18 20:27 [PATCH RFC 0/3] vsock: add support for sockmap Bobby Eshleman
  2023-01-18 20:27 ` [PATCH RFC 1/3] vsock: support sockmap Bobby Eshleman
@ 2023-01-18 20:27 ` Bobby Eshleman
  2023-01-19 10:44     ` Stefano Garzarella
  2023-01-18 20:27 ` [PATCH RFC 3/3] selftests/bpf: Add a test case for vsock sockmap Bobby Eshleman
  2023-01-19 10:49   ` Stefano Garzarella
  3 siblings, 1 reply; 20+ messages in thread
From: Bobby Eshleman @ 2023-01-18 20:27 UTC (permalink / raw)
  To: Stefan Hajnoczi, Stefano Garzarella, Michael S. Tsirkin,
	Jason Wang, David S. Miller, Eric Dumazet, Jakub Kicinski,
	Paolo Abeni, Andrii Nakryiko, Mykola Lysenko, Alexei Starovoitov,
	Daniel Borkmann, Martin KaFai Lau, Song Liu, Yonghong Song,
	John Fastabend, KP Singh, Stanislav Fomichev, Hao Luo, Jiri Olsa,
	Shuah Khan
  Cc: linux-kernel, kvm, virtualization, netdev, bpf, linux-kselftest,
	Bobby Eshleman, Cong Wang

Add the ability to use vsock in the vmtest.sh script and
the test kernel config.

Signed-off-by: Bobby Eshleman <bobby.eshleman@bytedance.com>
---
 tools/testing/selftests/bpf/config.x86_64 | 4 ++++
 tools/testing/selftests/bpf/vmtest.sh     | 1 +
 2 files changed, 5 insertions(+)

diff --git a/tools/testing/selftests/bpf/config.x86_64 b/tools/testing/selftests/bpf/config.x86_64
index dd97d61d325ca..db5e6b9a91711 100644
--- a/tools/testing/selftests/bpf/config.x86_64
+++ b/tools/testing/selftests/bpf/config.x86_64
@@ -234,7 +234,11 @@ CONFIG_VIRTIO_BLK=y
 CONFIG_VIRTIO_CONSOLE=y
 CONFIG_VIRTIO_NET=y
 CONFIG_VIRTIO_PCI=y
+CONFIG_VIRTIO_VSOCKETS=y
+CONFIG_VIRTIO_VSOCKETS_COMMON=y
 CONFIG_VLAN_8021Q=y
+CONFIG_VSOCKETS=y
+CONFIG_VSOCKETS_LOOPBACK=y
 CONFIG_X86_ACPI_CPUFREQ=y
 CONFIG_X86_CPUID=y
 CONFIG_X86_MSR=y
diff --git a/tools/testing/selftests/bpf/vmtest.sh b/tools/testing/selftests/bpf/vmtest.sh
index 316a56d680f25..aad27ffd4ec68 100755
--- a/tools/testing/selftests/bpf/vmtest.sh
+++ b/tools/testing/selftests/bpf/vmtest.sh
@@ -250,6 +250,7 @@ EOF
 		-enable-kvm \
 		-m 4G \
 		-drive file="${rootfs_img}",format=raw,index=1,media=disk,if=virtio,cache=none \
+		-device vhost-vsock-pci,guest-cid=1234 \
 		-kernel "${kernel_bzimage}" \
 		-append "root=/dev/vda rw console=${QEMU_CONSOLE}"
 }

-- 
2.30.2

^ permalink raw reply related	[flat|nested] 20+ messages in thread

* [PATCH RFC 3/3] selftests/bpf: Add a test case for vsock sockmap
  2023-01-18 20:27 [PATCH RFC 0/3] vsock: add support for sockmap Bobby Eshleman
  2023-01-18 20:27 ` [PATCH RFC 1/3] vsock: support sockmap Bobby Eshleman
  2023-01-18 20:27 ` [PATCH RFC 2/3] selftests/bpf: add vsock to vmtest.sh Bobby Eshleman
@ 2023-01-18 20:27 ` Bobby Eshleman
  2023-01-19 10:48     ` Stefano Garzarella
  2023-01-19 10:49   ` Stefano Garzarella
  3 siblings, 1 reply; 20+ messages in thread
From: Bobby Eshleman @ 2023-01-18 20:27 UTC (permalink / raw)
  To: Stefan Hajnoczi, Stefano Garzarella, Michael S. Tsirkin,
	Jason Wang, David S. Miller, Eric Dumazet, Jakub Kicinski,
	Paolo Abeni, Andrii Nakryiko, Mykola Lysenko, Alexei Starovoitov,
	Daniel Borkmann, Martin KaFai Lau, Song Liu, Yonghong Song,
	John Fastabend, KP Singh, Stanislav Fomichev, Hao Luo, Jiri Olsa,
	Shuah Khan
  Cc: linux-kernel, kvm, virtualization, netdev, bpf, linux-kselftest,
	Bobby Eshleman, Cong Wang

Add a test case testing the redirection from connectible AF_VSOCK
sockets to connectible AF_UNIX sockets.

Signed-off-by: Bobby Eshleman <bobby.eshleman@bytedance.com>
---
 .../selftests/bpf/prog_tests/sockmap_listen.c      | 163 +++++++++++++++++++++
 1 file changed, 163 insertions(+)

diff --git a/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c b/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c
index 2cf0c7a3fe232..8b5a2e09c9ede 100644
--- a/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c
+++ b/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c
@@ -18,6 +18,7 @@
 #include <string.h>
 #include <sys/select.h>
 #include <unistd.h>
+#include <linux/vm_sockets.h>
 
 #include <bpf/bpf.h>
 #include <bpf/libbpf.h>
@@ -249,6 +250,16 @@ static void init_addr_loopback6(struct sockaddr_storage *ss, socklen_t *len)
 	*len = sizeof(*addr6);
 }
 
+static void init_addr_loopback_vsock(struct sockaddr_storage *ss, socklen_t *len)
+{
+	struct sockaddr_vm *addr = memset(ss, 0, sizeof(*ss));
+
+	addr->svm_family = AF_VSOCK;
+	addr->svm_port = VMADDR_PORT_ANY;
+	addr->svm_cid = VMADDR_CID_LOCAL;
+	*len = sizeof(*addr);
+}
+
 static void init_addr_loopback(int family, struct sockaddr_storage *ss,
 			       socklen_t *len)
 {
@@ -259,6 +270,9 @@ static void init_addr_loopback(int family, struct sockaddr_storage *ss,
 	case AF_INET6:
 		init_addr_loopback6(ss, len);
 		return;
+	case AF_VSOCK:
+		init_addr_loopback_vsock(ss, len);
+		return;
 	default:
 		FAIL("unsupported address family %d", family);
 	}
@@ -1434,6 +1448,8 @@ static const char *family_str(sa_family_t family)
 		return "IPv6";
 	case AF_UNIX:
 		return "Unix";
+	case AF_VSOCK:
+		return "VSOCK";
 	default:
 		return "unknown";
 	}
@@ -1644,6 +1660,151 @@ static void test_unix_redir(struct test_sockmap_listen *skel, struct bpf_map *ma
 	unix_skb_redir_to_connected(skel, map, sotype);
 }
 
+/* Returns two connected loopback vsock sockets */
+static int vsock_socketpair_connectible(int sotype, int *v0, int *v1)
+{
+	struct sockaddr_storage addr;
+	socklen_t len = sizeof(addr);
+	int s, p, c;
+
+	s = socket_loopback(AF_VSOCK, sotype);
+	if (s < 0)
+		return -1;
+
+	c = xsocket(AF_VSOCK, sotype | SOCK_NONBLOCK, 0);
+	if (c == -1)
+		goto close_srv;
+
+	if (getsockname(s, sockaddr(&addr), &len) < 0)
+		goto close_cli;
+
+	if (connect(c, sockaddr(&addr), len) < 0 && errno != EINPROGRESS) {
+		FAIL_ERRNO("connect");
+		goto close_cli;
+	}
+
+	len = sizeof(addr);
+	p = accept_timeout(s, sockaddr(&addr), &len, IO_TIMEOUT_SEC);
+	if (p < 0)
+		goto close_cli;
+
+	*v0 = p;
+	*v1 = c;
+
+	return 0;
+
+close_cli:
+	close(c);
+close_srv:
+	close(s);
+
+	return -1;
+}
+
+static void vsock_unix_redir_connectible(int sock_mapfd, int verd_mapfd,
+					 enum redir_mode mode, int sotype)
+{
+	const char *log_prefix = redir_mode_str(mode);
+	char a = 'a', b = 'b';
+	int u0, u1, v0, v1;
+	int sfd[2];
+	unsigned int pass;
+	int err, n;
+	u32 key;
+
+	zero_verdict_count(verd_mapfd);
+
+	if (socketpair(AF_UNIX, SOCK_STREAM | SOCK_NONBLOCK, 0, sfd))
+		return;
+
+	u0 = sfd[0];
+	u1 = sfd[1];
+
+	err = vsock_socketpair_connectible(sotype, &v0, &v1);
+	if (err) {
+		FAIL("vsock_socketpair_connectible() failed");
+		goto close_uds;
+	}
+
+	err = add_to_sockmap(sock_mapfd, u0, v0);
+	if (err) {
+		FAIL("add_to_sockmap failed");
+		goto close_vsock;
+	}
+
+	n = write(v1, &a, sizeof(a));
+	if (n < 0)
+		FAIL_ERRNO("%s: write", log_prefix);
+	if (n == 0)
+		FAIL("%s: incomplete write", log_prefix);
+	if (n < 1)
+		goto out;
+
+	n = recv(mode == REDIR_INGRESS ? u0 : u1, &b, sizeof(b), MSG_DONTWAIT);
+	if (n < 0)
+		FAIL("%s: recv() err, errno=%d", log_prefix, errno);
+	if (n == 0)
+		FAIL("%s: incomplete recv", log_prefix);
+	if (b != a)
+		FAIL("%s: vsock socket map failed, %c != %c", log_prefix, a, b);
+
+	key = SK_PASS;
+	err = xbpf_map_lookup_elem(verd_mapfd, &key, &pass);
+	if (err)
+		goto out;
+	if (pass != 1)
+		FAIL("%s: want pass count 1, have %d", log_prefix, pass);
+out:
+	key = 0;
+	bpf_map_delete_elem(sock_mapfd, &key);
+	key = 1;
+	bpf_map_delete_elem(sock_mapfd, &key);
+
+close_vsock:
+	close(v0);
+	close(v1);
+
+close_uds:
+	close(u0);
+	close(u1);
+}
+
+static void vsock_unix_skb_redir_connectible(struct test_sockmap_listen *skel,
+					     struct bpf_map *inner_map,
+					     int sotype)
+{
+	int verdict = bpf_program__fd(skel->progs.prog_skb_verdict);
+	int verdict_map = bpf_map__fd(skel->maps.verdict_map);
+	int sock_map = bpf_map__fd(inner_map);
+	int err;
+
+	err = xbpf_prog_attach(verdict, sock_map, BPF_SK_SKB_VERDICT, 0);
+	if (err)
+		return;
+
+	skel->bss->test_ingress = false;
+	vsock_unix_redir_connectible(sock_map, verdict_map, REDIR_EGRESS, sotype);
+	skel->bss->test_ingress = true;
+	vsock_unix_redir_connectible(sock_map, verdict_map, REDIR_INGRESS, sotype);
+
+	xbpf_prog_detach2(verdict, sock_map, BPF_SK_SKB_VERDICT);
+}
+
+static void test_vsock_redir(struct test_sockmap_listen *skel, struct bpf_map *map)
+{
+	const char *family_name, *map_name;
+	char s[MAX_TEST_NAME];
+
+	family_name = family_str(AF_VSOCK);
+	map_name = map_type_str(map);
+	snprintf(s, sizeof(s), "%s %s %s", map_name, family_name, __func__);
+	if (!test__start_subtest(s))
+		return;
+
+	vsock_unix_skb_redir_connectible(skel, map, SOCK_STREAM);
+	vsock_unix_skb_redir_connectible(skel, map, SOCK_SEQPACKET);
+}
+
 static void test_reuseport(struct test_sockmap_listen *skel,
 			   struct bpf_map *map, int family, int sotype)
 {
@@ -2015,12 +2176,14 @@ void serial_test_sockmap_listen(void)
 	run_tests(skel, skel->maps.sock_map, AF_INET6);
 	test_unix_redir(skel, skel->maps.sock_map, SOCK_DGRAM);
 	test_unix_redir(skel, skel->maps.sock_map, SOCK_STREAM);
+	test_vsock_redir(skel, skel->maps.sock_map);
 
 	skel->bss->test_sockmap = false;
 	run_tests(skel, skel->maps.sock_hash, AF_INET);
 	run_tests(skel, skel->maps.sock_hash, AF_INET6);
 	test_unix_redir(skel, skel->maps.sock_hash, SOCK_DGRAM);
 	test_unix_redir(skel, skel->maps.sock_hash, SOCK_STREAM);
+	test_vsock_redir(skel, skel->maps.sock_hash);
 
 	test_sockmap_listen__destroy(skel);
 }

-- 
2.30.2

^ permalink raw reply related	[flat|nested] 20+ messages in thread

* Re: [PATCH RFC 1/3] vsock: support sockmap
  2023-01-21 19:22     ` Cong Wang
  (?)
@ 2023-01-19  3:47     ` Bobby Eshleman
  -1 siblings, 0 replies; 20+ messages in thread
From: Bobby Eshleman @ 2023-01-19  3:47 UTC (permalink / raw)
  To: Cong Wang
  Cc: Bobby Eshleman, Stefan Hajnoczi, Stefano Garzarella,
	Michael S. Tsirkin, Jason Wang, David S. Miller, Eric Dumazet,
	Jakub Kicinski, Paolo Abeni, Andrii Nakryiko, Mykola Lysenko,
	Alexei Starovoitov, Daniel Borkmann, Martin KaFai Lau, Song Liu,
	Yonghong Song, John Fastabend, KP Singh, Stanislav Fomichev,
	Hao Luo, Jiri Olsa, Shuah Khan, linux-kernel, kvm,
	virtualization, netdev, bpf, linux-kselftest, Cong Wang

On Sat, Jan 21, 2023 at 11:22:31AM -0800, Cong Wang wrote:
> On Wed, Jan 18, 2023 at 12:27:39PM -0800, Bobby Eshleman wrote:
> > +static int vsock_read_skb(struct sock *sk, skb_read_actor_t read_actor)
> > +{
> > +	struct vsock_sock *vsk = vsock_sk(sk);
> > +
> > +	if (!vsk->transport)
> > +		return -ENODEV;
> > +
> > +	if (!vsk->transport->read_skb)
> > +		return -EOPNOTSUPP;
> 
> Can we move these two checks to sockmap update path? It would make
> vsock_read_skb() faster.
> 
> > +
> > +	return vsk->transport->read_skb(vsk, read_actor);
> > +}
> 
> Essentially can be just this one line.
> 
> Thanks.

That makes sense, will do.

Thanks,
Bobby

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH RFC 1/3] vsock: support sockmap
  2023-01-18 20:27 ` [PATCH RFC 1/3] vsock: support sockmap Bobby Eshleman
@ 2023-01-19 10:39     ` Stefano Garzarella
  2023-01-21 19:22     ` Cong Wang
  1 sibling, 0 replies; 20+ messages in thread
From: Stefano Garzarella @ 2023-01-19 10:39 UTC (permalink / raw)
  To: Bobby Eshleman
  Cc: Cong Wang, kvm, Michael S. Tsirkin, Alexei Starovoitov,
	virtualization, Song Liu, Eric Dumazet, Stanislav Fomichev,
	linux-kselftest, Shuah Khan, Mykola Lysenko, Daniel Borkmann,
	John Fastabend, Andrii Nakryiko, Yonghong Song, Paolo Abeni,
	KP Singh, Stefan Hajnoczi, Jakub Kicinski, Hao Luo, netdev,
	linux-kernel, Jiri Olsa, bpf, Martin KaFai Lau, David S. Miller

On Wed, Jan 18, 2023 at 12:27:39PM -0800, Bobby Eshleman wrote:
>This patch adds sockmap support for vsock sockets. It is intended to be
>usable by all transports, but only the virtio transport is implemented.
>
>Signed-off-by: Bobby Eshleman <bobby.eshleman@bytedance.com>
>---
> drivers/vhost/vsock.c                   |   1 +
> include/linux/virtio_vsock.h            |   1 +
> include/net/af_vsock.h                  |  17 +++
> net/vmw_vsock/Makefile                  |   1 +
> net/vmw_vsock/af_vsock.c                |  59 +++++++++--
> net/vmw_vsock/virtio_transport.c        |   2 +
> net/vmw_vsock/virtio_transport_common.c |  22 ++++
> net/vmw_vsock/vsock_bpf.c               | 180 ++++++++++++++++++++++++++++++++
> net/vmw_vsock/vsock_loopback.c          |   2 +
> 9 files changed, 279 insertions(+), 6 deletions(-)

./scripts/checkpatch.pl --strict prints some simple warnings/checks that
I suggest to fix :-)

>
>diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c
>index 1f3b89c885cca..3c6dc036b9044 100644
>--- a/drivers/vhost/vsock.c
>+++ b/drivers/vhost/vsock.c
>@@ -439,6 +439,7 @@ static struct virtio_transport vhost_transport = {
> 		.notify_send_post_enqueue = virtio_transport_notify_send_post_enqueue,
> 		.notify_buffer_size       = virtio_transport_notify_buffer_size,
>
>+		.read_skb = virtio_transport_read_skb,
> 	},
>
> 	.send_pkt = vhost_transport_send_pkt,
>diff --git a/include/linux/virtio_vsock.h b/include/linux/virtio_vsock.h
>index 3f9c166113063..c58453699ee98 100644
>--- a/include/linux/virtio_vsock.h
>+++ b/include/linux/virtio_vsock.h
>@@ -245,4 +245,5 @@ u32 virtio_transport_get_credit(struct virtio_vsock_sock *vvs, u32 wanted);
> void virtio_transport_put_credit(struct virtio_vsock_sock *vvs, u32 credit);
> void virtio_transport_deliver_tap_pkt(struct sk_buff *skb);
> int virtio_transport_purge_skbs(void *vsk, struct sk_buff_head *list);
>+int virtio_transport_read_skb(struct vsock_sock *vsk, skb_read_actor_t read_actor);
> #endif /* _LINUX_VIRTIO_VSOCK_H */
>diff --git a/include/net/af_vsock.h b/include/net/af_vsock.h
>index 568a87c5e0d0f..a73f5fbd296af 100644
>--- a/include/net/af_vsock.h
>+++ b/include/net/af_vsock.h
>@@ -75,6 +75,7 @@ struct vsock_sock {
> 	void *trans;
> };
>
>+s64 vsock_connectible_has_data(struct vsock_sock *vsk);
> s64 vsock_stream_has_data(struct vsock_sock *vsk);
> s64 vsock_stream_has_space(struct vsock_sock *vsk);
> struct sock *vsock_create_connected(struct sock *parent);
>@@ -173,6 +174,9 @@ struct vsock_transport {
>
> 	/* Addressing. */
> 	u32 (*get_local_cid)(void);
>+
>+	/* Read a single skb */
>+	int (*read_skb)(struct vsock_sock *, skb_read_actor_t);
> };
>
> /**** CORE ****/
>@@ -225,5 +229,18 @@ int vsock_init_tap(void);
> int vsock_add_tap(struct vsock_tap *vt);
> int vsock_remove_tap(struct vsock_tap *vt);
> void vsock_deliver_tap(struct sk_buff *build_skb(void *opaque), void *opaque);
>+int vsock_connectible_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
>+			      int flags);
>+int vsock_dgram_recvmsg(struct socket *sock, struct msghdr *msg,
>+			       size_t len, int flags);
>+
>+#ifdef CONFIG_BPF_SYSCALL
>+extern struct proto vsock_proto;
>+int vsock_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore);
>+void __init vsock_bpf_build_proto(void);
>+#else
>+static inline void __init vsock_bpf_build_proto(void)
>+{}
>+#endif
>
> #endif /* __AF_VSOCK_H__ */
>diff --git a/net/vmw_vsock/Makefile b/net/vmw_vsock/Makefile
>index 6a943ec95c4a5..5da74c4a9f1d1 100644
>--- a/net/vmw_vsock/Makefile
>+++ b/net/vmw_vsock/Makefile
>@@ -8,6 +8,7 @@ obj-$(CONFIG_HYPERV_VSOCKETS) += hv_sock.o
> obj-$(CONFIG_VSOCKETS_LOOPBACK) += vsock_loopback.o
>
> vsock-y += af_vsock.o af_vsock_tap.o vsock_addr.o
>+vsock-$(CONFIG_BPF_SYSCALL) += vsock_bpf.o
>
> vsock_diag-y += diag.o
>
>diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c
>index d593d5b6d4b15..7081b3a992c1e 100644
>--- a/net/vmw_vsock/af_vsock.c
>+++ b/net/vmw_vsock/af_vsock.c
>@@ -116,10 +116,13 @@ static void vsock_sk_destruct(struct sock *sk);
> static int vsock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb);
>
> /* Protocol family. */
>-static struct proto vsock_proto = {
>+struct proto vsock_proto = {
> 	.name = "AF_VSOCK",
> 	.owner = THIS_MODULE,
> 	.obj_size = sizeof(struct vsock_sock),
>+#ifdef CONFIG_BPF_SYSCALL
>+	.psock_update_sk_prot = vsock_bpf_update_proto,
>+#endif
> };
>
> /* The default peer timeout indicates how long we will wait for a peer response
>@@ -865,7 +868,7 @@ s64 vsock_stream_has_data(struct vsock_sock *vsk)
> }
> EXPORT_SYMBOL_GPL(vsock_stream_has_data);
>
>-static s64 vsock_connectible_has_data(struct vsock_sock *vsk)
>+s64 vsock_connectible_has_data(struct vsock_sock *vsk)
> {
> 	struct sock *sk = sk_vsock(vsk);
>
>@@ -874,6 +877,7 @@ static s64 vsock_connectible_has_data(struct vsock_sock *vsk)
> 	else
> 		return vsock_stream_has_data(vsk);
> }
>+EXPORT_SYMBOL_GPL(vsock_connectible_has_data);
>
> s64 vsock_stream_has_space(struct vsock_sock *vsk)
> {
>@@ -1131,6 +1135,19 @@ static __poll_t vsock_poll(struct file *file, struct socket *sock,
> 	return mask;
> }
>
>+static int vsock_read_skb(struct sock *sk, skb_read_actor_t read_actor)
>+{
>+	struct vsock_sock *vsk = vsock_sk(sk);
>+
>+	if (!vsk->transport)
>+		return -ENODEV;
>+
>+	if (!vsk->transport->read_skb)
>+		return -EOPNOTSUPP;
>+
>+	return vsk->transport->read_skb(vsk, read_actor);
>+}
>+
> static int vsock_dgram_sendmsg(struct socket *sock, struct msghdr *msg,
> 			       size_t len)
> {
>@@ -1241,19 +1258,32 @@ static int vsock_dgram_connect(struct socket *sock,
>
> 	memcpy(&vsk->remote_addr, remote_addr, sizeof(vsk->remote_addr));
> 	sock->state = SS_CONNECTED;
>+	sk->sk_state = TCP_ESTABLISHED;

Why we need this change?
If it's a fix, we should put it in another patch.

>
> out:
> 	release_sock(sk);
> 	return err;
> }
>
>-static int vsock_dgram_recvmsg(struct socket *sock, struct msghdr *msg,
>-			       size_t len, int flags)
>+int vsock_dgram_recvmsg(struct socket *sock, struct msghdr *msg,
>+			size_t len, int flags)
> {
>-	struct vsock_sock *vsk = vsock_sk(sock->sk);
>+	const struct proto *prot;

We should use the guard for this statement as in
vsock_connectible_recvmsg().

>+	struct vsock_sock *vsk;
>+	struct sock *sk;
>+
>+	sk = sock->sk;
>+	vsk = vsock_sk(sk);
>+
>+#ifdef CONFIG_BPF_SYSCALL
>+	prot = READ_ONCE(sk->sk_prot);
>+	if (prot != &vsock_proto)
>+		return prot->recvmsg(sk, msg, len, flags, NULL);
>+#endif
>
> 	return vsk->transport->dgram_dequeue(vsk, msg, len, flags);
> }
>+EXPORT_SYMBOL_GPL(vsock_dgram_recvmsg);
>
> static const struct proto_ops vsock_dgram_ops = {
> 	.family = PF_VSOCK,
>@@ -1272,6 +1302,7 @@ static const struct proto_ops vsock_dgram_ops = {
> 	.recvmsg = vsock_dgram_recvmsg,
> 	.mmap = sock_no_mmap,
> 	.sendpage = sock_no_sendpage,
>+	.read_skb = vsock_read_skb,
> };
>
> static int vsock_transport_cancel_pkt(struct vsock_sock *vsk)
>@@ -2085,13 +2116,16 @@ static int __vsock_seqpacket_recvmsg(struct sock *sk, struct msghdr *msg,
> 	return err;
> }
>
>-static int
>+int
> vsock_connectible_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
> 			  int flags)
> {
> 	struct sock *sk;
> 	struct vsock_sock *vsk;
> 	const struct vsock_transport *transport;
>+#ifdef CONFIG_BPF_SYSCALL
>+	const struct proto *prot;
>+#endif
> 	int err;
>
> 	sk = sock->sk;
>@@ -2138,6 +2172,14 @@ vsock_connectible_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
> 		goto out;
> 	}
>
>+#ifdef CONFIG_BPF_SYSCALL
>+	prot = READ_ONCE(sk->sk_prot);
>+	if (prot != &vsock_proto) {
>+		release_sock(sk);
>+		return prot->recvmsg(sk, msg, len, flags, NULL);
>+	}
>+#endif
>+
> 	if (sk->sk_type == SOCK_STREAM)
> 		err = __vsock_stream_recvmsg(sk, msg, len, flags);
> 	else
>@@ -2147,6 +2189,7 @@ vsock_connectible_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
> 	release_sock(sk);
> 	return err;
> }
>+EXPORT_SYMBOL_GPL(vsock_connectible_recvmsg);
>
> static int vsock_set_rcvlowat(struct sock *sk, int val)
> {
>@@ -2187,6 +2230,7 @@ static const struct proto_ops vsock_stream_ops = {
> 	.mmap = sock_no_mmap,
> 	.sendpage = sock_no_sendpage,
> 	.set_rcvlowat = vsock_set_rcvlowat,
>+	.read_skb = vsock_read_skb,
> };
>
> static const struct proto_ops vsock_seqpacket_ops = {
>@@ -2208,6 +2252,7 @@ static const struct proto_ops vsock_seqpacket_ops = {
> 	.recvmsg = vsock_connectible_recvmsg,
> 	.mmap = sock_no_mmap,
> 	.sendpage = sock_no_sendpage,
>+	.read_skb = vsock_read_skb,
> };
>
> static int vsock_create(struct net *net, struct socket *sock,
>@@ -2347,6 +2392,8 @@ static int __init vsock_init(void)
> 		goto err_unregister_proto;
> 	}
>
>+	vsock_bpf_build_proto();
>+
> 	return 0;
>
> err_unregister_proto:
>diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c
>index 28b5a8e8e0948..e95df847176b6 100644
>--- a/net/vmw_vsock/virtio_transport.c
>+++ b/net/vmw_vsock/virtio_transport.c
>@@ -457,6 +457,8 @@ static struct virtio_transport virtio_transport = {
> 		.notify_send_pre_enqueue  = 
> 		virtio_transport_notify_send_pre_enqueue,
> 		.notify_send_post_enqueue = virtio_transport_notify_send_post_enqueue,
> 		.notify_buffer_size       = virtio_transport_notify_buffer_size,
>+
>+		.read_skb = virtio_transport_read_skb,
> 	},
>
> 	.send_pkt = virtio_transport_send_pkt,
>diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c
>index a1581c77cf84a..9a87ead5b1fc5 100644
>--- a/net/vmw_vsock/virtio_transport_common.c
>+++ b/net/vmw_vsock/virtio_transport_common.c
>@@ -1388,6 +1388,28 @@ int virtio_transport_purge_skbs(void *vsk, struct sk_buff_head *queue)
> }
> EXPORT_SYMBOL_GPL(virtio_transport_purge_skbs);
>
>+int virtio_transport_read_skb(struct vsock_sock *vsk, skb_read_actor_t recv_actor)
>+{
>+	struct virtio_vsock_sock *vvs = vsk->trans;
>+	struct sock *sk = sk_vsock(vsk);
>+	struct sk_buff *skb;
>+	int copied = 0;

We could avoid initializing `copied`, since it is overwritten later.

>+	int off = 0;
>+	int err;
>+
>+	spin_lock_bh(&vvs->rx_lock);
>+	skb = __skb_recv_datagram(sk, &vvs->rx_queue, MSG_DONTWAIT, &off, &err);

Will this work also for STREAM and SEQPACKET sockets?

>+	spin_unlock_bh(&vvs->rx_lock);
>+
>+	if (!skb)
>+		return err;
>+
>+	copied = recv_actor(sk, skb);
>+	kfree_skb(skb);

I would have moved these steps to vsock_read_skb() to avoid duplicating
these steps in each transport. Perhaps not all transports want to pass
skb ownership to the caller, though, so maybe we can leave it that
way for now.

>+	return copied;
>+}
>+EXPORT_SYMBOL_GPL(virtio_transport_read_skb);
>+
> MODULE_LICENSE("GPL v2");
> MODULE_AUTHOR("Asias He");
> MODULE_DESCRIPTION("common code for virtio vsock");
>diff --git a/net/vmw_vsock/vsock_bpf.c b/net/vmw_vsock/vsock_bpf.c
>new file mode 100644
>index 0000000000000..9e11282d3bc1f
>--- /dev/null
>+++ b/net/vmw_vsock/vsock_bpf.c
>@@ -0,0 +1,180 @@
>+// SPDX-License-Identifier: GPL-2.0
>+/* Copyright (c) 2022 Bobby Eshleman <bobby.eshleman@bytedance.com>
>+ *
>+ * Based off of net/unix/unix_bpf.c
>+ */
>+
>+#include <linux/bpf.h>
>+#include <linux/module.h>
>+#include <linux/skmsg.h>
>+#include <linux/socket.h>
>+#include <net/af_vsock.h>
>+#include <net/sock.h>
>+
>+#define vsock_sk_has_data(__sk, __psock)				\
>+		({	!skb_queue_empty(&__sk->sk_receive_queue) ||	\
>+			!skb_queue_empty(&__psock->ingress_skb) ||	\
>+			!list_empty(&__psock->ingress_msg);		\
>+		})
>+
>+static struct proto *vsock_dgram_prot_saved __read_mostly;
>+static DEFINE_SPINLOCK(vsock_dgram_prot_lock);
>+static struct proto vsock_dgram_bpf_prot;
>+
>+static bool vsock_has_data(struct vsock_sock *vsk, struct sk_psock *psock)
>+{
>+	struct sock *sk = sk_vsock(vsk);
>+	s64 ret;
>+
>+	ret = vsock_connectible_has_data(vsk);
>+	if (ret > 0)
>+		return true;
>+
>+	return vsock_sk_has_data(sk, psock);
>+}
>+
>+static int vsock_msg_wait_data(struct sock *sk, struct sk_psock *psock, long timeo)
>+{
>+	struct vsock_sock *vsk;
>+	int err;
>+
>+	DEFINE_WAIT(wait);
>+
>+	vsk = vsock_sk(sk);
>+	err = 0;
>+
>+	while (vsock_has_data(vsk, psock)) {
>+		prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
>+
>+		if (sk->sk_err != 0 ||
>+		    (sk->sk_shutdown & RCV_SHUTDOWN) ||
>+		    (vsk->peer_shutdown & SEND_SHUTDOWN)) {
>+			break;
>+		}
>+
>+		if (timeo == 0) {
>+			err = -EAGAIN;
>+			break;
>+		}
>+
>+		release_sock(sk);
>+		timeo = schedule_timeout(timeo);
>+		lock_sock(sk);
>+
>+		if (signal_pending(current)) {
>+			err = sock_intr_errno(timeo);
>+			break;
>+		} else if (timeo == 0) {
>+			err = -EAGAIN;
>+			break;
>+		}
>+	}
>+
>+	finish_wait(sk_sleep(sk), &wait);
>+
>+	if (err)
>+		return err;
>+
>+	return 0;
>+}
>+
>+static int vsock_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags)
>+{
>+	int err;
>+	struct socket *sock = sk->sk_socket;
>+
>+	if (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET)
>+		err = vsock_connectible_recvmsg(sock, msg, len, flags);
>+	else

Could it happen that it is not DGRAM and we should return an error in
this case?

Thanks,
Stefano

>+		err = vsock_dgram_recvmsg(sock, msg, len, flags);
>+
>+	return err;
>+}
>+
>+static int vsock_bpf_recvmsg(struct sock *sk, struct msghdr *msg,
>+			     size_t len, int flags, int *addr_len)
>+{
>+	int copied;
>+	struct sk_psock *psock;
>+
>+	lock_sock(sk);
>+	psock = sk_psock_get(sk);
>+	if (unlikely(!psock)) {
>+		release_sock(sk);
>+		return vsock_recvmsg(sk, msg, len, flags);
>+	}
>+
>+	if (vsock_has_data(vsock_sk(sk), psock) && sk_psock_queue_empty(psock)) {
>+		sk_psock_put(sk, psock);
>+		release_sock(sk);
>+		return vsock_recvmsg(sk, msg, len, flags);
>+	}
>+
>+msg_bytes_ready:
>+	copied = sk_msg_recvmsg(sk, psock, msg, len, flags);
>+	if (!copied) {
>+		long timeo;
>+		int data;
>+
>+		timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
>+		data = vsock_msg_wait_data(sk, psock, timeo);
>+		if (data) {
>+			if (!sk_psock_queue_empty(psock))
>+				goto msg_bytes_ready;
>+			sk_psock_put(sk, psock);
>+			release_sock(sk);
>+			return vsock_recvmsg(sk, msg, len, flags);
>+		}
>+		copied = -EAGAIN;
>+	}
>+	sk_psock_put(sk, psock);
>+	release_sock(sk);
>+
>+	return copied;
>+}
>+
>+/* Copy of original proto with updated sock_map methods */
>+static struct proto vsock_dgram_bpf_prot = {
>+	.close = sock_map_close,
>+	.recvmsg = vsock_bpf_recvmsg,
>+	.sock_is_readable = sk_msg_is_readable,
>+	.unhash = sock_map_unhash,
>+};
>+
>+static void vsock_dgram_bpf_rebuild_protos(struct proto *prot, const struct proto *base)
>+{
>+	*prot        = *base;
>+	prot->close  = sock_map_close;
>+	prot->recvmsg = vsock_bpf_recvmsg;
>+	prot->sock_is_readable = sk_msg_is_readable;
>+}
>+
>+static void vsock_dgram_bpf_check_needs_rebuild(struct proto *ops)
>+{
>+	if (unlikely(ops != smp_load_acquire(&vsock_dgram_prot_saved))) {
>+		spin_lock_bh(&vsock_dgram_prot_lock);
>+		if (likely(ops != vsock_dgram_prot_saved)) {
>+			vsock_dgram_bpf_rebuild_protos(&vsock_dgram_bpf_prot, ops);
>+			smp_store_release(&vsock_dgram_prot_saved, ops);
>+		}
>+		spin_unlock_bh(&vsock_dgram_prot_lock);
>+	}
>+}
>+
>+int vsock_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore)
>+{
>+	if (restore) {
>+		sk->sk_write_space = psock->saved_write_space;
>+		sock_replace_proto(sk, psock->sk_proto);
>+		return 0;
>+	}
>+
>+	vsock_dgram_bpf_check_needs_rebuild(psock->sk_proto);
>+	sock_replace_proto(sk, &vsock_dgram_bpf_prot);
>+	return 0;
>+}
>+
>+void __init vsock_bpf_build_proto(void)
>+{
>+	vsock_dgram_bpf_rebuild_protos(&vsock_dgram_bpf_prot, &vsock_proto);
>+}
>diff --git a/net/vmw_vsock/vsock_loopback.c b/net/vmw_vsock/vsock_loopback.c
>index 671e03240fc52..40753b661c135 100644
>--- a/net/vmw_vsock/vsock_loopback.c
>+++ b/net/vmw_vsock/vsock_loopback.c
>@@ -94,6 +94,8 @@ static struct virtio_transport loopback_transport = {
> 		.notify_send_pre_enqueue  = virtio_transport_notify_send_pre_enqueue,
> 		.notify_send_post_enqueue = virtio_transport_notify_send_post_enqueue,
> 		.notify_buffer_size       = virtio_transport_notify_buffer_size,
>+
>+		.read_skb = virtio_transport_read_skb,
> 	},
>
> 	.send_pkt = vsock_loopback_send_pkt,
>
>-- 
>2.30.2
>

_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH RFC 1/3] vsock: support sockmap
@ 2023-01-19 10:39     ` Stefano Garzarella
  0 siblings, 0 replies; 20+ messages in thread
From: Stefano Garzarella @ 2023-01-19 10:39 UTC (permalink / raw)
  To: Bobby Eshleman
  Cc: Stefan Hajnoczi, Michael S. Tsirkin, Jason Wang, David S. Miller,
	Eric Dumazet, Jakub Kicinski, Paolo Abeni, Andrii Nakryiko,
	Mykola Lysenko, Alexei Starovoitov, Daniel Borkmann,
	Martin KaFai Lau, Song Liu, Yonghong Song, John Fastabend,
	KP Singh, Stanislav Fomichev, Hao Luo, Jiri Olsa, Shuah Khan,
	linux-kernel, kvm, virtualization, netdev, bpf, linux-kselftest,
	Cong Wang

On Wed, Jan 18, 2023 at 12:27:39PM -0800, Bobby Eshleman wrote:
>This patch adds sockmap support for vsock sockets. It is intended to be
>usable by all transports, but only the virtio transport is implemented.
>
>Signed-off-by: Bobby Eshleman <bobby.eshleman@bytedance.com>
>---
> drivers/vhost/vsock.c                   |   1 +
> include/linux/virtio_vsock.h            |   1 +
> include/net/af_vsock.h                  |  17 +++
> net/vmw_vsock/Makefile                  |   1 +
> net/vmw_vsock/af_vsock.c                |  59 +++++++++--
> net/vmw_vsock/virtio_transport.c        |   2 +
> net/vmw_vsock/virtio_transport_common.c |  22 ++++
> net/vmw_vsock/vsock_bpf.c               | 180 ++++++++++++++++++++++++++++++++
> net/vmw_vsock/vsock_loopback.c          |   2 +
> 9 files changed, 279 insertions(+), 6 deletions(-)

./scripts/checkpatch.pl --strict prints some simple warnings/checks that
I suggest to fix :-)

>
>diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c
>index 1f3b89c885cca..3c6dc036b9044 100644
>--- a/drivers/vhost/vsock.c
>+++ b/drivers/vhost/vsock.c
>@@ -439,6 +439,7 @@ static struct virtio_transport vhost_transport = {
> 		.notify_send_post_enqueue = virtio_transport_notify_send_post_enqueue,
> 		.notify_buffer_size       = virtio_transport_notify_buffer_size,
>
>+		.read_skb = virtio_transport_read_skb,
> 	},
>
> 	.send_pkt = vhost_transport_send_pkt,
>diff --git a/include/linux/virtio_vsock.h b/include/linux/virtio_vsock.h
>index 3f9c166113063..c58453699ee98 100644
>--- a/include/linux/virtio_vsock.h
>+++ b/include/linux/virtio_vsock.h
>@@ -245,4 +245,5 @@ u32 virtio_transport_get_credit(struct virtio_vsock_sock *vvs, u32 wanted);
> void virtio_transport_put_credit(struct virtio_vsock_sock *vvs, u32 credit);
> void virtio_transport_deliver_tap_pkt(struct sk_buff *skb);
> int virtio_transport_purge_skbs(void *vsk, struct sk_buff_head *list);
>+int virtio_transport_read_skb(struct vsock_sock *vsk, skb_read_actor_t read_actor);
> #endif /* _LINUX_VIRTIO_VSOCK_H */
>diff --git a/include/net/af_vsock.h b/include/net/af_vsock.h
>index 568a87c5e0d0f..a73f5fbd296af 100644
>--- a/include/net/af_vsock.h
>+++ b/include/net/af_vsock.h
>@@ -75,6 +75,7 @@ struct vsock_sock {
> 	void *trans;
> };
>
>+s64 vsock_connectible_has_data(struct vsock_sock *vsk);
> s64 vsock_stream_has_data(struct vsock_sock *vsk);
> s64 vsock_stream_has_space(struct vsock_sock *vsk);
> struct sock *vsock_create_connected(struct sock *parent);
>@@ -173,6 +174,9 @@ struct vsock_transport {
>
> 	/* Addressing. */
> 	u32 (*get_local_cid)(void);
>+
>+	/* Read a single skb */
>+	int (*read_skb)(struct vsock_sock *, skb_read_actor_t);
> };
>
> /**** CORE ****/
>@@ -225,5 +229,18 @@ int vsock_init_tap(void);
> int vsock_add_tap(struct vsock_tap *vt);
> int vsock_remove_tap(struct vsock_tap *vt);
> void vsock_deliver_tap(struct sk_buff *build_skb(void *opaque), void *opaque);
>+int vsock_connectible_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
>+			      int flags);
>+int vsock_dgram_recvmsg(struct socket *sock, struct msghdr *msg,
>+			       size_t len, int flags);
>+
>+#ifdef CONFIG_BPF_SYSCALL
>+extern struct proto vsock_proto;
>+int vsock_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore);
>+void __init vsock_bpf_build_proto(void);
>+#else
>+static inline void __init vsock_bpf_build_proto(void)
>+{}
>+#endif
>
> #endif /* __AF_VSOCK_H__ */
>diff --git a/net/vmw_vsock/Makefile b/net/vmw_vsock/Makefile
>index 6a943ec95c4a5..5da74c4a9f1d1 100644
>--- a/net/vmw_vsock/Makefile
>+++ b/net/vmw_vsock/Makefile
>@@ -8,6 +8,7 @@ obj-$(CONFIG_HYPERV_VSOCKETS) += hv_sock.o
> obj-$(CONFIG_VSOCKETS_LOOPBACK) += vsock_loopback.o
>
> vsock-y += af_vsock.o af_vsock_tap.o vsock_addr.o
>+vsock-$(CONFIG_BPF_SYSCALL) += vsock_bpf.o
>
> vsock_diag-y += diag.o
>
>diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c
>index d593d5b6d4b15..7081b3a992c1e 100644
>--- a/net/vmw_vsock/af_vsock.c
>+++ b/net/vmw_vsock/af_vsock.c
>@@ -116,10 +116,13 @@ static void vsock_sk_destruct(struct sock *sk);
> static int vsock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb);
>
> /* Protocol family. */
>-static struct proto vsock_proto = {
>+struct proto vsock_proto = {
> 	.name = "AF_VSOCK",
> 	.owner = THIS_MODULE,
> 	.obj_size = sizeof(struct vsock_sock),
>+#ifdef CONFIG_BPF_SYSCALL
>+	.psock_update_sk_prot = vsock_bpf_update_proto,
>+#endif
> };
>
> /* The default peer timeout indicates how long we will wait for a peer response
>@@ -865,7 +868,7 @@ s64 vsock_stream_has_data(struct vsock_sock *vsk)
> }
> EXPORT_SYMBOL_GPL(vsock_stream_has_data);
>
>-static s64 vsock_connectible_has_data(struct vsock_sock *vsk)
>+s64 vsock_connectible_has_data(struct vsock_sock *vsk)
> {
> 	struct sock *sk = sk_vsock(vsk);
>
>@@ -874,6 +877,7 @@ static s64 vsock_connectible_has_data(struct vsock_sock *vsk)
> 	else
> 		return vsock_stream_has_data(vsk);
> }
>+EXPORT_SYMBOL_GPL(vsock_connectible_has_data);
>
> s64 vsock_stream_has_space(struct vsock_sock *vsk)
> {
>@@ -1131,6 +1135,19 @@ static __poll_t vsock_poll(struct file *file, struct socket *sock,
> 	return mask;
> }
>
>+static int vsock_read_skb(struct sock *sk, skb_read_actor_t read_actor)
>+{
>+	struct vsock_sock *vsk = vsock_sk(sk);
>+
>+	if (!vsk->transport)
>+		return -ENODEV;
>+
>+	if (!vsk->transport->read_skb)
>+		return -EOPNOTSUPP;
>+
>+	return vsk->transport->read_skb(vsk, read_actor);
>+}
>+
> static int vsock_dgram_sendmsg(struct socket *sock, struct msghdr *msg,
> 			       size_t len)
> {
>@@ -1241,19 +1258,32 @@ static int vsock_dgram_connect(struct socket *sock,
>
> 	memcpy(&vsk->remote_addr, remote_addr, sizeof(vsk->remote_addr));
> 	sock->state = SS_CONNECTED;
>+	sk->sk_state = TCP_ESTABLISHED;

Why we need this change?
If it's a fix, we should put it in another patch.

>
> out:
> 	release_sock(sk);
> 	return err;
> }
>
>-static int vsock_dgram_recvmsg(struct socket *sock, struct msghdr *msg,
>-			       size_t len, int flags)
>+int vsock_dgram_recvmsg(struct socket *sock, struct msghdr *msg,
>+			size_t len, int flags)
> {
>-	struct vsock_sock *vsk = vsock_sk(sock->sk);
>+	const struct proto *prot;

We should use the guard for this statement as in
vsock_connectible_recvmsg().

>+	struct vsock_sock *vsk;
>+	struct sock *sk;
>+
>+	sk = sock->sk;
>+	vsk = vsock_sk(sk);
>+
>+#ifdef CONFIG_BPF_SYSCALL
>+	prot = READ_ONCE(sk->sk_prot);
>+	if (prot != &vsock_proto)
>+		return prot->recvmsg(sk, msg, len, flags, NULL);
>+#endif
>
> 	return vsk->transport->dgram_dequeue(vsk, msg, len, flags);
> }
>+EXPORT_SYMBOL_GPL(vsock_dgram_recvmsg);
>
> static const struct proto_ops vsock_dgram_ops = {
> 	.family = PF_VSOCK,
>@@ -1272,6 +1302,7 @@ static const struct proto_ops vsock_dgram_ops = {
> 	.recvmsg = vsock_dgram_recvmsg,
> 	.mmap = sock_no_mmap,
> 	.sendpage = sock_no_sendpage,
>+	.read_skb = vsock_read_skb,
> };
>
> static int vsock_transport_cancel_pkt(struct vsock_sock *vsk)
>@@ -2085,13 +2116,16 @@ static int __vsock_seqpacket_recvmsg(struct sock *sk, struct msghdr *msg,
> 	return err;
> }
>
>-static int
>+int
> vsock_connectible_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
> 			  int flags)
> {
> 	struct sock *sk;
> 	struct vsock_sock *vsk;
> 	const struct vsock_transport *transport;
>+#ifdef CONFIG_BPF_SYSCALL
>+	const struct proto *prot;
>+#endif
> 	int err;
>
> 	sk = sock->sk;
>@@ -2138,6 +2172,14 @@ vsock_connectible_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
> 		goto out;
> 	}
>
>+#ifdef CONFIG_BPF_SYSCALL
>+	prot = READ_ONCE(sk->sk_prot);
>+	if (prot != &vsock_proto) {
>+		release_sock(sk);
>+		return prot->recvmsg(sk, msg, len, flags, NULL);
>+	}
>+#endif
>+
> 	if (sk->sk_type == SOCK_STREAM)
> 		err = __vsock_stream_recvmsg(sk, msg, len, flags);
> 	else
>@@ -2147,6 +2189,7 @@ vsock_connectible_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
> 	release_sock(sk);
> 	return err;
> }
>+EXPORT_SYMBOL_GPL(vsock_connectible_recvmsg);
>
> static int vsock_set_rcvlowat(struct sock *sk, int val)
> {
>@@ -2187,6 +2230,7 @@ static const struct proto_ops vsock_stream_ops = {
> 	.mmap = sock_no_mmap,
> 	.sendpage = sock_no_sendpage,
> 	.set_rcvlowat = vsock_set_rcvlowat,
>+	.read_skb = vsock_read_skb,
> };
>
> static const struct proto_ops vsock_seqpacket_ops = {
>@@ -2208,6 +2252,7 @@ static const struct proto_ops vsock_seqpacket_ops = {
> 	.recvmsg = vsock_connectible_recvmsg,
> 	.mmap = sock_no_mmap,
> 	.sendpage = sock_no_sendpage,
>+	.read_skb = vsock_read_skb,
> };
>
> static int vsock_create(struct net *net, struct socket *sock,
>@@ -2347,6 +2392,8 @@ static int __init vsock_init(void)
> 		goto err_unregister_proto;
> 	}
>
>+	vsock_bpf_build_proto();
>+
> 	return 0;
>
> err_unregister_proto:
>diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c
>index 28b5a8e8e0948..e95df847176b6 100644
>--- a/net/vmw_vsock/virtio_transport.c
>+++ b/net/vmw_vsock/virtio_transport.c
>@@ -457,6 +457,8 @@ static struct virtio_transport virtio_transport = {
> 		.notify_send_pre_enqueue  = 
> 		virtio_transport_notify_send_pre_enqueue,
> 		.notify_send_post_enqueue = virtio_transport_notify_send_post_enqueue,
> 		.notify_buffer_size       = virtio_transport_notify_buffer_size,
>+
>+		.read_skb = virtio_transport_read_skb,
> 	},
>
> 	.send_pkt = virtio_transport_send_pkt,
>diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c
>index a1581c77cf84a..9a87ead5b1fc5 100644
>--- a/net/vmw_vsock/virtio_transport_common.c
>+++ b/net/vmw_vsock/virtio_transport_common.c
>@@ -1388,6 +1388,28 @@ int virtio_transport_purge_skbs(void *vsk, struct sk_buff_head *queue)
> }
> EXPORT_SYMBOL_GPL(virtio_transport_purge_skbs);
>
>+int virtio_transport_read_skb(struct vsock_sock *vsk, skb_read_actor_t recv_actor)
>+{
>+	struct virtio_vsock_sock *vvs = vsk->trans;
>+	struct sock *sk = sk_vsock(vsk);
>+	struct sk_buff *skb;
>+	int copied = 0;

We could avoid initializing `copied`, since it is overwritten later.

>+	int off = 0;
>+	int err;
>+
>+	spin_lock_bh(&vvs->rx_lock);
>+	skb = __skb_recv_datagram(sk, &vvs->rx_queue, MSG_DONTWAIT, &off, &err);

Will this work also for STREAM and SEQPACKET sockets?

>+	spin_unlock_bh(&vvs->rx_lock);
>+
>+	if (!skb)
>+		return err;
>+
>+	copied = recv_actor(sk, skb);
>+	kfree_skb(skb);

I would have moved these steps to vsock_read_skb() to avoid duplicating
these steps in each transport. Perhaps not all transports want to pass
skb ownership to the caller, though, so maybe we can leave it that
way for now.

>+	return copied;
>+}
>+EXPORT_SYMBOL_GPL(virtio_transport_read_skb);
>+
> MODULE_LICENSE("GPL v2");
> MODULE_AUTHOR("Asias He");
> MODULE_DESCRIPTION("common code for virtio vsock");
>diff --git a/net/vmw_vsock/vsock_bpf.c b/net/vmw_vsock/vsock_bpf.c
>new file mode 100644
>index 0000000000000..9e11282d3bc1f
>--- /dev/null
>+++ b/net/vmw_vsock/vsock_bpf.c
>@@ -0,0 +1,180 @@
>+// SPDX-License-Identifier: GPL-2.0
>+/* Copyright (c) 2022 Bobby Eshleman <bobby.eshleman@bytedance.com>
>+ *
>+ * Based off of net/unix/unix_bpf.c
>+ */
>+
>+#include <linux/bpf.h>
>+#include <linux/module.h>
>+#include <linux/skmsg.h>
>+#include <linux/socket.h>
>+#include <net/af_vsock.h>
>+#include <net/sock.h>
>+
>+#define vsock_sk_has_data(__sk, __psock)				\
>+		({	!skb_queue_empty(&__sk->sk_receive_queue) ||	\
>+			!skb_queue_empty(&__psock->ingress_skb) ||	\
>+			!list_empty(&__psock->ingress_msg);		\
>+		})
>+
>+static struct proto *vsock_dgram_prot_saved __read_mostly;
>+static DEFINE_SPINLOCK(vsock_dgram_prot_lock);
>+static struct proto vsock_dgram_bpf_prot;
>+
>+static bool vsock_has_data(struct vsock_sock *vsk, struct sk_psock *psock)
>+{
>+	struct sock *sk = sk_vsock(vsk);
>+	s64 ret;
>+
>+	ret = vsock_connectible_has_data(vsk);
>+	if (ret > 0)
>+		return true;
>+
>+	return vsock_sk_has_data(sk, psock);
>+}
>+
>+static int vsock_msg_wait_data(struct sock *sk, struct sk_psock *psock, long timeo)
>+{
>+	struct vsock_sock *vsk;
>+	int err;
>+
>+	DEFINE_WAIT(wait);
>+
>+	vsk = vsock_sk(sk);
>+	err = 0;
>+
>+	while (vsock_has_data(vsk, psock)) {
>+		prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
>+
>+		if (sk->sk_err != 0 ||
>+		    (sk->sk_shutdown & RCV_SHUTDOWN) ||
>+		    (vsk->peer_shutdown & SEND_SHUTDOWN)) {
>+			break;
>+		}
>+
>+		if (timeo == 0) {
>+			err = -EAGAIN;
>+			break;
>+		}
>+
>+		release_sock(sk);
>+		timeo = schedule_timeout(timeo);
>+		lock_sock(sk);
>+
>+		if (signal_pending(current)) {
>+			err = sock_intr_errno(timeo);
>+			break;
>+		} else if (timeo == 0) {
>+			err = -EAGAIN;
>+			break;
>+		}
>+	}
>+
>+	finish_wait(sk_sleep(sk), &wait);
>+
>+	if (err)
>+		return err;
>+
>+	return 0;
>+}
>+
>+static int vsock_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags)
>+{
>+	int err;
>+	struct socket *sock = sk->sk_socket;
>+
>+	if (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET)
>+		err = vsock_connectible_recvmsg(sock, msg, len, flags);
>+	else

Could it happen that it is not DGRAM and we should return an error in
this case?

Thanks,
Stefano

>+		err = vsock_dgram_recvmsg(sock, msg, len, flags);
>+
>+	return err;
>+}
>+
>+static int vsock_bpf_recvmsg(struct sock *sk, struct msghdr *msg,
>+			     size_t len, int flags, int *addr_len)
>+{
>+	int copied;
>+	struct sk_psock *psock;
>+
>+	lock_sock(sk);
>+	psock = sk_psock_get(sk);
>+	if (unlikely(!psock)) {
>+		release_sock(sk);
>+		return vsock_recvmsg(sk, msg, len, flags);
>+	}
>+
>+	if (vsock_has_data(vsock_sk(sk), psock) && sk_psock_queue_empty(psock)) {
>+		sk_psock_put(sk, psock);
>+		release_sock(sk);
>+		return vsock_recvmsg(sk, msg, len, flags);
>+	}
>+
>+msg_bytes_ready:
>+	copied = sk_msg_recvmsg(sk, psock, msg, len, flags);
>+	if (!copied) {
>+		long timeo;
>+		int data;
>+
>+		timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
>+		data = vsock_msg_wait_data(sk, psock, timeo);
>+		if (data) {
>+			if (!sk_psock_queue_empty(psock))
>+				goto msg_bytes_ready;
>+			sk_psock_put(sk, psock);
>+			release_sock(sk);
>+			return vsock_recvmsg(sk, msg, len, flags);
>+		}
>+		copied = -EAGAIN;
>+	}
>+	sk_psock_put(sk, psock);
>+	release_sock(sk);
>+
>+	return copied;
>+}
>+
>+/* Copy of original proto with updated sock_map methods */
>+static struct proto vsock_dgram_bpf_prot = {
>+	.close = sock_map_close,
>+	.recvmsg = vsock_bpf_recvmsg,
>+	.sock_is_readable = sk_msg_is_readable,
>+	.unhash = sock_map_unhash,
>+};
>+
>+static void vsock_dgram_bpf_rebuild_protos(struct proto *prot, const struct proto *base)
>+{
>+	*prot        = *base;
>+	prot->close  = sock_map_close;
>+	prot->recvmsg = vsock_bpf_recvmsg;
>+	prot->sock_is_readable = sk_msg_is_readable;
>+}
>+
>+static void vsock_dgram_bpf_check_needs_rebuild(struct proto *ops)
>+{
>+	if (unlikely(ops != smp_load_acquire(&vsock_dgram_prot_saved))) {
>+		spin_lock_bh(&vsock_dgram_prot_lock);
>+		if (likely(ops != vsock_dgram_prot_saved)) {
>+			vsock_dgram_bpf_rebuild_protos(&vsock_dgram_bpf_prot, ops);
>+			smp_store_release(&vsock_dgram_prot_saved, ops);
>+		}
>+		spin_unlock_bh(&vsock_dgram_prot_lock);
>+	}
>+}
>+
>+int vsock_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore)
>+{
>+	if (restore) {
>+		sk->sk_write_space = psock->saved_write_space;
>+		sock_replace_proto(sk, psock->sk_proto);
>+		return 0;
>+	}
>+
>+	vsock_dgram_bpf_check_needs_rebuild(psock->sk_proto);
>+	sock_replace_proto(sk, &vsock_dgram_bpf_prot);
>+	return 0;
>+}
>+
>+void __init vsock_bpf_build_proto(void)
>+{
>+	vsock_dgram_bpf_rebuild_protos(&vsock_dgram_bpf_prot, &vsock_proto);
>+}
>diff --git a/net/vmw_vsock/vsock_loopback.c b/net/vmw_vsock/vsock_loopback.c
>index 671e03240fc52..40753b661c135 100644
>--- a/net/vmw_vsock/vsock_loopback.c
>+++ b/net/vmw_vsock/vsock_loopback.c
>@@ -94,6 +94,8 @@ static struct virtio_transport loopback_transport = {
> 		.notify_send_pre_enqueue  = virtio_transport_notify_send_pre_enqueue,
> 		.notify_send_post_enqueue = virtio_transport_notify_send_post_enqueue,
> 		.notify_buffer_size       = virtio_transport_notify_buffer_size,
>+
>+		.read_skb = virtio_transport_read_skb,
> 	},
>
> 	.send_pkt = vsock_loopback_send_pkt,
>
>-- 
>2.30.2
>


^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH RFC 2/3] selftests/bpf: add vsock to vmtest.sh
  2023-01-18 20:27 ` [PATCH RFC 2/3] selftests/bpf: add vsock to vmtest.sh Bobby Eshleman
@ 2023-01-19 10:44     ` Stefano Garzarella
  0 siblings, 0 replies; 20+ messages in thread
From: Stefano Garzarella @ 2023-01-19 10:44 UTC (permalink / raw)
  To: Bobby Eshleman
  Cc: Cong Wang, kvm, Michael S. Tsirkin, Alexei Starovoitov,
	virtualization, Song Liu, Eric Dumazet, Stanislav Fomichev,
	linux-kselftest, Shuah Khan, Mykola Lysenko, Daniel Borkmann,
	John Fastabend, Andrii Nakryiko, Yonghong Song, Paolo Abeni,
	KP Singh, Stefan Hajnoczi, Jakub Kicinski, Hao Luo, netdev,
	linux-kernel, Jiri Olsa, bpf, Martin KaFai Lau, David S. Miller

On Wed, Jan 18, 2023 at 12:27:40PM -0800, Bobby Eshleman wrote:
>Add the ability to use vsock in the vmtest.sh script and
>the test kernel config.
>
>Signed-off-by: Bobby Eshleman <bobby.eshleman@bytedance.com>
>---
> tools/testing/selftests/bpf/config.x86_64 | 4 ++++
> tools/testing/selftests/bpf/vmtest.sh     | 1 +
> 2 files changed, 5 insertions(+)
>
>diff --git a/tools/testing/selftests/bpf/config.x86_64 b/tools/testing/selftests/bpf/config.x86_64
>index dd97d61d325ca..db5e6b9a91711 100644
>--- a/tools/testing/selftests/bpf/config.x86_64
>+++ b/tools/testing/selftests/bpf/config.x86_64
>@@ -234,7 +234,11 @@ CONFIG_VIRTIO_BLK=y
> CONFIG_VIRTIO_CONSOLE=y
> CONFIG_VIRTIO_NET=y
> CONFIG_VIRTIO_PCI=y
>+CONFIG_VIRTIO_VSOCKETS=y
>+CONFIG_VIRTIO_VSOCKETS_COMMON=y
> CONFIG_VLAN_8021Q=y
>+CONFIG_VSOCKETS=y
>+CONFIG_VSOCKETS_LOOPBACK=y
> CONFIG_X86_ACPI_CPUFREQ=y
> CONFIG_X86_CPUID=y
> CONFIG_X86_MSR=y
>diff --git a/tools/testing/selftests/bpf/vmtest.sh b/tools/testing/selftests/bpf/vmtest.sh
>index 316a56d680f25..aad27ffd4ec68 100755
>--- a/tools/testing/selftests/bpf/vmtest.sh
>+++ b/tools/testing/selftests/bpf/vmtest.sh
>@@ -250,6 +250,7 @@ EOF
> 		-enable-kvm \
> 		-m 4G \
> 		-drive file="${rootfs_img}",format=raw,index=1,media=disk,if=virtio,cache=none \
>+		-device vhost-vsock-pci,guest-cid=1234 \

I'm not sure if this will work with qemu-system-s390x and 
qemu-system-aarch64.

Maybe the "virt" machine of qemu-system-aarch64 supports PCI, but IIRC 
s390x doesn't support it and we should use the vhost-vsock-ccw device.

In addition, we are changing only config.x86_64, so maybe we should add 
the vhost-vsock-pci device only for x86_64 (maybe in the QEMU_FLAGS or a 
new VSOCK_DEV variable), and run the tests only for that architecture.

Thanks,
Stefano

> 		-kernel "${kernel_bzimage}" \
> 		-append "root=/dev/vda rw console=${QEMU_CONSOLE}"
> }
>
>-- 
>2.30.2
>

_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH RFC 2/3] selftests/bpf: add vsock to vmtest.sh
@ 2023-01-19 10:44     ` Stefano Garzarella
  0 siblings, 0 replies; 20+ messages in thread
From: Stefano Garzarella @ 2023-01-19 10:44 UTC (permalink / raw)
  To: Bobby Eshleman
  Cc: Stefan Hajnoczi, Michael S. Tsirkin, Jason Wang, David S. Miller,
	Eric Dumazet, Jakub Kicinski, Paolo Abeni, Andrii Nakryiko,
	Mykola Lysenko, Alexei Starovoitov, Daniel Borkmann,
	Martin KaFai Lau, Song Liu, Yonghong Song, John Fastabend,
	KP Singh, Stanislav Fomichev, Hao Luo, Jiri Olsa, Shuah Khan,
	linux-kernel, kvm, virtualization, netdev, bpf, linux-kselftest,
	Cong Wang

On Wed, Jan 18, 2023 at 12:27:40PM -0800, Bobby Eshleman wrote:
>Add the ability to use vsock in the vmtest.sh script and
>the test kernel config.
>
>Signed-off-by: Bobby Eshleman <bobby.eshleman@bytedance.com>
>---
> tools/testing/selftests/bpf/config.x86_64 | 4 ++++
> tools/testing/selftests/bpf/vmtest.sh     | 1 +
> 2 files changed, 5 insertions(+)
>
>diff --git a/tools/testing/selftests/bpf/config.x86_64 b/tools/testing/selftests/bpf/config.x86_64
>index dd97d61d325ca..db5e6b9a91711 100644
>--- a/tools/testing/selftests/bpf/config.x86_64
>+++ b/tools/testing/selftests/bpf/config.x86_64
>@@ -234,7 +234,11 @@ CONFIG_VIRTIO_BLK=y
> CONFIG_VIRTIO_CONSOLE=y
> CONFIG_VIRTIO_NET=y
> CONFIG_VIRTIO_PCI=y
>+CONFIG_VIRTIO_VSOCKETS=y
>+CONFIG_VIRTIO_VSOCKETS_COMMON=y
> CONFIG_VLAN_8021Q=y
>+CONFIG_VSOCKETS=y
>+CONFIG_VSOCKETS_LOOPBACK=y
> CONFIG_X86_ACPI_CPUFREQ=y
> CONFIG_X86_CPUID=y
> CONFIG_X86_MSR=y
>diff --git a/tools/testing/selftests/bpf/vmtest.sh b/tools/testing/selftests/bpf/vmtest.sh
>index 316a56d680f25..aad27ffd4ec68 100755
>--- a/tools/testing/selftests/bpf/vmtest.sh
>+++ b/tools/testing/selftests/bpf/vmtest.sh
>@@ -250,6 +250,7 @@ EOF
> 		-enable-kvm \
> 		-m 4G \
> 		-drive file="${rootfs_img}",format=raw,index=1,media=disk,if=virtio,cache=none \
>+		-device vhost-vsock-pci,guest-cid=1234 \

I'm not sure if this will work with qemu-system-s390x and 
qemu-system-aarch64.

Maybe the "virt" machine of qemu-system-aarch64 supports PCI, but IIRC 
s390x doesn't support it and we should use the vhost-vsock-ccw device.

In addition, we are changing only config.x86_64, so maybe we should add 
the vhost-vsock-pci device only for x86_64 (maybe in the QEMU_FLAGS or a 
new VSOCK_DEV variable), and run the tests only for that architecture.

Thanks,
Stefano

> 		-kernel "${kernel_bzimage}" \
> 		-append "root=/dev/vda rw console=${QEMU_CONSOLE}"
> }
>
>-- 
>2.30.2
>


^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH RFC 3/3] selftests/bpf: Add a test case for vsock sockmap
  2023-01-18 20:27 ` [PATCH RFC 3/3] selftests/bpf: Add a test case for vsock sockmap Bobby Eshleman
@ 2023-01-19 10:48     ` Stefano Garzarella
  0 siblings, 0 replies; 20+ messages in thread
From: Stefano Garzarella @ 2023-01-19 10:48 UTC (permalink / raw)
  To: Bobby Eshleman
  Cc: Stefan Hajnoczi, Michael S. Tsirkin, Jason Wang, David S. Miller,
	Eric Dumazet, Jakub Kicinski, Paolo Abeni, Andrii Nakryiko,
	Mykola Lysenko, Alexei Starovoitov, Daniel Borkmann,
	Martin KaFai Lau, Song Liu, Yonghong Song, John Fastabend,
	KP Singh, Stanislav Fomichev, Hao Luo, Jiri Olsa, Shuah Khan,
	linux-kernel, kvm, virtualization, netdev, bpf, linux-kselftest,
	Cong Wang

On Wed, Jan 18, 2023 at 12:27:41PM -0800, Bobby Eshleman wrote:
>Add a test case testing the redirection from connectible AF_VSOCK
>sockets to connectible AF_UNIX sockets.
>
>Signed-off-by: Bobby Eshleman <bobby.eshleman@bytedance.com>
>---
> .../selftests/bpf/prog_tests/sockmap_listen.c      | 163 +++++++++++++++++++++
> 1 file changed, 163 insertions(+)
>
>diff --git a/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c b/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c
>index 2cf0c7a3fe232..8b5a2e09c9ede 100644
>--- a/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c
>+++ b/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c
>@@ -18,6 +18,7 @@
> #include <string.h>
> #include <sys/select.h>
> #include <unistd.h>
>+#include <linux/vm_sockets.h>
>
> #include <bpf/bpf.h>
> #include <bpf/libbpf.h>
>@@ -249,6 +250,16 @@ static void init_addr_loopback6(struct sockaddr_storage *ss, socklen_t *len)
> 	*len = sizeof(*addr6);
> }
>
>+static void init_addr_loopback_vsock(struct sockaddr_storage *ss, socklen_t *len)
>+{
>+	struct sockaddr_vm *addr = memset(ss, 0, sizeof(*ss));
>+
>+	addr->svm_family = AF_VSOCK;
>+	addr->svm_port = VMADDR_PORT_ANY;
>+	addr->svm_cid = VMADDR_CID_LOCAL;

Wait, IIUC we only use loopback, so why do we need to attach the 
vhost-vsock-pci device to QEMU?

At that point if we add CONFIG_VSOCKETS_LOOPBACK in all configurations, 
it should also work with aarch64 and s390x.

Thanks,
Stefano

>+	*len = sizeof(*addr);
>+}
>+
> static void init_addr_loopback(int family, struct sockaddr_storage *ss,
> 			       socklen_t *len)
> {
>@@ -259,6 +270,9 @@ static void init_addr_loopback(int family, struct sockaddr_storage *ss,
> 	case AF_INET6:
> 		init_addr_loopback6(ss, len);
> 		return;
>+	case AF_VSOCK:
>+		init_addr_loopback_vsock(ss, len);
>+		return;
> 	default:
> 		FAIL("unsupported address family %d", family);
> 	}
>@@ -1434,6 +1448,8 @@ static const char *family_str(sa_family_t family)
> 		return "IPv6";
> 	case AF_UNIX:
> 		return "Unix";
>+	case AF_VSOCK:
>+		return "VSOCK";
> 	default:
> 		return "unknown";
> 	}
>@@ -1644,6 +1660,151 @@ static void test_unix_redir(struct test_sockmap_listen *skel, struct bpf_map *ma
> 	unix_skb_redir_to_connected(skel, map, sotype);
> }
>
>+/* Returns two connected loopback vsock sockets */
>+static int vsock_socketpair_connectible(int sotype, int *v0, int *v1)
>+{
>+	struct sockaddr_storage addr;
>+	socklen_t len = sizeof(addr);
>+	int s, p, c;
>+
>+	s = socket_loopback(AF_VSOCK, sotype);
>+	if (s < 0)
>+		return -1;
>+
>+	c = xsocket(AF_VSOCK, sotype | SOCK_NONBLOCK, 0);
>+	if (c == -1)
>+		goto close_srv;
>+
>+	if (getsockname(s, sockaddr(&addr), &len) < 0)
>+		goto close_cli;
>+
>+	if (connect(c, sockaddr(&addr), len) < 0 && errno != EINPROGRESS) {
>+		FAIL_ERRNO("connect");
>+		goto close_cli;
>+	}
>+
>+	len = sizeof(addr);
>+	p = accept_timeout(s, sockaddr(&addr), &len, IO_TIMEOUT_SEC);
>+	if (p < 0)
>+		goto close_cli;
>+
>+	*v0 = p;
>+	*v1 = c;
>+
>+	return 0;
>+
>+close_cli:
>+	close(c);
>+close_srv:
>+	close(s);
>+
>+	return -1;
>+}
>+
>+static void vsock_unix_redir_connectible(int sock_mapfd, int verd_mapfd,
>+					 enum redir_mode mode, int sotype)
>+{
>+	const char *log_prefix = redir_mode_str(mode);
>+	char a = 'a', b = 'b';
>+	int u0, u1, v0, v1;
>+	int sfd[2];
>+	unsigned int pass;
>+	int err, n;
>+	u32 key;
>+
>+	zero_verdict_count(verd_mapfd);
>+
>+	if (socketpair(AF_UNIX, SOCK_STREAM | SOCK_NONBLOCK, 0, sfd))
>+		return;
>+
>+	u0 = sfd[0];
>+	u1 = sfd[1];
>+
>+	err = vsock_socketpair_connectible(sotype, &v0, &v1);
>+	if (err) {
>+		FAIL("vsock_socketpair_connectible() failed");
>+		goto close_uds;
>+	}
>+
>+	err = add_to_sockmap(sock_mapfd, u0, v0);
>+	if (err) {
>+		FAIL("add_to_sockmap failed");
>+		goto close_vsock;
>+	}
>+
>+	n = write(v1, &a, sizeof(a));
>+	if (n < 0)
>+		FAIL_ERRNO("%s: write", log_prefix);
>+	if (n == 0)
>+		FAIL("%s: incomplete write", log_prefix);
>+	if (n < 1)
>+		goto out;
>+
>+	n = recv(mode == REDIR_INGRESS ? u0 : u1, &b, sizeof(b), MSG_DONTWAIT);
>+	if (n < 0)
>+		FAIL("%s: recv() err, errno=%d", log_prefix, errno);
>+	if (n == 0)
>+		FAIL("%s: incomplete recv", log_prefix);
>+	if (b != a)
>+		FAIL("%s: vsock socket map failed, %c != %c", log_prefix, a, b);
>+
>+	key = SK_PASS;
>+	err = xbpf_map_lookup_elem(verd_mapfd, &key, &pass);
>+	if (err)
>+		goto out;
>+	if (pass != 1)
>+		FAIL("%s: want pass count 1, have %d", log_prefix, pass);
>+out:
>+	key = 0;
>+	bpf_map_delete_elem(sock_mapfd, &key);
>+	key = 1;
>+	bpf_map_delete_elem(sock_mapfd, &key);
>+
>+close_vsock:
>+	close(v0);
>+	close(v1);
>+
>+close_uds:
>+	close(u0);
>+	close(u1);
>+}
>+
>+static void vsock_unix_skb_redir_connectible(struct test_sockmap_listen *skel,
>+					     struct bpf_map *inner_map,
>+					     int sotype)
>+{
>+	int verdict = bpf_program__fd(skel->progs.prog_skb_verdict);
>+	int verdict_map = bpf_map__fd(skel->maps.verdict_map);
>+	int sock_map = bpf_map__fd(inner_map);
>+	int err;
>+
>+	err = xbpf_prog_attach(verdict, sock_map, BPF_SK_SKB_VERDICT, 0);
>+	if (err)
>+		return;
>+
>+	skel->bss->test_ingress = false;
>+	vsock_unix_redir_connectible(sock_map, verdict_map, REDIR_EGRESS, sotype);
>+	skel->bss->test_ingress = true;
>+	vsock_unix_redir_connectible(sock_map, verdict_map, REDIR_INGRESS, sotype);
>+
>+	xbpf_prog_detach2(verdict, sock_map, BPF_SK_SKB_VERDICT);
>+}
>+
>+static void test_vsock_redir(struct test_sockmap_listen *skel, struct bpf_map *map)
>+{
>+	const char *family_name, *map_name;
>+	char s[MAX_TEST_NAME];
>+
>+	family_name = family_str(AF_VSOCK);
>+	map_name = map_type_str(map);
>+	snprintf(s, sizeof(s), "%s %s %s", map_name, family_name, __func__);
>+	if (!test__start_subtest(s))
>+		return;
>+
>+	vsock_unix_skb_redir_connectible(skel, map, SOCK_STREAM);
>+	vsock_unix_skb_redir_connectible(skel, map, SOCK_SEQPACKET);
>+}
>+
> static void test_reuseport(struct test_sockmap_listen *skel,
> 			   struct bpf_map *map, int family, int sotype)
> {
>@@ -2015,12 +2176,14 @@ void serial_test_sockmap_listen(void)
> 	run_tests(skel, skel->maps.sock_map, AF_INET6);
> 	test_unix_redir(skel, skel->maps.sock_map, SOCK_DGRAM);
> 	test_unix_redir(skel, skel->maps.sock_map, SOCK_STREAM);
>+	test_vsock_redir(skel, skel->maps.sock_map);
>
> 	skel->bss->test_sockmap = false;
> 	run_tests(skel, skel->maps.sock_hash, AF_INET);
> 	run_tests(skel, skel->maps.sock_hash, AF_INET6);
> 	test_unix_redir(skel, skel->maps.sock_hash, SOCK_DGRAM);
> 	test_unix_redir(skel, skel->maps.sock_hash, SOCK_STREAM);
>+	test_vsock_redir(skel, skel->maps.sock_hash);
>
> 	test_sockmap_listen__destroy(skel);
> }
>
>-- 
>2.30.2
>


^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH RFC 3/3] selftests/bpf: Add a test case for vsock sockmap
@ 2023-01-19 10:48     ` Stefano Garzarella
  0 siblings, 0 replies; 20+ messages in thread
From: Stefano Garzarella @ 2023-01-19 10:48 UTC (permalink / raw)
  To: Bobby Eshleman
  Cc: Cong Wang, kvm, Michael S. Tsirkin, Alexei Starovoitov,
	virtualization, Song Liu, Eric Dumazet, Stanislav Fomichev,
	linux-kselftest, Shuah Khan, Mykola Lysenko, Daniel Borkmann,
	John Fastabend, Andrii Nakryiko, Yonghong Song, Paolo Abeni,
	KP Singh, Stefan Hajnoczi, Jakub Kicinski, Hao Luo, netdev,
	linux-kernel, Jiri Olsa, bpf, Martin KaFai Lau, David S. Miller

On Wed, Jan 18, 2023 at 12:27:41PM -0800, Bobby Eshleman wrote:
>Add a test case testing the redirection from connectible AF_VSOCK
>sockets to connectible AF_UNIX sockets.
>
>Signed-off-by: Bobby Eshleman <bobby.eshleman@bytedance.com>
>---
> .../selftests/bpf/prog_tests/sockmap_listen.c      | 163 +++++++++++++++++++++
> 1 file changed, 163 insertions(+)
>
>diff --git a/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c b/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c
>index 2cf0c7a3fe232..8b5a2e09c9ede 100644
>--- a/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c
>+++ b/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c
>@@ -18,6 +18,7 @@
> #include <string.h>
> #include <sys/select.h>
> #include <unistd.h>
>+#include <linux/vm_sockets.h>
>
> #include <bpf/bpf.h>
> #include <bpf/libbpf.h>
>@@ -249,6 +250,16 @@ static void init_addr_loopback6(struct sockaddr_storage *ss, socklen_t *len)
> 	*len = sizeof(*addr6);
> }
>
>+static void init_addr_loopback_vsock(struct sockaddr_storage *ss, socklen_t *len)
>+{
>+	struct sockaddr_vm *addr = memset(ss, 0, sizeof(*ss));
>+
>+	addr->svm_family = AF_VSOCK;
>+	addr->svm_port = VMADDR_PORT_ANY;
>+	addr->svm_cid = VMADDR_CID_LOCAL;

Wait, IIUC we only use loopback, so why do we need to attach the 
vhost-vsock-pci device to QEMU?

At that point if we add CONFIG_VSOCKETS_LOOPBACK in all configurations, 
it should also work with aarch64 and s390x.

Thanks,
Stefano

>+	*len = sizeof(*addr);
>+}
>+
> static void init_addr_loopback(int family, struct sockaddr_storage *ss,
> 			       socklen_t *len)
> {
>@@ -259,6 +270,9 @@ static void init_addr_loopback(int family, struct sockaddr_storage *ss,
> 	case AF_INET6:
> 		init_addr_loopback6(ss, len);
> 		return;
>+	case AF_VSOCK:
>+		init_addr_loopback_vsock(ss, len);
>+		return;
> 	default:
> 		FAIL("unsupported address family %d", family);
> 	}
>@@ -1434,6 +1448,8 @@ static const char *family_str(sa_family_t family)
> 		return "IPv6";
> 	case AF_UNIX:
> 		return "Unix";
>+	case AF_VSOCK:
>+		return "VSOCK";
> 	default:
> 		return "unknown";
> 	}
>@@ -1644,6 +1660,151 @@ static void test_unix_redir(struct test_sockmap_listen *skel, struct bpf_map *ma
> 	unix_skb_redir_to_connected(skel, map, sotype);
> }
>
>+/* Returns two connected loopback vsock sockets */
>+static int vsock_socketpair_connectible(int sotype, int *v0, int *v1)
>+{
>+	struct sockaddr_storage addr;
>+	socklen_t len = sizeof(addr);
>+	int s, p, c;
>+
>+	s = socket_loopback(AF_VSOCK, sotype);
>+	if (s < 0)
>+		return -1;
>+
>+	c = xsocket(AF_VSOCK, sotype | SOCK_NONBLOCK, 0);
>+	if (c == -1)
>+		goto close_srv;
>+
>+	if (getsockname(s, sockaddr(&addr), &len) < 0)
>+		goto close_cli;
>+
>+	if (connect(c, sockaddr(&addr), len) < 0 && errno != EINPROGRESS) {
>+		FAIL_ERRNO("connect");
>+		goto close_cli;
>+	}
>+
>+	len = sizeof(addr);
>+	p = accept_timeout(s, sockaddr(&addr), &len, IO_TIMEOUT_SEC);
>+	if (p < 0)
>+		goto close_cli;
>+
>+	*v0 = p;
>+	*v1 = c;
>+
>+	return 0;
>+
>+close_cli:
>+	close(c);
>+close_srv:
>+	close(s);
>+
>+	return -1;
>+}
>+
>+static void vsock_unix_redir_connectible(int sock_mapfd, int verd_mapfd,
>+					 enum redir_mode mode, int sotype)
>+{
>+	const char *log_prefix = redir_mode_str(mode);
>+	char a = 'a', b = 'b';
>+	int u0, u1, v0, v1;
>+	int sfd[2];
>+	unsigned int pass;
>+	int err, n;
>+	u32 key;
>+
>+	zero_verdict_count(verd_mapfd);
>+
>+	if (socketpair(AF_UNIX, SOCK_STREAM | SOCK_NONBLOCK, 0, sfd))
>+		return;
>+
>+	u0 = sfd[0];
>+	u1 = sfd[1];
>+
>+	err = vsock_socketpair_connectible(sotype, &v0, &v1);
>+	if (err) {
>+		FAIL("vsock_socketpair_connectible() failed");
>+		goto close_uds;
>+	}
>+
>+	err = add_to_sockmap(sock_mapfd, u0, v0);
>+	if (err) {
>+		FAIL("add_to_sockmap failed");
>+		goto close_vsock;
>+	}
>+
>+	n = write(v1, &a, sizeof(a));
>+	if (n < 0)
>+		FAIL_ERRNO("%s: write", log_prefix);
>+	if (n == 0)
>+		FAIL("%s: incomplete write", log_prefix);
>+	if (n < 1)
>+		goto out;
>+
>+	n = recv(mode == REDIR_INGRESS ? u0 : u1, &b, sizeof(b), MSG_DONTWAIT);
>+	if (n < 0)
>+		FAIL("%s: recv() err, errno=%d", log_prefix, errno);
>+	if (n == 0)
>+		FAIL("%s: incomplete recv", log_prefix);
>+	if (b != a)
>+		FAIL("%s: vsock socket map failed, %c != %c", log_prefix, a, b);
>+
>+	key = SK_PASS;
>+	err = xbpf_map_lookup_elem(verd_mapfd, &key, &pass);
>+	if (err)
>+		goto out;
>+	if (pass != 1)
>+		FAIL("%s: want pass count 1, have %d", log_prefix, pass);
>+out:
>+	key = 0;
>+	bpf_map_delete_elem(sock_mapfd, &key);
>+	key = 1;
>+	bpf_map_delete_elem(sock_mapfd, &key);
>+
>+close_vsock:
>+	close(v0);
>+	close(v1);
>+
>+close_uds:
>+	close(u0);
>+	close(u1);
>+}
>+
>+static void vsock_unix_skb_redir_connectible(struct test_sockmap_listen *skel,
>+					     struct bpf_map *inner_map,
>+					     int sotype)
>+{
>+	int verdict = bpf_program__fd(skel->progs.prog_skb_verdict);
>+	int verdict_map = bpf_map__fd(skel->maps.verdict_map);
>+	int sock_map = bpf_map__fd(inner_map);
>+	int err;
>+
>+	err = xbpf_prog_attach(verdict, sock_map, BPF_SK_SKB_VERDICT, 0);
>+	if (err)
>+		return;
>+
>+	skel->bss->test_ingress = false;
>+	vsock_unix_redir_connectible(sock_map, verdict_map, REDIR_EGRESS, sotype);
>+	skel->bss->test_ingress = true;
>+	vsock_unix_redir_connectible(sock_map, verdict_map, REDIR_INGRESS, sotype);
>+
>+	xbpf_prog_detach2(verdict, sock_map, BPF_SK_SKB_VERDICT);
>+}
>+
>+static void test_vsock_redir(struct test_sockmap_listen *skel, struct bpf_map *map)
>+{
>+	const char *family_name, *map_name;
>+	char s[MAX_TEST_NAME];
>+
>+	family_name = family_str(AF_VSOCK);
>+	map_name = map_type_str(map);
>+	snprintf(s, sizeof(s), "%s %s %s", map_name, family_name, __func__);
>+	if (!test__start_subtest(s))
>+		return;
>+
>+	vsock_unix_skb_redir_connectible(skel, map, SOCK_STREAM);
>+	vsock_unix_skb_redir_connectible(skel, map, SOCK_SEQPACKET);
>+}
>+
> static void test_reuseport(struct test_sockmap_listen *skel,
> 			   struct bpf_map *map, int family, int sotype)
> {
>@@ -2015,12 +2176,14 @@ void serial_test_sockmap_listen(void)
> 	run_tests(skel, skel->maps.sock_map, AF_INET6);
> 	test_unix_redir(skel, skel->maps.sock_map, SOCK_DGRAM);
> 	test_unix_redir(skel, skel->maps.sock_map, SOCK_STREAM);
>+	test_vsock_redir(skel, skel->maps.sock_map);
>
> 	skel->bss->test_sockmap = false;
> 	run_tests(skel, skel->maps.sock_hash, AF_INET);
> 	run_tests(skel, skel->maps.sock_hash, AF_INET6);
> 	test_unix_redir(skel, skel->maps.sock_hash, SOCK_DGRAM);
> 	test_unix_redir(skel, skel->maps.sock_hash, SOCK_STREAM);
>+	test_vsock_redir(skel, skel->maps.sock_hash);
>
> 	test_sockmap_listen__destroy(skel);
> }
>
>-- 
>2.30.2
>

_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH RFC 0/3] vsock: add support for sockmap
  2023-01-18 20:27 [PATCH RFC 0/3] vsock: add support for sockmap Bobby Eshleman
@ 2023-01-19 10:49   ` Stefano Garzarella
  2023-01-18 20:27 ` [PATCH RFC 2/3] selftests/bpf: add vsock to vmtest.sh Bobby Eshleman
                     ` (2 subsequent siblings)
  3 siblings, 0 replies; 20+ messages in thread
From: Stefano Garzarella @ 2023-01-19 10:49 UTC (permalink / raw)
  To: Bobby Eshleman
  Cc: Cong Wang, kvm, Michael S. Tsirkin, Alexei Starovoitov,
	virtualization, Song Liu, Eric Dumazet, Stanislav Fomichev,
	linux-kselftest, Shuah Khan, Mykola Lysenko, Daniel Borkmann,
	John Fastabend, Andrii Nakryiko, Yonghong Song, Paolo Abeni,
	KP Singh, Stefan Hajnoczi, Jakub Kicinski, Hao Luo, netdev,
	linux-kernel, Jiri Olsa, bpf, Martin KaFai Lau, David S. Miller

Hi Bobby,

On Wed, Jan 18, 2023 at 12:27:39PM -0800, Bobby Eshleman wrote:
>Add support for sockmap to vsock.
>
>We're testing usage of vsock as a way to redirect guest-local UDS requests to
>the host and this patch series greatly improves the performance of such a
>setup.
>
>Compared to copying packets via userspace, this improves throughput by 221% in
>basic testing.

Cool, nice series!

>
>Tested as follows.
>
>Setup: guest unix dgram sender -> guest vsock redirector -> host vsock server
>Threads: 1
>Payload: 64k
>No sockmap:
>- 76.3 MB/s
>- The guest vsock redirector was
>  "socat VSOCK-CONNECT:2:1234 UNIX-RECV:/path/to/sock"
>Using sockmap (this patch):
>- 168.8 MB/s (+221%)

Assuming the absolute value is correct, there is a typo here, it would 
be +121% right?

>- The guest redirector was a simple sockmap echo server,
>  redirecting unix ingress to vsock 2:1234 egress.
>- Same sender and server programs
>
>Only the virtio transport has been tested.

I think is fine for now.

>The loopback transport was used in
>writing bpf/selftests, but not thoroughly tested otherwise.

I did a quick review mainly for vsock stuff.
Hoping others can take a better look at net/vmw_vsock/vsock_bpf.c, since 
I'm not very familiar with that subsystem.

FYI I will be off the next two weeks (till Feb 7) with limited internet 
access.

Thanks,
Stefano

>
>This series requires the skb patch.
>
>To: Stefan Hajnoczi <stefanha@redhat.com>
>To: Stefano Garzarella <sgarzare@redhat.com>
>To: "Michael S. Tsirkin" <mst@redhat.com>
>To: Jason Wang <jasowang@redhat.com>
>To: "David S. Miller" <davem@davemloft.net>
>To: Eric Dumazet <edumazet@google.com>
>To: Jakub Kicinski <kuba@kernel.org>
>To: Paolo Abeni <pabeni@redhat.com>
>To: Andrii Nakryiko <andrii@kernel.org>
>To: Mykola Lysenko <mykolal@fb.com>
>To: Alexei Starovoitov <ast@kernel.org>
>To: Daniel Borkmann <daniel@iogearbox.net>
>To: Martin KaFai Lau <martin.lau@linux.dev>
>To: Song Liu <song@kernel.org>
>To: Yonghong Song <yhs@fb.com>
>To: John Fastabend <john.fastabend@gmail.com>
>To: KP Singh <kpsingh@kernel.org>
>To: Stanislav Fomichev <sdf@google.com>
>To: Hao Luo <haoluo@google.com>
>To: Jiri Olsa <jolsa@kernel.org>
>To: Shuah Khan <shuah@kernel.org>
>Cc: linux-kernel@vger.kernel.org
>Cc: kvm@vger.kernel.org
>Cc: virtualization@lists.linux-foundation.org
>Cc: netdev@vger.kernel.org
>Cc: bpf@vger.kernel.org
>Cc: linux-kselftest@vger.kernel.org
>Signed-off-by: Bobby Eshleman <bobby.eshleman@bytedance.com>
>
>---
>Bobby Eshleman (3):
>      vsock: support sockmap
>      selftests/bpf: add vsock to vmtest.sh
>      selftests/bpf: Add a test case for vsock sockmap
>
> drivers/vhost/vsock.c                              |   1 +
> include/linux/virtio_vsock.h                       |   1 +
> include/net/af_vsock.h                             |  17 ++
> net/vmw_vsock/Makefile                             |   1 +
> net/vmw_vsock/af_vsock.c                           |  59 ++++++-
> net/vmw_vsock/virtio_transport.c                   |   2 +
> net/vmw_vsock/virtio_transport_common.c            |  22 +++
> net/vmw_vsock/vsock_bpf.c                          | 180 +++++++++++++++++++++
> net/vmw_vsock/vsock_loopback.c                     |   2 +
> tools/testing/selftests/bpf/config.x86_64          |   4 +
> .../selftests/bpf/prog_tests/sockmap_listen.c      | 163 +++++++++++++++++++
> tools/testing/selftests/bpf/vmtest.sh              |   1 +
> 12 files changed, 447 insertions(+), 6 deletions(-)
>---
>base-commit: f12f4326c6a75a74e908714be6d2f0e2f0fd0d76
>change-id: 20230118-support-vsock-sockmap-connectible-2e1297d2111a
>
>Best regards,
>-- 
>Bobby Eshleman <bobby.eshleman@bytedance.com>
>

_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH RFC 0/3] vsock: add support for sockmap
@ 2023-01-19 10:49   ` Stefano Garzarella
  0 siblings, 0 replies; 20+ messages in thread
From: Stefano Garzarella @ 2023-01-19 10:49 UTC (permalink / raw)
  To: Bobby Eshleman
  Cc: Stefan Hajnoczi, Michael S. Tsirkin, Jason Wang, David S. Miller,
	Eric Dumazet, Jakub Kicinski, Paolo Abeni, Andrii Nakryiko,
	Mykola Lysenko, Alexei Starovoitov, Daniel Borkmann,
	Martin KaFai Lau, Song Liu, Yonghong Song, John Fastabend,
	KP Singh, Stanislav Fomichev, Hao Luo, Jiri Olsa, Shuah Khan,
	linux-kernel, kvm, virtualization, netdev, bpf, linux-kselftest,
	Cong Wang

Hi Bobby,

On Wed, Jan 18, 2023 at 12:27:39PM -0800, Bobby Eshleman wrote:
>Add support for sockmap to vsock.
>
>We're testing usage of vsock as a way to redirect guest-local UDS requests to
>the host and this patch series greatly improves the performance of such a
>setup.
>
>Compared to copying packets via userspace, this improves throughput by 221% in
>basic testing.

Cool, nice series!

>
>Tested as follows.
>
>Setup: guest unix dgram sender -> guest vsock redirector -> host vsock server
>Threads: 1
>Payload: 64k
>No sockmap:
>- 76.3 MB/s
>- The guest vsock redirector was
>  "socat VSOCK-CONNECT:2:1234 UNIX-RECV:/path/to/sock"
>Using sockmap (this patch):
>- 168.8 MB/s (+221%)

Assuming the absolute value is correct, there is a typo here, it would 
be +121% right?

>- The guest redirector was a simple sockmap echo server,
>  redirecting unix ingress to vsock 2:1234 egress.
>- Same sender and server programs
>
>Only the virtio transport has been tested.

I think is fine for now.

>The loopback transport was used in
>writing bpf/selftests, but not thoroughly tested otherwise.

I did a quick review mainly for vsock stuff.
Hoping others can take a better look at net/vmw_vsock/vsock_bpf.c, since 
I'm not very familiar with that subsystem.

FYI I will be off the next two weeks (till Feb 7) with limited internet 
access.

Thanks,
Stefano

>
>This series requires the skb patch.
>
>To: Stefan Hajnoczi <stefanha@redhat.com>
>To: Stefano Garzarella <sgarzare@redhat.com>
>To: "Michael S. Tsirkin" <mst@redhat.com>
>To: Jason Wang <jasowang@redhat.com>
>To: "David S. Miller" <davem@davemloft.net>
>To: Eric Dumazet <edumazet@google.com>
>To: Jakub Kicinski <kuba@kernel.org>
>To: Paolo Abeni <pabeni@redhat.com>
>To: Andrii Nakryiko <andrii@kernel.org>
>To: Mykola Lysenko <mykolal@fb.com>
>To: Alexei Starovoitov <ast@kernel.org>
>To: Daniel Borkmann <daniel@iogearbox.net>
>To: Martin KaFai Lau <martin.lau@linux.dev>
>To: Song Liu <song@kernel.org>
>To: Yonghong Song <yhs@fb.com>
>To: John Fastabend <john.fastabend@gmail.com>
>To: KP Singh <kpsingh@kernel.org>
>To: Stanislav Fomichev <sdf@google.com>
>To: Hao Luo <haoluo@google.com>
>To: Jiri Olsa <jolsa@kernel.org>
>To: Shuah Khan <shuah@kernel.org>
>Cc: linux-kernel@vger.kernel.org
>Cc: kvm@vger.kernel.org
>Cc: virtualization@lists.linux-foundation.org
>Cc: netdev@vger.kernel.org
>Cc: bpf@vger.kernel.org
>Cc: linux-kselftest@vger.kernel.org
>Signed-off-by: Bobby Eshleman <bobby.eshleman@bytedance.com>
>
>---
>Bobby Eshleman (3):
>      vsock: support sockmap
>      selftests/bpf: add vsock to vmtest.sh
>      selftests/bpf: Add a test case for vsock sockmap
>
> drivers/vhost/vsock.c                              |   1 +
> include/linux/virtio_vsock.h                       |   1 +
> include/net/af_vsock.h                             |  17 ++
> net/vmw_vsock/Makefile                             |   1 +
> net/vmw_vsock/af_vsock.c                           |  59 ++++++-
> net/vmw_vsock/virtio_transport.c                   |   2 +
> net/vmw_vsock/virtio_transport_common.c            |  22 +++
> net/vmw_vsock/vsock_bpf.c                          | 180 +++++++++++++++++++++
> net/vmw_vsock/vsock_loopback.c                     |   2 +
> tools/testing/selftests/bpf/config.x86_64          |   4 +
> .../selftests/bpf/prog_tests/sockmap_listen.c      | 163 +++++++++++++++++++
> tools/testing/selftests/bpf/vmtest.sh              |   1 +
> 12 files changed, 447 insertions(+), 6 deletions(-)
>---
>base-commit: f12f4326c6a75a74e908714be6d2f0e2f0fd0d76
>change-id: 20230118-support-vsock-sockmap-connectible-2e1297d2111a
>
>Best regards,
>-- 
>Bobby Eshleman <bobby.eshleman@bytedance.com>
>


^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH RFC 1/3] vsock: support sockmap
  2023-01-18 15:08     ` Bobby Eshleman
@ 2023-01-20  8:55         ` Stefano Garzarella
  0 siblings, 0 replies; 20+ messages in thread
From: Stefano Garzarella @ 2023-01-20  8:55 UTC (permalink / raw)
  To: Bobby Eshleman
  Cc: Cong Wang, Bobby Eshleman, kvm, Michael S. Tsirkin, KP Singh,
	virtualization, Song Liu, Eric Dumazet, Stanislav Fomichev,
	linux-kselftest, Shuah Khan, Mykola Lysenko, Daniel Borkmann,
	John Fastabend, Andrii Nakryiko, Jakub Kicinski, Paolo Abeni,
	Alexei Starovoitov, Stefan Hajnoczi, Yonghong Song, Hao Luo,
	netdev, linux-kernel, Jiri Olsa, bpf, Martin KaFai Lau,
	David S. Miller

On Wed, Jan 18, 2023 at 03:08:11PM +0000, Bobby Eshleman wrote:
>On Thu, Jan 19, 2023 at 11:39:36AM +0100, Stefano Garzarella wrote:
>> On Wed, Jan 18, 2023 at 12:27:39PM -0800, Bobby Eshleman wrote:
>> > This patch adds sockmap support for vsock sockets. It is intended to be
>> > usable by all transports, but only the virtio transport is implemented.
>> >
>> > Signed-off-by: Bobby Eshleman <bobby.eshleman@bytedance.com>
>> > ---
>> > drivers/vhost/vsock.c                   |   1 +
>> > include/linux/virtio_vsock.h            |   1 +
>> > include/net/af_vsock.h                  |  17 +++
>> > net/vmw_vsock/Makefile                  |   1 +
>> > net/vmw_vsock/af_vsock.c                |  59 +++++++++--
>> > net/vmw_vsock/virtio_transport.c        |   2 +
>> > net/vmw_vsock/virtio_transport_common.c |  22 ++++
>> > net/vmw_vsock/vsock_bpf.c               | 180 ++++++++++++++++++++++++++++++++
>> > net/vmw_vsock/vsock_loopback.c          |   2 +
>> > 9 files changed, 279 insertions(+), 6 deletions(-)
>>
>> ./scripts/checkpatch.pl --strict prints some simple warnings/checks that
>> I suggest to fix :-)
>>
>
>Oops, thanks. New machine, forgot my pre-commit hook. Putting in place
>now.
>
>> >
>> > diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c
>> > index 1f3b89c885cca..3c6dc036b9044 100644
>> > --- a/drivers/vhost/vsock.c
>> > +++ b/drivers/vhost/vsock.c
>> > @@ -439,6 +439,7 @@ static struct virtio_transport vhost_transport = {
>> > 		.notify_send_post_enqueue = virtio_transport_notify_send_post_enqueue,
>> > 		.notify_buffer_size       = virtio_transport_notify_buffer_size,
>> >
>> > +		.read_skb = virtio_transport_read_skb,
>> > 	},
>> >
>> > 	.send_pkt = vhost_transport_send_pkt,
>> > diff --git a/include/linux/virtio_vsock.h b/include/linux/virtio_vsock.h
>> > index 3f9c166113063..c58453699ee98 100644
>> > --- a/include/linux/virtio_vsock.h
>> > +++ b/include/linux/virtio_vsock.h
>> > @@ -245,4 +245,5 @@ u32 virtio_transport_get_credit(struct virtio_vsock_sock *vvs, u32 wanted);
>> > void virtio_transport_put_credit(struct virtio_vsock_sock *vvs, u32 credit);
>> > void virtio_transport_deliver_tap_pkt(struct sk_buff *skb);
>> > int virtio_transport_purge_skbs(void *vsk, struct sk_buff_head *list);
>> > +int virtio_transport_read_skb(struct vsock_sock *vsk, skb_read_actor_t read_actor);
>> > #endif /* _LINUX_VIRTIO_VSOCK_H */
>> > diff --git a/include/net/af_vsock.h b/include/net/af_vsock.h
>> > index 568a87c5e0d0f..a73f5fbd296af 100644
>> > --- a/include/net/af_vsock.h
>> > +++ b/include/net/af_vsock.h
>> > @@ -75,6 +75,7 @@ struct vsock_sock {
>> > 	void *trans;
>> > };
>> >
>> > +s64 vsock_connectible_has_data(struct vsock_sock *vsk);
>> > s64 vsock_stream_has_data(struct vsock_sock *vsk);
>> > s64 vsock_stream_has_space(struct vsock_sock *vsk);
>> > struct sock *vsock_create_connected(struct sock *parent);
>> > @@ -173,6 +174,9 @@ struct vsock_transport {
>> >
>> > 	/* Addressing. */
>> > 	u32 (*get_local_cid)(void);
>> > +
>> > +	/* Read a single skb */
>> > +	int (*read_skb)(struct vsock_sock *, skb_read_actor_t);
>> > };
>> >
>> > /**** CORE ****/
>> > @@ -225,5 +229,18 @@ int vsock_init_tap(void);
>> > int vsock_add_tap(struct vsock_tap *vt);
>> > int vsock_remove_tap(struct vsock_tap *vt);
>> > void vsock_deliver_tap(struct sk_buff *build_skb(void *opaque), void *opaque);
>> > +int vsock_connectible_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
>> > +			      int flags);
>> > +int vsock_dgram_recvmsg(struct socket *sock, struct msghdr *msg,
>> > +			       size_t len, int flags);
>> > +
>> > +#ifdef CONFIG_BPF_SYSCALL
>> > +extern struct proto vsock_proto;
>> > +int vsock_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore);
>> > +void __init vsock_bpf_build_proto(void);
>> > +#else
>> > +static inline void __init vsock_bpf_build_proto(void)
>> > +{}
>> > +#endif
>> >
>> > #endif /* __AF_VSOCK_H__ */
>> > diff --git a/net/vmw_vsock/Makefile b/net/vmw_vsock/Makefile
>> > index 6a943ec95c4a5..5da74c4a9f1d1 100644
>> > --- a/net/vmw_vsock/Makefile
>> > +++ b/net/vmw_vsock/Makefile
>> > @@ -8,6 +8,7 @@ obj-$(CONFIG_HYPERV_VSOCKETS) += hv_sock.o
>> > obj-$(CONFIG_VSOCKETS_LOOPBACK) += vsock_loopback.o
>> >
>> > vsock-y += af_vsock.o af_vsock_tap.o vsock_addr.o
>> > +vsock-$(CONFIG_BPF_SYSCALL) += vsock_bpf.o
>> >
>> > vsock_diag-y += diag.o
>> >
>> > diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c
>> > index d593d5b6d4b15..7081b3a992c1e 100644
>> > --- a/net/vmw_vsock/af_vsock.c
>> > +++ b/net/vmw_vsock/af_vsock.c
>> > @@ -116,10 +116,13 @@ static void vsock_sk_destruct(struct sock *sk);
>> > static int vsock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb);
>> >
>> > /* Protocol family. */
>> > -static struct proto vsock_proto = {
>> > +struct proto vsock_proto = {
>> > 	.name = "AF_VSOCK",
>> > 	.owner = THIS_MODULE,
>> > 	.obj_size = sizeof(struct vsock_sock),
>> > +#ifdef CONFIG_BPF_SYSCALL
>> > +	.psock_update_sk_prot = vsock_bpf_update_proto,
>> > +#endif
>> > };
>> >
>> > /* The default peer timeout indicates how long we will wait for a peer response
>> > @@ -865,7 +868,7 @@ s64 vsock_stream_has_data(struct vsock_sock *vsk)
>> > }
>> > EXPORT_SYMBOL_GPL(vsock_stream_has_data);
>> >
>> > -static s64 vsock_connectible_has_data(struct vsock_sock *vsk)
>> > +s64 vsock_connectible_has_data(struct vsock_sock *vsk)
>> > {
>> > 	struct sock *sk = sk_vsock(vsk);
>> >
>> > @@ -874,6 +877,7 @@ static s64 vsock_connectible_has_data(struct vsock_sock *vsk)
>> > 	else
>> > 		return vsock_stream_has_data(vsk);
>> > }
>> > +EXPORT_SYMBOL_GPL(vsock_connectible_has_data);
>> >
>> > s64 vsock_stream_has_space(struct vsock_sock *vsk)
>> > {
>> > @@ -1131,6 +1135,19 @@ static __poll_t vsock_poll(struct file *file, struct socket *sock,
>> > 	return mask;
>> > }
>> >
>> > +static int vsock_read_skb(struct sock *sk, skb_read_actor_t read_actor)
>> > +{
>> > +	struct vsock_sock *vsk = vsock_sk(sk);
>> > +
>> > +	if (!vsk->transport)
>> > +		return -ENODEV;
>> > +
>> > +	if (!vsk->transport->read_skb)
>> > +		return -EOPNOTSUPP;
>> > +
>> > +	return vsk->transport->read_skb(vsk, read_actor);
>> > +}
>> > +
>> > static int vsock_dgram_sendmsg(struct socket *sock, struct msghdr *msg,
>> > 			       size_t len)
>> > {
>> > @@ -1241,19 +1258,32 @@ static int vsock_dgram_connect(struct socket *sock,
>> >
>> > 	memcpy(&vsk->remote_addr, remote_addr, sizeof(vsk->remote_addr));
>> > 	sock->state = SS_CONNECTED;
>> > +	sk->sk_state = TCP_ESTABLISHED;
>>
>> Why we need this change?
>> If it's a fix, we should put it in another patch.
>>
>
>This is just required by sockmap's function that determines if a socket
>is a valid one to add to a map. It will refuse to add any socket that is
>not TCP_ESTABLISHED to a sockmap.
>
>This was the approach that unix dgrams took, so I followed here.

Okay, make sense to me.
Maybe mention that in the commit message.

>
>> >
>> > out:
>> > 	release_sock(sk);
>> > 	return err;
>> > }
>> >
>> > -static int vsock_dgram_recvmsg(struct socket *sock, struct msghdr *msg,
>> > -			       size_t len, int flags)
>> > +int vsock_dgram_recvmsg(struct socket *sock, struct msghdr *msg,
>> > +			size_t len, int flags)
>> > {
>> > -	struct vsock_sock *vsk = vsock_sk(sock->sk);
>> > +	const struct proto *prot;
>>
>> We should use the guard for this statement as in
>> vsock_connectible_recvmsg().
>>
>
>Got it.
>
>> > +	struct vsock_sock *vsk;
>> > +	struct sock *sk;
>> > +
>> > +	sk = sock->sk;
>> > +	vsk = vsock_sk(sk);
>> > +
>> > +#ifdef CONFIG_BPF_SYSCALL
>> > +	prot = READ_ONCE(sk->sk_prot);
>> > +	if (prot != &vsock_proto)
>> > +		return prot->recvmsg(sk, msg, len, flags, NULL);
>> > +#endif
>> >
>> > 	return vsk->transport->dgram_dequeue(vsk, msg, len, flags);
>> > }
>> > +EXPORT_SYMBOL_GPL(vsock_dgram_recvmsg);
>> >
>> > static const struct proto_ops vsock_dgram_ops = {
>> > 	.family = PF_VSOCK,
>> > @@ -1272,6 +1302,7 @@ static const struct proto_ops vsock_dgram_ops = {
>> > 	.recvmsg = vsock_dgram_recvmsg,
>> > 	.mmap = sock_no_mmap,
>> > 	.sendpage = sock_no_sendpage,
>> > +	.read_skb = vsock_read_skb,
>> > };
>> >
>> > static int vsock_transport_cancel_pkt(struct vsock_sock *vsk)
>> > @@ -2085,13 +2116,16 @@ static int __vsock_seqpacket_recvmsg(struct sock *sk, struct msghdr *msg,
>> > 	return err;
>> > }
>> >
>> > -static int
>> > +int
>> > vsock_connectible_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
>> > 			  int flags)
>> > {
>> > 	struct sock *sk;
>> > 	struct vsock_sock *vsk;
>> > 	const struct vsock_transport *transport;
>> > +#ifdef CONFIG_BPF_SYSCALL
>> > +	const struct proto *prot;
>> > +#endif
>> > 	int err;
>> >
>> > 	sk = sock->sk;
>> > @@ -2138,6 +2172,14 @@ vsock_connectible_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
>> > 		goto out;
>> > 	}
>> >
>> > +#ifdef CONFIG_BPF_SYSCALL
>> > +	prot = READ_ONCE(sk->sk_prot);
>> > +	if (prot != &vsock_proto) {
>> > +		release_sock(sk);
>> > +		return prot->recvmsg(sk, msg, len, flags, NULL);
>> > +	}
>> > +#endif
>> > +
>> > 	if (sk->sk_type == SOCK_STREAM)
>> > 		err = __vsock_stream_recvmsg(sk, msg, len, flags);
>> > 	else
>> > @@ -2147,6 +2189,7 @@ vsock_connectible_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
>> > 	release_sock(sk);
>> > 	return err;
>> > }
>> > +EXPORT_SYMBOL_GPL(vsock_connectible_recvmsg);
>> >
>> > static int vsock_set_rcvlowat(struct sock *sk, int val)
>> > {
>> > @@ -2187,6 +2230,7 @@ static const struct proto_ops vsock_stream_ops = {
>> > 	.mmap = sock_no_mmap,
>> > 	.sendpage = sock_no_sendpage,
>> > 	.set_rcvlowat = vsock_set_rcvlowat,
>> > +	.read_skb = vsock_read_skb,
>> > };
>> >
>> > static const struct proto_ops vsock_seqpacket_ops = {
>> > @@ -2208,6 +2252,7 @@ static const struct proto_ops vsock_seqpacket_ops = {
>> > 	.recvmsg = vsock_connectible_recvmsg,
>> > 	.mmap = sock_no_mmap,
>> > 	.sendpage = sock_no_sendpage,
>> > +	.read_skb = vsock_read_skb,
>> > };
>> >
>> > static int vsock_create(struct net *net, struct socket *sock,
>> > @@ -2347,6 +2392,8 @@ static int __init vsock_init(void)
>> > 		goto err_unregister_proto;
>> > 	}
>> >
>> > +	vsock_bpf_build_proto();
>> > +
>> > 	return 0;
>> >
>> > err_unregister_proto:
>> > diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c
>> > index 28b5a8e8e0948..e95df847176b6 100644
>> > --- a/net/vmw_vsock/virtio_transport.c
>> > +++ b/net/vmw_vsock/virtio_transport.c
>> > @@ -457,6 +457,8 @@ static struct virtio_transport virtio_transport = {
>> > 		.notify_send_pre_enqueue  =
>> > 		virtio_transport_notify_send_pre_enqueue,
>> > 		.notify_send_post_enqueue = virtio_transport_notify_send_post_enqueue,
>> > 		.notify_buffer_size       = virtio_transport_notify_buffer_size,
>> > +
>> > +		.read_skb = virtio_transport_read_skb,
>> > 	},
>> >
>> > 	.send_pkt = virtio_transport_send_pkt,
>> > diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c
>> > index a1581c77cf84a..9a87ead5b1fc5 100644
>> > --- a/net/vmw_vsock/virtio_transport_common.c
>> > +++ b/net/vmw_vsock/virtio_transport_common.c
>> > @@ -1388,6 +1388,28 @@ int virtio_transport_purge_skbs(void *vsk, struct sk_buff_head *queue)
>> > }
>> > EXPORT_SYMBOL_GPL(virtio_transport_purge_skbs);
>> >
>> > +int virtio_transport_read_skb(struct vsock_sock *vsk, skb_read_actor_t recv_actor)
>> > +{
>> > +	struct virtio_vsock_sock *vvs = vsk->trans;
>> > +	struct sock *sk = sk_vsock(vsk);
>> > +	struct sk_buff *skb;
>> > +	int copied = 0;
>>
>> We could avoid initializing `copied`, since it is overwritten later.
>>
>
>Got it.
>
>> > +	int off = 0;
>> > +	int err;
>> > +
>> > +	spin_lock_bh(&vvs->rx_lock);
>> > +	skb = __skb_recv_datagram(sk, &vvs->rx_queue, MSG_DONTWAIT, &off, &err);
>>
>> Will this work also for STREAM and SEQPACKET sockets?
>>
>
>Yep, it is used for non-datagram sockets as well because it is free of race
>conditions, and handles waits/errors sensibly. For example, in
>unix_accept():
>
>https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/net/unix/af_unix.c#n1696

Good to know :-)
I'd add a comment, just because the function name suggests something 
else.

>
>> > +	spin_unlock_bh(&vvs->rx_lock);
>> > +
>> > +	if (!skb)
>> > +		return err;
>> > +
>> > +	copied = recv_actor(sk, skb);
>> > +	kfree_skb(skb);
>>
>> I would have moved these steps to vsock_read_skb() to avoid duplicating
>> these steps in each transport. Perhaps not all transports want to pass
>> skb ownership to the caller, though, so maybe we can leave it that
>> way for now.
>>
>
>That is a good point though. I bet your initial hunch is the right one.
>If even one other transport duplicates this, then I'd say it is worth
>pulling up into vsock_read_skb().

Make sense to me :-)

>
>> > +	return copied;
>> > +}
>> > +EXPORT_SYMBOL_GPL(virtio_transport_read_skb);
>> > +
>> > MODULE_LICENSE("GPL v2");
>> > MODULE_AUTHOR("Asias He");
>> > MODULE_DESCRIPTION("common code for virtio vsock");
>> > diff --git a/net/vmw_vsock/vsock_bpf.c b/net/vmw_vsock/vsock_bpf.c
>> > new file mode 100644
>> > index 0000000000000..9e11282d3bc1f
>> > --- /dev/null
>> > +++ b/net/vmw_vsock/vsock_bpf.c
>> > @@ -0,0 +1,180 @@
>> > +// SPDX-License-Identifier: GPL-2.0
>> > +/* Copyright (c) 2022 Bobby Eshleman <bobby.eshleman@bytedance.com>
>> > + *
>> > + * Based off of net/unix/unix_bpf.c
>> > + */
>> > +
>> > +#include <linux/bpf.h>
>> > +#include <linux/module.h>
>> > +#include <linux/skmsg.h>
>> > +#include <linux/socket.h>
>> > +#include <net/af_vsock.h>
>> > +#include <net/sock.h>
>> > +
>> > +#define vsock_sk_has_data(__sk, __psock)				\
>> > +		({	!skb_queue_empty(&__sk->sk_receive_queue) ||	\
>> > +			!skb_queue_empty(&__psock->ingress_skb) ||	\
>> > +			!list_empty(&__psock->ingress_msg);		\
>> > +		})
>> > +
>> > +static struct proto *vsock_dgram_prot_saved __read_mostly;
>> > +static DEFINE_SPINLOCK(vsock_dgram_prot_lock);
>> > +static struct proto vsock_dgram_bpf_prot;
>> > +
>> > +static bool vsock_has_data(struct vsock_sock *vsk, struct sk_psock *psock)
>> > +{
>> > +	struct sock *sk = sk_vsock(vsk);
>> > +	s64 ret;
>> > +
>> > +	ret = vsock_connectible_has_data(vsk);
>> > +	if (ret > 0)
>> > +		return true;
>> > +
>> > +	return vsock_sk_has_data(sk, psock);
>> > +}
>> > +
>> > +static int vsock_msg_wait_data(struct sock *sk, struct sk_psock *psock, long timeo)
>> > +{
>> > +	struct vsock_sock *vsk;
>> > +	int err;
>> > +
>> > +	DEFINE_WAIT(wait);
>> > +
>> > +	vsk = vsock_sk(sk);
>> > +	err = 0;
>> > +
>> > +	while (vsock_has_data(vsk, psock)) {
>> > +		prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
>> > +
>> > +		if (sk->sk_err != 0 ||
>> > +		    (sk->sk_shutdown & RCV_SHUTDOWN) ||
>> > +		    (vsk->peer_shutdown & SEND_SHUTDOWN)) {
>> > +			break;
>> > +		}
>> > +
>> > +		if (timeo == 0) {
>> > +			err = -EAGAIN;
>> > +			break;
>> > +		}
>> > +
>> > +		release_sock(sk);
>> > +		timeo = schedule_timeout(timeo);
>> > +		lock_sock(sk);
>> > +
>> > +		if (signal_pending(current)) {
>> > +			err = sock_intr_errno(timeo);
>> > +			break;
>> > +		} else if (timeo == 0) {
>> > +			err = -EAGAIN;
>> > +			break;
>> > +		}
>> > +	}
>> > +
>> > +	finish_wait(sk_sleep(sk), &wait);
>> > +
>> > +	if (err)
>> > +		return err;
>> > +
>> > +	return 0;
>> > +}
>> > +
>> > +static int vsock_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags)
>> > +{
>> > +	int err;
>> > +	struct socket *sock = sk->sk_socket;
>> > +
>> > +	if (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET)
>> > +		err = vsock_connectible_recvmsg(sock, msg, len, flags);
>> > +	else
>>
>> Could it happen that it is not DGRAM and we should return an error in
>> this case?
>>
>
>I'm not sure but for the sake of safety, I'll add that.

Yep, I agree.

Thanks,
Stefano

_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH RFC 1/3] vsock: support sockmap
@ 2023-01-20  8:55         ` Stefano Garzarella
  0 siblings, 0 replies; 20+ messages in thread
From: Stefano Garzarella @ 2023-01-20  8:55 UTC (permalink / raw)
  To: Bobby Eshleman
  Cc: Bobby Eshleman, Cong Wang, kvm, Michael S. Tsirkin,
	Alexei Starovoitov, virtualization, Song Liu, Eric Dumazet,
	Stanislav Fomichev, linux-kselftest, Shuah Khan, Mykola Lysenko,
	Daniel Borkmann, John Fastabend, Andrii Nakryiko, Yonghong Song,
	Paolo Abeni, KP Singh, Stefan Hajnoczi, Jakub Kicinski, Hao Luo,
	netdev, linux-kernel, Jiri Olsa, bpf, Martin KaFai Lau,
	David S. Miller

On Wed, Jan 18, 2023 at 03:08:11PM +0000, Bobby Eshleman wrote:
>On Thu, Jan 19, 2023 at 11:39:36AM +0100, Stefano Garzarella wrote:
>> On Wed, Jan 18, 2023 at 12:27:39PM -0800, Bobby Eshleman wrote:
>> > This patch adds sockmap support for vsock sockets. It is intended to be
>> > usable by all transports, but only the virtio transport is implemented.
>> >
>> > Signed-off-by: Bobby Eshleman <bobby.eshleman@bytedance.com>
>> > ---
>> > drivers/vhost/vsock.c                   |   1 +
>> > include/linux/virtio_vsock.h            |   1 +
>> > include/net/af_vsock.h                  |  17 +++
>> > net/vmw_vsock/Makefile                  |   1 +
>> > net/vmw_vsock/af_vsock.c                |  59 +++++++++--
>> > net/vmw_vsock/virtio_transport.c        |   2 +
>> > net/vmw_vsock/virtio_transport_common.c |  22 ++++
>> > net/vmw_vsock/vsock_bpf.c               | 180 ++++++++++++++++++++++++++++++++
>> > net/vmw_vsock/vsock_loopback.c          |   2 +
>> > 9 files changed, 279 insertions(+), 6 deletions(-)
>>
>> ./scripts/checkpatch.pl --strict prints some simple warnings/checks that
>> I suggest to fix :-)
>>
>
>Oops, thanks. New machine, forgot my pre-commit hook. Putting in place
>now.
>
>> >
>> > diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c
>> > index 1f3b89c885cca..3c6dc036b9044 100644
>> > --- a/drivers/vhost/vsock.c
>> > +++ b/drivers/vhost/vsock.c
>> > @@ -439,6 +439,7 @@ static struct virtio_transport vhost_transport = {
>> > 		.notify_send_post_enqueue = virtio_transport_notify_send_post_enqueue,
>> > 		.notify_buffer_size       = virtio_transport_notify_buffer_size,
>> >
>> > +		.read_skb = virtio_transport_read_skb,
>> > 	},
>> >
>> > 	.send_pkt = vhost_transport_send_pkt,
>> > diff --git a/include/linux/virtio_vsock.h b/include/linux/virtio_vsock.h
>> > index 3f9c166113063..c58453699ee98 100644
>> > --- a/include/linux/virtio_vsock.h
>> > +++ b/include/linux/virtio_vsock.h
>> > @@ -245,4 +245,5 @@ u32 virtio_transport_get_credit(struct virtio_vsock_sock *vvs, u32 wanted);
>> > void virtio_transport_put_credit(struct virtio_vsock_sock *vvs, u32 credit);
>> > void virtio_transport_deliver_tap_pkt(struct sk_buff *skb);
>> > int virtio_transport_purge_skbs(void *vsk, struct sk_buff_head *list);
>> > +int virtio_transport_read_skb(struct vsock_sock *vsk, skb_read_actor_t read_actor);
>> > #endif /* _LINUX_VIRTIO_VSOCK_H */
>> > diff --git a/include/net/af_vsock.h b/include/net/af_vsock.h
>> > index 568a87c5e0d0f..a73f5fbd296af 100644
>> > --- a/include/net/af_vsock.h
>> > +++ b/include/net/af_vsock.h
>> > @@ -75,6 +75,7 @@ struct vsock_sock {
>> > 	void *trans;
>> > };
>> >
>> > +s64 vsock_connectible_has_data(struct vsock_sock *vsk);
>> > s64 vsock_stream_has_data(struct vsock_sock *vsk);
>> > s64 vsock_stream_has_space(struct vsock_sock *vsk);
>> > struct sock *vsock_create_connected(struct sock *parent);
>> > @@ -173,6 +174,9 @@ struct vsock_transport {
>> >
>> > 	/* Addressing. */
>> > 	u32 (*get_local_cid)(void);
>> > +
>> > +	/* Read a single skb */
>> > +	int (*read_skb)(struct vsock_sock *, skb_read_actor_t);
>> > };
>> >
>> > /**** CORE ****/
>> > @@ -225,5 +229,18 @@ int vsock_init_tap(void);
>> > int vsock_add_tap(struct vsock_tap *vt);
>> > int vsock_remove_tap(struct vsock_tap *vt);
>> > void vsock_deliver_tap(struct sk_buff *build_skb(void *opaque), void *opaque);
>> > +int vsock_connectible_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
>> > +			      int flags);
>> > +int vsock_dgram_recvmsg(struct socket *sock, struct msghdr *msg,
>> > +			       size_t len, int flags);
>> > +
>> > +#ifdef CONFIG_BPF_SYSCALL
>> > +extern struct proto vsock_proto;
>> > +int vsock_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore);
>> > +void __init vsock_bpf_build_proto(void);
>> > +#else
>> > +static inline void __init vsock_bpf_build_proto(void)
>> > +{}
>> > +#endif
>> >
>> > #endif /* __AF_VSOCK_H__ */
>> > diff --git a/net/vmw_vsock/Makefile b/net/vmw_vsock/Makefile
>> > index 6a943ec95c4a5..5da74c4a9f1d1 100644
>> > --- a/net/vmw_vsock/Makefile
>> > +++ b/net/vmw_vsock/Makefile
>> > @@ -8,6 +8,7 @@ obj-$(CONFIG_HYPERV_VSOCKETS) += hv_sock.o
>> > obj-$(CONFIG_VSOCKETS_LOOPBACK) += vsock_loopback.o
>> >
>> > vsock-y += af_vsock.o af_vsock_tap.o vsock_addr.o
>> > +vsock-$(CONFIG_BPF_SYSCALL) += vsock_bpf.o
>> >
>> > vsock_diag-y += diag.o
>> >
>> > diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c
>> > index d593d5b6d4b15..7081b3a992c1e 100644
>> > --- a/net/vmw_vsock/af_vsock.c
>> > +++ b/net/vmw_vsock/af_vsock.c
>> > @@ -116,10 +116,13 @@ static void vsock_sk_destruct(struct sock *sk);
>> > static int vsock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb);
>> >
>> > /* Protocol family. */
>> > -static struct proto vsock_proto = {
>> > +struct proto vsock_proto = {
>> > 	.name = "AF_VSOCK",
>> > 	.owner = THIS_MODULE,
>> > 	.obj_size = sizeof(struct vsock_sock),
>> > +#ifdef CONFIG_BPF_SYSCALL
>> > +	.psock_update_sk_prot = vsock_bpf_update_proto,
>> > +#endif
>> > };
>> >
>> > /* The default peer timeout indicates how long we will wait for a peer response
>> > @@ -865,7 +868,7 @@ s64 vsock_stream_has_data(struct vsock_sock *vsk)
>> > }
>> > EXPORT_SYMBOL_GPL(vsock_stream_has_data);
>> >
>> > -static s64 vsock_connectible_has_data(struct vsock_sock *vsk)
>> > +s64 vsock_connectible_has_data(struct vsock_sock *vsk)
>> > {
>> > 	struct sock *sk = sk_vsock(vsk);
>> >
>> > @@ -874,6 +877,7 @@ static s64 vsock_connectible_has_data(struct vsock_sock *vsk)
>> > 	else
>> > 		return vsock_stream_has_data(vsk);
>> > }
>> > +EXPORT_SYMBOL_GPL(vsock_connectible_has_data);
>> >
>> > s64 vsock_stream_has_space(struct vsock_sock *vsk)
>> > {
>> > @@ -1131,6 +1135,19 @@ static __poll_t vsock_poll(struct file *file, struct socket *sock,
>> > 	return mask;
>> > }
>> >
>> > +static int vsock_read_skb(struct sock *sk, skb_read_actor_t read_actor)
>> > +{
>> > +	struct vsock_sock *vsk = vsock_sk(sk);
>> > +
>> > +	if (!vsk->transport)
>> > +		return -ENODEV;
>> > +
>> > +	if (!vsk->transport->read_skb)
>> > +		return -EOPNOTSUPP;
>> > +
>> > +	return vsk->transport->read_skb(vsk, read_actor);
>> > +}
>> > +
>> > static int vsock_dgram_sendmsg(struct socket *sock, struct msghdr *msg,
>> > 			       size_t len)
>> > {
>> > @@ -1241,19 +1258,32 @@ static int vsock_dgram_connect(struct socket *sock,
>> >
>> > 	memcpy(&vsk->remote_addr, remote_addr, sizeof(vsk->remote_addr));
>> > 	sock->state = SS_CONNECTED;
>> > +	sk->sk_state = TCP_ESTABLISHED;
>>
>> Why we need this change?
>> If it's a fix, we should put it in another patch.
>>
>
>This is just required by sockmap's function that determines if a socket
>is a valid one to add to a map. It will refuse to add any socket that is
>not TCP_ESTABLISHED to a sockmap.
>
>This was the approach that unix dgrams took, so I followed here.

Okay, make sense to me.
Maybe mention that in the commit message.

>
>> >
>> > out:
>> > 	release_sock(sk);
>> > 	return err;
>> > }
>> >
>> > -static int vsock_dgram_recvmsg(struct socket *sock, struct msghdr *msg,
>> > -			       size_t len, int flags)
>> > +int vsock_dgram_recvmsg(struct socket *sock, struct msghdr *msg,
>> > +			size_t len, int flags)
>> > {
>> > -	struct vsock_sock *vsk = vsock_sk(sock->sk);
>> > +	const struct proto *prot;
>>
>> We should use the guard for this statement as in
>> vsock_connectible_recvmsg().
>>
>
>Got it.
>
>> > +	struct vsock_sock *vsk;
>> > +	struct sock *sk;
>> > +
>> > +	sk = sock->sk;
>> > +	vsk = vsock_sk(sk);
>> > +
>> > +#ifdef CONFIG_BPF_SYSCALL
>> > +	prot = READ_ONCE(sk->sk_prot);
>> > +	if (prot != &vsock_proto)
>> > +		return prot->recvmsg(sk, msg, len, flags, NULL);
>> > +#endif
>> >
>> > 	return vsk->transport->dgram_dequeue(vsk, msg, len, flags);
>> > }
>> > +EXPORT_SYMBOL_GPL(vsock_dgram_recvmsg);
>> >
>> > static const struct proto_ops vsock_dgram_ops = {
>> > 	.family = PF_VSOCK,
>> > @@ -1272,6 +1302,7 @@ static const struct proto_ops vsock_dgram_ops = {
>> > 	.recvmsg = vsock_dgram_recvmsg,
>> > 	.mmap = sock_no_mmap,
>> > 	.sendpage = sock_no_sendpage,
>> > +	.read_skb = vsock_read_skb,
>> > };
>> >
>> > static int vsock_transport_cancel_pkt(struct vsock_sock *vsk)
>> > @@ -2085,13 +2116,16 @@ static int __vsock_seqpacket_recvmsg(struct sock *sk, struct msghdr *msg,
>> > 	return err;
>> > }
>> >
>> > -static int
>> > +int
>> > vsock_connectible_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
>> > 			  int flags)
>> > {
>> > 	struct sock *sk;
>> > 	struct vsock_sock *vsk;
>> > 	const struct vsock_transport *transport;
>> > +#ifdef CONFIG_BPF_SYSCALL
>> > +	const struct proto *prot;
>> > +#endif
>> > 	int err;
>> >
>> > 	sk = sock->sk;
>> > @@ -2138,6 +2172,14 @@ vsock_connectible_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
>> > 		goto out;
>> > 	}
>> >
>> > +#ifdef CONFIG_BPF_SYSCALL
>> > +	prot = READ_ONCE(sk->sk_prot);
>> > +	if (prot != &vsock_proto) {
>> > +		release_sock(sk);
>> > +		return prot->recvmsg(sk, msg, len, flags, NULL);
>> > +	}
>> > +#endif
>> > +
>> > 	if (sk->sk_type == SOCK_STREAM)
>> > 		err = __vsock_stream_recvmsg(sk, msg, len, flags);
>> > 	else
>> > @@ -2147,6 +2189,7 @@ vsock_connectible_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
>> > 	release_sock(sk);
>> > 	return err;
>> > }
>> > +EXPORT_SYMBOL_GPL(vsock_connectible_recvmsg);
>> >
>> > static int vsock_set_rcvlowat(struct sock *sk, int val)
>> > {
>> > @@ -2187,6 +2230,7 @@ static const struct proto_ops vsock_stream_ops = {
>> > 	.mmap = sock_no_mmap,
>> > 	.sendpage = sock_no_sendpage,
>> > 	.set_rcvlowat = vsock_set_rcvlowat,
>> > +	.read_skb = vsock_read_skb,
>> > };
>> >
>> > static const struct proto_ops vsock_seqpacket_ops = {
>> > @@ -2208,6 +2252,7 @@ static const struct proto_ops vsock_seqpacket_ops = {
>> > 	.recvmsg = vsock_connectible_recvmsg,
>> > 	.mmap = sock_no_mmap,
>> > 	.sendpage = sock_no_sendpage,
>> > +	.read_skb = vsock_read_skb,
>> > };
>> >
>> > static int vsock_create(struct net *net, struct socket *sock,
>> > @@ -2347,6 +2392,8 @@ static int __init vsock_init(void)
>> > 		goto err_unregister_proto;
>> > 	}
>> >
>> > +	vsock_bpf_build_proto();
>> > +
>> > 	return 0;
>> >
>> > err_unregister_proto:
>> > diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c
>> > index 28b5a8e8e0948..e95df847176b6 100644
>> > --- a/net/vmw_vsock/virtio_transport.c
>> > +++ b/net/vmw_vsock/virtio_transport.c
>> > @@ -457,6 +457,8 @@ static struct virtio_transport virtio_transport = {
>> > 		.notify_send_pre_enqueue  =
>> > 		virtio_transport_notify_send_pre_enqueue,
>> > 		.notify_send_post_enqueue = virtio_transport_notify_send_post_enqueue,
>> > 		.notify_buffer_size       = virtio_transport_notify_buffer_size,
>> > +
>> > +		.read_skb = virtio_transport_read_skb,
>> > 	},
>> >
>> > 	.send_pkt = virtio_transport_send_pkt,
>> > diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c
>> > index a1581c77cf84a..9a87ead5b1fc5 100644
>> > --- a/net/vmw_vsock/virtio_transport_common.c
>> > +++ b/net/vmw_vsock/virtio_transport_common.c
>> > @@ -1388,6 +1388,28 @@ int virtio_transport_purge_skbs(void *vsk, struct sk_buff_head *queue)
>> > }
>> > EXPORT_SYMBOL_GPL(virtio_transport_purge_skbs);
>> >
>> > +int virtio_transport_read_skb(struct vsock_sock *vsk, skb_read_actor_t recv_actor)
>> > +{
>> > +	struct virtio_vsock_sock *vvs = vsk->trans;
>> > +	struct sock *sk = sk_vsock(vsk);
>> > +	struct sk_buff *skb;
>> > +	int copied = 0;
>>
>> We could avoid initializing `copied`, since it is overwritten later.
>>
>
>Got it.
>
>> > +	int off = 0;
>> > +	int err;
>> > +
>> > +	spin_lock_bh(&vvs->rx_lock);
>> > +	skb = __skb_recv_datagram(sk, &vvs->rx_queue, MSG_DONTWAIT, &off, &err);
>>
>> Will this work also for STREAM and SEQPACKET sockets?
>>
>
>Yep, it is used for non-datagram sockets as well because it is free of race
>conditions, and handles waits/errors sensibly. For example, in
>unix_accept():
>
>https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/net/unix/af_unix.c#n1696

Good to know :-)
I'd add a comment, just because the function name suggests something 
else.

>
>> > +	spin_unlock_bh(&vvs->rx_lock);
>> > +
>> > +	if (!skb)
>> > +		return err;
>> > +
>> > +	copied = recv_actor(sk, skb);
>> > +	kfree_skb(skb);
>>
>> I would have moved these steps to vsock_read_skb() to avoid duplicating
>> these steps in each transport. Perhaps not all transports want to pass
>> skb ownership to the caller, though, so maybe we can leave it that
>> way for now.
>>
>
>That is a good point though. I bet your initial hunch is the right one.
>If even one other transport duplicates this, then I'd say it is worth
>pulling up into vsock_read_skb().

Make sense to me :-)

>
>> > +	return copied;
>> > +}
>> > +EXPORT_SYMBOL_GPL(virtio_transport_read_skb);
>> > +
>> > MODULE_LICENSE("GPL v2");
>> > MODULE_AUTHOR("Asias He");
>> > MODULE_DESCRIPTION("common code for virtio vsock");
>> > diff --git a/net/vmw_vsock/vsock_bpf.c b/net/vmw_vsock/vsock_bpf.c
>> > new file mode 100644
>> > index 0000000000000..9e11282d3bc1f
>> > --- /dev/null
>> > +++ b/net/vmw_vsock/vsock_bpf.c
>> > @@ -0,0 +1,180 @@
>> > +// SPDX-License-Identifier: GPL-2.0
>> > +/* Copyright (c) 2022 Bobby Eshleman <bobby.eshleman@bytedance.com>
>> > + *
>> > + * Based off of net/unix/unix_bpf.c
>> > + */
>> > +
>> > +#include <linux/bpf.h>
>> > +#include <linux/module.h>
>> > +#include <linux/skmsg.h>
>> > +#include <linux/socket.h>
>> > +#include <net/af_vsock.h>
>> > +#include <net/sock.h>
>> > +
>> > +#define vsock_sk_has_data(__sk, __psock)				\
>> > +		({	!skb_queue_empty(&__sk->sk_receive_queue) ||	\
>> > +			!skb_queue_empty(&__psock->ingress_skb) ||	\
>> > +			!list_empty(&__psock->ingress_msg);		\
>> > +		})
>> > +
>> > +static struct proto *vsock_dgram_prot_saved __read_mostly;
>> > +static DEFINE_SPINLOCK(vsock_dgram_prot_lock);
>> > +static struct proto vsock_dgram_bpf_prot;
>> > +
>> > +static bool vsock_has_data(struct vsock_sock *vsk, struct sk_psock *psock)
>> > +{
>> > +	struct sock *sk = sk_vsock(vsk);
>> > +	s64 ret;
>> > +
>> > +	ret = vsock_connectible_has_data(vsk);
>> > +	if (ret > 0)
>> > +		return true;
>> > +
>> > +	return vsock_sk_has_data(sk, psock);
>> > +}
>> > +
>> > +static int vsock_msg_wait_data(struct sock *sk, struct sk_psock *psock, long timeo)
>> > +{
>> > +	struct vsock_sock *vsk;
>> > +	int err;
>> > +
>> > +	DEFINE_WAIT(wait);
>> > +
>> > +	vsk = vsock_sk(sk);
>> > +	err = 0;
>> > +
>> > +	while (vsock_has_data(vsk, psock)) {
>> > +		prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
>> > +
>> > +		if (sk->sk_err != 0 ||
>> > +		    (sk->sk_shutdown & RCV_SHUTDOWN) ||
>> > +		    (vsk->peer_shutdown & SEND_SHUTDOWN)) {
>> > +			break;
>> > +		}
>> > +
>> > +		if (timeo == 0) {
>> > +			err = -EAGAIN;
>> > +			break;
>> > +		}
>> > +
>> > +		release_sock(sk);
>> > +		timeo = schedule_timeout(timeo);
>> > +		lock_sock(sk);
>> > +
>> > +		if (signal_pending(current)) {
>> > +			err = sock_intr_errno(timeo);
>> > +			break;
>> > +		} else if (timeo == 0) {
>> > +			err = -EAGAIN;
>> > +			break;
>> > +		}
>> > +	}
>> > +
>> > +	finish_wait(sk_sleep(sk), &wait);
>> > +
>> > +	if (err)
>> > +		return err;
>> > +
>> > +	return 0;
>> > +}
>> > +
>> > +static int vsock_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags)
>> > +{
>> > +	int err;
>> > +	struct socket *sock = sk->sk_socket;
>> > +
>> > +	if (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET)
>> > +		err = vsock_connectible_recvmsg(sock, msg, len, flags);
>> > +	else
>>
>> Could it happen that it is not DGRAM and we should return an error in
>> this case?
>>
>
>I'm not sure but for the sake of safety, I'll add that.

Yep, I agree.

Thanks,
Stefano


^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH RFC 1/3] vsock: support sockmap
  2023-01-18 20:27 ` [PATCH RFC 1/3] vsock: support sockmap Bobby Eshleman
@ 2023-01-21 19:22     ` Cong Wang
  2023-01-21 19:22     ` Cong Wang
  1 sibling, 0 replies; 20+ messages in thread
From: Cong Wang @ 2023-01-21 19:22 UTC (permalink / raw)
  To: Bobby Eshleman
  Cc: Stefan Hajnoczi, Stefano Garzarella, Michael S. Tsirkin,
	Jason Wang, David S. Miller, Eric Dumazet, Jakub Kicinski,
	Paolo Abeni, Andrii Nakryiko, Mykola Lysenko, Alexei Starovoitov,
	Daniel Borkmann, Martin KaFai Lau, Song Liu, Yonghong Song,
	John Fastabend, KP Singh, Stanislav Fomichev, Hao Luo, Jiri Olsa,
	Shuah Khan, linux-kernel, kvm, virtualization, netdev, bpf,
	linux-kselftest, Cong Wang

On Wed, Jan 18, 2023 at 12:27:39PM -0800, Bobby Eshleman wrote:
> +static int vsock_read_skb(struct sock *sk, skb_read_actor_t read_actor)
> +{
> +	struct vsock_sock *vsk = vsock_sk(sk);
> +
> +	if (!vsk->transport)
> +		return -ENODEV;
> +
> +	if (!vsk->transport->read_skb)
> +		return -EOPNOTSUPP;

Can we move these two checks to sockmap update path? It would make
vsock_read_skb() faster.

> +
> +	return vsk->transport->read_skb(vsk, read_actor);
> +}

Essentially can be just this one line.

Thanks.

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH RFC 1/3] vsock: support sockmap
@ 2023-01-21 19:22     ` Cong Wang
  0 siblings, 0 replies; 20+ messages in thread
From: Cong Wang @ 2023-01-21 19:22 UTC (permalink / raw)
  To: Bobby Eshleman
  Cc: Cong Wang, kvm, Michael S. Tsirkin, Alexei Starovoitov,
	virtualization, Song Liu, Eric Dumazet, Stanislav Fomichev,
	linux-kselftest, Shuah Khan, Mykola Lysenko, Daniel Borkmann,
	John Fastabend, Andrii Nakryiko, Yonghong Song, Paolo Abeni,
	KP Singh, Stefan Hajnoczi, Jakub Kicinski, Hao Luo, netdev,
	linux-kernel, Jiri Olsa, bpf, Martin KaFai Lau, David S. Miller

On Wed, Jan 18, 2023 at 12:27:39PM -0800, Bobby Eshleman wrote:
> +static int vsock_read_skb(struct sock *sk, skb_read_actor_t read_actor)
> +{
> +	struct vsock_sock *vsk = vsock_sk(sk);
> +
> +	if (!vsk->transport)
> +		return -ENODEV;
> +
> +	if (!vsk->transport->read_skb)
> +		return -EOPNOTSUPP;

Can we move these two checks to sockmap update path? It would make
vsock_read_skb() faster.

> +
> +	return vsk->transport->read_skb(vsk, read_actor);
> +}

Essentially can be just this one line.

Thanks.
_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply	[flat|nested] 20+ messages in thread

end of thread, other threads:[~2023-01-22  5:03 UTC | newest]

Thread overview: 20+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-01-18 20:27 [PATCH RFC 0/3] vsock: add support for sockmap Bobby Eshleman
2023-01-18 20:27 ` [PATCH RFC 1/3] vsock: support sockmap Bobby Eshleman
2023-01-19 10:39   ` Stefano Garzarella
2023-01-19 10:39     ` Stefano Garzarella
2023-01-18 15:08     ` Bobby Eshleman
2023-01-20  8:55       ` Stefano Garzarella
2023-01-20  8:55         ` Stefano Garzarella
2023-01-21 19:22   ` Cong Wang
2023-01-21 19:22     ` Cong Wang
2023-01-19  3:47     ` Bobby Eshleman
2023-01-18 20:27 ` [PATCH RFC 2/3] selftests/bpf: add vsock to vmtest.sh Bobby Eshleman
2023-01-19 10:44   ` Stefano Garzarella
2023-01-19 10:44     ` Stefano Garzarella
2023-01-18 20:27 ` [PATCH RFC 3/3] selftests/bpf: Add a test case for vsock sockmap Bobby Eshleman
2023-01-19 10:48   ` Stefano Garzarella
2023-01-19 10:48     ` Stefano Garzarella
2023-01-18 15:14     ` Bobby Eshleman
2023-01-19 10:49 ` [PATCH RFC 0/3] vsock: add support for sockmap Stefano Garzarella
2023-01-19 10:49   ` Stefano Garzarella
2023-01-18 15:10   ` Bobby Eshleman

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.