netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [RFC] vsock: add multiple transports support for dgram
@ 2021-04-06 18:31 Jiang Wang
  2021-04-07  9:51 ` Jorgen Hansen
  0 siblings, 1 reply; 9+ messages in thread
From: Jiang Wang @ 2021-04-06 18:31 UTC (permalink / raw)
  Cc: virtualization, stefanha, cong.wang, duanxiongchun, xieyongji,
	jiang.wang, David S. Miller, Jakub Kicinski, Stefano Garzarella,
	Jorgen Hansen, Colin Ian King, Norbert Slusarek, Andra Paraschiv,
	Alexander Popov, netdev, linux-kernel

From: "jiang.wang" <jiang.wang@bytedance.com>

Currently, only VMCI supports dgram sockets. To supported
nested VM use case, this patch removes transport_dgram and
uses transport_g2h and transport_h2g for dgram too.

The transport is assgined when sending every packet and
receiving every packet on dgram sockets.

Signed-off-by: Jiang Wang <jiang.wang@bytedance.com>
---
This patch is not tested. I don't have a VMWare testing
environment. Could someone help me to test it? 

 include/net/af_vsock.h         |  2 --
 net/vmw_vsock/af_vsock.c       | 63 +++++++++++++++++++++---------------------
 net/vmw_vsock/vmci_transport.c | 20 +++++++++-----
 3 files changed, 45 insertions(+), 40 deletions(-)

diff --git a/include/net/af_vsock.h b/include/net/af_vsock.h
index b1c717286993..aba241e0d202 100644
--- a/include/net/af_vsock.h
+++ b/include/net/af_vsock.h
@@ -96,8 +96,6 @@ struct vsock_transport_send_notify_data {
 #define VSOCK_TRANSPORT_F_H2G		0x00000001
 /* Transport provides guest->host communication */
 #define VSOCK_TRANSPORT_F_G2H		0x00000002
-/* Transport provides DGRAM communication */
-#define VSOCK_TRANSPORT_F_DGRAM		0x00000004
 /* Transport provides local (loopback) communication */
 #define VSOCK_TRANSPORT_F_LOCAL		0x00000008
 
diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c
index 92a72f0e0d94..7739ab2521a1 100644
--- a/net/vmw_vsock/af_vsock.c
+++ b/net/vmw_vsock/af_vsock.c
@@ -449,8 +449,6 @@ int vsock_assign_transport(struct vsock_sock *vsk, struct vsock_sock *psk)
 
 	switch (sk->sk_type) {
 	case SOCK_DGRAM:
-		new_transport = transport_dgram;
-		break;
 	case SOCK_STREAM:
 		if (vsock_use_local_transport(remote_cid))
 			new_transport = transport_local;
@@ -1096,7 +1094,6 @@ static int vsock_dgram_sendmsg(struct socket *sock, struct msghdr *msg,
 	struct sock *sk;
 	struct vsock_sock *vsk;
 	struct sockaddr_vm *remote_addr;
-	const struct vsock_transport *transport;
 
 	if (msg->msg_flags & MSG_OOB)
 		return -EOPNOTSUPP;
@@ -1108,25 +1105,30 @@ static int vsock_dgram_sendmsg(struct socket *sock, struct msghdr *msg,
 
 	lock_sock(sk);
 
-	transport = vsk->transport;
-
 	err = vsock_auto_bind(vsk);
 	if (err)
 		goto out;
 
-
 	/* If the provided message contains an address, use that.  Otherwise
 	 * fall back on the socket's remote handle (if it has been connected).
 	 */
 	if (msg->msg_name &&
 	    vsock_addr_cast(msg->msg_name, msg->msg_namelen,
 			    &remote_addr) == 0) {
+		vsock_addr_init(&vsk->remote_addr, remote_addr->svm_cid,
+			remote_addr->svm_port);
+
+		err = vsock_assign_transport(vsk, NULL);
+		if (err) {
+			err = -EINVAL;
+			goto out;
+		}
+
 		/* Ensure this address is of the right type and is a valid
 		 * destination.
 		 */
-
 		if (remote_addr->svm_cid == VMADDR_CID_ANY)
-			remote_addr->svm_cid = transport->get_local_cid();
+			remote_addr->svm_cid = vsk->transport->get_local_cid();
 
 		if (!vsock_addr_bound(remote_addr)) {
 			err = -EINVAL;
@@ -1136,7 +1138,7 @@ static int vsock_dgram_sendmsg(struct socket *sock, struct msghdr *msg,
 		remote_addr = &vsk->remote_addr;
 
 		if (remote_addr->svm_cid == VMADDR_CID_ANY)
-			remote_addr->svm_cid = transport->get_local_cid();
+			remote_addr->svm_cid = vsk->transport->get_local_cid();
 
 		/* XXX Should connect() or this function ensure remote_addr is
 		 * bound?
@@ -1150,13 +1152,13 @@ static int vsock_dgram_sendmsg(struct socket *sock, struct msghdr *msg,
 		goto out;
 	}
 
-	if (!transport->dgram_allow(remote_addr->svm_cid,
+	if (!vsk->transport->dgram_allow(remote_addr->svm_cid,
 				    remote_addr->svm_port)) {
 		err = -EINVAL;
 		goto out;
 	}
 
-	err = transport->dgram_enqueue(vsk, remote_addr, msg, len);
+	err = vsk->transport->dgram_enqueue(vsk, remote_addr, msg, len);
 
 out:
 	release_sock(sk);
@@ -1191,13 +1193,20 @@ static int vsock_dgram_connect(struct socket *sock,
 	if (err)
 		goto out;
 
+	memcpy(&vsk->remote_addr, remote_addr, sizeof(vsk->remote_addr));
+
+	err = vsock_assign_transport(vsk, NULL);
+	if (err) {
+		err = -EINVAL;
+		goto out;
+	}
+
 	if (!vsk->transport->dgram_allow(remote_addr->svm_cid,
 					 remote_addr->svm_port)) {
 		err = -EINVAL;
 		goto out;
 	}
 
-	memcpy(&vsk->remote_addr, remote_addr, sizeof(vsk->remote_addr));
 	sock->state = SS_CONNECTED;
 
 out:
@@ -1209,6 +1218,16 @@ static int vsock_dgram_recvmsg(struct socket *sock, struct msghdr *msg,
 			       size_t len, int flags)
 {
 	struct vsock_sock *vsk = vsock_sk(sock->sk);
+	long timeo;
+
+	timeo = sock_rcvtimeo(sock->sk, flags & MSG_DONTWAIT);
+	do {
+		if (vsk->transport)
+			break;
+	} while (timeo && !vsk->transport);
+
+	if (!vsk->transport)
+		return -EAGAIN;
 
 	return vsk->transport->dgram_dequeue(vsk, msg, len, flags);
 }
@@ -2055,14 +2074,6 @@ static int vsock_create(struct net *net, struct socket *sock,
 
 	vsk = vsock_sk(sk);
 
-	if (sock->type == SOCK_DGRAM) {
-		ret = vsock_assign_transport(vsk, NULL);
-		if (ret < 0) {
-			sock_put(sk);
-			return ret;
-		}
-	}
-
 	vsock_insert_unbound(vsk);
 
 	return 0;
@@ -2182,7 +2193,7 @@ EXPORT_SYMBOL_GPL(vsock_core_get_transport);
 
 int vsock_core_register(const struct vsock_transport *t, int features)
 {
-	const struct vsock_transport *t_h2g, *t_g2h, *t_dgram, *t_local;
+	const struct vsock_transport *t_h2g, *t_g2h, *t_local;
 	int err = mutex_lock_interruptible(&vsock_register_mutex);
 
 	if (err)
@@ -2190,7 +2201,6 @@ int vsock_core_register(const struct vsock_transport *t, int features)
 
 	t_h2g = transport_h2g;
 	t_g2h = transport_g2h;
-	t_dgram = transport_dgram;
 	t_local = transport_local;
 
 	if (features & VSOCK_TRANSPORT_F_H2G) {
@@ -2209,14 +2219,6 @@ int vsock_core_register(const struct vsock_transport *t, int features)
 		t_g2h = t;
 	}
 
-	if (features & VSOCK_TRANSPORT_F_DGRAM) {
-		if (t_dgram) {
-			err = -EBUSY;
-			goto err_busy;
-		}
-		t_dgram = t;
-	}
-
 	if (features & VSOCK_TRANSPORT_F_LOCAL) {
 		if (t_local) {
 			err = -EBUSY;
@@ -2227,7 +2229,6 @@ int vsock_core_register(const struct vsock_transport *t, int features)
 
 	transport_h2g = t_h2g;
 	transport_g2h = t_g2h;
-	transport_dgram = t_dgram;
 	transport_local = t_local;
 
 err_busy:
diff --git a/net/vmw_vsock/vmci_transport.c b/net/vmw_vsock/vmci_transport.c
index 8b65323207db..42ea2a1c92ce 100644
--- a/net/vmw_vsock/vmci_transport.c
+++ b/net/vmw_vsock/vmci_transport.c
@@ -613,6 +613,7 @@ static int vmci_transport_recv_dgram_cb(void *data, struct vmci_datagram *dg)
 	size_t size;
 	struct sk_buff *skb;
 	struct vsock_sock *vsk;
+	int err;
 
 	sk = (struct sock *)data;
 
@@ -629,6 +630,17 @@ static int vmci_transport_recv_dgram_cb(void *data, struct vmci_datagram *dg)
 	if (!vmci_transport_allow_dgram(vsk, dg->src.context))
 		return VMCI_ERROR_NO_ACCESS;
 
+	vsock_addr_init(&vsk->remote_addr, dg->src.context,
+				dg->src.resource);
+
+	bh_lock_sock(sk);
+	if (!sock_owned_by_user(sk)) {
+		err = vsock_assign_transport(vsk, NULL);
+		if (err)
+			return err;
+	}
+	bh_unlock_sock(sk);
+
 	size = VMCI_DG_SIZE(dg);
 
 	/* Attach the packet to the socket's receive queue as an sk_buff. */
@@ -2093,13 +2105,7 @@ static int __init vmci_transport_init(void)
 		goto err_destroy_stream_handle;
 	}
 
-	/* Register only with dgram feature, other features (H2G, G2H) will be
-	 * registered when the first host or guest becomes active.
-	 */
-	err = vsock_core_register(&vmci_transport, VSOCK_TRANSPORT_F_DGRAM);
-	if (err < 0)
-		goto err_unsubscribe;
-
+	/* H2G, G2H will be registered when the first host or guest becomes active. */
 	err = vmci_register_vsock_callback(vmci_vsock_transport_cb);
 	if (err < 0)
 		goto err_unregister;
-- 
2.11.0


^ permalink raw reply related	[flat|nested] 9+ messages in thread

* Re: [RFC] vsock: add multiple transports support for dgram
  2021-04-06 18:31 [RFC] vsock: add multiple transports support for dgram Jiang Wang
@ 2021-04-07  9:51 ` Jorgen Hansen
  2021-04-07 18:25   ` [External] " Jiang Wang .
  0 siblings, 1 reply; 9+ messages in thread
From: Jorgen Hansen @ 2021-04-07  9:51 UTC (permalink / raw)
  To: Jiang Wang
  Cc: virtualization, Stefan Hajnoczi, cong.wang, duanxiongchun,
	xieyongji, David S. Miller, Jakub Kicinski, Stefano Garzarella,
	Colin Ian King, Norbert Slusarek, Andra Paraschiv,
	Alexander Popov, netdev, linux-kernel


> On 6 Apr 2021, at 20:31, Jiang Wang <jiang.wang@bytedance.com> wrote:
> 
> From: "jiang.wang" <jiang.wang@bytedance.com>
> 
> Currently, only VMCI supports dgram sockets. To supported
> nested VM use case, this patch removes transport_dgram and
> uses transport_g2h and transport_h2g for dgram too.

Could you provide some background for introducing this change - are you
looking at introducing datagrams for a different transport? VMCI datagrams
already support the nested use case, but if we need to support multiple datagram
transports we need to rework how we administer port assignment for datagrams.
One specific issue is that the vmci transport won’t receive any datagrams for a
port unless the datagram socket has already been assigned the vmci transport
and the port bound to the underlying VMCI device (see below for more details).


> The transport is assgined when sending every packet and
> receiving every packet on dgram sockets.

Is the intent that the same datagram socket can be used for sending packets both
In the host to guest, and the guest to directions? 

Also, as mentioned above the vSocket datagram needs to be bound to a port in the
VMCI transport before we can receive any datagrams on that port. This means that
vmci_transport_recv_dgram_cb won’t be called unless it is already associated with
a socket as the transport, and therefore we can’t delay the transport assignment to
that point.


> Signed-off-by: Jiang Wang <jiang.wang@bytedance.com>
> ---
> This patch is not tested. I don't have a VMWare testing
> environment. Could someone help me to test it? 
> 
> include/net/af_vsock.h         |  2 --
> net/vmw_vsock/af_vsock.c       | 63 +++++++++++++++++++++---------------------
> net/vmw_vsock/vmci_transport.c | 20 +++++++++-----
> 3 files changed, 45 insertions(+), 40 deletions(-)
> 
> diff --git a/include/net/af_vsock.h b/include/net/af_vsock.h
> index b1c717286993..aba241e0d202 100644
> --- a/include/net/af_vsock.h
> +++ b/include/net/af_vsock.h
> @@ -96,8 +96,6 @@ struct vsock_transport_send_notify_data {
> #define VSOCK_TRANSPORT_F_H2G		0x00000001
> /* Transport provides guest->host communication */
> #define VSOCK_TRANSPORT_F_G2H		0x00000002
> -/* Transport provides DGRAM communication */
> -#define VSOCK_TRANSPORT_F_DGRAM		0x00000004
> /* Transport provides local (loopback) communication */
> #define VSOCK_TRANSPORT_F_LOCAL		0x00000008
> 
> diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c
> index 92a72f0e0d94..7739ab2521a1 100644
> --- a/net/vmw_vsock/af_vsock.c
> +++ b/net/vmw_vsock/af_vsock.c
> @@ -449,8 +449,6 @@ int vsock_assign_transport(struct vsock_sock *vsk, struct vsock_sock *psk)
> 
> 	switch (sk->sk_type) {
> 	case SOCK_DGRAM:
> -		new_transport = transport_dgram;
> -		break;
> 	case SOCK_STREAM:
> 		if (vsock_use_local_transport(remote_cid))
> 			new_transport = transport_local;
> @@ -1096,7 +1094,6 @@ static int vsock_dgram_sendmsg(struct socket *sock, struct msghdr *msg,
> 	struct sock *sk;
> 	struct vsock_sock *vsk;
> 	struct sockaddr_vm *remote_addr;
> -	const struct vsock_transport *transport;
> 
> 	if (msg->msg_flags & MSG_OOB)
> 		return -EOPNOTSUPP;
> @@ -1108,25 +1105,30 @@ static int vsock_dgram_sendmsg(struct socket *sock, struct msghdr *msg,
> 
> 	lock_sock(sk);
> 
> -	transport = vsk->transport;
> -
> 	err = vsock_auto_bind(vsk);
> 	if (err)
> 		goto out;
> 
> -
> 	/* If the provided message contains an address, use that.  Otherwise
> 	 * fall back on the socket's remote handle (if it has been connected).
> 	 */
> 	if (msg->msg_name &&
> 	    vsock_addr_cast(msg->msg_name, msg->msg_namelen,
> 			    &remote_addr) == 0) {
> +		vsock_addr_init(&vsk->remote_addr, remote_addr->svm_cid,
> +			remote_addr->svm_port);
> +
> +		err = vsock_assign_transport(vsk, NULL);
> +		if (err) {
> +			err = -EINVAL;
> +			goto out;
> +		}
> +
> 		/* Ensure this address is of the right type and is a valid
> 		 * destination.
> 		 */
> -
> 		if (remote_addr->svm_cid == VMADDR_CID_ANY)
> -			remote_addr->svm_cid = transport->get_local_cid();
> +			remote_addr->svm_cid = vsk->transport->get_local_cid();
> 
> 		if (!vsock_addr_bound(remote_addr)) {
> 			err = -EINVAL;
> @@ -1136,7 +1138,7 @@ static int vsock_dgram_sendmsg(struct socket *sock, struct msghdr *msg,
> 		remote_addr = &vsk->remote_addr;
> 
> 		if (remote_addr->svm_cid == VMADDR_CID_ANY)
> -			remote_addr->svm_cid = transport->get_local_cid();
> +			remote_addr->svm_cid = vsk->transport->get_local_cid();
> 
> 		/* XXX Should connect() or this function ensure remote_addr is
> 		 * bound?
> @@ -1150,13 +1152,13 @@ static int vsock_dgram_sendmsg(struct socket *sock, struct msghdr *msg,
> 		goto out;
> 	}
> 
> -	if (!transport->dgram_allow(remote_addr->svm_cid,
> +	if (!vsk->transport->dgram_allow(remote_addr->svm_cid,
> 				    remote_addr->svm_port)) {
> 		err = -EINVAL;
> 		goto out;
> 	}
> 
> -	err = transport->dgram_enqueue(vsk, remote_addr, msg, len);
> +	err = vsk->transport->dgram_enqueue(vsk, remote_addr, msg, len);
> 
> out:
> 	release_sock(sk);
> @@ -1191,13 +1193,20 @@ static int vsock_dgram_connect(struct socket *sock,
> 	if (err)
> 		goto out;
> 
> +	memcpy(&vsk->remote_addr, remote_addr, sizeof(vsk->remote_addr));
> +
> +	err = vsock_assign_transport(vsk, NULL);
> +	if (err) {
> +		err = -EINVAL;
> +		goto out;
> +	}
> +
> 	if (!vsk->transport->dgram_allow(remote_addr->svm_cid,
> 					 remote_addr->svm_port)) {
> 		err = -EINVAL;
> 		goto out;
> 	}
> 
> -	memcpy(&vsk->remote_addr, remote_addr, sizeof(vsk->remote_addr));
> 	sock->state = SS_CONNECTED;
> 
> out:
> @@ -1209,6 +1218,16 @@ static int vsock_dgram_recvmsg(struct socket *sock, struct msghdr *msg,
> 			       size_t len, int flags)
> {
> 	struct vsock_sock *vsk = vsock_sk(sock->sk);
> +	long timeo;
> +
> +	timeo = sock_rcvtimeo(sock->sk, flags & MSG_DONTWAIT);
> +	do {
> +		if (vsk->transport)
> +			break;
> +	} while (timeo && !vsk->transport);
> +
> +	if (!vsk->transport)
> +		return -EAGAIN;
> 
> 	return vsk->transport->dgram_dequeue(vsk, msg, len, flags);
> }
> @@ -2055,14 +2074,6 @@ static int vsock_create(struct net *net, struct socket *sock,
> 
> 	vsk = vsock_sk(sk);
> 
> -	if (sock->type == SOCK_DGRAM) {
> -		ret = vsock_assign_transport(vsk, NULL);
> -		if (ret < 0) {
> -			sock_put(sk);
> -			return ret;
> -		}
> -	}
> -
> 	vsock_insert_unbound(vsk);
> 
> 	return 0;
> @@ -2182,7 +2193,7 @@ EXPORT_SYMBOL_GPL(vsock_core_get_transport);
> 
> int vsock_core_register(const struct vsock_transport *t, int features)
> {
> -	const struct vsock_transport *t_h2g, *t_g2h, *t_dgram, *t_local;
> +	const struct vsock_transport *t_h2g, *t_g2h, *t_local;
> 	int err = mutex_lock_interruptible(&vsock_register_mutex);
> 
> 	if (err)
> @@ -2190,7 +2201,6 @@ int vsock_core_register(const struct vsock_transport *t, int features)
> 
> 	t_h2g = transport_h2g;
> 	t_g2h = transport_g2h;
> -	t_dgram = transport_dgram;
> 	t_local = transport_local;
> 
> 	if (features & VSOCK_TRANSPORT_F_H2G) {
> @@ -2209,14 +2219,6 @@ int vsock_core_register(const struct vsock_transport *t, int features)
> 		t_g2h = t;
> 	}
> 
> -	if (features & VSOCK_TRANSPORT_F_DGRAM) {
> -		if (t_dgram) {
> -			err = -EBUSY;
> -			goto err_busy;
> -		}
> -		t_dgram = t;
> -	}
> -
> 	if (features & VSOCK_TRANSPORT_F_LOCAL) {
> 		if (t_local) {
> 			err = -EBUSY;
> @@ -2227,7 +2229,6 @@ int vsock_core_register(const struct vsock_transport *t, int features)
> 
> 	transport_h2g = t_h2g;
> 	transport_g2h = t_g2h;
> -	transport_dgram = t_dgram;
> 	transport_local = t_local;
> 
> err_busy:
> diff --git a/net/vmw_vsock/vmci_transport.c b/net/vmw_vsock/vmci_transport.c
> index 8b65323207db..42ea2a1c92ce 100644
> --- a/net/vmw_vsock/vmci_transport.c
> +++ b/net/vmw_vsock/vmci_transport.c
> @@ -613,6 +613,7 @@ static int vmci_transport_recv_dgram_cb(void *data, struct vmci_datagram *dg)
> 	size_t size;
> 	struct sk_buff *skb;
> 	struct vsock_sock *vsk;
> +	int err;
> 
> 	sk = (struct sock *)data;
> 
> @@ -629,6 +630,17 @@ static int vmci_transport_recv_dgram_cb(void *data, struct vmci_datagram *dg)
> 	if (!vmci_transport_allow_dgram(vsk, dg->src.context))
> 		return VMCI_ERROR_NO_ACCESS;
> 
> +	vsock_addr_init(&vsk->remote_addr, dg->src.context,
> +				dg->src.resource);
> +
> +	bh_lock_sock(sk);
> +	if (!sock_owned_by_user(sk)) {
> +		err = vsock_assign_transport(vsk, NULL);
> +		if (err)
> +			return err;
> +	}
> +	bh_unlock_sock(sk);
> +
> 	size = VMCI_DG_SIZE(dg);
> 
> 	/* Attach the packet to the socket's receive queue as an sk_buff. */
> @@ -2093,13 +2105,7 @@ static int __init vmci_transport_init(void)
> 		goto err_destroy_stream_handle;
> 	}
> 
> -	/* Register only with dgram feature, other features (H2G, G2H) will be
> -	 * registered when the first host or guest becomes active.
> -	 */
> -	err = vsock_core_register(&vmci_transport, VSOCK_TRANSPORT_F_DGRAM);
> -	if (err < 0)
> -		goto err_unsubscribe;
> -
> +	/* H2G, G2H will be registered when the first host or guest becomes active. */
> 	err = vmci_register_vsock_callback(vmci_vsock_transport_cb);
> 	if (err < 0)
> 		goto err_unregister;
> -- 
> 2.11.0
> 


^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [External] Re: [RFC] vsock: add multiple transports support for dgram
  2021-04-07  9:51 ` Jorgen Hansen
@ 2021-04-07 18:25   ` Jiang Wang .
  2021-04-12 14:04     ` Stefano Garzarella
  2021-04-13  9:02     ` [External] " Jorgen Hansen
  0 siblings, 2 replies; 9+ messages in thread
From: Jiang Wang . @ 2021-04-07 18:25 UTC (permalink / raw)
  To: Jorgen Hansen
  Cc: virtualization, Stefan Hajnoczi, cong.wang, duanxiongchun,
	xieyongji, David S. Miller, Jakub Kicinski, Stefano Garzarella,
	Colin Ian King, Norbert Slusarek, Andra Paraschiv,
	Alexander Popov, netdev, linux-kernel

On Wed, Apr 7, 2021 at 2:51 AM Jorgen Hansen <jhansen@vmware.com> wrote:
>
>
> > On 6 Apr 2021, at 20:31, Jiang Wang <jiang.wang@bytedance.com> wrote:
> >
> > From: "jiang.wang" <jiang.wang@bytedance.com>
> >
> > Currently, only VMCI supports dgram sockets. To supported
> > nested VM use case, this patch removes transport_dgram and
> > uses transport_g2h and transport_h2g for dgram too.
>
> Could you provide some background for introducing this change - are you
> looking at introducing datagrams for a different transport? VMCI datagrams
> already support the nested use case,

Yes, I am trying to introduce datagram for virtio transport. I wrote a
spec patch for
virtio dgram support and also a code patch, but the code patch is still WIP.
When I wrote this commit message, I was thinking nested VM is the same as
multiple transport support. But now, I realize they are different.
Nested VMs may use
the same virtualization layer(KVM on KVM), or different virtualization layers
(KVM on ESXi). Thanks for letting me know that VMCI already supported nested
use cases. I think you mean VMCI on VMCI, right?

> but if we need to support multiple datagram
> transports we need to rework how we administer port assignment for datagrams.
> One specific issue is that the vmci transport won’t receive any datagrams for a
> port unless the datagram socket has already been assigned the vmci transport
> and the port bound to the underlying VMCI device (see below for more details).
>
I see.

> > The transport is assgined when sending every packet and
> > receiving every packet on dgram sockets.
>
> Is the intent that the same datagram socket can be used for sending packets both
> In the host to guest, and the guest to directions?

Nope. One datagram socket will only send packets to one direction, either to the
host or to the guest. My above description is wrong. When sending packets, the
transport is assigned with the first packet (with auto_bind).

The problem is when receiving packets. The listener can bind to the
VMADDR_CID_ANY
address. Then it is unclear which transport we should use. For stream
sockets, there will be a new socket for each connection, and transport
can be decided
at that time. For datagram sockets, I am not sure how to handle that.
For VMCI, does the same transport can be used for both receiving from
host and from
the guest?

For virtio, the h2g and g2h transports are different,, so we have to choose
one of them. My original thought is to wait until the first packet arrives.

Another idea is that we always bind to host addr and use h2g
transport because I think that might
be more common. If a listener wants to recv packets from the host, then it
should bind to the guest addr instead of CID_ANY.
Any other suggestions?

> Also, as mentioned above the vSocket datagram needs to be bound to a port in the
> VMCI transport before we can receive any datagrams on that port. This means that
> vmci_transport_recv_dgram_cb won’t be called unless it is already associated with
> a socket as the transport, and therefore we can’t delay the transport assignment to
> that point.

Got it. Thanks. Please see the above replies.

>
> > Signed-off-by: Jiang Wang <jiang.wang@bytedance.com>
> > ---
> > This patch is not tested. I don't have a VMWare testing
> > environment. Could someone help me to test it?
> >
> > include/net/af_vsock.h         |  2 --
> > net/vmw_vsock/af_vsock.c       | 63 +++++++++++++++++++++---------------------
> > net/vmw_vsock/vmci_transport.c | 20 +++++++++-----
> > 3 files changed, 45 insertions(+), 40 deletions(-)
> >
> > diff --git a/include/net/af_vsock.h b/include/net/af_vsock.h
> > index b1c717286993..aba241e0d202 100644
> > --- a/include/net/af_vsock.h
> > +++ b/include/net/af_vsock.h
> > @@ -96,8 +96,6 @@ struct vsock_transport_send_notify_data {
> > #define VSOCK_TRANSPORT_F_H2G         0x00000001
> > /* Transport provides guest->host communication */
> > #define VSOCK_TRANSPORT_F_G2H         0x00000002
> > -/* Transport provides DGRAM communication */
> > -#define VSOCK_TRANSPORT_F_DGRAM              0x00000004
> > /* Transport provides local (loopback) communication */
> > #define VSOCK_TRANSPORT_F_LOCAL               0x00000008
> >
> > diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c
> > index 92a72f0e0d94..7739ab2521a1 100644
> > --- a/net/vmw_vsock/af_vsock.c
> > +++ b/net/vmw_vsock/af_vsock.c
> > @@ -449,8 +449,6 @@ int vsock_assign_transport(struct vsock_sock *vsk, struct vsock_sock *psk)
> >
> >       switch (sk->sk_type) {
> >       case SOCK_DGRAM:
> > -             new_transport = transport_dgram;
> > -             break;
> >       case SOCK_STREAM:
> >               if (vsock_use_local_transport(remote_cid))
> >                       new_transport = transport_local;
> > @@ -1096,7 +1094,6 @@ static int vsock_dgram_sendmsg(struct socket *sock, struct msghdr *msg,
> >       struct sock *sk;
> >       struct vsock_sock *vsk;
> >       struct sockaddr_vm *remote_addr;
> > -     const struct vsock_transport *transport;
> >
> >       if (msg->msg_flags & MSG_OOB)
> >               return -EOPNOTSUPP;
> > @@ -1108,25 +1105,30 @@ static int vsock_dgram_sendmsg(struct socket *sock, struct msghdr *msg,
> >
> >       lock_sock(sk);
> >
> > -     transport = vsk->transport;
> > -
> >       err = vsock_auto_bind(vsk);
> >       if (err)
> >               goto out;
> >
> > -
> >       /* If the provided message contains an address, use that.  Otherwise
> >        * fall back on the socket's remote handle (if it has been connected).
> >        */
> >       if (msg->msg_name &&
> >           vsock_addr_cast(msg->msg_name, msg->msg_namelen,
> >                           &remote_addr) == 0) {
> > +             vsock_addr_init(&vsk->remote_addr, remote_addr->svm_cid,
> > +                     remote_addr->svm_port);
> > +
> > +             err = vsock_assign_transport(vsk, NULL);
> > +             if (err) {
> > +                     err = -EINVAL;
> > +                     goto out;
> > +             }
> > +
> >               /* Ensure this address is of the right type and is a valid
> >                * destination.
> >                */
> > -
> >               if (remote_addr->svm_cid == VMADDR_CID_ANY)
> > -                     remote_addr->svm_cid = transport->get_local_cid();
> > +                     remote_addr->svm_cid = vsk->transport->get_local_cid();
> >
> >               if (!vsock_addr_bound(remote_addr)) {
> >                       err = -EINVAL;
> > @@ -1136,7 +1138,7 @@ static int vsock_dgram_sendmsg(struct socket *sock, struct msghdr *msg,
> >               remote_addr = &vsk->remote_addr;
> >
> >               if (remote_addr->svm_cid == VMADDR_CID_ANY)
> > -                     remote_addr->svm_cid = transport->get_local_cid();
> > +                     remote_addr->svm_cid = vsk->transport->get_local_cid();
> >
> >               /* XXX Should connect() or this function ensure remote_addr is
> >                * bound?
> > @@ -1150,13 +1152,13 @@ static int vsock_dgram_sendmsg(struct socket *sock, struct msghdr *msg,
> >               goto out;
> >       }
> >
> > -     if (!transport->dgram_allow(remote_addr->svm_cid,
> > +     if (!vsk->transport->dgram_allow(remote_addr->svm_cid,
> >                                   remote_addr->svm_port)) {
> >               err = -EINVAL;
> >               goto out;
> >       }
> >
> > -     err = transport->dgram_enqueue(vsk, remote_addr, msg, len);
> > +     err = vsk->transport->dgram_enqueue(vsk, remote_addr, msg, len);
> >
> > out:
> >       release_sock(sk);
> > @@ -1191,13 +1193,20 @@ static int vsock_dgram_connect(struct socket *sock,
> >       if (err)
> >               goto out;
> >
> > +     memcpy(&vsk->remote_addr, remote_addr, sizeof(vsk->remote_addr));
> > +
> > +     err = vsock_assign_transport(vsk, NULL);
> > +     if (err) {
> > +             err = -EINVAL;
> > +             goto out;
> > +     }
> > +
> >       if (!vsk->transport->dgram_allow(remote_addr->svm_cid,
> >                                        remote_addr->svm_port)) {
> >               err = -EINVAL;
> >               goto out;
> >       }
> >
> > -     memcpy(&vsk->remote_addr, remote_addr, sizeof(vsk->remote_addr));
> >       sock->state = SS_CONNECTED;
> >
> > out:
> > @@ -1209,6 +1218,16 @@ static int vsock_dgram_recvmsg(struct socket *sock, struct msghdr *msg,
> >                              size_t len, int flags)
> > {
> >       struct vsock_sock *vsk = vsock_sk(sock->sk);
> > +     long timeo;
> > +
> > +     timeo = sock_rcvtimeo(sock->sk, flags & MSG_DONTWAIT);
> > +     do {
> > +             if (vsk->transport)
> > +                     break;
> > +     } while (timeo && !vsk->transport);
> > +
> > +     if (!vsk->transport)
> > +             return -EAGAIN;
> >
> >       return vsk->transport->dgram_dequeue(vsk, msg, len, flags);
> > }
> > @@ -2055,14 +2074,6 @@ static int vsock_create(struct net *net, struct socket *sock,
> >
> >       vsk = vsock_sk(sk);
> >
> > -     if (sock->type == SOCK_DGRAM) {
> > -             ret = vsock_assign_transport(vsk, NULL);
> > -             if (ret < 0) {
> > -                     sock_put(sk);
> > -                     return ret;
> > -             }
> > -     }
> > -
> >       vsock_insert_unbound(vsk);
> >
> >       return 0;
> > @@ -2182,7 +2193,7 @@ EXPORT_SYMBOL_GPL(vsock_core_get_transport);
> >
> > int vsock_core_register(const struct vsock_transport *t, int features)
> > {
> > -     const struct vsock_transport *t_h2g, *t_g2h, *t_dgram, *t_local;
> > +     const struct vsock_transport *t_h2g, *t_g2h, *t_local;
> >       int err = mutex_lock_interruptible(&vsock_register_mutex);
> >
> >       if (err)
> > @@ -2190,7 +2201,6 @@ int vsock_core_register(const struct vsock_transport *t, int features)
> >
> >       t_h2g = transport_h2g;
> >       t_g2h = transport_g2h;
> > -     t_dgram = transport_dgram;
> >       t_local = transport_local;
> >
> >       if (features & VSOCK_TRANSPORT_F_H2G) {
> > @@ -2209,14 +2219,6 @@ int vsock_core_register(const struct vsock_transport *t, int features)
> >               t_g2h = t;
> >       }
> >
> > -     if (features & VSOCK_TRANSPORT_F_DGRAM) {
> > -             if (t_dgram) {
> > -                     err = -EBUSY;
> > -                     goto err_busy;
> > -             }
> > -             t_dgram = t;
> > -     }
> > -
> >       if (features & VSOCK_TRANSPORT_F_LOCAL) {
> >               if (t_local) {
> >                       err = -EBUSY;
> > @@ -2227,7 +2229,6 @@ int vsock_core_register(const struct vsock_transport *t, int features)
> >
> >       transport_h2g = t_h2g;
> >       transport_g2h = t_g2h;
> > -     transport_dgram = t_dgram;
> >       transport_local = t_local;
> >
> > err_busy:
> > diff --git a/net/vmw_vsock/vmci_transport.c b/net/vmw_vsock/vmci_transport.c
> > index 8b65323207db..42ea2a1c92ce 100644
> > --- a/net/vmw_vsock/vmci_transport.c
> > +++ b/net/vmw_vsock/vmci_transport.c
> > @@ -613,6 +613,7 @@ static int vmci_transport_recv_dgram_cb(void *data, struct vmci_datagram *dg)
> >       size_t size;
> >       struct sk_buff *skb;
> >       struct vsock_sock *vsk;
> > +     int err;
> >
> >       sk = (struct sock *)data;
> >
> > @@ -629,6 +630,17 @@ static int vmci_transport_recv_dgram_cb(void *data, struct vmci_datagram *dg)
> >       if (!vmci_transport_allow_dgram(vsk, dg->src.context))
> >               return VMCI_ERROR_NO_ACCESS;
> >
> > +     vsock_addr_init(&vsk->remote_addr, dg->src.context,
> > +                             dg->src.resource);
> > +
> > +     bh_lock_sock(sk);
> > +     if (!sock_owned_by_user(sk)) {
> > +             err = vsock_assign_transport(vsk, NULL);
> > +             if (err)
> > +                     return err;
> > +     }
> > +     bh_unlock_sock(sk);
> > +
> >       size = VMCI_DG_SIZE(dg);
> >
> >       /* Attach the packet to the socket's receive queue as an sk_buff. */
> > @@ -2093,13 +2105,7 @@ static int __init vmci_transport_init(void)
> >               goto err_destroy_stream_handle;
> >       }
> >
> > -     /* Register only with dgram feature, other features (H2G, G2H) will be
> > -      * registered when the first host or guest becomes active.
> > -      */
> > -     err = vsock_core_register(&vmci_transport, VSOCK_TRANSPORT_F_DGRAM);
> > -     if (err < 0)
> > -             goto err_unsubscribe;
> > -
> > +     /* H2G, G2H will be registered when the first host or guest becomes active. */
> >       err = vmci_register_vsock_callback(vmci_vsock_transport_cb);
> >       if (err < 0)
> >               goto err_unregister;
> > --
> > 2.11.0
> >
>

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [External] Re: [RFC] vsock: add multiple transports support for dgram
  2021-04-07 18:25   ` [External] " Jiang Wang .
@ 2021-04-12 14:04     ` Stefano Garzarella
  2021-04-12 18:53       ` Jiang Wang .
  2021-04-13  9:02     ` [External] " Jorgen Hansen
  1 sibling, 1 reply; 9+ messages in thread
From: Stefano Garzarella @ 2021-04-12 14:04 UTC (permalink / raw)
  To: Jiang Wang ., Jorgen Hansen
  Cc: virtualization, Stefan Hajnoczi, cong.wang, duanxiongchun,
	xieyongji, David S. Miller, Jakub Kicinski, Colin Ian King,
	Norbert Slusarek, Andra Paraschiv, Alexander Popov, netdev,
	linux-kernel

Hi Jiang,
thanks for re-starting the multi-transport support for dgram!

On Wed, Apr 07, 2021 at 11:25:36AM -0700, Jiang Wang . wrote:
>On Wed, Apr 7, 2021 at 2:51 AM Jorgen Hansen <jhansen@vmware.com> wrote:
>>
>>
>> > On 6 Apr 2021, at 20:31, Jiang Wang <jiang.wang@bytedance.com> wrote:
>> >
>> > From: "jiang.wang" <jiang.wang@bytedance.com>
>> >
>> > Currently, only VMCI supports dgram sockets. To supported
>> > nested VM use case, this patch removes transport_dgram and
>> > uses transport_g2h and transport_h2g for dgram too.

I agree on this part, I think that's the direction to go.  
transport_dgram was added as a shortcut.

>>
>> Could you provide some background for introducing this change - are you
>> looking at introducing datagrams for a different transport? VMCI datagrams
>> already support the nested use case,
>
>Yes, I am trying to introduce datagram for virtio transport. I wrote a
>spec patch for
>virtio dgram support and also a code patch, but the code patch is still WIP.
>When I wrote this commit message, I was thinking nested VM is the same as
>multiple transport support. But now, I realize they are different.
>Nested VMs may use
>the same virtualization layer(KVM on KVM), or different virtualization layers
>(KVM on ESXi). Thanks for letting me know that VMCI already supported nested
>use cases. I think you mean VMCI on VMCI, right?
>
>> but if we need to support multiple datagram
>> transports we need to rework how we administer port assignment for datagrams.
>> One specific issue is that the vmci transport won’t receive any datagrams for a
>> port unless the datagram socket has already been assigned the vmci transport
>> and the port bound to the underlying VMCI device (see below for more details).
>>
>I see.
>
>> > The transport is assgined when sending every packet and
>> > receiving every packet on dgram sockets.
>>
>> Is the intent that the same datagram socket can be used for sending packets both
>> In the host to guest, and the guest to directions?
>
>Nope. One datagram socket will only send packets to one direction, either to the
>host or to the guest. My above description is wrong. When sending packets, the
>transport is assigned with the first packet (with auto_bind).

I'm not sure this is right.
The auto_bind on the first packet should only assign a local port to the 
socket, but does not affect the transport to be used.

A user could send one packet to the nested guest and another to the host 
using the same socket, or am I wrong?

>
>The problem is when receiving packets. The listener can bind to the
>VMADDR_CID_ANY
>address. Then it is unclear which transport we should use. For stream
>sockets, there will be a new socket for each connection, and transport
>can be decided
>at that time. For datagram sockets, I am not sure how to handle that.

yes, this I think is the main problem, but maybe the sender one is even 
more complicated.

Maybe we should remove the 1:1 association we have now between vsk and 
transport.

At least for DGRAM, for connected sockets I think the association makes 
sense.

>For VMCI, does the same transport can be used for both receiving from
>host and from
>the guest?

Yes, they're registered at different times, but it's the same transport.

>
>For virtio, the h2g and g2h transports are different,, so we have to 
>choose
>one of them. My original thought is to wait until the first packet 
>arrives.
>
>Another idea is that we always bind to host addr and use h2g
>transport because I think that might
>be more common. If a listener wants to recv packets from the host, then 
>it
>should bind to the guest addr instead of CID_ANY.

Yes, I remember we discussed this idea, this would simplify the 
receiving, but there is still the issue of a user wanting to receive 
packets from both the nested guest and the host.

>Any other suggestions?
>

I think one solution could be to remove the 1:1 association between 
DGRAM socket and transport.

IIUC VMCI creates a skb for each received packet and queues it through 
sk_receive_skb() directly in the struct sock.

Then the .dgram_dequeue() callback dequeues them using 
skb_recv_datagram().

We can move these parts in the vsock core, and create some helpers to 
allow the transports to enqueue received DGRAM packets in the same way 
(and with the same format) directly in the struct sock.


What do you think?

Thanks,
Stefano


^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: Re: [RFC] vsock: add multiple transports support for dgram
  2021-04-12 14:04     ` Stefano Garzarella
@ 2021-04-12 18:53       ` Jiang Wang .
       [not found]         ` <2EE65DBC-30AC-4E11-BFD5-73586B94C985@vmware.com>
  0 siblings, 1 reply; 9+ messages in thread
From: Jiang Wang . @ 2021-04-12 18:53 UTC (permalink / raw)
  To: Stefano Garzarella
  Cc: Jorgen Hansen, virtualization, Stefan Hajnoczi, cong.wang,
	duanxiongchun, xieyongji, David S. Miller, Jakub Kicinski,
	Colin Ian King, Norbert Slusarek, Andra Paraschiv,
	Alexander Popov, netdev, linux-kernel

On Mon, Apr 12, 2021 at 7:04 AM Stefano Garzarella <sgarzare@redhat.com> wrote:
>
> Hi Jiang,
> thanks for re-starting the multi-transport support for dgram!

No problem.

> On Wed, Apr 07, 2021 at 11:25:36AM -0700, Jiang Wang . wrote:
> >On Wed, Apr 7, 2021 at 2:51 AM Jorgen Hansen <jhansen@vmware.com> wrote:
> >>
> >>
> >> > On 6 Apr 2021, at 20:31, Jiang Wang <jiang.wang@bytedance.com> wrote:
> >> >
> >> > From: "jiang.wang" <jiang.wang@bytedance.com>
> >> >
> >> > Currently, only VMCI supports dgram sockets. To supported
> >> > nested VM use case, this patch removes transport_dgram and
> >> > uses transport_g2h and transport_h2g for dgram too.
>
> I agree on this part, I think that's the direction to go.
> transport_dgram was added as a shortcut.

Got it.

> >>
> >> Could you provide some background for introducing this change - are you
> >> looking at introducing datagrams for a different transport? VMCI datagrams
> >> already support the nested use case,
> >
> >Yes, I am trying to introduce datagram for virtio transport. I wrote a
> >spec patch for
> >virtio dgram support and also a code patch, but the code patch is still WIP.
> >When I wrote this commit message, I was thinking nested VM is the same as
> >multiple transport support. But now, I realize they are different.
> >Nested VMs may use
> >the same virtualization layer(KVM on KVM), or different virtualization layers
> >(KVM on ESXi). Thanks for letting me know that VMCI already supported nested
> >use cases. I think you mean VMCI on VMCI, right?
> >
> >> but if we need to support multiple datagram
> >> transports we need to rework how we administer port assignment for datagrams.
> >> One specific issue is that the vmci transport won’t receive any datagrams for a
> >> port unless the datagram socket has already been assigned the vmci transport
> >> and the port bound to the underlying VMCI device (see below for more details).
> >>
> >I see.
> >
> >> > The transport is assgined when sending every packet and
> >> > receiving every packet on dgram sockets.
> >>
> >> Is the intent that the same datagram socket can be used for sending packets both
> >> In the host to guest, and the guest to directions?
> >
> >Nope. One datagram socket will only send packets to one direction, either to the
> >host or to the guest. My above description is wrong. When sending packets, the
> >transport is assigned with the first packet (with auto_bind).
>
> I'm not sure this is right.
> The auto_bind on the first packet should only assign a local port to the
> socket, but does not affect the transport to be used.
>
> A user could send one packet to the nested guest and another to the host
> using the same socket, or am I wrong?

OK. I think you are right.

> >
> >The problem is when receiving packets. The listener can bind to the
> >VMADDR_CID_ANY
> >address. Then it is unclear which transport we should use. For stream
> >sockets, there will be a new socket for each connection, and transport
> >can be decided
> >at that time. For datagram sockets, I am not sure how to handle that.
>
> yes, this I think is the main problem, but maybe the sender one is even
> more complicated.
>
> Maybe we should remove the 1:1 association we have now between vsk and
> transport.

Yes, I thought about that too. One idea is to define two transports in vsk.
For sending pkt, we can pick the right transport when we get the packet, like
in virtio_transport_send_pkt_info(). For receiving pkts, we have to check
and call both
transports dequeue callbacks if the local cid is CID_ANY.

> At least for DGRAM, for connected sockets I think the association makes
> sense.

Yeah. For a connected socket, we will only use one transport.

> >For VMCI, does the same transport can be used for both receiving from
> >host and from
> >the guest?
>
> Yes, they're registered at different times, but it's the same transport.
>
> >
> >For virtio, the h2g and g2h transports are different,, so we have to
> >choose
> >one of them. My original thought is to wait until the first packet
> >arrives.
> >
> >Another idea is that we always bind to host addr and use h2g
> >transport because I think that might
> >be more common. If a listener wants to recv packets from the host, then
> >it
> >should bind to the guest addr instead of CID_ANY.
>
> Yes, I remember we discussed this idea, this would simplify the
> receiving, but there is still the issue of a user wanting to receive
> packets from both the nested guest and the host.

OK. Agree.

> >Any other suggestions?
> >
>
> I think one solution could be to remove the 1:1 association between
> DGRAM socket and transport.
>
> IIUC VMCI creates a skb for each received packet and queues it through
> sk_receive_skb() directly in the struct sock.
>
> Then the .dgram_dequeue() callback dequeues them using
> skb_recv_datagram().
>
> We can move these parts in the vsock core, and create some helpers to
> allow the transports to enqueue received DGRAM packets in the same way
> (and with the same format) directly in the struct sock.
>

I agree to use skbs (and move them to vscok core). We have another use case
which will need to use skb. But I am not sure how this helps with multiple
transport cases. Each transport has a dgram_dequeue callback. So we still
need to let vsk have multiple transports somehow. Could you elaborate how
using skb help with multiple transport support? Will that be similar to what I
mentioned above? Thanks.

Regards,

Jiang

> What do you think?
>
> Thanks,
> Stefano
>

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [External] [RFC] vsock: add multiple transports support for dgram
  2021-04-07 18:25   ` [External] " Jiang Wang .
  2021-04-12 14:04     ` Stefano Garzarella
@ 2021-04-13  9:02     ` Jorgen Hansen
  2021-04-13 22:32       ` Jiang Wang .
  1 sibling, 1 reply; 9+ messages in thread
From: Jorgen Hansen @ 2021-04-13  9:02 UTC (permalink / raw)
  To: Jiang Wang .
  Cc: virtualization, Stefan Hajnoczi, cong.wang, duanxiongchun,
	xieyongji, David S. Miller, Jakub Kicinski, Stefano Garzarella,
	Colin Ian King, Norbert Slusarek, Andra Paraschiv,
	Alexander Popov, netdev, linux-kernel



> On 7 Apr 2021, at 20:25, Jiang Wang . <jiang.wang@bytedance.com> wrote:
> 
> On Wed, Apr 7, 2021 at 2:51 AM Jorgen Hansen <jhansen@vmware.com> wrote:
>> 
>> 
>>> On 6 Apr 2021, at 20:31, Jiang Wang <jiang.wang@bytedance.com> wrote:
>>> 
>>> From: "jiang.wang" <jiang.wang@bytedance.com>
>>> 
>>> Currently, only VMCI supports dgram sockets. To supported
>>> nested VM use case, this patch removes transport_dgram and
>>> uses transport_g2h and transport_h2g for dgram too.
>> 
>> Could you provide some background for introducing this change - are you
>> looking at introducing datagrams for a different transport? VMCI datagrams
>> already support the nested use case,
> 
> Yes, I am trying to introduce datagram for virtio transport. I wrote a
> spec patch for
> virtio dgram support and also a code patch, but the code patch is still WIP.

Oh ok. Cool. I must have missed the spec patch - could you provide a reference to
it?

> When I wrote this commit message, I was thinking nested VM is the same as
> multiple transport support. But now, I realize they are different.
> Nested VMs may use
> the same virtualization layer(KVM on KVM), or different virtualization layers
> (KVM on ESXi). Thanks for letting me know that VMCI already supported nested
> use cases. I think you mean VMCI on VMCI, right?

Right, only VMCI on VMCI. 

I’ll respond to Stefano’s email for the rest of the discussion.

Thanks,
Jorgen

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [RFC] vsock: add multiple transports support for dgram
       [not found]         ` <2EE65DBC-30AC-4E11-BFD5-73586B94C985@vmware.com>
@ 2021-04-13 12:52           ` Stefano Garzarella
  2021-04-13 22:41             ` Jiang Wang .
  0 siblings, 1 reply; 9+ messages in thread
From: Stefano Garzarella @ 2021-04-13 12:52 UTC (permalink / raw)
  To: Jorgen Hansen
  Cc: Jiang Wang .,
	virtualization, Stefan Hajnoczi, cong.wang, duanxiongchun,
	xieyongji, David S. Miller, Jakub Kicinski, Colin Ian King,
	Norbert Slusarek, Andra Paraschiv, Alexander Popov, netdev,
	linux-kernel

On Tue, Apr 13, 2021 at 12:12:50PM +0000, Jorgen Hansen wrote:
>
>
>On 12 Apr 2021, at 20:53, Jiang Wang . <jiang.wang@bytedance.com<mailto:jiang.wang@bytedance.com>> wrote:
>
>On Mon, Apr 12, 2021 at 7:04 AM Stefano Garzarella <sgarzare@redhat.com<mailto:sgarzare@redhat.com>> wrote:
>
>Hi Jiang,
>thanks for re-starting the multi-transport support for dgram!
>
>No problem.
>
>On Wed, Apr 07, 2021 at 11:25:36AM -0700, Jiang Wang . wrote:
>On Wed, Apr 7, 2021 at 2:51 AM Jorgen Hansen <jhansen@vmware.com<mailto:jhansen@vmware.com>> wrote:
>
>
>On 6 Apr 2021, at 20:31, Jiang Wang <jiang.wang@bytedance.com<mailto:jiang.wang@bytedance.com>> wrote:
>
>From: "jiang.wang<http://jiang.wang>" <jiang.wang@bytedance.com<mailto:jiang.wang@bytedance.com>>
>
>Currently, only VMCI supports dgram sockets. To supported
>nested VM use case, this patch removes transport_dgram and
>uses transport_g2h and transport_h2g for dgram too.
>
>I agree on this part, I think that's the direction to go.
>transport_dgram was added as a shortcut.
>
>Got it.
>
>
>Could you provide some background for introducing this change - are you
>looking at introducing datagrams for a different transport? VMCI datagrams
>already support the nested use case,
>
>Yes, I am trying to introduce datagram for virtio transport. I wrote a
>spec patch for
>virtio dgram support and also a code patch, but the code patch is still WIP.
>When I wrote this commit message, I was thinking nested VM is the same as
>multiple transport support. But now, I realize they are different.
>Nested VMs may use
>the same virtualization layer(KVM on KVM), or different virtualization layers
>(KVM on ESXi). Thanks for letting me know that VMCI already supported nested
>use cases. I think you mean VMCI on VMCI, right?
>
>but if we need to support multiple datagram
>transports we need to rework how we administer port assignment for datagrams.
>One specific issue is that the vmci transport won’t receive any datagrams for a
>port unless the datagram socket has already been assigned the vmci transport
>and the port bound to the underlying VMCI device (see below for more details).
>
>I see.
>
>The transport is assgined when sending every packet and
>receiving every packet on dgram sockets.
>
>Is the intent that the same datagram socket can be used for sending packets both
>In the host to guest, and the guest to directions?
>
>Nope. One datagram socket will only send packets to one direction, either to the
>host or to the guest. My above description is wrong. When sending packets, the
>transport is assigned with the first packet (with auto_bind).
>
>I'm not sure this is right.
>The auto_bind on the first packet should only assign a local port to the
>socket, but does not affect the transport to be used.
>
>A user could send one packet to the nested guest and another to the host
>using the same socket, or am I wrong?
>
>OK. I think you are right.
>
>
>The problem is when receiving packets. The listener can bind to the
>VMADDR_CID_ANY
>address. Then it is unclear which transport we should use. For stream
>sockets, there will be a new socket for each connection, and transport
>can be decided
>at that time. For datagram sockets, I am not sure how to handle that.
>
>yes, this I think is the main problem, but maybe the sender one is even
>more complicated.
>
>Maybe we should remove the 1:1 association we have now between vsk and
>transport.
>
>Yes, I thought about that too. One idea is to define two transports in vsk.
>For sending pkt, we can pick the right transport when we get the packet, like
>in virtio_transport_send_pkt_info(). For receiving pkts, we have to check
>and call both
>transports dequeue callbacks if the local cid is CID_ANY.
>
>At least for DGRAM, for connected sockets I think the association makes
>sense.
>
>Yeah. For a connected socket, we will only use one transport.
>
>For VMCI, does the same transport can be used for both receiving from
>host and from
>the guest?
>
>Yes, they're registered at different times, but it's the same transport.
>
>
>For virtio, the h2g and g2h transports are different,, so we have to
>choose
>one of them. My original thought is to wait until the first packet
>arrives.
>
>Another idea is that we always bind to host addr and use h2g
>transport because I think that might
>be more common. If a listener wants to recv packets from the host, then
>it
>should bind to the guest addr instead of CID_ANY.
>
>Yes, I remember we discussed this idea, this would simplify the
>receiving, but there is still the issue of a user wanting to receive
>packets from both the nested guest and the host.
>
>OK. Agree.
>
>Any other suggestions?
>
>
>I think one solution could be to remove the 1:1 association between
>DGRAM socket and transport.
>
>IIUC VMCI creates a skb for each received packet and queues it through
>sk_receive_skb() directly in the struct sock.
>
>Then the .dgram_dequeue() callback dequeues them using
>skb_recv_datagram().
>
>We can move these parts in the vsock core, and create some helpers to
>allow the transports to enqueue received DGRAM packets in the same way
>(and with the same format) directly in the struct sock.
>
>
>I agree to use skbs (and move them to vscok core). We have another use case
>which will need to use skb. But I am not sure how this helps with multiple
>transport cases. Each transport has a dgram_dequeue callback. So we still
>need to let vsk have multiple transports somehow. Could you elaborate how
>using skb help with multiple transport support? Will that be similar to what I
>mentioned above? Thanks.
>
>Moving away from the 1:1 association between DGRAM socket and transports sounds
>like the right approach to me. A dgram socket bound to CID_ANY would be able to
>use either h2g or g2h on a per dgram basis. If the socket is bound to a specific CID -
>either host or the guest CID, it should only use either the h2g for host CID or g2h
>for the guest CID. This would match the logic for the stream sockets.
>
>I like the idea of removing the dgram_dequeue callback from the transports and instead
>having a call that allow the transports to enqueue received dgrams into the socket
>receive queue as skbs. This is what the VMCI transport does today. Then the
>vsock_dgram_recvmsg function will provide functionality similar to what
>vmci_transport_dgram_dequeue does today. The current datagram format used was
>created specifically for VMCI datagrams, but the header just contains source and dest
>CID and port, so we should be able to use it as is.
>
>For sends from CID_ANY, the same logic as for streams in vsock_assign_transport can
>be applied on each send - but without locking the dgram socket to a specific transport.
>
>So the above is mostly restating what Stefano proposed, so this was a verbose way
>of agreeing with that.

Jorgen, thank you very much!
This is exactly what I had in mind, explained much better :-)

We should look at the datagram header better because virtio-vsock uses 
64 bits for CID and port, but I don't think it's a big problem.

@Jiang, I think Jorgen answered you questions, but feel free to ask more 
if it's not clear.

>
>With respect to binding a dgram socket to a port, we could introduce a bound list for
>dgram sockets just like we have for streams. However, for VMCI, the port space
>is shared with other VMCI datagram clients (at the VMCI device level), so if a
>dgram socket can potentially use the vmci transport, it should reserve the port
>with the VMCI transport before assigning it to the socket. So similar to how
>__vsock_bind_stream checks if an port is already bound/in use, the dgram socket
>would have an additional call to potential transports to reserve the port. If the
>port cannot be reserved with the transport, move on to the next port in the case
>of VMADDR_PORT_ANY, or return EADDRINUSE otherwise. Once reserved,
>It will ensure that VMCI can deliver datagrams to the specified port. A reserved
>port should be released when the socket is removed from the bound list.

Yes, I agree, it seems the right way to go.

Thanks,
Stefano


^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [RFC] vsock: add multiple transports support for dgram
  2021-04-13  9:02     ` [External] " Jorgen Hansen
@ 2021-04-13 22:32       ` Jiang Wang .
  0 siblings, 0 replies; 9+ messages in thread
From: Jiang Wang . @ 2021-04-13 22:32 UTC (permalink / raw)
  To: Jorgen Hansen
  Cc: virtualization, Stefan Hajnoczi, cong.wang, duanxiongchun,
	xieyongji, David S. Miller, Jakub Kicinski, Stefano Garzarella,
	Colin Ian King, Norbert Slusarek, Andra Paraschiv,
	Alexander Popov, netdev, linux-kernel

On Tue, Apr 13, 2021 at 2:02 AM Jorgen Hansen <jhansen@vmware.com> wrote:
>
>
>
> > On 7 Apr 2021, at 20:25, Jiang Wang . <jiang.wang@bytedance.com> wrote:
> >
> > On Wed, Apr 7, 2021 at 2:51 AM Jorgen Hansen <jhansen@vmware.com> wrote:
> >>
> >>
> >>> On 6 Apr 2021, at 20:31, Jiang Wang <jiang.wang@bytedance.com> wrote:
> >>>
> >>> From: "jiang.wang" <jiang.wang@bytedance.com>
> >>>
> >>> Currently, only VMCI supports dgram sockets. To supported
> >>> nested VM use case, this patch removes transport_dgram and
> >>> uses transport_g2h and transport_h2g for dgram too.
> >>
> >> Could you provide some background for introducing this change - are you
> >> looking at introducing datagrams for a different transport? VMCI datagrams
> >> already support the nested use case,
> >
> > Yes, I am trying to introduce datagram for virtio transport. I wrote a
> > spec patch for
> > virtio dgram support and also a code patch, but the code patch is still WIP.
>
> Oh ok. Cool. I must have missed the spec patch - could you provide a reference to
> it?

Sure. here is the link:
https://lists.linuxfoundation.org/pipermail/virtualization/2021-April/053543.html

> > When I wrote this commit message, I was thinking nested VM is the same as
> > multiple transport support. But now, I realize they are different.
> > Nested VMs may use
> > the same virtualization layer(KVM on KVM), or different virtualization layers
> > (KVM on ESXi). Thanks for letting me know that VMCI already supported nested
> > use cases. I think you mean VMCI on VMCI, right?
>
> Right, only VMCI on VMCI.

Got it. thanks.

> I’ll respond to Stefano’s email for the rest of the discussion.
>
> Thanks,
> Jorgen

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: Re: [RFC] vsock: add multiple transports support for dgram
  2021-04-13 12:52           ` Stefano Garzarella
@ 2021-04-13 22:41             ` Jiang Wang .
  0 siblings, 0 replies; 9+ messages in thread
From: Jiang Wang . @ 2021-04-13 22:41 UTC (permalink / raw)
  To: Stefano Garzarella
  Cc: Jorgen Hansen, virtualization, Stefan Hajnoczi, cong.wang,
	duanxiongchun, xieyongji, David S. Miller, Jakub Kicinski,
	Colin Ian King, Norbert Slusarek, Andra Paraschiv,
	Alexander Popov, netdev, linux-kernel

Hi Jorgen,

Thanks for the detailed explanation and I agree with you. For the bind list,
my  prototype is doing
something similar to that. I will double check it.

Hi Stefano,

I don't have other questions for now. Thanks.

Regards,

Jiang

On Tue, Apr 13, 2021 at 5:52 AM Stefano Garzarella <sgarzare@redhat.com> wrote:
>
> On Tue, Apr 13, 2021 at 12:12:50PM +0000, Jorgen Hansen wrote:
> >
> >
> >On 12 Apr 2021, at 20:53, Jiang Wang . <jiang.wang@bytedance.com<mailto:jiang.wang@bytedance.com>> wrote:
> >
> >On Mon, Apr 12, 2021 at 7:04 AM Stefano Garzarella <sgarzare@redhat.com<mailto:sgarzare@redhat.com>> wrote:
> >
> >Hi Jiang,
> >thanks for re-starting the multi-transport support for dgram!
> >
> >No problem.
> >
> >On Wed, Apr 07, 2021 at 11:25:36AM -0700, Jiang Wang . wrote:
> >On Wed, Apr 7, 2021 at 2:51 AM Jorgen Hansen <jhansen@vmware.com<mailto:jhansen@vmware.com>> wrote:
> >
> >
> >On 6 Apr 2021, at 20:31, Jiang Wang <jiang.wang@bytedance.com<mailto:jiang.wang@bytedance.com>> wrote:
> >
> >From: "jiang.wang<http://jiang.wang>" <jiang.wang@bytedance.com<mailto:jiang.wang@bytedance.com>>
> >
> >Currently, only VMCI supports dgram sockets. To supported
> >nested VM use case, this patch removes transport_dgram and
> >uses transport_g2h and transport_h2g for dgram too.
> >
> >I agree on this part, I think that's the direction to go.
> >transport_dgram was added as a shortcut.
> >
> >Got it.
> >
> >
> >Could you provide some background for introducing this change - are you
> >looking at introducing datagrams for a different transport? VMCI datagrams
> >already support the nested use case,
> >
> >Yes, I am trying to introduce datagram for virtio transport. I wrote a
> >spec patch for
> >virtio dgram support and also a code patch, but the code patch is still WIP.
> >When I wrote this commit message, I was thinking nested VM is the same as
> >multiple transport support. But now, I realize they are different.
> >Nested VMs may use
> >the same virtualization layer(KVM on KVM), or different virtualization layers
> >(KVM on ESXi). Thanks for letting me know that VMCI already supported nested
> >use cases. I think you mean VMCI on VMCI, right?
> >
> >but if we need to support multiple datagram
> >transports we need to rework how we administer port assignment for datagrams.
> >One specific issue is that the vmci transport won’t receive any datagrams for a
> >port unless the datagram socket has already been assigned the vmci transport
> >and the port bound to the underlying VMCI device (see below for more details).
> >
> >I see.
> >
> >The transport is assgined when sending every packet and
> >receiving every packet on dgram sockets.
> >
> >Is the intent that the same datagram socket can be used for sending packets both
> >In the host to guest, and the guest to directions?
> >
> >Nope. One datagram socket will only send packets to one direction, either to the
> >host or to the guest. My above description is wrong. When sending packets, the
> >transport is assigned with the first packet (with auto_bind).
> >
> >I'm not sure this is right.
> >The auto_bind on the first packet should only assign a local port to the
> >socket, but does not affect the transport to be used.
> >
> >A user could send one packet to the nested guest and another to the host
> >using the same socket, or am I wrong?
> >
> >OK. I think you are right.
> >
> >
> >The problem is when receiving packets. The listener can bind to the
> >VMADDR_CID_ANY
> >address. Then it is unclear which transport we should use. For stream
> >sockets, there will be a new socket for each connection, and transport
> >can be decided
> >at that time. For datagram sockets, I am not sure how to handle that.
> >
> >yes, this I think is the main problem, but maybe the sender one is even
> >more complicated.
> >
> >Maybe we should remove the 1:1 association we have now between vsk and
> >transport.
> >
> >Yes, I thought about that too. One idea is to define two transports in vsk.
> >For sending pkt, we can pick the right transport when we get the packet, like
> >in virtio_transport_send_pkt_info(). For receiving pkts, we have to check
> >and call both
> >transports dequeue callbacks if the local cid is CID_ANY.
> >
> >At least for DGRAM, for connected sockets I think the association makes
> >sense.
> >
> >Yeah. For a connected socket, we will only use one transport.
> >
> >For VMCI, does the same transport can be used for both receiving from
> >host and from
> >the guest?
> >
> >Yes, they're registered at different times, but it's the same transport.
> >
> >
> >For virtio, the h2g and g2h transports are different,, so we have to
> >choose
> >one of them. My original thought is to wait until the first packet
> >arrives.
> >
> >Another idea is that we always bind to host addr and use h2g
> >transport because I think that might
> >be more common. If a listener wants to recv packets from the host, then
> >it
> >should bind to the guest addr instead of CID_ANY.
> >
> >Yes, I remember we discussed this idea, this would simplify the
> >receiving, but there is still the issue of a user wanting to receive
> >packets from both the nested guest and the host.
> >
> >OK. Agree.
> >
> >Any other suggestions?
> >
> >
> >I think one solution could be to remove the 1:1 association between
> >DGRAM socket and transport.
> >
> >IIUC VMCI creates a skb for each received packet and queues it through
> >sk_receive_skb() directly in the struct sock.
> >
> >Then the .dgram_dequeue() callback dequeues them using
> >skb_recv_datagram().
> >
> >We can move these parts in the vsock core, and create some helpers to
> >allow the transports to enqueue received DGRAM packets in the same way
> >(and with the same format) directly in the struct sock.
> >
> >
> >I agree to use skbs (and move them to vscok core). We have another use case
> >which will need to use skb. But I am not sure how this helps with multiple
> >transport cases. Each transport has a dgram_dequeue callback. So we still
> >need to let vsk have multiple transports somehow. Could you elaborate how
> >using skb help with multiple transport support? Will that be similar to what I
> >mentioned above? Thanks.
> >
> >Moving away from the 1:1 association between DGRAM socket and transports sounds
> >like the right approach to me. A dgram socket bound to CID_ANY would be able to
> >use either h2g or g2h on a per dgram basis. If the socket is bound to a specific CID -
> >either host or the guest CID, it should only use either the h2g for host CID or g2h
> >for the guest CID. This would match the logic for the stream sockets.
> >
> >I like the idea of removing the dgram_dequeue callback from the transports and instead
> >having a call that allow the transports to enqueue received dgrams into the socket
> >receive queue as skbs. This is what the VMCI transport does today. Then the
> >vsock_dgram_recvmsg function will provide functionality similar to what
> >vmci_transport_dgram_dequeue does today. The current datagram format used was
> >created specifically for VMCI datagrams, but the header just contains source and dest
> >CID and port, so we should be able to use it as is.
> >
> >For sends from CID_ANY, the same logic as for streams in vsock_assign_transport can
> >be applied on each send - but without locking the dgram socket to a specific transport.
> >
> >So the above is mostly restating what Stefano proposed, so this was a verbose way
> >of agreeing with that.
>
> Jorgen, thank you very much!
> This is exactly what I had in mind, explained much better :-)
>
> We should look at the datagram header better because virtio-vsock uses
> 64 bits for CID and port, but I don't think it's a big problem.
>
> @Jiang, I think Jorgen answered you questions, but feel free to ask more
> if it's not clear.
>
> >
> >With respect to binding a dgram socket to a port, we could introduce a bound list for
> >dgram sockets just like we have for streams. However, for VMCI, the port space
> >is shared with other VMCI datagram clients (at the VMCI device level), so if a
> >dgram socket can potentially use the vmci transport, it should reserve the port
> >with the VMCI transport before assigning it to the socket. So similar to how
> >__vsock_bind_stream checks if an port is already bound/in use, the dgram socket
> >would have an additional call to potential transports to reserve the port. If the
> >port cannot be reserved with the transport, move on to the next port in the case
> >of VMADDR_PORT_ANY, or return EADDRINUSE otherwise. Once reserved,
> >It will ensure that VMCI can deliver datagrams to the specified port. A reserved
> >port should be released when the socket is removed from the bound list.
>
> Yes, I agree, it seems the right way to go.
>
> Thanks,
> Stefano
>

^ permalink raw reply	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2021-04-13 22:41 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-04-06 18:31 [RFC] vsock: add multiple transports support for dgram Jiang Wang
2021-04-07  9:51 ` Jorgen Hansen
2021-04-07 18:25   ` [External] " Jiang Wang .
2021-04-12 14:04     ` Stefano Garzarella
2021-04-12 18:53       ` Jiang Wang .
     [not found]         ` <2EE65DBC-30AC-4E11-BFD5-73586B94C985@vmware.com>
2021-04-13 12:52           ` Stefano Garzarella
2021-04-13 22:41             ` Jiang Wang .
2021-04-13  9:02     ` [External] " Jorgen Hansen
2021-04-13 22:32       ` Jiang Wang .

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).