linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] hv_sock: Fix data loss upon socket close
@ 2019-05-08 23:10 Sunil Muthuswamy
  2019-05-09 20:58 ` David Miller
  2019-05-11  3:56 ` Dexuan Cui
  0 siblings, 2 replies; 5+ messages in thread
From: Sunil Muthuswamy @ 2019-05-08 23:10 UTC (permalink / raw)
  To: KY Srinivasan, Haiyang Zhang, Stephen Hemminger, Sasha Levin,
	David S. Miller, Dexuan Cui, Michael Kelley
  Cc: netdev, linux-hyperv, linux-kernel

Currently, when a hvsock socket is closed, the socket is shutdown and
immediately a RST is sent. There is no wait for the FIN packet to arrive
from the other end. This can lead to data loss since the connection is
terminated abruptly. This can manifest easily in cases of a fast guest
hvsock writer and a much slower host hvsock reader. Essentially hvsock is
not following the proper STREAM(TCP) closing handshake mechanism.

The fix involves adding support for the delayed close of hvsock, which is
in-line with other socket providers such as virtio. While closing, the
socket waits for a constant timeout, for the FIN packet to arrive from the
other end. On timeout, it will terminate the connection (i.e a RST).

Signed-off-by: Sunil Muthuswamy <sunilmut@microsoft.com>
---
 net/vmw_vsock/hyperv_transport.c | 122 ++++++++++++++++++++++++++++-----------
 1 file changed, 87 insertions(+), 35 deletions(-)

diff --git a/net/vmw_vsock/hyperv_transport.c b/net/vmw_vsock/hyperv_transport.c
index a827547..62b986d 100644
--- a/net/vmw_vsock/hyperv_transport.c
+++ b/net/vmw_vsock/hyperv_transport.c
@@ -35,6 +35,9 @@
 /* The MTU is 16KB per the host side's design */
 #define HVS_MTU_SIZE		(1024 * 16)
 
+/* How long to wait for graceful shutdown of a connection */
+#define HVS_CLOSE_TIMEOUT (8 * HZ)
+
 struct vmpipe_proto_header {
 	u32 pkt_type;
 	u32 data_size;
@@ -305,19 +308,33 @@ static void hvs_channel_cb(void *ctx)
 		sk->sk_write_space(sk);
 }
 
-static void hvs_close_connection(struct vmbus_channel *chan)
+static void hvs_do_close_lock_held(struct vsock_sock *vsk,
+				   bool cancel_timeout)
 {
-	struct sock *sk = get_per_channel_state(chan);
-	struct vsock_sock *vsk = vsock_sk(sk);
-
-	lock_sock(sk);
+	struct sock *sk = sk_vsock(vsk);
 
-	sk->sk_state = TCP_CLOSE;
 	sock_set_flag(sk, SOCK_DONE);
-	vsk->peer_shutdown |= SEND_SHUTDOWN | RCV_SHUTDOWN;
-
+	vsk->peer_shutdown = SHUTDOWN_MASK;
+	if (vsock_stream_has_data(vsk) <= 0)
+		sk->sk_state = TCP_CLOSING;
 	sk->sk_state_change(sk);
+	if (vsk->close_work_scheduled &&
+	    (!cancel_timeout || cancel_delayed_work(&vsk->close_work))) {
+		vsk->close_work_scheduled = false;
+		vsock_remove_sock(vsk);
+
+		/* Release the reference taken while scheduling the timeout */
+		sock_put(sk);
+	}
+}
+
+/* Equivalent of a RST */
+static void hvs_close_connection(struct vmbus_channel *chan)
+{
+	struct sock *sk = get_per_channel_state(chan);
 
+	lock_sock(sk);
+	hvs_do_close_lock_held(vsock_sk(sk), true);
 	release_sock(sk);
 }
 
@@ -452,50 +469,80 @@ static int hvs_connect(struct vsock_sock *vsk)
 	return vmbus_send_tl_connect_request(&h->vm_srv_id, &h->host_srv_id);
 }
 
+static inline void hvs_shutdown_lock_held(struct hvsock *hvs, int mode)
+{
+	struct vmpipe_proto_header hdr;
+
+	if (hvs->fin_sent || !hvs->chan)
+		return;
+
+	/* It can't fail: see hvs_channel_writable_bytes(). */
+	(void)hvs_send_data(hvs->chan, (struct hvs_send_buf *)&hdr, 0);
+	hvs->fin_sent = true;
+}
+
 static int hvs_shutdown(struct vsock_sock *vsk, int mode)
 {
 	struct sock *sk = sk_vsock(vsk);
-	struct vmpipe_proto_header hdr;
-	struct hvs_send_buf *send_buf;
-	struct hvsock *hvs;
 
 	if (!(mode & SEND_SHUTDOWN))
 		return 0;
 
 	lock_sock(sk);
+	hvs_shutdown_lock_held(vsk->trans, mode);
+	release_sock(sk);
+	return 0;
+}
 
-	hvs = vsk->trans;
-	if (hvs->fin_sent)
-		goto out;
-
-	send_buf = (struct hvs_send_buf *)&hdr;
+static void hvs_close_timeout(struct work_struct *work)
+{
+	struct vsock_sock *vsk =
+		container_of(work, struct vsock_sock, close_work.work);
+	struct sock *sk = sk_vsock(vsk);
 
-	/* It can't fail: see hvs_channel_writable_bytes(). */
-	(void)hvs_send_data(hvs->chan, send_buf, 0);
+	sock_hold(sk);
+	lock_sock(sk);
+	if (!sock_flag(sk, SOCK_DONE))
+		hvs_do_close_lock_held(vsk, false);
 
-	hvs->fin_sent = true;
-out:
+	vsk->close_work_scheduled = false;
 	release_sock(sk);
-	return 0;
+	sock_put(sk);
 }
 
-static void hvs_release(struct vsock_sock *vsk)
+/* Returns true, if it is safe to remove socket; false otherwise */
+static bool hvs_close_lock_held(struct vsock_sock *vsk)
 {
 	struct sock *sk = sk_vsock(vsk);
-	struct hvsock *hvs = vsk->trans;
-	struct vmbus_channel *chan;
 
-	lock_sock(sk);
+	if (!(sk->sk_state == TCP_ESTABLISHED ||
+	      sk->sk_state == TCP_CLOSING))
+		return true;
 
-	sk->sk_state = TCP_CLOSING;
-	vsock_remove_sock(vsk);
+	if ((sk->sk_shutdown & SHUTDOWN_MASK) != SHUTDOWN_MASK)
+		hvs_shutdown_lock_held(vsk->trans, SHUTDOWN_MASK);
 
-	release_sock(sk);
+	if (sock_flag(sk, SOCK_DONE))
+		return true;
 
-	chan = hvs->chan;
-	if (chan)
-		hvs_shutdown(vsk, RCV_SHUTDOWN | SEND_SHUTDOWN);
+	/* This reference will be dropped by the delayed close routine */
+	sock_hold(sk);
+	INIT_DELAYED_WORK(&vsk->close_work, hvs_close_timeout);
+	vsk->close_work_scheduled = true;
+	schedule_delayed_work(&vsk->close_work, HVS_CLOSE_TIMEOUT);
+	return false;
+}
 
+static void hvs_release(struct vsock_sock *vsk)
+{
+	struct sock *sk = sk_vsock(vsk);
+	bool remove_sock;
+
+	lock_sock(sk);
+	remove_sock = hvs_close_lock_held(vsk);
+	release_sock(sk);
+	if (remove_sock)
+		vsock_remove_sock(vsk);
 }
 
 static void hvs_destruct(struct vsock_sock *vsk)
@@ -532,10 +579,11 @@ static bool hvs_dgram_allow(u32 cid, u32 port)
 	return false;
 }
 
-static int hvs_update_recv_data(struct hvsock *hvs)
+static int hvs_update_recv_data(struct vsock_sock *vsk)
 {
 	struct hvs_recv_buf *recv_buf;
 	u32 payload_len;
+	struct hvsock *hvs = vsk->trans;
 
 	recv_buf = (struct hvs_recv_buf *)(hvs->recv_desc + 1);
 	payload_len = recv_buf->hdr.data_size;
@@ -543,8 +591,12 @@ static int hvs_update_recv_data(struct hvsock *hvs)
 	if (payload_len > HVS_MTU_SIZE)
 		return -EIO;
 
-	if (payload_len == 0)
+	/* Peer shutdown */
+	if (payload_len == 0) {
+		struct sock *sk = sk_vsock(vsk);
 		hvs->vsk->peer_shutdown |= SEND_SHUTDOWN;
+		sk->sk_state_change(sk);
+	}
 
 	hvs->recv_data_len = payload_len;
 	hvs->recv_data_off = 0;
@@ -566,7 +618,7 @@ static ssize_t hvs_stream_dequeue(struct vsock_sock *vsk, struct msghdr *msg,
 
 	if (need_refill) {
 		hvs->recv_desc = hv_pkt_iter_first(hvs->chan);
-		ret = hvs_update_recv_data(hvs);
+		ret = hvs_update_recv_data(vsk);
 		if (ret)
 			return ret;
 	}
@@ -581,7 +633,7 @@ static ssize_t hvs_stream_dequeue(struct vsock_sock *vsk, struct msghdr *msg,
 	if (hvs->recv_data_len == 0) {
 		hvs->recv_desc = hv_pkt_iter_next(hvs->chan, hvs->recv_desc);
 		if (hvs->recv_desc) {
-			ret = hvs_update_recv_data(hvs);
+			ret = hvs_update_recv_data(vsk);
 			if (ret)
 				return ret;
 		}
-- 
2.7.4


^ permalink raw reply related	[flat|nested] 5+ messages in thread

* Re: [PATCH] hv_sock: Fix data loss upon socket close
  2019-05-08 23:10 [PATCH] hv_sock: Fix data loss upon socket close Sunil Muthuswamy
@ 2019-05-09 20:58 ` David Miller
  2019-05-14 16:33   ` Sunil Muthuswamy
  2019-05-11  3:56 ` Dexuan Cui
  1 sibling, 1 reply; 5+ messages in thread
From: David Miller @ 2019-05-09 20:58 UTC (permalink / raw)
  To: sunilmut
  Cc: kys, haiyangz, sthemmin, sashal, decui, mikelley, netdev,
	linux-hyperv, linux-kernel

From: Sunil Muthuswamy <sunilmut@microsoft.com>
Date: Wed, 8 May 2019 23:10:35 +0000

> +static inline void hvs_shutdown_lock_held(struct hvsock *hvs, int mode)

Please do not use the inline keyword in foo.c files, let the compiler decide.

Also, longer term thing, I notice that vsock_remove_socket() is very
inefficient locking-wise.  It takes the table lock to do the placement
test, and takes it again to do the removal.  Might even be racy.

^ permalink raw reply	[flat|nested] 5+ messages in thread

* RE: [PATCH] hv_sock: Fix data loss upon socket close
  2019-05-08 23:10 [PATCH] hv_sock: Fix data loss upon socket close Sunil Muthuswamy
  2019-05-09 20:58 ` David Miller
@ 2019-05-11  3:56 ` Dexuan Cui
  2019-05-14 20:40   ` Sunil Muthuswamy
  1 sibling, 1 reply; 5+ messages in thread
From: Dexuan Cui @ 2019-05-11  3:56 UTC (permalink / raw)
  To: Sunil Muthuswamy, KY Srinivasan, Haiyang Zhang,
	Stephen Hemminger, Sasha Levin, David S. Miller, Michael Kelley
  Cc: netdev, linux-hyperv, linux-kernel

> From: Sunil Muthuswamy <sunilmut@microsoft.com>
> Sent: Wednesday, May 8, 2019 4:11 PM
>
> Currently, when a hvsock socket is closed, the socket is shutdown and
> immediately a RST is sent. There is no wait for the FIN packet to arrive
> from the other end. This can lead to data loss since the connection is
> terminated abruptly. This can manifest easily in cases of a fast guest
> hvsock writer and a much slower host hvsock reader. Essentially hvsock is
> not following the proper STREAM(TCP) closing handshake mechanism.

Hi Sunil,
It looks to me the above description is inaccurate.

In the upstream Linux kernel, closing a hv_sock file descriptor may hang
in vmbus_hvsock_device_unregister() -> msleep(), until the host side of
the connection is closed. This is bad and should be fixed, but I don't think
the current code can cause data loss: when Linux calls hvs_destruct() ->
vmbus_hvsock_device_unregister() -> vmbus_device_unregister() -> ...
-> vmbus_close() to close the channel, Linux knows the host app has
already called close(), and normally that means the host app has
received all the data from the connection.

BTW, technically speaking, in hv_sock there is no RST packet, while there
is indeed a payload_len==0 packet, which is similar to TCP FIN.

I think by saying "a RST is sent" you mean Linux VM closes the channel.

> The fix involves adding support for the delayed close of hvsock, which is
> in-line with other socket providers such as virtio.

With this "delayed close" patch, Linux's close() won't hang until the host
also closes the connection. This is good!

> While closing, the
> socket waits for a constant timeout, for the FIN packet to arrive from the
> other end. On timeout, it will terminate the connection (i.e a RST).

As I mentioned above, I suppose the "RST" means Linux closes the channel.

When Linux closes a connection, the FIN packet is written into the shared
guest-to-host channel ringbuffer immediately, so the host is able to see it
immediately, but the real question is: what if the host kernel and/or host app
can not (timely) receive the data from the ringbuffer, inclding the FIN?

Does the host kernel guarantee it *always* timely fetches/caches all the
data from a connection, even if the host app has not accept()'d the
conection, or the host app is reading from the connection too slowly?

If the host doesn't guarantee that, then even with this patch there is still
a chance Linux can time out, and close the channel before the host
finishes receiving all the data.

I'm curious how Windows guest implements the "async close"?
Does Windows guest also use the same timeout strategy here? If yes,
what's the timeout value used?

> diff --git a/net/vmw_vsock/hyperv_transport.c
> b/net/vmw_vsock/hyperv_transport.c
> index a827547..62b986d 100644

Sorry, I need more time to review the rest of patch. Will try to reply ASAP.

> -static int hvs_update_recv_data(struct hvsock *hvs)
> +static int hvs_update_recv_data(struct vsock_sock *vsk)
>  {
>       struct hvs_recv_buf *recv_buf;
>       u32 payload_len;
> +     struct hvsock *hvs = vsk->trans;
>
>       recv_buf = (struct hvs_recv_buf *)(hvs->recv_desc + 1);
>       payload_len = recv_buf->hdr.data_size;
> @@ -543,8 +591,12 @@ static int hvs_update_recv_data(struct hvsock *hvs)
>       if (payload_len > HVS_MTU_SIZE)
>               return -EIO;
>
> -     if (payload_len == 0)
> +     /* Peer shutdown */
> +     if (payload_len == 0) {
> +             struct sock *sk = sk_vsock(vsk);
>               hvs->vsk->peer_shutdown |= SEND_SHUTDOWN;
> +             sk->sk_state_change(sk);
> +     }

Can you please explain why we need to call this sk->sk_state_change()?

When we call hvs_update_recv_data(), we hold the lock_sock(sk) lock, and we
know there is at least one byte to read. Since we hold the lock, the other
code paths, which normally are also requried to acquire the lock before
checking vsk->peer_shutdown, can not race with us.

Thanks,
-- Dexuan

^ permalink raw reply	[flat|nested] 5+ messages in thread

* RE: [PATCH] hv_sock: Fix data loss upon socket close
  2019-05-09 20:58 ` David Miller
@ 2019-05-14 16:33   ` Sunil Muthuswamy
  0 siblings, 0 replies; 5+ messages in thread
From: Sunil Muthuswamy @ 2019-05-14 16:33 UTC (permalink / raw)
  To: David Miller
  Cc: KY Srinivasan, Haiyang Zhang, Stephen Hemminger, sashal,
	Dexuan Cui, Michael Kelley, netdev, linux-hyperv, linux-kernel



> -----Original Message-----
> From: linux-hyperv-owner@vger.kernel.org <linux-hyperv-owner@vger.kernel.org> On Behalf Of David Miller
> Sent: Thursday, May 9, 2019 1:58 PM
> To: Sunil Muthuswamy <sunilmut@microsoft.com>
> Cc: KY Srinivasan <kys@microsoft.com>; Haiyang Zhang <haiyangz@microsoft.com>; Stephen Hemminger
> <sthemmin@microsoft.com>; sashal@kernel.org; Dexuan Cui <decui@microsoft.com>; Michael Kelley <mikelley@microsoft.com>;
> netdev@vger.kernel.org; linux-hyperv@vger.kernel.org; linux-kernel@vger.kernel.org
> Subject: Re: [PATCH] hv_sock: Fix data loss upon socket close
> 
> From: Sunil Muthuswamy <sunilmut@microsoft.com>
> Date: Wed, 8 May 2019 23:10:35 +0000
> 
> > +static inline void hvs_shutdown_lock_held(struct hvsock *hvs, int mode)
> 
> Please do not use the inline keyword in foo.c files, let the compiler decide.
> 
Thanks, will fix in the next version.
> Also, longer term thing, I notice that vsock_remove_socket() is very
> inefficient locking-wise.  It takes the table lock to do the placement
> test, and takes it again to do the removal.  Might even be racy.
Agreed. The check & remove should be done as an atomic operation.
This can be taken up as a separate patch.

^ permalink raw reply	[flat|nested] 5+ messages in thread

* RE: [PATCH] hv_sock: Fix data loss upon socket close
  2019-05-11  3:56 ` Dexuan Cui
@ 2019-05-14 20:40   ` Sunil Muthuswamy
  0 siblings, 0 replies; 5+ messages in thread
From: Sunil Muthuswamy @ 2019-05-14 20:40 UTC (permalink / raw)
  To: Dexuan Cui, KY Srinivasan, Haiyang Zhang, Stephen Hemminger,
	Sasha Levin, David S. Miller, Michael Kelley
  Cc: netdev, linux-hyperv, linux-kernel



> -----Original Message-----
> From: Dexuan Cui <decui@microsoft.com>
> Sent: Friday, May 10, 2019 8:57 PM
> To: Sunil Muthuswamy <sunilmut@microsoft.com>; KY Srinivasan <kys@microsoft.com>; Haiyang Zhang <haiyangz@microsoft.com>;
> Stephen Hemminger <sthemmin@microsoft.com>; Sasha Levin <sashal@kernel.org>; David S. Miller <davem@davemloft.net>;
> Michael Kelley <mikelley@microsoft.com>
> Cc: netdev@vger.kernel.org; linux-hyperv@vger.kernel.org; linux-kernel@vger.kernel.org
> Subject: RE: [PATCH] hv_sock: Fix data loss upon socket close
> 
> > From: Sunil Muthuswamy <sunilmut@microsoft.com>
> > Sent: Wednesday, May 8, 2019 4:11 PM
> >
> > Currently, when a hvsock socket is closed, the socket is shutdown and
> > immediately a RST is sent. There is no wait for the FIN packet to arrive
> > from the other end. This can lead to data loss since the connection is
> > terminated abruptly. This can manifest easily in cases of a fast guest
> > hvsock writer and a much slower host hvsock reader. Essentially hvsock is
> > not following the proper STREAM(TCP) closing handshake mechanism.
> 
> Hi Sunil,
> It looks to me the above description is inaccurate.
> 
> In the upstream Linux kernel, closing a hv_sock file descriptor may hang
> in vmbus_hvsock_device_unregister() -> msleep(), until the host side of
> the connection is closed. This is bad and should be fixed, but I don't think
> the current code can cause data loss: when Linux calls hvs_destruct() ->
> vmbus_hvsock_device_unregister() -> vmbus_device_unregister() -> ...
> -> vmbus_close() to close the channel, Linux knows the host app has
> already called close(), and normally that means the host app has
> received all the data from the connection.
> 
> BTW, technically speaking, in hv_sock there is no RST packet, while there
> is indeed a payload_len==0 packet, which is similar to TCP FIN.
> 
> I think by saying "a RST is sent" you mean Linux VM closes the channel.
> 
> > The fix involves adding support for the delayed close of hvsock, which is
> > in-line with other socket providers such as virtio.
> 
> With this "delayed close" patch, Linux's close() won't hang until the host
> also closes the connection. This is good!
> 
The next version of the patch will only focus on implementing the delayed
(or background) close logic. I will update the title and the description of the
next version patch to more accurately reflect the change. 
> > While closing, the
> > socket waits for a constant timeout, for the FIN packet to arrive from the
> > other end. On timeout, it will terminate the connection (i.e a RST).
> 
> As I mentioned above, I suppose the "RST" means Linux closes the channel.
> 
> When Linux closes a connection, the FIN packet is written into the shared
> guest-to-host channel ringbuffer immediately, so the host is able to see it
> immediately, but the real question is: what if the host kernel and/or host app
> can not (timely) receive the data from the ringbuffer, inclding the FIN?
> 
> Does the host kernel guarantee it *always* timely fetches/caches all the
> data from a connection, even if the host app has not accept()'d the
> conection, or the host app is reading from the connection too slowly?
> 
> If the host doesn't guarantee that, then even with this patch there is still
> a chance Linux can time out, and close the channel before the host
> finishes receiving all the data.
> 
TCP protocol does not guarantee that all the data gets delivered, especially
during close. The applications are required to mitigate this by using a
combination of shutdown() and subsequent read() on both client and server
side.
> I'm curious how Windows guest implements the "async close"?
> Does Windows guest also use the same timeout strategy here? If yes,
> what's the timeout value used?
> 
Windows also implements the delayed close logic in a similar fashion. You can
lookup the MSDN article on 'graceful shutdown'.
> > diff --git a/net/vmw_vsock/hyperv_transport.c
> > b/net/vmw_vsock/hyperv_transport.c
> > index a827547..62b986d 100644
> 
> Sorry, I need more time to review the rest of patch. Will try to reply ASAP.
> 
> > -static int hvs_update_recv_data(struct hvsock *hvs)
> > +static int hvs_update_recv_data(struct vsock_sock *vsk)
> >  {
> >  	struct hvs_recv_buf *recv_buf;
> >  	u32 payload_len;
> > +	struct hvsock *hvs = vsk->trans;
> >
> >  	recv_buf = (struct hvs_recv_buf *)(hvs->recv_desc + 1);
> >  	payload_len = recv_buf->hdr.data_size;
> > @@ -543,8 +591,12 @@ static int hvs_update_recv_data(struct hvsock *hvs)
> >  	if (payload_len > HVS_MTU_SIZE)
> >  		return -EIO;
> >
> > -	if (payload_len == 0)
> > +	/* Peer shutdown */
> > +	if (payload_len == 0) {
> > +		struct sock *sk = sk_vsock(vsk);
> >  		hvs->vsk->peer_shutdown |= SEND_SHUTDOWN;
> > +		sk->sk_state_change(sk);
> > +	}
> 
> Can you please explain why we need to call this sk->sk_state_change()?
> 
> When we call hvs_update_recv_data(), we hold the lock_sock(sk) lock, and we
> know there is at least one byte to read. Since we hold the lock, the other
> code paths, which normally are also requried to acquire the lock before
> checking vsk->peer_shutdown, can not race with us.
> 
This was to wakeup any waiters on socket state change. Since the updated
patch now only focuses on adding the delayed close logic, I will remove this in
the next version.
Thanks for the review so far.
> Thanks,
> -- Dexuan

^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2019-05-14 20:41 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-05-08 23:10 [PATCH] hv_sock: Fix data loss upon socket close Sunil Muthuswamy
2019-05-09 20:58 ` David Miller
2019-05-14 16:33   ` Sunil Muthuswamy
2019-05-11  3:56 ` Dexuan Cui
2019-05-14 20:40   ` Sunil Muthuswamy

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).