linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH v2 1/4] Tools: hv: Reopen the devices if read() or write() returns errors
@ 2020-01-13  6:30 Dexuan Cui
  2020-01-22 16:53 ` Michael Kelley
  0 siblings, 1 reply; 3+ messages in thread
From: Dexuan Cui @ 2020-01-13  6:30 UTC (permalink / raw)
  To: KY Srinivasan, Haiyang Zhang, Stephen Hemminger, sashal,
	Sasha Levin, linux-hyperv, Michael Kelley, vkuznets,
	linux-kernel


The state machine in the hv_utils driver can run out of order in some
corner cases, e.g. if the kvp daemon doesn't call write() fast enough
due to some reason, kvp_timeout_func() can run first and move the state
to HVUTIL_READY; next, when kvp_on_msg() is called it returns -EINVAL
since kvp_transaction.state is smaller than HVUTIL_USERSPACE_REQ; later,
the daemon's write() gets an error -EINVAL, and the daemon will exit().

We can reproduce the issue by sending a SIGSTOP signal to the daemon, wait
for 1 minute, and send a SIGCONT signal to the daemon: the daemon will
exit() quickly.

We can fix the issue by forcing a reset of the device (which means the
daemon can close() and open() the device again) and doing extra necessary
clean-up.

Signed-off-by: Dexuan Cui <decui@microsoft.com>
---
 tools/hv/hv_fcopy_daemon.c | 19 +++++++++++++++----
 tools/hv/hv_kvp_daemon.c   | 25 ++++++++++++++-----------
 tools/hv/hv_vss_daemon.c   | 25 +++++++++++++++++++------
 3 files changed, 48 insertions(+), 21 deletions(-)

diff --git a/tools/hv/hv_fcopy_daemon.c b/tools/hv/hv_fcopy_daemon.c
index aea2d91ab364..a78a5292589b 100644
--- a/tools/hv/hv_fcopy_daemon.c
+++ b/tools/hv/hv_fcopy_daemon.c
@@ -21,7 +21,7 @@
 #include <fcntl.h>
 #include <getopt.h>
 
-static int target_fd;
+static int target_fd = -1;
 static char target_fname[PATH_MAX];
 static unsigned long long filesize;
 
@@ -80,6 +80,8 @@ static int hv_start_fcopy(struct hv_start_fcopy *smsg)
 
 	error = 0;
 done:
+	if (error)
+		memset(target_fname, 0, sizeof(target_fname));
 	return error;
 }
 
@@ -111,12 +113,16 @@ static int hv_copy_data(struct hv_do_fcopy *cpmsg)
 static int hv_copy_finished(void)
 {
 	close(target_fd);
+	target_fd = -1;
+	memset(target_fname, 0, sizeof(target_fname));
 	return 0;
 }
 static int hv_copy_cancel(void)
 {
 	close(target_fd);
+	target_fd = -1;
 	unlink(target_fname);
+	memset(target_fname, 0, sizeof(target_fname));
 	return 0;
 
 }
@@ -141,7 +147,7 @@ int main(int argc, char *argv[])
 		struct hv_do_fcopy copy;
 		__u32 kernel_modver;
 	} buffer = { };
-	int in_handshake = 1;
+	int in_handshake;
 
 	static struct option long_options[] = {
 		{"help",	no_argument,	   0,  'h' },
@@ -170,6 +176,9 @@ int main(int argc, char *argv[])
 	openlog("HV_FCOPY", 0, LOG_USER);
 	syslog(LOG_INFO, "starting; pid is:%d", getpid());
 
+reopen_fcopy_fd:
+	hv_copy_cancel();
+	in_handshake = 1;
 	fcopy_fd = open("/dev/vmbus/hv_fcopy", O_RDWR);
 
 	if (fcopy_fd < 0) {
@@ -196,7 +205,8 @@ int main(int argc, char *argv[])
 		len = pread(fcopy_fd, &buffer, sizeof(buffer), 0);
 		if (len < 0) {
 			syslog(LOG_ERR, "pread failed: %s", strerror(errno));
-			exit(EXIT_FAILURE);
+			close(fcopy_fd);
+			goto reopen_fcopy_fd;
 		}
 
 		if (in_handshake) {
@@ -233,7 +243,8 @@ int main(int argc, char *argv[])
 
 		if (pwrite(fcopy_fd, &error, sizeof(int), 0) != sizeof(int)) {
 			syslog(LOG_ERR, "pwrite failed: %s", strerror(errno));
-			exit(EXIT_FAILURE);
+			close(fcopy_fd);
+			goto reopen_fcopy_fd;
 		}
 	}
 }
diff --git a/tools/hv/hv_kvp_daemon.c b/tools/hv/hv_kvp_daemon.c
index e9ef4ca6a655..3282d48c4487 100644
--- a/tools/hv/hv_kvp_daemon.c
+++ b/tools/hv/hv_kvp_daemon.c
@@ -76,7 +76,7 @@ enum {
 	DNS
 };
 
-static int in_hand_shake = 1;
+static int in_hand_shake;
 
 static char *os_name = "";
 static char *os_major = "";
@@ -1400,14 +1400,6 @@ int main(int argc, char *argv[])
 	openlog("KVP", 0, LOG_USER);
 	syslog(LOG_INFO, "KVP starting; pid is:%d", getpid());
 
-	kvp_fd = open("/dev/vmbus/hv_kvp", O_RDWR | O_CLOEXEC);
-
-	if (kvp_fd < 0) {
-		syslog(LOG_ERR, "open /dev/vmbus/hv_kvp failed; error: %d %s",
-			errno, strerror(errno));
-		exit(EXIT_FAILURE);
-	}
-
 	/*
 	 * Retrieve OS release information.
 	 */
@@ -1423,6 +1415,16 @@ int main(int argc, char *argv[])
 		exit(EXIT_FAILURE);
 	}
 
+reopen_kvp_fd:
+	in_hand_shake = 1;
+	kvp_fd = open("/dev/vmbus/hv_kvp", O_RDWR | O_CLOEXEC);
+
+	if (kvp_fd < 0) {
+		syslog(LOG_ERR, "open /dev/vmbus/hv_kvp failed; error: %d %s",
+		       errno, strerror(errno));
+		exit(EXIT_FAILURE);
+	}
+
 	/*
 	 * Register ourselves with the kernel.
 	 */
@@ -1458,7 +1460,7 @@ int main(int argc, char *argv[])
 			       errno, strerror(errno));
 
 			close(kvp_fd);
-			return EXIT_FAILURE;
+			goto reopen_kvp_fd;
 		}
 
 		/*
@@ -1623,7 +1625,8 @@ int main(int argc, char *argv[])
 		if (len != sizeof(struct hv_kvp_msg)) {
 			syslog(LOG_ERR, "write failed; error: %d %s", errno,
 			       strerror(errno));
-			exit(EXIT_FAILURE);
+			close(kvp_fd);
+			goto reopen_kvp_fd;
 		}
 	}
 
diff --git a/tools/hv/hv_vss_daemon.c b/tools/hv/hv_vss_daemon.c
index 92902a88f671..e70fed66a5ae 100644
--- a/tools/hv/hv_vss_daemon.c
+++ b/tools/hv/hv_vss_daemon.c
@@ -28,6 +28,8 @@
 #include <stdbool.h>
 #include <dirent.h>
 
+static bool fs_frozen;
+
 /* Don't use syslog() in the function since that can cause write to disk */
 static int vss_do_freeze(char *dir, unsigned int cmd)
 {
@@ -155,8 +157,11 @@ static int vss_operate(int operation)
 			continue;
 		}
 		error |= vss_do_freeze(ent->mnt_dir, cmd);
-		if (error && operation == VSS_OP_FREEZE)
-			goto err;
+		if (operation == VSS_OP_FREEZE) {
+			if (error)
+				goto err;
+			fs_frozen = true;
+		}
 	}
 
 	endmntent(mounts);
@@ -167,6 +172,9 @@ static int vss_operate(int operation)
 			goto err;
 	}
 
+	if (operation == VSS_OP_THAW && !error)
+		fs_frozen = false;
+
 	goto out;
 err:
 	save_errno = errno;
@@ -175,6 +183,7 @@ static int vss_operate(int operation)
 		endmntent(mounts);
 	}
 	vss_operate(VSS_OP_THAW);
+	fs_frozen = false;
 	/* Call syslog after we thaw all filesystems */
 	if (ent)
 		syslog(LOG_ERR, "FREEZE of %s failed; error:%d %s",
@@ -202,7 +211,7 @@ int main(int argc, char *argv[])
 	int	op;
 	struct hv_vss_msg vss_msg[1];
 	int daemonize = 1, long_index = 0, opt;
-	int in_handshake = 1;
+	int in_handshake;
 	__u32 kernel_modver;
 
 	static struct option long_options[] = {
@@ -232,6 +241,10 @@ int main(int argc, char *argv[])
 	openlog("Hyper-V VSS", 0, LOG_USER);
 	syslog(LOG_INFO, "VSS starting; pid is:%d", getpid());
 
+reopen_vss_fd:
+	if (fs_frozen)
+		vss_operate(VSS_OP_THAW);
+	in_handshake = 1;
 	vss_fd = open("/dev/vmbus/hv_vss", O_RDWR);
 	if (vss_fd < 0) {
 		syslog(LOG_ERR, "open /dev/vmbus/hv_vss failed; error: %d %s",
@@ -285,7 +298,7 @@ int main(int argc, char *argv[])
 			syslog(LOG_ERR, "read failed; error:%d %s",
 			       errno, strerror(errno));
 			close(vss_fd);
-			return EXIT_FAILURE;
+			goto reopen_vss_fd;
 		}
 
 		op = vss_msg->vss_hdr.operation;
@@ -318,8 +331,8 @@ int main(int argc, char *argv[])
 			syslog(LOG_ERR, "write failed; error: %d %s", errno,
 			       strerror(errno));
 
-			if (op == VSS_OP_FREEZE)
-				vss_operate(VSS_OP_THAW);
+			close(vss_fd);
+			goto reopen_vss_fd;
 		}
 	}
 
-- 
2.19.1


^ permalink raw reply related	[flat|nested] 3+ messages in thread

* RE: [PATCH v2 1/4] Tools: hv: Reopen the devices if read() or write() returns errors
  2020-01-13  6:30 [PATCH v2 1/4] Tools: hv: Reopen the devices if read() or write() returns errors Dexuan Cui
@ 2020-01-22 16:53 ` Michael Kelley
  2020-01-23  8:10   ` Dexuan Cui
  0 siblings, 1 reply; 3+ messages in thread
From: Michael Kelley @ 2020-01-22 16:53 UTC (permalink / raw)
  To: Dexuan Cui, KY Srinivasan, Haiyang Zhang, Stephen Hemminger,
	sashal, Sasha Levin, linux-hyperv, vkuznets, linux-kernel

From: Dexuan Cui <decui@microsoft.com> Sent: Sunday, January 12, 2020 10:30 PM
> 
> The state machine in the hv_utils driver can run out of order in some
> corner cases, e.g. if the kvp daemon doesn't call write() fast enough
> due to some reason, kvp_timeout_func() can run first and move the state
> to HVUTIL_READY; next, when kvp_on_msg() is called it returns -EINVAL
> since kvp_transaction.state is smaller than HVUTIL_USERSPACE_REQ; later,
> the daemon's write() gets an error -EINVAL, and the daemon will exit().
> 
> We can reproduce the issue by sending a SIGSTOP signal to the daemon, wait
> for 1 minute, and send a SIGCONT signal to the daemon: the daemon will
> exit() quickly.
> 
> We can fix the issue by forcing a reset of the device (which means the
> daemon can close() and open() the device again) and doing extra necessary
> clean-up.
> 
> Signed-off-by: Dexuan Cui <decui@microsoft.com>
> ---
>  tools/hv/hv_fcopy_daemon.c | 19 +++++++++++++++----
>  tools/hv/hv_kvp_daemon.c   | 25 ++++++++++++++-----------
>  tools/hv/hv_vss_daemon.c   | 25 +++++++++++++++++++------
>  3 files changed, 48 insertions(+), 21 deletions(-)
> 
> diff --git a/tools/hv/hv_fcopy_daemon.c b/tools/hv/hv_fcopy_daemon.c
> index aea2d91ab364..a78a5292589b 100644
> --- a/tools/hv/hv_fcopy_daemon.c
> +++ b/tools/hv/hv_fcopy_daemon.c
> @@ -21,7 +21,7 @@
>  #include <fcntl.h>
>  #include <getopt.h>
> 
> -static int target_fd;
> +static int target_fd = -1;
>  static char target_fname[PATH_MAX];
>  static unsigned long long filesize;
> 
> @@ -80,6 +80,8 @@ static int hv_start_fcopy(struct hv_start_fcopy *smsg)
> 
>  	error = 0;
>  done:
> +	if (error)
> +		memset(target_fname, 0, sizeof(target_fname));
>  	return error;
>  }
> 
> @@ -111,12 +113,16 @@ static int hv_copy_data(struct hv_do_fcopy *cpmsg)
>  static int hv_copy_finished(void)
>  {
>  	close(target_fd);
> +	target_fd = -1;
> +	memset(target_fname, 0, sizeof(target_fname));

I'm not completely clear on why target_fd and target_fname need to
be reset.  Could you add a comment with an explanation?  Also,
since target_fname is a null terminated string, it seems like
target_fname[0] = 0 would be sufficient vs. zero'ing all 4096 bytes
(PATH_MAX).

>  	return 0;
>  }
>  static int hv_copy_cancel(void)
>  {
>  	close(target_fd);
> +	target_fd = -1;
>  	unlink(target_fname);
> +	memset(target_fname, 0, sizeof(target_fname));
>  	return 0;
> 
>  }
> @@ -141,7 +147,7 @@ int main(int argc, char *argv[])
>  		struct hv_do_fcopy copy;
>  		__u32 kernel_modver;
>  	} buffer = { };
> -	int in_handshake = 1;
> +	int in_handshake;
> 
>  	static struct option long_options[] = {
>  		{"help",	no_argument,	   0,  'h' },
> @@ -170,6 +176,9 @@ int main(int argc, char *argv[])
>  	openlog("HV_FCOPY", 0, LOG_USER);
>  	syslog(LOG_INFO, "starting; pid is:%d", getpid());
> 
> +reopen_fcopy_fd:
> +	hv_copy_cancel();
> +	in_handshake = 1;
>  	fcopy_fd = open("/dev/vmbus/hv_fcopy", O_RDWR);
> 
>  	if (fcopy_fd < 0) {
> @@ -196,7 +205,8 @@ int main(int argc, char *argv[])
>  		len = pread(fcopy_fd, &buffer, sizeof(buffer), 0);
>  		if (len < 0) {
>  			syslog(LOG_ERR, "pread failed: %s", strerror(errno));
> -			exit(EXIT_FAILURE);
> +			close(fcopy_fd);
> +			goto reopen_fcopy_fd;

In this case and all similar cases in this patch, there may be some risk
of getting stuck in a tight loop doing reopens if things are broken
in some strange and bizarre way.   Having an absolute limit on the
number of reopens is potentially too restrictive as it could limit the
number of times a VM could be hibernated and resumed.  Ideally
there could a simple rate limit on the reopens -- if it happens too frequently,
go ahead and exit like the current code does.  Thoughts?

>  		}
> 
>  		if (in_handshake) {
> @@ -233,7 +243,8 @@ int main(int argc, char *argv[])
> 
>  		if (pwrite(fcopy_fd, &error, sizeof(int), 0) != sizeof(int)) {
>  			syslog(LOG_ERR, "pwrite failed: %s", strerror(errno));
> -			exit(EXIT_FAILURE);
> +			close(fcopy_fd);
> +			goto reopen_fcopy_fd;
>  		}
>  	}
>  }
> diff --git a/tools/hv/hv_kvp_daemon.c b/tools/hv/hv_kvp_daemon.c
> index e9ef4ca6a655..3282d48c4487 100644
> --- a/tools/hv/hv_kvp_daemon.c
> +++ b/tools/hv/hv_kvp_daemon.c
> @@ -76,7 +76,7 @@ enum {
>  	DNS
>  };
> 
> -static int in_hand_shake = 1;
> +static int in_hand_shake;
> 
>  static char *os_name = "";
>  static char *os_major = "";
> @@ -1400,14 +1400,6 @@ int main(int argc, char *argv[])
>  	openlog("KVP", 0, LOG_USER);
>  	syslog(LOG_INFO, "KVP starting; pid is:%d", getpid());
> 
> -	kvp_fd = open("/dev/vmbus/hv_kvp", O_RDWR | O_CLOEXEC);
> -
> -	if (kvp_fd < 0) {
> -		syslog(LOG_ERR, "open /dev/vmbus/hv_kvp failed; error: %d %s",
> -			errno, strerror(errno));
> -		exit(EXIT_FAILURE);
> -	}
> -
>  	/*
>  	 * Retrieve OS release information.
>  	 */
> @@ -1423,6 +1415,16 @@ int main(int argc, char *argv[])
>  		exit(EXIT_FAILURE);
>  	}
> 
> +reopen_kvp_fd:
> +	in_hand_shake = 1;
> +	kvp_fd = open("/dev/vmbus/hv_kvp", O_RDWR | O_CLOEXEC);
> +
> +	if (kvp_fd < 0) {
> +		syslog(LOG_ERR, "open /dev/vmbus/hv_kvp failed; error: %d %s",
> +		       errno, strerror(errno));
> +		exit(EXIT_FAILURE);
> +	}
> +
>  	/*
>  	 * Register ourselves with the kernel.
>  	 */
> @@ -1458,7 +1460,7 @@ int main(int argc, char *argv[])
>  			       errno, strerror(errno));
> 
>  			close(kvp_fd);
> -			return EXIT_FAILURE;
> +			goto reopen_kvp_fd;
>  		}
> 
>  		/*
> @@ -1623,7 +1625,8 @@ int main(int argc, char *argv[])
>  		if (len != sizeof(struct hv_kvp_msg)) {
>  			syslog(LOG_ERR, "write failed; error: %d %s", errno,
>  			       strerror(errno));
> -			exit(EXIT_FAILURE);
> +			close(kvp_fd);
> +			goto reopen_kvp_fd;
>  		}
>  	}
> 
> diff --git a/tools/hv/hv_vss_daemon.c b/tools/hv/hv_vss_daemon.c
> index 92902a88f671..e70fed66a5ae 100644
> --- a/tools/hv/hv_vss_daemon.c
> +++ b/tools/hv/hv_vss_daemon.c
> @@ -28,6 +28,8 @@
>  #include <stdbool.h>
>  #include <dirent.h>
> 
> +static bool fs_frozen;
> +
>  /* Don't use syslog() in the function since that can cause write to disk */
>  static int vss_do_freeze(char *dir, unsigned int cmd)
>  {
> @@ -155,8 +157,11 @@ static int vss_operate(int operation)
>  			continue;
>  		}
>  		error |= vss_do_freeze(ent->mnt_dir, cmd);
> -		if (error && operation == VSS_OP_FREEZE)
> -			goto err;
> +		if (operation == VSS_OP_FREEZE) {
> +			if (error)
> +				goto err;
> +			fs_frozen = true;
> +		}
>  	}
> 
>  	endmntent(mounts);

Shortly after the above code, there's code specifically to
do the root filesystem last.  It has the same error test as above,
and it seems like it should also be setting fs_frozen = true if
it is successful.

> @@ -167,6 +172,9 @@ static int vss_operate(int operation)
>  			goto err;
>  	}
> 
> +	if (operation == VSS_OP_THAW && !error)
> +		fs_frozen = false;
> +
>  	goto out;
>  err:
>  	save_errno = errno;
> @@ -175,6 +183,7 @@ static int vss_operate(int operation)
>  		endmntent(mounts);
>  	}
>  	vss_operate(VSS_OP_THAW);
> +	fs_frozen = false;
>  	/* Call syslog after we thaw all filesystems */
>  	if (ent)
>  		syslog(LOG_ERR, "FREEZE of %s failed; error:%d %s",
> @@ -202,7 +211,7 @@ int main(int argc, char *argv[])
>  	int	op;
>  	struct hv_vss_msg vss_msg[1];
>  	int daemonize = 1, long_index = 0, opt;
> -	int in_handshake = 1;
> +	int in_handshake;
>  	__u32 kernel_modver;
> 
>  	static struct option long_options[] = {
> @@ -232,6 +241,10 @@ int main(int argc, char *argv[])
>  	openlog("Hyper-V VSS", 0, LOG_USER);
>  	syslog(LOG_INFO, "VSS starting; pid is:%d", getpid());
> 
> +reopen_vss_fd:
> +	if (fs_frozen)
> +		vss_operate(VSS_OP_THAW);

Need to set fs_frozen = false after the above statement?

> +	in_handshake = 1;
>  	vss_fd = open("/dev/vmbus/hv_vss", O_RDWR);
>  	if (vss_fd < 0) {
>  		syslog(LOG_ERR, "open /dev/vmbus/hv_vss failed; error: %d %s",
> @@ -285,7 +298,7 @@ int main(int argc, char *argv[])
>  			syslog(LOG_ERR, "read failed; error:%d %s",
>  			       errno, strerror(errno));
>  			close(vss_fd);
> -			return EXIT_FAILURE;
> +			goto reopen_vss_fd;
>  		}
> 
>  		op = vss_msg->vss_hdr.operation;
> @@ -318,8 +331,8 @@ int main(int argc, char *argv[])
>  			syslog(LOG_ERR, "write failed; error: %d %s", errno,
>  			       strerror(errno));
> 
> -			if (op == VSS_OP_FREEZE)
> -				vss_operate(VSS_OP_THAW);
> +			close(vss_fd);
> +			goto reopen_vss_fd;
>  		}
>  	}
> 
> --
> 2.19.1


^ permalink raw reply	[flat|nested] 3+ messages in thread

* RE: [PATCH v2 1/4] Tools: hv: Reopen the devices if read() or write() returns errors
  2020-01-22 16:53 ` Michael Kelley
@ 2020-01-23  8:10   ` Dexuan Cui
  0 siblings, 0 replies; 3+ messages in thread
From: Dexuan Cui @ 2020-01-23  8:10 UTC (permalink / raw)
  To: Michael Kelley, KY Srinivasan, Haiyang Zhang, Stephen Hemminger,
	sashal, Sasha Levin, linux-hyperv, vkuznets, linux-kernel

> From: Michael Kelley <mikelley@microsoft.com>
> Sent: Wednesday, January 22, 2020 8:54 AM
> > ...
> > @@ -111,12 +113,16 @@ static int hv_copy_data(struct hv_do_fcopy
> *cpmsg)
> >  static int hv_copy_finished(void)
> >  {
> >  	close(target_fd);
> > +	target_fd = -1;
> > +	memset(target_fname, 0, sizeof(target_fname));
> 
> I'm not completely clear on why target_fd and target_fname need to
> be reset.  Could you add a comment with an explanation?  Also,

After checking the code again, now I think we indeed don't need
"target_fd = -1;", but we need to reset target_fname because the kernel
hv_utils hibernation patch fakes a CANCEL_FCOPY message upon suspend,
and later when the VM resumes back the hv_fcopy_daemon may need
to handle the message by calling hv_copy_cancel(), which tries to remove
a file specified by target_fname. 

If some file was copied successfully before suspend, currently 
hv_copy_finished() doesn't reset target_fname; so after resume, when
the daemon handles the faked CANCEL_FCOPY message, hv_copy_cancel()
removes the file unexpectedly.

This patch resets target_fname in hv_copy_finished() to avoid the described
issue. 

In case a file is not fully copied before suspend (which means hv_copy_finished()
is not called), we'd better also reset taret_fname hv_copy_cancel(), since we'd
better make sure we only try to remove the file once, when we suspend/resume
multiple times.

I'll add a comment in the next version of the patch.

> since target_fname is a null terminated string, it seems like
> target_fname[0] = 0 would be sufficient vs. zero'ing all 4096 bytes
> (PATH_MAX).

I agree. Will use this better method.

> > +reopen_fcopy_fd:
> > +	hv_copy_cancel();
> > +	in_handshake = 1;
> >  	fcopy_fd = open("/dev/vmbus/hv_fcopy", O_RDWR);
> >
> >  	if (fcopy_fd < 0) {
> > @@ -196,7 +205,8 @@ int main(int argc, char *argv[])
> >  		len = pread(fcopy_fd, &buffer, sizeof(buffer), 0);
> >  		if (len < 0) {
> >  			syslog(LOG_ERR, "pread failed: %s", strerror(errno));
> > -			exit(EXIT_FAILURE);
> > +			close(fcopy_fd);
> > +			goto reopen_fcopy_fd;
> 
> In this case and all similar cases in this patch, there may be some risk
> of getting stuck in a tight loop doing reopens if things are broken
> in some strange and bizarre way.   Having an absolute limit on the
> number of reopens is potentially too restrictive as it could limit the
> number of times a VM could be hibernated and resumed.  Ideally
> there could a simple rate limit on the reopens -- if it happens too frequently,
> go ahead and exit like the current code does.  Thoughts?

If there is a bug, IMO it's better to get stuck in a tight loop, so the user
can notice it more quickly by the "top" command. :-)

With the patch, the daemon can get stuck in the loop only when the daemon
successfully reopens the dev file and registers itself to the kernel, but fails
to handle one of the messages. IMO the chance is pretty small, and if that
happens, there must be a bug in the hv_utils driver we need to fix, so, again,
IMO it's better to make the bug more noticeable by the tight loop. :-)

If the daemon fails to reopen the dev file or fails to register it to the kernel,
the daemon still calls exit().

> >  static int vss_do_freeze(char *dir, unsigned int cmd)
> >  {
> > @@ -155,8 +157,11 @@ static int vss_operate(int operation)
> >  			continue;
> >  		}
> >  		error |= vss_do_freeze(ent->mnt_dir, cmd);
> > -		if (error && operation == VSS_OP_FREEZE)
> > -			goto err;
> > +		if (operation == VSS_OP_FREEZE) {
> > +			if (error)
> > +				goto err;
> > +			fs_frozen = true;
> > +		}
> >  	}
> >
> >  	endmntent(mounts);
> 
> Shortly after the above code, there's code specifically to
> do the root filesystem last.  It has the same error test as above,
> and it seems like it should also be setting fs_frozen = true if
> it is successful.

Yes, I missed that. Will add code for that.
 
> > @@ -167,6 +172,9 @@ static int vss_operate(int operation)
> >  			goto err;
> >  	}
> >
> > +	if (operation == VSS_OP_THAW && !error)
> > +		fs_frozen = false;
> > +
> >  	goto out;
> >  err:
> >  	save_errno = errno;
> > @@ -175,6 +183,7 @@ static int vss_operate(int operation)
> >  		endmntent(mounts);
> >  	}
> >  	vss_operate(VSS_OP_THAW);
> > +	fs_frozen = false;
> >  	/* Call syslog after we thaw all filesystems */
> >  	if (ent)
> >  		syslog(LOG_ERR, "FREEZE of %s failed; error:%d %s",
> > @@ -202,7 +211,7 @@ int main(int argc, char *argv[])
> >  	int	op;
> >  	struct hv_vss_msg vss_msg[1];
> >  	int daemonize = 1, long_index = 0, opt;
> > -	int in_handshake = 1;
> > +	int in_handshake;
> >  	__u32 kernel_modver;
> >
> >  	static struct option long_options[] = {
> > @@ -232,6 +241,10 @@ int main(int argc, char *argv[])
> >  	openlog("Hyper-V VSS", 0, LOG_USER);
> >  	syslog(LOG_INFO, "VSS starting; pid is:%d", getpid());
> >
> > +reopen_vss_fd:
> > +	if (fs_frozen)
> > +		vss_operate(VSS_OP_THAW);
> 
> Need to set fs_frozen = false after the above statement?

fs_frozen is set to false in vss_operate(), but there is a chance
vss_operate() may fail due to some reason, and hence fs_frozen may
remain to be true. I'll add the below code:

@@ -242,8 +242,14 @@ int main(int argc, char *argv[])
        syslog(LOG_INFO, "VSS starting; pid is:%d", getpid());

 reopen_vss_fd:
-       if (fs_frozen)
-               vss_operate(VSS_OP_THAW);
+       if (fs_frozen) {
+               if (vss_operate(VSS_OP_THAW) || fs_frozen) {
+                       syslog(LOG_ERR, "failed to thaw file system: err=%d",
+                              errno);
+                       exit(EXIT_FAILURE);
+               }
+       }
+
        in_handshake = 1;
        vss_fd = open("/dev/vmbus/hv_vss", O_RDWR);

Thanks,
-- Dexuan

^ permalink raw reply	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2020-01-23  8:11 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-01-13  6:30 [PATCH v2 1/4] Tools: hv: Reopen the devices if read() or write() returns errors Dexuan Cui
2020-01-22 16:53 ` Michael Kelley
2020-01-23  8:10   ` Dexuan Cui

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).