All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 1/2] vchan-socket-proxy: add reconnect marker support
@ 2022-09-05 13:50 Marek Marczykowski-Górecki
  2022-09-05 13:50 ` [PATCH 2/2] tools/libxl: enable in-band reconnect marker for stubdom QMP proxy Marek Marczykowski-Górecki
                   ` (2 more replies)
  0 siblings, 3 replies; 8+ messages in thread
From: Marek Marczykowski-Górecki @ 2022-09-05 13:50 UTC (permalink / raw)
  To: xen-devel
  Cc: Jason Andryuk, Marek Marczykowski-Górecki, Wei Liu, Anthony PERARD

When vchan client reconnect quickly, the server may not notice it. This
means, it won't reconnect the UNIX socket either. For QMP, it will
prevent the client to see the QMP protocol handshake, and the
communication will timeout.
Solve the issue by sending in-band connect marker. Whenever server sees
one (elsewhere than the first byte in the connection), handle it as a
client had reconnected. The marker is a one byte, and the user need to
choose something that doesn't appear in the data stream elsewhere.

Signed-off-by: Marek Marczykowski-Górecki <marmarek@invisiblethingslab.com>
---
 tools/vchan/vchan-socket-proxy.c | 51 +++++++++++++++++++++++++++++++-
 1 file changed, 50 insertions(+), 1 deletion(-)

diff --git a/tools/vchan/vchan-socket-proxy.c b/tools/vchan/vchan-socket-proxy.c
index e1d959c6d15c..1e7defe9bae7 100644
--- a/tools/vchan/vchan-socket-proxy.c
+++ b/tools/vchan/vchan-socket-proxy.c
@@ -31,6 +31,7 @@
  * One client is served at a time, clients needs to coordinate this themselves.
  */
 
+#define _GNU_SOURCE
 #include <stdlib.h>
 #include <stdio.h>
 #include <string.h>
@@ -54,6 +55,9 @@ static void usage(char** argv)
         "\t-m, --mode=client|server - vchan connection mode (client by default)\n"
         "\t-s, --state-path=path - xenstore path where write \"running\" to \n"
         "\t                        at startup\n"
+        "\t-r, --reconnect-marker=value - send(client)/expect(server) a\n"
+        "\t                single-byte marker to detect quick reconnects and\n"
+        "\t                force reconnecting UNIX socket\n"
         "\t-v, --verbose - verbose logging\n"
         "\n"
         "client: client of a vchan connection, fourth parameter can be:\n"
@@ -61,7 +65,7 @@ static void usage(char** argv)
         "\t             whenever new connection is accepted;\n"
         "\t             handle multiple _subsequent_ connections, until terminated\n"
         "\n"
-        "\tfile-no:     except open FD of a socket in listen mode;\n"
+        "\tfile-no:     expect open FD of a socket in listen mode;\n"
         "\t             otherwise similar to socket-path\n"
         "\n"
         "\t-:           open vchan connection immediately and pass the data\n"
@@ -88,6 +92,7 @@ char outbuf[BUFSIZE];
 int insiz = 0;
 int outsiz = 0;
 int verbose = 0;
+int reconnect_marker_value = -1;
 
 struct vchan_proxy_state {
     struct libxenvchan *ctrl;
@@ -291,6 +296,7 @@ int data_loop(struct vchan_proxy_state *state)
     int ret;
     int libxenvchan_fd;
     int max_fd;
+    bool just_connected = true;
 
     libxenvchan_fd = libxenvchan_fd_for_select(state->ctrl);
     for (;;) {
@@ -368,8 +374,33 @@ int data_loop(struct vchan_proxy_state *state)
                 exit(1);
             if (verbose)
                 fprintf(stderr, "from-vchan: %.*s\n", ret, outbuf + outsiz);
+            if (reconnect_marker_value != -1) {
+                char *reconnect_found =
+                    memrchr(outbuf + outsiz, reconnect_marker_value, ret);
+                if (just_connected && reconnect_found == outbuf + outsiz) {
+                    /* skip reconnect marker at the very first byte of the data
+                     * stream */
+                    memmove(outbuf + outsiz, outbuf + outsiz + 1, ret - 1);
+                    ret -= 1;
+                } else if (reconnect_found) {
+                    size_t newsiz = outbuf + outsiz + ret - reconnect_found - 1;
+                    if (verbose)
+                        fprintf(stderr, "reconnect marker found\n");
+                    /* discard everything before and including the reconnect
+                     * marker */
+                    memmove(outbuf, reconnect_found + 1, newsiz);
+                    outsiz = newsiz;
+                    /* then handle it as the client had just disconnected */
+                    close(state->output_fd);
+                    state->output_fd = -1;
+                    close(state->input_fd);
+                    state->input_fd = -1;
+                    return 0;
+                }
+            }
             outsiz += ret;
             socket_wr(state->output_fd);
+            just_connected = false;
         }
     }
     return 0;
@@ -385,6 +416,7 @@ static struct option options[] = {
     { "mode",       required_argument, NULL, 'm' },
     { "verbose",          no_argument, NULL, 'v' },
     { "state-path", required_argument, NULL, 's' },
+    { "reconnect-marker", required_argument, NULL, 'r' },
     { }
 };
 
@@ -421,6 +453,14 @@ int main(int argc, char **argv)
             case 's':
                 state_path = optarg;
                 break;
+            case 'r':
+                reconnect_marker_value = atoi(optarg);
+                if (reconnect_marker_value < 0 || reconnect_marker_value > 255) {
+                    fprintf(stderr, "invalid argument for --reconnect-marker, "
+                                    "must be a number between 0 and 255\n");
+                    usage(argv);
+                }
+                break;
             case '?':
                 usage(argv);
         }
@@ -509,6 +549,15 @@ int main(int argc, char **argv)
                 ret = 1;
                 break;
             }
+            if (reconnect_marker_value != -1) {
+                const char marker_buf[] = { reconnect_marker_value };
+
+                if (libxenvchan_write(state.ctrl, marker_buf, sizeof(marker_buf))
+                        != sizeof(marker_buf)) {
+                    fprintf(stderr, "failed to send reconnect marker\n");
+                    break;
+                }
+            }
             if (data_loop(&state) != 0)
                 break;
             /* don't reconnect if output was stdout */
-- 
2.35.3



^ permalink raw reply related	[flat|nested] 8+ messages in thread

* [PATCH 2/2] tools/libxl: enable in-band reconnect marker for stubdom QMP proxy
  2022-09-05 13:50 [PATCH 1/2] vchan-socket-proxy: add reconnect marker support Marek Marczykowski-Górecki
@ 2022-09-05 13:50 ` Marek Marczykowski-Górecki
  2022-09-06 12:45   ` Jason Andryuk
  2022-12-07 15:26   ` Anthony PERARD
  2022-09-06 12:44 ` [PATCH 1/2] vchan-socket-proxy: add reconnect marker support Jason Andryuk
  2022-12-07 15:22 ` Anthony PERARD
  2 siblings, 2 replies; 8+ messages in thread
From: Marek Marczykowski-Górecki @ 2022-09-05 13:50 UTC (permalink / raw)
  To: xen-devel
  Cc: Jason Andryuk, Marek Marczykowski-Górecki, Wei Liu,
	Anthony PERARD, Juergen Gross

This enables stubdom reliably detect when it needs to reconnect QMP
socket. It is critical, as otherwise QEMU will not send its handshake,
and so libxl will timeout while waiting on one. When it happens during
domain startup, it can result in error like this:

libxl: libxl_pci.c:1772:device_pci_add_done: Domain 3:libxl__device_pci_add failed for PCI device 0:0:14.0 (rc -9)
libxl: libxl_create.c:1904:domcreate_attach_devices: Domain 3:unable to add pci devices

See vchan-socket-proxy commit message for details about this reconnect
corner case.

Stubdomain side needs to use --reconnect-marker=1 option too.

Signed-off-by: Marek Marczykowski-Górecki <marmarek@invisiblethingslab.com>
---
 tools/libs/light/libxl_dm.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tools/libs/light/libxl_dm.c b/tools/libs/light/libxl_dm.c
index fc264a3a13a6..cc9c5bea1e7f 100644
--- a/tools/libs/light/libxl_dm.c
+++ b/tools/libs/light/libxl_dm.c
@@ -2625,10 +2625,11 @@ static void spawn_qmp_proxy(libxl__egc *egc,
     sdss->qmp_proxy_spawn.failure_cb = qmp_proxy_startup_failed;
     sdss->qmp_proxy_spawn.detached_cb = qmp_proxy_detached;
 
-    const int arraysize = 6;
+    const int arraysize = 7;
     GCNEW_ARRAY(args, arraysize);
     args[nr++] = STUBDOM_QMP_PROXY_PATH;
     args[nr++] = GCSPRINTF("--state-path=%s", sdss->qmp_proxy_spawn.xspath);
+    args[nr++] = "--reconnect-marker=1";
     args[nr++] = GCSPRINTF("%u", dm_domid);
     args[nr++] = GCSPRINTF("%s/device-model/%u/qmp-vchan", dom_path, guest_domid);
     args[nr++] = (char*)libxl__qemu_qmp_path(gc, guest_domid);
-- 
2.35.3



^ permalink raw reply related	[flat|nested] 8+ messages in thread

* Re: [PATCH 1/2] vchan-socket-proxy: add reconnect marker support
  2022-09-05 13:50 [PATCH 1/2] vchan-socket-proxy: add reconnect marker support Marek Marczykowski-Górecki
  2022-09-05 13:50 ` [PATCH 2/2] tools/libxl: enable in-band reconnect marker for stubdom QMP proxy Marek Marczykowski-Górecki
@ 2022-09-06 12:44 ` Jason Andryuk
  2022-12-07 15:22 ` Anthony PERARD
  2 siblings, 0 replies; 8+ messages in thread
From: Jason Andryuk @ 2022-09-06 12:44 UTC (permalink / raw)
  To: Marek Marczykowski-Górecki; +Cc: xen-devel, Wei Liu, Anthony PERARD

On Mon, Sep 5, 2022 at 9:50 AM Marek Marczykowski-Górecki
<marmarek@invisiblethingslab.com> wrote:
>
> When vchan client reconnect quickly, the server may not notice it. This
> means, it won't reconnect the UNIX socket either. For QMP, it will
> prevent the client to see the QMP protocol handshake, and the
> communication will timeout.
> Solve the issue by sending in-band connect marker. Whenever server sees
> one (elsewhere than the first byte in the connection), handle it as a
> client had reconnected. The marker is a one byte, and the user need to
> choose something that doesn't appear in the data stream elsewhere.
>
> Signed-off-by: Marek Marczykowski-Górecki <marmarek@invisiblethingslab.com>

Reviewed-by: Jason Andryuk <jandryuk@gmail.com>

Here are some of my thoughts on the correctness of this approach:
The QMP data is json - ascii (utf-8?) - so using \x01 is fine as it
won't be in the data stream.  The dropping of any partial message
works since the close and re-open of the QMP socket resets the data
stream.  There shouldn't be any partial data in vchan-socket-proxy to
drop since the client side should not have sent any new data before it
closed and re-opened.  If there is partial data, we shouldn't send it
into QEMU since the new QMP client isn't expecting any response it
would generate.  So that all seems okay.

Regards,
Jason


^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH 2/2] tools/libxl: enable in-band reconnect marker for stubdom QMP proxy
  2022-09-05 13:50 ` [PATCH 2/2] tools/libxl: enable in-band reconnect marker for stubdom QMP proxy Marek Marczykowski-Górecki
@ 2022-09-06 12:45   ` Jason Andryuk
  2022-09-27 13:15     ` Jason Andryuk
  2022-12-07 15:26   ` Anthony PERARD
  1 sibling, 1 reply; 8+ messages in thread
From: Jason Andryuk @ 2022-09-06 12:45 UTC (permalink / raw)
  To: Marek Marczykowski-Górecki
  Cc: xen-devel, Wei Liu, Anthony PERARD, Juergen Gross

On Mon, Sep 5, 2022 at 9:50 AM Marek Marczykowski-Górecki
<marmarek@invisiblethingslab.com> wrote:
>
> This enables stubdom reliably detect when it needs to reconnect QMP
> socket. It is critical, as otherwise QEMU will not send its handshake,
> and so libxl will timeout while waiting on one. When it happens during
> domain startup, it can result in error like this:
>
> libxl: libxl_pci.c:1772:device_pci_add_done: Domain 3:libxl__device_pci_add failed for PCI device 0:0:14.0 (rc -9)
> libxl: libxl_create.c:1904:domcreate_attach_devices: Domain 3:unable to add pci devices
>
> See vchan-socket-proxy commit message for details about this reconnect
> corner case.
>
> Stubdomain side needs to use --reconnect-marker=1 option too.
>
> Signed-off-by: Marek Marczykowski-Górecki <marmarek@invisiblethingslab.com>

Reviewed-by: Jason Andryuk <jandryuk@gmail.com>


^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH 2/2] tools/libxl: enable in-band reconnect marker for stubdom QMP proxy
  2022-09-06 12:45   ` Jason Andryuk
@ 2022-09-27 13:15     ` Jason Andryuk
  0 siblings, 0 replies; 8+ messages in thread
From: Jason Andryuk @ 2022-09-27 13:15 UTC (permalink / raw)
  To: Marek Marczykowski-Górecki
  Cc: xen-devel, Wei Liu, Anthony PERARD, Juergen Gross

On Tue, Sep 6, 2022 at 8:45 AM Jason Andryuk <jandryuk@gmail.com> wrote:
>
> On Mon, Sep 5, 2022 at 9:50 AM Marek Marczykowski-Górecki
> <marmarek@invisiblethingslab.com> wrote:
> >
> > This enables stubdom reliably detect when it needs to reconnect QMP
> > socket. It is critical, as otherwise QEMU will not send its handshake,
> > and so libxl will timeout while waiting on one. When it happens during
> > domain startup, it can result in error like this:
> >
> > libxl: libxl_pci.c:1772:device_pci_add_done: Domain 3:libxl__device_pci_add failed for PCI device 0:0:14.0 (rc -9)
> > libxl: libxl_create.c:1904:domcreate_attach_devices: Domain 3:unable to add pci devices
> >
> > See vchan-socket-proxy commit message for details about this reconnect
> > corner case.
> >
> > Stubdomain side needs to use --reconnect-marker=1 option too.
> >
> > Signed-off-by: Marek Marczykowski-Górecki <marmarek@invisiblethingslab.com>
>
> Reviewed-by: Jason Andryuk <jandryuk@gmail.com>

Also
Tested-by: Jason Andryuk <jandryuk@gmail.com>


^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH 1/2] vchan-socket-proxy: add reconnect marker support
  2022-09-05 13:50 [PATCH 1/2] vchan-socket-proxy: add reconnect marker support Marek Marczykowski-Górecki
  2022-09-05 13:50 ` [PATCH 2/2] tools/libxl: enable in-band reconnect marker for stubdom QMP proxy Marek Marczykowski-Górecki
  2022-09-06 12:44 ` [PATCH 1/2] vchan-socket-proxy: add reconnect marker support Jason Andryuk
@ 2022-12-07 15:22 ` Anthony PERARD
  2022-12-07 16:35   ` Marek Marczykowski-Górecki
  2 siblings, 1 reply; 8+ messages in thread
From: Anthony PERARD @ 2022-12-07 15:22 UTC (permalink / raw)
  To: Marek Marczykowski-Górecki; +Cc: xen-devel, Jason Andryuk, Wei Liu

On Mon, Sep 05, 2022 at 03:50:18PM +0200, Marek Marczykowski-Górecki wrote:
> +                reconnect_marker_value = atoi(optarg);

atoi() isn't great, if there's garbage it just return 0, which is
validated below.

Would there be value here in using strtol() which can detect input
error? To at least help with maybe hard to debug issues?

> +                if (reconnect_marker_value < 0 || reconnect_marker_value > 255) {
> +                    fprintf(stderr, "invalid argument for --reconnect-marker, "
> +                                    "must be a number between 0 and 255\n");
> +                    usage(argv);
> +                }
> +                break;
>              case '?':
>                  usage(argv);
>          }
> @@ -509,6 +549,15 @@ int main(int argc, char **argv)
>                  ret = 1;
>                  break;
>              }
> +            if (reconnect_marker_value != -1) {
> +                const char marker_buf[] = { reconnect_marker_value };
> +
> +                if (libxenvchan_write(state.ctrl, marker_buf, sizeof(marker_buf))
> +                        != sizeof(marker_buf)) {
> +                    fprintf(stderr, "failed to send reconnect marker\n");

Is this an expected failure? If so, maybe adding "(ignored)" might be
valuable to someone reading the logs?

> +                    break;
> +                }
> +            }
>              if (data_loop(&state) != 0)
>                  break;
>              /* don't reconnect if output was stdout */


Otherwise, the patch looks fine. Even if kept as is:
Acked-by: Anthony PERARD <anthony.perard@citrix.com>

Thanks,

-- 
Anthony PERARD


^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH 2/2] tools/libxl: enable in-band reconnect marker for stubdom QMP proxy
  2022-09-05 13:50 ` [PATCH 2/2] tools/libxl: enable in-band reconnect marker for stubdom QMP proxy Marek Marczykowski-Górecki
  2022-09-06 12:45   ` Jason Andryuk
@ 2022-12-07 15:26   ` Anthony PERARD
  1 sibling, 0 replies; 8+ messages in thread
From: Anthony PERARD @ 2022-12-07 15:26 UTC (permalink / raw)
  To: Marek Marczykowski-Górecki
  Cc: xen-devel, Jason Andryuk, Wei Liu, Juergen Gross

On Mon, Sep 05, 2022 at 03:50:19PM +0200, Marek Marczykowski-Górecki wrote:
> This enables stubdom reliably detect when it needs to reconnect QMP
> socket. It is critical, as otherwise QEMU will not send its handshake,
> and so libxl will timeout while waiting on one. When it happens during
> domain startup, it can result in error like this:
> 
> libxl: libxl_pci.c:1772:device_pci_add_done: Domain 3:libxl__device_pci_add failed for PCI device 0:0:14.0 (rc -9)
> libxl: libxl_create.c:1904:domcreate_attach_devices: Domain 3:unable to add pci devices
> 
> See vchan-socket-proxy commit message for details about this reconnect
> corner case.
> 
> Stubdomain side needs to use --reconnect-marker=1 option too.
> 
> Signed-off-by: Marek Marczykowski-Górecki <marmarek@invisiblethingslab.com>

Acked-by: Anthony PERARD <anthony.perard@citrix.com>

Thanks,

-- 
Anthony PERARD


^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH 1/2] vchan-socket-proxy: add reconnect marker support
  2022-12-07 15:22 ` Anthony PERARD
@ 2022-12-07 16:35   ` Marek Marczykowski-Górecki
  0 siblings, 0 replies; 8+ messages in thread
From: Marek Marczykowski-Górecki @ 2022-12-07 16:35 UTC (permalink / raw)
  To: Anthony PERARD; +Cc: xen-devel, Jason Andryuk, Wei Liu

[-- Attachment #1: Type: text/plain, Size: 2119 bytes --]

On Wed, Dec 07, 2022 at 03:22:46PM +0000, Anthony PERARD wrote:
> On Mon, Sep 05, 2022 at 03:50:18PM +0200, Marek Marczykowski-Górecki wrote:
> > +                reconnect_marker_value = atoi(optarg);
> 
> atoi() isn't great, if there's garbage it just return 0, which is
> validated below.
> 
> Would there be value here in using strtol() which can detect input
> error? To at least help with maybe hard to debug issues?

I'll send v2 with this changed.

> > +                if (reconnect_marker_value < 0 || reconnect_marker_value > 255) {
> > +                    fprintf(stderr, "invalid argument for --reconnect-marker, "
> > +                                    "must be a number between 0 and 255\n");
> > +                    usage(argv);
> > +                }
> > +                break;
> >              case '?':
> >                  usage(argv);
> >          }
> > @@ -509,6 +549,15 @@ int main(int argc, char **argv)
> >                  ret = 1;
> >                  break;
> >              }
> > +            if (reconnect_marker_value != -1) {
> > +                const char marker_buf[] = { reconnect_marker_value };
> > +
> > +                if (libxenvchan_write(state.ctrl, marker_buf, sizeof(marker_buf))
> > +                        != sizeof(marker_buf)) {
> > +                    fprintf(stderr, "failed to send reconnect marker\n");
> 
> Is this an expected failure? If so, maybe adding "(ignored)" might be
> valuable to someone reading the logs?

No, it isn't, it may mean server is gone or other fatal communication
error. The break below will terminate the process (after performing
cleanup).

> > +                    break;
> > +                }
> > +            }
> >              if (data_loop(&state) != 0)
> >                  break;
> >              /* don't reconnect if output was stdout */
> 
> 
> Otherwise, the patch looks fine. Even if kept as is:
> Acked-by: Anthony PERARD <anthony.perard@citrix.com>
> 
> Thanks,
> 
> -- 
> Anthony PERARD

-- 
Best Regards,
Marek Marczykowski-Górecki
Invisible Things Lab

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 488 bytes --]

^ permalink raw reply	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2022-12-07 16:36 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-09-05 13:50 [PATCH 1/2] vchan-socket-proxy: add reconnect marker support Marek Marczykowski-Górecki
2022-09-05 13:50 ` [PATCH 2/2] tools/libxl: enable in-band reconnect marker for stubdom QMP proxy Marek Marczykowski-Górecki
2022-09-06 12:45   ` Jason Andryuk
2022-09-27 13:15     ` Jason Andryuk
2022-12-07 15:26   ` Anthony PERARD
2022-09-06 12:44 ` [PATCH 1/2] vchan-socket-proxy: add reconnect marker support Jason Andryuk
2022-12-07 15:22 ` Anthony PERARD
2022-12-07 16:35   ` Marek Marczykowski-Górecki

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.