All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 0/4] Fix multifd + cancel + multifd
@ 2019-12-18  5:04 Juan Quintela
  2019-12-18  5:04 ` [PATCH 1/4] qemu-file: Don't do IO after shutdown Juan Quintela
                   ` (3 more replies)
  0 siblings, 4 replies; 17+ messages in thread
From: Juan Quintela @ 2019-12-18  5:04 UTC (permalink / raw)
  To: qemu-devel
  Cc: Laurent Vivier, Paolo Bonzini, Thomas Huth,
	Dr. David Alan Gilbert, Juan Quintela

Hi

This series:
- create a test that does:
  launch multifd on target
  migrate to target
  cancel on source
  create another target
  migrate again

- And fixes the cases that made it fail:
* Make sure that we don't try ever IO after shutdown/error

Please, review.

Juan Quintela (4):
  qemu-file: Don't do IO after shutdown
  multifd: Make sure that we don't do any IO after an error
  migration-test: Make sure that multifd and cancel works
  migration: Make sure that we don't call write() in case of error

 migration/qemu-file.c  |  13 +++++
 migration/ram.c        |  41 ++++++++++++----
 tests/migration-test.c | 108 ++++++++++++++++++++++++++++++++++++++++-
 3 files changed, 152 insertions(+), 10 deletions(-)

-- 
2.23.0



^ permalink raw reply	[flat|nested] 17+ messages in thread

* [PATCH 1/4] qemu-file: Don't do IO after shutdown
  2019-12-18  5:04 [PATCH 0/4] Fix multifd + cancel + multifd Juan Quintela
@ 2019-12-18  5:04 ` Juan Quintela
  2019-12-18 12:27   ` Dr. David Alan Gilbert
  2019-12-18  5:04 ` [PATCH 2/4] multifd: Make sure that we don't do any IO after an error Juan Quintela
                   ` (2 subsequent siblings)
  3 siblings, 1 reply; 17+ messages in thread
From: Juan Quintela @ 2019-12-18  5:04 UTC (permalink / raw)
  To: qemu-devel
  Cc: Laurent Vivier, Paolo Bonzini, Thomas Huth,
	Dr. David Alan Gilbert, Juan Quintela

Be sure that we are not doing neither read/write after shutdown of the
QEMUFile.

Signed-off-by: Juan Quintela <quintela@redhat.com>
---
 migration/qemu-file.c | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/migration/qemu-file.c b/migration/qemu-file.c
index 26fb25ddc1..1e5543a279 100644
--- a/migration/qemu-file.c
+++ b/migration/qemu-file.c
@@ -53,6 +53,8 @@ struct QEMUFile {
 
     int last_error;
     Error *last_error_obj;
+    /* has the file has been shutdown */
+    bool shutdown;
 };
 
 /*
@@ -61,6 +63,7 @@ struct QEMUFile {
  */
 int qemu_file_shutdown(QEMUFile *f)
 {
+    f->shutdown = true;
     if (!f->ops->shut_down) {
         return -ENOSYS;
     }
@@ -214,6 +217,9 @@ void qemu_fflush(QEMUFile *f)
         return;
     }
 
+    if (f->shutdown) {
+        return;
+    }
     if (f->iovcnt > 0) {
         expect = iov_size(f->iov, f->iovcnt);
         ret = f->ops->writev_buffer(f->opaque, f->iov, f->iovcnt, f->pos,
@@ -328,6 +334,10 @@ static ssize_t qemu_fill_buffer(QEMUFile *f)
     f->buf_index = 0;
     f->buf_size = pending;
 
+    if (f->shutdown) {
+        return 0;
+    }
+
     len = f->ops->get_buffer(f->opaque, f->buf + pending, f->pos,
                              IO_BUF_SIZE - pending, &local_error);
     if (len > 0) {
@@ -642,6 +652,9 @@ int64_t qemu_ftell(QEMUFile *f)
 
 int qemu_file_rate_limit(QEMUFile *f)
 {
+    if (f->shutdown) {
+        return 1;
+    }
     if (qemu_file_get_error(f)) {
         return 1;
     }
-- 
2.23.0



^ permalink raw reply related	[flat|nested] 17+ messages in thread

* [PATCH 2/4] multifd: Make sure that we don't do any IO after an error
  2019-12-18  5:04 [PATCH 0/4] Fix multifd + cancel + multifd Juan Quintela
  2019-12-18  5:04 ` [PATCH 1/4] qemu-file: Don't do IO after shutdown Juan Quintela
@ 2019-12-18  5:04 ` Juan Quintela
  2019-12-18 13:57   ` Dr. David Alan Gilbert
  2019-12-18  5:04 ` [PATCH 3/4] migration-test: Make sure that multifd and cancel works Juan Quintela
  2019-12-18  5:04 ` [PATCH 4/4] migration: Make sure that we don't call write() in case of error Juan Quintela
  3 siblings, 1 reply; 17+ messages in thread
From: Juan Quintela @ 2019-12-18  5:04 UTC (permalink / raw)
  To: qemu-devel
  Cc: Laurent Vivier, Paolo Bonzini, Thomas Huth,
	Dr. David Alan Gilbert, Juan Quintela

Signed-off-by: Juan Quintela <quintela@redhat.com>
---
 migration/ram.c | 22 +++++++++++++---------
 1 file changed, 13 insertions(+), 9 deletions(-)

diff --git a/migration/ram.c b/migration/ram.c
index db90237977..4b44578e57 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -4132,7 +4132,7 @@ static int ram_save_iterate(QEMUFile *f, void *opaque)
 {
     RAMState **temp = opaque;
     RAMState *rs = *temp;
-    int ret;
+    int ret = 0;
     int i;
     int64_t t0;
     int done = 0;
@@ -4203,12 +4203,14 @@ static int ram_save_iterate(QEMUFile *f, void *opaque)
     ram_control_after_iterate(f, RAM_CONTROL_ROUND);
 
 out:
-    multifd_send_sync_main(rs);
-    qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
-    qemu_fflush(f);
-    ram_counters.transferred += 8;
+    if (ret >= 0) {
+        multifd_send_sync_main(rs);
+        qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
+        qemu_fflush(f);
+        ram_counters.transferred += 8;
 
-    ret = qemu_file_get_error(f);
+        ret = qemu_file_get_error(f);
+    }
     if (ret < 0) {
         return ret;
     }
@@ -4260,9 +4262,11 @@ static int ram_save_complete(QEMUFile *f, void *opaque)
         ram_control_after_iterate(f, RAM_CONTROL_FINISH);
     }
 
-    multifd_send_sync_main(rs);
-    qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
-    qemu_fflush(f);
+    if (ret >= 0) {
+        multifd_send_sync_main(rs);
+        qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
+        qemu_fflush(f);
+    }
 
     return ret;
 }
-- 
2.23.0



^ permalink raw reply related	[flat|nested] 17+ messages in thread

* [PATCH 3/4] migration-test: Make sure that multifd and cancel works
  2019-12-18  5:04 [PATCH 0/4] Fix multifd + cancel + multifd Juan Quintela
  2019-12-18  5:04 ` [PATCH 1/4] qemu-file: Don't do IO after shutdown Juan Quintela
  2019-12-18  5:04 ` [PATCH 2/4] multifd: Make sure that we don't do any IO after an error Juan Quintela
@ 2019-12-18  5:04 ` Juan Quintela
  2019-12-18 16:25   ` Dr. David Alan Gilbert
  2019-12-18  5:04 ` [PATCH 4/4] migration: Make sure that we don't call write() in case of error Juan Quintela
  3 siblings, 1 reply; 17+ messages in thread
From: Juan Quintela @ 2019-12-18  5:04 UTC (permalink / raw)
  To: qemu-devel
  Cc: Laurent Vivier, Paolo Bonzini, Thomas Huth,
	Dr. David Alan Gilbert, Juan Quintela

Test that this sequerce works:

- launch source
- launch target
- start migration
- cancel migration
- relaunch target
- do migration again

Signed-off-by: Juan Quintela <quintela@redhat.com>
---
 tests/migration-test.c | 108 ++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 107 insertions(+), 1 deletion(-)

diff --git a/tests/migration-test.c b/tests/migration-test.c
index 7588f50b9b..1c93b3e5bc 100644
--- a/tests/migration-test.c
+++ b/tests/migration-test.c
@@ -527,6 +527,14 @@ static void migrate_recover(QTestState *who, const char *uri)
     qobject_unref(rsp);
 }
 
+static void migrate_cancel(QTestState *who)
+{
+    QDict *rsp;
+
+    rsp = wait_command(who, "{ 'execute': 'migrate_cancel' }");
+    qobject_unref(rsp);
+}
+
 static void migrate_set_capability(QTestState *who, const char *capability,
                                    bool value)
 {
@@ -583,6 +591,8 @@ static void migrate_postcopy_start(QTestState *from, QTestState *to)
 typedef struct {
     bool hide_stderr;
     bool use_shmem;
+    /* only launch the target process */
+    bool only_target;
     char *opts_source;
     char *opts_target;
 } MigrateStart;
@@ -704,7 +714,9 @@ static int test_migrate_start(QTestState **from, QTestState **to,
                                  arch_source, shmem_opts, args->opts_source,
                                  ignore_stderr);
     g_free(arch_source);
-    *from = qtest_init(cmd_source);
+    if (!args->only_target) {
+        *from = qtest_init(cmd_source);
+    }
     g_free(cmd_source);
 
     cmd_target = g_strdup_printf("-machine %saccel=kvm:tcg%s "
@@ -1470,6 +1482,99 @@ static void test_multifd_tcp_zstd(void)
     test_multifd_tcp("zstd");
 }
 
+/*
+ * This test does:
+ *  source               target
+ *                       migrate_incoming
+ *     migrate
+ *     migrate_cancel
+ *                       launch another target
+ *     migrate
+ *
+ *  And see that it works
+ */
+
+static void test_multifd_tcp_cancel(void)
+{
+    MigrateStart *args = migrate_start_new();
+    QTestState *from, *to;
+    QDict *rsp;
+    char *uri;
+
+    if (test_migrate_start(&from, &to, "defer", args)) {
+        return;
+    }
+
+    /*
+     * We want to pick a speed slow enough that the test completes
+     * quickly, but that it doesn't complete precopy even on a slow
+     * machine, so also set the downtime.
+     */
+    /* 1 ms should make it not converge*/
+    migrate_set_parameter_int(from, "downtime-limit", 1);
+    /* 1GB/s */
+    migrate_set_parameter_int(from, "max-bandwidth", 1000000000);
+
+    migrate_set_parameter_int(from, "multifd-channels", 16);
+    migrate_set_parameter_int(to, "multifd-channels", 16);
+
+    migrate_set_capability(from, "multifd", "true");
+    migrate_set_capability(to, "multifd", "true");
+
+    /* Start incoming migration from the 1st socket */
+    rsp = wait_command(to, "{ 'execute': 'migrate-incoming',"
+                           "  'arguments': { 'uri': 'tcp:127.0.0.1:0' }}");
+    qobject_unref(rsp);
+
+    /* Wait for the first serial output from the source */
+    wait_for_serial("src_serial");
+
+    uri = migrate_get_socket_address(to, "socket-address");
+
+    migrate(from, uri, "{}");
+
+    wait_for_migration_pass(from);
+
+    printf("before cancel\n");
+    migrate_cancel(from);
+    printf("after cancel\n");
+
+    args = migrate_start_new();
+    args->only_target = true;
+
+    if (test_migrate_start(&from, &to, "defer", args)) {
+        return;
+    }
+
+    migrate_set_parameter_int(to, "multifd-channels", 16);
+
+    migrate_set_capability(to, "multifd", "true");
+
+    /* Start incoming migration from the 1st socket */
+    rsp = wait_command(to, "{ 'execute': 'migrate-incoming',"
+                           "  'arguments': { 'uri': 'tcp:127.0.0.1:0' }}");
+    qobject_unref(rsp);
+
+    /* 300ms it should converge */
+    migrate_set_parameter_int(from, "downtime-limit", 600);
+
+    uri = migrate_get_socket_address(to, "socket-address");
+
+    migrate(from, uri, "{}");
+
+    wait_for_migration_pass(from);
+
+    if (!got_stop) {
+        qtest_qmp_eventwait(from, "STOP");
+    }
+    qtest_qmp_eventwait(to, "RESUME");
+
+    wait_for_serial("dest_serial");
+    wait_for_migration_complete(from);
+    test_migrate_end(from, to, true);
+    free(uri);
+}
+
 int main(int argc, char **argv)
 {
     char template[] = "/tmp/migration-test-XXXXXX";
@@ -1537,6 +1642,7 @@ int main(int argc, char **argv)
     qtest_add_func("/migration/multifd/tcp/none", test_multifd_tcp_none);
     qtest_add_func("/migration/multifd/tcp/zlib", test_multifd_tcp_zlib);
     qtest_add_func("/migration/multifd/tcp/zstd", test_multifd_tcp_zstd);
+    qtest_add_func("/migration/multifd/tcp/cancel", test_multifd_tcp_cancel);
 
     ret = g_test_run();
 
-- 
2.23.0



^ permalink raw reply related	[flat|nested] 17+ messages in thread

* [PATCH 4/4] migration: Make sure that we don't call write() in case of error
  2019-12-18  5:04 [PATCH 0/4] Fix multifd + cancel + multifd Juan Quintela
                   ` (2 preceding siblings ...)
  2019-12-18  5:04 ` [PATCH 3/4] migration-test: Make sure that multifd and cancel works Juan Quintela
@ 2019-12-18  5:04 ` Juan Quintela
  2019-12-18 16:33   ` Dr. David Alan Gilbert
  3 siblings, 1 reply; 17+ messages in thread
From: Juan Quintela @ 2019-12-18  5:04 UTC (permalink / raw)
  To: qemu-devel
  Cc: Laurent Vivier, Paolo Bonzini, Thomas Huth,
	Dr. David Alan Gilbert, Juan Quintela

If we are exiting due to an error/finish/.... Just don't try to even
touch the channel with one IO operation.

Signed-off-by: Juan Quintela <quintela@redhat.com>
---
 migration/ram.c | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/migration/ram.c b/migration/ram.c
index 4b44578e57..909ef6d237 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -1601,6 +1601,12 @@ struct {
     QemuSemaphore channels_ready;
     /* multifd ops */
     MultiFDMethods *ops;
+    /*
+     * Have we already run terminate threads.  There is a race when it
+     * happens that we got one error while we are exiting.
+     * We will use atomic operations.  Only valid values are 0 and 1.
+     */
+    int exiting;
 } *multifd_send_state;
 
 /*
@@ -1629,6 +1635,10 @@ static int multifd_send_pages(RAMState *rs)
     MultiFDPages_t *pages = multifd_send_state->pages;
     uint64_t transferred;
 
+    if (atomic_read(&multifd_send_state->exiting)) {
+        return -1;
+    }
+
     qemu_sem_wait(&multifd_send_state->channels_ready);
     for (i = next_channel;; i = (i + 1) % migrate_multifd_channels()) {
         p = &multifd_send_state->params[i];
@@ -1710,6 +1720,10 @@ static void multifd_send_terminate_threads(Error *err)
         }
     }
 
+    if (atomic_xchg(&multifd_send_state->exiting, 1)) {
+        return;
+    }
+
     for (i = 0; i < migrate_multifd_channels(); i++) {
         MultiFDSendParams *p = &multifd_send_state->params[i];
 
@@ -1824,6 +1838,10 @@ static void *multifd_send_thread(void *opaque)
 
     while (true) {
         qemu_sem_wait(&p->sem);
+
+        if (atomic_read(&multifd_send_state->exiting)) {
+            break;
+        }
         qemu_mutex_lock(&p->mutex);
 
         if (p->pending_job) {
@@ -1938,6 +1956,7 @@ int multifd_save_setup(Error **errp)
     multifd_send_state->pages = multifd_pages_init(page_count);
     qemu_sem_init(&multifd_send_state->channels_ready, 0);
     multifd_send_state->ops = multifd_ops[migrate_multifd_method()];
+    atomic_set(&multifd_send_state->exiting, 0);
 
     for (i = 0; i < thread_count; i++) {
         MultiFDSendParams *p = &multifd_send_state->params[i];
-- 
2.23.0



^ permalink raw reply related	[flat|nested] 17+ messages in thread

* Re: [PATCH 1/4] qemu-file: Don't do IO after shutdown
  2019-12-18  5:04 ` [PATCH 1/4] qemu-file: Don't do IO after shutdown Juan Quintela
@ 2019-12-18 12:27   ` Dr. David Alan Gilbert
  2019-12-18 14:35     ` Juan Quintela
                       ` (2 more replies)
  0 siblings, 3 replies; 17+ messages in thread
From: Dr. David Alan Gilbert @ 2019-12-18 12:27 UTC (permalink / raw)
  To: Juan Quintela; +Cc: Laurent Vivier, Paolo Bonzini, Thomas Huth, qemu-devel

* Juan Quintela (quintela@redhat.com) wrote:
> Be sure that we are not doing neither read/write after shutdown of the
> QEMUFile.
> 
> Signed-off-by: Juan Quintela <quintela@redhat.com>
> ---
>  migration/qemu-file.c | 13 +++++++++++++
>  1 file changed, 13 insertions(+)
> 
> diff --git a/migration/qemu-file.c b/migration/qemu-file.c
> index 26fb25ddc1..1e5543a279 100644
> --- a/migration/qemu-file.c
> +++ b/migration/qemu-file.c
> @@ -53,6 +53,8 @@ struct QEMUFile {
>  
>      int last_error;
>      Error *last_error_obj;
> +    /* has the file has been shutdown */
> +    bool shutdown;
>  };
>  
>  /*
> @@ -61,6 +63,7 @@ struct QEMUFile {
>   */
>  int qemu_file_shutdown(QEMUFile *f)
>  {
> +    f->shutdown = true;
>      if (!f->ops->shut_down) {
>          return -ENOSYS;
>      }
> @@ -214,6 +217,9 @@ void qemu_fflush(QEMUFile *f)
>          return;
>      }
>  
> +    if (f->shutdown) {
> +        return;
> +    }

OK, I did wonder if you need to free the iovec.

>      if (f->iovcnt > 0) {
>          expect = iov_size(f->iov, f->iovcnt);
>          ret = f->ops->writev_buffer(f->opaque, f->iov, f->iovcnt, f->pos,
> @@ -328,6 +334,10 @@ static ssize_t qemu_fill_buffer(QEMUFile *f)
>      f->buf_index = 0;
>      f->buf_size = pending;
>  
> +    if (f->shutdown) {
> +        return 0;
> +    }

I also wondered if perhaps an error would be reasonable here; but I'm
not sure what a read(2) does after a shutdown(2).

Still,


Reviewed-by: Dr. David Alan Gilbert <dgilbert@redhat.com>

>      len = f->ops->get_buffer(f->opaque, f->buf + pending, f->pos,
>                               IO_BUF_SIZE - pending, &local_error);
>      if (len > 0) {
> @@ -642,6 +652,9 @@ int64_t qemu_ftell(QEMUFile *f)
>  
>  int qemu_file_rate_limit(QEMUFile *f)
>  {
> +    if (f->shutdown) {
> +        return 1;
> +    }
>      if (qemu_file_get_error(f)) {
>          return 1;
>      }
> -- 
> 2.23.0
> 
--
Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK



^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 2/4] multifd: Make sure that we don't do any IO after an error
  2019-12-18  5:04 ` [PATCH 2/4] multifd: Make sure that we don't do any IO after an error Juan Quintela
@ 2019-12-18 13:57   ` Dr. David Alan Gilbert
  2019-12-18 14:15     ` Juan Quintela
  0 siblings, 1 reply; 17+ messages in thread
From: Dr. David Alan Gilbert @ 2019-12-18 13:57 UTC (permalink / raw)
  To: Juan Quintela; +Cc: Laurent Vivier, Paolo Bonzini, Thomas Huth, qemu-devel

* Juan Quintela (quintela@redhat.com) wrote:
> Signed-off-by: Juan Quintela <quintela@redhat.com>

Reviewed-by: Dr. David Alan Gilbert <dgilbert@redhat.com>

I wonder if the fflush's should happen anyway?

> ---
>  migration/ram.c | 22 +++++++++++++---------
>  1 file changed, 13 insertions(+), 9 deletions(-)
> 
> diff --git a/migration/ram.c b/migration/ram.c
> index db90237977..4b44578e57 100644
> --- a/migration/ram.c
> +++ b/migration/ram.c
> @@ -4132,7 +4132,7 @@ static int ram_save_iterate(QEMUFile *f, void *opaque)
>  {
>      RAMState **temp = opaque;
>      RAMState *rs = *temp;
> -    int ret;
> +    int ret = 0;
>      int i;
>      int64_t t0;
>      int done = 0;
> @@ -4203,12 +4203,14 @@ static int ram_save_iterate(QEMUFile *f, void *opaque)
>      ram_control_after_iterate(f, RAM_CONTROL_ROUND);
>  
>  out:
> -    multifd_send_sync_main(rs);
> -    qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
> -    qemu_fflush(f);
> -    ram_counters.transferred += 8;
> +    if (ret >= 0) {
> +        multifd_send_sync_main(rs);
> +        qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
> +        qemu_fflush(f);
> +        ram_counters.transferred += 8;
>  
> -    ret = qemu_file_get_error(f);
> +        ret = qemu_file_get_error(f);
> +    }
>      if (ret < 0) {
>          return ret;
>      }
> @@ -4260,9 +4262,11 @@ static int ram_save_complete(QEMUFile *f, void *opaque)
>          ram_control_after_iterate(f, RAM_CONTROL_FINISH);
>      }
>  
> -    multifd_send_sync_main(rs);
> -    qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
> -    qemu_fflush(f);
> +    if (ret >= 0) {
> +        multifd_send_sync_main(rs);
> +        qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
> +        qemu_fflush(f);
> +    }
>  
>      return ret;
>  }
> -- 
> 2.23.0
> 
--
Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK



^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 2/4] multifd: Make sure that we don't do any IO after an error
  2019-12-18 13:57   ` Dr. David Alan Gilbert
@ 2019-12-18 14:15     ` Juan Quintela
  0 siblings, 0 replies; 17+ messages in thread
From: Juan Quintela @ 2019-12-18 14:15 UTC (permalink / raw)
  To: Dr. David Alan Gilbert
  Cc: Laurent Vivier, Paolo Bonzini, Thomas Huth, qemu-devel

"Dr. David Alan Gilbert" <dgilbert@redhat.com> wrote:
> * Juan Quintela (quintela@redhat.com) wrote:
>> Signed-off-by: Juan Quintela <quintela@redhat.com>
>
> Reviewed-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
>
> I wonder if the fflush's should happen anyway?

No, that is the problem that I am really trying to fix.  We tried to do
a write() after we knew that we have closed it (due to error or
cancel).  And we get a nice error message on the screen:

Unable to write to socket()

And everybody gets scared about that message.  When we really know that
we don't want it.

In an ideal world, we would just remove ->last_error() and make every
function return errors and check return codes.  But this is qemu.  This
is migration.  And we don't do it.  Sniff.



^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 1/4] qemu-file: Don't do IO after shutdown
  2019-12-18 12:27   ` Dr. David Alan Gilbert
@ 2019-12-18 14:35     ` Juan Quintela
  2019-12-18 15:24       ` Dr. David Alan Gilbert
  2019-12-29 18:20     ` Juan Quintela
  2020-01-08  9:49     ` Dr. David Alan Gilbert
  2 siblings, 1 reply; 17+ messages in thread
From: Juan Quintela @ 2019-12-18 14:35 UTC (permalink / raw)
  To: Dr. David Alan Gilbert
  Cc: Laurent Vivier, Paolo Bonzini, Thomas Huth, qemu-devel

"Dr. David Alan Gilbert" <dgilbert@redhat.com> wrote:
> * Juan Quintela (quintela@redhat.com) wrote:
>> Be sure that we are not doing neither read/write after shutdown of the
>> QEMUFile.
>> 
>> Signed-off-by: Juan Quintela <quintela@redhat.com>
>> ---
>>  migration/qemu-file.c | 13 +++++++++++++
>>  1 file changed, 13 insertions(+)
>> 
>> diff --git a/migration/qemu-file.c b/migration/qemu-file.c
>> index 26fb25ddc1..1e5543a279 100644
>> --- a/migration/qemu-file.c
>> +++ b/migration/qemu-file.c
>> @@ -53,6 +53,8 @@ struct QEMUFile {
>>  
>>      int last_error;
>>      Error *last_error_obj;
>> +    /* has the file has been shutdown */
>> +    bool shutdown;
>>  };
>>  
>>  /*
>> @@ -61,6 +63,7 @@ struct QEMUFile {
>>   */
>>  int qemu_file_shutdown(QEMUFile *f)
>>  {
>> +    f->shutdown = true;
>>      if (!f->ops->shut_down) {
>>          return -ENOSYS;
>>      }
>> @@ -214,6 +217,9 @@ void qemu_fflush(QEMUFile *f)
>>          return;
>>      }
>>  
>> +    if (f->shutdown) {
>> +        return;
>> +    }
>
> OK, I did wonder if you need to free the iovec.

I will think about this one.

>>      if (f->iovcnt > 0) {
>>          expect = iov_size(f->iov, f->iovcnt);
>>          ret = f->ops->writev_buffer(f->opaque, f->iov, f->iovcnt, f->pos,
>> @@ -328,6 +334,10 @@ static ssize_t qemu_fill_buffer(QEMUFile *f)
>>      f->buf_index = 0;
>>      f->buf_size = pending;
>>  
>> +    if (f->shutdown) {
>> +        return 0;
>> +    }
>
> I also wondered if perhaps an error would be reasonable here; but I'm
> not sure what a read(2) does after a shutdown(2).

A fast google shows that it is .... implementation dependant.  And
worse, only really works for sockets.


> Still,
>
>
> Reviewed-by: Dr. David Alan Gilbert <dgilbert@redhat.com>

Thanks.



^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 1/4] qemu-file: Don't do IO after shutdown
  2019-12-18 14:35     ` Juan Quintela
@ 2019-12-18 15:24       ` Dr. David Alan Gilbert
  0 siblings, 0 replies; 17+ messages in thread
From: Dr. David Alan Gilbert @ 2019-12-18 15:24 UTC (permalink / raw)
  To: Juan Quintela; +Cc: Laurent Vivier, Paolo Bonzini, Thomas Huth, qemu-devel

* Juan Quintela (quintela@redhat.com) wrote:
> "Dr. David Alan Gilbert" <dgilbert@redhat.com> wrote:
> > * Juan Quintela (quintela@redhat.com) wrote:
> >> Be sure that we are not doing neither read/write after shutdown of the
> >> QEMUFile.
> >> 
> >> Signed-off-by: Juan Quintela <quintela@redhat.com>
> >> ---
> >>  migration/qemu-file.c | 13 +++++++++++++
> >>  1 file changed, 13 insertions(+)
> >> 
> >> diff --git a/migration/qemu-file.c b/migration/qemu-file.c
> >> index 26fb25ddc1..1e5543a279 100644
> >> --- a/migration/qemu-file.c
> >> +++ b/migration/qemu-file.c
> >> @@ -53,6 +53,8 @@ struct QEMUFile {
> >>  
> >>      int last_error;
> >>      Error *last_error_obj;
> >> +    /* has the file has been shutdown */
> >> +    bool shutdown;
> >>  };
> >>  
> >>  /*
> >> @@ -61,6 +63,7 @@ struct QEMUFile {
> >>   */
> >>  int qemu_file_shutdown(QEMUFile *f)
> >>  {
> >> +    f->shutdown = true;
> >>      if (!f->ops->shut_down) {
> >>          return -ENOSYS;
> >>      }
> >> @@ -214,6 +217,9 @@ void qemu_fflush(QEMUFile *f)
> >>          return;
> >>      }
> >>  
> >> +    if (f->shutdown) {
> >> +        return;
> >> +    }
> >
> > OK, I did wonder if you need to free the iovec.
> 
> I will think about this one.
> 
> >>      if (f->iovcnt > 0) {
> >>          expect = iov_size(f->iov, f->iovcnt);
> >>          ret = f->ops->writev_buffer(f->opaque, f->iov, f->iovcnt, f->pos,
> >> @@ -328,6 +334,10 @@ static ssize_t qemu_fill_buffer(QEMUFile *f)
> >>      f->buf_index = 0;
> >>      f->buf_size = pending;
> >>  
> >> +    if (f->shutdown) {
> >> +        return 0;
> >> +    }
> >
> > I also wondered if perhaps an error would be reasonable here; but I'm
> > not sure what a read(2) does after a shutdown(2).
> 
> A fast google shows that it is .... implementation dependant.  And
> worse, only really works for sockets.

Yeh, so our main reason for using it is for hung sockets; in particular,
if a machine just disappears, then you get a many-minute hang waiting
for TCP to timeout.  Using 'shutdown(2)' means you can migrate_cancel
even in that situation.  The same thing happens when you're using
sockets in both directions, if you get an error on one direction you can
shutdown(2) the other direction so you know any thread doesn't get stuck
on it.

Dave

> 
> > Still,
> >
> >
> > Reviewed-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
> 
> Thanks.
--
Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK



^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 3/4] migration-test: Make sure that multifd and cancel works
  2019-12-18  5:04 ` [PATCH 3/4] migration-test: Make sure that multifd and cancel works Juan Quintela
@ 2019-12-18 16:25   ` Dr. David Alan Gilbert
  2019-12-29 18:27     ` Juan Quintela
  0 siblings, 1 reply; 17+ messages in thread
From: Dr. David Alan Gilbert @ 2019-12-18 16:25 UTC (permalink / raw)
  To: Juan Quintela; +Cc: Laurent Vivier, Paolo Bonzini, Thomas Huth, qemu-devel

* Juan Quintela (quintela@redhat.com) wrote:
> Test that this sequerce works:
> 
> - launch source
> - launch target
> - start migration
> - cancel migration
> - relaunch target
> - do migration again
> 
> Signed-off-by: Juan Quintela <quintela@redhat.com>
> ---
>  tests/migration-test.c | 108 ++++++++++++++++++++++++++++++++++++++++-
>  1 file changed, 107 insertions(+), 1 deletion(-)
> 
> diff --git a/tests/migration-test.c b/tests/migration-test.c
> index 7588f50b9b..1c93b3e5bc 100644
> --- a/tests/migration-test.c
> +++ b/tests/migration-test.c
> @@ -527,6 +527,14 @@ static void migrate_recover(QTestState *who, const char *uri)
>      qobject_unref(rsp);
>  }
>  
> +static void migrate_cancel(QTestState *who)
> +{
> +    QDict *rsp;
> +
> +    rsp = wait_command(who, "{ 'execute': 'migrate_cancel' }");
> +    qobject_unref(rsp);
> +}
> +
>  static void migrate_set_capability(QTestState *who, const char *capability,
>                                     bool value)
>  {
> @@ -583,6 +591,8 @@ static void migrate_postcopy_start(QTestState *from, QTestState *to)
>  typedef struct {
>      bool hide_stderr;
>      bool use_shmem;
> +    /* only launch the target process */
> +    bool only_target;
>      char *opts_source;
>      char *opts_target;
>  } MigrateStart;
> @@ -704,7 +714,9 @@ static int test_migrate_start(QTestState **from, QTestState **to,
>                                   arch_source, shmem_opts, args->opts_source,
>                                   ignore_stderr);
>      g_free(arch_source);
> -    *from = qtest_init(cmd_source);
> +    if (!args->only_target) {
> +        *from = qtest_init(cmd_source);
> +    }
>      g_free(cmd_source);
>  
>      cmd_target = g_strdup_printf("-machine %saccel=kvm:tcg%s "
> @@ -1470,6 +1482,99 @@ static void test_multifd_tcp_zstd(void)
>      test_multifd_tcp("zstd");
>  }
>  
> +/*
> + * This test does:
> + *  source               target
> + *                       migrate_incoming
> + *     migrate
> + *     migrate_cancel
> + *                       launch another target
> + *     migrate
> + *
> + *  And see that it works
> + */
> +
> +static void test_multifd_tcp_cancel(void)
> +{
> +    MigrateStart *args = migrate_start_new();
> +    QTestState *from, *to;
> +    QDict *rsp;
> +    char *uri;
> +
> +    if (test_migrate_start(&from, &to, "defer", args)) {
> +        return;
> +    }
> +
> +    /*
> +     * We want to pick a speed slow enough that the test completes
> +     * quickly, but that it doesn't complete precopy even on a slow
> +     * machine, so also set the downtime.
> +     */
> +    /* 1 ms should make it not converge*/
> +    migrate_set_parameter_int(from, "downtime-limit", 1);
> +    /* 1GB/s */
> +    migrate_set_parameter_int(from, "max-bandwidth", 1000000000);

This is copied from postcopy_prepare, note that I dropped that bandwidth
quite a bit in 513aa2c because we were seeing TCG on slow hosts converge
even at 1ms, because the vCPU wasn't dirtying pages quickly.

> +    migrate_set_parameter_int(from, "multifd-channels", 16);
> +    migrate_set_parameter_int(to, "multifd-channels", 16);
> +
> +    migrate_set_capability(from, "multifd", "true");
> +    migrate_set_capability(to, "multifd", "true");
> +
> +    /* Start incoming migration from the 1st socket */
> +    rsp = wait_command(to, "{ 'execute': 'migrate-incoming',"
> +                           "  'arguments': { 'uri': 'tcp:127.0.0.1:0' }}");
> +    qobject_unref(rsp);
> +
> +    /* Wait for the first serial output from the source */
> +    wait_for_serial("src_serial");
> +
> +    uri = migrate_get_socket_address(to, "socket-address");
> +
> +    migrate(from, uri, "{}");
> +
> +    wait_for_migration_pass(from);
> +
> +    printf("before cancel\n");
> +    migrate_cancel(from);
> +    printf("after cancel\n");

Do you really want those printf's for normal operation?

> +    args = migrate_start_new();
> +    args->only_target = true;
> +
> +    if (test_migrate_start(&from, &to, "defer", args)) {
> +        return;
> +    }
> +
> +    migrate_set_parameter_int(to, "multifd-channels", 16);
> +
> +    migrate_set_capability(to, "multifd", "true");
> +
> +    /* Start incoming migration from the 1st socket */
> +    rsp = wait_command(to, "{ 'execute': 'migrate-incoming',"
> +                           "  'arguments': { 'uri': 'tcp:127.0.0.1:0' }}");
> +    qobject_unref(rsp);
> +
> +    /* 300ms it should converge */
> +    migrate_set_parameter_int(from, "downtime-limit", 600);

Comment doesn't match parameter!

With those fixed;


Reviewed-by: Dr. David Alan Gilbert <dgilbert@redhat.com>

> +    uri = migrate_get_socket_address(to, "socket-address");
> +
> +    migrate(from, uri, "{}");
> +
> +    wait_for_migration_pass(from);
> +
> +    if (!got_stop) {
> +        qtest_qmp_eventwait(from, "STOP");
> +    }
> +    qtest_qmp_eventwait(to, "RESUME");
> +
> +    wait_for_serial("dest_serial");
> +    wait_for_migration_complete(from);
> +    test_migrate_end(from, to, true);
> +    free(uri);
> +}
> +
>  int main(int argc, char **argv)
>  {
>      char template[] = "/tmp/migration-test-XXXXXX";
> @@ -1537,6 +1642,7 @@ int main(int argc, char **argv)
>      qtest_add_func("/migration/multifd/tcp/none", test_multifd_tcp_none);
>      qtest_add_func("/migration/multifd/tcp/zlib", test_multifd_tcp_zlib);
>      qtest_add_func("/migration/multifd/tcp/zstd", test_multifd_tcp_zstd);
> +    qtest_add_func("/migration/multifd/tcp/cancel", test_multifd_tcp_cancel);
>  
>      ret = g_test_run();
>  
> -- 
> 2.23.0
> 
--
Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK



^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 4/4] migration: Make sure that we don't call write() in case of error
  2019-12-18  5:04 ` [PATCH 4/4] migration: Make sure that we don't call write() in case of error Juan Quintela
@ 2019-12-18 16:33   ` Dr. David Alan Gilbert
  0 siblings, 0 replies; 17+ messages in thread
From: Dr. David Alan Gilbert @ 2019-12-18 16:33 UTC (permalink / raw)
  To: Juan Quintela; +Cc: Laurent Vivier, Paolo Bonzini, Thomas Huth, qemu-devel

* Juan Quintela (quintela@redhat.com) wrote:
> If we are exiting due to an error/finish/.... Just don't try to even
> touch the channel with one IO operation.
> 
> Signed-off-by: Juan Quintela <quintela@redhat.com>
> ---
>  migration/ram.c | 19 +++++++++++++++++++
>  1 file changed, 19 insertions(+)
> 
> diff --git a/migration/ram.c b/migration/ram.c
> index 4b44578e57..909ef6d237 100644
> --- a/migration/ram.c
> +++ b/migration/ram.c
> @@ -1601,6 +1601,12 @@ struct {
>      QemuSemaphore channels_ready;
>      /* multifd ops */
>      MultiFDMethods *ops;
> +    /*
> +     * Have we already run terminate threads.  There is a race when it
> +     * happens that we got one error while we are exiting.
> +     * We will use atomic operations.  Only valid values are 0 and 1.
> +     */
> +    int exiting;
>  } *multifd_send_state;
>  
>  /*
> @@ -1629,6 +1635,10 @@ static int multifd_send_pages(RAMState *rs)
>      MultiFDPages_t *pages = multifd_send_state->pages;
>      uint64_t transferred;
>  
> +    if (atomic_read(&multifd_send_state->exiting)) {
> +        return -1;
> +    }
> +
>      qemu_sem_wait(&multifd_send_state->channels_ready);
>      for (i = next_channel;; i = (i + 1) % migrate_multifd_channels()) {
>          p = &multifd_send_state->params[i];
> @@ -1710,6 +1720,10 @@ static void multifd_send_terminate_threads(Error *err)
>          }
>      }
>  
> +    if (atomic_xchg(&multifd_send_state->exiting, 1)) {
> +        return;
> +    }

That could do with a comment on it;  I think what you're saying is
'don't do send_terminate_threads twice'

With a comment,


Reviewed-by: Dr. David Alan Gilbert <dgilbert@redhat.com>

>      for (i = 0; i < migrate_multifd_channels(); i++) {
>          MultiFDSendParams *p = &multifd_send_state->params[i];
>  
> @@ -1824,6 +1838,10 @@ static void *multifd_send_thread(void *opaque)
>  
>      while (true) {
>          qemu_sem_wait(&p->sem);
> +
> +        if (atomic_read(&multifd_send_state->exiting)) {
> +            break;
> +        }
>          qemu_mutex_lock(&p->mutex);
>  
>          if (p->pending_job) {
> @@ -1938,6 +1956,7 @@ int multifd_save_setup(Error **errp)
>      multifd_send_state->pages = multifd_pages_init(page_count);
>      qemu_sem_init(&multifd_send_state->channels_ready, 0);
>      multifd_send_state->ops = multifd_ops[migrate_multifd_method()];
> +    atomic_set(&multifd_send_state->exiting, 0);
>  
>      for (i = 0; i < thread_count; i++) {
>          MultiFDSendParams *p = &multifd_send_state->params[i];
> -- 
> 2.23.0
> 
--
Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK



^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 1/4] qemu-file: Don't do IO after shutdown
  2019-12-18 12:27   ` Dr. David Alan Gilbert
  2019-12-18 14:35     ` Juan Quintela
@ 2019-12-29 18:20     ` Juan Quintela
  2020-01-08  9:49     ` Dr. David Alan Gilbert
  2 siblings, 0 replies; 17+ messages in thread
From: Juan Quintela @ 2019-12-29 18:20 UTC (permalink / raw)
  To: Dr. David Alan Gilbert
  Cc: Laurent Vivier, Paolo Bonzini, Thomas Huth, qemu-devel

"Dr. David Alan Gilbert" <dgilbert@redhat.com> wrote:
> * Juan Quintela (quintela@redhat.com) wrote:
>> Be sure that we are not doing neither read/write after shutdown of the
>> QEMUFile.
>> 
>> Signed-off-by: Juan Quintela <quintela@redhat.com>
>> ---
>>  migration/qemu-file.c | 13 +++++++++++++
>>  1 file changed, 13 insertions(+)
>> 
>> diff --git a/migration/qemu-file.c b/migration/qemu-file.c
>> index 26fb25ddc1..1e5543a279 100644
>> --- a/migration/qemu-file.c
>> +++ b/migration/qemu-file.c
>> @@ -53,6 +53,8 @@ struct QEMUFile {
>>  
>>      int last_error;
>>      Error *last_error_obj;
>> +    /* has the file has been shutdown */
>> +    bool shutdown;
>>  };
>>  
>>  /*
>> @@ -61,6 +63,7 @@ struct QEMUFile {
>>   */
>>  int qemu_file_shutdown(QEMUFile *f)
>>  {
>> +    f->shutdown = true;
>>      if (!f->ops->shut_down) {
>>          return -ENOSYS;
>>      }
>> @@ -214,6 +217,9 @@ void qemu_fflush(QEMUFile *f)
>>          return;
>>      }
>>  
>> +    if (f->shutdown) {
>> +        return;
>> +    }
>
> OK, I did wonder if you need to free the iovec.

We need to improve things here.  We should free it on the 1st
error/shutdown.  Withought fixing all callers, I don't feel "safe" doing
it.

>
>>      if (f->iovcnt > 0) {
>>          expect = iov_size(f->iov, f->iovcnt);
>>          ret = f->ops->writev_buffer(f->opaque, f->iov, f->iovcnt, f->pos,
>> @@ -328,6 +334,10 @@ static ssize_t qemu_fill_buffer(QEMUFile *f)
>>      f->buf_index = 0;
>>      f->buf_size = pending;
>>  
>> +    if (f->shutdown) {
>> +        return 0;
>> +    }
>
> I also wondered if perhaps an error would be reasonable here; but I'm
> not sure what a read(2) does after a shutdown(2).

We should check this sooner.  Same than prevoious.  If there has been an
error anywhere else, we should fail qemu_fill_buffer().  Right now we
don't do it. and we should.

qemu_get_error() and the setter should dissapear.  And we should just
return errors in all functions.  Especially now that we have migration
thread, and we don't have callbacks anymore.

> Still,
>
>
> Reviewed-by: Dr. David Alan Gilbert <dgilbert@redhat.com>

Thanks, Juan.

>>      len = f->ops->get_buffer(f->opaque, f->buf + pending, f->pos,
>>                               IO_BUF_SIZE - pending, &local_error);
>>      if (len > 0) {
>> @@ -642,6 +652,9 @@ int64_t qemu_ftell(QEMUFile *f)
>>  
>>  int qemu_file_rate_limit(QEMUFile *f)
>>  {
>> +    if (f->shutdown) {
>> +        return 1;
>> +    }
>>      if (qemu_file_get_error(f)) {
>>          return 1;
>>      }
>> -- 
>> 2.23.0
>> 
> --
> Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK



^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 3/4] migration-test: Make sure that multifd and cancel works
  2019-12-18 16:25   ` Dr. David Alan Gilbert
@ 2019-12-29 18:27     ` Juan Quintela
  0 siblings, 0 replies; 17+ messages in thread
From: Juan Quintela @ 2019-12-29 18:27 UTC (permalink / raw)
  To: Dr. David Alan Gilbert
  Cc: Laurent Vivier, Paolo Bonzini, Thomas Huth, qemu-devel

"Dr. David Alan Gilbert" <dgilbert@redhat.com> wrote:
> * Juan Quintela (quintela@redhat.com) wrote:
>> Test that this sequerce works:

>> +    /* 1 ms should make it not converge*/
>> +    migrate_set_parameter_int(from, "downtime-limit", 1);
>> +    /* 1GB/s */
>> +    migrate_set_parameter_int(from, "max-bandwidth", 1000000000);
>
> This is copied from postcopy_prepare, note that I dropped that bandwidth
> quite a bit in 513aa2c because we were seeing TCG on slow hosts converge
> even at 1ms, because the vCPU wasn't dirtying pages quickly.
>

We have to use a #define to have everything using the same.  Right now,
I am using the same that preoopy_tcp and that multifd :-(

>> +    migrate_set_parameter_int(from, "multifd-channels", 16);
>> +    migrate_set_parameter_int(to, "multifd-channels", 16);
>> +
>> +    migrate_set_capability(from, "multifd", "true");
>> +    migrate_set_capability(to, "multifd", "true");
>> +
>> +    /* Start incoming migration from the 1st socket */
>> +    rsp = wait_command(to, "{ 'execute': 'migrate-incoming',"
>> +                           "  'arguments': { 'uri': 'tcp:127.0.0.1:0' }}");
>> +    qobject_unref(rsp);
>> +
>> +    /* Wait for the first serial output from the source */
>> +    wait_for_serial("src_serial");
>> +
>> +    uri = migrate_get_socket_address(to, "socket-address");
>> +
>> +    migrate(from, uri, "{}");
>> +
>> +    wait_for_migration_pass(from);
>> +
>> +    printf("before cancel\n");
>> +    migrate_cancel(from);
>> +    printf("after cancel\n");
>
> Do you really want those printf's for normal operation?

Obviously no, thanks.

>> +
>> +    /* 300ms it should converge */
>> +    migrate_set_parameter_int(from, "downtime-limit", 600);
>
> Comment doesn't match parameter!

Ooops.

>
> With those fixed;

>
>
> Reviewed-by: Dr. David Alan Gilbert <dgilbert@redhat.com>

Thanks.



^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 1/4] qemu-file: Don't do IO after shutdown
  2019-12-18 12:27   ` Dr. David Alan Gilbert
  2019-12-18 14:35     ` Juan Quintela
  2019-12-29 18:20     ` Juan Quintela
@ 2020-01-08  9:49     ` Dr. David Alan Gilbert
  2020-01-08 12:40       ` Juan Quintela
  2 siblings, 1 reply; 17+ messages in thread
From: Dr. David Alan Gilbert @ 2020-01-08  9:49 UTC (permalink / raw)
  To: Juan Quintela; +Cc: Laurent Vivier, Paolo Bonzini, Thomas Huth, qemu-devel

* Dr. David Alan Gilbert (dgilbert@redhat.com) wrote:
> * Juan Quintela (quintela@redhat.com) wrote:
> > Be sure that we are not doing neither read/write after shutdown of the
> > QEMUFile.
> > 
> > Signed-off-by: Juan Quintela <quintela@redhat.com>
> > ---
> >  migration/qemu-file.c | 13 +++++++++++++
> >  1 file changed, 13 insertions(+)
> > 
> > diff --git a/migration/qemu-file.c b/migration/qemu-file.c
> > index 26fb25ddc1..1e5543a279 100644
> > --- a/migration/qemu-file.c
> > +++ b/migration/qemu-file.c
> > @@ -53,6 +53,8 @@ struct QEMUFile {
> >  
> >      int last_error;
> >      Error *last_error_obj;
> > +    /* has the file has been shutdown */
> > +    bool shutdown;
> >  };
> >  
> >  /*
> > @@ -61,6 +63,7 @@ struct QEMUFile {
> >   */
> >  int qemu_file_shutdown(QEMUFile *f)
> >  {
> > +    f->shutdown = true;
> >      if (!f->ops->shut_down) {
> >          return -ENOSYS;
> >      }
> > @@ -214,6 +217,9 @@ void qemu_fflush(QEMUFile *f)
> >          return;
> >      }
> >  
> > +    if (f->shutdown) {
> > +        return;
> > +    }
> 
> OK, I did wonder if you need to free the iovec.
> 
> >      if (f->iovcnt > 0) {
> >          expect = iov_size(f->iov, f->iovcnt);
> >          ret = f->ops->writev_buffer(f->opaque, f->iov, f->iovcnt, f->pos,
> > @@ -328,6 +334,10 @@ static ssize_t qemu_fill_buffer(QEMUFile *f)
> >      f->buf_index = 0;
> >      f->buf_size = pending;
> >  
> > +    if (f->shutdown) {
> > +        return 0;
> > +    }
> 
> I also wondered if perhaps an error would be reasonable here; but I'm
> not sure what a read(2) does after a shutdown(2).
> 
> Still,
> 
> 
> Reviewed-by: Dr. David Alan Gilbert <dgilbert@redhat.com>

Actually, it turns out this breaks an assumption - 'shutdown' must cause
reads/writes/etc to fail and for the qemu_file to go into error state.
There's a few places we loop doing IO until we either change migration
state or the file goes into error.


diff --git a/migration/qemu-file.c b/migration/qemu-file.c
index 1e5543a279..bbb2b63927 100644
--- a/migration/qemu-file.c
+++ b/migration/qemu-file.c
@@ -63,11 +63,18 @@ struct QEMUFile {
  */
 int qemu_file_shutdown(QEMUFile *f)
 {
+    int ret;
+
     f->shutdown = true;
     if (!f->ops->shut_down) {
         return -ENOSYS;
     }
-    return f->ops->shut_down(f->opaque, true, true, NULL);
+    ret = f->ops->shut_down(f->opaque, true, true, NULL);
+
+    if (!f->last_error) {
+        qemu_file_set_error(f, -EIO);
+    }
+    return ret;
 }
 
 /*


seems to fix it for me.

Dave

> >      len = f->ops->get_buffer(f->opaque, f->buf + pending, f->pos,
> >                               IO_BUF_SIZE - pending, &local_error);
> >      if (len > 0) {
> > @@ -642,6 +652,9 @@ int64_t qemu_ftell(QEMUFile *f)
> >  
> >  int qemu_file_rate_limit(QEMUFile *f)
> >  {
> > +    if (f->shutdown) {
> > +        return 1;
> > +    }
> >      if (qemu_file_get_error(f)) {
> >          return 1;
> >      }
> > -- 
> > 2.23.0
> > 
> --
> Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
> 
> 
--
Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK



^ permalink raw reply related	[flat|nested] 17+ messages in thread

* Re: [PATCH 1/4] qemu-file: Don't do IO after shutdown
  2020-01-08  9:49     ` Dr. David Alan Gilbert
@ 2020-01-08 12:40       ` Juan Quintela
  2020-01-16 10:04         ` Juan Quintela
  0 siblings, 1 reply; 17+ messages in thread
From: Juan Quintela @ 2020-01-08 12:40 UTC (permalink / raw)
  To: Dr. David Alan Gilbert
  Cc: Laurent Vivier, Paolo Bonzini, Thomas Huth, qemu-devel

"Dr. David Alan Gilbert" <dgilbert@redhat.com> wrote:
> * Dr. David Alan Gilbert (dgilbert@redhat.com) wrote:
>> * Juan Quintela (quintela@redhat.com) wrote:
>> > Be sure that we are not doing neither read/write after shutdown of the
>> > QEMUFile.
>> > 
>> > Signed-off-by: Juan Quintela <quintela@redhat.com>
>> > ---
>> >  migration/qemu-file.c | 13 +++++++++++++
>> >  1 file changed, 13 insertions(+)
>> > 
>> > diff --git a/migration/qemu-file.c b/migration/qemu-file.c
>> > index 26fb25ddc1..1e5543a279 100644
>> > --- a/migration/qemu-file.c
>> > +++ b/migration/qemu-file.c
>> > @@ -53,6 +53,8 @@ struct QEMUFile {
>> >  
>> >      int last_error;
>> >      Error *last_error_obj;
>> > +    /* has the file has been shutdown */
>> > +    bool shutdown;
>> >  };
>> >  
>> >  /*
>> > @@ -61,6 +63,7 @@ struct QEMUFile {
>> >   */
>> >  int qemu_file_shutdown(QEMUFile *f)
>> >  {
>> > +    f->shutdown = true;
>> >      if (!f->ops->shut_down) {
>> >          return -ENOSYS;
>> >      }
>> > @@ -214,6 +217,9 @@ void qemu_fflush(QEMUFile *f)
>> >          return;
>> >      }
>> >  
>> > +    if (f->shutdown) {
>> > +        return;
>> > +    }
>> 
>> OK, I did wonder if you need to free the iovec.
>> 
>> >      if (f->iovcnt > 0) {
>> >          expect = iov_size(f->iov, f->iovcnt);
>> >          ret = f->ops->writev_buffer(f->opaque, f->iov, f->iovcnt, f->pos,
>> > @@ -328,6 +334,10 @@ static ssize_t qemu_fill_buffer(QEMUFile *f)
>> >      f->buf_index = 0;
>> >      f->buf_size = pending;
>> >  
>> > +    if (f->shutdown) {
>> > +        return 0;
>> > +    }
>> 
>> I also wondered if perhaps an error would be reasonable here; but I'm
>> not sure what a read(2) does after a shutdown(2).
>> 
>> Still,
>> 
>> 
>> Reviewed-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
>
> Actually, it turns out this breaks an assumption - 'shutdown' must cause
> reads/writes/etc to fail and for the qemu_file to go into error state.
> There's a few places we loop doing IO until we either change migration
> state or the file goes into error.
>
>
> diff --git a/migration/qemu-file.c b/migration/qemu-file.c
> index 1e5543a279..bbb2b63927 100644
> --- a/migration/qemu-file.c
> +++ b/migration/qemu-file.c
> @@ -63,11 +63,18 @@ struct QEMUFile {
>   */
>  int qemu_file_shutdown(QEMUFile *f)
>  {
> +    int ret;
> +
>      f->shutdown = true;
>      if (!f->ops->shut_down) {
>          return -ENOSYS;
>      }
> -    return f->ops->shut_down(f->opaque, true, true, NULL);
> +    ret = f->ops->shut_down(f->opaque, true, true, NULL);
> +
> +    if (!f->last_error) {
> +        qemu_file_set_error(f, -EIO);
> +    }
> +    return ret;
>  }
>  
>  /*
>
>
> seems to fix it for me.

will gve it a try later.

Thanks.



^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 1/4] qemu-file: Don't do IO after shutdown
  2020-01-08 12:40       ` Juan Quintela
@ 2020-01-16 10:04         ` Juan Quintela
  0 siblings, 0 replies; 17+ messages in thread
From: Juan Quintela @ 2020-01-16 10:04 UTC (permalink / raw)
  To: Dr. David Alan Gilbert
  Cc: Laurent Vivier, Paolo Bonzini, Thomas Huth, qemu-devel

Juan Quintela <quintela@redhat.com> wrote:
> "Dr. David Alan Gilbert" <dgilbert@redhat.com> wrote:
>> * Dr. David Alan Gilbert (dgilbert@redhat.com) wrote:
>>> * Juan Quintela (quintela@redhat.com) wrote:

>>
>> diff --git a/migration/qemu-file.c b/migration/qemu-file.c
>> index 1e5543a279..bbb2b63927 100644
>> --- a/migration/qemu-file.c
>> +++ b/migration/qemu-file.c
>> @@ -63,11 +63,18 @@ struct QEMUFile {
>>   */
>>  int qemu_file_shutdown(QEMUFile *f)
>>  {
>> +    int ret;
>> +
>>      f->shutdown = true;
>>      if (!f->ops->shut_down) {
>>          return -ENOSYS;
>>      }
>> -    return f->ops->shut_down(f->opaque, true, true, NULL);
>> +    ret = f->ops->shut_down(f->opaque, true, true, NULL);
>> +
>> +    if (!f->last_error) {
>> +        qemu_file_set_error(f, -EIO);
>> +    }
>> +    return ret;
>>  }
>>  
>>  /*
>>
>>
>> seems to fix it for me.
>
> will gve it a try later.


With this it passes my tests.

Thanks a lot.



^ permalink raw reply	[flat|nested] 17+ messages in thread

end of thread, other threads:[~2020-01-16 10:05 UTC | newest]

Thread overview: 17+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-12-18  5:04 [PATCH 0/4] Fix multifd + cancel + multifd Juan Quintela
2019-12-18  5:04 ` [PATCH 1/4] qemu-file: Don't do IO after shutdown Juan Quintela
2019-12-18 12:27   ` Dr. David Alan Gilbert
2019-12-18 14:35     ` Juan Quintela
2019-12-18 15:24       ` Dr. David Alan Gilbert
2019-12-29 18:20     ` Juan Quintela
2020-01-08  9:49     ` Dr. David Alan Gilbert
2020-01-08 12:40       ` Juan Quintela
2020-01-16 10:04         ` Juan Quintela
2019-12-18  5:04 ` [PATCH 2/4] multifd: Make sure that we don't do any IO after an error Juan Quintela
2019-12-18 13:57   ` Dr. David Alan Gilbert
2019-12-18 14:15     ` Juan Quintela
2019-12-18  5:04 ` [PATCH 3/4] migration-test: Make sure that multifd and cancel works Juan Quintela
2019-12-18 16:25   ` Dr. David Alan Gilbert
2019-12-29 18:27     ` Juan Quintela
2019-12-18  5:04 ` [PATCH 4/4] migration: Make sure that we don't call write() in case of error Juan Quintela
2019-12-18 16:33   ` Dr. David Alan Gilbert

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.