xen-devel.lists.xenproject.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] tools/libxl: Fixes to stream v2 task joining logic
@ 2015-07-23 11:09 Andrew Cooper
  2015-07-23 11:38 ` Wei Liu
  2015-07-24 11:41 ` Ian Jackson
  0 siblings, 2 replies; 8+ messages in thread
From: Andrew Cooper @ 2015-07-23 11:09 UTC (permalink / raw)
  To: Xen-devel; +Cc: Andrew Cooper, Ian Jackson, Ian Campbell, Wei Liu

During review of the libxl migration v2 series, I changes the task
joining logic, but clearly didn't think the result through
properly. This patch fixes several errors.

1) Do not call check_all_finished() in the success cases of
libxl__xc_domain_{save,restore}_done().  It serves no specific purpose
as the save helper state will report itself as inactive by this point,
and avoids triggering a second stream->completion_callback() in the case
that write_toolstack_record()/stream_continue() record errors
synchronously themselves.

2) Only ever set stream->rc in stream_complete().  The first version of
the migration v2 series had separate rc and joined_rc parameters, where
this logic worked.  However when combining the two, the teardown path
fails to trigger if stream_done() records stream->rc itself.  A side
effect of this is that stream_done() needs to take an rc parameter.

3) Avoid stacking of check_all_finished() via synchronous teardown of
tasks.  If the _abort() functions call back synchronously,
stream->completion_callback() ends up getting called twice, as first and
last check_all_finished() frames observe each task being finished.

Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
CC: Ian Campbell <Ian.Campbell@citrix.com>
CC: Ian Jackson <Ian.Jackson@eu.citrix.com>
CC: Wei Liu <wei.liu2@citrix.com>

---
I found this while working to fix the toolstack record issue, but am
posting this bugfix ahead of the other work as OSSTest has tripped over
the issue.
---
 tools/libxl/libxl_internal.h     |    2 ++
 tools/libxl/libxl_stream_read.c  |   32 ++++++++++++++++++++++++++------
 tools/libxl/libxl_stream_write.c |   32 ++++++++++++++++++++++++++------
 3 files changed, 54 insertions(+), 12 deletions(-)

diff --git a/tools/libxl/libxl_internal.h b/tools/libxl/libxl_internal.h
index 3c09668..73b4d62 100644
--- a/tools/libxl/libxl_internal.h
+++ b/tools/libxl/libxl_internal.h
@@ -2996,6 +2996,7 @@ struct libxl__stream_write_state {
     int rc;
     bool running;
     bool in_checkpoint;
+    bool sync_teardown;
     libxl__save_helper_state shs;
 
     /* Main stream-writing data. */
@@ -3337,6 +3338,7 @@ struct libxl__stream_read_state {
     int rc;
     bool running;
     bool in_checkpoint;
+    bool sync_teardown;
     libxl__save_helper_state shs;
     libxl__conversion_helper_state chs;
 
diff --git a/tools/libxl/libxl_stream_read.c b/tools/libxl/libxl_stream_read.c
index 32a3551..70d3e5f 100644
--- a/tools/libxl/libxl_stream_read.c
+++ b/tools/libxl/libxl_stream_read.c
@@ -112,7 +112,7 @@ static void stream_complete(libxl__egc *egc,
 static void checkpoint_done(libxl__egc *egc,
                             libxl__stream_read_state *stream, int rc);
 static void stream_done(libxl__egc *egc,
-                        libxl__stream_read_state *stream);
+                        libxl__stream_read_state *stream, int rc);
 static void conversion_done(libxl__egc *egc,
                             libxl__conversion_helper_state *chs, int rc);
 static void check_all_finished(libxl__egc *egc,
@@ -176,6 +176,7 @@ void libxl__stream_read_init(libxl__stream_read_state *stream)
     stream->rc = 0;
     stream->running = false;
     stream->in_checkpoint = false;
+    stream->sync_teardown = false;
     libxl__save_helper_init(&stream->shs);
     libxl__conversion_helper_init(&stream->chs);
     FILLZERO(stream->dc);
@@ -669,9 +670,7 @@ static void stream_complete(libxl__egc *egc,
         return;
     }
 
-    if (!stream->rc)
-        stream->rc = rc;
-    stream_done(egc, stream);
+    stream_done(egc, stream, rc);
 }
 
 static void checkpoint_done(libxl__egc *egc,
@@ -695,7 +694,7 @@ static void checkpoint_done(libxl__egc *egc,
 }
 
 static void stream_done(libxl__egc *egc,
-                        libxl__stream_read_state *stream)
+                        libxl__stream_read_state *stream, int rc)
 {
     libxl__sr_record_buf *rec, *trec;
 
@@ -720,7 +719,7 @@ static void stream_done(libxl__egc *egc,
     LIBXL_STAILQ_FOREACH_SAFE(rec, &stream->record_queue, entry, trec)
         free_record(rec);
 
-    check_all_finished(egc, stream, stream->rc);
+    check_all_finished(egc, stream, rc);
 }
 
 void libxl__xc_domain_restore_done(libxl__egc *egc, void *dcs_void,
@@ -744,6 +743,7 @@ void libxl__xc_domain_restore_done(libxl__egc *egc, void *dcs_void,
      * libxl records from it.
      */
     stream_continue(egc, stream);
+    return;
 
  err:
     check_all_finished(egc, stream, rc);
@@ -762,13 +762,33 @@ static void check_all_finished(libxl__egc *egc,
 {
     STATE_AO_GC(stream->ao);
 
+    /*
+     * In the case of a failure, the _abort()'s below might cancel
+     * synchronously on top of us, or asynchronously at a later point.
+     *
+     * We must avoid the situation where all _abort() cancel
+     * synchronously and the completion_callback() gets called twice;
+     * once by the first error and once by the final stacked abort(),
+     * both of whom will find that all of the tasks have stopped.
+     *
+     * To avoid this problem, any stacked re-entry into this function is
+     * ineligible to fire the completion callback.  The outermost
+     * instance will take care of completing, once the stack has
+     * unwound.
+     */
+    if (stream->sync_teardown)
+        return;
+
     if (!stream->rc && rc) {
         /* First reported failure. Tear everything down. */
         stream->rc = rc;
+        stream->sync_teardown = true;
 
         libxl__stream_read_abort(egc, stream, rc);
         libxl__save_helper_abort(egc, &stream->shs);
         libxl__conversion_helper_abort(egc, &stream->chs, rc);
+
+        stream->sync_teardown = false;
     }
 
     /* Don't fire the callback until all our parallel tasks have stopped. */
diff --git a/tools/libxl/libxl_stream_write.c b/tools/libxl/libxl_stream_write.c
index 5bff52b..5135bc3 100644
--- a/tools/libxl/libxl_stream_write.c
+++ b/tools/libxl/libxl_stream_write.c
@@ -55,7 +55,7 @@ static void stream_success(libxl__egc *egc,
 static void stream_complete(libxl__egc *egc,
                             libxl__stream_write_state *stream, int rc);
 static void stream_done(libxl__egc *egc,
-                        libxl__stream_write_state *stream);
+                        libxl__stream_write_state *stream, int rc);
 static void checkpoint_done(libxl__egc *egc,
                             libxl__stream_write_state *stream,
                             int rc);
@@ -158,6 +158,7 @@ void libxl__stream_write_init(libxl__stream_write_state *stream)
     stream->rc = 0;
     stream->running = false;
     stream->in_checkpoint = false;
+    stream->sync_teardown = false;
     FILLZERO(stream->dc);
     stream->record_done_callback = NULL;
     FILLZERO(stream->emu_dc);
@@ -275,6 +276,7 @@ void libxl__xc_domain_save_done(libxl__egc *egc, void *dss_void,
     }
 
     write_toolstack_record(egc, stream);
+    return;
 
  err:
     check_all_finished(egc, stream, rc);
@@ -492,13 +494,11 @@ static void stream_complete(libxl__egc *egc,
         return;
     }
 
-    if (!stream->rc)
-        stream->rc = rc;
-    stream_done(egc, stream);
+    stream_done(egc, stream, rc);
 }
 
 static void stream_done(libxl__egc *egc,
-                        libxl__stream_write_state *stream)
+                        libxl__stream_write_state *stream, int rc)
 {
     assert(stream->running);
     stream->running = false;
@@ -507,7 +507,7 @@ static void stream_done(libxl__egc *egc,
         libxl__carefd_close(stream->emu_carefd);
     free(stream->emu_body);
 
-    check_all_finished(egc, stream, stream->rc);
+    check_all_finished(egc, stream, rc);
 }
 
 static void checkpoint_done(libxl__egc *egc,
@@ -526,12 +526,32 @@ static void check_all_finished(libxl__egc *egc,
 {
     STATE_AO_GC(stream->ao);
 
+    /*
+     * In the case of a failure, the _abort()'s below might cancel
+     * synchronously on top of us, or asynchronously at a later point.
+     *
+     * We must avoid the situation where all _abort() cancel
+     * synchronously and the completion_callback() gets called twice;
+     * once by the first error and once by the final stacked abort(),
+     * both of whom will find that all of the tasks have stopped.
+     *
+     * To avoid this problem, any stacked re-entry into this function is
+     * ineligible to fire the completion callback.  The outermost
+     * instance will take care of completing, once the stack has
+     * unwound.
+     */
+    if (stream->sync_teardown)
+        return;
+
     if (!stream->rc && rc) {
         /* First reported failure. Tear everything down. */
         stream->rc = rc;
+        stream->sync_teardown = true;
 
         libxl__stream_write_abort(egc, stream, rc);
         libxl__save_helper_abort(egc, &stream->shs);
+
+        stream->sync_teardown = false;
     }
 
     /* Don't fire the callback until all our parallel tasks have stopped. */
-- 
1.7.10.4

^ permalink raw reply related	[flat|nested] 8+ messages in thread

* Re: [PATCH] tools/libxl: Fixes to stream v2 task joining logic
  2015-07-23 11:09 [PATCH] tools/libxl: Fixes to stream v2 task joining logic Andrew Cooper
@ 2015-07-23 11:38 ` Wei Liu
  2015-07-23 11:42   ` Andrew Cooper
  2015-07-24 11:41 ` Ian Jackson
  1 sibling, 1 reply; 8+ messages in thread
From: Wei Liu @ 2015-07-23 11:38 UTC (permalink / raw)
  To: Andrew Cooper; +Cc: Wei Liu, Ian Jackson, Ian Campbell, Xen-devel

On Thu, Jul 23, 2015 at 12:09:38PM +0100, Andrew Cooper wrote:
> During review of the libxl migration v2 series, I changes the task
> joining logic, but clearly didn't think the result through
> properly. This patch fixes several errors.
> 
> 1) Do not call check_all_finished() in the success cases of
> libxl__xc_domain_{save,restore}_done().  It serves no specific purpose
> as the save helper state will report itself as inactive by this point,
> and avoids triggering a second stream->completion_callback() in the case
> that write_toolstack_record()/stream_continue() record errors
> synchronously themselves.
> 
> 2) Only ever set stream->rc in stream_complete().  The first version of
> the migration v2 series had separate rc and joined_rc parameters, where
> this logic worked.  However when combining the two, the teardown path
> fails to trigger if stream_done() records stream->rc itself.  A side
> effect of this is that stream_done() needs to take an rc parameter.
> 
> 3) Avoid stacking of check_all_finished() via synchronous teardown of
> tasks.  If the _abort() functions call back synchronously,
> stream->completion_callback() ends up getting called twice, as first and
> last check_all_finished() frames observe each task being finished.
> 
> Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
> CC: Ian Campbell <Ian.Campbell@citrix.com>
> CC: Ian Jackson <Ian.Jackson@eu.citrix.com>
> CC: Wei Liu <wei.liu2@citrix.com>
> 
> ---
> I found this while working to fix the toolstack record issue, but am
> posting this bugfix ahead of the other work as OSSTest has tripped over
> the issue.

This change itself doesn't seem to have anything to do with libxc. In
OSSTest the error that triggers this knock on effect is the failure of
xc_map_foreign_bulk. Does that mean this patch only fix half of the
problem seen in OSSTest?

Wei.

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH] tools/libxl: Fixes to stream v2 task joining logic
  2015-07-23 11:38 ` Wei Liu
@ 2015-07-23 11:42   ` Andrew Cooper
  2015-07-23 12:03     ` Wei Liu
  0 siblings, 1 reply; 8+ messages in thread
From: Andrew Cooper @ 2015-07-23 11:42 UTC (permalink / raw)
  To: Wei Liu; +Cc: Ian Jackson, Ian Campbell, Xen-devel

On 23/07/15 12:38, Wei Liu wrote:
> On Thu, Jul 23, 2015 at 12:09:38PM +0100, Andrew Cooper wrote:
>> During review of the libxl migration v2 series, I changes the task
>> joining logic, but clearly didn't think the result through
>> properly. This patch fixes several errors.
>>
>> 1) Do not call check_all_finished() in the success cases of
>> libxl__xc_domain_{save,restore}_done().  It serves no specific purpose
>> as the save helper state will report itself as inactive by this point,
>> and avoids triggering a second stream->completion_callback() in the case
>> that write_toolstack_record()/stream_continue() record errors
>> synchronously themselves.
>>
>> 2) Only ever set stream->rc in stream_complete().  The first version of
>> the migration v2 series had separate rc and joined_rc parameters, where
>> this logic worked.  However when combining the two, the teardown path
>> fails to trigger if stream_done() records stream->rc itself.  A side
>> effect of this is that stream_done() needs to take an rc parameter.
>>
>> 3) Avoid stacking of check_all_finished() via synchronous teardown of
>> tasks.  If the _abort() functions call back synchronously,
>> stream->completion_callback() ends up getting called twice, as first and
>> last check_all_finished() frames observe each task being finished.
>>
>> Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
>> CC: Ian Campbell <Ian.Campbell@citrix.com>
>> CC: Ian Jackson <Ian.Jackson@eu.citrix.com>
>> CC: Wei Liu <wei.liu2@citrix.com>
>>
>> ---
>> I found this while working to fix the toolstack record issue, but am
>> posting this bugfix ahead of the other work as OSSTest has tripped over
>> the issue.
> This change itself doesn't seem to have anything to do with libxc. In
> OSSTest the error that triggers this knock on effect is the failure of
> xc_map_foreign_bulk. Does that mean this patch only fix half of the
> problem seen in OSSTest?

Saving to image new xl format (info 0x3/0x0/1797)
xc: info: In experimental xc_domain_save2
xc: info: Saving domain 2, type x86 HVM
xc: progress: Memory: 67584/1344513    5%
xc: progress: Memory: 135168/1344513   10%
xc: progress: Memory: 201728/1344513   15%
xc: progress: Memory: 269312/1344513   20%
xc: progress: Memory: 336896/1344513   25%
xc: progress: Memory: 403456/1344513   30%
xc: progress: Memory: 471040/1344513   35%
xc: progress: Memory: 538624/1344513   40%
xc: progress: Memory: 605184/1344513   45%
xc: progress: Memory: 672768/1344513   50%
xc: progress: Memory: 740352/1344513   55%
xc: error: xc_map_foreign_bulk: mmap failed (12 = Cannot allocate
memory): Internal error
xc: error: Failed to map guest pages (12 = Cannot allocate memory):
Internal error
xc: error: Save failed (12 = Cannot allocate memory): Internal error
libxl: error: libxl_stream_write.c:267:libxl__xc_domain_save_done:
saving domain: domain responded to suspend request: Cannot allocate memory
xl: libxl_event.c:1866: libxl__ao_inprogress_gc: Assertion
`!ao->complete' failed.


I don't know why xc_map_foreign_bulk() failed, but at a guess I would
day dom0 is out of RAM.

This patch fixes the libxl assertion.

~Andrew

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH] tools/libxl: Fixes to stream v2 task joining logic
  2015-07-23 11:42   ` Andrew Cooper
@ 2015-07-23 12:03     ` Wei Liu
  2015-07-23 13:07       ` Andrew Cooper
  0 siblings, 1 reply; 8+ messages in thread
From: Wei Liu @ 2015-07-23 12:03 UTC (permalink / raw)
  To: Andrew Cooper; +Cc: Ian Jackson, Wei Liu, Ian Campbell, Xen-devel

On Thu, Jul 23, 2015 at 12:42:25PM +0100, Andrew Cooper wrote:
> On 23/07/15 12:38, Wei Liu wrote:
> > On Thu, Jul 23, 2015 at 12:09:38PM +0100, Andrew Cooper wrote:
> >> During review of the libxl migration v2 series, I changes the task
> >> joining logic, but clearly didn't think the result through
> >> properly. This patch fixes several errors.
> >>
> >> 1) Do not call check_all_finished() in the success cases of
> >> libxl__xc_domain_{save,restore}_done().  It serves no specific purpose
> >> as the save helper state will report itself as inactive by this point,
> >> and avoids triggering a second stream->completion_callback() in the case
> >> that write_toolstack_record()/stream_continue() record errors
> >> synchronously themselves.
> >>
> >> 2) Only ever set stream->rc in stream_complete().  The first version of
> >> the migration v2 series had separate rc and joined_rc parameters, where
> >> this logic worked.  However when combining the two, the teardown path
> >> fails to trigger if stream_done() records stream->rc itself.  A side
> >> effect of this is that stream_done() needs to take an rc parameter.
> >>
> >> 3) Avoid stacking of check_all_finished() via synchronous teardown of
> >> tasks.  If the _abort() functions call back synchronously,
> >> stream->completion_callback() ends up getting called twice, as first and
> >> last check_all_finished() frames observe each task being finished.
> >>
> >> Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
> >> CC: Ian Campbell <Ian.Campbell@citrix.com>
> >> CC: Ian Jackson <Ian.Jackson@eu.citrix.com>
> >> CC: Wei Liu <wei.liu2@citrix.com>
> >>
> >> ---
> >> I found this while working to fix the toolstack record issue, but am
> >> posting this bugfix ahead of the other work as OSSTest has tripped over
> >> the issue.
> > This change itself doesn't seem to have anything to do with libxc. In
> > OSSTest the error that triggers this knock on effect is the failure of
> > xc_map_foreign_bulk. Does that mean this patch only fix half of the
> > problem seen in OSSTest?
> 
> Saving to image new xl format (info 0x3/0x0/1797)
> xc: info: In experimental xc_domain_save2
> xc: info: Saving domain 2, type x86 HVM
> xc: progress: Memory: 67584/1344513    5%
> xc: progress: Memory: 135168/1344513   10%
> xc: progress: Memory: 201728/1344513   15%
> xc: progress: Memory: 269312/1344513   20%
> xc: progress: Memory: 336896/1344513   25%
> xc: progress: Memory: 403456/1344513   30%
> xc: progress: Memory: 471040/1344513   35%
> xc: progress: Memory: 538624/1344513   40%
> xc: progress: Memory: 605184/1344513   45%
> xc: progress: Memory: 672768/1344513   50%
> xc: progress: Memory: 740352/1344513   55%
> xc: error: xc_map_foreign_bulk: mmap failed (12 = Cannot allocate
> memory): Internal error
> xc: error: Failed to map guest pages (12 = Cannot allocate memory):
> Internal error
> xc: error: Save failed (12 = Cannot allocate memory): Internal error
> libxl: error: libxl_stream_write.c:267:libxl__xc_domain_save_done:
> saving domain: domain responded to suspend request: Cannot allocate memory
> xl: libxl_event.c:1866: libxl__ao_inprogress_gc: Assertion
> `!ao->complete' failed.
> 
> 
> I don't know why xc_map_foreign_bulk() failed, but at a guess I would
> day dom0 is out of RAM.
> 

I don't think so. There is no OOM error in serial log and dmesg.

A similar failure is discovered in QEMU mainline test. That one is using
OVMF + QEMU upstream. So this failure is not related to firmware
or device model.

http://logs.test-lab.xenproject.org/osstest/logs/59819/test-amd64-i386-xl-qemuu-debianhvm-amd64-xsm/info.html

It should be a real bug manifests only on 32bit toolstack + guest with
large amount of ram (5000 in this case).

Looks like we have one more bug to hunt down.

Wei.

> This patch fixes the libxl assertion.
> 
> ~Andrew

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH] tools/libxl: Fixes to stream v2 task joining logic
  2015-07-23 12:03     ` Wei Liu
@ 2015-07-23 13:07       ` Andrew Cooper
  2015-07-23 14:02         ` Ian Jackson
  0 siblings, 1 reply; 8+ messages in thread
From: Andrew Cooper @ 2015-07-23 13:07 UTC (permalink / raw)
  To: Wei Liu; +Cc: Ian Jackson, Ian Campbell, Xen-devel

On 23/07/15 13:03, Wei Liu wrote:
> On Thu, Jul 23, 2015 at 12:42:25PM +0100, Andrew Cooper wrote:
>> On 23/07/15 12:38, Wei Liu wrote:
>>> On Thu, Jul 23, 2015 at 12:09:38PM +0100, Andrew Cooper wrote:
>>>> During review of the libxl migration v2 series, I changes the task
>>>> joining logic, but clearly didn't think the result through
>>>> properly. This patch fixes several errors.
>>>>
>>>> 1) Do not call check_all_finished() in the success cases of
>>>> libxl__xc_domain_{save,restore}_done().  It serves no specific purpose
>>>> as the save helper state will report itself as inactive by this point,
>>>> and avoids triggering a second stream->completion_callback() in the case
>>>> that write_toolstack_record()/stream_continue() record errors
>>>> synchronously themselves.
>>>>
>>>> 2) Only ever set stream->rc in stream_complete().  The first version of
>>>> the migration v2 series had separate rc and joined_rc parameters, where
>>>> this logic worked.  However when combining the two, the teardown path
>>>> fails to trigger if stream_done() records stream->rc itself.  A side
>>>> effect of this is that stream_done() needs to take an rc parameter.
>>>>
>>>> 3) Avoid stacking of check_all_finished() via synchronous teardown of
>>>> tasks.  If the _abort() functions call back synchronously,
>>>> stream->completion_callback() ends up getting called twice, as first and
>>>> last check_all_finished() frames observe each task being finished.
>>>>
>>>> Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
>>>> CC: Ian Campbell <Ian.Campbell@citrix.com>
>>>> CC: Ian Jackson <Ian.Jackson@eu.citrix.com>
>>>> CC: Wei Liu <wei.liu2@citrix.com>
>>>>
>>>> ---
>>>> I found this while working to fix the toolstack record issue, but am
>>>> posting this bugfix ahead of the other work as OSSTest has tripped over
>>>> the issue.
>>> This change itself doesn't seem to have anything to do with libxc. In
>>> OSSTest the error that triggers this knock on effect is the failure of
>>> xc_map_foreign_bulk. Does that mean this patch only fix half of the
>>> problem seen in OSSTest?
>> Saving to image new xl format (info 0x3/0x0/1797)
>> xc: info: In experimental xc_domain_save2
>> xc: info: Saving domain 2, type x86 HVM
>> xc: progress: Memory: 67584/1344513    5%
>> xc: progress: Memory: 135168/1344513   10%
>> xc: progress: Memory: 201728/1344513   15%
>> xc: progress: Memory: 269312/1344513   20%
>> xc: progress: Memory: 336896/1344513   25%
>> xc: progress: Memory: 403456/1344513   30%
>> xc: progress: Memory: 471040/1344513   35%
>> xc: progress: Memory: 538624/1344513   40%
>> xc: progress: Memory: 605184/1344513   45%
>> xc: progress: Memory: 672768/1344513   50%
>> xc: progress: Memory: 740352/1344513   55%
>> xc: error: xc_map_foreign_bulk: mmap failed (12 = Cannot allocate
>> memory): Internal error
>> xc: error: Failed to map guest pages (12 = Cannot allocate memory):
>> Internal error
>> xc: error: Save failed (12 = Cannot allocate memory): Internal error
>> libxl: error: libxl_stream_write.c:267:libxl__xc_domain_save_done:
>> saving domain: domain responded to suspend request: Cannot allocate memory
>> xl: libxl_event.c:1866: libxl__ao_inprogress_gc: Assertion
>> `!ao->complete' failed.
>>
>>
>> I don't know why xc_map_foreign_bulk() failed, but at a guess I would
>> day dom0 is out of RAM.
>>
> I don't think so. There is no OOM error in serial log and dmesg.
>
> A similar failure is discovered in QEMU mainline test. That one is using
> OVMF + QEMU upstream. So this failure is not related to firmware
> or device model.
>
> http://logs.test-lab.xenproject.org/osstest/logs/59819/test-amd64-i386-xl-qemuu-debianhvm-amd64-xsm/info.html
>
> It should be a real bug manifests only on 32bit toolstack + guest with
> large amount of ram (5000 in this case).
>
> Looks like we have one more bug to hunt down.

>From linux_privcmd_map_foreign_bulk()

addr = mmap(NULL, (unsigned long)num << XC_PAGE_SHIFT, prot, MAP_SHARED,
            fd, 0);
if ( addr == MAP_FAILED )
{
    PERROR("xc_map_foreign_bulk: mmap failed");
    return NULL;
}

So the mmap() call did genuinely fail with ENOMEM.

Migration v2 only ever has 1024 guest pages ever mapped at once (see
xc_sr_save.c:write_batch() ), and doesn't leak the mapping, which means
that the kernel decided that it was out of memory.

~Andrew

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH] tools/libxl: Fixes to stream v2 task joining logic
  2015-07-23 13:07       ` Andrew Cooper
@ 2015-07-23 14:02         ` Ian Jackson
  0 siblings, 0 replies; 8+ messages in thread
From: Ian Jackson @ 2015-07-23 14:02 UTC (permalink / raw)
  To: Andrew Cooper; +Cc: Wei Liu, Ian Campbell, Xen-devel

Andrew Cooper writes ("Re: [PATCH] tools/libxl: Fixes to stream v2 task joining logic"):

> Migration v2 only ever has 1024 guest pages ever mapped at once (see
> xc_sr_save.c:write_batch() ), and doesn't leak the mapping, which means
> that the kernel decided that it was out of memory.

Maybe it was being asked for too much.  I don't think this can really
be an OOM condition in the debianhvm test.

Ian.

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH] tools/libxl: Fixes to stream v2 task joining logic
  2015-07-23 11:09 [PATCH] tools/libxl: Fixes to stream v2 task joining logic Andrew Cooper
  2015-07-23 11:38 ` Wei Liu
@ 2015-07-24 11:41 ` Ian Jackson
  2015-07-27 14:30   ` Andrew Cooper
  1 sibling, 1 reply; 8+ messages in thread
From: Ian Jackson @ 2015-07-24 11:41 UTC (permalink / raw)
  To: Andrew Cooper; +Cc: Wei Liu, Ian Campbell, Xen-devel

Andrew Cooper writes ("[PATCH] tools/libxl: Fixes to stream v2 task joining logic"):
> During review of the libxl migration v2 series, I changes the task
> joining logic, but clearly didn't think the result through
> properly. This patch fixes several errors.

This would have been much easier to review if it had been split into 3
patches.  I have gone mostly by the commit message because it was hard
to see hunk belonged to what.

> 3) Avoid stacking of check_all_finished() via synchronous teardown of
> tasks.  If the _abort() functions call back synchronously,
> stream->completion_callback() ends up getting called twice, as first and
> last check_all_finished() frames observe each task being finished.

I think this part of the patch is fine.


> 1) Do not call check_all_finished() in the success cases of
> libxl__xc_domain_{save,restore}_done().  It serves no specific purpose
> as the save helper state will report itself as inactive by this point,
> and avoids triggering a second stream->completion_callback() in the case
> that write_toolstack_record()/stream_continue() record errors
> synchronously themselves.

"Serves no specific purpose" other than having a single exit path,
which makes matters much less confusing.

I think the problem may be that libxl__xc_domain_{save,restore}_done
fail to "return" after "write_toolstack_record" and "stream_continue".
That seems like simply a bug.  I'm sorry that I didn't notice it in
review.

In general each callback function should set up exactly one other
callback.  If it does anything else then reentrancy hazards arise.

Also, it is confusing and perhaps wrong that write_toolstack_record
calls stream_complete.  What if there are other threads of control
outstanding ?


> 2) Only ever set stream->rc in stream_complete().  The first version of
> the migration v2 series had separate rc and joined_rc parameters, where
> this logic worked.  However when combining the two, the teardown path
> fails to trigger if stream_done() records stream->rc itself.  A side
> effect of this is that stream_done() needs to take an rc parameter.

"the teardown path fails to trigger if stream_done() records
stream->rc itself" but in the code I am looking at neither of the
functions stream_done assign to rc.

Maybe I would understand this if I were looking at only this set of
diff hunks.


Ian.

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH] tools/libxl: Fixes to stream v2 task joining logic
  2015-07-24 11:41 ` Ian Jackson
@ 2015-07-27 14:30   ` Andrew Cooper
  0 siblings, 0 replies; 8+ messages in thread
From: Andrew Cooper @ 2015-07-27 14:30 UTC (permalink / raw)
  To: Ian Jackson; +Cc: Wei Liu, Ian Campbell, Xen-devel

On 24/07/15 12:41, Ian Jackson wrote:
> Andrew Cooper writes ("[PATCH] tools/libxl: Fixes to stream v2 task joining logic"):
>> During review of the libxl migration v2 series, I changes the task
>> joining logic, but clearly didn't think the result through
>> properly. This patch fixes several errors.
> This would have been much easier to review if it had been split into 3
> patches.  I have gone mostly by the commit message because it was hard
> to see hunk belonged to what.

I have split them up. 

>
>> 3) Avoid stacking of check_all_finished() via synchronous teardown of
>> tasks.  If the _abort() functions call back synchronously,
>> stream->completion_callback() ends up getting called twice, as first and
>> last check_all_finished() frames observe each task being finished.
> I think this part of the patch is fine.
>
>
>> 1) Do not call check_all_finished() in the success cases of
>> libxl__xc_domain_{save,restore}_done().  It serves no specific purpose
>> as the save helper state will report itself as inactive by this point,
>> and avoids triggering a second stream->completion_callback() in the case
>> that write_toolstack_record()/stream_continue() record errors
>> synchronously themselves.
> "Serves no specific purpose" other than having a single exit path,
> which makes matters much less confusing.

"Serves no specific purpose" in so far as what check_all_finished()
would do in the success case.

>
> I think the problem may be that libxl__xc_domain_{save,restore}_done
> fail to "return" after "write_toolstack_record" and "stream_continue".
> That seems like simply a bug.  I'm sorry that I didn't notice it in
> review.

After some more thought, I don't believe that my fix is necessarily
correct.  If a condition were to exist where the stream had recorded an
error and abort()'ed the save helper, but the save helper was already
exiting with a success condition, then the callback wouldn't be fired at
all.  I have a proposed alternate solution.

>
> In general each callback function should set up exactly one other
> callback.  If it does anything else then reentrancy hazards arise.

The entire point of this logic is that there are multiple operations
going on in parallel.  It is not guaranteed that a save helper will ever
be spawned on the read side (although this would be a very useless
stream).  The state of the libxl stream read/write object is
deliberately separate from the save helper.

>
> Also, it is confusing and perhaps wrong that write_toolstack_record
> calls stream_complete.  What if there are other threads of control
> outstanding ?

This is exactly the problem which check_all_finished() is supposed to
solve, but currently doesn't.

>
>
>> 2) Only ever set stream->rc in stream_complete().  The first version of
>> the migration v2 series had separate rc and joined_rc parameters, where
>> this logic worked.  However when combining the two, the teardown path
>> fails to trigger if stream_done() records stream->rc itself.  A side
>> effect of this is that stream_done() needs to take an rc parameter.
> "the teardown path fails to trigger if stream_done() records
> stream->rc itself" but in the code I am looking at neither of the
> functions stream_done assign to rc.

I have no idea why I wrote what I did.  The code was correct but the
description was wrong.  I have fixed it in the split version of this patch.

~Andrew

^ permalink raw reply	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2015-07-27 14:30 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2015-07-23 11:09 [PATCH] tools/libxl: Fixes to stream v2 task joining logic Andrew Cooper
2015-07-23 11:38 ` Wei Liu
2015-07-23 11:42   ` Andrew Cooper
2015-07-23 12:03     ` Wei Liu
2015-07-23 13:07       ` Andrew Cooper
2015-07-23 14:02         ` Ian Jackson
2015-07-24 11:41 ` Ian Jackson
2015-07-27 14:30   ` Andrew Cooper

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).