All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] multifd: Copy pages before compressing them with zlib
@ 2022-03-29 15:21 Ilya Leoshkevich
  2022-03-30 14:35 ` Christian Borntraeger
  2022-04-04 11:20 ` Dr. David Alan Gilbert
  0 siblings, 2 replies; 16+ messages in thread
From: Ilya Leoshkevich @ 2022-03-29 15:21 UTC (permalink / raw)
  To: Juan Quintela, Dr. David Alan Gilbert
  Cc: Peter Maydell, thuth, f.ebner, Daniel P . Berrangé,
	Ilya Leoshkevich, Alex Bennée, s.reiter, Cornelia Huck,
	qemu-devel, peterx, qemu-s390x, Philippe Mathieu-Daudé,
	hreitz, Christian Borntraeger, jinpu.wang

zlib_send_prepare() compresses pages of a running VM. zlib does not
make any thread-safety guarantees with respect to changing deflate()
input concurrently with deflate() [1].

One can observe problems due to this with the IBM zEnterprise Data
Compression accelerator capable zlib [2]. When the hardware
acceleration is enabled, migration/multifd/tcp/zlib test fails
intermittently [3] due to sliding window corruption.

At the moment this problem occurs only with this accelerator, since
its architecture explicitly discourages concurrent accesses [4]:

    Page 26-57, "Other Conditions":

    As observed by this CPU, other CPUs, and channel
    programs, references to the parameter block, first,
    second, and third operands may be multiple-access
    references, accesses to these storage locations are
    not necessarily block-concurrent, and the sequence
    of these accesses or references is undefined.

Still, it might affect other platforms due to a future zlib update.
Therefore, copy the page being compressed into a private buffer before
passing it to zlib.

[1] https://zlib.net/manual.html
[2] https://github.com/madler/zlib/pull/410
[3] https://lists.nongnu.org/archive/html/qemu-devel/2022-03/msg03988.html
[4] http://publibfp.dhe.ibm.com/epubs/pdf/a227832c.pdf

Signed-off-by: Ilya Leoshkevich <iii@linux.ibm.com>
---
 migration/multifd-zlib.c | 35 ++++++++++++++++++++++-------------
 1 file changed, 22 insertions(+), 13 deletions(-)

diff --git a/migration/multifd-zlib.c b/migration/multifd-zlib.c
index 3a7ae44485..b6b22b7d1f 100644
--- a/migration/multifd-zlib.c
+++ b/migration/multifd-zlib.c
@@ -27,6 +27,8 @@ struct zlib_data {
     uint8_t *zbuff;
     /* size of compressed buffer */
     uint32_t zbuff_len;
+    /* uncompressed buffer */
+    uint8_t buf[];
 };
 
 /* Multifd zlib compression */
@@ -43,9 +45,18 @@ struct zlib_data {
  */
 static int zlib_send_setup(MultiFDSendParams *p, Error **errp)
 {
-    struct zlib_data *z = g_new0(struct zlib_data, 1);
-    z_stream *zs = &z->zs;
+    /* This is the maximum size of the compressed buffer */
+    uint32_t zbuff_len = compressBound(MULTIFD_PACKET_SIZE);
+    size_t buf_len = qemu_target_page_size();
+    struct zlib_data *z;
+    z_stream *zs;
 
+    z = g_try_malloc0(sizeof(struct zlib_data) + buf_len + zbuff_len);
+    if (!z) {
+        error_setg(errp, "multifd %u: out of memory for zlib_data", p->id);
+        return -1;
+    }
+    zs = &z->zs;
     zs->zalloc = Z_NULL;
     zs->zfree = Z_NULL;
     zs->opaque = Z_NULL;
@@ -54,15 +65,8 @@ static int zlib_send_setup(MultiFDSendParams *p, Error **errp)
         error_setg(errp, "multifd %u: deflate init failed", p->id);
         return -1;
     }
-    /* This is the maxium size of the compressed buffer */
-    z->zbuff_len = compressBound(MULTIFD_PACKET_SIZE);
-    z->zbuff = g_try_malloc(z->zbuff_len);
-    if (!z->zbuff) {
-        deflateEnd(&z->zs);
-        g_free(z);
-        error_setg(errp, "multifd %u: out of memory for zbuff", p->id);
-        return -1;
-    }
+    z->zbuff_len = zbuff_len;
+    z->zbuff = z->buf + buf_len;
     p->data = z;
     return 0;
 }
@@ -80,7 +84,6 @@ static void zlib_send_cleanup(MultiFDSendParams *p, Error **errp)
     struct zlib_data *z = p->data;
 
     deflateEnd(&z->zs);
-    g_free(z->zbuff);
     z->zbuff = NULL;
     g_free(p->data);
     p->data = NULL;
@@ -114,8 +117,14 @@ static int zlib_send_prepare(MultiFDSendParams *p, Error **errp)
             flush = Z_SYNC_FLUSH;
         }
 
+        /*
+         * Since the VM might be running, the page may be changing concurrently
+         * with compression. zlib does not guarantee that this is safe,
+         * therefore copy the page before calling deflate().
+         */
+        memcpy(z->buf, p->pages->block->host + p->normal[i], page_size);
         zs->avail_in = page_size;
-        zs->next_in = p->pages->block->host + p->normal[i];
+        zs->next_in = z->buf;
 
         zs->avail_out = available;
         zs->next_out = z->zbuff + out_size;
-- 
2.35.1



^ permalink raw reply related	[flat|nested] 16+ messages in thread

* Re: [PATCH] multifd: Copy pages before compressing them with zlib
  2022-03-29 15:21 [PATCH] multifd: Copy pages before compressing them with zlib Ilya Leoshkevich
@ 2022-03-30 14:35 ` Christian Borntraeger
  2022-04-04 11:20 ` Dr. David Alan Gilbert
  1 sibling, 0 replies; 16+ messages in thread
From: Christian Borntraeger @ 2022-03-30 14:35 UTC (permalink / raw)
  To: Ilya Leoshkevich, Juan Quintela, Dr. David Alan Gilbert,
	Peter Maydell, Alex Bennée
  Cc: thuth, Daniel P . Berrangé,
	s.reiter, Cornelia Huck, qemu-devel, peterx, qemu-s390x,
	Philippe Mathieu-Daudé,
	hreitz, f.ebner, jinpu.wang

Peter, Alex this is the fallout of Ilyas analysis of the s390x migration issue that triggered the DFLTCC workaround.

Am 29.03.22 um 17:21 schrieb Ilya Leoshkevich:
> zlib_send_prepare() compresses pages of a running VM. zlib does not
> make any thread-safety guarantees with respect to changing deflate()
> input concurrently with deflate() [1].
> 
> One can observe problems due to this with the IBM zEnterprise Data
> Compression accelerator capable zlib [2]. When the hardware
> acceleration is enabled, migration/multifd/tcp/zlib test fails
> intermittently [3] due to sliding window corruption.
> 
> At the moment this problem occurs only with this accelerator, since
> its architecture explicitly discourages concurrent accesses [4]:
> 
>      Page 26-57, "Other Conditions":
> 
>      As observed by this CPU, other CPUs, and channel
>      programs, references to the parameter block, first,
>      second, and third operands may be multiple-access
>      references, accesses to these storage locations are
>      not necessarily block-concurrent, and the sequence
>      of these accesses or references is undefined.
> 
> Still, it might affect other platforms due to a future zlib update.
> Therefore, copy the page being compressed into a private buffer before
> passing it to zlib.
> 
> [1] https://zlib.net/manual.html
> [2] https://github.com/madler/zlib/pull/410
> [3] https://lists.nongnu.org/archive/html/qemu-devel/2022-03/msg03988.html
> [4] http://publibfp.dhe.ibm.com/epubs/pdf/a227832c.pdf
> 
> Signed-off-by: Ilya Leoshkevich <iii@linux.ibm.com>
> ---
>   migration/multifd-zlib.c | 35 ++++++++++++++++++++++-------------
>   1 file changed, 22 insertions(+), 13 deletions(-)
> 
> diff --git a/migration/multifd-zlib.c b/migration/multifd-zlib.c
> index 3a7ae44485..b6b22b7d1f 100644
> --- a/migration/multifd-zlib.c
> +++ b/migration/multifd-zlib.c
> @@ -27,6 +27,8 @@ struct zlib_data {
>       uint8_t *zbuff;
>       /* size of compressed buffer */
>       uint32_t zbuff_len;
> +    /* uncompressed buffer */
> +    uint8_t buf[];
>   };
>   
>   /* Multifd zlib compression */
> @@ -43,9 +45,18 @@ struct zlib_data {
>    */
>   static int zlib_send_setup(MultiFDSendParams *p, Error **errp)
>   {
> -    struct zlib_data *z = g_new0(struct zlib_data, 1);
> -    z_stream *zs = &z->zs;
> +    /* This is the maximum size of the compressed buffer */
> +    uint32_t zbuff_len = compressBound(MULTIFD_PACKET_SIZE);
> +    size_t buf_len = qemu_target_page_size();
> +    struct zlib_data *z;
> +    z_stream *zs;
>   
> +    z = g_try_malloc0(sizeof(struct zlib_data) + buf_len + zbuff_len);
> +    if (!z) {
> +        error_setg(errp, "multifd %u: out of memory for zlib_data", p->id);
> +        return -1;
> +    }
> +    zs = &z->zs;
>       zs->zalloc = Z_NULL;
>       zs->zfree = Z_NULL;
>       zs->opaque = Z_NULL;
> @@ -54,15 +65,8 @@ static int zlib_send_setup(MultiFDSendParams *p, Error **errp)
>           error_setg(errp, "multifd %u: deflate init failed", p->id);
>           return -1;
>       }
> -    /* This is the maxium size of the compressed buffer */
> -    z->zbuff_len = compressBound(MULTIFD_PACKET_SIZE);
> -    z->zbuff = g_try_malloc(z->zbuff_len);
> -    if (!z->zbuff) {
> -        deflateEnd(&z->zs);
> -        g_free(z);
> -        error_setg(errp, "multifd %u: out of memory for zbuff", p->id);
> -        return -1;
> -    }
> +    z->zbuff_len = zbuff_len;
> +    z->zbuff = z->buf + buf_len;
>       p->data = z;
>       return 0;
>   }
> @@ -80,7 +84,6 @@ static void zlib_send_cleanup(MultiFDSendParams *p, Error **errp)
>       struct zlib_data *z = p->data;
>   
>       deflateEnd(&z->zs);
> -    g_free(z->zbuff);
>       z->zbuff = NULL;
>       g_free(p->data);
>       p->data = NULL;
> @@ -114,8 +117,14 @@ static int zlib_send_prepare(MultiFDSendParams *p, Error **errp)
>               flush = Z_SYNC_FLUSH;
>           }
>   
> +        /*
> +         * Since the VM might be running, the page may be changing concurrently
> +         * with compression. zlib does not guarantee that this is safe,
> +         * therefore copy the page before calling deflate().
> +         */
> +        memcpy(z->buf, p->pages->block->host + p->normal[i], page_size);
>           zs->avail_in = page_size;
> -        zs->next_in = p->pages->block->host + p->normal[i];
> +        zs->next_in = z->buf;
>   
>           zs->avail_out = available;
>           zs->next_out = z->zbuff + out_size;


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] multifd: Copy pages before compressing them with zlib
  2022-03-29 15:21 [PATCH] multifd: Copy pages before compressing them with zlib Ilya Leoshkevich
  2022-03-30 14:35 ` Christian Borntraeger
@ 2022-04-04 11:20 ` Dr. David Alan Gilbert
  2022-04-04 12:09   ` Ilya Leoshkevich
  2022-04-04 12:45   ` Daniel P. Berrangé
  1 sibling, 2 replies; 16+ messages in thread
From: Dr. David Alan Gilbert @ 2022-04-04 11:20 UTC (permalink / raw)
  To: Ilya Leoshkevich
  Cc: Peter Maydell, thuth, f.ebner, Daniel P . Berrangé,
	Juan Quintela, Alex Bennée, s.reiter, Cornelia Huck,
	qemu-devel, peterx, qemu-s390x, Philippe Mathieu-Daudé,
	hreitz, Christian Borntraeger, jinpu.wang

* Ilya Leoshkevich (iii@linux.ibm.com) wrote:
> zlib_send_prepare() compresses pages of a running VM. zlib does not
> make any thread-safety guarantees with respect to changing deflate()
> input concurrently with deflate() [1].
> 
> One can observe problems due to this with the IBM zEnterprise Data
> Compression accelerator capable zlib [2]. When the hardware
> acceleration is enabled, migration/multifd/tcp/zlib test fails
> intermittently [3] due to sliding window corruption.
> 
> At the moment this problem occurs only with this accelerator, since
> its architecture explicitly discourages concurrent accesses [4]:
> 
>     Page 26-57, "Other Conditions":
> 
>     As observed by this CPU, other CPUs, and channel
>     programs, references to the parameter block, first,
>     second, and third operands may be multiple-access
>     references, accesses to these storage locations are
>     not necessarily block-concurrent, and the sequence
>     of these accesses or references is undefined.
> 
> Still, it might affect other platforms due to a future zlib update.
> Therefore, copy the page being compressed into a private buffer before
> passing it to zlib.

While this might work around the problem; your explanation doesn't quite
fit with the symptoms; or if they do, then you have a separate problem.

The live migration code relies on the fact that the source is running
and changing it's memory as the data is transmitted; however it also
relies on the fact that if this happens the 'dirty' flag is set _after_
those changes causing another round of migration and retransmission of
the (now stable) data.

We don't expect the load of the data for the first page write to be
correct, consistent etc - we just rely on the retransmission to be
correct when the page is stable.

If your compressor hardware is doing something undefined during the
first case that's fine; as long as it works fine in the stable case
where the data isn't changing.

Adding the extra copy is going to slow everyone else dowmn; and since
there's plenty of pthread lockingin those multifd I'm expecting them
to get reasonably defined ordering and thus be safe from multi threading
problems (please correct us if we've actually done something wrong in
the locking there).

IMHO your accelerator when called from a zlib call needs to behave
the same as if it was the software implementation; i.e. if we've got
pthread calls in there that are enforcing ordering then that should be
fine; your accelerator implementation needs to add a barrier of some
type or an internal copy, not penalise everyone else.

Dave



> 
> [1] https://zlib.net/manual.html
> [2] https://github.com/madler/zlib/pull/410
> [3] https://lists.nongnu.org/archive/html/qemu-devel/2022-03/msg03988.html
> [4] http://publibfp.dhe.ibm.com/epubs/pdf/a227832c.pdf
> 
> Signed-off-by: Ilya Leoshkevich <iii@linux.ibm.com>
> ---
>  migration/multifd-zlib.c | 35 ++++++++++++++++++++++-------------
>  1 file changed, 22 insertions(+), 13 deletions(-)
> 
> diff --git a/migration/multifd-zlib.c b/migration/multifd-zlib.c
> index 3a7ae44485..b6b22b7d1f 100644
> --- a/migration/multifd-zlib.c
> +++ b/migration/multifd-zlib.c
> @@ -27,6 +27,8 @@ struct zlib_data {
>      uint8_t *zbuff;
>      /* size of compressed buffer */
>      uint32_t zbuff_len;
> +    /* uncompressed buffer */
> +    uint8_t buf[];
>  };
>  
>  /* Multifd zlib compression */
> @@ -43,9 +45,18 @@ struct zlib_data {
>   */
>  static int zlib_send_setup(MultiFDSendParams *p, Error **errp)
>  {
> -    struct zlib_data *z = g_new0(struct zlib_data, 1);
> -    z_stream *zs = &z->zs;
> +    /* This is the maximum size of the compressed buffer */
> +    uint32_t zbuff_len = compressBound(MULTIFD_PACKET_SIZE);
> +    size_t buf_len = qemu_target_page_size();
> +    struct zlib_data *z;
> +    z_stream *zs;
>  
> +    z = g_try_malloc0(sizeof(struct zlib_data) + buf_len + zbuff_len);
> +    if (!z) {
> +        error_setg(errp, "multifd %u: out of memory for zlib_data", p->id);
> +        return -1;
> +    }
> +    zs = &z->zs;
>      zs->zalloc = Z_NULL;
>      zs->zfree = Z_NULL;
>      zs->opaque = Z_NULL;
> @@ -54,15 +65,8 @@ static int zlib_send_setup(MultiFDSendParams *p, Error **errp)
>          error_setg(errp, "multifd %u: deflate init failed", p->id);
>          return -1;
>      }
> -    /* This is the maxium size of the compressed buffer */
> -    z->zbuff_len = compressBound(MULTIFD_PACKET_SIZE);
> -    z->zbuff = g_try_malloc(z->zbuff_len);
> -    if (!z->zbuff) {
> -        deflateEnd(&z->zs);
> -        g_free(z);
> -        error_setg(errp, "multifd %u: out of memory for zbuff", p->id);
> -        return -1;
> -    }
> +    z->zbuff_len = zbuff_len;
> +    z->zbuff = z->buf + buf_len;
>      p->data = z;
>      return 0;
>  }
> @@ -80,7 +84,6 @@ static void zlib_send_cleanup(MultiFDSendParams *p, Error **errp)
>      struct zlib_data *z = p->data;
>  
>      deflateEnd(&z->zs);
> -    g_free(z->zbuff);
>      z->zbuff = NULL;
>      g_free(p->data);
>      p->data = NULL;
> @@ -114,8 +117,14 @@ static int zlib_send_prepare(MultiFDSendParams *p, Error **errp)
>              flush = Z_SYNC_FLUSH;
>          }
>  
> +        /*
> +         * Since the VM might be running, the page may be changing concurrently
> +         * with compression. zlib does not guarantee that this is safe,
> +         * therefore copy the page before calling deflate().
> +         */
> +        memcpy(z->buf, p->pages->block->host + p->normal[i], page_size);
>          zs->avail_in = page_size;
> -        zs->next_in = p->pages->block->host + p->normal[i];
> +        zs->next_in = z->buf;
>  
>          zs->avail_out = available;
>          zs->next_out = z->zbuff + out_size;
> -- 
> 2.35.1
> 
-- 
Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK



^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] multifd: Copy pages before compressing them with zlib
  2022-04-04 11:20 ` Dr. David Alan Gilbert
@ 2022-04-04 12:09   ` Ilya Leoshkevich
  2022-04-04 17:11     ` Dr. David Alan Gilbert
  2022-04-04 12:45   ` Daniel P. Berrangé
  1 sibling, 1 reply; 16+ messages in thread
From: Ilya Leoshkevich @ 2022-04-04 12:09 UTC (permalink / raw)
  To: Dr. David Alan Gilbert
  Cc: Peter Maydell, thuth, Christian Borntraeger,
	Daniel P . Berrangé,
	Juan Quintela, jinpu.wang, s.reiter, Cornelia Huck, qemu-devel,
	peterx, qemu-s390x, Philippe Mathieu-Daudé,
	hreitz, f.ebner, Alex Bennée

On Mon, 2022-04-04 at 12:20 +0100, Dr. David Alan Gilbert wrote:
> * Ilya Leoshkevich (iii@linux.ibm.com) wrote:
> > zlib_send_prepare() compresses pages of a running VM. zlib does not
> > make any thread-safety guarantees with respect to changing
> > deflate()
> > input concurrently with deflate() [1].
> > 
> > One can observe problems due to this with the IBM zEnterprise Data
> > Compression accelerator capable zlib [2]. When the hardware
> > acceleration is enabled, migration/multifd/tcp/zlib test fails
> > intermittently [3] due to sliding window corruption.
> > 
> > At the moment this problem occurs only with this accelerator, since
> > its architecture explicitly discourages concurrent accesses [4]:
> > 
> >     Page 26-57, "Other Conditions":
> > 
> >     As observed by this CPU, other CPUs, and channel
> >     programs, references to the parameter block, first,
> >     second, and third operands may be multiple-access
> >     references, accesses to these storage locations are
> >     not necessarily block-concurrent, and the sequence
> >     of these accesses or references is undefined.
> > 
> > Still, it might affect other platforms due to a future zlib update.
> > Therefore, copy the page being compressed into a private buffer
> > before
> > passing it to zlib.
> 
> While this might work around the problem; your explanation doesn't
> quite
> fit with the symptoms; or if they do, then you have a separate
> problem.
> 
> The live migration code relies on the fact that the source is running
> and changing it's memory as the data is transmitted; however it also
> relies on the fact that if this happens the 'dirty' flag is set
> _after_
> those changes causing another round of migration and retransmission
> of
> the (now stable) data.
> 
> We don't expect the load of the data for the first page write to be
> correct, consistent etc - we just rely on the retransmission to be
> correct when the page is stable.
> 
> If your compressor hardware is doing something undefined during the
> first case that's fine; as long as it works fine in the stable case
> where the data isn't changing.
> 
> Adding the extra copy is going to slow everyone else dowmn; and since
> there's plenty of pthread lockingin those multifd I'm expecting them
> to get reasonably defined ordering and thus be safe from multi
> threading
> problems (please correct us if we've actually done something wrong in
> the locking there).
> 
> IMHO your accelerator when called from a zlib call needs to behave
> the same as if it was the software implementation; i.e. if we've got
> pthread calls in there that are enforcing ordering then that should
> be
> fine; your accelerator implementation needs to add a barrier of some
> type or an internal copy, not penalise everyone else.
> 
> Dave

The problem with the accelerator is that during the first case the
internal state might end up being corrupted (in particular: what goes
into the deflate stream differs from what goes into the sliding
window). This may affect the data integrity in the second case later
on.

I've been trying to think what to do with that, and of course doing an
internal copy is one option (a barrier won't suffice). However, I
realized that zlib API as documented doesn't guarantee that it's safe
to change input data concurrently with compression. On the other hand,
today's zlib is implemented in a way that tolerates this.

So the open question for me is, whether we should honor zlib
documentation (in which case, I would argue, QEMU needs to be changed)
or say that the behavior of today's zlib implementation is more
important (in which case accelerator code needs to change). I went with
the former for now, but the latter is of course doable as well.


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] multifd: Copy pages before compressing them with zlib
  2022-04-04 11:20 ` Dr. David Alan Gilbert
  2022-04-04 12:09   ` Ilya Leoshkevich
@ 2022-04-04 12:45   ` Daniel P. Berrangé
  2022-04-04 13:55     ` Juan Quintela
  1 sibling, 1 reply; 16+ messages in thread
From: Daniel P. Berrangé @ 2022-04-04 12:45 UTC (permalink / raw)
  To: Dr. David Alan Gilbert
  Cc: Peter Maydell, thuth, f.ebner, Ilya Leoshkevich, Juan Quintela,
	Alex Bennée, s.reiter, Cornelia Huck, qemu-devel, peterx,
	qemu-s390x, Philippe Mathieu-Daudé,
	hreitz, Christian Borntraeger, jinpu.wang

On Mon, Apr 04, 2022 at 12:20:14PM +0100, Dr. David Alan Gilbert wrote:
> * Ilya Leoshkevich (iii@linux.ibm.com) wrote:
> > zlib_send_prepare() compresses pages of a running VM. zlib does not
> > make any thread-safety guarantees with respect to changing deflate()
> > input concurrently with deflate() [1].
> > 
> > One can observe problems due to this with the IBM zEnterprise Data
> > Compression accelerator capable zlib [2]. When the hardware
> > acceleration is enabled, migration/multifd/tcp/zlib test fails
> > intermittently [3] due to sliding window corruption.
> > 
> > At the moment this problem occurs only with this accelerator, since
> > its architecture explicitly discourages concurrent accesses [4]:
> > 
> >     Page 26-57, "Other Conditions":
> > 
> >     As observed by this CPU, other CPUs, and channel
> >     programs, references to the parameter block, first,
> >     second, and third operands may be multiple-access
> >     references, accesses to these storage locations are
> >     not necessarily block-concurrent, and the sequence
> >     of these accesses or references is undefined.
> > 
> > Still, it might affect other platforms due to a future zlib update.
> > Therefore, copy the page being compressed into a private buffer before
> > passing it to zlib.
> 
> While this might work around the problem; your explanation doesn't quite
> fit with the symptoms; or if they do, then you have a separate problem.
> 
> The live migration code relies on the fact that the source is running
> and changing it's memory as the data is transmitted; however it also
> relies on the fact that if this happens the 'dirty' flag is set _after_
> those changes causing another round of migration and retransmission of
> the (now stable) data.
> 
> We don't expect the load of the data for the first page write to be
> correct, consistent etc - we just rely on the retransmission to be
> correct when the page is stable.
> 
> If your compressor hardware is doing something undefined during the
> first case that's fine; as long as it works fine in the stable case
> where the data isn't changing.
> 
> Adding the extra copy is going to slow everyone else dowmn; and since
> there's plenty of pthread lockingin those multifd I'm expecting them
> to get reasonably defined ordering and thus be safe from multi threading
> problems (please correct us if we've actually done something wrong in
> the locking there).
> 
> IMHO your accelerator when called from a zlib call needs to behave
> the same as if it was the software implementation; i.e. if we've got
> pthread calls in there that are enforcing ordering then that should be
> fine; your accelerator implementation needs to add a barrier of some
> type or an internal copy, not penalise everyone else.

It is reasonable to argue that QEMU is relying on undefined behaviour
when invoking zlib in this case, so it isn't clear that the accelerator
impl should be changed, rather than QEMU be changed to follow the zlib
API requirements. 

With regards,
Daniel
-- 
|: https://berrange.com      -o-    https://www.flickr.com/photos/dberrange :|
|: https://libvirt.org         -o-            https://fstop138.berrange.com :|
|: https://entangle-photo.org    -o-    https://www.instagram.com/dberrange :|



^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] multifd: Copy pages before compressing them with zlib
  2022-04-04 12:45   ` Daniel P. Berrangé
@ 2022-04-04 13:55     ` Juan Quintela
  0 siblings, 0 replies; 16+ messages in thread
From: Juan Quintela @ 2022-04-04 13:55 UTC (permalink / raw)
  To: Daniel P. Berrangé
  Cc: Peter Maydell, thuth, f.ebner, Ilya Leoshkevich,
	Alex Bennée, s.reiter, Cornelia Huck, qemu-devel, peterx,
	Dr. David Alan Gilbert, qemu-s390x, Philippe Mathieu-Daudé,
	hreitz, Christian Borntraeger, jinpu.wang

Daniel P. Berrangé <berrange@redhat.com> wrote:
> On Mon, Apr 04, 2022 at 12:20:14PM +0100, Dr. David Alan Gilbert wrote:
>> * Ilya Leoshkevich (iii@linux.ibm.com) wrote:
>> > zlib_send_prepare() compresses pages of a running VM. zlib does not
>> > make any thread-safety guarantees with respect to changing deflate()
>> > input concurrently with deflate() [1].
>> > 
>> > One can observe problems due to this with the IBM zEnterprise Data
>> > Compression accelerator capable zlib [2]. When the hardware
>> > acceleration is enabled, migration/multifd/tcp/zlib test fails
>> > intermittently [3] due to sliding window corruption.
>> > 
>> > At the moment this problem occurs only with this accelerator, since
>> > its architecture explicitly discourages concurrent accesses [4]:
>> > 
>> >     Page 26-57, "Other Conditions":
>> > 
>> >     As observed by this CPU, other CPUs, and channel
>> >     programs, references to the parameter block, first,
>> >     second, and third operands may be multiple-access
>> >     references, accesses to these storage locations are
>> >     not necessarily block-concurrent, and the sequence
>> >     of these accesses or references is undefined.
>> > 
>> > Still, it might affect other platforms due to a future zlib update.
>> > Therefore, copy the page being compressed into a private buffer before
>> > passing it to zlib.
>> 
>> While this might work around the problem; your explanation doesn't quite
>> fit with the symptoms; or if they do, then you have a separate problem.
>> 
>> The live migration code relies on the fact that the source is running
>> and changing it's memory as the data is transmitted; however it also
>> relies on the fact that if this happens the 'dirty' flag is set _after_
>> those changes causing another round of migration and retransmission of
>> the (now stable) data.
>> 
>> We don't expect the load of the data for the first page write to be
>> correct, consistent etc - we just rely on the retransmission to be
>> correct when the page is stable.
>> 
>> If your compressor hardware is doing something undefined during the
>> first case that's fine; as long as it works fine in the stable case
>> where the data isn't changing.
>> 
>> Adding the extra copy is going to slow everyone else dowmn; and since
>> there's plenty of pthread lockingin those multifd I'm expecting them
>> to get reasonably defined ordering and thus be safe from multi threading
>> problems (please correct us if we've actually done something wrong in
>> the locking there).
>> 
>> IMHO your accelerator when called from a zlib call needs to behave
>> the same as if it was the software implementation; i.e. if we've got
>> pthread calls in there that are enforcing ordering then that should be
>> fine; your accelerator implementation needs to add a barrier of some
>> type or an internal copy, not penalise everyone else.
>
> It is reasonable to argue that QEMU is relying on undefined behaviour
> when invoking zlib in this case, so it isn't clear that the accelerator
> impl should be changed, rather than QEMU be changed to follow the zlib
> API requirements. 

It works on all the other cases.  My vote if need taht is that we add a
zlib-sync or similar method.
zlib already means doing a copy, doing an extra copy will cost too much
on my opinion.

Once that we are here, is there such a requirement for zstd?  In my
testing, zstd was basically always better than zlib (no, I don't
remember the details).

Later, Juan.



^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] multifd: Copy pages before compressing them with zlib
  2022-04-04 12:09   ` Ilya Leoshkevich
@ 2022-04-04 17:11     ` Dr. David Alan Gilbert
  0 siblings, 0 replies; 16+ messages in thread
From: Dr. David Alan Gilbert @ 2022-04-04 17:11 UTC (permalink / raw)
  To: Ilya Leoshkevich
  Cc: Peter Maydell, thuth, Christian Borntraeger,
	Daniel P . Berrangé,
	Juan Quintela, jinpu.wang, s.reiter, Cornelia Huck, qemu-devel,
	peterx, qemu-s390x, Philippe Mathieu-Daudé,
	hreitz, f.ebner, Alex Bennée

* Ilya Leoshkevich (iii@linux.ibm.com) wrote:
> On Mon, 2022-04-04 at 12:20 +0100, Dr. David Alan Gilbert wrote:
> > * Ilya Leoshkevich (iii@linux.ibm.com) wrote:
> > > zlib_send_prepare() compresses pages of a running VM. zlib does not
> > > make any thread-safety guarantees with respect to changing
> > > deflate()
> > > input concurrently with deflate() [1].
> > > 
> > > One can observe problems due to this with the IBM zEnterprise Data
> > > Compression accelerator capable zlib [2]. When the hardware
> > > acceleration is enabled, migration/multifd/tcp/zlib test fails
> > > intermittently [3] due to sliding window corruption.
> > > 
> > > At the moment this problem occurs only with this accelerator, since
> > > its architecture explicitly discourages concurrent accesses [4]:
> > > 
> > >     Page 26-57, "Other Conditions":
> > > 
> > >     As observed by this CPU, other CPUs, and channel
> > >     programs, references to the parameter block, first,
> > >     second, and third operands may be multiple-access
> > >     references, accesses to these storage locations are
> > >     not necessarily block-concurrent, and the sequence
> > >     of these accesses or references is undefined.
> > > 
> > > Still, it might affect other platforms due to a future zlib update.
> > > Therefore, copy the page being compressed into a private buffer
> > > before
> > > passing it to zlib.
> > 
> > While this might work around the problem; your explanation doesn't
> > quite
> > fit with the symptoms; or if they do, then you have a separate
> > problem.
> > 
> > The live migration code relies on the fact that the source is running
> > and changing it's memory as the data is transmitted; however it also
> > relies on the fact that if this happens the 'dirty' flag is set
> > _after_
> > those changes causing another round of migration and retransmission
> > of
> > the (now stable) data.
> > 
> > We don't expect the load of the data for the first page write to be
> > correct, consistent etc - we just rely on the retransmission to be
> > correct when the page is stable.
> > 
> > If your compressor hardware is doing something undefined during the
> > first case that's fine; as long as it works fine in the stable case
> > where the data isn't changing.
> > 
> > Adding the extra copy is going to slow everyone else dowmn; and since
> > there's plenty of pthread lockingin those multifd I'm expecting them
> > to get reasonably defined ordering and thus be safe from multi
> > threading
> > problems (please correct us if we've actually done something wrong in
> > the locking there).
> > 
> > IMHO your accelerator when called from a zlib call needs to behave
> > the same as if it was the software implementation; i.e. if we've got
> > pthread calls in there that are enforcing ordering then that should
> > be
> > fine; your accelerator implementation needs to add a barrier of some
> > type or an internal copy, not penalise everyone else.
> > 
> > Dave
> 
> The problem with the accelerator is that during the first case the
> internal state might end up being corrupted (in particular: what goes
> into the deflate stream differs from what goes into the sliding
> window). This may affect the data integrity in the second case later
> on.

Hmm I hadn't expected the unpredictability to span multiple blocks.

> I've been trying to think what to do with that, and of course doing an
> internal copy is one option (a barrier won't suffice). However, I
> realized that zlib API as documented doesn't guarantee that it's safe
> to change input data concurrently with compression. On the other hand,
> today's zlib is implemented in a way that tolerates this.
> 
> So the open question for me is, whether we should honor zlib
> documentation (in which case, I would argue, QEMU needs to be changed)
> or say that the behavior of today's zlib implementation is more
> important (in which case accelerator code needs to change). I went with
> the former for now, but the latter is of course doable as well.

Well I think you're saying that the current docs don't specify and
thus assume that there's a constraint.

I think the right people to answer this is the zlib community; so
can you send a mail to zlib-devel and ask?

Dave

-- 
Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK



^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] multifd: Copy pages before compressing them with zlib
  2022-07-05 17:22   ` Ilya Leoshkevich
@ 2022-07-05 17:32     ` Dr. David Alan Gilbert
  0 siblings, 0 replies; 16+ messages in thread
From: Dr. David Alan Gilbert @ 2022-07-05 17:32 UTC (permalink / raw)
  To: Ilya Leoshkevich; +Cc: Juan Quintela, qemu-devel, Christian Borntraeger

* Ilya Leoshkevich (iii@linux.ibm.com) wrote:
> On Tue, 2022-07-05 at 16:27 +0100, Dr. David Alan Gilbert wrote:
> > * Ilya Leoshkevich (iii@linux.ibm.com) wrote:
> > > zlib_send_prepare() compresses pages of a running VM. zlib does not
> > > make any thread-safety guarantees with respect to changing
> > > deflate()
> > > input concurrently with deflate() [1].
> > > 
> > > One can observe problems due to this with the IBM zEnterprise Data
> > > Compression accelerator capable zlib [2]. When the hardware
> > > acceleration is enabled, migration/multifd/tcp/plain/zlib test
> > > fails
> > > intermittently [3] due to sliding window corruption. The
> > > accelerator's
> > > architecture explicitly discourages concurrent accesses [4]:
> > > 
> > >     Page 26-57, "Other Conditions":
> > > 
> > >     As observed by this CPU, other CPUs, and channel
> > >     programs, references to the parameter block, first,
> > >     second, and third operands may be multiple-access
> > >     references, accesses to these storage locations are
> > >     not necessarily block-concurrent, and the sequence
> > >     of these accesses or references is undefined.
> > > 
> > > Mark Adler pointed out that vanilla zlib performs double fetches
> > > under
> > > certain circumstances as well [5], therefore we need to copy data
> > > before passing it to deflate().
> > 
> > Thanks for fixing that!
> > 
> > > [1] https://zlib.net/manual.html
> > > [2] https://github.com/madler/zlib/pull/410
> > > [3]
> > > https://lists.nongnu.org/archive/html/qemu-devel/2022-03/msg03988.html
> > > [4] http://publibfp.dhe.ibm.com/epubs/pdf/a227832c.pdf
> > > [5] https://gitlab.com/qemu-project/qemu/-/issues/1099
> > > 
> > > Signed-off-by: Ilya Leoshkevich <iii@linux.ibm.com>
> > > ---
> > > 
> > > v1:
> > > https://lists.gnu.org/archive/html/qemu-devel/2022-03/msg06841.html
> > > v1 -> v2: Rebase, mention Mark Adler's reply in the commit message.
> > > 
> > >  migration/multifd-zlib.c | 35 ++++++++++++++++++++++-------------
> > >  1 file changed, 22 insertions(+), 13 deletions(-)
> > > 
> > > diff --git a/migration/multifd-zlib.c b/migration/multifd-zlib.c
> > > index 3a7ae44485..b6b22b7d1f 100644
> > > --- a/migration/multifd-zlib.c
> > > +++ b/migration/multifd-zlib.c
> > > @@ -27,6 +27,8 @@ struct zlib_data {
> > >      uint8_t *zbuff;
> > >      /* size of compressed buffer */
> > >      uint32_t zbuff_len;
> > > +    /* uncompressed buffer */
> > > +    uint8_t buf[];
> > >  };
> > >  
> > >  /* Multifd zlib compression */
> > > @@ -43,9 +45,18 @@ struct zlib_data {
> > >   */
> > >  static int zlib_send_setup(MultiFDSendParams *p, Error **errp)
> > >  {
> > > -    struct zlib_data *z = g_new0(struct zlib_data, 1);
> > > -    z_stream *zs = &z->zs;
> > > +    /* This is the maximum size of the compressed buffer */
> > > +    uint32_t zbuff_len = compressBound(MULTIFD_PACKET_SIZE);
> > > +    size_t buf_len = qemu_target_page_size();
> > > +    struct zlib_data *z;
> > > +    z_stream *zs;
> > >  
> > > +    z = g_try_malloc0(sizeof(struct zlib_data) + buf_len +
> > > zbuff_len);
> > 
> > So I think this works; but wouldn't life be easier if you just used
> > separate malloc's for the buffers?  You've got a lot of hairy pointer
> > maths below that would go away if they were separate.
> > 
> > Dave
> 
> I was trying to avoid an (IMHO equally hairy) error handling sequence
> here. But I don't mind changing this if an alternative would be more
> maintainable.

It's probably worth trying; I bet it works out a lot simpler.
Remember that g_free(NULL) is safe; so if you want to do a cleanup
of a bunch of pointers you can do:
  g_free(a);
  g_free(b);
  g_free(c);

even if some combination of those hadn't been allocated yet.

Dave

> Best regards,
> Ilya
> 
-- 
Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK



^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] multifd: Copy pages before compressing them with zlib
  2022-07-05 15:27 ` Dr. David Alan Gilbert
@ 2022-07-05 17:22   ` Ilya Leoshkevich
  2022-07-05 17:32     ` Dr. David Alan Gilbert
  0 siblings, 1 reply; 16+ messages in thread
From: Ilya Leoshkevich @ 2022-07-05 17:22 UTC (permalink / raw)
  To: Dr. David Alan Gilbert; +Cc: Juan Quintela, qemu-devel, Christian Borntraeger

On Tue, 2022-07-05 at 16:27 +0100, Dr. David Alan Gilbert wrote:
> * Ilya Leoshkevich (iii@linux.ibm.com) wrote:
> > zlib_send_prepare() compresses pages of a running VM. zlib does not
> > make any thread-safety guarantees with respect to changing
> > deflate()
> > input concurrently with deflate() [1].
> > 
> > One can observe problems due to this with the IBM zEnterprise Data
> > Compression accelerator capable zlib [2]. When the hardware
> > acceleration is enabled, migration/multifd/tcp/plain/zlib test
> > fails
> > intermittently [3] due to sliding window corruption. The
> > accelerator's
> > architecture explicitly discourages concurrent accesses [4]:
> > 
> >     Page 26-57, "Other Conditions":
> > 
> >     As observed by this CPU, other CPUs, and channel
> >     programs, references to the parameter block, first,
> >     second, and third operands may be multiple-access
> >     references, accesses to these storage locations are
> >     not necessarily block-concurrent, and the sequence
> >     of these accesses or references is undefined.
> > 
> > Mark Adler pointed out that vanilla zlib performs double fetches
> > under
> > certain circumstances as well [5], therefore we need to copy data
> > before passing it to deflate().
> 
> Thanks for fixing that!
> 
> > [1] https://zlib.net/manual.html
> > [2] https://github.com/madler/zlib/pull/410
> > [3]
> > https://lists.nongnu.org/archive/html/qemu-devel/2022-03/msg03988.html
> > [4] http://publibfp.dhe.ibm.com/epubs/pdf/a227832c.pdf
> > [5] https://gitlab.com/qemu-project/qemu/-/issues/1099
> > 
> > Signed-off-by: Ilya Leoshkevich <iii@linux.ibm.com>
> > ---
> > 
> > v1:
> > https://lists.gnu.org/archive/html/qemu-devel/2022-03/msg06841.html
> > v1 -> v2: Rebase, mention Mark Adler's reply in the commit message.
> > 
> >  migration/multifd-zlib.c | 35 ++++++++++++++++++++++-------------
> >  1 file changed, 22 insertions(+), 13 deletions(-)
> > 
> > diff --git a/migration/multifd-zlib.c b/migration/multifd-zlib.c
> > index 3a7ae44485..b6b22b7d1f 100644
> > --- a/migration/multifd-zlib.c
> > +++ b/migration/multifd-zlib.c
> > @@ -27,6 +27,8 @@ struct zlib_data {
> >      uint8_t *zbuff;
> >      /* size of compressed buffer */
> >      uint32_t zbuff_len;
> > +    /* uncompressed buffer */
> > +    uint8_t buf[];
> >  };
> >  
> >  /* Multifd zlib compression */
> > @@ -43,9 +45,18 @@ struct zlib_data {
> >   */
> >  static int zlib_send_setup(MultiFDSendParams *p, Error **errp)
> >  {
> > -    struct zlib_data *z = g_new0(struct zlib_data, 1);
> > -    z_stream *zs = &z->zs;
> > +    /* This is the maximum size of the compressed buffer */
> > +    uint32_t zbuff_len = compressBound(MULTIFD_PACKET_SIZE);
> > +    size_t buf_len = qemu_target_page_size();
> > +    struct zlib_data *z;
> > +    z_stream *zs;
> >  
> > +    z = g_try_malloc0(sizeof(struct zlib_data) + buf_len +
> > zbuff_len);
> 
> So I think this works; but wouldn't life be easier if you just used
> separate malloc's for the buffers?  You've got a lot of hairy pointer
> maths below that would go away if they were separate.
> 
> Dave

I was trying to avoid an (IMHO equally hairy) error handling sequence
here. But I don't mind changing this if an alternative would be more
maintainable.

Best regards,
Ilya


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] multifd: Copy pages before compressing them with zlib
  2022-07-05 16:27     ` Christian Borntraeger
@ 2022-07-05 16:33       ` Dr. David Alan Gilbert
  0 siblings, 0 replies; 16+ messages in thread
From: Dr. David Alan Gilbert @ 2022-07-05 16:33 UTC (permalink / raw)
  To: Christian Borntraeger, zlib
  Cc: Peter Maydell, Ilya Leoshkevich, Juan Quintela, qemu-devel

* Christian Borntraeger (borntraeger@de.ibm.com) wrote:
> Am 05.07.22 um 18:16 schrieb Dr. David Alan Gilbert:
> > * Peter Maydell (peter.maydell@linaro.org) wrote:
> > > On Mon, 4 Jul 2022 at 17:43, Ilya Leoshkevich <iii@linux.ibm.com> wrote:
> > > > 
> > > > zlib_send_prepare() compresses pages of a running VM. zlib does not
> > > > make any thread-safety guarantees with respect to changing deflate()
> > > > input concurrently with deflate() [1].
> > > > 
> > > > One can observe problems due to this with the IBM zEnterprise Data
> > > > Compression accelerator capable zlib [2]. When the hardware
> > > > acceleration is enabled, migration/multifd/tcp/plain/zlib test fails
> > > > intermittently [3] due to sliding window corruption. The accelerator's
> > > > architecture explicitly discourages concurrent accesses [4]:
> > > > 
> > > >      Page 26-57, "Other Conditions":
> > > > 
> > > >      As observed by this CPU, other CPUs, and channel
> > > >      programs, references to the parameter block, first,
> > > >      second, and third operands may be multiple-access
> > > >      references, accesses to these storage locations are
> > > >      not necessarily block-concurrent, and the sequence
> > > >      of these accesses or references is undefined.
> > > > 
> > > > Mark Adler pointed out that vanilla zlib performs double fetches under
> > > > certain circumstances as well [5], therefore we need to copy data
> > > > before passing it to deflate().
> > > > 
> > > > [1] https://zlib.net/manual.html
> > > > [2] https://github.com/madler/zlib/pull/410
> > > > [3] https://lists.nongnu.org/archive/html/qemu-devel/2022-03/msg03988.html
> > > > [4] http://publibfp.dhe.ibm.com/epubs/pdf/a227832c.pdf
> > > > [5] https://gitlab.com/qemu-project/qemu/-/issues/1099
> > > 
> > > Is this [5] the wrong link? It's to our issue tracker, not zlib's
> > > or a zlib mailing list thread, and it doesn't contain any messages
> > > from Mark Adler.
> > 
> > Looking at Mark's message, I'm not seeing that it was cc'd to the lists.
> > I did however ask him to update zlib's docs to describe the requirement.
> 
> 
> Can you maybe forward the message here?

Lets see, it looks OK from the content, here's a copy of my reply with
the thread in it.  I've add Mark to the cc here so he knows.

Dave

<><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
* Mark Adler (zlib@madler.net) wrote:
> Dave,
> 
> d), which should also result in an invalid check value (CRC-32 or Adler-32). I suppose you could call that b).
> 
> To get c), the input data would need to be read exactly once. However there is a case in deflate when writing a stored block where the input is accessed twice — once to copy to the output, and then a second time to fill in the sliding window. If the data changes between those two, then the sliding window does not reflect the data written, which can propagate to incorrect matches downstream of the modified data.
> 
> That is the only place I see that. The impact would usually be c), but if you are trying to compress incompressible data followed by compressible data, you will have stored blocks followed by dynamic blocks with matches to the incorrect data. Your testing would likely not expose that. (I tried to compile the linked test, but went down a rat hole to find include files and gave up.)

OK - thanks for your clarification!
I've created:
  https://gitlab.com/qemu-project/qemu/-/issues/1099

as a reminder we need to fix this in qemu somewhere.

Could you please add a note to the zlib docs somewhere to make this
explicit.

Thanks again,

Dave

> Mark
> 
> 
> > On Jun 30, 2022, at 9:26 AM, Dr. David Alan Gilbert <dgilbert@redhat.com> wrote:
> > 
> > * Mark Adler (zlib@madler.net <mailto:zlib@madler.net>) wrote:
> >> Ilya,
> >> 
> >> What exactly do you mean by “concurrently”? What is an example of this?
> > 
> > In qemu's live migration we have a thread that shuffles the contents of
> > guest memory out over the network. The guest is still
> > running at the time and changing the contents of the memory we're
> > saving.
> > Fortunately we keep a 'modified' flag so that if the guest does modify
> > it while we're saving, we know about it and will send the block again.
> > 
> > Zlib (and zstd) have recently been forcibly inserted into this; so zlib
> > may be compressing a page of memory that changes.
> > 
> >> If you mean modifying the data provided to deflate() before deflate() has returned, then that is certainly not safe.
> > 
> > So a question is what does 'not safe' mean:
> > a) It explodes and segs
> > b) It produces an invalid stream
> > c) It produces a valid stream but the data for the modified block may
> > be garbage
> > d) It produces a valid stream but the data for the modified block and
> > other blocks may be garbage.
> > 
> > The qemu live migration code is happy with (c) because it'll retransmit
> > a stable block later. So far with the software zlib libraries we've
> > seen that be fine; I think Ilya is finding something like (b) or (d) on
> > their compression hardware.
> > 
> > Dave
> > 
> >> 
> >> Mark
> >> 
> >> 
> >>> On Jun 22, 2022, at 2:04 AM, Ilya Leoshkevich <iii@linux.ibm.com> wrote:
> >>> 
> >>> [resending with a smaller cc: list in order to pass the
> >>> zlib-devel mailing list moderation process]
> >>> 
> >>> Hello zlib developers,
> >>> 
> >>> I've been investigating a problem in the QEMU test suite on IBM Z [1]
> >>> [2] in connection with the IBM Z compression accelerator patch [3].
> >>> 
> >>> The problem is that a QEMU thread compresses data that is being
> >>> modified by another QEMU thread. zlib manual [4] does not state that
> >>> this is safe, however, the current stable zlib in fact tolerates it.
> >>> 
> >>> The accelerator, however, does not: not only what it compresses ends up
> >>> being unpredictable - QEMU actually resolves this just fine -
> >>> but the accelerator's internal state also ends up being corrupted.
> >>> 
> >>> I have a design question in connection to this: does zlib guarantee
> >>> that modifying deflate() input concurrently with deflate() is safe?
> >>> Or does it reserve the right to change this in the future versions?
> >>> 
> >>> Cc:ing zlib-ng folks for their opinion as well.
> >>> 
> >>> [1] https://lists.gnu.org/archive/html/qemu-devel/2022-03/msg06841.html
> >>> [2] https://lists.gnu.org/archive/html/qemu-devel/2022-04/msg00329.html
> >>> [3] https://github.com/madler/zlib/pull/410
> >>> [4] https://zlib.net/manual.html
> >>> 
> >>> Best regards,
> >>> Ilya
> >> 
> > -- 
> > Dr. David Alan Gilbert / dgilbert@redhat.com <mailto:dgilbert@redhat.com> / Manchester, UK
> 
<><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
-- 
Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK



^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] multifd: Copy pages before compressing them with zlib
  2022-07-05 16:16   ` Dr. David Alan Gilbert
@ 2022-07-05 16:27     ` Christian Borntraeger
  2022-07-05 16:33       ` Dr. David Alan Gilbert
  0 siblings, 1 reply; 16+ messages in thread
From: Christian Borntraeger @ 2022-07-05 16:27 UTC (permalink / raw)
  To: Dr. David Alan Gilbert, Peter Maydell
  Cc: Ilya Leoshkevich, Juan Quintela, qemu-devel

Am 05.07.22 um 18:16 schrieb Dr. David Alan Gilbert:
> * Peter Maydell (peter.maydell@linaro.org) wrote:
>> On Mon, 4 Jul 2022 at 17:43, Ilya Leoshkevich <iii@linux.ibm.com> wrote:
>>>
>>> zlib_send_prepare() compresses pages of a running VM. zlib does not
>>> make any thread-safety guarantees with respect to changing deflate()
>>> input concurrently with deflate() [1].
>>>
>>> One can observe problems due to this with the IBM zEnterprise Data
>>> Compression accelerator capable zlib [2]. When the hardware
>>> acceleration is enabled, migration/multifd/tcp/plain/zlib test fails
>>> intermittently [3] due to sliding window corruption. The accelerator's
>>> architecture explicitly discourages concurrent accesses [4]:
>>>
>>>      Page 26-57, "Other Conditions":
>>>
>>>      As observed by this CPU, other CPUs, and channel
>>>      programs, references to the parameter block, first,
>>>      second, and third operands may be multiple-access
>>>      references, accesses to these storage locations are
>>>      not necessarily block-concurrent, and the sequence
>>>      of these accesses or references is undefined.
>>>
>>> Mark Adler pointed out that vanilla zlib performs double fetches under
>>> certain circumstances as well [5], therefore we need to copy data
>>> before passing it to deflate().
>>>
>>> [1] https://zlib.net/manual.html
>>> [2] https://github.com/madler/zlib/pull/410
>>> [3] https://lists.nongnu.org/archive/html/qemu-devel/2022-03/msg03988.html
>>> [4] http://publibfp.dhe.ibm.com/epubs/pdf/a227832c.pdf
>>> [5] https://gitlab.com/qemu-project/qemu/-/issues/1099
>>
>> Is this [5] the wrong link? It's to our issue tracker, not zlib's
>> or a zlib mailing list thread, and it doesn't contain any messages
>> from Mark Adler.
> 
> Looking at Mark's message, I'm not seeing that it was cc'd to the lists.
> I did however ask him to update zlib's docs to describe the requirement.


Can you maybe forward the message here?


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] multifd: Copy pages before compressing them with zlib
  2022-07-05 16:00 ` Peter Maydell
@ 2022-07-05 16:16   ` Dr. David Alan Gilbert
  2022-07-05 16:27     ` Christian Borntraeger
  0 siblings, 1 reply; 16+ messages in thread
From: Dr. David Alan Gilbert @ 2022-07-05 16:16 UTC (permalink / raw)
  To: Peter Maydell
  Cc: Ilya Leoshkevich, Juan Quintela, qemu-devel, Christian Borntraeger

* Peter Maydell (peter.maydell@linaro.org) wrote:
> On Mon, 4 Jul 2022 at 17:43, Ilya Leoshkevich <iii@linux.ibm.com> wrote:
> >
> > zlib_send_prepare() compresses pages of a running VM. zlib does not
> > make any thread-safety guarantees with respect to changing deflate()
> > input concurrently with deflate() [1].
> >
> > One can observe problems due to this with the IBM zEnterprise Data
> > Compression accelerator capable zlib [2]. When the hardware
> > acceleration is enabled, migration/multifd/tcp/plain/zlib test fails
> > intermittently [3] due to sliding window corruption. The accelerator's
> > architecture explicitly discourages concurrent accesses [4]:
> >
> >     Page 26-57, "Other Conditions":
> >
> >     As observed by this CPU, other CPUs, and channel
> >     programs, references to the parameter block, first,
> >     second, and third operands may be multiple-access
> >     references, accesses to these storage locations are
> >     not necessarily block-concurrent, and the sequence
> >     of these accesses or references is undefined.
> >
> > Mark Adler pointed out that vanilla zlib performs double fetches under
> > certain circumstances as well [5], therefore we need to copy data
> > before passing it to deflate().
> >
> > [1] https://zlib.net/manual.html
> > [2] https://github.com/madler/zlib/pull/410
> > [3] https://lists.nongnu.org/archive/html/qemu-devel/2022-03/msg03988.html
> > [4] http://publibfp.dhe.ibm.com/epubs/pdf/a227832c.pdf
> > [5] https://gitlab.com/qemu-project/qemu/-/issues/1099
> 
> Is this [5] the wrong link? It's to our issue tracker, not zlib's
> or a zlib mailing list thread, and it doesn't contain any messages
> from Mark Adler.

Looking at Mark's message, I'm not seeing that it was cc'd to the lists.
I did however ask him to update zlib's docs to describe the requirement.

Dave

> thanks
> -- PMM
> 
-- 
Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK



^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] multifd: Copy pages before compressing them with zlib
  2022-07-04 16:41 Ilya Leoshkevich
  2022-07-04 16:51 ` Juan Quintela
  2022-07-05 15:27 ` Dr. David Alan Gilbert
@ 2022-07-05 16:00 ` Peter Maydell
  2022-07-05 16:16   ` Dr. David Alan Gilbert
  2 siblings, 1 reply; 16+ messages in thread
From: Peter Maydell @ 2022-07-05 16:00 UTC (permalink / raw)
  To: Ilya Leoshkevich
  Cc: Juan Quintela, Dr. David Alan Gilbert, qemu-devel, Christian Borntraeger

On Mon, 4 Jul 2022 at 17:43, Ilya Leoshkevich <iii@linux.ibm.com> wrote:
>
> zlib_send_prepare() compresses pages of a running VM. zlib does not
> make any thread-safety guarantees with respect to changing deflate()
> input concurrently with deflate() [1].
>
> One can observe problems due to this with the IBM zEnterprise Data
> Compression accelerator capable zlib [2]. When the hardware
> acceleration is enabled, migration/multifd/tcp/plain/zlib test fails
> intermittently [3] due to sliding window corruption. The accelerator's
> architecture explicitly discourages concurrent accesses [4]:
>
>     Page 26-57, "Other Conditions":
>
>     As observed by this CPU, other CPUs, and channel
>     programs, references to the parameter block, first,
>     second, and third operands may be multiple-access
>     references, accesses to these storage locations are
>     not necessarily block-concurrent, and the sequence
>     of these accesses or references is undefined.
>
> Mark Adler pointed out that vanilla zlib performs double fetches under
> certain circumstances as well [5], therefore we need to copy data
> before passing it to deflate().
>
> [1] https://zlib.net/manual.html
> [2] https://github.com/madler/zlib/pull/410
> [3] https://lists.nongnu.org/archive/html/qemu-devel/2022-03/msg03988.html
> [4] http://publibfp.dhe.ibm.com/epubs/pdf/a227832c.pdf
> [5] https://gitlab.com/qemu-project/qemu/-/issues/1099

Is this [5] the wrong link? It's to our issue tracker, not zlib's
or a zlib mailing list thread, and it doesn't contain any messages
from Mark Adler.

thanks
-- PMM


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] multifd: Copy pages before compressing them with zlib
  2022-07-04 16:41 Ilya Leoshkevich
  2022-07-04 16:51 ` Juan Quintela
@ 2022-07-05 15:27 ` Dr. David Alan Gilbert
  2022-07-05 17:22   ` Ilya Leoshkevich
  2022-07-05 16:00 ` Peter Maydell
  2 siblings, 1 reply; 16+ messages in thread
From: Dr. David Alan Gilbert @ 2022-07-05 15:27 UTC (permalink / raw)
  To: Ilya Leoshkevich; +Cc: Juan Quintela, qemu-devel, Christian Borntraeger

* Ilya Leoshkevich (iii@linux.ibm.com) wrote:
> zlib_send_prepare() compresses pages of a running VM. zlib does not
> make any thread-safety guarantees with respect to changing deflate()
> input concurrently with deflate() [1].
> 
> One can observe problems due to this with the IBM zEnterprise Data
> Compression accelerator capable zlib [2]. When the hardware
> acceleration is enabled, migration/multifd/tcp/plain/zlib test fails
> intermittently [3] due to sliding window corruption. The accelerator's
> architecture explicitly discourages concurrent accesses [4]:
> 
>     Page 26-57, "Other Conditions":
> 
>     As observed by this CPU, other CPUs, and channel
>     programs, references to the parameter block, first,
>     second, and third operands may be multiple-access
>     references, accesses to these storage locations are
>     not necessarily block-concurrent, and the sequence
>     of these accesses or references is undefined.
> 
> Mark Adler pointed out that vanilla zlib performs double fetches under
> certain circumstances as well [5], therefore we need to copy data
> before passing it to deflate().

Thanks for fixing that!

> [1] https://zlib.net/manual.html
> [2] https://github.com/madler/zlib/pull/410
> [3] https://lists.nongnu.org/archive/html/qemu-devel/2022-03/msg03988.html
> [4] http://publibfp.dhe.ibm.com/epubs/pdf/a227832c.pdf
> [5] https://gitlab.com/qemu-project/qemu/-/issues/1099
> 
> Signed-off-by: Ilya Leoshkevich <iii@linux.ibm.com>
> ---
> 
> v1: https://lists.gnu.org/archive/html/qemu-devel/2022-03/msg06841.html
> v1 -> v2: Rebase, mention Mark Adler's reply in the commit message.
> 
>  migration/multifd-zlib.c | 35 ++++++++++++++++++++++-------------
>  1 file changed, 22 insertions(+), 13 deletions(-)
> 
> diff --git a/migration/multifd-zlib.c b/migration/multifd-zlib.c
> index 3a7ae44485..b6b22b7d1f 100644
> --- a/migration/multifd-zlib.c
> +++ b/migration/multifd-zlib.c
> @@ -27,6 +27,8 @@ struct zlib_data {
>      uint8_t *zbuff;
>      /* size of compressed buffer */
>      uint32_t zbuff_len;
> +    /* uncompressed buffer */
> +    uint8_t buf[];
>  };
>  
>  /* Multifd zlib compression */
> @@ -43,9 +45,18 @@ struct zlib_data {
>   */
>  static int zlib_send_setup(MultiFDSendParams *p, Error **errp)
>  {
> -    struct zlib_data *z = g_new0(struct zlib_data, 1);
> -    z_stream *zs = &z->zs;
> +    /* This is the maximum size of the compressed buffer */
> +    uint32_t zbuff_len = compressBound(MULTIFD_PACKET_SIZE);
> +    size_t buf_len = qemu_target_page_size();
> +    struct zlib_data *z;
> +    z_stream *zs;
>  
> +    z = g_try_malloc0(sizeof(struct zlib_data) + buf_len + zbuff_len);

So I think this works; but wouldn't life be easier if you just used
separate malloc's for the buffers?  You've got a lot of hairy pointer
maths below that would go away if they were separate.

Dave

> +    if (!z) {
> +        error_setg(errp, "multifd %u: out of memory for zlib_data", p->id);
> +        return -1;
> +    }
> +    zs = &z->zs;
>      zs->zalloc = Z_NULL;
>      zs->zfree = Z_NULL;
>      zs->opaque = Z_NULL;
> @@ -54,15 +65,8 @@ static int zlib_send_setup(MultiFDSendParams *p, Error **errp)
>          error_setg(errp, "multifd %u: deflate init failed", p->id);
>          return -1;
>      }
> -    /* This is the maxium size of the compressed buffer */
> -    z->zbuff_len = compressBound(MULTIFD_PACKET_SIZE);
> -    z->zbuff = g_try_malloc(z->zbuff_len);
> -    if (!z->zbuff) {
> -        deflateEnd(&z->zs);
> -        g_free(z);
> -        error_setg(errp, "multifd %u: out of memory for zbuff", p->id);
> -        return -1;
> -    }
> +    z->zbuff_len = zbuff_len;
> +    z->zbuff = z->buf + buf_len;
>      p->data = z;
>      return 0;
>  }
> @@ -80,7 +84,6 @@ static void zlib_send_cleanup(MultiFDSendParams *p, Error **errp)
>      struct zlib_data *z = p->data;
>  
>      deflateEnd(&z->zs);
> -    g_free(z->zbuff);
>      z->zbuff = NULL;
>      g_free(p->data);
>      p->data = NULL;
> @@ -114,8 +117,14 @@ static int zlib_send_prepare(MultiFDSendParams *p, Error **errp)
>              flush = Z_SYNC_FLUSH;
>          }
>  
> +        /*
> +         * Since the VM might be running, the page may be changing concurrently
> +         * with compression. zlib does not guarantee that this is safe,
> +         * therefore copy the page before calling deflate().
> +         */
> +        memcpy(z->buf, p->pages->block->host + p->normal[i], page_size);
>          zs->avail_in = page_size;
> -        zs->next_in = p->pages->block->host + p->normal[i];
> +        zs->next_in = z->buf;
>  
>          zs->avail_out = available;
>          zs->next_out = z->zbuff + out_size;
> -- 
> 2.35.3
> 
-- 
Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK



^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] multifd: Copy pages before compressing them with zlib
  2022-07-04 16:41 Ilya Leoshkevich
@ 2022-07-04 16:51 ` Juan Quintela
  2022-07-05 15:27 ` Dr. David Alan Gilbert
  2022-07-05 16:00 ` Peter Maydell
  2 siblings, 0 replies; 16+ messages in thread
From: Juan Quintela @ 2022-07-04 16:51 UTC (permalink / raw)
  To: Ilya Leoshkevich
  Cc: Dr. David Alan Gilbert, qemu-devel, Christian Borntraeger

Ilya Leoshkevich <iii@linux.ibm.com> wrote:
> zlib_send_prepare() compresses pages of a running VM. zlib does not
> make any thread-safety guarantees with respect to changing deflate()
> input concurrently with deflate() [1].
>
> One can observe problems due to this with the IBM zEnterprise Data
> Compression accelerator capable zlib [2]. When the hardware
> acceleration is enabled, migration/multifd/tcp/plain/zlib test fails
> intermittently [3] due to sliding window corruption. The accelerator's
> architecture explicitly discourages concurrent accesses [4]:
>
>     Page 26-57, "Other Conditions":
>
>     As observed by this CPU, other CPUs, and channel
>     programs, references to the parameter block, first,
>     second, and third operands may be multiple-access
>     references, accesses to these storage locations are
>     not necessarily block-concurrent, and the sequence
>     of these accesses or references is undefined.
>
> Mark Adler pointed out that vanilla zlib performs double fetches under
> certain circumstances as well [5], therefore we need to copy data
> before passing it to deflate().
>
> [1] https://zlib.net/manual.html
> [2] https://github.com/madler/zlib/pull/410
> [3] https://lists.nongnu.org/archive/html/qemu-devel/2022-03/msg03988.html
> [4] http://publibfp.dhe.ibm.com/epubs/pdf/a227832c.pdf
> [5] https://gitlab.com/qemu-project/qemu/-/issues/1099
>
> Signed-off-by: Ilya Leoshkevich <iii@linux.ibm.com>

Reviewed-by: Juan Quintela <quintela@redhat.com>

And now I wonder if we need this for zstd.

Once told that, compression (not multifd one) has always operated the
other way, sniff.



^ permalink raw reply	[flat|nested] 16+ messages in thread

* [PATCH] multifd: Copy pages before compressing them with zlib
@ 2022-07-04 16:41 Ilya Leoshkevich
  2022-07-04 16:51 ` Juan Quintela
                   ` (2 more replies)
  0 siblings, 3 replies; 16+ messages in thread
From: Ilya Leoshkevich @ 2022-07-04 16:41 UTC (permalink / raw)
  To: Juan Quintela, Dr. David Alan Gilbert
  Cc: qemu-devel, Christian Borntraeger, Ilya Leoshkevich

zlib_send_prepare() compresses pages of a running VM. zlib does not
make any thread-safety guarantees with respect to changing deflate()
input concurrently with deflate() [1].

One can observe problems due to this with the IBM zEnterprise Data
Compression accelerator capable zlib [2]. When the hardware
acceleration is enabled, migration/multifd/tcp/plain/zlib test fails
intermittently [3] due to sliding window corruption. The accelerator's
architecture explicitly discourages concurrent accesses [4]:

    Page 26-57, "Other Conditions":

    As observed by this CPU, other CPUs, and channel
    programs, references to the parameter block, first,
    second, and third operands may be multiple-access
    references, accesses to these storage locations are
    not necessarily block-concurrent, and the sequence
    of these accesses or references is undefined.

Mark Adler pointed out that vanilla zlib performs double fetches under
certain circumstances as well [5], therefore we need to copy data
before passing it to deflate().

[1] https://zlib.net/manual.html
[2] https://github.com/madler/zlib/pull/410
[3] https://lists.nongnu.org/archive/html/qemu-devel/2022-03/msg03988.html
[4] http://publibfp.dhe.ibm.com/epubs/pdf/a227832c.pdf
[5] https://gitlab.com/qemu-project/qemu/-/issues/1099

Signed-off-by: Ilya Leoshkevich <iii@linux.ibm.com>
---

v1: https://lists.gnu.org/archive/html/qemu-devel/2022-03/msg06841.html
v1 -> v2: Rebase, mention Mark Adler's reply in the commit message.

 migration/multifd-zlib.c | 35 ++++++++++++++++++++++-------------
 1 file changed, 22 insertions(+), 13 deletions(-)

diff --git a/migration/multifd-zlib.c b/migration/multifd-zlib.c
index 3a7ae44485..b6b22b7d1f 100644
--- a/migration/multifd-zlib.c
+++ b/migration/multifd-zlib.c
@@ -27,6 +27,8 @@ struct zlib_data {
     uint8_t *zbuff;
     /* size of compressed buffer */
     uint32_t zbuff_len;
+    /* uncompressed buffer */
+    uint8_t buf[];
 };
 
 /* Multifd zlib compression */
@@ -43,9 +45,18 @@ struct zlib_data {
  */
 static int zlib_send_setup(MultiFDSendParams *p, Error **errp)
 {
-    struct zlib_data *z = g_new0(struct zlib_data, 1);
-    z_stream *zs = &z->zs;
+    /* This is the maximum size of the compressed buffer */
+    uint32_t zbuff_len = compressBound(MULTIFD_PACKET_SIZE);
+    size_t buf_len = qemu_target_page_size();
+    struct zlib_data *z;
+    z_stream *zs;
 
+    z = g_try_malloc0(sizeof(struct zlib_data) + buf_len + zbuff_len);
+    if (!z) {
+        error_setg(errp, "multifd %u: out of memory for zlib_data", p->id);
+        return -1;
+    }
+    zs = &z->zs;
     zs->zalloc = Z_NULL;
     zs->zfree = Z_NULL;
     zs->opaque = Z_NULL;
@@ -54,15 +65,8 @@ static int zlib_send_setup(MultiFDSendParams *p, Error **errp)
         error_setg(errp, "multifd %u: deflate init failed", p->id);
         return -1;
     }
-    /* This is the maxium size of the compressed buffer */
-    z->zbuff_len = compressBound(MULTIFD_PACKET_SIZE);
-    z->zbuff = g_try_malloc(z->zbuff_len);
-    if (!z->zbuff) {
-        deflateEnd(&z->zs);
-        g_free(z);
-        error_setg(errp, "multifd %u: out of memory for zbuff", p->id);
-        return -1;
-    }
+    z->zbuff_len = zbuff_len;
+    z->zbuff = z->buf + buf_len;
     p->data = z;
     return 0;
 }
@@ -80,7 +84,6 @@ static void zlib_send_cleanup(MultiFDSendParams *p, Error **errp)
     struct zlib_data *z = p->data;
 
     deflateEnd(&z->zs);
-    g_free(z->zbuff);
     z->zbuff = NULL;
     g_free(p->data);
     p->data = NULL;
@@ -114,8 +117,14 @@ static int zlib_send_prepare(MultiFDSendParams *p, Error **errp)
             flush = Z_SYNC_FLUSH;
         }
 
+        /*
+         * Since the VM might be running, the page may be changing concurrently
+         * with compression. zlib does not guarantee that this is safe,
+         * therefore copy the page before calling deflate().
+         */
+        memcpy(z->buf, p->pages->block->host + p->normal[i], page_size);
         zs->avail_in = page_size;
-        zs->next_in = p->pages->block->host + p->normal[i];
+        zs->next_in = z->buf;
 
         zs->avail_out = available;
         zs->next_out = z->zbuff + out_size;
-- 
2.35.3



^ permalink raw reply related	[flat|nested] 16+ messages in thread

end of thread, other threads:[~2022-07-05 17:34 UTC | newest]

Thread overview: 16+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-03-29 15:21 [PATCH] multifd: Copy pages before compressing them with zlib Ilya Leoshkevich
2022-03-30 14:35 ` Christian Borntraeger
2022-04-04 11:20 ` Dr. David Alan Gilbert
2022-04-04 12:09   ` Ilya Leoshkevich
2022-04-04 17:11     ` Dr. David Alan Gilbert
2022-04-04 12:45   ` Daniel P. Berrangé
2022-04-04 13:55     ` Juan Quintela
2022-07-04 16:41 Ilya Leoshkevich
2022-07-04 16:51 ` Juan Quintela
2022-07-05 15:27 ` Dr. David Alan Gilbert
2022-07-05 17:22   ` Ilya Leoshkevich
2022-07-05 17:32     ` Dr. David Alan Gilbert
2022-07-05 16:00 ` Peter Maydell
2022-07-05 16:16   ` Dr. David Alan Gilbert
2022-07-05 16:27     ` Christian Borntraeger
2022-07-05 16:33       ` Dr. David Alan Gilbert

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.