Re: [PATCH v11 3/4] qcow2: add zstd cluster compression

From: Denis Plotnikov <dplotnikov@virtuozzo.com>
To: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>,
	qemu-devel@nongnu.org
Cc: kwolf@redhat.com, berto@igalia.com, qemu-block@nongnu.org,
	armbru@redhat.com, mreitz@redhat.com, den@openvz.org
Subject: Re: [PATCH v11 3/4] qcow2: add zstd cluster compression
Date: Tue, 31 Mar 2020 10:55:59 +0300	[thread overview]
Message-ID: <9bc60573-975e-2c75-6c65-fc923a845538@virtuozzo.com> (raw)
In-Reply-To: <08227565-8b4a-f496-0d66-af339d0bd667@virtuozzo.com>


On 31.03.2020 09:22, Vladimir Sementsov-Ogievskiy wrote:
> 30.03.2020 18:04, Denis Plotnikov wrote:
>>
>>
>> On 30.03.2020 16:14, Vladimir Sementsov-Ogievskiy wrote:
>>> 30.03.2020 12:54, Denis Plotnikov wrote:
>>>> zstd significantly reduces cluster compression time.
>>>> It provides better compression performance maintaining
>>>> the same level of the compression ratio in comparison with
>>>> zlib, which, at the moment, is the only compression
>>>> method available.
>>>>
>>>> The performance test results:
>>>> Test compresses and decompresses qemu qcow2 image with just
>>>> installed rhel-7.6 guest.
>>>> Image cluster size: 64K. Image on disk size: 2.2G
>>>>
>>>> The test was conducted with brd disk to reduce the influence
>>>> of disk subsystem to the test results.
>>>> The results is given in seconds.
>>>>
>>>> compress cmd:
>>>>    time ./qemu-img convert -O qcow2 -c -o compression_type=[zlib|zstd]
>>>>                    src.img [zlib|zstd]_compressed.img
>>>> decompress cmd
>>>>    time ./qemu-img convert -O qcow2
>>>>                    [zlib|zstd]_compressed.img uncompressed.img
>>>>
>>>>             compression               decompression
>>>>           zlib       zstd           zlib         zstd
>>>> ------------------------------------------------------------
>>>> real     65.5       16.3 (-75 %)    1.9          1.6 (-16 %)
>>>> user     65.0       15.8            5.3          2.5
>>>> sys       3.3        0.2            2.0          2.0
>>>>
>>>> Both ZLIB and ZSTD gave the same compression ratio: 1.57
>>>> compressed image size in both cases: 1.4G
>>>>
>>>> Signed-off-by: Denis Plotnikov <dplotnikov@virtuozzo.com>
>>>> QAPI part:
>>>> Acked-by: Markus Armbruster <armbru@redhat.com>
>>>> ---
>>>>   docs/interop/qcow2.txt |   1 +
>>>>   configure              |   2 +-
>>>>   qapi/block-core.json   |   3 +-
>>>>   block/qcow2-threads.c  | 138 
>>>> +++++++++++++++++++++++++++++++++++++++++
>>>>   block/qcow2.c          |   7 +++
>>>>   5 files changed, 149 insertions(+), 2 deletions(-)
>>>>
>>>> diff --git a/docs/interop/qcow2.txt b/docs/interop/qcow2.txt
>>>> index 5597e24474..795dbb21dd 100644
>>>> --- a/docs/interop/qcow2.txt
>>>> +++ b/docs/interop/qcow2.txt
>>>> @@ -208,6 +208,7 @@ version 2.
>>>>                         Available compression type values:
>>>>                           0: zlib <https://www.zlib.net/>
>>>> +                        1: zstd <http://github.com/facebook/zstd>
>>>>       === Header padding ===
>>>> diff --git a/configure b/configure
>>>> index da09c35895..57cac2abe1 100755
>>>> --- a/configure
>>>> +++ b/configure
>>>> @@ -1861,7 +1861,7 @@ disabled with --disable-FEATURE, default is 
>>>> enabled if available:
>>>>     lzfse           support of lzfse compression library
>>>>                     (for reading lzfse-compressed dmg images)
>>>>     zstd            support for zstd compression library
>>>> -                  (for migration compression)
>>>> +                  (for migration compression and qcow2 cluster 
>>>> compression)
>>>>     seccomp         seccomp support
>>>>     coroutine-pool  coroutine freelist (better performance)
>>>>     glusterfs       GlusterFS backend
>>>> diff --git a/qapi/block-core.json b/qapi/block-core.json
>>>> index d669ec0965..44690ed266 100644
>>>> --- a/qapi/block-core.json
>>>> +++ b/qapi/block-core.json
>>>> @@ -4293,11 +4293,12 @@
>>>>   # Compression type used in qcow2 image file
>>>>   #
>>>>   # @zlib: zlib compression, see <http://zlib.net/>
>>>> +# @zstd: zstd compression, see <http://github.com/facebook/zstd>
>>>>   #
>>>>   # Since: 5.0
>>>>   ##
>>>>   { 'enum': 'Qcow2CompressionType',
>>>> -  'data': [ 'zlib' ] }
>>>> +  'data': [ 'zlib', { 'name': 'zstd', 'if': 'defined(CONFIG_ZSTD)' 
>>>> } ] }
>>>>     ##
>>>>   # @BlockdevCreateOptionsQcow2:
>>>> diff --git a/block/qcow2-threads.c b/block/qcow2-threads.c
>>>> index 7dbaf53489..b8ccd97009 100644
>>>> --- a/block/qcow2-threads.c
>>>> +++ b/block/qcow2-threads.c
>>>> @@ -28,6 +28,11 @@
>>>>   #define ZLIB_CONST
>>>>   #include <zlib.h>
>>>>   +#ifdef CONFIG_ZSTD
>>>> +#include <zstd.h>
>>>> +#include <zstd_errors.h>
>>>> +#endif
>>>> +
>>>>   #include "qcow2.h"
>>>>   #include "block/thread-pool.h"
>>>>   #include "crypto.h"
>>>> @@ -166,6 +171,129 @@ static ssize_t qcow2_zlib_decompress(void 
>>>> *dest, size_t dest_size,
>>>>       return ret;
>>>>   }
>>>>   +#ifdef CONFIG_ZSTD
>>>> +
>>>> +/*
>>>> + * qcow2_zstd_compress()
>>>> + *
>>>> + * Compress @src_size bytes of data using zstd compression method
>>>> + *
>>>> + * @dest - destination buffer, @dest_size bytes
>>>> + * @src - source buffer, @src_size bytes
>>>> + *
>>>> + * Returns: compressed size on success
>>>> + *          -ENOMEM destination buffer is not enough to store 
>>>> compressed data
>>>> + *          -EIO    on any other error
>>>> + */
>>>> +static ssize_t qcow2_zstd_compress(void *dest, size_t dest_size,
>>>> +                                   const void *src, size_t src_size)
>>>> +{
>>>> +    size_t ret;
>>>> +    ZSTD_outBuffer output = { dest, dest_size, 0 };
>>>> +    ZSTD_inBuffer input = { src, src_size, 0 };
>>>> +    ZSTD_CCtx *cctx = ZSTD_createCCtx();
>>>> +
>>>> +    if (!cctx) {
>>>> +        return -EIO;
>>>> +    }
>>>> +    /*
>>>> +     * ZSTD spec: "You must continue calling ZSTD_compressStream2()
>>>> +     * with ZSTD_e_end until it returns 0, at which point you are
>>>> +     * free to start a new frame".
>>>> +     * In the loop, we try to compress all the data into one zstd 
>>>> frame.
>>>> +     * ZSTD_compressStream2 can potentially finish a frame earlier
>>>> +     * than the full input data is consumed. That's why we are 
>>>> looping
>>>> +     * until all the input data is consumed.
>>>> +     */
>>>> +    while (input.pos < input.size) {
>>>> +        /*
>>>> +         * zstd simple interface requires the exact compressed size.
>>>
>>> on decompression you mean
>>>
>>>> +         * zstd stream interface reads the comressed size from
>>>> +         * the compressed stream frame.
>>>
>>> compressed size is not stored in the stream. I think, that streamed
>>> interface just decompress until it have something to decompress and
>>> have space to write output.
>>>
>>>> +         * Instruct zstd to compress the whole buffer and write
>>>> +         * the frame which includes the compressed size.
>>>
>>> I think this is wrong. compression size is not written.
>> Ok
>>>
>>>> +         * This allows as to use zstd streaming semantics and
>>>> +         * don't store the compressed size for the zstd 
>>>> decompression.
>>>> +         */
>>>
>>> Comment is just outdated. Accordingly to our off-list discussion, I'd
>>> rewrite it like this:
>>>
>>> We want to use streamed interface on decompression, as we will not know
>>> exact size of compression data. 
>> This one is better then mine.
>>> Use streamed interface for compression
>>> just for symmetry.
>> I think this one is exceeding. If we have stream family functions on 
>> both sides this won't rise any questions from the reader.
>>
>>>
>>>
>>>> +        ret = ZSTD_compressStream2(cctx, &output, &input, 
>>>> ZSTD_e_end);
>>>> +        if (ZSTD_isError(ret)) {
>>>> +            ret = -EIO;
>>>> +            goto out;
>>>> +        }
>>>> +        /* Dest buffer isn't big enough to store compressed 
>>>> content */
>>>> +        if (output.pos + ret > output.size) {
>>>> +            ret = -ENOMEM;
>>>> +            goto out;
>>>> +        }
>>>
>>> Here you use "@return provides a minimum amount of data remaining to 
>>> be flushed from internal buffers
>>>             or an error code, which can be tested using 
>>> ZSTD_isError()."
>>>
>>> I think we can safely drop this check
>> No, we can't.
>> we should check for if zstd managed to write the stream correctly. 
>> ZSTD_compressStream2 may consume all the input buffer but return ret 
>> > 0 meaning that it needs more space.
>
> Hmm, interesting thing. But your check doesn't protect us from it:
>
> Assume we have
>
> output.size = input.size = 64K
> output.pos = 64K
> input.pos = 10K
> ret = 10K
>
> - which means that all input is consumed, but we have some cached data 
> (at least 10K) to flush.
>
> 10K + 10K = 20K < 64K, so your check will no lead to an error, but 
> we'll exit the loop..
No, it does protect
you use incorrect formula: _input_.pos + ret < output.size

but the code is

         if (output.pos + ret > output.size) {
             ret = -ENOMEM;
             goto out;
         }

so, 64K + 10K = 74K -> 74K > 64K (true) -> ret = -ENOMEM

>
> So, actually, what we need it two things:
>
>   1. input.pos = input.size, which means that all input is consumed
>   2. ret = 0, which means that all cached data flushed
>
> So we need something like
>
> do {
>    ret = ZSTD_compressStream2(cctx, &output, &input, ZDTD_e_end)
>    if (ZSTD_isError(ret)) {
>        ret = -EIO.
>        goto out;
>    }
> } while (ret || input.pos < input.size);
>
>>
>> This is from my tests:
>>
>> (gdb) p ret
>> $1 = 11
>> (gdb) p input
>> $2 = {src = 0x562305536000, size = 65536, pos = 65536}
>> (gdb) p output
>> $3 = {dst = 0x562305546000, size = 65535, pos = 65535}
>>
>> Alternatively, we can replace the check with something like
>>
>> if (ret != 0) {
>>      ret = -ENOMEM;
>> }
>
> It's not correct either, it's not an error: it just means that we need 
> to flush a bit more data.
.. but we don't have space to do it, so it looks like -ENOMEM
>
>>
>> after the loop, but I think both are equivalent, so I would stick 
>> with my one :)))
>>> work anyway.
>>>
>>>> +    };
>>>> +
>>>> +    /* make sure we can safely return compressed buffer size with 
>>>> ssize_t */
>>>> +    assert(output.pos <= SSIZE_MAX);
>>>
>>> Hmm. I hope it's not possible for cluster. But taking the function 
>>> in separate, it _is_ possible.
>>> So may be better to assert at function start that
>>>
>>>   assert(dest_size <= SSIZE_MAX);
>>>
>>> To stress, that this limitation is part of qcow2_zstd_compress() 
>>> interface.
>> Agree, this is better
>>>
>>>> +    ret = output.pos;
>>>> +
>>>> +out:
>>>> +    ZSTD_freeCCtx(cctx);
>>>> +    return ret;
>>>> +}
>>>> +
>>>> +/*
>>>> + * qcow2_zstd_decompress()
>>>> + *
>>>> + * Decompress some data (not more than @src_size bytes) to produce 
>>>> exactly
>>>> + * @dest_size bytes using zstd compression method
>>>> + *
>>>> + * @dest - destination buffer, @dest_size bytes
>>>> + * @src - source buffer, @src_size bytes
>>>> + *
>>>> + * Returns: 0 on success
>>>> + *          -EIO on any error
>>>> + */
>>>> +static ssize_t qcow2_zstd_decompress(void *dest, size_t dest_size,
>>>> +                                     const void *src, size_t 
>>>> src_size)
>>>> +{
>>>> +    size_t ret = 0;
>>>> +    ZSTD_outBuffer output = { dest, dest_size, 0 };
>>>> +    ZSTD_inBuffer input = { src, src_size, 0 };
>>>> +    ZSTD_DCtx *dctx = ZSTD_createDCtx();
>>>> +
>>>> +    if (!dctx) {
>>>> +        return -EIO;
>>>> +    }
>>>> +
>>>> +    /*
>>>> +     * The compressed stream from input buffer may consist from more
>>>> +     * than one zstd frames. So we iterate until we get a fully
>>>> +     * uncompressed cluster.
>>>> +     */
>>>> +    while (output.pos < output.size) {
>>>> +        ret = ZSTD_decompressStream(dctx, &output, &input);
>>>> +        /*
>>>> +         * if we don't fully populate the output but have read
>>>> +         * all the frames from the input, we end up with error
>>>> +         * here
>>>> +         */
>>>> +        if (ZSTD_isError(ret)) {
>>>> +            ret = -EIO;
>>>> +            break;
>>>> +        }
>>>> +        /*
>>>> +         * Dest buffer size is the image cluster size.
>>>> +         * It should be big enough to store uncompressed content.
>>>> +         * There shouldn't be any cases when the decompressed content
>>>> +         * size is greater then the cluster size, except cluster
>>>> +         * damaging.
>>>> +         */
>>>> +        if (output.pos + ret > output.size) {
>>>> +            ret = -EIO;
>>>> +            break;
>>>> +        }
>>>
>>> But here, you use
>>> "
>>> or any other value > 0, which means there is still some decoding or 
>>> flushing to do to complete current frame :
>>>                                 the return value is a suggested next 
>>> input size (just a hint for better latency)
>>>                                 that will never request more than 
>>> the remaining frame size.
>>> "
>>>
>>> I'm afraid that "just a hint" is not safe API to make a conclusion 
>>> from. So, I'd prefer to drop this optimization
>>> and just wait for an error from next ZSTD_decompressStream.
>>>
>>> And therefore, for symmetry drop similar optimization on compression 
>>> part..
>>>
>>> What do you think?
>> I'd add some kind of check that we have finished with ret==0 (after 
>> loop). It looks like this is the only case when we can be sure that 
>> everything went ok.
>
> I think, we should not check it. It is possible that compressed data 
> of another cluster is starting below input end. Is it guaranteed that 
> ZSTD_decompressStream will not start to decompress (or at least plan 
> to do it) the next frame, which is unrelated to our cluster? I'm 
> afraid it's not guaranteed, so we can get ret>0 when 
> output.pos=output.size in correct situation. So I think it's safer not 
> to add this final check for ret. Note that we are not protected from 
> wrong data anyway.
Looking at zlib_compress implementation, yes it seems to be, so.
Ok, I'll drop the check.

>
>>>
>>>
>>>> +    }
>>>> +
>>>> +    ZSTD_freeDCtx(dctx);
>>>> +    return ret;
>>>> +}
>>>> +#endif
>>>> +
>>>>   static int qcow2_compress_pool_func(void *opaque)
>>>>   {
>>>>       Qcow2CompressData *data = opaque;
>>>> @@ -217,6 +345,11 @@ qcow2_co_compress(BlockDriverState *bs, void 
>>>> *dest, size_t dest_size,
>>>>           fn = qcow2_zlib_compress;
>>>>           break;
>>>>   +#ifdef CONFIG_ZSTD
>>>> +    case QCOW2_COMPRESSION_TYPE_ZSTD:
>>>> +        fn = qcow2_zstd_compress;
>>>> +        break;
>>>> +#endif
>>>>       default:
>>>>           abort();
>>>>       }
>>>> @@ -249,6 +382,11 @@ qcow2_co_decompress(BlockDriverState *bs, void 
>>>> *dest, size_t dest_size,
>>>>           fn = qcow2_zlib_decompress;
>>>>           break;
>>>>   +#ifdef CONFIG_ZSTD
>>>> +    case QCOW2_COMPRESSION_TYPE_ZSTD:
>>>> +        fn = qcow2_zstd_decompress;
>>>> +        break;
>>>> +#endif
>>>>       default:
>>>>           abort();
>>>>       }
>>>> diff --git a/block/qcow2.c b/block/qcow2.c
>>>> index 643698934d..6632daf50b 100644
>>>> --- a/block/qcow2.c
>>>> +++ b/block/qcow2.c
>>>> @@ -1246,6 +1246,9 @@ static int 
>>>> validate_compression_type(BDRVQcow2State *s, Error **errp)
>>>>   {
>>>>       switch (s->compression_type) {
>>>>       case QCOW2_COMPRESSION_TYPE_ZLIB:
>>>> +#ifdef CONFIG_ZSTD
>>>> +    case QCOW2_COMPRESSION_TYPE_ZSTD:
>>>> +#endif
>>>>           break;
>>>>         default:
>>>> @@ -3461,6 +3464,10 @@ qcow2_co_create(BlockdevCreateOptions 
>>>> *create_options, Error **errp)
>>>>           }
>>>>             switch (qcow2_opts->compression_type) {
>>>> +#ifdef CONFIG_ZSTD
>>>> +        case QCOW2_COMPRESSION_TYPE_ZSTD:
>>>> +            break;
>>>> +#endif
>>>>           default:
>>>>               error_setg(errp, "Unknown compression type");
>>>>               goto out;
>>>>
>>>
>>>
>>
>
>