[v2,3/4] btrfs: Add zstd support
diff mbox series

Message ID 20170629194108.1674498-4-terrelln@fb.com
State New, archived
Headers show
Series
  • Add xxhash and zstd modules
Related show

Commit Message

Nick Terrell June 29, 2017, 7:41 p.m. UTC
Add zstd compression and decompression support to BtrFS. zstd at its
fastest level compresses almost as well as zlib, while offering much
faster compression and decompression, approaching lzo speeds.

I benchmarked btrfs with zstd compression against no compression, lzo
compression, and zlib compression. I benchmarked two scenarios. Copying
a set of files to btrfs, and then reading the files. Copying a tarball
to btrfs, extracting it to btrfs, and then reading the extracted files.
After every operation, I call `sync` and include the sync time.
Between every pair of operations I unmount and remount the filesystem
to avoid caching. The benchmark files can be found in the upstream
zstd source repository under
`contrib/linux-kernel/{btrfs-benchmark.sh,btrfs-extract-benchmark.sh}`
[1] [2].

I ran the benchmarks on a Ubuntu 14.04 VM with 2 cores and 4 GiB of RAM.
The VM is running on a MacBook Pro with a 3.1 GHz Intel Core i7 processor,
16 GB of RAM, and a SSD.

The first compression benchmark is copying 10 copies of the unzipped
Silesia corpus [3] into a BtrFS filesystem mounted with
`-o compress-force=Method`. The decompression benchmark times how long
it takes to `tar` all 10 copies into `/dev/null`. The compression ratio is
measured by comparing the output of `df` and `du`. See the benchmark file
[1] for details. I benchmarked multiple zstd compression levels, although
the patch uses zstd level 1.

| Method  | Ratio | Compression MB/s | Decompression speed |
|---------|-------|------------------|---------------------|
| None    |  0.99 |              504 |                 686 |
| lzo     |  1.66 |              398 |                 442 |
| zlib    |  2.58 |               65 |                 241 |
| zstd 1  |  2.57 |              260 |                 383 |
| zstd 3  |  2.71 |              174 |                 408 |
| zstd 6  |  2.87 |               70 |                 398 |
| zstd 9  |  2.92 |               43 |                 406 |
| zstd 12 |  2.93 |               21 |                 408 |
| zstd 15 |  3.01 |               11 |                 354 |

The next benchmark first copies `linux-4.11.6.tar` [4] to btrfs. Then it
measures the compression ratio, extracts the tar, and deletes the tar.
Then it measures the compression ratio again, and `tar`s the extracted
files into `/dev/null`. See the benchmark file [2] for details.

| Method | Tar Ratio | Extract Ratio | Copy (s) | Extract (s)| Read (s) |
|--------|-----------|---------------|----------|------------|----------|
| None   |      0.97 |          0.78 |    0.981 |      5.501 |    8.807 |
| lzo    |      2.06 |          1.38 |    1.631 |      8.458 |    8.585 |
| zlib   |      3.40 |          1.86 |    7.750 |     21.544 |   11.744 |
| zstd 1 |      3.57 |          1.85 |    2.579 |     11.479 |    9.389 |

[1] https://github.com/facebook/zstd/blob/dev/contrib/linux-kernel/btrfs-benchmark.sh
[2] https://github.com/facebook/zstd/blob/dev/contrib/linux-kernel/btrfs-extract-benchmark.sh
[3] http://sun.aei.polsl.pl/~sdeor/index.php?page=silesia
[4] https://cdn.kernel.org/pub/linux/kernel/v4.x/linux-4.11.6.tar.xz

zstd source repository: https://github.com/facebook/zstd

Signed-off-by: Nick Terrell <terrelln@fb.com>
---
 fs/btrfs/Kconfig           |   2 +
 fs/btrfs/Makefile          |   2 +-
 fs/btrfs/compression.c     |   1 +
 fs/btrfs/compression.h     |   6 +-
 fs/btrfs/ctree.h           |   1 +
 fs/btrfs/disk-io.c         |   2 +
 fs/btrfs/ioctl.c           |   6 +-
 fs/btrfs/props.c           |   6 +
 fs/btrfs/super.c           |  12 +-
 fs/btrfs/sysfs.c           |   2 +
 fs/btrfs/zstd.c            | 433 +++++++++++++++++++++++++++++++++++++++++++++
 include/uapi/linux/btrfs.h |   8 +-
 12 files changed, 469 insertions(+), 12 deletions(-)
 create mode 100644 fs/btrfs/zstd.c

--
2.9.3

Comments

Adam Borowski June 30, 2017, 3:24 a.m. UTC | #1
On Thu, Jun 29, 2017 at 12:41:07PM -0700, Nick Terrell wrote:
> Add zstd compression and decompression support to BtrFS. zstd at its
> fastest level compresses almost as well as zlib, while offering much
> faster compression and decompression, approaching lzo speeds.

> +static int zstd_decompress_bio(struct list_head *ws, struct page **pages_in,
> +		u64 disk_start,
> +		struct bio *orig_bio,
> +		size_t srclen)

Note that this fails to build if e1ddce71 is applied, and it'll be in
mainline in just a few days from now.  Fixing that is obvious, but
to avoid duplicating the effort:


diff --git a/fs/btrfs/zstd.c b/fs/btrfs/zstd.c
index 838741b3732c..e9403e569d8e 100644
--- a/fs/btrfs/zstd.c
+++ b/fs/btrfs/zstd.c
@@ -25,6 +25,7 @@
 #include <linux/pagemap.h>
 #include <linux/bio.h>
 #include <linux/zstd.h>
+#include <linux/refcount.h>
 #include "compression.h"
 
 #define ZSTD_BTRFS_MAX_WINDOWLOG 17
@@ -262,17 +263,18 @@ static int zstd_compress_pages(struct list_head *ws,
        return ret;
 }
 
-static int zstd_decompress_bio(struct list_head *ws, struct page **pages_in,
-               u64 disk_start,
-               struct bio *orig_bio,
-               size_t srclen)
+static int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
 {
        struct workspace *workspace = list_entry(ws, struct workspace, list);
        ZSTD_DStream *stream;
        int ret = 0;
        unsigned long page_in_index = 0;
+       size_t srclen = cb->compressed_len;
        unsigned long total_pages_in = DIV_ROUND_UP(srclen, PAGE_SIZE);
        unsigned long buf_start;
+       struct page **pages_in = cb->compressed_pages;
+       u64 disk_start = cb->start;
+       struct bio *orig_bio = cb->orig_bio;
        unsigned long total_out = 0;
        ZSTD_inBuffer in_buf = { NULL, 0, 0 };
        ZSTD_outBuffer out_buf = { NULL, 0, 0 };
Adam Borowski July 6, 2017, 4:32 p.m. UTC | #2
On Thu, Jun 29, 2017 at 12:41:07PM -0700, Nick Terrell wrote:
> Add zstd compression and decompression support to BtrFS. zstd at its
> fastest level compresses almost as well as zlib, while offering much
> faster compression and decompression, approaching lzo speeds.

Got a reproducible crash on amd64:

[98235.266511] BUG: unable to handle kernel paging request at ffffc90001251000
[98235.267485] IP: ZSTD_storeSeq.constprop.24+0x67/0xe0
[98235.269395] PGD 227034067 
[98235.269397] P4D 227034067 
[98235.271587] PUD 227035067 
[98235.273657] PMD 223323067 
[98235.275744] PTE 0

[98235.281545] Oops: 0002 [#1] SMP
[98235.283353] Modules linked in: loop veth tun fuse arc4 rtl8xxxu mac80211 cfg80211 cp210x pl2303 rfkill usbserial nouveau video mxm_wmi ttm
[98235.285203] CPU: 0 PID: 10850 Comm: kworker/u12:9 Not tainted 4.12.0+ #1
[98235.287070] Hardware name: System manufacturer System Product Name/M4A77T, BIOS 2401    05/18/2011
[98235.288964] Workqueue: btrfs-delalloc btrfs_delalloc_helper
[98235.290934] task: ffff880224984140 task.stack: ffffc90007e5c000
[98235.292731] RIP: 0010:ZSTD_storeSeq.constprop.24+0x67/0xe0
[98235.294579] RSP: 0018:ffffc90007e5fa68 EFLAGS: 00010282
[98235.296395] RAX: ffffc90001251001 RBX: 0000000000000094 RCX: ffffc9000118f930
[98235.298380] RDX: 0000000000000006 RSI: ffffc900011b06b0 RDI: ffffc9000118d1e0
[98235.300321] RBP: 000000000000009f R08: 1fffffffffffbe58 R09: 0000000000000000
[98235.302282] R10: ffffc9000118f970 R11: 0000000000000005 R12: ffffc9000118f878
[98235.304221] R13: 000000000000005b R14: ffffc9000118f915 R15: ffffc900011cfe88
[98235.306147] FS:  0000000000000000(0000) GS:ffff88022fc00000(0000) knlGS:0000000000000000
[98235.308162] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[98235.310129] CR2: ffffc90001251000 CR3: 000000021018d000 CR4: 00000000000006f0
[98235.312095] Call Trace:
[98235.314008]  ? ZSTD_compressBlock_fast+0x94b/0xb30
[98235.315975]  ? ZSTD_compressContinue_internal+0x1a0/0x580
[98235.317938]  ? ZSTD_compressStream_generic+0x248/0x2f0
[98235.319877]  ? ZSTD_compressStream+0x41/0x60
[98235.321821]  ? zstd_compress_pages+0x236/0x5d0
[98235.323724]  ? btrfs_compress_pages+0x5e/0x80
[98235.325684]  ? compress_file_range.constprop.79+0x1eb/0x750
[98235.327668]  ? async_cow_start+0x2e/0x50
[98235.329594]  ? btrfs_worker_helper+0x1b9/0x1d0
[98235.331486]  ? process_one_work+0x158/0x2f0
[98235.333361]  ? worker_thread+0x45/0x3a0
[98235.335253]  ? process_one_work+0x2f0/0x2f0
[98235.337189]  ? kthread+0x10e/0x130
[98235.339020]  ? kthread_park+0x60/0x60
[98235.340819]  ? ret_from_fork+0x22/0x30
[98235.342637] Code: 8b 4e d0 4c 89 48 d0 4c 8b 4e d8 4c 89 48 d8 4c 8b 4e e0 4c 89 48 e0 4c 8b 4e e8 4c 89 48 e8 4c 8b 4e f0 4c 89 48 f0 4c 8b 4e f8 <4c> 89 48 f8 48 39 f1 75 a2 4e 8d 04 c0 48 8b 31 48 83 c0 08 48 
[98235.346773] RIP: ZSTD_storeSeq.constprop.24+0x67/0xe0 RSP: ffffc90007e5fa68
[98235.348809] CR2: ffffc90001251000
[98235.363216] ---[ end trace 5fb3ad0f2aec0605 ]---
[98235.363218] BUG: unable to handle kernel paging request at ffffc9000393a000
[98235.363239] IP: ZSTD_storeSeq.constprop.24+0x67/0xe0
[98235.363241] PGD 227034067 
[98235.363242] P4D 227034067 
[98235.363243] PUD 227035067 
[98235.363244] PMD 21edec067 
[98235.363245] PTE 0
(More of the above follows.)

My reproducer copies an uncompressed tarball onto a fresh filesystem:
.----
#!/bin/sh
set -e

losetup -D; umount /mnt/vol1 ||:
dd if=/dev/zero of=/tmp/disk bs=2048 seek=1048575 count=1
mkfs.btrfs -msingle /tmp/disk
losetup -f /tmp/disk
sleep 1 # yay udev races
mount -onoatime,compress=$1 /dev/loop0 /mnt/vol1
time sh -c 'cp -p ~kilobyte/tmp/kernel.tar /mnt/vol1 && umount /mnt/vol1'
losetup -D
`----
(run it with arg of "zstd")

Kernel is 4.12.0 + btrfs-for-4.13 + v4 of Qu's chunk check + some unrelated
stuff + zstd; in case it matters I've pushed my tree to
https://github.com/kilobyte/linux/tree/zstd-crash

The payload is a tarball of the above, but, for debugging compression you
need the exact byte stream.  https://angband.pl/tmp/kernel.tar.xz --
without xz, I compressed it for transport.


Meow!
Nick Terrell July 7, 2017, 11:17 p.m. UTC | #3
On 7/6/17, 9:32 AM, "Adam Borowski" <kilobyte@angband.pl> wrote:
> On Thu, Jun 29, 2017 at 12:41:07PM -0700, Nick Terrell wrote:
>> Add zstd compression and decompression support to BtrFS. zstd at its
>> fastest level compresses almost as well as zlib, while offering much
>> faster compression and decompression, approaching lzo speeds.
>
> Got a reproducible crash on amd64:
>
> [98235.266511] BUG: unable to handle kernel paging request at ffffc90001251000
> [98235.267485] IP: ZSTD_storeSeq.constprop.24+0x67/0xe0
> [98235.269395] PGD 227034067 
> [98235.269397] P4D 227034067 
> [98235.271587] PUD 227035067 
> [98235.273657] PMD 223323067 
> [98235.275744] PTE 0
>
> [98235.281545] Oops: 0002 [#1] SMP
> [98235.283353] Modules linked in: loop veth tun fuse arc4 rtl8xxxu mac80211 cfg80211 cp210x pl2303 rfkill usbserial nouveau video mxm_wmi ttm
> [98235.285203] CPU: 0 PID: 10850 Comm: kworker/u12:9 Not tainted 4.12.0+ #1
> [98235.287070] Hardware name: System manufacturer System Product Name/M4A77T, BIOS 2401    05/18/2011
> [98235.288964] Workqueue: btrfs-delalloc btrfs_delalloc_helper
> [98235.290934] task: ffff880224984140 task.stack: ffffc90007e5c000
> [98235.292731] RIP: 0010:ZSTD_storeSeq.constprop.24+0x67/0xe0
> [98235.294579] RSP: 0018:ffffc90007e5fa68 EFLAGS: 00010282
> [98235.296395] RAX: ffffc90001251001 RBX: 0000000000000094 RCX: ffffc9000118f930
> [98235.298380] RDX: 0000000000000006 RSI: ffffc900011b06b0 RDI: ffffc9000118d1e0
> [98235.300321] RBP: 000000000000009f R08: 1fffffffffffbe58 R09: 0000000000000000
> [98235.302282] R10: ffffc9000118f970 R11: 0000000000000005 R12: ffffc9000118f878
> [98235.304221] R13: 000000000000005b R14: ffffc9000118f915 R15: ffffc900011cfe88
> [98235.306147] FS:  0000000000000000(0000) GS:ffff88022fc00000(0000) knlGS:0000000000000000
> [98235.308162] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> [98235.310129] CR2: ffffc90001251000 CR3: 000000021018d000 CR4: 00000000000006f0
> [98235.312095] Call Trace:
> [98235.314008]  ? ZSTD_compressBlock_fast+0x94b/0xb30
> [98235.315975]  ? ZSTD_compressContinue_internal+0x1a0/0x580
> [98235.317938]  ? ZSTD_compressStream_generic+0x248/0x2f0
> [98235.319877]  ? ZSTD_compressStream+0x41/0x60
> [98235.321821]  ? zstd_compress_pages+0x236/0x5d0
> [98235.323724]  ? btrfs_compress_pages+0x5e/0x80
> [98235.325684]  ? compress_file_range.constprop.79+0x1eb/0x750
> [98235.327668]  ? async_cow_start+0x2e/0x50
> [98235.329594]  ? btrfs_worker_helper+0x1b9/0x1d0
> [98235.331486]  ? process_one_work+0x158/0x2f0
> [98235.333361]  ? worker_thread+0x45/0x3a0
> [98235.335253]  ? process_one_work+0x2f0/0x2f0
> [98235.337189]  ? kthread+0x10e/0x130
> [98235.339020]  ? kthread_park+0x60/0x60
> [98235.340819]  ? ret_from_fork+0x22/0x30
> [98235.342637] Code: 8b 4e d0 4c 89 48 d0 4c 8b 4e d8 4c 89 48 d8 4c 8b 4e e0 4c 89 48 e0 4c 8b 4e e8 4c 89 48 e8 4c 8b 4e f0 4c 89 48 f0 4c 8b 4e f8 <4c> 89 48 f8 48 39 f1 75 a2 4e 8d 04 c0 48 8b 31 48 83 c0 08 48 
> [98235.346773] RIP: ZSTD_storeSeq.constprop.24+0x67/0xe0 RSP: ffffc90007e5fa68
> [98235.348809] CR2: ffffc90001251000
> [98235.363216] ---[ end trace 5fb3ad0f2aec0605 ]---
> [98235.363218] BUG: unable to handle kernel paging request at ffffc9000393a000
> [98235.363239] IP: ZSTD_storeSeq.constprop.24+0x67/0xe0
> [98235.363241] PGD 227034067 
> [98235.363242] P4D 227034067 
> [98235.363243] PUD 227035067 
> [98235.363244] PMD 21edec067 
> [98235.363245] PTE 0
> (More of the above follows.)
>
> My reproducer copies an uncompressed tarball onto a fresh filesystem:
> .----
> #!/bin/sh
> set -e
>
> losetup -D; umount /mnt/vol1 ||:
> dd if=/dev/zero of=/tmp/disk bs=2048 seek=1048575 count=1
> mkfs.btrfs -msingle /tmp/disk
> losetup -f /tmp/disk
> sleep 1 # yay udev races
> mount -onoatime,compress=$1 /dev/loop0 /mnt/vol1
> time sh -c 'cp -p ~kilobyte/tmp/kernel.tar /mnt/vol1 && umount /mnt/vol1'
> losetup -D
> `----
> (run it with arg of "zstd")
>
> Kernel is 4.12.0 + btrfs-for-4.13 + v4 of Qu's chunk check + some unrelated
> stuff + zstd; in case it matters I've pushed my tree to
> https://github.com/kilobyte/linux/tree/zstd-crash
>
> The payload is a tarball of the above, but, for debugging compression you
> need the exact byte stream.  https://angband.pl/tmp/kernel.tar.xz  --
> without xz, I compressed it for transport.

Thanks for the bug report Adam! I'm looking into the failure, and haven't
been able to reproduce it yet. I've built my kernel from your tree, and
I ran your script with the kernel.tar tarball 100 times, but haven't gotten
a failure yet. I have a few questions to guide my debugging.

- How many cores are you running with? I’ve run the script with 1, 2, and 4 cores.
- Which version of gcc are you using to compile the kernel? I’m using gcc-6.2.0-5ubuntu12.
- Are the failures always in exactly the same place, and does it fail 100%
  of the time or just regularly?
Adam Borowski July 7, 2017, 11:40 p.m. UTC | #4
On Fri, Jul 07, 2017 at 11:17:49PM +0000, Nick Terrell wrote:
> On 7/6/17, 9:32 AM, "Adam Borowski" <kilobyte@angband.pl> wrote:
> > Got a reproducible crash on amd64:
> >
> > [98235.266511] BUG: unable to handle kernel paging request at ffffc90001251000

> > [98235.314008]  ? ZSTD_compressBlock_fast+0x94b/0xb30
> > [98235.315975]  ? ZSTD_compressContinue_internal+0x1a0/0x580
> > [98235.317938]  ? ZSTD_compressStream_generic+0x248/0x2f0
> > [98235.319877]  ? ZSTD_compressStream+0x41/0x60
> > [98235.321821]  ? zstd_compress_pages+0x236/0x5d0
> > [98235.323724]  ? btrfs_compress_pages+0x5e/0x80
> > [98235.325684]  ? compress_file_range.constprop.79+0x1eb/0x750
> 
> Thanks for the bug report Adam! I'm looking into the failure, and haven't
> been able to reproduce it yet. I've built my kernel from your tree, and
> I ran your script with the kernel.tar tarball 100 times, but haven't gotten
> a failure yet.

Crashed the same way 4 out of 4 tries for me.

> I have a few questions to guide my debugging.
> 
> - How many cores are you running with? I’ve run the script with 1, 2, and 4 cores.
> - Which version of gcc are you using to compile the kernel? I’m using gcc-6.2.0-5ubuntu12.
> - Are the failures always in exactly the same place, and does it fail 100%
>   of the time or just regularly?

6 cores -- all on bare metal.  gcc-7.1.0-9.

Lemme try with gcc-6, a different config or in a VM.
Adam Borowski July 8, 2017, 3:07 a.m. UTC | #5
On Sat, Jul 08, 2017 at 01:40:18AM +0200, Adam Borowski wrote:
> On Fri, Jul 07, 2017 at 11:17:49PM +0000, Nick Terrell wrote:
> > On 7/6/17, 9:32 AM, "Adam Borowski" <kilobyte@angband.pl> wrote:
> > > Got a reproducible crash on amd64:
>
> > Thanks for the bug report Adam! I'm looking into the failure, and haven't
> > been able to reproduce it yet. I've built my kernel from your tree, and
> > I ran your script with the kernel.tar tarball 100 times, but haven't gotten
> > a failure yet.
> 
> > I have a few questions to guide my debugging.
> > 
> > - How many cores are you running with? I’ve run the script with 1, 2, and 4 cores.
> > - Which version of gcc are you using to compile the kernel? I’m using gcc-6.2.0-5ubuntu12.
> > - Are the failures always in exactly the same place, and does it fail 100%
> >   of the time or just regularly?
> 
> 6 cores -- all on bare metal.  gcc-7.1.0-9.
> Lemme try with gcc-6, a different config or in a VM.

I've tried the following:
* gcc-6, defconfig (+btrfs obviously)
* gcc-7, defconfig
* gcc-6, my regular config
* gcc-7, my regular config
* gcc-7, debug + UBSAN + etc
* gcc-7, defconfig, qemu-kvm with only 1 core

Every build with gcc-7 reproduces the crash, every with gcc-6 does not.
Austin S. Hemmelgarn July 10, 2017, 12:36 p.m. UTC | #6
On 2017-07-07 23:07, Adam Borowski wrote:
> On Sat, Jul 08, 2017 at 01:40:18AM +0200, Adam Borowski wrote:
>> On Fri, Jul 07, 2017 at 11:17:49PM +0000, Nick Terrell wrote:
>>> On 7/6/17, 9:32 AM, "Adam Borowski" <kilobyte@angband.pl> wrote:
>>>> Got a reproducible crash on amd64:
>>
>>> Thanks for the bug report Adam! I'm looking into the failure, and haven't
>>> been able to reproduce it yet. I've built my kernel from your tree, and
>>> I ran your script with the kernel.tar tarball 100 times, but haven't gotten
>>> a failure yet.
>>
>>> I have a few questions to guide my debugging.
>>>
>>> - How many cores are you running with? I’ve run the script with 1, 2, and 4 cores.
>>> - Which version of gcc are you using to compile the kernel? I’m using gcc-6.2.0-5ubuntu12.
>>> - Are the failures always in exactly the same place, and does it fail 100%
>>>    of the time or just regularly?
>>
>> 6 cores -- all on bare metal.  gcc-7.1.0-9.
>> Lemme try with gcc-6, a different config or in a VM.
> 
> I've tried the following:
> * gcc-6, defconfig (+btrfs obviously)
> * gcc-7, defconfig
> * gcc-6, my regular config
> * gcc-7, my regular config
> * gcc-7, debug + UBSAN + etc
> * gcc-7, defconfig, qemu-kvm with only 1 core
> 
> Every build with gcc-7 reproduces the crash, every with gcc-6 does not.
> 
Got a GCC7 tool-chain built, and I can confirm this here too, tested 
with various numbers of cores ranging from 1-32 in a QEMU+KVM VM, with 
various combinations of debug options and other config switches.
Nick Terrell July 10, 2017, 8:57 p.m. UTC | #7
On 7/10/17, 5:36 AM, "Austin S. Hemmelgarn" <ahferroin7@gmail.com> wrote:
> On 2017-07-07 23:07, Adam Borowski wrote:
>> On Sat, Jul 08, 2017 at 01:40:18AM +0200, Adam Borowski wrote:
>>> On Fri, Jul 07, 2017 at 11:17:49PM +0000, Nick Terrell wrote:
>>>> On 7/6/17, 9:32 AM, "Adam Borowski" <kilobyte@angband.pl> wrote:
>>>>> Got a reproducible crash on amd64:
>>>
>>>> Thanks for the bug report Adam! I'm looking into the failure, and haven't
>>>> been able to reproduce it yet. I've built my kernel from your tree, and
>>>> I ran your script with the kernel.tar tarball 100 times, but haven't gotten
>>>> a failure yet.
>>>
>>>> I have a few questions to guide my debugging.
>>>>
>>>> - How many cores are you running with? I’ve run the script with 1, 2, and 4 cores.
>>>> - Which version of gcc are you using to compile the kernel? I’m using gcc-6.2.0-5ubuntu12.
>>>> - Are the failures always in exactly the same place, and does it fail 100%
>>>>    of the time or just regularly?
>>>
>>> 6 cores -- all on bare metal.  gcc-7.1.0-9.
>>> Lemme try with gcc-6, a different config or in a VM.
>> 
>> I've tried the following:
>> * gcc-6, defconfig (+btrfs obviously)
>> * gcc-7, defconfig
>> * gcc-6, my regular config
>> * gcc-7, my regular config
>> * gcc-7, debug + UBSAN + etc
>> * gcc-7, defconfig, qemu-kvm with only 1 core
>>
>> Every build with gcc-7 reproduces the crash, every with gcc-6 does not.
>>
> Got a GCC7 tool-chain built, and I can confirm this here too, tested 
> with various numbers of cores ranging from 1-32 in a QEMU+KVM VM, with 
> various combinations of debug options and other config switches.

I was running in an Ubuntu 16.10 VM on a MacBook Pro. I built with gcc-6.2
with KASAN, and couldn't trigger it, as expected. I built with gcc-7.1.0
built from source, and couldn't reproduce it. However, when I set up
qemu-kvm on another device, and compiled with gcc-7.1.0 built from source,
I was able to reproduce the bug. Now that I can reproduce it, I'll look
into a fix. Thanks Adam and Austin for finding, reproducing, and verifying
the bug.
Nick Terrell July 11, 2017, 4:57 a.m. UTC | #8
On 7/10/17, 5:36 AM, "Austin S. Hemmelgarn" <ahferroin7@gmail.com> wrote:
> On 2017-07-07 23:07, Adam Borowski wrote:
>> On Sat, Jul 08, 2017 at 01:40:18AM +0200, Adam Borowski wrote:
>>> On Fri, Jul 07, 2017 at 11:17:49PM +0000, Nick Terrell wrote:
>>>> On 7/6/17, 9:32 AM, "Adam Borowski" <kilobyte@angband.pl> wrote:
>>>>> Got a reproducible crash on amd64:
>>>
>>>> Thanks for the bug report Adam! I'm looking into the failure, and haven't
>>>> been able to reproduce it yet. I've built my kernel from your tree, and
>>>> I ran your script with the kernel.tar tarball 100 times, but haven't gotten
>>>> a failure yet.
>>>
>>>> I have a few questions to guide my debugging.
>>>>
>>>> - How many cores are you running with? I’ve run the script with 1, 2, and 4 cores.
>>>> - Which version of gcc are you using to compile the kernel? I’m using gcc-6.2.0-5ubuntu12.
>>>> - Are the failures always in exactly the same place, and does it fail 100%
>>>>    of the time or just regularly?
>>>
>>> 6 cores -- all on bare metal.  gcc-7.1.0-9.
>>> Lemme try with gcc-6, a different config or in a VM.
>>
>> I've tried the following:
>> * gcc-6, defconfig (+btrfs obviously)
>> * gcc-7, defconfig
>> * gcc-6, my regular config
>> * gcc-7, my regular config
>> * gcc-7, debug + UBSAN + etc
>> * gcc-7, defconfig, qemu-kvm with only 1 core
>>
>> Every build with gcc-7 reproduces the crash, every with gcc-6 does not.
>>
> Got a GCC7 tool-chain built, and I can confirm this here too, tested 
> with various numbers of cores ranging from 1-32 in a QEMU+KVM VM, with 
> various combinations of debug options and other config switches.

The problem is caused by a gcc-7 bug [1]. It miscompiles
ZSTD_wildcopy(void *dst, void const *src, ptrdiff_t len) when len is 0.
It only happens when it can't analyze ZSTD_copy8(), which is the case in
the kernel, because memcpy() is implemented with inline assembly. The
generated code is slow anyways, so I propose this workaround, which will
be included in the next patch set. I've confirmed that it fixes the bug for
me. This alternative implementation is also 10-20x faster, and compiles to
the same x86 assembly as the original ZSTD_wildcopy() with the userland
memcpy() implementation [2].

[1] https://gcc.gnu.org/bugzilla/show_bug.cgi?id=81388#add_comment
[2] https://godbolt.org/g/q5YpLx

Signed-off-by: Nick Terrell <terrelln@fb.com>
---
 lib/zstd/zstd_internal.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/lib/zstd/zstd_internal.h b/lib/zstd/zstd_internal.h
index 6748719..ade0365 100644
--- a/lib/zstd/zstd_internal.h
+++ b/lib/zstd/zstd_internal.h
@@ -126,7 +126,9 @@ static const U32 OF_defaultNormLog = OF_DEFAULTNORMLOG;
 /*-*******************************************
 *  Shared functions to include for inlining
 *********************************************/
-static void ZSTD_copy8(void *dst, const void *src) { memcpy(dst, src, 8); }
+static void ZSTD_copy8(void *dst, const void *src) {
+       ZSTD_write64(dst, ZSTD_read64(src));
+}
 #define COPY8(d, s)               \
        {                         \
                ZSTD_copy8(d, s); \
--
2.9.3
Nick Terrell July 11, 2017, 6:01 a.m. UTC | #9
On 7/10/17, 9:57 PM, "Nick Terrell" <terrelln@fb.com> wrote:
> The problem is caused by a gcc-7 bug [1]. It miscompiles
> ZSTD_wildcopy(void *dst, void const *src, ptrdiff_t len) when len is 0.
> It only happens when it can't analyze ZSTD_copy8(), which is the case in
> the kernel, because memcpy() is implemented with inline assembly. The
> generated code is slow anyways, so I propose this workaround, which will
> be included in the next patch set. I've confirmed that it fixes the bug for
> me. This alternative implementation is also 10-20x faster, and compiles to
> the same x86 assembly as the original ZSTD_wildcopy() with the userland
> memcpy() implementation [2].
>
> [1] https://gcc.gnu.org/bugzilla/show_bug.cgi?id=81388#add_comment
> [2] https://godbolt.org/g/q5YpLx
>
> Signed-off-by: Nick Terrell <terrelln@fb.com>
> ---
>  lib/zstd/zstd_internal.h | 4 +++-
>  1 file changed, 3 insertions(+), 1 deletion(-)
>
> diff --git a/lib/zstd/zstd_internal.h b/lib/zstd/zstd_internal.h
> index 6748719..ade0365 100644
> --- a/lib/zstd/zstd_internal.h
> +++ b/lib/zstd/zstd_internal.h
> @@ -126,7 +126,9 @@ static const U32 OF_defaultNormLog = OF_DEFAULTNORMLOG;
>  /*-*******************************************
>  *  Shared functions to include for inlining
>  *********************************************/
> -static void ZSTD_copy8(void *dst, const void *src) { memcpy(dst, src, 8); }
> +static void ZSTD_copy8(void *dst, const void *src) {
> +       ZSTD_write64(dst, ZSTD_read64(src));
> +}

Sorry, my patch still triggered the gcc bug, I used the wrong compiler.
This patch works, and runs about the same speed as before the patch for
small inputs, and slightly faster for larger inputs (100+ bytes). I'll
look for a faster workaround if benchmarks show it matters.

Signed-off-by: Nick Terrell <terrelln@fb.com>
---
 lib/zstd/zstd_internal.h | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/lib/zstd/zstd_internal.h b/lib/zstd/zstd_internal.h
index 6748719..839014d 100644
--- a/lib/zstd/zstd_internal.h
+++ b/lib/zstd/zstd_internal.h
@@ -139,12 +139,8 @@ static void ZSTD_copy8(void *dst, const void *src) { memcpy(dst, src, 8); }
 #define WILDCOPY_OVERLENGTH 8
 ZSTD_STATIC void ZSTD_wildcopy(void *dst, const void *src, ptrdiff_t length)
 {
-	const BYTE *ip = (const BYTE *)src;
-	BYTE *op = (BYTE *)dst;
-	BYTE *const oend = op + length;
-	do
-		COPY8(op, ip)
-	while (op < oend);
+	if (length > 0)
+		memcpy(dst, src, length);
 }
 
 ZSTD_STATIC void ZSTD_wildcopy_e(void *dst, const void *src, void *dstEnd) /* should be faster for decoding, but strangely, not verified on all platform */
Adam Borowski July 12, 2017, 3:38 a.m. UTC | #10
On Tue, Jul 11, 2017 at 06:01:38AM +0000, Nick Terrell wrote:
> On 7/10/17, 9:57 PM, "Nick Terrell" <terrelln@fb.com> wrote:
> > The problem is caused by a gcc-7 bug [1]. It miscompiles
> > ZSTD_wildcopy(void *dst, void const *src, ptrdiff_t len) when len is 0.
>
> Sorry, my patch still triggered the gcc bug, I used the wrong compiler.
> This patch works, and runs about the same speed as before the patch for
> small inputs, and slightly faster for larger inputs (100+ bytes). I'll
> look for a faster workaround if benchmarks show it matters.

Confirmed, the fix stops the crash for me.  Yay!

> --- a/lib/zstd/zstd_internal.h
> +++ b/lib/zstd/zstd_internal.h
> @@ -139,12 +139,8 @@ static void ZSTD_copy8(void *dst, const void *src) { memcpy(dst, src, 8); }
>  #define WILDCOPY_OVERLENGTH 8
>  ZSTD_STATIC void ZSTD_wildcopy(void *dst, const void *src, ptrdiff_t length)
>  {
> -	const BYTE *ip = (const BYTE *)src;
> -	BYTE *op = (BYTE *)dst;
> -	BYTE *const oend = op + length;
> -	do
> -		COPY8(op, ip)
> -	while (op < oend);
> +	if (length > 0)
> +		memcpy(dst, src, length);
>  }
>  
>  ZSTD_STATIC void ZSTD_wildcopy_e(void *dst, const void *src, void *dstEnd) /* should be faster for decoding, but strangely, not verified on all platform */
David Sterba July 18, 2017, 6:21 p.m. UTC | #11
On Thu, Jun 29, 2017 at 12:41:07PM -0700, Nick Terrell wrote:
> +static void zstd_free_workspace(struct list_head *ws)
> +{
> +	struct workspace *workspace = list_entry(ws, struct workspace, list);
> +
> +	vfree(workspace->mem);
> +	kfree(workspace->buf);
> +	kfree(workspace);
> +}
> +
> +static struct list_head *zstd_alloc_workspace(void)
> +{
> +	ZSTD_parameters params =
> +			zstd_get_btrfs_parameters(ZSTD_BTRFS_MAX_INPUT);
> +	struct workspace *workspace;
> +
> +	workspace = kzalloc(sizeof(*workspace), GFP_NOFS);
> +	if (!workspace)
> +		return ERR_PTR(-ENOMEM);
> +
> +	workspace->size = max_t(size_t,
> +			ZSTD_CStreamWorkspaceBound(params.cParams),
> +			ZSTD_DStreamWorkspaceBound(ZSTD_BTRFS_MAX_INPUT));
> +	workspace->mem = vmalloc(workspace->size);
> +	workspace->buf = kmalloc(PAGE_SIZE, GFP_NOFS);
> +	if (!workspace->mem || !workspace->buf)
> +		goto fail;
> +
> +	INIT_LIST_HEAD(&workspace->list);
> +
> +	return &workspace->list;
> +fail:
> +	zstd_free_workspace(&workspace->list);
> +	return ERR_PTR(-ENOMEM);
> +}

In the next iteration, please update the workspace allocations so that
they use kvmalloc/kvfree and GFP_KERNEL (eg. 6acafd1eff426).

Patch
diff mbox series

diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig
index 80e9c18..a26c63b 100644
--- a/fs/btrfs/Kconfig
+++ b/fs/btrfs/Kconfig
@@ -6,6 +6,8 @@  config BTRFS_FS
 	select ZLIB_DEFLATE
 	select LZO_COMPRESS
 	select LZO_DECOMPRESS
+	select ZSTD_COMPRESS
+	select ZSTD_DECOMPRESS
 	select RAID6_PQ
 	select XOR_BLOCKS
 	select SRCU
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index 128ce17..962a95a 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -6,7 +6,7 @@  btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
 	   transaction.o inode.o file.o tree-defrag.o \
 	   extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \
 	   extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \
-	   export.o tree-log.o free-space-cache.o zlib.o lzo.o \
+	   export.o tree-log.o free-space-cache.o zlib.o lzo.o zstd.o \
 	   compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \
 	   reada.o backref.o ulist.o qgroup.o send.o dev-replace.o raid56.o \
 	   uuid-tree.o props.o hash.o free-space-tree.o
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 10e6b28..3beb0d0 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -761,6 +761,7 @@  static struct {
 static const struct btrfs_compress_op * const btrfs_compress_op[] = {
 	&btrfs_zlib_compress,
 	&btrfs_lzo_compress,
+	&btrfs_zstd_compress,
 };

 void __init btrfs_init_compress(void)
diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h
index 39ec43a..d99fc21 100644
--- a/fs/btrfs/compression.h
+++ b/fs/btrfs/compression.h
@@ -60,8 +60,9 @@  enum btrfs_compression_type {
 	BTRFS_COMPRESS_NONE  = 0,
 	BTRFS_COMPRESS_ZLIB  = 1,
 	BTRFS_COMPRESS_LZO   = 2,
-	BTRFS_COMPRESS_TYPES = 2,
-	BTRFS_COMPRESS_LAST  = 3,
+	BTRFS_COMPRESS_ZSTD  = 3,
+	BTRFS_COMPRESS_TYPES = 3,
+	BTRFS_COMPRESS_LAST  = 4,
 };

 struct btrfs_compress_op {
@@ -92,5 +93,6 @@  struct btrfs_compress_op {

 extern const struct btrfs_compress_op btrfs_zlib_compress;
 extern const struct btrfs_compress_op btrfs_lzo_compress;
+extern const struct btrfs_compress_op btrfs_zstd_compress;

 #endif
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 4f8f75d..61dd3dd 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -271,6 +271,7 @@  struct btrfs_super_block {
 	 BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS |		\
 	 BTRFS_FEATURE_INCOMPAT_BIG_METADATA |		\
 	 BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO |		\
+	 BTRFS_FEATURE_INCOMPAT_COMPRESS_ZSTD |		\
 	 BTRFS_FEATURE_INCOMPAT_RAID56 |		\
 	 BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF |		\
 	 BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA |	\
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 5f678dc..49c0e91 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2831,6 +2831,8 @@  int open_ctree(struct super_block *sb,
 	features |= BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF;
 	if (fs_info->compress_type == BTRFS_COMPRESS_LZO)
 		features |= BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO;
+	else if (fs_info->compress_type == BTRFS_COMPRESS_ZSTD)
+		features |= BTRFS_FEATURE_INCOMPAT_COMPRESS_ZSTD;

 	if (features & BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA)
 		btrfs_info(fs_info, "has skinny extents");
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index e176375..f732cfd 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -327,8 +327,10 @@  static int btrfs_ioctl_setflags(struct file *file, void __user *arg)

 		if (fs_info->compress_type == BTRFS_COMPRESS_LZO)
 			comp = "lzo";
-		else
+		else if (fs_info->compress_type == BTRFS_COMPRESS_ZLIB)
 			comp = "zlib";
+		else
+			comp = "zstd";
 		ret = btrfs_set_prop(inode, "btrfs.compression",
 				     comp, strlen(comp), 0);
 		if (ret)
@@ -1463,6 +1465,8 @@  int btrfs_defrag_file(struct inode *inode, struct file *file,

 	if (range->compress_type == BTRFS_COMPRESS_LZO) {
 		btrfs_set_fs_incompat(fs_info, COMPRESS_LZO);
+	} else if (range->compress_type == BTRFS_COMPRESS_ZSTD) {
+		btrfs_set_fs_incompat(fs_info, COMPRESS_ZSTD);
 	}

 	ret = defrag_count;
diff --git a/fs/btrfs/props.c b/fs/btrfs/props.c
index d6cb155..162105f 100644
--- a/fs/btrfs/props.c
+++ b/fs/btrfs/props.c
@@ -383,6 +383,8 @@  static int prop_compression_validate(const char *value, size_t len)
 		return 0;
 	else if (!strncmp("zlib", value, len))
 		return 0;
+	else if (!strncmp("zstd", value, len))
+		return 0;

 	return -EINVAL;
 }
@@ -405,6 +407,8 @@  static int prop_compression_apply(struct inode *inode,
 		type = BTRFS_COMPRESS_LZO;
 	else if (!strncmp("zlib", value, len))
 		type = BTRFS_COMPRESS_ZLIB;
+	else if (!strncmp("zstd", value, len))
+		type = BTRFS_COMPRESS_ZSTD;
 	else
 		return -EINVAL;

@@ -422,6 +426,8 @@  static const char *prop_compression_extract(struct inode *inode)
 		return "zlib";
 	case BTRFS_COMPRESS_LZO:
 		return "lzo";
+	case BTRFS_COMPRESS_ZSTD:
+		return "zstd";
 	}

 	return NULL;
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 4f1cdd5..4f792d5 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -513,6 +513,14 @@  int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
 				btrfs_clear_opt(info->mount_opt, NODATASUM);
 				btrfs_set_fs_incompat(info, COMPRESS_LZO);
 				no_compress = 0;
+			} else if (strcmp(args[0].from, "zstd") == 0) {
+				compress_type = "zstd";
+				info->compress_type = BTRFS_COMPRESS_ZSTD;
+				btrfs_set_opt(info->mount_opt, COMPRESS);
+				btrfs_clear_opt(info->mount_opt, NODATACOW);
+				btrfs_clear_opt(info->mount_opt, NODATASUM);
+				btrfs_set_fs_incompat(info, COMPRESS_ZSTD);
+				no_compress = 0;
 			} else if (strncmp(args[0].from, "no", 2) == 0) {
 				compress_type = "no";
 				btrfs_clear_opt(info->mount_opt, COMPRESS);
@@ -1240,8 +1248,10 @@  static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry)
 	if (btrfs_test_opt(info, COMPRESS)) {
 		if (info->compress_type == BTRFS_COMPRESS_ZLIB)
 			compress_type = "zlib";
-		else
+		else if (info->compress_type == BTRFS_COMPRESS_LZO)
 			compress_type = "lzo";
+		else
+			compress_type = "zstd";
 		if (btrfs_test_opt(info, FORCE_COMPRESS))
 			seq_printf(seq, ",compress-force=%s", compress_type);
 		else
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index 1f157fb..b0dec90 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -200,6 +200,7 @@  BTRFS_FEAT_ATTR_INCOMPAT(mixed_backref, MIXED_BACKREF);
 BTRFS_FEAT_ATTR_INCOMPAT(default_subvol, DEFAULT_SUBVOL);
 BTRFS_FEAT_ATTR_INCOMPAT(mixed_groups, MIXED_GROUPS);
 BTRFS_FEAT_ATTR_INCOMPAT(compress_lzo, COMPRESS_LZO);
+BTRFS_FEAT_ATTR_INCOMPAT(compress_zstd, COMPRESS_ZSTD);
 BTRFS_FEAT_ATTR_INCOMPAT(big_metadata, BIG_METADATA);
 BTRFS_FEAT_ATTR_INCOMPAT(extended_iref, EXTENDED_IREF);
 BTRFS_FEAT_ATTR_INCOMPAT(raid56, RAID56);
@@ -212,6 +213,7 @@  static struct attribute *btrfs_supported_feature_attrs[] = {
 	BTRFS_FEAT_ATTR_PTR(default_subvol),
 	BTRFS_FEAT_ATTR_PTR(mixed_groups),
 	BTRFS_FEAT_ATTR_PTR(compress_lzo),
+	BTRFS_FEAT_ATTR_PTR(compress_zstd),
 	BTRFS_FEAT_ATTR_PTR(big_metadata),
 	BTRFS_FEAT_ATTR_PTR(extended_iref),
 	BTRFS_FEAT_ATTR_PTR(raid56),
diff --git a/fs/btrfs/zstd.c b/fs/btrfs/zstd.c
new file mode 100644
index 0000000..838741b
--- /dev/null
+++ b/fs/btrfs/zstd.c
@@ -0,0 +1,433 @@ 
+/*
+ * Copyright (c) 2016-present, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/init.h>
+#include <linux/err.h>
+#include <linux/sched.h>
+#include <linux/pagemap.h>
+#include <linux/bio.h>
+#include <linux/zstd.h>
+#include "compression.h"
+
+#define ZSTD_BTRFS_MAX_WINDOWLOG 17
+#define ZSTD_BTRFS_MAX_INPUT (1 << ZSTD_BTRFS_MAX_WINDOWLOG)
+
+static ZSTD_parameters zstd_get_btrfs_parameters(size_t src_len)
+{
+	ZSTD_parameters params = ZSTD_getParams(1, src_len, 0);
+
+	if (params.cParams.windowLog > ZSTD_BTRFS_MAX_WINDOWLOG)
+		params.cParams.windowLog = ZSTD_BTRFS_MAX_WINDOWLOG;
+	WARN_ON(src_len > ZSTD_BTRFS_MAX_INPUT);
+	return params;
+}
+
+struct workspace {
+	void *mem;
+	size_t size;
+	char *buf;
+	struct list_head list;
+};
+
+static void zstd_free_workspace(struct list_head *ws)
+{
+	struct workspace *workspace = list_entry(ws, struct workspace, list);
+
+	vfree(workspace->mem);
+	kfree(workspace->buf);
+	kfree(workspace);
+}
+
+static struct list_head *zstd_alloc_workspace(void)
+{
+	ZSTD_parameters params =
+			zstd_get_btrfs_parameters(ZSTD_BTRFS_MAX_INPUT);
+	struct workspace *workspace;
+
+	workspace = kzalloc(sizeof(*workspace), GFP_NOFS);
+	if (!workspace)
+		return ERR_PTR(-ENOMEM);
+
+	workspace->size = max_t(size_t,
+			ZSTD_CStreamWorkspaceBound(params.cParams),
+			ZSTD_DStreamWorkspaceBound(ZSTD_BTRFS_MAX_INPUT));
+	workspace->mem = vmalloc(workspace->size);
+	workspace->buf = kmalloc(PAGE_SIZE, GFP_NOFS);
+	if (!workspace->mem || !workspace->buf)
+		goto fail;
+
+	INIT_LIST_HEAD(&workspace->list);
+
+	return &workspace->list;
+fail:
+	zstd_free_workspace(&workspace->list);
+	return ERR_PTR(-ENOMEM);
+}
+
+static int zstd_compress_pages(struct list_head *ws,
+		struct address_space *mapping,
+		u64 start,
+		struct page **pages,
+		unsigned long *out_pages,
+		unsigned long *total_in,
+		unsigned long *total_out)
+{
+	struct workspace *workspace = list_entry(ws, struct workspace, list);
+	ZSTD_CStream *stream;
+	int ret = 0;
+	int nr_pages = 0;
+	struct page *in_page = NULL;  /* The current page to read */
+	struct page *out_page = NULL; /* The current page to write to */
+	ZSTD_inBuffer in_buf = { NULL, 0, 0 };
+	ZSTD_outBuffer out_buf = { NULL, 0, 0 };
+	unsigned long tot_in = 0;
+	unsigned long tot_out = 0;
+	unsigned long len = *total_out;
+	const unsigned long nr_dest_pages = *out_pages;
+	unsigned long max_out = nr_dest_pages * PAGE_SIZE;
+	ZSTD_parameters params = zstd_get_btrfs_parameters(len);
+
+	*out_pages = 0;
+	*total_out = 0;
+	*total_in = 0;
+
+	/* Initialize the stream */
+	stream = ZSTD_initCStream(params, len, workspace->mem,
+			workspace->size);
+	if (!stream) {
+		pr_warn("BTRFS: ZSTD_initCStream failed\n");
+		ret = -EIO;
+		goto out;
+	}
+
+	/* map in the first page of input data */
+	in_page = find_get_page(mapping, start >> PAGE_SHIFT);
+	in_buf.src = kmap(in_page);
+	in_buf.pos = 0;
+	in_buf.size = min_t(size_t, len, PAGE_SIZE);
+
+
+	/* Allocate and map in the output buffer */
+	out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
+	if (out_page == NULL) {
+		ret = -ENOMEM;
+		goto out;
+	}
+	pages[nr_pages++] = out_page;
+	out_buf.dst = kmap(out_page);
+	out_buf.pos = 0;
+	out_buf.size = min_t(size_t, max_out, PAGE_SIZE);
+
+	while (1) {
+		size_t ret2;
+
+		ret2 = ZSTD_compressStream(stream, &out_buf, &in_buf);
+		if (ZSTD_isError(ret2)) {
+			pr_debug("BTRFS: ZSTD_compressStream returned %d\n",
+					ZSTD_getErrorCode(ret2));
+			ret = -EIO;
+			goto out;
+		}
+
+		/* Check to see if we are making it bigger */
+		if (tot_in + in_buf.pos > 8192 &&
+				tot_in + in_buf.pos <
+				tot_out + out_buf.pos) {
+			ret = -E2BIG;
+			goto out;
+		}
+
+		/* We've reached the end of our output range */
+		if (out_buf.pos >= max_out) {
+			tot_out += out_buf.pos;
+			ret = -E2BIG;
+			goto out;
+		}
+
+		/* Check if we need more output space */
+		if (out_buf.pos == out_buf.size) {
+			tot_out += PAGE_SIZE;
+			max_out -= PAGE_SIZE;
+			kunmap(out_page);
+			if (nr_pages == nr_dest_pages) {
+				out_page = NULL;
+				ret = -E2BIG;
+				goto out;
+			}
+			out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
+			if (out_page == NULL) {
+				ret = -ENOMEM;
+				goto out;
+			}
+			pages[nr_pages++] = out_page;
+			out_buf.dst = kmap(out_page);
+			out_buf.pos = 0;
+			out_buf.size = min_t(size_t, max_out, PAGE_SIZE);
+		}
+
+		/* We've reached the end of the input */
+		if (in_buf.pos >= len) {
+			tot_in += in_buf.pos;
+			break;
+		}
+
+		/* Check if we need more input */
+		if (in_buf.pos == in_buf.size) {
+			tot_in += PAGE_SIZE;
+			kunmap(in_page);
+			put_page(in_page);
+
+			start += PAGE_SIZE;
+			len -= PAGE_SIZE;
+			in_page = find_get_page(mapping, start >> PAGE_SHIFT);
+			in_buf.src = kmap(in_page);
+			in_buf.pos = 0;
+			in_buf.size = min_t(size_t, len, PAGE_SIZE);
+		}
+	}
+	while (1) {
+		size_t ret2;
+
+		ret2 = ZSTD_endStream(stream, &out_buf);
+		if (ZSTD_isError(ret2)) {
+			pr_debug("BTRFS: ZSTD_endStream returned %d\n",
+					ZSTD_getErrorCode(ret2));
+			ret = -EIO;
+			goto out;
+		}
+		if (ret2 == 0) {
+			tot_out += out_buf.pos;
+			break;
+		}
+		if (out_buf.pos >= max_out) {
+			tot_out += out_buf.pos;
+			ret = -E2BIG;
+			goto out;
+		}
+
+		tot_out += PAGE_SIZE;
+		max_out -= PAGE_SIZE;
+		kunmap(out_page);
+		if (nr_pages == nr_dest_pages) {
+			out_page = NULL;
+			ret = -E2BIG;
+			goto out;
+		}
+		out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
+		if (out_page == NULL) {
+			ret = -ENOMEM;
+			goto out;
+		}
+		pages[nr_pages++] = out_page;
+		out_buf.dst = kmap(out_page);
+		out_buf.pos = 0;
+		out_buf.size = min_t(size_t, max_out, PAGE_SIZE);
+	}
+
+	if (tot_out >= tot_in) {
+		ret = -E2BIG;
+		goto out;
+	}
+
+	ret = 0;
+	*total_in = tot_in;
+	*total_out = tot_out;
+out:
+	*out_pages = nr_pages;
+	/* Cleanup */
+	if (in_page) {
+		kunmap(in_page);
+		put_page(in_page);
+	}
+	if (out_page)
+		kunmap(out_page);
+	return ret;
+}
+
+static int zstd_decompress_bio(struct list_head *ws, struct page **pages_in,
+		u64 disk_start,
+		struct bio *orig_bio,
+		size_t srclen)
+{
+	struct workspace *workspace = list_entry(ws, struct workspace, list);
+	ZSTD_DStream *stream;
+	int ret = 0;
+	unsigned long page_in_index = 0;
+	unsigned long total_pages_in = DIV_ROUND_UP(srclen, PAGE_SIZE);
+	unsigned long buf_start;
+	unsigned long total_out = 0;
+	ZSTD_inBuffer in_buf = { NULL, 0, 0 };
+	ZSTD_outBuffer out_buf = { NULL, 0, 0 };
+
+	stream = ZSTD_initDStream(
+			ZSTD_BTRFS_MAX_INPUT, workspace->mem, workspace->size);
+	if (!stream) {
+		pr_debug("BTRFS: ZSTD_initDStream failed\n");
+		ret = -EIO;
+		goto done;
+	}
+
+	in_buf.src = kmap(pages_in[page_in_index]);
+	in_buf.pos = 0;
+	in_buf.size = min_t(size_t, srclen, PAGE_SIZE);
+
+	out_buf.dst = workspace->buf;
+	out_buf.pos = 0;
+	out_buf.size = PAGE_SIZE;
+
+	while (1) {
+		size_t ret2;
+
+		ret2 = ZSTD_decompressStream(stream, &out_buf, &in_buf);
+		if (ZSTD_isError(ret2)) {
+			pr_debug("BTRFS: ZSTD_decompressStream returned %d\n",
+					ZSTD_getErrorCode(ret2));
+			ret = -EIO;
+			goto done;
+		}
+		buf_start = total_out;
+		total_out += out_buf.pos;
+		out_buf.pos = 0;
+
+		ret = btrfs_decompress_buf2page(out_buf.dst, buf_start,
+				total_out, disk_start, orig_bio);
+		if (ret == 0)
+			break;
+
+		if (in_buf.pos >= srclen)
+			break;
+
+		/* Check if we've hit the end of a frame */
+		if (ret2 == 0)
+			break;
+
+		if (in_buf.pos == in_buf.size) {
+			kunmap(pages_in[page_in_index++]);
+			if (page_in_index >= total_pages_in) {
+				in_buf.src = NULL;
+				ret = -EIO;
+				goto done;
+			}
+			srclen -= PAGE_SIZE;
+			in_buf.src = kmap(pages_in[page_in_index]);
+			in_buf.pos = 0;
+			in_buf.size = min_t(size_t, srclen, PAGE_SIZE);
+		}
+	}
+	ret = 0;
+	zero_fill_bio(orig_bio);
+done:
+	if (in_buf.src)
+		kunmap(pages_in[page_in_index]);
+	return ret;
+}
+
+static int zstd_decompress(struct list_head *ws, unsigned char *data_in,
+		struct page *dest_page,
+		unsigned long start_byte,
+		size_t srclen, size_t destlen)
+{
+	struct workspace *workspace = list_entry(ws, struct workspace, list);
+	ZSTD_DStream *stream;
+	int ret = 0;
+	size_t ret2;
+	ZSTD_inBuffer in_buf = { NULL, 0, 0 };
+	ZSTD_outBuffer out_buf = { NULL, 0, 0 };
+	unsigned long total_out = 0;
+	unsigned long pg_offset = 0;
+	char *kaddr;
+
+	stream = ZSTD_initDStream(
+			ZSTD_BTRFS_MAX_INPUT, workspace->mem, workspace->size);
+	if (!stream) {
+		pr_warn("BTRFS: ZSTD_initDStream failed\n");
+		ret = -EIO;
+		goto finish;
+	}
+
+	destlen = min_t(size_t, destlen, PAGE_SIZE);
+
+	in_buf.src = data_in;
+	in_buf.pos = 0;
+	in_buf.size = srclen;
+
+	out_buf.dst = workspace->buf;
+	out_buf.pos = 0;
+	out_buf.size = PAGE_SIZE;
+
+	ret2 = 1;
+	while (pg_offset < destlen && in_buf.pos < in_buf.size) {
+		unsigned long buf_start;
+		unsigned long buf_offset;
+		unsigned long bytes;
+
+		/* Check if the frame is over and we still need more input */
+		if (ret2 == 0) {
+			pr_debug("BTRFS: ZSTD_decompressStream ended early\n");
+			ret = -EIO;
+			goto finish;
+		}
+		ret2 = ZSTD_decompressStream(stream, &out_buf, &in_buf);
+		if (ZSTD_isError(ret2)) {
+			pr_debug("BTRFS: ZSTD_decompressStream returned %d\n",
+					ZSTD_getErrorCode(ret2));
+			ret = -EIO;
+			goto finish;
+		}
+
+		buf_start = total_out;
+		total_out += out_buf.pos;
+		out_buf.pos = 0;
+
+		if (total_out <= start_byte)
+			continue;
+
+		if (total_out > start_byte && buf_start < start_byte)
+			buf_offset = start_byte - buf_start;
+		else
+			buf_offset = 0;
+
+		bytes = min_t(unsigned long, destlen - pg_offset,
+				out_buf.size - buf_offset);
+
+		kaddr = kmap_atomic(dest_page);
+		memcpy(kaddr + pg_offset, out_buf.dst + buf_offset, bytes);
+		kunmap_atomic(kaddr);
+
+		pg_offset += bytes;
+	}
+	ret = 0;
+finish:
+	if (pg_offset < destlen) {
+		kaddr = kmap_atomic(dest_page);
+		memset(kaddr + pg_offset, 0, destlen - pg_offset);
+		kunmap_atomic(kaddr);
+	}
+	return ret;
+}
+
+const struct btrfs_compress_op btrfs_zstd_compress = {
+	.alloc_workspace = zstd_alloc_workspace,
+	.free_workspace = zstd_free_workspace,
+	.compress_pages = zstd_compress_pages,
+	.decompress_bio = zstd_decompress_bio,
+	.decompress = zstd_decompress,
+};
diff --git a/include/uapi/linux/btrfs.h b/include/uapi/linux/btrfs.h
index a456e53..992c150 100644
--- a/include/uapi/linux/btrfs.h
+++ b/include/uapi/linux/btrfs.h
@@ -255,13 +255,7 @@  struct btrfs_ioctl_fs_info_args {
 #define BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL	(1ULL << 1)
 #define BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS	(1ULL << 2)
 #define BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO	(1ULL << 3)
-/*
- * some patches floated around with a second compression method
- * lets save that incompat here for when they do get in
- * Note we don't actually support it, we're just reserving the
- * number
- */
-#define BTRFS_FEATURE_INCOMPAT_COMPRESS_LZOv2	(1ULL << 4)
+#define BTRFS_FEATURE_INCOMPAT_COMPRESS_ZSTD	(1ULL << 4)

 /*
  * older kernels tried to do bigger metadata blocks, but the