All of lore.kernel.org
 help / color / mirror / Atom feed
* fiemap is slow on btrfs on files with multiple extents
@ 2022-08-04 16:30 Pavel Tikhomirov
  2022-08-04 18:49 ` Josef Bacik
  2022-08-05  7:38 ` Dominique MARTINET
  0 siblings, 2 replies; 9+ messages in thread
From: Pavel Tikhomirov @ 2022-08-04 16:30 UTC (permalink / raw)
  To: Chris Mason, Josef Bacik, David Sterba
  Cc: linux-btrfs, lkml, Chen Liang-Chun, Alexander Mikhalitsyn,
	kernel, Dominique MARTINET, Yu Kuai, Theodore Ts'o

I ran the below test on Fedora 36 (the test basically creates "very" 
sparse file, with 4k data followed by 4k hole again and again for the 
specified length and uses fiemap to count extents in this file) and face 
the problem that fiemap hangs for too long (for instance comparing to 
ext4 version). Fiemap with 32768 extents takes ~37264 us and with 65536 
extents it takes ~34123954 us, which is x1000 times more when file only 
increased twice the size:

256Mb:

./fiemap-reproduce /testfile $((1<<28))
size: 268435456
actual size: 134217728
fiemap: fm_mapped_extents = 32768
time = 37264 us

./fiemap-reproduce /testfile $((1<<28))
size: 268435456
actual size: 134217728
fiemap: fm_mapped_extents = 32768
time = 37285 us

512Mb:

./fiemap-reproduce /testfile $((1<<29))
size: 536870912
actual size: 268435456
fiemap: fm_mapped_extents = 65536
time = 34123954 us

./fiemap-reproduce /testfile $((1<<29))
size: 536870912
actual size: 268435456
fiemap: fm_mapped_extents = 65536
time = 60404334 us

1Gb (the whole Fedora hangs sometimes when I measure it):

./fiemap-reproduce /testfile $((1<<30))
size: 1073741824
actual size: 536870912
fiemap: fm_mapped_extents = 131072
time = 231194793 us

./fiemap-reproduce /testfile $((1<<30))
size: 1073741824
actual size: 536870912
fiemap: fm_mapped_extents = 131072
time = 347867789 us

I see a similar problem here 
https://lore.kernel.org/linux-btrfs/Yr4nEoNLkXPKcOBi@atmark-techno.com/#r , 
but in my case I have "5.18.6-200.fc36.x86_64" fedora kernel which does 
not have 5ccc944dce3d ("filemap: Correct the conditions for marking a 
folio as accessed") commit, so it should be something else.

Some more info:

cat /proc/self/mountinfo | grep btrfs
106 1 0:47 /root / rw,relatime shared:1 - btrfs /dev/nvme0n1p3 
rw,compress=zstd:1,ssd,space_cache,subvolid=257,subvol=/root

perf top -ag
Samples: 268K of event 'cycles', 4000 Hz, Event count (approx.): 
77250404934 lost: 0/0 drop: 0/0
   Children      Self  Shared Object                       Symbol
+   74,25%     1,16%  [kernel]                            [k] 
entry_SYSCALL_64_after_hwframe
+   73,14%     0,65%  [kernel]                            [k] do_syscall_64
+   53,05%     3,30%  libc.so.6                           [.] __poll
+   39,53%     0,76%  [kernel]                            [k] __x64_sys_poll
+   34,91%     6,44%  [kernel]                            [k] do_sys_poll
+   29,37%     0,00%  [kernel]                            [k] 
__x64_sys_ioctl
+   29,08%     7,65%  [kernel]                            [k] 
count_range_bits
+   28,44%     0,00%  [kernel]                            [k] do_vfs_ioctl
+   28,43%     0,00%  [kernel]                            [k] extent_fiemap
+   28,43%     0,00%  [kernel]                            [k] 
btrfs_get_extent_fiemap
+   27,87%     0,00%  libc.so.6                           [.] __GI___ioctl
+   25,89%     0,00%  [kernel]                            [k] 
get_extent_skip_holes
+   21,76%    21,29%  [kernel]                            [k] rb_next
+    9,50%     0,48%  [kernel]                            [k] perf_poll
+    8,04%     0,00%  libc.so.6                           [.] 
__libc_start_call_main
+    6,93%     3,26%  [kernel]                            [k] 
select_estimate_accuracy
+    6,69%     2,15%  [kernel]                            [k] ktime_get_ts64
+    5,60%     3,99%  [kernel]                            [k] 
_raw_spin_lock_irqsave
+    5,16%     0,40%  [kernel]                            [k] poll_freewait

Here is a fiemap-reproduce.c code:

#include <stdio.h>
#include <unistd.h>
#include <stdlib.h>
#include <fcntl.h>

#include <sys/stat.h>
#include <sys/time.h>
#include <sys/ioctl.h>

#include <linux/fs.h>
#include <linux/fiemap.h>

#define FILE_INTERVAL (1<<13) /* 8Kb */

long long interval(struct timeval t1, struct timeval t2)
{
         long long val = 0;
         val += (t2.tv_usec - t1.tv_usec);
         val += (t2.tv_sec - t1.tv_sec) * 1000 * 1000;
         return val;
}

int main(int argc, char **argv) {
         struct fiemap fiemap = {};
         struct timeval t1, t2;
         char data = 'a';
         struct stat st;
         int fd, off, file_size = FILE_INTERVAL;

         if (argc != 3 && argc != 2) {
                 printf("usage: %s <path> [size]\n", argv[0]);
                 return 1;
         }

         if (argc == 3)
                 file_size = atoi(argv[2]);
         if (file_size < FILE_INTERVAL)
                 file_size = FILE_INTERVAL;
         file_size -= file_size % FILE_INTERVAL;

         fd = open(argv[1], O_RDWR | O_CREAT | O_TRUNC, 0644);
         if (fd < 0) {
                 perror("open");
                 return 1;
         }

         for (off = 0; off < file_size; off += FILE_INTERVAL) {
                 if (pwrite(fd, &data, 1, off) != 1) {
                         perror("pwrite");
                         close(fd);
                         return 1;
                 }
         }

         if (ftruncate(fd, file_size)) {
                 perror("ftruncate");
                 close(fd);
                 return 1;
         }

         if (fstat(fd, &st) < 0) {
                 perror("fstat");
                 close(fd);
                 return 1;
         }

         printf("size: %ld\n", st.st_size);
         printf("actual size: %ld\n", st.st_blocks * 512);

         fiemap.fm_length = FIEMAP_MAX_OFFSET;
         gettimeofday(&t1, NULL);
         if (ioctl(fd, FS_IOC_FIEMAP, &fiemap) < 0) {
                 perror("fiemap");
                 close(fd);
                 return 1;
         }
         gettimeofday(&t2, NULL);

         printf("fiemap: fm_mapped_extents = %d\n", 
fiemap.fm_mapped_extents);
         printf("time = %lld us\n", interval(t1, t2));

         close(fd);
         return 0;
}

-- 
Best regards, Tikhomirov Pavel
Software Developer, Virtuozzo.

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: fiemap is slow on btrfs on files with multiple extents
  2022-08-04 16:30 fiemap is slow on btrfs on files with multiple extents Pavel Tikhomirov
@ 2022-08-04 18:49 ` Josef Bacik
  2022-08-05  4:52   ` Wang Yugui
  2022-08-05  7:38 ` Dominique MARTINET
  1 sibling, 1 reply; 9+ messages in thread
From: Josef Bacik @ 2022-08-04 18:49 UTC (permalink / raw)
  To: Pavel Tikhomirov
  Cc: Chris Mason, David Sterba, linux-btrfs, lkml, Chen Liang-Chun,
	Alexander Mikhalitsyn, kernel, Dominique MARTINET, Yu Kuai,
	Theodore Ts'o

On Thu, Aug 04, 2022 at 07:30:52PM +0300, Pavel Tikhomirov wrote:
> I ran the below test on Fedora 36 (the test basically creates "very" sparse
> file, with 4k data followed by 4k hole again and again for the specified
> length and uses fiemap to count extents in this file) and face the problem
> that fiemap hangs for too long (for instance comparing to ext4 version).
> Fiemap with 32768 extents takes ~37264 us and with 65536 extents it takes
> ~34123954 us, which is x1000 times more when file only increased twice the
> size:
>

Ah that was helpful, thank you.  I think I've spotted the problem, please give
this a whirl to make sure we're seeing the same thing.  Thanks,

Josef
 
From 1133d5ebf952ebf334bc7be21a575b1f52eb71d4 Mon Sep 17 00:00:00 2001
Message-Id: <1133d5ebf952ebf334bc7be21a575b1f52eb71d4.1659638886.git.josef@toxicpanda.com>
From: Josef Bacik <josef@toxicpanda.com>
Date: Thu, 4 Aug 2022 14:45:53 -0400
Subject: [PATCH] btrfs: don't search entire range for delalloc with fiemap

For the case where we have

[EXTENT1][HOLE][EXTENT2]

If we fiemap from [HOLE] we will search to len (which could be -1) to
see if there's any delalloc extents in the range, however in the above
case btrfs_get_extent() returns a hole em for just the range of the
hole, as it will find EXTENT2, so all we need to do is search for
delalloc in the hole range, not the entire rest of the requested fiemap
range.

This fixes the extremely bad fiemap performance with very large sparse
files.

Signed-off-by: Josef Bacik <josef@toxicpanda.com>
---
 fs/btrfs/inode.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 8fc1e3b6e00c..b7ad8f7a7b53 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -7095,7 +7095,7 @@ struct extent_map *btrfs_get_extent_fiemap(struct btrfs_inode *inode,
 		hole_em = em;
 
 	/* check to see if we've wrapped (len == -1 or similar) */
-	end = start + len;
+	end = em->start + em->len;
 	if (end < start)
 		end = (u64)-1;
 	else
-- 
2.36.1

^ permalink raw reply related	[flat|nested] 9+ messages in thread

* Re: fiemap is slow on btrfs on files with multiple extents
  2022-08-04 18:49 ` Josef Bacik
@ 2022-08-05  4:52   ` Wang Yugui
  0 siblings, 0 replies; 9+ messages in thread
From: Wang Yugui @ 2022-08-05  4:52 UTC (permalink / raw)
  To: Josef Bacik
  Cc: Pavel Tikhomirov, Chris Mason, David Sterba, linux-btrfs, lkml,
	Chen Liang-Chun, Alexander Mikhalitsyn, kernel,
	Dominique MARTINET, Yu Kuai, Theodore Ts'o

Hi,

> On Thu, Aug 04, 2022 at 07:30:52PM +0300, Pavel Tikhomirov wrote:
> > I ran the below test on Fedora 36 (the test basically creates "very" sparse
> > file, with 4k data followed by 4k hole again and again for the specified
> > length and uses fiemap to count extents in this file) and face the problem
> > that fiemap hangs for too long (for instance comparing to ext4 version).
> > Fiemap with 32768 extents takes ~37264 us and with 65536 extents it takes
> > ~34123954 us, which is x1000 times more when file only increased twice the
> > size:
> >
> 
> Ah that was helpful, thank you.  I think I've spotted the problem, please give
> this a whirl to make sure we're seeing the same thing.  Thanks,
> 
> Josef

This patch improve the performance very well, but  it seems to break
xfstest generic/285.

xfstest generic/285:
06.11 SEEK_HOLE expected 8192 or 16384, got 8191.                 FAIL
06.12 SEEK_DATA expected 8191 or 8191, got 12288.                 FAIL
06.23 SEEK_HOLE expected 16384 or 16384, got 16383.               FAIL
06.24 SEEK_DATA expected 16383 or 16383, got -1.                  FAIL

Best Regards
Wang Yugui (wangyugui@e16-tech.com)
2022/08/05


>  
> From 1133d5ebf952ebf334bc7be21a575b1f52eb71d4 Mon Sep 17 00:00:00 2001
> Message-Id: <1133d5ebf952ebf334bc7be21a575b1f52eb71d4.1659638886.git.josef@toxicpanda.com>
> From: Josef Bacik <josef@toxicpanda.com>
> Date: Thu, 4 Aug 2022 14:45:53 -0400
> Subject: [PATCH] btrfs: don't search entire range for delalloc with fiemap
> 
> For the case where we have
> 
> [EXTENT1][HOLE][EXTENT2]
> 
> If we fiemap from [HOLE] we will search to len (which could be -1) to
> see if there's any delalloc extents in the range, however in the above
> case btrfs_get_extent() returns a hole em for just the range of the
> hole, as it will find EXTENT2, so all we need to do is search for
> delalloc in the hole range, not the entire rest of the requested fiemap
> range.
> 
> This fixes the extremely bad fiemap performance with very large sparse
> files.
> 
> Signed-off-by: Josef Bacik <josef@toxicpanda.com>
> ---
>  fs/btrfs/inode.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
> index 8fc1e3b6e00c..b7ad8f7a7b53 100644
> --- a/fs/btrfs/inode.c
> +++ b/fs/btrfs/inode.c
> @@ -7095,7 +7095,7 @@ struct extent_map *btrfs_get_extent_fiemap(struct btrfs_inode *inode,
>  		hole_em = em;
>  
>  	/* check to see if we've wrapped (len == -1 or similar) */
> -	end = start + len;
> +	end = em->start + em->len;
>  	if (end < start)
>  		end = (u64)-1;
>  	else
> -- 
> 2.36.1



^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: fiemap is slow on btrfs on files with multiple extents
  2022-08-04 16:30 fiemap is slow on btrfs on files with multiple extents Pavel Tikhomirov
  2022-08-04 18:49 ` Josef Bacik
@ 2022-08-05  7:38 ` Dominique MARTINET
  2022-08-05  9:54   ` Filipe Manana
  1 sibling, 1 reply; 9+ messages in thread
From: Dominique MARTINET @ 2022-08-05  7:38 UTC (permalink / raw)
  To: Pavel Tikhomirov, Josef Bacik
  Cc: Chris Mason, David Sterba, linux-btrfs, lkml, Chen Liang-Chun,
	Alexander Mikhalitsyn, kernel, Yu Kuai, Theodore Ts'o

Pavel Tikhomirov wrote on Thu, Aug 04, 2022 at 07:30:52PM +0300:
> I see a similar problem here
> https://lore.kernel.org/linux-btrfs/Yr4nEoNLkXPKcOBi@atmark-techno.com/#r ,
> but in my case I have "5.18.6-200.fc36.x86_64" fedora kernel which does not
> have 5ccc944dce3d ("filemap: Correct the conditions for marking a folio as
> accessed") commit, so it should be something else.

The root cause might be different but I guess they're related enough: if
fiemap gets faster enough even when the whole file is in cache I guess
that works for me :)

Josef Bacik wrote on Thu, Aug 04, 2022 at 02:49:39PM -0400:
> On Thu, Aug 04, 2022 at 07:30:52PM +0300, Pavel Tikhomirov wrote:
> > I ran the below test on Fedora 36 (the test basically creates "very" sparse
> > file, with 4k data followed by 4k hole again and again for the specified
> > length and uses fiemap to count extents in this file) and face the problem
> > that fiemap hangs for too long (for instance comparing to ext4 version).
> > Fiemap with 32768 extents takes ~37264 us and with 65536 extents it takes
> > ~34123954 us, which is x1000 times more when file only increased twice the
> > size:
> >
> 
> Ah that was helpful, thank you.  I think I've spotted the problem, please give
> this a whirl to make sure we're seeing the same thing.  Thanks,

FWIW this patch does help a tiny bit, but I'm still seeing a huge
slowdown: with patch cp goes from ~600MB/s (55s) to 136MB/s (3m55s) on
the second run; and without the patch I'm getting 47s and 5m35
respectively so this has gotten a bit better but these must still be
cases running through the whole list (e.g. when not hitting a hole?)


My reproducer is just running 'cp file /dev/null' twice on a file with
194955 extents (same file with mixed compressed & non-compressed extents
as last time), so should be close enough to what Pavel was describing in
just much worse.

-- 
Dominique

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: fiemap is slow on btrfs on files with multiple extents
  2022-08-05  7:38 ` Dominique MARTINET
@ 2022-08-05  9:54   ` Filipe Manana
  2022-09-01 13:25     ` Filipe Manana
  0 siblings, 1 reply; 9+ messages in thread
From: Filipe Manana @ 2022-08-05  9:54 UTC (permalink / raw)
  To: Dominique MARTINET
  Cc: Pavel Tikhomirov, Josef Bacik, Chris Mason, David Sterba,
	linux-btrfs, lkml, Chen Liang-Chun, Alexander Mikhalitsyn,
	kernel, Yu Kuai, Theodore Ts'o

On Fri, Aug 05, 2022 at 04:38:21PM +0900, Dominique MARTINET wrote:
> Pavel Tikhomirov wrote on Thu, Aug 04, 2022 at 07:30:52PM +0300:
> > I see a similar problem here
> > https://lore.kernel.org/linux-btrfs/Yr4nEoNLkXPKcOBi@atmark-techno.com/#r ,
> > but in my case I have "5.18.6-200.fc36.x86_64" fedora kernel which does not
> > have 5ccc944dce3d ("filemap: Correct the conditions for marking a folio as
> > accessed") commit, so it should be something else.
> 
> The root cause might be different but I guess they're related enough: if
> fiemap gets faster enough even when the whole file is in cache I guess
> that works for me :)
> 
> Josef Bacik wrote on Thu, Aug 04, 2022 at 02:49:39PM -0400:
> > On Thu, Aug 04, 2022 at 07:30:52PM +0300, Pavel Tikhomirov wrote:
> > > I ran the below test on Fedora 36 (the test basically creates "very" sparse
> > > file, with 4k data followed by 4k hole again and again for the specified
> > > length and uses fiemap to count extents in this file) and face the problem
> > > that fiemap hangs for too long (for instance comparing to ext4 version).
> > > Fiemap with 32768 extents takes ~37264 us and with 65536 extents it takes
> > > ~34123954 us, which is x1000 times more when file only increased twice the
> > > size:
> > >
> > 
> > Ah that was helpful, thank you.  I think I've spotted the problem, please give
> > this a whirl to make sure we're seeing the same thing.  Thanks,
> 
> FWIW this patch does help a tiny bit, but I'm still seeing a huge
> slowdown: with patch cp goes from ~600MB/s (55s) to 136MB/s (3m55s) on
> the second run; and without the patch I'm getting 47s and 5m35
> respectively so this has gotten a bit better but these must still be
> cases running through the whole list (e.g. when not hitting a hole?)
> 
> 
> My reproducer is just running 'cp file /dev/null' twice on a file with
> 194955 extents (same file with mixed compressed & non-compressed extents
> as last time), so should be close enough to what Pavel was describing in
> just much worse.

I remember your original report Dominique, it came along with the short
reads issue when using using io_uring with qemu.

I had a quick look before going on vacations. In your post at:

https://lore.kernel.org/linux-btrfs/Ysace25wh5BbLd5f@atmark-techno.com/

you mentioned a lot of time spent on count_range_bits(), and I quickly
came with a testing patch for that specific area:

https://git.kernel.org/pub/scm/linux/kernel/git/fdmanana/linux.git/commit/?h=fiemap_speedup&id=6bdc02edbb52786df2d8c2405d790390d9a9443c

Basically whenever we call that, we start searching from the root of the
extent states rbtree - if the rbtree is large, that takes a lot of time.
The idea is to start the search from the last record instead.

I haven't actually made any performance tests, as vacations came in and
I noticed that such change will very likely make little or no difference
because algorithmically btrfs' fiemap implementation is very ineficient
for several reasons. It basically works like this:

1) We start the search for the first extent. First we go search the inode's
   extent map rbtree - if we can't find it, then we will search in the
   fs b+tree - after this we create an extent map based on the file extent
   item we found in the b+tree and add it to the extent map rbtree.

   We then pass to fiemap extent information based on the extent map
   (there's a few extra minor details, like merging, etc);

2) Then we search for the next extent, with a start offset based on the
   end offset of the previous one +1.

   Again, if we can't find it in the extent map rbtree, we go search the
   fs b+tree, then create an extent map based on the file extent item we
   found there and add it to extent map rbtree.

   This is silly. On each iteration the extent maps rbtree gets bigger and
   bigger, and we always search from the root node. We are spending time
   searching there and then allocating memory for the extent map and adding
   it to the rbtree, which is yet more cpu time spent.

   We should only create extent maps when we are doing IO against, for a
   data write or read operation, we are just spending a lot of time on
   this and consuming memory too.

   Then it's silly again because we will search the fs b+tree again, starting
   from the root. So we end up visting the same leaves over and over;

3) Whenever we find a hole, or a prealloc/unwritten extent, we have to check
   if there's pending dealloc for that region. That's where count_range_bits()
   is used - everytime it's called it starts from the root node of the extent
   states rbtree.

My idea to address this is to basically rewrite fiemap so that it works like
this:

1) Go over each leaf in the fs b+tree and for each file extent item emit the
   extent information for fiemap - like this we don't do many repeated b+tree
   searches to end up in the same leaf;

2) Never create extent maps, so that we don't grow the extent maps rbtree
   unnecessarily, saving cpu time and avoiding memory allocations;

3) Whenever we find a hole or prealloc/unwritten extent, then check if there's
   pending delalloc in the range by using count_range_bits() like we currently
   do (and maybe add that patch to avoid always starting the search from the
   root).

   If there's delalloc, then lookup for the correspond extent maps and use
   their info to emit extent information for fiemap. And keep using rb_next()
   while an extent map ends before the hole/unwritten range;

4) Because emitting all the extent information for fiemap and doing other things
   like checking if an extent is shared, calling count_range_bits(), etc can
   take some time, to avoid holding a read lock for too long on the fs b+tree
   leaf and block other tasks, clone the leaf, release the lock on the leaf and
   use the private clone. This is fine since we start fiemap we lock the file
   range, so no one else can go and create or drop extents in the range before
   fiemap finishes.

That's the high level idea.

There's another factor that can slowdown fiemap a lot, which is figuring out if
an extent is shared or not (reflinks, snapshots), but in your case you don't
have shared extents IIRC. I would have to look at that separetely, we probably
have some room for improvement there as well.

I haven't had the time to work on that, as I've been working on other stuff
unrelated to fiemap, but maybe in a week or two I may start it.

> 
> -- 
> Dominique

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: fiemap is slow on btrfs on files with multiple extents
  2022-08-05  9:54   ` Filipe Manana
@ 2022-09-01 13:25     ` Filipe Manana
  2022-09-01 15:06       ` Pavel Tikhomirov
  2022-09-21  7:30       ` Dominique MARTINET
  0 siblings, 2 replies; 9+ messages in thread
From: Filipe Manana @ 2022-09-01 13:25 UTC (permalink / raw)
  To: Dominique MARTINET
  Cc: Pavel Tikhomirov, Josef Bacik, Chris Mason, David Sterba,
	linux-btrfs, lkml, Chen Liang-Chun, Alexander Mikhalitsyn,
	kernel, Yu Kuai, Theodore Ts'o

On Fri, Aug 05, 2022 at 10:54:07AM +0100, Filipe Manana wrote:
> On Fri, Aug 05, 2022 at 04:38:21PM +0900, Dominique MARTINET wrote:
> > Pavel Tikhomirov wrote on Thu, Aug 04, 2022 at 07:30:52PM +0300:
> > > I see a similar problem here
> > > https://lore.kernel.org/linux-btrfs/Yr4nEoNLkXPKcOBi@atmark-techno.com/#r ,
> > > but in my case I have "5.18.6-200.fc36.x86_64" fedora kernel which does not
> > > have 5ccc944dce3d ("filemap: Correct the conditions for marking a folio as
> > > accessed") commit, so it should be something else.
> > 
> > The root cause might be different but I guess they're related enough: if
> > fiemap gets faster enough even when the whole file is in cache I guess
> > that works for me :)
> > 
> > Josef Bacik wrote on Thu, Aug 04, 2022 at 02:49:39PM -0400:
> > > On Thu, Aug 04, 2022 at 07:30:52PM +0300, Pavel Tikhomirov wrote:
> > > > I ran the below test on Fedora 36 (the test basically creates "very" sparse
> > > > file, with 4k data followed by 4k hole again and again for the specified
> > > > length and uses fiemap to count extents in this file) and face the problem
> > > > that fiemap hangs for too long (for instance comparing to ext4 version).
> > > > Fiemap with 32768 extents takes ~37264 us and with 65536 extents it takes
> > > > ~34123954 us, which is x1000 times more when file only increased twice the
> > > > size:
> > > >
> > > 
> > > Ah that was helpful, thank you.  I think I've spotted the problem, please give
> > > this a whirl to make sure we're seeing the same thing.  Thanks,
> > 
> > FWIW this patch does help a tiny bit, but I'm still seeing a huge
> > slowdown: with patch cp goes from ~600MB/s (55s) to 136MB/s (3m55s) on
> > the second run; and without the patch I'm getting 47s and 5m35
> > respectively so this has gotten a bit better but these must still be
> > cases running through the whole list (e.g. when not hitting a hole?)
> > 
> > 
> > My reproducer is just running 'cp file /dev/null' twice on a file with
> > 194955 extents (same file with mixed compressed & non-compressed extents
> > as last time), so should be close enough to what Pavel was describing in
> > just much worse.
> 
> I remember your original report Dominique, it came along with the short
> reads issue when using using io_uring with qemu.
> 
> I had a quick look before going on vacations. In your post at:
> 
> https://lore.kernel.org/linux-btrfs/Ysace25wh5BbLd5f@atmark-techno.com/
> 
> you mentioned a lot of time spent on count_range_bits(), and I quickly
> came with a testing patch for that specific area:
> 
> https://git.kernel.org/pub/scm/linux/kernel/git/fdmanana/linux.git/commit/?h=fiemap_speedup&id=6bdc02edbb52786df2d8c2405d790390d9a9443c
> 
> Basically whenever we call that, we start searching from the root of the
> extent states rbtree - if the rbtree is large, that takes a lot of time.
> The idea is to start the search from the last record instead.
> 
> I haven't actually made any performance tests, as vacations came in and
> I noticed that such change will very likely make little or no difference
> because algorithmically btrfs' fiemap implementation is very ineficient
> for several reasons. It basically works like this:
> 
> 1) We start the search for the first extent. First we go search the inode's
>    extent map rbtree - if we can't find it, then we will search in the
>    fs b+tree - after this we create an extent map based on the file extent
>    item we found in the b+tree and add it to the extent map rbtree.
> 
>    We then pass to fiemap extent information based on the extent map
>    (there's a few extra minor details, like merging, etc);
> 
> 2) Then we search for the next extent, with a start offset based on the
>    end offset of the previous one +1.
> 
>    Again, if we can't find it in the extent map rbtree, we go search the
>    fs b+tree, then create an extent map based on the file extent item we
>    found there and add it to extent map rbtree.
> 
>    This is silly. On each iteration the extent maps rbtree gets bigger and
>    bigger, and we always search from the root node. We are spending time
>    searching there and then allocating memory for the extent map and adding
>    it to the rbtree, which is yet more cpu time spent.
> 
>    We should only create extent maps when we are doing IO against, for a
>    data write or read operation, we are just spending a lot of time on
>    this and consuming memory too.
> 
>    Then it's silly again because we will search the fs b+tree again, starting
>    from the root. So we end up visting the same leaves over and over;
> 
> 3) Whenever we find a hole, or a prealloc/unwritten extent, we have to check
>    if there's pending dealloc for that region. That's where count_range_bits()
>    is used - everytime it's called it starts from the root node of the extent
>    states rbtree.
> 
> My idea to address this is to basically rewrite fiemap so that it works like
> this:
> 
> 1) Go over each leaf in the fs b+tree and for each file extent item emit the
>    extent information for fiemap - like this we don't do many repeated b+tree
>    searches to end up in the same leaf;
> 
> 2) Never create extent maps, so that we don't grow the extent maps rbtree
>    unnecessarily, saving cpu time and avoiding memory allocations;
> 
> 3) Whenever we find a hole or prealloc/unwritten extent, then check if there's
>    pending delalloc in the range by using count_range_bits() like we currently
>    do (and maybe add that patch to avoid always starting the search from the
>    root).
> 
>    If there's delalloc, then lookup for the correspond extent maps and use
>    their info to emit extent information for fiemap. And keep using rb_next()
>    while an extent map ends before the hole/unwritten range;
> 
> 4) Because emitting all the extent information for fiemap and doing other things
>    like checking if an extent is shared, calling count_range_bits(), etc can
>    take some time, to avoid holding a read lock for too long on the fs b+tree
>    leaf and block other tasks, clone the leaf, release the lock on the leaf and
>    use the private clone. This is fine since we start fiemap we lock the file
>    range, so no one else can go and create or drop extents in the range before
>    fiemap finishes.
> 
> That's the high level idea.
> 
> There's another factor that can slowdown fiemap a lot, which is figuring out if
> an extent is shared or not (reflinks, snapshots), but in your case you don't
> have shared extents IIRC. I would have to look at that separetely, we probably
> have some room for improvement there as well.
> 
> I haven't had the time to work on that, as I've been working on other stuff
> unrelated to fiemap, but maybe in a week or two I may start it.

It took me a bit more than I expected, but here is the patchset to make fiemap
(and lseek) much more efficient on btrfs:

https://lore.kernel.org/linux-btrfs/cover.1662022922.git.fdmanana@suse.com/

And also available in this git branch:

https://git.kernel.org/pub/scm/linux/kernel/git/fdmanana/linux.git/log/?h=lseek_fiemap_scalability

Running Pavel's test before applying the patchset:

    *********** 256M ***********

    size: 268435456
    actual size: 134217728
    fiemap: fm_mapped_extents = 32768
    time = 4003133 us

    size: 268435456
    actual size: 134217728
    fiemap: fm_mapped_extents = 32768
    time = 4895330 us

    *********** 512M ***********

    size: 536870912
    actual size: 268435456
    fiemap: fm_mapped_extents = 65536
    time = 30123675 us

    size: 536870912
    actual size: 268435456
    fiemap: fm_mapped_extents = 65536
    time = 33450934 us

    *********** 1G ***********

    size: 1073741824
    actual size: 536870912
    fiemap: fm_mapped_extents = 131072
    time = 224924074 us

    size: 1073741824
    actual size: 536870912
    fiemap: fm_mapped_extents = 131072
    time = 217239242 us

And running it after applying the patchset:

    *********** 256M ***********

    size: 268435456
    actual size: 134217728
    fiemap: fm_mapped_extents = 32768
    time = 29475 us

    size: 268435456
    actual size: 134217728
    fiemap: fm_mapped_extents = 32768
    time = 29307 us

    *********** 512M ***********

    size: 536870912
    actual size: 268435456
    fiemap: fm_mapped_extents = 65536
    time = 58996 us

    size: 536870912
    actual size: 268435456
    fiemap: fm_mapped_extents = 65536
    time = 59115 us

    *********** 1G ***********

    size: 1073741824
    actual size: 536870912
    fiemap: fm_mapped_extents = 116251
    time = 124141 us

    size: 1073741824
    actual size: 536870912
    fiemap: fm_mapped_extents = 131072
    time = 119387 us

There's a huge difference, so after it fiemap is a lot more usable on
btrfs.

It's still not as fast as ext4, but it's getting close to. On ext4 I
get:

    *********** 256M ***********

    size: 268435456
    actual size: 134217728
    fiemap: fm_mapped_extents = 32768
    time = 16877 us

    size: 268435456
    actual size: 134217728
    fiemap: fm_mapped_extents = 32768
    time = 17014 us

    *********** 512M ***********

    size: 536870912
    actual size: 268435456
    fiemap: fm_mapped_extents = 65536
    time = 33764 us

    size: 536870912
    actual size: 268435456
    fiemap: fm_mapped_extents = 65536
    time = 33849 us

    *********** 1G ***********

    size: 1073741824
    actual size: 536870912
    fiemap: fm_mapped_extents = 131072
    time = 69085 us

    size: 1073741824
    actual size: 536870912
    fiemap: fm_mapped_extents = 131072
    time = 68101 us

However we do have extra work to do on btrfs because we have reflinks
and snapshots, so it needs to check if extents are shared, while ext4
does not have those features, thus less work to do for fiemap.

Thanks for the report.

> 
> > 
> > -- 
> > Dominique

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: fiemap is slow on btrfs on files with multiple extents
  2022-09-01 13:25     ` Filipe Manana
@ 2022-09-01 15:06       ` Pavel Tikhomirov
  2022-09-21  7:30       ` Dominique MARTINET
  1 sibling, 0 replies; 9+ messages in thread
From: Pavel Tikhomirov @ 2022-09-01 15:06 UTC (permalink / raw)
  To: Filipe Manana
  Cc: Josef Bacik, Chris Mason, David Sterba, linux-btrfs, lkml,
	Chen Liang-Chun, Alexander Mikhalitsyn, kernel, Yu Kuai,
	Theodore Ts'o, Dominique MARTINET



On 01.09.2022 16:25, Filipe Manana wrote:
> On Fri, Aug 05, 2022 at 10:54:07AM +0100, Filipe Manana wrote:
>> On Fri, Aug 05, 2022 at 04:38:21PM +0900, Dominique MARTINET wrote:
>>> Pavel Tikhomirov wrote on Thu, Aug 04, 2022 at 07:30:52PM +0300:
>>>> I see a similar problem here
>>>> https://lore.kernel.org/linux-btrfs/Yr4nEoNLkXPKcOBi@atmark-techno.com/#r ,
>>>> but in my case I have "5.18.6-200.fc36.x86_64" fedora kernel which does not
>>>> have 5ccc944dce3d ("filemap: Correct the conditions for marking a folio as
>>>> accessed") commit, so it should be something else.
>>>
>>> The root cause might be different but I guess they're related enough: if
>>> fiemap gets faster enough even when the whole file is in cache I guess
>>> that works for me :)
>>>
>>> Josef Bacik wrote on Thu, Aug 04, 2022 at 02:49:39PM -0400:
>>>> On Thu, Aug 04, 2022 at 07:30:52PM +0300, Pavel Tikhomirov wrote:
>>>>> I ran the below test on Fedora 36 (the test basically creates "very" sparse
>>>>> file, with 4k data followed by 4k hole again and again for the specified
>>>>> length and uses fiemap to count extents in this file) and face the problem
>>>>> that fiemap hangs for too long (for instance comparing to ext4 version).
>>>>> Fiemap with 32768 extents takes ~37264 us and with 65536 extents it takes
>>>>> ~34123954 us, which is x1000 times more when file only increased twice the
>>>>> size:
>>>>>
>>>>
>>>> Ah that was helpful, thank you.  I think I've spotted the problem, please give
>>>> this a whirl to make sure we're seeing the same thing.  Thanks,
>>>
>>> FWIW this patch does help a tiny bit, but I'm still seeing a huge
>>> slowdown: with patch cp goes from ~600MB/s (55s) to 136MB/s (3m55s) on
>>> the second run; and without the patch I'm getting 47s and 5m35
>>> respectively so this has gotten a bit better but these must still be
>>> cases running through the whole list (e.g. when not hitting a hole?)
>>>
>>>
>>> My reproducer is just running 'cp file /dev/null' twice on a file with
>>> 194955 extents (same file with mixed compressed & non-compressed extents
>>> as last time), so should be close enough to what Pavel was describing in
>>> just much worse.
>>
>> I remember your original report Dominique, it came along with the short
>> reads issue when using using io_uring with qemu.
>>
>> I had a quick look before going on vacations. In your post at:
>>
>> https://lore.kernel.org/linux-btrfs/Ysace25wh5BbLd5f@atmark-techno.com/
>>
>> you mentioned a lot of time spent on count_range_bits(), and I quickly
>> came with a testing patch for that specific area:
>>
>> https://git.kernel.org/pub/scm/linux/kernel/git/fdmanana/linux.git/commit/?h=fiemap_speedup&id=6bdc02edbb52786df2d8c2405d790390d9a9443c
>>
>> Basically whenever we call that, we start searching from the root of the
>> extent states rbtree - if the rbtree is large, that takes a lot of time.
>> The idea is to start the search from the last record instead.
>>
>> I haven't actually made any performance tests, as vacations came in and
>> I noticed that such change will very likely make little or no difference
>> because algorithmically btrfs' fiemap implementation is very ineficient
>> for several reasons. It basically works like this:
>>
>> 1) We start the search for the first extent. First we go search the inode's
>>     extent map rbtree - if we can't find it, then we will search in the
>>     fs b+tree - after this we create an extent map based on the file extent
>>     item we found in the b+tree and add it to the extent map rbtree.
>>
>>     We then pass to fiemap extent information based on the extent map
>>     (there's a few extra minor details, like merging, etc);
>>
>> 2) Then we search for the next extent, with a start offset based on the
>>     end offset of the previous one +1.
>>
>>     Again, if we can't find it in the extent map rbtree, we go search the
>>     fs b+tree, then create an extent map based on the file extent item we
>>     found there and add it to extent map rbtree.
>>
>>     This is silly. On each iteration the extent maps rbtree gets bigger and
>>     bigger, and we always search from the root node. We are spending time
>>     searching there and then allocating memory for the extent map and adding
>>     it to the rbtree, which is yet more cpu time spent.
>>
>>     We should only create extent maps when we are doing IO against, for a
>>     data write or read operation, we are just spending a lot of time on
>>     this and consuming memory too.
>>
>>     Then it's silly again because we will search the fs b+tree again, starting
>>     from the root. So we end up visting the same leaves over and over;
>>
>> 3) Whenever we find a hole, or a prealloc/unwritten extent, we have to check
>>     if there's pending dealloc for that region. That's where count_range_bits()
>>     is used - everytime it's called it starts from the root node of the extent
>>     states rbtree.
>>
>> My idea to address this is to basically rewrite fiemap so that it works like
>> this:
>>
>> 1) Go over each leaf in the fs b+tree and for each file extent item emit the
>>     extent information for fiemap - like this we don't do many repeated b+tree
>>     searches to end up in the same leaf;
>>
>> 2) Never create extent maps, so that we don't grow the extent maps rbtree
>>     unnecessarily, saving cpu time and avoiding memory allocations;
>>
>> 3) Whenever we find a hole or prealloc/unwritten extent, then check if there's
>>     pending delalloc in the range by using count_range_bits() like we currently
>>     do (and maybe add that patch to avoid always starting the search from the
>>     root).
>>
>>     If there's delalloc, then lookup for the correspond extent maps and use
>>     their info to emit extent information for fiemap. And keep using rb_next()
>>     while an extent map ends before the hole/unwritten range;
>>
>> 4) Because emitting all the extent information for fiemap and doing other things
>>     like checking if an extent is shared, calling count_range_bits(), etc can
>>     take some time, to avoid holding a read lock for too long on the fs b+tree
>>     leaf and block other tasks, clone the leaf, release the lock on the leaf and
>>     use the private clone. This is fine since we start fiemap we lock the file
>>     range, so no one else can go and create or drop extents in the range before
>>     fiemap finishes.
>>
>> That's the high level idea.
>>
>> There's another factor that can slowdown fiemap a lot, which is figuring out if
>> an extent is shared or not (reflinks, snapshots), but in your case you don't
>> have shared extents IIRC. I would have to look at that separetely, we probably
>> have some room for improvement there as well.
>>
>> I haven't had the time to work on that, as I've been working on other stuff
>> unrelated to fiemap, but maybe in a week or two I may start it.
> 
> It took me a bit more than I expected, but here is the patchset to make fiemap
> (and lseek) much more efficient on btrfs:
> 
> https://lore.kernel.org/linux-btrfs/cover.1662022922.git.fdmanana@suse.com/
> 
> And also available in this git branch:
> 
> https://git.kernel.org/pub/scm/linux/kernel/git/fdmanana/linux.git/log/?h=lseek_fiemap_scalability
> 
> Running Pavel's test before applying the patchset:
> 
>      *********** 256M ***********
> 
>      size: 268435456
>      actual size: 134217728
>      fiemap: fm_mapped_extents = 32768
>      time = 4003133 us
> 
>      size: 268435456
>      actual size: 134217728
>      fiemap: fm_mapped_extents = 32768
>      time = 4895330 us
> 
>      *********** 512M ***********
> 
>      size: 536870912
>      actual size: 268435456
>      fiemap: fm_mapped_extents = 65536
>      time = 30123675 us
> 
>      size: 536870912
>      actual size: 268435456
>      fiemap: fm_mapped_extents = 65536
>      time = 33450934 us
> 
>      *********** 1G ***********
> 
>      size: 1073741824
>      actual size: 536870912
>      fiemap: fm_mapped_extents = 131072
>      time = 224924074 us
> 
>      size: 1073741824
>      actual size: 536870912
>      fiemap: fm_mapped_extents = 131072
>      time = 217239242 us
> 
> And running it after applying the patchset:
> 
>      *********** 256M ***********
> 
>      size: 268435456
>      actual size: 134217728
>      fiemap: fm_mapped_extents = 32768
>      time = 29475 us
> 
>      size: 268435456
>      actual size: 134217728
>      fiemap: fm_mapped_extents = 32768
>      time = 29307 us
> 
>      *********** 512M ***********
> 
>      size: 536870912
>      actual size: 268435456
>      fiemap: fm_mapped_extents = 65536
>      time = 58996 us
> 
>      size: 536870912
>      actual size: 268435456
>      fiemap: fm_mapped_extents = 65536
>      time = 59115 us
> 
>      *********** 1G ***********
> 
>      size: 1073741824
>      actual size: 536870912
>      fiemap: fm_mapped_extents = 116251
>      time = 124141 us
> 
>      size: 1073741824
>      actual size: 536870912
>      fiemap: fm_mapped_extents = 131072
>      time = 119387 us
> 
> There's a huge difference, so after it fiemap is a lot more usable on
> btrfs.
> 
> It's still not as fast as ext4, but it's getting close to. On ext4 I
> get:
> 
>      *********** 256M ***********
> 
>      size: 268435456
>      actual size: 134217728
>      fiemap: fm_mapped_extents = 32768
>      time = 16877 us
> 
>      size: 268435456
>      actual size: 134217728
>      fiemap: fm_mapped_extents = 32768
>      time = 17014 us
> 
>      *********** 512M ***********
> 
>      size: 536870912
>      actual size: 268435456
>      fiemap: fm_mapped_extents = 65536
>      time = 33764 us
> 
>      size: 536870912
>      actual size: 268435456
>      fiemap: fm_mapped_extents = 65536
>      time = 33849 us
> 
>      *********** 1G ***********
> 
>      size: 1073741824
>      actual size: 536870912
>      fiemap: fm_mapped_extents = 131072
>      time = 69085 us
> 
>      size: 1073741824
>      actual size: 536870912
>      fiemap: fm_mapped_extents = 131072
>      time = 68101 us
> 
> However we do have extra work to do on btrfs because we have reflinks
> and snapshots, so it needs to check if extents are shared, while ext4
> does not have those features, thus less work to do for fiemap.
> 
> Thanks for the report.

The results are amassing, would try it on my system. Thanks a lot for 
the fixes!

> 
>>
>>>
>>> -- 
>>> Dominique

-- 
Best regards, Tikhomirov Pavel
Software Developer, Virtuozzo.

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: fiemap is slow on btrfs on files with multiple extents
  2022-09-01 13:25     ` Filipe Manana
  2022-09-01 15:06       ` Pavel Tikhomirov
@ 2022-09-21  7:30       ` Dominique MARTINET
  2022-09-21  9:00         ` Filipe Manana
  1 sibling, 1 reply; 9+ messages in thread
From: Dominique MARTINET @ 2022-09-21  7:30 UTC (permalink / raw)
  To: Filipe Manana
  Cc: Pavel Tikhomirov, Josef Bacik, Chris Mason, David Sterba,
	linux-btrfs, lkml, Chen Liang-Chun, Alexander Mikhalitsyn,
	kernel, Yu Kuai, Theodore Ts'o

Filipe Manana wrote on Thu, Sep 01, 2022 at 02:25:12PM +0100:
> It took me a bit more than I expected, but here is the patchset to make fiemap
> (and lseek) much more efficient on btrfs:
> 
> https://lore.kernel.org/linux-btrfs/cover.1662022922.git.fdmanana@suse.com/
> 
> And also available in this git branch:
> 
> https://git.kernel.org/pub/scm/linux/kernel/git/fdmanana/linux.git/log/?h=lseek_fiemap_scalability

Thanks a lot!
Sorry for the slow reply, it took me a while to find time to get back to
my test setup.

There's still this weird behaviour that later calls to cp are slower
than the first, but the improvement is so good that it doesn't matter
quite as much -- I haven't been able to reproduce the rcu stalls in qemu
so I can't say for sure but they probably won't be a problem anymore.

From a quick look with perf record/report the difference still seems to
stem from fiemap (time spent there goes from 4.13 to 45.20%), so there
is still more processing once the file is (at least partially) in cache,
but it has gotten much better.


(tests run on a laptop so assume some inconsistency with thermal
throttling etc)

/mnt/t/t # compsize bigfile
Processed 1 file, 194955 regular extents (199583 refs), 0 inline.
Type       Perc     Disk Usage   Uncompressed Referenced  
TOTAL       15%      3.7G          23G          23G       
none       100%      477M         477M         514M       
zstd        14%      3.2G          23G          23G       
/mnt/t/t # time cp bigfile /dev/null
real	0m 44.52s
user	0m 0.49s
sys	0m 32.91s
/mnt/t/t # time cp bigfile /dev/null
real	0m 46.81s
user	0m 0.55s
sys	0m 35.63s
/mnt/t/t # time cp bigfile /dev/null
real	1m 13.63s
user	0m 0.55s
sys	1m 1.89s
/mnt/t/t # time cp bigfile /dev/null
real	1m 13.44s
user	0m 0.53s
sys	1m 2.08s


For comparison here's how it was on 6.0-rc2 your branch is based on:
/mnt/t/t # time cp atde-test /dev/null
real	0m 46.17s
user	0m 0.60s
sys	0m 33.21s
/mnt/t/t # time cp atde-test /dev/null
real	5m 35.92s
user	0m 0.57s
sys	5m 24.20s



If you're curious the report blames set_extent_bit and
clear_state_bit as follow; get_extent_skip_holes is completely gone; but
I wouldn't necessarily say this needs much more time spent on it.

45.20%--extent_fiemap
|
|--31.02%--lock_extent_bits
|          |          
|           --30.78%--set_extent_bit
|                     |          
|                     |--6.93%--insert_state
|                     |          |          
|                     |           --0.70%--set_state_bits
|                     |          
|                     |--4.25%--alloc_extent_state
|                     |          |          
|                     |           --3.86%--kmem_cache_alloc
|                     |          
|                     |--2.77%--_raw_spin_lock
|                     |          |          
|                     |           --1.23%--preempt_count_add
|                     |          
|                     |--2.48%--rb_next
|                     |          
|                     |--1.13%--_raw_spin_unlock
|                     |          |          
|                     |           --0.55%--preempt_count_sub
|                     |          
|                      --0.92%--set_state_bits
|          
 --13.80%--__clear_extent_bit
           |          
            --13.30%--clear_state_bit
                      |          
                      |           --3.48%--_raw_spin_unlock_irqrestore
                      |          
                      |--2.45%--merge_state.part.0
                      |          |          
                      |           --1.57%--rb_next
                      |          
                      |--2.14%--__slab_free
                      |          |          
                      |           --1.26%--cmpxchg_double_slab.constprop.0.isra.0
                      |          
                      |--0.74%--free_extent_state
                      |          
                      |--0.70%--kmem_cache_free
                      |          
                      |--0.69%--btrfs_clear_delalloc_extent
                      |          
                       --0.52%--rb_next



Thanks!
-- 
Dominique

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: fiemap is slow on btrfs on files with multiple extents
  2022-09-21  7:30       ` Dominique MARTINET
@ 2022-09-21  9:00         ` Filipe Manana
  0 siblings, 0 replies; 9+ messages in thread
From: Filipe Manana @ 2022-09-21  9:00 UTC (permalink / raw)
  To: Dominique MARTINET
  Cc: Pavel Tikhomirov, Josef Bacik, Chris Mason, David Sterba,
	linux-btrfs, lkml, Chen Liang-Chun, Alexander Mikhalitsyn,
	kernel, Yu Kuai, Theodore Ts'o

On Wed, Sep 21, 2022 at 8:30 AM Dominique MARTINET
<dominique.martinet@atmark-techno.com> wrote:
>
> Filipe Manana wrote on Thu, Sep 01, 2022 at 02:25:12PM +0100:
> > It took me a bit more than I expected, but here is the patchset to make fiemap
> > (and lseek) much more efficient on btrfs:
> >
> > https://lore.kernel.org/linux-btrfs/cover.1662022922.git.fdmanana@suse.com/
> >
> > And also available in this git branch:
> >
> > https://git.kernel.org/pub/scm/linux/kernel/git/fdmanana/linux.git/log/?h=lseek_fiemap_scalability
>
> Thanks a lot!
> Sorry for the slow reply, it took me a while to find time to get back to
> my test setup.
>
> There's still this weird behaviour that later calls to cp are slower
> than the first, but the improvement is so good that it doesn't matter
> quite as much -- I haven't been able to reproduce the rcu stalls in qemu
> so I can't say for sure but they probably won't be a problem anymore.
>
> From a quick look with perf record/report the difference still seems to
> stem from fiemap (time spent there goes from 4.13 to 45.20%), so there
> is still more processing once the file is (at least partially) in cache,
> but it has gotten much better.
>
>
> (tests run on a laptop so assume some inconsistency with thermal
> throttling etc)
>
> /mnt/t/t # compsize bigfile
> Processed 1 file, 194955 regular extents (199583 refs), 0 inline.
> Type       Perc     Disk Usage   Uncompressed Referenced
> TOTAL       15%      3.7G          23G          23G
> none       100%      477M         477M         514M
> zstd        14%      3.2G          23G          23G
> /mnt/t/t # time cp bigfile /dev/null
> real    0m 44.52s
> user    0m 0.49s
> sys     0m 32.91s
> /mnt/t/t # time cp bigfile /dev/null
> real    0m 46.81s
> user    0m 0.55s
> sys     0m 35.63s
> /mnt/t/t # time cp bigfile /dev/null
> real    1m 13.63s
> user    0m 0.55s
> sys     1m 1.89s
> /mnt/t/t # time cp bigfile /dev/null
> real    1m 13.44s
> user    0m 0.53s
> sys     1m 2.08s
>
>
> For comparison here's how it was on 6.0-rc2 your branch is based on:
> /mnt/t/t # time cp atde-test /dev/null
> real    0m 46.17s
> user    0m 0.60s
> sys     0m 33.21s
> /mnt/t/t # time cp atde-test /dev/null
> real    5m 35.92s
> user    0m 0.57s
> sys     5m 24.20s
>
>
>
> If you're curious the report blames set_extent_bit and
> clear_state_bit as follow; get_extent_skip_holes is completely gone; but
> I wouldn't necessarily say this needs much more time spent on it.

get_extent_skip_holes() no longer exists, so 0% of time spent there :)

Yes, I know. The reason you see so much time spent on
lock_extent_bits() is basically
because cp does too many fiemap calls with a very small extent buffer size.
I pointed that out here:

https://lore.kernel.org/linux-btrfs/CAL3q7H5NSVicm7nYBJ7x8fFkDpno8z3PYt5aPU43Bajc1H0h1Q@mail.gmail.com/

Making it use a larger buffer (say 500 or 1000 extents), would make it
a lot better.
But as I pointed out there, last year cp was changed to not use fiemap
to detect holes anymore,
now it uses lseek with SEEK_HOLE mode. So with time, everyone will get
a cp version that does
not use fiemap anymore.

Also, for the cp case, since it does many read and fiemap calls to the
source file, the following
patch probably helps too:

https://lore.kernel.org/linux-btrfs/20220819024408.9714-1-ethanlien@synology.com/

Because it will make the io tree smaller. That should land on 6.1 too.

Thanks for testing and the report.

>
> 45.20%--extent_fiemap
> |
> |--31.02%--lock_extent_bits
> |          |
> |           --30.78%--set_extent_bit
> |                     |
> |                     |--6.93%--insert_state
> |                     |          |
> |                     |           --0.70%--set_state_bits
> |                     |
> |                     |--4.25%--alloc_extent_state
> |                     |          |
> |                     |           --3.86%--kmem_cache_alloc
> |                     |
> |                     |--2.77%--_raw_spin_lock
> |                     |          |
> |                     |           --1.23%--preempt_count_add
> |                     |
> |                     |--2.48%--rb_next
> |                     |
> |                     |--1.13%--_raw_spin_unlock
> |                     |          |
> |                     |           --0.55%--preempt_count_sub
> |                     |
> |                      --0.92%--set_state_bits
> |
>  --13.80%--__clear_extent_bit
>            |
>             --13.30%--clear_state_bit
>                       |
>                       |           --3.48%--_raw_spin_unlock_irqrestore
>                       |
>                       |--2.45%--merge_state.part.0
>                       |          |
>                       |           --1.57%--rb_next
>                       |
>                       |--2.14%--__slab_free
>                       |          |
>                       |           --1.26%--cmpxchg_double_slab.constprop.0.isra.0
>                       |
>                       |--0.74%--free_extent_state
>                       |
>                       |--0.70%--kmem_cache_free
>                       |
>                       |--0.69%--btrfs_clear_delalloc_extent
>                       |
>                        --0.52%--rb_next
>
>
>
> Thanks!
> --
> Dominique

^ permalink raw reply	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2022-09-21  9:01 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-08-04 16:30 fiemap is slow on btrfs on files with multiple extents Pavel Tikhomirov
2022-08-04 18:49 ` Josef Bacik
2022-08-05  4:52   ` Wang Yugui
2022-08-05  7:38 ` Dominique MARTINET
2022-08-05  9:54   ` Filipe Manana
2022-09-01 13:25     ` Filipe Manana
2022-09-01 15:06       ` Pavel Tikhomirov
2022-09-21  7:30       ` Dominique MARTINET
2022-09-21  9:00         ` Filipe Manana

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.