All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] btrfs: cache reported zone during mount
@ 2021-11-11  5:14 Naohiro Aota
  2021-11-11 12:51 ` David Sterba
  2021-11-11 12:55 ` David Sterba
  0 siblings, 2 replies; 5+ messages in thread
From: Naohiro Aota @ 2021-11-11  5:14 UTC (permalink / raw)
  To: linux-btrfs; +Cc: David Sterba, Naohiro Aota

When mounting a device, we are reporting the zones twice: once for
checking the zone attributes in btrfs_get_dev_zone_info and once for
loading block groups' zone info in
btrfs_load_block_group_zone_info(). With a lot of block groups, that
leads to a lot of REPORT ZONE commands and slows down the mount
process.

This patch introduces a zone info cache in struct
btrfs_zoned_device_info. The cache is populated while in
btrfs_get_dev_zone_info() and used for
btrfs_load_block_group_zone_info() to reduce the number of REPORT ZONE
commands. The zone cache is then released after loading the block
groups, as it will not be much effective during the run time.

Benchmark: Mount an HDD with 57,007 block groups
Before patch: 171.368 seconds
After patch: 64.064 seconds

While it still takes a minute due to the slowness of loading all the
block groups, the patch reduces the mount time by 1/3.

Link: https://lore.kernel.org/linux-btrfs/CAHQ7scUiLtcTqZOMMY5kbWUBOhGRwKo6J6wYPT5WY+C=cD49nQ@mail.gmail.com/
Fixes: 5b316468983d ("btrfs: get zone information of zoned block devices")
CC: stable@vger.kernel.org
Signed-off-by: Naohiro Aota <naohiro.aota@wdc.com>
---
 fs/btrfs/dev-replace.c |  2 +-
 fs/btrfs/disk-io.c     |  2 ++
 fs/btrfs/volumes.c     |  2 +-
 fs/btrfs/zoned.c       | 78 +++++++++++++++++++++++++++++++++++++++---
 fs/btrfs/zoned.h       |  8 +++--
 5 files changed, 84 insertions(+), 8 deletions(-)

diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index a39987e020e3..1c91f2203da4 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -323,7 +323,7 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
 	set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE);
 	device->fs_devices = fs_info->fs_devices;
 
-	ret = btrfs_get_dev_zone_info(device);
+	ret = btrfs_get_dev_zone_info(device, false);
 	if (ret)
 		goto error;
 
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 847aabb30676..369f84ff6bd3 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -3563,6 +3563,8 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
 		goto fail_sysfs;
 	}
 
+	btrfs_free_zone_cache(fs_info);
+
 	if (!sb_rdonly(sb) && fs_info->fs_devices->missing_devices &&
 	    !btrfs_check_rw_degradable(fs_info, NULL)) {
 		btrfs_warn(fs_info,
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 45c91a2f172c..dd1cbbb73ef0 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -2667,7 +2667,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
 	device->fs_info = fs_info;
 	device->bdev = bdev;
 
-	ret = btrfs_get_dev_zone_info(device);
+	ret = btrfs_get_dev_zone_info(device, false);
 	if (ret)
 		goto error_free_device;
 
diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c
index 67d932d70798..2300d9eff69a 100644
--- a/fs/btrfs/zoned.c
+++ b/fs/btrfs/zoned.c
@@ -213,6 +213,9 @@ static int emulate_report_zones(struct btrfs_device *device, u64 pos,
 static int btrfs_get_dev_zones(struct btrfs_device *device, u64 pos,
 			       struct blk_zone *zones, unsigned int *nr_zones)
 {
+	struct btrfs_zoned_device_info *zinfo = device->zone_info;
+	struct blk_zone *zone_info;
+	u32 zno;
 	int ret;
 
 	if (!*nr_zones)
@@ -224,6 +227,32 @@ static int btrfs_get_dev_zones(struct btrfs_device *device, u64 pos,
 		return 0;
 	}
 
+	if (zinfo->zone_cache) {
+		/* Check cache */
+		unsigned int i;
+
+		ASSERT(IS_ALIGNED(pos, zinfo->zone_size));
+		zno = pos >> zinfo->zone_size_shift;
+		/*
+		 * We cannot report zones beyond the zone end. So, it
+		 * is OK to cap *nr_zones to at the end.
+		 */
+		*nr_zones = min_t(u32, *nr_zones, zinfo->nr_zones - zno);
+
+		for (i = 0; i < *nr_zones; i++) {
+			zone_info = &zinfo->zone_cache[zno + i];
+			if (!zone_info->len)
+				break;
+		}
+
+		if (i == *nr_zones) {
+			/* Cache hit on all the zones */
+			memcpy(zones, zinfo->zone_cache + zno,
+			       sizeof(*zinfo->zone_cache) * *nr_zones);
+			return 0;
+		}
+	}
+
 	ret = blkdev_report_zones(device->bdev, pos >> SECTOR_SHIFT, *nr_zones,
 				  copy_zone_info_cb, zones);
 	if (ret < 0) {
@@ -237,6 +266,11 @@ static int btrfs_get_dev_zones(struct btrfs_device *device, u64 pos,
 	if (!ret)
 		return -EIO;
 
+	/* Populate cache */
+	if (zinfo->zone_cache)
+		memcpy(zinfo->zone_cache + zno, zones,
+		       sizeof(*zinfo->zone_cache) * *nr_zones);
+
 	return 0;
 }
 
@@ -300,7 +334,7 @@ int btrfs_get_dev_zone_info_all_devices(struct btrfs_fs_info *fs_info)
 		if (!device->bdev)
 			continue;
 
-		ret = btrfs_get_dev_zone_info(device);
+		ret = btrfs_get_dev_zone_info(device, true);
 		if (ret)
 			break;
 	}
@@ -309,7 +343,7 @@ int btrfs_get_dev_zone_info_all_devices(struct btrfs_fs_info *fs_info)
 	return ret;
 }
 
-int btrfs_get_dev_zone_info(struct btrfs_device *device)
+int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache)
 {
 	struct btrfs_fs_info *fs_info = device->fs_info;
 	struct btrfs_zoned_device_info *zone_info = NULL;
@@ -407,6 +441,25 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device)
 		goto out;
 	}
 
+	/*
+	 * Enable zone cache only for a zoned device. On a non-zoned
+	 * device, we fill the zone info with emulated CONVENTIONAL
+	 * zones, so no need to use the cache.
+	 */
+	if (populate_cache && bdev_is_zoned(device->bdev)) {
+		zone_info->zone_cache = vzalloc(sizeof(struct blk_zone) *
+						zone_info->nr_zones);
+		if (!zone_info->zone_cache) {
+			btrfs_err_in_rcu(device->fs_info,
+					 "zoned: failed to allocate zone cache for %s",
+					 rcu_str_deref(device->name));
+			ret = -ENOMEM;
+			goto out;
+		}
+	}
+
+	device->zone_info = zone_info;
+
 	/* Get zones type */
 	nactive = 0;
 	while (sector < nr_sectors) {
@@ -505,8 +558,6 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device)
 
 	kfree(zones);
 
-	device->zone_info = zone_info;
-
 	switch (bdev_zoned_model(bdev)) {
 	case BLK_ZONED_HM:
 		model = "host-managed zoned";
@@ -542,6 +593,7 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device)
 	bitmap_free(zone_info->active_zones);
 	bitmap_free(zone_info->empty_zones);
 	bitmap_free(zone_info->seq_zones);
+	vfree(zone_info->zone_cache);
 	kfree(zone_info);
 	device->zone_info = NULL;
 
@@ -1973,3 +2025,21 @@ void btrfs_clear_data_reloc_bg(struct btrfs_block_group *bg)
 		fs_info->data_reloc_bg = 0;
 	spin_unlock(&fs_info->relocation_bg_lock);
 }
+
+void btrfs_free_zone_cache(struct btrfs_fs_info *fs_info)
+{
+	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
+	struct btrfs_device *device;
+
+	if (!btrfs_is_zoned(fs_info))
+		return;
+
+	mutex_lock(&fs_devices->device_list_mutex);
+	list_for_each_entry(device, &fs_devices->devices, dev_list) {
+		if (device->zone_info) {
+			vfree(device->zone_info->zone_cache);
+			device->zone_info->zone_cache = NULL;
+		}
+	}
+	mutex_unlock(&fs_devices->device_list_mutex);
+}
diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h
index e53ab7b96437..4344f4818389 100644
--- a/fs/btrfs/zoned.h
+++ b/fs/btrfs/zoned.h
@@ -28,6 +28,7 @@ struct btrfs_zoned_device_info {
 	unsigned long *seq_zones;
 	unsigned long *empty_zones;
 	unsigned long *active_zones;
+	struct blk_zone *zone_cache;
 	struct blk_zone sb_zones[2 * BTRFS_SUPER_MIRROR_MAX];
 };
 
@@ -35,7 +36,7 @@ struct btrfs_zoned_device_info {
 int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos,
 		       struct blk_zone *zone);
 int btrfs_get_dev_zone_info_all_devices(struct btrfs_fs_info *fs_info);
-int btrfs_get_dev_zone_info(struct btrfs_device *device);
+int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache);
 void btrfs_destroy_dev_zone_info(struct btrfs_device *device);
 int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info);
 int btrfs_check_mountopts_zoned(struct btrfs_fs_info *info);
@@ -76,6 +77,7 @@ bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices,
 void btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info, u64 logical,
 			     u64 length);
 void btrfs_clear_data_reloc_bg(struct btrfs_block_group *bg);
+void btrfs_free_zone_cache(struct btrfs_fs_info *fs_info);
 #else /* CONFIG_BLK_DEV_ZONED */
 static inline int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos,
 				     struct blk_zone *zone)
@@ -88,7 +90,8 @@ static inline int btrfs_get_dev_zone_info_all_devices(struct btrfs_fs_info *fs_i
 	return 0;
 }
 
-static inline int btrfs_get_dev_zone_info(struct btrfs_device *device)
+static inline int btrfs_get_dev_zone_info(struct btrfs_device *device,
+					  bool populate_cache)
 {
 	return 0;
 }
@@ -232,6 +235,7 @@ static inline void btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info,
 
 static inline void btrfs_clear_data_reloc_bg(struct btrfs_block_group *bg) { }
 
+static inline void btrfs_free_zone_cache(struct btrfs_fs_info *fs_info) { }
 #endif
 
 static inline bool btrfs_dev_is_sequential(struct btrfs_device *device, u64 pos)
-- 
2.33.1


^ permalink raw reply related	[flat|nested] 5+ messages in thread

* Re: [PATCH] btrfs: cache reported zone during mount
  2021-11-11  5:14 [PATCH] btrfs: cache reported zone during mount Naohiro Aota
@ 2021-11-11 12:51 ` David Sterba
  2021-11-12  7:55   ` Naohiro Aota
  2021-11-11 12:55 ` David Sterba
  1 sibling, 1 reply; 5+ messages in thread
From: David Sterba @ 2021-11-11 12:51 UTC (permalink / raw)
  To: Naohiro Aota; +Cc: linux-btrfs, David Sterba

On Thu, Nov 11, 2021 at 02:14:38PM +0900, Naohiro Aota wrote:
> This patch introduces a zone info cache in struct
> btrfs_zoned_device_info. The cache is populated while in
> btrfs_get_dev_zone_info() and used for
> btrfs_load_block_group_zone_info() to reduce the number of REPORT ZONE
> commands. The zone cache is then released after loading the block
> groups, as it will not be much effective during the run time.
> 
> Benchmark: Mount an HDD with 57,007 block groups
> Before patch: 171.368 seconds
> After patch: 64.064 seconds
> 
> While it still takes a minute due to the slowness of loading all the
> block groups, the patch reduces the mount time by 1/3.

That's a good improvement.

> Link: https://lore.kernel.org/linux-btrfs/CAHQ7scUiLtcTqZOMMY5kbWUBOhGRwKo6J6wYPT5WY+C=cD49nQ@mail.gmail.com/
> Fixes: 5b316468983d ("btrfs: get zone information of zoned block devices")
> CC: stable@vger.kernel.org
> Signed-off-by: Naohiro Aota <naohiro.aota@wdc.com>
> ---
> +	/*
> +	 * Enable zone cache only for a zoned device. On a non-zoned
> +	 * device, we fill the zone info with emulated CONVENTIONAL
> +	 * zones, so no need to use the cache.
> +	 */
> +	if (populate_cache && bdev_is_zoned(device->bdev)) {
> +		zone_info->zone_cache = vzalloc(sizeof(struct blk_zone) *
> +						zone_info->nr_zones);

So the zone cache is a huge array of struct blk_zone. In the example
device with 57k zones and sizeof(blk_zone) = 64, it's about 3.5M. As the
cache lifetime is relatively short I think it's acceptable to do the
virtual allocation.

This is the simplest way. What got me thinking a bit is if we need to
cache entire blk_zone.

struct blk_zone {
	__u64	start;		/* Zone start sector */
	__u64	len;		/* Zone length in number of sectors */
	__u64	wp;		/* Zone write pointer position */
	__u8	type;		/* Zone type */
	__u8	cond;		/* Zone condition */
	__u8	non_seq;	/* Non-sequential write resources active */
	__u8	reset;		/* Reset write pointer recommended */
	__u8	resv[4];
	__u64	capacity;	/* Zone capacity in number of sectors */
	__u8	reserved[24];
};

Reserved is 28 bytes, and some other state information may not be
necessary at the load time. So could we have a cached_blk_zone structure
in the array, with only the interesting blk_zone members copied?

This could reduce the array size, eg. if the cached_blk_zone is
something like:

struct cached_blk_zone {
	__u64	start;		/* Zone start sector */
	__u64	len;		/* Zone length in number of sectors */
	__u64	wp;		/* Zone write pointer position */
	__u8	type;		/* Zone type */
	__u8	cond;		/* Zone condition */
	__u64	capacity;	/* Zone capacity in number of sectors */
};

There's still some padding needed between u8 and u64, so we may not be
able to do better than space of 5*u64, which is 40 bytes. This would
result in cached array of about 2.2M.

This may not look much but kernel memory always comes at a cost and if
there's more than one device needed the allocated memory grows.
That it's virtual memory avoids the problem with fragmented memory,
though I think we'd have to use it anyway in case we want to use a
contiguous array and not some dynamic structure.

For first implementation the array of blk_zone is fine, but please give
it a thought too if you could spot something that can be made more
effective.

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH] btrfs: cache reported zone during mount
  2021-11-11  5:14 [PATCH] btrfs: cache reported zone during mount Naohiro Aota
  2021-11-11 12:51 ` David Sterba
@ 2021-11-11 12:55 ` David Sterba
  2021-11-12  8:09   ` Naohiro Aota
  1 sibling, 1 reply; 5+ messages in thread
From: David Sterba @ 2021-11-11 12:55 UTC (permalink / raw)
  To: Naohiro Aota; +Cc: linux-btrfs, David Sterba

On Thu, Nov 11, 2021 at 02:14:38PM +0900, Naohiro Aota wrote:
> When mounting a device, we are reporting the zones twice: once for
> checking the zone attributes in btrfs_get_dev_zone_info and once for
> loading block groups' zone info in
> btrfs_load_block_group_zone_info(). With a lot of block groups, that
> leads to a lot of REPORT ZONE commands and slows down the mount
> process.
> 
> This patch introduces a zone info cache in struct
> btrfs_zoned_device_info. The cache is populated while in
> btrfs_get_dev_zone_info() and used for
> btrfs_load_block_group_zone_info() to reduce the number of REPORT ZONE
> commands. The zone cache is then released after loading the block
> groups, as it will not be much effective during the run time.
> 
> Benchmark: Mount an HDD with 57,007 block groups
> Before patch: 171.368 seconds
> After patch: 64.064 seconds
> 
> While it still takes a minute due to the slowness of loading all the
> block groups, the patch reduces the mount time by 1/3.
> 
> Link: https://lore.kernel.org/linux-btrfs/CAHQ7scUiLtcTqZOMMY5kbWUBOhGRwKo6J6wYPT5WY+C=cD49nQ@mail.gmail.com/
> Fixes: 5b316468983d ("btrfs: get zone information of zoned block devices")
> CC: stable@vger.kernel.org
> Signed-off-by: Naohiro Aota <naohiro.aota@wdc.com>
> ---
>  fs/btrfs/dev-replace.c |  2 +-
>  fs/btrfs/disk-io.c     |  2 ++
>  fs/btrfs/volumes.c     |  2 +-
>  fs/btrfs/zoned.c       | 78 +++++++++++++++++++++++++++++++++++++++---
>  fs/btrfs/zoned.h       |  8 +++--
>  5 files changed, 84 insertions(+), 8 deletions(-)
> 
> diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
> index a39987e020e3..1c91f2203da4 100644
> --- a/fs/btrfs/dev-replace.c
> +++ b/fs/btrfs/dev-replace.c
> @@ -323,7 +323,7 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
>  	set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE);
>  	device->fs_devices = fs_info->fs_devices;
>  
> -	ret = btrfs_get_dev_zone_info(device);
> +	ret = btrfs_get_dev_zone_info(device, false);
>  	if (ret)
>  		goto error;
>  
> diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
> index 847aabb30676..369f84ff6bd3 100644
> --- a/fs/btrfs/disk-io.c
> +++ b/fs/btrfs/disk-io.c
> @@ -3563,6 +3563,8 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
>  		goto fail_sysfs;
>  	}
>  
> +	btrfs_free_zone_cache(fs_info);
> +
>  	if (!sb_rdonly(sb) && fs_info->fs_devices->missing_devices &&
>  	    !btrfs_check_rw_degradable(fs_info, NULL)) {
>  		btrfs_warn(fs_info,
> diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
> index 45c91a2f172c..dd1cbbb73ef0 100644
> --- a/fs/btrfs/volumes.c
> +++ b/fs/btrfs/volumes.c
> @@ -2667,7 +2667,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
>  	device->fs_info = fs_info;
>  	device->bdev = bdev;
>  
> -	ret = btrfs_get_dev_zone_info(device);
> +	ret = btrfs_get_dev_zone_info(device, false);
>  	if (ret)
>  		goto error_free_device;
>  
> diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c
> index 67d932d70798..2300d9eff69a 100644
> --- a/fs/btrfs/zoned.c
> +++ b/fs/btrfs/zoned.c
> @@ -213,6 +213,9 @@ static int emulate_report_zones(struct btrfs_device *device, u64 pos,
>  static int btrfs_get_dev_zones(struct btrfs_device *device, u64 pos,
>  			       struct blk_zone *zones, unsigned int *nr_zones)
>  {
> +	struct btrfs_zoned_device_info *zinfo = device->zone_info;
> +	struct blk_zone *zone_info;

Variables should be declared in the inner-most scope they're used, so
zone_info is in the for loop

> +	u32 zno;
>  	int ret;
>  
>  	if (!*nr_zones)
> @@ -224,6 +227,32 @@ static int btrfs_get_dev_zones(struct btrfs_device *device, u64 pos,
>  		return 0;
>  	}
>  
> +	if (zinfo->zone_cache) {
> +		/* Check cache */
> +		unsigned int i;

and u32 zno should be there.

> +
> +		ASSERT(IS_ALIGNED(pos, zinfo->zone_size));
> +		zno = pos >> zinfo->zone_size_shift;
> +		/*
> +		 * We cannot report zones beyond the zone end. So, it
> +		 * is OK to cap *nr_zones to at the end.
> +		 */
> +		*nr_zones = min_t(u32, *nr_zones, zinfo->nr_zones - zno);
> +
> +		for (i = 0; i < *nr_zones; i++) {
> +			zone_info = &zinfo->zone_cache[zno + i];

Creating a temporary variable to capture some weird array expresion is
fine and preferred. When the variable is not declared here it looks like
it could be needed elsewehre so it may take some scrolling around to
make sure it's not so.

Both fixed in the committed version.

> +			if (!zone_info->len)
> +				break;
> +		}
> +
> +		if (i == *nr_zones) {
> +			/* Cache hit on all the zones */
> +			memcpy(zones, zinfo->zone_cache + zno,
> +			       sizeof(*zinfo->zone_cache) * *nr_zones);
> +			return 0;
> +		}
> +	}
> +
>  	ret = blkdev_report_zones(device->bdev, pos >> SECTOR_SHIFT, *nr_zones,
>  				  copy_zone_info_cb, zones);
>  	if (ret < 0) {
> +}

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH] btrfs: cache reported zone during mount
  2021-11-11 12:51 ` David Sterba
@ 2021-11-12  7:55   ` Naohiro Aota
  0 siblings, 0 replies; 5+ messages in thread
From: Naohiro Aota @ 2021-11-12  7:55 UTC (permalink / raw)
  To: dsterba, linux-btrfs, David Sterba

On Thu, Nov 11, 2021 at 01:51:00PM +0100, David Sterba wrote:
> On Thu, Nov 11, 2021 at 02:14:38PM +0900, Naohiro Aota wrote:
> > This patch introduces a zone info cache in struct
> > btrfs_zoned_device_info. The cache is populated while in
> > btrfs_get_dev_zone_info() and used for
> > btrfs_load_block_group_zone_info() to reduce the number of REPORT ZONE
> > commands. The zone cache is then released after loading the block
> > groups, as it will not be much effective during the run time.
> > 
> > Benchmark: Mount an HDD with 57,007 block groups
> > Before patch: 171.368 seconds
> > After patch: 64.064 seconds
> > 
> > While it still takes a minute due to the slowness of loading all the
> > block groups, the patch reduces the mount time by 1/3.
> 
> That's a good improvement.
> 
> > Link: https://lore.kernel.org/linux-btrfs/CAHQ7scUiLtcTqZOMMY5kbWUBOhGRwKo6J6wYPT5WY+C=cD49nQ@mail.gmail.com/
> > Fixes: 5b316468983d ("btrfs: get zone information of zoned block devices")
> > CC: stable@vger.kernel.org
> > Signed-off-by: Naohiro Aota <naohiro.aota@wdc.com>
> > ---
> > +	/*
> > +	 * Enable zone cache only for a zoned device. On a non-zoned
> > +	 * device, we fill the zone info with emulated CONVENTIONAL
> > +	 * zones, so no need to use the cache.
> > +	 */
> > +	if (populate_cache && bdev_is_zoned(device->bdev)) {
> > +		zone_info->zone_cache = vzalloc(sizeof(struct blk_zone) *
> > +						zone_info->nr_zones);
> 
> So the zone cache is a huge array of struct blk_zone. In the example
> device with 57k zones and sizeof(blk_zone) = 64, it's about 3.5M. As the
> cache lifetime is relatively short I think it's acceptable to do the
> virtual allocation.
>
> This is the simplest way. What got me thinking a bit is if we need to
> cache entire blk_zone.
> 
> struct blk_zone {
> 	__u64	start;		/* Zone start sector */
> 	__u64	len;		/* Zone length in number of sectors */
> 	__u64	wp;		/* Zone write pointer position */
> 	__u8	type;		/* Zone type */
> 	__u8	cond;		/* Zone condition */
> 	__u8	non_seq;	/* Non-sequential write resources active */
> 	__u8	reset;		/* Reset write pointer recommended */
> 	__u8	resv[4];
> 	__u64	capacity;	/* Zone capacity in number of sectors */
> 	__u8	reserved[24];
> };
> 
> Reserved is 28 bytes, and some other state information may not be
> necessary at the load time. So could we have a cached_blk_zone structure
> in the array, with only the interesting blk_zone members copied?

Yes. I'm thinking about that improvement.

> This could reduce the array size, eg. if the cached_blk_zone is
> something like:
> 
> struct cached_blk_zone {
> 	__u64	start;		/* Zone start sector */
> 	__u64	len;		/* Zone length in number of sectors */
> 	__u64	wp;		/* Zone write pointer position */
> 	__u8	type;		/* Zone type */
> 	__u8	cond;		/* Zone condition */
> 	__u64	capacity;	/* Zone capacity in number of sectors */
> };
> 
> There's still some padding needed between u8 and u64, so we may not be
> able to do better than space of 5*u64, which is 40 bytes. This would
> result in cached array of about 2.2M.

We might even reduce some other fields: "start" can be inferred from
the array index, "len" should be the zone size, "type" and "cond" are
already encoded into the btrfs_zoned_device_info->seq_zones and
->empty_zones. So, we need only the "wp" and "capacity." Then the
cache array will become less than 1 MB.

> This may not look much but kernel memory always comes at a cost and if
> there's more than one device needed the allocated memory grows.
> That it's virtual memory avoids the problem with fragmented memory,
> though I think we'd have to use it anyway in case we want to use a
> contiguous array and not some dynamic structure.
> 
> For first implementation the array of blk_zone is fine, but please give
> it a thought too if you could spot something that can be made more
> effective.

Sure. I'll write the improvement patch.

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH] btrfs: cache reported zone during mount
  2021-11-11 12:55 ` David Sterba
@ 2021-11-12  8:09   ` Naohiro Aota
  0 siblings, 0 replies; 5+ messages in thread
From: Naohiro Aota @ 2021-11-12  8:09 UTC (permalink / raw)
  To: dsterba, linux-btrfs, David Sterba

On Thu, Nov 11, 2021 at 01:55:14PM +0100, David Sterba wrote:
> On Thu, Nov 11, 2021 at 02:14:38PM +0900, Naohiro Aota wrote:
> > When mounting a device, we are reporting the zones twice: once for
> > checking the zone attributes in btrfs_get_dev_zone_info and once for
> > loading block groups' zone info in
> > btrfs_load_block_group_zone_info(). With a lot of block groups, that
> > leads to a lot of REPORT ZONE commands and slows down the mount
> > process.
> > 
> > This patch introduces a zone info cache in struct
> > btrfs_zoned_device_info. The cache is populated while in
> > btrfs_get_dev_zone_info() and used for
> > btrfs_load_block_group_zone_info() to reduce the number of REPORT ZONE
> > commands. The zone cache is then released after loading the block
> > groups, as it will not be much effective during the run time.
> > 
> > Benchmark: Mount an HDD with 57,007 block groups
> > Before patch: 171.368 seconds
> > After patch: 64.064 seconds
> > 
> > While it still takes a minute due to the slowness of loading all the
> > block groups, the patch reduces the mount time by 1/3.
> > 
> > Link: https://lore.kernel.org/linux-btrfs/CAHQ7scUiLtcTqZOMMY5kbWUBOhGRwKo6J6wYPT5WY+C=cD49nQ@mail.gmail.com/
> > Fixes: 5b316468983d ("btrfs: get zone information of zoned block devices")
> > CC: stable@vger.kernel.org
> > Signed-off-by: Naohiro Aota <naohiro.aota@wdc.com>
> > ---
> >  fs/btrfs/dev-replace.c |  2 +-
> >  fs/btrfs/disk-io.c     |  2 ++
> >  fs/btrfs/volumes.c     |  2 +-
> >  fs/btrfs/zoned.c       | 78 +++++++++++++++++++++++++++++++++++++++---
> >  fs/btrfs/zoned.h       |  8 +++--
> >  5 files changed, 84 insertions(+), 8 deletions(-)
> > 
> > diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
> > index a39987e020e3..1c91f2203da4 100644
> > --- a/fs/btrfs/dev-replace.c
> > +++ b/fs/btrfs/dev-replace.c
> > @@ -323,7 +323,7 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
> >  	set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE);
> >  	device->fs_devices = fs_info->fs_devices;
> >  
> > -	ret = btrfs_get_dev_zone_info(device);
> > +	ret = btrfs_get_dev_zone_info(device, false);
> >  	if (ret)
> >  		goto error;
> >  
> > diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
> > index 847aabb30676..369f84ff6bd3 100644
> > --- a/fs/btrfs/disk-io.c
> > +++ b/fs/btrfs/disk-io.c
> > @@ -3563,6 +3563,8 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
> >  		goto fail_sysfs;
> >  	}
> >  
> > +	btrfs_free_zone_cache(fs_info);
> > +
> >  	if (!sb_rdonly(sb) && fs_info->fs_devices->missing_devices &&
> >  	    !btrfs_check_rw_degradable(fs_info, NULL)) {
> >  		btrfs_warn(fs_info,
> > diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
> > index 45c91a2f172c..dd1cbbb73ef0 100644
> > --- a/fs/btrfs/volumes.c
> > +++ b/fs/btrfs/volumes.c
> > @@ -2667,7 +2667,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
> >  	device->fs_info = fs_info;
> >  	device->bdev = bdev;
> >  
> > -	ret = btrfs_get_dev_zone_info(device);
> > +	ret = btrfs_get_dev_zone_info(device, false);
> >  	if (ret)
> >  		goto error_free_device;
> >  
> > diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c
> > index 67d932d70798..2300d9eff69a 100644
> > --- a/fs/btrfs/zoned.c
> > +++ b/fs/btrfs/zoned.c
> > @@ -213,6 +213,9 @@ static int emulate_report_zones(struct btrfs_device *device, u64 pos,
> >  static int btrfs_get_dev_zones(struct btrfs_device *device, u64 pos,
> >  			       struct blk_zone *zones, unsigned int *nr_zones)
> >  {
> > +	struct btrfs_zoned_device_info *zinfo = device->zone_info;
> > +	struct blk_zone *zone_info;
> 
> Variables should be declared in the inner-most scope they're used, so
> zone_info is in the for loop
> 
> > +	u32 zno;
> >  	int ret;
> >  
> >  	if (!*nr_zones)
> > @@ -224,6 +227,32 @@ static int btrfs_get_dev_zones(struct btrfs_device *device, u64 pos,
> >  		return 0;
> >  	}
> >  
> > +	if (zinfo->zone_cache) {
> > +		/* Check cache */
> > +		unsigned int i;
> 
> and u32 zno should be there.
> 
> > +
> > +		ASSERT(IS_ALIGNED(pos, zinfo->zone_size));
> > +		zno = pos >> zinfo->zone_size_shift;
> > +		/*
> > +		 * We cannot report zones beyond the zone end. So, it
> > +		 * is OK to cap *nr_zones to at the end.
> > +		 */
> > +		*nr_zones = min_t(u32, *nr_zones, zinfo->nr_zones - zno);
> > +
> > +		for (i = 0; i < *nr_zones; i++) {
> > +			zone_info = &zinfo->zone_cache[zno + i];
> 
> Creating a temporary variable to capture some weird array expresion is
> fine and preferred. When the variable is not declared here it looks like
> it could be needed elsewehre so it may take some scrolling around to
> make sure it's not so.
> 
> Both fixed in the committed version.

Thank you. I think I messed up the location while I was moving the
code around. I'll take care.

> > +			if (!zone_info->len)
> > +				break;
> > +		}
> > +
> > +		if (i == *nr_zones) {
> > +			/* Cache hit on all the zones */
> > +			memcpy(zones, zinfo->zone_cache + zno,
> > +			       sizeof(*zinfo->zone_cache) * *nr_zones);
> > +			return 0;
> > +		}
> > +	}
> > +
> >  	ret = blkdev_report_zones(device->bdev, pos >> SECTOR_SHIFT, *nr_zones,
> >  				  copy_zone_info_cb, zones);
> >  	if (ret < 0) {
> > +}

^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2021-11-12  8:09 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-11-11  5:14 [PATCH] btrfs: cache reported zone during mount Naohiro Aota
2021-11-11 12:51 ` David Sterba
2021-11-12  7:55   ` Naohiro Aota
2021-11-11 12:55 ` David Sterba
2021-11-12  8:09   ` Naohiro Aota

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.