Re: [PATCH v7 1/5] btrfs: Introduce per-profile available space facility

From: Nikolay Borisov <nborisov@suse.com>
To: Qu Wenruo <wqu@suse.com>, linux-btrfs@vger.kernel.org
Cc: Josef Bacik <josef@toxicpanda.com>
Subject: Re: [PATCH v7 1/5] btrfs: Introduce per-profile available space facility
Date: Thu, 20 Feb 2020 14:49:34 +0200	[thread overview]
Message-ID: <329cf703-f3f2-4583-73bf-c90c9e001956@suse.com> (raw)
In-Reply-To: <20200211051153.19466-2-wqu@suse.com>

<snip>

> 
> Suggested-by: Josef Bacik <josef@toxicpanda.com>
> Signed-off-by: Qu Wenruo <wqu@suse.com>
> ---
>  fs/btrfs/volumes.c | 216 ++++++++++++++++++++++++++++++++++++++++-----
>  fs/btrfs/volumes.h |  11 +++
>  2 files changed, 207 insertions(+), 20 deletions(-)
> 

<snip>

> +
> +/*
> + * Return 0 if we allocated any virtual(*) chunk, and restore the size to
> + * @allocated_size
> + * Return -ENOSPC if we have no more space to allocate virtual chunk
> + *
> + * *: virtual chunk is a space holder for per-profile available space
> + *    calculator.
> + *    Such virtual chunks won't take on-disk space, thus called virtual, and
> + *    only affects per-profile available space calulation.
> + */

Document that the last parameter is an output value which contains the
size of the allocated virtual chunk.

> +static int alloc_virtual_chunk(struct btrfs_fs_info *fs_info,
> +			       struct btrfs_device_info *devices_info,
> +			       enum btrfs_raid_types type,
> +			       u64 *allocated)
> +{
> +	const struct btrfs_raid_attr *raid_attr = &btrfs_raid_array[type];
> +	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
> +	struct btrfs_device *device;
> +	u64 stripe_size;
> +	int i;
> +	int ndevs = 0;
> +
> +	lockdep_assert_held(&fs_info->chunk_mutex);
> +
> +	/* Go through devices to collect their unallocated space */
> +	list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
> +		u64 avail;
> +		if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
> +					&device->dev_state) ||
> +		    test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state))
> +			continue;
> +
> +		if (device->total_bytes > device->bytes_used +
> +				device->virtual_allocated)
> +			avail = device->total_bytes - device->bytes_used -
> +				device->virtual_allocated;
> +		else
> +			avail = 0;
> +
> +		/* And exclude the [0, 1M) reserved space */
> +		if (avail > SZ_1M)
> +			avail -= SZ_1M;
> +		else
> +			avail = 0;
> +
> +		if (avail < fs_info->sectorsize)
> +			continue;
> +		/*
> +		 * Unlike chunk allocator, we don't care about stripe or hole
> +		 * size, so here we use @avail directly
> +		 */
> +		devices_info[ndevs].dev_offset = 0;
> +		devices_info[ndevs].total_avail = avail;
> +		devices_info[ndevs].max_avail = avail;
> +		devices_info[ndevs].dev = device;
> +		++ndevs;
> +	}
> +	sort(devices_info, ndevs, sizeof(struct btrfs_device_info),
> +	     btrfs_cmp_device_info, NULL);
> +	ndevs -= ndevs % raid_attr->devs_increment;

nit: ndevs = rounddown(ndevs, raid_attr->devs_increment);
makes it more clear what's going on. Since you are working with at most
int it's not a problem for 32bits.

> +	if (ndevs < raid_attr->devs_min)
> +		return -ENOSPC;
> +	if (raid_attr->devs_max)
> +		ndevs = min(ndevs, (int)raid_attr->devs_max);
> +	else
> +		ndevs = min(ndevs, (int)BTRFS_MAX_DEVS(fs_info));

Instead of casting simply use min_t(int, ndevs, BTRFS_MAX_DEVS...)

> +
> +	/*
> +	 * Now allocate a virtual chunk using the unallocate space of the

nit: missing d after 'unallocate'

> +	 * device with the least unallocated space.
> +	 */
> +	stripe_size = round_down(devices_info[ndevs - 1].total_avail,
> +				 fs_info->sectorsize);
> +	if (stripe_size == 0)
> +		return -ENOSPC;

Isn't this check redundant - in the loop where you iterate the devices
you always ensure total_avail is at least a sector size, this guarantees
that stripe_size cannot be 0 at this point.

> +
> +	for (i = 0; i < ndevs; i++)
> +		devices_info[i].dev->virtual_allocated += stripe_size;
> +	*allocated = stripe_size * (ndevs - raid_attr->nparity) /
> +		     raid_attr->ncopies;
> +	return 0;
> +}
> +
> +static int calc_one_profile_avail(struct btrfs_fs_info *fs_info,
> +				  enum btrfs_raid_types type,
> +				  u64 *result_ret)
> +{
> +	struct btrfs_device_info *devices_info = NULL;
> +	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
> +	struct btrfs_device *device;
> +	u64 allocated;
> +	u64 result = 0;
> +	int ret = 0;
> +

lockdep assert that chunk mutex is held since you access alloc_list.

> +	ASSERT(type >= 0 && type < BTRFS_NR_RAID_TYPES);
> +
> +	/* Not enough devices, quick exit, just update the result */
> +	if (fs_devices->rw_devices < btrfs_raid_array[type].devs_min)
> +		goto out;

You can directly return.

> +
> +	devices_info = kcalloc(fs_devices->rw_devices, sizeof(*devices_info),
> +			       GFP_NOFS);
> +	if (!devices_info) {
> +		ret = -ENOMEM;
> +		goto out;

Same here.

> +	}
> +	/* Clear virtual chunk used space for each device */
> +	list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list)
> +		device->virtual_allocated = 0;
> +	while (ret == 0) {
> +		ret = alloc_virtual_chunk(fs_info, devices_info, type,
> +					  &allocated);
The 'allocated' variable is used only in this loop so declare it in the
loop. Ideally we want variables to have the shortest possible lifecycle.

> +		if (ret == 0)
> +			result += allocated;
> +	}

Why do you need to call this in a loop calling alloc_virtual_chunk once
simply calculate the available space for the given raid type.

> +	list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list)
> +		device->virtual_allocated = 0;
> +out:
> +	kfree(devices_info);
> +	if (ret < 0 && ret != -ENOSPC)
> +		return ret;
> +	*result_ret = result;
> +	return 0;
> +}

<snip>

> @@ -259,6 +266,10 @@ struct btrfs_fs_devices {
>  	struct kobject fsid_kobj;
>  	struct kobject *devices_kobj;
>  	struct completion kobj_unregister;
> +
> +	/* Records per-type available space */
> +	spinlock_t per_profile_lock;
> +	u64 per_profile_avail[BTRFS_NR_RAID_TYPES];

Since every avail quantity is independent of any other, can you turn
this into an array of atomic64 values and get rid of the spinlock? My
head spins how many locks we have in btrfs.

>  };
>  
>  #define BTRFS_BIO_INLINE_CSUM_SIZE	64
>