All of lore.kernel.org
 help / color / mirror / Atom feed
From: Anand Jain <anand.jain@oracle.com>
To: dsterba@suse.cz, linux-btrfs@vger.kernel.org
Subject: Re: [PATCH 1/2] btrfs: keep device type in the struct btrfs_device
Date: Sun, 30 Jan 2022 00:24:15 +0800	[thread overview]
Message-ID: <e412efb6-7ea2-cfd2-5c5e-cbe4fdde5c53@oracle.com> (raw)
In-Reply-To: <20220126165302.GC14046@twin.jikos.cz>



On 27/01/2022 00:53, David Sterba wrote:
> On Tue, Jan 18, 2022 at 11:18:01PM +0800, Anand Jain wrote:
>> Preparation to make data/metadata chunks allocations based on the device
>> types- keep the identified device type in the struct btrfs_device.
>>
>> This patch adds a member 'dev_type' to hold the defined device types in
>> the struct btrfs_devices.
>>
>> Also, add a helper function and a struct btrfs_fs_devices member
>> 'mixed_dev_type' to know if the filesystem contains the mixed device
>> types.
>>
>> Struct btrfs_device has an existing member 'type' that stages and writes
>> back to the on-disk format. This patch does not use it. As just an
>> in-memory only data will suffice the requirement here.
>>
>> Signed-off-by: Anand Jain <anand.jain@oracle.com>
>> ---
>>   fs/btrfs/dev-replace.c |  2 ++
>>   fs/btrfs/volumes.c     | 45 ++++++++++++++++++++++++++++++++++++++++++
>>   fs/btrfs/volumes.h     | 26 +++++++++++++++++++++++-
>>   3 files changed, 72 insertions(+), 1 deletion(-)
>>
>> diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
>> index 71fd99b48283..3731c7d1c6b7 100644
>> --- a/fs/btrfs/dev-replace.c
>> +++ b/fs/btrfs/dev-replace.c
>> @@ -325,6 +325,7 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
>>   	device->dev_stats_valid = 1;
>>   	set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE);
>>   	device->fs_devices = fs_devices;
>> +	device->dev_type = btrfs_get_device_type(device);
>>   
>>   	ret = btrfs_get_dev_zone_info(device, false);
>>   	if (ret)
>> @@ -334,6 +335,7 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
>>   	list_add(&device->dev_list, &fs_devices->devices);
>>   	fs_devices->num_devices++;
>>   	fs_devices->open_devices++;
>> +	fs_devices->mixed_dev_types = btrfs_has_mixed_dev_types(fs_devices);
>>   	mutex_unlock(&fs_devices->device_list_mutex);
>>   
>>   	*device_out = device;
>> diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
>> index 9d50e035e61d..da3d6d0f5bc3 100644
>> --- a/fs/btrfs/volumes.c
>> +++ b/fs/btrfs/volumes.c
>> @@ -1041,6 +1041,7 @@ static void __btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices,
>>   			     device->generation > (*latest_dev)->generation)) {
>>   				*latest_dev = device;
>>   			}
>> +			device->dev_type = btrfs_get_device_type(device);
>>   			continue;
>>   		}
>>   
>> @@ -1084,6 +1085,7 @@ void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices)
>>   		__btrfs_free_extra_devids(seed_dev, &latest_dev);
>>   
>>   	fs_devices->latest_dev = latest_dev;
>> +	fs_devices->mixed_dev_types = btrfs_has_mixed_dev_types(fs_devices);
>>   
>>   	mutex_unlock(&uuid_mutex);
>>   }
>> @@ -2183,6 +2185,9 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info,
>>   
>>   	num_devices = btrfs_super_num_devices(fs_info->super_copy) - 1;
>>   	btrfs_set_super_num_devices(fs_info->super_copy, num_devices);
>> +
>> +	cur_devices->mixed_dev_types = btrfs_has_mixed_dev_types(cur_devices);
>> +
>>   	mutex_unlock(&fs_devices->device_list_mutex);
>>   
>>   	/*
>> @@ -2584,6 +2589,44 @@ static int btrfs_finish_sprout(struct btrfs_trans_handle *trans)
>>   	return ret;
>>   }
>>   
>> +bool btrfs_has_mixed_dev_types(struct btrfs_fs_devices *fs_devices)
>> +{
>> +	struct btrfs_device *device;
>> +	int type_rot = 0;
>> +	int type_nonrot = 0;
>> +
>> +	list_for_each_entry(device, &fs_devices->devices, dev_list) {
>> +
>> +		if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))
>> +			continue;
>> +
>> +		switch (device->dev_type) {
>> +		case BTRFS_DEV_TYPE_ROT:
>> +			type_rot++;
>> +			break;
>> +		case BTRFS_DEV_TYPE_NONROT:
>> +		default:
>> +			type_nonrot++;
>> +		}
>> +	}
>> +
>> +	if (type_rot && type_nonrot)
>> +		return true;
>> +	else
>> +		return false;
>> +}
>> +
>> +enum btrfs_dev_types btrfs_get_device_type(struct btrfs_device *device)
>> +{
>> +	if (bdev_is_zoned(device->bdev))
>> +		return BTRFS_DEV_TYPE_ZONED;
>> +
>> +	if (blk_queue_nonrot(bdev_get_queue(device->bdev)))
>> +		return BTRFS_DEV_TYPE_NONROT;
>> +
>> +	return BTRFS_DEV_TYPE_ROT;
>> +}
>> +
>>   int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path)
>>   {
>>   	struct btrfs_root *root = fs_info->dev_root;
>> @@ -2675,6 +2718,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
>>   	clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);
>>   	device->mode = FMODE_EXCL;
>>   	device->dev_stats_valid = 1;
>> +	device->dev_type = btrfs_get_device_type(device);
>>   	set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE);
>>   
>>   	if (seeding_dev) {
>> @@ -2710,6 +2754,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
>>   	atomic64_add(device->total_bytes, &fs_info->free_chunk_space);
>>   
>>   	fs_devices->rotating = !blk_queue_nonrot(bdev_get_queue(bdev));
>> +	fs_devices->mixed_dev_types = btrfs_has_mixed_dev_types(fs_devices);
>>   
>>   	orig_super_total_bytes = btrfs_super_total_bytes(fs_info->super_copy);
>>   	btrfs_set_super_total_bytes(fs_info->super_copy,
>> diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
>> index 6a790b85edd8..5be31161601d 100644
>> --- a/fs/btrfs/volumes.h
>> +++ b/fs/btrfs/volumes.h
>> @@ -52,6 +52,16 @@ struct btrfs_io_geometry {
>>   #define BTRFS_DEV_STATE_FLUSH_SENT	(4)
>>   #define BTRFS_DEV_STATE_NO_READA	(5)
>>   
>> +/*
>> + * Device class types arranged by their IO latency from low to high.
>> + */
>> +enum btrfs_dev_types {
>> +	BTRFS_DEV_TYPE_MEM = 1,
>> +	BTRFS_DEV_TYPE_NONROT,
>> +	BTRFS_DEV_TYPE_ROT,
>> +	BTRFS_DEV_TYPE_ZONED,
> 
> I think this should be separate, in one list define all sensible device
> types and then add arrays sorted by latency, or other factors.
>
> The zoned devices are mostly HDD but ther are also SSD-like using ZNS,
> so that can't be both under BTRFS_DEV_TYPE_ZONED and behind
> BTRFS_DEV_TYPE_ROT as if it had worse latency.

I still do not have a complete idea of its implantation using an array. 
Do you mean something like as below,

enum btrfs_dev_types {
	::
};

struct btrfs_dev_type {
	enum btrfs_dev_types dev_type;
	int priority_latency;
};


I think we can identify a ZNS and set its relative latency to a value 
lower (better) than rotational.


> I'm not sure how much we should try to guess the device types, the ones
> you have so far are almost all we can get without peeking too much into
> the devices/queues internals.

Agree.

> Also here's the terminology question if we should still consider
> rotational device status as the main criteria, because then SSD, NVMe,
> RAM-backed are all non-rotational but with different latency
> characteristics.

Right. The expected performance also depends on the interconnect type
which is undetectable.

IMO we shouldn't worry about the non-rational's interconnect types
(M2/PCIe/NVMe/SATA/SAS) even though they have different performances.

Because some of these interconnects are compatible with each-other 
medium might change its interconnect during the lifecycle of the data.

Then left with are the types of mediums- rotational, non-rotational and
zoned which, we can identify safely.

Use-cases plan to mix these types of mediums to achieve the 
cost-per-byte advantage. As of now, I think our target can be to help these
kind of planned mixing.

>> +};
>> +
>>   struct btrfs_zoned_device_info;
>>   
>>   struct btrfs_device {
>> @@ -101,9 +111,16 @@ struct btrfs_device {
>>   
>>   	/* optimal io width for this device */
>>   	u32 io_width;
>> -	/* type and info about this device */
>> +
>> +	/* Type and info about this device, on-disk (currently unused) */
>>   	u64 type;
>>   
>> +	/*
>> +	 * Device type (in memory only) at some point, merge to the on-disk
>> +	 * member 'type' above.
>> +	 */
>> +	enum btrfs_dev_types dev_type;
> 
> So at some point this is planned to be user configurable? We should be
> always able to detect the device type at mount type so I wonder if this
> needs to be stored on disk.

I didn't find any planned configs (white papers) discussing the
advantages of mixing non-rotational drives with different interconnect
types. If any then, we may have to provide the manual configuration.
(If the mixing is across the medium types like non-rotational with
either zoned or HDD, then the users don't have to configure as we can
optimize the allocation automatically for performance.)

Also, hot cache device or device grouping (when implemented) need the
user to configure.

And so maybe part of in-memory 'dev_type' would be in-sync with on-disk
'type' at some point. IMO.

Thanks, Anand


  reply	other threads:[~2022-01-29 16:24 UTC|newest]

Thread overview: 13+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2022-01-18 15:18 [PATCH 0/2] device type and create chunk Anand Jain
2022-01-18 15:18 ` [PATCH 1/2] btrfs: keep device type in the struct btrfs_device Anand Jain
2022-01-26 16:53   ` David Sterba
2022-01-29 16:24     ` Anand Jain [this message]
2022-02-01 17:06       ` David Sterba
2022-02-03 12:56         ` Anand Jain
2022-01-18 15:18 ` [PATCH 2/2] btrfs: create chunk device type aware Anand Jain
2022-01-26 17:01   ` David Sterba
2022-01-29 16:24     ` Anand Jain
2022-01-26 17:38   ` David Sterba
2022-01-29 16:46     ` Anand Jain
2022-01-30 22:15       ` Goffredo Baroncelli
2022-01-30 22:28     ` Goffredo Baroncelli

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=e412efb6-7ea2-cfd2-5c5e-cbe4fdde5c53@oracle.com \
    --to=anand.jain@oracle.com \
    --cc=dsterba@suse.cz \
    --cc=linux-btrfs@vger.kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.