All of lore.kernel.org
 help / color / mirror / Atom feed
From: Goffredo Baroncelli <kreijack@libero.it>
To: linux-btrfs@vger.kernel.org
Cc: Michael <mclaud@roznica.com.ua>, Hugo Mills <hugo@carfax.org.uk>,
	Martin Svec <martin.svec@zoner.cz>,
	Goffredo Baroncelli <kreijack@inwind.it>
Subject: [PATCH] btrfs: add ssd_metadata mode
Date: Sun,  5 Apr 2020 09:19:43 +0200	[thread overview]
Message-ID: <20200405071943.6902-2-kreijack@libero.it> (raw)
In-Reply-To: <20200405071943.6902-1-kreijack@libero.it>

From: Goffredo Baroncelli <kreijack@inwind.it>

When this mode is enabled, the allocation policy of the chunk
is so modified:
- allocation of metadata chunk: priority is given to ssd disk.
- allocation of data chunk: priority is given to a rotational disk.

When a striped profile is involved (like RAID0,5,6), the logic
is a bit more complex. If there are enough disks, the data profiles
are stored on the rotational disks only; instead the metadata profiles
are stored on the non rotational disk only.
If the disks are not enough, then the profiles is stored on all
the disks.

Example: assuming that sda, sdb, sdc are ssd disks, and sde, sdf are
rotational ones.
A data profile raid6, will be stored on sda, sdb, sdc, sde, sdf (sde
and sdf are not enough to host a raid5 profile).
A metadata profile raid6, will be stored on sda, sdb, sdc (these
are enough to host a raid6 profile).

To enable this mode pass -o ssd_metadata at mount time.

Signed-off-by: Goffredo Baroncelli <kreijack@inwind.it>
---
 fs/btrfs/ctree.h   |  1 +
 fs/btrfs/super.c   |  8 +++++
 fs/btrfs/volumes.c | 89 ++++++++++++++++++++++++++++++++++++++++++++--
 fs/btrfs/volumes.h |  1 +
 4 files changed, 97 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 36df977b64d9..c2e8b1b6eae7 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1236,6 +1236,7 @@ static inline u32 BTRFS_MAX_XATTR_SIZE(const struct btrfs_fs_info *info)
 #define BTRFS_MOUNT_NOLOGREPLAY		(1 << 27)
 #define BTRFS_MOUNT_REF_VERIFY		(1 << 28)
 #define BTRFS_MOUNT_DISCARD_ASYNC	(1 << 29)
+#define BTRFS_MOUNT_SSD_METADATA	(1 << 29)
 
 #define BTRFS_DEFAULT_COMMIT_INTERVAL	(30)
 #define BTRFS_DEFAULT_MAX_INLINE	(2048)
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 67c63858812a..4ad14b0a57b3 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -350,6 +350,7 @@ enum {
 #ifdef CONFIG_BTRFS_FS_REF_VERIFY
 	Opt_ref_verify,
 #endif
+	Opt_ssd_metadata,
 	Opt_err,
 };
 
@@ -421,6 +422,7 @@ static const match_table_t tokens = {
 #ifdef CONFIG_BTRFS_FS_REF_VERIFY
 	{Opt_ref_verify, "ref_verify"},
 #endif
+	{Opt_ssd_metadata, "ssd_metadata"},
 	{Opt_err, NULL},
 };
 
@@ -872,6 +874,10 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
 			btrfs_set_opt(info->mount_opt, REF_VERIFY);
 			break;
 #endif
+		case Opt_ssd_metadata:
+			btrfs_set_and_info(info, SSD_METADATA,
+					"enabling ssd_metadata");
+			break;
 		case Opt_err:
 			btrfs_info(info, "unrecognized mount option '%s'", p);
 			ret = -EINVAL;
@@ -1390,6 +1396,8 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry)
 #endif
 	if (btrfs_test_opt(info, REF_VERIFY))
 		seq_puts(seq, ",ref_verify");
+	if (btrfs_test_opt(info, SSD_METADATA))
+		seq_puts(seq, ",ssd_metadata");
 	seq_printf(seq, ",subvolid=%llu",
 		  BTRFS_I(d_inode(dentry))->root->root_key.objectid);
 	seq_puts(seq, ",subvol=");
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 9cfc668f91f4..ffb2bc912c43 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -4761,6 +4761,58 @@ static int btrfs_cmp_device_info(const void *a, const void *b)
 	return 0;
 }
 
+/*
+ * sort the devices in descending order by rotational,
+ * max_avail, total_avail
+ */
+static int btrfs_cmp_device_info_metadata(const void *a, const void *b)
+{
+	const struct btrfs_device_info *di_a = a;
+	const struct btrfs_device_info *di_b = b;
+
+	/* metadata -> non rotational first */
+	if (!di_a->rotational && di_b->rotational)
+		return -1;
+	if (di_a->rotational && !di_b->rotational)
+		return 1;
+	if (di_a->max_avail > di_b->max_avail)
+		return -1;
+	if (di_a->max_avail < di_b->max_avail)
+		return 1;
+	if (di_a->total_avail > di_b->total_avail)
+		return -1;
+	if (di_a->total_avail < di_b->total_avail)
+		return 1;
+	return 0;
+}
+
+/*
+ * sort the devices in descending order by !rotational,
+ * max_avail, total_avail
+ */
+static int btrfs_cmp_device_info_data(const void *a, const void *b)
+{
+	const struct btrfs_device_info *di_a = a;
+	const struct btrfs_device_info *di_b = b;
+
+	/* data -> non rotational last */
+	if (!di_a->rotational && di_b->rotational)
+		return 1;
+	if (di_a->rotational && !di_b->rotational)
+		return -1;
+	if (di_a->max_avail > di_b->max_avail)
+		return -1;
+	if (di_a->max_avail < di_b->max_avail)
+		return 1;
+	if (di_a->total_avail > di_b->total_avail)
+		return -1;
+	if (di_a->total_avail < di_b->total_avail)
+		return 1;
+	return 0;
+}
+
+
+
 static void check_raid56_incompat_flag(struct btrfs_fs_info *info, u64 type)
 {
 	if (!(type & BTRFS_BLOCK_GROUP_RAID56_MASK))
@@ -4808,6 +4860,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 	int i;
 	int j;
 	int index;
+	int nr_rotational;
 
 	BUG_ON(!alloc_profile_is_valid(type, 0));
 
@@ -4863,6 +4916,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 	 * about the available holes on each device.
 	 */
 	ndevs = 0;
+	nr_rotational = 0;
 	list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
 		u64 max_avail;
 		u64 dev_offset;
@@ -4914,14 +4968,45 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 		devices_info[ndevs].max_avail = max_avail;
 		devices_info[ndevs].total_avail = total_avail;
 		devices_info[ndevs].dev = device;
+		devices_info[ndevs].rotational = !test_bit(QUEUE_FLAG_NONROT,
+				&(bdev_get_queue(device->bdev)->queue_flags));
+		if (devices_info[ndevs].rotational)
+			nr_rotational++;
 		++ndevs;
 	}
 
+	BUG_ON(nr_rotational > ndevs);
 	/*
 	 * now sort the devices by hole size / available space
 	 */
-	sort(devices_info, ndevs, sizeof(struct btrfs_device_info),
-	     btrfs_cmp_device_info, NULL);
+	if (((type & BTRFS_BLOCK_GROUP_DATA) &&
+	     (type & BTRFS_BLOCK_GROUP_METADATA)) ||
+	    !btrfs_test_opt(info, SSD_METADATA)) {
+		/* mixed bg or SSD_METADATA not set */
+		sort(devices_info, ndevs, sizeof(struct btrfs_device_info),
+			     btrfs_cmp_device_info, NULL);
+	} else {
+		/*
+		 * if SSD_METADATA is set, sort the device considering also the
+		 * kind (ssd or not). Limit the availables devices to the ones
+		 * of the same kind, to avoid that a striped profile like raid5
+		 * spread to all kind of devices (ssd and rotational).
+		 * It is allowed to use different kinds of devices if the ones
+		 * of the same kind are not enough alone.
+		 */
+		if (type & BTRFS_BLOCK_GROUP_DATA) {
+			sort(devices_info, ndevs, sizeof(struct btrfs_device_info),
+				     btrfs_cmp_device_info_data, NULL);
+			if (nr_rotational >= devs_min)
+				ndevs = nr_rotational;
+		} else {
+			int nr_norot = ndevs - nr_rotational;
+			sort(devices_info, ndevs, sizeof(struct btrfs_device_info),
+				     btrfs_cmp_device_info_metadata, NULL);
+			if (nr_norot >= devs_min)
+				ndevs = nr_norot;
+		}
+	}
 
 	/*
 	 * Round down to number of usable stripes, devs_increment can be any
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index f01552a0785e..285d71d54a03 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -343,6 +343,7 @@ struct btrfs_device_info {
 	u64 dev_offset;
 	u64 max_avail;
 	u64 total_avail;
+	int rotational:1;
 };
 
 struct btrfs_raid_attr {
-- 
2.26.0


  reply	other threads:[~2020-04-05  7:19 UTC|newest]

Thread overview: 26+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2020-04-05  7:19 [RFC][PATCH v2] btrfs: ssd_metadata: storing metadata on SSD Goffredo Baroncelli
2020-04-05  7:19 ` Goffredo Baroncelli [this message]
2020-04-05  8:22 ` Achim Gratz
2020-04-05 19:03   ` Goffredo Baroncelli
2020-04-06  0:57     ` Paul Jones
  -- strict thread matches above, loose matches on Subject: below --
2020-04-05  8:26 [RFC][PATCH V3] " Goffredo Baroncelli
2020-04-05  8:26 ` [PATCH] btrfs: add ssd_metadata mode Goffredo Baroncelli
2020-04-14  5:24   ` Paul Jones
2020-10-23  7:23   ` Wang Yugui
2020-10-23 10:11     ` Adam Borowski
2020-10-23 11:25       ` Qu Wenruo
2020-10-23 12:37         ` Wang Yugui
2020-10-23 12:45           ` Qu Wenruo
2020-10-23 13:10           ` Steven Davies
2020-10-23 13:49             ` Wang Yugui
2020-10-23 18:03           ` Goffredo Baroncelli
2020-10-24  3:26             ` Paul Jones
2020-04-01 20:03 [RFC] btrfs: ssd_metadata: storing metadata on SSD Goffredo Baroncelli
2020-04-01 20:03 ` [PATCH] btrfs: add ssd_metadata mode Goffredo Baroncelli
2020-04-01 20:53   ` Goffredo Baroncelli
2020-04-02  9:33   ` Steven Davies
2020-04-02 16:39     ` Goffredo Baroncelli
2020-04-03  8:43     ` Michael
2020-04-03 10:08       ` Steven Davies
2020-04-03 16:19       ` Goffredo Baroncelli
2020-04-03 16:28         ` Hugo Mills
2020-04-03 16:36           ` Hans van Kranenburg
2020-04-02 18:01   ` Martin Svec

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20200405071943.6902-2-kreijack@libero.it \
    --to=kreijack@libero.it \
    --cc=hugo@carfax.org.uk \
    --cc=kreijack@inwind.it \
    --cc=linux-btrfs@vger.kernel.org \
    --cc=martin.svec@zoner.cz \
    --cc=mclaud@roznica.com.ua \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.