All of lore.kernel.org
 help / color / mirror / Atom feed
From: Yang Yang <yang.yang@vivo.com>
To: Alasdair Kergon <agk@redhat.com>,
	Mike Snitzer <snitzer@kernel.org>,
	Mikulas Patocka <mpatocka@redhat.com>,
	dm-devel@lists.linux.dev
Cc: Yang Yang <yang.yang@vivo.com>
Subject: [PATCH] dm: Avoid sending redundant empty flush bios to the same block device
Date: Wed, 17 Apr 2024 16:03:31 +0800	[thread overview]
Message-ID: <20240417080332.9288-1-yang.yang@vivo.com> (raw)

__send_empty_flush() sends empty flush bios to every target in the
dm_table. However, if the num_targets exceeds the number of block
devices in the dm_table's device list, it could lead to multiple
invocations of __send_duplicate_bios() for the same block device.
Typically, a single thread sending numerous empty flush bios to one
block device is redundant, as these bios are likely to be merged by the
flush state machine. In scenarios where num_targets significantly
outweighs the number of block devices, such behavior may result in a
noteworthy decrease in performance.

This issue can be reproduced using this command line:
  for i in {0..1023}; do
    echo $((8000*$i)) 8000 linear /dev/sda2 $((16384*$i))
  done | dmsetup create example

With this fix, a random write with fsync workload executed with the
following fio command:

  fio --group_reporting --name=benchmark --filename=/dev/mapper/example \
      --ioengine=sync --invalidate=1 --numjobs=16 --rw=randwrite \
      --blocksize=4k --size=2G --time_based --runtime=30 --fdatasync=1

results in an increase from 857 KB/s to 30.8 MB/s of the write
throughput (3580% increase).

Signed-off-by: Yang Yang <yang.yang@vivo.com>
---
 drivers/md/dm-core.h          |  1 +
 drivers/md/dm-table.c         |  7 ++++
 drivers/md/dm.c               | 60 +++++++++++++++++++++++++++++++++++
 include/linux/device-mapper.h |  6 ++++
 4 files changed, 74 insertions(+)

diff --git a/drivers/md/dm-core.h b/drivers/md/dm-core.h
index e6757a30dcca..7e3f2168289f 100644
--- a/drivers/md/dm-core.h
+++ b/drivers/md/dm-core.h
@@ -217,6 +217,7 @@ struct dm_table {
 	/* a list of devices used by this table */
 	struct list_head devices;
 	struct rw_semaphore devices_lock;
+	unsigned short num_devices;
 
 	/* events get handed up using this callback */
 	void (*event_fn)(void *data);
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 41f1d731ae5a..ddc60e498afb 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -2133,6 +2133,8 @@ void dm_table_postsuspend_targets(struct dm_table *t)
 
 int dm_table_resume_targets(struct dm_table *t)
 {
+	struct list_head *devices = dm_table_get_devices(t);
+	struct dm_dev_internal *dd;
 	unsigned int i;
 	int r = 0;
 
@@ -2159,6 +2161,11 @@ int dm_table_resume_targets(struct dm_table *t)
 			ti->type->resume(ti);
 	}
 
+	t->num_devices = 0;
+
+	list_for_each_entry(dd, devices, list)
+		dd->dm_dev->index = ++(t->num_devices);
+
 	return 0;
 }
 
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 56aa2a8b9d71..e46bafa13601 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -48,6 +48,8 @@
  */
 #define REQ_DM_POLL_LIST	REQ_DRV
 
+#define DM_MAX_TABLE_DEVICES	1024
+
 static const char *_name = DM_NAME;
 
 static unsigned int major;
@@ -1543,10 +1545,39 @@ static unsigned int __send_duplicate_bios(struct clone_info *ci, struct dm_targe
 	return ret;
 }
 
+static inline bool has_redundant_flush(struct dm_table *t,
+		unsigned long **bitmap)
+{
+	/* Add a limit here to prevent excessive memory usage for bitmaps */
+	if (t->num_devices >= DM_MAX_TABLE_DEVICES)
+		return false;
+
+	if (t->num_devices < t->num_targets) {
+		/* dm_dev's index starts from 1, so need plus 1 here */
+		*bitmap = bitmap_zalloc(min(t->num_devices + 1, DM_MAX_TABLE_DEVICES),
+						GFP_KERNEL);
+		if (*bitmap)
+			return true;
+	}
+
+	return false;
+}
+
+static int dm_get_dev_index(struct dm_target *ti, struct dm_dev *dev,
+				     sector_t start, sector_t len, void *data)
+{
+	unsigned short *index = data;
+	*index = dev->index;
+	return 0;
+}
+
 static void __send_empty_flush(struct clone_info *ci)
 {
 	struct dm_table *t = ci->map;
 	struct bio flush_bio;
+	unsigned long *handled_map;
+	unsigned int nr_handled = 0;
+	bool check = has_redundant_flush(t, &handled_map);
 
 	/*
 	 * Use an on-stack bio for this, it's safe since we don't
@@ -1562,17 +1593,46 @@ static void __send_empty_flush(struct clone_info *ci)
 
 	for (unsigned int i = 0; i < t->num_targets; i++) {
 		unsigned int bios;
+		unsigned short index = 0;
 		struct dm_target *ti = dm_table_get_target(t, i);
 
 		if (unlikely(ti->num_flush_bios == 0))
 			continue;
 
+		/*
+		 * If the num_targets is greater than the number of block devices
+		 * in the dm_table's devices list, __send_empty_flush() might
+		 * invoke __send_duplicate_bios() multiple times for the same
+		 * block device. This could lead to a substantial decrease in
+		 * performance when num_targets significantly exceeds the number
+		 * of block devices.
+		 * Ensure that __send_duplicate_bios() is only called once for
+		 * each block device.
+		 */
+		if (check) {
+			if (nr_handled == t->num_devices)
+				break;
+
+			if (ti->type->iterate_devices)
+				ti->type->iterate_devices(ti, dm_get_dev_index, &index);
+
+			if (index > 0) {
+				if (__test_and_set_bit(index, handled_map))
+					continue;
+				else
+					nr_handled++;
+			}
+		}
+
 		atomic_add(ti->num_flush_bios, &ci->io->io_count);
 		bios = __send_duplicate_bios(ci, ti, ti->num_flush_bios,
 					     NULL, GFP_NOWAIT);
 		atomic_sub(ti->num_flush_bios - bios, &ci->io->io_count);
 	}
 
+	if (check)
+		bitmap_free(handled_map);
+
 	/*
 	 * alloc_io() takes one extra reference for submission, so the
 	 * reference won't reach 0 without the following subtraction
diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
index 82b2195efaca..4a54b4f0a609 100644
--- a/include/linux/device-mapper.h
+++ b/include/linux/device-mapper.h
@@ -169,6 +169,12 @@ struct dm_dev {
 	struct dax_device *dax_dev;
 	blk_mode_t mode;
 	char name[16];
+
+	/*
+	 * sequential number for each dm_dev in dm_table's devices list,
+	 * start from 1
+	 */
+	unsigned short index;
 };
 
 /*
-- 
2.34.1


                 reply	other threads:[~2024-04-17  8:04 UTC|newest]

Thread overview: [no followups] expand[flat|nested]  mbox.gz  Atom feed

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20240417080332.9288-1-yang.yang@vivo.com \
    --to=yang.yang@vivo.com \
    --cc=agk@redhat.com \
    --cc=dm-devel@lists.linux.dev \
    --cc=mpatocka@redhat.com \
    --cc=snitzer@kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.