dm-devel.redhat.com archive mirror
 help / color / mirror / Atom feed
* [RFC PATCH] dm: Avoid sending redundant empty flush bios to the same block device
@ 2024-04-22 10:05 Yang Yang
  2024-04-25  2:54 ` YangYang
                   ` (2 more replies)
  0 siblings, 3 replies; 6+ messages in thread
From: Yang Yang @ 2024-04-22 10:05 UTC (permalink / raw)
  To: Alasdair Kergon, Mike Snitzer, Mikulas Patocka, dm-devel, linux-kernel
  Cc: Yang Yang

__send_empty_flush() sends empty flush bios to every target in the
dm_table. However, if the num_targets exceeds the number of block
devices in the dm_table's device list, it could lead to multiple
invocations of __send_duplicate_bios() for the same block device.
Typically, a single thread sending numerous empty flush bios to one
block device is redundant, as these bios are likely to be merged by the
flush state machine. In scenarios where num_targets significantly
outweighs the number of block devices, such behavior may result in a
noteworthy decrease in performance.

This issue can be reproduced using this command line:
  for i in {0..1023}; do
    echo $((8000*$i)) 8000 linear /dev/sda2 $((16384*$i))
  done | dmsetup create example

With this fix, a random write with fsync workload executed with the
following fio command:

  fio --group_reporting --name=benchmark --filename=/dev/mapper/example \
      --ioengine=sync --invalidate=1 --numjobs=16 --rw=randwrite \
      --blocksize=4k --size=2G --time_based --runtime=30 --fdatasync=1

results in an increase from 857 KB/s to 30.8 MB/s of the write
throughput (3580% increase).

Signed-off-by: Yang Yang <yang.yang@vivo.com>
---
 drivers/md/dm-core.h          |  1 +
 drivers/md/dm-table.c         |  7 +++++
 drivers/md/dm.c               | 59 +++++++++++++++++++++++++++++++++++
 include/linux/device-mapper.h |  6 ++++
 4 files changed, 73 insertions(+)

diff --git a/drivers/md/dm-core.h b/drivers/md/dm-core.h
index e6757a30dcca..7e3f2168289f 100644
--- a/drivers/md/dm-core.h
+++ b/drivers/md/dm-core.h
@@ -217,6 +217,7 @@ struct dm_table {
 	/* a list of devices used by this table */
 	struct list_head devices;
 	struct rw_semaphore devices_lock;
+	unsigned short num_devices;
 
 	/* events get handed up using this callback */
 	void (*event_fn)(void *data);
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 41f1d731ae5a..ddc60e498afb 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -2133,6 +2133,8 @@ void dm_table_postsuspend_targets(struct dm_table *t)
 
 int dm_table_resume_targets(struct dm_table *t)
 {
+	struct list_head *devices = dm_table_get_devices(t);
+	struct dm_dev_internal *dd;
 	unsigned int i;
 	int r = 0;
 
@@ -2159,6 +2161,11 @@ int dm_table_resume_targets(struct dm_table *t)
 			ti->type->resume(ti);
 	}
 
+	t->num_devices = 0;
+
+	list_for_each_entry(dd, devices, list)
+		dd->dm_dev->index = ++(t->num_devices);
+
 	return 0;
 }
 
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 56aa2a8b9d71..7297235291f6 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -48,6 +48,8 @@
  */
 #define REQ_DM_POLL_LIST	REQ_DRV
 
+#define DM_MAX_TABLE_DEVICES	1024
+
 static const char *_name = DM_NAME;
 
 static unsigned int major;
@@ -1543,10 +1545,38 @@ static unsigned int __send_duplicate_bios(struct clone_info *ci, struct dm_targe
 	return ret;
 }
 
+static inline bool has_redundant_flush(struct dm_table *t,
+		unsigned long **bitmap)
+{
+	if (t->num_devices < t->num_targets) {
+		/* Add a limit here to prevent excessive memory usage for bitmaps */
+		if (t->num_devices >= DM_MAX_TABLE_DEVICES)
+			return false;
+
+		/* dm_dev's index starts from 1, so need plus 1 here */
+		*bitmap = bitmap_zalloc(t->num_devices + 1, GFP_KERNEL);
+		if (*bitmap)
+			return true;
+	}
+
+	return false;
+}
+
+static int dm_get_dev_index(struct dm_target *ti, struct dm_dev *dev,
+				     sector_t start, sector_t len, void *data)
+{
+	unsigned short *index = data;
+	*index = dev->index;
+	return 0;
+}
+
 static void __send_empty_flush(struct clone_info *ci)
 {
 	struct dm_table *t = ci->map;
 	struct bio flush_bio;
+	unsigned long *handled_map;
+	unsigned int nr_handled = 0;
+	bool check = has_redundant_flush(t, &handled_map);
 
 	/*
 	 * Use an on-stack bio for this, it's safe since we don't
@@ -1562,17 +1592,46 @@ static void __send_empty_flush(struct clone_info *ci)
 
 	for (unsigned int i = 0; i < t->num_targets; i++) {
 		unsigned int bios;
+		unsigned short index = 0;
 		struct dm_target *ti = dm_table_get_target(t, i);
 
 		if (unlikely(ti->num_flush_bios == 0))
 			continue;
 
+		/*
+		 * If the num_targets is greater than the number of block devices
+		 * in the dm_table's devices list, __send_empty_flush() might
+		 * invoke __send_duplicate_bios() multiple times for the same
+		 * block device. This could lead to a substantial decrease in
+		 * performance when num_targets significantly exceeds the number
+		 * of block devices.
+		 * Ensure that __send_duplicate_bios() is only called once for
+		 * each block device.
+		 */
+		if (check) {
+			if (nr_handled == t->num_devices)
+				break;
+
+			if (ti->type->iterate_devices)
+				ti->type->iterate_devices(ti, dm_get_dev_index, &index);
+
+			if (index > 0) {
+				if (__test_and_set_bit(index, handled_map))
+					continue;
+				else
+					nr_handled++;
+			}
+		}
+
 		atomic_add(ti->num_flush_bios, &ci->io->io_count);
 		bios = __send_duplicate_bios(ci, ti, ti->num_flush_bios,
 					     NULL, GFP_NOWAIT);
 		atomic_sub(ti->num_flush_bios - bios, &ci->io->io_count);
 	}
 
+	if (check)
+		bitmap_free(handled_map);
+
 	/*
 	 * alloc_io() takes one extra reference for submission, so the
 	 * reference won't reach 0 without the following subtraction
diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
index 82b2195efaca..4a54b4f0a609 100644
--- a/include/linux/device-mapper.h
+++ b/include/linux/device-mapper.h
@@ -169,6 +169,12 @@ struct dm_dev {
 	struct dax_device *dax_dev;
 	blk_mode_t mode;
 	char name[16];
+
+	/*
+	 * sequential number for each dm_dev in dm_table's devices list,
+	 * start from 1
+	 */
+	unsigned short index;
 };
 
 /*
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 6+ messages in thread

* Re: [RFC PATCH] dm: Avoid sending redundant empty flush bios to the same block device
  2024-04-22 10:05 [RFC PATCH] dm: Avoid sending redundant empty flush bios to the same block device Yang Yang
@ 2024-04-25  2:54 ` YangYang
  2024-05-09 14:20 ` Mike Snitzer
  2024-05-10 14:08 ` Mikulas Patocka
  2 siblings, 0 replies; 6+ messages in thread
From: YangYang @ 2024-04-25  2:54 UTC (permalink / raw)
  To: Alasdair Kergon, Mike Snitzer, Mikulas Patocka; +Cc: linux-kernel, dm-devel

On 2024/4/22 18:05, Yang Yang wrote:
> __send_empty_flush() sends empty flush bios to every target in the
> dm_table. However, if the num_targets exceeds the number of block
> devices in the dm_table's device list, it could lead to multiple
> invocations of __send_duplicate_bios() for the same block device.
> Typically, a single thread sending numerous empty flush bios to one
> block device is redundant, as these bios are likely to be merged by the
> flush state machine. In scenarios where num_targets significantly
> outweighs the number of block devices, such behavior may result in a
> noteworthy decrease in performance.
> 
> This issue can be reproduced using this command line:
>    for i in {0..1023}; do
>      echo $((8000*$i)) 8000 linear /dev/sda2 $((16384*$i))
>    done | dmsetup create example
> 
> With this fix, a random write with fsync workload executed with the
> following fio command:
> 
>    fio --group_reporting --name=benchmark --filename=/dev/mapper/example \
>        --ioengine=sync --invalidate=1 --numjobs=16 --rw=randwrite \
>        --blocksize=4k --size=2G --time_based --runtime=30 --fdatasync=1
> 
> results in an increase from 857 KB/s to 30.8 MB/s of the write
> throughput (3580% increase).
> 
> Signed-off-by: Yang Yang <yang.yang@vivo.com>
> ---
>   drivers/md/dm-core.h          |  1 +
>   drivers/md/dm-table.c         |  7 +++++
>   drivers/md/dm.c               | 59 +++++++++++++++++++++++++++++++++++
>   include/linux/device-mapper.h |  6 ++++
>   4 files changed, 73 insertions(+)
> 
> diff --git a/drivers/md/dm-core.h b/drivers/md/dm-core.h
> index e6757a30dcca..7e3f2168289f 100644
> --- a/drivers/md/dm-core.h
> +++ b/drivers/md/dm-core.h
> @@ -217,6 +217,7 @@ struct dm_table {
>   	/* a list of devices used by this table */
>   	struct list_head devices;
>   	struct rw_semaphore devices_lock;
> +	unsigned short num_devices;
>   
>   	/* events get handed up using this callback */
>   	void (*event_fn)(void *data);
> diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
> index 41f1d731ae5a..ddc60e498afb 100644
> --- a/drivers/md/dm-table.c
> +++ b/drivers/md/dm-table.c
> @@ -2133,6 +2133,8 @@ void dm_table_postsuspend_targets(struct dm_table *t)
>   
>   int dm_table_resume_targets(struct dm_table *t)
>   {
> +	struct list_head *devices = dm_table_get_devices(t);
> +	struct dm_dev_internal *dd;
>   	unsigned int i;
>   	int r = 0;
>   
> @@ -2159,6 +2161,11 @@ int dm_table_resume_targets(struct dm_table *t)
>   			ti->type->resume(ti);
>   	}
>   
> +	t->num_devices = 0;
> +
> +	list_for_each_entry(dd, devices, list)
> +		dd->dm_dev->index = ++(t->num_devices);
> +
>   	return 0;
>   }
>   
> diff --git a/drivers/md/dm.c b/drivers/md/dm.c
> index 56aa2a8b9d71..7297235291f6 100644
> --- a/drivers/md/dm.c
> +++ b/drivers/md/dm.c
> @@ -48,6 +48,8 @@
>    */
>   #define REQ_DM_POLL_LIST	REQ_DRV
>   
> +#define DM_MAX_TABLE_DEVICES	1024
> +
>   static const char *_name = DM_NAME;
>   
>   static unsigned int major;
> @@ -1543,10 +1545,38 @@ static unsigned int __send_duplicate_bios(struct clone_info *ci, struct dm_targe
>   	return ret;
>   }
>   
> +static inline bool has_redundant_flush(struct dm_table *t,
> +		unsigned long **bitmap)
> +{
> +	if (t->num_devices < t->num_targets) {
> +		/* Add a limit here to prevent excessive memory usage for bitmaps */
> +		if (t->num_devices >= DM_MAX_TABLE_DEVICES)
> +			return false;
> +
> +		/* dm_dev's index starts from 1, so need plus 1 here */
> +		*bitmap = bitmap_zalloc(t->num_devices + 1, GFP_KERNEL);
> +		if (*bitmap)
> +			return true;
> +	}
> +
> +	return false;
> +}
> +
> +static int dm_get_dev_index(struct dm_target *ti, struct dm_dev *dev,
> +				     sector_t start, sector_t len, void *data)
> +{
> +	unsigned short *index = data;
> +	*index = dev->index;
> +	return 0;
> +}
> +
>   static void __send_empty_flush(struct clone_info *ci)
>   {
>   	struct dm_table *t = ci->map;
>   	struct bio flush_bio;
> +	unsigned long *handled_map;
> +	unsigned int nr_handled = 0;
> +	bool check = has_redundant_flush(t, &handled_map);
>   
>   	/*
>   	 * Use an on-stack bio for this, it's safe since we don't
> @@ -1562,17 +1592,46 @@ static void __send_empty_flush(struct clone_info *ci)
>   
>   	for (unsigned int i = 0; i < t->num_targets; i++) {
>   		unsigned int bios;
> +		unsigned short index = 0;
>   		struct dm_target *ti = dm_table_get_target(t, i);
>   
>   		if (unlikely(ti->num_flush_bios == 0))
>   			continue;
>   
> +		/*
> +		 * If the num_targets is greater than the number of block devices
> +		 * in the dm_table's devices list, __send_empty_flush() might
> +		 * invoke __send_duplicate_bios() multiple times for the same
> +		 * block device. This could lead to a substantial decrease in
> +		 * performance when num_targets significantly exceeds the number
> +		 * of block devices.
> +		 * Ensure that __send_duplicate_bios() is only called once for
> +		 * each block device.
> +		 */
> +		if (check) {
> +			if (nr_handled == t->num_devices)
> +				break;
> +
> +			if (ti->type->iterate_devices)
> +				ti->type->iterate_devices(ti, dm_get_dev_index, &index);
> +
> +			if (index > 0) {
> +				if (__test_and_set_bit(index, handled_map))
> +					continue;
> +				else
> +					nr_handled++;
> +			}
> +		}
> +
>   		atomic_add(ti->num_flush_bios, &ci->io->io_count);
>   		bios = __send_duplicate_bios(ci, ti, ti->num_flush_bios,
>   					     NULL, GFP_NOWAIT);
>   		atomic_sub(ti->num_flush_bios - bios, &ci->io->io_count);
>   	}
>   
> +	if (check)
> +		bitmap_free(handled_map);
> +
>   	/*
>   	 * alloc_io() takes one extra reference for submission, so the
>   	 * reference won't reach 0 without the following subtraction
> diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
> index 82b2195efaca..4a54b4f0a609 100644
> --- a/include/linux/device-mapper.h
> +++ b/include/linux/device-mapper.h
> @@ -169,6 +169,12 @@ struct dm_dev {
>   	struct dax_device *dax_dev;
>   	blk_mode_t mode;
>   	char name[16];
> +
> +	/*
> +	 * sequential number for each dm_dev in dm_table's devices list,
> +	 * start from 1
> +	 */
> +	unsigned short index;
>   };
>   
>   /*

Hello experts,
Any comments are welcome!

Thanks

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [RFC PATCH] dm: Avoid sending redundant empty flush bios to the same block device
  2024-04-22 10:05 [RFC PATCH] dm: Avoid sending redundant empty flush bios to the same block device Yang Yang
  2024-04-25  2:54 ` YangYang
@ 2024-05-09 14:20 ` Mike Snitzer
  2024-05-11 11:05   ` YangYang
  2024-05-10 14:08 ` Mikulas Patocka
  2 siblings, 1 reply; 6+ messages in thread
From: Mike Snitzer @ 2024-05-09 14:20 UTC (permalink / raw)
  To: Yang Yang; +Cc: Alasdair Kergon, Mikulas Patocka, dm-devel, linux-kernel

On Mon, Apr 22, 2024 at 06:05:40PM +0800, Yang Yang wrote:
> __send_empty_flush() sends empty flush bios to every target in the
> dm_table. However, if the num_targets exceeds the number of block
> devices in the dm_table's device list, it could lead to multiple
> invocations of __send_duplicate_bios() for the same block device.
> Typically, a single thread sending numerous empty flush bios to one
> block device is redundant, as these bios are likely to be merged by the
> flush state machine. In scenarios where num_targets significantly
> outweighs the number of block devices, such behavior may result in a
> noteworthy decrease in performance.
> 
> This issue can be reproduced using this command line:
>   for i in {0..1023}; do
>     echo $((8000*$i)) 8000 linear /dev/sda2 $((16384*$i))
>   done | dmsetup create example

Only _one_ dm_dev will be created for this example: /dev/sda2
So your bitmap is looking at a single bit.

> With this fix, a random write with fsync workload executed with the
> following fio command:
> 
>   fio --group_reporting --name=benchmark --filename=/dev/mapper/example \
>       --ioengine=sync --invalidate=1 --numjobs=16 --rw=randwrite \
>       --blocksize=4k --size=2G --time_based --runtime=30 --fdatasync=1
> 
> results in an increase from 857 KB/s to 30.8 MB/s of the write
> throughput (3580% increase).
> 
> Signed-off-by: Yang Yang <yang.yang@vivo.com>

I'm including my original fine-grained review comments inlined below
BUT, having wasted time reviewing this patch I'm left with the
conclusions:

1) this patch has serious issues.
2) it fixes an issue with a toy 'example' but ignores that not all
   targets are linear, each disparate target could have their own
   reason(s) for actually _needing_ the redundant flushes.

I'm inclined to never take this type of change.

> ---
>  drivers/md/dm-core.h          |  1 +
>  drivers/md/dm-table.c         |  7 +++++
>  drivers/md/dm.c               | 59 +++++++++++++++++++++++++++++++++++
>  include/linux/device-mapper.h |  6 ++++
>  4 files changed, 73 insertions(+)
> 
> diff --git a/drivers/md/dm-core.h b/drivers/md/dm-core.h
> index e6757a30dcca..7e3f2168289f 100644
> --- a/drivers/md/dm-core.h
> +++ b/drivers/md/dm-core.h
> @@ -217,6 +217,7 @@ struct dm_table {
>  	/* a list of devices used by this table */
>  	struct list_head devices;
>  	struct rw_semaphore devices_lock;
> +	unsigned short num_devices;
>  
>  	/* events get handed up using this callback */
>  	void (*event_fn)(void *data);
> diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
> index 41f1d731ae5a..ddc60e498afb 100644
> --- a/drivers/md/dm-table.c
> +++ b/drivers/md/dm-table.c
> @@ -2133,6 +2133,8 @@ void dm_table_postsuspend_targets(struct dm_table *t)
>  
>  int dm_table_resume_targets(struct dm_table *t)
>  {
> +	struct list_head *devices = dm_table_get_devices(t);
> +	struct dm_dev_internal *dd;
>  	unsigned int i;
>  	int r = 0;
>  
> @@ -2159,6 +2161,11 @@ int dm_table_resume_targets(struct dm_table *t)
>  			ti->type->resume(ti);
>  	}
>  
> +	t->num_devices = 0;
> +
> +	list_for_each_entry(dd, devices, list)
> +		dd->dm_dev->index = ++(t->num_devices);
> +
>  	return 0;
>  }
>  
> diff --git a/drivers/md/dm.c b/drivers/md/dm.c
> index 56aa2a8b9d71..7297235291f6 100644
> --- a/drivers/md/dm.c
> +++ b/drivers/md/dm.c
> @@ -48,6 +48,8 @@
>   */
>  #define REQ_DM_POLL_LIST	REQ_DRV
>  
> +#define DM_MAX_TABLE_DEVICES	1024
> +

This name is too general. This limit isn't imposed for anything other
than bounding the size of the bitmap used to avoid redundant flushes.

So maybe rename to: DM_MAX_REDUNDANT_FLUSH_BITMAP_DEVICES

>  static const char *_name = DM_NAME;
>  
>  static unsigned int major;
> @@ -1543,10 +1545,38 @@ static unsigned int __send_duplicate_bios(struct clone_info *ci, struct dm_targe
>  	return ret;
>  }
>  
> +static inline bool has_redundant_flush(struct dm_table *t,
> +		unsigned long **bitmap)
> +{
> +	if (t->num_devices < t->num_targets) {
> +		/* Add a limit here to prevent excessive memory usage for bitmaps */
> +		if (t->num_devices >= DM_MAX_TABLE_DEVICES)
> +			return false;

OK, in practice I'm not aware of tables that require such an excessive
amount of devices.

> +		/* dm_dev's index starts from 1, so need plus 1 here */
> +		*bitmap = bitmap_zalloc(t->num_devices + 1, GFP_KERNEL);
> +		if (*bitmap)
> +			return true;
> +	}

This method is being used in the IO path, so you cannot use
GFP_KERNEL.  Please switch to GFP_NOIO.

> +
> +	return false;
> +}
> +
> +static int dm_get_dev_index(struct dm_target *ti, struct dm_dev *dev,
> +				     sector_t start, sector_t len, void *data)
> +{
> +	unsigned short *index = data;
> +	*index = dev->index;
> +	return 0;
> +}
> +
>  static void __send_empty_flush(struct clone_info *ci)
>  {
>  	struct dm_table *t = ci->map;
>  	struct bio flush_bio;
> +	unsigned long *handled_map;

Please rename, e.g.: handled_devices_bitmap

> +	unsigned int nr_handled = 0;
> +	bool check = has_redundant_flush(t, &handled_map);
>  
>  	/*
>  	 * Use an on-stack bio for this, it's safe since we don't
> @@ -1562,17 +1592,46 @@ static void __send_empty_flush(struct clone_info *ci)
>  
>  	for (unsigned int i = 0; i < t->num_targets; i++) {
>  		unsigned int bios;
> +		unsigned short index = 0;
>  		struct dm_target *ti = dm_table_get_target(t, i);
>  
>  		if (unlikely(ti->num_flush_bios == 0))
>  			continue;
>  
> +		/*
> +		 * If the num_targets is greater than the number of block devices
> +		 * in the dm_table's devices list, __send_empty_flush() might
> +		 * invoke __send_duplicate_bios() multiple times for the same
> +		 * block device. This could lead to a substantial decrease in
> +		 * performance when num_targets significantly exceeds the number
> +		 * of block devices.
> +		 * Ensure that __send_duplicate_bios() is only called once for
> +		 * each block device.
> +		 */
> +		if (check) {
> +			if (nr_handled == t->num_devices)
> +				break;
> +
> +			if (ti->type->iterate_devices)
> +				ti->type->iterate_devices(ti, dm_get_dev_index, &index);

You're looping through all data devices in a target, so you're only
getting the last device in the target's index.  That seems very
broken.

But each target in your test 'example' device (mentioned in the patch
header) only has a single device so you wouldn't have noticed this.

> +
> +			if (index > 0) {
> +				if (__test_and_set_bit(index, handled_map))
> +					continue;
> +				else
> +					nr_handled++;

Think you really mean to set bits in the bitmap within the
iterate_devices callout.  So it should be renamed accordingly.

Why not count the first time a device is handled in nr_handled?
Also, it strikes me as strange that you're break'ing out this loop
early based on nr_handled... not seeing the point (and also it seems
broken, because it implies you aren't issuing flushes to targets at
the end).

> +			}
> +		}
> +
>  		atomic_add(ti->num_flush_bios, &ci->io->io_count);
>  		bios = __send_duplicate_bios(ci, ti, ti->num_flush_bios,
>  					     NULL, GFP_NOWAIT);
>  		atomic_sub(ti->num_flush_bios - bios, &ci->io->io_count);
>  	}
>  
> +	if (check)
> +		bitmap_free(handled_map);
> +
>  	/*
>  	 * alloc_io() takes one extra reference for submission, so the
>  	 * reference won't reach 0 without the following subtraction
> diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
> index 82b2195efaca..4a54b4f0a609 100644
> --- a/include/linux/device-mapper.h
> +++ b/include/linux/device-mapper.h
> @@ -169,6 +169,12 @@ struct dm_dev {
>  	struct dax_device *dax_dev;
>  	blk_mode_t mode;
>  	char name[16];
> +
> +	/*
> +	 * sequential number for each dm_dev in dm_table's devices list,
> +	 * start from 1
> +	 */
> +	unsigned short index;

Please update this comment to indicate that index=0 is (ab)used as a
flag in __send_empty_flush().

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [RFC PATCH] dm: Avoid sending redundant empty flush bios to the same block device
  2024-04-22 10:05 [RFC PATCH] dm: Avoid sending redundant empty flush bios to the same block device Yang Yang
  2024-04-25  2:54 ` YangYang
  2024-05-09 14:20 ` Mike Snitzer
@ 2024-05-10 14:08 ` Mikulas Patocka
  2024-05-11 11:08   ` YangYang
  2 siblings, 1 reply; 6+ messages in thread
From: Mikulas Patocka @ 2024-05-10 14:08 UTC (permalink / raw)
  To: Yang Yang; +Cc: Alasdair Kergon, Mike Snitzer, dm-devel, linux-kernel

Hi

Regarding *bitmap = bitmap_zalloc(t->num_devices + 1, GFP_KERNEL); - you 
can't allocate memory in the I/O processing path, because it may deadlock. 
Think of a case when the system is out of memory and it needs to swap out 
some pages and the swap out operation attempts to allocate more memory.

Anyway, the patch is too complicated. I suggest to try this:

* introduce a per-target bit "flush_pass_around" that is set for dm-linear 
  and dm-stripe and that means that the target supports flush optimization

* set a per-table "flush_pass_around" bit if all the targets in the table 
  have "flush_pass_around" set

* in __send_empty_flush, test the table's "flush_pass_around" bit and if 
  it is set, bypass this loop "for (unsigned int i = 0; i < 
  t->num_targets; i++) {" and iterate over dm_table->devices and send the 
  flush to each of them.

Hopefully, these changes will make the patch smaller and more acceptable.

Mikulas



On Mon, 22 Apr 2024, Yang Yang wrote:

> __send_empty_flush() sends empty flush bios to every target in the
> dm_table. However, if the num_targets exceeds the number of block
> devices in the dm_table's device list, it could lead to multiple
> invocations of __send_duplicate_bios() for the same block device.
> Typically, a single thread sending numerous empty flush bios to one
> block device is redundant, as these bios are likely to be merged by the
> flush state machine. In scenarios where num_targets significantly
> outweighs the number of block devices, such behavior may result in a
> noteworthy decrease in performance.
> 
> This issue can be reproduced using this command line:
>   for i in {0..1023}; do
>     echo $((8000*$i)) 8000 linear /dev/sda2 $((16384*$i))
>   done | dmsetup create example
> 
> With this fix, a random write with fsync workload executed with the
> following fio command:
> 
>   fio --group_reporting --name=benchmark --filename=/dev/mapper/example \
>       --ioengine=sync --invalidate=1 --numjobs=16 --rw=randwrite \
>       --blocksize=4k --size=2G --time_based --runtime=30 --fdatasync=1
> 
> results in an increase from 857 KB/s to 30.8 MB/s of the write
> throughput (3580% increase).
> 
> Signed-off-by: Yang Yang <yang.yang@vivo.com>
> ---
>  drivers/md/dm-core.h          |  1 +
>  drivers/md/dm-table.c         |  7 +++++
>  drivers/md/dm.c               | 59 +++++++++++++++++++++++++++++++++++
>  include/linux/device-mapper.h |  6 ++++
>  4 files changed, 73 insertions(+)
> 
> diff --git a/drivers/md/dm-core.h b/drivers/md/dm-core.h
> index e6757a30dcca..7e3f2168289f 100644
> --- a/drivers/md/dm-core.h
> +++ b/drivers/md/dm-core.h
> @@ -217,6 +217,7 @@ struct dm_table {
>  	/* a list of devices used by this table */
>  	struct list_head devices;
>  	struct rw_semaphore devices_lock;
> +	unsigned short num_devices;
>  
>  	/* events get handed up using this callback */
>  	void (*event_fn)(void *data);
> diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
> index 41f1d731ae5a..ddc60e498afb 100644
> --- a/drivers/md/dm-table.c
> +++ b/drivers/md/dm-table.c
> @@ -2133,6 +2133,8 @@ void dm_table_postsuspend_targets(struct dm_table *t)
>  
>  int dm_table_resume_targets(struct dm_table *t)
>  {
> +	struct list_head *devices = dm_table_get_devices(t);
> +	struct dm_dev_internal *dd;
>  	unsigned int i;
>  	int r = 0;
>  
> @@ -2159,6 +2161,11 @@ int dm_table_resume_targets(struct dm_table *t)
>  			ti->type->resume(ti);
>  	}
>  
> +	t->num_devices = 0;
> +
> +	list_for_each_entry(dd, devices, list)
> +		dd->dm_dev->index = ++(t->num_devices);
> +
>  	return 0;
>  }
>  
> diff --git a/drivers/md/dm.c b/drivers/md/dm.c
> index 56aa2a8b9d71..7297235291f6 100644
> --- a/drivers/md/dm.c
> +++ b/drivers/md/dm.c
> @@ -48,6 +48,8 @@
>   */
>  #define REQ_DM_POLL_LIST	REQ_DRV
>  
> +#define DM_MAX_TABLE_DEVICES	1024
> +
>  static const char *_name = DM_NAME;
>  
>  static unsigned int major;
> @@ -1543,10 +1545,38 @@ static unsigned int __send_duplicate_bios(struct clone_info *ci, struct dm_targe
>  	return ret;
>  }
>  
> +static inline bool has_redundant_flush(struct dm_table *t,
> +		unsigned long **bitmap)
> +{
> +	if (t->num_devices < t->num_targets) {
> +		/* Add a limit here to prevent excessive memory usage for bitmaps */
> +		if (t->num_devices >= DM_MAX_TABLE_DEVICES)
> +			return false;
> +
> +		/* dm_dev's index starts from 1, so need plus 1 here */
> +		*bitmap = bitmap_zalloc(t->num_devices + 1, GFP_KERNEL);
> +		if (*bitmap)
> +			return true;
> +	}
> +
> +	return false;
> +}
> +
> +static int dm_get_dev_index(struct dm_target *ti, struct dm_dev *dev,
> +				     sector_t start, sector_t len, void *data)
> +{
> +	unsigned short *index = data;
> +	*index = dev->index;
> +	return 0;
> +}
> +
>  static void __send_empty_flush(struct clone_info *ci)
>  {
>  	struct dm_table *t = ci->map;
>  	struct bio flush_bio;
> +	unsigned long *handled_map;
> +	unsigned int nr_handled = 0;
> +	bool check = has_redundant_flush(t, &handled_map);
>  
>  	/*
>  	 * Use an on-stack bio for this, it's safe since we don't
> @@ -1562,17 +1592,46 @@ static void __send_empty_flush(struct clone_info *ci)
>  
>  	for (unsigned int i = 0; i < t->num_targets; i++) {
>  		unsigned int bios;
> +		unsigned short index = 0;
>  		struct dm_target *ti = dm_table_get_target(t, i);
>  
>  		if (unlikely(ti->num_flush_bios == 0))
>  			continue;
>  
> +		/*
> +		 * If the num_targets is greater than the number of block devices
> +		 * in the dm_table's devices list, __send_empty_flush() might
> +		 * invoke __send_duplicate_bios() multiple times for the same
> +		 * block device. This could lead to a substantial decrease in
> +		 * performance when num_targets significantly exceeds the number
> +		 * of block devices.
> +		 * Ensure that __send_duplicate_bios() is only called once for
> +		 * each block device.
> +		 */
> +		if (check) {
> +			if (nr_handled == t->num_devices)
> +				break;
> +
> +			if (ti->type->iterate_devices)
> +				ti->type->iterate_devices(ti, dm_get_dev_index, &index);
> +
> +			if (index > 0) {
> +				if (__test_and_set_bit(index, handled_map))
> +					continue;
> +				else
> +					nr_handled++;
> +			}
> +		}
> +
>  		atomic_add(ti->num_flush_bios, &ci->io->io_count);
>  		bios = __send_duplicate_bios(ci, ti, ti->num_flush_bios,
>  					     NULL, GFP_NOWAIT);
>  		atomic_sub(ti->num_flush_bios - bios, &ci->io->io_count);
>  	}
>  
> +	if (check)
> +		bitmap_free(handled_map);
> +
>  	/*
>  	 * alloc_io() takes one extra reference for submission, so the
>  	 * reference won't reach 0 without the following subtraction
> diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
> index 82b2195efaca..4a54b4f0a609 100644
> --- a/include/linux/device-mapper.h
> +++ b/include/linux/device-mapper.h
> @@ -169,6 +169,12 @@ struct dm_dev {
>  	struct dax_device *dax_dev;
>  	blk_mode_t mode;
>  	char name[16];
> +
> +	/*
> +	 * sequential number for each dm_dev in dm_table's devices list,
> +	 * start from 1
> +	 */
> +	unsigned short index;
>  };
>  
>  /*
> -- 
> 2.34.1
> 


^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [RFC PATCH] dm: Avoid sending redundant empty flush bios to the same block device
  2024-05-09 14:20 ` Mike Snitzer
@ 2024-05-11 11:05   ` YangYang
  0 siblings, 0 replies; 6+ messages in thread
From: YangYang @ 2024-05-11 11:05 UTC (permalink / raw)
  To: Mike Snitzer; +Cc: Alasdair Kergon, Mikulas Patocka, dm-devel, linux-kernel

On 2024/5/9 22:20, Mike Snitzer wrote:
> On Mon, Apr 22, 2024 at 06:05:40PM +0800, Yang Yang wrote:
>> __send_empty_flush() sends empty flush bios to every target in the
>> dm_table. However, if the num_targets exceeds the number of block
>> devices in the dm_table's device list, it could lead to multiple
>> invocations of __send_duplicate_bios() for the same block device.
>> Typically, a single thread sending numerous empty flush bios to one
>> block device is redundant, as these bios are likely to be merged by the
>> flush state machine. In scenarios where num_targets significantly
>> outweighs the number of block devices, such behavior may result in a
>> noteworthy decrease in performance.
>>
>> This issue can be reproduced using this command line:
>>    for i in {0..1023}; do
>>      echo $((8000*$i)) 8000 linear /dev/sda2 $((16384*$i))
>>    done | dmsetup create example
> 
> Only _one_ dm_dev will be created for this example: /dev/sda2
> So your bitmap is looking at a single bit.

Yes. And this patch has also been tested on a scenario with multiple
dm_devs using the following command line:
   for i in {0..1023}; do
     if [[ $i -gt 511 ]]; then
       echo $((8000*$i)) 8000 linear /dev/sda3 $((16384*$i))
     else
       echo $((8000*$i)) 8000 linear /dev/sda2 $((16384*$i))
     fi
   done | sudo dmsetup create example

>> With this fix, a random write with fsync workload executed with the
>> following fio command:
>>
>>    fio --group_reporting --name=benchmark --filename=/dev/mapper/example \
>>        --ioengine=sync --invalidate=1 --numjobs=16 --rw=randwrite \
>>        --blocksize=4k --size=2G --time_based --runtime=30 --fdatasync=1
>>
>> results in an increase from 857 KB/s to 30.8 MB/s of the write
>> throughput (3580% increase).
>>
>> Signed-off-by: Yang Yang <yang.yang@vivo.com>
> 
> I'm including my original fine-grained review comments inlined below
> BUT, having wasted time reviewing this patch I'm left with the
> conclusions:
> 
> 1) this patch has serious issues.
> 2) it fixes an issue with a toy 'example' but ignores that not all
>     targets are linear, each disparate target could have their own
>     reason(s) for actually _needing_ the redundant flushes.

I chose this specific 'example' because the constructed scenario closely
resembles real-world use cases.
This is a real-life scenario that we have encountered:
1) Call fallocate(file_fd, 0, 0, SZ_8G)
2) Call ioctl(file_fd, FS_IOC_FIEMAP, fiemap). In situations of severe
file system fragmentation, fiemap->fm_mapped_extents may exceed 1000.
3) Create a dm-linear device based on fiemap->fm_extents
4) Create a snapshot-cow device based on the dm-linear device

I guess targets needing the redundant flushes you mentioned are targets
with ti->num_flush_bios > 1. I think they won't be affected, duplicate
flush bios are still send to dm_dev by __send_duplicate_bios().
The main purpose of this patch is to ensure that __send_duplicate_bios()
is called only once for each dm_dev.

Anyway, I will modify this patch to apply only to dm-linear and dm-stripe.

> I'm inclined to never take this type of change.
> 
>> ---
>>   drivers/md/dm-core.h          |  1 +
>>   drivers/md/dm-table.c         |  7 +++++
>>   drivers/md/dm.c               | 59 +++++++++++++++++++++++++++++++++++
>>   include/linux/device-mapper.h |  6 ++++
>>   4 files changed, 73 insertions(+)
>>
>> diff --git a/drivers/md/dm-core.h b/drivers/md/dm-core.h
>> index e6757a30dcca..7e3f2168289f 100644
>> --- a/drivers/md/dm-core.h
>> +++ b/drivers/md/dm-core.h
>> @@ -217,6 +217,7 @@ struct dm_table {
>>   	/* a list of devices used by this table */
>>   	struct list_head devices;
>>   	struct rw_semaphore devices_lock;
>> +	unsigned short num_devices;
>>   
>>   	/* events get handed up using this callback */
>>   	void (*event_fn)(void *data);
>> diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
>> index 41f1d731ae5a..ddc60e498afb 100644
>> --- a/drivers/md/dm-table.c
>> +++ b/drivers/md/dm-table.c
>> @@ -2133,6 +2133,8 @@ void dm_table_postsuspend_targets(struct dm_table *t)
>>   
>>   int dm_table_resume_targets(struct dm_table *t)
>>   {
>> +	struct list_head *devices = dm_table_get_devices(t);
>> +	struct dm_dev_internal *dd;
>>   	unsigned int i;
>>   	int r = 0;
>>   
>> @@ -2159,6 +2161,11 @@ int dm_table_resume_targets(struct dm_table *t)
>>   			ti->type->resume(ti);
>>   	}
>>   
>> +	t->num_devices = 0;
>> +
>> +	list_for_each_entry(dd, devices, list)
>> +		dd->dm_dev->index = ++(t->num_devices);
>> +
>>   	return 0;
>>   }
>>   
>> diff --git a/drivers/md/dm.c b/drivers/md/dm.c
>> index 56aa2a8b9d71..7297235291f6 100644
>> --- a/drivers/md/dm.c
>> +++ b/drivers/md/dm.c
>> @@ -48,6 +48,8 @@
>>    */
>>   #define REQ_DM_POLL_LIST	REQ_DRV
>>   
>> +#define DM_MAX_TABLE_DEVICES	1024
>> +
> 
> This name is too general. This limit isn't imposed for anything other
> than bounding the size of the bitmap used to avoid redundant flushes.
> 
> So maybe rename to: DM_MAX_REDUNDANT_FLUSH_BITMAP_DEVICES

Got it!

>>   static const char *_name = DM_NAME;
>>   
>>   static unsigned int major;
>> @@ -1543,10 +1545,38 @@ static unsigned int __send_duplicate_bios(struct clone_info *ci, struct dm_targe
>>   	return ret;
>>   }
>>   
>> +static inline bool has_redundant_flush(struct dm_table *t,
>> +		unsigned long **bitmap)
>> +{
>> +	if (t->num_devices < t->num_targets) {
>> +		/* Add a limit here to prevent excessive memory usage for bitmaps */
>> +		if (t->num_devices >= DM_MAX_TABLE_DEVICES)
>> +			return false;
> 
> OK, in practice I'm not aware of tables that require such an excessive
> amount of devices.

Noted!

>> +		/* dm_dev's index starts from 1, so need plus 1 here */
>> +		*bitmap = bitmap_zalloc(t->num_devices + 1, GFP_KERNEL);
>> +		if (*bitmap)
>> +			return true;
>> +	}
> 
> This method is being used in the IO path, so you cannot use
> GFP_KERNEL.  Please switch to GFP_NOIO.

Got it!

>> +
>> +	return false;
>> +}
>> +
>> +static int dm_get_dev_index(struct dm_target *ti, struct dm_dev *dev,
>> +				     sector_t start, sector_t len, void *data)
>> +{
>> +	unsigned short *index = data;
>> +	*index = dev->index;
>> +	return 0;
>> +}
>> +
>>   static void __send_empty_flush(struct clone_info *ci)
>>   {
>>   	struct dm_table *t = ci->map;
>>   	struct bio flush_bio;
>> +	unsigned long *handled_map;
> 
> Please rename, e.g.: handled_devices_bitmap

Got it!

>> +	unsigned int nr_handled = 0;
>> +	bool check = has_redundant_flush(t, &handled_map);
>>   
>>   	/*
>>   	 * Use an on-stack bio for this, it's safe since we don't
>> @@ -1562,17 +1592,46 @@ static void __send_empty_flush(struct clone_info *ci)
>>   
>>   	for (unsigned int i = 0; i < t->num_targets; i++) {
>>   		unsigned int bios;
>> +		unsigned short index = 0;
>>   		struct dm_target *ti = dm_table_get_target(t, i);
>>   
>>   		if (unlikely(ti->num_flush_bios == 0))
>>   			continue;
>>   
>> +		/*
>> +		 * If the num_targets is greater than the number of block devices
>> +		 * in the dm_table's devices list, __send_empty_flush() might
>> +		 * invoke __send_duplicate_bios() multiple times for the same
>> +		 * block device. This could lead to a substantial decrease in
>> +		 * performance when num_targets significantly exceeds the number
>> +		 * of block devices.
>> +		 * Ensure that __send_duplicate_bios() is only called once for
>> +		 * each block device.
>> +		 */
>> +		if (check) {
>> +			if (nr_handled == t->num_devices)
>> +				break;
>> +
>> +			if (ti->type->iterate_devices)
>> +				ti->type->iterate_devices(ti, dm_get_dev_index, &index);
> 
> You're looping through all data devices in a target, so you're only
> getting the last device in the target's index.  That seems very
> broken.
> 
> But each target in your test 'example' device (mentioned in the patch
> header) only has a single device so you wouldn't have noticed this.

Perhaps the code comments are not sufficiently detailed, so allow me to
provide a detailed explanation of the execution flow of this code.

For device table only has a single device:
example: 0 8000 linear 8:2 0            -> dm_dev(8:2).index = 1
example: 8000 8000 linear 8:2 16384     -> dm_dev(8:2).index = 1
example: 16000 8000 linear 8:2 32768    -> dm_dev(8:2).index = 1
example: 24000 8000 linear 8:2 49152    -> dm_dev(8:2).index = 1
example: 32000 8000 linear 8:2 65536    -> dm_dev(8:2).index = 1
example: 40000 8000 linear 8:2 81920    -> dm_dev(8:2).index = 1
example: 48000 8000 linear 8:2 98304    -> dm_dev(8:2).index = 1
example: 56000 8000 linear 8:2 114688   -> dm_dev(8:2).index = 1
example: 64000 8000 linear 8:2 131072   -> dm_dev(8:2).index = 1

Before enter the loop:
     nr_handled = 0
     t->num_devices = 1

When i equals to 0:
for (unsigned int i = 0; i < t->num_targets; i++) {
     if (nr_handled == t->num_devices) //nr_handled=0, t->num_devices=1
         break;

     if (ti->type->iterate_devices)
         ti->type->iterate_devices(ti, dm_get_dev_index, &index) //index is set to 1

     if (index > 0) {
         if (__test_and_set_bit(index, handled_map)) //__test_and_set_bit() return 0
             continue;
         else
             nr_handled++;    //nr_handled = 1
     }

     bios = __send_duplicate_bios(ci, ti, ti->num_flush_bios,
                      NULL, GFP_NOWAIT);  //__send_duplicate_bios() is called
}

When i equals to 1:
for (unsigned int i = 0; i < t->num_targets; i++) {
     if (nr_handled == t->num_devices) //nr_handled=1, t->num_devices=1, break the loop
         break;
}

__send_duplicate_bios() is called for device (8:2) only once.

======================================================================

For device table has two devices:
example: 0 8000 linear 8:2 0            -> dm_dev(8:2).index = 1
example: 8000 8000 linear 8:2 16384     -> dm_dev(8:2).index = 1
example: 16000 8000 linear 8:2 32768    -> dm_dev(8:2).index = 1
example: 24000 8000 linear 8:2 49152    -> dm_dev(8:2).index = 1
example: 32000 8000 linear 8:2 65536    -> dm_dev(8:2).index = 1
example: 40000 8000 linear 8:3 81920    -> dm_dev(8:3).index = 2
example: 48000 8000 linear 8:3 98304    -> dm_dev(8:3).index = 2
example: 56000 8000 linear 8:3 114688   -> dm_dev(8:3).index = 2
example: 64000 8000 linear 8:3 131072   -> dm_dev(8:3).index = 2

Before enter the loop:
     nr_handled = 0
     t->num_devices = 2

When i equals to 0:
for (unsigned int i = 0; i < t->num_targets; i++) {
     if (nr_handled == t->num_devices) //nr_handled=0, t->num_devices=2
         break;

     if (ti->type->iterate_devices)
         ti->type->iterate_devices(ti, dm_get_dev_index, &index) //index is set to 1

     if (index > 0) {
         if (__test_and_set_bit(index, handled_map)) //__test_and_set_bit() return 0
             continue;
         else
             nr_handled++;    //nr_handled = 1
     }

     bios = __send_duplicate_bios(ci, ti, ti->num_flush_bios,
                      NULL, GFP_NOWAIT);    //__send_duplicate_bios() is called
}

When i equals to 1, 2, 3, or 4:
for (unsigned int i = 0; i < t->num_targets; i++) {
     if (nr_handled == t->num_devices) //nr_handled=1, t->num_devices=2
         break;

     if (ti->type->iterate_devices)
         ti->type->iterate_devices(ti, dm_get_dev_index, &index) //index is set to 1

     if (index > 0) {
         if (__test_and_set_bit(index, handled_map)) //__test_and_set_bit() return 1
             continue;
         else
             nr_handled++;
     }
}

When i equals to 5:
for (unsigned int i = 0; i < t->num_targets; i++) {
     if (nr_handled == t->num_devices) //nr_handled=1, t->num_devices=2
         break;

     if (ti->type->iterate_devices)
         ti->type->iterate_devices(ti, dm_get_dev_index, &index) //index is set to 2

     if (index > 0) {
         if (__test_and_set_bit(index, handled_map)) //__test_and_set_bit() return 0
             continue;
         else
             nr_handled++;    //nr_handled = 2
     }

     bios = __send_duplicate_bios(ci, ti, ti->num_flush_bios,
                      NULL, GFP_NOWAIT); //__send_duplicate_bios() is called for the second time
}

When i equals to 6:
for (unsigned int i = 0; i < t->num_targets; i++) {
     if (nr_handled == t->num_devices) //nr_handled=2, t->num_devices=2, break the loop
         break;
}

__send_duplicate_bios() is called only once for each of devices (8:2) and (8:3).

>> +
>> +			if (index > 0) {
>> +				if (__test_and_set_bit(index, handled_map))
>> +					continue;
>> +				else
>> +					nr_handled++;
> 
> Think you really mean to set bits in the bitmap within the
> iterate_devices callout.  So it should be renamed accordingly.
> 
> Why not count the first time a device is handled in nr_handled?
> Also, it strikes me as strange that you're break'ing out this loop
> early based on nr_handled... not seeing the point (and also it seems
> broken, because it implies you aren't issuing flushes to targets at
> the end).
> 
>> +			}
>> +		}
>> +
>>   		atomic_add(ti->num_flush_bios, &ci->io->io_count);
>>   		bios = __send_duplicate_bios(ci, ti, ti->num_flush_bios,
>>   					     NULL, GFP_NOWAIT);
>>   		atomic_sub(ti->num_flush_bios - bios, &ci->io->io_count);
>>   	}
>>   
>> +	if (check)
>> +		bitmap_free(handled_map);
>> +
>>   	/*
>>   	 * alloc_io() takes one extra reference for submission, so the
>>   	 * reference won't reach 0 without the following subtraction
>> diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
>> index 82b2195efaca..4a54b4f0a609 100644
>> --- a/include/linux/device-mapper.h
>> +++ b/include/linux/device-mapper.h
>> @@ -169,6 +169,12 @@ struct dm_dev {
>>   	struct dax_device *dax_dev;
>>   	blk_mode_t mode;
>>   	char name[16];
>> +
>> +	/*
>> +	 * sequential number for each dm_dev in dm_table's devices list,
>> +	 * start from 1
>> +	 */
>> +	unsigned short index;
> 
> Please update this comment to indicate that index=0 is (ab)used as a
> flag in __send_empty_flush().

Noted!

Thanks for your comments.

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [RFC PATCH] dm: Avoid sending redundant empty flush bios to the same block device
  2024-05-10 14:08 ` Mikulas Patocka
@ 2024-05-11 11:08   ` YangYang
  0 siblings, 0 replies; 6+ messages in thread
From: YangYang @ 2024-05-11 11:08 UTC (permalink / raw)
  To: Mikulas Patocka; +Cc: Alasdair Kergon, Mike Snitzer, dm-devel, linux-kernel

On 2024/5/10 22:08, Mikulas Patocka wrote:
> Hi
> 
> Regarding *bitmap = bitmap_zalloc(t->num_devices + 1, GFP_KERNEL); - you
> can't allocate memory in the I/O processing path, because it may deadlock.
> Think of a case when the system is out of memory and it needs to swap out
> some pages and the swap out operation attempts to allocate more memory.

Got it!

> 
> Anyway, the patch is too complicated. I suggest to try this:
> 
> * introduce a per-target bit "flush_pass_around" that is set for dm-linear
>    and dm-stripe and that means that the target supports flush optimization
> 
> * set a per-table "flush_pass_around" bit if all the targets in the table
>    have "flush_pass_around" set
> 
> * in __send_empty_flush, test the table's "flush_pass_around" bit and if
>    it is set, bypass this loop "for (unsigned int i = 0; i <
>    t->num_targets; i++) {" and iterate over dm_table->devices and send the
>    flush to each of them.

OK, I will do this in the next version.

Thanks for your comments.

> 
> Hopefully, these changes will make the patch smaller and more acceptable.
> 
> Mikulas
> 
> 
> 
> On Mon, 22 Apr 2024, Yang Yang wrote:
> 
>> __send_empty_flush() sends empty flush bios to every target in the
>> dm_table. However, if the num_targets exceeds the number of block
>> devices in the dm_table's device list, it could lead to multiple
>> invocations of __send_duplicate_bios() for the same block device.
>> Typically, a single thread sending numerous empty flush bios to one
>> block device is redundant, as these bios are likely to be merged by the
>> flush state machine. In scenarios where num_targets significantly
>> outweighs the number of block devices, such behavior may result in a
>> noteworthy decrease in performance.
>>
>> This issue can be reproduced using this command line:
>>    for i in {0..1023}; do
>>      echo $((8000*$i)) 8000 linear /dev/sda2 $((16384*$i))
>>    done | dmsetup create example
>>
>> With this fix, a random write with fsync workload executed with the
>> following fio command:
>>
>>    fio --group_reporting --name=benchmark --filename=/dev/mapper/example \
>>        --ioengine=sync --invalidate=1 --numjobs=16 --rw=randwrite \
>>        --blocksize=4k --size=2G --time_based --runtime=30 --fdatasync=1
>>
>> results in an increase from 857 KB/s to 30.8 MB/s of the write
>> throughput (3580% increase).
>>
>> Signed-off-by: Yang Yang <yang.yang@vivo.com>
>> ---
>>   drivers/md/dm-core.h          |  1 +
>>   drivers/md/dm-table.c         |  7 +++++
>>   drivers/md/dm.c               | 59 +++++++++++++++++++++++++++++++++++
>>   include/linux/device-mapper.h |  6 ++++
>>   4 files changed, 73 insertions(+)
>>
>> diff --git a/drivers/md/dm-core.h b/drivers/md/dm-core.h
>> index e6757a30dcca..7e3f2168289f 100644
>> --- a/drivers/md/dm-core.h
>> +++ b/drivers/md/dm-core.h
>> @@ -217,6 +217,7 @@ struct dm_table {
>>   	/* a list of devices used by this table */
>>   	struct list_head devices;
>>   	struct rw_semaphore devices_lock;
>> +	unsigned short num_devices;
>>   
>>   	/* events get handed up using this callback */
>>   	void (*event_fn)(void *data);
>> diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
>> index 41f1d731ae5a..ddc60e498afb 100644
>> --- a/drivers/md/dm-table.c
>> +++ b/drivers/md/dm-table.c
>> @@ -2133,6 +2133,8 @@ void dm_table_postsuspend_targets(struct dm_table *t)
>>   
>>   int dm_table_resume_targets(struct dm_table *t)
>>   {
>> +	struct list_head *devices = dm_table_get_devices(t);
>> +	struct dm_dev_internal *dd;
>>   	unsigned int i;
>>   	int r = 0;
>>   
>> @@ -2159,6 +2161,11 @@ int dm_table_resume_targets(struct dm_table *t)
>>   			ti->type->resume(ti);
>>   	}
>>   
>> +	t->num_devices = 0;
>> +
>> +	list_for_each_entry(dd, devices, list)
>> +		dd->dm_dev->index = ++(t->num_devices);
>> +
>>   	return 0;
>>   }
>>   
>> diff --git a/drivers/md/dm.c b/drivers/md/dm.c
>> index 56aa2a8b9d71..7297235291f6 100644
>> --- a/drivers/md/dm.c
>> +++ b/drivers/md/dm.c
>> @@ -48,6 +48,8 @@
>>    */
>>   #define REQ_DM_POLL_LIST	REQ_DRV
>>   
>> +#define DM_MAX_TABLE_DEVICES	1024
>> +
>>   static const char *_name = DM_NAME;
>>   
>>   static unsigned int major;
>> @@ -1543,10 +1545,38 @@ static unsigned int __send_duplicate_bios(struct clone_info *ci, struct dm_targe
>>   	return ret;
>>   }
>>   
>> +static inline bool has_redundant_flush(struct dm_table *t,
>> +		unsigned long **bitmap)
>> +{
>> +	if (t->num_devices < t->num_targets) {
>> +		/* Add a limit here to prevent excessive memory usage for bitmaps */
>> +		if (t->num_devices >= DM_MAX_TABLE_DEVICES)
>> +			return false;
>> +
>> +		/* dm_dev's index starts from 1, so need plus 1 here */
>> +		*bitmap = bitmap_zalloc(t->num_devices + 1, GFP_KERNEL);
>> +		if (*bitmap)
>> +			return true;
>> +	}
>> +
>> +	return false;
>> +}
>> +
>> +static int dm_get_dev_index(struct dm_target *ti, struct dm_dev *dev,
>> +				     sector_t start, sector_t len, void *data)
>> +{
>> +	unsigned short *index = data;
>> +	*index = dev->index;
>> +	return 0;
>> +}
>> +
>>   static void __send_empty_flush(struct clone_info *ci)
>>   {
>>   	struct dm_table *t = ci->map;
>>   	struct bio flush_bio;
>> +	unsigned long *handled_map;
>> +	unsigned int nr_handled = 0;
>> +	bool check = has_redundant_flush(t, &handled_map);
>>   
>>   	/*
>>   	 * Use an on-stack bio for this, it's safe since we don't
>> @@ -1562,17 +1592,46 @@ static void __send_empty_flush(struct clone_info *ci)
>>   
>>   	for (unsigned int i = 0; i < t->num_targets; i++) {
>>   		unsigned int bios;
>> +		unsigned short index = 0;
>>   		struct dm_target *ti = dm_table_get_target(t, i);
>>   
>>   		if (unlikely(ti->num_flush_bios == 0))
>>   			continue;
>>   
>> +		/*
>> +		 * If the num_targets is greater than the number of block devices
>> +		 * in the dm_table's devices list, __send_empty_flush() might
>> +		 * invoke __send_duplicate_bios() multiple times for the same
>> +		 * block device. This could lead to a substantial decrease in
>> +		 * performance when num_targets significantly exceeds the number
>> +		 * of block devices.
>> +		 * Ensure that __send_duplicate_bios() is only called once for
>> +		 * each block device.
>> +		 */
>> +		if (check) {
>> +			if (nr_handled == t->num_devices)
>> +				break;
>> +
>> +			if (ti->type->iterate_devices)
>> +				ti->type->iterate_devices(ti, dm_get_dev_index, &index);
>> +
>> +			if (index > 0) {
>> +				if (__test_and_set_bit(index, handled_map))
>> +					continue;
>> +				else
>> +					nr_handled++;
>> +			}
>> +		}
>> +
>>   		atomic_add(ti->num_flush_bios, &ci->io->io_count);
>>   		bios = __send_duplicate_bios(ci, ti, ti->num_flush_bios,
>>   					     NULL, GFP_NOWAIT);
>>   		atomic_sub(ti->num_flush_bios - bios, &ci->io->io_count);
>>   	}
>>   
>> +	if (check)
>> +		bitmap_free(handled_map);
>> +
>>   	/*
>>   	 * alloc_io() takes one extra reference for submission, so the
>>   	 * reference won't reach 0 without the following subtraction
>> diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
>> index 82b2195efaca..4a54b4f0a609 100644
>> --- a/include/linux/device-mapper.h
>> +++ b/include/linux/device-mapper.h
>> @@ -169,6 +169,12 @@ struct dm_dev {
>>   	struct dax_device *dax_dev;
>>   	blk_mode_t mode;
>>   	char name[16];
>> +
>> +	/*
>> +	 * sequential number for each dm_dev in dm_table's devices list,
>> +	 * start from 1
>> +	 */
>> +	unsigned short index;
>>   };
>>   
>>   /*
>> -- 
>> 2.34.1
>>
> 
> 


^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2024-05-11 11:08 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2024-04-22 10:05 [RFC PATCH] dm: Avoid sending redundant empty flush bios to the same block device Yang Yang
2024-04-25  2:54 ` YangYang
2024-05-09 14:20 ` Mike Snitzer
2024-05-11 11:05   ` YangYang
2024-05-10 14:08 ` Mikulas Patocka
2024-05-11 11:08   ` YangYang

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).