linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] dm: delay: use kthread instead of timers and wq
@ 2023-10-19 11:21 Christian Loehle
  2023-10-19 19:51 ` Mike Snitzer
  0 siblings, 1 reply; 3+ messages in thread
From: Christian Loehle @ 2023-10-19 11:21 UTC (permalink / raw)
  To: agk, snitzer, dm-devel, linux-kernel

The current design of timers and wq to realize the delays
is insufficient especially for delays below ~50ms.
The design is replaced with a kthread to flush the expired delays,
trading some CPU time (in some cases) for better delay accuracy and
delays closer to what the user requested for smaller delays.

Since bios can't be completed in interrupt context using a kthread
is probably the most reasonable way to approach this.

Testing with
echo "0 2097152 zero" | dmsetup create dm-zeros
for i in $(seq 0 20);
do
  echo "0 2097152 delay /dev/mapper/dm-zeros 0 $i" | dmsetup create dm-delay-${i}ms;
done

Some performance numbers for comparison
beaglebone black (single core) CONFIG_HZ_1000=y:
fio --name=1msread --rw=randread --bs=4k --runtime=60 --time_based --filename=/dev/mapper/dm-delay-1ms
Theoretical maximum: 1000 IOPS
Previous: 250 IOPS
Kthread: 500 IOPS
fio --name=10msread --rw=randread --bs=4k --runtime=60 --time_based --filename=/dev/mapper/dm-delay-10ms
Theoretical maximum: 1000 IOPS
Previous: 45 IOPS
Kthread: 50 IOPS
fio --name=1mswrite --rw=randwrite --direct=1 --bs=4k --runtime=60 --time_based --filename=/dev/mapper/dm-delay-1ms
Theoretical maximum: 1000 IOPS
Previous: 498 IOPS
Kthread: 1000 IOPS
fio --name=10mswrite --rw=randwrite --direct=1 --bs=4k --runtime=60 --time_based --filename=/dev/mapper/dm-delay-10ms
Theoretical maximum: 100 IOPS
Previous: 90 IOPS
Kthread: 100 IOPS
fio --name=10mswriteasync --rw=randwrite --direct=1 --bs=4k --runtime=60 --time_based --filename=/dev/mapper/dm-delay-10ms --numjobs=32 --iodepth=64 --ioengine=libaio --group_reporting
Previous: 13.3k IOPS
Kthread: 13.3k IOPS
(This one is just to prove the new design isn't impacting throughput,
not really about delays)

Signed-off-by: Christian Loehle <christian.loehle@arm.com>
---
 drivers/md/dm-delay.c | 83 ++++++++++++-------------------------------
 1 file changed, 22 insertions(+), 61 deletions(-)

diff --git a/drivers/md/dm-delay.c b/drivers/md/dm-delay.c
index 7433525e5985..a37f28294f7e 100644
--- a/drivers/md/dm-delay.c
+++ b/drivers/md/dm-delay.c
@@ -13,6 +13,7 @@
 #include <linux/blkdev.h>
 #include <linux/bio.h>
 #include <linux/slab.h>
+#include <linux/kthread.h>
 
 #include <linux/device-mapper.h>
 
@@ -26,11 +27,9 @@ struct delay_class {
 };
 
 struct delay_c {
-	struct timer_list delay_timer;
-	struct mutex timer_lock;
-	struct workqueue_struct *kdelayd_wq;
 	struct work_struct flush_expired_bios;
 	struct list_head delayed_bios;
+	struct task_struct *worker;
 	atomic_t may_delay;
 
 	struct delay_class read;
@@ -48,42 +47,25 @@ struct dm_delay_info {
 };
 
 static DEFINE_MUTEX(delayed_bios_lock);
+static void flush_delayed_bios(struct delay_c *dc, int flush_all);
 
-static void handle_delayed_timer(struct timer_list *t)
+static int flush_worker_fn(void *data)
 {
-	struct delay_c *dc = from_timer(dc, t, delay_timer);
+	struct delay_c *dc = (struct delay_c *)data;
 
-	queue_work(dc->kdelayd_wq, &dc->flush_expired_bios);
-}
-
-static void queue_timeout(struct delay_c *dc, unsigned long expires)
-{
-	mutex_lock(&dc->timer_lock);
-
-	if (!timer_pending(&dc->delay_timer) || expires < dc->delay_timer.expires)
-		mod_timer(&dc->delay_timer, expires);
-
-	mutex_unlock(&dc->timer_lock);
-}
-
-static void flush_bios(struct bio *bio)
-{
-	struct bio *n;
-
-	while (bio) {
-		n = bio->bi_next;
-		bio->bi_next = NULL;
-		dm_submit_bio_remap(bio, NULL);
-		bio = n;
+	while (1) {
+		flush_delayed_bios(dc, 0);
+		if (unlikely(list_empty(&dc->delayed_bios)))
+			set_current_state(TASK_INTERRUPTIBLE);
+		schedule();
 	}
+
+	return 0;
 }
 
-static struct bio *flush_delayed_bios(struct delay_c *dc, int flush_all)
+static void flush_delayed_bios(struct delay_c *dc, int flush_all)
 {
 	struct dm_delay_info *delayed, *next;
-	unsigned long next_expires = 0;
-	unsigned long start_timer = 0;
-	struct bio_list flush_bios = { };
 
 	mutex_lock(&delayed_bios_lock);
 	list_for_each_entry_safe(delayed, next, &dc->delayed_bios, list) {
@@ -91,23 +73,12 @@ static struct bio *flush_delayed_bios(struct delay_c *dc, int flush_all)
 			struct bio *bio = dm_bio_from_per_bio_data(delayed,
 						sizeof(struct dm_delay_info));
 			list_del(&delayed->list);
-			bio_list_add(&flush_bios, bio);
+			dm_submit_bio_remap(bio, NULL);
 			delayed->class->ops--;
-			continue;
 		}
-
-		if (!start_timer) {
-			start_timer = 1;
-			next_expires = delayed->expires;
-		} else
-			next_expires = min(next_expires, delayed->expires);
 	}
 	mutex_unlock(&delayed_bios_lock);
 
-	if (start_timer)
-		queue_timeout(dc, next_expires);
-
-	return bio_list_get(&flush_bios);
 }
 
 static void flush_expired_bios(struct work_struct *work)
@@ -115,24 +86,21 @@ static void flush_expired_bios(struct work_struct *work)
 	struct delay_c *dc;
 
 	dc = container_of(work, struct delay_c, flush_expired_bios);
-	flush_bios(flush_delayed_bios(dc, 0));
+	flush_delayed_bios(dc, 0);
 }
 
 static void delay_dtr(struct dm_target *ti)
 {
 	struct delay_c *dc = ti->private;
 
-	if (dc->kdelayd_wq)
-		destroy_workqueue(dc->kdelayd_wq);
-
 	if (dc->read.dev)
 		dm_put_device(ti, dc->read.dev);
 	if (dc->write.dev)
 		dm_put_device(ti, dc->write.dev);
 	if (dc->flush.dev)
 		dm_put_device(ti, dc->flush.dev);
-
-	mutex_destroy(&dc->timer_lock);
+	if (dc->worker)
+		kthread_stop(dc->worker);
 
 	kfree(dc);
 }
@@ -188,10 +156,11 @@ static int delay_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 	}
 
 	ti->private = dc;
-	timer_setup(&dc->delay_timer, handle_delayed_timer, 0);
 	INIT_WORK(&dc->flush_expired_bios, flush_expired_bios);
 	INIT_LIST_HEAD(&dc->delayed_bios);
-	mutex_init(&dc->timer_lock);
+	dc->worker = kthread_create(&flush_worker_fn, dc, "dm-delay-flush-worker");
+	if (!dc->worker)
+		goto bad;
 	atomic_set(&dc->may_delay, 1);
 	dc->argc = argc;
 
@@ -224,13 +193,6 @@ static int delay_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 		goto bad;
 
 out:
-	dc->kdelayd_wq = alloc_workqueue("kdelayd", WQ_MEM_RECLAIM, 0);
-	if (!dc->kdelayd_wq) {
-		ret = -EINVAL;
-		DMERR("Couldn't start kdelayd");
-		goto bad;
-	}
-
 	ti->num_flush_bios = 1;
 	ti->num_discard_bios = 1;
 	ti->accounts_remapped_io = true;
@@ -260,7 +222,7 @@ static int delay_bio(struct delay_c *dc, struct delay_class *c, struct bio *bio)
 	list_add_tail(&delayed->list, &dc->delayed_bios);
 	mutex_unlock(&delayed_bios_lock);
 
-	queue_timeout(dc, expires);
+	wake_up_process(dc->worker);
 
 	return DM_MAPIO_SUBMITTED;
 }
@@ -270,8 +232,7 @@ static void delay_presuspend(struct dm_target *ti)
 	struct delay_c *dc = ti->private;
 
 	atomic_set(&dc->may_delay, 0);
-	del_timer_sync(&dc->delay_timer);
-	flush_bios(flush_delayed_bios(dc, 1));
+	flush_delayed_bios(dc, 1);
 }
 
 static void delay_resume(struct dm_target *ti)
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 3+ messages in thread

* Re: dm: delay: use kthread instead of timers and wq
  2023-10-19 11:21 [PATCH] dm: delay: use kthread instead of timers and wq Christian Loehle
@ 2023-10-19 19:51 ` Mike Snitzer
  0 siblings, 0 replies; 3+ messages in thread
From: Mike Snitzer @ 2023-10-19 19:51 UTC (permalink / raw)
  To: Christian Loehle; +Cc: agk, dm-devel, linux-kernel

On Thu, Oct 19 2023 at  7:21P -0400,
Christian Loehle <christian.loehle@arm.com> wrote:

> The current design of timers and wq to realize the delays
> is insufficient especially for delays below ~50ms.
> The design is replaced with a kthread to flush the expired delays,
> trading some CPU time (in some cases) for better delay accuracy and
> delays closer to what the user requested for smaller delays.
> 
> Since bios can't be completed in interrupt context using a kthread
> is probably the most reasonable way to approach this.

Thanks for working on this.  The tradeoff to burn cpu to poll for bios
that hit their expire time is reasonable for very short delays (which
is the focus of your improvement) _but_ for longer delays it is
definitely wasteful.

Your benchmarks (below) don't quantify the cpu time difference.

Might you use a hybrid aproach where over a certain delay threshold it
does make sense to avoid waking the polling thread?  Otherwise if
delay is below the threshold the busy loop is queued immediately?

Basically: still use workqueue, but bypass jitter associated with
using timer when appropriate (delay is short).

There are some inline comments below on your patch.

Thanks,
Mike

> Testing with
> echo "0 2097152 zero" | dmsetup create dm-zeros
> for i in $(seq 0 20);
> do
>   echo "0 2097152 delay /dev/mapper/dm-zeros 0 $i" | dmsetup create dm-delay-${i}ms;
> done
> 
> Some performance numbers for comparison
> beaglebone black (single core) CONFIG_HZ_1000=y:
> fio --name=1msread --rw=randread --bs=4k --runtime=60 --time_based --filename=/dev/mapper/dm-delay-1ms
> Theoretical maximum: 1000 IOPS
> Previous: 250 IOPS
> Kthread: 500 IOPS
> fio --name=10msread --rw=randread --bs=4k --runtime=60 --time_based --filename=/dev/mapper/dm-delay-10ms
> Theoretical maximum: 1000 IOPS
> Previous: 45 IOPS
> Kthread: 50 IOPS
> fio --name=1mswrite --rw=randwrite --direct=1 --bs=4k --runtime=60 --time_based --filename=/dev/mapper/dm-delay-1ms
> Theoretical maximum: 1000 IOPS
> Previous: 498 IOPS
> Kthread: 1000 IOPS
> fio --name=10mswrite --rw=randwrite --direct=1 --bs=4k --runtime=60 --time_based --filename=/dev/mapper/dm-delay-10ms
> Theoretical maximum: 100 IOPS
> Previous: 90 IOPS
> Kthread: 100 IOPS
> fio --name=10mswriteasync --rw=randwrite --direct=1 --bs=4k --runtime=60 --time_based --filename=/dev/mapper/dm-delay-10ms --numjobs=32 --iodepth=64 --ioengine=libaio --group_reporting
> Previous: 13.3k IOPS
> Kthread: 13.3k IOPS
> (This one is just to prove the new design isn't impacting throughput,
> not really about delays)
> 
> Signed-off-by: Christian Loehle <christian.loehle@arm.com>
> ---
>  drivers/md/dm-delay.c | 83 ++++++++++++-------------------------------
>  1 file changed, 22 insertions(+), 61 deletions(-)
> 
> diff --git a/drivers/md/dm-delay.c b/drivers/md/dm-delay.c
> index 7433525e5985..a37f28294f7e 100644
> --- a/drivers/md/dm-delay.c
> +++ b/drivers/md/dm-delay.c
> @@ -13,6 +13,7 @@
>  #include <linux/blkdev.h>
>  #include <linux/bio.h>
>  #include <linux/slab.h>
> +#include <linux/kthread.h>
>  
>  #include <linux/device-mapper.h>
>  
> @@ -26,11 +27,9 @@ struct delay_class {
>  };
>  
>  struct delay_c {
> -	struct timer_list delay_timer;
> -	struct mutex timer_lock;
> -	struct workqueue_struct *kdelayd_wq;
>  	struct work_struct flush_expired_bios;
>  	struct list_head delayed_bios;
> +	struct task_struct *worker;
>  	atomic_t may_delay;
>  
>  	struct delay_class read;
> @@ -48,42 +47,25 @@ struct dm_delay_info {
>  };
>  
>  static DEFINE_MUTEX(delayed_bios_lock);
> +static void flush_delayed_bios(struct delay_c *dc, int flush_all);
>  
> -static void handle_delayed_timer(struct timer_list *t)
> +static int flush_worker_fn(void *data)
>  {
> -	struct delay_c *dc = from_timer(dc, t, delay_timer);
> +	struct delay_c *dc = (struct delay_c *)data;

Cast shouldn't be needed.

> -	queue_work(dc->kdelayd_wq, &dc->flush_expired_bios);
> -}
> -
> -static void queue_timeout(struct delay_c *dc, unsigned long expires)
> -{
> -	mutex_lock(&dc->timer_lock);
> -
> -	if (!timer_pending(&dc->delay_timer) || expires < dc->delay_timer.expires)
> -		mod_timer(&dc->delay_timer, expires);
> -
> -	mutex_unlock(&dc->timer_lock);
> -}
> -
> -static void flush_bios(struct bio *bio)
> -{
> -	struct bio *n;
> -
> -	while (bio) {
> -		n = bio->bi_next;
> -		bio->bi_next = NULL;
> -		dm_submit_bio_remap(bio, NULL);
> -		bio = n;
> +	while (1) {
> +		flush_delayed_bios(dc, 0);
> +		if (unlikely(list_empty(&dc->delayed_bios)))
> +			set_current_state(TASK_INTERRUPTIBLE);
> +		schedule();

cond_resched() likely better?

>  	}
> +
> +	return 0;
>  }
>  
> -static struct bio *flush_delayed_bios(struct delay_c *dc, int flush_all)
> +static void flush_delayed_bios(struct delay_c *dc, int flush_all)

Feel free to switch flush_all to a bool.

>  {
>  	struct dm_delay_info *delayed, *next;
> -	unsigned long next_expires = 0;
> -	unsigned long start_timer = 0;
> -	struct bio_list flush_bios = { };
>  
>  	mutex_lock(&delayed_bios_lock);
>  	list_for_each_entry_safe(delayed, next, &dc->delayed_bios, list) {
> @@ -91,23 +73,12 @@ static struct bio *flush_delayed_bios(struct delay_c *dc, int flush_all)
>  			struct bio *bio = dm_bio_from_per_bio_data(delayed,
>  						sizeof(struct dm_delay_info));
>  			list_del(&delayed->list);
> -			bio_list_add(&flush_bios, bio);
> +			dm_submit_bio_remap(bio, NULL);
>  			delayed->class->ops--;
> -			continue;
>  		}
> -
> -		if (!start_timer) {
> -			start_timer = 1;
> -			next_expires = delayed->expires;
> -		} else
> -			next_expires = min(next_expires, delayed->expires);
>  	}
>  	mutex_unlock(&delayed_bios_lock);
>  
> -	if (start_timer)
> -		queue_timeout(dc, next_expires);
> -
> -	return bio_list_get(&flush_bios);
>  }
>  
>  static void flush_expired_bios(struct work_struct *work)
> @@ -115,24 +86,21 @@ static void flush_expired_bios(struct work_struct *work)
>  	struct delay_c *dc;
>  
>  	dc = container_of(work, struct delay_c, flush_expired_bios);
> -	flush_bios(flush_delayed_bios(dc, 0));
> +	flush_delayed_bios(dc, 0);
>  }
>  
>  static void delay_dtr(struct dm_target *ti)
>  {
>  	struct delay_c *dc = ti->private;
>  
> -	if (dc->kdelayd_wq)
> -		destroy_workqueue(dc->kdelayd_wq);
> -
>  	if (dc->read.dev)
>  		dm_put_device(ti, dc->read.dev);
>  	if (dc->write.dev)
>  		dm_put_device(ti, dc->write.dev);
>  	if (dc->flush.dev)
>  		dm_put_device(ti, dc->flush.dev);
> -
> -	mutex_destroy(&dc->timer_lock);
> +	if (dc->worker)
> +		kthread_stop(dc->worker);
>  
>  	kfree(dc);
>  }
> @@ -188,10 +156,11 @@ static int delay_ctr(struct dm_target *ti, unsigned int argc, char **argv)
>  	}
>  
>  	ti->private = dc;
> -	timer_setup(&dc->delay_timer, handle_delayed_timer, 0);
>  	INIT_WORK(&dc->flush_expired_bios, flush_expired_bios);
>  	INIT_LIST_HEAD(&dc->delayed_bios);
> -	mutex_init(&dc->timer_lock);
> +	dc->worker = kthread_create(&flush_worker_fn, dc, "dm-delay-flush-worker");
> +	if (!dc->worker)
> +		goto bad;
>  	atomic_set(&dc->may_delay, 1);
>  	dc->argc = argc;
>  
> @@ -224,13 +193,6 @@ static int delay_ctr(struct dm_target *ti, unsigned int argc, char **argv)
>  		goto bad;
>  
>  out:
> -	dc->kdelayd_wq = alloc_workqueue("kdelayd", WQ_MEM_RECLAIM, 0);
> -	if (!dc->kdelayd_wq) {
> -		ret = -EINVAL;
> -		DMERR("Couldn't start kdelayd");
> -		goto bad;
> -	}
> -
>  	ti->num_flush_bios = 1;
>  	ti->num_discard_bios = 1;
>  	ti->accounts_remapped_io = true;
> @@ -260,7 +222,7 @@ static int delay_bio(struct delay_c *dc, struct delay_class *c, struct bio *bio)
>  	list_add_tail(&delayed->list, &dc->delayed_bios);
>  	mutex_unlock(&delayed_bios_lock);
>  
> -	queue_timeout(dc, expires);
> +	wake_up_process(dc->worker);
>  
>  	return DM_MAPIO_SUBMITTED;
>  }
> @@ -270,8 +232,7 @@ static void delay_presuspend(struct dm_target *ti)
>  	struct delay_c *dc = ti->private;
>  
>  	atomic_set(&dc->may_delay, 0);
> -	del_timer_sync(&dc->delay_timer);
> -	flush_bios(flush_delayed_bios(dc, 1));
> +	flush_delayed_bios(dc, 1);
>  }
>  
>  static void delay_resume(struct dm_target *ti)
> -- 
> 2.34.1
> 

^ permalink raw reply	[flat|nested] 3+ messages in thread

* Re: dm: delay: use kthread instead of timers and wq
  2023-10-20 11:46 [PATCHv2] " Christian Loehle
@ 2023-10-27 19:17 ` Mike Snitzer
  0 siblings, 0 replies; 3+ messages in thread
From: Mike Snitzer @ 2023-10-27 19:17 UTC (permalink / raw)
  To: Christian Loehle; +Cc: dm-devel, agk, linux-kernel

On Fri, Oct 20 2023 at  7:46P -0400,
Christian Loehle <christian.loehle@arm.com> wrote:

> The current design of timers and wq to realize the delays
> is insufficient especially for delays below ~50ms.
> The design is enhanced with a kthread to flush the expired delays,
> trading some CPU time (in some cases) for better delay accuracy and
> delays closer to what the user requested for smaller delays.
> The new design is chosen as long as all the delays are below 50ms.
> 
> Since bios can't be completed in interrupt context using a kthread
> is probably the most reasonable way to approach this.
> 
> Testing with
> echo "0 2097152 zero" | dmsetup create dm-zeros
> for i in $(seq 0 20);
> do
>   echo "0 2097152 delay /dev/mapper/dm-zeros 0 $i" | dmsetup create dm-delay-${i}ms;
> done
> 
> Some performance numbers for comparison
> beaglebone black (single core) CONFIG_HZ_1000=y:
> fio --name=1msread --rw=randread --bs=4k --runtime=60 --time_based --filename=/dev/mapper/dm-delay-1ms
> Theoretical maximum: 1000 IOPS
> Previous: 250 IOPS
> Kthread: 500 IOPS
> fio --name=10msread --rw=randread --bs=4k --runtime=60 --time_based --filename=/dev/mapper/dm-delay-10ms
> Theoretical maximum: 100 IOPS
> Previous: 45 IOPS
> Kthread: 50 IOPS
> fio --name=1mswrite --rw=randwrite --direct=1 --bs=4k --runtime=60 --time_based --filename=/dev/mapper/dm-delay-1ms
> Theoretical maximum: 1000 IOPS
> Previous: 498 IOPS
> Kthread: 1000 IOPS
> fio --name=10mswrite --rw=randwrite --direct=1 --bs=4k --runtime=60 --time_based --filename=/dev/mapper/dm-delay-10ms
> Theoretical maximum: 100 IOPS
> Previous: 90 IOPS
> Kthread: 100 IOPS
> fio --name=10mswriteasync --rw=randwrite --direct=1 --bs=4k --runtime=60 --time_based --filename=/dev/mapper/dm-delay-10ms --numjobs=32 --iodepth=64 --ioengine=libaio --group_reporting
> Previous: 13.3k IOPS
> Kthread: 13.3k IOPS
> (This one is just to prove the new design isn't impacting throughput,
> not really about delays)
> 
> Signed-off-by: Christian Loehle <christian.loehle@arm.com>
> ---
> v2:
> 	- Keep the timer wq and delay design for longer delays
> 	- Address the rest of Mike's minor comments


Hi,

I've picked this up.  But I fixed various issues along the way.

Please see the staged commit here:
https://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm.git/commit/?h=dm-6.7&id=c1fce71d29b2a48fd6788f9555561fda0f0c1863

Issues ranged from whitespace, to removing needless forward
declaration, to removing stray changes (restoring 'continue;' in
flush_delayed_bios), actually using a bool for bool params, fixed
missing mutex_init, and use max() rather than opencode comparisons in
the ctr.  I might be forgetting some other code change. ;)

I also tweaked the commit header for whitespace (line wrapping, etc).

Thanks,
Mike

^ permalink raw reply	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2023-10-27 19:18 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-10-19 11:21 [PATCH] dm: delay: use kthread instead of timers and wq Christian Loehle
2023-10-19 19:51 ` Mike Snitzer
2023-10-20 11:46 [PATCHv2] " Christian Loehle
2023-10-27 19:17 ` Mike Snitzer

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).