linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] blk-throtl: optimize IOPS throttle for large IO scenarios
@ 2021-07-16  6:22 brookxu
  2021-07-16 16:09 ` Tejun Heo
  0 siblings, 1 reply; 9+ messages in thread
From: brookxu @ 2021-07-16  6:22 UTC (permalink / raw)
  To: axboe, tj; +Cc: cgroups, linux-block, linux-kernel

From: Chunguang Xu <brookxu@tencent.com>

After patch 54efd50 (block: make generic_make_request handle
arbitrarily sized bios), the IO through io-throttle may be larger,
and these IOs may be further split into more small IOs. However,
IOPS throttle does not seem to be aware of this change, which
makes the calculation of IOPS of large IOs incomplete, resulting
in disk-side IOPS that does not meet expectations. Maybe we should
fix this problem.

We can reproduce it by set max_sectors_kb of disk to 128, set
blkio.write_iops_throttle to 100, run a dd instance inside blkio
and use iostat to watch IOPS:

dd if=/dev/zero of=/dev/sdb bs=1M count=1000 oflag=direct

As a result, without this change the average IOPS is 1995, with
this change the IOPS is 98.

Signed-off-by: Chunguang Xu <brookxu@tencent.com>
---
 block/blk-merge.c    |  2 ++
 block/blk-throttle.c | 34 ++++++++++++++++++++++++++++++++++
 block/blk.h          |  2 ++
 3 files changed, 38 insertions(+)

diff --git a/block/blk-merge.c b/block/blk-merge.c
index a11b3b5..86ff943 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -348,6 +348,8 @@ void __blk_queue_split(struct bio **bio, unsigned int *nr_segs)
 		trace_block_split(split, (*bio)->bi_iter.bi_sector);
 		submit_bio_noacct(*bio);
 		*bio = split;
+
+		blk_throtl_recharge_bio(*bio);
 	}
 }
 
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index b1b22d8..1967438 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -2176,6 +2176,40 @@ static inline void throtl_update_latency_buckets(struct throtl_data *td)
 }
 #endif
 
+void blk_throtl_recharge_bio(struct bio *bio)
+{
+	bool rw = bio_data_dir(bio);
+	struct blkcg_gq *blkg = bio->bi_blkg;
+	struct throtl_grp *tg = blkg_to_tg(blkg);
+	u32 iops_limit = tg_iops_limit(tg, rw);
+
+	if (iops_limit == UINT_MAX)
+		return;
+
+	/*
+	 * If previous slice expired, start a new one otherwise renew/extend
+	 * existing slice to make sure it is at least throtl_slice interval
+	 * long since now. New slice is started only for empty throttle group.
+	 * If there is queued bio, that means there should be an active
+	 * slice and it should be extended instead.
+	 */
+	if (throtl_slice_used(tg, rw) && !(tg->service_queue.nr_queued[rw]))
+		throtl_start_new_slice(tg, rw);
+	else {
+		if (time_before(tg->slice_end[rw],
+		    jiffies + tg->td->throtl_slice))
+			throtl_extend_slice(tg, rw,
+				jiffies + tg->td->throtl_slice);
+	}
+
+	/* Recharge the bio to the group, as some BIOs will be further split
+	 * after passing through the throttle, causing the actual IOPS to
+	 * be greater than the expected value.
+	 */
+	tg->last_io_disp[rw]++;
+	tg->io_disp[rw]++;
+}
+
 bool blk_throtl_bio(struct bio *bio)
 {
 	struct request_queue *q = bio->bi_bdev->bd_disk->queue;
diff --git a/block/blk.h b/block/blk.h
index 4b885c0..067634a 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -293,12 +293,14 @@ struct io_cq *ioc_create_icq(struct io_context *ioc, struct request_queue *q,
 extern int blk_throtl_init(struct request_queue *q);
 extern void blk_throtl_exit(struct request_queue *q);
 extern void blk_throtl_register_queue(struct request_queue *q);
+extern void blk_throtl_recharge_bio(struct bio *bio);
 bool blk_throtl_bio(struct bio *bio);
 #else /* CONFIG_BLK_DEV_THROTTLING */
 static inline int blk_throtl_init(struct request_queue *q) { return 0; }
 static inline void blk_throtl_exit(struct request_queue *q) { }
 static inline void blk_throtl_register_queue(struct request_queue *q) { }
 static inline bool blk_throtl_bio(struct bio *bio) { return false; }
+static inline void blk_throtl_recharge_bio(struct bio *bio) { }
 #endif /* CONFIG_BLK_DEV_THROTTLING */
 #ifdef CONFIG_BLK_DEV_THROTTLING_LOW
 extern ssize_t blk_throtl_sample_time_show(struct request_queue *q, char *page);
-- 
1.8.3.1


^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] blk-throtl: optimize IOPS throttle for large IO scenarios
  2021-07-16  6:22 [PATCH] blk-throtl: optimize IOPS throttle for large IO scenarios brookxu
@ 2021-07-16 16:09 ` Tejun Heo
  2021-07-16 23:07   ` brookxu
  2021-07-19 16:35   ` brookxu
  0 siblings, 2 replies; 9+ messages in thread
From: Tejun Heo @ 2021-07-16 16:09 UTC (permalink / raw)
  To: brookxu; +Cc: axboe, cgroups, linux-block, linux-kernel

Hello,

On Fri, Jul 16, 2021 at 02:22:49PM +0800, brookxu wrote:
> diff --git a/block/blk-merge.c b/block/blk-merge.c
> index a11b3b5..86ff943 100644
> --- a/block/blk-merge.c
> +++ b/block/blk-merge.c
> @@ -348,6 +348,8 @@ void __blk_queue_split(struct bio **bio, unsigned int *nr_segs)
>  		trace_block_split(split, (*bio)->bi_iter.bi_sector);
>  		submit_bio_noacct(*bio);
>  		*bio = split;
> +
> +		blk_throtl_recharge_bio(*bio);

I don't think we're holding the queue lock here.

>  	}
>  }
>  
> diff --git a/block/blk-throttle.c b/block/blk-throttle.c
> index b1b22d8..1967438 100644
> --- a/block/blk-throttle.c
> +++ b/block/blk-throttle.c
> @@ -2176,6 +2176,40 @@ static inline void throtl_update_latency_buckets(struct throtl_data *td)
>  }
>  #endif
>  
> +void blk_throtl_recharge_bio(struct bio *bio)
> +{
> +	bool rw = bio_data_dir(bio);
> +	struct blkcg_gq *blkg = bio->bi_blkg;
> +	struct throtl_grp *tg = blkg_to_tg(blkg);
> +	u32 iops_limit = tg_iops_limit(tg, rw);
> +
> +	if (iops_limit == UINT_MAX)
> +		return;
> +
> +	/*
> +	 * If previous slice expired, start a new one otherwise renew/extend
> +	 * existing slice to make sure it is at least throtl_slice interval
> +	 * long since now. New slice is started only for empty throttle group.
> +	 * If there is queued bio, that means there should be an active
> +	 * slice and it should be extended instead.
> +	 */
> +	if (throtl_slice_used(tg, rw) && !(tg->service_queue.nr_queued[rw]))
> +		throtl_start_new_slice(tg, rw);
> +	else {
> +		if (time_before(tg->slice_end[rw],
> +		    jiffies + tg->td->throtl_slice))
> +			throtl_extend_slice(tg, rw,
> +				jiffies + tg->td->throtl_slice);
> +	}
> +
> +	/* Recharge the bio to the group, as some BIOs will be further split
> +	 * after passing through the throttle, causing the actual IOPS to
> +	 * be greater than the expected value.
> +	 */
> +	tg->last_io_disp[rw]++;
> +	tg->io_disp[rw]++;
> +}

But blk-throtl expects queue lock to be held.

How about doing something simpler? Just estimate how many bios a given bio
is gonna be and charge it outright? The calculation will be duplicated
between the split path but that seems like the path of least resistance
here.

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] blk-throtl: optimize IOPS throttle for large IO scenarios
  2021-07-16 16:09 ` Tejun Heo
@ 2021-07-16 23:07   ` brookxu
  2021-07-19 16:35   ` brookxu
  1 sibling, 0 replies; 9+ messages in thread
From: brookxu @ 2021-07-16 23:07 UTC (permalink / raw)
  To: Tejun Heo; +Cc: axboe, cgroups, linux-block, linux-kernel



Tejun Heo wrote on 2021/7/17 0:09:
> Hello,
> 
> On Fri, Jul 16, 2021 at 02:22:49PM +0800, brookxu wrote:
>> diff --git a/block/blk-merge.c b/block/blk-merge.c
>> index a11b3b5..86ff943 100644
>> --- a/block/blk-merge.c
>> +++ b/block/blk-merge.c
>> @@ -348,6 +348,8 @@ void __blk_queue_split(struct bio **bio, unsigned int *nr_segs)
>>  		trace_block_split(split, (*bio)->bi_iter.bi_sector);
>>  		submit_bio_noacct(*bio);
>>  		*bio = split;
>> +
>> +		blk_throtl_recharge_bio(*bio);
> 
> I don't think we're holding the queue lock here.

sorry, some kind of synchronization mechanism is really needed here. But the use of queue_lock
here may be unsafe, since it is difficult for us to control the lock on the split path.

>>  	}
>>  }
>>  
>> diff --git a/block/blk-throttle.c b/block/blk-throttle.c
>> index b1b22d8..1967438 100644
>> --- a/block/blk-throttle.c
>> +++ b/block/blk-throttle.c
>> @@ -2176,6 +2176,40 @@ static inline void throtl_update_latency_buckets(struct throtl_data *td)
>>  }
>>  #endif
>>  
>> +void blk_throtl_recharge_bio(struct bio *bio)
>> +{
>> +	bool rw = bio_data_dir(bio);
>> +	struct blkcg_gq *blkg = bio->bi_blkg;
>> +	struct throtl_grp *tg = blkg_to_tg(blkg);
>> +	u32 iops_limit = tg_iops_limit(tg, rw);
>> +
>> +	if (iops_limit == UINT_MAX)
>> +		return;
>> +
>> +	/*
>> +	 * If previous slice expired, start a new one otherwise renew/extend
>> +	 * existing slice to make sure it is at least throtl_slice interval
>> +	 * long since now. New slice is started only for empty throttle group.
>> +	 * If there is queued bio, that means there should be an active
>> +	 * slice and it should be extended instead.
>> +	 */
>> +	if (throtl_slice_used(tg, rw) && !(tg->service_queue.nr_queued[rw]))
>> +		throtl_start_new_slice(tg, rw);
>> +	else {
>> +		if (time_before(tg->slice_end[rw],
>> +		    jiffies + tg->td->throtl_slice))
>> +			throtl_extend_slice(tg, rw,
>> +				jiffies + tg->td->throtl_slice);
>> +	}
>> +
>> +	/* Recharge the bio to the group, as some BIOs will be further split
>> +	 * after passing through the throttle, causing the actual IOPS to
>> +	 * be greater than the expected value.
>> +	 */
>> +	tg->last_io_disp[rw]++;
>> +	tg->io_disp[rw]++;
>> +}
> 
> But blk-throtl expects queue lock to be held.
> 
> How about doing something simpler? Just estimate how many bios a given bio
> is gonna be and charge it outright? The calculation will be duplicated
> between the split path but that seems like the path of least resistance
> here.

I have tried this method, the code redundancy is indeed a bit high, it may not be
very convenient for code maintenance. In addition to this problem, since we add
a large value at a time, the fluctuation of IOPS will be relatively large. Since
blk_throtl_recharge_bio() does not need to participate in the maintenance of the
state machine, we only need to protect some fields of tg, so can we add a new
spin_lock to tg instead of queue_lock to solve the synchronization problem ? Just
a idea, Thanks.

> Thanks.
> 

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] blk-throtl: optimize IOPS throttle for large IO scenarios
  2021-07-16 16:09 ` Tejun Heo
  2021-07-16 23:07   ` brookxu
@ 2021-07-19 16:35   ` brookxu
  2021-07-26 21:46     ` Tejun Heo
  1 sibling, 1 reply; 9+ messages in thread
From: brookxu @ 2021-07-19 16:35 UTC (permalink / raw)
  To: Tejun Heo; +Cc: axboe, cgroups, linux-block, linux-kernel

Hi Tejun:

In order to avoid code duplication and IOPS stability problems caused by estimating
the equivalent number of IOs, and to avoid potential deadlock problems caused by
synchronization through queue_lock. I tried to count the number of splited IOs in
the current window through two atomic counters. Add the value of the atomic variable
when calculating io_disp[rw], which can also avoid the problem of inaccurate IOPS in
large IO scenarios. How do you think of this approach? Thanks for your time.

The following are related changes. If that is ok, I will send the v2 version later.

---
 block/blk-merge.c    |  2 ++
 block/blk-throttle.c | 66 +++++++++++++++++++++++++++++++++++++++++++++++-----
 block/blk.h          |  2 ++
 3 files changed, 64 insertions(+), 6 deletions(-)

diff --git a/block/blk-merge.c b/block/blk-merge.c
index a11b3b5..86ff943 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -348,6 +348,8 @@ void __blk_queue_split(struct bio **bio, unsigned int *nr_segs)
 		trace_block_split(split, (*bio)->bi_iter.bi_sector);
 		submit_bio_noacct(*bio);
 		*bio = split;
+
+		blk_throtl_recharge_bio(*bio);
 	}
 }
 
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index b1b22d8..a0daa15 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -178,6 +178,9 @@ struct throtl_grp {
 	unsigned int bad_bio_cnt; /* bios exceeding latency threshold */
 	unsigned long bio_cnt_reset_time;
 
+	atomic_t io_split_cnt[2];
+	atomic_t last_io_split_cnt[2];
+
 	struct blkg_rwstat stat_bytes;
 	struct blkg_rwstat stat_ios;
 };
@@ -293,6 +296,16 @@ static uint64_t throtl_adjusted_limit(uint64_t low, struct throtl_data *td)
 	return low + (low >> 1) * td->scale;
 }
 
+static inline unsigned int tg_io_disp(struct throtl_grp *tg, int rw)
+{
+	return tg->io_disp[rw] + atomic_read(&tg->io_split_cnt[rw]);
+}
+
+static inline unsigned int tg_last_io_disp(struct throtl_grp *tg, int rw)
+{
+	return tg->last_io_disp[rw] + atomic_read(&tg->last_io_split_cnt[rw]);
+}
+
 static uint64_t tg_bps_limit(struct throtl_grp *tg, int rw)
 {
 	struct blkcg_gq *blkg = tg_to_blkg(tg);
@@ -524,6 +537,11 @@ static struct blkg_policy_data *throtl_pd_alloc(gfp_t gfp,
 	tg->idletime_threshold = DFL_IDLE_THRESHOLD;
 	tg->idletime_threshold_conf = DFL_IDLE_THRESHOLD;
 
+	atomic_set(&tg->io_split_cnt[0], 0);
+	atomic_set(&tg->io_split_cnt[1], 0);
+	atomic_set(&tg->last_io_split_cnt[0], 0);
+	atomic_set(&tg->last_io_split_cnt[1], 0);
+
 	return &tg->pd;
 
 err_exit_stat_bytes:
@@ -777,6 +795,8 @@ static inline void throtl_start_new_slice_with_credit(struct throtl_grp *tg,
 	tg->bytes_disp[rw] = 0;
 	tg->io_disp[rw] = 0;
 
+	atomic_set(&tg->io_split_cnt[rw], 0);
+
 	/*
 	 * Previous slice has expired. We must have trimmed it after last
 	 * bio dispatch. That means since start of last slice, we never used
@@ -799,6 +819,9 @@ static inline void throtl_start_new_slice(struct throtl_grp *tg, bool rw)
 	tg->io_disp[rw] = 0;
 	tg->slice_start[rw] = jiffies;
 	tg->slice_end[rw] = jiffies + tg->td->throtl_slice;
+
+	atomic_set(&tg->io_split_cnt[rw], 0);
+
 	throtl_log(&tg->service_queue,
 		   "[%c] new slice start=%lu end=%lu jiffies=%lu",
 		   rw == READ ? 'R' : 'W', tg->slice_start[rw],
@@ -877,10 +900,19 @@ static inline void throtl_trim_slice(struct throtl_grp *tg, bool rw)
 	else
 		tg->bytes_disp[rw] = 0;
 
-	if (tg->io_disp[rw] >= io_trim)
+	if (tg_io_disp(tg, rw) >= io_trim) {
+		int cnt = atomic_read(&tg->io_split_cnt[rw]);
+
+		if (cnt) {
+			atomic_set(&tg->io_split_cnt[rw], 0);
+			tg->io_disp[rw] += cnt;
+		}
+
 		tg->io_disp[rw] -= io_trim;
-	else
+	} else {
+		atomic_set(&tg->io_split_cnt[rw], 0);
 		tg->io_disp[rw] = 0;
+	}
 
 	tg->slice_start[rw] += nr_slices * tg->td->throtl_slice;
 
@@ -924,7 +956,7 @@ static bool tg_with_in_iops_limit(struct throtl_grp *tg, struct bio *bio,
 	else
 		io_allowed = tmp;
 
-	if (tg->io_disp[rw] + 1 <= io_allowed) {
+	if (tg_io_disp(tg, rw) + 1 <= io_allowed) {
 		if (wait)
 			*wait = 0;
 		return true;
@@ -2052,13 +2084,13 @@ static void throtl_downgrade_check(struct throtl_grp *tg)
 	}
 
 	if (tg->iops[READ][LIMIT_LOW]) {
-		iops = tg->last_io_disp[READ] * HZ / elapsed_time;
+		iops = tg_last_io_disp(tg, READ) * HZ / elapsed_time;
 		if (iops >= tg->iops[READ][LIMIT_LOW])
 			tg->last_low_overflow_time[READ] = now;
 	}
 
 	if (tg->iops[WRITE][LIMIT_LOW]) {
-		iops = tg->last_io_disp[WRITE] * HZ / elapsed_time;
+		iops = tg_last_io_disp(tg, WRITE) * HZ / elapsed_time;
 		if (iops >= tg->iops[WRITE][LIMIT_LOW])
 			tg->last_low_overflow_time[WRITE] = now;
 	}
@@ -2074,6 +2106,9 @@ static void throtl_downgrade_check(struct throtl_grp *tg)
 	tg->last_bytes_disp[WRITE] = 0;
 	tg->last_io_disp[READ] = 0;
 	tg->last_io_disp[WRITE] = 0;
+
+	atomic_set(&tg->last_io_split_cnt[READ], 0);
+	atomic_set(&tg->last_io_split_cnt[WRITE], 0);
 }
 
 static void blk_throtl_update_idletime(struct throtl_grp *tg)
@@ -2176,6 +2211,25 @@ static inline void throtl_update_latency_buckets(struct throtl_data *td)
 }
 #endif
 
+void blk_throtl_recharge_bio(struct bio *bio)
+{
+	struct blkcg_gq *blkg = bio->bi_blkg;
+	struct throtl_grp *parent = blkg_to_tg(blkg);
+	struct throtl_service_queue *parent_sq;
+	bool rw = bio_data_dir(bio);
+
+	if (!parent->has_rules[rw])
+		return;
+
+	do {
+		atomic_inc(&parent->io_split_cnt[rw]);
+		atomic_inc(&parent->last_io_split_cnt[rw]);
+
+		parent_sq = parent->service_queue.parent_sq;
+		parent = sq_to_tg(parent_sq);
+	} while (parent);
+}
+
 bool blk_throtl_bio(struct bio *bio)
 {
 	struct request_queue *q = bio->bi_bdev->bd_disk->queue;
@@ -2263,7 +2317,7 @@ bool blk_throtl_bio(struct bio *bio)
 		   rw == READ ? 'R' : 'W',
 		   tg->bytes_disp[rw], bio->bi_iter.bi_size,
 		   tg_bps_limit(tg, rw),
-		   tg->io_disp[rw], tg_iops_limit(tg, rw),
+		   tg_io_disp(tg, rw), tg_iops_limit(tg, rw),
 		   sq->nr_queued[READ], sq->nr_queued[WRITE]);
 
 	tg->last_low_overflow_time[rw] = jiffies;
diff --git a/block/blk.h b/block/blk.h
index 4b885c0..9e925bf 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -293,11 +293,13 @@ struct io_cq *ioc_create_icq(struct io_context *ioc, struct request_queue *q,
 extern int blk_throtl_init(struct request_queue *q);
 extern void blk_throtl_exit(struct request_queue *q);
 extern void blk_throtl_register_queue(struct request_queue *q);
+extern void blk_throtl_recharge_bio(struct bio *bio);
 bool blk_throtl_bio(struct bio *bio);
 #else /* CONFIG_BLK_DEV_THROTTLING */
 static inline int blk_throtl_init(struct request_queue *q) { return 0; }
 static inline void blk_throtl_exit(struct request_queue *q) { }
 static inline void blk_throtl_register_queue(struct request_queue *q) { }
+static inline void blk_throtl_recharge_bio(struct bio *bio) { }
 static inline bool blk_throtl_bio(struct bio *bio) { return false; }
 #endif /* CONFIG_BLK_DEV_THROTTLING */
 #ifdef CONFIG_BLK_DEV_THROTTLING_LOW
-- 
1.8.3.1



^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] blk-throtl: optimize IOPS throttle for large IO scenarios
  2021-07-19 16:35   ` brookxu
@ 2021-07-26 21:46     ` Tejun Heo
  2021-07-27  3:06       ` brookxu
  0 siblings, 1 reply; 9+ messages in thread
From: Tejun Heo @ 2021-07-26 21:46 UTC (permalink / raw)
  To: brookxu; +Cc: axboe, cgroups, linux-block, linux-kernel

Hello,

On Tue, Jul 20, 2021 at 12:35:54AM +0800, brookxu wrote:
> In order to avoid code duplication and IOPS stability problems caused by estimating
> the equivalent number of IOs, and to avoid potential deadlock problems caused by
> synchronization through queue_lock. I tried to count the number of splited IOs in
> the current window through two atomic counters. Add the value of the atomic variable
> when calculating io_disp[rw], which can also avoid the problem of inaccurate IOPS in
> large IO scenarios. How do you think of this approach? Thanks for your time.

I guess it's okay but am still not a big fan of adding another hook. This is
primarily because blk-throtl is sitting too early in the stack - e.g. rq_qos
is doing the same thing but sits after the split path - and it's a bit nasty
to add an additional hook for it.

Do you think it can be an option to relocate the blk-throtl hooks to the
same spots as rq-qos or, even better, make it use rq-qos?

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] blk-throtl: optimize IOPS throttle for large IO scenarios
  2021-07-26 21:46     ` Tejun Heo
@ 2021-07-27  3:06       ` brookxu
  2021-07-27 16:21         ` Tejun Heo
  0 siblings, 1 reply; 9+ messages in thread
From: brookxu @ 2021-07-27  3:06 UTC (permalink / raw)
  To: Tejun Heo; +Cc: axboe, cgroups, linux-block, linux-kernel



Tejun Heo wrote on 2021/7/27 5:46:
> Hello,
> 
> On Tue, Jul 20, 2021 at 12:35:54AM +0800, brookxu wrote:
>> In order to avoid code duplication and IOPS stability problems caused by estimating
>> the equivalent number of IOs, and to avoid potential deadlock problems caused by
>> synchronization through queue_lock. I tried to count the number of splited IOs in
>> the current window through two atomic counters. Add the value of the atomic variable
>> when calculating io_disp[rw], which can also avoid the problem of inaccurate IOPS in
>> large IO scenarios. How do you think of this approach? Thanks for your time.
> 
> I guess it's okay but am still not a big fan of adding another hook. This is
> primarily because blk-throtl is sitting too early in the stack - e.g. rq_qos
> is doing the same thing but sits after the split path - and it's a bit nasty
> to add an additional hook for it.
> 
> Do you think it can be an option to relocate the blk-throtl hooks to the
> same spots as rq-qos or, even better, make it use rq-qos?

Make blk-throttle use rq-qos may be more elegant. But I found that there may be at least
one problem that is difficult to solve. blk-throttle supports separate throttle for read
and write IOs, which means that we cannot suspend tasks during throttle, but rq-qos
throttle IOs by suspending tasks.

We may be able to relocate the blk-throttle hooks to the rq-qos hooks. Since we may not
be able to replace the throttle hook, in this case, if we register a rq-qos to the system,
part of the blk-throttle hooks is in rq-qos and part hooks not, which feels a bit confusing.
In addition, we may need to implement more hooks, such as IO merge hook.

Thanks for you time.

> Thanks.
> 

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] blk-throtl: optimize IOPS throttle for large IO scenarios
  2021-07-27  3:06       ` brookxu
@ 2021-07-27 16:21         ` Tejun Heo
  2021-07-28  2:33           ` brookxu
  0 siblings, 1 reply; 9+ messages in thread
From: Tejun Heo @ 2021-07-27 16:21 UTC (permalink / raw)
  To: brookxu; +Cc: axboe, cgroups, linux-block, linux-kernel

Hello,

On Tue, Jul 27, 2021 at 11:06:18AM +0800, brookxu wrote:
> Make blk-throttle use rq-qos may be more elegant. But I found that there may be at least
> one problem that is difficult to solve. blk-throttle supports separate throttle for read
> and write IOs, which means that we cannot suspend tasks during throttle, but rq-qos
> throttle IOs by suspending tasks.

Ah, right, I forgot about that.

> We may be able to relocate the blk-throttle hooks to the rq-qos hooks. Since we may not
> be able to replace the throttle hook, in this case, if we register a rq-qos to the system,
> part of the blk-throttle hooks is in rq-qos and part hooks not, which feels a bit confusing.
> In addition, we may need to implement more hooks, such as IO merge hook.

Would it be possible to just move the blk-throtl throttling hook right next
to the rq-qos hook so that it gets throttled after splitting?

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] blk-throtl: optimize IOPS throttle for large IO scenarios
  2021-07-27 16:21         ` Tejun Heo
@ 2021-07-28  2:33           ` brookxu
  2021-07-28  7:48             ` Tejun Heo
  0 siblings, 1 reply; 9+ messages in thread
From: brookxu @ 2021-07-28  2:33 UTC (permalink / raw)
  To: Tejun Heo; +Cc: axboe, cgroups, linux-block, linux-kernel

Thanks for your time.

Tejun Heo wrote on 2021/7/28 12:21 上午:
> Hello,
> 
> On Tue, Jul 27, 2021 at 11:06:18AM +0800, brookxu wrote:
>> Make blk-throttle use rq-qos may be more elegant. But I found that there may be at least
>> one problem that is difficult to solve. blk-throttle supports separate throttle for read
>> and write IOs, which means that we cannot suspend tasks during throttle, but rq-qos
>> throttle IOs by suspending tasks.
> 
> Ah, right, I forgot about that.
> 
>> We may be able to relocate the blk-throttle hooks to the rq-qos hooks. Since we may not
>> be able to replace the throttle hook, in this case, if we register a rq-qos to the system,
>> part of the blk-throttle hooks is in rq-qos and part hooks not, which feels a bit confusing.
>> In addition, we may need to implement more hooks, such as IO merge hook.
> 
> Would it be possible to just move the blk-throtl throttling hook right next
> to the rq-qos hook so that it gets throttled after splitting?

If we do this, I think we will encounter some problems, as follows:
1. blk-throttle is now at the top of the IO stack. Changing the position of the throttle hook
   will make this mechanism invalid for some devices.
2. We may also need to add a new hook to handle back-merge, otherwise I think the isolation
   effect will be worse in the sequential IO scene.
3. Since bio has entered the IO stack, if it is suspended and resubmitted by blk-throttle,
   then there is an IO stack reentry problem, which I think may cause many small problems.

> Thanks.
> 

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] blk-throtl: optimize IOPS throttle for large IO scenarios
  2021-07-28  2:33           ` brookxu
@ 2021-07-28  7:48             ` Tejun Heo
  0 siblings, 0 replies; 9+ messages in thread
From: Tejun Heo @ 2021-07-28  7:48 UTC (permalink / raw)
  To: brookxu; +Cc: axboe, cgroups, linux-block, linux-kernel

Hello,

On Wed, Jul 28, 2021 at 10:33:20AM +0800, brookxu wrote:
> If we do this, I think we will encounter some problems, as follows:
> 1. blk-throttle is now at the top of the IO stack. Changing the position of the throttle hook
>    will make this mechanism invalid for some devices.
> 2. We may also need to add a new hook to handle back-merge, otherwise I think the isolation
>    effect will be worse in the sequential IO scene.
> 3. Since bio has entered the IO stack, if it is suspended and resubmitted by blk-throttle,
>    then there is an IO stack reentry problem, which I think may cause many small problems.

Yeah, you're right. I don't think it's worth rocking the boat with
blk-throtl at this point. An extra callback it is then.

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2021-07-28  7:48 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-07-16  6:22 [PATCH] blk-throtl: optimize IOPS throttle for large IO scenarios brookxu
2021-07-16 16:09 ` Tejun Heo
2021-07-16 23:07   ` brookxu
2021-07-19 16:35   ` brookxu
2021-07-26 21:46     ` Tejun Heo
2021-07-27  3:06       ` brookxu
2021-07-27 16:21         ` Tejun Heo
2021-07-28  2:33           ` brookxu
2021-07-28  7:48             ` Tejun Heo

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).