linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH 1/2] blk-throtl: make latency= absolute
@ 2017-11-09 22:19 Tejun Heo
  2017-11-09 22:20 ` [PATCH 2/2] blk-throtl: add relative percentage support to latency= Tejun Heo
  2017-11-09 23:12 ` [PATCH 1/2] blk-throtl: make latency= absolute Shaohua Li
  0 siblings, 2 replies; 12+ messages in thread
From: Tejun Heo @ 2017-11-09 22:19 UTC (permalink / raw)
  To: Jens Axboe, Shaohua Li; +Cc: linux-kernel, kernel-team

Currently, the latency= parameter specifies the extra latency on top
of the estimated baseline latency.  This doesn't seem ideal for the
following two reasons.

1. Sometimes we want to use absolute target latencies, especially as
   the baseline latency estimation isn't always reliable.

2. If we want to set a latency target relative to the estimated
   baseline latency, it makes a lot more sense to express the target
   as a percentage of the baseline (say, 120% of the expected latency)
   rather than the baseline latency + an absolute offset.

This patch makes the existing latency= parameter to be interpreted as
an absolute latency target instead of an offset to the baseline.  The
next patch will add support for relative latency target expressed in
terms of percentage.

While this is a userland visible change, io.low support is still
evoling and highly experimental.  This isn't expected to break any
actual usages.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Shaohua Li <shli@kernel.org>
---
 block/blk-throttle.c |    3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -2299,8 +2299,7 @@ void blk_throtl_bio_endio(struct bio *bi
 
 		bucket = request_bucket_index(
 			blk_stat_size(&bio->bi_issue_stat));
-		threshold = tg->td->avg_buckets[bucket].latency +
-			tg->latency_target;
+		threshold = tg->latency_target;
 		if (lat > threshold)
 			tg->bad_bio_cnt++;
 		/*

^ permalink raw reply	[flat|nested] 12+ messages in thread

* [PATCH 2/2] blk-throtl: add relative percentage support to latency=
  2017-11-09 22:19 [PATCH 1/2] blk-throtl: make latency= absolute Tejun Heo
@ 2017-11-09 22:20 ` Tejun Heo
  2017-11-14 22:06   ` Shaohua Li
  2017-11-09 23:12 ` [PATCH 1/2] blk-throtl: make latency= absolute Shaohua Li
  1 sibling, 1 reply; 12+ messages in thread
From: Tejun Heo @ 2017-11-09 22:20 UTC (permalink / raw)
  To: Jens Axboe, Shaohua Li; +Cc: linux-kernel, kernel-team

This patch updates latency= handling so that the latency target can
also be specified as a percentage.  This allows, in addition to the
default absolute latency target, to specify the latency target as a
percentage of the baseline (say, 120% of the expected latency).

A given blkg can only have either absolute or percentage latency
target.  The propgation is updated so that we always consider both
targets and follow whatever is the least protecting on the path to the
root.

The percentage latency target is specified and presented with the '%'
suffix.

 $ echo 8:16 rbps=$((100<<20)) riops=100 wbps=$((100<<20)) wiops=100 \
	idle=$((1000*1000)) latency=120% > io.low
 $ cat io.low
 8:16 rbps=104857600 wbps=104857600 riops=100 wiops=100 idle=1000000 latency=120%

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Shaohua Li <shli@kernel.org>
---
 block/blk-throttle.c |   66 +++++++++++++++++++++++++++++++++++++++------------
 1 file changed, 51 insertions(+), 15 deletions(-)

--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -27,6 +27,7 @@ static int throtl_quantum = 32;
 #define MIN_THROTL_BPS (320 * 1024)
 #define MIN_THROTL_IOPS (10)
 #define DFL_LATENCY_TARGET (-1L)
+#define DFL_LATENCY_TARGET_PCT (-1L)
 #define DFL_IDLE_THRESHOLD (0)
 #define DFL_HD_BASELINE_LATENCY (4000L) /* 4ms */
 #define LATENCY_FILTERED_SSD (0)
@@ -164,8 +165,11 @@ struct throtl_grp {
 
 	unsigned long last_check_time;
 
+	/* Either both target and target_pct are DFL or neither is */
 	unsigned long latency_target; /* us */
 	unsigned long latency_target_conf; /* us */
+	unsigned long latency_target_pct; /* % */
+	unsigned long latency_target_pct_conf; /* % */
 	/* When did we start a new slice */
 	unsigned long slice_start[2];
 	unsigned long slice_end[2];
@@ -511,6 +515,8 @@ static struct blkg_policy_data *throtl_p
 
 	tg->latency_target = DFL_LATENCY_TARGET;
 	tg->latency_target_conf = DFL_LATENCY_TARGET;
+	tg->latency_target_pct = DFL_LATENCY_TARGET_PCT;
+	tg->latency_target_pct_conf = DFL_LATENCY_TARGET_PCT;
 	tg->idletime_threshold = DFL_IDLE_THRESHOLD;
 	tg->idletime_threshold_conf = DFL_IDLE_THRESHOLD;
 
@@ -1417,6 +1423,8 @@ static void tg_conf_updated(struct throt
 				parent_tg->idletime_threshold);
 		this_tg->latency_target = max(this_tg->latency_target,
 				parent_tg->latency_target);
+		this_tg->latency_target_pct = max(this_tg->latency_target_pct,
+				parent_tg->latency_target_pct);
 	}
 
 	/*
@@ -1528,7 +1536,7 @@ static u64 tg_prfill_limit(struct seq_fi
 	u64 bps_dft;
 	unsigned int iops_dft;
 	char idle_time[26] = "";
-	char latency_time[26] = "";
+	char latency_time[27] = "";	/* +1 for the optional '%' */
 
 	if (!dname)
 		return 0;
@@ -1569,8 +1577,11 @@ static u64 tg_prfill_limit(struct seq_fi
 			snprintf(idle_time, sizeof(idle_time), " idle=%lu",
 				tg->idletime_threshold_conf);
 
-		if (tg->latency_target_conf == ULONG_MAX)
+		if (tg->latency_target_conf == DFL_LATENCY_TARGET)
 			strcpy(latency_time, " latency=max");
+		else if (tg->latency_target_pct_conf)
+			snprintf(latency_time, sizeof(latency_time),
+				" latency=%lu%%", tg->latency_target_pct_conf);
 		else
 			snprintf(latency_time, sizeof(latency_time),
 				" latency=%lu", tg->latency_target_conf);
@@ -1597,7 +1608,7 @@ static ssize_t tg_set_limit(struct kernf
 	struct throtl_grp *tg;
 	u64 v[4];
 	unsigned long idle_time;
-	unsigned long latency_time;
+	unsigned long latency_time, latency_pct;
 	int ret;
 	int index = of_cft(of)->private;
 
@@ -1614,8 +1625,10 @@ static ssize_t tg_set_limit(struct kernf
 
 	idle_time = tg->idletime_threshold_conf;
 	latency_time = tg->latency_target_conf;
+	latency_pct = tg->latency_target_pct_conf;
 	while (true) {
 		char tok[27];	/* wiops=18446744073709551616 */
+		char is_pct = 0;
 		char *p;
 		u64 val = U64_MAX;
 		int len;
@@ -1629,7 +1642,11 @@ static ssize_t tg_set_limit(struct kernf
 		ret = -EINVAL;
 		p = tok;
 		strsep(&p, "=");
-		if (!p || (sscanf(p, "%llu", &val) != 1 && strcmp(p, "max")))
+		if (!p || (sscanf(p, "%llu%c", &val, &is_pct) < 1 &&
+			   strcmp(p, "max")))
+			goto out_finish;
+
+		if (is_pct && (is_pct != '%' || strcmp(tok, "latency")))
 			goto out_finish;
 
 		ret = -ERANGE;
@@ -1637,20 +1654,33 @@ static ssize_t tg_set_limit(struct kernf
 			goto out_finish;
 
 		ret = -EINVAL;
-		if (!strcmp(tok, "rbps"))
+		if (!strcmp(tok, "rbps")) {
 			v[0] = val;
-		else if (!strcmp(tok, "wbps"))
+		} else if (!strcmp(tok, "wbps")) {
 			v[1] = val;
-		else if (!strcmp(tok, "riops"))
+		} else if (!strcmp(tok, "riops")) {
 			v[2] = min_t(u64, val, UINT_MAX);
-		else if (!strcmp(tok, "wiops"))
+		} else if (!strcmp(tok, "wiops")) {
 			v[3] = min_t(u64, val, UINT_MAX);
-		else if (off == LIMIT_LOW && !strcmp(tok, "idle"))
+		} else if (off == LIMIT_LOW && !strcmp(tok, "idle")) {
 			idle_time = val;
-		else if (off == LIMIT_LOW && !strcmp(tok, "latency"))
-			latency_time = val;
-		else
+		} else if (off == LIMIT_LOW && !strcmp(tok, "latency")) {
+			/* gonna use max of the two, set the other one to 0 */
+			if (val != U64_MAX) {
+				if (is_pct) {
+					latency_time = 0;
+					latency_pct = val;
+				} else {
+					latency_time = val;
+					latency_pct = 0;
+				}
+			} else {
+				latency_time = DFL_LATENCY_TARGET;
+				latency_pct = DFL_LATENCY_TARGET_PCT;
+			}
+		} else {
 			goto out_finish;
+		}
 	}
 
 	tg->bps_conf[READ][index] = v[0];
@@ -1674,6 +1704,7 @@ static ssize_t tg_set_limit(struct kernf
 		tg->iops_conf[WRITE][LIMIT_MAX]);
 	tg->idletime_threshold_conf = idle_time;
 	tg->latency_target_conf = latency_time;
+	tg->latency_target_pct_conf = latency_pct;
 
 	/* force user to configure all settings for low limit  */
 	if (!(tg->bps[READ][LIMIT_LOW] || tg->iops[READ][LIMIT_LOW] ||
@@ -1686,9 +1717,11 @@ static ssize_t tg_set_limit(struct kernf
 		tg->iops[WRITE][LIMIT_LOW] = 0;
 		tg->idletime_threshold = DFL_IDLE_THRESHOLD;
 		tg->latency_target = DFL_LATENCY_TARGET;
+		tg->latency_target_pct = DFL_LATENCY_TARGET_PCT;
 	} else if (index == LIMIT_LOW) {
 		tg->idletime_threshold = tg->idletime_threshold_conf;
 		tg->latency_target = tg->latency_target_conf;
+		tg->latency_target_pct = tg->latency_target_pct_conf;
 	}
 
 	blk_throtl_update_limit_valid(tg->td);
@@ -1799,7 +1832,7 @@ static bool throtl_tg_is_idle(struct thr
 	      tg->idletime_threshold == DFL_IDLE_THRESHOLD ||
 	      (ktime_get_ns() >> 10) - tg->last_finish_time > time ||
 	      tg->avg_idletime > tg->idletime_threshold ||
-	      (tg->latency_target && tg->bio_cnt &&
+	      ((tg->latency_target || tg->latency_target_pct) && tg->bio_cnt &&
 		tg->bad_bio_cnt * 5 < tg->bio_cnt);
 	throtl_log(&tg->service_queue,
 		"avg_idle=%ld, idle_threshold=%ld, bad_bio=%d, total_bio=%d, is_idle=%d, scale=%d",
@@ -2293,13 +2326,16 @@ void blk_throtl_bio_endio(struct bio *bi
 		throtl_track_latency(tg->td, blk_stat_size(&bio->bi_issue_stat),
 			bio_op(bio), lat);
 
-	if (tg->latency_target && lat >= tg->td->filtered_latency) {
+	if ((tg->latency_target || tg->latency_target_pct) &&
+	    lat >= tg->td->filtered_latency) {
 		int bucket;
 		unsigned int threshold;
 
 		bucket = request_bucket_index(
 			blk_stat_size(&bio->bi_issue_stat));
-		threshold = tg->latency_target;
+		threshold = max(tg->latency_target,
+				tg->latency_target_pct *
+				tg->td->avg_buckets[bucket].latency / 100);
 		if (lat > threshold)
 			tg->bad_bio_cnt++;
 		/*

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 1/2] blk-throtl: make latency= absolute
  2017-11-09 22:19 [PATCH 1/2] blk-throtl: make latency= absolute Tejun Heo
  2017-11-09 22:20 ` [PATCH 2/2] blk-throtl: add relative percentage support to latency= Tejun Heo
@ 2017-11-09 23:12 ` Shaohua Li
  2017-11-09 23:42   ` Tejun Heo
  1 sibling, 1 reply; 12+ messages in thread
From: Shaohua Li @ 2017-11-09 23:12 UTC (permalink / raw)
  To: Tejun Heo; +Cc: Jens Axboe, linux-kernel, kernel-team

On Thu, Nov 09, 2017 at 02:19:24PM -0800, Tejun Heo wrote:
> Currently, the latency= parameter specifies the extra latency on top
> of the estimated baseline latency.  This doesn't seem ideal for the
> following two reasons.
> 
> 1. Sometimes we want to use absolute target latencies, especially as
>    the baseline latency estimation isn't always reliable.
> 
> 2. If we want to set a latency target relative to the estimated
>    baseline latency, it makes a lot more sense to express the target
>    as a percentage of the baseline (say, 120% of the expected latency)
>    rather than the baseline latency + an absolute offset.
> 
> This patch makes the existing latency= parameter to be interpreted as
> an absolute latency target instead of an offset to the baseline.  The
> next patch will add support for relative latency target expressed in
> terms of percentage.
> 
> While this is a userland visible change, io.low support is still
> evoling and highly experimental.  This isn't expected to break any
> actual usages.

The percentage latency makes sense, but the absolute latency doesn't to me. A
4k IO latency could be much smaller than 1M IO latency. If we don't add
baseline latency, we can't specify a latency target which works for both 4k and
1M IO.

Thanks,
Shaohua
> Signed-off-by: Tejun Heo <tj@kernel.org>
> Cc: Shaohua Li <shli@kernel.org>
> ---
>  block/blk-throttle.c |    3 +--
>  1 file changed, 1 insertion(+), 2 deletions(-)
> 
> --- a/block/blk-throttle.c
> +++ b/block/blk-throttle.c
> @@ -2299,8 +2299,7 @@ void blk_throtl_bio_endio(struct bio *bi
>  
>  		bucket = request_bucket_index(
>  			blk_stat_size(&bio->bi_issue_stat));
> -		threshold = tg->td->avg_buckets[bucket].latency +
> -			tg->latency_target;
> +		threshold = tg->latency_target;
>  		if (lat > threshold)
>  			tg->bad_bio_cnt++;
>  		/*

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 1/2] blk-throtl: make latency= absolute
  2017-11-09 23:12 ` [PATCH 1/2] blk-throtl: make latency= absolute Shaohua Li
@ 2017-11-09 23:42   ` Tejun Heo
  2017-11-10  4:27     ` Shaohua Li
  0 siblings, 1 reply; 12+ messages in thread
From: Tejun Heo @ 2017-11-09 23:42 UTC (permalink / raw)
  To: Shaohua Li; +Cc: Jens Axboe, linux-kernel, kernel-team

Hello, Shaohua.

On Thu, Nov 09, 2017 at 03:12:12PM -0800, Shaohua Li wrote:
> The percentage latency makes sense, but the absolute latency doesn't to me. A
> 4k IO latency could be much smaller than 1M IO latency. If we don't add
> baseline latency, we can't specify a latency target which works for both 4k and
> 1M IO.

It isn't adaptive for sure.  I think it's still useful for the
following reasons.

1. The absolute latency target is by nature both workload and device
   dependent.  For a lot of use cases, coming up with a decent number
   should be possible.

2. There are many use cases which aren't sensitive to the level where
   they care much about the different between small and large
   requests.  e.g. protecting a managerial job so that it doesn't
   completely stall doesn't require tuning things to that level.  A
   value which is comfortably higher than usually expected latencies
   would often be enough (say 100ms).

3. It's also useful for verification / testing.

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 1/2] blk-throtl: make latency= absolute
  2017-11-09 23:42   ` Tejun Heo
@ 2017-11-10  4:27     ` Shaohua Li
  2017-11-10 15:43       ` Tejun Heo
  0 siblings, 1 reply; 12+ messages in thread
From: Shaohua Li @ 2017-11-10  4:27 UTC (permalink / raw)
  To: Tejun Heo; +Cc: Jens Axboe, linux-kernel, kernel-team

On Thu, Nov 09, 2017 at 03:42:58PM -0800, Tejun Heo wrote:
> Hello, Shaohua.
> 
> On Thu, Nov 09, 2017 at 03:12:12PM -0800, Shaohua Li wrote:
> > The percentage latency makes sense, but the absolute latency doesn't to me. A
> > 4k IO latency could be much smaller than 1M IO latency. If we don't add
> > baseline latency, we can't specify a latency target which works for both 4k and
> > 1M IO.
> 
> It isn't adaptive for sure.  I think it's still useful for the
> following reasons.
> 
> 1. The absolute latency target is by nature both workload and device
>    dependent.  For a lot of use cases, coming up with a decent number
>    should be possible.
> 
> 2. There are many use cases which aren't sensitive to the level where
>    they care much about the different between small and large
>    requests.  e.g. protecting a managerial job so that it doesn't
>    completely stall doesn't require tuning things to that level.  A
>    value which is comfortably higher than usually expected latencies
>    would often be enough (say 100ms).
> 
> 3. It's also useful for verification / testing.

I think the absolute latency would only work for HD. For a SSD, a 4k latency
probably is 60us and 1M latency is 500us. The disk must be very contended to
make 4k latency reach 500us. Not sensitive doesn't mean no protection. If the
use case sets rough latency, say 1ms, there will be no protection for 4k IO at
all. The baseline latency is pretty reliable for SSD actually. So I'd rather
keeping the baseline latency for SSD but using absolute latency for HD, which
can be done easily by setting DFL_HD_BASELINE_LATENCY to 0.

Thanks,
Shaohua

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 1/2] blk-throtl: make latency= absolute
  2017-11-10  4:27     ` Shaohua Li
@ 2017-11-10 15:43       ` Tejun Heo
  2017-11-13  4:29         ` Shaohua Li
  0 siblings, 1 reply; 12+ messages in thread
From: Tejun Heo @ 2017-11-10 15:43 UTC (permalink / raw)
  To: Shaohua Li; +Cc: Jens Axboe, linux-kernel, kernel-team

Hello, Shaohua.

On Thu, Nov 09, 2017 at 08:27:13PM -0800, Shaohua Li wrote:
> I think the absolute latency would only work for HD. For a SSD, a 4k latency
> probably is 60us and 1M latency is 500us. The disk must be very contended to
> make 4k latency reach 500us. Not sensitive doesn't mean no protection. If the
> use case sets rough latency, say 1ms, there will be no protection for 4k IO at
> all. The baseline latency is pretty reliable for SSD actually. So I'd rather

I don't understand how that would mean no protection.  The latency
naturally includes the queueing time on the host side and, even for a
fast SSD device, it isn't too difficult to saturate the device to the
point where the host-side waiting time becomes pretty long.  All
that's necessary is IOs being issued faster than completed and we can
almost always do that.

> keeping the baseline latency for SSD but using absolute latency for HD, which
> can be done easily by setting DFL_HD_BASELINE_LATENCY to 0.

I don't think that'd be a good interface choice.  It's too misleading.
If we actually need to specify baseline + margin, it'd probably be
better to add another notation - say, "+N" - than overloading the
meaning of "N".

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 1/2] blk-throtl: make latency= absolute
  2017-11-10 15:43       ` Tejun Heo
@ 2017-11-13  4:29         ` Shaohua Li
  2017-11-13 11:27           ` Tejun Heo
  0 siblings, 1 reply; 12+ messages in thread
From: Shaohua Li @ 2017-11-13  4:29 UTC (permalink / raw)
  To: Tejun Heo; +Cc: Jens Axboe, linux-kernel, kernel-team

On Fri, Nov 10, 2017 at 07:43:14AM -0800, Tejun Heo wrote:
> Hello, Shaohua.
> 
> On Thu, Nov 09, 2017 at 08:27:13PM -0800, Shaohua Li wrote:
> > I think the absolute latency would only work for HD. For a SSD, a 4k latency
> > probably is 60us and 1M latency is 500us. The disk must be very contended to
> > make 4k latency reach 500us. Not sensitive doesn't mean no protection. If the
> > use case sets rough latency, say 1ms, there will be no protection for 4k IO at
> > all. The baseline latency is pretty reliable for SSD actually. So I'd rather
> 
> I don't understand how that would mean no protection.  The latency
> naturally includes the queueing time on the host side and, even for a
> fast SSD device, it isn't too difficult to saturate the device to the
> point where the host-side waiting time becomes pretty long.  All
> that's necessary is IOs being issued faster than completed and we can
> almost always do that.

Didn't get this. What did you mean 'queueing time on the host side'? You mean
the application think time delay?

My point is absolute latency doen't protect as we expected. Let me have an
example. Say 4k latency is 60us, BW is 100MB/s. When 4k BW is 50MB/s, the
latency is 200us. 1M latency is 500us. If you set the absolute latency to
600us, you can't protect the 4k BW to above 50MB/s. To do the protection, you
really want to set the absolute latency below 500us, which doesn't work for the
1M IO.
 
> > keeping the baseline latency for SSD but using absolute latency for HD, which
> > can be done easily by setting DFL_HD_BASELINE_LATENCY to 0.
> 
> I don't think that'd be a good interface choice.  It's too misleading.
> If we actually need to specify baseline + margin, it'd probably be
> better to add another notation - say, "+N" - than overloading the
> meaning of "N".

We don't overload the meaning of "N". Untill your next patch, the "N" actually
means "+N".

Ponder a little bit, I think 4ms base latency for HD actually is reasonable. We
have LATENCY_FILTERED_HD to filter out small latency bios, which come from
sequential IO. So remaining IO is random IO. 4k base latency for HD random IO
should be ok. Probably something else is wrong. I think we need understand
what's wrong for HD throttling first before we make any change.

Thanks,
Shaohua

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 1/2] blk-throtl: make latency= absolute
  2017-11-13  4:29         ` Shaohua Li
@ 2017-11-13 11:27           ` Tejun Heo
  2017-11-13 14:18             ` Tejun Heo
  0 siblings, 1 reply; 12+ messages in thread
From: Tejun Heo @ 2017-11-13 11:27 UTC (permalink / raw)
  To: Shaohua Li; +Cc: Jens Axboe, linux-kernel, kernel-team

Hello, Shoahua.

On Sun, Nov 12, 2017 at 08:29:40PM -0800, Shaohua Li wrote:
> Didn't get this. What did you mean 'queueing time on the host side'? You mean
> the application think time delay?
> 
> My point is absolute latency doen't protect as we expected. Let me have an
> example. Say 4k latency is 60us, BW is 100MB/s. When 4k BW is 50MB/s, the
> latency is 200us. 1M latency is 500us. If you set the absolute latency to
> 600us, you can't protect the 4k BW to above 50MB/s. To do the protection, you
> really want to set the absolute latency below 500us, which doesn't work for the
> 1M IO.

What I'm trying to say is that the latency is defined as "from bio
issue to completion", not "in-flight time on device".  Whether the
on-device latency is 50us or 500us, the host side queueing latency can
be in orders of magnitude higher.

For things like starvation protection for managerial workloads which
work fine on rotating disks, the only thing we need to protect against
is excessive host side queue overflowing leading to starvation of such
workloads.  IOW, we're talking about latency target in tens or lower
hundreds of millisecs.  Whether the on-device time is 50 or 500us
doesn't matter that much.

> We don't overload the meaning of "N". Untill your next patch, the "N" actually
> means "+N".
> 
> Ponder a little bit, I think 4ms base latency for HD actually is reasonable. We
> have LATENCY_FILTERED_HD to filter out small latency bios, which come from
> sequential IO. So remaining IO is random IO. 4k base latency for HD random IO
> should be ok. Probably something else is wrong. I think we need understand
> what's wrong for HD throttling first before we make any change.

So, even purely from user-interface perspective, I think it can be
very confusing to use "N" to mean "base + N".  Explicitly saying
what's going on through "+N" or "N%" is a lot more straight-forward.
I mean, we can decide to change the config syntax but not support abs
targets but I think abs targets are useful and it's not like this adds
significant overhead / complexity.

As for why it isn't working well for disks, I think some of it is
coming from buffering behaviors and not handling merges properly.

Write latencies aren't evenly spread across commands.  Most of them
really fast and then one of them or the flush take a really long time.
Filtering based on LATENCY_FILTERED_HD simply ignores those fast
completions which means that the eventual latency spike is attributed
arbitrarily, which doesn't really work.

The other part is that blk-throtl was seeing wildly different IO
numbers than the underlying device does when there are a lot of
merges, which still happens.  This means that iops limits were just
badly broken.

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 1/2] blk-throtl: make latency= absolute
  2017-11-13 11:27           ` Tejun Heo
@ 2017-11-13 14:18             ` Tejun Heo
  2017-11-13 22:08               ` Shaohua Li
  0 siblings, 1 reply; 12+ messages in thread
From: Tejun Heo @ 2017-11-13 14:18 UTC (permalink / raw)
  To: Shaohua Li; +Cc: Jens Axboe, linux-kernel, kernel-team

Hello, Shaohua.  Just a bit of addition.

On Mon, Nov 13, 2017 at 03:27:10AM -0800, Tejun Heo wrote:
> What I'm trying to say is that the latency is defined as "from bio
> issue to completion", not "in-flight time on device".  Whether the
> on-device latency is 50us or 500us, the host side queueing latency can
> be in orders of magnitude higher.
> 
> For things like starvation protection for managerial workloads which
> work fine on rotating disks, the only thing we need to protect against
> is excessive host side queue overflowing leading to starvation of such
> workloads.  IOW, we're talking about latency target in tens or lower
> hundreds of millisecs.  Whether the on-device time is 50 or 500us
> doesn't matter that much.

So, the absolute latency target can express the requirements of the
workload in question - it's saying "if the IO latency stays within
this boundary, regardless of the underlying device, this workload is
gonna be happy enough".  There are workloads which are this way -
e.g. it has some IOs to do and some deadline requirements (like
heartbeat period).  For those workloads, it doesn't matter what the
underlying device is.  It can be a rotating disk, or a slow or
lightening-fast SSD.  As long as the absolute target latency is met,
the workload will be happy.

The % notation can express how much proportional hit the workload is
willing to take to share the underlying device with others - "I'm
willing to take 20% extra hit in latency so that I can be a nice
neighbor", which also makes sense to me.

The baseline + slack (the current one) is the mix of the two.  IOW,
the configuration is dependent on both the workload requirements and
the performance characteristics of the underlying device - you can't
use a single value across different workloads or devices.  We can
absolutely keep supporting this but I think it fits worse than the
previous two and am having a bit of hard time to come up with why we'd
want this.

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 1/2] blk-throtl: make latency= absolute
  2017-11-13 14:18             ` Tejun Heo
@ 2017-11-13 22:08               ` Shaohua Li
  2017-11-14 14:52                 ` Tejun Heo
  0 siblings, 1 reply; 12+ messages in thread
From: Shaohua Li @ 2017-11-13 22:08 UTC (permalink / raw)
  To: Tejun Heo; +Cc: Jens Axboe, linux-kernel, kernel-team

On Mon, Nov 13, 2017 at 06:18:49AM -0800, Tejun Heo wrote:
> Hello, Shaohua.  Just a bit of addition.
> 
> On Mon, Nov 13, 2017 at 03:27:10AM -0800, Tejun Heo wrote:
> > What I'm trying to say is that the latency is defined as "from bio
> > issue to completion", not "in-flight time on device".  Whether the
> > on-device latency is 50us or 500us, the host side queueing latency can
> > be in orders of magnitude higher.
> > 
> > For things like starvation protection for managerial workloads which
> > work fine on rotating disks, the only thing we need to protect against
> > is excessive host side queue overflowing leading to starvation of such
> > workloads.  IOW, we're talking about latency target in tens or lower
> > hundreds of millisecs.  Whether the on-device time is 50 or 500us
> > doesn't matter that much.
> 
> So, the absolute latency target can express the requirements of the
> workload in question - it's saying "if the IO latency stays within
> this boundary, regardless of the underlying device, this workload is
> gonna be happy enough".  There are workloads which are this way -
> e.g. it has some IOs to do and some deadline requirements (like
> heartbeat period).  For those workloads, it doesn't matter what the
> underlying device is.  It can be a rotating disk, or a slow or
> lightening-fast SSD.  As long as the absolute target latency is met,
> the workload will be happy.

I think this is what we don't agree with. The user doesn't really care about
the IO latency. What user care about is 'read' syscall latency or 'fsync'
syscall latency. The syscall could do several 4k IO or 1M IO or mixed. To meet
the syscall latency target, we must control the latency for each IO. If we use
absolute latency, it can only control some IOs. In this case, it's very likely
the syscall latency requirement isn't met. So we do need to know what the
underlying device is.

That said, absolute latency is useful for HD. But on the other hand, HD
baseline is always 4ms for any size IO. So absolute latency = 4ms + slack,
unless you want to specify a smaller than 4ms absolute latency.

Thanks,
Shaohua

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 1/2] blk-throtl: make latency= absolute
  2017-11-13 22:08               ` Shaohua Li
@ 2017-11-14 14:52                 ` Tejun Heo
  0 siblings, 0 replies; 12+ messages in thread
From: Tejun Heo @ 2017-11-14 14:52 UTC (permalink / raw)
  To: Shaohua Li; +Cc: Jens Axboe, linux-kernel, kernel-team

Hello, Shaohua.

On Mon, Nov 13, 2017 at 02:08:04PM -0800, Shaohua Li wrote:
> I think this is what we don't agree with. The user doesn't really care about
> the IO latency. What user care about is 'read' syscall latency or 'fsync'
> syscall latency. The syscall could do several 4k IO or 1M IO or mixed. To meet
> the syscall latency target, we must control the latency for each IO. If we use
> absolute latency, it can only control some IOs. In this case, it's very likely
> the syscall latency requirement isn't met. So we do need to know what the
> underlying device is.

Hmmm... the IO pattern of a workload can often be pretty well defined,
so I don't think it'd be too difficult to say "as long as most IOs
that this workload issues finishes under Xms, this works fine".  It of
course doesn't cover all use cases but all that's being argued is that
it is good enough for some use cases.

> That said, absolute latency is useful for HD. But on the other hand, HD
> baseline is always 4ms for any size IO. So absolute latency = 4ms + slack,
> unless you want to specify a smaller than 4ms absolute latency.

If you do small random IOs and then big ones, there are significant
differences in performance characteristics.  I don't understand how
declaring that the baseline is always 4ms for harddisks would work.

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 2/2] blk-throtl: add relative percentage support to latency=
  2017-11-09 22:20 ` [PATCH 2/2] blk-throtl: add relative percentage support to latency= Tejun Heo
@ 2017-11-14 22:06   ` Shaohua Li
  0 siblings, 0 replies; 12+ messages in thread
From: Shaohua Li @ 2017-11-14 22:06 UTC (permalink / raw)
  To: Tejun Heo; +Cc: Jens Axboe, linux-kernel, kernel-team

On Thu, Nov 09, 2017 at 02:20:00PM -0800, Tejun Heo wrote:
> This patch updates latency= handling so that the latency target can
> also be specified as a percentage.  This allows, in addition to the
> default absolute latency target, to specify the latency target as a
> percentage of the baseline (say, 120% of the expected latency).
> 
> A given blkg can only have either absolute or percentage latency
> target.  The propgation is updated so that we always consider both
> targets and follow whatever is the least protecting on the path to the
> root.
> 
> The percentage latency target is specified and presented with the '%'
> suffix.
> 
>  $ echo 8:16 rbps=$((100<<20)) riops=100 wbps=$((100<<20)) wiops=100 \
> 	idle=$((1000*1000)) latency=120% > io.low
>  $ cat io.low
>  8:16 rbps=104857600 wbps=104857600 riops=100 wiops=100 idle=1000000 latency=120%
> 
> Signed-off-by: Tejun Heo <tj@kernel.org>
> Cc: Shaohua Li <shli@kernel.org>

Reviewed-by: Shaohua Li <shli@kernel.org>
> ---
>  block/blk-throttle.c |   66 +++++++++++++++++++++++++++++++++++++++------------
>  1 file changed, 51 insertions(+), 15 deletions(-)
> 
> --- a/block/blk-throttle.c
> +++ b/block/blk-throttle.c
> @@ -27,6 +27,7 @@ static int throtl_quantum = 32;
>  #define MIN_THROTL_BPS (320 * 1024)
>  #define MIN_THROTL_IOPS (10)
>  #define DFL_LATENCY_TARGET (-1L)
> +#define DFL_LATENCY_TARGET_PCT (-1L)
>  #define DFL_IDLE_THRESHOLD (0)
>  #define DFL_HD_BASELINE_LATENCY (4000L) /* 4ms */
>  #define LATENCY_FILTERED_SSD (0)
> @@ -164,8 +165,11 @@ struct throtl_grp {
>  
>  	unsigned long last_check_time;
>  
> +	/* Either both target and target_pct are DFL or neither is */
>  	unsigned long latency_target; /* us */
>  	unsigned long latency_target_conf; /* us */
> +	unsigned long latency_target_pct; /* % */
> +	unsigned long latency_target_pct_conf; /* % */
>  	/* When did we start a new slice */
>  	unsigned long slice_start[2];
>  	unsigned long slice_end[2];
> @@ -511,6 +515,8 @@ static struct blkg_policy_data *throtl_p
>  
>  	tg->latency_target = DFL_LATENCY_TARGET;
>  	tg->latency_target_conf = DFL_LATENCY_TARGET;
> +	tg->latency_target_pct = DFL_LATENCY_TARGET_PCT;
> +	tg->latency_target_pct_conf = DFL_LATENCY_TARGET_PCT;
>  	tg->idletime_threshold = DFL_IDLE_THRESHOLD;
>  	tg->idletime_threshold_conf = DFL_IDLE_THRESHOLD;
>  
> @@ -1417,6 +1423,8 @@ static void tg_conf_updated(struct throt
>  				parent_tg->idletime_threshold);
>  		this_tg->latency_target = max(this_tg->latency_target,
>  				parent_tg->latency_target);
> +		this_tg->latency_target_pct = max(this_tg->latency_target_pct,
> +				parent_tg->latency_target_pct);
>  	}
>  
>  	/*
> @@ -1528,7 +1536,7 @@ static u64 tg_prfill_limit(struct seq_fi
>  	u64 bps_dft;
>  	unsigned int iops_dft;
>  	char idle_time[26] = "";
> -	char latency_time[26] = "";
> +	char latency_time[27] = "";	/* +1 for the optional '%' */
>  
>  	if (!dname)
>  		return 0;
> @@ -1569,8 +1577,11 @@ static u64 tg_prfill_limit(struct seq_fi
>  			snprintf(idle_time, sizeof(idle_time), " idle=%lu",
>  				tg->idletime_threshold_conf);
>  
> -		if (tg->latency_target_conf == ULONG_MAX)
> +		if (tg->latency_target_conf == DFL_LATENCY_TARGET)
>  			strcpy(latency_time, " latency=max");
> +		else if (tg->latency_target_pct_conf)
> +			snprintf(latency_time, sizeof(latency_time),
> +				" latency=%lu%%", tg->latency_target_pct_conf);
>  		else
>  			snprintf(latency_time, sizeof(latency_time),
>  				" latency=%lu", tg->latency_target_conf);
> @@ -1597,7 +1608,7 @@ static ssize_t tg_set_limit(struct kernf
>  	struct throtl_grp *tg;
>  	u64 v[4];
>  	unsigned long idle_time;
> -	unsigned long latency_time;
> +	unsigned long latency_time, latency_pct;
>  	int ret;
>  	int index = of_cft(of)->private;
>  
> @@ -1614,8 +1625,10 @@ static ssize_t tg_set_limit(struct kernf
>  
>  	idle_time = tg->idletime_threshold_conf;
>  	latency_time = tg->latency_target_conf;
> +	latency_pct = tg->latency_target_pct_conf;
>  	while (true) {
>  		char tok[27];	/* wiops=18446744073709551616 */
> +		char is_pct = 0;
>  		char *p;
>  		u64 val = U64_MAX;
>  		int len;
> @@ -1629,7 +1642,11 @@ static ssize_t tg_set_limit(struct kernf
>  		ret = -EINVAL;
>  		p = tok;
>  		strsep(&p, "=");
> -		if (!p || (sscanf(p, "%llu", &val) != 1 && strcmp(p, "max")))
> +		if (!p || (sscanf(p, "%llu%c", &val, &is_pct) < 1 &&
> +			   strcmp(p, "max")))
> +			goto out_finish;
> +
> +		if (is_pct && (is_pct != '%' || strcmp(tok, "latency")))
>  			goto out_finish;
>  
>  		ret = -ERANGE;
> @@ -1637,20 +1654,33 @@ static ssize_t tg_set_limit(struct kernf
>  			goto out_finish;
>  
>  		ret = -EINVAL;
> -		if (!strcmp(tok, "rbps"))
> +		if (!strcmp(tok, "rbps")) {
>  			v[0] = val;
> -		else if (!strcmp(tok, "wbps"))
> +		} else if (!strcmp(tok, "wbps")) {
>  			v[1] = val;
> -		else if (!strcmp(tok, "riops"))
> +		} else if (!strcmp(tok, "riops")) {
>  			v[2] = min_t(u64, val, UINT_MAX);
> -		else if (!strcmp(tok, "wiops"))
> +		} else if (!strcmp(tok, "wiops")) {
>  			v[3] = min_t(u64, val, UINT_MAX);
> -		else if (off == LIMIT_LOW && !strcmp(tok, "idle"))
> +		} else if (off == LIMIT_LOW && !strcmp(tok, "idle")) {
>  			idle_time = val;
> -		else if (off == LIMIT_LOW && !strcmp(tok, "latency"))
> -			latency_time = val;
> -		else
> +		} else if (off == LIMIT_LOW && !strcmp(tok, "latency")) {
> +			/* gonna use max of the two, set the other one to 0 */
> +			if (val != U64_MAX) {
> +				if (is_pct) {
> +					latency_time = 0;
> +					latency_pct = val;
> +				} else {
> +					latency_time = val;
> +					latency_pct = 0;
> +				}
> +			} else {
> +				latency_time = DFL_LATENCY_TARGET;
> +				latency_pct = DFL_LATENCY_TARGET_PCT;
> +			}
> +		} else {
>  			goto out_finish;
> +		}
>  	}
>  
>  	tg->bps_conf[READ][index] = v[0];
> @@ -1674,6 +1704,7 @@ static ssize_t tg_set_limit(struct kernf
>  		tg->iops_conf[WRITE][LIMIT_MAX]);
>  	tg->idletime_threshold_conf = idle_time;
>  	tg->latency_target_conf = latency_time;
> +	tg->latency_target_pct_conf = latency_pct;
>  
>  	/* force user to configure all settings for low limit  */
>  	if (!(tg->bps[READ][LIMIT_LOW] || tg->iops[READ][LIMIT_LOW] ||
> @@ -1686,9 +1717,11 @@ static ssize_t tg_set_limit(struct kernf
>  		tg->iops[WRITE][LIMIT_LOW] = 0;
>  		tg->idletime_threshold = DFL_IDLE_THRESHOLD;
>  		tg->latency_target = DFL_LATENCY_TARGET;
> +		tg->latency_target_pct = DFL_LATENCY_TARGET_PCT;
>  	} else if (index == LIMIT_LOW) {
>  		tg->idletime_threshold = tg->idletime_threshold_conf;
>  		tg->latency_target = tg->latency_target_conf;
> +		tg->latency_target_pct = tg->latency_target_pct_conf;
>  	}
>  
>  	blk_throtl_update_limit_valid(tg->td);
> @@ -1799,7 +1832,7 @@ static bool throtl_tg_is_idle(struct thr
>  	      tg->idletime_threshold == DFL_IDLE_THRESHOLD ||
>  	      (ktime_get_ns() >> 10) - tg->last_finish_time > time ||
>  	      tg->avg_idletime > tg->idletime_threshold ||
> -	      (tg->latency_target && tg->bio_cnt &&
> +	      ((tg->latency_target || tg->latency_target_pct) && tg->bio_cnt &&
>  		tg->bad_bio_cnt * 5 < tg->bio_cnt);
>  	throtl_log(&tg->service_queue,
>  		"avg_idle=%ld, idle_threshold=%ld, bad_bio=%d, total_bio=%d, is_idle=%d, scale=%d",
> @@ -2293,13 +2326,16 @@ void blk_throtl_bio_endio(struct bio *bi
>  		throtl_track_latency(tg->td, blk_stat_size(&bio->bi_issue_stat),
>  			bio_op(bio), lat);
>  
> -	if (tg->latency_target && lat >= tg->td->filtered_latency) {
> +	if ((tg->latency_target || tg->latency_target_pct) &&
> +	    lat >= tg->td->filtered_latency) {
>  		int bucket;
>  		unsigned int threshold;
>  
>  		bucket = request_bucket_index(
>  			blk_stat_size(&bio->bi_issue_stat));
> -		threshold = tg->latency_target;
> +		threshold = max(tg->latency_target,
> +				tg->latency_target_pct *
> +				tg->td->avg_buckets[bucket].latency / 100);
>  		if (lat > threshold)
>  			tg->bad_bio_cnt++;
>  		/*

^ permalink raw reply	[flat|nested] 12+ messages in thread

end of thread, other threads:[~2017-11-14 22:07 UTC | newest]

Thread overview: 12+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-11-09 22:19 [PATCH 1/2] blk-throtl: make latency= absolute Tejun Heo
2017-11-09 22:20 ` [PATCH 2/2] blk-throtl: add relative percentage support to latency= Tejun Heo
2017-11-14 22:06   ` Shaohua Li
2017-11-09 23:12 ` [PATCH 1/2] blk-throtl: make latency= absolute Shaohua Li
2017-11-09 23:42   ` Tejun Heo
2017-11-10  4:27     ` Shaohua Li
2017-11-10 15:43       ` Tejun Heo
2017-11-13  4:29         ` Shaohua Li
2017-11-13 11:27           ` Tejun Heo
2017-11-13 14:18             ` Tejun Heo
2017-11-13 22:08               ` Shaohua Li
2017-11-14 14:52                 ` Tejun Heo

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).