All of lore.kernel.org
 help / color / mirror / Atom feed
From: Tejun Heo <tj@kernel.org>
To: axboe@kernel.dk
Cc: linux-block@vger.kernel.org, cgroups@vger.kernel.org,
	linux-kernel@vger.kernel.org, kernel-team@fb.com, newella@fb.com,
	Tejun Heo <tj@kernel.org>
Subject: [PATCH 19/27] blk-iocost: revamp donation amount determination
Date: Tue,  1 Sep 2020 14:52:49 -0400	[thread overview]
Message-ID: <20200901185257.645114-20-tj@kernel.org> (raw)
In-Reply-To: <20200901185257.645114-1-tj@kernel.org>

iocost has various safety nets to combat inuse adjustment calculation
inaccuracies. With Andy's method implemented in transfer_surpluses(), inuse
adjustment calculations are now accurate and we can make donation amount
determinations accurate too.

* Stop keeping track of past usage history and using the maximum. Act on the
  immediate usage information.

* Remove donation constraints defined by SURPLUS_* constants. Donate
  whatever isn't used.

* Determine the donation amount so that the iocg will end up with
  MARGIN_TARGET_PCT budget at the end of the coming period assuming the same
  usage as the previous period. TARGET is set at 50% of period, which is the
  previous maximum. This provides smooth convergence for most repetitive IO
  patterns.

* Apply donation logic early at 20% budget. There's no risk in doing so as
  the calculation is based on the delta between the current budget and the
  target budget at the end of the coming period.

* Remove preemptive iocg activation for zero cost IOs. As donation can reach
  near zero now, the mere activation doesn't provide any protection anymore.
  In the unlikely case that this becomes a problem, the right solution is
  assigning appropriate costs for such IOs.

This significantly improves the donation determination logic while also
simplifying it. Now all donations are immediate, exact and smooth.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Andy Newell <newella@fb.com>
---
 block/blk-iocost.c | 133 +++++++++++++++++----------------------------
 1 file changed, 51 insertions(+), 82 deletions(-)

diff --git a/block/blk-iocost.c b/block/blk-iocost.c
index ecc23b827e5d..694f1487208a 100644
--- a/block/blk-iocost.c
+++ b/block/blk-iocost.c
@@ -217,12 +217,14 @@ enum {
 	MAX_PERIOD		= USEC_PER_SEC,
 
 	/*
-	 * A cgroup's vtime can run 50% behind the device vtime, which
+	 * iocg->vtime is targeted at 50% behind the device vtime, which
 	 * serves as its IO credit buffer.  Surplus weight adjustment is
 	 * immediately canceled if the vtime margin runs below 10%.
 	 */
 	MARGIN_MIN_PCT		= 10,
-	MARGIN_MAX_PCT		= 50,
+	MARGIN_LOW_PCT		= 20,
+	MARGIN_TARGET_PCT	= 50,
+	MARGIN_MAX_PCT		= 100,
 
 	/* Have some play in timer operations */
 	TIMER_SLACK_PCT		= 1,
@@ -234,17 +236,6 @@ enum {
 	 */
 	VTIME_VALID_DUR		= 300 * USEC_PER_SEC,
 
-	/*
-	 * Remember the past three non-zero usages and use the max for
-	 * surplus calculation.  Three slots guarantee that we remember one
-	 * full period usage from the last active stretch even after
-	 * partial deactivation and re-activation periods.  Don't start
-	 * giving away weight before collecting two data points to prevent
-	 * hweight adjustments based on one partial activation period.
-	 */
-	NR_USAGE_SLOTS		= 3,
-	MIN_VALID_USAGES	= 2,
-
 	/* 1/64k is granular enough and can easily be handled w/ u32 */
 	WEIGHT_ONE		= 1 << 16,
 
@@ -280,14 +271,6 @@ enum {
 	/* don't let cmds which take a very long time pin lagging for too long */
 	MAX_LAGGING_PERIODS	= 10,
 
-	/*
-	 * If usage% * 1.25 + 2% is lower than hweight% by more than 3%,
-	 * donate the surplus.
-	 */
-	SURPLUS_SCALE_PCT	= 125,			/* * 125% */
-	SURPLUS_SCALE_ABS	= WEIGHT_ONE / 50,	/* + 2% */
-	SURPLUS_MIN_ADJ_DELTA	= WEIGHT_ONE / 33,	/* 3% */
-
 	/* switch iff the conditions are met for longer than this */
 	AUTOP_CYCLE_NSEC	= 10LLU * NSEC_PER_SEC,
 
@@ -376,6 +359,8 @@ struct ioc_params {
 
 struct ioc_margins {
 	s64				min;
+	s64				low;
+	s64				target;
 	s64				max;
 };
 
@@ -514,11 +499,7 @@ struct ioc_gq {
 	struct iocg_stat		desc_stat;
 	struct iocg_stat		last_stat;
 	u64				last_stat_abs_vusage;
-
-	/* usage is recorded as fractions of WEIGHT_ONE */
-	u32				usage_delta_us;
-	int				usage_idx;
-	u32				usages[NR_USAGE_SLOTS];
+	u64				usage_delta_us;
 
 	/* this iocg's depth in the hierarchy and ancestors including self */
 	int				level;
@@ -737,6 +718,8 @@ static void ioc_refresh_margins(struct ioc *ioc)
 	u64 vrate = atomic64_read(&ioc->vtime_rate);
 
 	margins->min = (period_us * MARGIN_MIN_PCT / 100) * vrate;
+	margins->low = (period_us * MARGIN_LOW_PCT / 100) * vrate;
+	margins->target = (period_us * MARGIN_TARGET_PCT / 100) * vrate;
 	margins->max = (period_us * MARGIN_MAX_PCT / 100) * vrate;
 }
 
@@ -1228,7 +1211,7 @@ static bool iocg_kick_delay(struct ioc_gq *iocg, struct ioc_now *now)
 		return false;
 	}
 	if (!atomic_read(&blkg->use_delay) &&
-	    time_before_eq64(vtime, now->vnow + ioc->margins.max))
+	    time_before_eq64(vtime, now->vnow + ioc->margins.target))
 		return false;
 
 	/* use delay */
@@ -1527,7 +1510,7 @@ static u32 hweight_after_donation(struct ioc_gq *iocg, u32 hwm, u32 usage,
 {
 	struct ioc *ioc = iocg->ioc;
 	u64 vtime = atomic64_read(&iocg->vtime);
-	s64 excess;
+	s64 excess, delta, target, new_hwi;
 
 	/* see whether minimum margin requirement is met */
 	if (waitqueue_active(&iocg->waitq) ||
@@ -1542,15 +1525,28 @@ static u32 hweight_after_donation(struct ioc_gq *iocg, u32 hwm, u32 usage,
 		vtime += excess;
 	}
 
-	/* add margin */
-	usage = DIV_ROUND_UP(usage * SURPLUS_SCALE_PCT, 100);
-	usage += SURPLUS_SCALE_ABS;
-
-	/* don't bother if the surplus is too small */
-	if (usage + SURPLUS_MIN_ADJ_DELTA > hwm)
-		return hwm;
+	/*
+	 * Let's say the distance between iocg's and device's vtimes as a
+	 * fraction of period duration is delta. Assuming that the iocg will
+	 * consume the usage determined above, we want to determine new_hwi so
+	 * that delta equals MARGIN_TARGET at the end of the next period.
+	 *
+	 * We need to execute usage worth of IOs while spending the sum of the
+	 * new budget (1 - MARGIN_TARGET) and the leftover from the last period
+	 * (delta):
+	 *
+	 *   usage = (1 - MARGIN_TARGET + delta) * new_hwi
+	 *
+	 * Therefore, the new_hwi is:
+	 *
+	 *   new_hwi = usage / (1 - MARGIN_TARGET + delta)
+	 */
+	delta = div64_s64(WEIGHT_ONE * (now->vnow - vtime),
+			  now->vnow - ioc->period_at_vtime);
+	target = WEIGHT_ONE * MARGIN_TARGET_PCT / 100;
+	new_hwi = div64_s64(WEIGHT_ONE * usage, WEIGHT_ONE - target + delta);
 
-	return usage;
+	return clamp_t(s64, new_hwi, 1, hwm);
 }
 
 /*
@@ -1812,7 +1808,7 @@ static void ioc_timer_fn(struct timer_list *timer)
 	u32 ppm_wthr = MILLION - ioc->params.qos[QOS_WPPM];
 	u32 missed_ppm[2], rq_wait_pct;
 	u64 period_vtime;
-	int prev_busy_level, i;
+	int prev_busy_level;
 
 	/* how were the latencies during the period? */
 	ioc_lat_stat(ioc, missed_ppm, &rq_wait_pct);
@@ -1857,11 +1853,10 @@ static void ioc_timer_fn(struct timer_list *timer)
 	}
 	commit_weights(ioc);
 
-	/* calc usages and see whether some weights need to be moved around */
+	/* calc usage and see whether some weights need to be moved around */
 	list_for_each_entry(iocg, &ioc->active_iocgs, active_list) {
-		u64 vdone, vtime, usage_us;
-		u32 hw_active, hw_inuse, usage;
-		int uidx, nr_valid;
+		u64 vdone, vtime, usage_us, usage_dur;
+		u32 usage, hw_active, hw_inuse;
 
 		/*
 		 * Collect unused and wind vtime closer to vnow to prevent
@@ -1886,15 +1881,11 @@ static void ioc_timer_fn(struct timer_list *timer)
 			nr_lagging++;
 
 		/*
-		 * Determine absolute usage factoring in pending and in-flight
-		 * IOs to avoid stalls and high-latency completions appearing as
-		 * idle.
+		 * Determine absolute usage factoring in in-flight IOs to avoid
+		 * high-latency completions appearing as idle.
 		 */
 		usage_us = iocg->usage_delta_us;
-		if (waitqueue_active(&iocg->waitq) && time_before64(vtime, now.vnow))
-			usage_us += DIV64_U64_ROUND_UP(
-				cost_to_abs_cost(now.vnow - vtime, hw_inuse),
-				now.vrate);
+
 		if (vdone != vtime) {
 			u64 inflight_us = DIV64_U64_ROUND_UP(
 				cost_to_abs_cost(vtime - vdone, hw_inuse),
@@ -1902,43 +1893,22 @@ static void ioc_timer_fn(struct timer_list *timer)
 			usage_us = max(usage_us, inflight_us);
 		}
 
-		/* convert to hweight based usage ratio and record */
-		uidx = (iocg->usage_idx + 1) % NR_USAGE_SLOTS;
-
-		if (time_after64(vtime, now.vnow - ioc->margins.min)) {
-			iocg->usage_idx = uidx;
-			iocg->usages[uidx] = WEIGHT_ONE;
-		} else if (usage_us) {
-			u64 started_at, dur;
-
-			if (time_after64(iocg->activated_at, ioc->period_at))
-				started_at = iocg->activated_at;
-			else
-				started_at = ioc->period_at;
-
-			dur = max_t(u64, now.now - started_at, 1);
+		/* convert to hweight based usage ratio */
+		if (time_after64(iocg->activated_at, ioc->period_at))
+			usage_dur = max_t(u64, now.now - iocg->activated_at, 1);
+		else
+			usage_dur = max_t(u64, now.now - ioc->period_at, 1);
 
-			iocg->usage_idx = uidx;
-			iocg->usages[uidx] = clamp_t(u32,
-				DIV64_U64_ROUND_UP(usage_us * WEIGHT_ONE, dur),
+		usage = clamp_t(u32,
+				DIV64_U64_ROUND_UP(usage_us * WEIGHT_ONE,
+						   usage_dur),
 				1, WEIGHT_ONE);
-		}
-
-		/* base the decision on max historical usage */
-		for (i = 0, usage = 0, nr_valid = 0; i < NR_USAGE_SLOTS; i++) {
-			if (iocg->usages[i]) {
-				usage = max(usage, iocg->usages[i]);
-				nr_valid++;
-			}
-		}
-		if (nr_valid < MIN_VALID_USAGES)
-			usage = WEIGHT_ONE;
 
 		/* see whether there's surplus vtime */
 		WARN_ON_ONCE(!list_empty(&iocg->surplus_list));
 		if (hw_inuse < hw_active ||
 		    (!waitqueue_active(&iocg->waitq) &&
-		     time_before64(vtime, now.vnow - ioc->margins.max))) {
+		     time_before64(vtime, now.vnow - ioc->margins.low))) {
 			u32 hwa, hwm, new_hwi;
 
 			/*
@@ -2175,15 +2145,14 @@ static void ioc_rqos_throttle(struct rq_qos *rqos, struct bio *bio)
 	if (!ioc->enabled || !iocg->level)
 		return;
 
-	/* always activate so that even 0 cost IOs get protected to some level */
-	if (!iocg_activate(iocg, &now))
-		return;
-
 	/* calculate the absolute vtime cost */
 	abs_cost = calc_vtime_cost(bio, iocg, false);
 	if (!abs_cost)
 		return;
 
+	if (!iocg_activate(iocg, &now))
+		return;
+
 	iocg->cursor = bio_end_sector(bio);
 
 	vtime = atomic64_read(&iocg->vtime);
-- 
2.26.2


  parent reply	other threads:[~2020-09-01 18:55 UTC|newest]

Thread overview: 42+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2020-09-01 18:52 [PATCHSET for-5.10/block] blk-iocost: iocost: improve donation, debt and excess handling Tejun Heo
2020-09-01 18:52 ` [PATCH 01/27] blk-iocost: ioc_pd_free() shouldn't assume irq disabled Tejun Heo
2020-09-01 18:52 ` [PATCH 02/27] blk-stat: make q->stats->lock irqsafe Tejun Heo
2020-09-01 18:52   ` Tejun Heo
2020-09-01 18:52 ` [PATCH 03/27] blk-iocost: use local[64]_t for percpu stat Tejun Heo
2020-09-01 18:52   ` Tejun Heo
2020-11-20 21:51   ` Stafford Horne
2020-11-20 22:13     ` Tejun Heo
2020-11-20 22:13       ` Tejun Heo
2020-09-01 18:52 ` [PATCH 04/27] blk-iocost: rename propagate_active_weights() to propagate_weights() Tejun Heo
2020-09-01 18:52 ` [PATCH 05/27] blk-iocost: clamp inuse and skip noops in __propagate_weights() Tejun Heo
2020-09-01 18:52 ` [PATCH 06/27] blk-iocost: move iocg_kick_delay() above iocg_kick_waitq() Tejun Heo
2020-09-01 18:52 ` [PATCH 07/27] blk-iocost: make iocg_kick_waitq() call iocg_kick_delay() after paying debt Tejun Heo
2020-09-01 18:52 ` [PATCH 08/27] blk-iocost: s/HWEIGHT_WHOLE/WEIGHT_ONE/g Tejun Heo
2020-09-01 18:52   ` Tejun Heo
2020-09-01 18:52 ` [PATCH 09/27] blk-iocost: use WEIGHT_ONE based fixed point number for weights Tejun Heo
2020-09-01 18:52 ` [PATCH 10/27] blk-iocost: make ioc_now->now and ioc->period_at 64bit Tejun Heo
2020-09-01 18:52 ` [PATCH 11/27] blk-iocost: streamline vtime margin and timer slack handling Tejun Heo
2020-09-01 18:52 ` [PATCH 12/27] blk-iocost: grab ioc->lock for debt handling Tejun Heo
2020-09-01 18:52 ` [PATCH 13/27] blk-iocost: add absolute usage stat Tejun Heo
2020-09-01 18:52   ` Tejun Heo
2020-09-01 18:52 ` [PATCH 14/27] blk-iocost: calculate iocg->usages[] from iocg->local_stat.usage_us Tejun Heo
2020-09-01 18:52   ` Tejun Heo
2020-09-01 18:52 ` [PATCH 15/27] blk-iocost: replace iocg->has_surplus with ->surplus_list Tejun Heo
2020-09-01 18:52 ` [PATCH 16/27] blk-iocost: decouple vrate adjustment from surplus transfers Tejun Heo
2020-09-01 18:52   ` Tejun Heo
2020-09-01 18:52 ` [PATCH 17/27] blk-iocost: restructure surplus donation logic Tejun Heo
2020-09-01 18:52 ` [PATCH 18/27] blk-iocost: implement Andy's method for donation weight updates Tejun Heo
2020-09-01 18:52 ` Tejun Heo [this message]
2020-09-01 18:52 ` [PATCH 20/27] blk-iocost: revamp in-period donation snapbacks Tejun Heo
2020-09-01 18:52   ` Tejun Heo
2020-09-01 18:52 ` [PATCH 21/27] blk-iocost: revamp debt handling Tejun Heo
2020-09-01 18:52   ` Tejun Heo
2020-09-01 18:52 ` [PATCH 22/27] blk-iocost: implement delay adjustment hysteresis Tejun Heo
2020-09-01 18:52 ` [PATCH 23/27] blk-iocost: halve debts if device stays idle Tejun Heo
2020-09-01 18:52 ` [PATCH 24/27] blk-iocost: implement vtime loss compensation Tejun Heo
2020-09-01 18:52 ` [PATCH 25/27] blk-iocost: restore inuse update tracepoints Tejun Heo
2020-09-01 18:52 ` [PATCH 26/27] blk-iocost: add three debug stat - cost.wait, indebt and indelay Tejun Heo
2020-09-01 18:52   ` Tejun Heo
2020-09-01 18:52 ` [PATCH 27/27] blk-iocost: update iocost_monitor.py Tejun Heo
2020-09-01 22:57 ` [PATCHSET for-5.10/block] blk-iocost: iocost: improve donation, debt and excess handling Jens Axboe
2020-09-01 22:57   ` Jens Axboe

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20200901185257.645114-20-tj@kernel.org \
    --to=tj@kernel.org \
    --cc=axboe@kernel.dk \
    --cc=cgroups@vger.kernel.org \
    --cc=kernel-team@fb.com \
    --cc=linux-block@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=newella@fb.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.