On Mon 16-05-22 09:47:14, Stefan Roesch wrote: > This factors out the for loop in balance_dirty_pages() into a new > function called _balance_dirty_pages(). By factoring out this function > the async write code can determine if it has to wait to throttle writes > or not. The function _balance_dirty_pages() returns the sleep time. > If the sleep time is greater 0, then the async write code needs to throttle. > > To maintain the context for consecutive calls of _balance_dirty_pages() > and maintain the current behavior a new data structure called bdp_ctx > has been introduced. > > Signed-off-by: Stefan Roesch ... > --- > mm/page-writeback.c | 452 +++++++++++++++++++++++--------------------- > 1 file changed, 239 insertions(+), 213 deletions(-) > > diff --git a/mm/page-writeback.c b/mm/page-writeback.c > index 7e2da284e427..cbb74c0666c6 100644 > --- a/mm/page-writeback.c > +++ b/mm/page-writeback.c > @@ -140,6 +140,29 @@ struct dirty_throttle_control { > unsigned long pos_ratio; > }; > > +/* context for consecutive calls to _balance_dirty_pages() */ > +struct bdp_ctx { > + long pause; > + unsigned long now; > + unsigned long start_time; > + unsigned long task_ratelimit; > + unsigned long dirty_ratelimit; > + unsigned long nr_reclaimable; > + int nr_dirtied_pause; > + bool dirty_exceeded; > + > + struct dirty_throttle_control gdtc_stor; > + struct dirty_throttle_control mdtc_stor; > + struct dirty_throttle_control *sdtc; > +} bdp_ctx; Looking at how much context you propagate into _balance_dirty_pages() I don't think this suggestion was as great as I've hoped. I'm sorry for that. We could actually significantly reduce the amount of context passed in/out but some things would be difficult to get rid of and some interactions of code in _balance_dirty_pages() and the caller are actually pretty subtle. I think something like attached three patches should make things NOWAIT support in balance_dirty_pages() reasonably readable. Honza > + > +/* initialize _balance_dirty_pages() context */ > +#define BDP_CTX_INIT(ctx, wb) \ > + .gdtc_stor = { GDTC_INIT(wb) }, \ > + .mdtc_stor = { MDTC_INIT(wb, &ctx.gdtc_stor) }, \ > + .start_time = jiffies, \ > + .dirty_exceeded = false > + > /* > * Length of period for aging writeout fractions of bdis. This is an > * arbitrarily chosen number. The longer the period, the slower fractions will > @@ -1538,261 +1561,264 @@ static inline void wb_dirty_limits(struct dirty_throttle_control *dtc) > } > } > > -/* > - * balance_dirty_pages() must be called by processes which are generating dirty > - * data. It looks at the number of dirty pages in the machine and will force > - * the caller to wait once crossing the (background_thresh + dirty_thresh) / 2. > - * If we're over `background_thresh' then the writeback threads are woken to > - * perform some writeout. > - */ > -static void balance_dirty_pages(struct bdi_writeback *wb, > - unsigned long pages_dirtied) > +static inline int _balance_dirty_pages(struct bdi_writeback *wb, > + unsigned long pages_dirtied, struct bdp_ctx *ctx) > { > - struct dirty_throttle_control gdtc_stor = { GDTC_INIT(wb) }; > - struct dirty_throttle_control mdtc_stor = { MDTC_INIT(wb, &gdtc_stor) }; > - struct dirty_throttle_control * const gdtc = &gdtc_stor; > - struct dirty_throttle_control * const mdtc = mdtc_valid(&mdtc_stor) ? > - &mdtc_stor : NULL; > - struct dirty_throttle_control *sdtc; > - unsigned long nr_reclaimable; /* = file_dirty */ > + struct dirty_throttle_control * const gdtc = &ctx->gdtc_stor; > + struct dirty_throttle_control * const mdtc = mdtc_valid(&ctx->mdtc_stor) ? > + &ctx->mdtc_stor : NULL; > long period; > - long pause; > long max_pause; > long min_pause; > - int nr_dirtied_pause; > - bool dirty_exceeded = false; > - unsigned long task_ratelimit; > - unsigned long dirty_ratelimit; > struct backing_dev_info *bdi = wb->bdi; > bool strictlimit = bdi->capabilities & BDI_CAP_STRICTLIMIT; > - unsigned long start_time = jiffies; > > - for (;;) { > - unsigned long now = jiffies; > - unsigned long dirty, thresh, bg_thresh; > - unsigned long m_dirty = 0; /* stop bogus uninit warnings */ > - unsigned long m_thresh = 0; > - unsigned long m_bg_thresh = 0; > - > - nr_reclaimable = global_node_page_state(NR_FILE_DIRTY); > - gdtc->avail = global_dirtyable_memory(); > - gdtc->dirty = nr_reclaimable + global_node_page_state(NR_WRITEBACK); > + unsigned long dirty, thresh, bg_thresh; > + unsigned long m_dirty = 0; /* stop bogus uninit warnings */ > + unsigned long m_thresh = 0; > + unsigned long m_bg_thresh = 0; > > - domain_dirty_limits(gdtc); > + ctx->now = jiffies; > + ctx->nr_reclaimable = global_node_page_state(NR_FILE_DIRTY); > + gdtc->avail = global_dirtyable_memory(); > + gdtc->dirty = ctx->nr_reclaimable + global_node_page_state(NR_WRITEBACK); > > - if (unlikely(strictlimit)) { > - wb_dirty_limits(gdtc); > + domain_dirty_limits(gdtc); > > - dirty = gdtc->wb_dirty; > - thresh = gdtc->wb_thresh; > - bg_thresh = gdtc->wb_bg_thresh; > - } else { > - dirty = gdtc->dirty; > - thresh = gdtc->thresh; > - bg_thresh = gdtc->bg_thresh; > - } > + if (unlikely(strictlimit)) { > + wb_dirty_limits(gdtc); > > - if (mdtc) { > - unsigned long filepages, headroom, writeback; > + dirty = gdtc->wb_dirty; > + thresh = gdtc->wb_thresh; > + bg_thresh = gdtc->wb_bg_thresh; > + } else { > + dirty = gdtc->dirty; > + thresh = gdtc->thresh; > + bg_thresh = gdtc->bg_thresh; > + } > > - /* > - * If @wb belongs to !root memcg, repeat the same > - * basic calculations for the memcg domain. > - */ > - mem_cgroup_wb_stats(wb, &filepages, &headroom, > - &mdtc->dirty, &writeback); > - mdtc->dirty += writeback; > - mdtc_calc_avail(mdtc, filepages, headroom); > - > - domain_dirty_limits(mdtc); > - > - if (unlikely(strictlimit)) { > - wb_dirty_limits(mdtc); > - m_dirty = mdtc->wb_dirty; > - m_thresh = mdtc->wb_thresh; > - m_bg_thresh = mdtc->wb_bg_thresh; > - } else { > - m_dirty = mdtc->dirty; > - m_thresh = mdtc->thresh; > - m_bg_thresh = mdtc->bg_thresh; > - } > - } > + if (mdtc) { > + unsigned long filepages, headroom, writeback; > > /* > - * Throttle it only when the background writeback cannot > - * catch-up. This avoids (excessively) small writeouts > - * when the wb limits are ramping up in case of !strictlimit. > - * > - * In strictlimit case make decision based on the wb counters > - * and limits. Small writeouts when the wb limits are ramping > - * up are the price we consciously pay for strictlimit-ing. > - * > - * If memcg domain is in effect, @dirty should be under > - * both global and memcg freerun ceilings. > + * If @wb belongs to !root memcg, repeat the same > + * basic calculations for the memcg domain. > */ > - if (dirty <= dirty_freerun_ceiling(thresh, bg_thresh) && > - (!mdtc || > - m_dirty <= dirty_freerun_ceiling(m_thresh, m_bg_thresh))) { > - unsigned long intv; > - unsigned long m_intv; > + mem_cgroup_wb_stats(wb, &filepages, &headroom, > + &mdtc->dirty, &writeback); > + mdtc->dirty += writeback; > + mdtc_calc_avail(mdtc, filepages, headroom); > > -free_running: > - intv = dirty_poll_interval(dirty, thresh); > - m_intv = ULONG_MAX; > + domain_dirty_limits(mdtc); > > - current->dirty_paused_when = now; > - current->nr_dirtied = 0; > - if (mdtc) > - m_intv = dirty_poll_interval(m_dirty, m_thresh); > - current->nr_dirtied_pause = min(intv, m_intv); > - break; > + if (unlikely(strictlimit)) { > + wb_dirty_limits(mdtc); > + m_dirty = mdtc->wb_dirty; > + m_thresh = mdtc->wb_thresh; > + m_bg_thresh = mdtc->wb_bg_thresh; > + } else { > + m_dirty = mdtc->dirty; > + m_thresh = mdtc->thresh; > + m_bg_thresh = mdtc->bg_thresh; > } > + } > > - if (unlikely(!writeback_in_progress(wb))) > - wb_start_background_writeback(wb); > + /* > + * Throttle it only when the background writeback cannot > + * catch-up. This avoids (excessively) small writeouts > + * when the wb limits are ramping up in case of !strictlimit. > + * > + * In strictlimit case make decision based on the wb counters > + * and limits. Small writeouts when the wb limits are ramping > + * up are the price we consciously pay for strictlimit-ing. > + * > + * If memcg domain is in effect, @dirty should be under > + * both global and memcg freerun ceilings. > + */ > + if (dirty <= dirty_freerun_ceiling(thresh, bg_thresh) && > + (!mdtc || > + m_dirty <= dirty_freerun_ceiling(m_thresh, m_bg_thresh))) { > + unsigned long intv; > + unsigned long m_intv; > > - mem_cgroup_flush_foreign(wb); > +free_running: > + intv = dirty_poll_interval(dirty, thresh); > + m_intv = ULONG_MAX; > + > + current->dirty_paused_when = ctx->now; > + current->nr_dirtied = 0; > + if (mdtc) > + m_intv = dirty_poll_interval(m_dirty, m_thresh); > + current->nr_dirtied_pause = min(intv, m_intv); > + return 0; > + } > + > + if (unlikely(!writeback_in_progress(wb))) > + wb_start_background_writeback(wb); > > + mem_cgroup_flush_foreign(wb); > + > + /* > + * Calculate global domain's pos_ratio and select the > + * global dtc by default. > + */ > + if (!strictlimit) { > + wb_dirty_limits(gdtc); > + > + if ((current->flags & PF_LOCAL_THROTTLE) && > + gdtc->wb_dirty < > + dirty_freerun_ceiling(gdtc->wb_thresh, > + gdtc->wb_bg_thresh)) > + /* > + * LOCAL_THROTTLE tasks must not be throttled > + * when below the per-wb freerun ceiling. > + */ > + goto free_running; > + } > + > + ctx->dirty_exceeded = (gdtc->wb_dirty > gdtc->wb_thresh) && > + ((gdtc->dirty > gdtc->thresh) || strictlimit); > + > + wb_position_ratio(gdtc); > + ctx->sdtc = gdtc; > + > + if (mdtc) { > /* > - * Calculate global domain's pos_ratio and select the > - * global dtc by default. > + * If memcg domain is in effect, calculate its > + * pos_ratio. @wb should satisfy constraints from > + * both global and memcg domains. Choose the one > + * w/ lower pos_ratio. > */ > if (!strictlimit) { > - wb_dirty_limits(gdtc); > + wb_dirty_limits(mdtc); > > if ((current->flags & PF_LOCAL_THROTTLE) && > - gdtc->wb_dirty < > - dirty_freerun_ceiling(gdtc->wb_thresh, > - gdtc->wb_bg_thresh)) > + mdtc->wb_dirty < > + dirty_freerun_ceiling(mdtc->wb_thresh, > + mdtc->wb_bg_thresh)) > /* > - * LOCAL_THROTTLE tasks must not be throttled > - * when below the per-wb freerun ceiling. > + * LOCAL_THROTTLE tasks must not be > + * throttled when below the per-wb > + * freerun ceiling. > */ > goto free_running; > } > + ctx->dirty_exceeded |= (mdtc->wb_dirty > mdtc->wb_thresh) && > + ((mdtc->dirty > mdtc->thresh) || strictlimit); > > - dirty_exceeded = (gdtc->wb_dirty > gdtc->wb_thresh) && > - ((gdtc->dirty > gdtc->thresh) || strictlimit); > + wb_position_ratio(mdtc); > + if (mdtc->pos_ratio < gdtc->pos_ratio) > + ctx->sdtc = mdtc; > + } > > - wb_position_ratio(gdtc); > - sdtc = gdtc; > + if (ctx->dirty_exceeded && !wb->dirty_exceeded) > + wb->dirty_exceeded = 1; > > - if (mdtc) { > - /* > - * If memcg domain is in effect, calculate its > - * pos_ratio. @wb should satisfy constraints from > - * both global and memcg domains. Choose the one > - * w/ lower pos_ratio. > - */ > - if (!strictlimit) { > - wb_dirty_limits(mdtc); > - > - if ((current->flags & PF_LOCAL_THROTTLE) && > - mdtc->wb_dirty < > - dirty_freerun_ceiling(mdtc->wb_thresh, > - mdtc->wb_bg_thresh)) > - /* > - * LOCAL_THROTTLE tasks must not be > - * throttled when below the per-wb > - * freerun ceiling. > - */ > - goto free_running; > - } > - dirty_exceeded |= (mdtc->wb_dirty > mdtc->wb_thresh) && > - ((mdtc->dirty > mdtc->thresh) || strictlimit); > + if (time_is_before_jiffies(READ_ONCE(wb->bw_time_stamp) + > + BANDWIDTH_INTERVAL)) > + __wb_update_bandwidth(gdtc, mdtc, true); > + > + /* throttle according to the chosen dtc */ > + ctx->dirty_ratelimit = READ_ONCE(wb->dirty_ratelimit); > + ctx->task_ratelimit = ((u64)ctx->dirty_ratelimit * ctx->sdtc->pos_ratio) >> > + RATELIMIT_CALC_SHIFT; > + max_pause = wb_max_pause(wb, ctx->sdtc->wb_dirty); > + min_pause = wb_min_pause(wb, max_pause, > + ctx->task_ratelimit, ctx->dirty_ratelimit, > + &ctx->nr_dirtied_pause); > + > + if (unlikely(ctx->task_ratelimit == 0)) { > + period = max_pause; > + ctx->pause = max_pause; > + goto pause; > + } > + period = HZ * pages_dirtied / ctx->task_ratelimit; > + ctx->pause = period; > + if (current->dirty_paused_when) > + ctx->pause -= ctx->now - current->dirty_paused_when; > + /* > + * For less than 1s think time (ext3/4 may block the dirtier > + * for up to 800ms from time to time on 1-HDD; so does xfs, > + * however at much less frequency), try to compensate it in > + * future periods by updating the virtual time; otherwise just > + * do a reset, as it may be a light dirtier. > + */ > + if (ctx->pause < min_pause) { > + trace_balance_dirty_pages(wb, > + ctx->sdtc->thresh, > + ctx->sdtc->bg_thresh, > + ctx->sdtc->dirty, > + ctx->sdtc->wb_thresh, > + ctx->sdtc->wb_dirty, > + ctx->dirty_ratelimit, > + ctx->task_ratelimit, > + pages_dirtied, > + period, > + min(ctx->pause, 0L), > + ctx->start_time); > + if (ctx->pause < -HZ) { > + current->dirty_paused_when = ctx->now; > + current->nr_dirtied = 0; > + } else if (period) { > + current->dirty_paused_when += period; > + current->nr_dirtied = 0; > + } else if (current->nr_dirtied_pause <= pages_dirtied) > + current->nr_dirtied_pause += pages_dirtied; > + return 0; > + } > + if (unlikely(ctx->pause > max_pause)) { > + /* for occasional dropped task_ratelimit */ > + ctx->now += min(ctx->pause - max_pause, max_pause); > + ctx->pause = max_pause; > + } > > - wb_position_ratio(mdtc); > - if (mdtc->pos_ratio < gdtc->pos_ratio) > - sdtc = mdtc; > - } > +pause: > + trace_balance_dirty_pages(wb, > + ctx->sdtc->thresh, > + ctx->sdtc->bg_thresh, > + ctx->sdtc->dirty, > + ctx->sdtc->wb_thresh, > + ctx->sdtc->wb_dirty, > + ctx->dirty_ratelimit, > + ctx->task_ratelimit, > + pages_dirtied, > + period, > + ctx->pause, > + ctx->start_time); > + > + return ctx->pause; > +} > > - if (dirty_exceeded && !wb->dirty_exceeded) > - wb->dirty_exceeded = 1; > - > - if (time_is_before_jiffies(READ_ONCE(wb->bw_time_stamp) + > - BANDWIDTH_INTERVAL)) > - __wb_update_bandwidth(gdtc, mdtc, true); > - > - /* throttle according to the chosen dtc */ > - dirty_ratelimit = READ_ONCE(wb->dirty_ratelimit); > - task_ratelimit = ((u64)dirty_ratelimit * sdtc->pos_ratio) >> > - RATELIMIT_CALC_SHIFT; > - max_pause = wb_max_pause(wb, sdtc->wb_dirty); > - min_pause = wb_min_pause(wb, max_pause, > - task_ratelimit, dirty_ratelimit, > - &nr_dirtied_pause); > - > - if (unlikely(task_ratelimit == 0)) { > - period = max_pause; > - pause = max_pause; > - goto pause; > - } > - period = HZ * pages_dirtied / task_ratelimit; > - pause = period; > - if (current->dirty_paused_when) > - pause -= now - current->dirty_paused_when; > - /* > - * For less than 1s think time (ext3/4 may block the dirtier > - * for up to 800ms from time to time on 1-HDD; so does xfs, > - * however at much less frequency), try to compensate it in > - * future periods by updating the virtual time; otherwise just > - * do a reset, as it may be a light dirtier. > - */ > - if (pause < min_pause) { > - trace_balance_dirty_pages(wb, > - sdtc->thresh, > - sdtc->bg_thresh, > - sdtc->dirty, > - sdtc->wb_thresh, > - sdtc->wb_dirty, > - dirty_ratelimit, > - task_ratelimit, > - pages_dirtied, > - period, > - min(pause, 0L), > - start_time); > - if (pause < -HZ) { > - current->dirty_paused_when = now; > - current->nr_dirtied = 0; > - } else if (period) { > - current->dirty_paused_when += period; > - current->nr_dirtied = 0; > - } else if (current->nr_dirtied_pause <= pages_dirtied) > - current->nr_dirtied_pause += pages_dirtied; > +/* > + * balance_dirty_pages() must be called by processes which are generating dirty > + * data. It looks at the number of dirty pages in the machine and will force > + * the caller to wait once crossing the (background_thresh + dirty_thresh) / 2. > + * If we're over `background_thresh' then the writeback threads are woken to > + * perform some writeout. > + */ > +static void balance_dirty_pages(struct bdi_writeback *wb, unsigned long pages_dirtied) > +{ > + int ret; > + struct bdp_ctx ctx = { BDP_CTX_INIT(ctx, wb) }; > + > + for (;;) { > + ret = _balance_dirty_pages(wb, current->nr_dirtied, &ctx); > + if (!ret) > break; > - } > - if (unlikely(pause > max_pause)) { > - /* for occasional dropped task_ratelimit */ > - now += min(pause - max_pause, max_pause); > - pause = max_pause; > - } > > -pause: > - trace_balance_dirty_pages(wb, > - sdtc->thresh, > - sdtc->bg_thresh, > - sdtc->dirty, > - sdtc->wb_thresh, > - sdtc->wb_dirty, > - dirty_ratelimit, > - task_ratelimit, > - pages_dirtied, > - period, > - pause, > - start_time); > __set_current_state(TASK_KILLABLE); > - wb->dirty_sleep = now; > - io_schedule_timeout(pause); > + wb->dirty_sleep = ctx.now; > + io_schedule_timeout(ctx.pause); > > - current->dirty_paused_when = now + pause; > + current->dirty_paused_when = ctx.now + ctx.pause; > current->nr_dirtied = 0; > - current->nr_dirtied_pause = nr_dirtied_pause; > + current->nr_dirtied_pause = ctx.nr_dirtied_pause; > > /* > * This is typically equal to (dirty < thresh) and can also > * keep "1000+ dd on a slow USB stick" under control. > */ > - if (task_ratelimit) > + if (ctx.task_ratelimit) > break; > > /* > @@ -1805,14 +1831,14 @@ static void balance_dirty_pages(struct bdi_writeback *wb, > * more page. However wb_dirty has accounting errors. So use > * the larger and more IO friendly wb_stat_error. > */ > - if (sdtc->wb_dirty <= wb_stat_error()) > + if (ctx.sdtc->wb_dirty <= wb_stat_error()) > break; > > if (fatal_signal_pending(current)) > break; > } > > - if (!dirty_exceeded && wb->dirty_exceeded) > + if (!ctx.dirty_exceeded && wb->dirty_exceeded) > wb->dirty_exceeded = 0; > > if (writeback_in_progress(wb)) > @@ -1829,7 +1855,7 @@ static void balance_dirty_pages(struct bdi_writeback *wb, > if (laptop_mode) > return; > > - if (nr_reclaimable > gdtc->bg_thresh) > + if (ctx.nr_reclaimable > ctx.gdtc_stor.bg_thresh) > wb_start_background_writeback(wb); > } > > -- > 2.30.2 > -- Jan Kara SUSE Labs, CR