Linux-mm Archive on lore.kernel.org
 help / color / Atom feed
From: Hillf Danton <hdanton@sina.com>
To: mm <linux-mm@kvack.org>
Cc: fsdev <linux-fsdevel@vger.kernel.org>,
	Andrew Morton <akpm@linux-foundation.org>,
	linux <linux-kernel@vger.kernel.org>,
	Roman Gushchin <guro@fb.com>, Tejun Heo <tj@kernel.org>,
	Jan Kara <jack@suse.cz>, Johannes Weiner <hannes@cmpxchg.org>,
	Shakeel Butt <shakeelb@google.com>,
	Minchan Kim <minchan@kernel.org>, Mel Gorman <mgorman@suse.de>,
	Hillf Danton <hdanton@sina.com>
Subject: [RFC] writeback: add elastic bdi in cgwb bdp
Date: Sat, 12 Oct 2019 21:27:40 +0800
Message-ID: <20191012132740.12968-1-hdanton@sina.com> (raw)


The behaviors of the elastic bdi (ebdi) observed in the current cgwb
bandwidth measurement include

1, like spinning disks on market ebdi can do ~128MB/s IOs in consective
minutes in few scenarios, or higher like SSD, or lower like USB key.

2, with ebdi a bdi_writeback, wb-A, is able to do 80MB/s writeouts in the
current time window of 200ms, while it was 16M/s in the previous one.

3, it will be either 100MB/s in the next time window if wb-B joins wb-A
writing pages out or 18MB/s if wb-C also decides to chime in.

With the help of bandwidth gauged above, what is left in balancing dirty
pages, bdp, is try to make wb-A's laundry speed catch up dirty speed in
every 200ms interval without knowing what wb-B is doing.

No heuristic is added in this work because ebdi does bdp without it.

Cc: Roman Gushchin <guro@fb.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Jan Kara <jack@suse.cz>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Mel Gorman <mgorman@suse.de>
Signed-off-by: Hillf Danton <hdanton@sina.com>
---

--- a/include/linux/backing-dev-defs.h
+++ b/include/linux/backing-dev-defs.h
@@ -157,6 +157,9 @@ struct bdi_writeback {
 	struct list_head memcg_node;	/* anchored at memcg->cgwb_list */
 	struct list_head blkcg_node;	/* anchored at blkcg->cgwb_list */
 
+#ifdef CONFIG_CGWB_BDP_WITH_EBDI
+	struct wait_queue_head bdp_waitq;
+#endif
 	union {
 		struct work_struct release_work;
 		struct rcu_head rcu;
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -324,6 +324,10 @@ static int wb_init(struct bdi_writeback
 			goto out_destroy_stat;
 	}
 
+	if (IS_ENABLED(CONFIG_CGROUP_WRITEBACK) &&
+	    IS_ENABLED(CONFIG_CGWB_BDP_WITH_EBDI))
+		init_waitqueue_head(&wb->bdp_waitq);
+
 	return 0;
 
 out_destroy_stat:
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -1551,6 +1551,45 @@ static inline void wb_dirty_limits(struc
 	}
 }
 
+#if defined(CONFIG_CGROUP_WRITEBACK) && defined(CONFIG_CGWB_BDP_WITH_EBDI)
+static bool cgwb_bdp_should_throttle(struct bdi_writeback *wb)
+{
+	struct dirty_throttle_control gdtc = { GDTC_INIT_NO_WB };
+
+	if (fatal_signal_pending(current))
+		return false;
+
+	gdtc.avail = global_dirtyable_memory();
+
+	domain_dirty_limits(&gdtc);
+
+	gdtc.dirty = global_node_page_state(NR_FILE_DIRTY) +
+			global_node_page_state(NR_UNSTABLE_NFS) +
+			global_node_page_state(NR_WRITEBACK);
+
+	if (gdtc.dirty < gdtc.bg_thresh)
+		return false;
+
+	if (!writeback_in_progress(wb))
+		wb_start_background_writeback(wb);
+
+	/*
+	 * throttle if laundry speed remarkably falls behind dirty speed
+	 * in the current time window of 200ms
+	 */
+	return gdtc.dirty > gdtc.thresh &&
+		wb_stat(wb, WB_DIRTIED) >
+		wb_stat(wb, WB_WRITTEN) +
+		wb_stat_error();
+}
+
+static inline void cgwb_bdp(struct bdi_writeback *wb)
+{
+	wait_event_interruptible_timeout(wb->bdp_waitq,
+			!cgwb_bdp_should_throttle(wb), HZ);
+}
+#endif
+
 /*
  * balance_dirty_pages() must be called by processes which are generating dirty
  * data.  It looks at the number of dirty pages in the machine and will force
@@ -1910,7 +1949,11 @@ void balance_dirty_pages_ratelimited(str
 	preempt_enable();
 
 	if (unlikely(current->nr_dirtied >= ratelimit))
-		balance_dirty_pages(wb, current->nr_dirtied);
+		if (IS_ENABLED(CONFIG_CGROUP_WRITEBACK) &&
+		    IS_ENABLED(CONFIG_CGWB_BDP_WITH_EBDI))
+			cgwb_bdp(wb);
+		else
+			balance_dirty_pages(wb, current->nr_dirtied);
 
 	wb_put(wb);
 }
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -632,6 +632,11 @@ void wbc_detach_inode(struct writeback_c
 	if (!wb)
 		return;
 
+	if (IS_ENABLED(CONFIG_CGROUP_WRITEBACK) &&
+	    IS_ENABLED(CONFIG_CGWB_BDP_WITH_EBDI))
+		if (waitqueue_active(&wb->bdp_waitq))
+			wake_up_all(&wb->bdp_waitq);
+
 	history = inode->i_wb_frn_history;
 	avg_time = inode->i_wb_frn_avg_time;
 
@@ -811,6 +816,9 @@ static long wb_split_bdi_pages(struct bd
 	if (nr_pages == LONG_MAX)
 		return LONG_MAX;
 
+	if (IS_ENABLED(CONFIG_CGROUP_WRITEBACK) &&
+	    IS_ENABLED(CONFIG_CGWB_BDP_WITH_EBDI))
+		return nr_pages;
 	/*
 	 * This may be called on clean wb's and proportional distribution
 	 * may not make sense, just use the original @nr_pages in those
@@ -1599,6 +1607,10 @@ static long writeback_chunk_size(struct
 	if (work->sync_mode == WB_SYNC_ALL || work->tagged_writepages)
 		pages = LONG_MAX;
 	else {
+		if (IS_ENABLED(CONFIG_CGROUP_WRITEBACK) &&
+		    IS_ENABLED(CONFIG_CGWB_BDP_WITH_EBDI))
+			return work->nr_pages;
+
 		pages = min(wb->avg_write_bandwidth / 2,
 			    global_wb_domain.dirty_limit / DIRTY_SCOPE);
 		pages = min(pages, work->nr_pages);
--



             reply index

Thread overview: 5+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2019-10-12 13:27 Hillf Danton [this message]
2019-10-15 10:22 ` Jan Kara
2019-10-15 14:03 ` Hillf Danton
2019-10-15 14:37   ` Tejun Heo
2019-10-16  2:26   ` Hillf Danton

Reply instructions:

You may reply publically to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20191012132740.12968-1-hdanton@sina.com \
    --to=hdanton@sina.com \
    --cc=akpm@linux-foundation.org \
    --cc=guro@fb.com \
    --cc=hannes@cmpxchg.org \
    --cc=jack@suse.cz \
    --cc=linux-fsdevel@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=mgorman@suse.de \
    --cc=minchan@kernel.org \
    --cc=shakeelb@google.com \
    --cc=tj@kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Linux-mm Archive on lore.kernel.org

Archives are clonable:
	git clone --mirror https://lore.kernel.org/linux-mm/0 linux-mm/git/0.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 linux-mm linux-mm/ https://lore.kernel.org/linux-mm \
		linux-mm@kvack.org linux-mm@archiver.kernel.org
	public-inbox-index linux-mm

Example config snippet for mirrors

Newsgroup available over NNTP:
	nntp://nntp.lore.kernel.org/org.kvack.linux-mm


AGPL code for this site: git clone https://public-inbox.org/ public-inbox