All of lore.kernel.org
 help / color / mirror / Atom feed
* [RFC v1] writeback: add elastic bdi in cgwb bdp
@ 2019-10-20 13:00 Hillf Danton
  2019-10-21 23:33 ` kbuild test robot
  2019-10-22  5:56 ` kbuild test robot
  0 siblings, 2 replies; 3+ messages in thread
From: Hillf Danton @ 2019-10-20 13:00 UTC (permalink / raw)
  To: linux-mm
  Cc: fsdev, Andrew Morton, linux-kernel, Tejun Heo, Jan Kara,
	Fengguang Wu, Johannes Weiner, Shakeel Butt, Minchan Kim,
	Mel Gorman, Hillf Danton


The elastic bdi is the mirror bdi of spinning disks, SSD, USB and
other storage devices/instruments on market. The performance of
ebdi goes up and down as the pattern of IO dispatched changes. It
can be approximately estimated as below.

	P = j(..., IO pattern);

In ebdi's view, the bandwidth currently measured in balancing dirty
pages has close relation to its performance because the former is a
part of the latter as represented below.

	B = y(P);

The functions above suggest that there may be a layer violation if
filesystems don't care what is measured at the mm layer, while it
could be better measured somewhere below fs.

It is measured however to the extent that makes every judge happy,
and is playing a role in dispatching IO with the IO pattern entirely
ignored that is volatile in nature.

And it helps to throttle the dirty speed, with the figure ignored
that DRAM in general is x10 faster than ebdi. If B is half of P for
instance, then it is near 5% of dirty speed, only 2 points from the
figure in the snippet below.

/*
 * If ratelimit_pages is too high then we can get into dirty-data overload
 * if a large number of processes all perform writes at the same time.
 * If it is too low then SMP machines will call the (expensive)
 * get_writeback_state too often.
 *
 * Here we set ratelimit_pages to a level which ensures that when all CPUs are
 * dirtying in parallel, we cannot go more than 3% (1/32) over the dirty memory
 * thresholds.
 */

To prevent dirty speed from running away from laundry speed in bdp,
ebdi suggests the walk-dog method to consider in assumption that a
leash churns less in IO pattern.

V1 is based on 5.4-rc3.

Changes since v0
- add CGWB_BDP_WITH_EBDI in mm/Kconfig
- drop wakeup in wbc_detach_inode()
- add wakeup in wb_workfn()

Cc: Tejun Heo <tj@kernel.org>
Cc: Jan Kara <jack@suse.com>
Cc: Fengguang Wu <fengguang.wu@intel.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Mel Gorman <mgorman@suse.de>
Signed-off-by: Hillf Danton <hdanton@sina.com>
---

--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -204,6 +204,14 @@ config ARCH_ENABLE_SPLIT_PMD_PTLOCK
 config MEMORY_BALLOON
 	bool
 
+config CGWB_BDP_WITH_EBDI
+	bool
+	help
+	  This puts the walk-dog method in balancing dirty pages
+	  instead of measuring bandwidth.
+
+	  Say N if unsure.
+
 #
 # support for memory balloon compaction
 config BALLOON_COMPACTION
--- a/include/linux/backing-dev-defs.h
+++ b/include/linux/backing-dev-defs.h
@@ -170,6 +170,10 @@ struct bdi_writeback {
 
 	struct list_head bdi_node;	/* anchored at bdi->wb_list */
 
+#ifdef CONFIG_CGWB_BDP_WITH_EBDI
+	struct wait_queue_head bdp_waitq;
+#endif
+
 #ifdef CONFIG_CGROUP_WRITEBACK
 	struct percpu_ref refcnt;	/* used only for !root wb's */
 	struct fprop_local_percpu memcg_completions;
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -324,6 +324,9 @@ static int wb_init(struct bdi_writeback
 			goto out_destroy_stat;
 	}
 
+	if (IS_ENABLED(CONFIG_CGWB_BDP_WITH_EBDI))
+		init_waitqueue_head(&wb->bdp_waitq);
+
 	return 0;
 
 out_destroy_stat:
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -1551,6 +1551,41 @@ static inline void wb_dirty_limits(struc
 	}
 }
 
+#ifdef CONFIG_CGWB_BDP_WITH_EBDI
+static bool cgwb_bdp_should_throttle(struct bdi_writeback *wb)
+{
+	struct dirty_throttle_control gdtc = { GDTC_INIT_NO_WB };
+
+	if (fatal_signal_pending(current))
+		return false;
+
+	gdtc.avail = global_dirtyable_memory();
+
+	domain_dirty_limits(&gdtc);
+
+	gdtc.dirty = global_node_page_state(NR_FILE_DIRTY) +
+			global_node_page_state(NR_UNSTABLE_NFS) +
+			global_node_page_state(NR_WRITEBACK);
+
+	if (gdtc.dirty < gdtc.bg_thresh)
+		return false;
+
+	if (!writeback_in_progress(wb))
+		wb_start_background_writeback(wb);
+
+	return gdtc.dirty > gdtc.thresh &&
+		wb_stat(wb, WB_DIRTIED) >
+		wb_stat(wb, WB_WRITTEN) +
+		wb_stat_error();
+}
+
+static inline void cgwb_bdp(struct bdi_writeback *wb)
+{
+	wait_event_interruptible_timeout(wb->bdp_waitq,
+			!cgwb_bdp_should_throttle(wb), HZ);
+}
+#endif
+
 /*
  * balance_dirty_pages() must be called by processes which are generating dirty
  * data.  It looks at the number of dirty pages in the machine and will force
@@ -1910,7 +1945,10 @@ void balance_dirty_pages_ratelimited(str
 	preempt_enable();
 
 	if (unlikely(current->nr_dirtied >= ratelimit))
-		balance_dirty_pages(wb, current->nr_dirtied);
+		if (IS_ENABLED(CONFIG_CGWB_BDP_WITH_EBDI))
+			cgwb_bdp(wb);
+		else
+			balance_dirty_pages(wb, current->nr_dirtied);
 
 	wb_put(wb);
 }
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -811,6 +811,9 @@ static long wb_split_bdi_pages(struct bd
 	if (nr_pages == LONG_MAX)
 		return LONG_MAX;
 
+	if (IS_ENABLED(CONFIG_CGWB_BDP_WITH_EBDI))
+		return nr_pages;
+
 	/*
 	 * This may be called on clean wb's and proportional distribution
 	 * may not make sense, just use the original @nr_pages in those
@@ -1598,6 +1601,8 @@ static long writeback_chunk_size(struct
 	 */
 	if (work->sync_mode == WB_SYNC_ALL || work->tagged_writepages)
 		pages = LONG_MAX;
+	else if (IS_ENABLED(CONFIG_CGWB_BDP_WITH_EBDI))
+		pages = work->nr_pages;
 	else {
 		pages = min(wb->avg_write_bandwidth / 2,
 			    global_wb_domain.dirty_limit / DIRTY_SCOPE);
@@ -2092,6 +2097,10 @@ void wb_workfn(struct work_struct *work)
 		wb_wakeup_delayed(wb);
 
 	current->flags &= ~PF_SWAPWRITE;
+
+	if (IS_ENABLED(CONFIG_CGWB_BDP_WITH_EBDI))
+		if (waitqueue_active(&wb->bdp_waitq))
+			wake_up_all(&wb->bdp_waitq);
 }
 
 /*
--



^ permalink raw reply	[flat|nested] 3+ messages in thread

* Re: [RFC v1] writeback: add elastic bdi in cgwb bdp
  2019-10-20 13:00 [RFC v1] writeback: add elastic bdi in cgwb bdp Hillf Danton
@ 2019-10-21 23:33 ` kbuild test robot
  2019-10-22  5:56 ` kbuild test robot
  1 sibling, 0 replies; 3+ messages in thread
From: kbuild test robot @ 2019-10-21 23:33 UTC (permalink / raw)
  To: kbuild-all

[-- Attachment #1: Type: text/plain, Size: 5738 bytes --]

Hi Hillf,

[FYI, it's a private test report for your RFC patch.]
[auto build test ERROR on linus/master]
[cannot apply to v5.4-rc4 next-20191021]
[if your patch is applied to the wrong git tree, please drop us a note to help
improve the system. BTW, we also suggest to use '--base' option to specify the
base tree in git format-patch, please see https://stackoverflow.com/a/37406982]

url:    https://github.com/0day-ci/linux/commits/Hillf-Danton/writeback-add-elastic-bdi-in-cgwb-bdp/20191022-070010
base:   https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git 7d194c2100ad2a6dded545887d02754948ca5241
config: i386-tinyconfig (attached as .config)
compiler: gcc-7 (Debian 7.4.0-14) 7.4.0
reproduce:
        # save the attached .config to linux build tree
        make ARCH=i386 

If you fix the issue, kindly add following tag
Reported-by: kbuild test robot <lkp@intel.com>

All errors (new ones prefixed by >>):

   mm/page-writeback.c: In function 'balance_dirty_pages_ratelimited':
>> mm/page-writeback.c:1949:4: error: implicit declaration of function 'cgwb_bdp' [-Werror=implicit-function-declaration]
       cgwb_bdp(wb);
       ^~~~~~~~
   mm/page-writeback.c:1947:5: warning: suggest explicit braces to avoid ambiguous 'else' [-Wdangling-else]
     if (unlikely(current->nr_dirtied >= ratelimit))
        ^
   cc1: some warnings being treated as errors
--
   In file included from mm/backing-dev.c:3:0:
   mm/backing-dev.c: In function 'wb_init':
>> mm/backing-dev.c:328:26: error: 'struct bdi_writeback' has no member named 'bdp_waitq'
      init_waitqueue_head(&wb->bdp_waitq);
                             ^
   include/linux/wait.h:67:26: note: in definition of macro 'init_waitqueue_head'
      __init_waitqueue_head((wq_head), #wq_head, &__key);  \
                             ^~~~~~~
--
   fs/fs-writeback.c: In function 'wb_workfn':
>> fs/fs-writeback.c:2102:27: error: 'struct bdi_writeback' has no member named 'bdp_waitq'
      if (waitqueue_active(&wb->bdp_waitq))
                              ^~
   In file included from include/linux/mmzone.h:10:0,
                    from include/linux/gfp.h:6,
                    from include/linux/slab.h:15,
                    from fs/fs-writeback.c:20:
   fs/fs-writeback.c:2103:19: error: 'struct bdi_writeback' has no member named 'bdp_waitq'
       wake_up_all(&wb->bdp_waitq);
                      ^
   include/linux/wait.h:210:36: note: in definition of macro 'wake_up_all'
    #define wake_up_all(x)   __wake_up(x, TASK_NORMAL, 0, NULL)
                                       ^

vim +/cgwb_bdp +1949 mm/page-writeback.c

  1885	
  1886	/**
  1887	 * balance_dirty_pages_ratelimited - balance dirty memory state
  1888	 * @mapping: address_space which was dirtied
  1889	 *
  1890	 * Processes which are dirtying memory should call in here once for each page
  1891	 * which was newly dirtied.  The function will periodically check the system's
  1892	 * dirty state and will initiate writeback if needed.
  1893	 *
  1894	 * On really big machines, get_writeback_state is expensive, so try to avoid
  1895	 * calling it too often (ratelimiting).  But once we're over the dirty memory
  1896	 * limit we decrease the ratelimiting by a lot, to prevent individual processes
  1897	 * from overshooting the limit by (ratelimit_pages) each.
  1898	 */
  1899	void balance_dirty_pages_ratelimited(struct address_space *mapping)
  1900	{
  1901		struct inode *inode = mapping->host;
  1902		struct backing_dev_info *bdi = inode_to_bdi(inode);
  1903		struct bdi_writeback *wb = NULL;
  1904		int ratelimit;
  1905		int *p;
  1906	
  1907		if (!bdi_cap_account_dirty(bdi))
  1908			return;
  1909	
  1910		if (inode_cgwb_enabled(inode))
  1911			wb = wb_get_create_current(bdi, GFP_KERNEL);
  1912		if (!wb)
  1913			wb = &bdi->wb;
  1914	
  1915		ratelimit = current->nr_dirtied_pause;
  1916		if (wb->dirty_exceeded)
  1917			ratelimit = min(ratelimit, 32 >> (PAGE_SHIFT - 10));
  1918	
  1919		preempt_disable();
  1920		/*
  1921		 * This prevents one CPU to accumulate too many dirtied pages without
  1922		 * calling into balance_dirty_pages(), which can happen when there are
  1923		 * 1000+ tasks, all of them start dirtying pages at exactly the same
  1924		 * time, hence all honoured too large initial task->nr_dirtied_pause.
  1925		 */
  1926		p =  this_cpu_ptr(&bdp_ratelimits);
  1927		if (unlikely(current->nr_dirtied >= ratelimit))
  1928			*p = 0;
  1929		else if (unlikely(*p >= ratelimit_pages)) {
  1930			*p = 0;
  1931			ratelimit = 0;
  1932		}
  1933		/*
  1934		 * Pick up the dirtied pages by the exited tasks. This avoids lots of
  1935		 * short-lived tasks (eg. gcc invocations in a kernel build) escaping
  1936		 * the dirty throttling and livelock other long-run dirtiers.
  1937		 */
  1938		p = this_cpu_ptr(&dirty_throttle_leaks);
  1939		if (*p > 0 && current->nr_dirtied < ratelimit) {
  1940			unsigned long nr_pages_dirtied;
  1941			nr_pages_dirtied = min(*p, ratelimit - current->nr_dirtied);
  1942			*p -= nr_pages_dirtied;
  1943			current->nr_dirtied += nr_pages_dirtied;
  1944		}
  1945		preempt_enable();
  1946	
  1947		if (unlikely(current->nr_dirtied >= ratelimit))
  1948			if (IS_ENABLED(CONFIG_CGWB_BDP_WITH_EBDI))
> 1949				cgwb_bdp(wb);
  1950			else
  1951				balance_dirty_pages(wb, current->nr_dirtied);
  1952	
  1953		wb_put(wb);
  1954	}
  1955	EXPORT_SYMBOL(balance_dirty_pages_ratelimited);
  1956	

---
0-DAY kernel test infrastructure                Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all                   Intel Corporation

[-- Attachment #2: config.gz --]
[-- Type: application/gzip, Size: 7207 bytes --]

^ permalink raw reply	[flat|nested] 3+ messages in thread

* Re: [RFC v1] writeback: add elastic bdi in cgwb bdp
  2019-10-20 13:00 [RFC v1] writeback: add elastic bdi in cgwb bdp Hillf Danton
  2019-10-21 23:33 ` kbuild test robot
@ 2019-10-22  5:56 ` kbuild test robot
  1 sibling, 0 replies; 3+ messages in thread
From: kbuild test robot @ 2019-10-22  5:56 UTC (permalink / raw)
  To: kbuild-all

[-- Attachment #1: Type: text/plain, Size: 5500 bytes --]

Hi Hillf,

[FYI, it's a private test report for your RFC patch.]
[auto build test WARNING on linus/master]
[cannot apply to v5.4-rc4 next-20191021]
[if your patch is applied to the wrong git tree, please drop us a note to help
improve the system. BTW, we also suggest to use '--base' option to specify the
base tree in git format-patch, please see https://stackoverflow.com/a/37406982]

url:    https://github.com/0day-ci/linux/commits/Hillf-Danton/writeback-add-elastic-bdi-in-cgwb-bdp/20191022-070010
base:   https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git 7d194c2100ad2a6dded545887d02754948ca5241
config: x86_64-randconfig-f003-201942 (attached as .config)
compiler: gcc-7 (Debian 7.4.0-14) 7.4.0
reproduce:
        # save the attached .config to linux build tree
        make ARCH=x86_64 

If you fix the issue, kindly add following tag
Reported-by: kbuild test robot <lkp@intel.com>

All warnings (new ones prefixed by >>):

   In file included from include/linux/export.h:44:0,
                    from include/linux/linkage.h:7,
                    from include/linux/kernel.h:8,
                    from fs/fs-writeback.c:17:
   fs/fs-writeback.c: In function 'wb_workfn':
   fs/fs-writeback.c:2102:27: error: 'struct bdi_writeback' has no member named 'bdp_waitq'
      if (waitqueue_active(&wb->bdp_waitq))
                              ^
   include/linux/compiler.h:58:52: note: in definition of macro '__trace_if_var'
    #define __trace_if_var(cond) (__builtin_constant_p(cond) ? (cond) : __trace_if_value(cond))
                                                       ^~~~
>> fs/fs-writeback.c:2102:3: note: in expansion of macro 'if'
      if (waitqueue_active(&wb->bdp_waitq))
      ^~
   fs/fs-writeback.c:2102:27: error: 'struct bdi_writeback' has no member named 'bdp_waitq'
      if (waitqueue_active(&wb->bdp_waitq))
                              ^
   include/linux/compiler.h:58:61: note: in definition of macro '__trace_if_var'
    #define __trace_if_var(cond) (__builtin_constant_p(cond) ? (cond) : __trace_if_value(cond))
                                                                ^~~~
>> fs/fs-writeback.c:2102:3: note: in expansion of macro 'if'
      if (waitqueue_active(&wb->bdp_waitq))
      ^~
   fs/fs-writeback.c:2102:27: error: 'struct bdi_writeback' has no member named 'bdp_waitq'
      if (waitqueue_active(&wb->bdp_waitq))
                              ^
   include/linux/compiler.h:69:3: note: in definition of macro '__trace_if_value'
     (cond) ?     \
      ^~~~
   include/linux/compiler.h:56:28: note: in expansion of macro '__trace_if_var'
    #define if(cond, ...) if ( __trace_if_var( !!(cond , ## __VA_ARGS__) ) )
                               ^~~~~~~~~~~~~~
>> fs/fs-writeback.c:2102:3: note: in expansion of macro 'if'
      if (waitqueue_active(&wb->bdp_waitq))
      ^~
   In file included from include/linux/mmzone.h:10:0,
                    from include/linux/gfp.h:6,
                    from include/linux/slab.h:15,
                    from fs/fs-writeback.c:20:
   fs/fs-writeback.c:2103:19: error: 'struct bdi_writeback' has no member named 'bdp_waitq'
       wake_up_all(&wb->bdp_waitq);
                      ^
   include/linux/wait.h:210:36: note: in definition of macro 'wake_up_all'
    #define wake_up_all(x)   __wake_up(x, TASK_NORMAL, 0, NULL)
                                       ^

vim +/if +2102 fs/fs-writeback.c

  2057	
  2058	/*
  2059	 * Handle writeback of dirty data for the device backed by this bdi. Also
  2060	 * reschedules periodically and does kupdated style flushing.
  2061	 */
  2062	void wb_workfn(struct work_struct *work)
  2063	{
  2064		struct bdi_writeback *wb = container_of(to_delayed_work(work),
  2065							struct bdi_writeback, dwork);
  2066		long pages_written;
  2067	
  2068		set_worker_desc("flush-%s", dev_name(wb->bdi->dev));
  2069		current->flags |= PF_SWAPWRITE;
  2070	
  2071		if (likely(!current_is_workqueue_rescuer() ||
  2072			   !test_bit(WB_registered, &wb->state))) {
  2073			/*
  2074			 * The normal path.  Keep writing back @wb until its
  2075			 * work_list is empty.  Note that this path is also taken
  2076			 * if @wb is shutting down even when we're running off the
  2077			 * rescuer as work_list needs to be drained.
  2078			 */
  2079			do {
  2080				pages_written = wb_do_writeback(wb);
  2081				trace_writeback_pages_written(pages_written);
  2082			} while (!list_empty(&wb->work_list));
  2083		} else {
  2084			/*
  2085			 * bdi_wq can't get enough workers and we're running off
  2086			 * the emergency worker.  Don't hog it.  Hopefully, 1024 is
  2087			 * enough for efficient IO.
  2088			 */
  2089			pages_written = writeback_inodes_wb(wb, 1024,
  2090							    WB_REASON_FORKER_THREAD);
  2091			trace_writeback_pages_written(pages_written);
  2092		}
  2093	
  2094		if (!list_empty(&wb->work_list))
  2095			wb_wakeup(wb);
  2096		else if (wb_has_dirty_io(wb) && dirty_writeback_interval)
  2097			wb_wakeup_delayed(wb);
  2098	
  2099		current->flags &= ~PF_SWAPWRITE;
  2100	
  2101		if (IS_ENABLED(CONFIG_CGWB_BDP_WITH_EBDI))
> 2102			if (waitqueue_active(&wb->bdp_waitq))
  2103				wake_up_all(&wb->bdp_waitq);
  2104	}
  2105	

---
0-DAY kernel test infrastructure                Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all                   Intel Corporation

[-- Attachment #2: config.gz --]
[-- Type: application/gzip, Size: 32473 bytes --]

^ permalink raw reply	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2019-10-22  5:56 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-10-20 13:00 [RFC v1] writeback: add elastic bdi in cgwb bdp Hillf Danton
2019-10-21 23:33 ` kbuild test robot
2019-10-22  5:56 ` kbuild test robot

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.