[PATCH 1/2] mm: add a function to return a bdi_writeback dirty page statistic

linux-fsdevel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [PATCH 1/2] mm: add a function to return a bdi_writeback dirty page statistic
@ 2018-08-01 10:48 Li RongQing
  2018-08-01 10:48 ` [PATCH 2/2] fs/writeback: do memory cgroup related writeback firstly Li RongQing
  2018-08-01 11:03 ` [PATCH 1/2] mm: add a function to return a bdi_writeback dirty page statistic Michal Hocko
  0 siblings, 2 replies; 4+ messages in thread
From: Li RongQing @ 2018-08-01 10:48 UTC (permalink / raw)
  To: linux-fsdevel, linux-mm, linux-kernel
  Cc: Alexander Viro, Johannes Weiner, Michal Hocko, Vladimir Davydov

this is a preparation to optimise a full writeback
when reclaim memory

Signed-off-by: Zhang Yu <zhangyu31@baidu.com>
Signed-off-by: Li RongQing <lirongqing@baidu.com>
---
 include/linux/memcontrol.h | 2 +-
 mm/memcontrol.c            | 6 ++++++
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 6c6fb116e925..58e29555ac81 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -1141,7 +1141,7 @@ struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb);
 void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages,
 			 unsigned long *pheadroom, unsigned long *pdirty,
 			 unsigned long *pwriteback);
-
+unsigned long mem_cgroup_wb_dirty_stats(struct bdi_writeback *wb);
 #else	/* CONFIG_CGROUP_WRITEBACK */
 
 static inline struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 8c0280b3143e..82d3061e91d1 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -3640,6 +3640,12 @@ void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages,
 	}
 }
 
+unsigned long mem_cgroup_wb_dirty_stats(struct bdi_writeback *wb)
+{
+	struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
+
+	return memcg_page_state(memcg, NR_FILE_DIRTY);
+}
 #else	/* CONFIG_CGROUP_WRITEBACK */
 
 static int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp)
-- 
2.16.2

^ permalink raw reply related	[flat|nested] 4+ messages in thread

* [PATCH 2/2] fs/writeback: do memory cgroup related writeback firstly
  2018-08-01 10:48 [PATCH 1/2] mm: add a function to return a bdi_writeback dirty page statistic Li RongQing
@ 2018-08-01 10:48 ` Li RongQing
  2018-08-01 11:16   ` Michal Hocko
  2018-08-01 11:03 ` [PATCH 1/2] mm: add a function to return a bdi_writeback dirty page statistic Michal Hocko
  1 sibling, 1 reply; 4+ messages in thread
From: Li RongQing @ 2018-08-01 10:48 UTC (permalink / raw)
  To: linux-fsdevel, linux-mm, linux-kernel
  Cc: Alexander Viro, Johannes Weiner, Michal Hocko, Vladimir Davydov

When a machine has hundreds of memory cgroups, and some cgroups
generate more or less dirty pages, but a cgroup of them has lots
of memory pressure and always tries to reclaim dirty page, then it
will trigger all cgroups to writeback, which is less efficient:

1.if the used memory in a memory cgroup reaches its limit,
it is useless to writeback other cgroups.
2.other cgroups can wait more time to merge write request

so replace the full flush with flushing writeback of memory cgroup
whose tasks tries to reclaim memory and trigger writeback, if
nothing is writeback, then fallback a full flush

After this patch, the writing performance enhance 5% in below setup:
  $mount -t cgroup none -o memory /cgroups/memory/
  $mkdir /cgroups/memory/x1
  $echo $$ > /cgroups/memory/x1/tasks
  $echo 100M > /cgroups/memory/x1/memory.limit_in_bytes
  $cd /cgroups/memory/
  $seq 10000|xargs  mkdir
  $fio -filename=/home/test1 -direct=0 -iodepth 1 -thread -rw=write -ioengine=libaio -bs=16k -size=20G
Before:
WRITE: io=20480MB, aggrb=779031KB/s, minb=779031KB/s, maxb=779031KB/s, mint=26920msec, maxt=26920msec
After:
WRITE: io=20480MB, aggrb=831708KB/s, minb=831708KB/s, maxb=831708KB/s, mint=25215msec, maxt=25215msec

And this patch can reduce io util in this condition, like there
is two disks, one disks is used to store all kinds of logs, it
should be less io pressure, and other is used to store hadoop data
which will write lots of data to disk, but both disk io utils are
high in fact, since when hadoop reclaims memory, it will wake all
memory cgroup writeback.

Signed-off-by: Zhang Yu <zhangyu31@baidu.com>
Signed-off-by: Li RongQing <lirongqing@baidu.com>
---
 fs/fs-writeback.c | 31 +++++++++++++++++++++++++++++++
 1 file changed, 31 insertions(+)

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 471d863958bc..475cada5d1cf 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -35,6 +35,11 @@
  */
 #define MIN_WRITEBACK_PAGES	(4096UL >> (PAGE_SHIFT - 10))
 
+/*
+ * if WB cgroup dirty pages is bigger than it, not start a full flush
+ */
+#define MIN_WB_DIRTY_PAGES 64
+
 struct wb_completion {
 	atomic_t		cnt;
 };
@@ -2005,6 +2010,32 @@ void wakeup_flusher_threads(enum wb_reason reason)
 	if (blk_needs_flush_plug(current))
 		blk_schedule_flush_plug(current);
 
+#ifdef CONFIG_CGROUP_WRITEBACK
+	if (reason == WB_REASON_VMSCAN) {
+		unsigned long tmp, pdirty = 0;
+
+		rcu_read_lock();
+		list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) {
+			struct bdi_writeback *wb = wb_find_current(bdi);
+
+			if (wb) {
+				tmp = mem_cgroup_wb_dirty_stats(wb);
+				if (tmp) {
+					pdirty += tmp;
+					wb_start_writeback(wb, reason);
+
+					if (wb == &bdi->wb)
+						pdirty += MIN_WB_DIRTY_PAGES;
+				}
+			}
+		}
+		rcu_read_unlock();
+
+		if (pdirty > MIN_WB_DIRTY_PAGES)
+			return;
+	}
+#endif
+
 	rcu_read_lock();
 	list_for_each_entry_rcu(bdi, &bdi_list, bdi_list)
 		__wakeup_flusher_threads_bdi(bdi, reason);
-- 
2.16.2

^ permalink raw reply related	[flat|nested] 4+ messages in thread

* Re: [PATCH 1/2] mm: add a function to return a bdi_writeback dirty page statistic
  2018-08-01 10:48 [PATCH 1/2] mm: add a function to return a bdi_writeback dirty page statistic Li RongQing
  2018-08-01 10:48 ` [PATCH 2/2] fs/writeback: do memory cgroup related writeback firstly Li RongQing
@ 2018-08-01 11:03 ` Michal Hocko
  1 sibling, 0 replies; 4+ messages in thread
From: Michal Hocko @ 2018-08-01 11:03 UTC (permalink / raw)
  To: Li RongQing
  Cc: linux-fsdevel, linux-mm, linux-kernel, Alexander Viro,
	Johannes Weiner, Vladimir Davydov

On Wed 01-08-18 18:48:35, Li RongQing wrote:
> this is a preparation to optimise a full writeback
> when reclaim memory

Please do not add unused functions. This makes review harder without a
good reason. Besides that we already have mem_cgroup_wb_stats. Why
cannot you reuse it?

> Signed-off-by: Zhang Yu <zhangyu31@baidu.com>
> Signed-off-by: Li RongQing <lirongqing@baidu.com>
> ---
>  include/linux/memcontrol.h | 2 +-
>  mm/memcontrol.c            | 6 ++++++
>  2 files changed, 7 insertions(+), 1 deletion(-)
> 
> diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
> index 6c6fb116e925..58e29555ac81 100644
> --- a/include/linux/memcontrol.h
> +++ b/include/linux/memcontrol.h
> @@ -1141,7 +1141,7 @@ struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb);
>  void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages,
>  			 unsigned long *pheadroom, unsigned long *pdirty,
>  			 unsigned long *pwriteback);
> -
> +unsigned long mem_cgroup_wb_dirty_stats(struct bdi_writeback *wb);
>  #else	/* CONFIG_CGROUP_WRITEBACK */
>  
>  static inline struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb)
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index 8c0280b3143e..82d3061e91d1 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -3640,6 +3640,12 @@ void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages,
>  	}
>  }
>  
> +unsigned long mem_cgroup_wb_dirty_stats(struct bdi_writeback *wb)
> +{
> +	struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
> +
> +	return memcg_page_state(memcg, NR_FILE_DIRTY);
> +}
>  #else	/* CONFIG_CGROUP_WRITEBACK */
>  
>  static int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp)
> -- 
> 2.16.2

-- 
Michal Hocko
SUSE Labs

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH 2/2] fs/writeback: do memory cgroup related writeback firstly
  2018-08-01 10:48 ` [PATCH 2/2] fs/writeback: do memory cgroup related writeback firstly Li RongQing
@ 2018-08-01 11:16   ` Michal Hocko
  0 siblings, 0 replies; 4+ messages in thread
From: Michal Hocko @ 2018-08-01 11:16 UTC (permalink / raw)
  To: Li RongQing
  Cc: linux-fsdevel, linux-mm, linux-kernel, Alexander Viro,
	Johannes Weiner, Vladimir Davydov

On Wed 01-08-18 18:48:36, Li RongQing wrote:
> When a machine has hundreds of memory cgroups, and some cgroups
> generate more or less dirty pages, but a cgroup of them has lots
> of memory pressure and always tries to reclaim dirty page, then it
> will trigger all cgroups to writeback, which is less efficient:
> 
> 1.if the used memory in a memory cgroup reaches its limit,
> it is useless to writeback other cgroups.
> 2.other cgroups can wait more time to merge write request
> 
> so replace the full flush with flushing writeback of memory cgroup
> whose tasks tries to reclaim memory and trigger writeback, if
> nothing is writeback, then fallback a full flush
> 
> After this patch, the writing performance enhance 5% in below setup:
>   $mount -t cgroup none -o memory /cgroups/memory/
>   $mkdir /cgroups/memory/x1
>   $echo $$ > /cgroups/memory/x1/tasks
>   $echo 100M > /cgroups/memory/x1/memory.limit_in_bytes
>   $cd /cgroups/memory/
>   $seq 10000|xargs  mkdir
>   $fio -filename=/home/test1 -direct=0 -iodepth 1 -thread -rw=write -ioengine=libaio -bs=16k -size=20G
> Before:
> WRITE: io=20480MB, aggrb=779031KB/s, minb=779031KB/s, maxb=779031KB/s, mint=26920msec, maxt=26920msec
> After:
> WRITE: io=20480MB, aggrb=831708KB/s, minb=831708KB/s, maxb=831708KB/s, mint=25215msec, maxt=25215msec

Have you tried v2 interface which should be much more effective when
flushing IO?

> And this patch can reduce io util in this condition, like there
> is two disks, one disks is used to store all kinds of logs, it
> should be less io pressure, and other is used to store hadoop data
> which will write lots of data to disk, but both disk io utils are
> high in fact, since when hadoop reclaims memory, it will wake all
> memory cgroup writeback.

This is not my domain and that might be the reason why the above doesn't
really explain what is going on here. But from my understanding the
flushing behavior for v1 is inherently suboptimal because we lack any
per memcg throttling and per cgroup writeback support. It seems that you
are just trying to paper over this limitation with another ad-hoc
measure.

I might be wrong here but I completely fail to see how this can help to
isolate flushing behavior to the memcg under the reclaim.
 
> Signed-off-by: Zhang Yu <zhangyu31@baidu.com>
> Signed-off-by: Li RongQing <lirongqing@baidu.com>
> ---
>  fs/fs-writeback.c | 31 +++++++++++++++++++++++++++++++
>  1 file changed, 31 insertions(+)
> 
> diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
> index 471d863958bc..475cada5d1cf 100644
> --- a/fs/fs-writeback.c
> +++ b/fs/fs-writeback.c
> @@ -35,6 +35,11 @@
>   */
>  #define MIN_WRITEBACK_PAGES	(4096UL >> (PAGE_SHIFT - 10))
>  
> +/*
> + * if WB cgroup dirty pages is bigger than it, not start a full flush
> + */
> +#define MIN_WB_DIRTY_PAGES 64
> +
>  struct wb_completion {
>  	atomic_t		cnt;
>  };
> @@ -2005,6 +2010,32 @@ void wakeup_flusher_threads(enum wb_reason reason)
>  	if (blk_needs_flush_plug(current))
>  		blk_schedule_flush_plug(current);
>  
> +#ifdef CONFIG_CGROUP_WRITEBACK
> +	if (reason == WB_REASON_VMSCAN) {
> +		unsigned long tmp, pdirty = 0;
> +
> +		rcu_read_lock();
> +		list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) {
> +			struct bdi_writeback *wb = wb_find_current(bdi);
> +
> +			if (wb) {
> +				tmp = mem_cgroup_wb_dirty_stats(wb);
> +				if (tmp) {
> +					pdirty += tmp;
> +					wb_start_writeback(wb, reason);
> +
> +					if (wb == &bdi->wb)
> +						pdirty += MIN_WB_DIRTY_PAGES;
> +				}
> +			}
> +		}
> +		rcu_read_unlock();
> +
> +		if (pdirty > MIN_WB_DIRTY_PAGES)
> +			return;
> +	}
> +#endif
> +
>  	rcu_read_lock();
>  	list_for_each_entry_rcu(bdi, &bdi_list, bdi_list)
>  		__wakeup_flusher_threads_bdi(bdi, reason);
> -- 
> 2.16.2

-- 
Michal Hocko
SUSE Labs

^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2018-08-01 12:35 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-08-01 10:48 [PATCH 1/2] mm: add a function to return a bdi_writeback dirty page statistic Li RongQing
2018-08-01 10:48 ` [PATCH 2/2] fs/writeback: do memory cgroup related writeback firstly Li RongQing
2018-08-01 11:16   ` Michal Hocko
2018-08-01 11:03 ` [PATCH 1/2] mm: add a function to return a bdi_writeback dirty page statistic Michal Hocko

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).