From mboxrd@z Thu Jan 1 00:00:00 1970 From: Daisuke Nishimura Subject: Re: [PATCH -mmotm 4/5] memcg: dirty pages accounting and limiting infrastructure Date: Tue, 16 Mar 2010 11:32:38 +0900 Message-ID: <20100316113238.f7d74848.nishimura__19469.7327483029$1268707533$gmane$org@mxp.nes.nec.co.jp> References: <1268609202-15581-1-git-send-email-arighi@develer.com> <1268609202-15581-5-git-send-email-arighi@develer.com> Mime-Version: 1.0 Content-Type: text/plain; charset="us-ascii" Content-Transfer-Encoding: 7bit Return-path: In-Reply-To: <1268609202-15581-5-git-send-email-arighi-vWjgImWzx8FBDgjK7y7TUQ@public.gmane.org> List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Sender: containers-bounces-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA@public.gmane.org Errors-To: containers-bounces-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA@public.gmane.org To: Andrea Righi Cc: Daisuke Nishimura , linux-kernel-u79uwXL29TY76Z2rM5mHXA@public.gmane.org, Trond Myklebust , linux-mm-Bw31MaZKKs3YtjvyW6yDsg@public.gmane.org, Suleiman Souhlal , Andrew Morton , containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA@public.gmane.org, Vivek Goyal , Balbir Singh List-Id: containers.vger.kernel.org On Mon, 15 Mar 2010 00:26:41 +0100, Andrea Righi wrote: > Infrastructure to account dirty pages per cgroup and add dirty limit > interfaces in the cgroupfs: > > - Direct write-out: memory.dirty_ratio, memory.dirty_bytes > > - Background write-out: memory.dirty_background_ratio, memory.dirty_background_bytes > > Signed-off-by: Andrea Righi > --- > include/linux/memcontrol.h | 92 ++++++++- > mm/memcontrol.c | 484 +++++++++++++++++++++++++++++++++++++++++--- > 2 files changed, 540 insertions(+), 36 deletions(-) > > diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h > index 88d3f9e..0602ec9 100644 > --- a/include/linux/memcontrol.h > +++ b/include/linux/memcontrol.h > @@ -19,12 +19,55 @@ > > #ifndef _LINUX_MEMCONTROL_H > #define _LINUX_MEMCONTROL_H > + > +#include > #include > + > struct mem_cgroup; > struct page_cgroup; > struct page; > struct mm_struct; > > +/* Cgroup memory statistics items exported to the kernel */ > +enum mem_cgroup_read_page_stat_item { > + MEMCG_NR_DIRTYABLE_PAGES, > + MEMCG_NR_RECLAIM_PAGES, > + MEMCG_NR_WRITEBACK, > + MEMCG_NR_DIRTY_WRITEBACK_PAGES, > +}; > + > +/* File cache pages accounting */ > +enum mem_cgroup_write_page_stat_item { > + MEMCG_NR_FILE_MAPPED, /* # of pages charged as file rss */ > + MEMCG_NR_FILE_DIRTY, /* # of dirty pages in page cache */ > + MEMCG_NR_FILE_WRITEBACK, /* # of pages under writeback */ > + MEMCG_NR_FILE_WRITEBACK_TEMP, /* # of pages under writeback using > + temporary buffers */ > + MEMCG_NR_FILE_UNSTABLE_NFS, /* # of NFS unstable pages */ > + > + MEMCG_NR_FILE_NSTAT, > +}; > + > +/* Dirty memory parameters */ > +struct vm_dirty_param { > + int dirty_ratio; > + int dirty_background_ratio; > + unsigned long dirty_bytes; > + unsigned long dirty_background_bytes; > +}; > + > +/* > + * TODO: provide a validation check routine. And retry if validation > + * fails. > + */ > +static inline void get_global_vm_dirty_param(struct vm_dirty_param *param) > +{ > + param->dirty_ratio = vm_dirty_ratio; > + param->dirty_bytes = vm_dirty_bytes; > + param->dirty_background_ratio = dirty_background_ratio; > + param->dirty_background_bytes = dirty_background_bytes; > +} > + > #ifdef CONFIG_CGROUP_MEM_RES_CTLR > /* > * All "charge" functions with gfp_mask should use GFP_KERNEL or > @@ -117,6 +160,25 @@ extern void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, > extern int do_swap_account; > #endif > > +extern bool mem_cgroup_has_dirty_limit(void); > +extern void get_vm_dirty_param(struct vm_dirty_param *param); > +extern s64 mem_cgroup_page_stat(enum mem_cgroup_read_page_stat_item item); > + > +extern void mem_cgroup_update_page_stat(struct page *page, > + enum mem_cgroup_write_page_stat_item idx, bool charge); > + > +static inline void mem_cgroup_inc_page_stat(struct page *page, > + enum mem_cgroup_write_page_stat_item idx) > +{ > + mem_cgroup_update_page_stat(page, idx, true); > +} > + > +static inline void mem_cgroup_dec_page_stat(struct page *page, > + enum mem_cgroup_write_page_stat_item idx) > +{ > + mem_cgroup_update_page_stat(page, idx, false); > +} > + > static inline bool mem_cgroup_disabled(void) > { > if (mem_cgroup_subsys.disabled) > @@ -124,12 +186,6 @@ static inline bool mem_cgroup_disabled(void) > return false; > } > > -enum mem_cgroup_page_stat_item { > - MEMCG_NR_FILE_MAPPED, > - MEMCG_NR_FILE_NSTAT, > -}; > - > -void mem_cgroup_update_stat(struct page *page, int idx, bool charge); > unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, > gfp_t gfp_mask, int nid, > int zid); > @@ -299,8 +355,18 @@ mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p) > { > } > > -static inline void mem_cgroup_update_file_mapped(struct page *page, > - int val) > +static inline s64 mem_cgroup_page_stat(enum mem_cgroup_read_page_stat_item item) > +{ > + return -ENOSYS; > +} > + > +static inline void mem_cgroup_inc_page_stat(struct page *page, > + enum mem_cgroup_write_page_stat_item idx) > +{ > +} > + > +static inline void mem_cgroup_dec_page_stat(struct page *page, > + enum mem_cgroup_write_page_stat_item idx) > { > } > > @@ -311,6 +377,16 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, > return 0; > } > > +static inline bool mem_cgroup_has_dirty_limit(void) > +{ > + return false; > +} > + > +static inline void get_vm_dirty_param(struct vm_dirty_param *param) > +{ > + get_global_vm_dirty_param(param); > +} > + > #endif /* CONFIG_CGROUP_MEM_CONT */ > > #endif /* _LINUX_MEMCONTROL_H */ > diff --git a/mm/memcontrol.c b/mm/memcontrol.c > index b7c23ea..91770d0 100644 > --- a/mm/memcontrol.c > +++ b/mm/memcontrol.c > @@ -80,14 +80,21 @@ enum mem_cgroup_stat_index { > /* > * For MEM_CONTAINER_TYPE_ALL, usage = pagecache + rss. > */ > - MEM_CGROUP_STAT_CACHE, /* # of pages charged as cache */ > + MEM_CGROUP_STAT_CACHE, /* # of pages charged as cache */ > MEM_CGROUP_STAT_RSS, /* # of pages charged as anon rss */ > - MEM_CGROUP_STAT_FILE_MAPPED, /* # of pages charged as file rss */ > MEM_CGROUP_STAT_PGPGIN_COUNT, /* # of pages paged in */ > MEM_CGROUP_STAT_PGPGOUT_COUNT, /* # of pages paged out */ > MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */ > MEM_CGROUP_EVENTS, /* incremented at every pagein/pageout */ > > + /* File cache pages accounting */ > + MEM_CGROUP_STAT_FILE_MAPPED, /* # of pages charged as file rss */ > + MEM_CGROUP_STAT_FILE_DIRTY, /* # of dirty pages in page cache */ > + MEM_CGROUP_STAT_WRITEBACK, /* # of pages under writeback */ > + MEM_CGROUP_STAT_WRITEBACK_TEMP, /* # of pages under writeback using > + temporary buffers */ > + MEM_CGROUP_STAT_UNSTABLE_NFS, /* # of NFS unstable pages */ > + > MEM_CGROUP_STAT_NSTATS, > }; > > @@ -95,6 +102,19 @@ struct mem_cgroup_stat_cpu { > s64 count[MEM_CGROUP_STAT_NSTATS]; > }; > > +/* Per cgroup page statistics */ > +struct mem_cgroup_page_stat { > + enum mem_cgroup_read_page_stat_item item; > + s64 value; > +}; > + > +enum { > + MEM_CGROUP_DIRTY_RATIO, > + MEM_CGROUP_DIRTY_BYTES, > + MEM_CGROUP_DIRTY_BACKGROUND_RATIO, > + MEM_CGROUP_DIRTY_BACKGROUND_BYTES, > +}; > + > /* > * per-zone information in memory controller. > */ > @@ -208,6 +228,9 @@ struct mem_cgroup { > > unsigned int swappiness; > > + /* control memory cgroup dirty pages */ > + struct vm_dirty_param dirty_param; > + > /* set when res.limit == memsw.limit */ > bool memsw_is_minimum; > > @@ -1033,6 +1056,189 @@ static unsigned int get_swappiness(struct mem_cgroup *memcg) > return swappiness; > } > > +static bool dirty_param_is_valid(struct vm_dirty_param *param) > +{ > + if (param->dirty_ratio && param->dirty_bytes) > + return false; > + if (param->dirty_background_ratio && param->dirty_background_bytes) > + return false; > + return true; > +} > + > +static void __mem_cgroup_get_dirty_param(struct vm_dirty_param *param, > + struct mem_cgroup *mem) > +{ > + param->dirty_ratio = mem->dirty_param.dirty_ratio; > + param->dirty_bytes = mem->dirty_param.dirty_bytes; > + param->dirty_background_ratio = mem->dirty_param.dirty_background_ratio; > + param->dirty_background_bytes = mem->dirty_param.dirty_background_bytes; > +} > + > +/* > + * get_vm_dirty_param() - get dirty memory parameters of the current memcg > + * @param: a structure that is filled with the dirty memory settings > + * > + * The function fills @param with the current memcg dirty memory settings. If > + * memory cgroup is disabled or in case of error the structure is filled with > + * the global dirty memory settings. > + */ > +void get_vm_dirty_param(struct vm_dirty_param *param) > +{ > + struct mem_cgroup *memcg; > + > + if (mem_cgroup_disabled()) { > + get_global_vm_dirty_param(param); > + return; > + } > + /* > + * It's possible that "current" may be moved to other cgroup while we > + * access cgroup. But precise check is meaningless because the task can > + * be moved after our access and writeback tends to take long time. > + * At least, "memcg" will not be freed under rcu_read_lock(). > + */ > + while (1) { > + rcu_read_lock(); > + memcg = mem_cgroup_from_task(current); > + if (likely(memcg)) > + __mem_cgroup_get_dirty_param(param, memcg); > + else > + get_global_vm_dirty_param(param); > + rcu_read_unlock(); > + /* > + * Since global and memcg vm_dirty_param are not protected we > + * try to speculatively read them and retry if we get > + * inconsistent values. > + */ > + if (likely(dirty_param_is_valid(param))) > + break; > + } > +} > + > +/* > + * mem_cgroup_has_dirty_limit() - check if current memcg has local dirty limits > + * > + * Return true if the current memory cgroup has local dirty memory settings, > + * false otherwise. > + */ > +bool mem_cgroup_has_dirty_limit(void) > +{ > + struct mem_cgroup *mem; > + > + if (mem_cgroup_disabled()) > + return false; > + mem = mem_cgroup_from_task(current); > + return mem && !mem_cgroup_is_root(mem); > +} > + > +static inline bool mem_cgroup_can_swap(struct mem_cgroup *memcg) > +{ > + if (!do_swap_account) > + return nr_swap_pages > 0; > + return !memcg->memsw_is_minimum && > + (res_counter_read_u64(&memcg->memsw, RES_LIMIT) > 0); > +} > + > +static s64 mem_cgroup_get_local_page_stat(struct mem_cgroup *memcg, > + enum mem_cgroup_read_page_stat_item item) > +{ > + s64 ret; > + > + switch (item) { > + case MEMCG_NR_DIRTYABLE_PAGES: > + ret = mem_cgroup_read_stat(memcg, LRU_ACTIVE_FILE) + > + mem_cgroup_read_stat(memcg, LRU_INACTIVE_FILE); > + if (mem_cgroup_can_swap(memcg)) > + ret += mem_cgroup_read_stat(memcg, LRU_ACTIVE_ANON) + > + mem_cgroup_read_stat(memcg, LRU_INACTIVE_ANON); > + break; > + case MEMCG_NR_RECLAIM_PAGES: > + ret = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_FILE_DIRTY) + > + mem_cgroup_read_stat(memcg, > + MEM_CGROUP_STAT_UNSTABLE_NFS); > + break; > + case MEMCG_NR_WRITEBACK: > + ret = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_WRITEBACK); > + break; > + case MEMCG_NR_DIRTY_WRITEBACK_PAGES: > + ret = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_WRITEBACK) + > + mem_cgroup_read_stat(memcg, > + MEM_CGROUP_STAT_UNSTABLE_NFS); > + break; > + default: > + BUG(); > + break; > + } > + return ret; > +} > + > +static int mem_cgroup_page_stat_cb(struct mem_cgroup *mem, void *data) > +{ > + struct mem_cgroup_page_stat *stat = (struct mem_cgroup_page_stat *)data; > + > + stat->value += mem_cgroup_get_local_page_stat(mem, stat->item); > + return 0; > +} > + > +static unsigned long long > +memcg_get_hierarchical_free_pages(struct mem_cgroup *mem) > +{ > + struct cgroup *cgroup; > + unsigned long long min_free, free; > + > + min_free = res_counter_read_u64(&mem->res, RES_LIMIT) - > + res_counter_read_u64(&mem->res, RES_USAGE); > + cgroup = mem->css.cgroup; > + if (!mem->use_hierarchy) > + goto out; > + > + while (cgroup->parent) { > + cgroup = cgroup->parent; > + mem = mem_cgroup_from_cont(cgroup); > + if (!mem->use_hierarchy) > + break; > + free = res_counter_read_u64(&mem->res, RES_LIMIT) - > + res_counter_read_u64(&mem->res, RES_USAGE); > + min_free = min(min_free, free); > + } > +out: > + /* Translate free memory in pages */ > + return min_free >> PAGE_SHIFT; > +} > + > +/* > + * mem_cgroup_page_stat() - get memory cgroup file cache statistics > + * @item: memory statistic item exported to the kernel > + * > + * Return the accounted statistic value, or a negative value in case of error. > + */ > +s64 mem_cgroup_page_stat(enum mem_cgroup_read_page_stat_item item) > +{ > + struct mem_cgroup_page_stat stat = {}; > + struct mem_cgroup *mem; > + > + rcu_read_lock(); > + mem = mem_cgroup_from_task(current); > + if (mem && !mem_cgroup_is_root(mem)) { > + /* > + * If we're looking for dirtyable pages we need to evaluate > + * free pages depending on the limit and usage of the parents > + * first of all. > + */ > + if (item == MEMCG_NR_DIRTYABLE_PAGES) > + stat.value = memcg_get_hierarchical_free_pages(mem); > + /* > + * Recursively evaluate page statistics against all cgroup > + * under hierarchy tree > + */ > + stat.item = item; > + mem_cgroup_walk_tree(mem, &stat, mem_cgroup_page_stat_cb); > + } else > + stat.value = -EINVAL; > + rcu_read_unlock(); > + > + return stat.value; > +} > + hmm, mem_cgroup_page_stat() can return negative value, but you place BUG_ON() in [5/5] to check it returns negative value. What happens if the current is moved to root between mem_cgroup_has_dirty_limit() and mem_cgroup_page_stat() ? How about making mem_cgroup_has_dirty_limit() return the target mem_cgroup, and passing the mem_cgroup to mem_cgroup_page_stat() ? > static int mem_cgroup_count_children_cb(struct mem_cgroup *mem, void *data) > { > int *val = data; > @@ -1344,24 +1550,16 @@ bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask) > return true; > } > > -/* > - * Currently used to update mapped file statistics, but the routine can be > - * generalized to update other statistics as well. > - */ > -void __mem_cgroup_update_stat(struct page_cgroup *pc, int idx, bool charge) > +static void __mem_cgroup_update_page_stat(struct page_cgroup *pc, > + int idx, bool charge) > { > struct mem_cgroup *mem; > - int val; > + int val = charge ? 1 : -1; > > mem = pc->mem_cgroup; > if (!mem || !PageCgroupUsed(pc)) > return; > > - if (charge) > - val = 1; > - else > - val = -1; > - > switch (idx) { > case MEMCG_NR_FILE_MAPPED: > if (charge) { > @@ -1377,6 +1575,62 @@ void __mem_cgroup_update_stat(struct page_cgroup *pc, int idx, bool charge) > } > idx = MEM_CGROUP_STAT_FILE_MAPPED; > break; > + case MEMCG_NR_FILE_DIRTY: > + if (charge) { > + if (!PageCgroupFileDirty(pc)) > + SetPageCgroupFileDirty(pc); > + else > + val = 0; > + } else { > + if (PageCgroupFileDirty(pc)) > + ClearPageCgroupFileDirty(pc); > + else > + val = 0; > + } > + idx = MEM_CGROUP_STAT_FILE_DIRTY; > + break; > + case MEMCG_NR_FILE_WRITEBACK: > + if (charge) { > + if (!PageCgroupFileWriteback(pc)) > + SetPageCgroupFileWriteback(pc); > + else > + val = 0; > + } else { > + if (PageCgroupFileWriteback(pc)) > + ClearPageCgroupFileWriteback(pc); > + else > + val = 0; > + } > + idx = MEM_CGROUP_STAT_WRITEBACK; > + break; > + case MEMCG_NR_FILE_WRITEBACK_TEMP: > + if (charge) { > + if (!PageCgroupFileWritebackTemp(pc)) > + SetPageCgroupFileWritebackTemp(pc); > + else > + val = 0; > + } else { > + if (PageCgroupFileWritebackTemp(pc)) > + ClearPageCgroupFileWritebackTemp(pc); > + else > + val = 0; > + } > + idx = MEM_CGROUP_STAT_WRITEBACK_TEMP; > + break; > + case MEMCG_NR_FILE_UNSTABLE_NFS: > + if (charge) { > + if (!PageCgroupFileUnstableNFS(pc)) > + SetPageCgroupFileUnstableNFS(pc); > + else > + val = 0; > + } else { > + if (PageCgroupFileUnstableNFS(pc)) > + ClearPageCgroupFileUnstableNFS(pc); > + else > + val = 0; > + } > + idx = MEM_CGROUP_STAT_UNSTABLE_NFS; > + break; > default: > BUG(); > break; > @@ -1384,34 +1638,81 @@ void __mem_cgroup_update_stat(struct page_cgroup *pc, int idx, bool charge) > /* > * Preemption is already disabled. We can use __this_cpu_xxx > */ > - __this_cpu_add(mem->stat->count[idx], val); > + if (val) > + __this_cpu_add(mem->stat->count[idx], val); > } > > -void mem_cgroup_update_stat(struct page *page, int idx, bool charge) > +/* > + * mem_cgroup_update_page_stat() - update memcg file cache's accounting > + * @page: the page involved in a file cache operation. > + * @idx: the particular file cache statistic. > + * @charge: true to increment, false to decrement the statistic specified > + * by @idx. > + * > + * Update memory cgroup file cache's accounting. > + */ > +void mem_cgroup_update_page_stat(struct page *page, > + enum mem_cgroup_write_page_stat_item idx, bool charge) > { > struct page_cgroup *pc; > > + if (mem_cgroup_disabled()) > + return; > pc = lookup_page_cgroup(page); > if (!pc || mem_cgroup_is_root(pc->mem_cgroup)) > return; > > if (trylock_page_cgroup(pc)) { > - __mem_cgroup_update_stat(pc, idx, charge); > + __mem_cgroup_update_page_stat(pc, idx, charge); > unlock_page_cgroup(pc); > } > - return; > } > +EXPORT_SYMBOL_GPL(mem_cgroup_update_page_stat); > > -static void mem_cgroup_migrate_stat(struct page_cgroup *pc, > +/* Update file cache accounted statistics on task migration. */ > +static void mem_cgroup_migrate_file_stat(struct page_cgroup *pc, > struct mem_cgroup *from, struct mem_cgroup *to) > { > if (PageCgroupFileMapped(pc)) { > __this_cpu_dec(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]); > - if (!mem_cgroup_is_root(to)) { > - __this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]); > - } else { > + if (!mem_cgroup_is_root(to)) > + __this_cpu_inc( > + to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]); > + else > ClearPageCgroupFileMapped(pc); > - } > + } > + if (PageCgroupFileDirty(pc)) { > + __this_cpu_dec(from->stat->count[MEM_CGROUP_STAT_FILE_DIRTY]); > + if (!mem_cgroup_is_root(to)) > + __this_cpu_inc( > + to->stat->count[MEM_CGROUP_STAT_FILE_DIRTY]); > + else > + ClearPageCgroupFileDirty(pc); > + } > + if (PageCgroupFileWriteback(pc)) { > + __this_cpu_dec(from->stat->count[MEM_CGROUP_STAT_WRITEBACK]); > + if (!mem_cgroup_is_root(to)) > + __this_cpu_inc( > + to->stat->count[MEM_CGROUP_STAT_WRITEBACK]); > + else > + ClearPageCgroupFileWriteback(pc); > + } > + if (PageCgroupFileWritebackTemp(pc)) { > + __this_cpu_dec( > + from->stat->count[MEM_CGROUP_STAT_WRITEBACK_TEMP]); > + if (!mem_cgroup_is_root(to)) > + __this_cpu_inc(to->stat->count[ > + MEM_CGROUP_STAT_WRITEBACK_TEMP]); > + else > + ClearPageCgroupFileWritebackTemp(pc); > + } > + if (PageCgroupFileUnstableNFS(pc)) { > + __this_cpu_dec(from->stat->count[MEM_CGROUP_STAT_UNSTABLE_NFS]); > + if (!mem_cgroup_is_root(to)) > + __this_cpu_inc( > + to->stat->count[MEM_CGROUP_STAT_UNSTABLE_NFS]); > + else > + ClearPageCgroupFileUnstableNFS(pc); > } > } > > @@ -1425,6 +1726,23 @@ __mem_cgroup_stat_fixup(struct page_cgroup *pc, struct mem_cgroup *mem) > __this_cpu_dec(mem->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]); > ClearPageCgroupFileMapped(pc); > } > + if (PageCgroupFileDirty(pc)) { > + __this_cpu_dec(mem->stat->count[MEM_CGROUP_STAT_FILE_DIRTY]); > + ClearPageCgroupFileDirty(pc); > + } > + if (PageCgroupFileWriteback(pc)) { > + __this_cpu_dec(mem->stat->count[MEM_CGROUP_STAT_WRITEBACK]); > + ClearPageCgroupFileWriteback(pc); > + } > + if (PageCgroupFileWritebackTemp(pc)) { > + __this_cpu_dec( > + mem->stat->count[MEM_CGROUP_STAT_WRITEBACK_TEMP]); > + ClearPageCgroupFileWritebackTemp(pc); > + } > + if (PageCgroupFileUnstableNFS(pc)) { > + __this_cpu_dec(mem->stat->count[MEM_CGROUP_STAT_UNSTABLE_NFS]); > + ClearPageCgroupFileUnstableNFS(pc); > + } > } > > /* > @@ -1863,7 +2181,7 @@ static void __mem_cgroup_move_account(struct page_cgroup *pc, > VM_BUG_ON(pc->mem_cgroup != from); > > page = pc->page; > - mem_cgroup_migrate_stat(pc, from, to); > + mem_cgroup_migrate_file_stat(pc, from, to); > mem_cgroup_charge_statistics(from, pc, false); > if (uncharge) > /* This is not "cancel", but cancel_charge does all we need. */ > @@ -3168,10 +3486,14 @@ static int mem_cgroup_move_charge_write(struct cgroup *cgrp, > enum { > MCS_CACHE, > MCS_RSS, > - MCS_FILE_MAPPED, > MCS_PGPGIN, > MCS_PGPGOUT, > MCS_SWAP, > + MCS_FILE_MAPPED, > + MCS_FILE_DIRTY, > + MCS_WRITEBACK, > + MCS_WRITEBACK_TEMP, > + MCS_UNSTABLE_NFS, > MCS_INACTIVE_ANON, > MCS_ACTIVE_ANON, > MCS_INACTIVE_FILE, > @@ -3190,10 +3512,14 @@ struct { > } memcg_stat_strings[NR_MCS_STAT] = { > {"cache", "total_cache"}, > {"rss", "total_rss"}, > - {"mapped_file", "total_mapped_file"}, > {"pgpgin", "total_pgpgin"}, > {"pgpgout", "total_pgpgout"}, > {"swap", "total_swap"}, > + {"mapped_file", "total_mapped_file"}, > + {"filedirty", "dirty_pages"}, > + {"writeback", "writeback_pages"}, > + {"writeback_tmp", "writeback_temp_pages"}, > + {"nfs", "nfs_unstable"}, > {"inactive_anon", "total_inactive_anon"}, > {"active_anon", "total_active_anon"}, > {"inactive_file", "total_inactive_file"}, Why not using "total_xxx" for total_name ? > @@ -3212,8 +3538,6 @@ static int mem_cgroup_get_local_stat(struct mem_cgroup *mem, void *data) > s->stat[MCS_CACHE] += val * PAGE_SIZE; > val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_RSS); > s->stat[MCS_RSS] += val * PAGE_SIZE; > - val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_FILE_MAPPED); > - s->stat[MCS_FILE_MAPPED] += val * PAGE_SIZE; > val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_PGPGIN_COUNT); > s->stat[MCS_PGPGIN] += val; > val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_PGPGOUT_COUNT); > @@ -3222,6 +3546,16 @@ static int mem_cgroup_get_local_stat(struct mem_cgroup *mem, void *data) > val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_SWAPOUT); > s->stat[MCS_SWAP] += val * PAGE_SIZE; > } > + val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_FILE_MAPPED); > + s->stat[MCS_FILE_MAPPED] += val * PAGE_SIZE; > + val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_FILE_DIRTY); > + s->stat[MCS_FILE_DIRTY] += val; > + val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_WRITEBACK); > + s->stat[MCS_WRITEBACK] += val; > + val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_WRITEBACK_TEMP); > + s->stat[MCS_WRITEBACK_TEMP] += val; > + val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_UNSTABLE_NFS); > + s->stat[MCS_UNSTABLE_NFS] += val; > I don't have a strong objection, but I prefer showing them in bytes. And can you add to mem_cgroup_stat_show() something like: for (i = 0; i < NR_MCS_STAT; i++) { if (i == MCS_SWAP && !do_swap_account) continue; + if (i >= MCS_FILE_STAT_STAR && i <= MCS_FILE_STAT_END && + mem_cgroup_is_root(mem_cont)) + continue; cb->fill(cb, memcg_stat_strings[i].local_name, mystat.stat[i]); } not to show file stat in root cgroup ? It's meaningless value anyway. Of course, you'd better mention it in [2/5] too. Thanks, Daisuke Nishimura. > /* per zone stat */ > val = mem_cgroup_get_local_zonestat(mem, LRU_INACTIVE_ANON); > @@ -3583,6 +3917,63 @@ unlock: > return ret; > } > > +static u64 mem_cgroup_dirty_read(struct cgroup *cgrp, struct cftype *cft) > +{ > + struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); > + > + switch (cft->private) { > + case MEM_CGROUP_DIRTY_RATIO: > + return memcg->dirty_param.dirty_ratio; > + case MEM_CGROUP_DIRTY_BYTES: > + return memcg->dirty_param.dirty_bytes; > + case MEM_CGROUP_DIRTY_BACKGROUND_RATIO: > + return memcg->dirty_param.dirty_background_ratio; > + case MEM_CGROUP_DIRTY_BACKGROUND_BYTES: > + return memcg->dirty_param.dirty_background_bytes; > + default: > + BUG(); > + } > +} > + > +static int > +mem_cgroup_dirty_write(struct cgroup *cgrp, struct cftype *cft, u64 val) > +{ > + struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); > + int type = cft->private; > + > + if (cgrp->parent == NULL) > + return -EINVAL; > + if ((type == MEM_CGROUP_DIRTY_RATIO || > + type == MEM_CGROUP_DIRTY_BACKGROUND_RATIO) && val > 100) > + return -EINVAL; > + /* > + * TODO: provide a validation check routine. And retry if validation > + * fails. > + */ > + switch (type) { > + case MEM_CGROUP_DIRTY_RATIO: > + memcg->dirty_param.dirty_ratio = val; > + memcg->dirty_param.dirty_bytes = 0; > + break; > + case MEM_CGROUP_DIRTY_BYTES: > + memcg->dirty_param.dirty_ratio = 0; > + memcg->dirty_param.dirty_bytes = val; > + break; > + case MEM_CGROUP_DIRTY_BACKGROUND_RATIO: > + memcg->dirty_param.dirty_background_ratio = val; > + memcg->dirty_param.dirty_background_bytes = 0; > + break; > + case MEM_CGROUP_DIRTY_BACKGROUND_BYTES: > + memcg->dirty_param.dirty_background_ratio = 0; > + memcg->dirty_param.dirty_background_bytes = val; > + break; > + default: > + BUG(); > + break; > + } > + return 0; > +} > + > static struct cftype mem_cgroup_files[] = { > { > .name = "usage_in_bytes", > @@ -3634,6 +4025,30 @@ static struct cftype mem_cgroup_files[] = { > .write_u64 = mem_cgroup_swappiness_write, > }, > { > + .name = "dirty_ratio", > + .read_u64 = mem_cgroup_dirty_read, > + .write_u64 = mem_cgroup_dirty_write, > + .private = MEM_CGROUP_DIRTY_RATIO, > + }, > + { > + .name = "dirty_bytes", > + .read_u64 = mem_cgroup_dirty_read, > + .write_u64 = mem_cgroup_dirty_write, > + .private = MEM_CGROUP_DIRTY_BYTES, > + }, > + { > + .name = "dirty_background_ratio", > + .read_u64 = mem_cgroup_dirty_read, > + .write_u64 = mem_cgroup_dirty_write, > + .private = MEM_CGROUP_DIRTY_BACKGROUND_RATIO, > + }, > + { > + .name = "dirty_background_bytes", > + .read_u64 = mem_cgroup_dirty_read, > + .write_u64 = mem_cgroup_dirty_write, > + .private = MEM_CGROUP_DIRTY_BACKGROUND_BYTES, > + }, > + { > .name = "move_charge_at_immigrate", > .read_u64 = mem_cgroup_move_charge_read, > .write_u64 = mem_cgroup_move_charge_write, > @@ -3892,8 +4307,21 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) > mem->last_scanned_child = 0; > spin_lock_init(&mem->reclaim_param_lock); > > - if (parent) > + if (parent) { > mem->swappiness = get_swappiness(parent); > + mem->dirty_param = parent->dirty_param; > + } else { > + while (1) { > + get_global_vm_dirty_param(&mem->dirty_param); > + /* > + * Since global dirty parameters are not protected we > + * try to speculatively read them and retry if we get > + * inconsistent values. > + */ > + if (likely(dirty_param_is_valid(&mem->dirty_param))) > + break; > + } > + } > atomic_set(&mem->refcnt, 1); > mem->move_charge_at_immigrate = 0; > mutex_init(&mem->thresholds_lock); > -- > 1.6.3.3 >