From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1755525AbbAOStY (ORCPT ); Thu, 15 Jan 2015 13:49:24 -0500 Received: from forward-corp1f.mail.yandex.net ([95.108.130.40]:52058 "EHLO forward-corp1f.mail.yandex.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1755454AbbAOStU (ORCPT ); Thu, 15 Jan 2015 13:49:20 -0500 Authentication-Results: smtpcorp1m.mail.yandex.net; dkim=pass header.i=@yandex-team.ru Subject: [PATCH 3/6] memcg: track shared inodes with dirty pages From: Konstantin Khebnikov To: linux-mm@kvack.org, cgroups@vger.kernel.org Cc: Roman Gushchin , Jan Kara , Dave Chinner , linux-kernel@vger.kernel.org, Tejun Heo , linux-fsdevel@vger.kernel.org, koct9i@gmail.com Date: Thu, 15 Jan 2015 21:49:14 +0300 Message-ID: <20150115184914.10450.51964.stgit@buzz> In-Reply-To: <20150115180242.10450.92.stgit@buzz> References: <20150115180242.10450.92.stgit@buzz> User-Agent: StGit/0.17.1-dirty MIME-Version: 1.0 Content-Type: text/plain; charset="utf-8" Content-Transfer-Encoding: 7bit Sender: linux-kernel-owner@vger.kernel.org List-ID: X-Mailing-List: linux-kernel@vger.kernel.org From: Konstantin Khlebnikov Inode is owned only by one memory cgroup, but if it's shared it might contain pages from multiple cgroups. This patch detects this situation in memory reclaiemer and marks dirty inode with flag I_DIRTY_SHARED which is cleared only when data is completely written. Memcg writeback always writes such inodes. Signed-off-by: Konstantin Khlebnikov --- fs/fs-writeback.c | 4 ++-- include/linux/fs.h | 3 +++ include/linux/memcontrol.h | 4 ++++ mm/memcontrol.c | 20 ++++++++++++++++++++ mm/vmscan.c | 4 ++++ 5 files changed, 33 insertions(+), 2 deletions(-) diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 9034768..fda6a64 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -484,7 +484,7 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc) */ spin_lock(&inode->i_lock); - dirty = inode->i_state & I_DIRTY; + dirty = inode->i_state & (I_DIRTY | I_DIRTY_SHARED); inode->i_state &= ~I_DIRTY; /* @@ -501,7 +501,7 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc) smp_mb(); if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) - inode->i_state |= I_DIRTY_PAGES; + inode->i_state |= I_DIRTY_PAGES | (dirty & I_DIRTY_SHARED); spin_unlock(&inode->i_lock); diff --git a/include/linux/fs.h b/include/linux/fs.h index ee2e3c0..303f0ad 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1741,6 +1741,8 @@ struct super_operations { * * I_DIO_WAKEUP Never set. Only used as a key for wait_on_bit(). * + * I_DIRTY_SHARED Dirty pages belong to multiple memory cgroups. + * * Q: What is the difference between I_WILL_FREE and I_FREEING? */ #define I_DIRTY_SYNC (1 << 0) @@ -1757,6 +1759,7 @@ struct super_operations { #define __I_DIO_WAKEUP 9 #define I_DIO_WAKEUP (1 << I_DIO_WAKEUP) #define I_LINKABLE (1 << 10) +#define I_DIRTY_SHARED (1 << 11) #define I_DIRTY (I_DIRTY_SYNC | I_DIRTY_DATASYNC | I_DIRTY_PAGES) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index ae05563..3f89e9b 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -181,6 +181,8 @@ void mem_cgroup_forget_mapping(struct address_space *mapping); bool mem_cgroup_dirty_limits(struct address_space *mapping, unsigned long *dirty, unsigned long *thresh, unsigned long *bg_thresh); bool mem_cgroup_dirty_exceeded(struct inode *inode); +void mem_cgroup_poke_writeback(struct address_space *mapping, + struct mem_cgroup *memcg); #else /* CONFIG_MEMCG */ struct mem_cgroup; @@ -358,6 +360,8 @@ static inline void mem_cgroup_forget_mapping(struct address_space *mapping) {} static inline bool mem_cgroup_dirty_limits(struct address_space *mapping, unsigned long *dirty, unsigned long *thresh, unsigned long *bg_thresh) { return false; } static inline bool mem_cgroup_dirty_exceeded(struct inode *inode) { return false; } +static inline void mem_cgroup_poke_writeback(struct address_space *mapping, + struct mem_cgroup *memcg) { } #endif /* CONFIG_MEMCG */ diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 17d966a3b..d9d345c 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -6064,6 +6064,9 @@ bool mem_cgroup_dirty_exceeded(struct inode *inode) if (mapping->backing_dev_info->dirty_exceeded) return true; + if (inode->i_state & I_DIRTY_SHARED) + return true; + rcu_read_lock(); memcg = rcu_dereference(mapping->i_memcg); for (; memcg; memcg = parent_mem_cgroup(memcg)) { @@ -6084,6 +6087,23 @@ bool mem_cgroup_dirty_exceeded(struct inode *inode) return memcg != NULL; } +void mem_cgroup_poke_writeback(struct address_space *mapping, + struct mem_cgroup *memcg) +{ + struct inode *inode = mapping->host; + + if (rcu_access_pointer(mapping->i_memcg) == memcg || + !memcg->dirty_exceeded) + return; + + if (inode->i_state & (I_DIRTY_PAGES|I_DIRTY_SHARED) == I_DIRTY_PAGES) { + spin_lock(&inode->i_lock); + if (inode->i_state & I_DIRTY_PAGES) + inode->i_state |= I_DIRTY_SHARED; + spin_unlock(&inode->i_lock); + } +} + /* * subsys_initcall() for memory controller. * diff --git a/mm/vmscan.c b/mm/vmscan.c index ab2505c..75165fc 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1013,6 +1013,10 @@ static unsigned long shrink_page_list(struct list_head *page_list, inc_zone_page_state(page, NR_VMSCAN_IMMEDIATE); SetPageReclaim(page); + if (!global_reclaim(sc)) + mem_cgroup_poke_writeback(mapping, + sc->target_mem_cgroup); + goto keep_locked; }