[PATCH v2] mm: workingset: make workingset detection logic memcg aware

* [PATCH v2] mm: workingset: make workingset detection logic memcg aware
@ 2016-01-24 16:56 Vladimir Davydov
  2016-01-25 16:39 ` Johannes Weiner
  0 siblings, 1 reply; 6+ messages in thread
From: Vladimir Davydov @ 2016-01-24 16:56 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Johannes Weiner, Michal Hocko, linux-mm, linux-kernel

Currently, inactive_age is maintained per zone, which results in
unexpected file page activations in case memory cgroups are used. For
example, if the total number of active pages is big, a memory cgroup
might get every refaulted file page activated even if refault distance
is much greater than the number of active file pages in the cgroup. This
patch fixes this issue by making inactive_age per lruvec.

The patch is pretty straightforward and self-explaining, but there are
two things that should be noted:

 - workingset_{eviction,activation} need to get lruvec given a page.
   On the default hierarchy one can safely access page->mem_cgroup
   provided the page is pinned, but on the legacy hierarchy a page can
   be migrated from one cgroup to another at any moment, so extra care
   must be taken to assure page->mem_cgroup will stay put.

   workingset_eviction is passed a locked page, so it is safe to use
   page->mem_cgroup in this function. workingset_activation is trickier:
   it is called from mark_page_accessed, where the page is not
   necessarily locked. To protect it against page->mem_cgroup change, we
   move it to __activate_page, which is called by mark_page_accessed
   once there's enough pages on percpu pagevec. This function is called
   with zone->lru_lock held, which rules out page charge migration.

 - To calculate refault distance correctly even in case a page is
   refaulted by a different cgroup, we need to store memcg id in shadow
   entry. There's no problem with it on 64-bit, but on 32-bit there's
   not much space left in radix tree slot after storing information
   about node, zone, and memory cgroup, so we can't just save eviction
   counter as is, because it would trim max refault distance making it
   unusable.

   To overcome this problem, we increase refault distance granularity,
   as proposed by Johannes Weiner. We disregard 10 least significant
   bits of eviction counter. This reduces refault distance accuracy to
   4MB, which is still fine. With the default NODE_SHIFT (3) this leaves
   us 9 bits for storing eviction counter, hence maximal refault
   distance will be 2GB, which should be enough for 32-bit systems.

Signed-off-by: Vladimir Davydov <vdavydov@virtuozzo.com>
---
Changes from v1:
 - Handle refaults by different cgroups properly (Johannes).

v1: http://www.spinics.net/lists/linux-mm/msg92466.html

 include/linux/memcontrol.h |  44 ++++++++++++++++
 include/linux/mmzone.h     |   7 +--
 include/linux/swap.h       |   1 +
 mm/memcontrol.c            |  25 ---------
 mm/swap.c                  |   5 +-
 mm/truncate.c              |   1 +
 mm/workingset.c            | 125 ++++++++++++++++++++++++++++++++++++++++-----
 7 files changed, 166 insertions(+), 42 deletions(-)

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 9ae48d4aeb5e..fd67027bf2e7 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -366,6 +366,42 @@ static inline bool mem_cgroup_online(struct mem_cgroup *memcg)
 }
 
 /*
+ * We restrict the id in the range of [1, 65535], so it can fit into
+ * an unsigned short.
+ */
+#define MEM_CGROUP_ID_SHIFT	16
+#define MEM_CGROUP_ID_MAX	((1 << MEM_CGROUP_ID_SHIFT) - 1)
+
+static inline unsigned short mem_cgroup_id(struct mem_cgroup *memcg)
+{
+	return memcg->css.id;
+}
+
+/*
+ * A helper function to get mem_cgroup from ID. must be called under
+ * rcu_read_lock().  The caller is responsible for calling
+ * css_tryget_online() if the mem_cgroup is used for charging. (dropping
+ * refcnt from swap can be called against removed memcg.)
+ */
+static inline struct mem_cgroup *mem_cgroup_from_id(unsigned short id)
+{
+	struct cgroup_subsys_state *css;
+
+	css = css_from_id(id, &memory_cgrp_subsys);
+	return mem_cgroup_from_css(css);
+}
+
+static inline void mem_cgroup_get(struct mem_cgroup *memcg)
+{
+	css_get(&memcg->css);
+}
+
+static inline void mem_cgroup_put(struct mem_cgroup *memcg)
+{
+	css_put(&memcg->css);
+}
+
+/*
  * For memory reclaim.
  */
 int mem_cgroup_select_victim_node(struct mem_cgroup *memcg);
@@ -590,6 +626,14 @@ static inline bool mem_cgroup_online(struct mem_cgroup *memcg)
 	return true;
 }
 
+static inline void mem_cgroup_get(struct mem_cgroup *memcg)
+{
+}
+
+static inline void mem_cgroup_put(struct mem_cgroup *memcg)
+{
+}
+
 static inline bool
 mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec)
 {
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 7b6c2cfee390..684368ccea50 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -211,6 +211,10 @@ struct zone_reclaim_stat {
 struct lruvec {
 	struct list_head lists[NR_LRU_LISTS];
 	struct zone_reclaim_stat reclaim_stat;
+
+	/* Evictions & activations on the inactive file list */
+	atomic_long_t		inactive_age;
+
 #ifdef CONFIG_MEMCG
 	struct zone *zone;
 #endif
@@ -487,9 +491,6 @@ struct zone {
 	spinlock_t		lru_lock;
 	struct lruvec		lruvec;
 
-	/* Evictions & activations on the inactive file list */
-	atomic_long_t		inactive_age;
-
 	/*
 	 * When free pages are below this point, additional steps are taken
 	 * when reading the number of free pages to avoid per-cpu counter
diff --git a/include/linux/swap.h b/include/linux/swap.h
index b14a2bb33514..b3713332c754 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -252,6 +252,7 @@ struct swap_info_struct {
 void *workingset_eviction(struct address_space *mapping, struct page *page);
 bool workingset_refault(void *shadow);
 void workingset_activation(struct page *page);
+void workingset_release_shadow(void *shadow);
 extern struct list_lru workingset_shadow_nodes;
 
 static inline unsigned int workingset_node_pages(struct radix_tree_node *node)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index d06cae2de783..4ea79f225fe8 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -268,31 +268,6 @@ static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg)
 	return (memcg == root_mem_cgroup);
 }
 
-/*
- * We restrict the id in the range of [1, 65535], so it can fit into
- * an unsigned short.
- */
-#define MEM_CGROUP_ID_MAX	USHRT_MAX
-
-static inline unsigned short mem_cgroup_id(struct mem_cgroup *memcg)
-{
-	return memcg->css.id;
-}
-
-/*
- * A helper function to get mem_cgroup from ID. must be called under
- * rcu_read_lock().  The caller is responsible for calling
- * css_tryget_online() if the mem_cgroup is used for charging. (dropping
- * refcnt from swap can be called against removed memcg.)
- */
-static inline struct mem_cgroup *mem_cgroup_from_id(unsigned short id)
-{
-	struct cgroup_subsys_state *css;
-
-	css = css_from_id(id, &memory_cgrp_subsys);
-	return mem_cgroup_from_css(css);
-}
-
 #ifndef CONFIG_SLOB
 /*
  * This will be the memcg's index in each cache's ->memcg_params.memcg_caches.
diff --git a/mm/swap.c b/mm/swap.c
index 09fe5e97714a..4b5d7a1f9742 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -270,6 +270,9 @@ static void __activate_page(struct page *page, struct lruvec *lruvec,
 
 		__count_vm_event(PGACTIVATE);
 		update_page_reclaim_stat(lruvec, file, 1);
+
+		if (file)
+			workingset_activation(page);
 	}
 }
 
@@ -375,8 +378,6 @@ void mark_page_accessed(struct page *page)
 		else
 			__lru_cache_activate_page(page);
 		ClearPageReferenced(page);
-		if (page_is_file_cache(page))
-			workingset_activation(page);
 	} else if (!PageReferenced(page)) {
 		SetPageReferenced(page);
 	}
diff --git a/mm/truncate.c b/mm/truncate.c
index e3ee0e27cd17..a8bae846d399 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -52,6 +52,7 @@ static void clear_exceptional_entry(struct address_space *mapping,
 			goto unlock;
 		radix_tree_replace_slot(slot, NULL);
 		mapping->nrexceptional--;
+		workingset_release_shadow(entry);
 		if (!node)
 			goto unlock;
 		workingset_node_shadows_dec(node);
diff --git a/mm/workingset.c b/mm/workingset.c
index 61ead9e5549d..30298eaee397 100644
--- a/mm/workingset.c
+++ b/mm/workingset.c
@@ -142,7 +142,7 @@
  *		Implementation
  *
  * For each zone's file LRU lists, a counter for inactive evictions
- * and activations is maintained (zone->inactive_age).
+ * and activations is maintained (lruvec->inactive_age).
  *
  * On eviction, a snapshot of this counter (along with some bits to
  * identify the zone) is stored in the now empty page cache radix tree
@@ -152,8 +152,72 @@
  * refault distance will immediately activate the refaulting page.
  */
 
-static void *pack_shadow(unsigned long eviction, struct zone *zone)
+#ifdef CONFIG_MEMCG
+/*
+ * On 32-bit there is not much space left in radix tree slot after
+ * storing information about node, zone, and memory cgroup, so we
+ * disregard 10 least significant bits of eviction counter. This
+ * reduces refault distance accuracy to 4MB, which is still fine.
+ *
+ * With the default NODE_SHIFT (3) this leaves us 9 bits for storing
+ * eviction counter, hence maximal refault distance will be 2GB, which
+ * should be enough for 32-bit systems.
+ */
+#ifdef CONFIG_64BIT
+# define REFAULT_DISTANCE_GRANULARITY		0
+#else
+# define REFAULT_DISTANCE_GRANULARITY		10
+#endif
+
+static unsigned long pack_shadow_memcg(unsigned long eviction,
+				       struct mem_cgroup *memcg)
+{
+	if (mem_cgroup_disabled())
+		return eviction;
+
+	eviction >>= REFAULT_DISTANCE_GRANULARITY;
+	eviction = (eviction << MEM_CGROUP_ID_SHIFT) | mem_cgroup_id(memcg);
+	return eviction;
+}
+
+static unsigned long unpack_shadow_memcg(unsigned long entry,
+					 unsigned long *mask,
+					 struct mem_cgroup **memcg)
+{
+	if (mem_cgroup_disabled()) {
+		*memcg = NULL;
+		return entry;
+	}
+
+	rcu_read_lock();
+	*memcg = mem_cgroup_from_id(entry & MEM_CGROUP_ID_MAX);
+	rcu_read_unlock();
+
+	entry >>= MEM_CGROUP_ID_SHIFT;
+	entry <<= REFAULT_DISTANCE_GRANULARITY;
+	*mask >>= MEM_CGROUP_ID_SHIFT - REFAULT_DISTANCE_GRANULARITY;
+	return entry;
+}
+#else /* !CONFIG_MEMCG */
+static unsigned long pack_shadow_memcg(unsigned long eviction,
+				       struct mem_cgroup *memcg)
+{
+	return eviction;
+}
+
+static unsigned long unpack_shadow_memcg(unsigned long entry,
+					 unsigned long *mask,
+					 struct mem_cgroup **memcg)
+{
+	*memcg = NULL;
+	return entry;
+}
+#endif /* CONFIG_MEMCG */
+
+static void *pack_shadow(unsigned long eviction, struct zone *zone,
+			 struct mem_cgroup *memcg)
 {
+	eviction = pack_shadow_memcg(eviction, memcg);
 	eviction = (eviction << NODES_SHIFT) | zone_to_nid(zone);
 	eviction = (eviction << ZONES_SHIFT) | zone_idx(zone);
 	eviction = (eviction << RADIX_TREE_EXCEPTIONAL_SHIFT);
@@ -163,6 +227,7 @@ static void *pack_shadow(unsigned long eviction, struct zone *zone)
 
 static void unpack_shadow(void *shadow,
 			  struct zone **zone,
+			  struct mem_cgroup **memcg,
 			  unsigned long *distance)
 {
 	unsigned long entry = (unsigned long)shadow;
@@ -170,19 +235,23 @@ static void unpack_shadow(void *shadow,
 	unsigned long refault;
 	unsigned long mask;
 	int zid, nid;
+	struct lruvec *lruvec;
 
 	entry >>= RADIX_TREE_EXCEPTIONAL_SHIFT;
 	zid = entry & ((1UL << ZONES_SHIFT) - 1);
 	entry >>= ZONES_SHIFT;
 	nid = entry & ((1UL << NODES_SHIFT) - 1);
 	entry >>= NODES_SHIFT;
-	eviction = entry;
 
-	*zone = NODE_DATA(nid)->node_zones + zid;
-
-	refault = atomic_long_read(&(*zone)->inactive_age);
 	mask = ~0UL >> (NODES_SHIFT + ZONES_SHIFT +
 			RADIX_TREE_EXCEPTIONAL_SHIFT);
+
+	eviction = unpack_shadow_memcg(entry, &mask, memcg);
+
+	*zone = NODE_DATA(nid)->node_zones + zid;
+	lruvec = mem_cgroup_zone_lruvec(*zone, *memcg);
+
+	refault = atomic_long_read(&lruvec->inactive_age);
 	/*
 	 * The unsigned subtraction here gives an accurate distance
 	 * across inactive_age overflows in most cases.
@@ -213,10 +282,16 @@ static void unpack_shadow(void *shadow,
 void *workingset_eviction(struct address_space *mapping, struct page *page)
 {
 	struct zone *zone = page_zone(page);
+	struct mem_cgroup *memcg = page_memcg(page);
+	struct lruvec *lruvec;
 	unsigned long eviction;
 
-	eviction = atomic_long_inc_return(&zone->inactive_age);
-	return pack_shadow(eviction, zone);
+	if (!mem_cgroup_disabled())
+		mem_cgroup_get(memcg);
+
+	lruvec = mem_cgroup_zone_lruvec(zone, memcg);
+	eviction = atomic_long_inc_return(&lruvec->inactive_age);
+	return pack_shadow(eviction, zone, memcg);
 }
 
 /**
@@ -230,13 +305,22 @@ void *workingset_eviction(struct address_space *mapping, struct page *page)
  */
 bool workingset_refault(void *shadow)
 {
-	unsigned long refault_distance;
+	unsigned long refault_distance, nr_active;
 	struct zone *zone;
+	struct mem_cgroup *memcg;
+	struct lruvec *lruvec;
 
-	unpack_shadow(shadow, &zone, &refault_distance);
+	unpack_shadow(shadow, &zone, &memcg, &refault_distance);
 	inc_zone_state(zone, WORKINGSET_REFAULT);
 
-	if (refault_distance <= zone_page_state(zone, NR_ACTIVE_FILE)) {
+	if (!mem_cgroup_disabled()) {
+		lruvec = mem_cgroup_zone_lruvec(zone, memcg);
+		nr_active = mem_cgroup_get_lru_size(lruvec, LRU_ACTIVE_FILE);
+		mem_cgroup_put(memcg);
+	} else
+		nr_active = zone_page_state(zone, NR_ACTIVE_FILE);
+
+	if (refault_distance <= nr_active) {
 		inc_zone_state(zone, WORKINGSET_ACTIVATE);
 		return true;
 	}
@@ -249,7 +333,23 @@ bool workingset_refault(void *shadow)
  */
 void workingset_activation(struct page *page)
 {
-	atomic_long_inc(&page_zone(page)->inactive_age);
+	struct lruvec *lruvec;
+
+	lruvec = mem_cgroup_page_lruvec(page, page_zone(page));
+	atomic_long_inc(&lruvec->inactive_age);
+}
+
+void workingset_release_shadow(void *shadow)
+{
+	unsigned long refault_distance;
+	struct zone *zone;
+	struct mem_cgroup *memcg;
+
+	if (mem_cgroup_disabled())
+		return;
+
+	unpack_shadow(shadow, &zone, &memcg, &refault_distance);
+	mem_cgroup_put(memcg);
 }
 
 /*
@@ -348,6 +448,7 @@ static enum lru_status shadow_lru_isolate(struct list_head *item,
 	for (i = 0; i < RADIX_TREE_MAP_SIZE; i++) {
 		if (node->slots[i]) {
 			BUG_ON(!radix_tree_exceptional_entry(node->slots[i]));
+			workingset_release_shadow(node->slots[i]);
 			node->slots[i] = NULL;
 			BUG_ON(node->count < (1U << RADIX_TREE_COUNT_SHIFT));
 			node->count -= 1U << RADIX_TREE_COUNT_SHIFT;
-- 
2.1.4

^ permalink raw reply related	[flat|nested] 6+ messages in thread