linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] mm: fix vm-scalability regression in cgroup-aware workingset code
@ 2016-06-22 18:20 Johannes Weiner
  2016-06-24 17:51 ` [PATCH rebase] " Johannes Weiner
  0 siblings, 1 reply; 4+ messages in thread
From: Johannes Weiner @ 2016-06-22 18:20 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Ye Xiaolong, Michal Hocko, Vladimir Davydov, linux-mm, cgroups,
	linux-kernel, kernel-team

23047a96d7cf ("mm: workingset: per-cgroup cache thrash detection")
added a page->mem_cgroup lookup to the cache eviction, refault, and
activation paths, as well as locking to the activation path, and the
vm-scalability tests showed a regression of -23%. While the test in
question is an artificial worst-case scenario that doesn't occur in
real workloads - reading two sparse files in parallel at full CPU
speed just to hammer the LRU paths - there is still some optimizations
that can be done in those paths.

Inline the lookup functions to eliminate calls. Also, page->mem_cgroup
doesn't need to be stabilized when counting an activation; we merely
need to hold the RCU lock to prevent the memcg from being freed.

This cuts down on overhead quite a bit:

23047a96d7cfcfca 063f6715e77a7be5770d6081fe
---------------- --------------------------
         %stddev     %change         %stddev
             \          |                \
  21621405 ±  0%     +11.3%   24069657 ±  2%  vm-scalability.throughput

Reported-by: Ye Xiaolong <xiaolong.ye@intel.com>
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
---
 include/linux/memcontrol.h | 39 ++++++++++++++++++++++++++++++++++++++-
 include/linux/mm.h         |  8 ++++++++
 mm/memcontrol.c            | 39 ---------------------------------------
 mm/workingset.c            | 10 ++++++----
 4 files changed, 52 insertions(+), 44 deletions(-)

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index a221663687d5..16609ab1a032 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -310,7 +310,44 @@ void mem_cgroup_uncharge_list(struct list_head *page_list);
 
 void mem_cgroup_migrate(struct page *oldpage, struct page *newpage);
 
-struct lruvec *mem_cgroup_lruvec(struct pglist_data *, struct mem_cgroup *);
+static inline struct mem_cgroup_per_node *
+mem_cgroup_nodeinfo(struct mem_cgroup *memcg, int nid)
+{
+	return memcg->nodeinfo[nid];
+}
+
+/**
+ * mem_cgroup_lruvec - get the lru list vector for a memcg node
+ * @node: node of the wanted lruvec
+ * @memcg: memcg of the wanted lruvec
+ *
+ * Returns the lru list vector holding pages for a given @node and @memcg.
+ * This can be the node lruvec, if the memory controller is disabled.
+ */
+static inline struct lruvec *mem_cgroup_lruvec(struct pglist_data *pgdat,
+					       struct mem_cgroup *memcg)
+{
+	struct mem_cgroup_per_node *mz;
+	struct lruvec *lruvec;
+
+	if (mem_cgroup_disabled()) {
+		lruvec = node_lruvec(pgdat);
+		goto out;
+	}
+
+	mz = mem_cgroup_nodeinfo(memcg, pgdat->node_id);
+	lruvec = &mz->lruvec;
+out:
+	/*
+	 * Since a node can be onlined after the mem_cgroup was created,
+	 * we have to be prepared to initialize lruvec->zone here;
+	 * and if offlined then reonlined, we need to reinitialize it.
+	 */
+	if (unlikely(lruvec->pgdat != pgdat))
+		lruvec->pgdat = pgdat;
+	return lruvec;
+}
+
 struct lruvec *mem_cgroup_page_lruvec(struct page *, struct pglist_data *);
 
 bool task_in_mem_cgroup(struct task_struct *task, struct mem_cgroup *memcg);
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 14de8810d02e..0f7a4d89b52a 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -978,11 +978,19 @@ static inline struct mem_cgroup *page_memcg(struct page *page)
 {
 	return page->mem_cgroup;
 }
+static inline struct mem_cgroup *page_memcg_rcu(struct page *page)
+{
+	return READ_ONCE(page->mem_cgroup);
+}
 #else
 static inline struct mem_cgroup *page_memcg(struct page *page)
 {
 	return NULL;
 }
+static inline struct mem_cgroup *page_memcg_rcu(struct page *page)
+{
+	return NULL;
+}
 #endif
 
 /*
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 2b6574a13d0e..603e9b030e46 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -319,12 +319,6 @@ EXPORT_SYMBOL(memcg_kmem_enabled_key);
 
 #endif /* !CONFIG_SLOB */
 
-static struct mem_cgroup_per_node *
-mem_cgroup_nodeinfo(struct mem_cgroup *memcg, int nid)
-{
-	return memcg->nodeinfo[nid];
-}
-
 /**
  * mem_cgroup_css_from_page - css of the memcg associated with a page
  * @page: page of interest
@@ -927,39 +921,6 @@ static void invalidate_reclaim_iterators(struct mem_cgroup *dead_memcg)
 	     iter = mem_cgroup_iter(NULL, iter, NULL))
 
 /**
- * mem_cgroup_lruvec - get the lru list vector for a node or a memcg zone
- * @node: node of the wanted lruvec
- * @memcg: memcg of the wanted lruvec
- *
- * Returns the lru list vector holding pages for a given @node or a given
- * @memcg and @zone. This can be the node lruvec, if the memory controller
- * is disabled.
- */
-struct lruvec *mem_cgroup_lruvec(struct pglist_data *pgdat,
-				 struct mem_cgroup *memcg)
-{
-	struct mem_cgroup_per_node *mz;
-	struct lruvec *lruvec;
-
-	if (mem_cgroup_disabled()) {
-		lruvec = node_lruvec(pgdat);
-		goto out;
-	}
-
-	mz = mem_cgroup_nodeinfo(memcg, pgdat->node_id);
-	lruvec = &mz->lruvec;
-out:
-	/*
-	 * Since a node can be onlined after the mem_cgroup was created,
-	 * we have to be prepared to initialize lruvec->zone here;
-	 * and if offlined then reonlined, we need to reinitialize it.
-	 */
-	if (unlikely(lruvec->pgdat != pgdat))
-		lruvec->pgdat = pgdat;
-	return lruvec;
-}
-
-/**
  * mem_cgroup_page_lruvec - return lruvec for isolating/putting an LRU page
  * @page: the page
  * @zone: zone of the page
diff --git a/mm/workingset.c b/mm/workingset.c
index 236493eaf480..d4c864066fde 100644
--- a/mm/workingset.c
+++ b/mm/workingset.c
@@ -302,9 +302,10 @@ bool workingset_refault(void *shadow)
  */
 void workingset_activation(struct page *page)
 {
+	struct mem_cgroup *memcg;
 	struct lruvec *lruvec;
 
-	lock_page_memcg(page);
+	rcu_read_lock();
 	/*
 	 * Filter non-memcg pages here, e.g. unmap can call
 	 * mark_page_accessed() on VDSO pages.
@@ -312,12 +313,13 @@ void workingset_activation(struct page *page)
 	 * XXX: See workingset_refault() - this should return
 	 * root_mem_cgroup even for !CONFIG_MEMCG.
 	 */
-	if (!mem_cgroup_disabled() && !page_memcg(page))
+	memcg = page_memcg_rcu(page);
+	if (!mem_cgroup_disabled() && !memcg)
 		goto out;
-	lruvec = mem_cgroup_lruvec(page_pgdat(page), page_memcg(page));
+	lruvec = mem_cgroup_lruvec(page_pgdat(page), memcg);
 	atomic_long_inc(&lruvec->inactive_age);
 out:
-	unlock_page_memcg(page);
+	rcu_read_unlock();
 }
 
 /*
-- 
2.8.3

^ permalink raw reply related	[flat|nested] 4+ messages in thread

* [PATCH rebase] mm: fix vm-scalability regression in cgroup-aware workingset code
  2016-06-22 18:20 [PATCH] mm: fix vm-scalability regression in cgroup-aware workingset code Johannes Weiner
@ 2016-06-24 17:51 ` Johannes Weiner
  2016-06-27 13:05   ` Michal Hocko
  0 siblings, 1 reply; 4+ messages in thread
From: Johannes Weiner @ 2016-06-24 17:51 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Ye Xiaolong, Michal Hocko, Vladimir Davydov, linux-mm, cgroups,
	linux-kernel, kernel-team

This is a rebased version on top of mmots sans the nodelru stuff.

---

23047a96d7cf ("mm: workingset: per-cgroup cache thrash detection")
added a page->mem_cgroup lookup to the cache eviction, refault, and
activation paths, as well as locking to the activation path, and the
vm-scalability tests showed a regression of -23%. While the test in
question is an artificial worst-case scenario that doesn't occur in
real workloads - reading two sparse files in parallel at full CPU
speed just to hammer the LRU paths - there is still some optimizations
that can be done in those paths.

Inline the lookup functions to eliminate calls. Also, page->mem_cgroup
doesn't need to be stabilized when counting an activation; we merely
need to hold the RCU lock to prevent the memcg from being freed.

This cuts down on overhead quite a bit:

23047a96d7cfcfca 063f6715e77a7be5770d6081fe
---------------- --------------------------
         %stddev     %change         %stddev
             \          |                \
  21621405 +- 0%     +11.3%   24069657 +- 2%  vm-scalability.throughput

Reported-by: Ye Xiaolong <xiaolong.ye@intel.com>
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
---
 include/linux/memcontrol.h | 44 +++++++++++++++++++++++++++++++++++++++++++-
 include/linux/mm.h         |  8 ++++++++
 mm/memcontrol.c            | 42 ------------------------------------------
 mm/workingset.c            | 10 ++++++----
 4 files changed, 57 insertions(+), 47 deletions(-)

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 71aff733a497..104efa6874db 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -29,6 +29,7 @@
 #include <linux/mmzone.h>
 #include <linux/writeback.h>
 #include <linux/page-flags.h>
+#include <linux/mm.h>
 
 struct mem_cgroup;
 struct page;
@@ -314,7 +315,48 @@ void mem_cgroup_uncharge_list(struct list_head *page_list);
 
 void mem_cgroup_migrate(struct page *oldpage, struct page *newpage);
 
-struct lruvec *mem_cgroup_zone_lruvec(struct zone *, struct mem_cgroup *);
+static inline struct mem_cgroup_per_zone *
+mem_cgroup_zone_zoneinfo(struct mem_cgroup *memcg, struct zone *zone)
+{
+	int nid = zone_to_nid(zone);
+	int zid = zone_idx(zone);
+
+	return &memcg->nodeinfo[nid]->zoneinfo[zid];
+}
+
+/**
+ * mem_cgroup_zone_lruvec - get the lru list vector for a zone and memcg
+ * @zone: zone of the wanted lruvec
+ * @memcg: memcg of the wanted lruvec
+ *
+ * Returns the lru list vector holding pages for the given @zone and
+ * @mem.  This can be the global zone lruvec, if the memory controller
+ * is disabled.
+ */
+static inline struct lruvec *mem_cgroup_zone_lruvec(struct zone *zone,
+						    struct mem_cgroup *memcg)
+{
+	struct mem_cgroup_per_zone *mz;
+	struct lruvec *lruvec;
+
+	if (mem_cgroup_disabled()) {
+		lruvec = &zone->lruvec;
+		goto out;
+	}
+
+	mz = mem_cgroup_zone_zoneinfo(memcg, zone);
+	lruvec = &mz->lruvec;
+out:
+	/*
+	 * Since a node can be onlined after the mem_cgroup was created,
+	 * we have to be prepared to initialize lruvec->zone here;
+	 * and if offlined then reonlined, we need to reinitialize it.
+	 */
+	if (unlikely(lruvec->zone != zone))
+		lruvec->zone = zone;
+	return lruvec;
+}
+
 struct lruvec *mem_cgroup_page_lruvec(struct page *, struct zone *);
 
 bool task_in_mem_cgroup(struct task_struct *task, struct mem_cgroup *memcg);
diff --git a/include/linux/mm.h b/include/linux/mm.h
index c606fe4f9a7f..b21e5f30378e 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -973,11 +973,19 @@ static inline struct mem_cgroup *page_memcg(struct page *page)
 {
 	return page->mem_cgroup;
 }
+static inline struct mem_cgroup *page_memcg_rcu(struct page *page)
+{
+	return READ_ONCE(page->mem_cgroup);
+}
 #else
 static inline struct mem_cgroup *page_memcg(struct page *page)
 {
 	return NULL;
 }
+static inline struct mem_cgroup *page_memcg_rcu(struct page *page)
+{
+	return NULL;
+}
 #endif
 
 /*
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 3e8f9e5e9291..40dfca3ef4bb 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -323,15 +323,6 @@ EXPORT_SYMBOL(memcg_kmem_enabled_key);
 
 #endif /* !CONFIG_SLOB */
 
-static struct mem_cgroup_per_zone *
-mem_cgroup_zone_zoneinfo(struct mem_cgroup *memcg, struct zone *zone)
-{
-	int nid = zone_to_nid(zone);
-	int zid = zone_idx(zone);
-
-	return &memcg->nodeinfo[nid]->zoneinfo[zid];
-}
-
 /**
  * mem_cgroup_css_from_page - css of the memcg associated with a page
  * @page: page of interest
@@ -944,39 +935,6 @@ static void invalidate_reclaim_iterators(struct mem_cgroup *dead_memcg)
 	     iter = mem_cgroup_iter(NULL, iter, NULL))
 
 /**
- * mem_cgroup_zone_lruvec - get the lru list vector for a zone and memcg
- * @zone: zone of the wanted lruvec
- * @memcg: memcg of the wanted lruvec
- *
- * Returns the lru list vector holding pages for the given @zone and
- * @mem.  This can be the global zone lruvec, if the memory controller
- * is disabled.
- */
-struct lruvec *mem_cgroup_zone_lruvec(struct zone *zone,
-				      struct mem_cgroup *memcg)
-{
-	struct mem_cgroup_per_zone *mz;
-	struct lruvec *lruvec;
-
-	if (mem_cgroup_disabled()) {
-		lruvec = &zone->lruvec;
-		goto out;
-	}
-
-	mz = mem_cgroup_zone_zoneinfo(memcg, zone);
-	lruvec = &mz->lruvec;
-out:
-	/*
-	 * Since a node can be onlined after the mem_cgroup was created,
-	 * we have to be prepared to initialize lruvec->zone here;
-	 * and if offlined then reonlined, we need to reinitialize it.
-	 */
-	if (unlikely(lruvec->zone != zone))
-		lruvec->zone = zone;
-	return lruvec;
-}
-
-/**
  * mem_cgroup_page_lruvec - return lruvec for isolating/putting an LRU page
  * @page: the page
  * @zone: zone of the page
diff --git a/mm/workingset.c b/mm/workingset.c
index 8a75f8d2916a..8252de4566e9 100644
--- a/mm/workingset.c
+++ b/mm/workingset.c
@@ -305,9 +305,10 @@ bool workingset_refault(void *shadow)
  */
 void workingset_activation(struct page *page)
 {
+	struct mem_cgroup *memcg;
 	struct lruvec *lruvec;
 
-	lock_page_memcg(page);
+	rcu_read_lock();
 	/*
 	 * Filter non-memcg pages here, e.g. unmap can call
 	 * mark_page_accessed() on VDSO pages.
@@ -315,12 +316,13 @@ void workingset_activation(struct page *page)
 	 * XXX: See workingset_refault() - this should return
 	 * root_mem_cgroup even for !CONFIG_MEMCG.
 	 */
-	if (!mem_cgroup_disabled() && !page_memcg(page))
+	memcg = page_memcg_rcu(page);
+	if (!mem_cgroup_disabled() && !memcg)
 		goto out;
-	lruvec = mem_cgroup_zone_lruvec(page_zone(page), page_memcg(page));
+	lruvec = mem_cgroup_zone_lruvec(page_zone(page), memcg);
 	atomic_long_inc(&lruvec->inactive_age);
 out:
-	unlock_page_memcg(page);
+	rcu_read_unlock();
 }
 
 /*
-- 
2.8.3

^ permalink raw reply related	[flat|nested] 4+ messages in thread

* Re: [PATCH rebase] mm: fix vm-scalability regression in cgroup-aware workingset code
  2016-06-24 17:51 ` [PATCH rebase] " Johannes Weiner
@ 2016-06-27 13:05   ` Michal Hocko
  2016-07-07 19:40     ` Johannes Weiner
  0 siblings, 1 reply; 4+ messages in thread
From: Michal Hocko @ 2016-06-27 13:05 UTC (permalink / raw)
  To: Johannes Weiner
  Cc: Andrew Morton, Ye Xiaolong, Vladimir Davydov, linux-mm, cgroups,
	linux-kernel, kernel-team

[Sorry for a late reply]

On Fri 24-06-16 13:51:01, Johannes Weiner wrote:
> This is a rebased version on top of mmots sans the nodelru stuff.
> 
> ---
> 
> 23047a96d7cf ("mm: workingset: per-cgroup cache thrash detection")
> added a page->mem_cgroup lookup to the cache eviction, refault, and
> activation paths, as well as locking to the activation path, and the
> vm-scalability tests showed a regression of -23%. While the test in
> question is an artificial worst-case scenario that doesn't occur in
> real workloads - reading two sparse files in parallel at full CPU
> speed just to hammer the LRU paths - there is still some optimizations
> that can be done in those paths.
> 
> Inline the lookup functions to eliminate calls. Also, page->mem_cgroup
> doesn't need to be stabilized when counting an activation; we merely
> need to hold the RCU lock to prevent the memcg from being freed.
> 
> This cuts down on overhead quite a bit:
> 
> 23047a96d7cfcfca 063f6715e77a7be5770d6081fe
> ---------------- --------------------------
>          %stddev     %change         %stddev
>              \          |                \
>   21621405 +- 0%     +11.3%   24069657 +- 2%  vm-scalability.throughput
> 
> Reported-by: Ye Xiaolong <xiaolong.ye@intel.com>
> Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>

Acked-by: Michal Hocko <mhocko@suse.com>

Minor note below

> +static inline struct mem_cgroup *page_memcg_rcu(struct page *page)
> +{

I guess rcu_read_lock_held() here would be appropriate

> +	return READ_ONCE(page->mem_cgroup);
> +}
-- 
Michal Hocko
SUSE Labs

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH rebase] mm: fix vm-scalability regression in cgroup-aware workingset code
  2016-06-27 13:05   ` Michal Hocko
@ 2016-07-07 19:40     ` Johannes Weiner
  0 siblings, 0 replies; 4+ messages in thread
From: Johannes Weiner @ 2016-07-07 19:40 UTC (permalink / raw)
  To: Michal Hocko
  Cc: Andrew Morton, Ye Xiaolong, Vladimir Davydov, linux-mm, cgroups,
	linux-kernel, kernel-team

Hi Michal,

[sorry for the delay, I was traveling with no connectivity]

On Mon, Jun 27, 2016 at 03:05:28PM +0200, Michal Hocko wrote:
> On Fri 24-06-16 13:51:01, Johannes Weiner wrote:
>
> Acked-by: Michal Hocko <mhocko@suse.com>

Thanks!

> Minor note below
> 
> > +static inline struct mem_cgroup *page_memcg_rcu(struct page *page)
> > +{
> 
> I guess rcu_read_lock_held() here would be appropriate
> 
> > +	return READ_ONCE(page->mem_cgroup);

Agreed.

Andrew, could you please fold this?

>From ed49e364e47c933d84533a0d8bd355831b5ca9f1 Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Thu, 7 Jul 2016 15:38:26 -0400
Subject: [PATCH] mm: fix vm-scalability regression in cgroup-aware workingset
 code fix

Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
---
 include/linux/mm.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index b21e5f30378e..97065e1f0237 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -975,6 +975,7 @@ static inline struct mem_cgroup *page_memcg(struct page *page)
 }
 static inline struct mem_cgroup *page_memcg_rcu(struct page *page)
 {
+	WARN_ON_ONCE(!rcu_read_lock_held());
 	return READ_ONCE(page->mem_cgroup);
 }
 #else
@@ -984,6 +985,7 @@ static inline struct mem_cgroup *page_memcg(struct page *page)
 }
 static inline struct mem_cgroup *page_memcg_rcu(struct page *page)
 {
+	WARN_ON_ONCE(!rcu_read_lock_held());
 	return NULL;
 }
 #endif
-- 
2.9.0

^ permalink raw reply related	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2016-07-07 19:44 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2016-06-22 18:20 [PATCH] mm: fix vm-scalability regression in cgroup-aware workingset code Johannes Weiner
2016-06-24 17:51 ` [PATCH rebase] " Johannes Weiner
2016-06-27 13:05   ` Michal Hocko
2016-07-07 19:40     ` Johannes Weiner

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).