All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] mm: Make count list_lru_one::nr_items lockless
@ 2017-09-19 15:06 ` Kirill Tkhai
  0 siblings, 0 replies; 14+ messages in thread
From: Kirill Tkhai @ 2017-09-19 15:06 UTC (permalink / raw)
  To: vdavydov.dev, apolyakov, linux-kernel, linux-mm, aryabinin, akpm

During the reclaiming slab of a memcg, shrink_slab iterates
over all registered shrinkers in the system, and tries to count
and consume objects related to the cgroup. In case of memory
pressure, this behaves bad: I observe high system time and
time spent in list_lru_count_one() for many processes on RHEL7
kernel (collected via $perf record --call-graph fp -j k -a):

0,50%  nixstatsagent  [kernel.vmlinux]  [k] _raw_spin_lock                [k] _raw_spin_lock
0,26%  nixstatsagent  [kernel.vmlinux]  [k] shrink_slab                   [k] shrink_slab
0,23%  nixstatsagent  [kernel.vmlinux]  [k] super_cache_count             [k] super_cache_count
0,15%  nixstatsagent  [kernel.vmlinux]  [k] __list_lru_count_one.isra.2   [k] _raw_spin_lock
0,15%  nixstatsagent  [kernel.vmlinux]  [k] list_lru_count_one            [k] __list_lru_count_one.isra.2

0,94%  mysqld         [kernel.vmlinux]  [k] _raw_spin_lock                [k] _raw_spin_lock
0,57%  mysqld         [kernel.vmlinux]  [k] shrink_slab                   [k] shrink_slab
0,51%  mysqld         [kernel.vmlinux]  [k] super_cache_count             [k] super_cache_count
0,32%  mysqld         [kernel.vmlinux]  [k] __list_lru_count_one.isra.2   [k] _raw_spin_lock
0,32%  mysqld         [kernel.vmlinux]  [k] list_lru_count_one            [k] __list_lru_count_one.isra.2

0,73%  sshd           [kernel.vmlinux]  [k] _raw_spin_lock                [k] _raw_spin_lock
0,35%  sshd           [kernel.vmlinux]  [k] shrink_slab                   [k] shrink_slab
0,32%  sshd           [kernel.vmlinux]  [k] super_cache_count             [k] super_cache_count
0,21%  sshd           [kernel.vmlinux]  [k] __list_lru_count_one.isra.2   [k] _raw_spin_lock
0,21%  sshd           [kernel.vmlinux]  [k] list_lru_count_one            [k] __list_lru_count_one.isra.2

This patch aims to make super_cache_count() (and other functions,
which count LRU nr_items) more effective.
It allows list_lru_node::memcg_lrus to be RCU-accessed, and makes
__list_lru_count_one() count nr_items lockless to minimize
overhead introduced by locking operation, and to make parallel
reclaims more scalable.

Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Acked-by: Vladimir Davydov <vdavydov.dev@gmail.com>
---
 include/linux/list_lru.h |    3 ++
 mm/list_lru.c            |   59 +++++++++++++++++++++++++++++-----------------
 2 files changed, 39 insertions(+), 23 deletions(-)

diff --git a/include/linux/list_lru.h b/include/linux/list_lru.h
index fa7fd03cb5f9..a55258100e40 100644
--- a/include/linux/list_lru.h
+++ b/include/linux/list_lru.h
@@ -31,6 +31,7 @@ struct list_lru_one {
 };
 
 struct list_lru_memcg {
+	struct rcu_head		rcu;
 	/* array of per cgroup lists, indexed by memcg_cache_id */
 	struct list_lru_one	*lru[0];
 };
@@ -42,7 +43,7 @@ struct list_lru_node {
 	struct list_lru_one	lru;
 #if defined(CONFIG_MEMCG) && !defined(CONFIG_SLOB)
 	/* for cgroup aware lrus points to per cgroup lists, otherwise NULL */
-	struct list_lru_memcg	*memcg_lrus;
+	struct list_lru_memcg	__rcu *memcg_lrus;
 #endif
 	long nr_items;
 } ____cacheline_aligned_in_smp;
diff --git a/mm/list_lru.c b/mm/list_lru.c
index 7a40fa2be858..9fdb24818dae 100644
--- a/mm/list_lru.c
+++ b/mm/list_lru.c
@@ -52,14 +52,15 @@ static inline bool list_lru_memcg_aware(struct list_lru *lru)
 static inline struct list_lru_one *
 list_lru_from_memcg_idx(struct list_lru_node *nlru, int idx)
 {
+	struct list_lru_memcg *memcg_lrus;
 	/*
-	 * The lock protects the array of per cgroup lists from relocation
-	 * (see memcg_update_list_lru_node).
+	 * Either lock or RCU protects the array of per cgroup lists
+	 * from relocation (see memcg_update_list_lru_node).
 	 */
-	lockdep_assert_held(&nlru->lock);
-	if (nlru->memcg_lrus && idx >= 0)
-		return nlru->memcg_lrus->lru[idx];
-
+	memcg_lrus = rcu_dereference_check(nlru->memcg_lrus,
+					   lockdep_is_held(&nlru->lock));
+	if (memcg_lrus && idx >= 0)
+		return memcg_lrus->lru[idx];
 	return &nlru->lru;
 }
 
@@ -168,10 +169,10 @@ static unsigned long __list_lru_count_one(struct list_lru *lru,
 	struct list_lru_one *l;
 	unsigned long count;
 
-	spin_lock(&nlru->lock);
+	rcu_read_lock();
 	l = list_lru_from_memcg_idx(nlru, memcg_idx);
 	count = l->nr_items;
-	spin_unlock(&nlru->lock);
+	rcu_read_unlock();
 
 	return count;
 }
@@ -323,24 +324,33 @@ static int __memcg_init_list_lru_node(struct list_lru_memcg *memcg_lrus,
 
 static int memcg_init_list_lru_node(struct list_lru_node *nlru)
 {
+	struct list_lru_memcg *memcg_lrus;
 	int size = memcg_nr_cache_ids;
 
-	nlru->memcg_lrus = kmalloc(size * sizeof(void *), GFP_KERNEL);
-	if (!nlru->memcg_lrus)
+	memcg_lrus = kmalloc(sizeof(*memcg_lrus) +
+			     size * sizeof(void *), GFP_KERNEL);
+	if (!memcg_lrus)
 		return -ENOMEM;
 
-	if (__memcg_init_list_lru_node(nlru->memcg_lrus, 0, size)) {
-		kfree(nlru->memcg_lrus);
+	if (__memcg_init_list_lru_node(memcg_lrus, 0, size)) {
+		kfree(memcg_lrus);
 		return -ENOMEM;
 	}
+	RCU_INIT_POINTER(nlru->memcg_lrus, memcg_lrus);
 
 	return 0;
 }
 
 static void memcg_destroy_list_lru_node(struct list_lru_node *nlru)
 {
-	__memcg_destroy_list_lru_node(nlru->memcg_lrus, 0, memcg_nr_cache_ids);
-	kfree(nlru->memcg_lrus);
+	struct list_lru_memcg *memcg_lrus;
+	/*
+	 * This is called when shrinker has already been unregistered,
+	 * and nobody can use it. So, there is no need to use kfree_rcu().
+	 */
+	memcg_lrus = rcu_dereference_protected(nlru->memcg_lrus, true);
+	__memcg_destroy_list_lru_node(memcg_lrus, 0, memcg_nr_cache_ids);
+	kfree(memcg_lrus);
 }
 
 static int memcg_update_list_lru_node(struct list_lru_node *nlru,
@@ -350,8 +360,9 @@ static int memcg_update_list_lru_node(struct list_lru_node *nlru,
 
 	BUG_ON(old_size > new_size);
 
-	old = nlru->memcg_lrus;
-	new = kmalloc(new_size * sizeof(void *), GFP_KERNEL);
+	old = rcu_dereference_protected(nlru->memcg_lrus,
+					lockdep_is_held(&list_lrus_mutex));
+	new = kmalloc(sizeof(*new) + new_size * sizeof(void *), GFP_KERNEL);
 	if (!new)
 		return -ENOMEM;
 
@@ -360,29 +371,33 @@ static int memcg_update_list_lru_node(struct list_lru_node *nlru,
 		return -ENOMEM;
 	}
 
-	memcpy(new, old, old_size * sizeof(void *));
+	memcpy(&new->lru, &old->lru, old_size * sizeof(void *));
 
 	/*
-	 * The lock guarantees that we won't race with a reader
-	 * (see list_lru_from_memcg_idx).
+	 * The locking below allows readers that hold nlru->lock avoid taking
+	 * rcu_read_lock (see list_lru_from_memcg_idx).
 	 *
 	 * Since list_lru_{add,del} may be called under an IRQ-safe lock,
 	 * we have to use IRQ-safe primitives here to avoid deadlock.
 	 */
 	spin_lock_irq(&nlru->lock);
-	nlru->memcg_lrus = new;
+	rcu_assign_pointer(nlru->memcg_lrus, new);
 	spin_unlock_irq(&nlru->lock);
 
-	kfree(old);
+	kfree_rcu(old, rcu);
 	return 0;
 }
 
 static void memcg_cancel_update_list_lru_node(struct list_lru_node *nlru,
 					      int old_size, int new_size)
 {
+	struct list_lru_memcg *memcg_lrus;
+
+	memcg_lrus = rcu_dereference_protected(nlru->memcg_lrus,
+					       lockdep_is_held(&list_lrus_mutex));
 	/* do not bother shrinking the array back to the old size, because we
 	 * cannot handle allocation failures here */
-	__memcg_destroy_list_lru_node(nlru->memcg_lrus, old_size, new_size);
+	__memcg_destroy_list_lru_node(memcg_lrus, old_size, new_size);
 }
 
 static int memcg_init_list_lru(struct list_lru *lru, bool memcg_aware)

^ permalink raw reply	[flat|nested] 14+ messages in thread

* [PATCH] mm: Make count list_lru_one::nr_items lockless
@ 2017-09-19 15:06 ` Kirill Tkhai
  0 siblings, 0 replies; 14+ messages in thread
From: Kirill Tkhai @ 2017-09-19 15:06 UTC (permalink / raw)
  To: vdavydov.dev, apolyakov, linux-kernel, linux-mm, aryabinin, akpm

During the reclaiming slab of a memcg, shrink_slab iterates
over all registered shrinkers in the system, and tries to count
and consume objects related to the cgroup. In case of memory
pressure, this behaves bad: I observe high system time and
time spent in list_lru_count_one() for many processes on RHEL7
kernel (collected via $perf record --call-graph fp -j k -a):

0,50%  nixstatsagent  [kernel.vmlinux]  [k] _raw_spin_lock                [k] _raw_spin_lock
0,26%  nixstatsagent  [kernel.vmlinux]  [k] shrink_slab                   [k] shrink_slab
0,23%  nixstatsagent  [kernel.vmlinux]  [k] super_cache_count             [k] super_cache_count
0,15%  nixstatsagent  [kernel.vmlinux]  [k] __list_lru_count_one.isra.2   [k] _raw_spin_lock
0,15%  nixstatsagent  [kernel.vmlinux]  [k] list_lru_count_one            [k] __list_lru_count_one.isra.2

0,94%  mysqld         [kernel.vmlinux]  [k] _raw_spin_lock                [k] _raw_spin_lock
0,57%  mysqld         [kernel.vmlinux]  [k] shrink_slab                   [k] shrink_slab
0,51%  mysqld         [kernel.vmlinux]  [k] super_cache_count             [k] super_cache_count
0,32%  mysqld         [kernel.vmlinux]  [k] __list_lru_count_one.isra.2   [k] _raw_spin_lock
0,32%  mysqld         [kernel.vmlinux]  [k] list_lru_count_one            [k] __list_lru_count_one.isra.2

0,73%  sshd           [kernel.vmlinux]  [k] _raw_spin_lock                [k] _raw_spin_lock
0,35%  sshd           [kernel.vmlinux]  [k] shrink_slab                   [k] shrink_slab
0,32%  sshd           [kernel.vmlinux]  [k] super_cache_count             [k] super_cache_count
0,21%  sshd           [kernel.vmlinux]  [k] __list_lru_count_one.isra.2   [k] _raw_spin_lock
0,21%  sshd           [kernel.vmlinux]  [k] list_lru_count_one            [k] __list_lru_count_one.isra.2

This patch aims to make super_cache_count() (and other functions,
which count LRU nr_items) more effective.
It allows list_lru_node::memcg_lrus to be RCU-accessed, and makes
__list_lru_count_one() count nr_items lockless to minimize
overhead introduced by locking operation, and to make parallel
reclaims more scalable.

Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Acked-by: Vladimir Davydov <vdavydov.dev@gmail.com>
---
 include/linux/list_lru.h |    3 ++
 mm/list_lru.c            |   59 +++++++++++++++++++++++++++++-----------------
 2 files changed, 39 insertions(+), 23 deletions(-)

diff --git a/include/linux/list_lru.h b/include/linux/list_lru.h
index fa7fd03cb5f9..a55258100e40 100644
--- a/include/linux/list_lru.h
+++ b/include/linux/list_lru.h
@@ -31,6 +31,7 @@ struct list_lru_one {
 };
 
 struct list_lru_memcg {
+	struct rcu_head		rcu;
 	/* array of per cgroup lists, indexed by memcg_cache_id */
 	struct list_lru_one	*lru[0];
 };
@@ -42,7 +43,7 @@ struct list_lru_node {
 	struct list_lru_one	lru;
 #if defined(CONFIG_MEMCG) && !defined(CONFIG_SLOB)
 	/* for cgroup aware lrus points to per cgroup lists, otherwise NULL */
-	struct list_lru_memcg	*memcg_lrus;
+	struct list_lru_memcg	__rcu *memcg_lrus;
 #endif
 	long nr_items;
 } ____cacheline_aligned_in_smp;
diff --git a/mm/list_lru.c b/mm/list_lru.c
index 7a40fa2be858..9fdb24818dae 100644
--- a/mm/list_lru.c
+++ b/mm/list_lru.c
@@ -52,14 +52,15 @@ static inline bool list_lru_memcg_aware(struct list_lru *lru)
 static inline struct list_lru_one *
 list_lru_from_memcg_idx(struct list_lru_node *nlru, int idx)
 {
+	struct list_lru_memcg *memcg_lrus;
 	/*
-	 * The lock protects the array of per cgroup lists from relocation
-	 * (see memcg_update_list_lru_node).
+	 * Either lock or RCU protects the array of per cgroup lists
+	 * from relocation (see memcg_update_list_lru_node).
 	 */
-	lockdep_assert_held(&nlru->lock);
-	if (nlru->memcg_lrus && idx >= 0)
-		return nlru->memcg_lrus->lru[idx];
-
+	memcg_lrus = rcu_dereference_check(nlru->memcg_lrus,
+					   lockdep_is_held(&nlru->lock));
+	if (memcg_lrus && idx >= 0)
+		return memcg_lrus->lru[idx];
 	return &nlru->lru;
 }
 
@@ -168,10 +169,10 @@ static unsigned long __list_lru_count_one(struct list_lru *lru,
 	struct list_lru_one *l;
 	unsigned long count;
 
-	spin_lock(&nlru->lock);
+	rcu_read_lock();
 	l = list_lru_from_memcg_idx(nlru, memcg_idx);
 	count = l->nr_items;
-	spin_unlock(&nlru->lock);
+	rcu_read_unlock();
 
 	return count;
 }
@@ -323,24 +324,33 @@ static int __memcg_init_list_lru_node(struct list_lru_memcg *memcg_lrus,
 
 static int memcg_init_list_lru_node(struct list_lru_node *nlru)
 {
+	struct list_lru_memcg *memcg_lrus;
 	int size = memcg_nr_cache_ids;
 
-	nlru->memcg_lrus = kmalloc(size * sizeof(void *), GFP_KERNEL);
-	if (!nlru->memcg_lrus)
+	memcg_lrus = kmalloc(sizeof(*memcg_lrus) +
+			     size * sizeof(void *), GFP_KERNEL);
+	if (!memcg_lrus)
 		return -ENOMEM;
 
-	if (__memcg_init_list_lru_node(nlru->memcg_lrus, 0, size)) {
-		kfree(nlru->memcg_lrus);
+	if (__memcg_init_list_lru_node(memcg_lrus, 0, size)) {
+		kfree(memcg_lrus);
 		return -ENOMEM;
 	}
+	RCU_INIT_POINTER(nlru->memcg_lrus, memcg_lrus);
 
 	return 0;
 }
 
 static void memcg_destroy_list_lru_node(struct list_lru_node *nlru)
 {
-	__memcg_destroy_list_lru_node(nlru->memcg_lrus, 0, memcg_nr_cache_ids);
-	kfree(nlru->memcg_lrus);
+	struct list_lru_memcg *memcg_lrus;
+	/*
+	 * This is called when shrinker has already been unregistered,
+	 * and nobody can use it. So, there is no need to use kfree_rcu().
+	 */
+	memcg_lrus = rcu_dereference_protected(nlru->memcg_lrus, true);
+	__memcg_destroy_list_lru_node(memcg_lrus, 0, memcg_nr_cache_ids);
+	kfree(memcg_lrus);
 }
 
 static int memcg_update_list_lru_node(struct list_lru_node *nlru,
@@ -350,8 +360,9 @@ static int memcg_update_list_lru_node(struct list_lru_node *nlru,
 
 	BUG_ON(old_size > new_size);
 
-	old = nlru->memcg_lrus;
-	new = kmalloc(new_size * sizeof(void *), GFP_KERNEL);
+	old = rcu_dereference_protected(nlru->memcg_lrus,
+					lockdep_is_held(&list_lrus_mutex));
+	new = kmalloc(sizeof(*new) + new_size * sizeof(void *), GFP_KERNEL);
 	if (!new)
 		return -ENOMEM;
 
@@ -360,29 +371,33 @@ static int memcg_update_list_lru_node(struct list_lru_node *nlru,
 		return -ENOMEM;
 	}
 
-	memcpy(new, old, old_size * sizeof(void *));
+	memcpy(&new->lru, &old->lru, old_size * sizeof(void *));
 
 	/*
-	 * The lock guarantees that we won't race with a reader
-	 * (see list_lru_from_memcg_idx).
+	 * The locking below allows readers that hold nlru->lock avoid taking
+	 * rcu_read_lock (see list_lru_from_memcg_idx).
 	 *
 	 * Since list_lru_{add,del} may be called under an IRQ-safe lock,
 	 * we have to use IRQ-safe primitives here to avoid deadlock.
 	 */
 	spin_lock_irq(&nlru->lock);
-	nlru->memcg_lrus = new;
+	rcu_assign_pointer(nlru->memcg_lrus, new);
 	spin_unlock_irq(&nlru->lock);
 
-	kfree(old);
+	kfree_rcu(old, rcu);
 	return 0;
 }
 
 static void memcg_cancel_update_list_lru_node(struct list_lru_node *nlru,
 					      int old_size, int new_size)
 {
+	struct list_lru_memcg *memcg_lrus;
+
+	memcg_lrus = rcu_dereference_protected(nlru->memcg_lrus,
+					       lockdep_is_held(&list_lrus_mutex));
 	/* do not bother shrinking the array back to the old size, because we
 	 * cannot handle allocation failures here */
-	__memcg_destroy_list_lru_node(nlru->memcg_lrus, old_size, new_size);
+	__memcg_destroy_list_lru_node(memcg_lrus, old_size, new_size);
 }
 
 static int memcg_init_list_lru(struct list_lru *lru, bool memcg_aware)

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] mm: Make count list_lru_one::nr_items lockless
  2017-09-19 15:06 ` Kirill Tkhai
@ 2017-09-27 21:15   ` Andrew Morton
  -1 siblings, 0 replies; 14+ messages in thread
From: Andrew Morton @ 2017-09-27 21:15 UTC (permalink / raw)
  To: Kirill Tkhai; +Cc: vdavydov.dev, apolyakov, linux-kernel, linux-mm, aryabinin

On Tue, 19 Sep 2017 18:06:33 +0300 Kirill Tkhai <ktkhai@virtuozzo.com> wrote:

> During the reclaiming slab of a memcg, shrink_slab iterates
> over all registered shrinkers in the system, and tries to count
> and consume objects related to the cgroup. In case of memory
> pressure, this behaves bad: I observe high system time and
> time spent in list_lru_count_one() for many processes on RHEL7
> kernel (collected via $perf record --call-graph fp -j k -a):
> 
> 0,50%  nixstatsagent  [kernel.vmlinux]  [k] _raw_spin_lock                [k] _raw_spin_lock
> 0,26%  nixstatsagent  [kernel.vmlinux]  [k] shrink_slab                   [k] shrink_slab
> 0,23%  nixstatsagent  [kernel.vmlinux]  [k] super_cache_count             [k] super_cache_count
> 0,15%  nixstatsagent  [kernel.vmlinux]  [k] __list_lru_count_one.isra.2   [k] _raw_spin_lock
> 0,15%  nixstatsagent  [kernel.vmlinux]  [k] list_lru_count_one            [k] __list_lru_count_one.isra.2
> 
> 0,94%  mysqld         [kernel.vmlinux]  [k] _raw_spin_lock                [k] _raw_spin_lock
> 0,57%  mysqld         [kernel.vmlinux]  [k] shrink_slab                   [k] shrink_slab
> 0,51%  mysqld         [kernel.vmlinux]  [k] super_cache_count             [k] super_cache_count
> 0,32%  mysqld         [kernel.vmlinux]  [k] __list_lru_count_one.isra.2   [k] _raw_spin_lock
> 0,32%  mysqld         [kernel.vmlinux]  [k] list_lru_count_one            [k] __list_lru_count_one.isra.2
> 
> 0,73%  sshd           [kernel.vmlinux]  [k] _raw_spin_lock                [k] _raw_spin_lock
> 0,35%  sshd           [kernel.vmlinux]  [k] shrink_slab                   [k] shrink_slab
> 0,32%  sshd           [kernel.vmlinux]  [k] super_cache_count             [k] super_cache_count
> 0,21%  sshd           [kernel.vmlinux]  [k] __list_lru_count_one.isra.2   [k] _raw_spin_lock
> 0,21%  sshd           [kernel.vmlinux]  [k] list_lru_count_one            [k] __list_lru_count_one.isra.2
> 
> This patch aims to make super_cache_count() (and other functions,
> which count LRU nr_items) more effective.
> It allows list_lru_node::memcg_lrus to be RCU-accessed, and makes
> __list_lru_count_one() count nr_items lockless to minimize
> overhead introduced by locking operation, and to make parallel
> reclaims more scalable.

And...  what were the effects of the patch?  Did you not run the same
performance tests after applying it?

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] mm: Make count list_lru_one::nr_items lockless
@ 2017-09-27 21:15   ` Andrew Morton
  0 siblings, 0 replies; 14+ messages in thread
From: Andrew Morton @ 2017-09-27 21:15 UTC (permalink / raw)
  To: Kirill Tkhai; +Cc: vdavydov.dev, apolyakov, linux-kernel, linux-mm, aryabinin

On Tue, 19 Sep 2017 18:06:33 +0300 Kirill Tkhai <ktkhai@virtuozzo.com> wrote:

> During the reclaiming slab of a memcg, shrink_slab iterates
> over all registered shrinkers in the system, and tries to count
> and consume objects related to the cgroup. In case of memory
> pressure, this behaves bad: I observe high system time and
> time spent in list_lru_count_one() for many processes on RHEL7
> kernel (collected via $perf record --call-graph fp -j k -a):
> 
> 0,50%  nixstatsagent  [kernel.vmlinux]  [k] _raw_spin_lock                [k] _raw_spin_lock
> 0,26%  nixstatsagent  [kernel.vmlinux]  [k] shrink_slab                   [k] shrink_slab
> 0,23%  nixstatsagent  [kernel.vmlinux]  [k] super_cache_count             [k] super_cache_count
> 0,15%  nixstatsagent  [kernel.vmlinux]  [k] __list_lru_count_one.isra.2   [k] _raw_spin_lock
> 0,15%  nixstatsagent  [kernel.vmlinux]  [k] list_lru_count_one            [k] __list_lru_count_one.isra.2
> 
> 0,94%  mysqld         [kernel.vmlinux]  [k] _raw_spin_lock                [k] _raw_spin_lock
> 0,57%  mysqld         [kernel.vmlinux]  [k] shrink_slab                   [k] shrink_slab
> 0,51%  mysqld         [kernel.vmlinux]  [k] super_cache_count             [k] super_cache_count
> 0,32%  mysqld         [kernel.vmlinux]  [k] __list_lru_count_one.isra.2   [k] _raw_spin_lock
> 0,32%  mysqld         [kernel.vmlinux]  [k] list_lru_count_one            [k] __list_lru_count_one.isra.2
> 
> 0,73%  sshd           [kernel.vmlinux]  [k] _raw_spin_lock                [k] _raw_spin_lock
> 0,35%  sshd           [kernel.vmlinux]  [k] shrink_slab                   [k] shrink_slab
> 0,32%  sshd           [kernel.vmlinux]  [k] super_cache_count             [k] super_cache_count
> 0,21%  sshd           [kernel.vmlinux]  [k] __list_lru_count_one.isra.2   [k] _raw_spin_lock
> 0,21%  sshd           [kernel.vmlinux]  [k] list_lru_count_one            [k] __list_lru_count_one.isra.2
> 
> This patch aims to make super_cache_count() (and other functions,
> which count LRU nr_items) more effective.
> It allows list_lru_node::memcg_lrus to be RCU-accessed, and makes
> __list_lru_count_one() count nr_items lockless to minimize
> overhead introduced by locking operation, and to make parallel
> reclaims more scalable.

And...  what were the effects of the patch?  Did you not run the same
performance tests after applying it?

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] mm: Make count list_lru_one::nr_items lockless
  2017-09-27 21:15   ` Andrew Morton
@ 2017-09-28  7:48     ` Kirill Tkhai
  -1 siblings, 0 replies; 14+ messages in thread
From: Kirill Tkhai @ 2017-09-28  7:48 UTC (permalink / raw)
  To: Andrew Morton; +Cc: vdavydov.dev, apolyakov, linux-kernel, linux-mm, aryabinin

On 28.09.2017 00:15, Andrew Morton wrote:
> On Tue, 19 Sep 2017 18:06:33 +0300 Kirill Tkhai <ktkhai@virtuozzo.com> wrote:
> 
>> During the reclaiming slab of a memcg, shrink_slab iterates
>> over all registered shrinkers in the system, and tries to count
>> and consume objects related to the cgroup. In case of memory
>> pressure, this behaves bad: I observe high system time and
>> time spent in list_lru_count_one() for many processes on RHEL7
>> kernel (collected via $perf record --call-graph fp -j k -a):
>>
>> 0,50%  nixstatsagent  [kernel.vmlinux]  [k] _raw_spin_lock                [k] _raw_spin_lock
>> 0,26%  nixstatsagent  [kernel.vmlinux]  [k] shrink_slab                   [k] shrink_slab
>> 0,23%  nixstatsagent  [kernel.vmlinux]  [k] super_cache_count             [k] super_cache_count
>> 0,15%  nixstatsagent  [kernel.vmlinux]  [k] __list_lru_count_one.isra.2   [k] _raw_spin_lock
>> 0,15%  nixstatsagent  [kernel.vmlinux]  [k] list_lru_count_one            [k] __list_lru_count_one.isra.2
>>
>> 0,94%  mysqld         [kernel.vmlinux]  [k] _raw_spin_lock                [k] _raw_spin_lock
>> 0,57%  mysqld         [kernel.vmlinux]  [k] shrink_slab                   [k] shrink_slab
>> 0,51%  mysqld         [kernel.vmlinux]  [k] super_cache_count             [k] super_cache_count
>> 0,32%  mysqld         [kernel.vmlinux]  [k] __list_lru_count_one.isra.2   [k] _raw_spin_lock
>> 0,32%  mysqld         [kernel.vmlinux]  [k] list_lru_count_one            [k] __list_lru_count_one.isra.2
>>
>> 0,73%  sshd           [kernel.vmlinux]  [k] _raw_spin_lock                [k] _raw_spin_lock
>> 0,35%  sshd           [kernel.vmlinux]  [k] shrink_slab                   [k] shrink_slab
>> 0,32%  sshd           [kernel.vmlinux]  [k] super_cache_count             [k] super_cache_count
>> 0,21%  sshd           [kernel.vmlinux]  [k] __list_lru_count_one.isra.2   [k] _raw_spin_lock
>> 0,21%  sshd           [kernel.vmlinux]  [k] list_lru_count_one            [k] __list_lru_count_one.isra.2
>>
>> This patch aims to make super_cache_count() (and other functions,
>> which count LRU nr_items) more effective.
>> It allows list_lru_node::memcg_lrus to be RCU-accessed, and makes
>> __list_lru_count_one() count nr_items lockless to minimize
>> overhead introduced by locking operation, and to make parallel
>> reclaims more scalable.
> 
> And...  what were the effects of the patch?  Did you not run the same
> performance tests after applying it?

I've just detected the such high usage of shrink slab on production node. It's rather
difficult to make it use another kernel, than it uses, only kpatches are possible.
So, I haven't estimated how it acts on node's performance.
On test node I see, that the patch obviously removes raw_spin_lock from perf profile.
So, it's a little bit untested in this way.

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] mm: Make count list_lru_one::nr_items lockless
@ 2017-09-28  7:48     ` Kirill Tkhai
  0 siblings, 0 replies; 14+ messages in thread
From: Kirill Tkhai @ 2017-09-28  7:48 UTC (permalink / raw)
  To: Andrew Morton; +Cc: vdavydov.dev, apolyakov, linux-kernel, linux-mm, aryabinin

On 28.09.2017 00:15, Andrew Morton wrote:
> On Tue, 19 Sep 2017 18:06:33 +0300 Kirill Tkhai <ktkhai@virtuozzo.com> wrote:
> 
>> During the reclaiming slab of a memcg, shrink_slab iterates
>> over all registered shrinkers in the system, and tries to count
>> and consume objects related to the cgroup. In case of memory
>> pressure, this behaves bad: I observe high system time and
>> time spent in list_lru_count_one() for many processes on RHEL7
>> kernel (collected via $perf record --call-graph fp -j k -a):
>>
>> 0,50%  nixstatsagent  [kernel.vmlinux]  [k] _raw_spin_lock                [k] _raw_spin_lock
>> 0,26%  nixstatsagent  [kernel.vmlinux]  [k] shrink_slab                   [k] shrink_slab
>> 0,23%  nixstatsagent  [kernel.vmlinux]  [k] super_cache_count             [k] super_cache_count
>> 0,15%  nixstatsagent  [kernel.vmlinux]  [k] __list_lru_count_one.isra.2   [k] _raw_spin_lock
>> 0,15%  nixstatsagent  [kernel.vmlinux]  [k] list_lru_count_one            [k] __list_lru_count_one.isra.2
>>
>> 0,94%  mysqld         [kernel.vmlinux]  [k] _raw_spin_lock                [k] _raw_spin_lock
>> 0,57%  mysqld         [kernel.vmlinux]  [k] shrink_slab                   [k] shrink_slab
>> 0,51%  mysqld         [kernel.vmlinux]  [k] super_cache_count             [k] super_cache_count
>> 0,32%  mysqld         [kernel.vmlinux]  [k] __list_lru_count_one.isra.2   [k] _raw_spin_lock
>> 0,32%  mysqld         [kernel.vmlinux]  [k] list_lru_count_one            [k] __list_lru_count_one.isra.2
>>
>> 0,73%  sshd           [kernel.vmlinux]  [k] _raw_spin_lock                [k] _raw_spin_lock
>> 0,35%  sshd           [kernel.vmlinux]  [k] shrink_slab                   [k] shrink_slab
>> 0,32%  sshd           [kernel.vmlinux]  [k] super_cache_count             [k] super_cache_count
>> 0,21%  sshd           [kernel.vmlinux]  [k] __list_lru_count_one.isra.2   [k] _raw_spin_lock
>> 0,21%  sshd           [kernel.vmlinux]  [k] list_lru_count_one            [k] __list_lru_count_one.isra.2
>>
>> This patch aims to make super_cache_count() (and other functions,
>> which count LRU nr_items) more effective.
>> It allows list_lru_node::memcg_lrus to be RCU-accessed, and makes
>> __list_lru_count_one() count nr_items lockless to minimize
>> overhead introduced by locking operation, and to make parallel
>> reclaims more scalable.
> 
> And...  what were the effects of the patch?  Did you not run the same
> performance tests after applying it?

I've just detected the such high usage of shrink slab on production node. It's rather
difficult to make it use another kernel, than it uses, only kpatches are possible.
So, I haven't estimated how it acts on node's performance.
On test node I see, that the patch obviously removes raw_spin_lock from perf profile.
So, it's a little bit untested in this way.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] mm: Make count list_lru_one::nr_items lockless
  2017-09-28  7:48     ` Kirill Tkhai
@ 2017-09-28 21:02       ` Andrew Morton
  -1 siblings, 0 replies; 14+ messages in thread
From: Andrew Morton @ 2017-09-28 21:02 UTC (permalink / raw)
  To: Kirill Tkhai; +Cc: vdavydov.dev, apolyakov, linux-kernel, linux-mm, aryabinin

On Thu, 28 Sep 2017 10:48:55 +0300 Kirill Tkhai <ktkhai@virtuozzo.com> wrote:

> >> This patch aims to make super_cache_count() (and other functions,
> >> which count LRU nr_items) more effective.
> >> It allows list_lru_node::memcg_lrus to be RCU-accessed, and makes
> >> __list_lru_count_one() count nr_items lockless to minimize
> >> overhead introduced by locking operation, and to make parallel
> >> reclaims more scalable.
> > 
> > And...  what were the effects of the patch?  Did you not run the same
> > performance tests after applying it?
> 
> I've just detected the such high usage of shrink slab on production node. It's rather
> difficult to make it use another kernel, than it uses, only kpatches are possible.
> So, I haven't estimated how it acts on node's performance.
> On test node I see, that the patch obviously removes raw_spin_lock from perf profile.
> So, it's a little bit untested in this way.

Well that's a problem.  The patch increases list_lru.o text size by a
lot (4800->5696) which will have a cost.  And we don't have proof that
any benefit is worth that cost.  It shouldn't be too hard to cook up a
synthetic test to trigger memcg slab reclaim and then run a
before-n-after benchmark?

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] mm: Make count list_lru_one::nr_items lockless
@ 2017-09-28 21:02       ` Andrew Morton
  0 siblings, 0 replies; 14+ messages in thread
From: Andrew Morton @ 2017-09-28 21:02 UTC (permalink / raw)
  To: Kirill Tkhai; +Cc: vdavydov.dev, apolyakov, linux-kernel, linux-mm, aryabinin

On Thu, 28 Sep 2017 10:48:55 +0300 Kirill Tkhai <ktkhai@virtuozzo.com> wrote:

> >> This patch aims to make super_cache_count() (and other functions,
> >> which count LRU nr_items) more effective.
> >> It allows list_lru_node::memcg_lrus to be RCU-accessed, and makes
> >> __list_lru_count_one() count nr_items lockless to minimize
> >> overhead introduced by locking operation, and to make parallel
> >> reclaims more scalable.
> > 
> > And...  what were the effects of the patch?  Did you not run the same
> > performance tests after applying it?
> 
> I've just detected the such high usage of shrink slab on production node. It's rather
> difficult to make it use another kernel, than it uses, only kpatches are possible.
> So, I haven't estimated how it acts on node's performance.
> On test node I see, that the patch obviously removes raw_spin_lock from perf profile.
> So, it's a little bit untested in this way.

Well that's a problem.  The patch increases list_lru.o text size by a
lot (4800->5696) which will have a cost.  And we don't have proof that
any benefit is worth that cost.  It shouldn't be too hard to cook up a
synthetic test to trigger memcg slab reclaim and then run a
before-n-after benchmark?

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] mm: Make count list_lru_one::nr_items lockless
  2017-09-28 21:02       ` Andrew Morton
@ 2017-09-29  8:15         ` Kirill Tkhai
  -1 siblings, 0 replies; 14+ messages in thread
From: Kirill Tkhai @ 2017-09-29  8:15 UTC (permalink / raw)
  To: Andrew Morton; +Cc: vdavydov.dev, apolyakov, linux-kernel, linux-mm, aryabinin

On 29.09.2017 00:02, Andrew Morton wrote:
> On Thu, 28 Sep 2017 10:48:55 +0300 Kirill Tkhai <ktkhai@virtuozzo.com> wrote:
> 
>>>> This patch aims to make super_cache_count() (and other functions,
>>>> which count LRU nr_items) more effective.
>>>> It allows list_lru_node::memcg_lrus to be RCU-accessed, and makes
>>>> __list_lru_count_one() count nr_items lockless to minimize
>>>> overhead introduced by locking operation, and to make parallel
>>>> reclaims more scalable.
>>>
>>> And...  what were the effects of the patch?  Did you not run the same
>>> performance tests after applying it?
>>
>> I've just detected the such high usage of shrink slab on production node. It's rather
>> difficult to make it use another kernel, than it uses, only kpatches are possible.
>> So, I haven't estimated how it acts on node's performance.
>> On test node I see, that the patch obviously removes raw_spin_lock from perf profile.
>> So, it's a little bit untested in this way.
> 
> Well that's a problem.  The patch increases list_lru.o text size by a
> lot (4800->5696) which will have a cost.  And we don't have proof that
> any benefit is worth that cost.  It shouldn't be too hard to cook up a
> synthetic test to trigger memcg slab reclaim and then run a
> before-n-after benchmark?

Ok, then, please, ignore this for a while, I'll try to do it a little bit later.

Kirill

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] mm: Make count list_lru_one::nr_items lockless
@ 2017-09-29  8:15         ` Kirill Tkhai
  0 siblings, 0 replies; 14+ messages in thread
From: Kirill Tkhai @ 2017-09-29  8:15 UTC (permalink / raw)
  To: Andrew Morton; +Cc: vdavydov.dev, apolyakov, linux-kernel, linux-mm, aryabinin

On 29.09.2017 00:02, Andrew Morton wrote:
> On Thu, 28 Sep 2017 10:48:55 +0300 Kirill Tkhai <ktkhai@virtuozzo.com> wrote:
> 
>>>> This patch aims to make super_cache_count() (and other functions,
>>>> which count LRU nr_items) more effective.
>>>> It allows list_lru_node::memcg_lrus to be RCU-accessed, and makes
>>>> __list_lru_count_one() count nr_items lockless to minimize
>>>> overhead introduced by locking operation, and to make parallel
>>>> reclaims more scalable.
>>>
>>> And...  what were the effects of the patch?  Did you not run the same
>>> performance tests after applying it?
>>
>> I've just detected the such high usage of shrink slab on production node. It's rather
>> difficult to make it use another kernel, than it uses, only kpatches are possible.
>> So, I haven't estimated how it acts on node's performance.
>> On test node I see, that the patch obviously removes raw_spin_lock from perf profile.
>> So, it's a little bit untested in this way.
> 
> Well that's a problem.  The patch increases list_lru.o text size by a
> lot (4800->5696) which will have a cost.  And we don't have proof that
> any benefit is worth that cost.  It shouldn't be too hard to cook up a
> synthetic test to trigger memcg slab reclaim and then run a
> before-n-after benchmark?

Ok, then, please, ignore this for a while, I'll try to do it a little bit later.

Kirill

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] mm: Make count list_lru_one::nr_items lockless
  2017-09-29  8:15         ` Kirill Tkhai
@ 2017-11-30  0:27           ` Shakeel Butt
  -1 siblings, 0 replies; 14+ messages in thread
From: Shakeel Butt @ 2017-11-30  0:27 UTC (permalink / raw)
  To: Kirill Tkhai
  Cc: Andrew Morton, Vladimir Davydov, apolyakov, LKML, Linux MM,
	Andrey Ryabinin

On Fri, Sep 29, 2017 at 1:15 AM, Kirill Tkhai <ktkhai@virtuozzo.com> wrote:
> On 29.09.2017 00:02, Andrew Morton wrote:
>> On Thu, 28 Sep 2017 10:48:55 +0300 Kirill Tkhai <ktkhai@virtuozzo.com> wrote:
>>
>>>>> This patch aims to make super_cache_count() (and other functions,
>>>>> which count LRU nr_items) more effective.
>>>>> It allows list_lru_node::memcg_lrus to be RCU-accessed, and makes
>>>>> __list_lru_count_one() count nr_items lockless to minimize
>>>>> overhead introduced by locking operation, and to make parallel
>>>>> reclaims more scalable.
>>>>
>>>> And...  what were the effects of the patch?  Did you not run the same
>>>> performance tests after applying it?
>>>
>>> I've just detected the such high usage of shrink slab on production node. It's rather
>>> difficult to make it use another kernel, than it uses, only kpatches are possible.
>>> So, I haven't estimated how it acts on node's performance.
>>> On test node I see, that the patch obviously removes raw_spin_lock from perf profile.
>>> So, it's a little bit untested in this way.
>>
>> Well that's a problem.  The patch increases list_lru.o text size by a
>> lot (4800->5696) which will have a cost.  And we don't have proof that
>> any benefit is worth that cost.  It shouldn't be too hard to cook up a
>> synthetic test to trigger memcg slab reclaim and then run a
>> before-n-after benchmark?
>
> Ok, then, please, ignore this for a while, I'll try to do it a little bit later.
>

I rebased this patch on linus tree (replacing kfree_rcu with call_rcu
as there is no kvfree_rcu) and did some experiments. I think the patch
is worth to be included.

Setup: running a fork-bomb in a memcg of 200MiB on a 8GiB and 4 vcpu
VM and recording the trace with 'perf record -g -a'.

The trace without the patch:

+  34.19%     fb.sh  [kernel.kallsyms]  [k] queued_spin_lock_slowpath
+  30.77%     fb.sh  [kernel.kallsyms]  [k] _raw_spin_lock
+   3.53%     fb.sh  [kernel.kallsyms]  [k] list_lru_count_one
+   2.26%     fb.sh  [kernel.kallsyms]  [k] super_cache_count
+   1.68%     fb.sh  [kernel.kallsyms]  [k] shrink_slab
+   0.59%     fb.sh  [kernel.kallsyms]  [k] down_read_trylock
+   0.48%     fb.sh  [kernel.kallsyms]  [k] _raw_spin_unlock_irqrestore
+   0.38%     fb.sh  [kernel.kallsyms]  [k] shrink_node_memcg
+   0.32%     fb.sh  [kernel.kallsyms]  [k] queue_work_on
+   0.26%     fb.sh  [kernel.kallsyms]  [k] count_shadow_nodes

With the patch:

+   0.16%     swapper  [kernel.kallsyms]    [k] default_idle
+   0.13%     oom_reaper  [kernel.kallsyms]    [k] mutex_spin_on_owner
+   0.05%     perf  [kernel.kallsyms]    [k] copy_user_generic_string
+   0.05%     init.real  [kernel.kallsyms]    [k] wait_consider_task
+   0.05%     kworker/0:0  [kernel.kallsyms]    [k] finish_task_switch
+   0.04%     kworker/2:1  [kernel.kallsyms]    [k] finish_task_switch
+   0.04%     kworker/3:1  [kernel.kallsyms]    [k] finish_task_switch
+   0.04%     kworker/1:0  [kernel.kallsyms]    [k] finish_task_switch
+   0.03%     binary  [kernel.kallsyms]    [k] copy_page


Kirill, can you resend your patch with this info or do you want me
send the rebased patch?

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] mm: Make count list_lru_one::nr_items lockless
@ 2017-11-30  0:27           ` Shakeel Butt
  0 siblings, 0 replies; 14+ messages in thread
From: Shakeel Butt @ 2017-11-30  0:27 UTC (permalink / raw)
  To: Kirill Tkhai
  Cc: Andrew Morton, Vladimir Davydov, apolyakov, LKML, Linux MM,
	Andrey Ryabinin

On Fri, Sep 29, 2017 at 1:15 AM, Kirill Tkhai <ktkhai@virtuozzo.com> wrote:
> On 29.09.2017 00:02, Andrew Morton wrote:
>> On Thu, 28 Sep 2017 10:48:55 +0300 Kirill Tkhai <ktkhai@virtuozzo.com> wrote:
>>
>>>>> This patch aims to make super_cache_count() (and other functions,
>>>>> which count LRU nr_items) more effective.
>>>>> It allows list_lru_node::memcg_lrus to be RCU-accessed, and makes
>>>>> __list_lru_count_one() count nr_items lockless to minimize
>>>>> overhead introduced by locking operation, and to make parallel
>>>>> reclaims more scalable.
>>>>
>>>> And...  what were the effects of the patch?  Did you not run the same
>>>> performance tests after applying it?
>>>
>>> I've just detected the such high usage of shrink slab on production node. It's rather
>>> difficult to make it use another kernel, than it uses, only kpatches are possible.
>>> So, I haven't estimated how it acts on node's performance.
>>> On test node I see, that the patch obviously removes raw_spin_lock from perf profile.
>>> So, it's a little bit untested in this way.
>>
>> Well that's a problem.  The patch increases list_lru.o text size by a
>> lot (4800->5696) which will have a cost.  And we don't have proof that
>> any benefit is worth that cost.  It shouldn't be too hard to cook up a
>> synthetic test to trigger memcg slab reclaim and then run a
>> before-n-after benchmark?
>
> Ok, then, please, ignore this for a while, I'll try to do it a little bit later.
>

I rebased this patch on linus tree (replacing kfree_rcu with call_rcu
as there is no kvfree_rcu) and did some experiments. I think the patch
is worth to be included.

Setup: running a fork-bomb in a memcg of 200MiB on a 8GiB and 4 vcpu
VM and recording the trace with 'perf record -g -a'.

The trace without the patch:

+  34.19%     fb.sh  [kernel.kallsyms]  [k] queued_spin_lock_slowpath
+  30.77%     fb.sh  [kernel.kallsyms]  [k] _raw_spin_lock
+   3.53%     fb.sh  [kernel.kallsyms]  [k] list_lru_count_one
+   2.26%     fb.sh  [kernel.kallsyms]  [k] super_cache_count
+   1.68%     fb.sh  [kernel.kallsyms]  [k] shrink_slab
+   0.59%     fb.sh  [kernel.kallsyms]  [k] down_read_trylock
+   0.48%     fb.sh  [kernel.kallsyms]  [k] _raw_spin_unlock_irqrestore
+   0.38%     fb.sh  [kernel.kallsyms]  [k] shrink_node_memcg
+   0.32%     fb.sh  [kernel.kallsyms]  [k] queue_work_on
+   0.26%     fb.sh  [kernel.kallsyms]  [k] count_shadow_nodes

With the patch:

+   0.16%     swapper  [kernel.kallsyms]    [k] default_idle
+   0.13%     oom_reaper  [kernel.kallsyms]    [k] mutex_spin_on_owner
+   0.05%     perf  [kernel.kallsyms]    [k] copy_user_generic_string
+   0.05%     init.real  [kernel.kallsyms]    [k] wait_consider_task
+   0.05%     kworker/0:0  [kernel.kallsyms]    [k] finish_task_switch
+   0.04%     kworker/2:1  [kernel.kallsyms]    [k] finish_task_switch
+   0.04%     kworker/3:1  [kernel.kallsyms]    [k] finish_task_switch
+   0.04%     kworker/1:0  [kernel.kallsyms]    [k] finish_task_switch
+   0.03%     binary  [kernel.kallsyms]    [k] copy_page


Kirill, can you resend your patch with this info or do you want me
send the rebased patch?

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] mm: Make count list_lru_one::nr_items lockless
  2017-11-30  0:27           ` Shakeel Butt
@ 2017-11-30 10:36             ` Kirill Tkhai
  -1 siblings, 0 replies; 14+ messages in thread
From: Kirill Tkhai @ 2017-11-30 10:36 UTC (permalink / raw)
  To: Shakeel Butt
  Cc: Andrew Morton, Vladimir Davydov, apolyakov, LKML, Linux MM,
	Andrey Ryabinin

On 30.11.2017 03:27, Shakeel Butt wrote:
> On Fri, Sep 29, 2017 at 1:15 AM, Kirill Tkhai <ktkhai@virtuozzo.com> wrote:
>> On 29.09.2017 00:02, Andrew Morton wrote:
>>> On Thu, 28 Sep 2017 10:48:55 +0300 Kirill Tkhai <ktkhai@virtuozzo.com> wrote:
>>>
>>>>>> This patch aims to make super_cache_count() (and other functions,
>>>>>> which count LRU nr_items) more effective.
>>>>>> It allows list_lru_node::memcg_lrus to be RCU-accessed, and makes
>>>>>> __list_lru_count_one() count nr_items lockless to minimize
>>>>>> overhead introduced by locking operation, and to make parallel
>>>>>> reclaims more scalable.
>>>>>
>>>>> And...  what were the effects of the patch?  Did you not run the same
>>>>> performance tests after applying it?
>>>>
>>>> I've just detected the such high usage of shrink slab on production node. It's rather
>>>> difficult to make it use another kernel, than it uses, only kpatches are possible.
>>>> So, I haven't estimated how it acts on node's performance.
>>>> On test node I see, that the patch obviously removes raw_spin_lock from perf profile.
>>>> So, it's a little bit untested in this way.
>>>
>>> Well that's a problem.  The patch increases list_lru.o text size by a
>>> lot (4800->5696) which will have a cost.  And we don't have proof that
>>> any benefit is worth that cost.  It shouldn't be too hard to cook up a
>>> synthetic test to trigger memcg slab reclaim and then run a
>>> before-n-after benchmark?
>>
>> Ok, then, please, ignore this for a while, I'll try to do it a little bit later.
>>
> 
> I rebased this patch on linus tree (replacing kfree_rcu with call_rcu
> as there is no kvfree_rcu) and did some experiments. I think the patch
> is worth to be included.
> 
> Setup: running a fork-bomb in a memcg of 200MiB on a 8GiB and 4 vcpu
> VM and recording the trace with 'perf record -g -a'.
> 
> The trace without the patch:
> 
> +  34.19%     fb.sh  [kernel.kallsyms]  [k] queued_spin_lock_slowpath
> +  30.77%     fb.sh  [kernel.kallsyms]  [k] _raw_spin_lock
> +   3.53%     fb.sh  [kernel.kallsyms]  [k] list_lru_count_one
> +   2.26%     fb.sh  [kernel.kallsyms]  [k] super_cache_count
> +   1.68%     fb.sh  [kernel.kallsyms]  [k] shrink_slab
> +   0.59%     fb.sh  [kernel.kallsyms]  [k] down_read_trylock
> +   0.48%     fb.sh  [kernel.kallsyms]  [k] _raw_spin_unlock_irqrestore
> +   0.38%     fb.sh  [kernel.kallsyms]  [k] shrink_node_memcg
> +   0.32%     fb.sh  [kernel.kallsyms]  [k] queue_work_on
> +   0.26%     fb.sh  [kernel.kallsyms]  [k] count_shadow_nodes
> 
> With the patch:
> 
> +   0.16%     swapper  [kernel.kallsyms]    [k] default_idle
> +   0.13%     oom_reaper  [kernel.kallsyms]    [k] mutex_spin_on_owner
> +   0.05%     perf  [kernel.kallsyms]    [k] copy_user_generic_string
> +   0.05%     init.real  [kernel.kallsyms]    [k] wait_consider_task
> +   0.05%     kworker/0:0  [kernel.kallsyms]    [k] finish_task_switch
> +   0.04%     kworker/2:1  [kernel.kallsyms]    [k] finish_task_switch
> +   0.04%     kworker/3:1  [kernel.kallsyms]    [k] finish_task_switch
> +   0.04%     kworker/1:0  [kernel.kallsyms]    [k] finish_task_switch
> +   0.03%     binary  [kernel.kallsyms]    [k] copy_page
> 
> 
> Kirill, can you resend your patch with this info or do you want me
> send the rebased patch?

Shakeel, thanks you for the testing! I'll resend the patch as "v2".

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] mm: Make count list_lru_one::nr_items lockless
@ 2017-11-30 10:36             ` Kirill Tkhai
  0 siblings, 0 replies; 14+ messages in thread
From: Kirill Tkhai @ 2017-11-30 10:36 UTC (permalink / raw)
  To: Shakeel Butt
  Cc: Andrew Morton, Vladimir Davydov, apolyakov, LKML, Linux MM,
	Andrey Ryabinin

On 30.11.2017 03:27, Shakeel Butt wrote:
> On Fri, Sep 29, 2017 at 1:15 AM, Kirill Tkhai <ktkhai@virtuozzo.com> wrote:
>> On 29.09.2017 00:02, Andrew Morton wrote:
>>> On Thu, 28 Sep 2017 10:48:55 +0300 Kirill Tkhai <ktkhai@virtuozzo.com> wrote:
>>>
>>>>>> This patch aims to make super_cache_count() (and other functions,
>>>>>> which count LRU nr_items) more effective.
>>>>>> It allows list_lru_node::memcg_lrus to be RCU-accessed, and makes
>>>>>> __list_lru_count_one() count nr_items lockless to minimize
>>>>>> overhead introduced by locking operation, and to make parallel
>>>>>> reclaims more scalable.
>>>>>
>>>>> And...  what were the effects of the patch?  Did you not run the same
>>>>> performance tests after applying it?
>>>>
>>>> I've just detected the such high usage of shrink slab on production node. It's rather
>>>> difficult to make it use another kernel, than it uses, only kpatches are possible.
>>>> So, I haven't estimated how it acts on node's performance.
>>>> On test node I see, that the patch obviously removes raw_spin_lock from perf profile.
>>>> So, it's a little bit untested in this way.
>>>
>>> Well that's a problem.  The patch increases list_lru.o text size by a
>>> lot (4800->5696) which will have a cost.  And we don't have proof that
>>> any benefit is worth that cost.  It shouldn't be too hard to cook up a
>>> synthetic test to trigger memcg slab reclaim and then run a
>>> before-n-after benchmark?
>>
>> Ok, then, please, ignore this for a while, I'll try to do it a little bit later.
>>
> 
> I rebased this patch on linus tree (replacing kfree_rcu with call_rcu
> as there is no kvfree_rcu) and did some experiments. I think the patch
> is worth to be included.
> 
> Setup: running a fork-bomb in a memcg of 200MiB on a 8GiB and 4 vcpu
> VM and recording the trace with 'perf record -g -a'.
> 
> The trace without the patch:
> 
> +  34.19%     fb.sh  [kernel.kallsyms]  [k] queued_spin_lock_slowpath
> +  30.77%     fb.sh  [kernel.kallsyms]  [k] _raw_spin_lock
> +   3.53%     fb.sh  [kernel.kallsyms]  [k] list_lru_count_one
> +   2.26%     fb.sh  [kernel.kallsyms]  [k] super_cache_count
> +   1.68%     fb.sh  [kernel.kallsyms]  [k] shrink_slab
> +   0.59%     fb.sh  [kernel.kallsyms]  [k] down_read_trylock
> +   0.48%     fb.sh  [kernel.kallsyms]  [k] _raw_spin_unlock_irqrestore
> +   0.38%     fb.sh  [kernel.kallsyms]  [k] shrink_node_memcg
> +   0.32%     fb.sh  [kernel.kallsyms]  [k] queue_work_on
> +   0.26%     fb.sh  [kernel.kallsyms]  [k] count_shadow_nodes
> 
> With the patch:
> 
> +   0.16%     swapper  [kernel.kallsyms]    [k] default_idle
> +   0.13%     oom_reaper  [kernel.kallsyms]    [k] mutex_spin_on_owner
> +   0.05%     perf  [kernel.kallsyms]    [k] copy_user_generic_string
> +   0.05%     init.real  [kernel.kallsyms]    [k] wait_consider_task
> +   0.05%     kworker/0:0  [kernel.kallsyms]    [k] finish_task_switch
> +   0.04%     kworker/2:1  [kernel.kallsyms]    [k] finish_task_switch
> +   0.04%     kworker/3:1  [kernel.kallsyms]    [k] finish_task_switch
> +   0.04%     kworker/1:0  [kernel.kallsyms]    [k] finish_task_switch
> +   0.03%     binary  [kernel.kallsyms]    [k] copy_page
> 
> 
> Kirill, can you resend your patch with this info or do you want me
> send the rebased patch?

Shakeel, thanks you for the testing! I'll resend the patch as "v2".

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 14+ messages in thread

end of thread, other threads:[~2017-11-30 10:37 UTC | newest]

Thread overview: 14+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-09-19 15:06 [PATCH] mm: Make count list_lru_one::nr_items lockless Kirill Tkhai
2017-09-19 15:06 ` Kirill Tkhai
2017-09-27 21:15 ` Andrew Morton
2017-09-27 21:15   ` Andrew Morton
2017-09-28  7:48   ` Kirill Tkhai
2017-09-28  7:48     ` Kirill Tkhai
2017-09-28 21:02     ` Andrew Morton
2017-09-28 21:02       ` Andrew Morton
2017-09-29  8:15       ` Kirill Tkhai
2017-09-29  8:15         ` Kirill Tkhai
2017-11-30  0:27         ` Shakeel Butt
2017-11-30  0:27           ` Shakeel Butt
2017-11-30 10:36           ` Kirill Tkhai
2017-11-30 10:36             ` Kirill Tkhai

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.