* [PATCH 1/5] mm: vmscan: make global slab shrink lockless
2023-02-20 9:16 [PATCH 0/5] make slab shrink lockless Qi Zheng
@ 2023-02-20 9:16 ` Qi Zheng
2023-02-20 9:16 ` [PATCH 2/5] mm: vmscan: make memcg " Qi Zheng
` (3 subsequent siblings)
4 siblings, 0 replies; 16+ messages in thread
From: Qi Zheng @ 2023-02-20 9:16 UTC (permalink / raw)
To: akpm, hannes, shakeelb, mhocko, roman.gushchin, muchun.song,
david, shy828301
Cc: tkhai, sultan, dave, penguin-kernel, paulmck, linux-mm,
linux-kernel, Qi Zheng
The shrinker_rwsem is a global lock in shrinkers subsystem,
it is easy to cause blocking in the following cases:
a. the write lock of shrinker_rwsem was held for too long.
For example, there are many memcgs in the system, which
causes some paths to hold locks and traverse it for too
long. (e.g. expand_shrinker_info())
b. the read lock of shrinker_rwsem was held for too long,
and a writer came at this time. Then this writer will be
forced to wait and block all subsequent readers.
For example:
- be scheduled when the read lock of shrinker_rwsem is
held in do_shrink_slab()
- some shrinker are blocked for too long. Like the case
mentioned in the patchset[1].
Therefore, many times in history ([2],[3],[4],[5]), some
people wanted to replace shrinker_rwsem reader with SRCU,
but they all gave up because SRCU was not unconditionally
enabled.
But now, since commit 1cd0bd06093c ("rcu: Remove CONFIG_SRCU"),
the SRCU is unconditionally enabled. So it's time to use
SRCU to protect readers who previously held shrinker_rwsem.
[1]. https://lore.kernel.org/lkml/20191129214541.3110-1-ptikhomirov@virtuozzo.com/
[2]. https://lore.kernel.org/all/1437080113.3596.2.camel@stgolabs.net/
[3]. https://lore.kernel.org/lkml/1510609063-3327-1-git-send-email-penguin-kernel@I-love.SAKURA.ne.jp/
[4]. https://lore.kernel.org/lkml/153365347929.19074.12509495712735843805.stgit@localhost.localdomain/
[5]. https://lore.kernel.org/lkml/20210927074823.5825-1-sultan@kerneltoast.com/
Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com>
---
mm/vmscan.c | 27 +++++++++++----------------
1 file changed, 11 insertions(+), 16 deletions(-)
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 9c1c5e8b24b8..95a3d6ddc6c1 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -202,6 +202,7 @@ static void set_task_reclaim_state(struct task_struct *task,
LIST_HEAD(shrinker_list);
DECLARE_RWSEM(shrinker_rwsem);
+DEFINE_SRCU(shrinker_srcu);
#ifdef CONFIG_MEMCG
static int shrinker_nr_max;
@@ -699,7 +700,7 @@ void free_prealloced_shrinker(struct shrinker *shrinker)
void register_shrinker_prepared(struct shrinker *shrinker)
{
down_write(&shrinker_rwsem);
- list_add_tail(&shrinker->list, &shrinker_list);
+ list_add_tail_rcu(&shrinker->list, &shrinker_list);
shrinker->flags |= SHRINKER_REGISTERED;
shrinker_debugfs_add(shrinker);
up_write(&shrinker_rwsem);
@@ -753,13 +754,15 @@ void unregister_shrinker(struct shrinker *shrinker)
return;
down_write(&shrinker_rwsem);
- list_del(&shrinker->list);
+ list_del_rcu(&shrinker->list);
shrinker->flags &= ~SHRINKER_REGISTERED;
if (shrinker->flags & SHRINKER_MEMCG_AWARE)
unregister_memcg_shrinker(shrinker);
debugfs_entry = shrinker_debugfs_remove(shrinker);
up_write(&shrinker_rwsem);
+ synchronize_srcu(&shrinker_srcu);
+
debugfs_remove_recursive(debugfs_entry);
kfree(shrinker->nr_deferred);
@@ -779,6 +782,7 @@ void synchronize_shrinkers(void)
{
down_write(&shrinker_rwsem);
up_write(&shrinker_rwsem);
+ synchronize_srcu(&shrinker_srcu);
}
EXPORT_SYMBOL(synchronize_shrinkers);
@@ -989,6 +993,7 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
{
unsigned long ret, freed = 0;
struct shrinker *shrinker;
+ int srcu_idx;
/*
* The root memcg might be allocated even though memcg is disabled
@@ -1000,10 +1005,10 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
if (!mem_cgroup_disabled() && !mem_cgroup_is_root(memcg))
return shrink_slab_memcg(gfp_mask, nid, memcg, priority);
- if (!down_read_trylock(&shrinker_rwsem))
- goto out;
+ srcu_idx = srcu_read_lock(&shrinker_srcu);
- list_for_each_entry(shrinker, &shrinker_list, list) {
+ list_for_each_entry_srcu(shrinker, &shrinker_list, list,
+ srcu_read_lock_held(&shrinker_srcu)) {
struct shrink_control sc = {
.gfp_mask = gfp_mask,
.nid = nid,
@@ -1014,19 +1019,9 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
if (ret == SHRINK_EMPTY)
ret = 0;
freed += ret;
- /*
- * Bail out if someone want to register a new shrinker to
- * prevent the registration from being stalled for long periods
- * by parallel ongoing shrinking.
- */
- if (rwsem_is_contended(&shrinker_rwsem)) {
- freed = freed ? : 1;
- break;
- }
}
- up_read(&shrinker_rwsem);
-out:
+ srcu_read_unlock(&shrinker_srcu, srcu_idx);
cond_resched();
return freed;
}
--
2.20.1
^ permalink raw reply related [flat|nested] 16+ messages in thread
* [PATCH 2/5] mm: vmscan: make memcg slab shrink lockless
2023-02-20 9:16 [PATCH 0/5] make slab shrink lockless Qi Zheng
2023-02-20 9:16 ` [PATCH 1/5] mm: vmscan: make global " Qi Zheng
@ 2023-02-20 9:16 ` Qi Zheng
2023-02-21 21:28 ` Kirill Tkhai
2023-02-21 21:43 ` Kirill Tkhai
2023-02-20 9:16 ` [PATCH 3/5] mm: shrinkers: make count and scan in shrinker debugfs lockless Qi Zheng
` (2 subsequent siblings)
4 siblings, 2 replies; 16+ messages in thread
From: Qi Zheng @ 2023-02-20 9:16 UTC (permalink / raw)
To: akpm, hannes, shakeelb, mhocko, roman.gushchin, muchun.song,
david, shy828301
Cc: tkhai, sultan, dave, penguin-kernel, paulmck, linux-mm,
linux-kernel, Qi Zheng
Like global slab shrink, since commit 1cd0bd06093c
("rcu: Remove CONFIG_SRCU"), it's time to use SRCU
to protect readers who previously held shrinker_rwsem.
We can test with the following script:
```
DIR="/root/shrinker/memcg/mnt"
do_create()
{
mkdir /sys/fs/cgroup/memory/test
echo 200M > /sys/fs/cgroup/memory/test/memory.limit_in_bytes
for i in `seq 0 $1`;
do
mkdir /sys/fs/cgroup/memory/test/$i;
echo $$ > /sys/fs/cgroup/memory/test/$i/cgroup.procs;
mkdir -p $DIR/$i;
done
}
do_mount()
{
for i in `seq $1 $2`;
do
mount -t tmpfs $i $DIR/$i;
done
}
do_touch()
{
for i in `seq $1 $2`;
do
echo $$ > /sys/fs/cgroup/memory/test/$i/cgroup.procs;
dd if=/dev/zero of=$DIR/$i/file$i bs=1M count=1 &
done
}
do_create 2000
do_mount 0 2000
do_touch 0 1000
```
Before applying:
46.60% [kernel] [k] down_read_trylock
18.70% [kernel] [k] up_read
15.44% [kernel] [k] shrink_slab
4.37% [kernel] [k] _find_next_bit
2.75% [kernel] [k] xa_load
2.07% [kernel] [k] idr_find
1.73% [kernel] [k] do_shrink_slab
1.42% [kernel] [k] shrink_lruvec
0.74% [kernel] [k] shrink_node
0.60% [kernel] [k] list_lru_count_one
After applying:
19.53% [kernel] [k] _find_next_bit
14.63% [kernel] [k] do_shrink_slab
14.58% [kernel] [k] shrink_slab
11.83% [kernel] [k] shrink_lruvec
9.33% [kernel] [k] __blk_flush_plug
6.67% [kernel] [k] mem_cgroup_iter
3.73% [kernel] [k] list_lru_count_one
2.43% [kernel] [k] shrink_node
1.96% [kernel] [k] super_cache_count
1.78% [kernel] [k] __rcu_read_unlock
1.38% [kernel] [k] __srcu_read_lock
1.30% [kernel] [k] xas_descend
We can see that the readers is no longer blocked.
Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com>
---
mm/vmscan.c | 56 ++++++++++++++++++++++++++++++-----------------------
1 file changed, 32 insertions(+), 24 deletions(-)
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 95a3d6ddc6c1..dc47396ecd0e 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -57,6 +57,7 @@
#include <linux/khugepaged.h>
#include <linux/rculist_nulls.h>
#include <linux/random.h>
+#include <linux/srcu.h>
#include <asm/tlbflush.h>
#include <asm/div64.h>
@@ -221,8 +222,21 @@ static inline int shrinker_defer_size(int nr_items)
static struct shrinker_info *shrinker_info_protected(struct mem_cgroup *memcg,
int nid)
{
- return rcu_dereference_protected(memcg->nodeinfo[nid]->shrinker_info,
- lockdep_is_held(&shrinker_rwsem));
+ return srcu_dereference_check(memcg->nodeinfo[nid]->shrinker_info,
+ &shrinker_srcu,
+ lockdep_is_held(&shrinker_rwsem));
+}
+
+static struct shrinker_info *shrinker_info_srcu(struct mem_cgroup *memcg,
+ int nid)
+{
+ return srcu_dereference(memcg->nodeinfo[nid]->shrinker_info,
+ &shrinker_srcu);
+}
+
+static void free_shrinker_info_rcu(struct rcu_head *head)
+{
+ kvfree(container_of(head, struct shrinker_info, rcu));
}
static int expand_one_shrinker_info(struct mem_cgroup *memcg,
@@ -257,7 +271,7 @@ static int expand_one_shrinker_info(struct mem_cgroup *memcg,
defer_size - old_defer_size);
rcu_assign_pointer(pn->shrinker_info, new);
- kvfree_rcu(old, rcu);
+ call_srcu(&shrinker_srcu, &old->rcu, free_shrinker_info_rcu);
}
return 0;
@@ -350,13 +364,14 @@ void set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id)
{
if (shrinker_id >= 0 && memcg && !mem_cgroup_is_root(memcg)) {
struct shrinker_info *info;
+ int srcu_idx;
- rcu_read_lock();
- info = rcu_dereference(memcg->nodeinfo[nid]->shrinker_info);
+ srcu_idx = srcu_read_lock(&shrinker_srcu);
+ info = shrinker_info_srcu(memcg, nid);
/* Pairs with smp mb in shrink_slab() */
smp_mb__before_atomic();
set_bit(shrinker_id, info->map);
- rcu_read_unlock();
+ srcu_read_unlock(&shrinker_srcu, srcu_idx);
}
}
@@ -370,7 +385,6 @@ static int prealloc_memcg_shrinker(struct shrinker *shrinker)
return -ENOSYS;
down_write(&shrinker_rwsem);
- /* This may call shrinker, so it must use down_read_trylock() */
id = idr_alloc(&shrinker_idr, shrinker, 0, 0, GFP_KERNEL);
if (id < 0)
goto unlock;
@@ -404,7 +418,7 @@ static long xchg_nr_deferred_memcg(int nid, struct shrinker *shrinker,
{
struct shrinker_info *info;
- info = shrinker_info_protected(memcg, nid);
+ info = shrinker_info_srcu(memcg, nid);
return atomic_long_xchg(&info->nr_deferred[shrinker->id], 0);
}
@@ -413,13 +427,13 @@ static long add_nr_deferred_memcg(long nr, int nid, struct shrinker *shrinker,
{
struct shrinker_info *info;
- info = shrinker_info_protected(memcg, nid);
+ info = shrinker_info_srcu(memcg, nid);
return atomic_long_add_return(nr, &info->nr_deferred[shrinker->id]);
}
void reparent_shrinker_deferred(struct mem_cgroup *memcg)
{
- int i, nid;
+ int i, nid, srcu_idx;
long nr;
struct mem_cgroup *parent;
struct shrinker_info *child_info, *parent_info;
@@ -429,16 +443,16 @@ void reparent_shrinker_deferred(struct mem_cgroup *memcg)
parent = root_mem_cgroup;
/* Prevent from concurrent shrinker_info expand */
- down_read(&shrinker_rwsem);
+ srcu_idx = srcu_read_lock(&shrinker_srcu);
for_each_node(nid) {
- child_info = shrinker_info_protected(memcg, nid);
- parent_info = shrinker_info_protected(parent, nid);
+ child_info = shrinker_info_srcu(memcg, nid);
+ parent_info = shrinker_info_srcu(parent, nid);
for (i = 0; i < shrinker_nr_max; i++) {
nr = atomic_long_read(&child_info->nr_deferred[i]);
atomic_long_add(nr, &parent_info->nr_deferred[i]);
}
}
- up_read(&shrinker_rwsem);
+ srcu_read_unlock(&shrinker_srcu, srcu_idx);
}
static bool cgroup_reclaim(struct scan_control *sc)
@@ -891,15 +905,14 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
{
struct shrinker_info *info;
unsigned long ret, freed = 0;
+ int srcu_idx;
int i;
if (!mem_cgroup_online(memcg))
return 0;
- if (!down_read_trylock(&shrinker_rwsem))
- return 0;
-
- info = shrinker_info_protected(memcg, nid);
+ srcu_idx = srcu_read_lock(&shrinker_srcu);
+ info = shrinker_info_srcu(memcg, nid);
if (unlikely(!info))
goto unlock;
@@ -949,14 +962,9 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
set_shrinker_bit(memcg, nid, i);
}
freed += ret;
-
- if (rwsem_is_contended(&shrinker_rwsem)) {
- freed = freed ? : 1;
- break;
- }
}
unlock:
- up_read(&shrinker_rwsem);
+ srcu_read_unlock(&shrinker_srcu, srcu_idx);
return freed;
}
#else /* CONFIG_MEMCG */
--
2.20.1
^ permalink raw reply related [flat|nested] 16+ messages in thread
* Re: [PATCH 2/5] mm: vmscan: make memcg slab shrink lockless
2023-02-20 9:16 ` [PATCH 2/5] mm: vmscan: make memcg " Qi Zheng
@ 2023-02-21 21:28 ` Kirill Tkhai
2023-02-22 7:32 ` Qi Zheng
2023-02-21 21:43 ` Kirill Tkhai
1 sibling, 1 reply; 16+ messages in thread
From: Kirill Tkhai @ 2023-02-21 21:28 UTC (permalink / raw)
To: Qi Zheng, akpm, hannes, shakeelb, mhocko, roman.gushchin,
muchun.song, david, shy828301
Cc: sultan, dave, penguin-kernel, paulmck, linux-mm, linux-kernel
On 20.02.2023 12:16, Qi Zheng wrote:
> Like global slab shrink, since commit 1cd0bd06093c
> ("rcu: Remove CONFIG_SRCU"), it's time to use SRCU
> to protect readers who previously held shrinker_rwsem.
>
> We can test with the following script:
>
> ```
> DIR="/root/shrinker/memcg/mnt"
>
> do_create()
> {
> mkdir /sys/fs/cgroup/memory/test
> echo 200M > /sys/fs/cgroup/memory/test/memory.limit_in_bytes
> for i in `seq 0 $1`;
> do
> mkdir /sys/fs/cgroup/memory/test/$i;
> echo $$ > /sys/fs/cgroup/memory/test/$i/cgroup.procs;
> mkdir -p $DIR/$i;
> done
> }
>
> do_mount()
> {
> for i in `seq $1 $2`;
> do
> mount -t tmpfs $i $DIR/$i;
> done
> }
>
> do_touch()
> {
> for i in `seq $1 $2`;
> do
> echo $$ > /sys/fs/cgroup/memory/test/$i/cgroup.procs;
> dd if=/dev/zero of=$DIR/$i/file$i bs=1M count=1 &
> done
> }
>
> do_create 2000
> do_mount 0 2000
> do_touch 0 1000
> ```
>
> Before applying:
>
> 46.60% [kernel] [k] down_read_trylock
> 18.70% [kernel] [k] up_read
> 15.44% [kernel] [k] shrink_slab
> 4.37% [kernel] [k] _find_next_bit
> 2.75% [kernel] [k] xa_load
> 2.07% [kernel] [k] idr_find
> 1.73% [kernel] [k] do_shrink_slab
> 1.42% [kernel] [k] shrink_lruvec
> 0.74% [kernel] [k] shrink_node
> 0.60% [kernel] [k] list_lru_count_one
>
> After applying:
>
> 19.53% [kernel] [k] _find_next_bit
> 14.63% [kernel] [k] do_shrink_slab
> 14.58% [kernel] [k] shrink_slab
> 11.83% [kernel] [k] shrink_lruvec
> 9.33% [kernel] [k] __blk_flush_plug
> 6.67% [kernel] [k] mem_cgroup_iter
> 3.73% [kernel] [k] list_lru_count_one
> 2.43% [kernel] [k] shrink_node
> 1.96% [kernel] [k] super_cache_count
> 1.78% [kernel] [k] __rcu_read_unlock
> 1.38% [kernel] [k] __srcu_read_lock
> 1.30% [kernel] [k] xas_descend
>
> We can see that the readers is no longer blocked.
>
> Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com>
> ---
> mm/vmscan.c | 56 ++++++++++++++++++++++++++++++-----------------------
> 1 file changed, 32 insertions(+), 24 deletions(-)
>
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index 95a3d6ddc6c1..dc47396ecd0e 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -57,6 +57,7 @@
> #include <linux/khugepaged.h>
> #include <linux/rculist_nulls.h>
> #include <linux/random.h>
> +#include <linux/srcu.h>
>
> #include <asm/tlbflush.h>
> #include <asm/div64.h>
> @@ -221,8 +222,21 @@ static inline int shrinker_defer_size(int nr_items)
> static struct shrinker_info *shrinker_info_protected(struct mem_cgroup *memcg,
> int nid)
> {
> - return rcu_dereference_protected(memcg->nodeinfo[nid]->shrinker_info,
> - lockdep_is_held(&shrinker_rwsem));
> + return srcu_dereference_check(memcg->nodeinfo[nid]->shrinker_info,
> + &shrinker_srcu,
> + lockdep_is_held(&shrinker_rwsem));
> +}
> +
> +static struct shrinker_info *shrinker_info_srcu(struct mem_cgroup *memcg,
> + int nid)
> +{
> + return srcu_dereference(memcg->nodeinfo[nid]->shrinker_info,
> + &shrinker_srcu);
> +}
> +
> +static void free_shrinker_info_rcu(struct rcu_head *head)
> +{
> + kvfree(container_of(head, struct shrinker_info, rcu));
> }
>
> static int expand_one_shrinker_info(struct mem_cgroup *memcg,
> @@ -257,7 +271,7 @@ static int expand_one_shrinker_info(struct mem_cgroup *memcg,
> defer_size - old_defer_size);
>
> rcu_assign_pointer(pn->shrinker_info, new);
> - kvfree_rcu(old, rcu);
> + call_srcu(&shrinker_srcu, &old->rcu, free_shrinker_info_rcu);
> }
>
> return 0;
> @@ -350,13 +364,14 @@ void set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id)
> {
> if (shrinker_id >= 0 && memcg && !mem_cgroup_is_root(memcg)) {
> struct shrinker_info *info;
> + int srcu_idx;
>
> - rcu_read_lock();
> - info = rcu_dereference(memcg->nodeinfo[nid]->shrinker_info);
> + srcu_idx = srcu_read_lock(&shrinker_srcu);
> + info = shrinker_info_srcu(memcg, nid);
> /* Pairs with smp mb in shrink_slab() */
> smp_mb__before_atomic();
> set_bit(shrinker_id, info->map);
> - rcu_read_unlock();
> + srcu_read_unlock(&shrinker_srcu, srcu_idx);
> }
> }
>
> @@ -370,7 +385,6 @@ static int prealloc_memcg_shrinker(struct shrinker *shrinker)
> return -ENOSYS;
>
> down_write(&shrinker_rwsem);
> - /* This may call shrinker, so it must use down_read_trylock() */
> id = idr_alloc(&shrinker_idr, shrinker, 0, 0, GFP_KERNEL);
> if (id < 0)
> goto unlock;
> @@ -404,7 +418,7 @@ static long xchg_nr_deferred_memcg(int nid, struct shrinker *shrinker,
> {
> struct shrinker_info *info;
>
> - info = shrinker_info_protected(memcg, nid);
> + info = shrinker_info_srcu(memcg, nid);
> return atomic_long_xchg(&info->nr_deferred[shrinker->id], 0);
> }
>
> @@ -413,13 +427,13 @@ static long add_nr_deferred_memcg(long nr, int nid, struct shrinker *shrinker,
> {
> struct shrinker_info *info;
>
> - info = shrinker_info_protected(memcg, nid);
> + info = shrinker_info_srcu(memcg, nid);
> return atomic_long_add_return(nr, &info->nr_deferred[shrinker->id]);
> }
>
> void reparent_shrinker_deferred(struct mem_cgroup *memcg)
> {
> - int i, nid;
> + int i, nid, srcu_idx;
> long nr;
> struct mem_cgroup *parent;
> struct shrinker_info *child_info, *parent_info;
> @@ -429,16 +443,16 @@ void reparent_shrinker_deferred(struct mem_cgroup *memcg)
> parent = root_mem_cgroup;
>
> /* Prevent from concurrent shrinker_info expand */
> - down_read(&shrinker_rwsem);
> + srcu_idx = srcu_read_lock(&shrinker_srcu);
Don't we still have to be protected against parallel expand_one_shrinker_info()?
It looks like parent->nodeinfo[nid]->shrinker_info pointer may be changed in expand*
right after we've dereferenced it here.
> for_each_node(nid) {
> - child_info = shrinker_info_protected(memcg, nid);
> - parent_info = shrinker_info_protected(parent, nid);
> + child_info = shrinker_info_srcu(memcg, nid);
> + parent_info = shrinker_info_srcu(parent, nid);
> for (i = 0; i < shrinker_nr_max; i++) {
> nr = atomic_long_read(&child_info->nr_deferred[i]);
> atomic_long_add(nr, &parent_info->nr_deferred[i]);
> }
> }
> - up_read(&shrinker_rwsem);
> + srcu_read_unlock(&shrinker_srcu, srcu_idx);
> }
>
> static bool cgroup_reclaim(struct scan_control *sc)
> @@ -891,15 +905,14 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
> {
> struct shrinker_info *info;
> unsigned long ret, freed = 0;
> + int srcu_idx;
> int i;
>
> if (!mem_cgroup_online(memcg))
> return 0;
>
> - if (!down_read_trylock(&shrinker_rwsem))
> - return 0;
> -
> - info = shrinker_info_protected(memcg, nid);
> + srcu_idx = srcu_read_lock(&shrinker_srcu);
> + info = shrinker_info_srcu(memcg, nid);
> if (unlikely(!info))
> goto unlock;
>
> @@ -949,14 +962,9 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
> set_shrinker_bit(memcg, nid, i);
> }
> freed += ret;
> -
> - if (rwsem_is_contended(&shrinker_rwsem)) {
> - freed = freed ? : 1;
> - break;
> - }
> }
> unlock:
> - up_read(&shrinker_rwsem);
> + srcu_read_unlock(&shrinker_srcu, srcu_idx);
> return freed;
> }
> #else /* CONFIG_MEMCG */
^ permalink raw reply [flat|nested] 16+ messages in thread
* Re: [PATCH 2/5] mm: vmscan: make memcg slab shrink lockless
2023-02-21 21:28 ` Kirill Tkhai
@ 2023-02-22 7:32 ` Qi Zheng
2023-02-22 19:58 ` Kirill Tkhai
0 siblings, 1 reply; 16+ messages in thread
From: Qi Zheng @ 2023-02-22 7:32 UTC (permalink / raw)
To: Kirill Tkhai
Cc: sultan, dave, penguin-kernel, paulmck, linux-mm, linux-kernel,
Andrew Morton, Johannes Weiner, Shakeel Butt, Michal Hocko,
Roman Gushchin, Muchun Song, David Hildenbrand, Yang Shi
On 2023/2/22 05:28, Kirill Tkhai wrote:
> On 20.02.2023 12:16, Qi Zheng wrote:
<...>
>>
>> void reparent_shrinker_deferred(struct mem_cgroup *memcg)
>> {
>> - int i, nid;
>> + int i, nid, srcu_idx;
>> long nr;
>> struct mem_cgroup *parent;
>> struct shrinker_info *child_info, *parent_info;
>> @@ -429,16 +443,16 @@ void reparent_shrinker_deferred(struct mem_cgroup *memcg)
>> parent = root_mem_cgroup;
>>
>> /* Prevent from concurrent shrinker_info expand */
>> - down_read(&shrinker_rwsem);
>> + srcu_idx = srcu_read_lock(&shrinker_srcu);
>
> Don't we still have to be protected against parallel expand_one_shrinker_info()?
>
> It looks like parent->nodeinfo[nid]->shrinker_info pointer may be changed in expand*
> right after we've dereferenced it here.
Hi Kirill,
Oh, indeed. We may wrongly reparent the child's nr_deferred to the old
parent's nr_deferred (it is about to be freed by call_srcu).
The reparent_shrinker_deferred() will only be called on the offline
path (not a hotspot path), so we may be able to use shrinker_mutex
introduced later for protection. What do you think?
Thanks,
Qi
>
>> for_each_node(nid) {
>> - child_info = shrinker_info_protected(memcg, nid);
>> - parent_info = shrinker_info_protected(parent, nid);
>> + child_info = shrinker_info_srcu(memcg, nid);
>> + parent_info = shrinker_info_srcu(parent, nid);
>> for (i = 0; i < shrinker_nr_max; i++) {
>> nr = atomic_long_read(&child_info->nr_deferred[i]);
>> atomic_long_add(nr, &parent_info->nr_deferred[i]);
>> }
>> }
>> - up_read(&shrinker_rwsem);
>> + srcu_read_unlock(&shrinker_srcu, srcu_idx);
>> }
>>
>> static bool cgroup_reclaim(struct scan_control *sc)
>> @@ -891,15 +905,14 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
>> {
>> struct shrinker_info *info;
>> unsigned long ret, freed = 0;
>> + int srcu_idx;
>> int i;
>>
>> if (!mem_cgroup_online(memcg))
>> return 0;
>>
>> - if (!down_read_trylock(&shrinker_rwsem))
>> - return 0;
>> -
>> - info = shrinker_info_protected(memcg, nid);
>> + srcu_idx = srcu_read_lock(&shrinker_srcu);
>> + info = shrinker_info_srcu(memcg, nid);
>> if (unlikely(!info))
>> goto unlock;
>>
>> @@ -949,14 +962,9 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
>> set_shrinker_bit(memcg, nid, i);
>> }
>> freed += ret;
>> -
>> - if (rwsem_is_contended(&shrinker_rwsem)) {
>> - freed = freed ? : 1;
>> - break;
>> - }
>> }
>> unlock:
>> - up_read(&shrinker_rwsem);
>> + srcu_read_unlock(&shrinker_srcu, srcu_idx);
>> return freed;
>> }
>> #else /* CONFIG_MEMCG */
>
--
Thanks,
Qi
^ permalink raw reply [flat|nested] 16+ messages in thread
* Re: [PATCH 2/5] mm: vmscan: make memcg slab shrink lockless
2023-02-22 7:32 ` Qi Zheng
@ 2023-02-22 19:58 ` Kirill Tkhai
2023-02-23 4:36 ` Qi Zheng
0 siblings, 1 reply; 16+ messages in thread
From: Kirill Tkhai @ 2023-02-22 19:58 UTC (permalink / raw)
To: Qi Zheng
Cc: sultan, dave, penguin-kernel, paulmck, linux-mm, linux-kernel,
Andrew Morton, Johannes Weiner, Shakeel Butt, Michal Hocko,
Roman Gushchin, Muchun Song, David Hildenbrand, Yang Shi
On 22.02.2023 10:32, Qi Zheng wrote:
>
>
> On 2023/2/22 05:28, Kirill Tkhai wrote:
>> On 20.02.2023 12:16, Qi Zheng wrote:
> <...>
>>> void reparent_shrinker_deferred(struct mem_cgroup *memcg)
>>> {
>>> - int i, nid;
>>> + int i, nid, srcu_idx;
>>> long nr;
>>> struct mem_cgroup *parent;
>>> struct shrinker_info *child_info, *parent_info;
>>> @@ -429,16 +443,16 @@ void reparent_shrinker_deferred(struct mem_cgroup *memcg)
>>> parent = root_mem_cgroup;
>>> /* Prevent from concurrent shrinker_info expand */
>>> - down_read(&shrinker_rwsem);
>>> + srcu_idx = srcu_read_lock(&shrinker_srcu);
>>
>> Don't we still have to be protected against parallel expand_one_shrinker_info()?
>>
>> It looks like parent->nodeinfo[nid]->shrinker_info pointer may be changed in expand*
>> right after we've dereferenced it here.
>
> Hi Kirill,
>
> Oh, indeed. We may wrongly reparent the child's nr_deferred to the old
> parent's nr_deferred (it is about to be freed by call_srcu).
>
> The reparent_shrinker_deferred() will only be called on the offline
> path (not a hotspot path), so we may be able to use shrinker_mutex
> introduced later for protection. What do you think?
It looks good for me. One more thing I'd analyzed is whether we want to have
is two reparent_shrinker_deferred() are executing in parallel.
Possible, we should leave rwsem there as it was used before..
>>
>>> for_each_node(nid) {
>>> - child_info = shrinker_info_protected(memcg, nid);
>>> - parent_info = shrinker_info_protected(parent, nid);
>>> + child_info = shrinker_info_srcu(memcg, nid);
>>> + parent_info = shrinker_info_srcu(parent, nid);
>>> for (i = 0; i < shrinker_nr_max; i++) {
>>> nr = atomic_long_read(&child_info->nr_deferred[i]);
>>> atomic_long_add(nr, &parent_info->nr_deferred[i]);
>>> }
>>> }
>>> - up_read(&shrinker_rwsem);
>>> + srcu_read_unlock(&shrinker_srcu, srcu_idx);
>>> }
>>> static bool cgroup_reclaim(struct scan_control *sc)
>>> @@ -891,15 +905,14 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
>>> {
>>> struct shrinker_info *info;
>>> unsigned long ret, freed = 0;
>>> + int srcu_idx;
>>> int i;
>>> if (!mem_cgroup_online(memcg))
>>> return 0;
>>> - if (!down_read_trylock(&shrinker_rwsem))
>>> - return 0;
>>> -
>>> - info = shrinker_info_protected(memcg, nid);
>>> + srcu_idx = srcu_read_lock(&shrinker_srcu);
>>> + info = shrinker_info_srcu(memcg, nid);
>>> if (unlikely(!info))
>>> goto unlock;
>>> @@ -949,14 +962,9 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
>>> set_shrinker_bit(memcg, nid, i);
>>> }
>>> freed += ret;
>>> -
>>> - if (rwsem_is_contended(&shrinker_rwsem)) {
>>> - freed = freed ? : 1;
>>> - break;
>>> - }
>>> }
>>> unlock:
>>> - up_read(&shrinker_rwsem);
>>> + srcu_read_unlock(&shrinker_srcu, srcu_idx);
>>> return freed;
>>> }
>>> #else /* CONFIG_MEMCG */
>>
>
^ permalink raw reply [flat|nested] 16+ messages in thread
* Re: [PATCH 2/5] mm: vmscan: make memcg slab shrink lockless
2023-02-22 19:58 ` Kirill Tkhai
@ 2023-02-23 4:36 ` Qi Zheng
0 siblings, 0 replies; 16+ messages in thread
From: Qi Zheng @ 2023-02-23 4:36 UTC (permalink / raw)
To: Kirill Tkhai
Cc: sultan, dave, penguin-kernel, paulmck, linux-mm, linux-kernel,
Andrew Morton, Johannes Weiner, Shakeel Butt, Michal Hocko,
Roman Gushchin, Muchun Song, David Hildenbrand, Yang Shi
On 2023/2/23 03:58, Kirill Tkhai wrote:
> On 22.02.2023 10:32, Qi Zheng wrote:
>>
>>
>> On 2023/2/22 05:28, Kirill Tkhai wrote:
>>> On 20.02.2023 12:16, Qi Zheng wrote:
>> <...>
>>>> void reparent_shrinker_deferred(struct mem_cgroup *memcg)
>>>> {
>>>> - int i, nid;
>>>> + int i, nid, srcu_idx;
>>>> long nr;
>>>> struct mem_cgroup *parent;
>>>> struct shrinker_info *child_info, *parent_info;
>>>> @@ -429,16 +443,16 @@ void reparent_shrinker_deferred(struct mem_cgroup *memcg)
>>>> parent = root_mem_cgroup;
>>>> /* Prevent from concurrent shrinker_info expand */
>>>> - down_read(&shrinker_rwsem);
>>>> + srcu_idx = srcu_read_lock(&shrinker_srcu);
>>>
>>> Don't we still have to be protected against parallel expand_one_shrinker_info()?
>>>
>>> It looks like parent->nodeinfo[nid]->shrinker_info pointer may be changed in expand*
>>> right after we've dereferenced it here.
>>
>> Hi Kirill,
>>
>> Oh, indeed. We may wrongly reparent the child's nr_deferred to the old
>> parent's nr_deferred (it is about to be freed by call_srcu).
>>
>> The reparent_shrinker_deferred() will only be called on the offline
>> path (not a hotspot path), so we may be able to use shrinker_mutex
>> introduced later for protection. What do you think?
>
> It looks good for me. One more thing I'd analyzed is whether we want to have
> is two reparent_shrinker_deferred() are executing in parallel.
I see that mem_cgroup_css_offline() is already protected by
cgroup_mutex, so maybe shrinker_mutex is enough here. :)
>
> Possible, we should leave rwsem there as it was used before..
>
>>>
>>>> for_each_node(nid) {
>>>> - child_info = shrinker_info_protected(memcg, nid);
>>>> - parent_info = shrinker_info_protected(parent, nid);
>>>> + child_info = shrinker_info_srcu(memcg, nid);
>>>> + parent_info = shrinker_info_srcu(parent, nid);
>>>> for (i = 0; i < shrinker_nr_max; i++) {
>>>> nr = atomic_long_read(&child_info->nr_deferred[i]);
>>>> atomic_long_add(nr, &parent_info->nr_deferred[i]);
>>>> }
>>>> }
>>>> - up_read(&shrinker_rwsem);
>>>> + srcu_read_unlock(&shrinker_srcu, srcu_idx);
>>>> }
>>>> static bool cgroup_reclaim(struct scan_control *sc)
>>>> @@ -891,15 +905,14 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
>>>> {
>>>> struct shrinker_info *info;
>>>> unsigned long ret, freed = 0;
>>>> + int srcu_idx;
>>>> int i;
>>>> if (!mem_cgroup_online(memcg))
>>>> return 0;
>>>> - if (!down_read_trylock(&shrinker_rwsem))
>>>> - return 0;
>>>> -
>>>> - info = shrinker_info_protected(memcg, nid);
>>>> + srcu_idx = srcu_read_lock(&shrinker_srcu);
>>>> + info = shrinker_info_srcu(memcg, nid);
>>>> if (unlikely(!info))
>>>> goto unlock;
>>>> @@ -949,14 +962,9 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
>>>> set_shrinker_bit(memcg, nid, i);
>>>> }
>>>> freed += ret;
>>>> -
>>>> - if (rwsem_is_contended(&shrinker_rwsem)) {
>>>> - freed = freed ? : 1;
>>>> - break;
>>>> - }
>>>> }
>>>> unlock:
>>>> - up_read(&shrinker_rwsem);
>>>> + srcu_read_unlock(&shrinker_srcu, srcu_idx);
>>>> return freed;
>>>> }
>>>> #else /* CONFIG_MEMCG */
>>>
>>
>
--
Thanks,
Qi
^ permalink raw reply [flat|nested] 16+ messages in thread
* Re: [PATCH 2/5] mm: vmscan: make memcg slab shrink lockless
2023-02-20 9:16 ` [PATCH 2/5] mm: vmscan: make memcg " Qi Zheng
2023-02-21 21:28 ` Kirill Tkhai
@ 2023-02-21 21:43 ` Kirill Tkhai
2023-02-22 8:16 ` Qi Zheng
1 sibling, 1 reply; 16+ messages in thread
From: Kirill Tkhai @ 2023-02-21 21:43 UTC (permalink / raw)
To: Qi Zheng, akpm, hannes, shakeelb, mhocko, roman.gushchin,
muchun.song, david, shy828301
Cc: sultan, dave, penguin-kernel, paulmck, linux-mm, linux-kernel
On 20.02.2023 12:16, Qi Zheng wrote:
> Like global slab shrink, since commit 1cd0bd06093c
> ("rcu: Remove CONFIG_SRCU"), it's time to use SRCU
> to protect readers who previously held shrinker_rwsem.
>
> We can test with the following script:
>
> ```
> DIR="/root/shrinker/memcg/mnt"
>
> do_create()
> {
> mkdir /sys/fs/cgroup/memory/test
> echo 200M > /sys/fs/cgroup/memory/test/memory.limit_in_bytes
> for i in `seq 0 $1`;
> do
> mkdir /sys/fs/cgroup/memory/test/$i;
> echo $$ > /sys/fs/cgroup/memory/test/$i/cgroup.procs;
> mkdir -p $DIR/$i;
> done
> }
>
> do_mount()
> {
> for i in `seq $1 $2`;
> do
> mount -t tmpfs $i $DIR/$i;
> done
> }
>
> do_touch()
> {
> for i in `seq $1 $2`;
> do
> echo $$ > /sys/fs/cgroup/memory/test/$i/cgroup.procs;
> dd if=/dev/zero of=$DIR/$i/file$i bs=1M count=1 &
> done
> }
>
> do_create 2000
> do_mount 0 2000
> do_touch 0 1000
> ```
>
> Before applying:
>
> 46.60% [kernel] [k] down_read_trylock
> 18.70% [kernel] [k] up_read
> 15.44% [kernel] [k] shrink_slab
> 4.37% [kernel] [k] _find_next_bit
> 2.75% [kernel] [k] xa_load
> 2.07% [kernel] [k] idr_find
> 1.73% [kernel] [k] do_shrink_slab
> 1.42% [kernel] [k] shrink_lruvec
> 0.74% [kernel] [k] shrink_node
> 0.60% [kernel] [k] list_lru_count_one
>
> After applying:
>
> 19.53% [kernel] [k] _find_next_bit
> 14.63% [kernel] [k] do_shrink_slab
> 14.58% [kernel] [k] shrink_slab
> 11.83% [kernel] [k] shrink_lruvec
> 9.33% [kernel] [k] __blk_flush_plug
> 6.67% [kernel] [k] mem_cgroup_iter
> 3.73% [kernel] [k] list_lru_count_one
> 2.43% [kernel] [k] shrink_node
> 1.96% [kernel] [k] super_cache_count
> 1.78% [kernel] [k] __rcu_read_unlock
> 1.38% [kernel] [k] __srcu_read_lock
> 1.30% [kernel] [k] xas_descend
>
> We can see that the readers is no longer blocked.
>
> Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com>
> ---
> mm/vmscan.c | 56 ++++++++++++++++++++++++++++++-----------------------
> 1 file changed, 32 insertions(+), 24 deletions(-)
>
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index 95a3d6ddc6c1..dc47396ecd0e 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -57,6 +57,7 @@
> #include <linux/khugepaged.h>
> #include <linux/rculist_nulls.h>
> #include <linux/random.h>
> +#include <linux/srcu.h>
>
> #include <asm/tlbflush.h>
> #include <asm/div64.h>
> @@ -221,8 +222,21 @@ static inline int shrinker_defer_size(int nr_items)
> static struct shrinker_info *shrinker_info_protected(struct mem_cgroup *memcg,
> int nid)
> {
> - return rcu_dereference_protected(memcg->nodeinfo[nid]->shrinker_info,
> - lockdep_is_held(&shrinker_rwsem));
> + return srcu_dereference_check(memcg->nodeinfo[nid]->shrinker_info,
> + &shrinker_srcu,
> + lockdep_is_held(&shrinker_rwsem));
> +}
> +
> +static struct shrinker_info *shrinker_info_srcu(struct mem_cgroup *memcg,
> + int nid)
> +{
> + return srcu_dereference(memcg->nodeinfo[nid]->shrinker_info,
> + &shrinker_srcu);
> +}
> +
> +static void free_shrinker_info_rcu(struct rcu_head *head)
> +{
> + kvfree(container_of(head, struct shrinker_info, rcu));
> }
>
> static int expand_one_shrinker_info(struct mem_cgroup *memcg,
> @@ -257,7 +271,7 @@ static int expand_one_shrinker_info(struct mem_cgroup *memcg,
> defer_size - old_defer_size);
>
> rcu_assign_pointer(pn->shrinker_info, new);
> - kvfree_rcu(old, rcu);
> + call_srcu(&shrinker_srcu, &old->rcu, free_shrinker_info_rcu);
> }
>
> return 0;
> @@ -350,13 +364,14 @@ void set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id)
> {
> if (shrinker_id >= 0 && memcg && !mem_cgroup_is_root(memcg)) {
> struct shrinker_info *info;
> + int srcu_idx;
>
> - rcu_read_lock();
> - info = rcu_dereference(memcg->nodeinfo[nid]->shrinker_info);
> + srcu_idx = srcu_read_lock(&shrinker_srcu);
> + info = shrinker_info_srcu(memcg, nid);
> /* Pairs with smp mb in shrink_slab() */
> smp_mb__before_atomic();
> set_bit(shrinker_id, info->map);
> - rcu_read_unlock();
> + srcu_read_unlock(&shrinker_srcu, srcu_idx);
> }
> }
>
> @@ -370,7 +385,6 @@ static int prealloc_memcg_shrinker(struct shrinker *shrinker)
> return -ENOSYS;
>
> down_write(&shrinker_rwsem);
> - /* This may call shrinker, so it must use down_read_trylock() */
> id = idr_alloc(&shrinker_idr, shrinker, 0, 0, GFP_KERNEL);
> if (id < 0)
> goto unlock;
> @@ -404,7 +418,7 @@ static long xchg_nr_deferred_memcg(int nid, struct shrinker *shrinker,
> {
> struct shrinker_info *info;
>
> - info = shrinker_info_protected(memcg, nid);
> + info = shrinker_info_srcu(memcg, nid);
> return atomic_long_xchg(&info->nr_deferred[shrinker->id], 0);
> }
>
> @@ -413,13 +427,13 @@ static long add_nr_deferred_memcg(long nr, int nid, struct shrinker *shrinker,
> {
> struct shrinker_info *info;
>
> - info = shrinker_info_protected(memcg, nid);
> + info = shrinker_info_srcu(memcg, nid);
> return atomic_long_add_return(nr, &info->nr_deferred[shrinker->id]);
> }
>
> void reparent_shrinker_deferred(struct mem_cgroup *memcg)
> {
> - int i, nid;
> + int i, nid, srcu_idx;
> long nr;
> struct mem_cgroup *parent;
> struct shrinker_info *child_info, *parent_info;
> @@ -429,16 +443,16 @@ void reparent_shrinker_deferred(struct mem_cgroup *memcg)
> parent = root_mem_cgroup;
>
> /* Prevent from concurrent shrinker_info expand */
> - down_read(&shrinker_rwsem);
> + srcu_idx = srcu_read_lock(&shrinker_srcu);
> for_each_node(nid) {
> - child_info = shrinker_info_protected(memcg, nid);
> - parent_info = shrinker_info_protected(parent, nid);
> + child_info = shrinker_info_srcu(memcg, nid);
> + parent_info = shrinker_info_srcu(parent, nid);
> for (i = 0; i < shrinker_nr_max; i++) {
> nr = atomic_long_read(&child_info->nr_deferred[i]);
> atomic_long_add(nr, &parent_info->nr_deferred[i]);
> }
> }
> - up_read(&shrinker_rwsem);
> + srcu_read_unlock(&shrinker_srcu, srcu_idx);
> }
>
> static bool cgroup_reclaim(struct scan_control *sc)
> @@ -891,15 +905,14 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
> {
> struct shrinker_info *info;
> unsigned long ret, freed = 0;
> + int srcu_idx;
> int i;
>
> if (!mem_cgroup_online(memcg))
> return 0;
>
> - if (!down_read_trylock(&shrinker_rwsem))
> - return 0;
> -
> - info = shrinker_info_protected(memcg, nid);
> + srcu_idx = srcu_read_lock(&shrinker_srcu);
> + info = shrinker_info_srcu(memcg, nid);
> if (unlikely(!info))
> goto unlock;
There is shrinker_nr_max dereference under this hunk. It's not in the patch:
for_each_set_bit(i, info->map, shrinker_nr_max) {
Since shrinker_nr_max may grow in parallel, this leads to access beyond allocated memory :(
It looks like we should save size of info->map as a new member of struct shrinker_info.
>
> @@ -949,14 +962,9 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
> set_shrinker_bit(memcg, nid, i);
> }
> freed += ret;
> -
> - if (rwsem_is_contended(&shrinker_rwsem)) {
> - freed = freed ? : 1;
> - break;
> - }
> }
> unlock:
> - up_read(&shrinker_rwsem);
> + srcu_read_unlock(&shrinker_srcu, srcu_idx);
> return freed;
> }
> #else /* CONFIG_MEMCG */
^ permalink raw reply [flat|nested] 16+ messages in thread
* Re: [PATCH 2/5] mm: vmscan: make memcg slab shrink lockless
2023-02-21 21:43 ` Kirill Tkhai
@ 2023-02-22 8:16 ` Qi Zheng
2023-02-22 8:21 ` Qi Zheng
0 siblings, 1 reply; 16+ messages in thread
From: Qi Zheng @ 2023-02-22 8:16 UTC (permalink / raw)
To: Kirill Tkhai
Cc: sultan, dave, penguin-kernel, paulmck, linux-mm, linux-kernel,
Andrew Morton, Johannes Weiner, Shakeel Butt, Michal Hocko,
Roman Gushchin, Muchun Song, David Hildenbrand, Yang Shi
Hi Kirill,
On 2023/2/22 05:43, Kirill Tkhai wrote:
> On 20.02.2023 12:16, Qi Zheng wrote:
>> Like global slab shrink, since commit 1cd0bd06093c<...>
>> static bool cgroup_reclaim(struct scan_control *sc)
>> @@ -891,15 +905,14 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
>> {
>> struct shrinker_info *info;
>> unsigned long ret, freed = 0;
>> + int srcu_idx;
>> int i;
>>
>> if (!mem_cgroup_online(memcg))
>> return 0;
>>
>> - if (!down_read_trylock(&shrinker_rwsem))
>> - return 0;
>> -
>> - info = shrinker_info_protected(memcg, nid);
>> + srcu_idx = srcu_read_lock(&shrinker_srcu);
>> + info = shrinker_info_srcu(memcg, nid);
>> if (unlikely(!info))
>> goto unlock;
>
> There is shrinker_nr_max dereference under this hunk. It's not in the patch:
>
> for_each_set_bit(i, info->map, shrinker_nr_max) {
>
> Since shrinker_nr_max may grow in parallel, this leads to access beyond allocated memory :(
Oh, indeed.
>
> It looks like we should save size of info->map as a new member of struct shrinker_info.
Agree, then we only traverse info->map_size each time. Like below:
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index b6eda2ab205d..f1b5d2803007 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -97,6 +97,7 @@ struct shrinker_info {
struct rcu_head rcu;
atomic_long_t *nr_deferred;
unsigned long *map;
+ int map_size;
};
struct lruvec_stats_percpu {
diff --git a/mm/vmscan.c b/mm/vmscan.c
index f94bfe540675..dd07eb107915 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -239,14 +239,20 @@ static void free_shrinker_info_rcu(struct rcu_head
*head)
kvfree(container_of(head, struct shrinker_info, rcu));
}
-static int expand_one_shrinker_info(struct mem_cgroup *memcg,
- int map_size, int defer_size,
- int old_map_size, int old_defer_size)
+static int expand_one_shrinker_info(struct mem_cgroup *memcg, int
new_nr_max,
+ int old_nr_max)
{
+ int map_size, defer_size, old_map_size, old_defer_size;
struct shrinker_info *new, *old;
struct mem_cgroup_per_node *pn;
int nid;
- int size = map_size + defer_size;
+ int size;
+
+ map_size = shrinker_map_size(new_nr_max);
+ defer_size = shrinker_defer_size(new_nr_max);
+ old_map_size = shrinker_map_size(shrinker_nr_max);
+ old_defer_size = shrinker_defer_size(shrinker_nr_max);
+ size = map_size + defer_size;
for_each_node(nid) {
pn = memcg->nodeinfo[nid];
@@ -261,6 +267,7 @@ static int expand_one_shrinker_info(struct
mem_cgroup *memcg,
new->nr_deferred = (atomic_long_t *)(new + 1);
new->map = (void *)new->nr_deferred + defer_size;
+ new->map_size = new_nr_max;
/* map: set all old bits, clear all new bits */
memset(new->map, (int)0xff, old_map_size);
@@ -310,6 +317,7 @@ int alloc_shrinker_info(struct mem_cgroup *memcg)
}
info->nr_deferred = (atomic_long_t *)(info + 1);
info->map = (void *)info->nr_deferred + defer_size;
+ info->map_size = shrinker_nr_max;
rcu_assign_pointer(memcg->nodeinfo[nid]->shrinker_info,
info);
}
mutex_unlock(&shrinker_mutex);
@@ -327,8 +335,6 @@ static int expand_shrinker_info(int new_id)
{
int ret = 0;
int new_nr_max = new_id + 1;
- int map_size, defer_size = 0;
- int old_map_size, old_defer_size = 0;
struct mem_cgroup *memcg;
if (!need_expand(new_nr_max))
@@ -339,15 +345,9 @@ static int expand_shrinker_info(int new_id)
lockdep_assert_held(&shrinker_mutex);
- map_size = shrinker_map_size(new_nr_max);
- defer_size = shrinker_defer_size(new_nr_max);
- old_map_size = shrinker_map_size(shrinker_nr_max);
- old_defer_size = shrinker_defer_size(shrinker_nr_max);
-
memcg = mem_cgroup_iter(NULL, NULL, NULL);
do {
- ret = expand_one_shrinker_info(memcg, map_size, defer_size,
- old_map_size,
old_defer_size);
+ ret = expand_one_shrinker_info(memcg, new_nr_max,
shrinker_nr_max);
if (ret) {
mem_cgroup_iter_break(NULL, memcg);
goto out;
@@ -912,7 +912,7 @@ static unsigned long shrink_slab_memcg(gfp_t
gfp_mask, int nid,
if (unlikely(!info))
goto unlock;
- for_each_set_bit(i, info->map, shrinker_nr_max) {
+ for_each_set_bit(i, info->map, info->map_size) {
struct shrink_control sc = {
.gfp_mask = gfp_mask,
.nid = nid,
I will send the v2.
Thanks,
Qi
>
>>
>> @@ -949,14 +962,9 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
>> set_shrinker_bit(memcg, nid, i);
>> }
>> freed += ret;
>> -
>> - if (rwsem_is_contended(&shrinker_rwsem)) {
>> - freed = freed ? : 1;
>> - break;
>> - }
>> }
>> unlock:
>> - up_read(&shrinker_rwsem);
>> + srcu_read_unlock(&shrinker_srcu, srcu_idx);
>> return freed;
>> }
>> #else /* CONFIG_MEMCG */
>
--
Thanks,
Qi
^ permalink raw reply related [flat|nested] 16+ messages in thread
* Re: [PATCH 2/5] mm: vmscan: make memcg slab shrink lockless
2023-02-22 8:16 ` Qi Zheng
@ 2023-02-22 8:21 ` Qi Zheng
2023-02-22 20:05 ` Kirill Tkhai
0 siblings, 1 reply; 16+ messages in thread
From: Qi Zheng @ 2023-02-22 8:21 UTC (permalink / raw)
To: Kirill Tkhai
Cc: sultan, dave, penguin-kernel, paulmck, linux-mm, linux-kernel,
Andrew Morton, Johannes Weiner, Shakeel Butt, Michal Hocko,
Roman Gushchin, Muchun Song, David Hildenbrand, Yang Shi
On 2023/2/22 16:16, Qi Zheng wrote:
> Hi Kirill,
>
> On 2023/2/22 05:43, Kirill Tkhai wrote:
>> On 20.02.2023 12:16, Qi Zheng wrote:
>>> Like global slab shrink, since commit 1cd0bd06093c<...>
>>> static bool cgroup_reclaim(struct scan_control *sc)
>>> @@ -891,15 +905,14 @@ static unsigned long shrink_slab_memcg(gfp_t
>>> gfp_mask, int nid,
>>> {
>>> struct shrinker_info *info;
>>> unsigned long ret, freed = 0;
>>> + int srcu_idx;
>>> int i;
>>> if (!mem_cgroup_online(memcg))
>>> return 0;
>>> - if (!down_read_trylock(&shrinker_rwsem))
>>> - return 0;
>>> -
>>> - info = shrinker_info_protected(memcg, nid);
>>> + srcu_idx = srcu_read_lock(&shrinker_srcu);
>>> + info = shrinker_info_srcu(memcg, nid);
>>> if (unlikely(!info))
>>> goto unlock;
>>
>> There is shrinker_nr_max dereference under this hunk. It's not in the
>> patch:
>>
>> for_each_set_bit(i, info->map, shrinker_nr_max) {
>>
>> Since shrinker_nr_max may grow in parallel, this leads to access
>> beyond allocated memory :(
>
> Oh, indeed.
>
>>
>> It looks like we should save size of info->map as a new member of
>> struct shrinker_info.
>
> Agree, then we only traverse info->map_size each time. Like below:
>
> diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
> index b6eda2ab205d..f1b5d2803007 100644
> --- a/include/linux/memcontrol.h
> +++ b/include/linux/memcontrol.h
> @@ -97,6 +97,7 @@ struct shrinker_info {
> struct rcu_head rcu;
> atomic_long_t *nr_deferred;
> unsigned long *map;
> + int map_size;
> };
>
> struct lruvec_stats_percpu {
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index f94bfe540675..dd07eb107915 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -239,14 +239,20 @@ static void free_shrinker_info_rcu(struct rcu_head
> *head)
> kvfree(container_of(head, struct shrinker_info, rcu));
> }
>
> -static int expand_one_shrinker_info(struct mem_cgroup *memcg,
> - int map_size, int defer_size,
> - int old_map_size, int old_defer_size)
> +static int expand_one_shrinker_info(struct mem_cgroup *memcg, int
> new_nr_max,
> + int old_nr_max)
> {
> + int map_size, defer_size, old_map_size, old_defer_size;
> struct shrinker_info *new, *old;
> struct mem_cgroup_per_node *pn;
> int nid;
> - int size = map_size + defer_size;
> + int size;
> +
> + map_size = shrinker_map_size(new_nr_max);
> + defer_size = shrinker_defer_size(new_nr_max);
> + old_map_size = shrinker_map_size(shrinker_nr_max);
> + old_defer_size = shrinker_defer_size(shrinker_nr_max);
Perhaps these should still be calculated outside the loop as before.
> + size = map_size + defer_size;
>
> for_each_node(nid) {
> pn = memcg->nodeinfo[nid];
> @@ -261,6 +267,7 @@ static int expand_one_shrinker_info(struct
> mem_cgroup *memcg,
>
> new->nr_deferred = (atomic_long_t *)(new + 1);
> new->map = (void *)new->nr_deferred + defer_size;
> + new->map_size = new_nr_max;
>
> /* map: set all old bits, clear all new bits */
> memset(new->map, (int)0xff, old_map_size);
> @@ -310,6 +317,7 @@ int alloc_shrinker_info(struct mem_cgroup *memcg)
> }
> info->nr_deferred = (atomic_long_t *)(info + 1);
> info->map = (void *)info->nr_deferred + defer_size;
> + info->map_size = shrinker_nr_max;
> rcu_assign_pointer(memcg->nodeinfo[nid]->shrinker_info,
> info);
> }
> mutex_unlock(&shrinker_mutex);
> @@ -327,8 +335,6 @@ static int expand_shrinker_info(int new_id)
> {
> int ret = 0;
> int new_nr_max = new_id + 1;
> - int map_size, defer_size = 0;
> - int old_map_size, old_defer_size = 0;
> struct mem_cgroup *memcg;
>
> if (!need_expand(new_nr_max))
> @@ -339,15 +345,9 @@ static int expand_shrinker_info(int new_id)
>
> lockdep_assert_held(&shrinker_mutex);
>
> - map_size = shrinker_map_size(new_nr_max);
> - defer_size = shrinker_defer_size(new_nr_max);
> - old_map_size = shrinker_map_size(shrinker_nr_max);
> - old_defer_size = shrinker_defer_size(shrinker_nr_max);
> -
> memcg = mem_cgroup_iter(NULL, NULL, NULL);
> do {
> - ret = expand_one_shrinker_info(memcg, map_size, defer_size,
> - old_map_size,
> old_defer_size);
> + ret = expand_one_shrinker_info(memcg, new_nr_max,
> shrinker_nr_max);
> if (ret) {
> mem_cgroup_iter_break(NULL, memcg);
> goto out;
> @@ -912,7 +912,7 @@ static unsigned long shrink_slab_memcg(gfp_t
> gfp_mask, int nid,
> if (unlikely(!info))
> goto unlock;
>
> - for_each_set_bit(i, info->map, shrinker_nr_max) {
> + for_each_set_bit(i, info->map, info->map_size) {
> struct shrink_control sc = {
> .gfp_mask = gfp_mask,
> .nid = nid,
>
> I will send the v2.
>
> Thanks,
> Qi
>
>>
>>> @@ -949,14 +962,9 @@ static unsigned long shrink_slab_memcg(gfp_t
>>> gfp_mask, int nid,
>>> set_shrinker_bit(memcg, nid, i);
>>> }
>>> freed += ret;
>>> -
>>> - if (rwsem_is_contended(&shrinker_rwsem)) {
>>> - freed = freed ? : 1;
>>> - break;
>>> - }
>>> }
>>> unlock:
>>> - up_read(&shrinker_rwsem);
>>> + srcu_read_unlock(&shrinker_srcu, srcu_idx);
>>> return freed;
>>> }
>>> #else /* CONFIG_MEMCG */
>>
>
--
Thanks,
Qi
^ permalink raw reply [flat|nested] 16+ messages in thread
* Re: [PATCH 2/5] mm: vmscan: make memcg slab shrink lockless
2023-02-22 8:21 ` Qi Zheng
@ 2023-02-22 20:05 ` Kirill Tkhai
2023-02-23 4:37 ` Qi Zheng
0 siblings, 1 reply; 16+ messages in thread
From: Kirill Tkhai @ 2023-02-22 20:05 UTC (permalink / raw)
To: Qi Zheng
Cc: sultan, dave, penguin-kernel, paulmck, linux-mm, linux-kernel,
Andrew Morton, Johannes Weiner, Shakeel Butt, Michal Hocko,
Roman Gushchin, Muchun Song, David Hildenbrand, Yang Shi
On 22.02.2023 11:21, Qi Zheng wrote:
>
>
> On 2023/2/22 16:16, Qi Zheng wrote:
>> Hi Kirill,
>>
>> On 2023/2/22 05:43, Kirill Tkhai wrote:
>>> On 20.02.2023 12:16, Qi Zheng wrote:
>>>> Like global slab shrink, since commit 1cd0bd06093c<...>
>>>> static bool cgroup_reclaim(struct scan_control *sc)
>>>> @@ -891,15 +905,14 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
>>>> {
>>>> struct shrinker_info *info;
>>>> unsigned long ret, freed = 0;
>>>> + int srcu_idx;
>>>> int i;
>>>> if (!mem_cgroup_online(memcg))
>>>> return 0;
>>>> - if (!down_read_trylock(&shrinker_rwsem))
>>>> - return 0;
>>>> -
>>>> - info = shrinker_info_protected(memcg, nid);
>>>> + srcu_idx = srcu_read_lock(&shrinker_srcu);
>>>> + info = shrinker_info_srcu(memcg, nid);
>>>> if (unlikely(!info))
>>>> goto unlock;
>>>
>>> There is shrinker_nr_max dereference under this hunk. It's not in the patch:
>>>
>>> for_each_set_bit(i, info->map, shrinker_nr_max) {
>>>
>>> Since shrinker_nr_max may grow in parallel, this leads to access beyond allocated memory :(
>>
>> Oh, indeed.
>>
>>>
>>> It looks like we should save size of info->map as a new member of struct shrinker_info.
>>
>> Agree, then we only traverse info->map_size each time. Like below:
>>
>> diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
>> index b6eda2ab205d..f1b5d2803007 100644
>> --- a/include/linux/memcontrol.h
>> +++ b/include/linux/memcontrol.h
>> @@ -97,6 +97,7 @@ struct shrinker_info {
>> struct rcu_head rcu;
>> atomic_long_t *nr_deferred;
>> unsigned long *map;
>> + int map_size;
Sure, like this. The only thing (after rethinking) I want to say is that despite "size" was
may suggestion, now it makes me think that name "size" is about size in bytes.
Possible, something like map_nr_max would be better here.
>> };
>>
>> struct lruvec_stats_percpu {
>> diff --git a/mm/vmscan.c b/mm/vmscan.c
>> index f94bfe540675..dd07eb107915 100644
>> --- a/mm/vmscan.c
>> +++ b/mm/vmscan.c
>> @@ -239,14 +239,20 @@ static void free_shrinker_info_rcu(struct rcu_head *head)
>> kvfree(container_of(head, struct shrinker_info, rcu));
>> }
>>
>> -static int expand_one_shrinker_info(struct mem_cgroup *memcg,
>> - int map_size, int defer_size,
>> - int old_map_size, int old_defer_size)
>> +static int expand_one_shrinker_info(struct mem_cgroup *memcg, int new_nr_max,
>> + int old_nr_max)
>> {
>> + int map_size, defer_size, old_map_size, old_defer_size;
>> struct shrinker_info *new, *old;
>> struct mem_cgroup_per_node *pn;
>> int nid;
>> - int size = map_size + defer_size;
>> + int size;
>> +
>> + map_size = shrinker_map_size(new_nr_max);
>> + defer_size = shrinker_defer_size(new_nr_max);
>> + old_map_size = shrinker_map_size(shrinker_nr_max);
>> + old_defer_size = shrinker_defer_size(shrinker_nr_max);
>
> Perhaps these should still be calculated outside the loop as before.
Yeah, for me it's also better.
>> + size = map_size + defer_size;
>>
>> for_each_node(nid) {
>> pn = memcg->nodeinfo[nid];
>> @@ -261,6 +267,7 @@ static int expand_one_shrinker_info(struct mem_cgroup *memcg,
>>
>> new->nr_deferred = (atomic_long_t *)(new + 1);
>> new->map = (void *)new->nr_deferred + defer_size;
>> + new->map_size = new_nr_max;
>>
>> /* map: set all old bits, clear all new bits */
>> memset(new->map, (int)0xff, old_map_size);
>> @@ -310,6 +317,7 @@ int alloc_shrinker_info(struct mem_cgroup *memcg)
>> }
>> info->nr_deferred = (atomic_long_t *)(info + 1);
>> info->map = (void *)info->nr_deferred + defer_size;
>> + info->map_size = shrinker_nr_max;
>> rcu_assign_pointer(memcg->nodeinfo[nid]->shrinker_info, info);
>> }
>> mutex_unlock(&shrinker_mutex);
>> @@ -327,8 +335,6 @@ static int expand_shrinker_info(int new_id)
>> {
>> int ret = 0;
>> int new_nr_max = new_id + 1;
>> - int map_size, defer_size = 0;
>> - int old_map_size, old_defer_size = 0;
>> struct mem_cgroup *memcg;
>>
>> if (!need_expand(new_nr_max))
>> @@ -339,15 +345,9 @@ static int expand_shrinker_info(int new_id)
>>
>> lockdep_assert_held(&shrinker_mutex);
>>
>> - map_size = shrinker_map_size(new_nr_max);
>> - defer_size = shrinker_defer_size(new_nr_max);
>> - old_map_size = shrinker_map_size(shrinker_nr_max);
>> - old_defer_size = shrinker_defer_size(shrinker_nr_max);
>> -
>> memcg = mem_cgroup_iter(NULL, NULL, NULL);
>> do {
>> - ret = expand_one_shrinker_info(memcg, map_size, defer_size,
>> - old_map_size, old_defer_size);
>> + ret = expand_one_shrinker_info(memcg, new_nr_max, shrinker_nr_max);
>> if (ret) {
>> mem_cgroup_iter_break(NULL, memcg);
>> goto out;
>> @@ -912,7 +912,7 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
>> if (unlikely(!info))
>> goto unlock;
>>
>> - for_each_set_bit(i, info->map, shrinker_nr_max) {
>> + for_each_set_bit(i, info->map, info->map_size) {
>> struct shrink_control sc = {
>> .gfp_mask = gfp_mask,
>> .nid = nid,
>>
>> I will send the v2.
>>
>> Thanks,
>> Qi
>>
>>>
>>>> @@ -949,14 +962,9 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
>>>> set_shrinker_bit(memcg, nid, i);
>>>> }
>>>> freed += ret;
>>>> -
>>>> - if (rwsem_is_contended(&shrinker_rwsem)) {
>>>> - freed = freed ? : 1;
>>>> - break;
>>>> - }
>>>> }
>>>> unlock:
>>>> - up_read(&shrinker_rwsem);
>>>> + srcu_read_unlock(&shrinker_srcu, srcu_idx);
>>>> return freed;
>>>> }
>>>> #else /* CONFIG_MEMCG */
>>>
>>
>
^ permalink raw reply [flat|nested] 16+ messages in thread
* Re: [PATCH 2/5] mm: vmscan: make memcg slab shrink lockless
2023-02-22 20:05 ` Kirill Tkhai
@ 2023-02-23 4:37 ` Qi Zheng
0 siblings, 0 replies; 16+ messages in thread
From: Qi Zheng @ 2023-02-23 4:37 UTC (permalink / raw)
To: Kirill Tkhai
Cc: sultan, dave, penguin-kernel, paulmck, linux-mm, linux-kernel,
Andrew Morton, Johannes Weiner, Shakeel Butt, Michal Hocko,
Roman Gushchin, Muchun Song, David Hildenbrand, Yang Shi
On 2023/2/23 04:05, Kirill Tkhai wrote:
> On 22.02.2023 11:21, Qi Zheng wrote:
>>
>>
>> On 2023/2/22 16:16, Qi Zheng wrote:
>>> Hi Kirill,
>>>
>>> On 2023/2/22 05:43, Kirill Tkhai wrote:
>>>> On 20.02.2023 12:16, Qi Zheng wrote:
>>>>> Like global slab shrink, since commit 1cd0bd06093c<...>
>>>>> static bool cgroup_reclaim(struct scan_control *sc)
>>>>> @@ -891,15 +905,14 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
>>>>> {
>>>>> struct shrinker_info *info;
>>>>> unsigned long ret, freed = 0;
>>>>> + int srcu_idx;
>>>>> int i;
>>>>> if (!mem_cgroup_online(memcg))
>>>>> return 0;
>>>>> - if (!down_read_trylock(&shrinker_rwsem))
>>>>> - return 0;
>>>>> -
>>>>> - info = shrinker_info_protected(memcg, nid);
>>>>> + srcu_idx = srcu_read_lock(&shrinker_srcu);
>>>>> + info = shrinker_info_srcu(memcg, nid);
>>>>> if (unlikely(!info))
>>>>> goto unlock;
>>>>
>>>> There is shrinker_nr_max dereference under this hunk. It's not in the patch:
>>>>
>>>> for_each_set_bit(i, info->map, shrinker_nr_max) {
>>>>
>>>> Since shrinker_nr_max may grow in parallel, this leads to access beyond allocated memory :(
>>>
>>> Oh, indeed.
>>>
>>>>
>>>> It looks like we should save size of info->map as a new member of struct shrinker_info.
>>>
>>> Agree, then we only traverse info->map_size each time. Like below:
>>>
>>> diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
>>> index b6eda2ab205d..f1b5d2803007 100644
>>> --- a/include/linux/memcontrol.h
>>> +++ b/include/linux/memcontrol.h
>>> @@ -97,6 +97,7 @@ struct shrinker_info {
>>> struct rcu_head rcu;
>>> atomic_long_t *nr_deferred;
>>> unsigned long *map;
>>> + int map_size;
>
> Sure, like this. The only thing (after rethinking) I want to say is that despite "size" was
> may suggestion, now it makes me think that name "size" is about size in bytes.
>
> Possible, something like map_nr_max would be better here.
Agree. Will change to it.
>
>>> };
>>>
>>> struct lruvec_stats_percpu {
>>> diff --git a/mm/vmscan.c b/mm/vmscan.c
>>> index f94bfe540675..dd07eb107915 100644
>>> --- a/mm/vmscan.c
>>> +++ b/mm/vmscan.c
>>> @@ -239,14 +239,20 @@ static void free_shrinker_info_rcu(struct rcu_head *head)
>>> kvfree(container_of(head, struct shrinker_info, rcu));
>>> }
>>>
>>> -static int expand_one_shrinker_info(struct mem_cgroup *memcg,
>>> - int map_size, int defer_size,
>>> - int old_map_size, int old_defer_size)
>>> +static int expand_one_shrinker_info(struct mem_cgroup *memcg, int new_nr_max,
>>> + int old_nr_max)
>>> {
>>> + int map_size, defer_size, old_map_size, old_defer_size;
>>> struct shrinker_info *new, *old;
>>> struct mem_cgroup_per_node *pn;
>>> int nid;
>>> - int size = map_size + defer_size;
>>> + int size;
>>> +
>>> + map_size = shrinker_map_size(new_nr_max);
>>> + defer_size = shrinker_defer_size(new_nr_max);
>>> + old_map_size = shrinker_map_size(shrinker_nr_max);
>>> + old_defer_size = shrinker_defer_size(shrinker_nr_max);
>>
>> Perhaps these should still be calculated outside the loop as before.
>
> Yeah, for me it's also better.
>
>>> + size = map_size + defer_size;
>>>
>>> for_each_node(nid) {
>>> pn = memcg->nodeinfo[nid];
>>> @@ -261,6 +267,7 @@ static int expand_one_shrinker_info(struct mem_cgroup *memcg,
>>>
>>> new->nr_deferred = (atomic_long_t *)(new + 1);
>>> new->map = (void *)new->nr_deferred + defer_size;
>>> + new->map_size = new_nr_max;
>>>
>>> /* map: set all old bits, clear all new bits */
>>> memset(new->map, (int)0xff, old_map_size);
>>> @@ -310,6 +317,7 @@ int alloc_shrinker_info(struct mem_cgroup *memcg)
>>> }
>>> info->nr_deferred = (atomic_long_t *)(info + 1);
>>> info->map = (void *)info->nr_deferred + defer_size;
>>> + info->map_size = shrinker_nr_max;
>>> rcu_assign_pointer(memcg->nodeinfo[nid]->shrinker_info, info);
>>> }
>>> mutex_unlock(&shrinker_mutex);
>>> @@ -327,8 +335,6 @@ static int expand_shrinker_info(int new_id)
>>> {
>>> int ret = 0;
>>> int new_nr_max = new_id + 1;
>>> - int map_size, defer_size = 0;
>>> - int old_map_size, old_defer_size = 0;
>>> struct mem_cgroup *memcg;
>>>
>>> if (!need_expand(new_nr_max))
>>> @@ -339,15 +345,9 @@ static int expand_shrinker_info(int new_id)
>>>
>>> lockdep_assert_held(&shrinker_mutex);
>>>
>>> - map_size = shrinker_map_size(new_nr_max);
>>> - defer_size = shrinker_defer_size(new_nr_max);
>>> - old_map_size = shrinker_map_size(shrinker_nr_max);
>>> - old_defer_size = shrinker_defer_size(shrinker_nr_max);
>>> -
>>> memcg = mem_cgroup_iter(NULL, NULL, NULL);
>>> do {
>>> - ret = expand_one_shrinker_info(memcg, map_size, defer_size,
>>> - old_map_size, old_defer_size);
>>> + ret = expand_one_shrinker_info(memcg, new_nr_max, shrinker_nr_max);
>>> if (ret) {
>>> mem_cgroup_iter_break(NULL, memcg);
>>> goto out;
>>> @@ -912,7 +912,7 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
>>> if (unlikely(!info))
>>> goto unlock;
>>>
>>> - for_each_set_bit(i, info->map, shrinker_nr_max) {
>>> + for_each_set_bit(i, info->map, info->map_size) {
>>> struct shrink_control sc = {
>>> .gfp_mask = gfp_mask,
>>> .nid = nid,
>>>
>>> I will send the v2.
>>>
>>> Thanks,
>>> Qi
>>>
>>>>
>>>>> @@ -949,14 +962,9 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
>>>>> set_shrinker_bit(memcg, nid, i);
>>>>> }
>>>>> freed += ret;
>>>>> -
>>>>> - if (rwsem_is_contended(&shrinker_rwsem)) {
>>>>> - freed = freed ? : 1;
>>>>> - break;
>>>>> - }
>>>>> }
>>>>> unlock:
>>>>> - up_read(&shrinker_rwsem);
>>>>> + srcu_read_unlock(&shrinker_srcu, srcu_idx);
>>>>> return freed;
>>>>> }
>>>>> #else /* CONFIG_MEMCG */
>>>>
>>>
>>
>
--
Thanks,
Qi
^ permalink raw reply [flat|nested] 16+ messages in thread
* [PATCH 3/5] mm: shrinkers: make count and scan in shrinker debugfs lockless
2023-02-20 9:16 [PATCH 0/5] make slab shrink lockless Qi Zheng
2023-02-20 9:16 ` [PATCH 1/5] mm: vmscan: make global " Qi Zheng
2023-02-20 9:16 ` [PATCH 2/5] mm: vmscan: make memcg " Qi Zheng
@ 2023-02-20 9:16 ` Qi Zheng
2023-02-20 13:22 ` kernel test robot
2023-02-20 9:16 ` [PATCH 4/5] mm: vmscan: remove shrinker_rwsem from synchronize_shrinkers() Qi Zheng
2023-02-20 9:16 ` [PATCH 5/5] mm: shrinkers: convert shrinker_rwsem to mutex Qi Zheng
4 siblings, 1 reply; 16+ messages in thread
From: Qi Zheng @ 2023-02-20 9:16 UTC (permalink / raw)
To: akpm, hannes, shakeelb, mhocko, roman.gushchin, muchun.song,
david, shy828301
Cc: tkhai, sultan, dave, penguin-kernel, paulmck, linux-mm,
linux-kernel, Qi Zheng
Like global and memcg slab shrink, also use SRCU to
make count and scan operations in memory shrinker
debugfs lockless.
Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com>
---
mm/shrinker_debug.c | 24 +++++++-----------------
1 file changed, 7 insertions(+), 17 deletions(-)
diff --git a/mm/shrinker_debug.c b/mm/shrinker_debug.c
index 39c3491e28a3..6a26e8ac40aa 100644
--- a/mm/shrinker_debug.c
+++ b/mm/shrinker_debug.c
@@ -9,6 +9,7 @@
/* defined in vmscan.c */
extern struct rw_semaphore shrinker_rwsem;
extern struct list_head shrinker_list;
+extern struct srcu_struct shrinker_srcu;
static DEFINE_IDA(shrinker_debugfs_ida);
static struct dentry *shrinker_debugfs_root;
@@ -49,18 +50,13 @@ static int shrinker_debugfs_count_show(struct seq_file *m, void *v)
struct mem_cgroup *memcg;
unsigned long total;
bool memcg_aware;
- int ret, nid;
+ int ret, nid, srcu_idx;
count_per_node = kcalloc(nr_node_ids, sizeof(unsigned long), GFP_KERNEL);
if (!count_per_node)
return -ENOMEM;
- ret = down_read_killable(&shrinker_rwsem);
- if (ret) {
- kfree(count_per_node);
- return ret;
- }
- rcu_read_lock();
+ srcu_idx = srcu_read_lock(&shrinker_srcu);
memcg_aware = shrinker->flags & SHRINKER_MEMCG_AWARE;
@@ -91,8 +87,7 @@ static int shrinker_debugfs_count_show(struct seq_file *m, void *v)
}
} while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)) != NULL);
- rcu_read_unlock();
- up_read(&shrinker_rwsem);
+ srcu_read_unlock(&shrinker_srcu, srcu_idx);
kfree(count_per_node);
return ret;
@@ -115,9 +110,8 @@ static ssize_t shrinker_debugfs_scan_write(struct file *file,
.gfp_mask = GFP_KERNEL,
};
struct mem_cgroup *memcg = NULL;
- int nid;
+ int nid, srcu_idx;
char kbuf[72];
- ssize_t ret;
read_len = size < (sizeof(kbuf) - 1) ? size : (sizeof(kbuf) - 1);
if (copy_from_user(kbuf, buf, read_len))
@@ -146,11 +140,7 @@ static ssize_t shrinker_debugfs_scan_write(struct file *file,
return -EINVAL;
}
- ret = down_read_killable(&shrinker_rwsem);
- if (ret) {
- mem_cgroup_put(memcg);
- return ret;
- }
+ srcu_idx = srcu_read_lock(&shrinker_srcu);
sc.nid = nid;
sc.memcg = memcg;
@@ -159,7 +149,7 @@ static ssize_t shrinker_debugfs_scan_write(struct file *file,
shrinker->scan_objects(shrinker, &sc);
- up_read(&shrinker_rwsem);
+ srcu_read_unlock(&shrinker_srcu, srcu_idx);
mem_cgroup_put(memcg);
return size;
--
2.20.1
^ permalink raw reply related [flat|nested] 16+ messages in thread
* Re: [PATCH 3/5] mm: shrinkers: make count and scan in shrinker debugfs lockless
2023-02-20 9:16 ` [PATCH 3/5] mm: shrinkers: make count and scan in shrinker debugfs lockless Qi Zheng
@ 2023-02-20 13:22 ` kernel test robot
0 siblings, 0 replies; 16+ messages in thread
From: kernel test robot @ 2023-02-20 13:22 UTC (permalink / raw)
To: Qi Zheng, akpm, hannes, shakeelb, mhocko, roman.gushchin,
muchun.song, david, shy828301
Cc: llvm, oe-kbuild-all, tkhai, sultan, dave, penguin-kernel,
paulmck, linux-mm, linux-kernel, Qi Zheng
Hi Qi,
Thank you for the patch! Perhaps something to improve:
[auto build test WARNING on akpm-mm/mm-everything]
[also build test WARNING on next-20230220]
[cannot apply to device-mapper-dm/for-next linus/master v6.2]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]
url: https://github.com/intel-lab-lkp/linux/commits/Qi-Zheng/mm-vmscan-make-global-slab-shrink-lockless/20230220-171954
base: https://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm.git mm-everything
patch link: https://lore.kernel.org/r/20230220091637.64865-4-zhengqi.arch%40bytedance.com
patch subject: [PATCH 3/5] mm: shrinkers: make count and scan in shrinker debugfs lockless
config: riscv-randconfig-r036-20230219 (https://download.01.org/0day-ci/archive/20230220/202302202134.OFjSh3rl-lkp@intel.com/config)
compiler: clang version 17.0.0 (https://github.com/llvm/llvm-project db89896bbbd2251fff457699635acbbedeead27f)
reproduce (this is a W=1 build):
wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
chmod +x ~/bin/make.cross
# install riscv cross compiling tool for clang build
# apt-get install binutils-riscv64-linux-gnu
# https://github.com/intel-lab-lkp/linux/commit/b5b7259339a7a5cfae5a120356750c5a9e151d12
git remote add linux-review https://github.com/intel-lab-lkp/linux
git fetch --no-tags linux-review Qi-Zheng/mm-vmscan-make-global-slab-shrink-lockless/20230220-171954
git checkout b5b7259339a7a5cfae5a120356750c5a9e151d12
# save the config file
mkdir build_dir && cp config build_dir/.config
COMPILER_INSTALL_PATH=$HOME/0day COMPILER=clang make.cross W=1 O=build_dir ARCH=riscv olddefconfig
COMPILER_INSTALL_PATH=$HOME/0day COMPILER=clang make.cross W=1 O=build_dir ARCH=riscv SHELL=/bin/bash
If you fix the issue, kindly add following tag where applicable
| Reported-by: kernel test robot <lkp@intel.com>
| Link: https://lore.kernel.org/oe-kbuild-all/202302202134.OFjSh3rl-lkp@intel.com/
All warnings (new ones prefixed by >>):
>> mm/shrinker_debug.c:88:11: warning: variable 'ret' is used uninitialized whenever 'do' loop exits because its condition is false [-Wsometimes-uninitialized]
} while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)) != NULL);
^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
mm/shrinker_debug.c:93:9: note: uninitialized use occurs here
return ret;
^~~
mm/shrinker_debug.c:88:11: note: remove the condition if it is always true
} while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)) != NULL);
^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1
>> mm/shrinker_debug.c:78:7: warning: variable 'ret' is used uninitialized whenever 'if' condition is true [-Wsometimes-uninitialized]
if (!memcg_aware) {
^~~~~~~~~~~~
mm/shrinker_debug.c:93:9: note: uninitialized use occurs here
return ret;
^~~
mm/shrinker_debug.c:78:3: note: remove the 'if' if its condition is always false
if (!memcg_aware) {
^~~~~~~~~~~~~~~~~~~
mm/shrinker_debug.c:53:9: note: initialize the variable 'ret' to silence this warning
int ret, nid, srcu_idx;
^
= 0
2 warnings generated.
vim +88 mm/shrinker_debug.c
5035ebc644aec9 Roman Gushchin 2022-05-31 45
5035ebc644aec9 Roman Gushchin 2022-05-31 46 static int shrinker_debugfs_count_show(struct seq_file *m, void *v)
5035ebc644aec9 Roman Gushchin 2022-05-31 47 {
5035ebc644aec9 Roman Gushchin 2022-05-31 48 struct shrinker *shrinker = m->private;
5035ebc644aec9 Roman Gushchin 2022-05-31 49 unsigned long *count_per_node;
5035ebc644aec9 Roman Gushchin 2022-05-31 50 struct mem_cgroup *memcg;
5035ebc644aec9 Roman Gushchin 2022-05-31 51 unsigned long total;
5035ebc644aec9 Roman Gushchin 2022-05-31 52 bool memcg_aware;
b5b7259339a7a5 Qi Zheng 2023-02-20 53 int ret, nid, srcu_idx;
5035ebc644aec9 Roman Gushchin 2022-05-31 54
5035ebc644aec9 Roman Gushchin 2022-05-31 55 count_per_node = kcalloc(nr_node_ids, sizeof(unsigned long), GFP_KERNEL);
5035ebc644aec9 Roman Gushchin 2022-05-31 56 if (!count_per_node)
5035ebc644aec9 Roman Gushchin 2022-05-31 57 return -ENOMEM;
5035ebc644aec9 Roman Gushchin 2022-05-31 58
b5b7259339a7a5 Qi Zheng 2023-02-20 59 srcu_idx = srcu_read_lock(&shrinker_srcu);
5035ebc644aec9 Roman Gushchin 2022-05-31 60
5035ebc644aec9 Roman Gushchin 2022-05-31 61 memcg_aware = shrinker->flags & SHRINKER_MEMCG_AWARE;
5035ebc644aec9 Roman Gushchin 2022-05-31 62
5035ebc644aec9 Roman Gushchin 2022-05-31 63 memcg = mem_cgroup_iter(NULL, NULL, NULL);
5035ebc644aec9 Roman Gushchin 2022-05-31 64 do {
5035ebc644aec9 Roman Gushchin 2022-05-31 65 if (memcg && !mem_cgroup_online(memcg))
5035ebc644aec9 Roman Gushchin 2022-05-31 66 continue;
5035ebc644aec9 Roman Gushchin 2022-05-31 67
5035ebc644aec9 Roman Gushchin 2022-05-31 68 total = shrinker_count_objects(shrinker,
5035ebc644aec9 Roman Gushchin 2022-05-31 69 memcg_aware ? memcg : NULL,
5035ebc644aec9 Roman Gushchin 2022-05-31 70 count_per_node);
5035ebc644aec9 Roman Gushchin 2022-05-31 71 if (total) {
5035ebc644aec9 Roman Gushchin 2022-05-31 72 seq_printf(m, "%lu", mem_cgroup_ino(memcg));
5035ebc644aec9 Roman Gushchin 2022-05-31 73 for_each_node(nid)
5035ebc644aec9 Roman Gushchin 2022-05-31 74 seq_printf(m, " %lu", count_per_node[nid]);
5035ebc644aec9 Roman Gushchin 2022-05-31 75 seq_putc(m, '\n');
5035ebc644aec9 Roman Gushchin 2022-05-31 76 }
5035ebc644aec9 Roman Gushchin 2022-05-31 77
5035ebc644aec9 Roman Gushchin 2022-05-31 @78 if (!memcg_aware) {
5035ebc644aec9 Roman Gushchin 2022-05-31 79 mem_cgroup_iter_break(NULL, memcg);
5035ebc644aec9 Roman Gushchin 2022-05-31 80 break;
5035ebc644aec9 Roman Gushchin 2022-05-31 81 }
5035ebc644aec9 Roman Gushchin 2022-05-31 82
5035ebc644aec9 Roman Gushchin 2022-05-31 83 if (signal_pending(current)) {
5035ebc644aec9 Roman Gushchin 2022-05-31 84 mem_cgroup_iter_break(NULL, memcg);
5035ebc644aec9 Roman Gushchin 2022-05-31 85 ret = -EINTR;
5035ebc644aec9 Roman Gushchin 2022-05-31 86 break;
5035ebc644aec9 Roman Gushchin 2022-05-31 87 }
5035ebc644aec9 Roman Gushchin 2022-05-31 @88 } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)) != NULL);
5035ebc644aec9 Roman Gushchin 2022-05-31 89
b5b7259339a7a5 Qi Zheng 2023-02-20 90 srcu_read_unlock(&shrinker_srcu, srcu_idx);
5035ebc644aec9 Roman Gushchin 2022-05-31 91
5035ebc644aec9 Roman Gushchin 2022-05-31 92 kfree(count_per_node);
5035ebc644aec9 Roman Gushchin 2022-05-31 @93 return ret;
5035ebc644aec9 Roman Gushchin 2022-05-31 94 }
5035ebc644aec9 Roman Gushchin 2022-05-31 95 DEFINE_SHOW_ATTRIBUTE(shrinker_debugfs_count);
5035ebc644aec9 Roman Gushchin 2022-05-31 96
--
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests
^ permalink raw reply [flat|nested] 16+ messages in thread
* [PATCH 4/5] mm: vmscan: remove shrinker_rwsem from synchronize_shrinkers()
2023-02-20 9:16 [PATCH 0/5] make slab shrink lockless Qi Zheng
` (2 preceding siblings ...)
2023-02-20 9:16 ` [PATCH 3/5] mm: shrinkers: make count and scan in shrinker debugfs lockless Qi Zheng
@ 2023-02-20 9:16 ` Qi Zheng
2023-02-20 9:16 ` [PATCH 5/5] mm: shrinkers: convert shrinker_rwsem to mutex Qi Zheng
4 siblings, 0 replies; 16+ messages in thread
From: Qi Zheng @ 2023-02-20 9:16 UTC (permalink / raw)
To: akpm, hannes, shakeelb, mhocko, roman.gushchin, muchun.song,
david, shy828301
Cc: tkhai, sultan, dave, penguin-kernel, paulmck, linux-mm,
linux-kernel, Qi Zheng
Now there are no readers of shrinker_rwsem, so
synchronize_shrinkers() does not need to hold the
writer of shrinker_rwsem to wait for all running
shinkers to complete, synchronize_srcu() is enough.
Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com>
---
mm/vmscan.c | 8 ++------
1 file changed, 2 insertions(+), 6 deletions(-)
diff --git a/mm/vmscan.c b/mm/vmscan.c
index dc47396ecd0e..b8da56beba96 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -787,15 +787,11 @@ EXPORT_SYMBOL(unregister_shrinker);
/**
* synchronize_shrinkers - Wait for all running shrinkers to complete.
*
- * This is equivalent to calling unregister_shrink() and register_shrinker(),
- * but atomically and with less overhead. This is useful to guarantee that all
- * shrinker invocations have seen an update, before freeing memory, similar to
- * rcu.
+ * This is useful to guarantee that all shrinker invocations have seen an
+ * update, before freeing memory.
*/
void synchronize_shrinkers(void)
{
- down_write(&shrinker_rwsem);
- up_write(&shrinker_rwsem);
synchronize_srcu(&shrinker_srcu);
}
EXPORT_SYMBOL(synchronize_shrinkers);
--
2.20.1
^ permalink raw reply related [flat|nested] 16+ messages in thread
* [PATCH 5/5] mm: shrinkers: convert shrinker_rwsem to mutex
2023-02-20 9:16 [PATCH 0/5] make slab shrink lockless Qi Zheng
` (3 preceding siblings ...)
2023-02-20 9:16 ` [PATCH 4/5] mm: vmscan: remove shrinker_rwsem from synchronize_shrinkers() Qi Zheng
@ 2023-02-20 9:16 ` Qi Zheng
4 siblings, 0 replies; 16+ messages in thread
From: Qi Zheng @ 2023-02-20 9:16 UTC (permalink / raw)
To: akpm, hannes, shakeelb, mhocko, roman.gushchin, muchun.song,
david, shy828301
Cc: tkhai, sultan, dave, penguin-kernel, paulmck, linux-mm,
linux-kernel, Qi Zheng
Now there are no readers of shrinker_rwsem, so we
can simply replace it with mutex lock.
Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com>
---
drivers/md/dm-cache-metadata.c | 2 +-
drivers/md/dm-thin-metadata.c | 2 +-
fs/super.c | 2 +-
mm/shrinker_debug.c | 14 +++++++-------
mm/vmscan.c | 30 +++++++++++++++---------------
5 files changed, 25 insertions(+), 25 deletions(-)
diff --git a/drivers/md/dm-cache-metadata.c b/drivers/md/dm-cache-metadata.c
index acffed750e3e..9e0c69958587 100644
--- a/drivers/md/dm-cache-metadata.c
+++ b/drivers/md/dm-cache-metadata.c
@@ -1828,7 +1828,7 @@ int dm_cache_metadata_abort(struct dm_cache_metadata *cmd)
* Replacement block manager (new_bm) is created and old_bm destroyed outside of
* cmd root_lock to avoid ABBA deadlock that would result (due to life-cycle of
* shrinker associated with the block manager's bufio client vs cmd root_lock).
- * - must take shrinker_rwsem without holding cmd->root_lock
+ * - must take shrinker_mutex without holding cmd->root_lock
*/
new_bm = dm_block_manager_create(cmd->bdev, DM_CACHE_METADATA_BLOCK_SIZE << SECTOR_SHIFT,
CACHE_MAX_CONCURRENT_LOCKS);
diff --git a/drivers/md/dm-thin-metadata.c b/drivers/md/dm-thin-metadata.c
index fd464fb024c3..9f5cb52c5763 100644
--- a/drivers/md/dm-thin-metadata.c
+++ b/drivers/md/dm-thin-metadata.c
@@ -1887,7 +1887,7 @@ int dm_pool_abort_metadata(struct dm_pool_metadata *pmd)
* Replacement block manager (new_bm) is created and old_bm destroyed outside of
* pmd root_lock to avoid ABBA deadlock that would result (due to life-cycle of
* shrinker associated with the block manager's bufio client vs pmd root_lock).
- * - must take shrinker_rwsem without holding pmd->root_lock
+ * - must take shrinker_mutex without holding pmd->root_lock
*/
new_bm = dm_block_manager_create(pmd->bdev, THIN_METADATA_BLOCK_SIZE << SECTOR_SHIFT,
THIN_MAX_CONCURRENT_LOCKS);
diff --git a/fs/super.c b/fs/super.c
index 84332d5cb817..91a4037b1d95 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -54,7 +54,7 @@ static char *sb_writers_name[SB_FREEZE_LEVELS] = {
* One thing we have to be careful of with a per-sb shrinker is that we don't
* drop the last active reference to the superblock from within the shrinker.
* If that happens we could trigger unregistering the shrinker from within the
- * shrinker path and that leads to deadlock on the shrinker_rwsem. Hence we
+ * shrinker path and that leads to deadlock on the shrinker_mutex. Hence we
* take a passive reference to the superblock to avoid this from occurring.
*/
static unsigned long super_cache_scan(struct shrinker *shrink,
diff --git a/mm/shrinker_debug.c b/mm/shrinker_debug.c
index 6a26e8ac40aa..4c3a5d9afb5c 100644
--- a/mm/shrinker_debug.c
+++ b/mm/shrinker_debug.c
@@ -7,7 +7,7 @@
#include <linux/memcontrol.h>
/* defined in vmscan.c */
-extern struct rw_semaphore shrinker_rwsem;
+extern struct mutex shrinker_mutex;
extern struct list_head shrinker_list;
extern struct srcu_struct shrinker_srcu;
@@ -167,7 +167,7 @@ int shrinker_debugfs_add(struct shrinker *shrinker)
char buf[128];
int id;
- lockdep_assert_held(&shrinker_rwsem);
+ lockdep_assert_held(&shrinker_mutex);
/* debugfs isn't initialized yet, add debugfs entries later. */
if (!shrinker_debugfs_root)
@@ -210,7 +210,7 @@ int shrinker_debugfs_rename(struct shrinker *shrinker, const char *fmt, ...)
if (!new)
return -ENOMEM;
- down_write(&shrinker_rwsem);
+ mutex_lock(&shrinker_mutex);
old = shrinker->name;
shrinker->name = new;
@@ -228,7 +228,7 @@ int shrinker_debugfs_rename(struct shrinker *shrinker, const char *fmt, ...)
shrinker->debugfs_entry = entry;
}
- up_write(&shrinker_rwsem);
+ mutex_unlock(&shrinker_mutex);
kfree_const(old);
@@ -240,7 +240,7 @@ struct dentry *shrinker_debugfs_remove(struct shrinker *shrinker)
{
struct dentry *entry = shrinker->debugfs_entry;
- lockdep_assert_held(&shrinker_rwsem);
+ lockdep_assert_held(&shrinker_mutex);
kfree_const(shrinker->name);
shrinker->name = NULL;
@@ -265,14 +265,14 @@ static int __init shrinker_debugfs_init(void)
shrinker_debugfs_root = dentry;
/* Create debugfs entries for shrinkers registered at boot */
- down_write(&shrinker_rwsem);
+ mutex_lock(&shrinker_mutex);
list_for_each_entry(shrinker, &shrinker_list, list)
if (!shrinker->debugfs_entry) {
ret = shrinker_debugfs_add(shrinker);
if (ret)
break;
}
- up_write(&shrinker_rwsem);
+ mutex_unlock(&shrinker_mutex);
return ret;
}
diff --git a/mm/vmscan.c b/mm/vmscan.c
index b8da56beba96..f94bfe540675 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -35,7 +35,7 @@
#include <linux/cpuset.h>
#include <linux/compaction.h>
#include <linux/notifier.h>
-#include <linux/rwsem.h>
+#include <linux/mutex.h>
#include <linux/delay.h>
#include <linux/kthread.h>
#include <linux/freezer.h>
@@ -202,7 +202,7 @@ static void set_task_reclaim_state(struct task_struct *task,
}
LIST_HEAD(shrinker_list);
-DECLARE_RWSEM(shrinker_rwsem);
+DEFINE_MUTEX(shrinker_mutex);
DEFINE_SRCU(shrinker_srcu);
#ifdef CONFIG_MEMCG
@@ -224,7 +224,7 @@ static struct shrinker_info *shrinker_info_protected(struct mem_cgroup *memcg,
{
return srcu_dereference_check(memcg->nodeinfo[nid]->shrinker_info,
&shrinker_srcu,
- lockdep_is_held(&shrinker_rwsem));
+ lockdep_is_held(&shrinker_mutex));
}
static struct shrinker_info *shrinker_info_srcu(struct mem_cgroup *memcg,
@@ -297,7 +297,7 @@ int alloc_shrinker_info(struct mem_cgroup *memcg)
int nid, size, ret = 0;
int map_size, defer_size = 0;
- down_write(&shrinker_rwsem);
+ mutex_lock(&shrinker_mutex);
map_size = shrinker_map_size(shrinker_nr_max);
defer_size = shrinker_defer_size(shrinker_nr_max);
size = map_size + defer_size;
@@ -312,7 +312,7 @@ int alloc_shrinker_info(struct mem_cgroup *memcg)
info->map = (void *)info->nr_deferred + defer_size;
rcu_assign_pointer(memcg->nodeinfo[nid]->shrinker_info, info);
}
- up_write(&shrinker_rwsem);
+ mutex_unlock(&shrinker_mutex);
return ret;
}
@@ -337,7 +337,7 @@ static int expand_shrinker_info(int new_id)
if (!root_mem_cgroup)
goto out;
- lockdep_assert_held(&shrinker_rwsem);
+ lockdep_assert_held(&shrinker_mutex);
map_size = shrinker_map_size(new_nr_max);
defer_size = shrinker_defer_size(new_nr_max);
@@ -384,7 +384,7 @@ static int prealloc_memcg_shrinker(struct shrinker *shrinker)
if (mem_cgroup_disabled())
return -ENOSYS;
- down_write(&shrinker_rwsem);
+ mutex_lock(&shrinker_mutex);
id = idr_alloc(&shrinker_idr, shrinker, 0, 0, GFP_KERNEL);
if (id < 0)
goto unlock;
@@ -398,7 +398,7 @@ static int prealloc_memcg_shrinker(struct shrinker *shrinker)
shrinker->id = id;
ret = 0;
unlock:
- up_write(&shrinker_rwsem);
+ mutex_unlock(&shrinker_mutex);
return ret;
}
@@ -408,7 +408,7 @@ static void unregister_memcg_shrinker(struct shrinker *shrinker)
BUG_ON(id < 0);
- lockdep_assert_held(&shrinker_rwsem);
+ lockdep_assert_held(&shrinker_mutex);
idr_remove(&shrinker_idr, id);
}
@@ -701,9 +701,9 @@ void free_prealloced_shrinker(struct shrinker *shrinker)
shrinker->name = NULL;
#endif
if (shrinker->flags & SHRINKER_MEMCG_AWARE) {
- down_write(&shrinker_rwsem);
+ mutex_lock(&shrinker_mutex);
unregister_memcg_shrinker(shrinker);
- up_write(&shrinker_rwsem);
+ mutex_unlock(&shrinker_mutex);
return;
}
@@ -713,11 +713,11 @@ void free_prealloced_shrinker(struct shrinker *shrinker)
void register_shrinker_prepared(struct shrinker *shrinker)
{
- down_write(&shrinker_rwsem);
+ mutex_lock(&shrinker_mutex);
list_add_tail_rcu(&shrinker->list, &shrinker_list);
shrinker->flags |= SHRINKER_REGISTERED;
shrinker_debugfs_add(shrinker);
- up_write(&shrinker_rwsem);
+ mutex_unlock(&shrinker_mutex);
}
static int __register_shrinker(struct shrinker *shrinker)
@@ -767,13 +767,13 @@ void unregister_shrinker(struct shrinker *shrinker)
if (!(shrinker->flags & SHRINKER_REGISTERED))
return;
- down_write(&shrinker_rwsem);
+ mutex_lock(&shrinker_mutex);
list_del_rcu(&shrinker->list);
shrinker->flags &= ~SHRINKER_REGISTERED;
if (shrinker->flags & SHRINKER_MEMCG_AWARE)
unregister_memcg_shrinker(shrinker);
debugfs_entry = shrinker_debugfs_remove(shrinker);
- up_write(&shrinker_rwsem);
+ mutex_unlock(&shrinker_mutex);
synchronize_srcu(&shrinker_srcu);
--
2.20.1
^ permalink raw reply related [flat|nested] 16+ messages in thread