All of lore.kernel.org
 help / color / mirror / Atom feed
From: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
To: Shakeel Butt <shakeelb@google.com>,
	Minchan Kim <minchan@kernel.org>,
	Huang Ying <ying.huang@intel.com>,
	Mel Gorman <mgorman@techsingularity.net>,
	Vladimir Davydov <vdavydov.dev@gmail.com>,
	Michal Hocko <mhocko@kernel.org>,
	Greg Thelen <gthelen@google.com>,
	Johannes Weiner <hannes@cmpxchg.org>,
	Andrew Morton <akpm@linux-foundation.org>
Cc: linux-mm@kvack.org, linux-kernel@vger.kernel.org
Subject: Re: [PATCH v2] mm, shrinker: make shrinker_list lockless
Date: Thu, 9 Nov 2017 19:26:50 +0900	[thread overview]
Message-ID: <2940c150-577a-30a8-fac3-cf59a49b84b4@I-love.SAKURA.ne.jp> (raw)
In-Reply-To: <20171108173740.115166-1-shakeelb@google.com>

On 2017/11/09 2:37, Shakeel Butt wrote:
> In our production, we have observed that the job loader gets stuck for
> 10s of seconds while doing mount operation. It turns out that it was
> stuck in register_shrinker() and some unrelated job was under memory
> pressure and spending time in shrink_slab(). Our machines have a lot
> of shrinkers registered and jobs under memory pressure has to traverse
> all of those memcg-aware shrinkers and do affect unrelated jobs which
> want to register their own shrinkers.
> 
> This patch has made the shrinker_list traversal lockless and shrinker
> register remain fast. For the shrinker unregister, atomic counter
> has been introduced to avoid synchronize_rcu() call. The fields of
> struct shrinker has been rearraged to make sure that the size does
> not increase for x86_64.
> 
> The shrinker functions are allowed to reschedule() and thus can not
> be called with rcu read lock. One way to resolve that is to use
> srcu read lock but then ifdefs has to be used as SRCU is behind
> CONFIG_SRCU. Another way is to just release the rcu read lock before
> calling the shrinker and reacquire on the return. The atomic counter
> will make sure that the shrinker entry will not be freed under us.
> 
> Signed-off-by: Shakeel Butt <shakeelb@google.com>
> ---
> Changelog since v1:
> - release and reacquire rcu lock across shrinker call.
> 
>  include/linux/shrinker.h |  4 +++-
>  mm/vmscan.c              | 54 ++++++++++++++++++++++++++++++------------------
>  2 files changed, 37 insertions(+), 21 deletions(-)
> 

If you can accept serialized register_shrinker()/unregister_shrinker(),
I think that something like shown below can do it.

----------
diff --git a/include/linux/shrinker.h b/include/linux/shrinker.h
index 388ff29..e2272dd 100644
--- a/include/linux/shrinker.h
+++ b/include/linux/shrinker.h
@@ -62,9 +62,10 @@ struct shrinker {
 
 	int seeks;	/* seeks to recreate an obj */
 	long batch;	/* reclaim batch size, 0 = default */
-	unsigned long flags;
+	unsigned int flags;
 
 	/* These are for internal use */
+	atomic_t nr_active; /* Counted only if !SHRINKER_PERMANENT */
 	struct list_head list;
 	/* objs pending delete, per node */
 	atomic_long_t *nr_deferred;
@@ -74,6 +75,7 @@ struct shrinker {
 /* Flags */
 #define SHRINKER_NUMA_AWARE	(1 << 0)
 #define SHRINKER_MEMCG_AWARE	(1 << 1)
+#define SHRINKER_PERMANENT	(1 << 2)
 
 extern int register_shrinker(struct shrinker *);
 extern void unregister_shrinker(struct shrinker *);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 1c1bc95..e963359 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -157,7 +157,7 @@ struct scan_control {
 unsigned long vm_total_pages;
 
 static LIST_HEAD(shrinker_list);
-static DECLARE_RWSEM(shrinker_rwsem);
+static DEFINE_MUTEX(shrinker_lock);
 
 #ifdef CONFIG_MEMCG
 static bool global_reclaim(struct scan_control *sc)
@@ -285,9 +285,10 @@ int register_shrinker(struct shrinker *shrinker)
 	if (!shrinker->nr_deferred)
 		return -ENOMEM;
 
-	down_write(&shrinker_rwsem);
-	list_add_tail(&shrinker->list, &shrinker_list);
-	up_write(&shrinker_rwsem);
+	atomic_set(&shrinker->nr_active, 0);
+	mutex_lock(&shrinker_lock);
+	list_add_tail_rcu(&shrinker->list, &shrinker_list);
+	mutex_unlock(&shrinker_lock);
 	return 0;
 }
 EXPORT_SYMBOL(register_shrinker);
@@ -297,9 +298,14 @@ int register_shrinker(struct shrinker *shrinker)
  */
 void unregister_shrinker(struct shrinker *shrinker)
 {
-	down_write(&shrinker_rwsem);
-	list_del(&shrinker->list);
-	up_write(&shrinker_rwsem);
+	BUG_ON(shrinker->flags & SHRINKER_PERMANENT);
+	mutex_lock(&shrinker_lock);
+	list_del_rcu(&shrinker->list);
+	synchronize_rcu();
+	while (atomic_read(&shrinker->nr_active))
+		msleep(1);
+	synchronize_rcu();
+	mutex_unlock(&shrinker_lock);
 	kfree(shrinker->nr_deferred);
 }
 EXPORT_SYMBOL(unregister_shrinker);
@@ -468,18 +474,9 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
 	if (nr_scanned == 0)
 		nr_scanned = SWAP_CLUSTER_MAX;
 
-	if (!down_read_trylock(&shrinker_rwsem)) {
-		/*
-		 * If we would return 0, our callers would understand that we
-		 * have nothing else to shrink and give up trying. By returning
-		 * 1 we keep it going and assume we'll be able to shrink next
-		 * time.
-		 */
-		freed = 1;
-		goto out;
-	}
-
-	list_for_each_entry(shrinker, &shrinker_list, list) {
+	rcu_read_lock();
+	list_for_each_entry_rcu(shrinker, &shrinker_list, list) {
+		bool permanent;
 		struct shrink_control sc = {
 			.gfp_mask = gfp_mask,
 			.nid = nid,
@@ -498,11 +495,16 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
 		if (!(shrinker->flags & SHRINKER_NUMA_AWARE))
 			sc.nid = 0;
 
+		permanent = (shrinker->flags & SHRINKER_PERMANENT);
+		if (!permanent)
+			atomic_inc(&shrinker->nr_active);
+		rcu_read_unlock();
 		freed += do_shrink_slab(&sc, shrinker, nr_scanned, nr_eligible);
+		rcu_read_lock();
+		if (!permanent)
+			atomic_dec(&shrinker->nr_active);
 	}
-
-	up_read(&shrinker_rwsem);
-out:
+	rcu_read_unlock();
 	cond_resched();
 	return freed;
 }
----------

If you want parallel register_shrinker()/unregister_shrinker(), something like
shown below on top of shown above will do it.

----------
diff --git a/include/linux/shrinker.h b/include/linux/shrinker.h
index e2272dd..471b2f6 100644
--- a/include/linux/shrinker.h
+++ b/include/linux/shrinker.h
@@ -67,6 +67,7 @@ struct shrinker {
 	/* These are for internal use */
 	atomic_t nr_active; /* Counted only if !SHRINKER_PERMANENT */
 	struct list_head list;
+	struct list_head gc_list;
 	/* objs pending delete, per node */
 	atomic_long_t *nr_deferred;
 };
diff --git a/mm/vmscan.c b/mm/vmscan.c
index e963359..a216dc5 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -157,7 +157,7 @@ struct scan_control {
 unsigned long vm_total_pages;
 
 static LIST_HEAD(shrinker_list);
-static DEFINE_MUTEX(shrinker_lock);
+static DEFINE_SPINLOCK(shrinker_lock);
 
 #ifdef CONFIG_MEMCG
 static bool global_reclaim(struct scan_control *sc)
@@ -286,9 +286,9 @@ int register_shrinker(struct shrinker *shrinker)
 		return -ENOMEM;
 
 	atomic_set(&shrinker->nr_active, 0);
-	mutex_lock(&shrinker_lock);
+	spin_lock(&shrinker_lock);
 	list_add_tail_rcu(&shrinker->list, &shrinker_list);
-	mutex_unlock(&shrinker_lock);
+	spin_unlock(&shrinker_lock);
 	return 0;
 }
 EXPORT_SYMBOL(register_shrinker);
@@ -298,15 +298,30 @@ int register_shrinker(struct shrinker *shrinker)
  */
 void unregister_shrinker(struct shrinker *shrinker)
 {
+	static LIST_HEAD(shrinker_gc_list);
+	struct shrinker *gc;
+
 	BUG_ON(shrinker->flags & SHRINKER_PERMANENT);
-	mutex_lock(&shrinker_lock);
+	spin_lock(&shrinker_lock);
 	list_del_rcu(&shrinker->list);
+	/*
+	 * Need to update ->list.next if concurrently unregistering shrinkers
+	 * can find this shrinker, for this shrinker's unregistration might
+	 * complete before their unregistrations complete.
+	 */
+	list_for_each_entry(gc, &shrinker_gc_list, gc_list) {
+		if (gc->list.next == &shrinker->list)
+			rcu_assign_pointer(gc->list.next, shrinker->list.next);
+	}
+	list_add_tail(&shrinker->gc_list, &shrinker_gc_list);
+	spin_unlock(&shrinker_lock);
 	synchronize_rcu();
 	while (atomic_read(&shrinker->nr_active))
 		msleep(1);
 	synchronize_rcu();
-	mutex_unlock(&shrinker_lock);
+	spin_lock(&shrinker_lock);
+	list_del(&shrinker->gc_list);
+	spin_unlock(&shrinker_lock);
 	kfree(shrinker->nr_deferred);
 }
 EXPORT_SYMBOL(unregister_shrinker);
----------

F.Y.I. When I posted above change at
http://lkml.kernel.org/r/201411231350.DHI12456.OLOFFJSFtQVMHO@I-love.SAKURA.ne.jp ,
Michal Hocko commented like below.

  I thought that {un}register_shrinker are really unlikely
  paths called during initialization and tear down which usually do not
  happen during OOM conditions.

  I cannot judge the patch itself as this is out of my area but is the
  complexity worth it?

WARNING: multiple messages have this Message-ID (diff)
From: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
To: Shakeel Butt <shakeelb@google.com>,
	Minchan Kim <minchan@kernel.org>,
	Huang Ying <ying.huang@intel.com>,
	Mel Gorman <mgorman@techsingularity.net>,
	Vladimir Davydov <vdavydov.dev@gmail.com>,
	Michal Hocko <mhocko@kernel.org>,
	Greg Thelen <gthelen@google.com>,
	Johannes Weiner <hannes@cmpxchg.org>,
	Andrew Morton <akpm@linux-foundation.org>
Cc: linux-mm@kvack.org, linux-kernel@vger.kernel.org
Subject: Re: [PATCH v2] mm, shrinker: make shrinker_list lockless
Date: Thu, 9 Nov 2017 19:26:50 +0900	[thread overview]
Message-ID: <2940c150-577a-30a8-fac3-cf59a49b84b4@I-love.SAKURA.ne.jp> (raw)
In-Reply-To: <20171108173740.115166-1-shakeelb@google.com>

On 2017/11/09 2:37, Shakeel Butt wrote:
> In our production, we have observed that the job loader gets stuck for
> 10s of seconds while doing mount operation. It turns out that it was
> stuck in register_shrinker() and some unrelated job was under memory
> pressure and spending time in shrink_slab(). Our machines have a lot
> of shrinkers registered and jobs under memory pressure has to traverse
> all of those memcg-aware shrinkers and do affect unrelated jobs which
> want to register their own shrinkers.
> 
> This patch has made the shrinker_list traversal lockless and shrinker
> register remain fast. For the shrinker unregister, atomic counter
> has been introduced to avoid synchronize_rcu() call. The fields of
> struct shrinker has been rearraged to make sure that the size does
> not increase for x86_64.
> 
> The shrinker functions are allowed to reschedule() and thus can not
> be called with rcu read lock. One way to resolve that is to use
> srcu read lock but then ifdefs has to be used as SRCU is behind
> CONFIG_SRCU. Another way is to just release the rcu read lock before
> calling the shrinker and reacquire on the return. The atomic counter
> will make sure that the shrinker entry will not be freed under us.
> 
> Signed-off-by: Shakeel Butt <shakeelb@google.com>
> ---
> Changelog since v1:
> - release and reacquire rcu lock across shrinker call.
> 
>  include/linux/shrinker.h |  4 +++-
>  mm/vmscan.c              | 54 ++++++++++++++++++++++++++++++------------------
>  2 files changed, 37 insertions(+), 21 deletions(-)
> 

If you can accept serialized register_shrinker()/unregister_shrinker(),
I think that something like shown below can do it.

----------
diff --git a/include/linux/shrinker.h b/include/linux/shrinker.h
index 388ff29..e2272dd 100644
--- a/include/linux/shrinker.h
+++ b/include/linux/shrinker.h
@@ -62,9 +62,10 @@ struct shrinker {
 
 	int seeks;	/* seeks to recreate an obj */
 	long batch;	/* reclaim batch size, 0 = default */
-	unsigned long flags;
+	unsigned int flags;
 
 	/* These are for internal use */
+	atomic_t nr_active; /* Counted only if !SHRINKER_PERMANENT */
 	struct list_head list;
 	/* objs pending delete, per node */
 	atomic_long_t *nr_deferred;
@@ -74,6 +75,7 @@ struct shrinker {
 /* Flags */
 #define SHRINKER_NUMA_AWARE	(1 << 0)
 #define SHRINKER_MEMCG_AWARE	(1 << 1)
+#define SHRINKER_PERMANENT	(1 << 2)
 
 extern int register_shrinker(struct shrinker *);
 extern void unregister_shrinker(struct shrinker *);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 1c1bc95..e963359 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -157,7 +157,7 @@ struct scan_control {
 unsigned long vm_total_pages;
 
 static LIST_HEAD(shrinker_list);
-static DECLARE_RWSEM(shrinker_rwsem);
+static DEFINE_MUTEX(shrinker_lock);
 
 #ifdef CONFIG_MEMCG
 static bool global_reclaim(struct scan_control *sc)
@@ -285,9 +285,10 @@ int register_shrinker(struct shrinker *shrinker)
 	if (!shrinker->nr_deferred)
 		return -ENOMEM;
 
-	down_write(&shrinker_rwsem);
-	list_add_tail(&shrinker->list, &shrinker_list);
-	up_write(&shrinker_rwsem);
+	atomic_set(&shrinker->nr_active, 0);
+	mutex_lock(&shrinker_lock);
+	list_add_tail_rcu(&shrinker->list, &shrinker_list);
+	mutex_unlock(&shrinker_lock);
 	return 0;
 }
 EXPORT_SYMBOL(register_shrinker);
@@ -297,9 +298,14 @@ int register_shrinker(struct shrinker *shrinker)
  */
 void unregister_shrinker(struct shrinker *shrinker)
 {
-	down_write(&shrinker_rwsem);
-	list_del(&shrinker->list);
-	up_write(&shrinker_rwsem);
+	BUG_ON(shrinker->flags & SHRINKER_PERMANENT);
+	mutex_lock(&shrinker_lock);
+	list_del_rcu(&shrinker->list);
+	synchronize_rcu();
+	while (atomic_read(&shrinker->nr_active))
+		msleep(1);
+	synchronize_rcu();
+	mutex_unlock(&shrinker_lock);
 	kfree(shrinker->nr_deferred);
 }
 EXPORT_SYMBOL(unregister_shrinker);
@@ -468,18 +474,9 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
 	if (nr_scanned == 0)
 		nr_scanned = SWAP_CLUSTER_MAX;
 
-	if (!down_read_trylock(&shrinker_rwsem)) {
-		/*
-		 * If we would return 0, our callers would understand that we
-		 * have nothing else to shrink and give up trying. By returning
-		 * 1 we keep it going and assume we'll be able to shrink next
-		 * time.
-		 */
-		freed = 1;
-		goto out;
-	}
-
-	list_for_each_entry(shrinker, &shrinker_list, list) {
+	rcu_read_lock();
+	list_for_each_entry_rcu(shrinker, &shrinker_list, list) {
+		bool permanent;
 		struct shrink_control sc = {
 			.gfp_mask = gfp_mask,
 			.nid = nid,
@@ -498,11 +495,16 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
 		if (!(shrinker->flags & SHRINKER_NUMA_AWARE))
 			sc.nid = 0;
 
+		permanent = (shrinker->flags & SHRINKER_PERMANENT);
+		if (!permanent)
+			atomic_inc(&shrinker->nr_active);
+		rcu_read_unlock();
 		freed += do_shrink_slab(&sc, shrinker, nr_scanned, nr_eligible);
+		rcu_read_lock();
+		if (!permanent)
+			atomic_dec(&shrinker->nr_active);
 	}
-
-	up_read(&shrinker_rwsem);
-out:
+	rcu_read_unlock();
 	cond_resched();
 	return freed;
 }
----------

If you want parallel register_shrinker()/unregister_shrinker(), something like
shown below on top of shown above will do it.

----------
diff --git a/include/linux/shrinker.h b/include/linux/shrinker.h
index e2272dd..471b2f6 100644
--- a/include/linux/shrinker.h
+++ b/include/linux/shrinker.h
@@ -67,6 +67,7 @@ struct shrinker {
 	/* These are for internal use */
 	atomic_t nr_active; /* Counted only if !SHRINKER_PERMANENT */
 	struct list_head list;
+	struct list_head gc_list;
 	/* objs pending delete, per node */
 	atomic_long_t *nr_deferred;
 };
diff --git a/mm/vmscan.c b/mm/vmscan.c
index e963359..a216dc5 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -157,7 +157,7 @@ struct scan_control {
 unsigned long vm_total_pages;
 
 static LIST_HEAD(shrinker_list);
-static DEFINE_MUTEX(shrinker_lock);
+static DEFINE_SPINLOCK(shrinker_lock);
 
 #ifdef CONFIG_MEMCG
 static bool global_reclaim(struct scan_control *sc)
@@ -286,9 +286,9 @@ int register_shrinker(struct shrinker *shrinker)
 		return -ENOMEM;
 
 	atomic_set(&shrinker->nr_active, 0);
-	mutex_lock(&shrinker_lock);
+	spin_lock(&shrinker_lock);
 	list_add_tail_rcu(&shrinker->list, &shrinker_list);
-	mutex_unlock(&shrinker_lock);
+	spin_unlock(&shrinker_lock);
 	return 0;
 }
 EXPORT_SYMBOL(register_shrinker);
@@ -298,15 +298,30 @@ int register_shrinker(struct shrinker *shrinker)
  */
 void unregister_shrinker(struct shrinker *shrinker)
 {
+	static LIST_HEAD(shrinker_gc_list);
+	struct shrinker *gc;
+
 	BUG_ON(shrinker->flags & SHRINKER_PERMANENT);
-	mutex_lock(&shrinker_lock);
+	spin_lock(&shrinker_lock);
 	list_del_rcu(&shrinker->list);
+	/*
+	 * Need to update ->list.next if concurrently unregistering shrinkers
+	 * can find this shrinker, for this shrinker's unregistration might
+	 * complete before their unregistrations complete.
+	 */
+	list_for_each_entry(gc, &shrinker_gc_list, gc_list) {
+		if (gc->list.next == &shrinker->list)
+			rcu_assign_pointer(gc->list.next, shrinker->list.next);
+	}
+	list_add_tail(&shrinker->gc_list, &shrinker_gc_list);
+	spin_unlock(&shrinker_lock);
 	synchronize_rcu();
 	while (atomic_read(&shrinker->nr_active))
 		msleep(1);
 	synchronize_rcu();
-	mutex_unlock(&shrinker_lock);
+	spin_lock(&shrinker_lock);
+	list_del(&shrinker->gc_list);
+	spin_unlock(&shrinker_lock);
 	kfree(shrinker->nr_deferred);
 }
 EXPORT_SYMBOL(unregister_shrinker);
----------

F.Y.I. When I posted above change at
http://lkml.kernel.org/r/201411231350.DHI12456.OLOFFJSFtQVMHO@I-love.SAKURA.ne.jp ,
Michal Hocko commented like below.

  I thought that {un}register_shrinker are really unlikely
  paths called during initialization and tear down which usually do not
  happen during OOM conditions.

  I cannot judge the patch itself as this is out of my area but is the
  complexity worth it?

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

  parent reply	other threads:[~2017-11-09 10:27 UTC|newest]

Thread overview: 18+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2017-11-08 17:37 [PATCH v2] mm, shrinker: make shrinker_list lockless Shakeel Butt
2017-11-08 17:37 ` Shakeel Butt
2017-11-08 17:58 ` Greg Thelen
2017-11-08 17:58   ` Greg Thelen
2017-11-09  0:07 ` Minchan Kim
2017-11-09  0:07   ` Minchan Kim
2017-11-09  1:07   ` Shakeel Butt
2017-11-09  1:07     ` Shakeel Butt
2017-11-09  1:40     ` Minchan Kim
2017-11-09  1:40       ` Minchan Kim
2017-11-09 10:26 ` Tetsuo Handa [this message]
2017-11-09 10:26   ` Tetsuo Handa
2017-11-09 15:34   ` Shakeel Butt
2017-11-09 15:34     ` Shakeel Butt
2017-11-09 21:46     ` Tetsuo Handa
2017-11-09 21:46       ` Tetsuo Handa
2017-11-10 18:16       ` Shakeel Butt
2017-11-10 18:16         ` Shakeel Butt

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=2940c150-577a-30a8-fac3-cf59a49b84b4@I-love.SAKURA.ne.jp \
    --to=penguin-kernel@i-love.sakura.ne.jp \
    --cc=akpm@linux-foundation.org \
    --cc=gthelen@google.com \
    --cc=hannes@cmpxchg.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=mgorman@techsingularity.net \
    --cc=mhocko@kernel.org \
    --cc=minchan@kernel.org \
    --cc=shakeelb@google.com \
    --cc=vdavydov.dev@gmail.com \
    --cc=ying.huang@intel.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.