linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH 1/2] mm/zsmalloc.c: Migration can leave pages in ZS_EMPTY indefinitely
@ 2019-08-02  1:53 Henry Burns
  2019-08-02  1:53 ` [PATCH 2/2] mm/zsmalloc.c: Fix race condition in zs_destroy_pool Henry Burns
                   ` (3 more replies)
  0 siblings, 4 replies; 9+ messages in thread
From: Henry Burns @ 2019-08-02  1:53 UTC (permalink / raw)
  To: Minchan Kim, Nitin Gupta
  Cc: Sergey Senozhatsky, Shakeel Butt, Jonathan Adams, linux-mm,
	linux-kernel, Henry Burns

In zs_page_migrate() we call putback_zspage() after we have finished
migrating all pages in this zspage. However, the return value is ignored.
If a zs_free() races in between zs_page_isolate() and zs_page_migrate(),
freeing the last object in the zspage, putback_zspage() will leave the page
in ZS_EMPTY for potentially an unbounded amount of time.

To fix this, we need to do the same thing as zs_page_putback() does:
schedule free_work to occur.  To avoid duplicated code, move the
sequence to a new putback_zspage_deferred() function which both
zs_page_migrate() and zs_page_putback() call.

Signed-off-by: Henry Burns <henryburns@google.com>
---
 mm/zsmalloc.c | 30 ++++++++++++++++++++----------
 1 file changed, 20 insertions(+), 10 deletions(-)

diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index 1cda3fe0c2d9..efa660a87787 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -1901,6 +1901,22 @@ static void dec_zspage_isolation(struct zspage *zspage)
 	zspage->isolated--;
 }
 
+static void putback_zspage_deferred(struct zs_pool *pool,
+				    struct size_class *class,
+				    struct zspage *zspage)
+{
+	enum fullness_group fg;
+
+	fg = putback_zspage(class, zspage);
+	/*
+	 * Due to page_lock, we cannot free zspage immediately
+	 * so let's defer.
+	 */
+	if (fg == ZS_EMPTY)
+		schedule_work(&pool->free_work);
+
+}
+
 static void replace_sub_page(struct size_class *class, struct zspage *zspage,
 				struct page *newpage, struct page *oldpage)
 {
@@ -2070,7 +2086,7 @@ static int zs_page_migrate(struct address_space *mapping, struct page *newpage,
 	 * the list if @page is final isolated subpage in the zspage.
 	 */
 	if (!is_zspage_isolated(zspage))
-		putback_zspage(class, zspage);
+		putback_zspage_deferred(pool, class, zspage);
 
 	reset_page(page);
 	put_page(page);
@@ -2115,15 +2131,9 @@ static void zs_page_putback(struct page *page)
 
 	spin_lock(&class->lock);
 	dec_zspage_isolation(zspage);
-	if (!is_zspage_isolated(zspage)) {
-		fg = putback_zspage(class, zspage);
-		/*
-		 * Due to page_lock, we cannot free zspage immediately
-		 * so let's defer.
-		 */
-		if (fg == ZS_EMPTY)
-			schedule_work(&pool->free_work);
-	}
+	if (!is_zspage_isolated(zspage))
+		putback_zspage_deferred(pool, class, zspage);
+
 	spin_unlock(&class->lock);
 }
 
-- 
2.22.0.770.g0f2c4a37fd-goog


^ permalink raw reply related	[flat|nested] 9+ messages in thread

* [PATCH 2/2] mm/zsmalloc.c: Fix race condition in zs_destroy_pool
  2019-08-02  1:53 [PATCH 1/2] mm/zsmalloc.c: Migration can leave pages in ZS_EMPTY indefinitely Henry Burns
@ 2019-08-02  1:53 ` Henry Burns
  2019-08-05  4:28   ` Minchan Kim
  2019-08-02 14:58 ` [PATCH 1/2] mm/zsmalloc.c: Migration can leave pages in ZS_EMPTY indefinitely Shakeel Butt
                   ` (2 subsequent siblings)
  3 siblings, 1 reply; 9+ messages in thread
From: Henry Burns @ 2019-08-02  1:53 UTC (permalink / raw)
  To: Minchan Kim, Nitin Gupta
  Cc: Sergey Senozhatsky, Shakeel Butt, Jonathan Adams, linux-mm,
	linux-kernel, Henry Burns

In zs_destroy_pool() we call flush_work(&pool->free_work). However, we
have no guarantee that migration isn't happening in the background
at that time.

Since migration can't directly free pages, it relies on free_work
being scheduled to free the pages.  But there's nothing preventing an
in-progress migrate from queuing the work *after*
zs_unregister_migration() has called flush_work().  Which would mean
pages still pointing at the inode when we free it.

Since we know at destroy time all objects should be free, no new
migrations can come in (since zs_page_isolate() fails for fully-free
zspages).  This means it is sufficient to track a "# isolated zspages"
count by class, and have the destroy logic ensure all such pages have
drained before proceeding.  Keeping that state under the class
spinlock keeps the logic straightforward.

Signed-off-by: Henry Burns <henryburns@google.com>
---
 mm/zsmalloc.c | 68 ++++++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 65 insertions(+), 3 deletions(-)

diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index efa660a87787..1f16ed4d6a13 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -53,6 +53,7 @@
 #include <linux/zpool.h>
 #include <linux/mount.h>
 #include <linux/migrate.h>
+#include <linux/wait.h>
 #include <linux/pagemap.h>
 #include <linux/fs.h>
 
@@ -206,6 +207,10 @@ struct size_class {
 	int objs_per_zspage;
 	/* Number of PAGE_SIZE sized pages to combine to form a 'zspage' */
 	int pages_per_zspage;
+#ifdef CONFIG_COMPACTION
+	/* Number of zspages currently isolated by compaction */
+	int isolated;
+#endif
 
 	unsigned int index;
 	struct zs_size_stat stats;
@@ -267,6 +272,8 @@ struct zs_pool {
 #ifdef CONFIG_COMPACTION
 	struct inode *inode;
 	struct work_struct free_work;
+	/* A workqueue for when migration races with async_free_zspage() */
+	struct wait_queue_head migration_wait;
 #endif
 };
 
@@ -1917,6 +1924,21 @@ static void putback_zspage_deferred(struct zs_pool *pool,
 
 }
 
+static inline void zs_class_dec_isolated(struct zs_pool *pool,
+					 struct size_class *class)
+{
+	assert_spin_locked(&class->lock);
+	VM_BUG_ON(class->isolated <= 0);
+	class->isolated--;
+	/*
+	 * There's no possibility of racing, since wait_for_isolated_drain()
+	 * checks the isolated count under &class->lock after enqueuing
+	 * on migration_wait.
+	 */
+	if (class->isolated == 0 && waitqueue_active(&pool->migration_wait))
+		wake_up_all(&pool->migration_wait);
+}
+
 static void replace_sub_page(struct size_class *class, struct zspage *zspage,
 				struct page *newpage, struct page *oldpage)
 {
@@ -1986,6 +2008,7 @@ static bool zs_page_isolate(struct page *page, isolate_mode_t mode)
 	 */
 	if (!list_empty(&zspage->list) && !is_zspage_isolated(zspage)) {
 		get_zspage_mapping(zspage, &class_idx, &fullness);
+		class->isolated++;
 		remove_zspage(class, zspage, fullness);
 	}
 
@@ -2085,8 +2108,14 @@ static int zs_page_migrate(struct address_space *mapping, struct page *newpage,
 	 * Page migration is done so let's putback isolated zspage to
 	 * the list if @page is final isolated subpage in the zspage.
 	 */
-	if (!is_zspage_isolated(zspage))
+	if (!is_zspage_isolated(zspage)) {
+		/*
+		 * We still hold the class lock while all of this is happening,
+		 * so we cannot race with zs_destroy_pool()
+		 */
 		putback_zspage_deferred(pool, class, zspage);
+		zs_class_dec_isolated(pool, class);
+	}
 
 	reset_page(page);
 	put_page(page);
@@ -2131,9 +2160,11 @@ static void zs_page_putback(struct page *page)
 
 	spin_lock(&class->lock);
 	dec_zspage_isolation(zspage);
-	if (!is_zspage_isolated(zspage))
-		putback_zspage_deferred(pool, class, zspage);
 
+	if (!is_zspage_isolated(zspage)) {
+		putback_zspage_deferred(pool, class, zspage);
+		zs_class_dec_isolated(pool, class);
+	}
 	spin_unlock(&class->lock);
 }
 
@@ -2156,8 +2187,36 @@ static int zs_register_migration(struct zs_pool *pool)
 	return 0;
 }
 
+static bool class_isolated_are_drained(struct size_class *class)
+{
+	bool ret;
+
+	spin_lock(&class->lock);
+	ret = class->isolated == 0;
+	spin_unlock(&class->lock);
+	return ret;
+}
+
+/* Function for resolving migration */
+static void wait_for_isolated_drain(struct zs_pool *pool)
+{
+	int i;
+
+	/*
+	 * We're in the process of destroying the pool, so there are no
+	 * active allocations. zs_page_isolate() fails for completely free
+	 * zspages, so we need only wait for each size_class's isolated
+	 * count to hit zero.
+	 */
+	for (i = 0; i < ZS_SIZE_CLASSES; i++) {
+		wait_event(pool->migration_wait,
+			   class_isolated_are_drained(pool->size_class[i]));
+	}
+}
+
 static void zs_unregister_migration(struct zs_pool *pool)
 {
+	wait_for_isolated_drain(pool); /* This can block */
 	flush_work(&pool->free_work);
 	iput(pool->inode);
 }
@@ -2401,6 +2460,8 @@ struct zs_pool *zs_create_pool(const char *name)
 	if (!pool->name)
 		goto err;
 
+	init_waitqueue_head(&pool->migration_wait);
+
 	if (create_cache(pool))
 		goto err;
 
@@ -2466,6 +2527,7 @@ struct zs_pool *zs_create_pool(const char *name)
 		class->index = i;
 		class->pages_per_zspage = pages_per_zspage;
 		class->objs_per_zspage = objs_per_zspage;
+		class->isolated = 0;
 		spin_lock_init(&class->lock);
 		pool->size_class[i] = class;
 		for (fullness = ZS_EMPTY; fullness < NR_ZS_FULLNESS;
-- 
2.22.0.770.g0f2c4a37fd-goog


^ permalink raw reply related	[flat|nested] 9+ messages in thread

* Re: [PATCH 1/2] mm/zsmalloc.c: Migration can leave pages in ZS_EMPTY indefinitely
  2019-08-02  1:53 [PATCH 1/2] mm/zsmalloc.c: Migration can leave pages in ZS_EMPTY indefinitely Henry Burns
  2019-08-02  1:53 ` [PATCH 2/2] mm/zsmalloc.c: Fix race condition in zs_destroy_pool Henry Burns
@ 2019-08-02 14:58 ` Shakeel Butt
  2019-08-02 15:09 ` Jonathan Adams
  2019-08-05  4:14 ` Minchan Kim
  3 siblings, 0 replies; 9+ messages in thread
From: Shakeel Butt @ 2019-08-02 14:58 UTC (permalink / raw)
  To: Henry Burns
  Cc: Minchan Kim, Nitin Gupta, Sergey Senozhatsky, Jonathan Adams,
	Linux MM, LKML

On Thu, Aug 1, 2019 at 6:53 PM Henry Burns <henryburns@google.com> wrote:
>
> In zs_page_migrate() we call putback_zspage() after we have finished
> migrating all pages in this zspage. However, the return value is ignored.
> If a zs_free() races in between zs_page_isolate() and zs_page_migrate(),
> freeing the last object in the zspage, putback_zspage() will leave the page
> in ZS_EMPTY for potentially an unbounded amount of time.
>
> To fix this, we need to do the same thing as zs_page_putback() does:
> schedule free_work to occur.  To avoid duplicated code, move the
> sequence to a new putback_zspage_deferred() function which both
> zs_page_migrate() and zs_page_putback() call.
>
> Signed-off-by: Henry Burns <henryburns@google.com>

Reviewed-by: Shakeel Butt <shakeelb@google.com>

> ---
>  mm/zsmalloc.c | 30 ++++++++++++++++++++----------
>  1 file changed, 20 insertions(+), 10 deletions(-)
>
> diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
> index 1cda3fe0c2d9..efa660a87787 100644
> --- a/mm/zsmalloc.c
> +++ b/mm/zsmalloc.c
> @@ -1901,6 +1901,22 @@ static void dec_zspage_isolation(struct zspage *zspage)
>         zspage->isolated--;
>  }
>
> +static void putback_zspage_deferred(struct zs_pool *pool,
> +                                   struct size_class *class,
> +                                   struct zspage *zspage)
> +{
> +       enum fullness_group fg;
> +
> +       fg = putback_zspage(class, zspage);
> +       /*
> +        * Due to page_lock, we cannot free zspage immediately
> +        * so let's defer.
> +        */
> +       if (fg == ZS_EMPTY)
> +               schedule_work(&pool->free_work);
> +
> +}
> +
>  static void replace_sub_page(struct size_class *class, struct zspage *zspage,
>                                 struct page *newpage, struct page *oldpage)
>  {
> @@ -2070,7 +2086,7 @@ static int zs_page_migrate(struct address_space *mapping, struct page *newpage,
>          * the list if @page is final isolated subpage in the zspage.
>          */
>         if (!is_zspage_isolated(zspage))
> -               putback_zspage(class, zspage);
> +               putback_zspage_deferred(pool, class, zspage);
>
>         reset_page(page);
>         put_page(page);
> @@ -2115,15 +2131,9 @@ static void zs_page_putback(struct page *page)
>
>         spin_lock(&class->lock);
>         dec_zspage_isolation(zspage);
> -       if (!is_zspage_isolated(zspage)) {
> -               fg = putback_zspage(class, zspage);
> -               /*
> -                * Due to page_lock, we cannot free zspage immediately
> -                * so let's defer.
> -                */
> -               if (fg == ZS_EMPTY)
> -                       schedule_work(&pool->free_work);
> -       }
> +       if (!is_zspage_isolated(zspage))
> +               putback_zspage_deferred(pool, class, zspage);
> +
>         spin_unlock(&class->lock);
>  }
>
> --
> 2.22.0.770.g0f2c4a37fd-goog
>

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH 1/2] mm/zsmalloc.c: Migration can leave pages in ZS_EMPTY indefinitely
  2019-08-02  1:53 [PATCH 1/2] mm/zsmalloc.c: Migration can leave pages in ZS_EMPTY indefinitely Henry Burns
  2019-08-02  1:53 ` [PATCH 2/2] mm/zsmalloc.c: Fix race condition in zs_destroy_pool Henry Burns
  2019-08-02 14:58 ` [PATCH 1/2] mm/zsmalloc.c: Migration can leave pages in ZS_EMPTY indefinitely Shakeel Butt
@ 2019-08-02 15:09 ` Jonathan Adams
  2019-08-05  4:14 ` Minchan Kim
  3 siblings, 0 replies; 9+ messages in thread
From: Jonathan Adams @ 2019-08-02 15:09 UTC (permalink / raw)
  To: Henry Burns
  Cc: Minchan Kim, Nitin Gupta, Sergey Senozhatsky, Shakeel Butt,
	Linux-MM, LKML

On Thu, Aug 1, 2019 at 6:53 PM Henry Burns <henryburns@google.com> wrote:
>
> In zs_page_migrate() we call putback_zspage() after we have finished
> migrating all pages in this zspage. However, the return value is ignored.
> If a zs_free() races in between zs_page_isolate() and zs_page_migrate(),
> freeing the last object in the zspage, putback_zspage() will leave the page
> in ZS_EMPTY for potentially an unbounded amount of time.
>
> To fix this, we need to do the same thing as zs_page_putback() does:
> schedule free_work to occur.  To avoid duplicated code, move the
> sequence to a new putback_zspage_deferred() function which both
> zs_page_migrate() and zs_page_putback() call.
>
> Signed-off-by: Henry Burns <henryburns@google.com>

Reviewed-by: Jonathan Adams <jwadams@google.com>
>
> ---
>  mm/zsmalloc.c | 30 ++++++++++++++++++++----------
>  1 file changed, 20 insertions(+), 10 deletions(-)
>
> diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
> index 1cda3fe0c2d9..efa660a87787 100644
> --- a/mm/zsmalloc.c
> +++ b/mm/zsmalloc.c
> @@ -1901,6 +1901,22 @@ static void dec_zspage_isolation(struct zspage *zspage)
>         zspage->isolated--;
>  }
>
> +static void putback_zspage_deferred(struct zs_pool *pool,
> +                                   struct size_class *class,
> +                                   struct zspage *zspage)
> +{
> +       enum fullness_group fg;
> +
> +       fg = putback_zspage(class, zspage);
> +       /*
> +        * Due to page_lock, we cannot free zspage immediately
> +        * so let's defer.
> +        */
> +       if (fg == ZS_EMPTY)
> +               schedule_work(&pool->free_work);
> +
> +}
> +
>  static void replace_sub_page(struct size_class *class, struct zspage *zspage,
>                                 struct page *newpage, struct page *oldpage)
>  {
> @@ -2070,7 +2086,7 @@ static int zs_page_migrate(struct address_space *mapping, struct page *newpage,
>          * the list if @page is final isolated subpage in the zspage.
>          */
>         if (!is_zspage_isolated(zspage))
> -               putback_zspage(class, zspage);
> +               putback_zspage_deferred(pool, class, zspage);
>
>         reset_page(page);
>         put_page(page);
> @@ -2115,15 +2131,9 @@ static void zs_page_putback(struct page *page)
>
>         spin_lock(&class->lock);
>         dec_zspage_isolation(zspage);
> -       if (!is_zspage_isolated(zspage)) {
> -               fg = putback_zspage(class, zspage);
> -               /*
> -                * Due to page_lock, we cannot free zspage immediately
> -                * so let's defer.
> -                */
> -               if (fg == ZS_EMPTY)
> -                       schedule_work(&pool->free_work);
> -       }
> +       if (!is_zspage_isolated(zspage))
> +               putback_zspage_deferred(pool, class, zspage);
> +
>         spin_unlock(&class->lock);
>  }
>
> --
> 2.22.0.770.g0f2c4a37fd-goog
>

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH 1/2] mm/zsmalloc.c: Migration can leave pages in ZS_EMPTY indefinitely
  2019-08-02  1:53 [PATCH 1/2] mm/zsmalloc.c: Migration can leave pages in ZS_EMPTY indefinitely Henry Burns
                   ` (2 preceding siblings ...)
  2019-08-02 15:09 ` Jonathan Adams
@ 2019-08-05  4:14 ` Minchan Kim
  3 siblings, 0 replies; 9+ messages in thread
From: Minchan Kim @ 2019-08-05  4:14 UTC (permalink / raw)
  To: Henry Burns
  Cc: Nitin Gupta, Sergey Senozhatsky, Shakeel Butt, Jonathan Adams,
	linux-mm, linux-kernel

On Thu, Aug 01, 2019 at 06:53:31PM -0700, Henry Burns wrote:
> In zs_page_migrate() we call putback_zspage() after we have finished
> migrating all pages in this zspage. However, the return value is ignored.
> If a zs_free() races in between zs_page_isolate() and zs_page_migrate(),
> freeing the last object in the zspage, putback_zspage() will leave the page
> in ZS_EMPTY for potentially an unbounded amount of time.

Nice catch.

> 
> To fix this, we need to do the same thing as zs_page_putback() does:
> schedule free_work to occur.  To avoid duplicated code, move the
> sequence to a new putback_zspage_deferred() function which both
> zs_page_migrate() and zs_page_putback() call.
> 
> Signed-off-by: Henry Burns <henryburns@google.com>
Cc: <stable@vger.kernel.org>    [4.8+]
Acked-by: Minchan Kim <minchan@kernel.org>

Below a just trivial:

> ---
>  mm/zsmalloc.c | 30 ++++++++++++++++++++----------
>  1 file changed, 20 insertions(+), 10 deletions(-)
> 
> diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
> index 1cda3fe0c2d9..efa660a87787 100644
> --- a/mm/zsmalloc.c
> +++ b/mm/zsmalloc.c
> @@ -1901,6 +1901,22 @@ static void dec_zspage_isolation(struct zspage *zspage)
>  	zspage->isolated--;
>  }
>  
> +static void putback_zspage_deferred(struct zs_pool *pool,
> +				    struct size_class *class,
> +				    struct zspage *zspage)
> +{
> +	enum fullness_group fg;
> +
> +	fg = putback_zspage(class, zspage);
> +	/*
> +	 * Due to page_lock, we cannot free zspage immediately
> +	 * so let's defer.
> +	 */

Could you move this comment function's description since it becomes
a function?

Thanks.

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH 2/2] mm/zsmalloc.c: Fix race condition in zs_destroy_pool
  2019-08-02  1:53 ` [PATCH 2/2] mm/zsmalloc.c: Fix race condition in zs_destroy_pool Henry Burns
@ 2019-08-05  4:28   ` Minchan Kim
  2019-08-05 17:34     ` Henry Burns
  0 siblings, 1 reply; 9+ messages in thread
From: Minchan Kim @ 2019-08-05  4:28 UTC (permalink / raw)
  To: Henry Burns
  Cc: Nitin Gupta, Sergey Senozhatsky, Shakeel Butt, Jonathan Adams,
	linux-mm, linux-kernel

Hi Henry,

On Thu, Aug 01, 2019 at 06:53:32PM -0700, Henry Burns wrote:
> In zs_destroy_pool() we call flush_work(&pool->free_work). However, we
> have no guarantee that migration isn't happening in the background
> at that time.
> 
> Since migration can't directly free pages, it relies on free_work
> being scheduled to free the pages.  But there's nothing preventing an
> in-progress migrate from queuing the work *after*
> zs_unregister_migration() has called flush_work().  Which would mean
> pages still pointing at the inode when we free it.

We already unregister shrinker so there is no upcoming async free call
via shrinker so the only concern is zs_compact API direct call from
the user. Is that what what you desribe from the description?

If so, can't we add a flag to indicate destroy of the pool and
global counter to indicate how many of zs_compact was nested?

So, zs_unregister_migration in zs_destroy_pool can set the flag to
prevent upcoming zs_compact call and wait until the global counter
will be zero. Once it's done, finally flush the work.

My point is it's not a per-class granuarity but global.

Thanks.

> 
> Since we know at destroy time all objects should be free, no new
> migrations can come in (since zs_page_isolate() fails for fully-free
> zspages).  This means it is sufficient to track a "# isolated zspages"
> count by class, and have the destroy logic ensure all such pages have
> drained before proceeding.  Keeping that state under the class
> spinlock keeps the logic straightforward.
> 
> Signed-off-by: Henry Burns <henryburns@google.com>
> ---
>  mm/zsmalloc.c | 68 ++++++++++++++++++++++++++++++++++++++++++++++++---
>  1 file changed, 65 insertions(+), 3 deletions(-)
> 
> diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
> index efa660a87787..1f16ed4d6a13 100644
> --- a/mm/zsmalloc.c
> +++ b/mm/zsmalloc.c
> @@ -53,6 +53,7 @@
>  #include <linux/zpool.h>
>  #include <linux/mount.h>
>  #include <linux/migrate.h>
> +#include <linux/wait.h>
>  #include <linux/pagemap.h>
>  #include <linux/fs.h>
>  
> @@ -206,6 +207,10 @@ struct size_class {
>  	int objs_per_zspage;
>  	/* Number of PAGE_SIZE sized pages to combine to form a 'zspage' */
>  	int pages_per_zspage;
> +#ifdef CONFIG_COMPACTION
> +	/* Number of zspages currently isolated by compaction */
> +	int isolated;
> +#endif
>  
>  	unsigned int index;
>  	struct zs_size_stat stats;
> @@ -267,6 +272,8 @@ struct zs_pool {
>  #ifdef CONFIG_COMPACTION
>  	struct inode *inode;
>  	struct work_struct free_work;
> +	/* A workqueue for when migration races with async_free_zspage() */
> +	struct wait_queue_head migration_wait;
>  #endif
>  };
>  
> @@ -1917,6 +1924,21 @@ static void putback_zspage_deferred(struct zs_pool *pool,
>  
>  }
>  
> +static inline void zs_class_dec_isolated(struct zs_pool *pool,
> +					 struct size_class *class)
> +{
> +	assert_spin_locked(&class->lock);
> +	VM_BUG_ON(class->isolated <= 0);
> +	class->isolated--;
> +	/*
> +	 * There's no possibility of racing, since wait_for_isolated_drain()
> +	 * checks the isolated count under &class->lock after enqueuing
> +	 * on migration_wait.
> +	 */
> +	if (class->isolated == 0 && waitqueue_active(&pool->migration_wait))
> +		wake_up_all(&pool->migration_wait);
> +}
> +
>  static void replace_sub_page(struct size_class *class, struct zspage *zspage,
>  				struct page *newpage, struct page *oldpage)
>  {
> @@ -1986,6 +2008,7 @@ static bool zs_page_isolate(struct page *page, isolate_mode_t mode)
>  	 */
>  	if (!list_empty(&zspage->list) && !is_zspage_isolated(zspage)) {
>  		get_zspage_mapping(zspage, &class_idx, &fullness);
> +		class->isolated++;
>  		remove_zspage(class, zspage, fullness);
>  	}
>  
> @@ -2085,8 +2108,14 @@ static int zs_page_migrate(struct address_space *mapping, struct page *newpage,
>  	 * Page migration is done so let's putback isolated zspage to
>  	 * the list if @page is final isolated subpage in the zspage.
>  	 */
> -	if (!is_zspage_isolated(zspage))
> +	if (!is_zspage_isolated(zspage)) {
> +		/*
> +		 * We still hold the class lock while all of this is happening,
> +		 * so we cannot race with zs_destroy_pool()
> +		 */
>  		putback_zspage_deferred(pool, class, zspage);
> +		zs_class_dec_isolated(pool, class);
> +	}
>  
>  	reset_page(page);
>  	put_page(page);
> @@ -2131,9 +2160,11 @@ static void zs_page_putback(struct page *page)
>  
>  	spin_lock(&class->lock);
>  	dec_zspage_isolation(zspage);
> -	if (!is_zspage_isolated(zspage))
> -		putback_zspage_deferred(pool, class, zspage);
>  
> +	if (!is_zspage_isolated(zspage)) {
> +		putback_zspage_deferred(pool, class, zspage);
> +		zs_class_dec_isolated(pool, class);
> +	}
>  	spin_unlock(&class->lock);
>  }
>  
> @@ -2156,8 +2187,36 @@ static int zs_register_migration(struct zs_pool *pool)
>  	return 0;
>  }
>  
> +static bool class_isolated_are_drained(struct size_class *class)
> +{
> +	bool ret;
> +
> +	spin_lock(&class->lock);
> +	ret = class->isolated == 0;
> +	spin_unlock(&class->lock);
> +	return ret;
> +}
> +
> +/* Function for resolving migration */
> +static void wait_for_isolated_drain(struct zs_pool *pool)
> +{
> +	int i;
> +
> +	/*
> +	 * We're in the process of destroying the pool, so there are no
> +	 * active allocations. zs_page_isolate() fails for completely free
> +	 * zspages, so we need only wait for each size_class's isolated
> +	 * count to hit zero.
> +	 */
> +	for (i = 0; i < ZS_SIZE_CLASSES; i++) {
> +		wait_event(pool->migration_wait,
> +			   class_isolated_are_drained(pool->size_class[i]));
> +	}
> +}
> +
>  static void zs_unregister_migration(struct zs_pool *pool)
>  {
> +	wait_for_isolated_drain(pool); /* This can block */
>  	flush_work(&pool->free_work);
>  	iput(pool->inode);
>  }
> @@ -2401,6 +2460,8 @@ struct zs_pool *zs_create_pool(const char *name)
>  	if (!pool->name)
>  		goto err;
>  
> +	init_waitqueue_head(&pool->migration_wait);
> +
>  	if (create_cache(pool))
>  		goto err;
>  
> @@ -2466,6 +2527,7 @@ struct zs_pool *zs_create_pool(const char *name)
>  		class->index = i;
>  		class->pages_per_zspage = pages_per_zspage;
>  		class->objs_per_zspage = objs_per_zspage;
> +		class->isolated = 0;
>  		spin_lock_init(&class->lock);
>  		pool->size_class[i] = class;
>  		for (fullness = ZS_EMPTY; fullness < NR_ZS_FULLNESS;
> -- 
> 2.22.0.770.g0f2c4a37fd-goog
> 

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH 2/2] mm/zsmalloc.c: Fix race condition in zs_destroy_pool
  2019-08-05  4:28   ` Minchan Kim
@ 2019-08-05 17:34     ` Henry Burns
  2019-08-06  1:38       ` Minchan Kim
  0 siblings, 1 reply; 9+ messages in thread
From: Henry Burns @ 2019-08-05 17:34 UTC (permalink / raw)
  To: Minchan Kim
  Cc: Nitin Gupta, Sergey Senozhatsky, Shakeel Butt, Jonathan Adams,
	Linux MM, LKML

On Sun, Aug 4, 2019 at 9:28 PM Minchan Kim <minchan@kernel.org> wrote:
>
> Hi Henry,
>
> On Thu, Aug 01, 2019 at 06:53:32PM -0700, Henry Burns wrote:
> > In zs_destroy_pool() we call flush_work(&pool->free_work). However, we
> > have no guarantee that migration isn't happening in the background
> > at that time.
> >
> > Since migration can't directly free pages, it relies on free_work
> > being scheduled to free the pages.  But there's nothing preventing an
> > in-progress migrate from queuing the work *after*
> > zs_unregister_migration() has called flush_work().  Which would mean
> > pages still pointing at the inode when we free it.
>
> We already unregister shrinker so there is no upcoming async free call
> via shrinker so the only concern is zs_compact API direct call from
> the user. Is that what what you desribe from the description?

What I am describing is a call to zsmalloc_aops->migratepage() by
kcompactd (which can call schedule work in either
zs_page_migrate() or zs_page_putback should the zspage become empty).

While we are migrating a page, we remove it from the class. Suppose
zs_free() loses a race with migration. We would schedule
async_free_zspage() to handle freeing that zspage, however we have no
guarantee that migration has finished
by the time we finish flush_work(&pool->work). In that case we then
call iput(inode), and now we have a page
pointing to a non-existent inode. (At which point something like
kcompactd would potentially BUG() if it tries to get a page
(from the inode) that doesn't exist anymore)


>
> If so, can't we add a flag to indicate destroy of the pool and
> global counter to indicate how many of zs_compact was nested?
>
> So, zs_unregister_migration in zs_destroy_pool can set the flag to
> prevent upcoming zs_compact call and wait until the global counter
> will be zero. Once it's done, finally flush the work.
>
> My point is it's not a per-class granuarity but global.

We could have a pool level counter of isolated pages, and wait for
that to finish before starting flush_work(&pool->work); However,
that would require an atomic_long in zs_pool, and we would have to eat
the cost of any contention over that lock. Still, it might be
preferable to a per-class granularity.

>
> Thanks.
>
> >
> > Since we know at destroy time all objects should be free, no new
> > migrations can come in (since zs_page_isolate() fails for fully-free
> > zspages).  This means it is sufficient to track a "# isolated zspages"
> > count by class, and have the destroy logic ensure all such pages have
> > drained before proceeding.  Keeping that state under the class
> > spinlock keeps the logic straightforward.
> >
> > Signed-off-by: Henry Burns <henryburns@google.com>
> > ---
> >  mm/zsmalloc.c | 68 ++++++++++++++++++++++++++++++++++++++++++++++++---
> >  1 file changed, 65 insertions(+), 3 deletions(-)
> >
> > diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
> > index efa660a87787..1f16ed4d6a13 100644
> > --- a/mm/zsmalloc.c
> > +++ b/mm/zsmalloc.c
> > @@ -53,6 +53,7 @@
> >  #include <linux/zpool.h>
> >  #include <linux/mount.h>
> >  #include <linux/migrate.h>
> > +#include <linux/wait.h>
> >  #include <linux/pagemap.h>
> >  #include <linux/fs.h>
> >
> > @@ -206,6 +207,10 @@ struct size_class {
> >       int objs_per_zspage;
> >       /* Number of PAGE_SIZE sized pages to combine to form a 'zspage' */
> >       int pages_per_zspage;
> > +#ifdef CONFIG_COMPACTION
> > +     /* Number of zspages currently isolated by compaction */
> > +     int isolated;
> > +#endif
> >
> >       unsigned int index;
> >       struct zs_size_stat stats;
> > @@ -267,6 +272,8 @@ struct zs_pool {
> >  #ifdef CONFIG_COMPACTION
> >       struct inode *inode;
> >       struct work_struct free_work;
> > +     /* A workqueue for when migration races with async_free_zspage() */
> > +     struct wait_queue_head migration_wait;
> >  #endif
> >  };
> >
> > @@ -1917,6 +1924,21 @@ static void putback_zspage_deferred(struct zs_pool *pool,
> >
> >  }
> >
> > +static inline void zs_class_dec_isolated(struct zs_pool *pool,
> > +                                      struct size_class *class)
> > +{
> > +     assert_spin_locked(&class->lock);
> > +     VM_BUG_ON(class->isolated <= 0);
> > +     class->isolated--;
> > +     /*
> > +      * There's no possibility of racing, since wait_for_isolated_drain()
> > +      * checks the isolated count under &class->lock after enqueuing
> > +      * on migration_wait.
> > +      */
> > +     if (class->isolated == 0 && waitqueue_active(&pool->migration_wait))
> > +             wake_up_all(&pool->migration_wait);
> > +}
> > +
> >  static void replace_sub_page(struct size_class *class, struct zspage *zspage,
> >                               struct page *newpage, struct page *oldpage)
> >  {
> > @@ -1986,6 +2008,7 @@ static bool zs_page_isolate(struct page *page, isolate_mode_t mode)
> >        */
> >       if (!list_empty(&zspage->list) && !is_zspage_isolated(zspage)) {
> >               get_zspage_mapping(zspage, &class_idx, &fullness);
> > +             class->isolated++;
> >               remove_zspage(class, zspage, fullness);
> >       }
> >
> > @@ -2085,8 +2108,14 @@ static int zs_page_migrate(struct address_space *mapping, struct page *newpage,
> >        * Page migration is done so let's putback isolated zspage to
> >        * the list if @page is final isolated subpage in the zspage.
> >        */
> > -     if (!is_zspage_isolated(zspage))
> > +     if (!is_zspage_isolated(zspage)) {
> > +             /*
> > +              * We still hold the class lock while all of this is happening,
> > +              * so we cannot race with zs_destroy_pool()
> > +              */
> >               putback_zspage_deferred(pool, class, zspage);
> > +             zs_class_dec_isolated(pool, class);
> > +     }
> >
> >       reset_page(page);
> >       put_page(page);
> > @@ -2131,9 +2160,11 @@ static void zs_page_putback(struct page *page)
> >
> >       spin_lock(&class->lock);
> >       dec_zspage_isolation(zspage);
> > -     if (!is_zspage_isolated(zspage))
> > -             putback_zspage_deferred(pool, class, zspage);
> >
> > +     if (!is_zspage_isolated(zspage)) {
> > +             putback_zspage_deferred(pool, class, zspage);
> > +             zs_class_dec_isolated(pool, class);
> > +     }
> >       spin_unlock(&class->lock);
> >  }
> >
> > @@ -2156,8 +2187,36 @@ static int zs_register_migration(struct zs_pool *pool)
> >       return 0;
> >  }
> >
> > +static bool class_isolated_are_drained(struct size_class *class)
> > +{
> > +     bool ret;
> > +
> > +     spin_lock(&class->lock);
> > +     ret = class->isolated == 0;
> > +     spin_unlock(&class->lock);
> > +     return ret;
> > +}
> > +
> > +/* Function for resolving migration */
> > +static void wait_for_isolated_drain(struct zs_pool *pool)
> > +{
> > +     int i;
> > +
> > +     /*
> > +      * We're in the process of destroying the pool, so there are no
> > +      * active allocations. zs_page_isolate() fails for completely free
> > +      * zspages, so we need only wait for each size_class's isolated
> > +      * count to hit zero.
> > +      */
> > +     for (i = 0; i < ZS_SIZE_CLASSES; i++) {
> > +             wait_event(pool->migration_wait,
> > +                        class_isolated_are_drained(pool->size_class[i]));
> > +     }
> > +}
> > +
> >  static void zs_unregister_migration(struct zs_pool *pool)
> >  {
> > +     wait_for_isolated_drain(pool); /* This can block */
> >       flush_work(&pool->free_work);
> >       iput(pool->inode);
> >  }
> > @@ -2401,6 +2460,8 @@ struct zs_pool *zs_create_pool(const char *name)
> >       if (!pool->name)
> >               goto err;
> >
> > +     init_waitqueue_head(&pool->migration_wait);
> > +
> >       if (create_cache(pool))
> >               goto err;
> >
> > @@ -2466,6 +2527,7 @@ struct zs_pool *zs_create_pool(const char *name)
> >               class->index = i;
> >               class->pages_per_zspage = pages_per_zspage;
> >               class->objs_per_zspage = objs_per_zspage;
> > +             class->isolated = 0;
> >               spin_lock_init(&class->lock);
> >               pool->size_class[i] = class;
> >               for (fullness = ZS_EMPTY; fullness < NR_ZS_FULLNESS;
> > --
> > 2.22.0.770.g0f2c4a37fd-goog
> >

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH 2/2] mm/zsmalloc.c: Fix race condition in zs_destroy_pool
  2019-08-05 17:34     ` Henry Burns
@ 2019-08-06  1:38       ` Minchan Kim
  2019-08-06 19:53         ` Henry Burns
  0 siblings, 1 reply; 9+ messages in thread
From: Minchan Kim @ 2019-08-06  1:38 UTC (permalink / raw)
  To: Henry Burns
  Cc: Nitin Gupta, Sergey Senozhatsky, Shakeel Butt, Jonathan Adams,
	Linux MM, LKML

On Mon, Aug 05, 2019 at 10:34:41AM -0700, Henry Burns wrote:
> On Sun, Aug 4, 2019 at 9:28 PM Minchan Kim <minchan@kernel.org> wrote:
> >
> > Hi Henry,
> >
> > On Thu, Aug 01, 2019 at 06:53:32PM -0700, Henry Burns wrote:
> > > In zs_destroy_pool() we call flush_work(&pool->free_work). However, we
> > > have no guarantee that migration isn't happening in the background
> > > at that time.
> > >
> > > Since migration can't directly free pages, it relies on free_work
> > > being scheduled to free the pages.  But there's nothing preventing an
> > > in-progress migrate from queuing the work *after*
> > > zs_unregister_migration() has called flush_work().  Which would mean
> > > pages still pointing at the inode when we free it.
> >
> > We already unregister shrinker so there is no upcoming async free call
> > via shrinker so the only concern is zs_compact API direct call from
> > the user. Is that what what you desribe from the description?
> 
> What I am describing is a call to zsmalloc_aops->migratepage() by
> kcompactd (which can call schedule work in either
> zs_page_migrate() or zs_page_putback should the zspage become empty).
> 
> While we are migrating a page, we remove it from the class. Suppose
> zs_free() loses a race with migration. We would schedule
> async_free_zspage() to handle freeing that zspage, however we have no
> guarantee that migration has finished
> by the time we finish flush_work(&pool->work). In that case we then
> call iput(inode), and now we have a page
> pointing to a non-existent inode. (At which point something like
> kcompactd would potentially BUG() if it tries to get a page
> (from the inode) that doesn't exist anymore)
> 

True.
I totally got mixed up internal migration and external migration. :-/

> 
> >
> > If so, can't we add a flag to indicate destroy of the pool and
> > global counter to indicate how many of zs_compact was nested?
> >
> > So, zs_unregister_migration in zs_destroy_pool can set the flag to
> > prevent upcoming zs_compact call and wait until the global counter
> > will be zero. Once it's done, finally flush the work.
> >
> > My point is it's not a per-class granuarity but global.
> 
> We could have a pool level counter of isolated pages, and wait for
> that to finish before starting flush_work(&pool->work); However,
> that would require an atomic_long in zs_pool, and we would have to eat
> the cost of any contention over that lock. Still, it might be
> preferable to a per-class granularity.

That would be better for performance-wise but how it's significant?
Migration is not already hot path so adding a atomic variable in that path
wouldn't make noticible slow.

Rather than performance, my worry is maintainance so prefer simple and
not fragile.

> 
> >
> > Thanks.
> >
> > >
> > > Since we know at destroy time all objects should be free, no new
> > > migrations can come in (since zs_page_isolate() fails for fully-free
> > > zspages).  This means it is sufficient to track a "# isolated zspages"
> > > count by class, and have the destroy logic ensure all such pages have
> > > drained before proceeding.  Keeping that state under the class
> > > spinlock keeps the logic straightforward.
> > >
> > > Signed-off-by: Henry Burns <henryburns@google.com>
> > > ---
> > >  mm/zsmalloc.c | 68 ++++++++++++++++++++++++++++++++++++++++++++++++---
> > >  1 file changed, 65 insertions(+), 3 deletions(-)
> > >
> > > diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
> > > index efa660a87787..1f16ed4d6a13 100644
> > > --- a/mm/zsmalloc.c
> > > +++ b/mm/zsmalloc.c
> > > @@ -53,6 +53,7 @@
> > >  #include <linux/zpool.h>
> > >  #include <linux/mount.h>
> > >  #include <linux/migrate.h>
> > > +#include <linux/wait.h>
> > >  #include <linux/pagemap.h>
> > >  #include <linux/fs.h>
> > >
> > > @@ -206,6 +207,10 @@ struct size_class {
> > >       int objs_per_zspage;
> > >       /* Number of PAGE_SIZE sized pages to combine to form a 'zspage' */
> > >       int pages_per_zspage;
> > > +#ifdef CONFIG_COMPACTION
> > > +     /* Number of zspages currently isolated by compaction */
> > > +     int isolated;
> > > +#endif
> > >
> > >       unsigned int index;
> > >       struct zs_size_stat stats;
> > > @@ -267,6 +272,8 @@ struct zs_pool {
> > >  #ifdef CONFIG_COMPACTION
> > >       struct inode *inode;
> > >       struct work_struct free_work;
> > > +     /* A workqueue for when migration races with async_free_zspage() */
> > > +     struct wait_queue_head migration_wait;
> > >  #endif
> > >  };
> > >
> > > @@ -1917,6 +1924,21 @@ static void putback_zspage_deferred(struct zs_pool *pool,
> > >
> > >  }
> > >
> > > +static inline void zs_class_dec_isolated(struct zs_pool *pool,
> > > +                                      struct size_class *class)
> > > +{
> > > +     assert_spin_locked(&class->lock);
> > > +     VM_BUG_ON(class->isolated <= 0);
> > > +     class->isolated--;
> > > +     /*
> > > +      * There's no possibility of racing, since wait_for_isolated_drain()
> > > +      * checks the isolated count under &class->lock after enqueuing
> > > +      * on migration_wait.
> > > +      */
> > > +     if (class->isolated == 0 && waitqueue_active(&pool->migration_wait))
> > > +             wake_up_all(&pool->migration_wait);
> > > +}
> > > +
> > >  static void replace_sub_page(struct size_class *class, struct zspage *zspage,
> > >                               struct page *newpage, struct page *oldpage)
> > >  {
> > > @@ -1986,6 +2008,7 @@ static bool zs_page_isolate(struct page *page, isolate_mode_t mode)
> > >        */
> > >       if (!list_empty(&zspage->list) && !is_zspage_isolated(zspage)) {
> > >               get_zspage_mapping(zspage, &class_idx, &fullness);
> > > +             class->isolated++;
> > >               remove_zspage(class, zspage, fullness);
> > >       }
> > >
> > > @@ -2085,8 +2108,14 @@ static int zs_page_migrate(struct address_space *mapping, struct page *newpage,
> > >        * Page migration is done so let's putback isolated zspage to
> > >        * the list if @page is final isolated subpage in the zspage.
> > >        */
> > > -     if (!is_zspage_isolated(zspage))
> > > +     if (!is_zspage_isolated(zspage)) {
> > > +             /*
> > > +              * We still hold the class lock while all of this is happening,
> > > +              * so we cannot race with zs_destroy_pool()
> > > +              */
> > >               putback_zspage_deferred(pool, class, zspage);
> > > +             zs_class_dec_isolated(pool, class);
> > > +     }
> > >
> > >       reset_page(page);
> > >       put_page(page);
> > > @@ -2131,9 +2160,11 @@ static void zs_page_putback(struct page *page)
> > >
> > >       spin_lock(&class->lock);
> > >       dec_zspage_isolation(zspage);
> > > -     if (!is_zspage_isolated(zspage))
> > > -             putback_zspage_deferred(pool, class, zspage);
> > >
> > > +     if (!is_zspage_isolated(zspage)) {
> > > +             putback_zspage_deferred(pool, class, zspage);
> > > +             zs_class_dec_isolated(pool, class);
> > > +     }
> > >       spin_unlock(&class->lock);
> > >  }
> > >
> > > @@ -2156,8 +2187,36 @@ static int zs_register_migration(struct zs_pool *pool)
> > >       return 0;
> > >  }
> > >
> > > +static bool class_isolated_are_drained(struct size_class *class)
> > > +{
> > > +     bool ret;
> > > +
> > > +     spin_lock(&class->lock);
> > > +     ret = class->isolated == 0;
> > > +     spin_unlock(&class->lock);
> > > +     return ret;
> > > +}
> > > +
> > > +/* Function for resolving migration */
> > > +static void wait_for_isolated_drain(struct zs_pool *pool)
> > > +{
> > > +     int i;
> > > +
> > > +     /*
> > > +      * We're in the process of destroying the pool, so there are no
> > > +      * active allocations. zs_page_isolate() fails for completely free
> > > +      * zspages, so we need only wait for each size_class's isolated
> > > +      * count to hit zero.
> > > +      */
> > > +     for (i = 0; i < ZS_SIZE_CLASSES; i++) {
> > > +             wait_event(pool->migration_wait,
> > > +                        class_isolated_are_drained(pool->size_class[i]));
> > > +     }
> > > +}
> > > +
> > >  static void zs_unregister_migration(struct zs_pool *pool)
> > >  {
> > > +     wait_for_isolated_drain(pool); /* This can block */
> > >       flush_work(&pool->free_work);
> > >       iput(pool->inode);
> > >  }
> > > @@ -2401,6 +2460,8 @@ struct zs_pool *zs_create_pool(const char *name)
> > >       if (!pool->name)
> > >               goto err;
> > >
> > > +     init_waitqueue_head(&pool->migration_wait);
> > > +
> > >       if (create_cache(pool))
> > >               goto err;
> > >
> > > @@ -2466,6 +2527,7 @@ struct zs_pool *zs_create_pool(const char *name)
> > >               class->index = i;
> > >               class->pages_per_zspage = pages_per_zspage;
> > >               class->objs_per_zspage = objs_per_zspage;
> > > +             class->isolated = 0;
> > >               spin_lock_init(&class->lock);
> > >               pool->size_class[i] = class;
> > >               for (fullness = ZS_EMPTY; fullness < NR_ZS_FULLNESS;
> > > --
> > > 2.22.0.770.g0f2c4a37fd-goog
> > >

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH 2/2] mm/zsmalloc.c: Fix race condition in zs_destroy_pool
  2019-08-06  1:38       ` Minchan Kim
@ 2019-08-06 19:53         ` Henry Burns
  0 siblings, 0 replies; 9+ messages in thread
From: Henry Burns @ 2019-08-06 19:53 UTC (permalink / raw)
  To: Minchan Kim
  Cc: Nitin Gupta, Sergey Senozhatsky, Shakeel Butt, Jonathan Adams,
	Linux MM, LKML, henrywolfeburns

By the way, I will lose access to this email in 3 days, so I've cc'd a
personal email.


On Mon, Aug 5, 2019 at 6:38 PM Minchan Kim <minchan@kernel.org> wrote:
> On Mon, Aug 05, 2019 at 10:34:41AM -0700, Henry Burns wrote:
> > On Sun, Aug 4, 2019 at 9:28 PM Minchan Kim <minchan@kernel.org> wrote:
> > > On Thu, Aug 01, 2019 at 06:53:32PM -0700, Henry Burns wrote:
> > > > In zs_destroy_pool() we call flush_work(&pool->free_work). However, we
> > > > have no guarantee that migration isn't happening in the background
> > > > at that time.
> > > >
> > > > Since migration can't directly free pages, it relies on free_work
> > > > being scheduled to free the pages.  But there's nothing preventing an
> > > > in-progress migrate from queuing the work *after*
> > > > zs_unregister_migration() has called flush_work().  Which would mean
> > > > pages still pointing at the inode when we free it.
> > >
> > > We already unregister shrinker so there is no upcoming async free call
> > > via shrinker so the only concern is zs_compact API direct call from
> > > the user. Is that what what you desribe from the description?
> >
> > What I am describing is a call to zsmalloc_aops->migratepage() by
> > kcompactd (which can call schedule work in either
> > zs_page_migrate() or zs_page_putback should the zspage become empty).
> >
> > While we are migrating a page, we remove it from the class. Suppose
> > zs_free() loses a race with migration. We would schedule
> > async_free_zspage() to handle freeing that zspage, however we have no
> > guarantee that migration has finished
> > by the time we finish flush_work(&pool->work). In that case we then
> > call iput(inode), and now we have a page
> > pointing to a non-existent inode. (At which point something like
> > kcompactd would potentially BUG() if it tries to get a page
> > (from the inode) that doesn't exist anymore)
> >
>
> True.
> I totally got mixed up internal migration and external migration. :-/
>
> >
> > >
> > > If so, can't we add a flag to indicate destroy of the pool and
> > > global counter to indicate how many of zs_compact was nested?
> > >
> > > So, zs_unregister_migration in zs_destroy_pool can set the flag to
> > > prevent upcoming zs_compact call and wait until the global counter
> > > will be zero. Once it's done, finally flush the work.
> > >
> > > My point is it's not a per-class granuarity but global.
> >
> > We could have a pool level counter of isolated pages, and wait for
> > that to finish before starting flush_work(&pool->work); However,
> > that would require an atomic_long in zs_pool, and we would have to eat
> > the cost of any contention over that lock. Still, it might be
> > preferable to a per-class granularity.
>
> That would be better for performance-wise but how it's significant?
> Migration is not already hot path so adding a atomic variable in that path
> wouldn't make noticible slow.
>
> Rather than performance, my worry is maintainance so prefer simple and
> not fragile.

It sounds to me like you are saying that the current approach is fine, does this
match up with your understanding?

>
> >
> > >
> > > Thanks.
> > >
> > > >
> > > > Since we know at destroy time all objects should be free, no new
> > > > migrations can come in (since zs_page_isolate() fails for fully-free
> > > > zspages).  This means it is sufficient to track a "# isolated zspages"
> > > > count by class, and have the destroy logic ensure all such pages have
> > > > drained before proceeding.  Keeping that state under the class
> > > > spinlock keeps the logic straightforward.
> > > >
> > > > Signed-off-by: Henry Burns <henryburns@google.com>
> > > > ---
> > > >  mm/zsmalloc.c | 68 ++++++++++++++++++++++++++++++++++++++++++++++++---
> > > >  1 file changed, 65 insertions(+), 3 deletions(-)
> > > >
> > > > diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
> > > > index efa660a87787..1f16ed4d6a13 100644
> > > > --- a/mm/zsmalloc.c
> > > > +++ b/mm/zsmalloc.c
> > > > @@ -53,6 +53,7 @@
> > > >  #include <linux/zpool.h>
> > > >  #include <linux/mount.h>
> > > >  #include <linux/migrate.h>
> > > > +#include <linux/wait.h>
> > > >  #include <linux/pagemap.h>
> > > >  #include <linux/fs.h>
> > > >
> > > > @@ -206,6 +207,10 @@ struct size_class {
> > > >       int objs_per_zspage;
> > > >       /* Number of PAGE_SIZE sized pages to combine to form a 'zspage' */
> > > >       int pages_per_zspage;
> > > > +#ifdef CONFIG_COMPACTION
> > > > +     /* Number of zspages currently isolated by compaction */
> > > > +     int isolated;
> > > > +#endif
> > > >
> > > >       unsigned int index;
> > > >       struct zs_size_stat stats;
> > > > @@ -267,6 +272,8 @@ struct zs_pool {
> > > >  #ifdef CONFIG_COMPACTION
> > > >       struct inode *inode;
> > > >       struct work_struct free_work;
> > > > +     /* A workqueue for when migration races with async_free_zspage() */
> > > > +     struct wait_queue_head migration_wait;
> > > >  #endif
> > > >  };
> > > >
> > > > @@ -1917,6 +1924,21 @@ static void putback_zspage_deferred(struct zs_pool *pool,
> > > >
> > > >  }
> > > >
> > > > +static inline void zs_class_dec_isolated(struct zs_pool *pool,
> > > > +                                      struct size_class *class)
> > > > +{
> > > > +     assert_spin_locked(&class->lock);
> > > > +     VM_BUG_ON(class->isolated <= 0);
> > > > +     class->isolated--;
> > > > +     /*
> > > > +      * There's no possibility of racing, since wait_for_isolated_drain()
> > > > +      * checks the isolated count under &class->lock after enqueuing
> > > > +      * on migration_wait.
> > > > +      */
> > > > +     if (class->isolated == 0 && waitqueue_active(&pool->migration_wait))
> > > > +             wake_up_all(&pool->migration_wait);
> > > > +}
> > > > +
> > > >  static void replace_sub_page(struct size_class *class, struct zspage *zspage,
> > > >                               struct page *newpage, struct page *oldpage)
> > > >  {
> > > > @@ -1986,6 +2008,7 @@ static bool zs_page_isolate(struct page *page, isolate_mode_t mode)
> > > >        */
> > > >       if (!list_empty(&zspage->list) && !is_zspage_isolated(zspage)) {
> > > >               get_zspage_mapping(zspage, &class_idx, &fullness);
> > > > +             class->isolated++;
> > > >               remove_zspage(class, zspage, fullness);
> > > >       }
> > > >
> > > > @@ -2085,8 +2108,14 @@ static int zs_page_migrate(struct address_space *mapping, struct page *newpage,
> > > >        * Page migration is done so let's putback isolated zspage to
> > > >        * the list if @page is final isolated subpage in the zspage.
> > > >        */
> > > > -     if (!is_zspage_isolated(zspage))
> > > > +     if (!is_zspage_isolated(zspage)) {
> > > > +             /*
> > > > +              * We still hold the class lock while all of this is happening,
> > > > +              * so we cannot race with zs_destroy_pool()
> > > > +              */
> > > >               putback_zspage_deferred(pool, class, zspage);
> > > > +             zs_class_dec_isolated(pool, class);
> > > > +     }
> > > >
> > > >       reset_page(page);
> > > >       put_page(page);
> > > > @@ -2131,9 +2160,11 @@ static void zs_page_putback(struct page *page)
> > > >
> > > >       spin_lock(&class->lock);
> > > >       dec_zspage_isolation(zspage);
> > > > -     if (!is_zspage_isolated(zspage))
> > > > -             putback_zspage_deferred(pool, class, zspage);
> > > >
> > > > +     if (!is_zspage_isolated(zspage)) {
> > > > +             putback_zspage_deferred(pool, class, zspage);
> > > > +             zs_class_dec_isolated(pool, class);
> > > > +     }
> > > >       spin_unlock(&class->lock);
> > > >  }
> > > >
> > > > @@ -2156,8 +2187,36 @@ static int zs_register_migration(struct zs_pool *pool)
> > > >       return 0;
> > > >  }
> > > >
> > > > +static bool class_isolated_are_drained(struct size_class *class)
> > > > +{
> > > > +     bool ret;
> > > > +
> > > > +     spin_lock(&class->lock);
> > > > +     ret = class->isolated == 0;
> > > > +     spin_unlock(&class->lock);
> > > > +     return ret;
> > > > +}
> > > > +
> > > > +/* Function for resolving migration */
> > > > +static void wait_for_isolated_drain(struct zs_pool *pool)
> > > > +{
> > > > +     int i;
> > > > +
> > > > +     /*
> > > > +      * We're in the process of destroying the pool, so there are no
> > > > +      * active allocations. zs_page_isolate() fails for completely free
> > > > +      * zspages, so we need only wait for each size_class's isolated
> > > > +      * count to hit zero.
> > > > +      */
> > > > +     for (i = 0; i < ZS_SIZE_CLASSES; i++) {
> > > > +             wait_event(pool->migration_wait,
> > > > +                        class_isolated_are_drained(pool->size_class[i]));
> > > > +     }
> > > > +}
> > > > +
> > > >  static void zs_unregister_migration(struct zs_pool *pool)
> > > >  {
> > > > +     wait_for_isolated_drain(pool); /* This can block */
> > > >       flush_work(&pool->free_work);
> > > >       iput(pool->inode);
> > > >  }
> > > > @@ -2401,6 +2460,8 @@ struct zs_pool *zs_create_pool(const char *name)
> > > >       if (!pool->name)
> > > >               goto err;
> > > >
> > > > +     init_waitqueue_head(&pool->migration_wait);
> > > > +
> > > >       if (create_cache(pool))
> > > >               goto err;
> > > >
> > > > @@ -2466,6 +2527,7 @@ struct zs_pool *zs_create_pool(const char *name)
> > > >               class->index = i;
> > > >               class->pages_per_zspage = pages_per_zspage;
> > > >               class->objs_per_zspage = objs_per_zspage;
> > > > +             class->isolated = 0;
> > > >               spin_lock_init(&class->lock);
> > > >               pool->size_class[i] = class;
> > > >               for (fullness = ZS_EMPTY; fullness < NR_ZS_FULLNESS;
> > > > --
> > > > 2.22.0.770.g0f2c4a37fd-goog
> > > >

^ permalink raw reply	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2019-08-06 19:53 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-08-02  1:53 [PATCH 1/2] mm/zsmalloc.c: Migration can leave pages in ZS_EMPTY indefinitely Henry Burns
2019-08-02  1:53 ` [PATCH 2/2] mm/zsmalloc.c: Fix race condition in zs_destroy_pool Henry Burns
2019-08-05  4:28   ` Minchan Kim
2019-08-05 17:34     ` Henry Burns
2019-08-06  1:38       ` Minchan Kim
2019-08-06 19:53         ` Henry Burns
2019-08-02 14:58 ` [PATCH 1/2] mm/zsmalloc.c: Migration can leave pages in ZS_EMPTY indefinitely Shakeel Butt
2019-08-02 15:09 ` Jonathan Adams
2019-08-05  4:14 ` Minchan Kim

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).