All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 0/5] close various race windows for swap
@ 2021-04-08 13:08 Miaohe Lin
  2021-04-08 13:08 ` [PATCH 1/5] mm/swapfile: add percpu_ref support " Miaohe Lin
                   ` (5 more replies)
  0 siblings, 6 replies; 72+ messages in thread
From: Miaohe Lin @ 2021-04-08 13:08 UTC (permalink / raw)
  To: akpm
  Cc: hannes, mhocko, iamjoonsoo.kim, vbabka, alex.shi, willy, minchan,
	richard.weiyang, ying.huang, hughd, tim.c.chen, linux-kernel,
	linux-mm, linmiaohe

Hi all,
When I was investigating the swap code, I found some possible race
windows. This series aims to fix all these races. But using current
get/put_swap_device() to guard against concurrent swapoff for
swap_readpage() looks terrible because swap_readpage() may take really
long time. And to reduce the performance overhead on the hot-path as
much as possible, it appears we can use the percpu_ref to close this
race window(as suggested by Huang, Ying). The patch 1 adds percpu_ref
support for swap and the rest of the patches use this to close various
race windows. More details can be found in the respective changelogs.
Thanks!

Miaohe Lin (5):
  mm/swapfile: add percpu_ref support for swap
  swap: fix do_swap_page() race with swapoff
  mm/swap_state: fix get_shadow_from_swap_cache() race with swapoff
  mm/swap_state: fix potential faulted in race in swap_ra_info()
  mm/swap_state: fix swap_cluster_readahead() race with swapoff

 include/linux/swap.h |  4 +++-
 mm/memory.c          | 10 +++++++++
 mm/swap_state.c      | 33 +++++++++++++++++++++--------
 mm/swapfile.c        | 50 +++++++++++++++++++++++++++-----------------
 4 files changed, 68 insertions(+), 29 deletions(-)

-- 
2.19.1


^ permalink raw reply	[flat|nested] 72+ messages in thread

* [PATCH 1/5] mm/swapfile: add percpu_ref support for swap
  2021-04-08 13:08 [PATCH 0/5] close various race windows for swap Miaohe Lin
@ 2021-04-08 13:08 ` Miaohe Lin
  2021-04-12  3:30     ` Huang, Ying
  2021-04-08 13:08 ` [PATCH 2/5] swap: fix do_swap_page() race with swapoff Miaohe Lin
                   ` (4 subsequent siblings)
  5 siblings, 1 reply; 72+ messages in thread
From: Miaohe Lin @ 2021-04-08 13:08 UTC (permalink / raw)
  To: akpm
  Cc: hannes, mhocko, iamjoonsoo.kim, vbabka, alex.shi, willy, minchan,
	richard.weiyang, ying.huang, hughd, tim.c.chen, linux-kernel,
	linux-mm, linmiaohe

We will use percpu-refcount to serialize against concurrent swapoff. This
patch adds the percpu_ref support for later fixup.

Signed-off-by: Miaohe Lin <linmiaohe@huawei.com>
---
 include/linux/swap.h |  2 ++
 mm/swapfile.c        | 25 ++++++++++++++++++++++---
 2 files changed, 24 insertions(+), 3 deletions(-)

diff --git a/include/linux/swap.h b/include/linux/swap.h
index 144727041e78..849ba5265c11 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -240,6 +240,7 @@ struct swap_cluster_list {
  * The in-memory structure used to track swap areas.
  */
 struct swap_info_struct {
+	struct percpu_ref users;	/* serialization against concurrent swapoff */
 	unsigned long	flags;		/* SWP_USED etc: see above */
 	signed short	prio;		/* swap priority of this type */
 	struct plist_node list;		/* entry in swap_active_head */
@@ -260,6 +261,7 @@ struct swap_info_struct {
 	struct block_device *bdev;	/* swap device or bdev of swap file */
 	struct file *swap_file;		/* seldom referenced */
 	unsigned int old_block_size;	/* seldom referenced */
+	struct completion comp;		/* seldom referenced */
 #ifdef CONFIG_FRONTSWAP
 	unsigned long *frontswap_map;	/* frontswap in-use, one bit per page */
 	atomic_t frontswap_pages;	/* frontswap pages in-use counter */
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 149e77454e3c..724173cd7d0c 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -39,6 +39,7 @@
 #include <linux/export.h>
 #include <linux/swap_slots.h>
 #include <linux/sort.h>
+#include <linux/completion.h>
 
 #include <asm/tlbflush.h>
 #include <linux/swapops.h>
@@ -511,6 +512,15 @@ static void swap_discard_work(struct work_struct *work)
 	spin_unlock(&si->lock);
 }
 
+static void swap_users_ref_free(struct percpu_ref *ref)
+{
+	struct swap_info_struct *si;
+
+	si = container_of(ref, struct swap_info_struct, users);
+	complete(&si->comp);
+	percpu_ref_exit(&si->users);
+}
+
 static void alloc_cluster(struct swap_info_struct *si, unsigned long idx)
 {
 	struct swap_cluster_info *ci = si->cluster_info;
@@ -2500,7 +2510,7 @@ static void enable_swap_info(struct swap_info_struct *p, int prio,
 	 * Guarantee swap_map, cluster_info, etc. fields are valid
 	 * between get/put_swap_device() if SWP_VALID bit is set
 	 */
-	synchronize_rcu();
+	percpu_ref_reinit(&p->users);
 	spin_lock(&swap_lock);
 	spin_lock(&p->lock);
 	_enable_swap_info(p);
@@ -2621,11 +2631,13 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
 	p->flags &= ~SWP_VALID;		/* mark swap device as invalid */
 	spin_unlock(&p->lock);
 	spin_unlock(&swap_lock);
+
+	percpu_ref_kill(&p->users);
 	/*
 	 * wait for swap operations protected by get/put_swap_device()
 	 * to complete
 	 */
-	synchronize_rcu();
+	wait_for_completion(&p->comp);
 
 	flush_work(&p->discard_work);
 
@@ -3132,7 +3144,7 @@ static bool swap_discardable(struct swap_info_struct *si)
 SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
 {
 	struct swap_info_struct *p;
-	struct filename *name;
+	struct filename *name = NULL;
 	struct file *swap_file = NULL;
 	struct address_space *mapping;
 	int prio;
@@ -3163,6 +3175,12 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
 
 	INIT_WORK(&p->discard_work, swap_discard_work);
 
+	init_completion(&p->comp);
+	error = percpu_ref_init(&p->users, swap_users_ref_free,
+				PERCPU_REF_INIT_DEAD, GFP_KERNEL);
+	if (unlikely(error))
+		goto bad_swap;
+
 	name = getname(specialfile);
 	if (IS_ERR(name)) {
 		error = PTR_ERR(name);
@@ -3356,6 +3374,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
 bad_swap_unlock_inode:
 	inode_unlock(inode);
 bad_swap:
+	percpu_ref_exit(&p->users);
 	free_percpu(p->percpu_cluster);
 	p->percpu_cluster = NULL;
 	free_percpu(p->cluster_next_cpu);
-- 
2.19.1


^ permalink raw reply related	[flat|nested] 72+ messages in thread

* [PATCH 2/5] swap: fix do_swap_page() race with swapoff
  2021-04-08 13:08 [PATCH 0/5] close various race windows for swap Miaohe Lin
  2021-04-08 13:08 ` [PATCH 1/5] mm/swapfile: add percpu_ref support " Miaohe Lin
@ 2021-04-08 13:08 ` Miaohe Lin
  2021-04-08 21:34   ` Tim Chen
                     ` (3 more replies)
  2021-04-08 13:08 ` [PATCH 3/5] mm/swap_state: fix get_shadow_from_swap_cache() " Miaohe Lin
                   ` (3 subsequent siblings)
  5 siblings, 4 replies; 72+ messages in thread
From: Miaohe Lin @ 2021-04-08 13:08 UTC (permalink / raw)
  To: akpm
  Cc: hannes, mhocko, iamjoonsoo.kim, vbabka, alex.shi, willy, minchan,
	richard.weiyang, ying.huang, hughd, tim.c.chen, linux-kernel,
	linux-mm, linmiaohe

When I was investigating the swap code, I found the below possible race
window:

CPU 1					CPU 2
-----					-----
do_swap_page
  synchronous swap_readpage
    alloc_page_vma
					swapoff
					  release swap_file, bdev, or ...
      swap_readpage
	check sis->flags is ok
	  access swap_file, bdev...[oops!]
					    si->flags = 0

Using current get/put_swap_device() to guard against concurrent swapoff for
swap_readpage() looks terrible because swap_readpage() may take really long
time. And this race may not be really pernicious because swapoff is usually
done when system shutdown only. To reduce the performance overhead on the
hot-path as much as possible, it appears we can use the percpu_ref to close
this race window(as suggested by Huang, Ying).

Fixes: 235b62176712 ("mm/swap: add cluster lock")
Signed-off-by: Miaohe Lin <linmiaohe@huawei.com>
---
 include/linux/swap.h |  2 +-
 mm/memory.c          | 10 ++++++++++
 mm/swapfile.c        | 28 +++++++++++-----------------
 3 files changed, 22 insertions(+), 18 deletions(-)

diff --git a/include/linux/swap.h b/include/linux/swap.h
index 849ba5265c11..9066addb57fd 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -513,7 +513,7 @@ sector_t swap_page_sector(struct page *page);
 
 static inline void put_swap_device(struct swap_info_struct *si)
 {
-	rcu_read_unlock();
+	percpu_ref_put(&si->users);
 }
 
 #else /* CONFIG_SWAP */
diff --git a/mm/memory.c b/mm/memory.c
index cc71a445c76c..8543c47b955c 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3311,6 +3311,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
 {
 	struct vm_area_struct *vma = vmf->vma;
 	struct page *page = NULL, *swapcache;
+	struct swap_info_struct *si = NULL;
 	swp_entry_t entry;
 	pte_t pte;
 	int locked;
@@ -3339,6 +3340,11 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
 	}
 
 
+	si = get_swap_device(entry);
+	/* In case we raced with swapoff. */
+	if (unlikely(!si))
+		goto out;
+
 	delayacct_set_flag(DELAYACCT_PF_SWAPIN);
 	page = lookup_swap_cache(entry, vma, vmf->address);
 	swapcache = page;
@@ -3514,6 +3520,8 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
 unlock:
 	pte_unmap_unlock(vmf->pte, vmf->ptl);
 out:
+	if (si)
+		put_swap_device(si);
 	return ret;
 out_nomap:
 	pte_unmap_unlock(vmf->pte, vmf->ptl);
@@ -3525,6 +3533,8 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
 		unlock_page(swapcache);
 		put_page(swapcache);
 	}
+	if (si)
+		put_swap_device(si);
 	return ret;
 }
 
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 724173cd7d0c..01032c72ceae 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1280,18 +1280,12 @@ static unsigned char __swap_entry_free_locked(struct swap_info_struct *p,
  * via preventing the swap device from being swapoff, until
  * put_swap_device() is called.  Otherwise return NULL.
  *
- * The entirety of the RCU read critical section must come before the
- * return from or after the call to synchronize_rcu() in
- * enable_swap_info() or swapoff().  So if "si->flags & SWP_VALID" is
- * true, the si->map, si->cluster_info, etc. must be valid in the
- * critical section.
- *
  * Notice that swapoff or swapoff+swapon can still happen before the
- * rcu_read_lock() in get_swap_device() or after the rcu_read_unlock()
- * in put_swap_device() if there isn't any other way to prevent
- * swapoff, such as page lock, page table lock, etc.  The caller must
- * be prepared for that.  For example, the following situation is
- * possible.
+ * percpu_ref_tryget_live() in get_swap_device() or after the
+ * percpu_ref_put() in put_swap_device() if there isn't any other way
+ * to prevent swapoff, such as page lock, page table lock, etc.  The
+ * caller must be prepared for that.  For example, the following
+ * situation is possible.
  *
  *   CPU1				CPU2
  *   do_swap_page()
@@ -1319,21 +1313,21 @@ struct swap_info_struct *get_swap_device(swp_entry_t entry)
 	si = swp_swap_info(entry);
 	if (!si)
 		goto bad_nofile;
-
-	rcu_read_lock();
 	if (data_race(!(si->flags & SWP_VALID)))
-		goto unlock_out;
+		goto out;
+	if (!percpu_ref_tryget_live(&si->users))
+		goto out;
 	offset = swp_offset(entry);
 	if (offset >= si->max)
-		goto unlock_out;
+		goto put_out;
 
 	return si;
 bad_nofile:
 	pr_err("%s: %s%08lx\n", __func__, Bad_file, entry.val);
 out:
 	return NULL;
-unlock_out:
-	rcu_read_unlock();
+put_out:
+	percpu_ref_put(&si->users);
 	return NULL;
 }
 
-- 
2.19.1


^ permalink raw reply related	[flat|nested] 72+ messages in thread

* [PATCH 3/5] mm/swap_state: fix get_shadow_from_swap_cache() race with swapoff
  2021-04-08 13:08 [PATCH 0/5] close various race windows for swap Miaohe Lin
  2021-04-08 13:08 ` [PATCH 1/5] mm/swapfile: add percpu_ref support " Miaohe Lin
  2021-04-08 13:08 ` [PATCH 2/5] swap: fix do_swap_page() race with swapoff Miaohe Lin
@ 2021-04-08 13:08 ` Miaohe Lin
  2021-04-13  1:33     ` Huang, Ying
  2021-04-08 13:08 ` [PATCH 4/5] mm/swap_state: fix potential faulted in race in swap_ra_info() Miaohe Lin
                   ` (2 subsequent siblings)
  5 siblings, 1 reply; 72+ messages in thread
From: Miaohe Lin @ 2021-04-08 13:08 UTC (permalink / raw)
  To: akpm
  Cc: hannes, mhocko, iamjoonsoo.kim, vbabka, alex.shi, willy, minchan,
	richard.weiyang, ying.huang, hughd, tim.c.chen, linux-kernel,
	linux-mm, linmiaohe

The function get_shadow_from_swap_cache() can race with swapoff, though
it's only called by do_swap_page() now.

Fixes: aae466b0052e ("mm/swap: implement workingset detection for anonymous LRU")
Signed-off-by: Miaohe Lin <linmiaohe@huawei.com>
---
 mm/swap_state.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/mm/swap_state.c b/mm/swap_state.c
index 272ea2108c9d..709c260d644a 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -83,11 +83,14 @@ void show_swap_cache_info(void)
 
 void *get_shadow_from_swap_cache(swp_entry_t entry)
 {
-	struct address_space *address_space = swap_address_space(entry);
-	pgoff_t idx = swp_offset(entry);
+	struct swap_info_struct *si;
 	struct page *page;
 
-	page = xa_load(&address_space->i_pages, idx);
+	si = get_swap_device(entry);
+	if (!si)
+		return NULL;
+	page = xa_load(&swap_address_space(entry)->i_pages, swp_offset(entry));
+	put_swap_device(si);
 	if (xa_is_value(page))
 		return page;
 	return NULL;
-- 
2.19.1


^ permalink raw reply related	[flat|nested] 72+ messages in thread

* [PATCH 4/5] mm/swap_state: fix potential faulted in race in swap_ra_info()
  2021-04-08 13:08 [PATCH 0/5] close various race windows for swap Miaohe Lin
                   ` (2 preceding siblings ...)
  2021-04-08 13:08 ` [PATCH 3/5] mm/swap_state: fix get_shadow_from_swap_cache() " Miaohe Lin
@ 2021-04-08 13:08 ` Miaohe Lin
  2021-04-09  8:50     ` Huang, Ying
  2021-04-08 13:08 ` [PATCH 5/5] mm/swap_state: fix swap_cluster_readahead() race with swapoff Miaohe Lin
  2021-04-08 14:55 ` [PATCH 0/5] close various race windows for swap riteshh
  5 siblings, 1 reply; 72+ messages in thread
From: Miaohe Lin @ 2021-04-08 13:08 UTC (permalink / raw)
  To: akpm
  Cc: hannes, mhocko, iamjoonsoo.kim, vbabka, alex.shi, willy, minchan,
	richard.weiyang, ying.huang, hughd, tim.c.chen, linux-kernel,
	linux-mm, linmiaohe

While we released the pte lock, somebody else might faulted in this pte.
So we should check whether it's swap pte first to guard against such race
or swp_type would be unexpected. And we can also avoid some unnecessary
readahead cpu cycles possibly.

Fixes: ec560175c0b6 ("mm, swap: VMA based swap readahead")
Signed-off-by: Miaohe Lin <linmiaohe@huawei.com>
---
 mm/swap_state.c | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/mm/swap_state.c b/mm/swap_state.c
index 709c260d644a..3bf0d0c297bc 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -724,10 +724,10 @@ static void swap_ra_info(struct vm_fault *vmf,
 {
 	struct vm_area_struct *vma = vmf->vma;
 	unsigned long ra_val;
-	swp_entry_t entry;
+	swp_entry_t swap_entry;
 	unsigned long faddr, pfn, fpfn;
 	unsigned long start, end;
-	pte_t *pte, *orig_pte;
+	pte_t *pte, *orig_pte, entry;
 	unsigned int max_win, hits, prev_win, win, left;
 #ifndef CONFIG_64BIT
 	pte_t *tpte;
@@ -742,8 +742,13 @@ static void swap_ra_info(struct vm_fault *vmf,
 
 	faddr = vmf->address;
 	orig_pte = pte = pte_offset_map(vmf->pmd, faddr);
-	entry = pte_to_swp_entry(*pte);
-	if ((unlikely(non_swap_entry(entry)))) {
+	entry = *pte;
+	if (unlikely(!is_swap_pte(entry))) {
+		pte_unmap(orig_pte);
+		return;
+	}
+	swap_entry = pte_to_swp_entry(entry);
+	if ((unlikely(non_swap_entry(swap_entry)))) {
 		pte_unmap(orig_pte);
 		return;
 	}
-- 
2.19.1


^ permalink raw reply related	[flat|nested] 72+ messages in thread

* [PATCH 5/5] mm/swap_state: fix swap_cluster_readahead() race with swapoff
  2021-04-08 13:08 [PATCH 0/5] close various race windows for swap Miaohe Lin
                   ` (3 preceding siblings ...)
  2021-04-08 13:08 ` [PATCH 4/5] mm/swap_state: fix potential faulted in race in swap_ra_info() Miaohe Lin
@ 2021-04-08 13:08 ` Miaohe Lin
  2021-04-13  1:36     ` Huang, Ying
  2021-04-08 14:55 ` [PATCH 0/5] close various race windows for swap riteshh
  5 siblings, 1 reply; 72+ messages in thread
From: Miaohe Lin @ 2021-04-08 13:08 UTC (permalink / raw)
  To: akpm
  Cc: hannes, mhocko, iamjoonsoo.kim, vbabka, alex.shi, willy, minchan,
	richard.weiyang, ying.huang, hughd, tim.c.chen, linux-kernel,
	linux-mm, linmiaohe

swap_cluster_readahead() could race with swapoff and might dereference
si->swap_file after it's released by swapoff. Close this race window by
using get/put_swap_device() pair.

Signed-off-by: Miaohe Lin <linmiaohe@huawei.com>
---
 mm/swap_state.c | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/mm/swap_state.c b/mm/swap_state.c
index 3bf0d0c297bc..eba6b0cf6cf9 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -626,12 +626,17 @@ struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask,
 	unsigned long offset = entry_offset;
 	unsigned long start_offset, end_offset;
 	unsigned long mask;
-	struct swap_info_struct *si = swp_swap_info(entry);
+	struct swap_info_struct *si;
 	struct blk_plug plug;
 	bool do_poll = true, page_allocated;
 	struct vm_area_struct *vma = vmf->vma;
 	unsigned long addr = vmf->address;
 
+	si = get_swap_device(entry);
+	/* In case we raced with swapoff. */
+	if (!si)
+		return NULL;
+
 	mask = swapin_nr_pages(offset) - 1;
 	if (!mask)
 		goto skip;
@@ -673,7 +678,9 @@ struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask,
 
 	lru_add_drain();	/* Push any new pages onto the LRU now */
 skip:
-	return read_swap_cache_async(entry, gfp_mask, vma, addr, do_poll);
+	page = read_swap_cache_async(entry, gfp_mask, vma, addr, do_poll);
+	put_swap_device(si);
+	return page;
 }
 
 int init_swap_address_space(unsigned int type, unsigned long nr_pages)
-- 
2.19.1


^ permalink raw reply related	[flat|nested] 72+ messages in thread

* Re: [PATCH 0/5] close various race windows for swap
  2021-04-08 13:08 [PATCH 0/5] close various race windows for swap Miaohe Lin
                   ` (4 preceding siblings ...)
  2021-04-08 13:08 ` [PATCH 5/5] mm/swap_state: fix swap_cluster_readahead() race with swapoff Miaohe Lin
@ 2021-04-08 14:55 ` riteshh
  2021-04-09  8:01   ` Miaohe Lin
  5 siblings, 1 reply; 72+ messages in thread
From: riteshh @ 2021-04-08 14:55 UTC (permalink / raw)
  To: Miaohe Lin
  Cc: akpm, hannes, mhocko, iamjoonsoo.kim, vbabka, alex.shi, willy,
	minchan, richard.weiyang, ying.huang, hughd, tim.c.chen,
	linux-kernel, linux-mm

On 21/04/08 09:08AM, Miaohe Lin wrote:
> Hi all,
> When I was investigating the swap code, I found some possible race
> windows. This series aims to fix all these races. But using current
> get/put_swap_device() to guard against concurrent swapoff for
> swap_readpage() looks terrible because swap_readpage() may take really
> long time. And to reduce the performance overhead on the hot-path as
> much as possible, it appears we can use the percpu_ref to close this
> race window(as suggested by Huang, Ying). The patch 1 adds percpu_ref
> support for swap and the rest of the patches use this to close various
> race windows. More details can be found in the respective changelogs.
> Thanks!
>
> Miaohe Lin (5):
>   mm/swapfile: add percpu_ref support for swap
>   swap: fix do_swap_page() race with swapoff
>   mm/swap_state: fix get_shadow_from_swap_cache() race with swapoff
>   mm/swap_state: fix potential faulted in race in swap_ra_info()
>   mm/swap_state: fix swap_cluster_readahead() race with swapoff

Somehow I see Patch-1 and Patch-2 are missing on linux-mm[1].
Also I wanted to ask if you have a way to trigger this in a more controlled
environment (consistently)?

[1]: https://patchwork.kernel.org/project/linux-mm/cover/20210408130820.48233-1-linmiaohe@huawei.com/

-ritesh

>
>  include/linux/swap.h |  4 +++-
>  mm/memory.c          | 10 +++++++++
>  mm/swap_state.c      | 33 +++++++++++++++++++++--------
>  mm/swapfile.c        | 50 +++++++++++++++++++++++++++-----------------
>  4 files changed, 68 insertions(+), 29 deletions(-)
>
> --
> 2.19.1
>
>

^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [PATCH 2/5] swap: fix do_swap_page() race with swapoff
  2021-04-08 13:08 ` [PATCH 2/5] swap: fix do_swap_page() race with swapoff Miaohe Lin
@ 2021-04-08 21:34   ` Tim Chen
  2021-04-09  8:42     ` Miaohe Lin
  2021-04-08 21:37   ` kernel test robot
                     ` (2 subsequent siblings)
  3 siblings, 1 reply; 72+ messages in thread
From: Tim Chen @ 2021-04-08 21:34 UTC (permalink / raw)
  To: Miaohe Lin, akpm
  Cc: hannes, mhocko, iamjoonsoo.kim, vbabka, alex.shi, willy, minchan,
	richard.weiyang, ying.huang, hughd, linux-kernel, linux-mm



On 4/8/21 6:08 AM, Miaohe Lin wrote:
> When I was investigating the swap code, I found the below possible race
> window:
> 
> CPU 1					CPU 2
> -----					-----
> do_swap_page
>   synchronous swap_readpage
>     alloc_page_vma
> 					swapoff
> 					  release swap_file, bdev, or ...

Perhaps I'm missing something.  The release of swap_file, bdev etc
happens after we have cleared the SWP_VALID bit in si->flags in destroy_swap_extents
if I read the swapoff code correctly.
 

>       swap_readpage
> 	check sis->flags is ok
> 	  access swap_file, bdev...[oops!]
> 					    si->flags = 0

This happens after we clear the si->flags
					synchronize_rcu()
					release swap_file, bdev, in destroy_swap_extents()

So I think if we have get_swap_device/put_swap_device in do_swap_page,
it should fix the race you've pointed out here.  
Then synchronize_rcu() will wait till we have completed do_swap_page and
call put_swap_device.
					
> 
> Using current get/put_swap_device() to guard against concurrent swapoff for
> swap_readpage() looks terrible because swap_readpage() may take really long
> time. And this race may not be really pernicious because swapoff is usually
> done when system shutdown only. To reduce the performance overhead on the
> hot-path as much as possible, it appears we can use the percpu_ref to close
> this race window(as suggested by Huang, Ying).

I think it is better to break this patch into two.

One patch is to fix the race in do_swap_page and swapoff
by adding get_swap_device/put_swap_device in do_swap_page.

The second patch is to modify get_swap_device and put_swap_device
with percpu_ref. But swapoff is a relatively rare events.  

I am not sure making percpu_ref change for performance is really beneficial.
Did you encounter a real use case where you see a problem with swapoff?
The delay in swapoff is primarily in try_to_unuse to bring all
the swapped off pages back into memory.  Synchronizing with other
CPU for paging in probably is a small component in overall scheme
of things.

Thanks.

Tim


^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [PATCH 2/5] swap: fix do_swap_page() race with swapoff
  2021-04-08 13:08 ` [PATCH 2/5] swap: fix do_swap_page() race with swapoff Miaohe Lin
  2021-04-08 21:34   ` Tim Chen
@ 2021-04-08 21:37   ` kernel test robot
  2021-04-09  8:46     ` Miaohe Lin
  2021-04-08 22:56   ` kernel test robot
  2021-04-13  1:27     ` Huang, Ying
  3 siblings, 1 reply; 72+ messages in thread
From: kernel test robot @ 2021-04-08 21:37 UTC (permalink / raw)
  To: kbuild-all

[-- Attachment #1: Type: text/plain, Size: 11460 bytes --]

Hi Miaohe,

Thank you for the patch! Yet something to improve:

[auto build test ERROR on linux/master]
[also build test ERROR on linus/master hnaz-linux-mm/master v5.12-rc6 next-20210408]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch]

url:    https://github.com/0day-ci/linux/commits/Miaohe-Lin/close-various-race-windows-for-swap/20210408-211224
base:   https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git 5e46d1b78a03d52306f21f77a4e4a144b6d31486
config: x86_64-randconfig-a012-20210408 (attached as .config)
compiler: clang version 13.0.0 (https://github.com/llvm/llvm-project 56ea2e2fdd691136d5e6631fa0e447173694b82c)
reproduce (this is a W=1 build):
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # install x86_64 cross compiling tool for clang build
        # apt-get install binutils-x86-64-linux-gnu
        # https://github.com/0day-ci/linux/commit/56e65e21c8c9858e36c3bca84006a15fe9b85efd
        git remote add linux-review https://github.com/0day-ci/linux
        git fetch --no-tags linux-review Miaohe-Lin/close-various-race-windows-for-swap/20210408-211224
        git checkout 56e65e21c8c9858e36c3bca84006a15fe9b85efd
        # save the attached .config to linux build tree
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=clang make.cross ARCH=x86_64 

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <lkp@intel.com>

All error/warnings (new ones prefixed by >>):

>> mm/memory.c:3300:7: error: implicit declaration of function 'get_swap_device' [-Werror,-Wimplicit-function-declaration]
           si = get_swap_device(entry);
                ^
   mm/memory.c:3300:7: note: did you mean 'get_cpu_device'?
   include/linux/cpu.h:38:23: note: 'get_cpu_device' declared here
   extern struct device *get_cpu_device(unsigned cpu);
                         ^
>> mm/memory.c:3300:5: warning: incompatible integer to pointer conversion assigning to 'struct swap_info_struct *' from 'int' [-Wint-conversion]
           si = get_swap_device(entry);
              ^ ~~~~~~~~~~~~~~~~~~~~~~
>> mm/memory.c:3483:3: error: implicit declaration of function 'put_swap_device' [-Werror,-Wimplicit-function-declaration]
                   put_swap_device(si);
                   ^
   mm/memory.c:3483:3: note: did you mean 'get_swap_device'?
   mm/memory.c:3300:7: note: 'get_swap_device' declared here
           si = get_swap_device(entry);
                ^
   1 warning and 2 errors generated.


vim +/get_swap_device +3300 mm/memory.c

  3258	
  3259	/*
  3260	 * We enter with non-exclusive mmap_lock (to exclude vma changes,
  3261	 * but allow concurrent faults), and pte mapped but not yet locked.
  3262	 * We return with pte unmapped and unlocked.
  3263	 *
  3264	 * We return with the mmap_lock locked or unlocked in the same cases
  3265	 * as does filemap_fault().
  3266	 */
  3267	vm_fault_t do_swap_page(struct vm_fault *vmf)
  3268	{
  3269		struct vm_area_struct *vma = vmf->vma;
  3270		struct page *page = NULL, *swapcache;
  3271		struct swap_info_struct *si = NULL;
  3272		swp_entry_t entry;
  3273		pte_t pte;
  3274		int locked;
  3275		int exclusive = 0;
  3276		vm_fault_t ret = 0;
  3277		void *shadow = NULL;
  3278	
  3279		if (!pte_unmap_same(vma->vm_mm, vmf->pmd, vmf->pte, vmf->orig_pte))
  3280			goto out;
  3281	
  3282		entry = pte_to_swp_entry(vmf->orig_pte);
  3283		if (unlikely(non_swap_entry(entry))) {
  3284			if (is_migration_entry(entry)) {
  3285				migration_entry_wait(vma->vm_mm, vmf->pmd,
  3286						     vmf->address);
  3287			} else if (is_device_private_entry(entry)) {
  3288				vmf->page = device_private_entry_to_page(entry);
  3289				ret = vmf->page->pgmap->ops->migrate_to_ram(vmf);
  3290			} else if (is_hwpoison_entry(entry)) {
  3291				ret = VM_FAULT_HWPOISON;
  3292			} else {
  3293				print_bad_pte(vma, vmf->address, vmf->orig_pte, NULL);
  3294				ret = VM_FAULT_SIGBUS;
  3295			}
  3296			goto out;
  3297		}
  3298	
  3299	
> 3300		si = get_swap_device(entry);
  3301		/* In case we raced with swapoff. */
  3302		if (unlikely(!si))
  3303			goto out;
  3304	
  3305		delayacct_set_flag(DELAYACCT_PF_SWAPIN);
  3306		page = lookup_swap_cache(entry, vma, vmf->address);
  3307		swapcache = page;
  3308	
  3309		if (!page) {
  3310			struct swap_info_struct *si = swp_swap_info(entry);
  3311	
  3312			if (data_race(si->flags & SWP_SYNCHRONOUS_IO) &&
  3313			    __swap_count(entry) == 1) {
  3314				/* skip swapcache */
  3315				page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma,
  3316								vmf->address);
  3317				if (page) {
  3318					int err;
  3319	
  3320					__SetPageLocked(page);
  3321					__SetPageSwapBacked(page);
  3322					set_page_private(page, entry.val);
  3323	
  3324					/* Tell memcg to use swap ownership records */
  3325					SetPageSwapCache(page);
  3326					err = mem_cgroup_charge(page, vma->vm_mm,
  3327								GFP_KERNEL);
  3328					ClearPageSwapCache(page);
  3329					if (err) {
  3330						ret = VM_FAULT_OOM;
  3331						goto out_page;
  3332					}
  3333	
  3334					shadow = get_shadow_from_swap_cache(entry);
  3335					if (shadow)
  3336						workingset_refault(page, shadow);
  3337	
  3338					lru_cache_add(page);
  3339					swap_readpage(page, true);
  3340				}
  3341			} else {
  3342				page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE,
  3343							vmf);
  3344				swapcache = page;
  3345			}
  3346	
  3347			if (!page) {
  3348				/*
  3349				 * Back out if somebody else faulted in this pte
  3350				 * while we released the pte lock.
  3351				 */
  3352				vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
  3353						vmf->address, &vmf->ptl);
  3354				if (likely(pte_same(*vmf->pte, vmf->orig_pte)))
  3355					ret = VM_FAULT_OOM;
  3356				delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
  3357				goto unlock;
  3358			}
  3359	
  3360			/* Had to read the page from swap area: Major fault */
  3361			ret = VM_FAULT_MAJOR;
  3362			count_vm_event(PGMAJFAULT);
  3363			count_memcg_event_mm(vma->vm_mm, PGMAJFAULT);
  3364		} else if (PageHWPoison(page)) {
  3365			/*
  3366			 * hwpoisoned dirty swapcache pages are kept for killing
  3367			 * owner processes (which may be unknown at hwpoison time)
  3368			 */
  3369			ret = VM_FAULT_HWPOISON;
  3370			delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
  3371			goto out_release;
  3372		}
  3373	
  3374		locked = lock_page_or_retry(page, vma->vm_mm, vmf->flags);
  3375	
  3376		delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
  3377		if (!locked) {
  3378			ret |= VM_FAULT_RETRY;
  3379			goto out_release;
  3380		}
  3381	
  3382		/*
  3383		 * Make sure try_to_free_swap or reuse_swap_page or swapoff did not
  3384		 * release the swapcache from under us.  The page pin, and pte_same
  3385		 * test below, are not enough to exclude that.  Even if it is still
  3386		 * swapcache, we need to check that the page's swap has not changed.
  3387		 */
  3388		if (unlikely((!PageSwapCache(page) ||
  3389				page_private(page) != entry.val)) && swapcache)
  3390			goto out_page;
  3391	
  3392		page = ksm_might_need_to_copy(page, vma, vmf->address);
  3393		if (unlikely(!page)) {
  3394			ret = VM_FAULT_OOM;
  3395			page = swapcache;
  3396			goto out_page;
  3397		}
  3398	
  3399		cgroup_throttle_swaprate(page, GFP_KERNEL);
  3400	
  3401		/*
  3402		 * Back out if somebody else already faulted in this pte.
  3403		 */
  3404		vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
  3405				&vmf->ptl);
  3406		if (unlikely(!pte_same(*vmf->pte, vmf->orig_pte)))
  3407			goto out_nomap;
  3408	
  3409		if (unlikely(!PageUptodate(page))) {
  3410			ret = VM_FAULT_SIGBUS;
  3411			goto out_nomap;
  3412		}
  3413	
  3414		/*
  3415		 * The page isn't present yet, go ahead with the fault.
  3416		 *
  3417		 * Be careful about the sequence of operations here.
  3418		 * To get its accounting right, reuse_swap_page() must be called
  3419		 * while the page is counted on swap but not yet in mapcount i.e.
  3420		 * before page_add_anon_rmap() and swap_free(); try_to_free_swap()
  3421		 * must be called after the swap_free(), or it will never succeed.
  3422		 */
  3423	
  3424		inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
  3425		dec_mm_counter_fast(vma->vm_mm, MM_SWAPENTS);
  3426		pte = mk_pte(page, vma->vm_page_prot);
  3427		if ((vmf->flags & FAULT_FLAG_WRITE) && reuse_swap_page(page, NULL)) {
  3428			pte = maybe_mkwrite(pte_mkdirty(pte), vma);
  3429			vmf->flags &= ~FAULT_FLAG_WRITE;
  3430			ret |= VM_FAULT_WRITE;
  3431			exclusive = RMAP_EXCLUSIVE;
  3432		}
  3433		flush_icache_page(vma, page);
  3434		if (pte_swp_soft_dirty(vmf->orig_pte))
  3435			pte = pte_mksoft_dirty(pte);
  3436		if (pte_swp_uffd_wp(vmf->orig_pte)) {
  3437			pte = pte_mkuffd_wp(pte);
  3438			pte = pte_wrprotect(pte);
  3439		}
  3440		set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte);
  3441		arch_do_swap_page(vma->vm_mm, vma, vmf->address, pte, vmf->orig_pte);
  3442		vmf->orig_pte = pte;
  3443	
  3444		/* ksm created a completely new copy */
  3445		if (unlikely(page != swapcache && swapcache)) {
  3446			page_add_new_anon_rmap(page, vma, vmf->address, false);
  3447			lru_cache_add_inactive_or_unevictable(page, vma);
  3448		} else {
  3449			do_page_add_anon_rmap(page, vma, vmf->address, exclusive);
  3450		}
  3451	
  3452		swap_free(entry);
  3453		if (mem_cgroup_swap_full(page) ||
  3454		    (vma->vm_flags & VM_LOCKED) || PageMlocked(page))
  3455			try_to_free_swap(page);
  3456		unlock_page(page);
  3457		if (page != swapcache && swapcache) {
  3458			/*
  3459			 * Hold the lock to avoid the swap entry to be reused
  3460			 * until we take the PT lock for the pte_same() check
  3461			 * (to avoid false positives from pte_same). For
  3462			 * further safety release the lock after the swap_free
  3463			 * so that the swap count won't change under a
  3464			 * parallel locked swapcache.
  3465			 */
  3466			unlock_page(swapcache);
  3467			put_page(swapcache);
  3468		}
  3469	
  3470		if (vmf->flags & FAULT_FLAG_WRITE) {
  3471			ret |= do_wp_page(vmf);
  3472			if (ret & VM_FAULT_ERROR)
  3473				ret &= VM_FAULT_ERROR;
  3474			goto out;
  3475		}
  3476	
  3477		/* No need to invalidate - it was non-present before */
  3478		update_mmu_cache(vma, vmf->address, vmf->pte);
  3479	unlock:
  3480		pte_unmap_unlock(vmf->pte, vmf->ptl);
  3481	out:
  3482		if (si)
> 3483			put_swap_device(si);
  3484		return ret;
  3485	out_nomap:
  3486		pte_unmap_unlock(vmf->pte, vmf->ptl);
  3487	out_page:
  3488		unlock_page(page);
  3489	out_release:
  3490		put_page(page);
  3491		if (page != swapcache && swapcache) {
  3492			unlock_page(swapcache);
  3493			put_page(swapcache);
  3494		}
  3495		if (si)
  3496			put_swap_device(si);
  3497		return ret;
  3498	}
  3499	

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-all(a)lists.01.org

[-- Attachment #2: config.gz --]
[-- Type: application/gzip, Size: 35830 bytes --]

^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [PATCH 2/5] swap: fix do_swap_page() race with swapoff
  2021-04-08 13:08 ` [PATCH 2/5] swap: fix do_swap_page() race with swapoff Miaohe Lin
  2021-04-08 21:34   ` Tim Chen
  2021-04-08 21:37   ` kernel test robot
@ 2021-04-08 22:56   ` kernel test robot
  2021-04-13  1:27     ` Huang, Ying
  3 siblings, 0 replies; 72+ messages in thread
From: kernel test robot @ 2021-04-08 22:56 UTC (permalink / raw)
  To: kbuild-all

[-- Attachment #1: Type: text/plain, Size: 11028 bytes --]

Hi Miaohe,

Thank you for the patch! Yet something to improve:

[auto build test ERROR on linux/master]
[also build test ERROR on linus/master hnaz-linux-mm/master v5.12-rc6 next-20210408]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch]

url:    https://github.com/0day-ci/linux/commits/Miaohe-Lin/close-various-race-windows-for-swap/20210408-211224
base:   https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git 5e46d1b78a03d52306f21f77a4e4a144b6d31486
config: mips-randconfig-r016-20210408 (attached as .config)
compiler: mipsel-linux-gcc (GCC) 9.3.0
reproduce (this is a W=1 build):
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # https://github.com/0day-ci/linux/commit/56e65e21c8c9858e36c3bca84006a15fe9b85efd
        git remote add linux-review https://github.com/0day-ci/linux
        git fetch --no-tags linux-review Miaohe-Lin/close-various-race-windows-for-swap/20210408-211224
        git checkout 56e65e21c8c9858e36c3bca84006a15fe9b85efd
        # save the attached .config to linux build tree
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-9.3.0 make.cross ARCH=mips 

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <lkp@intel.com>

All error/warnings (new ones prefixed by >>):

   mm/memory.c: In function 'do_swap_page':
>> mm/memory.c:3300:7: error: implicit declaration of function 'get_swap_device'; did you mean 'get_cpu_device'? [-Werror=implicit-function-declaration]
    3300 |  si = get_swap_device(entry);
         |       ^~~~~~~~~~~~~~~
         |       get_cpu_device
>> mm/memory.c:3300:5: warning: assignment to 'struct swap_info_struct *' from 'int' makes pointer from integer without a cast [-Wint-conversion]
    3300 |  si = get_swap_device(entry);
         |     ^
>> mm/memory.c:3483:3: error: implicit declaration of function 'put_swap_device'; did you mean 'put_swap_page'? [-Werror=implicit-function-declaration]
    3483 |   put_swap_device(si);
         |   ^~~~~~~~~~~~~~~
         |   put_swap_page
   cc1: some warnings being treated as errors


vim +3300 mm/memory.c

  3258	
  3259	/*
  3260	 * We enter with non-exclusive mmap_lock (to exclude vma changes,
  3261	 * but allow concurrent faults), and pte mapped but not yet locked.
  3262	 * We return with pte unmapped and unlocked.
  3263	 *
  3264	 * We return with the mmap_lock locked or unlocked in the same cases
  3265	 * as does filemap_fault().
  3266	 */
  3267	vm_fault_t do_swap_page(struct vm_fault *vmf)
  3268	{
  3269		struct vm_area_struct *vma = vmf->vma;
  3270		struct page *page = NULL, *swapcache;
  3271		struct swap_info_struct *si = NULL;
  3272		swp_entry_t entry;
  3273		pte_t pte;
  3274		int locked;
  3275		int exclusive = 0;
  3276		vm_fault_t ret = 0;
  3277		void *shadow = NULL;
  3278	
  3279		if (!pte_unmap_same(vma->vm_mm, vmf->pmd, vmf->pte, vmf->orig_pte))
  3280			goto out;
  3281	
  3282		entry = pte_to_swp_entry(vmf->orig_pte);
  3283		if (unlikely(non_swap_entry(entry))) {
  3284			if (is_migration_entry(entry)) {
  3285				migration_entry_wait(vma->vm_mm, vmf->pmd,
  3286						     vmf->address);
  3287			} else if (is_device_private_entry(entry)) {
  3288				vmf->page = device_private_entry_to_page(entry);
  3289				ret = vmf->page->pgmap->ops->migrate_to_ram(vmf);
  3290			} else if (is_hwpoison_entry(entry)) {
  3291				ret = VM_FAULT_HWPOISON;
  3292			} else {
  3293				print_bad_pte(vma, vmf->address, vmf->orig_pte, NULL);
  3294				ret = VM_FAULT_SIGBUS;
  3295			}
  3296			goto out;
  3297		}
  3298	
  3299	
> 3300		si = get_swap_device(entry);
  3301		/* In case we raced with swapoff. */
  3302		if (unlikely(!si))
  3303			goto out;
  3304	
  3305		delayacct_set_flag(DELAYACCT_PF_SWAPIN);
  3306		page = lookup_swap_cache(entry, vma, vmf->address);
  3307		swapcache = page;
  3308	
  3309		if (!page) {
  3310			struct swap_info_struct *si = swp_swap_info(entry);
  3311	
  3312			if (data_race(si->flags & SWP_SYNCHRONOUS_IO) &&
  3313			    __swap_count(entry) == 1) {
  3314				/* skip swapcache */
  3315				page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma,
  3316								vmf->address);
  3317				if (page) {
  3318					int err;
  3319	
  3320					__SetPageLocked(page);
  3321					__SetPageSwapBacked(page);
  3322					set_page_private(page, entry.val);
  3323	
  3324					/* Tell memcg to use swap ownership records */
  3325					SetPageSwapCache(page);
  3326					err = mem_cgroup_charge(page, vma->vm_mm,
  3327								GFP_KERNEL);
  3328					ClearPageSwapCache(page);
  3329					if (err) {
  3330						ret = VM_FAULT_OOM;
  3331						goto out_page;
  3332					}
  3333	
  3334					shadow = get_shadow_from_swap_cache(entry);
  3335					if (shadow)
  3336						workingset_refault(page, shadow);
  3337	
  3338					lru_cache_add(page);
  3339					swap_readpage(page, true);
  3340				}
  3341			} else {
  3342				page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE,
  3343							vmf);
  3344				swapcache = page;
  3345			}
  3346	
  3347			if (!page) {
  3348				/*
  3349				 * Back out if somebody else faulted in this pte
  3350				 * while we released the pte lock.
  3351				 */
  3352				vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
  3353						vmf->address, &vmf->ptl);
  3354				if (likely(pte_same(*vmf->pte, vmf->orig_pte)))
  3355					ret = VM_FAULT_OOM;
  3356				delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
  3357				goto unlock;
  3358			}
  3359	
  3360			/* Had to read the page from swap area: Major fault */
  3361			ret = VM_FAULT_MAJOR;
  3362			count_vm_event(PGMAJFAULT);
  3363			count_memcg_event_mm(vma->vm_mm, PGMAJFAULT);
  3364		} else if (PageHWPoison(page)) {
  3365			/*
  3366			 * hwpoisoned dirty swapcache pages are kept for killing
  3367			 * owner processes (which may be unknown at hwpoison time)
  3368			 */
  3369			ret = VM_FAULT_HWPOISON;
  3370			delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
  3371			goto out_release;
  3372		}
  3373	
  3374		locked = lock_page_or_retry(page, vma->vm_mm, vmf->flags);
  3375	
  3376		delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
  3377		if (!locked) {
  3378			ret |= VM_FAULT_RETRY;
  3379			goto out_release;
  3380		}
  3381	
  3382		/*
  3383		 * Make sure try_to_free_swap or reuse_swap_page or swapoff did not
  3384		 * release the swapcache from under us.  The page pin, and pte_same
  3385		 * test below, are not enough to exclude that.  Even if it is still
  3386		 * swapcache, we need to check that the page's swap has not changed.
  3387		 */
  3388		if (unlikely((!PageSwapCache(page) ||
  3389				page_private(page) != entry.val)) && swapcache)
  3390			goto out_page;
  3391	
  3392		page = ksm_might_need_to_copy(page, vma, vmf->address);
  3393		if (unlikely(!page)) {
  3394			ret = VM_FAULT_OOM;
  3395			page = swapcache;
  3396			goto out_page;
  3397		}
  3398	
  3399		cgroup_throttle_swaprate(page, GFP_KERNEL);
  3400	
  3401		/*
  3402		 * Back out if somebody else already faulted in this pte.
  3403		 */
  3404		vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
  3405				&vmf->ptl);
  3406		if (unlikely(!pte_same(*vmf->pte, vmf->orig_pte)))
  3407			goto out_nomap;
  3408	
  3409		if (unlikely(!PageUptodate(page))) {
  3410			ret = VM_FAULT_SIGBUS;
  3411			goto out_nomap;
  3412		}
  3413	
  3414		/*
  3415		 * The page isn't present yet, go ahead with the fault.
  3416		 *
  3417		 * Be careful about the sequence of operations here.
  3418		 * To get its accounting right, reuse_swap_page() must be called
  3419		 * while the page is counted on swap but not yet in mapcount i.e.
  3420		 * before page_add_anon_rmap() and swap_free(); try_to_free_swap()
  3421		 * must be called after the swap_free(), or it will never succeed.
  3422		 */
  3423	
  3424		inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
  3425		dec_mm_counter_fast(vma->vm_mm, MM_SWAPENTS);
  3426		pte = mk_pte(page, vma->vm_page_prot);
  3427		if ((vmf->flags & FAULT_FLAG_WRITE) && reuse_swap_page(page, NULL)) {
  3428			pte = maybe_mkwrite(pte_mkdirty(pte), vma);
  3429			vmf->flags &= ~FAULT_FLAG_WRITE;
  3430			ret |= VM_FAULT_WRITE;
  3431			exclusive = RMAP_EXCLUSIVE;
  3432		}
  3433		flush_icache_page(vma, page);
  3434		if (pte_swp_soft_dirty(vmf->orig_pte))
  3435			pte = pte_mksoft_dirty(pte);
  3436		if (pte_swp_uffd_wp(vmf->orig_pte)) {
  3437			pte = pte_mkuffd_wp(pte);
  3438			pte = pte_wrprotect(pte);
  3439		}
  3440		set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte);
  3441		arch_do_swap_page(vma->vm_mm, vma, vmf->address, pte, vmf->orig_pte);
  3442		vmf->orig_pte = pte;
  3443	
  3444		/* ksm created a completely new copy */
  3445		if (unlikely(page != swapcache && swapcache)) {
  3446			page_add_new_anon_rmap(page, vma, vmf->address, false);
  3447			lru_cache_add_inactive_or_unevictable(page, vma);
  3448		} else {
  3449			do_page_add_anon_rmap(page, vma, vmf->address, exclusive);
  3450		}
  3451	
  3452		swap_free(entry);
  3453		if (mem_cgroup_swap_full(page) ||
  3454		    (vma->vm_flags & VM_LOCKED) || PageMlocked(page))
  3455			try_to_free_swap(page);
  3456		unlock_page(page);
  3457		if (page != swapcache && swapcache) {
  3458			/*
  3459			 * Hold the lock to avoid the swap entry to be reused
  3460			 * until we take the PT lock for the pte_same() check
  3461			 * (to avoid false positives from pte_same). For
  3462			 * further safety release the lock after the swap_free
  3463			 * so that the swap count won't change under a
  3464			 * parallel locked swapcache.
  3465			 */
  3466			unlock_page(swapcache);
  3467			put_page(swapcache);
  3468		}
  3469	
  3470		if (vmf->flags & FAULT_FLAG_WRITE) {
  3471			ret |= do_wp_page(vmf);
  3472			if (ret & VM_FAULT_ERROR)
  3473				ret &= VM_FAULT_ERROR;
  3474			goto out;
  3475		}
  3476	
  3477		/* No need to invalidate - it was non-present before */
  3478		update_mmu_cache(vma, vmf->address, vmf->pte);
  3479	unlock:
  3480		pte_unmap_unlock(vmf->pte, vmf->ptl);
  3481	out:
  3482		if (si)
> 3483			put_swap_device(si);
  3484		return ret;
  3485	out_nomap:
  3486		pte_unmap_unlock(vmf->pte, vmf->ptl);
  3487	out_page:
  3488		unlock_page(page);
  3489	out_release:
  3490		put_page(page);
  3491		if (page != swapcache && swapcache) {
  3492			unlock_page(swapcache);
  3493			put_page(swapcache);
  3494		}
  3495		if (si)
  3496			put_swap_device(si);
  3497		return ret;
  3498	}
  3499	

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-all(a)lists.01.org

[-- Attachment #2: config.gz --]
[-- Type: application/gzip, Size: 28245 bytes --]

^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [PATCH 0/5] close various race windows for swap
  2021-04-08 14:55 ` [PATCH 0/5] close various race windows for swap riteshh
@ 2021-04-09  8:01   ` Miaohe Lin
  0 siblings, 0 replies; 72+ messages in thread
From: Miaohe Lin @ 2021-04-09  8:01 UTC (permalink / raw)
  To: riteshh
  Cc: akpm, hannes, mhocko, iamjoonsoo.kim, vbabka, alex.shi, willy,
	minchan, richard.weiyang, ying.huang, hughd, tim.c.chen,
	linux-kernel, linux-mm

On 2021/4/8 22:55, riteshh wrote:
> On 21/04/08 09:08AM, Miaohe Lin wrote:
>> Hi all,
>> When I was investigating the swap code, I found some possible race
>> windows. This series aims to fix all these races. But using current
>> get/put_swap_device() to guard against concurrent swapoff for
>> swap_readpage() looks terrible because swap_readpage() may take really
>> long time. And to reduce the performance overhead on the hot-path as
>> much as possible, it appears we can use the percpu_ref to close this
>> race window(as suggested by Huang, Ying). The patch 1 adds percpu_ref
>> support for swap and the rest of the patches use this to close various
>> race windows. More details can be found in the respective changelogs.
>> Thanks!
>>
>> Miaohe Lin (5):
>>   mm/swapfile: add percpu_ref support for swap
>>   swap: fix do_swap_page() race with swapoff
>>   mm/swap_state: fix get_shadow_from_swap_cache() race with swapoff
>>   mm/swap_state: fix potential faulted in race in swap_ra_info()
>>   mm/swap_state: fix swap_cluster_readahead() race with swapoff
> 

Many thanks for quick respond.

> Somehow I see Patch-1 and Patch-2 are missing on linux-mm[1].

I have no idea why Patch-1 and Patch-2 are missing. But they could be found at:
https://www.mail-archive.com/linux-kernel@vger.kernel.org/msg2542188.html

> Also I wanted to ask if you have a way to trigger this in a more controlled
> environment (consistently)?
> 

This is *theoretical* issue. The race window is very small but not impossible.
Please see the discussion:
https://www.mail-archive.com/linux-kernel@vger.kernel.org/msg2530094.html

> [1]: https://patchwork.kernel.org/project/linux-mm/cover/20210408130820.48233-1-linmiaohe@huawei.com/
> 

Thanks again.

> -ritesh
> 
>>
>>  include/linux/swap.h |  4 +++-
>>  mm/memory.c          | 10 +++++++++
>>  mm/swap_state.c      | 33 +++++++++++++++++++++--------
>>  mm/swapfile.c        | 50 +++++++++++++++++++++++++++-----------------
>>  4 files changed, 68 insertions(+), 29 deletions(-)
>>
>> --
>> 2.19.1
>>
>>
> .
> 


^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [PATCH 2/5] swap: fix do_swap_page() race with swapoff
  2021-04-08 21:34   ` Tim Chen
@ 2021-04-09  8:42     ` Miaohe Lin
  2021-04-09 17:17       ` Tim Chen
  0 siblings, 1 reply; 72+ messages in thread
From: Miaohe Lin @ 2021-04-09  8:42 UTC (permalink / raw)
  To: Tim Chen, akpm
  Cc: hannes, mhocko, iamjoonsoo.kim, vbabka, alex.shi, willy, minchan,
	richard.weiyang, ying.huang, hughd, linux-kernel, linux-mm

On 2021/4/9 5:34, Tim Chen wrote:
> 
> 
> On 4/8/21 6:08 AM, Miaohe Lin wrote:
>> When I was investigating the swap code, I found the below possible race
>> window:
>>
>> CPU 1					CPU 2
>> -----					-----
>> do_swap_page
>>   synchronous swap_readpage
>>     alloc_page_vma
>> 					swapoff
>> 					  release swap_file, bdev, or ...
> 

Many thanks for quick review and reply!

> Perhaps I'm missing something.  The release of swap_file, bdev etc
> happens after we have cleared the SWP_VALID bit in si->flags in destroy_swap_extents
> if I read the swapoff code correctly.
Agree. Let's look this more close:
CPU1								CPU2
-----								-----
swap_readpage
  if (data_race(sis->flags & SWP_FS_OPS)) {
								swapoff
								  p->swap_file = NULL;
    struct file *swap_file = sis->swap_file;
    struct address_space *mapping = swap_file->f_mapping;[oops!]
								  ...
								  p->flags = 0;
    ...

Does this make sense for you?

> >
>>       swap_readpage
>> 	check sis->flags is ok
>> 	  access swap_file, bdev...[oops!]
>> 					    si->flags = 0
> 
> This happens after we clear the si->flags
> 					synchronize_rcu()
> 					release swap_file, bdev, in destroy_swap_extents()
> 
> So I think if we have get_swap_device/put_swap_device in do_swap_page,
> it should fix the race you've pointed out here.  
> Then synchronize_rcu() will wait till we have completed do_swap_page and
> call put_swap_device.

Right, get_swap_device/put_swap_device could fix this race. __But__ rcu_read_lock()
in get_swap_device() could disable preempt and do_swap_page() may take a really long
time because it involves I/O. It may not be acceptable to disable preempt for such a
long time. :(

> 					
>>
>> Using current get/put_swap_device() to guard against concurrent swapoff for
>> swap_readpage() looks terrible because swap_readpage() may take really long
>> time. And this race may not be really pernicious because swapoff is usually
>> done when system shutdown only. To reduce the performance overhead on the
>> hot-path as much as possible, it appears we can use the percpu_ref to close
>> this race window(as suggested by Huang, Ying).
> 
> I think it is better to break this patch into two.
> > One patch is to fix the race in do_swap_page and swapoff
> by adding get_swap_device/put_swap_device in do_swap_page.
> 
> The second patch is to modify get_swap_device and put_swap_device
> with percpu_ref. But swapoff is a relatively rare events.  

Sounds reasonable. Will do it.

> 
> I am not sure making percpu_ref change for performance is really beneficial.
> Did you encounter a real use case where you see a problem with swapoff?
> The delay in swapoff is primarily in try_to_unuse to bring all
> the swapped off pages back into memory.  Synchronizing with other
> CPU for paging in probably is a small component in overall scheme
> of things.
> 

I can't find a more simple and stable way to fix this potential and *theoretical* issue.
This could happen in real word but the race window should be very small. While swapoff
is usually done when system shutdown only, I'am not really sure if this effort is worth.

But IMO, we should eliminate any potential trouble. :)

> Thanks.
> 

Thanks again.

> Tim
> 
> .
> 

^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [PATCH 2/5] swap: fix do_swap_page() race with swapoff
  2021-04-08 21:37   ` kernel test robot
@ 2021-04-09  8:46     ` Miaohe Lin
  0 siblings, 0 replies; 72+ messages in thread
From: Miaohe Lin @ 2021-04-09  8:46 UTC (permalink / raw)
  To: kbuild-all

[-- Attachment #1: Type: text/plain, Size: 12132 bytes --]

On 2021/4/9 5:37, kernel test robot wrote:
> Hi Miaohe,
> 
> Thank you for the patch! Yet something to improve:
> 
> [auto build test ERROR on linux/master]
> [also build test ERROR on linus/master hnaz-linux-mm/master v5.12-rc6 next-20210408]
> [If your patch is applied to the wrong git tree, kindly drop us a note.
> And when submitting patch, we suggest to use '--base' as documented in
> https://git-scm.com/docs/git-format-patch]
> 
> url:    https://github.com/0day-ci/linux/commits/Miaohe-Lin/close-various-race-windows-for-swap/20210408-211224
> base:   https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git 5e46d1b78a03d52306f21f77a4e4a144b6d31486
> config: x86_64-randconfig-a012-20210408 (attached as .config)
> compiler: clang version 13.0.0 (https://github.com/llvm/llvm-project 56ea2e2fdd691136d5e6631fa0e447173694b82c)
> reproduce (this is a W=1 build):
>         wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
>         chmod +x ~/bin/make.cross
>         # install x86_64 cross compiling tool for clang build
>         # apt-get install binutils-x86-64-linux-gnu
>         # https://github.com/0day-ci/linux/commit/56e65e21c8c9858e36c3bca84006a15fe9b85efd
>         git remote add linux-review https://github.com/0day-ci/linux
>         git fetch --no-tags linux-review Miaohe-Lin/close-various-race-windows-for-swap/20210408-211224
>         git checkout 56e65e21c8c9858e36c3bca84006a15fe9b85efd
>         # save the attached .config to linux build tree
>         COMPILER_INSTALL_PATH=$HOME/0day COMPILER=clang make.cross ARCH=x86_64 
> 
> If you fix the issue, kindly add following tag as appropriate
> Reported-by: kernel test robot <lkp@intel.com>
> 
> All error/warnings (new ones prefixed by >>):
> 
>>> mm/memory.c:3300:7: error: implicit declaration of function 'get_swap_device' [-Werror,-Wimplicit-function-declaration]
>            si = get_swap_device(entry);
>                 ^
>    mm/memory.c:3300:7: note: did you mean 'get_cpu_device'?
>    include/linux/cpu.h:38:23: note: 'get_cpu_device' declared here
>    extern struct device *get_cpu_device(unsigned cpu);
>                          ^
>>> mm/memory.c:3300:5: warning: incompatible integer to pointer conversion assigning to 'struct swap_info_struct *' from 'int' [-Wint-conversion]
>            si = get_swap_device(entry);
>               ^ ~~~~~~~~~~~~~~~~~~~~~~
>>> mm/memory.c:3483:3: error: implicit declaration of function 'put_swap_device' [-Werror,-Wimplicit-function-declaration]
>                    put_swap_device(si);
>                    ^
>    mm/memory.c:3483:3: note: did you mean 'get_swap_device'?
>    mm/memory.c:3300:7: note: 'get_swap_device' declared here
>            si = get_swap_device(entry);
>                 ^
>    1 warning and 2 errors generated.
> 

Many thanks. Will fix it.

> 
> vim +/get_swap_device +3300 mm/memory.c
> 
>   3258	
>   3259	/*
>   3260	 * We enter with non-exclusive mmap_lock (to exclude vma changes,
>   3261	 * but allow concurrent faults), and pte mapped but not yet locked.
>   3262	 * We return with pte unmapped and unlocked.
>   3263	 *
>   3264	 * We return with the mmap_lock locked or unlocked in the same cases
>   3265	 * as does filemap_fault().
>   3266	 */
>   3267	vm_fault_t do_swap_page(struct vm_fault *vmf)
>   3268	{
>   3269		struct vm_area_struct *vma = vmf->vma;
>   3270		struct page *page = NULL, *swapcache;
>   3271		struct swap_info_struct *si = NULL;
>   3272		swp_entry_t entry;
>   3273		pte_t pte;
>   3274		int locked;
>   3275		int exclusive = 0;
>   3276		vm_fault_t ret = 0;
>   3277		void *shadow = NULL;
>   3278	
>   3279		if (!pte_unmap_same(vma->vm_mm, vmf->pmd, vmf->pte, vmf->orig_pte))
>   3280			goto out;
>   3281	
>   3282		entry = pte_to_swp_entry(vmf->orig_pte);
>   3283		if (unlikely(non_swap_entry(entry))) {
>   3284			if (is_migration_entry(entry)) {
>   3285				migration_entry_wait(vma->vm_mm, vmf->pmd,
>   3286						     vmf->address);
>   3287			} else if (is_device_private_entry(entry)) {
>   3288				vmf->page = device_private_entry_to_page(entry);
>   3289				ret = vmf->page->pgmap->ops->migrate_to_ram(vmf);
>   3290			} else if (is_hwpoison_entry(entry)) {
>   3291				ret = VM_FAULT_HWPOISON;
>   3292			} else {
>   3293				print_bad_pte(vma, vmf->address, vmf->orig_pte, NULL);
>   3294				ret = VM_FAULT_SIGBUS;
>   3295			}
>   3296			goto out;
>   3297		}
>   3298	
>   3299	
>> 3300		si = get_swap_device(entry);
>   3301		/* In case we raced with swapoff. */
>   3302		if (unlikely(!si))
>   3303			goto out;
>   3304	
>   3305		delayacct_set_flag(DELAYACCT_PF_SWAPIN);
>   3306		page = lookup_swap_cache(entry, vma, vmf->address);
>   3307		swapcache = page;
>   3308	
>   3309		if (!page) {
>   3310			struct swap_info_struct *si = swp_swap_info(entry);
>   3311	
>   3312			if (data_race(si->flags & SWP_SYNCHRONOUS_IO) &&
>   3313			    __swap_count(entry) == 1) {
>   3314				/* skip swapcache */
>   3315				page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma,
>   3316								vmf->address);
>   3317				if (page) {
>   3318					int err;
>   3319	
>   3320					__SetPageLocked(page);
>   3321					__SetPageSwapBacked(page);
>   3322					set_page_private(page, entry.val);
>   3323	
>   3324					/* Tell memcg to use swap ownership records */
>   3325					SetPageSwapCache(page);
>   3326					err = mem_cgroup_charge(page, vma->vm_mm,
>   3327								GFP_KERNEL);
>   3328					ClearPageSwapCache(page);
>   3329					if (err) {
>   3330						ret = VM_FAULT_OOM;
>   3331						goto out_page;
>   3332					}
>   3333	
>   3334					shadow = get_shadow_from_swap_cache(entry);
>   3335					if (shadow)
>   3336						workingset_refault(page, shadow);
>   3337	
>   3338					lru_cache_add(page);
>   3339					swap_readpage(page, true);
>   3340				}
>   3341			} else {
>   3342				page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE,
>   3343							vmf);
>   3344				swapcache = page;
>   3345			}
>   3346	
>   3347			if (!page) {
>   3348				/*
>   3349				 * Back out if somebody else faulted in this pte
>   3350				 * while we released the pte lock.
>   3351				 */
>   3352				vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
>   3353						vmf->address, &vmf->ptl);
>   3354				if (likely(pte_same(*vmf->pte, vmf->orig_pte)))
>   3355					ret = VM_FAULT_OOM;
>   3356				delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
>   3357				goto unlock;
>   3358			}
>   3359	
>   3360			/* Had to read the page from swap area: Major fault */
>   3361			ret = VM_FAULT_MAJOR;
>   3362			count_vm_event(PGMAJFAULT);
>   3363			count_memcg_event_mm(vma->vm_mm, PGMAJFAULT);
>   3364		} else if (PageHWPoison(page)) {
>   3365			/*
>   3366			 * hwpoisoned dirty swapcache pages are kept for killing
>   3367			 * owner processes (which may be unknown at hwpoison time)
>   3368			 */
>   3369			ret = VM_FAULT_HWPOISON;
>   3370			delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
>   3371			goto out_release;
>   3372		}
>   3373	
>   3374		locked = lock_page_or_retry(page, vma->vm_mm, vmf->flags);
>   3375	
>   3376		delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
>   3377		if (!locked) {
>   3378			ret |= VM_FAULT_RETRY;
>   3379			goto out_release;
>   3380		}
>   3381	
>   3382		/*
>   3383		 * Make sure try_to_free_swap or reuse_swap_page or swapoff did not
>   3384		 * release the swapcache from under us.  The page pin, and pte_same
>   3385		 * test below, are not enough to exclude that.  Even if it is still
>   3386		 * swapcache, we need to check that the page's swap has not changed.
>   3387		 */
>   3388		if (unlikely((!PageSwapCache(page) ||
>   3389				page_private(page) != entry.val)) && swapcache)
>   3390			goto out_page;
>   3391	
>   3392		page = ksm_might_need_to_copy(page, vma, vmf->address);
>   3393		if (unlikely(!page)) {
>   3394			ret = VM_FAULT_OOM;
>   3395			page = swapcache;
>   3396			goto out_page;
>   3397		}
>   3398	
>   3399		cgroup_throttle_swaprate(page, GFP_KERNEL);
>   3400	
>   3401		/*
>   3402		 * Back out if somebody else already faulted in this pte.
>   3403		 */
>   3404		vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
>   3405				&vmf->ptl);
>   3406		if (unlikely(!pte_same(*vmf->pte, vmf->orig_pte)))
>   3407			goto out_nomap;
>   3408	
>   3409		if (unlikely(!PageUptodate(page))) {
>   3410			ret = VM_FAULT_SIGBUS;
>   3411			goto out_nomap;
>   3412		}
>   3413	
>   3414		/*
>   3415		 * The page isn't present yet, go ahead with the fault.
>   3416		 *
>   3417		 * Be careful about the sequence of operations here.
>   3418		 * To get its accounting right, reuse_swap_page() must be called
>   3419		 * while the page is counted on swap but not yet in mapcount i.e.
>   3420		 * before page_add_anon_rmap() and swap_free(); try_to_free_swap()
>   3421		 * must be called after the swap_free(), or it will never succeed.
>   3422		 */
>   3423	
>   3424		inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
>   3425		dec_mm_counter_fast(vma->vm_mm, MM_SWAPENTS);
>   3426		pte = mk_pte(page, vma->vm_page_prot);
>   3427		if ((vmf->flags & FAULT_FLAG_WRITE) && reuse_swap_page(page, NULL)) {
>   3428			pte = maybe_mkwrite(pte_mkdirty(pte), vma);
>   3429			vmf->flags &= ~FAULT_FLAG_WRITE;
>   3430			ret |= VM_FAULT_WRITE;
>   3431			exclusive = RMAP_EXCLUSIVE;
>   3432		}
>   3433		flush_icache_page(vma, page);
>   3434		if (pte_swp_soft_dirty(vmf->orig_pte))
>   3435			pte = pte_mksoft_dirty(pte);
>   3436		if (pte_swp_uffd_wp(vmf->orig_pte)) {
>   3437			pte = pte_mkuffd_wp(pte);
>   3438			pte = pte_wrprotect(pte);
>   3439		}
>   3440		set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte);
>   3441		arch_do_swap_page(vma->vm_mm, vma, vmf->address, pte, vmf->orig_pte);
>   3442		vmf->orig_pte = pte;
>   3443	
>   3444		/* ksm created a completely new copy */
>   3445		if (unlikely(page != swapcache && swapcache)) {
>   3446			page_add_new_anon_rmap(page, vma, vmf->address, false);
>   3447			lru_cache_add_inactive_or_unevictable(page, vma);
>   3448		} else {
>   3449			do_page_add_anon_rmap(page, vma, vmf->address, exclusive);
>   3450		}
>   3451	
>   3452		swap_free(entry);
>   3453		if (mem_cgroup_swap_full(page) ||
>   3454		    (vma->vm_flags & VM_LOCKED) || PageMlocked(page))
>   3455			try_to_free_swap(page);
>   3456		unlock_page(page);
>   3457		if (page != swapcache && swapcache) {
>   3458			/*
>   3459			 * Hold the lock to avoid the swap entry to be reused
>   3460			 * until we take the PT lock for the pte_same() check
>   3461			 * (to avoid false positives from pte_same). For
>   3462			 * further safety release the lock after the swap_free
>   3463			 * so that the swap count won't change under a
>   3464			 * parallel locked swapcache.
>   3465			 */
>   3466			unlock_page(swapcache);
>   3467			put_page(swapcache);
>   3468		}
>   3469	
>   3470		if (vmf->flags & FAULT_FLAG_WRITE) {
>   3471			ret |= do_wp_page(vmf);
>   3472			if (ret & VM_FAULT_ERROR)
>   3473				ret &= VM_FAULT_ERROR;
>   3474			goto out;
>   3475		}
>   3476	
>   3477		/* No need to invalidate - it was non-present before */
>   3478		update_mmu_cache(vma, vmf->address, vmf->pte);
>   3479	unlock:
>   3480		pte_unmap_unlock(vmf->pte, vmf->ptl);
>   3481	out:
>   3482		if (si)
>> 3483			put_swap_device(si);
>   3484		return ret;
>   3485	out_nomap:
>   3486		pte_unmap_unlock(vmf->pte, vmf->ptl);
>   3487	out_page:
>   3488		unlock_page(page);
>   3489	out_release:
>   3490		put_page(page);
>   3491		if (page != swapcache && swapcache) {
>   3492			unlock_page(swapcache);
>   3493			put_page(swapcache);
>   3494		}
>   3495		if (si)
>   3496			put_swap_device(si);
>   3497		return ret;
>   3498	}
>   3499	
> 
> ---
> 0-DAY CI Kernel Test Service, Intel Corporation
> https://lists.01.org/hyperkitty/list/kbuild-all(a)lists.01.org
> 

^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [PATCH 4/5] mm/swap_state: fix potential faulted in race in swap_ra_info()
  2021-04-08 13:08 ` [PATCH 4/5] mm/swap_state: fix potential faulted in race in swap_ra_info() Miaohe Lin
@ 2021-04-09  8:50     ` Huang, Ying
  0 siblings, 0 replies; 72+ messages in thread
From: Huang, Ying @ 2021-04-09  8:50 UTC (permalink / raw)
  To: Miaohe Lin
  Cc: akpm, hannes, mhocko, iamjoonsoo.kim, vbabka, alex.shi, willy,
	minchan, richard.weiyang, hughd, tim.c.chen, linux-kernel,
	linux-mm

Miaohe Lin <linmiaohe@huawei.com> writes:

> While we released the pte lock, somebody else might faulted in this pte.
> So we should check whether it's swap pte first to guard against such race
> or swp_type would be unexpected. And we can also avoid some unnecessary
> readahead cpu cycles possibly.
>
> Fixes: ec560175c0b6 ("mm, swap: VMA based swap readahead")
> Signed-off-by: Miaohe Lin <linmiaohe@huawei.com>
> ---
>  mm/swap_state.c | 13 +++++++++----
>  1 file changed, 9 insertions(+), 4 deletions(-)
>
> diff --git a/mm/swap_state.c b/mm/swap_state.c
> index 709c260d644a..3bf0d0c297bc 100644
> --- a/mm/swap_state.c
> +++ b/mm/swap_state.c
> @@ -724,10 +724,10 @@ static void swap_ra_info(struct vm_fault *vmf,
>  {
>  	struct vm_area_struct *vma = vmf->vma;
>  	unsigned long ra_val;
> -	swp_entry_t entry;
> +	swp_entry_t swap_entry;
>  	unsigned long faddr, pfn, fpfn;
>  	unsigned long start, end;
> -	pte_t *pte, *orig_pte;
> +	pte_t *pte, *orig_pte, entry;
>  	unsigned int max_win, hits, prev_win, win, left;
>  #ifndef CONFIG_64BIT
>  	pte_t *tpte;
> @@ -742,8 +742,13 @@ static void swap_ra_info(struct vm_fault *vmf,
>  
>  	faddr = vmf->address;
>  	orig_pte = pte = pte_offset_map(vmf->pmd, faddr);
> -	entry = pte_to_swp_entry(*pte);
> -	if ((unlikely(non_swap_entry(entry)))) {
> +	entry = *pte;
> +	if (unlikely(!is_swap_pte(entry))) {
> +		pte_unmap(orig_pte);
> +		return;
> +	}
> +	swap_entry = pte_to_swp_entry(entry);
> +	if ((unlikely(non_swap_entry(swap_entry)))) {
>  		pte_unmap(orig_pte);
>  		return;
>  	}

This isn't a real issue.  entry or swap_entry isn't used in this
function.  And we have enough checking when we really operate the PTE
entries later.  But I admit it's confusing.  So I suggest to just remove
the checking.  We will check it when necessary.

Best Regards,
Huang, Ying

^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [PATCH 4/5] mm/swap_state: fix potential faulted in race in swap_ra_info()
@ 2021-04-09  8:50     ` Huang, Ying
  0 siblings, 0 replies; 72+ messages in thread
From: Huang, Ying @ 2021-04-09  8:50 UTC (permalink / raw)
  To: Miaohe Lin
  Cc: akpm, hannes, mhocko, iamjoonsoo.kim, vbabka, alex.shi, willy,
	minchan, richard.weiyang, hughd, tim.c.chen, linux-kernel,
	linux-mm

Miaohe Lin <linmiaohe@huawei.com> writes:

> While we released the pte lock, somebody else might faulted in this pte.
> So we should check whether it's swap pte first to guard against such race
> or swp_type would be unexpected. And we can also avoid some unnecessary
> readahead cpu cycles possibly.
>
> Fixes: ec560175c0b6 ("mm, swap: VMA based swap readahead")
> Signed-off-by: Miaohe Lin <linmiaohe@huawei.com>
> ---
>  mm/swap_state.c | 13 +++++++++----
>  1 file changed, 9 insertions(+), 4 deletions(-)
>
> diff --git a/mm/swap_state.c b/mm/swap_state.c
> index 709c260d644a..3bf0d0c297bc 100644
> --- a/mm/swap_state.c
> +++ b/mm/swap_state.c
> @@ -724,10 +724,10 @@ static void swap_ra_info(struct vm_fault *vmf,
>  {
>  	struct vm_area_struct *vma = vmf->vma;
>  	unsigned long ra_val;
> -	swp_entry_t entry;
> +	swp_entry_t swap_entry;
>  	unsigned long faddr, pfn, fpfn;
>  	unsigned long start, end;
> -	pte_t *pte, *orig_pte;
> +	pte_t *pte, *orig_pte, entry;
>  	unsigned int max_win, hits, prev_win, win, left;
>  #ifndef CONFIG_64BIT
>  	pte_t *tpte;
> @@ -742,8 +742,13 @@ static void swap_ra_info(struct vm_fault *vmf,
>  
>  	faddr = vmf->address;
>  	orig_pte = pte = pte_offset_map(vmf->pmd, faddr);
> -	entry = pte_to_swp_entry(*pte);
> -	if ((unlikely(non_swap_entry(entry)))) {
> +	entry = *pte;
> +	if (unlikely(!is_swap_pte(entry))) {
> +		pte_unmap(orig_pte);
> +		return;
> +	}
> +	swap_entry = pte_to_swp_entry(entry);
> +	if ((unlikely(non_swap_entry(swap_entry)))) {
>  		pte_unmap(orig_pte);
>  		return;
>  	}

This isn't a real issue.  entry or swap_entry isn't used in this
function.  And we have enough checking when we really operate the PTE
entries later.  But I admit it's confusing.  So I suggest to just remove
the checking.  We will check it when necessary.

Best Regards,
Huang, Ying


^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [PATCH 4/5] mm/swap_state: fix potential faulted in race in swap_ra_info()
  2021-04-09  8:50     ` Huang, Ying
  (?)
@ 2021-04-09  9:00     ` Miaohe Lin
  2021-04-12  0:55         ` Huang, Ying
  -1 siblings, 1 reply; 72+ messages in thread
From: Miaohe Lin @ 2021-04-09  9:00 UTC (permalink / raw)
  To: Huang, Ying
  Cc: akpm, hannes, mhocko, iamjoonsoo.kim, vbabka, alex.shi, willy,
	minchan, richard.weiyang, hughd, tim.c.chen, linux-kernel,
	linux-mm

On 2021/4/9 16:50, Huang, Ying wrote:
> Miaohe Lin <linmiaohe@huawei.com> writes:
> 
>> While we released the pte lock, somebody else might faulted in this pte.
>> So we should check whether it's swap pte first to guard against such race
>> or swp_type would be unexpected. And we can also avoid some unnecessary
>> readahead cpu cycles possibly.
>>
>> Fixes: ec560175c0b6 ("mm, swap: VMA based swap readahead")
>> Signed-off-by: Miaohe Lin <linmiaohe@huawei.com>
>> ---
>>  mm/swap_state.c | 13 +++++++++----
>>  1 file changed, 9 insertions(+), 4 deletions(-)
>>
>> diff --git a/mm/swap_state.c b/mm/swap_state.c
>> index 709c260d644a..3bf0d0c297bc 100644
>> --- a/mm/swap_state.c
>> +++ b/mm/swap_state.c
>> @@ -724,10 +724,10 @@ static void swap_ra_info(struct vm_fault *vmf,
>>  {
>>  	struct vm_area_struct *vma = vmf->vma;
>>  	unsigned long ra_val;
>> -	swp_entry_t entry;
>> +	swp_entry_t swap_entry;
>>  	unsigned long faddr, pfn, fpfn;
>>  	unsigned long start, end;
>> -	pte_t *pte, *orig_pte;
>> +	pte_t *pte, *orig_pte, entry;
>>  	unsigned int max_win, hits, prev_win, win, left;
>>  #ifndef CONFIG_64BIT
>>  	pte_t *tpte;
>> @@ -742,8 +742,13 @@ static void swap_ra_info(struct vm_fault *vmf,
>>  
>>  	faddr = vmf->address;
>>  	orig_pte = pte = pte_offset_map(vmf->pmd, faddr);
>> -	entry = pte_to_swp_entry(*pte);
>> -	if ((unlikely(non_swap_entry(entry)))) {
>> +	entry = *pte;
>> +	if (unlikely(!is_swap_pte(entry))) {
>> +		pte_unmap(orig_pte);
>> +		return;
>> +	}
>> +	swap_entry = pte_to_swp_entry(entry);
>> +	if ((unlikely(non_swap_entry(swap_entry)))) {
>>  		pte_unmap(orig_pte);
>>  		return;
>>  	}
> 
> This isn't a real issue.  entry or swap_entry isn't used in this

Agree. It seems the entry or swap_entry here is just used for check whether
pte is still valid swap_entry.

> function.  And we have enough checking when we really operate the PTE
> entries later.  But I admit it's confusing.  So I suggest to just remove
> the checking.  We will check it when necessary.

Sounds reasonable. Will do it in v2.

Many thanks for review and reply!

> 
> Best Regards,
> Huang, Ying
> .
> 


^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [PATCH 2/5] swap: fix do_swap_page() race with swapoff
  2021-04-09  8:42     ` Miaohe Lin
@ 2021-04-09 17:17       ` Tim Chen
  2021-04-10  3:17         ` Miaohe Lin
  0 siblings, 1 reply; 72+ messages in thread
From: Tim Chen @ 2021-04-09 17:17 UTC (permalink / raw)
  To: Miaohe Lin, akpm
  Cc: hannes, mhocko, iamjoonsoo.kim, vbabka, alex.shi, willy, minchan,
	richard.weiyang, ying.huang, hughd, linux-kernel, linux-mm



On 4/9/21 1:42 AM, Miaohe Lin wrote:
> On 2021/4/9 5:34, Tim Chen wrote:
>>
>>
>> On 4/8/21 6:08 AM, Miaohe Lin wrote:
>>> When I was investigating the swap code, I found the below possible race
>>> window:
>>>
>>> CPU 1					CPU 2
>>> -----					-----
>>> do_swap_page
>>>   synchronous swap_readpage
>>>     alloc_page_vma
>>> 					swapoff
>>> 					  release swap_file, bdev, or ...
>>
> 
> Many thanks for quick review and reply!
> 
>> Perhaps I'm missing something.  The release of swap_file, bdev etc
>> happens after we have cleared the SWP_VALID bit in si->flags in destroy_swap_extents
>> if I read the swapoff code correctly.
> Agree. Let's look this more close:
> CPU1								CPU2
> -----								-----
> swap_readpage
>   if (data_race(sis->flags & SWP_FS_OPS)) {
> 								swapoff
> 								  p->swap_file = NULL;
>     struct file *swap_file = sis->swap_file;
>     struct address_space *mapping = swap_file->f_mapping;[oops!]
> 								  ...
> 								  p->flags = 0;
>     ...
> 
> Does this make sense for you?

p->swapfile = NULL happens after the 
p->flags &= ~SWP_VALID, synchronize_rcu(), destroy_swap_extents() sequence in swapoff().

So I don't think the sequence you illustrated on CPU2 is in the right order.
That said, without get_swap_device/put_swap_device in swap_readpage, you could
potentially blow pass synchronize_rcu() on CPU2 and causes a problem.  so I think
the problematic race looks something like the following:


CPU1								CPU2
-----								-----
swap_readpage
  if (data_race(sis->flags & SWP_FS_OPS)) {
								swapoff
								  p->flags = &= ~SWP_VALID;
								  ..
								  synchronize_rcu();
								  ..
								  p->swap_file = NULL;
    struct file *swap_file = sis->swap_file;
    struct address_space *mapping = swap_file->f_mapping;[oops!]
								  ...
    ...

By adding get_swap_device/put_swap_device, then the race is fixed.


CPU1								CPU2
-----								-----
swap_readpage
  get_swap_device()
  ..
  if (data_race(sis->flags & SWP_FS_OPS)) {
								swapoff
								  p->flags = &= ~SWP_VALID;
								  ..
    struct file *swap_file = sis->swap_file;
    struct address_space *mapping = swap_file->f_mapping;[valid value]
  ..
  put_swap_device()
								  synchronize_rcu();
								  ..
								  p->swap_file = NULL;


> 
>>>
>>>       swap_readpage
>>> 	check sis->flags is ok
>>> 	  access swap_file, bdev...[oops!]
>>> 					    si->flags = 0
>>
>> This happens after we clear the si->flags
>> 					synchronize_rcu()
>> 					release swap_file, bdev, in destroy_swap_extents()
>>
>> So I think if we have get_swap_device/put_swap_device in do_swap_page,
>> it should fix the race you've pointed out here.  
>> Then synchronize_rcu() will wait till we have completed do_swap_page and
>> call put_swap_device.
> 
> Right, get_swap_device/put_swap_device could fix this race. __But__ rcu_read_lock()
> in get_swap_device() could disable preempt and do_swap_page() may take a really long
> time because it involves I/O. It may not be acceptable to disable preempt for such a
> long time. :(

I can see that it is not a good idea to hold rcu read lock for a long
time over slow file I/O operation, which will be the side effect of
introducing get/put_swap_device to swap_readpage.  So using percpu_ref
will then be preferable for synchronization once we introduce 
get/put_swap_device into swap_readpage.

Tim

^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [PATCH 2/5] swap: fix do_swap_page() race with swapoff
  2021-04-09 17:17       ` Tim Chen
@ 2021-04-10  3:17         ` Miaohe Lin
  2021-04-12  1:44             ` Huang, Ying
  0 siblings, 1 reply; 72+ messages in thread
From: Miaohe Lin @ 2021-04-10  3:17 UTC (permalink / raw)
  To: Tim Chen, akpm
  Cc: hannes, mhocko, iamjoonsoo.kim, vbabka, alex.shi, willy, minchan,
	richard.weiyang, ying.huang, hughd, linux-kernel, linux-mm

On 2021/4/10 1:17, Tim Chen wrote:
> 
> 
> On 4/9/21 1:42 AM, Miaohe Lin wrote:
>> On 2021/4/9 5:34, Tim Chen wrote:
>>>
>>>
>>> On 4/8/21 6:08 AM, Miaohe Lin wrote:
>>>> When I was investigating the swap code, I found the below possible race
>>>> window:
>>>>
>>>> CPU 1					CPU 2
>>>> -----					-----
>>>> do_swap_page
>>>>   synchronous swap_readpage
>>>>     alloc_page_vma
>>>> 					swapoff
>>>> 					  release swap_file, bdev, or ...
>>>
>>
>> Many thanks for quick review and reply!
>>
>>> Perhaps I'm missing something.  The release of swap_file, bdev etc
>>> happens after we have cleared the SWP_VALID bit in si->flags in destroy_swap_extents
>>> if I read the swapoff code correctly.
>> Agree. Let's look this more close:
>> CPU1								CPU2
>> -----								-----
>> swap_readpage
>>   if (data_race(sis->flags & SWP_FS_OPS)) {
>> 								swapoff
>> 								  p->swap_file = NULL;
>>     struct file *swap_file = sis->swap_file;
>>     struct address_space *mapping = swap_file->f_mapping;[oops!]
>> 								  ...
>> 								  p->flags = 0;
>>     ...
>>
>> Does this make sense for you?
> 
> p->swapfile = NULL happens after the 
> p->flags &= ~SWP_VALID, synchronize_rcu(), destroy_swap_extents() sequence in swapoff().
> 
> So I don't think the sequence you illustrated on CPU2 is in the right order.
> That said, without get_swap_device/put_swap_device in swap_readpage, you could
> potentially blow pass synchronize_rcu() on CPU2 and causes a problem.  so I think
> the problematic race looks something like the following:
> 
> 
> CPU1								CPU2
> -----								-----
> swap_readpage
>   if (data_race(sis->flags & SWP_FS_OPS)) {
> 								swapoff
> 								  p->flags = &= ~SWP_VALID;
> 								  ..
> 								  synchronize_rcu();
> 								  ..
> 								  p->swap_file = NULL;
>     struct file *swap_file = sis->swap_file;
>     struct address_space *mapping = swap_file->f_mapping;[oops!]
> 								  ...
>     ...
> 

Agree. This is also what I meant to illustrate. And you provide a better one. Many thanks!

> By adding get_swap_device/put_swap_device, then the race is fixed.
> 
> 
> CPU1								CPU2
> -----								-----
> swap_readpage
>   get_swap_device()
>   ..
>   if (data_race(sis->flags & SWP_FS_OPS)) {
> 								swapoff
> 								  p->flags = &= ~SWP_VALID;
> 								  ..
>     struct file *swap_file = sis->swap_file;
>     struct address_space *mapping = swap_file->f_mapping;[valid value]
>   ..
>   put_swap_device()
> 								  synchronize_rcu();
> 								  ..
> 								  p->swap_file = NULL;
> 
> 
>>
>>>>
>>>>       swap_readpage
>>>> 	check sis->flags is ok
>>>> 	  access swap_file, bdev...[oops!]
>>>> 					    si->flags = 0
>>>
>>> This happens after we clear the si->flags
>>> 					synchronize_rcu()
>>> 					release swap_file, bdev, in destroy_swap_extents()
>>>
>>> So I think if we have get_swap_device/put_swap_device in do_swap_page,
>>> it should fix the race you've pointed out here.  
>>> Then synchronize_rcu() will wait till we have completed do_swap_page and
>>> call put_swap_device.
>>
>> Right, get_swap_device/put_swap_device could fix this race. __But__ rcu_read_lock()
>> in get_swap_device() could disable preempt and do_swap_page() may take a really long
>> time because it involves I/O. It may not be acceptable to disable preempt for such a
>> long time. :(
> 
> I can see that it is not a good idea to hold rcu read lock for a long
> time over slow file I/O operation, which will be the side effect of
> introducing get/put_swap_device to swap_readpage.  So using percpu_ref
> will then be preferable for synchronization once we introduce 
> get/put_swap_device into swap_readpage.
> 

The sis->bdev should also be protected by get/put_swap_device. It has the similar
issue. And swap_slot_free_notify (called from callback end_swap_bio_read) would
race with swapoff too. So I use get/put_swap_device to protect swap_readpage until
file I/O operation is completed.

Thanks again!

> Tim
> .
> 

^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [PATCH 4/5] mm/swap_state: fix potential faulted in race in swap_ra_info()
  2021-04-09  9:00     ` Miaohe Lin
@ 2021-04-12  0:55         ` Huang, Ying
  0 siblings, 0 replies; 72+ messages in thread
From: Huang, Ying @ 2021-04-12  0:55 UTC (permalink / raw)
  To: Miaohe Lin
  Cc: akpm, hannes, mhocko, iamjoonsoo.kim, vbabka, alex.shi, willy,
	minchan, richard.weiyang, hughd, tim.c.chen, linux-kernel,
	linux-mm

Miaohe Lin <linmiaohe@huawei.com> writes:

> On 2021/4/9 16:50, Huang, Ying wrote:
>> Miaohe Lin <linmiaohe@huawei.com> writes:
>> 
>>> While we released the pte lock, somebody else might faulted in this pte.
>>> So we should check whether it's swap pte first to guard against such race
>>> or swp_type would be unexpected. And we can also avoid some unnecessary
>>> readahead cpu cycles possibly.
>>>
>>> Fixes: ec560175c0b6 ("mm, swap: VMA based swap readahead")
>>> Signed-off-by: Miaohe Lin <linmiaohe@huawei.com>
>>> ---
>>>  mm/swap_state.c | 13 +++++++++----
>>>  1 file changed, 9 insertions(+), 4 deletions(-)
>>>
>>> diff --git a/mm/swap_state.c b/mm/swap_state.c
>>> index 709c260d644a..3bf0d0c297bc 100644
>>> --- a/mm/swap_state.c
>>> +++ b/mm/swap_state.c
>>> @@ -724,10 +724,10 @@ static void swap_ra_info(struct vm_fault *vmf,
>>>  {
>>>  	struct vm_area_struct *vma = vmf->vma;
>>>  	unsigned long ra_val;
>>> -	swp_entry_t entry;
>>> +	swp_entry_t swap_entry;
>>>  	unsigned long faddr, pfn, fpfn;
>>>  	unsigned long start, end;
>>> -	pte_t *pte, *orig_pte;
>>> +	pte_t *pte, *orig_pte, entry;
>>>  	unsigned int max_win, hits, prev_win, win, left;
>>>  #ifndef CONFIG_64BIT
>>>  	pte_t *tpte;
>>> @@ -742,8 +742,13 @@ static void swap_ra_info(struct vm_fault *vmf,
>>>  
>>>  	faddr = vmf->address;
>>>  	orig_pte = pte = pte_offset_map(vmf->pmd, faddr);
>>> -	entry = pte_to_swp_entry(*pte);
>>> -	if ((unlikely(non_swap_entry(entry)))) {
>>> +	entry = *pte;
>>> +	if (unlikely(!is_swap_pte(entry))) {
>>> +		pte_unmap(orig_pte);
>>> +		return;
>>> +	}
>>> +	swap_entry = pte_to_swp_entry(entry);
>>> +	if ((unlikely(non_swap_entry(swap_entry)))) {
>>>  		pte_unmap(orig_pte);
>>>  		return;
>>>  	}
>> 
>> This isn't a real issue.  entry or swap_entry isn't used in this
>
> Agree. It seems the entry or swap_entry here is just used for check whether
> pte is still valid swap_entry.

If you check the git history, you will find that the check has been
necessary before.  Because the function is used earlier in
do_swap_page() at that time.

Best Regards,
Huang, Ying

^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [PATCH 4/5] mm/swap_state: fix potential faulted in race in swap_ra_info()
@ 2021-04-12  0:55         ` Huang, Ying
  0 siblings, 0 replies; 72+ messages in thread
From: Huang, Ying @ 2021-04-12  0:55 UTC (permalink / raw)
  To: Miaohe Lin
  Cc: akpm, hannes, mhocko, iamjoonsoo.kim, vbabka, alex.shi, willy,
	minchan, richard.weiyang, hughd, tim.c.chen, linux-kernel,
	linux-mm

Miaohe Lin <linmiaohe@huawei.com> writes:

> On 2021/4/9 16:50, Huang, Ying wrote:
>> Miaohe Lin <linmiaohe@huawei.com> writes:
>> 
>>> While we released the pte lock, somebody else might faulted in this pte.
>>> So we should check whether it's swap pte first to guard against such race
>>> or swp_type would be unexpected. And we can also avoid some unnecessary
>>> readahead cpu cycles possibly.
>>>
>>> Fixes: ec560175c0b6 ("mm, swap: VMA based swap readahead")
>>> Signed-off-by: Miaohe Lin <linmiaohe@huawei.com>
>>> ---
>>>  mm/swap_state.c | 13 +++++++++----
>>>  1 file changed, 9 insertions(+), 4 deletions(-)
>>>
>>> diff --git a/mm/swap_state.c b/mm/swap_state.c
>>> index 709c260d644a..3bf0d0c297bc 100644
>>> --- a/mm/swap_state.c
>>> +++ b/mm/swap_state.c
>>> @@ -724,10 +724,10 @@ static void swap_ra_info(struct vm_fault *vmf,
>>>  {
>>>  	struct vm_area_struct *vma = vmf->vma;
>>>  	unsigned long ra_val;
>>> -	swp_entry_t entry;
>>> +	swp_entry_t swap_entry;
>>>  	unsigned long faddr, pfn, fpfn;
>>>  	unsigned long start, end;
>>> -	pte_t *pte, *orig_pte;
>>> +	pte_t *pte, *orig_pte, entry;
>>>  	unsigned int max_win, hits, prev_win, win, left;
>>>  #ifndef CONFIG_64BIT
>>>  	pte_t *tpte;
>>> @@ -742,8 +742,13 @@ static void swap_ra_info(struct vm_fault *vmf,
>>>  
>>>  	faddr = vmf->address;
>>>  	orig_pte = pte = pte_offset_map(vmf->pmd, faddr);
>>> -	entry = pte_to_swp_entry(*pte);
>>> -	if ((unlikely(non_swap_entry(entry)))) {
>>> +	entry = *pte;
>>> +	if (unlikely(!is_swap_pte(entry))) {
>>> +		pte_unmap(orig_pte);
>>> +		return;
>>> +	}
>>> +	swap_entry = pte_to_swp_entry(entry);
>>> +	if ((unlikely(non_swap_entry(swap_entry)))) {
>>>  		pte_unmap(orig_pte);
>>>  		return;
>>>  	}
>> 
>> This isn't a real issue.  entry or swap_entry isn't used in this
>
> Agree. It seems the entry or swap_entry here is just used for check whether
> pte is still valid swap_entry.

If you check the git history, you will find that the check has been
necessary before.  Because the function is used earlier in
do_swap_page() at that time.

Best Regards,
Huang, Ying


^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [PATCH 2/5] swap: fix do_swap_page() race with swapoff
  2021-04-10  3:17         ` Miaohe Lin
@ 2021-04-12  1:44             ` Huang, Ying
  0 siblings, 0 replies; 72+ messages in thread
From: Huang, Ying @ 2021-04-12  1:44 UTC (permalink / raw)
  To: Miaohe Lin
  Cc: Tim Chen, akpm, hannes, mhocko, iamjoonsoo.kim, vbabka, alex.shi,
	willy, minchan, richard.weiyang, hughd, linux-kernel, linux-mm

Miaohe Lin <linmiaohe@huawei.com> writes:

> On 2021/4/10 1:17, Tim Chen wrote:
>> 
>> 
>> On 4/9/21 1:42 AM, Miaohe Lin wrote:
>>> On 2021/4/9 5:34, Tim Chen wrote:
>>>>
>>>>
>>>> On 4/8/21 6:08 AM, Miaohe Lin wrote:
>>>>> When I was investigating the swap code, I found the below possible race
>>>>> window:
>>>>>
>>>>> CPU 1					CPU 2
>>>>> -----					-----
>>>>> do_swap_page
>>>>>   synchronous swap_readpage
>>>>>     alloc_page_vma
>>>>> 					swapoff
>>>>> 					  release swap_file, bdev, or ...
>>>>
>>>
>>> Many thanks for quick review and reply!
>>>
>>>> Perhaps I'm missing something.  The release of swap_file, bdev etc
>>>> happens after we have cleared the SWP_VALID bit in si->flags in destroy_swap_extents
>>>> if I read the swapoff code correctly.
>>> Agree. Let's look this more close:
>>> CPU1								CPU2
>>> -----								-----
>>> swap_readpage
>>>   if (data_race(sis->flags & SWP_FS_OPS)) {
>>> 								swapoff
>>> 								  p->swap_file = NULL;
>>>     struct file *swap_file = sis->swap_file;
>>>     struct address_space *mapping = swap_file->f_mapping;[oops!]
>>> 								  ...
>>> 								  p->flags = 0;
>>>     ...
>>>
>>> Does this make sense for you?
>> 
>> p->swapfile = NULL happens after the 
>> p->flags &= ~SWP_VALID, synchronize_rcu(), destroy_swap_extents() sequence in swapoff().
>> 
>> So I don't think the sequence you illustrated on CPU2 is in the right order.
>> That said, without get_swap_device/put_swap_device in swap_readpage, you could
>> potentially blow pass synchronize_rcu() on CPU2 and causes a problem.  so I think
>> the problematic race looks something like the following:
>> 
>> 
>> CPU1								CPU2
>> -----								-----
>> swap_readpage
>>   if (data_race(sis->flags & SWP_FS_OPS)) {
>> 								swapoff
>> 								  p->flags = &= ~SWP_VALID;
>> 								  ..
>> 								  synchronize_rcu();
>> 								  ..
>> 								  p->swap_file = NULL;
>>     struct file *swap_file = sis->swap_file;
>>     struct address_space *mapping = swap_file->f_mapping;[oops!]
>> 								  ...
>>     ...
>> 
>
> Agree. This is also what I meant to illustrate. And you provide a better one. Many thanks!

For the pages that are swapped in through swap cache.  That isn't an
issue.  Because the page is locked, the swap entry will be marked with
SWAP_HAS_CACHE, so swapoff() cannot proceed until the page has been
unlocked.

So the race is for the fast path as follows,

		if (data_race(si->flags & SWP_SYNCHRONOUS_IO) &&
		    __swap_count(entry) == 1)

I found it in your original patch description.  But please make it more
explicit to reduce the potential confusing.

Best Regards,
Huang, Ying

^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [PATCH 2/5] swap: fix do_swap_page() race with swapoff
@ 2021-04-12  1:44             ` Huang, Ying
  0 siblings, 0 replies; 72+ messages in thread
From: Huang, Ying @ 2021-04-12  1:44 UTC (permalink / raw)
  To: Miaohe Lin
  Cc: Tim Chen, akpm, hannes, mhocko, iamjoonsoo.kim, vbabka, alex.shi,
	willy, minchan, richard.weiyang, hughd, linux-kernel, linux-mm

Miaohe Lin <linmiaohe@huawei.com> writes:

> On 2021/4/10 1:17, Tim Chen wrote:
>> 
>> 
>> On 4/9/21 1:42 AM, Miaohe Lin wrote:
>>> On 2021/4/9 5:34, Tim Chen wrote:
>>>>
>>>>
>>>> On 4/8/21 6:08 AM, Miaohe Lin wrote:
>>>>> When I was investigating the swap code, I found the below possible race
>>>>> window:
>>>>>
>>>>> CPU 1					CPU 2
>>>>> -----					-----
>>>>> do_swap_page
>>>>>   synchronous swap_readpage
>>>>>     alloc_page_vma
>>>>> 					swapoff
>>>>> 					  release swap_file, bdev, or ...
>>>>
>>>
>>> Many thanks for quick review and reply!
>>>
>>>> Perhaps I'm missing something.  The release of swap_file, bdev etc
>>>> happens after we have cleared the SWP_VALID bit in si->flags in destroy_swap_extents
>>>> if I read the swapoff code correctly.
>>> Agree. Let's look this more close:
>>> CPU1								CPU2
>>> -----								-----
>>> swap_readpage
>>>   if (data_race(sis->flags & SWP_FS_OPS)) {
>>> 								swapoff
>>> 								  p->swap_file = NULL;
>>>     struct file *swap_file = sis->swap_file;
>>>     struct address_space *mapping = swap_file->f_mapping;[oops!]
>>> 								  ...
>>> 								  p->flags = 0;
>>>     ...
>>>
>>> Does this make sense for you?
>> 
>> p->swapfile = NULL happens after the 
>> p->flags &= ~SWP_VALID, synchronize_rcu(), destroy_swap_extents() sequence in swapoff().
>> 
>> So I don't think the sequence you illustrated on CPU2 is in the right order.
>> That said, without get_swap_device/put_swap_device in swap_readpage, you could
>> potentially blow pass synchronize_rcu() on CPU2 and causes a problem.  so I think
>> the problematic race looks something like the following:
>> 
>> 
>> CPU1								CPU2
>> -----								-----
>> swap_readpage
>>   if (data_race(sis->flags & SWP_FS_OPS)) {
>> 								swapoff
>> 								  p->flags = &= ~SWP_VALID;
>> 								  ..
>> 								  synchronize_rcu();
>> 								  ..
>> 								  p->swap_file = NULL;
>>     struct file *swap_file = sis->swap_file;
>>     struct address_space *mapping = swap_file->f_mapping;[oops!]
>> 								  ...
>>     ...
>> 
>
> Agree. This is also what I meant to illustrate. And you provide a better one. Many thanks!

For the pages that are swapped in through swap cache.  That isn't an
issue.  Because the page is locked, the swap entry will be marked with
SWAP_HAS_CACHE, so swapoff() cannot proceed until the page has been
unlocked.

So the race is for the fast path as follows,

		if (data_race(si->flags & SWP_SYNCHRONOUS_IO) &&
		    __swap_count(entry) == 1)

I found it in your original patch description.  But please make it more
explicit to reduce the potential confusing.

Best Regards,
Huang, Ying


^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [PATCH 4/5] mm/swap_state: fix potential faulted in race in swap_ra_info()
  2021-04-12  0:55         ` Huang, Ying
  (?)
@ 2021-04-12  3:17         ` Miaohe Lin
  -1 siblings, 0 replies; 72+ messages in thread
From: Miaohe Lin @ 2021-04-12  3:17 UTC (permalink / raw)
  To: Huang, Ying
  Cc: akpm, hannes, mhocko, iamjoonsoo.kim, vbabka, alex.shi, willy,
	minchan, richard.weiyang, hughd, tim.c.chen, linux-kernel,
	linux-mm

On 2021/4/12 8:55, Huang, Ying wrote:
> Miaohe Lin <linmiaohe@huawei.com> writes:
> 
>> On 2021/4/9 16:50, Huang, Ying wrote:
>>> Miaohe Lin <linmiaohe@huawei.com> writes:
>>>
>>>> While we released the pte lock, somebody else might faulted in this pte.
>>>> So we should check whether it's swap pte first to guard against such race
>>>> or swp_type would be unexpected. And we can also avoid some unnecessary
>>>> readahead cpu cycles possibly.
>>>>
>>>> Fixes: ec560175c0b6 ("mm, swap: VMA based swap readahead")
>>>> Signed-off-by: Miaohe Lin <linmiaohe@huawei.com>
>>>> ---
>>>>  mm/swap_state.c | 13 +++++++++----
>>>>  1 file changed, 9 insertions(+), 4 deletions(-)
>>>>
>>>> diff --git a/mm/swap_state.c b/mm/swap_state.c
>>>> index 709c260d644a..3bf0d0c297bc 100644
>>>> --- a/mm/swap_state.c
>>>> +++ b/mm/swap_state.c
>>>> @@ -724,10 +724,10 @@ static void swap_ra_info(struct vm_fault *vmf,
>>>>  {
>>>>  	struct vm_area_struct *vma = vmf->vma;
>>>>  	unsigned long ra_val;
>>>> -	swp_entry_t entry;
>>>> +	swp_entry_t swap_entry;
>>>>  	unsigned long faddr, pfn, fpfn;
>>>>  	unsigned long start, end;
>>>> -	pte_t *pte, *orig_pte;
>>>> +	pte_t *pte, *orig_pte, entry;
>>>>  	unsigned int max_win, hits, prev_win, win, left;
>>>>  #ifndef CONFIG_64BIT
>>>>  	pte_t *tpte;
>>>> @@ -742,8 +742,13 @@ static void swap_ra_info(struct vm_fault *vmf,
>>>>  
>>>>  	faddr = vmf->address;
>>>>  	orig_pte = pte = pte_offset_map(vmf->pmd, faddr);
>>>> -	entry = pte_to_swp_entry(*pte);
>>>> -	if ((unlikely(non_swap_entry(entry)))) {
>>>> +	entry = *pte;
>>>> +	if (unlikely(!is_swap_pte(entry))) {
>>>> +		pte_unmap(orig_pte);
>>>> +		return;
>>>> +	}
>>>> +	swap_entry = pte_to_swp_entry(entry);
>>>> +	if ((unlikely(non_swap_entry(swap_entry)))) {
>>>>  		pte_unmap(orig_pte);
>>>>  		return;
>>>>  	}
>>>
>>> This isn't a real issue.  entry or swap_entry isn't used in this
>>
>> Agree. It seems the entry or swap_entry here is just used for check whether
>> pte is still valid swap_entry.
> 
> If you check the git history, you will find that the check has been
> necessary before.  Because the function is used earlier in
> do_swap_page() at that time.
> 

I see. Many thanks for explanation. :)

> Best Regards,
> Huang, Ying
> .
> 


^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [PATCH 2/5] swap: fix do_swap_page() race with swapoff
  2021-04-12  1:44             ` Huang, Ying
  (?)
@ 2021-04-12  3:24             ` Miaohe Lin
  -1 siblings, 0 replies; 72+ messages in thread
From: Miaohe Lin @ 2021-04-12  3:24 UTC (permalink / raw)
  To: Huang, Ying
  Cc: Tim Chen, akpm, hannes, mhocko, iamjoonsoo.kim, vbabka, alex.shi,
	willy, minchan, richard.weiyang, hughd, linux-kernel, linux-mm

On 2021/4/12 9:44, Huang, Ying wrote:
> Miaohe Lin <linmiaohe@huawei.com> writes:
> 
>> On 2021/4/10 1:17, Tim Chen wrote:
>>>
>>>
>>> On 4/9/21 1:42 AM, Miaohe Lin wrote:
>>>> On 2021/4/9 5:34, Tim Chen wrote:
>>>>>
>>>>>
>>>>> On 4/8/21 6:08 AM, Miaohe Lin wrote:
>>>>>> When I was investigating the swap code, I found the below possible race
>>>>>> window:
>>>>>>
>>>>>> CPU 1					CPU 2
>>>>>> -----					-----
>>>>>> do_swap_page
>>>>>>   synchronous swap_readpage
>>>>>>     alloc_page_vma
>>>>>> 					swapoff
>>>>>> 					  release swap_file, bdev, or ...
>>>>>
>>>>
>>>> Many thanks for quick review and reply!
>>>>
>>>>> Perhaps I'm missing something.  The release of swap_file, bdev etc
>>>>> happens after we have cleared the SWP_VALID bit in si->flags in destroy_swap_extents
>>>>> if I read the swapoff code correctly.
>>>> Agree. Let's look this more close:
>>>> CPU1								CPU2
>>>> -----								-----
>>>> swap_readpage
>>>>   if (data_race(sis->flags & SWP_FS_OPS)) {
>>>> 								swapoff
>>>> 								  p->swap_file = NULL;
>>>>     struct file *swap_file = sis->swap_file;
>>>>     struct address_space *mapping = swap_file->f_mapping;[oops!]
>>>> 								  ...
>>>> 								  p->flags = 0;
>>>>     ...
>>>>
>>>> Does this make sense for you?
>>>
>>> p->swapfile = NULL happens after the 
>>> p->flags &= ~SWP_VALID, synchronize_rcu(), destroy_swap_extents() sequence in swapoff().
>>>
>>> So I don't think the sequence you illustrated on CPU2 is in the right order.
>>> That said, without get_swap_device/put_swap_device in swap_readpage, you could
>>> potentially blow pass synchronize_rcu() on CPU2 and causes a problem.  so I think
>>> the problematic race looks something like the following:
>>>
>>>
>>> CPU1								CPU2
>>> -----								-----
>>> swap_readpage
>>>   if (data_race(sis->flags & SWP_FS_OPS)) {
>>> 								swapoff
>>> 								  p->flags = &= ~SWP_VALID;
>>> 								  ..
>>> 								  synchronize_rcu();
>>> 								  ..
>>> 								  p->swap_file = NULL;
>>>     struct file *swap_file = sis->swap_file;
>>>     struct address_space *mapping = swap_file->f_mapping;[oops!]
>>> 								  ...
>>>     ...
>>>
>>
>> Agree. This is also what I meant to illustrate. And you provide a better one. Many thanks!
> 
> For the pages that are swapped in through swap cache.  That isn't an
> issue.  Because the page is locked, the swap entry will be marked with
> SWAP_HAS_CACHE, so swapoff() cannot proceed until the page has been
> unlocked.
> 
> So the race is for the fast path as follows,
> 
> 		if (data_race(si->flags & SWP_SYNCHRONOUS_IO) &&
> 		    __swap_count(entry) == 1)
> 
> I found it in your original patch description.  But please make it more
> explicit to reduce the potential confusing.

Sure. Should I rephrase the commit log to clarify this or add a comment in the code?

Thanks.

> 
> Best Regards,
> Huang, Ying
> .
> 


^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [PATCH 1/5] mm/swapfile: add percpu_ref support for swap
  2021-04-08 13:08 ` [PATCH 1/5] mm/swapfile: add percpu_ref support " Miaohe Lin
@ 2021-04-12  3:30     ` Huang, Ying
  0 siblings, 0 replies; 72+ messages in thread
From: Huang, Ying @ 2021-04-12  3:30 UTC (permalink / raw)
  To: Miaohe Lin
  Cc: akpm, hannes, mhocko, iamjoonsoo.kim, vbabka, alex.shi, willy,
	minchan, richard.weiyang, hughd, tim.c.chen, linux-kernel,
	linux-mm

Miaohe Lin <linmiaohe@huawei.com> writes:

> We will use percpu-refcount to serialize against concurrent swapoff. This
> patch adds the percpu_ref support for later fixup.
>
> Signed-off-by: Miaohe Lin <linmiaohe@huawei.com>
> ---
>  include/linux/swap.h |  2 ++
>  mm/swapfile.c        | 25 ++++++++++++++++++++++---
>  2 files changed, 24 insertions(+), 3 deletions(-)
>
> diff --git a/include/linux/swap.h b/include/linux/swap.h
> index 144727041e78..849ba5265c11 100644
> --- a/include/linux/swap.h
> +++ b/include/linux/swap.h
> @@ -240,6 +240,7 @@ struct swap_cluster_list {
>   * The in-memory structure used to track swap areas.
>   */
>  struct swap_info_struct {
> +	struct percpu_ref users;	/* serialization against concurrent swapoff */
>  	unsigned long	flags;		/* SWP_USED etc: see above */
>  	signed short	prio;		/* swap priority of this type */
>  	struct plist_node list;		/* entry in swap_active_head */
> @@ -260,6 +261,7 @@ struct swap_info_struct {
>  	struct block_device *bdev;	/* swap device or bdev of swap file */
>  	struct file *swap_file;		/* seldom referenced */
>  	unsigned int old_block_size;	/* seldom referenced */
> +	struct completion comp;		/* seldom referenced */
>  #ifdef CONFIG_FRONTSWAP
>  	unsigned long *frontswap_map;	/* frontswap in-use, one bit per page */
>  	atomic_t frontswap_pages;	/* frontswap pages in-use counter */
> diff --git a/mm/swapfile.c b/mm/swapfile.c
> index 149e77454e3c..724173cd7d0c 100644
> --- a/mm/swapfile.c
> +++ b/mm/swapfile.c
> @@ -39,6 +39,7 @@
>  #include <linux/export.h>
>  #include <linux/swap_slots.h>
>  #include <linux/sort.h>
> +#include <linux/completion.h>
>  
>  #include <asm/tlbflush.h>
>  #include <linux/swapops.h>
> @@ -511,6 +512,15 @@ static void swap_discard_work(struct work_struct *work)
>  	spin_unlock(&si->lock);
>  }
>  
> +static void swap_users_ref_free(struct percpu_ref *ref)
> +{
> +	struct swap_info_struct *si;
> +
> +	si = container_of(ref, struct swap_info_struct, users);
> +	complete(&si->comp);
> +	percpu_ref_exit(&si->users);

Because percpu_ref_exit() is used, we cannot use percpu_ref_tryget() in
get_swap_device(), better to add comments there.

> +}
> +
>  static void alloc_cluster(struct swap_info_struct *si, unsigned long idx)
>  {
>  	struct swap_cluster_info *ci = si->cluster_info;
> @@ -2500,7 +2510,7 @@ static void enable_swap_info(struct swap_info_struct *p, int prio,
>  	 * Guarantee swap_map, cluster_info, etc. fields are valid
>  	 * between get/put_swap_device() if SWP_VALID bit is set
>  	 */
> -	synchronize_rcu();
> +	percpu_ref_reinit(&p->users);

Although the effect is same, I think it's better to use
percpu_ref_resurrect() here to improve code readability.

>  	spin_lock(&swap_lock);
>  	spin_lock(&p->lock);
>  	_enable_swap_info(p);
> @@ -2621,11 +2631,13 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
>  	p->flags &= ~SWP_VALID;		/* mark swap device as invalid */
>  	spin_unlock(&p->lock);
>  	spin_unlock(&swap_lock);
> +
> +	percpu_ref_kill(&p->users);
>  	/*
>  	 * wait for swap operations protected by get/put_swap_device()
>  	 * to complete
>  	 */
> -	synchronize_rcu();
> +	wait_for_completion(&p->comp);

Better to move percpu_ref_kill() after the comments.  And maybe revise
the comments.

>  
>  	flush_work(&p->discard_work);
>  
> @@ -3132,7 +3144,7 @@ static bool swap_discardable(struct swap_info_struct *si)
>  SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
>  {
>  	struct swap_info_struct *p;
> -	struct filename *name;
> +	struct filename *name = NULL;
>  	struct file *swap_file = NULL;
>  	struct address_space *mapping;
>  	int prio;
> @@ -3163,6 +3175,12 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
>  
>  	INIT_WORK(&p->discard_work, swap_discard_work);
>  
> +	init_completion(&p->comp);
> +	error = percpu_ref_init(&p->users, swap_users_ref_free,
> +				PERCPU_REF_INIT_DEAD, GFP_KERNEL);
> +	if (unlikely(error))
> +		goto bad_swap;
> +
>  	name = getname(specialfile);
>  	if (IS_ERR(name)) {
>  		error = PTR_ERR(name);
> @@ -3356,6 +3374,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
>  bad_swap_unlock_inode:
>  	inode_unlock(inode);
>  bad_swap:
> +	percpu_ref_exit(&p->users);

Usually the resource freeing order matches their allocating order
reversely.  So, if there's no special reason, please follow that rule.

Best Regards,
Huang, Ying

>  	free_percpu(p->percpu_cluster);
>  	p->percpu_cluster = NULL;
>  	free_percpu(p->cluster_next_cpu);

^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [PATCH 1/5] mm/swapfile: add percpu_ref support for swap
@ 2021-04-12  3:30     ` Huang, Ying
  0 siblings, 0 replies; 72+ messages in thread
From: Huang, Ying @ 2021-04-12  3:30 UTC (permalink / raw)
  To: Miaohe Lin
  Cc: akpm, hannes, mhocko, iamjoonsoo.kim, vbabka, alex.shi, willy,
	minchan, richard.weiyang, hughd, tim.c.chen, linux-kernel,
	linux-mm

Miaohe Lin <linmiaohe@huawei.com> writes:

> We will use percpu-refcount to serialize against concurrent swapoff. This
> patch adds the percpu_ref support for later fixup.
>
> Signed-off-by: Miaohe Lin <linmiaohe@huawei.com>
> ---
>  include/linux/swap.h |  2 ++
>  mm/swapfile.c        | 25 ++++++++++++++++++++++---
>  2 files changed, 24 insertions(+), 3 deletions(-)
>
> diff --git a/include/linux/swap.h b/include/linux/swap.h
> index 144727041e78..849ba5265c11 100644
> --- a/include/linux/swap.h
> +++ b/include/linux/swap.h
> @@ -240,6 +240,7 @@ struct swap_cluster_list {
>   * The in-memory structure used to track swap areas.
>   */
>  struct swap_info_struct {
> +	struct percpu_ref users;	/* serialization against concurrent swapoff */
>  	unsigned long	flags;		/* SWP_USED etc: see above */
>  	signed short	prio;		/* swap priority of this type */
>  	struct plist_node list;		/* entry in swap_active_head */
> @@ -260,6 +261,7 @@ struct swap_info_struct {
>  	struct block_device *bdev;	/* swap device or bdev of swap file */
>  	struct file *swap_file;		/* seldom referenced */
>  	unsigned int old_block_size;	/* seldom referenced */
> +	struct completion comp;		/* seldom referenced */
>  #ifdef CONFIG_FRONTSWAP
>  	unsigned long *frontswap_map;	/* frontswap in-use, one bit per page */
>  	atomic_t frontswap_pages;	/* frontswap pages in-use counter */
> diff --git a/mm/swapfile.c b/mm/swapfile.c
> index 149e77454e3c..724173cd7d0c 100644
> --- a/mm/swapfile.c
> +++ b/mm/swapfile.c
> @@ -39,6 +39,7 @@
>  #include <linux/export.h>
>  #include <linux/swap_slots.h>
>  #include <linux/sort.h>
> +#include <linux/completion.h>
>  
>  #include <asm/tlbflush.h>
>  #include <linux/swapops.h>
> @@ -511,6 +512,15 @@ static void swap_discard_work(struct work_struct *work)
>  	spin_unlock(&si->lock);
>  }
>  
> +static void swap_users_ref_free(struct percpu_ref *ref)
> +{
> +	struct swap_info_struct *si;
> +
> +	si = container_of(ref, struct swap_info_struct, users);
> +	complete(&si->comp);
> +	percpu_ref_exit(&si->users);

Because percpu_ref_exit() is used, we cannot use percpu_ref_tryget() in
get_swap_device(), better to add comments there.

> +}
> +
>  static void alloc_cluster(struct swap_info_struct *si, unsigned long idx)
>  {
>  	struct swap_cluster_info *ci = si->cluster_info;
> @@ -2500,7 +2510,7 @@ static void enable_swap_info(struct swap_info_struct *p, int prio,
>  	 * Guarantee swap_map, cluster_info, etc. fields are valid
>  	 * between get/put_swap_device() if SWP_VALID bit is set
>  	 */
> -	synchronize_rcu();
> +	percpu_ref_reinit(&p->users);

Although the effect is same, I think it's better to use
percpu_ref_resurrect() here to improve code readability.

>  	spin_lock(&swap_lock);
>  	spin_lock(&p->lock);
>  	_enable_swap_info(p);
> @@ -2621,11 +2631,13 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
>  	p->flags &= ~SWP_VALID;		/* mark swap device as invalid */
>  	spin_unlock(&p->lock);
>  	spin_unlock(&swap_lock);
> +
> +	percpu_ref_kill(&p->users);
>  	/*
>  	 * wait for swap operations protected by get/put_swap_device()
>  	 * to complete
>  	 */
> -	synchronize_rcu();
> +	wait_for_completion(&p->comp);

Better to move percpu_ref_kill() after the comments.  And maybe revise
the comments.

>  
>  	flush_work(&p->discard_work);
>  
> @@ -3132,7 +3144,7 @@ static bool swap_discardable(struct swap_info_struct *si)
>  SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
>  {
>  	struct swap_info_struct *p;
> -	struct filename *name;
> +	struct filename *name = NULL;
>  	struct file *swap_file = NULL;
>  	struct address_space *mapping;
>  	int prio;
> @@ -3163,6 +3175,12 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
>  
>  	INIT_WORK(&p->discard_work, swap_discard_work);
>  
> +	init_completion(&p->comp);
> +	error = percpu_ref_init(&p->users, swap_users_ref_free,
> +				PERCPU_REF_INIT_DEAD, GFP_KERNEL);
> +	if (unlikely(error))
> +		goto bad_swap;
> +
>  	name = getname(specialfile);
>  	if (IS_ERR(name)) {
>  		error = PTR_ERR(name);
> @@ -3356,6 +3374,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
>  bad_swap_unlock_inode:
>  	inode_unlock(inode);
>  bad_swap:
> +	percpu_ref_exit(&p->users);

Usually the resource freeing order matches their allocating order
reversely.  So, if there's no special reason, please follow that rule.

Best Regards,
Huang, Ying

>  	free_percpu(p->percpu_cluster);
>  	p->percpu_cluster = NULL;
>  	free_percpu(p->cluster_next_cpu);


^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [PATCH 1/5] mm/swapfile: add percpu_ref support for swap
  2021-04-12  3:30     ` Huang, Ying
  (?)
@ 2021-04-12  6:59     ` Miaohe Lin
  -1 siblings, 0 replies; 72+ messages in thread
From: Miaohe Lin @ 2021-04-12  6:59 UTC (permalink / raw)
  To: Huang, Ying
  Cc: akpm, hannes, mhocko, iamjoonsoo.kim, vbabka, alex.shi, willy,
	minchan, richard.weiyang, hughd, tim.c.chen, linux-kernel,
	linux-mm

On 2021/4/12 11:30, Huang, Ying wrote:
> Miaohe Lin <linmiaohe@huawei.com> writes:
> 
>> We will use percpu-refcount to serialize against concurrent swapoff. This
>> patch adds the percpu_ref support for later fixup.
>>
>> Signed-off-by: Miaohe Lin <linmiaohe@huawei.com>
>> ---
>>  include/linux/swap.h |  2 ++
>>  mm/swapfile.c        | 25 ++++++++++++++++++++++---
>>  2 files changed, 24 insertions(+), 3 deletions(-)
>>
>> diff --git a/include/linux/swap.h b/include/linux/swap.h
>> index 144727041e78..849ba5265c11 100644
>> --- a/include/linux/swap.h
>> +++ b/include/linux/swap.h
>> @@ -240,6 +240,7 @@ struct swap_cluster_list {
>>   * The in-memory structure used to track swap areas.
>>   */
>>  struct swap_info_struct {
>> +	struct percpu_ref users;	/* serialization against concurrent swapoff */
>>  	unsigned long	flags;		/* SWP_USED etc: see above */
>>  	signed short	prio;		/* swap priority of this type */
>>  	struct plist_node list;		/* entry in swap_active_head */
>> @@ -260,6 +261,7 @@ struct swap_info_struct {
>>  	struct block_device *bdev;	/* swap device or bdev of swap file */
>>  	struct file *swap_file;		/* seldom referenced */
>>  	unsigned int old_block_size;	/* seldom referenced */
>> +	struct completion comp;		/* seldom referenced */
>>  #ifdef CONFIG_FRONTSWAP
>>  	unsigned long *frontswap_map;	/* frontswap in-use, one bit per page */
>>  	atomic_t frontswap_pages;	/* frontswap pages in-use counter */
>> diff --git a/mm/swapfile.c b/mm/swapfile.c
>> index 149e77454e3c..724173cd7d0c 100644
>> --- a/mm/swapfile.c
>> +++ b/mm/swapfile.c
>> @@ -39,6 +39,7 @@
>>  #include <linux/export.h>
>>  #include <linux/swap_slots.h>
>>  #include <linux/sort.h>
>> +#include <linux/completion.h>
>>  
>>  #include <asm/tlbflush.h>
>>  #include <linux/swapops.h>
>> @@ -511,6 +512,15 @@ static void swap_discard_work(struct work_struct *work)
>>  	spin_unlock(&si->lock);
>>  }
>>  
>> +static void swap_users_ref_free(struct percpu_ref *ref)
>> +{
>> +	struct swap_info_struct *si;
>> +
>> +	si = container_of(ref, struct swap_info_struct, users);
>> +	complete(&si->comp);
>> +	percpu_ref_exit(&si->users);
> 
> Because percpu_ref_exit() is used, we cannot use percpu_ref_tryget() in
> get_swap_device(), better to add comments there.

Will do.

> 
>> +}
>> +
>>  static void alloc_cluster(struct swap_info_struct *si, unsigned long idx)
>>  {
>>  	struct swap_cluster_info *ci = si->cluster_info;
>> @@ -2500,7 +2510,7 @@ static void enable_swap_info(struct swap_info_struct *p, int prio,
>>  	 * Guarantee swap_map, cluster_info, etc. fields are valid
>>  	 * between get/put_swap_device() if SWP_VALID bit is set
>>  	 */
>> -	synchronize_rcu();
>> +	percpu_ref_reinit(&p->users);
> 
> Although the effect is same, I think it's better to use
> percpu_ref_resurrect() here to improve code readability.
> 

Agree.

>>  	spin_lock(&swap_lock);
>>  	spin_lock(&p->lock);
>>  	_enable_swap_info(p);
>> @@ -2621,11 +2631,13 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
>>  	p->flags &= ~SWP_VALID;		/* mark swap device as invalid */
>>  	spin_unlock(&p->lock);
>>  	spin_unlock(&swap_lock);
>> +
>> +	percpu_ref_kill(&p->users);
>>  	/*
>>  	 * wait for swap operations protected by get/put_swap_device()
>>  	 * to complete
>>  	 */
>> -	synchronize_rcu();
>> +	wait_for_completion(&p->comp);
> 
> Better to move percpu_ref_kill() after the comments.  And maybe revise
> the comments.

Will do.

> 
>>  
>>  	flush_work(&p->discard_work);
>>  
>> @@ -3132,7 +3144,7 @@ static bool swap_discardable(struct swap_info_struct *si)
>>  SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
>>  {
>>  	struct swap_info_struct *p;
>> -	struct filename *name;
>> +	struct filename *name = NULL;
>>  	struct file *swap_file = NULL;
>>  	struct address_space *mapping;
>>  	int prio;
>> @@ -3163,6 +3175,12 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
>>  
>>  	INIT_WORK(&p->discard_work, swap_discard_work);
>>  
>> +	init_completion(&p->comp);
>> +	error = percpu_ref_init(&p->users, swap_users_ref_free,
>> +				PERCPU_REF_INIT_DEAD, GFP_KERNEL);
>> +	if (unlikely(error))
>> +		goto bad_swap;
>> +
>>  	name = getname(specialfile);
>>  	if (IS_ERR(name)) {
>>  		error = PTR_ERR(name);
>> @@ -3356,6 +3374,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
>>  bad_swap_unlock_inode:
>>  	inode_unlock(inode);
>>  bad_swap:
>> +	percpu_ref_exit(&p->users);
> 
> Usually the resource freeing order matches their allocating order
> reversely.  So, if there's no special reason, please follow that rule.
> 

My oversight. Will fix it in V2.

> Best Regards,
> Huang, Ying
> 
>>  	free_percpu(p->percpu_cluster);
>>  	p->percpu_cluster = NULL;
>>  	free_percpu(p->cluster_next_cpu);
> .
> 

Many thanks for review and nice suggestion! :)

^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [PATCH 1/5] mm/swapfile: add percpu_ref support for swap
  2021-04-12  3:30     ` Huang, Ying
@ 2021-04-12  7:24       ` Huang, Ying
  -1 siblings, 0 replies; 72+ messages in thread
From: Huang, Ying @ 2021-04-12  7:24 UTC (permalink / raw)
  To: Miaohe Lin
  Cc: akpm, hannes, mhocko, iamjoonsoo.kim, vbabka, alex.shi, willy,
	minchan, richard.weiyang, hughd, tim.c.chen, linux-kernel,
	linux-mm

"Huang, Ying" <ying.huang@intel.com> writes:

> Miaohe Lin <linmiaohe@huawei.com> writes:
>
>> We will use percpu-refcount to serialize against concurrent swapoff. This
>> patch adds the percpu_ref support for later fixup.
>>
>> Signed-off-by: Miaohe Lin <linmiaohe@huawei.com>
>> ---
>>  include/linux/swap.h |  2 ++
>>  mm/swapfile.c        | 25 ++++++++++++++++++++++---
>>  2 files changed, 24 insertions(+), 3 deletions(-)
>>
>> diff --git a/include/linux/swap.h b/include/linux/swap.h
>> index 144727041e78..849ba5265c11 100644
>> --- a/include/linux/swap.h
>> +++ b/include/linux/swap.h
>> @@ -240,6 +240,7 @@ struct swap_cluster_list {
>>   * The in-memory structure used to track swap areas.
>>   */
>>  struct swap_info_struct {
>> +	struct percpu_ref users;	/* serialization against concurrent swapoff */
>>  	unsigned long	flags;		/* SWP_USED etc: see above */
>>  	signed short	prio;		/* swap priority of this type */
>>  	struct plist_node list;		/* entry in swap_active_head */
>> @@ -260,6 +261,7 @@ struct swap_info_struct {
>>  	struct block_device *bdev;	/* swap device or bdev of swap file */
>>  	struct file *swap_file;		/* seldom referenced */
>>  	unsigned int old_block_size;	/* seldom referenced */
>> +	struct completion comp;		/* seldom referenced */
>>  #ifdef CONFIG_FRONTSWAP
>>  	unsigned long *frontswap_map;	/* frontswap in-use, one bit per page */
>>  	atomic_t frontswap_pages;	/* frontswap pages in-use counter */
>> diff --git a/mm/swapfile.c b/mm/swapfile.c
>> index 149e77454e3c..724173cd7d0c 100644
>> --- a/mm/swapfile.c
>> +++ b/mm/swapfile.c
>> @@ -39,6 +39,7 @@
>>  #include <linux/export.h>
>>  #include <linux/swap_slots.h>
>>  #include <linux/sort.h>
>> +#include <linux/completion.h>
>>  
>>  #include <asm/tlbflush.h>
>>  #include <linux/swapops.h>
>> @@ -511,6 +512,15 @@ static void swap_discard_work(struct work_struct *work)
>>  	spin_unlock(&si->lock);
>>  }
>>  
>> +static void swap_users_ref_free(struct percpu_ref *ref)
>> +{
>> +	struct swap_info_struct *si;
>> +
>> +	si = container_of(ref, struct swap_info_struct, users);
>> +	complete(&si->comp);
>> +	percpu_ref_exit(&si->users);
>
> Because percpu_ref_exit() is used, we cannot use percpu_ref_tryget() in
> get_swap_device(), better to add comments there.

I just noticed that the comments of percpu_ref_tryget_live() says,

 * This function is safe to call as long as @ref is between init and exit.

While we need to call get_swap_device() almost at any time, so it's
better to avoid to call percpu_ref_exit() at all.  This will waste some
memory, but we need to follow the API definition to avoid potential
issues in the long term.

And we need to call percpu_ref_init() before insert the swap_info_struct
into the swap_info[].

>> +}
>> +
>>  static void alloc_cluster(struct swap_info_struct *si, unsigned long idx)
>>  {
>>  	struct swap_cluster_info *ci = si->cluster_info;
>> @@ -2500,7 +2510,7 @@ static void enable_swap_info(struct swap_info_struct *p, int prio,
>>  	 * Guarantee swap_map, cluster_info, etc. fields are valid
>>  	 * between get/put_swap_device() if SWP_VALID bit is set
>>  	 */
>> -	synchronize_rcu();
>> +	percpu_ref_reinit(&p->users);
>
> Although the effect is same, I think it's better to use
> percpu_ref_resurrect() here to improve code readability.

Check the original commit description for commit eb085574a752 "mm, swap:
fix race between swapoff and some swap operations" and discussion email
thread as follows again,

https://lore.kernel.org/linux-mm/20171219053650.GB7829@linux.vnet.ibm.com/

I found that the synchronize_rcu() here is to avoid to call smp_rmb() or
smp_load_acquire() in get_swap_device().  Now we will use
percpu_ref_tryget_live() in get_swap_device(), so we will need to add
the necessary memory barrier, or make sure percpu_ref_tryget_live() has
ACQUIRE semantics.  Per my understanding, we need to change
percpu_ref_tryget_live() for that.

>>  	spin_lock(&swap_lock);
>>  	spin_lock(&p->lock);
>>  	_enable_swap_info(p);
>> @@ -2621,11 +2631,13 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
>>  	p->flags &= ~SWP_VALID;		/* mark swap device as invalid */
>>  	spin_unlock(&p->lock);
>>  	spin_unlock(&swap_lock);
>> +
>> +	percpu_ref_kill(&p->users);
>>  	/*
>>  	 * wait for swap operations protected by get/put_swap_device()
>>  	 * to complete
>>  	 */
>> -	synchronize_rcu();
>> +	wait_for_completion(&p->comp);
>
> Better to move percpu_ref_kill() after the comments.  And maybe revise
> the comments.

After reading the original commit description as above, I found that we
need synchronize_rcu() here to protect the accessing to the swap cache
data structure.  Because there's call_rcu() during percpu_ref_kill(), it
appears OK to keep the synchronize_rcu() here.  And we need to revise
the comments to make it clear what is protected by which operation.

Best Regards,
Huang, Ying

[snip]

^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [PATCH 1/5] mm/swapfile: add percpu_ref support for swap
@ 2021-04-12  7:24       ` Huang, Ying
  0 siblings, 0 replies; 72+ messages in thread
From: Huang, Ying @ 2021-04-12  7:24 UTC (permalink / raw)
  To: Miaohe Lin
  Cc: akpm, hannes, mhocko, iamjoonsoo.kim, vbabka, alex.shi, willy,
	minchan, richard.weiyang, hughd, tim.c.chen, linux-kernel,
	linux-mm

"Huang, Ying" <ying.huang@intel.com> writes:

> Miaohe Lin <linmiaohe@huawei.com> writes:
>
>> We will use percpu-refcount to serialize against concurrent swapoff. This
>> patch adds the percpu_ref support for later fixup.
>>
>> Signed-off-by: Miaohe Lin <linmiaohe@huawei.com>
>> ---
>>  include/linux/swap.h |  2 ++
>>  mm/swapfile.c        | 25 ++++++++++++++++++++++---
>>  2 files changed, 24 insertions(+), 3 deletions(-)
>>
>> diff --git a/include/linux/swap.h b/include/linux/swap.h
>> index 144727041e78..849ba5265c11 100644
>> --- a/include/linux/swap.h
>> +++ b/include/linux/swap.h
>> @@ -240,6 +240,7 @@ struct swap_cluster_list {
>>   * The in-memory structure used to track swap areas.
>>   */
>>  struct swap_info_struct {
>> +	struct percpu_ref users;	/* serialization against concurrent swapoff */
>>  	unsigned long	flags;		/* SWP_USED etc: see above */
>>  	signed short	prio;		/* swap priority of this type */
>>  	struct plist_node list;		/* entry in swap_active_head */
>> @@ -260,6 +261,7 @@ struct swap_info_struct {
>>  	struct block_device *bdev;	/* swap device or bdev of swap file */
>>  	struct file *swap_file;		/* seldom referenced */
>>  	unsigned int old_block_size;	/* seldom referenced */
>> +	struct completion comp;		/* seldom referenced */
>>  #ifdef CONFIG_FRONTSWAP
>>  	unsigned long *frontswap_map;	/* frontswap in-use, one bit per page */
>>  	atomic_t frontswap_pages;	/* frontswap pages in-use counter */
>> diff --git a/mm/swapfile.c b/mm/swapfile.c
>> index 149e77454e3c..724173cd7d0c 100644
>> --- a/mm/swapfile.c
>> +++ b/mm/swapfile.c
>> @@ -39,6 +39,7 @@
>>  #include <linux/export.h>
>>  #include <linux/swap_slots.h>
>>  #include <linux/sort.h>
>> +#include <linux/completion.h>
>>  
>>  #include <asm/tlbflush.h>
>>  #include <linux/swapops.h>
>> @@ -511,6 +512,15 @@ static void swap_discard_work(struct work_struct *work)
>>  	spin_unlock(&si->lock);
>>  }
>>  
>> +static void swap_users_ref_free(struct percpu_ref *ref)
>> +{
>> +	struct swap_info_struct *si;
>> +
>> +	si = container_of(ref, struct swap_info_struct, users);
>> +	complete(&si->comp);
>> +	percpu_ref_exit(&si->users);
>
> Because percpu_ref_exit() is used, we cannot use percpu_ref_tryget() in
> get_swap_device(), better to add comments there.

I just noticed that the comments of percpu_ref_tryget_live() says,

 * This function is safe to call as long as @ref is between init and exit.

While we need to call get_swap_device() almost at any time, so it's
better to avoid to call percpu_ref_exit() at all.  This will waste some
memory, but we need to follow the API definition to avoid potential
issues in the long term.

And we need to call percpu_ref_init() before insert the swap_info_struct
into the swap_info[].

>> +}
>> +
>>  static void alloc_cluster(struct swap_info_struct *si, unsigned long idx)
>>  {
>>  	struct swap_cluster_info *ci = si->cluster_info;
>> @@ -2500,7 +2510,7 @@ static void enable_swap_info(struct swap_info_struct *p, int prio,
>>  	 * Guarantee swap_map, cluster_info, etc. fields are valid
>>  	 * between get/put_swap_device() if SWP_VALID bit is set
>>  	 */
>> -	synchronize_rcu();
>> +	percpu_ref_reinit(&p->users);
>
> Although the effect is same, I think it's better to use
> percpu_ref_resurrect() here to improve code readability.

Check the original commit description for commit eb085574a752 "mm, swap:
fix race between swapoff and some swap operations" and discussion email
thread as follows again,

https://lore.kernel.org/linux-mm/20171219053650.GB7829@linux.vnet.ibm.com/

I found that the synchronize_rcu() here is to avoid to call smp_rmb() or
smp_load_acquire() in get_swap_device().  Now we will use
percpu_ref_tryget_live() in get_swap_device(), so we will need to add
the necessary memory barrier, or make sure percpu_ref_tryget_live() has
ACQUIRE semantics.  Per my understanding, we need to change
percpu_ref_tryget_live() for that.

>>  	spin_lock(&swap_lock);
>>  	spin_lock(&p->lock);
>>  	_enable_swap_info(p);
>> @@ -2621,11 +2631,13 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
>>  	p->flags &= ~SWP_VALID;		/* mark swap device as invalid */
>>  	spin_unlock(&p->lock);
>>  	spin_unlock(&swap_lock);
>> +
>> +	percpu_ref_kill(&p->users);
>>  	/*
>>  	 * wait for swap operations protected by get/put_swap_device()
>>  	 * to complete
>>  	 */
>> -	synchronize_rcu();
>> +	wait_for_completion(&p->comp);
>
> Better to move percpu_ref_kill() after the comments.  And maybe revise
> the comments.

After reading the original commit description as above, I found that we
need synchronize_rcu() here to protect the accessing to the swap cache
data structure.  Because there's call_rcu() during percpu_ref_kill(), it
appears OK to keep the synchronize_rcu() here.  And we need to revise
the comments to make it clear what is protected by which operation.

Best Regards,
Huang, Ying

[snip]


^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [PATCH 2/5] swap: fix do_swap_page() race with swapoff
  2021-04-08 13:08 ` [PATCH 2/5] swap: fix do_swap_page() race with swapoff Miaohe Lin
@ 2021-04-13  1:27     ` Huang, Ying
  2021-04-08 21:37   ` kernel test robot
                       ` (2 subsequent siblings)
  3 siblings, 0 replies; 72+ messages in thread
From: Huang, Ying @ 2021-04-13  1:27 UTC (permalink / raw)
  To: Miaohe Lin
  Cc: akpm, hannes, mhocko, iamjoonsoo.kim, vbabka, alex.shi, willy,
	minchan, richard.weiyang, hughd, tim.c.chen, linux-kernel,
	linux-mm, Matthew Wilcox

Miaohe Lin <linmiaohe@huawei.com> writes:

> When I was investigating the swap code, I found the below possible race
> window:
>
> CPU 1					CPU 2
> -----					-----
> do_swap_page
>   synchronous swap_readpage
>     alloc_page_vma
> 					swapoff
> 					  release swap_file, bdev, or ...
>       swap_readpage
> 	check sis->flags is ok
> 	  access swap_file, bdev...[oops!]
> 					    si->flags = 0
>
> Using current get/put_swap_device() to guard against concurrent swapoff for
> swap_readpage() looks terrible because swap_readpage() may take really long
> time. And this race may not be really pernicious because swapoff is usually
> done when system shutdown only. To reduce the performance overhead on the
> hot-path as much as possible, it appears we can use the percpu_ref to close
> this race window(as suggested by Huang, Ying).
>
> Fixes: 235b62176712 ("mm/swap: add cluster lock")

This isn't the commit that introduces the race.  You can use `git blame`
find out the correct commit.  For this it's commit 0bcac06f27d7 "mm,
swap: skip swapcache for swapin of synchronous device".

And I suggest to merge 1/5 and 2/5 to make it easy to get the full
picture.

> Signed-off-by: Miaohe Lin <linmiaohe@huawei.com>
> ---
>  include/linux/swap.h |  2 +-
>  mm/memory.c          | 10 ++++++++++
>  mm/swapfile.c        | 28 +++++++++++-----------------
>  3 files changed, 22 insertions(+), 18 deletions(-)
>
> diff --git a/include/linux/swap.h b/include/linux/swap.h
> index 849ba5265c11..9066addb57fd 100644
> --- a/include/linux/swap.h
> +++ b/include/linux/swap.h
> @@ -513,7 +513,7 @@ sector_t swap_page_sector(struct page *page);
>  
>  static inline void put_swap_device(struct swap_info_struct *si)
>  {
> -	rcu_read_unlock();
> +	percpu_ref_put(&si->users);
>  }
>  
>  #else /* CONFIG_SWAP */
> diff --git a/mm/memory.c b/mm/memory.c
> index cc71a445c76c..8543c47b955c 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -3311,6 +3311,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
>  {
>  	struct vm_area_struct *vma = vmf->vma;
>  	struct page *page = NULL, *swapcache;
> +	struct swap_info_struct *si = NULL;
>  	swp_entry_t entry;
>  	pte_t pte;
>  	int locked;
> @@ -3339,6 +3340,11 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
>  	}
>  
>

I suggest to add comments here as follows (words copy from Matthew Wilcox)

	/* Prevent swapoff from happening to us */

> +	si = get_swap_device(entry);
> +	/* In case we raced with swapoff. */
> +	if (unlikely(!si))
> +		goto out;
> +

Because we wrap the whole do_swap_page() with get/put_swap_device()
now.  We can remove several get/put_swap_device() for function called by
do_swap_page().  That can be another optimization patch.

>  	delayacct_set_flag(DELAYACCT_PF_SWAPIN);
>  	page = lookup_swap_cache(entry, vma, vmf->address);
>  	swapcache = page;
> @@ -3514,6 +3520,8 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
>  unlock:
>  	pte_unmap_unlock(vmf->pte, vmf->ptl);
>  out:
> +	if (si)
> +		put_swap_device(si);
>  	return ret;
>  out_nomap:
>  	pte_unmap_unlock(vmf->pte, vmf->ptl);
> @@ -3525,6 +3533,8 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
>  		unlock_page(swapcache);
>  		put_page(swapcache);
>  	}
> +	if (si)
> +		put_swap_device(si);
>  	return ret;
>  }
>  
> diff --git a/mm/swapfile.c b/mm/swapfile.c
> index 724173cd7d0c..01032c72ceae 100644
> --- a/mm/swapfile.c
> +++ b/mm/swapfile.c
> @@ -1280,18 +1280,12 @@ static unsigned char __swap_entry_free_locked(struct swap_info_struct *p,
>   * via preventing the swap device from being swapoff, until
>   * put_swap_device() is called.  Otherwise return NULL.
>   *
> - * The entirety of the RCU read critical section must come before the
> - * return from or after the call to synchronize_rcu() in
> - * enable_swap_info() or swapoff().  So if "si->flags & SWP_VALID" is
> - * true, the si->map, si->cluster_info, etc. must be valid in the
> - * critical section.
> - *
>   * Notice that swapoff or swapoff+swapon can still happen before the
> - * rcu_read_lock() in get_swap_device() or after the rcu_read_unlock()
> - * in put_swap_device() if there isn't any other way to prevent
> - * swapoff, such as page lock, page table lock, etc.  The caller must
> - * be prepared for that.  For example, the following situation is
> - * possible.
> + * percpu_ref_tryget_live() in get_swap_device() or after the
> + * percpu_ref_put() in put_swap_device() if there isn't any other way
> + * to prevent swapoff, such as page lock, page table lock, etc.  The
> + * caller must be prepared for that.  For example, the following
> + * situation is possible.
>   *
>   *   CPU1				CPU2
>   *   do_swap_page()
> @@ -1319,21 +1313,21 @@ struct swap_info_struct *get_swap_device(swp_entry_t entry)
>  	si = swp_swap_info(entry);
>  	if (!si)
>  		goto bad_nofile;
> -
> -	rcu_read_lock();
>  	if (data_race(!(si->flags & SWP_VALID)))

We can delete SWP_VALID, that is used together with RCU solution.

> -		goto unlock_out;
> +		goto out;
> +	if (!percpu_ref_tryget_live(&si->users))
> +		goto out;
>  	offset = swp_offset(entry);
>  	if (offset >= si->max)
> -		goto unlock_out;
> +		goto put_out;
>  
>  	return si;
>  bad_nofile:
>  	pr_err("%s: %s%08lx\n", __func__, Bad_file, entry.val);
>  out:
>  	return NULL;
> -unlock_out:
> -	rcu_read_unlock();
> +put_out:
> +	percpu_ref_put(&si->users);
>  	return NULL;
>  }

Best Regards,
Huang, Ying

^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [PATCH 2/5] swap: fix do_swap_page() race with swapoff
@ 2021-04-13  1:27     ` Huang, Ying
  0 siblings, 0 replies; 72+ messages in thread
From: Huang, Ying @ 2021-04-13  1:27 UTC (permalink / raw)
  To: Miaohe Lin
  Cc: akpm, hannes, mhocko, iamjoonsoo.kim, vbabka, alex.shi, willy,
	minchan, richard.weiyang, hughd, tim.c.chen, linux-kernel,
	linux-mm, Matthew Wilcox

Miaohe Lin <linmiaohe@huawei.com> writes:

> When I was investigating the swap code, I found the below possible race
> window:
>
> CPU 1					CPU 2
> -----					-----
> do_swap_page
>   synchronous swap_readpage
>     alloc_page_vma
> 					swapoff
> 					  release swap_file, bdev, or ...
>       swap_readpage
> 	check sis->flags is ok
> 	  access swap_file, bdev...[oops!]
> 					    si->flags = 0
>
> Using current get/put_swap_device() to guard against concurrent swapoff for
> swap_readpage() looks terrible because swap_readpage() may take really long
> time. And this race may not be really pernicious because swapoff is usually
> done when system shutdown only. To reduce the performance overhead on the
> hot-path as much as possible, it appears we can use the percpu_ref to close
> this race window(as suggested by Huang, Ying).
>
> Fixes: 235b62176712 ("mm/swap: add cluster lock")

This isn't the commit that introduces the race.  You can use `git blame`
find out the correct commit.  For this it's commit 0bcac06f27d7 "mm,
swap: skip swapcache for swapin of synchronous device".

And I suggest to merge 1/5 and 2/5 to make it easy to get the full
picture.

> Signed-off-by: Miaohe Lin <linmiaohe@huawei.com>
> ---
>  include/linux/swap.h |  2 +-
>  mm/memory.c          | 10 ++++++++++
>  mm/swapfile.c        | 28 +++++++++++-----------------
>  3 files changed, 22 insertions(+), 18 deletions(-)
>
> diff --git a/include/linux/swap.h b/include/linux/swap.h
> index 849ba5265c11..9066addb57fd 100644
> --- a/include/linux/swap.h
> +++ b/include/linux/swap.h
> @@ -513,7 +513,7 @@ sector_t swap_page_sector(struct page *page);
>  
>  static inline void put_swap_device(struct swap_info_struct *si)
>  {
> -	rcu_read_unlock();
> +	percpu_ref_put(&si->users);
>  }
>  
>  #else /* CONFIG_SWAP */
> diff --git a/mm/memory.c b/mm/memory.c
> index cc71a445c76c..8543c47b955c 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -3311,6 +3311,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
>  {
>  	struct vm_area_struct *vma = vmf->vma;
>  	struct page *page = NULL, *swapcache;
> +	struct swap_info_struct *si = NULL;
>  	swp_entry_t entry;
>  	pte_t pte;
>  	int locked;
> @@ -3339,6 +3340,11 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
>  	}
>  
>

I suggest to add comments here as follows (words copy from Matthew Wilcox)

	/* Prevent swapoff from happening to us */

> +	si = get_swap_device(entry);
> +	/* In case we raced with swapoff. */
> +	if (unlikely(!si))
> +		goto out;
> +

Because we wrap the whole do_swap_page() with get/put_swap_device()
now.  We can remove several get/put_swap_device() for function called by
do_swap_page().  That can be another optimization patch.

>  	delayacct_set_flag(DELAYACCT_PF_SWAPIN);
>  	page = lookup_swap_cache(entry, vma, vmf->address);
>  	swapcache = page;
> @@ -3514,6 +3520,8 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
>  unlock:
>  	pte_unmap_unlock(vmf->pte, vmf->ptl);
>  out:
> +	if (si)
> +		put_swap_device(si);
>  	return ret;
>  out_nomap:
>  	pte_unmap_unlock(vmf->pte, vmf->ptl);
> @@ -3525,6 +3533,8 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
>  		unlock_page(swapcache);
>  		put_page(swapcache);
>  	}
> +	if (si)
> +		put_swap_device(si);
>  	return ret;
>  }
>  
> diff --git a/mm/swapfile.c b/mm/swapfile.c
> index 724173cd7d0c..01032c72ceae 100644
> --- a/mm/swapfile.c
> +++ b/mm/swapfile.c
> @@ -1280,18 +1280,12 @@ static unsigned char __swap_entry_free_locked(struct swap_info_struct *p,
>   * via preventing the swap device from being swapoff, until
>   * put_swap_device() is called.  Otherwise return NULL.
>   *
> - * The entirety of the RCU read critical section must come before the
> - * return from or after the call to synchronize_rcu() in
> - * enable_swap_info() or swapoff().  So if "si->flags & SWP_VALID" is
> - * true, the si->map, si->cluster_info, etc. must be valid in the
> - * critical section.
> - *
>   * Notice that swapoff or swapoff+swapon can still happen before the
> - * rcu_read_lock() in get_swap_device() or after the rcu_read_unlock()
> - * in put_swap_device() if there isn't any other way to prevent
> - * swapoff, such as page lock, page table lock, etc.  The caller must
> - * be prepared for that.  For example, the following situation is
> - * possible.
> + * percpu_ref_tryget_live() in get_swap_device() or after the
> + * percpu_ref_put() in put_swap_device() if there isn't any other way
> + * to prevent swapoff, such as page lock, page table lock, etc.  The
> + * caller must be prepared for that.  For example, the following
> + * situation is possible.
>   *
>   *   CPU1				CPU2
>   *   do_swap_page()
> @@ -1319,21 +1313,21 @@ struct swap_info_struct *get_swap_device(swp_entry_t entry)
>  	si = swp_swap_info(entry);
>  	if (!si)
>  		goto bad_nofile;
> -
> -	rcu_read_lock();
>  	if (data_race(!(si->flags & SWP_VALID)))

We can delete SWP_VALID, that is used together with RCU solution.

> -		goto unlock_out;
> +		goto out;
> +	if (!percpu_ref_tryget_live(&si->users))
> +		goto out;
>  	offset = swp_offset(entry);
>  	if (offset >= si->max)
> -		goto unlock_out;
> +		goto put_out;
>  
>  	return si;
>  bad_nofile:
>  	pr_err("%s: %s%08lx\n", __func__, Bad_file, entry.val);
>  out:
>  	return NULL;
> -unlock_out:
> -	rcu_read_unlock();
> +put_out:
> +	percpu_ref_put(&si->users);
>  	return NULL;
>  }

Best Regards,
Huang, Ying


^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [PATCH 3/5] mm/swap_state: fix get_shadow_from_swap_cache() race with swapoff
  2021-04-08 13:08 ` [PATCH 3/5] mm/swap_state: fix get_shadow_from_swap_cache() " Miaohe Lin
@ 2021-04-13  1:33     ` Huang, Ying
  0 siblings, 0 replies; 72+ messages in thread
From: Huang, Ying @ 2021-04-13  1:33 UTC (permalink / raw)
  To: Miaohe Lin
  Cc: akpm, hannes, mhocko, iamjoonsoo.kim, vbabka, alex.shi, willy,
	minchan, richard.weiyang, hughd, tim.c.chen, linux-kernel,
	linux-mm

Miaohe Lin <linmiaohe@huawei.com> writes:

> The function get_shadow_from_swap_cache() can race with swapoff, though
> it's only called by do_swap_page() now.
>
> Fixes: aae466b0052e ("mm/swap: implement workingset detection for anonymous LRU")
> Signed-off-by: Miaohe Lin <linmiaohe@huawei.com>

This is unnecessary.  The only caller has guaranteed the swap device
from swapoff.

Best Regards,
Huang, Ying

> ---
>  mm/swap_state.c | 9 ++++++---
>  1 file changed, 6 insertions(+), 3 deletions(-)
>
> diff --git a/mm/swap_state.c b/mm/swap_state.c
> index 272ea2108c9d..709c260d644a 100644
> --- a/mm/swap_state.c
> +++ b/mm/swap_state.c
> @@ -83,11 +83,14 @@ void show_swap_cache_info(void)
>  
>  void *get_shadow_from_swap_cache(swp_entry_t entry)
>  {
> -	struct address_space *address_space = swap_address_space(entry);
> -	pgoff_t idx = swp_offset(entry);
> +	struct swap_info_struct *si;
>  	struct page *page;
>  
> -	page = xa_load(&address_space->i_pages, idx);
> +	si = get_swap_device(entry);
> +	if (!si)
> +		return NULL;
> +	page = xa_load(&swap_address_space(entry)->i_pages, swp_offset(entry));
> +	put_swap_device(si);
>  	if (xa_is_value(page))
>  		return page;
>  	return NULL;

^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [PATCH 3/5] mm/swap_state: fix get_shadow_from_swap_cache() race with swapoff
@ 2021-04-13  1:33     ` Huang, Ying
  0 siblings, 0 replies; 72+ messages in thread
From: Huang, Ying @ 2021-04-13  1:33 UTC (permalink / raw)
  To: Miaohe Lin
  Cc: akpm, hannes, mhocko, iamjoonsoo.kim, vbabka, alex.shi, willy,
	minchan, richard.weiyang, hughd, tim.c.chen, linux-kernel,
	linux-mm

Miaohe Lin <linmiaohe@huawei.com> writes:

> The function get_shadow_from_swap_cache() can race with swapoff, though
> it's only called by do_swap_page() now.
>
> Fixes: aae466b0052e ("mm/swap: implement workingset detection for anonymous LRU")
> Signed-off-by: Miaohe Lin <linmiaohe@huawei.com>

This is unnecessary.  The only caller has guaranteed the swap device
from swapoff.

Best Regards,
Huang, Ying

> ---
>  mm/swap_state.c | 9 ++++++---
>  1 file changed, 6 insertions(+), 3 deletions(-)
>
> diff --git a/mm/swap_state.c b/mm/swap_state.c
> index 272ea2108c9d..709c260d644a 100644
> --- a/mm/swap_state.c
> +++ b/mm/swap_state.c
> @@ -83,11 +83,14 @@ void show_swap_cache_info(void)
>  
>  void *get_shadow_from_swap_cache(swp_entry_t entry)
>  {
> -	struct address_space *address_space = swap_address_space(entry);
> -	pgoff_t idx = swp_offset(entry);
> +	struct swap_info_struct *si;
>  	struct page *page;
>  
> -	page = xa_load(&address_space->i_pages, idx);
> +	si = get_swap_device(entry);
> +	if (!si)
> +		return NULL;
> +	page = xa_load(&swap_address_space(entry)->i_pages, swp_offset(entry));
> +	put_swap_device(si);
>  	if (xa_is_value(page))
>  		return page;
>  	return NULL;


^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [PATCH 5/5] mm/swap_state: fix swap_cluster_readahead() race with swapoff
  2021-04-08 13:08 ` [PATCH 5/5] mm/swap_state: fix swap_cluster_readahead() race with swapoff Miaohe Lin
@ 2021-04-13  1:36     ` Huang, Ying
  0 siblings, 0 replies; 72+ messages in thread
From: Huang, Ying @ 2021-04-13  1:36 UTC (permalink / raw)
  To: Miaohe Lin
  Cc: akpm, hannes, mhocko, iamjoonsoo.kim, vbabka, alex.shi, willy,
	minchan, richard.weiyang, hughd, tim.c.chen, linux-kernel,
	linux-mm

Miaohe Lin <linmiaohe@huawei.com> writes:

> swap_cluster_readahead() could race with swapoff and might dereference
> si->swap_file after it's released by swapoff. Close this race window by
> using get/put_swap_device() pair.

I think we should fix the callers instead to reduce the overhead.  Now,
do_swap_page() has been fixed.  We need to fix shmem_swapin().

Best Regards,
Huang, Ying

> Signed-off-by: Miaohe Lin <linmiaohe@huawei.com>
> ---
>  mm/swap_state.c | 11 +++++++++--
>  1 file changed, 9 insertions(+), 2 deletions(-)
>
> diff --git a/mm/swap_state.c b/mm/swap_state.c
> index 3bf0d0c297bc..eba6b0cf6cf9 100644
> --- a/mm/swap_state.c
> +++ b/mm/swap_state.c
> @@ -626,12 +626,17 @@ struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask,
>  	unsigned long offset = entry_offset;
>  	unsigned long start_offset, end_offset;
>  	unsigned long mask;
> -	struct swap_info_struct *si = swp_swap_info(entry);
> +	struct swap_info_struct *si;
>  	struct blk_plug plug;
>  	bool do_poll = true, page_allocated;
>  	struct vm_area_struct *vma = vmf->vma;
>  	unsigned long addr = vmf->address;
>  
> +	si = get_swap_device(entry);
> +	/* In case we raced with swapoff. */
> +	if (!si)
> +		return NULL;
> +
>  	mask = swapin_nr_pages(offset) - 1;
>  	if (!mask)
>  		goto skip;
> @@ -673,7 +678,9 @@ struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask,
>  
>  	lru_add_drain();	/* Push any new pages onto the LRU now */
>  skip:
> -	return read_swap_cache_async(entry, gfp_mask, vma, addr, do_poll);
> +	page = read_swap_cache_async(entry, gfp_mask, vma, addr, do_poll);
> +	put_swap_device(si);
> +	return page;
>  }
>  
>  int init_swap_address_space(unsigned int type, unsigned long nr_pages)

^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [PATCH 5/5] mm/swap_state: fix swap_cluster_readahead() race with swapoff
@ 2021-04-13  1:36     ` Huang, Ying
  0 siblings, 0 replies; 72+ messages in thread
From: Huang, Ying @ 2021-04-13  1:36 UTC (permalink / raw)
  To: Miaohe Lin
  Cc: akpm, hannes, mhocko, iamjoonsoo.kim, vbabka, alex.shi, willy,
	minchan, richard.weiyang, hughd, tim.c.chen, linux-kernel,
	linux-mm

Miaohe Lin <linmiaohe@huawei.com> writes:

> swap_cluster_readahead() could race with swapoff and might dereference
> si->swap_file after it's released by swapoff. Close this race window by
> using get/put_swap_device() pair.

I think we should fix the callers instead to reduce the overhead.  Now,
do_swap_page() has been fixed.  We need to fix shmem_swapin().

Best Regards,
Huang, Ying

> Signed-off-by: Miaohe Lin <linmiaohe@huawei.com>
> ---
>  mm/swap_state.c | 11 +++++++++--
>  1 file changed, 9 insertions(+), 2 deletions(-)
>
> diff --git a/mm/swap_state.c b/mm/swap_state.c
> index 3bf0d0c297bc..eba6b0cf6cf9 100644
> --- a/mm/swap_state.c
> +++ b/mm/swap_state.c
> @@ -626,12 +626,17 @@ struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask,
>  	unsigned long offset = entry_offset;
>  	unsigned long start_offset, end_offset;
>  	unsigned long mask;
> -	struct swap_info_struct *si = swp_swap_info(entry);
> +	struct swap_info_struct *si;
>  	struct blk_plug plug;
>  	bool do_poll = true, page_allocated;
>  	struct vm_area_struct *vma = vmf->vma;
>  	unsigned long addr = vmf->address;
>  
> +	si = get_swap_device(entry);
> +	/* In case we raced with swapoff. */
> +	if (!si)
> +		return NULL;
> +
>  	mask = swapin_nr_pages(offset) - 1;
>  	if (!mask)
>  		goto skip;
> @@ -673,7 +678,9 @@ struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask,
>  
>  	lru_add_drain();	/* Push any new pages onto the LRU now */
>  skip:
> -	return read_swap_cache_async(entry, gfp_mask, vma, addr, do_poll);
> +	page = read_swap_cache_async(entry, gfp_mask, vma, addr, do_poll);
> +	put_swap_device(si);
> +	return page;
>  }
>  
>  int init_swap_address_space(unsigned int type, unsigned long nr_pages)


^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [PATCH 1/5] mm/swapfile: add percpu_ref support for swap
  2021-04-12  7:24       ` Huang, Ying
  (?)
@ 2021-04-13 12:39       ` Miaohe Lin
  2021-04-14  1:17           ` Huang, Ying
  -1 siblings, 1 reply; 72+ messages in thread
From: Miaohe Lin @ 2021-04-13 12:39 UTC (permalink / raw)
  To: Huang, Ying
  Cc: akpm, hannes, mhocko, iamjoonsoo.kim, vbabka, alex.shi, willy,
	minchan, richard.weiyang, hughd, tim.c.chen, linux-kernel,
	linux-mm

On 2021/4/12 15:24, Huang, Ying wrote:
> "Huang, Ying" <ying.huang@intel.com> writes:
> 
>> Miaohe Lin <linmiaohe@huawei.com> writes:
>>
>>> We will use percpu-refcount to serialize against concurrent swapoff. This
>>> patch adds the percpu_ref support for later fixup.
>>>
>>> Signed-off-by: Miaohe Lin <linmiaohe@huawei.com>
>>> ---
>>>  include/linux/swap.h |  2 ++
>>>  mm/swapfile.c        | 25 ++++++++++++++++++++++---
>>>  2 files changed, 24 insertions(+), 3 deletions(-)
>>>
>>> diff --git a/include/linux/swap.h b/include/linux/swap.h
>>> index 144727041e78..849ba5265c11 100644
>>> --- a/include/linux/swap.h
>>> +++ b/include/linux/swap.h
>>> @@ -240,6 +240,7 @@ struct swap_cluster_list {
>>>   * The in-memory structure used to track swap areas.
>>>   */
>>>  struct swap_info_struct {
>>> +	struct percpu_ref users;	/* serialization against concurrent swapoff */
>>>  	unsigned long	flags;		/* SWP_USED etc: see above */
>>>  	signed short	prio;		/* swap priority of this type */
>>>  	struct plist_node list;		/* entry in swap_active_head */
>>> @@ -260,6 +261,7 @@ struct swap_info_struct {
>>>  	struct block_device *bdev;	/* swap device or bdev of swap file */
>>>  	struct file *swap_file;		/* seldom referenced */
>>>  	unsigned int old_block_size;	/* seldom referenced */
>>> +	struct completion comp;		/* seldom referenced */
>>>  #ifdef CONFIG_FRONTSWAP
>>>  	unsigned long *frontswap_map;	/* frontswap in-use, one bit per page */
>>>  	atomic_t frontswap_pages;	/* frontswap pages in-use counter */
>>> diff --git a/mm/swapfile.c b/mm/swapfile.c
>>> index 149e77454e3c..724173cd7d0c 100644
>>> --- a/mm/swapfile.c
>>> +++ b/mm/swapfile.c
>>> @@ -39,6 +39,7 @@
>>>  #include <linux/export.h>
>>>  #include <linux/swap_slots.h>
>>>  #include <linux/sort.h>
>>> +#include <linux/completion.h>
>>>  
>>>  #include <asm/tlbflush.h>
>>>  #include <linux/swapops.h>
>>> @@ -511,6 +512,15 @@ static void swap_discard_work(struct work_struct *work)
>>>  	spin_unlock(&si->lock);
>>>  }
>>>  
>>> +static void swap_users_ref_free(struct percpu_ref *ref)
>>> +{
>>> +	struct swap_info_struct *si;
>>> +
>>> +	si = container_of(ref, struct swap_info_struct, users);
>>> +	complete(&si->comp);
>>> +	percpu_ref_exit(&si->users);
>>
>> Because percpu_ref_exit() is used, we cannot use percpu_ref_tryget() in
>> get_swap_device(), better to add comments there.
> 
> I just noticed that the comments of percpu_ref_tryget_live() says,
> 
>  * This function is safe to call as long as @ref is between init and exit.
> 
> While we need to call get_swap_device() almost at any time, so it's
> better to avoid to call percpu_ref_exit() at all.  This will waste some
> memory, but we need to follow the API definition to avoid potential
> issues in the long term.

I have to admit that I'am not really familiar with percpu_ref. So I read the
implementation code of the percpu_ref and found percpu_ref_tryget_live() could
be called after exit now. But you're right we need to follow the API definition
to avoid potential issues in the long term.

> 
> And we need to call percpu_ref_init() before insert the swap_info_struct
> into the swap_info[].

If we remove the call to percpu_ref_exit(), we should not use percpu_ref_init()
here because *percpu_ref->data is assumed to be NULL* in percpu_ref_init() while
this is not the case as we do not call percpu_ref_exit(). Maybe percpu_ref_reinit()
or percpu_ref_resurrect() will do the work.

One more thing, how could I distinguish the killed percpu_ref from newly allocated one?
It seems percpu_ref_is_dying is only safe to call when @ref is between init and exit.
Maybe I could do this in alloc_swap_info()?

> 
>>> +}
>>> +
>>>  static void alloc_cluster(struct swap_info_struct *si, unsigned long idx)
>>>  {
>>>  	struct swap_cluster_info *ci = si->cluster_info;
>>> @@ -2500,7 +2510,7 @@ static void enable_swap_info(struct swap_info_struct *p, int prio,
>>>  	 * Guarantee swap_map, cluster_info, etc. fields are valid
>>>  	 * between get/put_swap_device() if SWP_VALID bit is set
>>>  	 */
>>> -	synchronize_rcu();
>>> +	percpu_ref_reinit(&p->users);
>>
>> Although the effect is same, I think it's better to use
>> percpu_ref_resurrect() here to improve code readability.
> 
> Check the original commit description for commit eb085574a752 "mm, swap:
> fix race between swapoff and some swap operations" and discussion email
> thread as follows again,
> 
> https://lore.kernel.org/linux-mm/20171219053650.GB7829@linux.vnet.ibm.com/
> 
> I found that the synchronize_rcu() here is to avoid to call smp_rmb() or
> smp_load_acquire() in get_swap_device().  Now we will use
> percpu_ref_tryget_live() in get_swap_device(), so we will need to add
> the necessary memory barrier, or make sure percpu_ref_tryget_live() has
> ACQUIRE semantics.  Per my understanding, we need to change
> percpu_ref_tryget_live() for that.
> 

Do you mean the below scene is possible?

cpu1
swapon()
  ...
  percpu_ref_init
  ...
  setup_swap_info
  /* smp_store_release() is inside percpu_ref_reinit */
  percpu_ref_reinit
  ...

cpu2
get_swap_device()
  /* ignored  smp_rmb() */
  percpu_ref_tryget_live
  ...

There is indeed missing smp_rmb() in percpu_ref_tryget_live. So I think the above
scene possible and we should fix this.

>>>  	spin_lock(&swap_lock);
>>>  	spin_lock(&p->lock);
>>>  	_enable_swap_info(p);
>>> @@ -2621,11 +2631,13 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
>>>  	p->flags &= ~SWP_VALID;		/* mark swap device as invalid */
>>>  	spin_unlock(&p->lock);
>>>  	spin_unlock(&swap_lock);
>>> +
>>> +	percpu_ref_kill(&p->users);
>>>  	/*
>>>  	 * wait for swap operations protected by get/put_swap_device()
>>>  	 * to complete
>>>  	 */
>>> -	synchronize_rcu();
>>> +	wait_for_completion(&p->comp);
>>
>> Better to move percpu_ref_kill() after the comments.  And maybe revise
>> the comments.
> 
> After reading the original commit description as above, I found that we
> need synchronize_rcu() here to protect the accessing to the swap cache
> data structure.  Because there's call_rcu() during percpu_ref_kill(), it
> appears OK to keep the synchronize_rcu() here.  And we need to revise
> the comments to make it clear what is protected by which operation.
> 

Per my understanding, percpu_ref->data->release is called only after the refcnt
reaches 0, including a full grace period has elapsed or refcnt won't be 0.
wait_for_completion() is used for waiting the last refcnt being released. So
synchronize_rcu() is not necessary here?

> Best Regards,

Many thanks for precious comments! :)

> Huang, Ying
> 
> [snip]
> .
> 


^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [PATCH 2/5] swap: fix do_swap_page() race with swapoff
  2021-04-13  1:27     ` Huang, Ying
  (?)
@ 2021-04-13 19:24     ` Tim Chen
  2021-04-14  1:04         ` Huang, Ying
  -1 siblings, 1 reply; 72+ messages in thread
From: Tim Chen @ 2021-04-13 19:24 UTC (permalink / raw)
  To: Huang, Ying, Miaohe Lin
  Cc: akpm, hannes, mhocko, iamjoonsoo.kim, vbabka, alex.shi, willy,
	minchan, richard.weiyang, hughd, linux-kernel, linux-mm



On 4/12/21 6:27 PM, Huang, Ying wrote:

> 
> This isn't the commit that introduces the race.  You can use `git blame`
> find out the correct commit.  For this it's commit 0bcac06f27d7 "mm,
> swap: skip swapcache for swapin of synchronous device".
> 
> And I suggest to merge 1/5 and 2/5 to make it easy to get the full
> picture.

I'll suggest make fix to do_swap_page race with get/put_swap_device
as a first patch. Then the per_cpu_ref stuff in patch 1 and patch 2 can
be combined together.

Tim

^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [PATCH 2/5] swap: fix do_swap_page() race with swapoff
  2021-04-13 19:24     ` Tim Chen
@ 2021-04-14  1:04         ` Huang, Ying
  0 siblings, 0 replies; 72+ messages in thread
From: Huang, Ying @ 2021-04-14  1:04 UTC (permalink / raw)
  To: Tim Chen
  Cc: Miaohe Lin, akpm, hannes, mhocko, iamjoonsoo.kim, vbabka,
	alex.shi, willy, minchan, richard.weiyang, hughd, linux-kernel,
	linux-mm

Tim Chen <tim.c.chen@linux.intel.com> writes:

> On 4/12/21 6:27 PM, Huang, Ying wrote:
>
>> 
>> This isn't the commit that introduces the race.  You can use `git blame`
>> find out the correct commit.  For this it's commit 0bcac06f27d7 "mm,
>> swap: skip swapcache for swapin of synchronous device".
>> 
>> And I suggest to merge 1/5 and 2/5 to make it easy to get the full
>> picture.
>
> I'll suggest make fix to do_swap_page race with get/put_swap_device
> as a first patch. Then the per_cpu_ref stuff in patch 1 and patch 2 can
> be combined together.

The original get/put_swap_device() use rcu_read_lock/unlock().  I don't
think it's good to wrap swap_read_page() with it.  After all, some
complex operations are done in swap_read_page(), including
blk_io_schedule().

Best Regards,
Huang, Ying

^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [PATCH 2/5] swap: fix do_swap_page() race with swapoff
@ 2021-04-14  1:04         ` Huang, Ying
  0 siblings, 0 replies; 72+ messages in thread
From: Huang, Ying @ 2021-04-14  1:04 UTC (permalink / raw)
  To: Tim Chen
  Cc: Miaohe Lin, akpm, hannes, mhocko, iamjoonsoo.kim, vbabka,
	alex.shi, willy, minchan, richard.weiyang, hughd, linux-kernel,
	linux-mm

Tim Chen <tim.c.chen@linux.intel.com> writes:

> On 4/12/21 6:27 PM, Huang, Ying wrote:
>
>> 
>> This isn't the commit that introduces the race.  You can use `git blame`
>> find out the correct commit.  For this it's commit 0bcac06f27d7 "mm,
>> swap: skip swapcache for swapin of synchronous device".
>> 
>> And I suggest to merge 1/5 and 2/5 to make it easy to get the full
>> picture.
>
> I'll suggest make fix to do_swap_page race with get/put_swap_device
> as a first patch. Then the per_cpu_ref stuff in patch 1 and patch 2 can
> be combined together.

The original get/put_swap_device() use rcu_read_lock/unlock().  I don't
think it's good to wrap swap_read_page() with it.  After all, some
complex operations are done in swap_read_page(), including
blk_io_schedule().

Best Regards,
Huang, Ying


^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [PATCH 1/5] mm/swapfile: add percpu_ref support for swap
  2021-04-13 12:39       ` Miaohe Lin
@ 2021-04-14  1:17           ` Huang, Ying
  0 siblings, 0 replies; 72+ messages in thread
From: Huang, Ying @ 2021-04-14  1:17 UTC (permalink / raw)
  To: Miaohe Lin
  Cc: akpm, hannes, mhocko, iamjoonsoo.kim, vbabka, alex.shi, willy,
	minchan, richard.weiyang, hughd, tim.c.chen, linux-kernel,
	linux-mm

Miaohe Lin <linmiaohe@huawei.com> writes:

> On 2021/4/12 15:24, Huang, Ying wrote:
>> "Huang, Ying" <ying.huang@intel.com> writes:
>> 
>>> Miaohe Lin <linmiaohe@huawei.com> writes:
>>>
>>>> We will use percpu-refcount to serialize against concurrent swapoff. This
>>>> patch adds the percpu_ref support for later fixup.
>>>>
>>>> Signed-off-by: Miaohe Lin <linmiaohe@huawei.com>
>>>> ---
>>>>  include/linux/swap.h |  2 ++
>>>>  mm/swapfile.c        | 25 ++++++++++++++++++++++---
>>>>  2 files changed, 24 insertions(+), 3 deletions(-)
>>>>
>>>> diff --git a/include/linux/swap.h b/include/linux/swap.h
>>>> index 144727041e78..849ba5265c11 100644
>>>> --- a/include/linux/swap.h
>>>> +++ b/include/linux/swap.h
>>>> @@ -240,6 +240,7 @@ struct swap_cluster_list {
>>>>   * The in-memory structure used to track swap areas.
>>>>   */
>>>>  struct swap_info_struct {
>>>> +	struct percpu_ref users;	/* serialization against concurrent swapoff */
>>>>  	unsigned long	flags;		/* SWP_USED etc: see above */
>>>>  	signed short	prio;		/* swap priority of this type */
>>>>  	struct plist_node list;		/* entry in swap_active_head */
>>>> @@ -260,6 +261,7 @@ struct swap_info_struct {
>>>>  	struct block_device *bdev;	/* swap device or bdev of swap file */
>>>>  	struct file *swap_file;		/* seldom referenced */
>>>>  	unsigned int old_block_size;	/* seldom referenced */
>>>> +	struct completion comp;		/* seldom referenced */
>>>>  #ifdef CONFIG_FRONTSWAP
>>>>  	unsigned long *frontswap_map;	/* frontswap in-use, one bit per page */
>>>>  	atomic_t frontswap_pages;	/* frontswap pages in-use counter */
>>>> diff --git a/mm/swapfile.c b/mm/swapfile.c
>>>> index 149e77454e3c..724173cd7d0c 100644
>>>> --- a/mm/swapfile.c
>>>> +++ b/mm/swapfile.c
>>>> @@ -39,6 +39,7 @@
>>>>  #include <linux/export.h>
>>>>  #include <linux/swap_slots.h>
>>>>  #include <linux/sort.h>
>>>> +#include <linux/completion.h>
>>>>  
>>>>  #include <asm/tlbflush.h>
>>>>  #include <linux/swapops.h>
>>>> @@ -511,6 +512,15 @@ static void swap_discard_work(struct work_struct *work)
>>>>  	spin_unlock(&si->lock);
>>>>  }
>>>>  
>>>> +static void swap_users_ref_free(struct percpu_ref *ref)
>>>> +{
>>>> +	struct swap_info_struct *si;
>>>> +
>>>> +	si = container_of(ref, struct swap_info_struct, users);
>>>> +	complete(&si->comp);
>>>> +	percpu_ref_exit(&si->users);
>>>
>>> Because percpu_ref_exit() is used, we cannot use percpu_ref_tryget() in
>>> get_swap_device(), better to add comments there.
>> 
>> I just noticed that the comments of percpu_ref_tryget_live() says,
>> 
>>  * This function is safe to call as long as @ref is between init and exit.
>> 
>> While we need to call get_swap_device() almost at any time, so it's
>> better to avoid to call percpu_ref_exit() at all.  This will waste some
>> memory, but we need to follow the API definition to avoid potential
>> issues in the long term.
>
> I have to admit that I'am not really familiar with percpu_ref. So I read the
> implementation code of the percpu_ref and found percpu_ref_tryget_live() could
> be called after exit now. But you're right we need to follow the API definition
> to avoid potential issues in the long term.
>
>> 
>> And we need to call percpu_ref_init() before insert the swap_info_struct
>> into the swap_info[].
>
> If we remove the call to percpu_ref_exit(), we should not use percpu_ref_init()
> here because *percpu_ref->data is assumed to be NULL* in percpu_ref_init() while
> this is not the case as we do not call percpu_ref_exit(). Maybe percpu_ref_reinit()
> or percpu_ref_resurrect() will do the work.
>
> One more thing, how could I distinguish the killed percpu_ref from newly allocated one?
> It seems percpu_ref_is_dying is only safe to call when @ref is between init and exit.
> Maybe I could do this in alloc_swap_info()?

Yes.  In alloc_swap_info(), you can distinguish newly allocated and
reused swap_info_struct.

>> 
>>>> +}
>>>> +
>>>>  static void alloc_cluster(struct swap_info_struct *si, unsigned long idx)
>>>>  {
>>>>  	struct swap_cluster_info *ci = si->cluster_info;
>>>> @@ -2500,7 +2510,7 @@ static void enable_swap_info(struct swap_info_struct *p, int prio,
>>>>  	 * Guarantee swap_map, cluster_info, etc. fields are valid
>>>>  	 * between get/put_swap_device() if SWP_VALID bit is set
>>>>  	 */
>>>> -	synchronize_rcu();
>>>> +	percpu_ref_reinit(&p->users);
>>>
>>> Although the effect is same, I think it's better to use
>>> percpu_ref_resurrect() here to improve code readability.
>> 
>> Check the original commit description for commit eb085574a752 "mm, swap:
>> fix race between swapoff and some swap operations" and discussion email
>> thread as follows again,
>> 
>> https://lore.kernel.org/linux-mm/20171219053650.GB7829@linux.vnet.ibm.com/
>> 
>> I found that the synchronize_rcu() here is to avoid to call smp_rmb() or
>> smp_load_acquire() in get_swap_device().  Now we will use
>> percpu_ref_tryget_live() in get_swap_device(), so we will need to add
>> the necessary memory barrier, or make sure percpu_ref_tryget_live() has
>> ACQUIRE semantics.  Per my understanding, we need to change
>> percpu_ref_tryget_live() for that.
>> 
>
> Do you mean the below scene is possible?
>
> cpu1
> swapon()
>   ...
>   percpu_ref_init
>   ...
>   setup_swap_info
>   /* smp_store_release() is inside percpu_ref_reinit */
>   percpu_ref_reinit

spin_unlock() has RELEASE semantics already.

>   ...
>
> cpu2
> get_swap_device()
>   /* ignored  smp_rmb() */
>   percpu_ref_tryget_live

Some kind of ACQUIRE is required here to guarantee the refcount is
checked before fetching the other fields of swap_info_struct.  I have
sent out a RFC patch to mailing list to discuss this.

>   ...
>
> There is indeed missing smp_rmb() in percpu_ref_tryget_live. So I think the above
> scene possible and we should fix this.
>
>>>>  	spin_lock(&swap_lock);
>>>>  	spin_lock(&p->lock);
>>>>  	_enable_swap_info(p);
>>>> @@ -2621,11 +2631,13 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
>>>>  	p->flags &= ~SWP_VALID;		/* mark swap device as invalid */
>>>>  	spin_unlock(&p->lock);
>>>>  	spin_unlock(&swap_lock);
>>>> +
>>>> +	percpu_ref_kill(&p->users);
>>>>  	/*
>>>>  	 * wait for swap operations protected by get/put_swap_device()
>>>>  	 * to complete
>>>>  	 */
>>>> -	synchronize_rcu();
>>>> +	wait_for_completion(&p->comp);
>>>
>>> Better to move percpu_ref_kill() after the comments.  And maybe revise
>>> the comments.
>> 
>> After reading the original commit description as above, I found that we
>> need synchronize_rcu() here to protect the accessing to the swap cache
>> data structure.  Because there's call_rcu() during percpu_ref_kill(), it
>> appears OK to keep the synchronize_rcu() here.  And we need to revise
>> the comments to make it clear what is protected by which operation.
>> 
>
> Per my understanding, percpu_ref->data->release is called only after the refcnt
> reaches 0, including a full grace period has elapsed or refcnt won't be 0.
> wait_for_completion() is used for waiting the last refcnt being released. So
> synchronize_rcu() is not necessary here?

Then we will depends on the implementation of percpu_ref.  If it changed
its implementation, it may take long to find out we need to change the
code here.  I guess in most cases, even adding a synchronize_rcu() here,
we still only need to wait for one grace period.  So the overhead to
call synchronize_rcu() is low here.  And the code is easier to be
maintained.

Best Regards,
Huang, Ying

^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [PATCH 1/5] mm/swapfile: add percpu_ref support for swap
@ 2021-04-14  1:17           ` Huang, Ying
  0 siblings, 0 replies; 72+ messages in thread
From: Huang, Ying @ 2021-04-14  1:17 UTC (permalink / raw)
  To: Miaohe Lin
  Cc: akpm, hannes, mhocko, iamjoonsoo.kim, vbabka, alex.shi, willy,
	minchan, richard.weiyang, hughd, tim.c.chen, linux-kernel,
	linux-mm

Miaohe Lin <linmiaohe@huawei.com> writes:

> On 2021/4/12 15:24, Huang, Ying wrote:
>> "Huang, Ying" <ying.huang@intel.com> writes:
>> 
>>> Miaohe Lin <linmiaohe@huawei.com> writes:
>>>
>>>> We will use percpu-refcount to serialize against concurrent swapoff. This
>>>> patch adds the percpu_ref support for later fixup.
>>>>
>>>> Signed-off-by: Miaohe Lin <linmiaohe@huawei.com>
>>>> ---
>>>>  include/linux/swap.h |  2 ++
>>>>  mm/swapfile.c        | 25 ++++++++++++++++++++++---
>>>>  2 files changed, 24 insertions(+), 3 deletions(-)
>>>>
>>>> diff --git a/include/linux/swap.h b/include/linux/swap.h
>>>> index 144727041e78..849ba5265c11 100644
>>>> --- a/include/linux/swap.h
>>>> +++ b/include/linux/swap.h
>>>> @@ -240,6 +240,7 @@ struct swap_cluster_list {
>>>>   * The in-memory structure used to track swap areas.
>>>>   */
>>>>  struct swap_info_struct {
>>>> +	struct percpu_ref users;	/* serialization against concurrent swapoff */
>>>>  	unsigned long	flags;		/* SWP_USED etc: see above */
>>>>  	signed short	prio;		/* swap priority of this type */
>>>>  	struct plist_node list;		/* entry in swap_active_head */
>>>> @@ -260,6 +261,7 @@ struct swap_info_struct {
>>>>  	struct block_device *bdev;	/* swap device or bdev of swap file */
>>>>  	struct file *swap_file;		/* seldom referenced */
>>>>  	unsigned int old_block_size;	/* seldom referenced */
>>>> +	struct completion comp;		/* seldom referenced */
>>>>  #ifdef CONFIG_FRONTSWAP
>>>>  	unsigned long *frontswap_map;	/* frontswap in-use, one bit per page */
>>>>  	atomic_t frontswap_pages;	/* frontswap pages in-use counter */
>>>> diff --git a/mm/swapfile.c b/mm/swapfile.c
>>>> index 149e77454e3c..724173cd7d0c 100644
>>>> --- a/mm/swapfile.c
>>>> +++ b/mm/swapfile.c
>>>> @@ -39,6 +39,7 @@
>>>>  #include <linux/export.h>
>>>>  #include <linux/swap_slots.h>
>>>>  #include <linux/sort.h>
>>>> +#include <linux/completion.h>
>>>>  
>>>>  #include <asm/tlbflush.h>
>>>>  #include <linux/swapops.h>
>>>> @@ -511,6 +512,15 @@ static void swap_discard_work(struct work_struct *work)
>>>>  	spin_unlock(&si->lock);
>>>>  }
>>>>  
>>>> +static void swap_users_ref_free(struct percpu_ref *ref)
>>>> +{
>>>> +	struct swap_info_struct *si;
>>>> +
>>>> +	si = container_of(ref, struct swap_info_struct, users);
>>>> +	complete(&si->comp);
>>>> +	percpu_ref_exit(&si->users);
>>>
>>> Because percpu_ref_exit() is used, we cannot use percpu_ref_tryget() in
>>> get_swap_device(), better to add comments there.
>> 
>> I just noticed that the comments of percpu_ref_tryget_live() says,
>> 
>>  * This function is safe to call as long as @ref is between init and exit.
>> 
>> While we need to call get_swap_device() almost at any time, so it's
>> better to avoid to call percpu_ref_exit() at all.  This will waste some
>> memory, but we need to follow the API definition to avoid potential
>> issues in the long term.
>
> I have to admit that I'am not really familiar with percpu_ref. So I read the
> implementation code of the percpu_ref and found percpu_ref_tryget_live() could
> be called after exit now. But you're right we need to follow the API definition
> to avoid potential issues in the long term.
>
>> 
>> And we need to call percpu_ref_init() before insert the swap_info_struct
>> into the swap_info[].
>
> If we remove the call to percpu_ref_exit(), we should not use percpu_ref_init()
> here because *percpu_ref->data is assumed to be NULL* in percpu_ref_init() while
> this is not the case as we do not call percpu_ref_exit(). Maybe percpu_ref_reinit()
> or percpu_ref_resurrect() will do the work.
>
> One more thing, how could I distinguish the killed percpu_ref from newly allocated one?
> It seems percpu_ref_is_dying is only safe to call when @ref is between init and exit.
> Maybe I could do this in alloc_swap_info()?

Yes.  In alloc_swap_info(), you can distinguish newly allocated and
reused swap_info_struct.

>> 
>>>> +}
>>>> +
>>>>  static void alloc_cluster(struct swap_info_struct *si, unsigned long idx)
>>>>  {
>>>>  	struct swap_cluster_info *ci = si->cluster_info;
>>>> @@ -2500,7 +2510,7 @@ static void enable_swap_info(struct swap_info_struct *p, int prio,
>>>>  	 * Guarantee swap_map, cluster_info, etc. fields are valid
>>>>  	 * between get/put_swap_device() if SWP_VALID bit is set
>>>>  	 */
>>>> -	synchronize_rcu();
>>>> +	percpu_ref_reinit(&p->users);
>>>
>>> Although the effect is same, I think it's better to use
>>> percpu_ref_resurrect() here to improve code readability.
>> 
>> Check the original commit description for commit eb085574a752 "mm, swap:
>> fix race between swapoff and some swap operations" and discussion email
>> thread as follows again,
>> 
>> https://lore.kernel.org/linux-mm/20171219053650.GB7829@linux.vnet.ibm.com/
>> 
>> I found that the synchronize_rcu() here is to avoid to call smp_rmb() or
>> smp_load_acquire() in get_swap_device().  Now we will use
>> percpu_ref_tryget_live() in get_swap_device(), so we will need to add
>> the necessary memory barrier, or make sure percpu_ref_tryget_live() has
>> ACQUIRE semantics.  Per my understanding, we need to change
>> percpu_ref_tryget_live() for that.
>> 
>
> Do you mean the below scene is possible?
>
> cpu1
> swapon()
>   ...
>   percpu_ref_init
>   ...
>   setup_swap_info
>   /* smp_store_release() is inside percpu_ref_reinit */
>   percpu_ref_reinit

spin_unlock() has RELEASE semantics already.

>   ...
>
> cpu2
> get_swap_device()
>   /* ignored  smp_rmb() */
>   percpu_ref_tryget_live

Some kind of ACQUIRE is required here to guarantee the refcount is
checked before fetching the other fields of swap_info_struct.  I have
sent out a RFC patch to mailing list to discuss this.

>   ...
>
> There is indeed missing smp_rmb() in percpu_ref_tryget_live. So I think the above
> scene possible and we should fix this.
>
>>>>  	spin_lock(&swap_lock);
>>>>  	spin_lock(&p->lock);
>>>>  	_enable_swap_info(p);
>>>> @@ -2621,11 +2631,13 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
>>>>  	p->flags &= ~SWP_VALID;		/* mark swap device as invalid */
>>>>  	spin_unlock(&p->lock);
>>>>  	spin_unlock(&swap_lock);
>>>> +
>>>> +	percpu_ref_kill(&p->users);
>>>>  	/*
>>>>  	 * wait for swap operations protected by get/put_swap_device()
>>>>  	 * to complete
>>>>  	 */
>>>> -	synchronize_rcu();
>>>> +	wait_for_completion(&p->comp);
>>>
>>> Better to move percpu_ref_kill() after the comments.  And maybe revise
>>> the comments.
>> 
>> After reading the original commit description as above, I found that we
>> need synchronize_rcu() here to protect the accessing to the swap cache
>> data structure.  Because there's call_rcu() during percpu_ref_kill(), it
>> appears OK to keep the synchronize_rcu() here.  And we need to revise
>> the comments to make it clear what is protected by which operation.
>> 
>
> Per my understanding, percpu_ref->data->release is called only after the refcnt
> reaches 0, including a full grace period has elapsed or refcnt won't be 0.
> wait_for_completion() is used for waiting the last refcnt being released. So
> synchronize_rcu() is not necessary here?

Then we will depends on the implementation of percpu_ref.  If it changed
its implementation, it may take long to find out we need to change the
code here.  I guess in most cases, even adding a synchronize_rcu() here,
we still only need to wait for one grace period.  So the overhead to
call synchronize_rcu() is low here.  And the code is easier to be
maintained.

Best Regards,
Huang, Ying


^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [PATCH 1/5] mm/swapfile: add percpu_ref support for swap
  2021-04-14  1:17           ` Huang, Ying
  (?)
@ 2021-04-14  1:58           ` Miaohe Lin
  2021-04-14  2:06               ` Huang, Ying
  -1 siblings, 1 reply; 72+ messages in thread
From: Miaohe Lin @ 2021-04-14  1:58 UTC (permalink / raw)
  To: Huang, Ying
  Cc: akpm, hannes, mhocko, iamjoonsoo.kim, vbabka, alex.shi, willy,
	minchan, richard.weiyang, hughd, tim.c.chen, linux-kernel,
	linux-mm

On 2021/4/14 9:17, Huang, Ying wrote:
> Miaohe Lin <linmiaohe@huawei.com> writes:
> 
>> On 2021/4/12 15:24, Huang, Ying wrote:
>>> "Huang, Ying" <ying.huang@intel.com> writes:
>>>
>>>> Miaohe Lin <linmiaohe@huawei.com> writes:
>>>>
>>>>> We will use percpu-refcount to serialize against concurrent swapoff. This
>>>>> patch adds the percpu_ref support for later fixup.
>>>>>
>>>>> Signed-off-by: Miaohe Lin <linmiaohe@huawei.com>
>>>>> ---
>>>>>  include/linux/swap.h |  2 ++
>>>>>  mm/swapfile.c        | 25 ++++++++++++++++++++++---
>>>>>  2 files changed, 24 insertions(+), 3 deletions(-)
>>>>>
>>>>> diff --git a/include/linux/swap.h b/include/linux/swap.h
>>>>> index 144727041e78..849ba5265c11 100644
>>>>> --- a/include/linux/swap.h
>>>>> +++ b/include/linux/swap.h
>>>>> @@ -240,6 +240,7 @@ struct swap_cluster_list {
>>>>>   * The in-memory structure used to track swap areas.
>>>>>   */
>>>>>  struct swap_info_struct {
>>>>> +	struct percpu_ref users;	/* serialization against concurrent swapoff */
>>>>>  	unsigned long	flags;		/* SWP_USED etc: see above */
>>>>>  	signed short	prio;		/* swap priority of this type */
>>>>>  	struct plist_node list;		/* entry in swap_active_head */
>>>>> @@ -260,6 +261,7 @@ struct swap_info_struct {
>>>>>  	struct block_device *bdev;	/* swap device or bdev of swap file */
>>>>>  	struct file *swap_file;		/* seldom referenced */
>>>>>  	unsigned int old_block_size;	/* seldom referenced */
>>>>> +	struct completion comp;		/* seldom referenced */
>>>>>  #ifdef CONFIG_FRONTSWAP
>>>>>  	unsigned long *frontswap_map;	/* frontswap in-use, one bit per page */
>>>>>  	atomic_t frontswap_pages;	/* frontswap pages in-use counter */
>>>>> diff --git a/mm/swapfile.c b/mm/swapfile.c
>>>>> index 149e77454e3c..724173cd7d0c 100644
>>>>> --- a/mm/swapfile.c
>>>>> +++ b/mm/swapfile.c
>>>>> @@ -39,6 +39,7 @@
>>>>>  #include <linux/export.h>
>>>>>  #include <linux/swap_slots.h>
>>>>>  #include <linux/sort.h>
>>>>> +#include <linux/completion.h>
>>>>>  
>>>>>  #include <asm/tlbflush.h>
>>>>>  #include <linux/swapops.h>
>>>>> @@ -511,6 +512,15 @@ static void swap_discard_work(struct work_struct *work)
>>>>>  	spin_unlock(&si->lock);
>>>>>  }
>>>>>  
>>>>> +static void swap_users_ref_free(struct percpu_ref *ref)
>>>>> +{
>>>>> +	struct swap_info_struct *si;
>>>>> +
>>>>> +	si = container_of(ref, struct swap_info_struct, users);
>>>>> +	complete(&si->comp);
>>>>> +	percpu_ref_exit(&si->users);
>>>>
>>>> Because percpu_ref_exit() is used, we cannot use percpu_ref_tryget() in
>>>> get_swap_device(), better to add comments there.
>>>
>>> I just noticed that the comments of percpu_ref_tryget_live() says,
>>>
>>>  * This function is safe to call as long as @ref is between init and exit.
>>>
>>> While we need to call get_swap_device() almost at any time, so it's
>>> better to avoid to call percpu_ref_exit() at all.  This will waste some
>>> memory, but we need to follow the API definition to avoid potential
>>> issues in the long term.
>>
>> I have to admit that I'am not really familiar with percpu_ref. So I read the
>> implementation code of the percpu_ref and found percpu_ref_tryget_live() could
>> be called after exit now. But you're right we need to follow the API definition
>> to avoid potential issues in the long term.
>>
>>>
>>> And we need to call percpu_ref_init() before insert the swap_info_struct
>>> into the swap_info[].
>>
>> If we remove the call to percpu_ref_exit(), we should not use percpu_ref_init()
>> here because *percpu_ref->data is assumed to be NULL* in percpu_ref_init() while
>> this is not the case as we do not call percpu_ref_exit(). Maybe percpu_ref_reinit()
>> or percpu_ref_resurrect() will do the work.
>>
>> One more thing, how could I distinguish the killed percpu_ref from newly allocated one?
>> It seems percpu_ref_is_dying is only safe to call when @ref is between init and exit.
>> Maybe I could do this in alloc_swap_info()?
> 
> Yes.  In alloc_swap_info(), you can distinguish newly allocated and
> reused swap_info_struct.
> 
>>>
>>>>> +}
>>>>> +
>>>>>  static void alloc_cluster(struct swap_info_struct *si, unsigned long idx)
>>>>>  {
>>>>>  	struct swap_cluster_info *ci = si->cluster_info;
>>>>> @@ -2500,7 +2510,7 @@ static void enable_swap_info(struct swap_info_struct *p, int prio,
>>>>>  	 * Guarantee swap_map, cluster_info, etc. fields are valid
>>>>>  	 * between get/put_swap_device() if SWP_VALID bit is set
>>>>>  	 */
>>>>> -	synchronize_rcu();
>>>>> +	percpu_ref_reinit(&p->users);
>>>>
>>>> Although the effect is same, I think it's better to use
>>>> percpu_ref_resurrect() here to improve code readability.
>>>
>>> Check the original commit description for commit eb085574a752 "mm, swap:
>>> fix race between swapoff and some swap operations" and discussion email
>>> thread as follows again,
>>>
>>> https://lore.kernel.org/linux-mm/20171219053650.GB7829@linux.vnet.ibm.com/
>>>
>>> I found that the synchronize_rcu() here is to avoid to call smp_rmb() or
>>> smp_load_acquire() in get_swap_device().  Now we will use
>>> percpu_ref_tryget_live() in get_swap_device(), so we will need to add
>>> the necessary memory barrier, or make sure percpu_ref_tryget_live() has
>>> ACQUIRE semantics.  Per my understanding, we need to change
>>> percpu_ref_tryget_live() for that.
>>>
>>
>> Do you mean the below scene is possible?
>>
>> cpu1
>> swapon()
>>   ...
>>   percpu_ref_init
>>   ...
>>   setup_swap_info
>>   /* smp_store_release() is inside percpu_ref_reinit */
>>   percpu_ref_reinit
> 
> spin_unlock() has RELEASE semantics already.
> 
>>   ...
>>
>> cpu2
>> get_swap_device()
>>   /* ignored  smp_rmb() */
>>   percpu_ref_tryget_live
> 
> Some kind of ACQUIRE is required here to guarantee the refcount is
> checked before fetching the other fields of swap_info_struct.  I have
> sent out a RFC patch to mailing list to discuss this.

Many thanks.
But We may still need to add a smp_rmb() in get_swap_device() in case
we can't add ACQUIRE for refcount.

> >>   ...
>>
>> There is indeed missing smp_rmb() in percpu_ref_tryget_live. So I think the above
>> scene possible and we should fix this.
>>
>>>>>  	spin_lock(&swap_lock);
>>>>>  	spin_lock(&p->lock);
>>>>>  	_enable_swap_info(p);
>>>>> @@ -2621,11 +2631,13 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
>>>>>  	p->flags &= ~SWP_VALID;		/* mark swap device as invalid */
>>>>>  	spin_unlock(&p->lock);
>>>>>  	spin_unlock(&swap_lock);
>>>>> +
>>>>> +	percpu_ref_kill(&p->users);
>>>>>  	/*
>>>>>  	 * wait for swap operations protected by get/put_swap_device()
>>>>>  	 * to complete
>>>>>  	 */
>>>>> -	synchronize_rcu();
>>>>> +	wait_for_completion(&p->comp);
>>>>
>>>> Better to move percpu_ref_kill() after the comments.  And maybe revise
>>>> the comments.
>>>
>>> After reading the original commit description as above, I found that we
>>> need synchronize_rcu() here to protect the accessing to the swap cache
>>> data structure.  Because there's call_rcu() during percpu_ref_kill(), it
>>> appears OK to keep the synchronize_rcu() here.  And we need to revise
>>> the comments to make it clear what is protected by which operation.
>>>
>>
>> Per my understanding, percpu_ref->data->release is called only after the refcnt
>> reaches 0, including a full grace period has elapsed or refcnt won't be 0.
>> wait_for_completion() is used for waiting the last refcnt being released. So
>> synchronize_rcu() is not necessary here?
> 
> Then we will depends on the implementation of percpu_ref.  If it changed
> its implementation, it may take long to find out we need to change the
> code here.  I guess in most cases, even adding a synchronize_rcu() here,
> we still only need to wait for one grace period.  So the overhead to
> call synchronize_rcu() is low here.  And the code is easier to be
> maintained.
> 

Sounds reasonable. Will do. Thanks.

> Best Regards,
> Huang, Ying
> .
> 


^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [PATCH 1/5] mm/swapfile: add percpu_ref support for swap
  2021-04-14  1:58           ` Miaohe Lin
@ 2021-04-14  2:06               ` Huang, Ying
  0 siblings, 0 replies; 72+ messages in thread
From: Huang, Ying @ 2021-04-14  2:06 UTC (permalink / raw)
  To: Miaohe Lin
  Cc: akpm, hannes, mhocko, iamjoonsoo.kim, vbabka, alex.shi, willy,
	minchan, richard.weiyang, hughd, tim.c.chen, linux-kernel,
	linux-mm

Miaohe Lin <linmiaohe@huawei.com> writes:

> On 2021/4/14 9:17, Huang, Ying wrote:
>> Miaohe Lin <linmiaohe@huawei.com> writes:
>> 
>>> On 2021/4/12 15:24, Huang, Ying wrote:
>>>> "Huang, Ying" <ying.huang@intel.com> writes:
>>>>
>>>>> Miaohe Lin <linmiaohe@huawei.com> writes:
>>>>>
>>>>>> We will use percpu-refcount to serialize against concurrent swapoff. This
>>>>>> patch adds the percpu_ref support for later fixup.
>>>>>>
>>>>>> Signed-off-by: Miaohe Lin <linmiaohe@huawei.com>
>>>>>> ---
>>>>>>  include/linux/swap.h |  2 ++
>>>>>>  mm/swapfile.c        | 25 ++++++++++++++++++++++---
>>>>>>  2 files changed, 24 insertions(+), 3 deletions(-)
>>>>>>
>>>>>> diff --git a/include/linux/swap.h b/include/linux/swap.h
>>>>>> index 144727041e78..849ba5265c11 100644
>>>>>> --- a/include/linux/swap.h
>>>>>> +++ b/include/linux/swap.h
>>>>>> @@ -240,6 +240,7 @@ struct swap_cluster_list {
>>>>>>   * The in-memory structure used to track swap areas.
>>>>>>   */
>>>>>>  struct swap_info_struct {
>>>>>> +	struct percpu_ref users;	/* serialization against concurrent swapoff */
>>>>>>  	unsigned long	flags;		/* SWP_USED etc: see above */
>>>>>>  	signed short	prio;		/* swap priority of this type */
>>>>>>  	struct plist_node list;		/* entry in swap_active_head */
>>>>>> @@ -260,6 +261,7 @@ struct swap_info_struct {
>>>>>>  	struct block_device *bdev;	/* swap device or bdev of swap file */
>>>>>>  	struct file *swap_file;		/* seldom referenced */
>>>>>>  	unsigned int old_block_size;	/* seldom referenced */
>>>>>> +	struct completion comp;		/* seldom referenced */
>>>>>>  #ifdef CONFIG_FRONTSWAP
>>>>>>  	unsigned long *frontswap_map;	/* frontswap in-use, one bit per page */
>>>>>>  	atomic_t frontswap_pages;	/* frontswap pages in-use counter */
>>>>>> diff --git a/mm/swapfile.c b/mm/swapfile.c
>>>>>> index 149e77454e3c..724173cd7d0c 100644
>>>>>> --- a/mm/swapfile.c
>>>>>> +++ b/mm/swapfile.c
>>>>>> @@ -39,6 +39,7 @@
>>>>>>  #include <linux/export.h>
>>>>>>  #include <linux/swap_slots.h>
>>>>>>  #include <linux/sort.h>
>>>>>> +#include <linux/completion.h>
>>>>>>  
>>>>>>  #include <asm/tlbflush.h>
>>>>>>  #include <linux/swapops.h>
>>>>>> @@ -511,6 +512,15 @@ static void swap_discard_work(struct work_struct *work)
>>>>>>  	spin_unlock(&si->lock);
>>>>>>  }
>>>>>>  
>>>>>> +static void swap_users_ref_free(struct percpu_ref *ref)
>>>>>> +{
>>>>>> +	struct swap_info_struct *si;
>>>>>> +
>>>>>> +	si = container_of(ref, struct swap_info_struct, users);
>>>>>> +	complete(&si->comp);
>>>>>> +	percpu_ref_exit(&si->users);
>>>>>
>>>>> Because percpu_ref_exit() is used, we cannot use percpu_ref_tryget() in
>>>>> get_swap_device(), better to add comments there.
>>>>
>>>> I just noticed that the comments of percpu_ref_tryget_live() says,
>>>>
>>>>  * This function is safe to call as long as @ref is between init and exit.
>>>>
>>>> While we need to call get_swap_device() almost at any time, so it's
>>>> better to avoid to call percpu_ref_exit() at all.  This will waste some
>>>> memory, but we need to follow the API definition to avoid potential
>>>> issues in the long term.
>>>
>>> I have to admit that I'am not really familiar with percpu_ref. So I read the
>>> implementation code of the percpu_ref and found percpu_ref_tryget_live() could
>>> be called after exit now. But you're right we need to follow the API definition
>>> to avoid potential issues in the long term.
>>>
>>>>
>>>> And we need to call percpu_ref_init() before insert the swap_info_struct
>>>> into the swap_info[].
>>>
>>> If we remove the call to percpu_ref_exit(), we should not use percpu_ref_init()
>>> here because *percpu_ref->data is assumed to be NULL* in percpu_ref_init() while
>>> this is not the case as we do not call percpu_ref_exit(). Maybe percpu_ref_reinit()
>>> or percpu_ref_resurrect() will do the work.
>>>
>>> One more thing, how could I distinguish the killed percpu_ref from newly allocated one?
>>> It seems percpu_ref_is_dying is only safe to call when @ref is between init and exit.
>>> Maybe I could do this in alloc_swap_info()?
>> 
>> Yes.  In alloc_swap_info(), you can distinguish newly allocated and
>> reused swap_info_struct.
>> 
>>>>
>>>>>> +}
>>>>>> +
>>>>>>  static void alloc_cluster(struct swap_info_struct *si, unsigned long idx)
>>>>>>  {
>>>>>>  	struct swap_cluster_info *ci = si->cluster_info;
>>>>>> @@ -2500,7 +2510,7 @@ static void enable_swap_info(struct swap_info_struct *p, int prio,
>>>>>>  	 * Guarantee swap_map, cluster_info, etc. fields are valid
>>>>>>  	 * between get/put_swap_device() if SWP_VALID bit is set
>>>>>>  	 */
>>>>>> -	synchronize_rcu();
>>>>>> +	percpu_ref_reinit(&p->users);
>>>>>
>>>>> Although the effect is same, I think it's better to use
>>>>> percpu_ref_resurrect() here to improve code readability.
>>>>
>>>> Check the original commit description for commit eb085574a752 "mm, swap:
>>>> fix race between swapoff and some swap operations" and discussion email
>>>> thread as follows again,
>>>>
>>>> https://lore.kernel.org/linux-mm/20171219053650.GB7829@linux.vnet.ibm.com/
>>>>
>>>> I found that the synchronize_rcu() here is to avoid to call smp_rmb() or
>>>> smp_load_acquire() in get_swap_device().  Now we will use
>>>> percpu_ref_tryget_live() in get_swap_device(), so we will need to add
>>>> the necessary memory barrier, or make sure percpu_ref_tryget_live() has
>>>> ACQUIRE semantics.  Per my understanding, we need to change
>>>> percpu_ref_tryget_live() for that.
>>>>
>>>
>>> Do you mean the below scene is possible?
>>>
>>> cpu1
>>> swapon()
>>>   ...
>>>   percpu_ref_init
>>>   ...
>>>   setup_swap_info
>>>   /* smp_store_release() is inside percpu_ref_reinit */
>>>   percpu_ref_reinit
>> 
>> spin_unlock() has RELEASE semantics already.
>> 
>>>   ...
>>>
>>> cpu2
>>> get_swap_device()
>>>   /* ignored  smp_rmb() */
>>>   percpu_ref_tryget_live
>> 
>> Some kind of ACQUIRE is required here to guarantee the refcount is
>> checked before fetching the other fields of swap_info_struct.  I have
>> sent out a RFC patch to mailing list to discuss this.
>
> Many thanks.
> But We may still need to add a smp_rmb() in get_swap_device() in case
> we can't add ACQUIRE for refcount.

Yes.

Best Regards,
Huang, Ying

^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [PATCH 1/5] mm/swapfile: add percpu_ref support for swap
@ 2021-04-14  2:06               ` Huang, Ying
  0 siblings, 0 replies; 72+ messages in thread
From: Huang, Ying @ 2021-04-14  2:06 UTC (permalink / raw)
  To: Miaohe Lin
  Cc: akpm, hannes, mhocko, iamjoonsoo.kim, vbabka, alex.shi, willy,
	minchan, richard.weiyang, hughd, tim.c.chen, linux-kernel,
	linux-mm

Miaohe Lin <linmiaohe@huawei.com> writes:

> On 2021/4/14 9:17, Huang, Ying wrote:
>> Miaohe Lin <linmiaohe@huawei.com> writes:
>> 
>>> On 2021/4/12 15:24, Huang, Ying wrote:
>>>> "Huang, Ying" <ying.huang@intel.com> writes:
>>>>
>>>>> Miaohe Lin <linmiaohe@huawei.com> writes:
>>>>>
>>>>>> We will use percpu-refcount to serialize against concurrent swapoff. This
>>>>>> patch adds the percpu_ref support for later fixup.
>>>>>>
>>>>>> Signed-off-by: Miaohe Lin <linmiaohe@huawei.com>
>>>>>> ---
>>>>>>  include/linux/swap.h |  2 ++
>>>>>>  mm/swapfile.c        | 25 ++++++++++++++++++++++---
>>>>>>  2 files changed, 24 insertions(+), 3 deletions(-)
>>>>>>
>>>>>> diff --git a/include/linux/swap.h b/include/linux/swap.h
>>>>>> index 144727041e78..849ba5265c11 100644
>>>>>> --- a/include/linux/swap.h
>>>>>> +++ b/include/linux/swap.h
>>>>>> @@ -240,6 +240,7 @@ struct swap_cluster_list {
>>>>>>   * The in-memory structure used to track swap areas.
>>>>>>   */
>>>>>>  struct swap_info_struct {
>>>>>> +	struct percpu_ref users;	/* serialization against concurrent swapoff */
>>>>>>  	unsigned long	flags;		/* SWP_USED etc: see above */
>>>>>>  	signed short	prio;		/* swap priority of this type */
>>>>>>  	struct plist_node list;		/* entry in swap_active_head */
>>>>>> @@ -260,6 +261,7 @@ struct swap_info_struct {
>>>>>>  	struct block_device *bdev;	/* swap device or bdev of swap file */
>>>>>>  	struct file *swap_file;		/* seldom referenced */
>>>>>>  	unsigned int old_block_size;	/* seldom referenced */
>>>>>> +	struct completion comp;		/* seldom referenced */
>>>>>>  #ifdef CONFIG_FRONTSWAP
>>>>>>  	unsigned long *frontswap_map;	/* frontswap in-use, one bit per page */
>>>>>>  	atomic_t frontswap_pages;	/* frontswap pages in-use counter */
>>>>>> diff --git a/mm/swapfile.c b/mm/swapfile.c
>>>>>> index 149e77454e3c..724173cd7d0c 100644
>>>>>> --- a/mm/swapfile.c
>>>>>> +++ b/mm/swapfile.c
>>>>>> @@ -39,6 +39,7 @@
>>>>>>  #include <linux/export.h>
>>>>>>  #include <linux/swap_slots.h>
>>>>>>  #include <linux/sort.h>
>>>>>> +#include <linux/completion.h>
>>>>>>  
>>>>>>  #include <asm/tlbflush.h>
>>>>>>  #include <linux/swapops.h>
>>>>>> @@ -511,6 +512,15 @@ static void swap_discard_work(struct work_struct *work)
>>>>>>  	spin_unlock(&si->lock);
>>>>>>  }
>>>>>>  
>>>>>> +static void swap_users_ref_free(struct percpu_ref *ref)
>>>>>> +{
>>>>>> +	struct swap_info_struct *si;
>>>>>> +
>>>>>> +	si = container_of(ref, struct swap_info_struct, users);
>>>>>> +	complete(&si->comp);
>>>>>> +	percpu_ref_exit(&si->users);
>>>>>
>>>>> Because percpu_ref_exit() is used, we cannot use percpu_ref_tryget() in
>>>>> get_swap_device(), better to add comments there.
>>>>
>>>> I just noticed that the comments of percpu_ref_tryget_live() says,
>>>>
>>>>  * This function is safe to call as long as @ref is between init and exit.
>>>>
>>>> While we need to call get_swap_device() almost at any time, so it's
>>>> better to avoid to call percpu_ref_exit() at all.  This will waste some
>>>> memory, but we need to follow the API definition to avoid potential
>>>> issues in the long term.
>>>
>>> I have to admit that I'am not really familiar with percpu_ref. So I read the
>>> implementation code of the percpu_ref and found percpu_ref_tryget_live() could
>>> be called after exit now. But you're right we need to follow the API definition
>>> to avoid potential issues in the long term.
>>>
>>>>
>>>> And we need to call percpu_ref_init() before insert the swap_info_struct
>>>> into the swap_info[].
>>>
>>> If we remove the call to percpu_ref_exit(), we should not use percpu_ref_init()
>>> here because *percpu_ref->data is assumed to be NULL* in percpu_ref_init() while
>>> this is not the case as we do not call percpu_ref_exit(). Maybe percpu_ref_reinit()
>>> or percpu_ref_resurrect() will do the work.
>>>
>>> One more thing, how could I distinguish the killed percpu_ref from newly allocated one?
>>> It seems percpu_ref_is_dying is only safe to call when @ref is between init and exit.
>>> Maybe I could do this in alloc_swap_info()?
>> 
>> Yes.  In alloc_swap_info(), you can distinguish newly allocated and
>> reused swap_info_struct.
>> 
>>>>
>>>>>> +}
>>>>>> +
>>>>>>  static void alloc_cluster(struct swap_info_struct *si, unsigned long idx)
>>>>>>  {
>>>>>>  	struct swap_cluster_info *ci = si->cluster_info;
>>>>>> @@ -2500,7 +2510,7 @@ static void enable_swap_info(struct swap_info_struct *p, int prio,
>>>>>>  	 * Guarantee swap_map, cluster_info, etc. fields are valid
>>>>>>  	 * between get/put_swap_device() if SWP_VALID bit is set
>>>>>>  	 */
>>>>>> -	synchronize_rcu();
>>>>>> +	percpu_ref_reinit(&p->users);
>>>>>
>>>>> Although the effect is same, I think it's better to use
>>>>> percpu_ref_resurrect() here to improve code readability.
>>>>
>>>> Check the original commit description for commit eb085574a752 "mm, swap:
>>>> fix race between swapoff and some swap operations" and discussion email
>>>> thread as follows again,
>>>>
>>>> https://lore.kernel.org/linux-mm/20171219053650.GB7829@linux.vnet.ibm.com/
>>>>
>>>> I found that the synchronize_rcu() here is to avoid to call smp_rmb() or
>>>> smp_load_acquire() in get_swap_device().  Now we will use
>>>> percpu_ref_tryget_live() in get_swap_device(), so we will need to add
>>>> the necessary memory barrier, or make sure percpu_ref_tryget_live() has
>>>> ACQUIRE semantics.  Per my understanding, we need to change
>>>> percpu_ref_tryget_live() for that.
>>>>
>>>
>>> Do you mean the below scene is possible?
>>>
>>> cpu1
>>> swapon()
>>>   ...
>>>   percpu_ref_init
>>>   ...
>>>   setup_swap_info
>>>   /* smp_store_release() is inside percpu_ref_reinit */
>>>   percpu_ref_reinit
>> 
>> spin_unlock() has RELEASE semantics already.
>> 
>>>   ...
>>>
>>> cpu2
>>> get_swap_device()
>>>   /* ignored  smp_rmb() */
>>>   percpu_ref_tryget_live
>> 
>> Some kind of ACQUIRE is required here to guarantee the refcount is
>> checked before fetching the other fields of swap_info_struct.  I have
>> sent out a RFC patch to mailing list to discuss this.
>
> Many thanks.
> But We may still need to add a smp_rmb() in get_swap_device() in case
> we can't add ACQUIRE for refcount.

Yes.

Best Regards,
Huang, Ying


^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [PATCH 2/5] swap: fix do_swap_page() race with swapoff
  2021-04-14  1:04         ` Huang, Ying
  (?)
@ 2021-04-14  2:20         ` Miaohe Lin
  -1 siblings, 0 replies; 72+ messages in thread
From: Miaohe Lin @ 2021-04-14  2:20 UTC (permalink / raw)
  To: Huang, Ying, Tim Chen
  Cc: akpm, hannes, mhocko, iamjoonsoo.kim, vbabka, alex.shi, willy,
	minchan, richard.weiyang, hughd, linux-kernel, linux-mm

On 2021/4/14 9:04, Huang, Ying wrote:
> Tim Chen <tim.c.chen@linux.intel.com> writes:
> 
>> On 4/12/21 6:27 PM, Huang, Ying wrote:
>>
>>>
>>> This isn't the commit that introduces the race.  You can use `git blame`
>>> find out the correct commit.  For this it's commit 0bcac06f27d7 "mm,
>>> swap: skip swapcache for swapin of synchronous device".
>>>
>>> And I suggest to merge 1/5 and 2/5 to make it easy to get the full
>>> picture.
>>
>> I'll suggest make fix to do_swap_page race with get/put_swap_device
>> as a first patch. Then the per_cpu_ref stuff in patch 1 and patch 2 can
>> be combined together.
> 
> The original get/put_swap_device() use rcu_read_lock/unlock().  I don't
> think it's good to wrap swap_read_page() with it.  After all, some
> complex operations are done in swap_read_page(), including
> blk_io_schedule().
> 

The patch was split to make it easier to review originally, i.e. 1/5 introduces
the percpu_ref to swap and 2/5 uses it to fix the race between do_swap_page()
and swapoff.
Btw, I have no preference for merging 1/5 and 2/5 or not.

> Best Regards,
> Huang, Ying
> .
> 


^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [PATCH 3/5] mm/swap_state: fix get_shadow_from_swap_cache() race with swapoff
  2021-04-13  1:33     ` Huang, Ying
  (?)
@ 2021-04-14  2:42     ` Miaohe Lin
  -1 siblings, 0 replies; 72+ messages in thread
From: Miaohe Lin @ 2021-04-14  2:42 UTC (permalink / raw)
  To: Huang, Ying
  Cc: akpm, hannes, mhocko, iamjoonsoo.kim, vbabka, alex.shi, willy,
	minchan, richard.weiyang, hughd, tim.c.chen, linux-kernel,
	linux-mm

On 2021/4/13 9:33, Huang, Ying wrote:
> Miaohe Lin <linmiaohe@huawei.com> writes:
> 
>> The function get_shadow_from_swap_cache() can race with swapoff, though
>> it's only called by do_swap_page() now.
>>
>> Fixes: aae466b0052e ("mm/swap: implement workingset detection for anonymous LRU")
>> Signed-off-by: Miaohe Lin <linmiaohe@huawei.com>
> 
> This is unnecessary.  The only caller has guaranteed the swap device
> from swapoff

Ok. This patch is just in case get_shadow_from_swap_cache() would be
called from elsewhere someday.

It's unnecessary and can be dropped now

.
> 
> Best Regards,
> Huang, Ying
> 
>> ---
>>  mm/swap_state.c | 9 ++++++---
>>  1 file changed, 6 insertions(+), 3 deletions(-)
>>
>> diff --git a/mm/swap_state.c b/mm/swap_state.c
>> index 272ea2108c9d..709c260d644a 100644
>> --- a/mm/swap_state.c
>> +++ b/mm/swap_state.c
>> @@ -83,11 +83,14 @@ void show_swap_cache_info(void)
>>  
>>  void *get_shadow_from_swap_cache(swp_entry_t entry)
>>  {
>> -	struct address_space *address_space = swap_address_space(entry);
>> -	pgoff_t idx = swp_offset(entry);
>> +	struct swap_info_struct *si;
>>  	struct page *page;
>>  
>> -	page = xa_load(&address_space->i_pages, idx);
>> +	si = get_swap_device(entry);
>> +	if (!si)
>> +		return NULL;
>> +	page = xa_load(&swap_address_space(entry)->i_pages, swp_offset(entry));
>> +	put_swap_device(si);
>>  	if (xa_is_value(page))
>>  		return page;
>>  	return NULL;
> 
> .
> 


^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [PATCH 5/5] mm/swap_state: fix swap_cluster_readahead() race with swapoff
  2021-04-13  1:36     ` Huang, Ying
  (?)
@ 2021-04-14  2:43     ` Miaohe Lin
  -1 siblings, 0 replies; 72+ messages in thread
From: Miaohe Lin @ 2021-04-14  2:43 UTC (permalink / raw)
  To: Huang, Ying
  Cc: akpm, hannes, mhocko, iamjoonsoo.kim, vbabka, alex.shi, willy,
	minchan, richard.weiyang, hughd, tim.c.chen, linux-kernel,
	linux-mm

On 2021/4/13 9:36, Huang, Ying wrote:
> Miaohe Lin <linmiaohe@huawei.com> writes:
> 
>> swap_cluster_readahead() could race with swapoff and might dereference
>> si->swap_file after it's released by swapoff. Close this race window by
>> using get/put_swap_device() pair.
> 
> I think we should fix the callers instead to reduce the overhead.  Now,
> do_swap_page() has been fixed.  We need to fix shmem_swapin().
> 

Will do. Many thanks.

> Best Regards,
> Huang, Ying
> 
>> Signed-off-by: Miaohe Lin <linmiaohe@huawei.com>
>> ---
>>  mm/swap_state.c | 11 +++++++++--
>>  1 file changed, 9 insertions(+), 2 deletions(-)
>>
>> diff --git a/mm/swap_state.c b/mm/swap_state.c
>> index 3bf0d0c297bc..eba6b0cf6cf9 100644
>> --- a/mm/swap_state.c
>> +++ b/mm/swap_state.c
>> @@ -626,12 +626,17 @@ struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask,
>>  	unsigned long offset = entry_offset;
>>  	unsigned long start_offset, end_offset;
>>  	unsigned long mask;
>> -	struct swap_info_struct *si = swp_swap_info(entry);
>> +	struct swap_info_struct *si;
>>  	struct blk_plug plug;
>>  	bool do_poll = true, page_allocated;
>>  	struct vm_area_struct *vma = vmf->vma;
>>  	unsigned long addr = vmf->address;
>>  
>> +	si = get_swap_device(entry);
>> +	/* In case we raced with swapoff. */
>> +	if (!si)
>> +		return NULL;
>> +
>>  	mask = swapin_nr_pages(offset) - 1;
>>  	if (!mask)
>>  		goto skip;
>> @@ -673,7 +678,9 @@ struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask,
>>  
>>  	lru_add_drain();	/* Push any new pages onto the LRU now */
>>  skip:
>> -	return read_swap_cache_async(entry, gfp_mask, vma, addr, do_poll);
>> +	page = read_swap_cache_async(entry, gfp_mask, vma, addr, do_poll);
>> +	put_swap_device(si);
>> +	return page;
>>  }
>>  
>>  int init_swap_address_space(unsigned int type, unsigned long nr_pages)
> .
> 


^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [PATCH 2/5] swap: fix do_swap_page() race with swapoff
  2021-04-13  1:27     ` Huang, Ying
  (?)
  (?)
@ 2021-04-14  2:55     ` Miaohe Lin
  2021-04-14  3:07         ` Huang, Ying
  -1 siblings, 1 reply; 72+ messages in thread
From: Miaohe Lin @ 2021-04-14  2:55 UTC (permalink / raw)
  To: Huang, Ying
  Cc: akpm, hannes, mhocko, iamjoonsoo.kim, vbabka, alex.shi, willy,
	minchan, richard.weiyang, hughd, tim.c.chen, linux-kernel,
	linux-mm

On 2021/4/13 9:27, Huang, Ying wrote:
> Miaohe Lin <linmiaohe@huawei.com> writes:
> 
>> When I was investigating the swap code, I found the below possible race
>> window:
>>
>> CPU 1					CPU 2
>> -----					-----
>> do_swap_page
>>   synchronous swap_readpage
>>     alloc_page_vma
>> 					swapoff
>> 					  release swap_file, bdev, or ...
>>       swap_readpage
>> 	check sis->flags is ok
>> 	  access swap_file, bdev...[oops!]
>> 					    si->flags = 0
>>
>> Using current get/put_swap_device() to guard against concurrent swapoff for
>> swap_readpage() looks terrible because swap_readpage() may take really long
>> time. And this race may not be really pernicious because swapoff is usually
>> done when system shutdown only. To reduce the performance overhead on the
>> hot-path as much as possible, it appears we can use the percpu_ref to close
>> this race window(as suggested by Huang, Ying).
>>
>> Fixes: 235b62176712 ("mm/swap: add cluster lock")
> 
> This isn't the commit that introduces the race.  You can use `git blame`
> find out the correct commit.  For this it's commit 0bcac06f27d7 "mm,
> swap: skip swapcache for swapin of synchronous device".
> 

Sorry about it! What I refer to is commit eb085574a752 ("mm, swap: fix race between
swapoff and some swap operations"). And I think this commit does not fix the race
condition completely, so I reuse the Fixes tag inside it.

> And I suggest to merge 1/5 and 2/5 to make it easy to get the full
> picture.
> 
>> Signed-off-by: Miaohe Lin <linmiaohe@huawei.com>
>> ---
>>  include/linux/swap.h |  2 +-
>>  mm/memory.c          | 10 ++++++++++
>>  mm/swapfile.c        | 28 +++++++++++-----------------
>>  3 files changed, 22 insertions(+), 18 deletions(-)
>>
>> diff --git a/include/linux/swap.h b/include/linux/swap.h
>> index 849ba5265c11..9066addb57fd 100644
>> --- a/include/linux/swap.h
>> +++ b/include/linux/swap.h
>> @@ -513,7 +513,7 @@ sector_t swap_page_sector(struct page *page);
>>  
>>  static inline void put_swap_device(struct swap_info_struct *si)
>>  {
>> -	rcu_read_unlock();
>> +	percpu_ref_put(&si->users);
>>  }
>>  
>>  #else /* CONFIG_SWAP */
>> diff --git a/mm/memory.c b/mm/memory.c
>> index cc71a445c76c..8543c47b955c 100644
>> --- a/mm/memory.c
>> +++ b/mm/memory.c
>> @@ -3311,6 +3311,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
>>  {
>>  	struct vm_area_struct *vma = vmf->vma;
>>  	struct page *page = NULL, *swapcache;
>> +	struct swap_info_struct *si = NULL;
>>  	swp_entry_t entry;
>>  	pte_t pte;
>>  	int locked;
>> @@ -3339,6 +3340,11 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
>>  	}
>>  
>>
> 
> I suggest to add comments here as follows (words copy from Matthew Wilcox)
> 
> 	/* Prevent swapoff from happening to us */

Ok.

> 
>> +	si = get_swap_device(entry);
>> +	/* In case we raced with swapoff. */
>> +	if (unlikely(!si))
>> +		goto out;
>> +
> 
> Because we wrap the whole do_swap_page() with get/put_swap_device()
> now.  We can remove several get/put_swap_device() for function called by
> do_swap_page().  That can be another optimization patch.

I tried to remove several get/put_swap_device() for function called
by do_swap_page() only before I send this series. But it seems they have
other callers without proper get/put_swap_device().

> 
>>  	delayacct_set_flag(DELAYACCT_PF_SWAPIN);
>>  	page = lookup_swap_cache(entry, vma, vmf->address);
>>  	swapcache = page;
>> @@ -3514,6 +3520,8 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
>>  unlock:
>>  	pte_unmap_unlock(vmf->pte, vmf->ptl);
>>  out:
>> +	if (si)
>> +		put_swap_device(si);
>>  	return ret;
>>  out_nomap:
>>  	pte_unmap_unlock(vmf->pte, vmf->ptl);
>> @@ -3525,6 +3533,8 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
>>  		unlock_page(swapcache);
>>  		put_page(swapcache);
>>  	}
>> +	if (si)
>> +		put_swap_device(si);
>>  	return ret;
>>  }
>>  
>> diff --git a/mm/swapfile.c b/mm/swapfile.c
>> index 724173cd7d0c..01032c72ceae 100644
>> --- a/mm/swapfile.c
>> +++ b/mm/swapfile.c
>> @@ -1280,18 +1280,12 @@ static unsigned char __swap_entry_free_locked(struct swap_info_struct *p,
>>   * via preventing the swap device from being swapoff, until
>>   * put_swap_device() is called.  Otherwise return NULL.
>>   *
>> - * The entirety of the RCU read critical section must come before the
>> - * return from or after the call to synchronize_rcu() in
>> - * enable_swap_info() or swapoff().  So if "si->flags & SWP_VALID" is
>> - * true, the si->map, si->cluster_info, etc. must be valid in the
>> - * critical section.
>> - *
>>   * Notice that swapoff or swapoff+swapon can still happen before the
>> - * rcu_read_lock() in get_swap_device() or after the rcu_read_unlock()
>> - * in put_swap_device() if there isn't any other way to prevent
>> - * swapoff, such as page lock, page table lock, etc.  The caller must
>> - * be prepared for that.  For example, the following situation is
>> - * possible.
>> + * percpu_ref_tryget_live() in get_swap_device() or after the
>> + * percpu_ref_put() in put_swap_device() if there isn't any other way
>> + * to prevent swapoff, such as page lock, page table lock, etc.  The
>> + * caller must be prepared for that.  For example, the following
>> + * situation is possible.
>>   *
>>   *   CPU1				CPU2
>>   *   do_swap_page()
>> @@ -1319,21 +1313,21 @@ struct swap_info_struct *get_swap_device(swp_entry_t entry)
>>  	si = swp_swap_info(entry);
>>  	if (!si)
>>  		goto bad_nofile;
>> -
>> -	rcu_read_lock();
>>  	if (data_race(!(si->flags & SWP_VALID)))
> 
> We can delete SWP_VALID, that is used together with RCU solution.

Will do.

> 
>> -		goto unlock_out;
>> +		goto out;
>> +	if (!percpu_ref_tryget_live(&si->users))
>> +		goto out;
>>  	offset = swp_offset(entry);
>>  	if (offset >= si->max)
>> -		goto unlock_out;
>> +		goto put_out;
>>  
>>  	return si;
>>  bad_nofile:
>>  	pr_err("%s: %s%08lx\n", __func__, Bad_file, entry.val);
>>  out:
>>  	return NULL;
>> -unlock_out:
>> -	rcu_read_unlock();
>> +put_out:
>> +	percpu_ref_put(&si->users);
>>  	return NULL;
>>  }
> 

Many thanks.

> Best Regards,
> Huang, Ying
> .
> 


^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [PATCH 2/5] swap: fix do_swap_page() race with swapoff
  2021-04-14  2:55     ` Miaohe Lin
@ 2021-04-14  3:07         ` Huang, Ying
  0 siblings, 0 replies; 72+ messages in thread
From: Huang, Ying @ 2021-04-14  3:07 UTC (permalink / raw)
  To: Miaohe Lin
  Cc: akpm, hannes, mhocko, iamjoonsoo.kim, vbabka, alex.shi, willy,
	minchan, richard.weiyang, hughd, tim.c.chen, linux-kernel,
	linux-mm

Miaohe Lin <linmiaohe@huawei.com> writes:

> On 2021/4/13 9:27, Huang, Ying wrote:
>> Miaohe Lin <linmiaohe@huawei.com> writes:
>> 
>>> When I was investigating the swap code, I found the below possible race
>>> window:
>>>
>>> CPU 1					CPU 2
>>> -----					-----
>>> do_swap_page
>>>   synchronous swap_readpage
>>>     alloc_page_vma
>>> 					swapoff
>>> 					  release swap_file, bdev, or ...
>>>       swap_readpage
>>> 	check sis->flags is ok
>>> 	  access swap_file, bdev...[oops!]
>>> 					    si->flags = 0
>>>
>>> Using current get/put_swap_device() to guard against concurrent swapoff for
>>> swap_readpage() looks terrible because swap_readpage() may take really long
>>> time. And this race may not be really pernicious because swapoff is usually
>>> done when system shutdown only. To reduce the performance overhead on the
>>> hot-path as much as possible, it appears we can use the percpu_ref to close
>>> this race window(as suggested by Huang, Ying).
>>>
>>> Fixes: 235b62176712 ("mm/swap: add cluster lock")
>> 
>> This isn't the commit that introduces the race.  You can use `git blame`
>> find out the correct commit.  For this it's commit 0bcac06f27d7 "mm,
>> swap: skip swapcache for swapin of synchronous device".
>> 
>
> Sorry about it! What I refer to is commit eb085574a752 ("mm, swap: fix race between
> swapoff and some swap operations"). And I think this commit does not fix the race
> condition completely, so I reuse the Fixes tag inside it.
>
>> And I suggest to merge 1/5 and 2/5 to make it easy to get the full
>> picture.
>> 
>>> Signed-off-by: Miaohe Lin <linmiaohe@huawei.com>
>>> ---
>>>  include/linux/swap.h |  2 +-
>>>  mm/memory.c          | 10 ++++++++++
>>>  mm/swapfile.c        | 28 +++++++++++-----------------
>>>  3 files changed, 22 insertions(+), 18 deletions(-)
>>>
>>> diff --git a/include/linux/swap.h b/include/linux/swap.h
>>> index 849ba5265c11..9066addb57fd 100644
>>> --- a/include/linux/swap.h
>>> +++ b/include/linux/swap.h
>>> @@ -513,7 +513,7 @@ sector_t swap_page_sector(struct page *page);
>>>  
>>>  static inline void put_swap_device(struct swap_info_struct *si)
>>>  {
>>> -	rcu_read_unlock();
>>> +	percpu_ref_put(&si->users);
>>>  }
>>>  
>>>  #else /* CONFIG_SWAP */
>>> diff --git a/mm/memory.c b/mm/memory.c
>>> index cc71a445c76c..8543c47b955c 100644
>>> --- a/mm/memory.c
>>> +++ b/mm/memory.c
>>> @@ -3311,6 +3311,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
>>>  {
>>>  	struct vm_area_struct *vma = vmf->vma;
>>>  	struct page *page = NULL, *swapcache;
>>> +	struct swap_info_struct *si = NULL;
>>>  	swp_entry_t entry;
>>>  	pte_t pte;
>>>  	int locked;
>>> @@ -3339,6 +3340,11 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
>>>  	}
>>>  
>>>
>> 
>> I suggest to add comments here as follows (words copy from Matthew Wilcox)
>> 
>> 	/* Prevent swapoff from happening to us */
>
> Ok.
>
>> 
>>> +	si = get_swap_device(entry);
>>> +	/* In case we raced with swapoff. */
>>> +	if (unlikely(!si))
>>> +		goto out;
>>> +
>> 
>> Because we wrap the whole do_swap_page() with get/put_swap_device()
>> now.  We can remove several get/put_swap_device() for function called by
>> do_swap_page().  That can be another optimization patch.
>
> I tried to remove several get/put_swap_device() for function called
> by do_swap_page() only before I send this series. But it seems they have
> other callers without proper get/put_swap_device().

Then we need to revise these callers instead.  Anyway, can be another
series.

Best Regards,
Huang, Ying

^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [PATCH 2/5] swap: fix do_swap_page() race with swapoff
@ 2021-04-14  3:07         ` Huang, Ying
  0 siblings, 0 replies; 72+ messages in thread
From: Huang, Ying @ 2021-04-14  3:07 UTC (permalink / raw)
  To: Miaohe Lin
  Cc: akpm, hannes, mhocko, iamjoonsoo.kim, vbabka, alex.shi, willy,
	minchan, richard.weiyang, hughd, tim.c.chen, linux-kernel,
	linux-mm

Miaohe Lin <linmiaohe@huawei.com> writes:

> On 2021/4/13 9:27, Huang, Ying wrote:
>> Miaohe Lin <linmiaohe@huawei.com> writes:
>> 
>>> When I was investigating the swap code, I found the below possible race
>>> window:
>>>
>>> CPU 1					CPU 2
>>> -----					-----
>>> do_swap_page
>>>   synchronous swap_readpage
>>>     alloc_page_vma
>>> 					swapoff
>>> 					  release swap_file, bdev, or ...
>>>       swap_readpage
>>> 	check sis->flags is ok
>>> 	  access swap_file, bdev...[oops!]
>>> 					    si->flags = 0
>>>
>>> Using current get/put_swap_device() to guard against concurrent swapoff for
>>> swap_readpage() looks terrible because swap_readpage() may take really long
>>> time. And this race may not be really pernicious because swapoff is usually
>>> done when system shutdown only. To reduce the performance overhead on the
>>> hot-path as much as possible, it appears we can use the percpu_ref to close
>>> this race window(as suggested by Huang, Ying).
>>>
>>> Fixes: 235b62176712 ("mm/swap: add cluster lock")
>> 
>> This isn't the commit that introduces the race.  You can use `git blame`
>> find out the correct commit.  For this it's commit 0bcac06f27d7 "mm,
>> swap: skip swapcache for swapin of synchronous device".
>> 
>
> Sorry about it! What I refer to is commit eb085574a752 ("mm, swap: fix race between
> swapoff and some swap operations"). And I think this commit does not fix the race
> condition completely, so I reuse the Fixes tag inside it.
>
>> And I suggest to merge 1/5 and 2/5 to make it easy to get the full
>> picture.
>> 
>>> Signed-off-by: Miaohe Lin <linmiaohe@huawei.com>
>>> ---
>>>  include/linux/swap.h |  2 +-
>>>  mm/memory.c          | 10 ++++++++++
>>>  mm/swapfile.c        | 28 +++++++++++-----------------
>>>  3 files changed, 22 insertions(+), 18 deletions(-)
>>>
>>> diff --git a/include/linux/swap.h b/include/linux/swap.h
>>> index 849ba5265c11..9066addb57fd 100644
>>> --- a/include/linux/swap.h
>>> +++ b/include/linux/swap.h
>>> @@ -513,7 +513,7 @@ sector_t swap_page_sector(struct page *page);
>>>  
>>>  static inline void put_swap_device(struct swap_info_struct *si)
>>>  {
>>> -	rcu_read_unlock();
>>> +	percpu_ref_put(&si->users);
>>>  }
>>>  
>>>  #else /* CONFIG_SWAP */
>>> diff --git a/mm/memory.c b/mm/memory.c
>>> index cc71a445c76c..8543c47b955c 100644
>>> --- a/mm/memory.c
>>> +++ b/mm/memory.c
>>> @@ -3311,6 +3311,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
>>>  {
>>>  	struct vm_area_struct *vma = vmf->vma;
>>>  	struct page *page = NULL, *swapcache;
>>> +	struct swap_info_struct *si = NULL;
>>>  	swp_entry_t entry;
>>>  	pte_t pte;
>>>  	int locked;
>>> @@ -3339,6 +3340,11 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
>>>  	}
>>>  
>>>
>> 
>> I suggest to add comments here as follows (words copy from Matthew Wilcox)
>> 
>> 	/* Prevent swapoff from happening to us */
>
> Ok.
>
>> 
>>> +	si = get_swap_device(entry);
>>> +	/* In case we raced with swapoff. */
>>> +	if (unlikely(!si))
>>> +		goto out;
>>> +
>> 
>> Because we wrap the whole do_swap_page() with get/put_swap_device()
>> now.  We can remove several get/put_swap_device() for function called by
>> do_swap_page().  That can be another optimization patch.
>
> I tried to remove several get/put_swap_device() for function called
> by do_swap_page() only before I send this series. But it seems they have
> other callers without proper get/put_swap_device().

Then we need to revise these callers instead.  Anyway, can be another
series.

Best Regards,
Huang, Ying


^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [PATCH 2/5] swap: fix do_swap_page() race with swapoff
  2021-04-14  3:07         ` Huang, Ying
  (?)
@ 2021-04-14  3:27         ` Miaohe Lin
  -1 siblings, 0 replies; 72+ messages in thread
From: Miaohe Lin @ 2021-04-14  3:27 UTC (permalink / raw)
  To: Huang, Ying
  Cc: akpm, hannes, mhocko, iamjoonsoo.kim, vbabka, alex.shi, willy,
	minchan, richard.weiyang, hughd, tim.c.chen, linux-kernel,
	linux-mm

On 2021/4/14 11:07, Huang, Ying wrote:
> Miaohe Lin <linmiaohe@huawei.com> writes:
> 
>> On 2021/4/13 9:27, Huang, Ying wrote:
>>> Miaohe Lin <linmiaohe@huawei.com> writes:
>>>
>>>> When I was investigating the swap code, I found the below possible race
>>>> window:
>>>>
>>>> CPU 1					CPU 2
>>>> -----					-----
>>>> do_swap_page
>>>>   synchronous swap_readpage
>>>>     alloc_page_vma
>>>> 					swapoff
>>>> 					  release swap_file, bdev, or ...
>>>>       swap_readpage
>>>> 	check sis->flags is ok
>>>> 	  access swap_file, bdev...[oops!]
>>>> 					    si->flags = 0
>>>>
>>>> Using current get/put_swap_device() to guard against concurrent swapoff for
>>>> swap_readpage() looks terrible because swap_readpage() may take really long
>>>> time. And this race may not be really pernicious because swapoff is usually
>>>> done when system shutdown only. To reduce the performance overhead on the
>>>> hot-path as much as possible, it appears we can use the percpu_ref to close
>>>> this race window(as suggested by Huang, Ying).
>>>>
>>>> Fixes: 235b62176712 ("mm/swap: add cluster lock")
>>>
>>> This isn't the commit that introduces the race.  You can use `git blame`
>>> find out the correct commit.  For this it's commit 0bcac06f27d7 "mm,
>>> swap: skip swapcache for swapin of synchronous device".
>>>
>>
>> Sorry about it! What I refer to is commit eb085574a752 ("mm, swap: fix race between
>> swapoff and some swap operations"). And I think this commit does not fix the race
>> condition completely, so I reuse the Fixes tag inside it.
>>
>>> And I suggest to merge 1/5 and 2/5 to make it easy to get the full
>>> picture.
>>>
>>>> Signed-off-by: Miaohe Lin <linmiaohe@huawei.com>
>>>> ---
>>>>  include/linux/swap.h |  2 +-
>>>>  mm/memory.c          | 10 ++++++++++
>>>>  mm/swapfile.c        | 28 +++++++++++-----------------
>>>>  3 files changed, 22 insertions(+), 18 deletions(-)
>>>>
>>>> diff --git a/include/linux/swap.h b/include/linux/swap.h
>>>> index 849ba5265c11..9066addb57fd 100644
>>>> --- a/include/linux/swap.h
>>>> +++ b/include/linux/swap.h
>>>> @@ -513,7 +513,7 @@ sector_t swap_page_sector(struct page *page);
>>>>  
>>>>  static inline void put_swap_device(struct swap_info_struct *si)
>>>>  {
>>>> -	rcu_read_unlock();
>>>> +	percpu_ref_put(&si->users);
>>>>  }
>>>>  
>>>>  #else /* CONFIG_SWAP */
>>>> diff --git a/mm/memory.c b/mm/memory.c
>>>> index cc71a445c76c..8543c47b955c 100644
>>>> --- a/mm/memory.c
>>>> +++ b/mm/memory.c
>>>> @@ -3311,6 +3311,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
>>>>  {
>>>>  	struct vm_area_struct *vma = vmf->vma;
>>>>  	struct page *page = NULL, *swapcache;
>>>> +	struct swap_info_struct *si = NULL;
>>>>  	swp_entry_t entry;
>>>>  	pte_t pte;
>>>>  	int locked;
>>>> @@ -3339,6 +3340,11 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
>>>>  	}
>>>>  
>>>>
>>>
>>> I suggest to add comments here as follows (words copy from Matthew Wilcox)
>>>
>>> 	/* Prevent swapoff from happening to us */
>>
>> Ok.
>>
>>>
>>>> +	si = get_swap_device(entry);
>>>> +	/* In case we raced with swapoff. */
>>>> +	if (unlikely(!si))
>>>> +		goto out;
>>>> +
>>>
>>> Because we wrap the whole do_swap_page() with get/put_swap_device()
>>> now.  We can remove several get/put_swap_device() for function called by
>>> do_swap_page().  That can be another optimization patch.
>>
>> I tried to remove several get/put_swap_device() for function called
>> by do_swap_page() only before I send this series. But it seems they have
>> other callers without proper get/put_swap_device().
> 
> Then we need to revise these callers instead.  Anyway, can be another
> series.

Yes. can be another series.
Thanks.

> 
> Best Regards,
> Huang, Ying
> 
> .
> 


^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [PATCH 1/5] mm/swapfile: add percpu_ref support for swap
  2021-04-14  2:06               ` Huang, Ying
  (?)
@ 2021-04-14  3:44               ` Dennis Zhou
  2021-04-14  3:59                   ` Huang, Ying
  -1 siblings, 1 reply; 72+ messages in thread
From: Dennis Zhou @ 2021-04-14  3:44 UTC (permalink / raw)
  To: Huang, Ying
  Cc: Miaohe Lin, akpm, hannes, mhocko, iamjoonsoo.kim, vbabka,
	alex.shi, willy, minchan, richard.weiyang, hughd, tim.c.chen,
	linux-kernel, linux-mm

Hello,

On Wed, Apr 14, 2021 at 10:06:48AM +0800, Huang, Ying wrote:
> Miaohe Lin <linmiaohe@huawei.com> writes:
> 
> > On 2021/4/14 9:17, Huang, Ying wrote:
> >> Miaohe Lin <linmiaohe@huawei.com> writes:
> >> 
> >>> On 2021/4/12 15:24, Huang, Ying wrote:
> >>>> "Huang, Ying" <ying.huang@intel.com> writes:
> >>>>
> >>>>> Miaohe Lin <linmiaohe@huawei.com> writes:
> >>>>>
> >>>>>> We will use percpu-refcount to serialize against concurrent swapoff. This
> >>>>>> patch adds the percpu_ref support for later fixup.
> >>>>>>
> >>>>>> Signed-off-by: Miaohe Lin <linmiaohe@huawei.com>
> >>>>>> ---
> >>>>>>  include/linux/swap.h |  2 ++
> >>>>>>  mm/swapfile.c        | 25 ++++++++++++++++++++++---
> >>>>>>  2 files changed, 24 insertions(+), 3 deletions(-)
> >>>>>>
> >>>>>> diff --git a/include/linux/swap.h b/include/linux/swap.h
> >>>>>> index 144727041e78..849ba5265c11 100644
> >>>>>> --- a/include/linux/swap.h
> >>>>>> +++ b/include/linux/swap.h
> >>>>>> @@ -240,6 +240,7 @@ struct swap_cluster_list {
> >>>>>>   * The in-memory structure used to track swap areas.
> >>>>>>   */
> >>>>>>  struct swap_info_struct {
> >>>>>> +	struct percpu_ref users;	/* serialization against concurrent swapoff */
> >>>>>>  	unsigned long	flags;		/* SWP_USED etc: see above */
> >>>>>>  	signed short	prio;		/* swap priority of this type */
> >>>>>>  	struct plist_node list;		/* entry in swap_active_head */
> >>>>>> @@ -260,6 +261,7 @@ struct swap_info_struct {
> >>>>>>  	struct block_device *bdev;	/* swap device or bdev of swap file */
> >>>>>>  	struct file *swap_file;		/* seldom referenced */
> >>>>>>  	unsigned int old_block_size;	/* seldom referenced */
> >>>>>> +	struct completion comp;		/* seldom referenced */
> >>>>>>  #ifdef CONFIG_FRONTSWAP
> >>>>>>  	unsigned long *frontswap_map;	/* frontswap in-use, one bit per page */
> >>>>>>  	atomic_t frontswap_pages;	/* frontswap pages in-use counter */
> >>>>>> diff --git a/mm/swapfile.c b/mm/swapfile.c
> >>>>>> index 149e77454e3c..724173cd7d0c 100644
> >>>>>> --- a/mm/swapfile.c
> >>>>>> +++ b/mm/swapfile.c
> >>>>>> @@ -39,6 +39,7 @@
> >>>>>>  #include <linux/export.h>
> >>>>>>  #include <linux/swap_slots.h>
> >>>>>>  #include <linux/sort.h>
> >>>>>> +#include <linux/completion.h>
> >>>>>>  
> >>>>>>  #include <asm/tlbflush.h>
> >>>>>>  #include <linux/swapops.h>
> >>>>>> @@ -511,6 +512,15 @@ static void swap_discard_work(struct work_struct *work)
> >>>>>>  	spin_unlock(&si->lock);
> >>>>>>  }
> >>>>>>  
> >>>>>> +static void swap_users_ref_free(struct percpu_ref *ref)
> >>>>>> +{
> >>>>>> +	struct swap_info_struct *si;
> >>>>>> +
> >>>>>> +	si = container_of(ref, struct swap_info_struct, users);
> >>>>>> +	complete(&si->comp);
> >>>>>> +	percpu_ref_exit(&si->users);
> >>>>>
> >>>>> Because percpu_ref_exit() is used, we cannot use percpu_ref_tryget() in
> >>>>> get_swap_device(), better to add comments there.
> >>>>
> >>>> I just noticed that the comments of percpu_ref_tryget_live() says,
> >>>>
> >>>>  * This function is safe to call as long as @ref is between init and exit.
> >>>>
> >>>> While we need to call get_swap_device() almost at any time, so it's
> >>>> better to avoid to call percpu_ref_exit() at all.  This will waste some
> >>>> memory, but we need to follow the API definition to avoid potential
> >>>> issues in the long term.
> >>>
> >>> I have to admit that I'am not really familiar with percpu_ref. So I read the
> >>> implementation code of the percpu_ref and found percpu_ref_tryget_live() could
> >>> be called after exit now. But you're right we need to follow the API definition
> >>> to avoid potential issues in the long term.
> >>>
> >>>>
> >>>> And we need to call percpu_ref_init() before insert the swap_info_struct
> >>>> into the swap_info[].
> >>>
> >>> If we remove the call to percpu_ref_exit(), we should not use percpu_ref_init()
> >>> here because *percpu_ref->data is assumed to be NULL* in percpu_ref_init() while
> >>> this is not the case as we do not call percpu_ref_exit(). Maybe percpu_ref_reinit()
> >>> or percpu_ref_resurrect() will do the work.
> >>>
> >>> One more thing, how could I distinguish the killed percpu_ref from newly allocated one?
> >>> It seems percpu_ref_is_dying is only safe to call when @ref is between init and exit.
> >>> Maybe I could do this in alloc_swap_info()?
> >> 
> >> Yes.  In alloc_swap_info(), you can distinguish newly allocated and
> >> reused swap_info_struct.
> >> 
> >>>>
> >>>>>> +}
> >>>>>> +
> >>>>>>  static void alloc_cluster(struct swap_info_struct *si, unsigned long idx)
> >>>>>>  {
> >>>>>>  	struct swap_cluster_info *ci = si->cluster_info;
> >>>>>> @@ -2500,7 +2510,7 @@ static void enable_swap_info(struct swap_info_struct *p, int prio,
> >>>>>>  	 * Guarantee swap_map, cluster_info, etc. fields are valid
> >>>>>>  	 * between get/put_swap_device() if SWP_VALID bit is set
> >>>>>>  	 */
> >>>>>> -	synchronize_rcu();
> >>>>>> +	percpu_ref_reinit(&p->users);
> >>>>>
> >>>>> Although the effect is same, I think it's better to use
> >>>>> percpu_ref_resurrect() here to improve code readability.
> >>>>
> >>>> Check the original commit description for commit eb085574a752 "mm, swap:
> >>>> fix race between swapoff and some swap operations" and discussion email
> >>>> thread as follows again,
> >>>>
> >>>> https://lore.kernel.org/linux-mm/20171219053650.GB7829@linux.vnet.ibm.com/
> >>>>
> >>>> I found that the synchronize_rcu() here is to avoid to call smp_rmb() or
> >>>> smp_load_acquire() in get_swap_device().  Now we will use
> >>>> percpu_ref_tryget_live() in get_swap_device(), so we will need to add
> >>>> the necessary memory barrier, or make sure percpu_ref_tryget_live() has
> >>>> ACQUIRE semantics.  Per my understanding, we need to change
> >>>> percpu_ref_tryget_live() for that.
> >>>>
> >>>
> >>> Do you mean the below scene is possible?
> >>>
> >>> cpu1
> >>> swapon()
> >>>   ...
> >>>   percpu_ref_init
> >>>   ...
> >>>   setup_swap_info
> >>>   /* smp_store_release() is inside percpu_ref_reinit */
> >>>   percpu_ref_reinit
> >> 
> >> spin_unlock() has RELEASE semantics already.
> >> 
> >>>   ...
> >>>
> >>> cpu2
> >>> get_swap_device()
> >>>   /* ignored  smp_rmb() */
> >>>   percpu_ref_tryget_live
> >> 
> >> Some kind of ACQUIRE is required here to guarantee the refcount is
> >> checked before fetching the other fields of swap_info_struct.  I have
> >> sent out a RFC patch to mailing list to discuss this.

I'm just catching up and following along a little bit. I apologize I
haven't read the swap code, but my understanding is you are trying to
narrow a race condition with swapoff. That makes sense to me. I'm not
sure I follow the need to race with reinitializing the ref though? Is it
not possible to wait out the dying swap info and then create a new one
rather than push acquire semantics?

> >
> > Many thanks.
> > But We may still need to add a smp_rmb() in get_swap_device() in case
> > we can't add ACQUIRE for refcount.
> 
> Yes.
> 
> Best Regards,
> Huang, Ying
> 

Thanks,
Dennis

^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [PATCH 1/5] mm/swapfile: add percpu_ref support for swap
  2021-04-14  3:44               ` Dennis Zhou
@ 2021-04-14  3:59                   ` Huang, Ying
  0 siblings, 0 replies; 72+ messages in thread
From: Huang, Ying @ 2021-04-14  3:59 UTC (permalink / raw)
  To: Dennis Zhou
  Cc: Miaohe Lin, akpm, hannes, mhocko, iamjoonsoo.kim, vbabka,
	alex.shi, willy, minchan, richard.weiyang, hughd, tim.c.chen,
	linux-kernel, linux-mm

Dennis Zhou <dennis@kernel.org> writes:

> Hello,
>
> On Wed, Apr 14, 2021 at 10:06:48AM +0800, Huang, Ying wrote:
>> Miaohe Lin <linmiaohe@huawei.com> writes:
>> 
>> > On 2021/4/14 9:17, Huang, Ying wrote:
>> >> Miaohe Lin <linmiaohe@huawei.com> writes:
>> >> 
>> >>> On 2021/4/12 15:24, Huang, Ying wrote:
>> >>>> "Huang, Ying" <ying.huang@intel.com> writes:
>> >>>>
>> >>>>> Miaohe Lin <linmiaohe@huawei.com> writes:
>> >>>>>
>> >>>>>> We will use percpu-refcount to serialize against concurrent swapoff. This
>> >>>>>> patch adds the percpu_ref support for later fixup.
>> >>>>>>
>> >>>>>> Signed-off-by: Miaohe Lin <linmiaohe@huawei.com>
>> >>>>>> ---
>> >>>>>>  include/linux/swap.h |  2 ++
>> >>>>>>  mm/swapfile.c        | 25 ++++++++++++++++++++++---
>> >>>>>>  2 files changed, 24 insertions(+), 3 deletions(-)
>> >>>>>>
>> >>>>>> diff --git a/include/linux/swap.h b/include/linux/swap.h
>> >>>>>> index 144727041e78..849ba5265c11 100644
>> >>>>>> --- a/include/linux/swap.h
>> >>>>>> +++ b/include/linux/swap.h
>> >>>>>> @@ -240,6 +240,7 @@ struct swap_cluster_list {
>> >>>>>>   * The in-memory structure used to track swap areas.
>> >>>>>>   */
>> >>>>>>  struct swap_info_struct {
>> >>>>>> +	struct percpu_ref users;	/* serialization against concurrent swapoff */
>> >>>>>>  	unsigned long	flags;		/* SWP_USED etc: see above */
>> >>>>>>  	signed short	prio;		/* swap priority of this type */
>> >>>>>>  	struct plist_node list;		/* entry in swap_active_head */
>> >>>>>> @@ -260,6 +261,7 @@ struct swap_info_struct {
>> >>>>>>  	struct block_device *bdev;	/* swap device or bdev of swap file */
>> >>>>>>  	struct file *swap_file;		/* seldom referenced */
>> >>>>>>  	unsigned int old_block_size;	/* seldom referenced */
>> >>>>>> +	struct completion comp;		/* seldom referenced */
>> >>>>>>  #ifdef CONFIG_FRONTSWAP
>> >>>>>>  	unsigned long *frontswap_map;	/* frontswap in-use, one bit per page */
>> >>>>>>  	atomic_t frontswap_pages;	/* frontswap pages in-use counter */
>> >>>>>> diff --git a/mm/swapfile.c b/mm/swapfile.c
>> >>>>>> index 149e77454e3c..724173cd7d0c 100644
>> >>>>>> --- a/mm/swapfile.c
>> >>>>>> +++ b/mm/swapfile.c
>> >>>>>> @@ -39,6 +39,7 @@
>> >>>>>>  #include <linux/export.h>
>> >>>>>>  #include <linux/swap_slots.h>
>> >>>>>>  #include <linux/sort.h>
>> >>>>>> +#include <linux/completion.h>
>> >>>>>>  
>> >>>>>>  #include <asm/tlbflush.h>
>> >>>>>>  #include <linux/swapops.h>
>> >>>>>> @@ -511,6 +512,15 @@ static void swap_discard_work(struct work_struct *work)
>> >>>>>>  	spin_unlock(&si->lock);
>> >>>>>>  }
>> >>>>>>  
>> >>>>>> +static void swap_users_ref_free(struct percpu_ref *ref)
>> >>>>>> +{
>> >>>>>> +	struct swap_info_struct *si;
>> >>>>>> +
>> >>>>>> +	si = container_of(ref, struct swap_info_struct, users);
>> >>>>>> +	complete(&si->comp);
>> >>>>>> +	percpu_ref_exit(&si->users);
>> >>>>>
>> >>>>> Because percpu_ref_exit() is used, we cannot use percpu_ref_tryget() in
>> >>>>> get_swap_device(), better to add comments there.
>> >>>>
>> >>>> I just noticed that the comments of percpu_ref_tryget_live() says,
>> >>>>
>> >>>>  * This function is safe to call as long as @ref is between init and exit.
>> >>>>
>> >>>> While we need to call get_swap_device() almost at any time, so it's
>> >>>> better to avoid to call percpu_ref_exit() at all.  This will waste some
>> >>>> memory, but we need to follow the API definition to avoid potential
>> >>>> issues in the long term.
>> >>>
>> >>> I have to admit that I'am not really familiar with percpu_ref. So I read the
>> >>> implementation code of the percpu_ref and found percpu_ref_tryget_live() could
>> >>> be called after exit now. But you're right we need to follow the API definition
>> >>> to avoid potential issues in the long term.
>> >>>
>> >>>>
>> >>>> And we need to call percpu_ref_init() before insert the swap_info_struct
>> >>>> into the swap_info[].
>> >>>
>> >>> If we remove the call to percpu_ref_exit(), we should not use percpu_ref_init()
>> >>> here because *percpu_ref->data is assumed to be NULL* in percpu_ref_init() while
>> >>> this is not the case as we do not call percpu_ref_exit(). Maybe percpu_ref_reinit()
>> >>> or percpu_ref_resurrect() will do the work.
>> >>>
>> >>> One more thing, how could I distinguish the killed percpu_ref from newly allocated one?
>> >>> It seems percpu_ref_is_dying is only safe to call when @ref is between init and exit.
>> >>> Maybe I could do this in alloc_swap_info()?
>> >> 
>> >> Yes.  In alloc_swap_info(), you can distinguish newly allocated and
>> >> reused swap_info_struct.
>> >> 
>> >>>>
>> >>>>>> +}
>> >>>>>> +
>> >>>>>>  static void alloc_cluster(struct swap_info_struct *si, unsigned long idx)
>> >>>>>>  {
>> >>>>>>  	struct swap_cluster_info *ci = si->cluster_info;
>> >>>>>> @@ -2500,7 +2510,7 @@ static void enable_swap_info(struct swap_info_struct *p, int prio,
>> >>>>>>  	 * Guarantee swap_map, cluster_info, etc. fields are valid
>> >>>>>>  	 * between get/put_swap_device() if SWP_VALID bit is set
>> >>>>>>  	 */
>> >>>>>> -	synchronize_rcu();
>> >>>>>> +	percpu_ref_reinit(&p->users);
>> >>>>>
>> >>>>> Although the effect is same, I think it's better to use
>> >>>>> percpu_ref_resurrect() here to improve code readability.
>> >>>>
>> >>>> Check the original commit description for commit eb085574a752 "mm, swap:
>> >>>> fix race between swapoff and some swap operations" and discussion email
>> >>>> thread as follows again,
>> >>>>
>> >>>> https://lore.kernel.org/linux-mm/20171219053650.GB7829@linux.vnet.ibm.com/
>> >>>>
>> >>>> I found that the synchronize_rcu() here is to avoid to call smp_rmb() or
>> >>>> smp_load_acquire() in get_swap_device().  Now we will use
>> >>>> percpu_ref_tryget_live() in get_swap_device(), so we will need to add
>> >>>> the necessary memory barrier, or make sure percpu_ref_tryget_live() has
>> >>>> ACQUIRE semantics.  Per my understanding, we need to change
>> >>>> percpu_ref_tryget_live() for that.
>> >>>>
>> >>>
>> >>> Do you mean the below scene is possible?
>> >>>
>> >>> cpu1
>> >>> swapon()
>> >>>   ...
>> >>>   percpu_ref_init
>> >>>   ...
>> >>>   setup_swap_info
>> >>>   /* smp_store_release() is inside percpu_ref_reinit */
>> >>>   percpu_ref_reinit
>> >> 
>> >> spin_unlock() has RELEASE semantics already.
>> >> 
>> >>>   ...
>> >>>
>> >>> cpu2
>> >>> get_swap_device()
>> >>>   /* ignored  smp_rmb() */
>> >>>   percpu_ref_tryget_live
>> >> 
>> >> Some kind of ACQUIRE is required here to guarantee the refcount is
>> >> checked before fetching the other fields of swap_info_struct.  I have
>> >> sent out a RFC patch to mailing list to discuss this.
>
> I'm just catching up and following along a little bit. I apologize I
> haven't read the swap code, but my understanding is you are trying to
> narrow a race condition with swapoff. That makes sense to me. I'm not
> sure I follow the need to race with reinitializing the ref though? Is it
> not possible to wait out the dying swap info and then create a new one
> rather than push acquire semantics?

We want to check whether the swap entry is valid (that is, the swap
device isn't swapped off now), prevent it from swapping off, then access
the swap_info_struct data structure.  When accessing swap_info_struct,
we want to guarantee the ordering, so that we will not reference
uninitialized fields of swap_info_struct.

Best Regards,
Huang, Ying

>> >
>> > Many thanks.
>> > But We may still need to add a smp_rmb() in get_swap_device() in case
>> > we can't add ACQUIRE for refcount.
>> 
>> Yes.
>> 
>> Best Regards,
>> Huang, Ying
>> 
>
> Thanks,
> Dennis

^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [PATCH 1/5] mm/swapfile: add percpu_ref support for swap
@ 2021-04-14  3:59                   ` Huang, Ying
  0 siblings, 0 replies; 72+ messages in thread
From: Huang, Ying @ 2021-04-14  3:59 UTC (permalink / raw)
  To: Dennis Zhou
  Cc: Miaohe Lin, akpm, hannes, mhocko, iamjoonsoo.kim, vbabka,
	alex.shi, willy, minchan, richard.weiyang, hughd, tim.c.chen,
	linux-kernel, linux-mm

Dennis Zhou <dennis@kernel.org> writes:

> Hello,
>
> On Wed, Apr 14, 2021 at 10:06:48AM +0800, Huang, Ying wrote:
>> Miaohe Lin <linmiaohe@huawei.com> writes:
>> 
>> > On 2021/4/14 9:17, Huang, Ying wrote:
>> >> Miaohe Lin <linmiaohe@huawei.com> writes:
>> >> 
>> >>> On 2021/4/12 15:24, Huang, Ying wrote:
>> >>>> "Huang, Ying" <ying.huang@intel.com> writes:
>> >>>>
>> >>>>> Miaohe Lin <linmiaohe@huawei.com> writes:
>> >>>>>
>> >>>>>> We will use percpu-refcount to serialize against concurrent swapoff. This
>> >>>>>> patch adds the percpu_ref support for later fixup.
>> >>>>>>
>> >>>>>> Signed-off-by: Miaohe Lin <linmiaohe@huawei.com>
>> >>>>>> ---
>> >>>>>>  include/linux/swap.h |  2 ++
>> >>>>>>  mm/swapfile.c        | 25 ++++++++++++++++++++++---
>> >>>>>>  2 files changed, 24 insertions(+), 3 deletions(-)
>> >>>>>>
>> >>>>>> diff --git a/include/linux/swap.h b/include/linux/swap.h
>> >>>>>> index 144727041e78..849ba5265c11 100644
>> >>>>>> --- a/include/linux/swap.h
>> >>>>>> +++ b/include/linux/swap.h
>> >>>>>> @@ -240,6 +240,7 @@ struct swap_cluster_list {
>> >>>>>>   * The in-memory structure used to track swap areas.
>> >>>>>>   */
>> >>>>>>  struct swap_info_struct {
>> >>>>>> +	struct percpu_ref users;	/* serialization against concurrent swapoff */
>> >>>>>>  	unsigned long	flags;		/* SWP_USED etc: see above */
>> >>>>>>  	signed short	prio;		/* swap priority of this type */
>> >>>>>>  	struct plist_node list;		/* entry in swap_active_head */
>> >>>>>> @@ -260,6 +261,7 @@ struct swap_info_struct {
>> >>>>>>  	struct block_device *bdev;	/* swap device or bdev of swap file */
>> >>>>>>  	struct file *swap_file;		/* seldom referenced */
>> >>>>>>  	unsigned int old_block_size;	/* seldom referenced */
>> >>>>>> +	struct completion comp;		/* seldom referenced */
>> >>>>>>  #ifdef CONFIG_FRONTSWAP
>> >>>>>>  	unsigned long *frontswap_map;	/* frontswap in-use, one bit per page */
>> >>>>>>  	atomic_t frontswap_pages;	/* frontswap pages in-use counter */
>> >>>>>> diff --git a/mm/swapfile.c b/mm/swapfile.c
>> >>>>>> index 149e77454e3c..724173cd7d0c 100644
>> >>>>>> --- a/mm/swapfile.c
>> >>>>>> +++ b/mm/swapfile.c
>> >>>>>> @@ -39,6 +39,7 @@
>> >>>>>>  #include <linux/export.h>
>> >>>>>>  #include <linux/swap_slots.h>
>> >>>>>>  #include <linux/sort.h>
>> >>>>>> +#include <linux/completion.h>
>> >>>>>>  
>> >>>>>>  #include <asm/tlbflush.h>
>> >>>>>>  #include <linux/swapops.h>
>> >>>>>> @@ -511,6 +512,15 @@ static void swap_discard_work(struct work_struct *work)
>> >>>>>>  	spin_unlock(&si->lock);
>> >>>>>>  }
>> >>>>>>  
>> >>>>>> +static void swap_users_ref_free(struct percpu_ref *ref)
>> >>>>>> +{
>> >>>>>> +	struct swap_info_struct *si;
>> >>>>>> +
>> >>>>>> +	si = container_of(ref, struct swap_info_struct, users);
>> >>>>>> +	complete(&si->comp);
>> >>>>>> +	percpu_ref_exit(&si->users);
>> >>>>>
>> >>>>> Because percpu_ref_exit() is used, we cannot use percpu_ref_tryget() in
>> >>>>> get_swap_device(), better to add comments there.
>> >>>>
>> >>>> I just noticed that the comments of percpu_ref_tryget_live() says,
>> >>>>
>> >>>>  * This function is safe to call as long as @ref is between init and exit.
>> >>>>
>> >>>> While we need to call get_swap_device() almost at any time, so it's
>> >>>> better to avoid to call percpu_ref_exit() at all.  This will waste some
>> >>>> memory, but we need to follow the API definition to avoid potential
>> >>>> issues in the long term.
>> >>>
>> >>> I have to admit that I'am not really familiar with percpu_ref. So I read the
>> >>> implementation code of the percpu_ref and found percpu_ref_tryget_live() could
>> >>> be called after exit now. But you're right we need to follow the API definition
>> >>> to avoid potential issues in the long term.
>> >>>
>> >>>>
>> >>>> And we need to call percpu_ref_init() before insert the swap_info_struct
>> >>>> into the swap_info[].
>> >>>
>> >>> If we remove the call to percpu_ref_exit(), we should not use percpu_ref_init()
>> >>> here because *percpu_ref->data is assumed to be NULL* in percpu_ref_init() while
>> >>> this is not the case as we do not call percpu_ref_exit(). Maybe percpu_ref_reinit()
>> >>> or percpu_ref_resurrect() will do the work.
>> >>>
>> >>> One more thing, how could I distinguish the killed percpu_ref from newly allocated one?
>> >>> It seems percpu_ref_is_dying is only safe to call when @ref is between init and exit.
>> >>> Maybe I could do this in alloc_swap_info()?
>> >> 
>> >> Yes.  In alloc_swap_info(), you can distinguish newly allocated and
>> >> reused swap_info_struct.
>> >> 
>> >>>>
>> >>>>>> +}
>> >>>>>> +
>> >>>>>>  static void alloc_cluster(struct swap_info_struct *si, unsigned long idx)
>> >>>>>>  {
>> >>>>>>  	struct swap_cluster_info *ci = si->cluster_info;
>> >>>>>> @@ -2500,7 +2510,7 @@ static void enable_swap_info(struct swap_info_struct *p, int prio,
>> >>>>>>  	 * Guarantee swap_map, cluster_info, etc. fields are valid
>> >>>>>>  	 * between get/put_swap_device() if SWP_VALID bit is set
>> >>>>>>  	 */
>> >>>>>> -	synchronize_rcu();
>> >>>>>> +	percpu_ref_reinit(&p->users);
>> >>>>>
>> >>>>> Although the effect is same, I think it's better to use
>> >>>>> percpu_ref_resurrect() here to improve code readability.
>> >>>>
>> >>>> Check the original commit description for commit eb085574a752 "mm, swap:
>> >>>> fix race between swapoff and some swap operations" and discussion email
>> >>>> thread as follows again,
>> >>>>
>> >>>> https://lore.kernel.org/linux-mm/20171219053650.GB7829@linux.vnet.ibm.com/
>> >>>>
>> >>>> I found that the synchronize_rcu() here is to avoid to call smp_rmb() or
>> >>>> smp_load_acquire() in get_swap_device().  Now we will use
>> >>>> percpu_ref_tryget_live() in get_swap_device(), so we will need to add
>> >>>> the necessary memory barrier, or make sure percpu_ref_tryget_live() has
>> >>>> ACQUIRE semantics.  Per my understanding, we need to change
>> >>>> percpu_ref_tryget_live() for that.
>> >>>>
>> >>>
>> >>> Do you mean the below scene is possible?
>> >>>
>> >>> cpu1
>> >>> swapon()
>> >>>   ...
>> >>>   percpu_ref_init
>> >>>   ...
>> >>>   setup_swap_info
>> >>>   /* smp_store_release() is inside percpu_ref_reinit */
>> >>>   percpu_ref_reinit
>> >> 
>> >> spin_unlock() has RELEASE semantics already.
>> >> 
>> >>>   ...
>> >>>
>> >>> cpu2
>> >>> get_swap_device()
>> >>>   /* ignored  smp_rmb() */
>> >>>   percpu_ref_tryget_live
>> >> 
>> >> Some kind of ACQUIRE is required here to guarantee the refcount is
>> >> checked before fetching the other fields of swap_info_struct.  I have
>> >> sent out a RFC patch to mailing list to discuss this.
>
> I'm just catching up and following along a little bit. I apologize I
> haven't read the swap code, but my understanding is you are trying to
> narrow a race condition with swapoff. That makes sense to me. I'm not
> sure I follow the need to race with reinitializing the ref though? Is it
> not possible to wait out the dying swap info and then create a new one
> rather than push acquire semantics?

We want to check whether the swap entry is valid (that is, the swap
device isn't swapped off now), prevent it from swapping off, then access
the swap_info_struct data structure.  When accessing swap_info_struct,
we want to guarantee the ordering, so that we will not reference
uninitialized fields of swap_info_struct.

Best Regards,
Huang, Ying

>> >
>> > Many thanks.
>> > But We may still need to add a smp_rmb() in get_swap_device() in case
>> > we can't add ACQUIRE for refcount.
>> 
>> Yes.
>> 
>> Best Regards,
>> Huang, Ying
>> 
>
> Thanks,
> Dennis


^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [PATCH 1/5] mm/swapfile: add percpu_ref support for swap
  2021-04-14  3:59                   ` Huang, Ying
  (?)
@ 2021-04-14  4:05                   ` Dennis Zhou
  2021-04-14  5:44                       ` Huang, Ying
  -1 siblings, 1 reply; 72+ messages in thread
From: Dennis Zhou @ 2021-04-14  4:05 UTC (permalink / raw)
  To: Huang, Ying
  Cc: Miaohe Lin, akpm, hannes, mhocko, iamjoonsoo.kim, vbabka,
	alex.shi, willy, minchan, richard.weiyang, hughd, tim.c.chen,
	linux-kernel, linux-mm

On Wed, Apr 14, 2021 at 11:59:03AM +0800, Huang, Ying wrote:
> Dennis Zhou <dennis@kernel.org> writes:
> 
> > Hello,
> >
> > On Wed, Apr 14, 2021 at 10:06:48AM +0800, Huang, Ying wrote:
> >> Miaohe Lin <linmiaohe@huawei.com> writes:
> >> 
> >> > On 2021/4/14 9:17, Huang, Ying wrote:
> >> >> Miaohe Lin <linmiaohe@huawei.com> writes:
> >> >> 
> >> >>> On 2021/4/12 15:24, Huang, Ying wrote:
> >> >>>> "Huang, Ying" <ying.huang@intel.com> writes:
> >> >>>>
> >> >>>>> Miaohe Lin <linmiaohe@huawei.com> writes:
> >> >>>>>
> >> >>>>>> We will use percpu-refcount to serialize against concurrent swapoff. This
> >> >>>>>> patch adds the percpu_ref support for later fixup.
> >> >>>>>>
> >> >>>>>> Signed-off-by: Miaohe Lin <linmiaohe@huawei.com>
> >> >>>>>> ---
> >> >>>>>>  include/linux/swap.h |  2 ++
> >> >>>>>>  mm/swapfile.c        | 25 ++++++++++++++++++++++---
> >> >>>>>>  2 files changed, 24 insertions(+), 3 deletions(-)
> >> >>>>>>
> >> >>>>>> diff --git a/include/linux/swap.h b/include/linux/swap.h
> >> >>>>>> index 144727041e78..849ba5265c11 100644
> >> >>>>>> --- a/include/linux/swap.h
> >> >>>>>> +++ b/include/linux/swap.h
> >> >>>>>> @@ -240,6 +240,7 @@ struct swap_cluster_list {
> >> >>>>>>   * The in-memory structure used to track swap areas.
> >> >>>>>>   */
> >> >>>>>>  struct swap_info_struct {
> >> >>>>>> +	struct percpu_ref users;	/* serialization against concurrent swapoff */
> >> >>>>>>  	unsigned long	flags;		/* SWP_USED etc: see above */
> >> >>>>>>  	signed short	prio;		/* swap priority of this type */
> >> >>>>>>  	struct plist_node list;		/* entry in swap_active_head */
> >> >>>>>> @@ -260,6 +261,7 @@ struct swap_info_struct {
> >> >>>>>>  	struct block_device *bdev;	/* swap device or bdev of swap file */
> >> >>>>>>  	struct file *swap_file;		/* seldom referenced */
> >> >>>>>>  	unsigned int old_block_size;	/* seldom referenced */
> >> >>>>>> +	struct completion comp;		/* seldom referenced */
> >> >>>>>>  #ifdef CONFIG_FRONTSWAP
> >> >>>>>>  	unsigned long *frontswap_map;	/* frontswap in-use, one bit per page */
> >> >>>>>>  	atomic_t frontswap_pages;	/* frontswap pages in-use counter */
> >> >>>>>> diff --git a/mm/swapfile.c b/mm/swapfile.c
> >> >>>>>> index 149e77454e3c..724173cd7d0c 100644
> >> >>>>>> --- a/mm/swapfile.c
> >> >>>>>> +++ b/mm/swapfile.c
> >> >>>>>> @@ -39,6 +39,7 @@
> >> >>>>>>  #include <linux/export.h>
> >> >>>>>>  #include <linux/swap_slots.h>
> >> >>>>>>  #include <linux/sort.h>
> >> >>>>>> +#include <linux/completion.h>
> >> >>>>>>  
> >> >>>>>>  #include <asm/tlbflush.h>
> >> >>>>>>  #include <linux/swapops.h>
> >> >>>>>> @@ -511,6 +512,15 @@ static void swap_discard_work(struct work_struct *work)
> >> >>>>>>  	spin_unlock(&si->lock);
> >> >>>>>>  }
> >> >>>>>>  
> >> >>>>>> +static void swap_users_ref_free(struct percpu_ref *ref)
> >> >>>>>> +{
> >> >>>>>> +	struct swap_info_struct *si;
> >> >>>>>> +
> >> >>>>>> +	si = container_of(ref, struct swap_info_struct, users);
> >> >>>>>> +	complete(&si->comp);
> >> >>>>>> +	percpu_ref_exit(&si->users);
> >> >>>>>
> >> >>>>> Because percpu_ref_exit() is used, we cannot use percpu_ref_tryget() in
> >> >>>>> get_swap_device(), better to add comments there.
> >> >>>>
> >> >>>> I just noticed that the comments of percpu_ref_tryget_live() says,
> >> >>>>
> >> >>>>  * This function is safe to call as long as @ref is between init and exit.
> >> >>>>
> >> >>>> While we need to call get_swap_device() almost at any time, so it's
> >> >>>> better to avoid to call percpu_ref_exit() at all.  This will waste some
> >> >>>> memory, but we need to follow the API definition to avoid potential
> >> >>>> issues in the long term.
> >> >>>
> >> >>> I have to admit that I'am not really familiar with percpu_ref. So I read the
> >> >>> implementation code of the percpu_ref and found percpu_ref_tryget_live() could
> >> >>> be called after exit now. But you're right we need to follow the API definition
> >> >>> to avoid potential issues in the long term.
> >> >>>
> >> >>>>
> >> >>>> And we need to call percpu_ref_init() before insert the swap_info_struct
> >> >>>> into the swap_info[].
> >> >>>
> >> >>> If we remove the call to percpu_ref_exit(), we should not use percpu_ref_init()
> >> >>> here because *percpu_ref->data is assumed to be NULL* in percpu_ref_init() while
> >> >>> this is not the case as we do not call percpu_ref_exit(). Maybe percpu_ref_reinit()
> >> >>> or percpu_ref_resurrect() will do the work.
> >> >>>
> >> >>> One more thing, how could I distinguish the killed percpu_ref from newly allocated one?
> >> >>> It seems percpu_ref_is_dying is only safe to call when @ref is between init and exit.
> >> >>> Maybe I could do this in alloc_swap_info()?
> >> >> 
> >> >> Yes.  In alloc_swap_info(), you can distinguish newly allocated and
> >> >> reused swap_info_struct.
> >> >> 
> >> >>>>
> >> >>>>>> +}
> >> >>>>>> +
> >> >>>>>>  static void alloc_cluster(struct swap_info_struct *si, unsigned long idx)
> >> >>>>>>  {
> >> >>>>>>  	struct swap_cluster_info *ci = si->cluster_info;
> >> >>>>>> @@ -2500,7 +2510,7 @@ static void enable_swap_info(struct swap_info_struct *p, int prio,
> >> >>>>>>  	 * Guarantee swap_map, cluster_info, etc. fields are valid
> >> >>>>>>  	 * between get/put_swap_device() if SWP_VALID bit is set
> >> >>>>>>  	 */
> >> >>>>>> -	synchronize_rcu();
> >> >>>>>> +	percpu_ref_reinit(&p->users);
> >> >>>>>
> >> >>>>> Although the effect is same, I think it's better to use
> >> >>>>> percpu_ref_resurrect() here to improve code readability.
> >> >>>>
> >> >>>> Check the original commit description for commit eb085574a752 "mm, swap:
> >> >>>> fix race between swapoff and some swap operations" and discussion email
> >> >>>> thread as follows again,
> >> >>>>
> >> >>>> https://lore.kernel.org/linux-mm/20171219053650.GB7829@linux.vnet.ibm.com/
> >> >>>>
> >> >>>> I found that the synchronize_rcu() here is to avoid to call smp_rmb() or
> >> >>>> smp_load_acquire() in get_swap_device().  Now we will use
> >> >>>> percpu_ref_tryget_live() in get_swap_device(), so we will need to add
> >> >>>> the necessary memory barrier, or make sure percpu_ref_tryget_live() has
> >> >>>> ACQUIRE semantics.  Per my understanding, we need to change
> >> >>>> percpu_ref_tryget_live() for that.
> >> >>>>
> >> >>>
> >> >>> Do you mean the below scene is possible?
> >> >>>
> >> >>> cpu1
> >> >>> swapon()
> >> >>>   ...
> >> >>>   percpu_ref_init
> >> >>>   ...
> >> >>>   setup_swap_info
> >> >>>   /* smp_store_release() is inside percpu_ref_reinit */
> >> >>>   percpu_ref_reinit
> >> >> 
> >> >> spin_unlock() has RELEASE semantics already.
> >> >> 
> >> >>>   ...
> >> >>>
> >> >>> cpu2
> >> >>> get_swap_device()
> >> >>>   /* ignored  smp_rmb() */
> >> >>>   percpu_ref_tryget_live
> >> >> 
> >> >> Some kind of ACQUIRE is required here to guarantee the refcount is
> >> >> checked before fetching the other fields of swap_info_struct.  I have
> >> >> sent out a RFC patch to mailing list to discuss this.
> >
> > I'm just catching up and following along a little bit. I apologize I
> > haven't read the swap code, but my understanding is you are trying to
> > narrow a race condition with swapoff. That makes sense to me. I'm not
> > sure I follow the need to race with reinitializing the ref though? Is it
> > not possible to wait out the dying swap info and then create a new one
> > rather than push acquire semantics?
> 
> We want to check whether the swap entry is valid (that is, the swap
> device isn't swapped off now), prevent it from swapping off, then access
> the swap_info_struct data structure.  When accessing swap_info_struct,
> we want to guarantee the ordering, so that we will not reference
> uninitialized fields of swap_info_struct.
> 

So in the normal context of percpu_ref, once someone can access it, the
elements that it is protecting are expected to be initialized. In the
basic case for swap off, I'm seeing the goal as to prevent destruction
until anyone currently accessing swap is done. In this case wouldn't we
always be protecting a live struct?

I'm maybe not understanding what conditions you're trying to revive the
percpu_ref?

> Best Regards,
> Huang, Ying
> 
> >> >
> >> > Many thanks.
> >> > But We may still need to add a smp_rmb() in get_swap_device() in case
> >> > we can't add ACQUIRE for refcount.
> >> 
> >> Yes.
> >> 
> >> Best Regards,
> >> Huang, Ying
> >> 
> >
> > Thanks,
> > Dennis

^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [PATCH 1/5] mm/swapfile: add percpu_ref support for swap
  2021-04-14  4:05                   ` Dennis Zhou
@ 2021-04-14  5:44                       ` Huang, Ying
  0 siblings, 0 replies; 72+ messages in thread
From: Huang, Ying @ 2021-04-14  5:44 UTC (permalink / raw)
  To: Dennis Zhou
  Cc: Miaohe Lin, akpm, hannes, mhocko, iamjoonsoo.kim, vbabka,
	alex.shi, willy, minchan, richard.weiyang, hughd, tim.c.chen,
	linux-kernel, linux-mm

Dennis Zhou <dennis@kernel.org> writes:

> On Wed, Apr 14, 2021 at 11:59:03AM +0800, Huang, Ying wrote:
>> Dennis Zhou <dennis@kernel.org> writes:
>> 
>> > Hello,
>> >
>> > On Wed, Apr 14, 2021 at 10:06:48AM +0800, Huang, Ying wrote:
>> >> Miaohe Lin <linmiaohe@huawei.com> writes:
>> >> 
>> >> > On 2021/4/14 9:17, Huang, Ying wrote:
>> >> >> Miaohe Lin <linmiaohe@huawei.com> writes:
>> >> >> 
>> >> >>> On 2021/4/12 15:24, Huang, Ying wrote:
>> >> >>>> "Huang, Ying" <ying.huang@intel.com> writes:
>> >> >>>>
>> >> >>>>> Miaohe Lin <linmiaohe@huawei.com> writes:
>> >> >>>>>
>> >> >>>>>> We will use percpu-refcount to serialize against concurrent swapoff. This
>> >> >>>>>> patch adds the percpu_ref support for later fixup.
>> >> >>>>>>
>> >> >>>>>> Signed-off-by: Miaohe Lin <linmiaohe@huawei.com>
>> >> >>>>>> ---
>> >> >>>>>>  include/linux/swap.h |  2 ++
>> >> >>>>>>  mm/swapfile.c        | 25 ++++++++++++++++++++++---
>> >> >>>>>>  2 files changed, 24 insertions(+), 3 deletions(-)
>> >> >>>>>>
>> >> >>>>>> diff --git a/include/linux/swap.h b/include/linux/swap.h
>> >> >>>>>> index 144727041e78..849ba5265c11 100644
>> >> >>>>>> --- a/include/linux/swap.h
>> >> >>>>>> +++ b/include/linux/swap.h
>> >> >>>>>> @@ -240,6 +240,7 @@ struct swap_cluster_list {
>> >> >>>>>>   * The in-memory structure used to track swap areas.
>> >> >>>>>>   */
>> >> >>>>>>  struct swap_info_struct {
>> >> >>>>>> +	struct percpu_ref users;	/* serialization against concurrent swapoff */
>> >> >>>>>>  	unsigned long	flags;		/* SWP_USED etc: see above */
>> >> >>>>>>  	signed short	prio;		/* swap priority of this type */
>> >> >>>>>>  	struct plist_node list;		/* entry in swap_active_head */
>> >> >>>>>> @@ -260,6 +261,7 @@ struct swap_info_struct {
>> >> >>>>>>  	struct block_device *bdev;	/* swap device or bdev of swap file */
>> >> >>>>>>  	struct file *swap_file;		/* seldom referenced */
>> >> >>>>>>  	unsigned int old_block_size;	/* seldom referenced */
>> >> >>>>>> +	struct completion comp;		/* seldom referenced */
>> >> >>>>>>  #ifdef CONFIG_FRONTSWAP
>> >> >>>>>>  	unsigned long *frontswap_map;	/* frontswap in-use, one bit per page */
>> >> >>>>>>  	atomic_t frontswap_pages;	/* frontswap pages in-use counter */
>> >> >>>>>> diff --git a/mm/swapfile.c b/mm/swapfile.c
>> >> >>>>>> index 149e77454e3c..724173cd7d0c 100644
>> >> >>>>>> --- a/mm/swapfile.c
>> >> >>>>>> +++ b/mm/swapfile.c
>> >> >>>>>> @@ -39,6 +39,7 @@
>> >> >>>>>>  #include <linux/export.h>
>> >> >>>>>>  #include <linux/swap_slots.h>
>> >> >>>>>>  #include <linux/sort.h>
>> >> >>>>>> +#include <linux/completion.h>
>> >> >>>>>>  
>> >> >>>>>>  #include <asm/tlbflush.h>
>> >> >>>>>>  #include <linux/swapops.h>
>> >> >>>>>> @@ -511,6 +512,15 @@ static void swap_discard_work(struct work_struct *work)
>> >> >>>>>>  	spin_unlock(&si->lock);
>> >> >>>>>>  }
>> >> >>>>>>  
>> >> >>>>>> +static void swap_users_ref_free(struct percpu_ref *ref)
>> >> >>>>>> +{
>> >> >>>>>> +	struct swap_info_struct *si;
>> >> >>>>>> +
>> >> >>>>>> +	si = container_of(ref, struct swap_info_struct, users);
>> >> >>>>>> +	complete(&si->comp);
>> >> >>>>>> +	percpu_ref_exit(&si->users);
>> >> >>>>>
>> >> >>>>> Because percpu_ref_exit() is used, we cannot use percpu_ref_tryget() in
>> >> >>>>> get_swap_device(), better to add comments there.
>> >> >>>>
>> >> >>>> I just noticed that the comments of percpu_ref_tryget_live() says,
>> >> >>>>
>> >> >>>>  * This function is safe to call as long as @ref is between init and exit.
>> >> >>>>
>> >> >>>> While we need to call get_swap_device() almost at any time, so it's
>> >> >>>> better to avoid to call percpu_ref_exit() at all.  This will waste some
>> >> >>>> memory, but we need to follow the API definition to avoid potential
>> >> >>>> issues in the long term.
>> >> >>>
>> >> >>> I have to admit that I'am not really familiar with percpu_ref. So I read the
>> >> >>> implementation code of the percpu_ref and found percpu_ref_tryget_live() could
>> >> >>> be called after exit now. But you're right we need to follow the API definition
>> >> >>> to avoid potential issues in the long term.
>> >> >>>
>> >> >>>>
>> >> >>>> And we need to call percpu_ref_init() before insert the swap_info_struct
>> >> >>>> into the swap_info[].
>> >> >>>
>> >> >>> If we remove the call to percpu_ref_exit(), we should not use percpu_ref_init()
>> >> >>> here because *percpu_ref->data is assumed to be NULL* in percpu_ref_init() while
>> >> >>> this is not the case as we do not call percpu_ref_exit(). Maybe percpu_ref_reinit()
>> >> >>> or percpu_ref_resurrect() will do the work.
>> >> >>>
>> >> >>> One more thing, how could I distinguish the killed percpu_ref from newly allocated one?
>> >> >>> It seems percpu_ref_is_dying is only safe to call when @ref is between init and exit.
>> >> >>> Maybe I could do this in alloc_swap_info()?
>> >> >> 
>> >> >> Yes.  In alloc_swap_info(), you can distinguish newly allocated and
>> >> >> reused swap_info_struct.
>> >> >> 
>> >> >>>>
>> >> >>>>>> +}
>> >> >>>>>> +
>> >> >>>>>>  static void alloc_cluster(struct swap_info_struct *si, unsigned long idx)
>> >> >>>>>>  {
>> >> >>>>>>  	struct swap_cluster_info *ci = si->cluster_info;
>> >> >>>>>> @@ -2500,7 +2510,7 @@ static void enable_swap_info(struct swap_info_struct *p, int prio,
>> >> >>>>>>  	 * Guarantee swap_map, cluster_info, etc. fields are valid
>> >> >>>>>>  	 * between get/put_swap_device() if SWP_VALID bit is set
>> >> >>>>>>  	 */
>> >> >>>>>> -	synchronize_rcu();
>> >> >>>>>> +	percpu_ref_reinit(&p->users);
>> >> >>>>>
>> >> >>>>> Although the effect is same, I think it's better to use
>> >> >>>>> percpu_ref_resurrect() here to improve code readability.
>> >> >>>>
>> >> >>>> Check the original commit description for commit eb085574a752 "mm, swap:
>> >> >>>> fix race between swapoff and some swap operations" and discussion email
>> >> >>>> thread as follows again,
>> >> >>>>
>> >> >>>> https://lore.kernel.org/linux-mm/20171219053650.GB7829@linux.vnet.ibm.com/
>> >> >>>>
>> >> >>>> I found that the synchronize_rcu() here is to avoid to call smp_rmb() or
>> >> >>>> smp_load_acquire() in get_swap_device().  Now we will use
>> >> >>>> percpu_ref_tryget_live() in get_swap_device(), so we will need to add
>> >> >>>> the necessary memory barrier, or make sure percpu_ref_tryget_live() has
>> >> >>>> ACQUIRE semantics.  Per my understanding, we need to change
>> >> >>>> percpu_ref_tryget_live() for that.
>> >> >>>>
>> >> >>>
>> >> >>> Do you mean the below scene is possible?
>> >> >>>
>> >> >>> cpu1
>> >> >>> swapon()
>> >> >>>   ...
>> >> >>>   percpu_ref_init
>> >> >>>   ...
>> >> >>>   setup_swap_info
>> >> >>>   /* smp_store_release() is inside percpu_ref_reinit */
>> >> >>>   percpu_ref_reinit
>> >> >> 
>> >> >> spin_unlock() has RELEASE semantics already.
>> >> >> 
>> >> >>>   ...
>> >> >>>
>> >> >>> cpu2
>> >> >>> get_swap_device()
>> >> >>>   /* ignored  smp_rmb() */
>> >> >>>   percpu_ref_tryget_live
>> >> >> 
>> >> >> Some kind of ACQUIRE is required here to guarantee the refcount is
>> >> >> checked before fetching the other fields of swap_info_struct.  I have
>> >> >> sent out a RFC patch to mailing list to discuss this.
>> >
>> > I'm just catching up and following along a little bit. I apologize I
>> > haven't read the swap code, but my understanding is you are trying to
>> > narrow a race condition with swapoff. That makes sense to me. I'm not
>> > sure I follow the need to race with reinitializing the ref though? Is it
>> > not possible to wait out the dying swap info and then create a new one
>> > rather than push acquire semantics?
>> 
>> We want to check whether the swap entry is valid (that is, the swap
>> device isn't swapped off now), prevent it from swapping off, then access
>> the swap_info_struct data structure.  When accessing swap_info_struct,
>> we want to guarantee the ordering, so that we will not reference
>> uninitialized fields of swap_info_struct.
>> 
>
> So in the normal context of percpu_ref, once someone can access it, the
> elements that it is protecting are expected to be initialized.

If we can make sure that all elements being initialized fully, why not
just use percpu_ref_get() instead of percpu_ref_tryget*()?

> In the basic case for swap off, I'm seeing the goal as to prevent
> destruction until anyone currently accessing swap is done. In this
> case wouldn't we always be protecting a live struct?
>
> I'm maybe not understanding what conditions you're trying to revive the
> percpu_ref?

A swap entry likes an indirect pointer to a swap device.  We may hold a
swap entry for long time, so that the swap device is swapoff/swapon.
Then we need to make sure the swap device are fully initialized before
accessing the swap device via the swap entry.

Best Regards,
Huang, Ying

^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [PATCH 1/5] mm/swapfile: add percpu_ref support for swap
@ 2021-04-14  5:44                       ` Huang, Ying
  0 siblings, 0 replies; 72+ messages in thread
From: Huang, Ying @ 2021-04-14  5:44 UTC (permalink / raw)
  To: Dennis Zhou
  Cc: Miaohe Lin, akpm, hannes, mhocko, iamjoonsoo.kim, vbabka,
	alex.shi, willy, minchan, richard.weiyang, hughd, tim.c.chen,
	linux-kernel, linux-mm

Dennis Zhou <dennis@kernel.org> writes:

> On Wed, Apr 14, 2021 at 11:59:03AM +0800, Huang, Ying wrote:
>> Dennis Zhou <dennis@kernel.org> writes:
>> 
>> > Hello,
>> >
>> > On Wed, Apr 14, 2021 at 10:06:48AM +0800, Huang, Ying wrote:
>> >> Miaohe Lin <linmiaohe@huawei.com> writes:
>> >> 
>> >> > On 2021/4/14 9:17, Huang, Ying wrote:
>> >> >> Miaohe Lin <linmiaohe@huawei.com> writes:
>> >> >> 
>> >> >>> On 2021/4/12 15:24, Huang, Ying wrote:
>> >> >>>> "Huang, Ying" <ying.huang@intel.com> writes:
>> >> >>>>
>> >> >>>>> Miaohe Lin <linmiaohe@huawei.com> writes:
>> >> >>>>>
>> >> >>>>>> We will use percpu-refcount to serialize against concurrent swapoff. This
>> >> >>>>>> patch adds the percpu_ref support for later fixup.
>> >> >>>>>>
>> >> >>>>>> Signed-off-by: Miaohe Lin <linmiaohe@huawei.com>
>> >> >>>>>> ---
>> >> >>>>>>  include/linux/swap.h |  2 ++
>> >> >>>>>>  mm/swapfile.c        | 25 ++++++++++++++++++++++---
>> >> >>>>>>  2 files changed, 24 insertions(+), 3 deletions(-)
>> >> >>>>>>
>> >> >>>>>> diff --git a/include/linux/swap.h b/include/linux/swap.h
>> >> >>>>>> index 144727041e78..849ba5265c11 100644
>> >> >>>>>> --- a/include/linux/swap.h
>> >> >>>>>> +++ b/include/linux/swap.h
>> >> >>>>>> @@ -240,6 +240,7 @@ struct swap_cluster_list {
>> >> >>>>>>   * The in-memory structure used to track swap areas.
>> >> >>>>>>   */
>> >> >>>>>>  struct swap_info_struct {
>> >> >>>>>> +	struct percpu_ref users;	/* serialization against concurrent swapoff */
>> >> >>>>>>  	unsigned long	flags;		/* SWP_USED etc: see above */
>> >> >>>>>>  	signed short	prio;		/* swap priority of this type */
>> >> >>>>>>  	struct plist_node list;		/* entry in swap_active_head */
>> >> >>>>>> @@ -260,6 +261,7 @@ struct swap_info_struct {
>> >> >>>>>>  	struct block_device *bdev;	/* swap device or bdev of swap file */
>> >> >>>>>>  	struct file *swap_file;		/* seldom referenced */
>> >> >>>>>>  	unsigned int old_block_size;	/* seldom referenced */
>> >> >>>>>> +	struct completion comp;		/* seldom referenced */
>> >> >>>>>>  #ifdef CONFIG_FRONTSWAP
>> >> >>>>>>  	unsigned long *frontswap_map;	/* frontswap in-use, one bit per page */
>> >> >>>>>>  	atomic_t frontswap_pages;	/* frontswap pages in-use counter */
>> >> >>>>>> diff --git a/mm/swapfile.c b/mm/swapfile.c
>> >> >>>>>> index 149e77454e3c..724173cd7d0c 100644
>> >> >>>>>> --- a/mm/swapfile.c
>> >> >>>>>> +++ b/mm/swapfile.c
>> >> >>>>>> @@ -39,6 +39,7 @@
>> >> >>>>>>  #include <linux/export.h>
>> >> >>>>>>  #include <linux/swap_slots.h>
>> >> >>>>>>  #include <linux/sort.h>
>> >> >>>>>> +#include <linux/completion.h>
>> >> >>>>>>  
>> >> >>>>>>  #include <asm/tlbflush.h>
>> >> >>>>>>  #include <linux/swapops.h>
>> >> >>>>>> @@ -511,6 +512,15 @@ static void swap_discard_work(struct work_struct *work)
>> >> >>>>>>  	spin_unlock(&si->lock);
>> >> >>>>>>  }
>> >> >>>>>>  
>> >> >>>>>> +static void swap_users_ref_free(struct percpu_ref *ref)
>> >> >>>>>> +{
>> >> >>>>>> +	struct swap_info_struct *si;
>> >> >>>>>> +
>> >> >>>>>> +	si = container_of(ref, struct swap_info_struct, users);
>> >> >>>>>> +	complete(&si->comp);
>> >> >>>>>> +	percpu_ref_exit(&si->users);
>> >> >>>>>
>> >> >>>>> Because percpu_ref_exit() is used, we cannot use percpu_ref_tryget() in
>> >> >>>>> get_swap_device(), better to add comments there.
>> >> >>>>
>> >> >>>> I just noticed that the comments of percpu_ref_tryget_live() says,
>> >> >>>>
>> >> >>>>  * This function is safe to call as long as @ref is between init and exit.
>> >> >>>>
>> >> >>>> While we need to call get_swap_device() almost at any time, so it's
>> >> >>>> better to avoid to call percpu_ref_exit() at all.  This will waste some
>> >> >>>> memory, but we need to follow the API definition to avoid potential
>> >> >>>> issues in the long term.
>> >> >>>
>> >> >>> I have to admit that I'am not really familiar with percpu_ref. So I read the
>> >> >>> implementation code of the percpu_ref and found percpu_ref_tryget_live() could
>> >> >>> be called after exit now. But you're right we need to follow the API definition
>> >> >>> to avoid potential issues in the long term.
>> >> >>>
>> >> >>>>
>> >> >>>> And we need to call percpu_ref_init() before insert the swap_info_struct
>> >> >>>> into the swap_info[].
>> >> >>>
>> >> >>> If we remove the call to percpu_ref_exit(), we should not use percpu_ref_init()
>> >> >>> here because *percpu_ref->data is assumed to be NULL* in percpu_ref_init() while
>> >> >>> this is not the case as we do not call percpu_ref_exit(). Maybe percpu_ref_reinit()
>> >> >>> or percpu_ref_resurrect() will do the work.
>> >> >>>
>> >> >>> One more thing, how could I distinguish the killed percpu_ref from newly allocated one?
>> >> >>> It seems percpu_ref_is_dying is only safe to call when @ref is between init and exit.
>> >> >>> Maybe I could do this in alloc_swap_info()?
>> >> >> 
>> >> >> Yes.  In alloc_swap_info(), you can distinguish newly allocated and
>> >> >> reused swap_info_struct.
>> >> >> 
>> >> >>>>
>> >> >>>>>> +}
>> >> >>>>>> +
>> >> >>>>>>  static void alloc_cluster(struct swap_info_struct *si, unsigned long idx)
>> >> >>>>>>  {
>> >> >>>>>>  	struct swap_cluster_info *ci = si->cluster_info;
>> >> >>>>>> @@ -2500,7 +2510,7 @@ static void enable_swap_info(struct swap_info_struct *p, int prio,
>> >> >>>>>>  	 * Guarantee swap_map, cluster_info, etc. fields are valid
>> >> >>>>>>  	 * between get/put_swap_device() if SWP_VALID bit is set
>> >> >>>>>>  	 */
>> >> >>>>>> -	synchronize_rcu();
>> >> >>>>>> +	percpu_ref_reinit(&p->users);
>> >> >>>>>
>> >> >>>>> Although the effect is same, I think it's better to use
>> >> >>>>> percpu_ref_resurrect() here to improve code readability.
>> >> >>>>
>> >> >>>> Check the original commit description for commit eb085574a752 "mm, swap:
>> >> >>>> fix race between swapoff and some swap operations" and discussion email
>> >> >>>> thread as follows again,
>> >> >>>>
>> >> >>>> https://lore.kernel.org/linux-mm/20171219053650.GB7829@linux.vnet.ibm.com/
>> >> >>>>
>> >> >>>> I found that the synchronize_rcu() here is to avoid to call smp_rmb() or
>> >> >>>> smp_load_acquire() in get_swap_device().  Now we will use
>> >> >>>> percpu_ref_tryget_live() in get_swap_device(), so we will need to add
>> >> >>>> the necessary memory barrier, or make sure percpu_ref_tryget_live() has
>> >> >>>> ACQUIRE semantics.  Per my understanding, we need to change
>> >> >>>> percpu_ref_tryget_live() for that.
>> >> >>>>
>> >> >>>
>> >> >>> Do you mean the below scene is possible?
>> >> >>>
>> >> >>> cpu1
>> >> >>> swapon()
>> >> >>>   ...
>> >> >>>   percpu_ref_init
>> >> >>>   ...
>> >> >>>   setup_swap_info
>> >> >>>   /* smp_store_release() is inside percpu_ref_reinit */
>> >> >>>   percpu_ref_reinit
>> >> >> 
>> >> >> spin_unlock() has RELEASE semantics already.
>> >> >> 
>> >> >>>   ...
>> >> >>>
>> >> >>> cpu2
>> >> >>> get_swap_device()
>> >> >>>   /* ignored  smp_rmb() */
>> >> >>>   percpu_ref_tryget_live
>> >> >> 
>> >> >> Some kind of ACQUIRE is required here to guarantee the refcount is
>> >> >> checked before fetching the other fields of swap_info_struct.  I have
>> >> >> sent out a RFC patch to mailing list to discuss this.
>> >
>> > I'm just catching up and following along a little bit. I apologize I
>> > haven't read the swap code, but my understanding is you are trying to
>> > narrow a race condition with swapoff. That makes sense to me. I'm not
>> > sure I follow the need to race with reinitializing the ref though? Is it
>> > not possible to wait out the dying swap info and then create a new one
>> > rather than push acquire semantics?
>> 
>> We want to check whether the swap entry is valid (that is, the swap
>> device isn't swapped off now), prevent it from swapping off, then access
>> the swap_info_struct data structure.  When accessing swap_info_struct,
>> we want to guarantee the ordering, so that we will not reference
>> uninitialized fields of swap_info_struct.
>> 
>
> So in the normal context of percpu_ref, once someone can access it, the
> elements that it is protecting are expected to be initialized.

If we can make sure that all elements being initialized fully, why not
just use percpu_ref_get() instead of percpu_ref_tryget*()?

> In the basic case for swap off, I'm seeing the goal as to prevent
> destruction until anyone currently accessing swap is done. In this
> case wouldn't we always be protecting a live struct?
>
> I'm maybe not understanding what conditions you're trying to revive the
> percpu_ref?

A swap entry likes an indirect pointer to a swap device.  We may hold a
swap entry for long time, so that the swap device is swapoff/swapon.
Then we need to make sure the swap device are fully initialized before
accessing the swap device via the swap entry.

Best Regards,
Huang, Ying


^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [PATCH 1/5] mm/swapfile: add percpu_ref support for swap
  2021-04-14  5:44                       ` Huang, Ying
  (?)
@ 2021-04-14 14:53                       ` Dennis Zhou
  2021-04-15  3:16                         ` Miaohe Lin
  2021-04-15  5:24                           ` Huang, Ying
  -1 siblings, 2 replies; 72+ messages in thread
From: Dennis Zhou @ 2021-04-14 14:53 UTC (permalink / raw)
  To: Huang, Ying
  Cc: Miaohe Lin, akpm, hannes, mhocko, iamjoonsoo.kim, vbabka,
	alex.shi, willy, minchan, richard.weiyang, hughd, tim.c.chen,
	linux-kernel, linux-mm

On Wed, Apr 14, 2021 at 01:44:58PM +0800, Huang, Ying wrote:
> Dennis Zhou <dennis@kernel.org> writes:
> 
> > On Wed, Apr 14, 2021 at 11:59:03AM +0800, Huang, Ying wrote:
> >> Dennis Zhou <dennis@kernel.org> writes:
> >> 
> >> > Hello,
> >> >
> >> > On Wed, Apr 14, 2021 at 10:06:48AM +0800, Huang, Ying wrote:
> >> >> Miaohe Lin <linmiaohe@huawei.com> writes:
> >> >> 
> >> >> > On 2021/4/14 9:17, Huang, Ying wrote:
> >> >> >> Miaohe Lin <linmiaohe@huawei.com> writes:
> >> >> >> 
> >> >> >>> On 2021/4/12 15:24, Huang, Ying wrote:
> >> >> >>>> "Huang, Ying" <ying.huang@intel.com> writes:
> >> >> >>>>
> >> >> >>>>> Miaohe Lin <linmiaohe@huawei.com> writes:
> >> >> >>>>>
> >> >> >>>>>> We will use percpu-refcount to serialize against concurrent swapoff. This
> >> >> >>>>>> patch adds the percpu_ref support for later fixup.
> >> >> >>>>>>
> >> >> >>>>>> Signed-off-by: Miaohe Lin <linmiaohe@huawei.com>
> >> >> >>>>>> ---
> >> >> >>>>>>  include/linux/swap.h |  2 ++
> >> >> >>>>>>  mm/swapfile.c        | 25 ++++++++++++++++++++++---
> >> >> >>>>>>  2 files changed, 24 insertions(+), 3 deletions(-)
> >> >> >>>>>>
> >> >> >>>>>> diff --git a/include/linux/swap.h b/include/linux/swap.h
> >> >> >>>>>> index 144727041e78..849ba5265c11 100644
> >> >> >>>>>> --- a/include/linux/swap.h
> >> >> >>>>>> +++ b/include/linux/swap.h
> >> >> >>>>>> @@ -240,6 +240,7 @@ struct swap_cluster_list {
> >> >> >>>>>>   * The in-memory structure used to track swap areas.
> >> >> >>>>>>   */
> >> >> >>>>>>  struct swap_info_struct {
> >> >> >>>>>> +	struct percpu_ref users;	/* serialization against concurrent swapoff */
> >> >> >>>>>>  	unsigned long	flags;		/* SWP_USED etc: see above */
> >> >> >>>>>>  	signed short	prio;		/* swap priority of this type */
> >> >> >>>>>>  	struct plist_node list;		/* entry in swap_active_head */
> >> >> >>>>>> @@ -260,6 +261,7 @@ struct swap_info_struct {
> >> >> >>>>>>  	struct block_device *bdev;	/* swap device or bdev of swap file */
> >> >> >>>>>>  	struct file *swap_file;		/* seldom referenced */
> >> >> >>>>>>  	unsigned int old_block_size;	/* seldom referenced */
> >> >> >>>>>> +	struct completion comp;		/* seldom referenced */
> >> >> >>>>>>  #ifdef CONFIG_FRONTSWAP
> >> >> >>>>>>  	unsigned long *frontswap_map;	/* frontswap in-use, one bit per page */
> >> >> >>>>>>  	atomic_t frontswap_pages;	/* frontswap pages in-use counter */
> >> >> >>>>>> diff --git a/mm/swapfile.c b/mm/swapfile.c
> >> >> >>>>>> index 149e77454e3c..724173cd7d0c 100644
> >> >> >>>>>> --- a/mm/swapfile.c
> >> >> >>>>>> +++ b/mm/swapfile.c
> >> >> >>>>>> @@ -39,6 +39,7 @@
> >> >> >>>>>>  #include <linux/export.h>
> >> >> >>>>>>  #include <linux/swap_slots.h>
> >> >> >>>>>>  #include <linux/sort.h>
> >> >> >>>>>> +#include <linux/completion.h>
> >> >> >>>>>>  
> >> >> >>>>>>  #include <asm/tlbflush.h>
> >> >> >>>>>>  #include <linux/swapops.h>
> >> >> >>>>>> @@ -511,6 +512,15 @@ static void swap_discard_work(struct work_struct *work)
> >> >> >>>>>>  	spin_unlock(&si->lock);
> >> >> >>>>>>  }
> >> >> >>>>>>  
> >> >> >>>>>> +static void swap_users_ref_free(struct percpu_ref *ref)
> >> >> >>>>>> +{
> >> >> >>>>>> +	struct swap_info_struct *si;
> >> >> >>>>>> +
> >> >> >>>>>> +	si = container_of(ref, struct swap_info_struct, users);
> >> >> >>>>>> +	complete(&si->comp);
> >> >> >>>>>> +	percpu_ref_exit(&si->users);
> >> >> >>>>>
> >> >> >>>>> Because percpu_ref_exit() is used, we cannot use percpu_ref_tryget() in
> >> >> >>>>> get_swap_device(), better to add comments there.
> >> >> >>>>
> >> >> >>>> I just noticed that the comments of percpu_ref_tryget_live() says,
> >> >> >>>>
> >> >> >>>>  * This function is safe to call as long as @ref is between init and exit.
> >> >> >>>>
> >> >> >>>> While we need to call get_swap_device() almost at any time, so it's
> >> >> >>>> better to avoid to call percpu_ref_exit() at all.  This will waste some
> >> >> >>>> memory, but we need to follow the API definition to avoid potential
> >> >> >>>> issues in the long term.
> >> >> >>>
> >> >> >>> I have to admit that I'am not really familiar with percpu_ref. So I read the
> >> >> >>> implementation code of the percpu_ref and found percpu_ref_tryget_live() could
> >> >> >>> be called after exit now. But you're right we need to follow the API definition
> >> >> >>> to avoid potential issues in the long term.
> >> >> >>>
> >> >> >>>>
> >> >> >>>> And we need to call percpu_ref_init() before insert the swap_info_struct
> >> >> >>>> into the swap_info[].
> >> >> >>>
> >> >> >>> If we remove the call to percpu_ref_exit(), we should not use percpu_ref_init()
> >> >> >>> here because *percpu_ref->data is assumed to be NULL* in percpu_ref_init() while
> >> >> >>> this is not the case as we do not call percpu_ref_exit(). Maybe percpu_ref_reinit()
> >> >> >>> or percpu_ref_resurrect() will do the work.
> >> >> >>>
> >> >> >>> One more thing, how could I distinguish the killed percpu_ref from newly allocated one?
> >> >> >>> It seems percpu_ref_is_dying is only safe to call when @ref is between init and exit.
> >> >> >>> Maybe I could do this in alloc_swap_info()?
> >> >> >> 
> >> >> >> Yes.  In alloc_swap_info(), you can distinguish newly allocated and
> >> >> >> reused swap_info_struct.
> >> >> >> 
> >> >> >>>>
> >> >> >>>>>> +}
> >> >> >>>>>> +
> >> >> >>>>>>  static void alloc_cluster(struct swap_info_struct *si, unsigned long idx)
> >> >> >>>>>>  {
> >> >> >>>>>>  	struct swap_cluster_info *ci = si->cluster_info;
> >> >> >>>>>> @@ -2500,7 +2510,7 @@ static void enable_swap_info(struct swap_info_struct *p, int prio,
> >> >> >>>>>>  	 * Guarantee swap_map, cluster_info, etc. fields are valid
> >> >> >>>>>>  	 * between get/put_swap_device() if SWP_VALID bit is set
> >> >> >>>>>>  	 */
> >> >> >>>>>> -	synchronize_rcu();
> >> >> >>>>>> +	percpu_ref_reinit(&p->users);
> >> >> >>>>>
> >> >> >>>>> Although the effect is same, I think it's better to use
> >> >> >>>>> percpu_ref_resurrect() here to improve code readability.
> >> >> >>>>
> >> >> >>>> Check the original commit description for commit eb085574a752 "mm, swap:
> >> >> >>>> fix race between swapoff and some swap operations" and discussion email
> >> >> >>>> thread as follows again,
> >> >> >>>>
> >> >> >>>> https://lore.kernel.org/linux-mm/20171219053650.GB7829@linux.vnet.ibm.com/
> >> >> >>>>
> >> >> >>>> I found that the synchronize_rcu() here is to avoid to call smp_rmb() or
> >> >> >>>> smp_load_acquire() in get_swap_device().  Now we will use
> >> >> >>>> percpu_ref_tryget_live() in get_swap_device(), so we will need to add
> >> >> >>>> the necessary memory barrier, or make sure percpu_ref_tryget_live() has
> >> >> >>>> ACQUIRE semantics.  Per my understanding, we need to change
> >> >> >>>> percpu_ref_tryget_live() for that.
> >> >> >>>>
> >> >> >>>
> >> >> >>> Do you mean the below scene is possible?
> >> >> >>>
> >> >> >>> cpu1
> >> >> >>> swapon()
> >> >> >>>   ...
> >> >> >>>   percpu_ref_init
> >> >> >>>   ...
> >> >> >>>   setup_swap_info
> >> >> >>>   /* smp_store_release() is inside percpu_ref_reinit */
> >> >> >>>   percpu_ref_reinit
> >> >> >> 
> >> >> >> spin_unlock() has RELEASE semantics already.
> >> >> >> 
> >> >> >>>   ...
> >> >> >>>
> >> >> >>> cpu2
> >> >> >>> get_swap_device()
> >> >> >>>   /* ignored  smp_rmb() */
> >> >> >>>   percpu_ref_tryget_live
> >> >> >> 
> >> >> >> Some kind of ACQUIRE is required here to guarantee the refcount is
> >> >> >> checked before fetching the other fields of swap_info_struct.  I have
> >> >> >> sent out a RFC patch to mailing list to discuss this.
> >> >
> >> > I'm just catching up and following along a little bit. I apologize I
> >> > haven't read the swap code, but my understanding is you are trying to
> >> > narrow a race condition with swapoff. That makes sense to me. I'm not
> >> > sure I follow the need to race with reinitializing the ref though? Is it
> >> > not possible to wait out the dying swap info and then create a new one
> >> > rather than push acquire semantics?
> >> 
> >> We want to check whether the swap entry is valid (that is, the swap
> >> device isn't swapped off now), prevent it from swapping off, then access
> >> the swap_info_struct data structure.  When accessing swap_info_struct,
> >> we want to guarantee the ordering, so that we will not reference
> >> uninitialized fields of swap_info_struct.
> >> 
> >
> > So in the normal context of percpu_ref, once someone can access it, the
> > elements that it is protecting are expected to be initialized.
> 
> If we can make sure that all elements being initialized fully, why not
> just use percpu_ref_get() instead of percpu_ref_tryget*()?
> 

Generally, the lookup is protected with rcu and then
percpu_ref_tryget*() is used to obtain a reference. percpu_ref_get() is
only good if you already have a ref as it increments regardless of being
0.

What I mean is if you can get a ref, that means the object hasn't been
destroyed. This differs from the semantics you are looking for which I
understand to be: I have long lived pointers to objects. The object may
die, but I may resurrect it and I want the old pointers to still be
valid.

When is it possible for someone to have a pointer to the swap device and
the refcount goes to 0? It might be better to avoid this situation than
add acquire semantics.

> > In the basic case for swap off, I'm seeing the goal as to prevent
> > destruction until anyone currently accessing swap is done. In this
> > case wouldn't we always be protecting a live struct?
> >
> > I'm maybe not understanding what conditions you're trying to revive the
> > percpu_ref?
> 
> A swap entry likes an indirect pointer to a swap device.  We may hold a
> swap entry for long time, so that the swap device is swapoff/swapon.
> Then we need to make sure the swap device are fully initialized before
> accessing the swap device via the swap entry.
> 

So if I have some number of outstanding references, and then
percpu_ref_kill() is called, then only those that have the pointer will
be able to use the swap device as those references are still good. Prior
to calling percpu_ref_kill(), call_rcu() needs to be called on lookup
data structure.

My personal understanding of tryget() vs tryget_live() is that it
provides a 2 phase clean up and bounds the ability for new users to come
in (cgroup destruction is a primary user). As tryget() might inevitably
let a cgroup live long past its removal, tryget_live() will say oh
you're in the process of dying do something else.

Thanks,
Dennis

^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [PATCH 2/5] swap: fix do_swap_page() race with swapoff
  2021-04-14  1:04         ` Huang, Ying
  (?)
  (?)
@ 2021-04-14 16:13         ` Tim Chen
  2021-04-15  3:19           ` Miaohe Lin
  -1 siblings, 1 reply; 72+ messages in thread
From: Tim Chen @ 2021-04-14 16:13 UTC (permalink / raw)
  To: Huang, Ying
  Cc: Miaohe Lin, akpm, hannes, mhocko, iamjoonsoo.kim, vbabka,
	alex.shi, willy, minchan, richard.weiyang, hughd, linux-kernel,
	linux-mm



On 4/13/21 6:04 PM, Huang, Ying wrote:
> Tim Chen <tim.c.chen@linux.intel.com> writes:
> 
>> On 4/12/21 6:27 PM, Huang, Ying wrote:
>>
>>>
>>> This isn't the commit that introduces the race.  You can use `git blame`
>>> find out the correct commit.  For this it's commit 0bcac06f27d7 "mm,
>>> swap: skip swapcache for swapin of synchronous device".
>>>
>>> And I suggest to merge 1/5 and 2/5 to make it easy to get the full
>>> picture.
>>
>> I'll suggest make fix to do_swap_page race with get/put_swap_device
>> as a first patch. Then the per_cpu_ref stuff in patch 1 and patch 2 can
>> be combined together.
> 
> The original get/put_swap_device() use rcu_read_lock/unlock().  I don't
> think it's good to wrap swap_read_page() with it.  After all, some
> complex operations are done in swap_read_page(), including
> blk_io_schedule().
> 

In that case then have the patches to make get/put_swap_device to use
percpu_ref first.  And the patch to to fix the race in do_swap_page
later in another patch.

Patch 2 is mixing the two.

Tim

^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [PATCH 1/5] mm/swapfile: add percpu_ref support for swap
  2021-04-14 14:53                       ` Dennis Zhou
@ 2021-04-15  3:16                         ` Miaohe Lin
  2021-04-15  4:20                           ` Dennis Zhou
  2021-04-15  5:24                           ` Huang, Ying
  1 sibling, 1 reply; 72+ messages in thread
From: Miaohe Lin @ 2021-04-15  3:16 UTC (permalink / raw)
  To: Dennis Zhou, Huang, Ying
  Cc: akpm, hannes, mhocko, iamjoonsoo.kim, vbabka, alex.shi, willy,
	minchan, richard.weiyang, hughd, tim.c.chen, linux-kernel,
	linux-mm

On 2021/4/14 22:53, Dennis Zhou wrote:
> On Wed, Apr 14, 2021 at 01:44:58PM +0800, Huang, Ying wrote:
>> Dennis Zhou <dennis@kernel.org> writes:
>>
>>> On Wed, Apr 14, 2021 at 11:59:03AM +0800, Huang, Ying wrote:
>>>> Dennis Zhou <dennis@kernel.org> writes:
>>>>
>>>>> Hello,
>>>>>
>>>>> On Wed, Apr 14, 2021 at 10:06:48AM +0800, Huang, Ying wrote:
>>>>>> Miaohe Lin <linmiaohe@huawei.com> writes:
>>>>>>
>>>>>>> On 2021/4/14 9:17, Huang, Ying wrote:
>>>>>>>> Miaohe Lin <linmiaohe@huawei.com> writes:
>>>>>>>>
>>>>>>>>> On 2021/4/12 15:24, Huang, Ying wrote:
>>>>>>>>>> "Huang, Ying" <ying.huang@intel.com> writes:
>>>>>>>>>>
>>>>>>>>>>> Miaohe Lin <linmiaohe@huawei.com> writes:
>>>>>>>>>>>
>>>>>>>>>>>> We will use percpu-refcount to serialize against concurrent swapoff. This
>>>>>>>>>>>> patch adds the percpu_ref support for later fixup.
>>>>>>>>>>>>
>>>>>>>>>>>> Signed-off-by: Miaohe Lin <linmiaohe@huawei.com>
>>>>>>>>>>>> ---
>>>>>>>>>>>>  include/linux/swap.h |  2 ++
>>>>>>>>>>>>  mm/swapfile.c        | 25 ++++++++++++++++++++++---
>>>>>>>>>>>>  2 files changed, 24 insertions(+), 3 deletions(-)
>>>>>>>>>>>>
>>>>>>>>>>>> diff --git a/include/linux/swap.h b/include/linux/swap.h
>>>>>>>>>>>> index 144727041e78..849ba5265c11 100644
>>>>>>>>>>>> --- a/include/linux/swap.h
>>>>>>>>>>>> +++ b/include/linux/swap.h
>>>>>>>>>>>> @@ -240,6 +240,7 @@ struct swap_cluster_list {
>>>>>>>>>>>>   * The in-memory structure used to track swap areas.
>>>>>>>>>>>>   */
>>>>>>>>>>>>  struct swap_info_struct {
>>>>>>>>>>>> +	struct percpu_ref users;	/* serialization against concurrent swapoff */
>>>>>>>>>>>>  	unsigned long	flags;		/* SWP_USED etc: see above */
>>>>>>>>>>>>  	signed short	prio;		/* swap priority of this type */
>>>>>>>>>>>>  	struct plist_node list;		/* entry in swap_active_head */
>>>>>>>>>>>> @@ -260,6 +261,7 @@ struct swap_info_struct {
>>>>>>>>>>>>  	struct block_device *bdev;	/* swap device or bdev of swap file */
>>>>>>>>>>>>  	struct file *swap_file;		/* seldom referenced */
>>>>>>>>>>>>  	unsigned int old_block_size;	/* seldom referenced */
>>>>>>>>>>>> +	struct completion comp;		/* seldom referenced */
>>>>>>>>>>>>  #ifdef CONFIG_FRONTSWAP
>>>>>>>>>>>>  	unsigned long *frontswap_map;	/* frontswap in-use, one bit per page */
>>>>>>>>>>>>  	atomic_t frontswap_pages;	/* frontswap pages in-use counter */
>>>>>>>>>>>> diff --git a/mm/swapfile.c b/mm/swapfile.c
>>>>>>>>>>>> index 149e77454e3c..724173cd7d0c 100644
>>>>>>>>>>>> --- a/mm/swapfile.c
>>>>>>>>>>>> +++ b/mm/swapfile.c
>>>>>>>>>>>> @@ -39,6 +39,7 @@
>>>>>>>>>>>>  #include <linux/export.h>
>>>>>>>>>>>>  #include <linux/swap_slots.h>
>>>>>>>>>>>>  #include <linux/sort.h>
>>>>>>>>>>>> +#include <linux/completion.h>
>>>>>>>>>>>>  
>>>>>>>>>>>>  #include <asm/tlbflush.h>
>>>>>>>>>>>>  #include <linux/swapops.h>
>>>>>>>>>>>> @@ -511,6 +512,15 @@ static void swap_discard_work(struct work_struct *work)
>>>>>>>>>>>>  	spin_unlock(&si->lock);
>>>>>>>>>>>>  }
>>>>>>>>>>>>  
>>>>>>>>>>>> +static void swap_users_ref_free(struct percpu_ref *ref)
>>>>>>>>>>>> +{
>>>>>>>>>>>> +	struct swap_info_struct *si;
>>>>>>>>>>>> +
>>>>>>>>>>>> +	si = container_of(ref, struct swap_info_struct, users);
>>>>>>>>>>>> +	complete(&si->comp);
>>>>>>>>>>>> +	percpu_ref_exit(&si->users);
>>>>>>>>>>>
>>>>>>>>>>> Because percpu_ref_exit() is used, we cannot use percpu_ref_tryget() in
>>>>>>>>>>> get_swap_device(), better to add comments there.
>>>>>>>>>>
>>>>>>>>>> I just noticed that the comments of percpu_ref_tryget_live() says,
>>>>>>>>>>
>>>>>>>>>>  * This function is safe to call as long as @ref is between init and exit.
>>>>>>>>>>
>>>>>>>>>> While we need to call get_swap_device() almost at any time, so it's
>>>>>>>>>> better to avoid to call percpu_ref_exit() at all.  This will waste some
>>>>>>>>>> memory, but we need to follow the API definition to avoid potential
>>>>>>>>>> issues in the long term.
>>>>>>>>>
>>>>>>>>> I have to admit that I'am not really familiar with percpu_ref. So I read the
>>>>>>>>> implementation code of the percpu_ref and found percpu_ref_tryget_live() could
>>>>>>>>> be called after exit now. But you're right we need to follow the API definition
>>>>>>>>> to avoid potential issues in the long term.
>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>> And we need to call percpu_ref_init() before insert the swap_info_struct
>>>>>>>>>> into the swap_info[].
>>>>>>>>>
>>>>>>>>> If we remove the call to percpu_ref_exit(), we should not use percpu_ref_init()
>>>>>>>>> here because *percpu_ref->data is assumed to be NULL* in percpu_ref_init() while
>>>>>>>>> this is not the case as we do not call percpu_ref_exit(). Maybe percpu_ref_reinit()
>>>>>>>>> or percpu_ref_resurrect() will do the work.
>>>>>>>>>
>>>>>>>>> One more thing, how could I distinguish the killed percpu_ref from newly allocated one?
>>>>>>>>> It seems percpu_ref_is_dying is only safe to call when @ref is between init and exit.
>>>>>>>>> Maybe I could do this in alloc_swap_info()?
>>>>>>>>
>>>>>>>> Yes.  In alloc_swap_info(), you can distinguish newly allocated and
>>>>>>>> reused swap_info_struct.
>>>>>>>>
>>>>>>>>>>
>>>>>>>>>>>> +}
>>>>>>>>>>>> +
>>>>>>>>>>>>  static void alloc_cluster(struct swap_info_struct *si, unsigned long idx)
>>>>>>>>>>>>  {
>>>>>>>>>>>>  	struct swap_cluster_info *ci = si->cluster_info;
>>>>>>>>>>>> @@ -2500,7 +2510,7 @@ static void enable_swap_info(struct swap_info_struct *p, int prio,
>>>>>>>>>>>>  	 * Guarantee swap_map, cluster_info, etc. fields are valid
>>>>>>>>>>>>  	 * between get/put_swap_device() if SWP_VALID bit is set
>>>>>>>>>>>>  	 */
>>>>>>>>>>>> -	synchronize_rcu();
>>>>>>>>>>>> +	percpu_ref_reinit(&p->users);
>>>>>>>>>>>
>>>>>>>>>>> Although the effect is same, I think it's better to use
>>>>>>>>>>> percpu_ref_resurrect() here to improve code readability.
>>>>>>>>>>
>>>>>>>>>> Check the original commit description for commit eb085574a752 "mm, swap:
>>>>>>>>>> fix race between swapoff and some swap operations" and discussion email
>>>>>>>>>> thread as follows again,
>>>>>>>>>>
>>>>>>>>>> https://lore.kernel.org/linux-mm/20171219053650.GB7829@linux.vnet.ibm.com/
>>>>>>>>>>
>>>>>>>>>> I found that the synchronize_rcu() here is to avoid to call smp_rmb() or
>>>>>>>>>> smp_load_acquire() in get_swap_device().  Now we will use
>>>>>>>>>> percpu_ref_tryget_live() in get_swap_device(), so we will need to add
>>>>>>>>>> the necessary memory barrier, or make sure percpu_ref_tryget_live() has
>>>>>>>>>> ACQUIRE semantics.  Per my understanding, we need to change
>>>>>>>>>> percpu_ref_tryget_live() for that.
>>>>>>>>>>
>>>>>>>>>
>>>>>>>>> Do you mean the below scene is possible?
>>>>>>>>>
>>>>>>>>> cpu1
>>>>>>>>> swapon()
>>>>>>>>>   ...
>>>>>>>>>   percpu_ref_init
>>>>>>>>>   ...
>>>>>>>>>   setup_swap_info
>>>>>>>>>   /* smp_store_release() is inside percpu_ref_reinit */
>>>>>>>>>   percpu_ref_reinit
>>>>>>>>
>>>>>>>> spin_unlock() has RELEASE semantics already.
>>>>>>>>
>>>>>>>>>   ...
>>>>>>>>>
>>>>>>>>> cpu2
>>>>>>>>> get_swap_device()
>>>>>>>>>   /* ignored  smp_rmb() */
>>>>>>>>>   percpu_ref_tryget_live
>>>>>>>>
>>>>>>>> Some kind of ACQUIRE is required here to guarantee the refcount is
>>>>>>>> checked before fetching the other fields of swap_info_struct.  I have
>>>>>>>> sent out a RFC patch to mailing list to discuss this.
>>>>>
>>>>> I'm just catching up and following along a little bit. I apologize I
>>>>> haven't read the swap code, but my understanding is you are trying to
>>>>> narrow a race condition with swapoff. That makes sense to me. I'm not
>>>>> sure I follow the need to race with reinitializing the ref though? Is it
>>>>> not possible to wait out the dying swap info and then create a new one
>>>>> rather than push acquire semantics?
>>>>
>>>> We want to check whether the swap entry is valid (that is, the swap
>>>> device isn't swapped off now), prevent it from swapping off, then access
>>>> the swap_info_struct data structure.  When accessing swap_info_struct,
>>>> we want to guarantee the ordering, so that we will not reference
>>>> uninitialized fields of swap_info_struct.
>>>>
>>>
>>> So in the normal context of percpu_ref, once someone can access it, the
>>> elements that it is protecting are expected to be initialized.
>>
>> If we can make sure that all elements being initialized fully, why not
>> just use percpu_ref_get() instead of percpu_ref_tryget*()?
>>
> 
> Generally, the lookup is protected with rcu and then
> percpu_ref_tryget*() is used to obtain a reference. percpu_ref_get() is
> only good if you already have a ref as it increments regardless of being
> 0.
> 
> What I mean is if you can get a ref, that means the object hasn't been
> destroyed. This differs from the semantics you are looking for which I

This assumption might not be held for swap. If we can get a ref, that means
the object hasn't been destroyed or the object has been destroyed and created
again. It's because swp_entry can hold a really long time while swapoff+swapon
happened. So we may get a ref to a newly swapon-ed swap device using old swap_entry.
So we must guarantee that we will not reference uninitialized fields of newly
swapon-ed swap device.

Does this make sense for you? Thanks.

> understand to be: I have long lived pointers to objects. The object may
> die, but I may resurrect it and I want the old pointers to still be
> valid.
> 
> When is it possible for someone to have a pointer to the swap device and
> the refcount goes to 0? It might be better to avoid this situation than
> add acquire semantics.>
>>> In the basic case for swap off, I'm seeing the goal as to prevent
>>> destruction until anyone currently accessing swap is done. In this
>>> case wouldn't we always be protecting a live struct?
>>>
>>> I'm maybe not understanding what conditions you're trying to revive the
>>> percpu_ref?
>>
>> A swap entry likes an indirect pointer to a swap device.  We may hold a
>> swap entry for long time, so that the swap device is swapoff/swapon.
>> Then we need to make sure the swap device are fully initialized before
>> accessing the swap device via the swap entry.
>>
> 
> So if I have some number of outstanding references, and then
> percpu_ref_kill() is called, then only those that have the pointer will
> be able to use the swap device as those references are still good. Prior
> to calling percpu_ref_kill(), call_rcu() needs to be called on lookup
> data structure.
> 
> My personal understanding of tryget() vs tryget_live() is that it
> provides a 2 phase clean up and bounds the ability for new users to come
> in (cgroup destruction is a primary user). As tryget() might inevitably
> let a cgroup live long past its removal, tryget_live() will say oh
> you're in the process of dying do something else.
> 
> Thanks,
> Dennis
> 
> .
> 


^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [PATCH 2/5] swap: fix do_swap_page() race with swapoff
  2021-04-14 16:13         ` Tim Chen
@ 2021-04-15  3:19           ` Miaohe Lin
  0 siblings, 0 replies; 72+ messages in thread
From: Miaohe Lin @ 2021-04-15  3:19 UTC (permalink / raw)
  To: Tim Chen, Huang, Ying
  Cc: akpm, hannes, mhocko, iamjoonsoo.kim, vbabka, alex.shi, willy,
	minchan, richard.weiyang, hughd, linux-kernel, linux-mm

On 2021/4/15 0:13, Tim Chen wrote:
> 
> 
> On 4/13/21 6:04 PM, Huang, Ying wrote:
>> Tim Chen <tim.c.chen@linux.intel.com> writes:
>>
>>> On 4/12/21 6:27 PM, Huang, Ying wrote:
>>>
>>>>
>>>> This isn't the commit that introduces the race.  You can use `git blame`
>>>> find out the correct commit.  For this it's commit 0bcac06f27d7 "mm,
>>>> swap: skip swapcache for swapin of synchronous device".
>>>>
>>>> And I suggest to merge 1/5 and 2/5 to make it easy to get the full
>>>> picture.
>>>
>>> I'll suggest make fix to do_swap_page race with get/put_swap_device
>>> as a first patch. Then the per_cpu_ref stuff in patch 1 and patch 2 can
>>> be combined together.
>>
>> The original get/put_swap_device() use rcu_read_lock/unlock().  I don't
>> think it's good to wrap swap_read_page() with it.  After all, some
>> complex operations are done in swap_read_page(), including
>> blk_io_schedule().
>>
> 
> In that case then have the patches to make get/put_swap_device to use
> percpu_ref first.  And the patch to to fix the race in do_swap_page
> later in another patch.
> 
> Patch 2 is mixing the two.
> 

Looks like a good way to organize this patch series. Many thanks!

> Tim
> .
> 


^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [PATCH 1/5] mm/swapfile: add percpu_ref support for swap
  2021-04-15  3:16                         ` Miaohe Lin
@ 2021-04-15  4:20                           ` Dennis Zhou
  2021-04-15  9:17                             ` Miaohe Lin
  0 siblings, 1 reply; 72+ messages in thread
From: Dennis Zhou @ 2021-04-15  4:20 UTC (permalink / raw)
  To: Miaohe Lin
  Cc: Huang, Ying, akpm, hannes, mhocko, iamjoonsoo.kim, vbabka,
	alex.shi, willy, minchan, richard.weiyang, hughd, tim.c.chen,
	linux-kernel, linux-mm

On Thu, Apr 15, 2021 at 11:16:42AM +0800, Miaohe Lin wrote:
> On 2021/4/14 22:53, Dennis Zhou wrote:
> > On Wed, Apr 14, 2021 at 01:44:58PM +0800, Huang, Ying wrote:
> >> Dennis Zhou <dennis@kernel.org> writes:
> >>
> >>> On Wed, Apr 14, 2021 at 11:59:03AM +0800, Huang, Ying wrote:
> >>>> Dennis Zhou <dennis@kernel.org> writes:
> >>>>
> >>>>> Hello,
> >>>>>
> >>>>> On Wed, Apr 14, 2021 at 10:06:48AM +0800, Huang, Ying wrote:
> >>>>>> Miaohe Lin <linmiaohe@huawei.com> writes:
> >>>>>>
> >>>>>>> On 2021/4/14 9:17, Huang, Ying wrote:
> >>>>>>>> Miaohe Lin <linmiaohe@huawei.com> writes:
> >>>>>>>>
> >>>>>>>>> On 2021/4/12 15:24, Huang, Ying wrote:
> >>>>>>>>>> "Huang, Ying" <ying.huang@intel.com> writes:
> >>>>>>>>>>
> >>>>>>>>>>> Miaohe Lin <linmiaohe@huawei.com> writes:
> >>>>>>>>>>>
> >>>>>>>>>>>> We will use percpu-refcount to serialize against concurrent swapoff. This
> >>>>>>>>>>>> patch adds the percpu_ref support for later fixup.
> >>>>>>>>>>>>
> >>>>>>>>>>>> Signed-off-by: Miaohe Lin <linmiaohe@huawei.com>
> >>>>>>>>>>>> ---
> >>>>>>>>>>>>  include/linux/swap.h |  2 ++
> >>>>>>>>>>>>  mm/swapfile.c        | 25 ++++++++++++++++++++++---
> >>>>>>>>>>>>  2 files changed, 24 insertions(+), 3 deletions(-)
> >>>>>>>>>>>>
> >>>>>>>>>>>> diff --git a/include/linux/swap.h b/include/linux/swap.h
> >>>>>>>>>>>> index 144727041e78..849ba5265c11 100644
> >>>>>>>>>>>> --- a/include/linux/swap.h
> >>>>>>>>>>>> +++ b/include/linux/swap.h
> >>>>>>>>>>>> @@ -240,6 +240,7 @@ struct swap_cluster_list {
> >>>>>>>>>>>>   * The in-memory structure used to track swap areas.
> >>>>>>>>>>>>   */
> >>>>>>>>>>>>  struct swap_info_struct {
> >>>>>>>>>>>> +	struct percpu_ref users;	/* serialization against concurrent swapoff */
> >>>>>>>>>>>>  	unsigned long	flags;		/* SWP_USED etc: see above */
> >>>>>>>>>>>>  	signed short	prio;		/* swap priority of this type */
> >>>>>>>>>>>>  	struct plist_node list;		/* entry in swap_active_head */
> >>>>>>>>>>>> @@ -260,6 +261,7 @@ struct swap_info_struct {
> >>>>>>>>>>>>  	struct block_device *bdev;	/* swap device or bdev of swap file */
> >>>>>>>>>>>>  	struct file *swap_file;		/* seldom referenced */
> >>>>>>>>>>>>  	unsigned int old_block_size;	/* seldom referenced */
> >>>>>>>>>>>> +	struct completion comp;		/* seldom referenced */
> >>>>>>>>>>>>  #ifdef CONFIG_FRONTSWAP
> >>>>>>>>>>>>  	unsigned long *frontswap_map;	/* frontswap in-use, one bit per page */
> >>>>>>>>>>>>  	atomic_t frontswap_pages;	/* frontswap pages in-use counter */
> >>>>>>>>>>>> diff --git a/mm/swapfile.c b/mm/swapfile.c
> >>>>>>>>>>>> index 149e77454e3c..724173cd7d0c 100644
> >>>>>>>>>>>> --- a/mm/swapfile.c
> >>>>>>>>>>>> +++ b/mm/swapfile.c
> >>>>>>>>>>>> @@ -39,6 +39,7 @@
> >>>>>>>>>>>>  #include <linux/export.h>
> >>>>>>>>>>>>  #include <linux/swap_slots.h>
> >>>>>>>>>>>>  #include <linux/sort.h>
> >>>>>>>>>>>> +#include <linux/completion.h>
> >>>>>>>>>>>>  
> >>>>>>>>>>>>  #include <asm/tlbflush.h>
> >>>>>>>>>>>>  #include <linux/swapops.h>
> >>>>>>>>>>>> @@ -511,6 +512,15 @@ static void swap_discard_work(struct work_struct *work)
> >>>>>>>>>>>>  	spin_unlock(&si->lock);
> >>>>>>>>>>>>  }
> >>>>>>>>>>>>  
> >>>>>>>>>>>> +static void swap_users_ref_free(struct percpu_ref *ref)
> >>>>>>>>>>>> +{
> >>>>>>>>>>>> +	struct swap_info_struct *si;
> >>>>>>>>>>>> +
> >>>>>>>>>>>> +	si = container_of(ref, struct swap_info_struct, users);
> >>>>>>>>>>>> +	complete(&si->comp);
> >>>>>>>>>>>> +	percpu_ref_exit(&si->users);
> >>>>>>>>>>>
> >>>>>>>>>>> Because percpu_ref_exit() is used, we cannot use percpu_ref_tryget() in
> >>>>>>>>>>> get_swap_device(), better to add comments there.
> >>>>>>>>>>
> >>>>>>>>>> I just noticed that the comments of percpu_ref_tryget_live() says,
> >>>>>>>>>>
> >>>>>>>>>>  * This function is safe to call as long as @ref is between init and exit.
> >>>>>>>>>>
> >>>>>>>>>> While we need to call get_swap_device() almost at any time, so it's
> >>>>>>>>>> better to avoid to call percpu_ref_exit() at all.  This will waste some
> >>>>>>>>>> memory, but we need to follow the API definition to avoid potential
> >>>>>>>>>> issues in the long term.
> >>>>>>>>>
> >>>>>>>>> I have to admit that I'am not really familiar with percpu_ref. So I read the
> >>>>>>>>> implementation code of the percpu_ref and found percpu_ref_tryget_live() could
> >>>>>>>>> be called after exit now. But you're right we need to follow the API definition
> >>>>>>>>> to avoid potential issues in the long term.
> >>>>>>>>>
> >>>>>>>>>>
> >>>>>>>>>> And we need to call percpu_ref_init() before insert the swap_info_struct
> >>>>>>>>>> into the swap_info[].
> >>>>>>>>>
> >>>>>>>>> If we remove the call to percpu_ref_exit(), we should not use percpu_ref_init()
> >>>>>>>>> here because *percpu_ref->data is assumed to be NULL* in percpu_ref_init() while
> >>>>>>>>> this is not the case as we do not call percpu_ref_exit(). Maybe percpu_ref_reinit()
> >>>>>>>>> or percpu_ref_resurrect() will do the work.
> >>>>>>>>>
> >>>>>>>>> One more thing, how could I distinguish the killed percpu_ref from newly allocated one?
> >>>>>>>>> It seems percpu_ref_is_dying is only safe to call when @ref is between init and exit.
> >>>>>>>>> Maybe I could do this in alloc_swap_info()?
> >>>>>>>>
> >>>>>>>> Yes.  In alloc_swap_info(), you can distinguish newly allocated and
> >>>>>>>> reused swap_info_struct.
> >>>>>>>>
> >>>>>>>>>>
> >>>>>>>>>>>> +}
> >>>>>>>>>>>> +
> >>>>>>>>>>>>  static void alloc_cluster(struct swap_info_struct *si, unsigned long idx)
> >>>>>>>>>>>>  {
> >>>>>>>>>>>>  	struct swap_cluster_info *ci = si->cluster_info;
> >>>>>>>>>>>> @@ -2500,7 +2510,7 @@ static void enable_swap_info(struct swap_info_struct *p, int prio,
> >>>>>>>>>>>>  	 * Guarantee swap_map, cluster_info, etc. fields are valid
> >>>>>>>>>>>>  	 * between get/put_swap_device() if SWP_VALID bit is set
> >>>>>>>>>>>>  	 */
> >>>>>>>>>>>> -	synchronize_rcu();
> >>>>>>>>>>>> +	percpu_ref_reinit(&p->users);
> >>>>>>>>>>>
> >>>>>>>>>>> Although the effect is same, I think it's better to use
> >>>>>>>>>>> percpu_ref_resurrect() here to improve code readability.
> >>>>>>>>>>
> >>>>>>>>>> Check the original commit description for commit eb085574a752 "mm, swap:
> >>>>>>>>>> fix race between swapoff and some swap operations" and discussion email
> >>>>>>>>>> thread as follows again,
> >>>>>>>>>>
> >>>>>>>>>> https://lore.kernel.org/linux-mm/20171219053650.GB7829@linux.vnet.ibm.com/
> >>>>>>>>>>
> >>>>>>>>>> I found that the synchronize_rcu() here is to avoid to call smp_rmb() or
> >>>>>>>>>> smp_load_acquire() in get_swap_device().  Now we will use
> >>>>>>>>>> percpu_ref_tryget_live() in get_swap_device(), so we will need to add
> >>>>>>>>>> the necessary memory barrier, or make sure percpu_ref_tryget_live() has
> >>>>>>>>>> ACQUIRE semantics.  Per my understanding, we need to change
> >>>>>>>>>> percpu_ref_tryget_live() for that.
> >>>>>>>>>>
> >>>>>>>>>
> >>>>>>>>> Do you mean the below scene is possible?
> >>>>>>>>>
> >>>>>>>>> cpu1
> >>>>>>>>> swapon()
> >>>>>>>>>   ...
> >>>>>>>>>   percpu_ref_init
> >>>>>>>>>   ...
> >>>>>>>>>   setup_swap_info
> >>>>>>>>>   /* smp_store_release() is inside percpu_ref_reinit */
> >>>>>>>>>   percpu_ref_reinit
> >>>>>>>>
> >>>>>>>> spin_unlock() has RELEASE semantics already.
> >>>>>>>>
> >>>>>>>>>   ...
> >>>>>>>>>
> >>>>>>>>> cpu2
> >>>>>>>>> get_swap_device()
> >>>>>>>>>   /* ignored  smp_rmb() */
> >>>>>>>>>   percpu_ref_tryget_live
> >>>>>>>>
> >>>>>>>> Some kind of ACQUIRE is required here to guarantee the refcount is
> >>>>>>>> checked before fetching the other fields of swap_info_struct.  I have
> >>>>>>>> sent out a RFC patch to mailing list to discuss this.
> >>>>>
> >>>>> I'm just catching up and following along a little bit. I apologize I
> >>>>> haven't read the swap code, but my understanding is you are trying to
> >>>>> narrow a race condition with swapoff. That makes sense to me. I'm not
> >>>>> sure I follow the need to race with reinitializing the ref though? Is it
> >>>>> not possible to wait out the dying swap info and then create a new one
> >>>>> rather than push acquire semantics?
> >>>>
> >>>> We want to check whether the swap entry is valid (that is, the swap
> >>>> device isn't swapped off now), prevent it from swapping off, then access
> >>>> the swap_info_struct data structure.  When accessing swap_info_struct,
> >>>> we want to guarantee the ordering, so that we will not reference
> >>>> uninitialized fields of swap_info_struct.
> >>>>
> >>>
> >>> So in the normal context of percpu_ref, once someone can access it, the
> >>> elements that it is protecting are expected to be initialized.
> >>
> >> If we can make sure that all elements being initialized fully, why not
> >> just use percpu_ref_get() instead of percpu_ref_tryget*()?
> >>
> > 
> > Generally, the lookup is protected with rcu and then
> > percpu_ref_tryget*() is used to obtain a reference. percpu_ref_get() is
> > only good if you already have a ref as it increments regardless of being
> > 0.
> > 
> > What I mean is if you can get a ref, that means the object hasn't been
> > destroyed. This differs from the semantics you are looking for which I
> 
> This assumption might not be held for swap. If we can get a ref, that means
> the object hasn't been destroyed or the object has been destroyed and created
> again. It's because swp_entry can hold a really long time while swapoff+swapon
> happened. So we may get a ref to a newly swapon-ed swap device using old swap_entry.
> So we must guarantee that we will not reference uninitialized fields of newly
> swapon-ed swap device.
> 
> Does this make sense for you? Thanks.
> 

Okay if I understand this right. The need is because:

struct swap_info_struct *swap_info[MAX_SWAPFILES];

swap_info[type] is recreated in place. And a swap_entry keeps a
swap_type and that is how it gets the value.

An alternative to that approach is to adopt something similar to how
cgroups does it which is with rcu and not constructing the object in
place.

rcu_read_lock();
swap_info_struct *info = swap_info[type];
got_ref = percpu_ref_tryget_live(&info->refcnt);
rcu_read_unlock();


However, I do not have a good sense of the cost of rcu + this vs
an acquire + release barrier.

<snip>

Thanks,
Dennis

^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [PATCH 1/5] mm/swapfile: add percpu_ref support for swap
  2021-04-14 14:53                       ` Dennis Zhou
@ 2021-04-15  5:24                           ` Huang, Ying
  2021-04-15  5:24                           ` Huang, Ying
  1 sibling, 0 replies; 72+ messages in thread
From: Huang, Ying @ 2021-04-15  5:24 UTC (permalink / raw)
  To: Dennis Zhou
  Cc: Miaohe Lin, akpm, hannes, mhocko, iamjoonsoo.kim, vbabka,
	alex.shi, willy, minchan, richard.weiyang, hughd, tim.c.chen,
	linux-kernel, linux-mm

Dennis Zhou <dennis@kernel.org> writes:

> On Wed, Apr 14, 2021 at 01:44:58PM +0800, Huang, Ying wrote:
>> Dennis Zhou <dennis@kernel.org> writes:
>> 
>> > On Wed, Apr 14, 2021 at 11:59:03AM +0800, Huang, Ying wrote:
>> >> Dennis Zhou <dennis@kernel.org> writes:
>> >> 
>> >> > Hello,
>> >> >
>> >> > On Wed, Apr 14, 2021 at 10:06:48AM +0800, Huang, Ying wrote:
>> >> >> Miaohe Lin <linmiaohe@huawei.com> writes:
>> >> >> 
>> >> >> > On 2021/4/14 9:17, Huang, Ying wrote:
>> >> >> >> Miaohe Lin <linmiaohe@huawei.com> writes:
>> >> >> >> 
>> >> >> >>> On 2021/4/12 15:24, Huang, Ying wrote:
>> >> >> >>>> "Huang, Ying" <ying.huang@intel.com> writes:
>> >> >> >>>>
>> >> >> >>>>> Miaohe Lin <linmiaohe@huawei.com> writes:
>> >> >> >>>>>
>> >> >> >>>>>> We will use percpu-refcount to serialize against concurrent swapoff. This
>> >> >> >>>>>> patch adds the percpu_ref support for later fixup.
>> >> >> >>>>>>
>> >> >> >>>>>> Signed-off-by: Miaohe Lin <linmiaohe@huawei.com>
>> >> >> >>>>>> ---
>> >> >> >>>>>>  include/linux/swap.h |  2 ++
>> >> >> >>>>>>  mm/swapfile.c        | 25 ++++++++++++++++++++++---
>> >> >> >>>>>>  2 files changed, 24 insertions(+), 3 deletions(-)
>> >> >> >>>>>>
>> >> >> >>>>>> diff --git a/include/linux/swap.h b/include/linux/swap.h
>> >> >> >>>>>> index 144727041e78..849ba5265c11 100644
>> >> >> >>>>>> --- a/include/linux/swap.h
>> >> >> >>>>>> +++ b/include/linux/swap.h
>> >> >> >>>>>> @@ -240,6 +240,7 @@ struct swap_cluster_list {
>> >> >> >>>>>>   * The in-memory structure used to track swap areas.
>> >> >> >>>>>>   */
>> >> >> >>>>>>  struct swap_info_struct {
>> >> >> >>>>>> +	struct percpu_ref users;	/* serialization against concurrent swapoff */
>> >> >> >>>>>>  	unsigned long	flags;		/* SWP_USED etc: see above */
>> >> >> >>>>>>  	signed short	prio;		/* swap priority of this type */
>> >> >> >>>>>>  	struct plist_node list;		/* entry in swap_active_head */
>> >> >> >>>>>> @@ -260,6 +261,7 @@ struct swap_info_struct {
>> >> >> >>>>>>  	struct block_device *bdev;	/* swap device or bdev of swap file */
>> >> >> >>>>>>  	struct file *swap_file;		/* seldom referenced */
>> >> >> >>>>>>  	unsigned int old_block_size;	/* seldom referenced */
>> >> >> >>>>>> +	struct completion comp;		/* seldom referenced */
>> >> >> >>>>>>  #ifdef CONFIG_FRONTSWAP
>> >> >> >>>>>>  	unsigned long *frontswap_map;	/* frontswap in-use, one bit per page */
>> >> >> >>>>>>  	atomic_t frontswap_pages;	/* frontswap pages in-use counter */
>> >> >> >>>>>> diff --git a/mm/swapfile.c b/mm/swapfile.c
>> >> >> >>>>>> index 149e77454e3c..724173cd7d0c 100644
>> >> >> >>>>>> --- a/mm/swapfile.c
>> >> >> >>>>>> +++ b/mm/swapfile.c
>> >> >> >>>>>> @@ -39,6 +39,7 @@
>> >> >> >>>>>>  #include <linux/export.h>
>> >> >> >>>>>>  #include <linux/swap_slots.h>
>> >> >> >>>>>>  #include <linux/sort.h>
>> >> >> >>>>>> +#include <linux/completion.h>
>> >> >> >>>>>>  
>> >> >> >>>>>>  #include <asm/tlbflush.h>
>> >> >> >>>>>>  #include <linux/swapops.h>
>> >> >> >>>>>> @@ -511,6 +512,15 @@ static void swap_discard_work(struct work_struct *work)
>> >> >> >>>>>>  	spin_unlock(&si->lock);
>> >> >> >>>>>>  }
>> >> >> >>>>>>  
>> >> >> >>>>>> +static void swap_users_ref_free(struct percpu_ref *ref)
>> >> >> >>>>>> +{
>> >> >> >>>>>> +	struct swap_info_struct *si;
>> >> >> >>>>>> +
>> >> >> >>>>>> +	si = container_of(ref, struct swap_info_struct, users);
>> >> >> >>>>>> +	complete(&si->comp);
>> >> >> >>>>>> +	percpu_ref_exit(&si->users);
>> >> >> >>>>>
>> >> >> >>>>> Because percpu_ref_exit() is used, we cannot use percpu_ref_tryget() in
>> >> >> >>>>> get_swap_device(), better to add comments there.
>> >> >> >>>>
>> >> >> >>>> I just noticed that the comments of percpu_ref_tryget_live() says,
>> >> >> >>>>
>> >> >> >>>>  * This function is safe to call as long as @ref is between init and exit.
>> >> >> >>>>
>> >> >> >>>> While we need to call get_swap_device() almost at any time, so it's
>> >> >> >>>> better to avoid to call percpu_ref_exit() at all.  This will waste some
>> >> >> >>>> memory, but we need to follow the API definition to avoid potential
>> >> >> >>>> issues in the long term.
>> >> >> >>>
>> >> >> >>> I have to admit that I'am not really familiar with percpu_ref. So I read the
>> >> >> >>> implementation code of the percpu_ref and found percpu_ref_tryget_live() could
>> >> >> >>> be called after exit now. But you're right we need to follow the API definition
>> >> >> >>> to avoid potential issues in the long term.
>> >> >> >>>
>> >> >> >>>>
>> >> >> >>>> And we need to call percpu_ref_init() before insert the swap_info_struct
>> >> >> >>>> into the swap_info[].
>> >> >> >>>
>> >> >> >>> If we remove the call to percpu_ref_exit(), we should not use percpu_ref_init()
>> >> >> >>> here because *percpu_ref->data is assumed to be NULL* in percpu_ref_init() while
>> >> >> >>> this is not the case as we do not call percpu_ref_exit(). Maybe percpu_ref_reinit()
>> >> >> >>> or percpu_ref_resurrect() will do the work.
>> >> >> >>>
>> >> >> >>> One more thing, how could I distinguish the killed percpu_ref from newly allocated one?
>> >> >> >>> It seems percpu_ref_is_dying is only safe to call when @ref is between init and exit.
>> >> >> >>> Maybe I could do this in alloc_swap_info()?
>> >> >> >> 
>> >> >> >> Yes.  In alloc_swap_info(), you can distinguish newly allocated and
>> >> >> >> reused swap_info_struct.
>> >> >> >> 
>> >> >> >>>>
>> >> >> >>>>>> +}
>> >> >> >>>>>> +
>> >> >> >>>>>>  static void alloc_cluster(struct swap_info_struct *si, unsigned long idx)
>> >> >> >>>>>>  {
>> >> >> >>>>>>  	struct swap_cluster_info *ci = si->cluster_info;
>> >> >> >>>>>> @@ -2500,7 +2510,7 @@ static void enable_swap_info(struct swap_info_struct *p, int prio,
>> >> >> >>>>>>  	 * Guarantee swap_map, cluster_info, etc. fields are valid
>> >> >> >>>>>>  	 * between get/put_swap_device() if SWP_VALID bit is set
>> >> >> >>>>>>  	 */
>> >> >> >>>>>> -	synchronize_rcu();
>> >> >> >>>>>> +	percpu_ref_reinit(&p->users);
>> >> >> >>>>>
>> >> >> >>>>> Although the effect is same, I think it's better to use
>> >> >> >>>>> percpu_ref_resurrect() here to improve code readability.
>> >> >> >>>>
>> >> >> >>>> Check the original commit description for commit eb085574a752 "mm, swap:
>> >> >> >>>> fix race between swapoff and some swap operations" and discussion email
>> >> >> >>>> thread as follows again,
>> >> >> >>>>
>> >> >> >>>> https://lore.kernel.org/linux-mm/20171219053650.GB7829@linux.vnet.ibm.com/
>> >> >> >>>>
>> >> >> >>>> I found that the synchronize_rcu() here is to avoid to call smp_rmb() or
>> >> >> >>>> smp_load_acquire() in get_swap_device().  Now we will use
>> >> >> >>>> percpu_ref_tryget_live() in get_swap_device(), so we will need to add
>> >> >> >>>> the necessary memory barrier, or make sure percpu_ref_tryget_live() has
>> >> >> >>>> ACQUIRE semantics.  Per my understanding, we need to change
>> >> >> >>>> percpu_ref_tryget_live() for that.
>> >> >> >>>>
>> >> >> >>>
>> >> >> >>> Do you mean the below scene is possible?
>> >> >> >>>
>> >> >> >>> cpu1
>> >> >> >>> swapon()
>> >> >> >>>   ...
>> >> >> >>>   percpu_ref_init
>> >> >> >>>   ...
>> >> >> >>>   setup_swap_info
>> >> >> >>>   /* smp_store_release() is inside percpu_ref_reinit */
>> >> >> >>>   percpu_ref_reinit
>> >> >> >> 
>> >> >> >> spin_unlock() has RELEASE semantics already.
>> >> >> >> 
>> >> >> >>>   ...
>> >> >> >>>
>> >> >> >>> cpu2
>> >> >> >>> get_swap_device()
>> >> >> >>>   /* ignored  smp_rmb() */
>> >> >> >>>   percpu_ref_tryget_live
>> >> >> >> 
>> >> >> >> Some kind of ACQUIRE is required here to guarantee the refcount is
>> >> >> >> checked before fetching the other fields of swap_info_struct.  I have
>> >> >> >> sent out a RFC patch to mailing list to discuss this.
>> >> >
>> >> > I'm just catching up and following along a little bit. I apologize I
>> >> > haven't read the swap code, but my understanding is you are trying to
>> >> > narrow a race condition with swapoff. That makes sense to me. I'm not
>> >> > sure I follow the need to race with reinitializing the ref though? Is it
>> >> > not possible to wait out the dying swap info and then create a new one
>> >> > rather than push acquire semantics?
>> >> 
>> >> We want to check whether the swap entry is valid (that is, the swap
>> >> device isn't swapped off now), prevent it from swapping off, then access
>> >> the swap_info_struct data structure.  When accessing swap_info_struct,
>> >> we want to guarantee the ordering, so that we will not reference
>> >> uninitialized fields of swap_info_struct.
>> >> 
>> >
>> > So in the normal context of percpu_ref, once someone can access it, the
>> > elements that it is protecting are expected to be initialized.
>> 
>> If we can make sure that all elements being initialized fully, why not
>> just use percpu_ref_get() instead of percpu_ref_tryget*()?
>> 
>
> Generally, the lookup is protected with rcu and then
> percpu_ref_tryget*() is used to obtain a reference. percpu_ref_get() is
> only good if you already have a ref as it increments regardless of being
> 0.
>
> What I mean is if you can get a ref, that means the object hasn't been
> destroyed. This differs from the semantics you are looking for which I
> understand to be: I have long lived pointers to objects. The object may
> die, but I may resurrect it and I want the old pointers to still be
> valid.
>
> When is it possible for someone to have a pointer to the swap device and
> the refcount goes to 0? It might be better to avoid this situation than
> add acquire semantics.
>
>> > In the basic case for swap off, I'm seeing the goal as to prevent
>> > destruction until anyone currently accessing swap is done. In this
>> > case wouldn't we always be protecting a live struct?
>> >
>> > I'm maybe not understanding what conditions you're trying to revive the
>> > percpu_ref?
>> 
>> A swap entry likes an indirect pointer to a swap device.  We may hold a
>> swap entry for long time, so that the swap device is swapoff/swapon.
>> Then we need to make sure the swap device are fully initialized before
>> accessing the swap device via the swap entry.
>> 
>
> So if I have some number of outstanding references, and then
> percpu_ref_kill() is called, then only those that have the pointer will
> be able to use the swap device as those references are still good. Prior
> to calling percpu_ref_kill(), call_rcu() needs to be called on lookup
> data structure.
>
> My personal understanding of tryget() vs tryget_live() is that it
> provides a 2 phase clean up and bounds the ability for new users to come
> in (cgroup destruction is a primary user). As tryget() might inevitably
> let a cgroup live long past its removal, tryget_live() will say oh
> you're in the process of dying do something else.

OK.  I think that I understand your typical use case now.  The resource
producer code may look like,

  obj = kmalloc();
  /* Initialize obj fields */
  percpu_ref_init(&obj->ref);
  rcu_assign_pointer(global_p, obj);

The resource reclaimer looks like,

  p = global_p;
  global_p = NULL;
  percpu_ref_kill(&p->ref);
  /* wait until percpu_ref_is_zero(&p->ref) */
  /* free resources pointed by obj fields */
  kfree(p);

The resource producer looks like,

  rcu_read_lock();
  p = rcu_dereference(global_p);
  if (!p || !percpu_ref_tryget_live(&p->ref)) {
          /* Invalid pointer, go out */
  }
  rcu_read_unlock();
  /* use p */
  percpu_ref_put(&p->ref);

For this use case, it's not necessary to make percpu_ref_tryget_live()
ACQUIRE operation.  Because refcount doesn't act as a flag to indicate
whether the object has been fully initialized, global_p does.  And
the data dependency guaranteed the required ordering.

The use case of swap is different.  Where global_p always points to
the obj (never freed) even if the resources pointed by obj fields has
been freed.  And we want to use refcount as a flag to indicate whether
the object is fully initialized.  This is hard to be changed, because
the global_p is used to identify the stalled pointer from the totally
invalid pointer.

If all other users follow the typical use case above, we may find some
other way to resolve the problem inside swap code, such as adding
smp_rmb() after percpu_ref_tryget_live().

Best Regards,
Huang, Ying

^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [PATCH 1/5] mm/swapfile: add percpu_ref support for swap
@ 2021-04-15  5:24                           ` Huang, Ying
  0 siblings, 0 replies; 72+ messages in thread
From: Huang, Ying @ 2021-04-15  5:24 UTC (permalink / raw)
  To: Dennis Zhou
  Cc: Miaohe Lin, akpm, hannes, mhocko, iamjoonsoo.kim, vbabka,
	alex.shi, willy, minchan, richard.weiyang, hughd, tim.c.chen,
	linux-kernel, linux-mm

Dennis Zhou <dennis@kernel.org> writes:

> On Wed, Apr 14, 2021 at 01:44:58PM +0800, Huang, Ying wrote:
>> Dennis Zhou <dennis@kernel.org> writes:
>> 
>> > On Wed, Apr 14, 2021 at 11:59:03AM +0800, Huang, Ying wrote:
>> >> Dennis Zhou <dennis@kernel.org> writes:
>> >> 
>> >> > Hello,
>> >> >
>> >> > On Wed, Apr 14, 2021 at 10:06:48AM +0800, Huang, Ying wrote:
>> >> >> Miaohe Lin <linmiaohe@huawei.com> writes:
>> >> >> 
>> >> >> > On 2021/4/14 9:17, Huang, Ying wrote:
>> >> >> >> Miaohe Lin <linmiaohe@huawei.com> writes:
>> >> >> >> 
>> >> >> >>> On 2021/4/12 15:24, Huang, Ying wrote:
>> >> >> >>>> "Huang, Ying" <ying.huang@intel.com> writes:
>> >> >> >>>>
>> >> >> >>>>> Miaohe Lin <linmiaohe@huawei.com> writes:
>> >> >> >>>>>
>> >> >> >>>>>> We will use percpu-refcount to serialize against concurrent swapoff. This
>> >> >> >>>>>> patch adds the percpu_ref support for later fixup.
>> >> >> >>>>>>
>> >> >> >>>>>> Signed-off-by: Miaohe Lin <linmiaohe@huawei.com>
>> >> >> >>>>>> ---
>> >> >> >>>>>>  include/linux/swap.h |  2 ++
>> >> >> >>>>>>  mm/swapfile.c        | 25 ++++++++++++++++++++++---
>> >> >> >>>>>>  2 files changed, 24 insertions(+), 3 deletions(-)
>> >> >> >>>>>>
>> >> >> >>>>>> diff --git a/include/linux/swap.h b/include/linux/swap.h
>> >> >> >>>>>> index 144727041e78..849ba5265c11 100644
>> >> >> >>>>>> --- a/include/linux/swap.h
>> >> >> >>>>>> +++ b/include/linux/swap.h
>> >> >> >>>>>> @@ -240,6 +240,7 @@ struct swap_cluster_list {
>> >> >> >>>>>>   * The in-memory structure used to track swap areas.
>> >> >> >>>>>>   */
>> >> >> >>>>>>  struct swap_info_struct {
>> >> >> >>>>>> +	struct percpu_ref users;	/* serialization against concurrent swapoff */
>> >> >> >>>>>>  	unsigned long	flags;		/* SWP_USED etc: see above */
>> >> >> >>>>>>  	signed short	prio;		/* swap priority of this type */
>> >> >> >>>>>>  	struct plist_node list;		/* entry in swap_active_head */
>> >> >> >>>>>> @@ -260,6 +261,7 @@ struct swap_info_struct {
>> >> >> >>>>>>  	struct block_device *bdev;	/* swap device or bdev of swap file */
>> >> >> >>>>>>  	struct file *swap_file;		/* seldom referenced */
>> >> >> >>>>>>  	unsigned int old_block_size;	/* seldom referenced */
>> >> >> >>>>>> +	struct completion comp;		/* seldom referenced */
>> >> >> >>>>>>  #ifdef CONFIG_FRONTSWAP
>> >> >> >>>>>>  	unsigned long *frontswap_map;	/* frontswap in-use, one bit per page */
>> >> >> >>>>>>  	atomic_t frontswap_pages;	/* frontswap pages in-use counter */
>> >> >> >>>>>> diff --git a/mm/swapfile.c b/mm/swapfile.c
>> >> >> >>>>>> index 149e77454e3c..724173cd7d0c 100644
>> >> >> >>>>>> --- a/mm/swapfile.c
>> >> >> >>>>>> +++ b/mm/swapfile.c
>> >> >> >>>>>> @@ -39,6 +39,7 @@
>> >> >> >>>>>>  #include <linux/export.h>
>> >> >> >>>>>>  #include <linux/swap_slots.h>
>> >> >> >>>>>>  #include <linux/sort.h>
>> >> >> >>>>>> +#include <linux/completion.h>
>> >> >> >>>>>>  
>> >> >> >>>>>>  #include <asm/tlbflush.h>
>> >> >> >>>>>>  #include <linux/swapops.h>
>> >> >> >>>>>> @@ -511,6 +512,15 @@ static void swap_discard_work(struct work_struct *work)
>> >> >> >>>>>>  	spin_unlock(&si->lock);
>> >> >> >>>>>>  }
>> >> >> >>>>>>  
>> >> >> >>>>>> +static void swap_users_ref_free(struct percpu_ref *ref)
>> >> >> >>>>>> +{
>> >> >> >>>>>> +	struct swap_info_struct *si;
>> >> >> >>>>>> +
>> >> >> >>>>>> +	si = container_of(ref, struct swap_info_struct, users);
>> >> >> >>>>>> +	complete(&si->comp);
>> >> >> >>>>>> +	percpu_ref_exit(&si->users);
>> >> >> >>>>>
>> >> >> >>>>> Because percpu_ref_exit() is used, we cannot use percpu_ref_tryget() in
>> >> >> >>>>> get_swap_device(), better to add comments there.
>> >> >> >>>>
>> >> >> >>>> I just noticed that the comments of percpu_ref_tryget_live() says,
>> >> >> >>>>
>> >> >> >>>>  * This function is safe to call as long as @ref is between init and exit.
>> >> >> >>>>
>> >> >> >>>> While we need to call get_swap_device() almost at any time, so it's
>> >> >> >>>> better to avoid to call percpu_ref_exit() at all.  This will waste some
>> >> >> >>>> memory, but we need to follow the API definition to avoid potential
>> >> >> >>>> issues in the long term.
>> >> >> >>>
>> >> >> >>> I have to admit that I'am not really familiar with percpu_ref. So I read the
>> >> >> >>> implementation code of the percpu_ref and found percpu_ref_tryget_live() could
>> >> >> >>> be called after exit now. But you're right we need to follow the API definition
>> >> >> >>> to avoid potential issues in the long term.
>> >> >> >>>
>> >> >> >>>>
>> >> >> >>>> And we need to call percpu_ref_init() before insert the swap_info_struct
>> >> >> >>>> into the swap_info[].
>> >> >> >>>
>> >> >> >>> If we remove the call to percpu_ref_exit(), we should not use percpu_ref_init()
>> >> >> >>> here because *percpu_ref->data is assumed to be NULL* in percpu_ref_init() while
>> >> >> >>> this is not the case as we do not call percpu_ref_exit(). Maybe percpu_ref_reinit()
>> >> >> >>> or percpu_ref_resurrect() will do the work.
>> >> >> >>>
>> >> >> >>> One more thing, how could I distinguish the killed percpu_ref from newly allocated one?
>> >> >> >>> It seems percpu_ref_is_dying is only safe to call when @ref is between init and exit.
>> >> >> >>> Maybe I could do this in alloc_swap_info()?
>> >> >> >> 
>> >> >> >> Yes.  In alloc_swap_info(), you can distinguish newly allocated and
>> >> >> >> reused swap_info_struct.
>> >> >> >> 
>> >> >> >>>>
>> >> >> >>>>>> +}
>> >> >> >>>>>> +
>> >> >> >>>>>>  static void alloc_cluster(struct swap_info_struct *si, unsigned long idx)
>> >> >> >>>>>>  {
>> >> >> >>>>>>  	struct swap_cluster_info *ci = si->cluster_info;
>> >> >> >>>>>> @@ -2500,7 +2510,7 @@ static void enable_swap_info(struct swap_info_struct *p, int prio,
>> >> >> >>>>>>  	 * Guarantee swap_map, cluster_info, etc. fields are valid
>> >> >> >>>>>>  	 * between get/put_swap_device() if SWP_VALID bit is set
>> >> >> >>>>>>  	 */
>> >> >> >>>>>> -	synchronize_rcu();
>> >> >> >>>>>> +	percpu_ref_reinit(&p->users);
>> >> >> >>>>>
>> >> >> >>>>> Although the effect is same, I think it's better to use
>> >> >> >>>>> percpu_ref_resurrect() here to improve code readability.
>> >> >> >>>>
>> >> >> >>>> Check the original commit description for commit eb085574a752 "mm, swap:
>> >> >> >>>> fix race between swapoff and some swap operations" and discussion email
>> >> >> >>>> thread as follows again,
>> >> >> >>>>
>> >> >> >>>> https://lore.kernel.org/linux-mm/20171219053650.GB7829@linux.vnet.ibm.com/
>> >> >> >>>>
>> >> >> >>>> I found that the synchronize_rcu() here is to avoid to call smp_rmb() or
>> >> >> >>>> smp_load_acquire() in get_swap_device().  Now we will use
>> >> >> >>>> percpu_ref_tryget_live() in get_swap_device(), so we will need to add
>> >> >> >>>> the necessary memory barrier, or make sure percpu_ref_tryget_live() has
>> >> >> >>>> ACQUIRE semantics.  Per my understanding, we need to change
>> >> >> >>>> percpu_ref_tryget_live() for that.
>> >> >> >>>>
>> >> >> >>>
>> >> >> >>> Do you mean the below scene is possible?
>> >> >> >>>
>> >> >> >>> cpu1
>> >> >> >>> swapon()
>> >> >> >>>   ...
>> >> >> >>>   percpu_ref_init
>> >> >> >>>   ...
>> >> >> >>>   setup_swap_info
>> >> >> >>>   /* smp_store_release() is inside percpu_ref_reinit */
>> >> >> >>>   percpu_ref_reinit
>> >> >> >> 
>> >> >> >> spin_unlock() has RELEASE semantics already.
>> >> >> >> 
>> >> >> >>>   ...
>> >> >> >>>
>> >> >> >>> cpu2
>> >> >> >>> get_swap_device()
>> >> >> >>>   /* ignored  smp_rmb() */
>> >> >> >>>   percpu_ref_tryget_live
>> >> >> >> 
>> >> >> >> Some kind of ACQUIRE is required here to guarantee the refcount is
>> >> >> >> checked before fetching the other fields of swap_info_struct.  I have
>> >> >> >> sent out a RFC patch to mailing list to discuss this.
>> >> >
>> >> > I'm just catching up and following along a little bit. I apologize I
>> >> > haven't read the swap code, but my understanding is you are trying to
>> >> > narrow a race condition with swapoff. That makes sense to me. I'm not
>> >> > sure I follow the need to race with reinitializing the ref though? Is it
>> >> > not possible to wait out the dying swap info and then create a new one
>> >> > rather than push acquire semantics?
>> >> 
>> >> We want to check whether the swap entry is valid (that is, the swap
>> >> device isn't swapped off now), prevent it from swapping off, then access
>> >> the swap_info_struct data structure.  When accessing swap_info_struct,
>> >> we want to guarantee the ordering, so that we will not reference
>> >> uninitialized fields of swap_info_struct.
>> >> 
>> >
>> > So in the normal context of percpu_ref, once someone can access it, the
>> > elements that it is protecting are expected to be initialized.
>> 
>> If we can make sure that all elements being initialized fully, why not
>> just use percpu_ref_get() instead of percpu_ref_tryget*()?
>> 
>
> Generally, the lookup is protected with rcu and then
> percpu_ref_tryget*() is used to obtain a reference. percpu_ref_get() is
> only good if you already have a ref as it increments regardless of being
> 0.
>
> What I mean is if you can get a ref, that means the object hasn't been
> destroyed. This differs from the semantics you are looking for which I
> understand to be: I have long lived pointers to objects. The object may
> die, but I may resurrect it and I want the old pointers to still be
> valid.
>
> When is it possible for someone to have a pointer to the swap device and
> the refcount goes to 0? It might be better to avoid this situation than
> add acquire semantics.
>
>> > In the basic case for swap off, I'm seeing the goal as to prevent
>> > destruction until anyone currently accessing swap is done. In this
>> > case wouldn't we always be protecting a live struct?
>> >
>> > I'm maybe not understanding what conditions you're trying to revive the
>> > percpu_ref?
>> 
>> A swap entry likes an indirect pointer to a swap device.  We may hold a
>> swap entry for long time, so that the swap device is swapoff/swapon.
>> Then we need to make sure the swap device are fully initialized before
>> accessing the swap device via the swap entry.
>> 
>
> So if I have some number of outstanding references, and then
> percpu_ref_kill() is called, then only those that have the pointer will
> be able to use the swap device as those references are still good. Prior
> to calling percpu_ref_kill(), call_rcu() needs to be called on lookup
> data structure.
>
> My personal understanding of tryget() vs tryget_live() is that it
> provides a 2 phase clean up and bounds the ability for new users to come
> in (cgroup destruction is a primary user). As tryget() might inevitably
> let a cgroup live long past its removal, tryget_live() will say oh
> you're in the process of dying do something else.

OK.  I think that I understand your typical use case now.  The resource
producer code may look like,

  obj = kmalloc();
  /* Initialize obj fields */
  percpu_ref_init(&obj->ref);
  rcu_assign_pointer(global_p, obj);

The resource reclaimer looks like,

  p = global_p;
  global_p = NULL;
  percpu_ref_kill(&p->ref);
  /* wait until percpu_ref_is_zero(&p->ref) */
  /* free resources pointed by obj fields */
  kfree(p);

The resource producer looks like,

  rcu_read_lock();
  p = rcu_dereference(global_p);
  if (!p || !percpu_ref_tryget_live(&p->ref)) {
          /* Invalid pointer, go out */
  }
  rcu_read_unlock();
  /* use p */
  percpu_ref_put(&p->ref);

For this use case, it's not necessary to make percpu_ref_tryget_live()
ACQUIRE operation.  Because refcount doesn't act as a flag to indicate
whether the object has been fully initialized, global_p does.  And
the data dependency guaranteed the required ordering.

The use case of swap is different.  Where global_p always points to
the obj (never freed) even if the resources pointed by obj fields has
been freed.  And we want to use refcount as a flag to indicate whether
the object is fully initialized.  This is hard to be changed, because
the global_p is used to identify the stalled pointer from the totally
invalid pointer.

If all other users follow the typical use case above, we may find some
other way to resolve the problem inside swap code, such as adding
smp_rmb() after percpu_ref_tryget_live().

Best Regards,
Huang, Ying


^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [PATCH 1/5] mm/swapfile: add percpu_ref support for swap
  2021-04-15  4:20                           ` Dennis Zhou
@ 2021-04-15  9:17                             ` Miaohe Lin
  0 siblings, 0 replies; 72+ messages in thread
From: Miaohe Lin @ 2021-04-15  9:17 UTC (permalink / raw)
  To: Dennis Zhou
  Cc: Huang, Ying, akpm, hannes, mhocko, iamjoonsoo.kim, vbabka,
	alex.shi, willy, minchan, richard.weiyang, hughd, tim.c.chen,
	linux-kernel, linux-mm

On 2021/4/15 12:20, Dennis Zhou wrote:
> On Thu, Apr 15, 2021 at 11:16:42AM +0800, Miaohe Lin wrote:
>> On 2021/4/14 22:53, Dennis Zhou wrote:
>>> On Wed, Apr 14, 2021 at 01:44:58PM +0800, Huang, Ying wrote:
>>>> Dennis Zhou <dennis@kernel.org> writes:
>>>>
>>>>> On Wed, Apr 14, 2021 at 11:59:03AM +0800, Huang, Ying wrote:
>>>>>> Dennis Zhou <dennis@kernel.org> writes:
>>>>>>
>>>>>>> Hello,
>>>>>>>
>>>>>>> On Wed, Apr 14, 2021 at 10:06:48AM +0800, Huang, Ying wrote:
>>>>>>>> Miaohe Lin <linmiaohe@huawei.com> writes:
>>>>>>>>
>>>>>>>>> On 2021/4/14 9:17, Huang, Ying wrote:
>>>>>>>>>> Miaohe Lin <linmiaohe@huawei.com> writes:
>>>>>>>>>>
>>>>>>>>>>> On 2021/4/12 15:24, Huang, Ying wrote:
>>>>>>>>>>>> "Huang, Ying" <ying.huang@intel.com> writes:
>>>>>>>>>>>>
>>>>>>>>>>>>> Miaohe Lin <linmiaohe@huawei.com> writes:
>>>>>>>>>>>>>
>>>>>>>>>>>>>> We will use percpu-refcount to serialize against concurrent swapoff. This
>>>>>>>>>>>>>> patch adds the percpu_ref support for later fixup.
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> Signed-off-by: Miaohe Lin <linmiaohe@huawei.com>
>>>>>>>>>>>>>> ---
>>>>>>>>>>>>>>  include/linux/swap.h |  2 ++
>>>>>>>>>>>>>>  mm/swapfile.c        | 25 ++++++++++++++++++++++---
>>>>>>>>>>>>>>  2 files changed, 24 insertions(+), 3 deletions(-)
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> diff --git a/include/linux/swap.h b/include/linux/swap.h
>>>>>>>>>>>>>> index 144727041e78..849ba5265c11 100644
>>>>>>>>>>>>>> --- a/include/linux/swap.h
>>>>>>>>>>>>>> +++ b/include/linux/swap.h
>>>>>>>>>>>>>> @@ -240,6 +240,7 @@ struct swap_cluster_list {
>>>>>>>>>>>>>>   * The in-memory structure used to track swap areas.
>>>>>>>>>>>>>>   */
>>>>>>>>>>>>>>  struct swap_info_struct {
>>>>>>>>>>>>>> +	struct percpu_ref users;	/* serialization against concurrent swapoff */
>>>>>>>>>>>>>>  	unsigned long	flags;		/* SWP_USED etc: see above */
>>>>>>>>>>>>>>  	signed short	prio;		/* swap priority of this type */
>>>>>>>>>>>>>>  	struct plist_node list;		/* entry in swap_active_head */
>>>>>>>>>>>>>> @@ -260,6 +261,7 @@ struct swap_info_struct {
>>>>>>>>>>>>>>  	struct block_device *bdev;	/* swap device or bdev of swap file */
>>>>>>>>>>>>>>  	struct file *swap_file;		/* seldom referenced */
>>>>>>>>>>>>>>  	unsigned int old_block_size;	/* seldom referenced */
>>>>>>>>>>>>>> +	struct completion comp;		/* seldom referenced */
>>>>>>>>>>>>>>  #ifdef CONFIG_FRONTSWAP
>>>>>>>>>>>>>>  	unsigned long *frontswap_map;	/* frontswap in-use, one bit per page */
>>>>>>>>>>>>>>  	atomic_t frontswap_pages;	/* frontswap pages in-use counter */
>>>>>>>>>>>>>> diff --git a/mm/swapfile.c b/mm/swapfile.c
>>>>>>>>>>>>>> index 149e77454e3c..724173cd7d0c 100644
>>>>>>>>>>>>>> --- a/mm/swapfile.c
>>>>>>>>>>>>>> +++ b/mm/swapfile.c
>>>>>>>>>>>>>> @@ -39,6 +39,7 @@
>>>>>>>>>>>>>>  #include <linux/export.h>
>>>>>>>>>>>>>>  #include <linux/swap_slots.h>
>>>>>>>>>>>>>>  #include <linux/sort.h>
>>>>>>>>>>>>>> +#include <linux/completion.h>
>>>>>>>>>>>>>>  
>>>>>>>>>>>>>>  #include <asm/tlbflush.h>
>>>>>>>>>>>>>>  #include <linux/swapops.h>
>>>>>>>>>>>>>> @@ -511,6 +512,15 @@ static void swap_discard_work(struct work_struct *work)
>>>>>>>>>>>>>>  	spin_unlock(&si->lock);
>>>>>>>>>>>>>>  }
>>>>>>>>>>>>>>  
>>>>>>>>>>>>>> +static void swap_users_ref_free(struct percpu_ref *ref)
>>>>>>>>>>>>>> +{
>>>>>>>>>>>>>> +	struct swap_info_struct *si;
>>>>>>>>>>>>>> +
>>>>>>>>>>>>>> +	si = container_of(ref, struct swap_info_struct, users);
>>>>>>>>>>>>>> +	complete(&si->comp);
>>>>>>>>>>>>>> +	percpu_ref_exit(&si->users);
>>>>>>>>>>>>>
>>>>>>>>>>>>> Because percpu_ref_exit() is used, we cannot use percpu_ref_tryget() in
>>>>>>>>>>>>> get_swap_device(), better to add comments there.
>>>>>>>>>>>>
>>>>>>>>>>>> I just noticed that the comments of percpu_ref_tryget_live() says,
>>>>>>>>>>>>
>>>>>>>>>>>>  * This function is safe to call as long as @ref is between init and exit.
>>>>>>>>>>>>
>>>>>>>>>>>> While we need to call get_swap_device() almost at any time, so it's
>>>>>>>>>>>> better to avoid to call percpu_ref_exit() at all.  This will waste some
>>>>>>>>>>>> memory, but we need to follow the API definition to avoid potential
>>>>>>>>>>>> issues in the long term.
>>>>>>>>>>>
>>>>>>>>>>> I have to admit that I'am not really familiar with percpu_ref. So I read the
>>>>>>>>>>> implementation code of the percpu_ref and found percpu_ref_tryget_live() could
>>>>>>>>>>> be called after exit now. But you're right we need to follow the API definition
>>>>>>>>>>> to avoid potential issues in the long term.
>>>>>>>>>>>
>>>>>>>>>>>>
>>>>>>>>>>>> And we need to call percpu_ref_init() before insert the swap_info_struct
>>>>>>>>>>>> into the swap_info[].
>>>>>>>>>>>
>>>>>>>>>>> If we remove the call to percpu_ref_exit(), we should not use percpu_ref_init()
>>>>>>>>>>> here because *percpu_ref->data is assumed to be NULL* in percpu_ref_init() while
>>>>>>>>>>> this is not the case as we do not call percpu_ref_exit(). Maybe percpu_ref_reinit()
>>>>>>>>>>> or percpu_ref_resurrect() will do the work.
>>>>>>>>>>>
>>>>>>>>>>> One more thing, how could I distinguish the killed percpu_ref from newly allocated one?
>>>>>>>>>>> It seems percpu_ref_is_dying is only safe to call when @ref is between init and exit.
>>>>>>>>>>> Maybe I could do this in alloc_swap_info()?
>>>>>>>>>>
>>>>>>>>>> Yes.  In alloc_swap_info(), you can distinguish newly allocated and
>>>>>>>>>> reused swap_info_struct.
>>>>>>>>>>
>>>>>>>>>>>>
>>>>>>>>>>>>>> +}
>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>  static void alloc_cluster(struct swap_info_struct *si, unsigned long idx)
>>>>>>>>>>>>>>  {
>>>>>>>>>>>>>>  	struct swap_cluster_info *ci = si->cluster_info;
>>>>>>>>>>>>>> @@ -2500,7 +2510,7 @@ static void enable_swap_info(struct swap_info_struct *p, int prio,
>>>>>>>>>>>>>>  	 * Guarantee swap_map, cluster_info, etc. fields are valid
>>>>>>>>>>>>>>  	 * between get/put_swap_device() if SWP_VALID bit is set
>>>>>>>>>>>>>>  	 */
>>>>>>>>>>>>>> -	synchronize_rcu();
>>>>>>>>>>>>>> +	percpu_ref_reinit(&p->users);
>>>>>>>>>>>>>
>>>>>>>>>>>>> Although the effect is same, I think it's better to use
>>>>>>>>>>>>> percpu_ref_resurrect() here to improve code readability.
>>>>>>>>>>>>
>>>>>>>>>>>> Check the original commit description for commit eb085574a752 "mm, swap:
>>>>>>>>>>>> fix race between swapoff and some swap operations" and discussion email
>>>>>>>>>>>> thread as follows again,
>>>>>>>>>>>>
>>>>>>>>>>>> https://lore.kernel.org/linux-mm/20171219053650.GB7829@linux.vnet.ibm.com/
>>>>>>>>>>>>
>>>>>>>>>>>> I found that the synchronize_rcu() here is to avoid to call smp_rmb() or
>>>>>>>>>>>> smp_load_acquire() in get_swap_device().  Now we will use
>>>>>>>>>>>> percpu_ref_tryget_live() in get_swap_device(), so we will need to add
>>>>>>>>>>>> the necessary memory barrier, or make sure percpu_ref_tryget_live() has
>>>>>>>>>>>> ACQUIRE semantics.  Per my understanding, we need to change
>>>>>>>>>>>> percpu_ref_tryget_live() for that.
>>>>>>>>>>>>
>>>>>>>>>>>
>>>>>>>>>>> Do you mean the below scene is possible?
>>>>>>>>>>>
>>>>>>>>>>> cpu1
>>>>>>>>>>> swapon()
>>>>>>>>>>>   ...
>>>>>>>>>>>   percpu_ref_init
>>>>>>>>>>>   ...
>>>>>>>>>>>   setup_swap_info
>>>>>>>>>>>   /* smp_store_release() is inside percpu_ref_reinit */
>>>>>>>>>>>   percpu_ref_reinit
>>>>>>>>>>
>>>>>>>>>> spin_unlock() has RELEASE semantics already.
>>>>>>>>>>
>>>>>>>>>>>   ...
>>>>>>>>>>>
>>>>>>>>>>> cpu2
>>>>>>>>>>> get_swap_device()
>>>>>>>>>>>   /* ignored  smp_rmb() */
>>>>>>>>>>>   percpu_ref_tryget_live
>>>>>>>>>>
>>>>>>>>>> Some kind of ACQUIRE is required here to guarantee the refcount is
>>>>>>>>>> checked before fetching the other fields of swap_info_struct.  I have
>>>>>>>>>> sent out a RFC patch to mailing list to discuss this.
>>>>>>>
>>>>>>> I'm just catching up and following along a little bit. I apologize I
>>>>>>> haven't read the swap code, but my understanding is you are trying to
>>>>>>> narrow a race condition with swapoff. That makes sense to me. I'm not
>>>>>>> sure I follow the need to race with reinitializing the ref though? Is it
>>>>>>> not possible to wait out the dying swap info and then create a new one
>>>>>>> rather than push acquire semantics?
>>>>>>
>>>>>> We want to check whether the swap entry is valid (that is, the swap
>>>>>> device isn't swapped off now), prevent it from swapping off, then access
>>>>>> the swap_info_struct data structure.  When accessing swap_info_struct,
>>>>>> we want to guarantee the ordering, so that we will not reference
>>>>>> uninitialized fields of swap_info_struct.
>>>>>>
>>>>>
>>>>> So in the normal context of percpu_ref, once someone can access it, the
>>>>> elements that it is protecting are expected to be initialized.
>>>>
>>>> If we can make sure that all elements being initialized fully, why not
>>>> just use percpu_ref_get() instead of percpu_ref_tryget*()?
>>>>
>>>
>>> Generally, the lookup is protected with rcu and then
>>> percpu_ref_tryget*() is used to obtain a reference. percpu_ref_get() is
>>> only good if you already have a ref as it increments regardless of being
>>> 0.
>>>
>>> What I mean is if you can get a ref, that means the object hasn't been
>>> destroyed. This differs from the semantics you are looking for which I
>>
>> This assumption might not be held for swap. If we can get a ref, that means
>> the object hasn't been destroyed or the object has been destroyed and created
>> again. It's because swp_entry can hold a really long time while swapoff+swapon
>> happened. So we may get a ref to a newly swapon-ed swap device using old swap_entry.
>> So we must guarantee that we will not reference uninitialized fields of newly
>> swapon-ed swap device.
>>
>> Does this make sense for you? Thanks.
>>
> 
> Okay if I understand this right. The need is because:
> 
> struct swap_info_struct *swap_info[MAX_SWAPFILES];
> 
> swap_info[type] is recreated in place. And a swap_entry keeps a
> swap_type and that is how it gets the value.
> 
> An alternative to that approach is to adopt something similar to how
> cgroups does it which is with rcu and not constructing the object in
> place.
> 
> rcu_read_lock();
> swap_info_struct *info = swap_info[type];
> got_ref = percpu_ref_tryget_live(&info->refcnt);
> rcu_read_unlock();
> 

Looks like a good alternative. But per my understanding, if we use rcu_read_lock
and synchronize_rcu to provide the acquire + release barrier, the all reference
to the fields of swap device should be in the rcu critical section. This could
not fix the do_swap_page() race with swapoff properly as patch 2/5 pointed out.

Please see below discussion provided by Huang, Ying previously:
https://lore.kernel.org/linux-mm/20171219053650.GB7829@linux.vnet.ibm.com/

> 
> However, I do not have a good sense of the cost of rcu + this vs
> an acquire + release barrier.
> 
> <snip>
> 
> Thanks,
> Dennis
> .
> 


^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [PATCH 1/5] mm/swapfile: add percpu_ref support for swap
  2021-04-15  5:24                           ` Huang, Ying
  (?)
@ 2021-04-15 14:31                           ` Dennis Zhou
  2021-04-16  0:54                               ` Huang, Ying
  2021-04-16  2:27                             ` Miaohe Lin
  -1 siblings, 2 replies; 72+ messages in thread
From: Dennis Zhou @ 2021-04-15 14:31 UTC (permalink / raw)
  To: Huang, Ying
  Cc: Miaohe Lin, akpm, hannes, mhocko, iamjoonsoo.kim, vbabka,
	alex.shi, willy, minchan, richard.weiyang, hughd, tim.c.chen,
	linux-kernel, linux-mm

On Thu, Apr 15, 2021 at 01:24:31PM +0800, Huang, Ying wrote:
> Dennis Zhou <dennis@kernel.org> writes:
> 
> > On Wed, Apr 14, 2021 at 01:44:58PM +0800, Huang, Ying wrote:
> >> Dennis Zhou <dennis@kernel.org> writes:
> >> 
> >> > On Wed, Apr 14, 2021 at 11:59:03AM +0800, Huang, Ying wrote:
> >> >> Dennis Zhou <dennis@kernel.org> writes:
> >> >> 
> >> >> > Hello,
> >> >> >
> >> >> > On Wed, Apr 14, 2021 at 10:06:48AM +0800, Huang, Ying wrote:
> >> >> >> Miaohe Lin <linmiaohe@huawei.com> writes:
> >> >> >> 
> >> >> >> > On 2021/4/14 9:17, Huang, Ying wrote:
> >> >> >> >> Miaohe Lin <linmiaohe@huawei.com> writes:
> >> >> >> >> 
> >> >> >> >>> On 2021/4/12 15:24, Huang, Ying wrote:
> >> >> >> >>>> "Huang, Ying" <ying.huang@intel.com> writes:
> >> >> >> >>>>
> >> >> >> >>>>> Miaohe Lin <linmiaohe@huawei.com> writes:
> >> >> >> >>>>>
> >> >> >> >>>>>> We will use percpu-refcount to serialize against concurrent swapoff. This
> >> >> >> >>>>>> patch adds the percpu_ref support for later fixup.
> >> >> >> >>>>>>
> >> >> >> >>>>>> Signed-off-by: Miaohe Lin <linmiaohe@huawei.com>
> >> >> >> >>>>>> ---
> >> >> >> >>>>>>  include/linux/swap.h |  2 ++
> >> >> >> >>>>>>  mm/swapfile.c        | 25 ++++++++++++++++++++++---
> >> >> >> >>>>>>  2 files changed, 24 insertions(+), 3 deletions(-)
> >> >> >> >>>>>>
> >> >> >> >>>>>> diff --git a/include/linux/swap.h b/include/linux/swap.h
> >> >> >> >>>>>> index 144727041e78..849ba5265c11 100644
> >> >> >> >>>>>> --- a/include/linux/swap.h
> >> >> >> >>>>>> +++ b/include/linux/swap.h
> >> >> >> >>>>>> @@ -240,6 +240,7 @@ struct swap_cluster_list {
> >> >> >> >>>>>>   * The in-memory structure used to track swap areas.
> >> >> >> >>>>>>   */
> >> >> >> >>>>>>  struct swap_info_struct {
> >> >> >> >>>>>> +	struct percpu_ref users;	/* serialization against concurrent swapoff */
> >> >> >> >>>>>>  	unsigned long	flags;		/* SWP_USED etc: see above */
> >> >> >> >>>>>>  	signed short	prio;		/* swap priority of this type */
> >> >> >> >>>>>>  	struct plist_node list;		/* entry in swap_active_head */
> >> >> >> >>>>>> @@ -260,6 +261,7 @@ struct swap_info_struct {
> >> >> >> >>>>>>  	struct block_device *bdev;	/* swap device or bdev of swap file */
> >> >> >> >>>>>>  	struct file *swap_file;		/* seldom referenced */
> >> >> >> >>>>>>  	unsigned int old_block_size;	/* seldom referenced */
> >> >> >> >>>>>> +	struct completion comp;		/* seldom referenced */
> >> >> >> >>>>>>  #ifdef CONFIG_FRONTSWAP
> >> >> >> >>>>>>  	unsigned long *frontswap_map;	/* frontswap in-use, one bit per page */
> >> >> >> >>>>>>  	atomic_t frontswap_pages;	/* frontswap pages in-use counter */
> >> >> >> >>>>>> diff --git a/mm/swapfile.c b/mm/swapfile.c
> >> >> >> >>>>>> index 149e77454e3c..724173cd7d0c 100644
> >> >> >> >>>>>> --- a/mm/swapfile.c
> >> >> >> >>>>>> +++ b/mm/swapfile.c
> >> >> >> >>>>>> @@ -39,6 +39,7 @@
> >> >> >> >>>>>>  #include <linux/export.h>
> >> >> >> >>>>>>  #include <linux/swap_slots.h>
> >> >> >> >>>>>>  #include <linux/sort.h>
> >> >> >> >>>>>> +#include <linux/completion.h>
> >> >> >> >>>>>>  
> >> >> >> >>>>>>  #include <asm/tlbflush.h>
> >> >> >> >>>>>>  #include <linux/swapops.h>
> >> >> >> >>>>>> @@ -511,6 +512,15 @@ static void swap_discard_work(struct work_struct *work)
> >> >> >> >>>>>>  	spin_unlock(&si->lock);
> >> >> >> >>>>>>  }
> >> >> >> >>>>>>  
> >> >> >> >>>>>> +static void swap_users_ref_free(struct percpu_ref *ref)
> >> >> >> >>>>>> +{
> >> >> >> >>>>>> +	struct swap_info_struct *si;
> >> >> >> >>>>>> +
> >> >> >> >>>>>> +	si = container_of(ref, struct swap_info_struct, users);
> >> >> >> >>>>>> +	complete(&si->comp);
> >> >> >> >>>>>> +	percpu_ref_exit(&si->users);
> >> >> >> >>>>>
> >> >> >> >>>>> Because percpu_ref_exit() is used, we cannot use percpu_ref_tryget() in
> >> >> >> >>>>> get_swap_device(), better to add comments there.
> >> >> >> >>>>
> >> >> >> >>>> I just noticed that the comments of percpu_ref_tryget_live() says,
> >> >> >> >>>>
> >> >> >> >>>>  * This function is safe to call as long as @ref is between init and exit.
> >> >> >> >>>>
> >> >> >> >>>> While we need to call get_swap_device() almost at any time, so it's
> >> >> >> >>>> better to avoid to call percpu_ref_exit() at all.  This will waste some
> >> >> >> >>>> memory, but we need to follow the API definition to avoid potential
> >> >> >> >>>> issues in the long term.
> >> >> >> >>>
> >> >> >> >>> I have to admit that I'am not really familiar with percpu_ref. So I read the
> >> >> >> >>> implementation code of the percpu_ref and found percpu_ref_tryget_live() could
> >> >> >> >>> be called after exit now. But you're right we need to follow the API definition
> >> >> >> >>> to avoid potential issues in the long term.
> >> >> >> >>>
> >> >> >> >>>>
> >> >> >> >>>> And we need to call percpu_ref_init() before insert the swap_info_struct
> >> >> >> >>>> into the swap_info[].
> >> >> >> >>>
> >> >> >> >>> If we remove the call to percpu_ref_exit(), we should not use percpu_ref_init()
> >> >> >> >>> here because *percpu_ref->data is assumed to be NULL* in percpu_ref_init() while
> >> >> >> >>> this is not the case as we do not call percpu_ref_exit(). Maybe percpu_ref_reinit()
> >> >> >> >>> or percpu_ref_resurrect() will do the work.
> >> >> >> >>>
> >> >> >> >>> One more thing, how could I distinguish the killed percpu_ref from newly allocated one?
> >> >> >> >>> It seems percpu_ref_is_dying is only safe to call when @ref is between init and exit.
> >> >> >> >>> Maybe I could do this in alloc_swap_info()?
> >> >> >> >> 
> >> >> >> >> Yes.  In alloc_swap_info(), you can distinguish newly allocated and
> >> >> >> >> reused swap_info_struct.
> >> >> >> >> 
> >> >> >> >>>>
> >> >> >> >>>>>> +}
> >> >> >> >>>>>> +
> >> >> >> >>>>>>  static void alloc_cluster(struct swap_info_struct *si, unsigned long idx)
> >> >> >> >>>>>>  {
> >> >> >> >>>>>>  	struct swap_cluster_info *ci = si->cluster_info;
> >> >> >> >>>>>> @@ -2500,7 +2510,7 @@ static void enable_swap_info(struct swap_info_struct *p, int prio,
> >> >> >> >>>>>>  	 * Guarantee swap_map, cluster_info, etc. fields are valid
> >> >> >> >>>>>>  	 * between get/put_swap_device() if SWP_VALID bit is set
> >> >> >> >>>>>>  	 */
> >> >> >> >>>>>> -	synchronize_rcu();
> >> >> >> >>>>>> +	percpu_ref_reinit(&p->users);
> >> >> >> >>>>>
> >> >> >> >>>>> Although the effect is same, I think it's better to use
> >> >> >> >>>>> percpu_ref_resurrect() here to improve code readability.
> >> >> >> >>>>
> >> >> >> >>>> Check the original commit description for commit eb085574a752 "mm, swap:
> >> >> >> >>>> fix race between swapoff and some swap operations" and discussion email
> >> >> >> >>>> thread as follows again,
> >> >> >> >>>>
> >> >> >> >>>> https://lore.kernel.org/linux-mm/20171219053650.GB7829@linux.vnet.ibm.com/
> >> >> >> >>>>
> >> >> >> >>>> I found that the synchronize_rcu() here is to avoid to call smp_rmb() or
> >> >> >> >>>> smp_load_acquire() in get_swap_device().  Now we will use
> >> >> >> >>>> percpu_ref_tryget_live() in get_swap_device(), so we will need to add
> >> >> >> >>>> the necessary memory barrier, or make sure percpu_ref_tryget_live() has
> >> >> >> >>>> ACQUIRE semantics.  Per my understanding, we need to change
> >> >> >> >>>> percpu_ref_tryget_live() for that.
> >> >> >> >>>>
> >> >> >> >>>
> >> >> >> >>> Do you mean the below scene is possible?
> >> >> >> >>>
> >> >> >> >>> cpu1
> >> >> >> >>> swapon()
> >> >> >> >>>   ...
> >> >> >> >>>   percpu_ref_init
> >> >> >> >>>   ...
> >> >> >> >>>   setup_swap_info
> >> >> >> >>>   /* smp_store_release() is inside percpu_ref_reinit */
> >> >> >> >>>   percpu_ref_reinit
> >> >> >> >> 
> >> >> >> >> spin_unlock() has RELEASE semantics already.
> >> >> >> >> 
> >> >> >> >>>   ...
> >> >> >> >>>
> >> >> >> >>> cpu2
> >> >> >> >>> get_swap_device()
> >> >> >> >>>   /* ignored  smp_rmb() */
> >> >> >> >>>   percpu_ref_tryget_live
> >> >> >> >> 
> >> >> >> >> Some kind of ACQUIRE is required here to guarantee the refcount is
> >> >> >> >> checked before fetching the other fields of swap_info_struct.  I have
> >> >> >> >> sent out a RFC patch to mailing list to discuss this.
> >> >> >
> >> >> > I'm just catching up and following along a little bit. I apologize I
> >> >> > haven't read the swap code, but my understanding is you are trying to
> >> >> > narrow a race condition with swapoff. That makes sense to me. I'm not
> >> >> > sure I follow the need to race with reinitializing the ref though? Is it
> >> >> > not possible to wait out the dying swap info and then create a new one
> >> >> > rather than push acquire semantics?
> >> >> 
> >> >> We want to check whether the swap entry is valid (that is, the swap
> >> >> device isn't swapped off now), prevent it from swapping off, then access
> >> >> the swap_info_struct data structure.  When accessing swap_info_struct,
> >> >> we want to guarantee the ordering, so that we will not reference
> >> >> uninitialized fields of swap_info_struct.
> >> >> 
> >> >
> >> > So in the normal context of percpu_ref, once someone can access it, the
> >> > elements that it is protecting are expected to be initialized.
> >> 
> >> If we can make sure that all elements being initialized fully, why not
> >> just use percpu_ref_get() instead of percpu_ref_tryget*()?
> >> 
> >
> > Generally, the lookup is protected with rcu and then
> > percpu_ref_tryget*() is used to obtain a reference. percpu_ref_get() is
> > only good if you already have a ref as it increments regardless of being
> > 0.
> >
> > What I mean is if you can get a ref, that means the object hasn't been
> > destroyed. This differs from the semantics you are looking for which I
> > understand to be: I have long lived pointers to objects. The object may
> > die, but I may resurrect it and I want the old pointers to still be
> > valid.
> >
> > When is it possible for someone to have a pointer to the swap device and
> > the refcount goes to 0? It might be better to avoid this situation than
> > add acquire semantics.
> >
> >> > In the basic case for swap off, I'm seeing the goal as to prevent
> >> > destruction until anyone currently accessing swap is done. In this
> >> > case wouldn't we always be protecting a live struct?
> >> >
> >> > I'm maybe not understanding what conditions you're trying to revive the
> >> > percpu_ref?
> >> 
> >> A swap entry likes an indirect pointer to a swap device.  We may hold a
> >> swap entry for long time, so that the swap device is swapoff/swapon.
> >> Then we need to make sure the swap device are fully initialized before
> >> accessing the swap device via the swap entry.
> >> 
> >
> > So if I have some number of outstanding references, and then
> > percpu_ref_kill() is called, then only those that have the pointer will
> > be able to use the swap device as those references are still good. Prior
> > to calling percpu_ref_kill(), call_rcu() needs to be called on lookup
> > data structure.
> >
> > My personal understanding of tryget() vs tryget_live() is that it
> > provides a 2 phase clean up and bounds the ability for new users to come
> > in (cgroup destruction is a primary user). As tryget() might inevitably
> > let a cgroup live long past its removal, tryget_live() will say oh
> > you're in the process of dying do something else.
> 
> OK.  I think that I understand your typical use case now.  The resource
> producer code may look like,
> 
>   obj = kmalloc();
>   /* Initialize obj fields */
>   percpu_ref_init(&obj->ref);
>   rcu_assign_pointer(global_p, obj);
> 
> The resource reclaimer looks like,
> 
>   p = global_p;
>   global_p = NULL;
>   percpu_ref_kill(&p->ref);
>   /* wait until percpu_ref_is_zero(&p->ref) */
>   /* free resources pointed by obj fields */
>   kfree(p);
> 
> The resource producer looks like,
> 
>   rcu_read_lock();
>   p = rcu_dereference(global_p);
>   if (!p || !percpu_ref_tryget_live(&p->ref)) {
>           /* Invalid pointer, go out */
>   }
>   rcu_read_unlock();
>   /* use p */
>   percpu_ref_put(&p->ref);
> 
> For this use case, it's not necessary to make percpu_ref_tryget_live()
> ACQUIRE operation.  Because refcount doesn't act as a flag to indicate
> whether the object has been fully initialized, global_p does.  And
> the data dependency guaranteed the required ordering.
> 

Yes this is spot on.

> The use case of swap is different.  Where global_p always points to
> the obj (never freed) even if the resources pointed by obj fields has
> been freed.  And we want to use refcount as a flag to indicate whether
> the object is fully initialized.  This is hard to be changed, because
> the global_p is used to identify the stalled pointer from the totally
> invalid pointer.
> 

Apologies ahead of time for this possibly dumb question. Is it possible
to have swapon swap out the global_p with
old_obj = rcu_access_pointer(global_p);
rcu_assign_pointer(global_p, obj);
kfree_rcu(remove_old_obj) or call_rcu();

Then the obj pointed to by global_p would always be valid, but only
would be alive again if it got the new pointer?

> If all other users follow the typical use case above, we may find some
> other way to resolve the problem inside swap code, such as adding
> smp_rmb() after percpu_ref_tryget_live().
> 

I would prefer it.

> Best Regards,
> Huang, Ying

Thanks,
Dennis

^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [PATCH 1/5] mm/swapfile: add percpu_ref support for swap
  2021-04-15 14:31                           ` Dennis Zhou
@ 2021-04-16  0:54                               ` Huang, Ying
  2021-04-16  2:27                             ` Miaohe Lin
  1 sibling, 0 replies; 72+ messages in thread
From: Huang, Ying @ 2021-04-16  0:54 UTC (permalink / raw)
  To: Dennis Zhou
  Cc: Miaohe Lin, akpm, hannes, mhocko, iamjoonsoo.kim, vbabka,
	alex.shi, willy, minchan, richard.weiyang, hughd, tim.c.chen,
	linux-kernel, linux-mm

Dennis Zhou <dennis@kernel.org> writes:

> On Thu, Apr 15, 2021 at 01:24:31PM +0800, Huang, Ying wrote:
>> Dennis Zhou <dennis@kernel.org> writes:
>> 
>> > On Wed, Apr 14, 2021 at 01:44:58PM +0800, Huang, Ying wrote:
>> >> Dennis Zhou <dennis@kernel.org> writes:
>> >> 
>> >> > On Wed, Apr 14, 2021 at 11:59:03AM +0800, Huang, Ying wrote:
>> >> >> Dennis Zhou <dennis@kernel.org> writes:
>> >> >> 
>> >> >> > Hello,
>> >> >> >
>> >> >> > On Wed, Apr 14, 2021 at 10:06:48AM +0800, Huang, Ying wrote:
>> >> >> >> Miaohe Lin <linmiaohe@huawei.com> writes:
>> >> >> >> 
>> >> >> >> > On 2021/4/14 9:17, Huang, Ying wrote:
>> >> >> >> >> Miaohe Lin <linmiaohe@huawei.com> writes:
>> >> >> >> >> 
>> >> >> >> >>> On 2021/4/12 15:24, Huang, Ying wrote:
>> >> >> >> >>>> "Huang, Ying" <ying.huang@intel.com> writes:
>> >> >> >> >>>>
>> >> >> >> >>>>> Miaohe Lin <linmiaohe@huawei.com> writes:
>> >> >> >> >>>>>
>> >> >> >> >>>>>> We will use percpu-refcount to serialize against concurrent swapoff. This
>> >> >> >> >>>>>> patch adds the percpu_ref support for later fixup.
>> >> >> >> >>>>>>
>> >> >> >> >>>>>> Signed-off-by: Miaohe Lin <linmiaohe@huawei.com>
>> >> >> >> >>>>>> ---
>> >> >> >> >>>>>>  include/linux/swap.h |  2 ++
>> >> >> >> >>>>>>  mm/swapfile.c        | 25 ++++++++++++++++++++++---
>> >> >> >> >>>>>>  2 files changed, 24 insertions(+), 3 deletions(-)
>> >> >> >> >>>>>>
>> >> >> >> >>>>>> diff --git a/include/linux/swap.h b/include/linux/swap.h
>> >> >> >> >>>>>> index 144727041e78..849ba5265c11 100644
>> >> >> >> >>>>>> --- a/include/linux/swap.h
>> >> >> >> >>>>>> +++ b/include/linux/swap.h
>> >> >> >> >>>>>> @@ -240,6 +240,7 @@ struct swap_cluster_list {
>> >> >> >> >>>>>>   * The in-memory structure used to track swap areas.
>> >> >> >> >>>>>>   */
>> >> >> >> >>>>>>  struct swap_info_struct {
>> >> >> >> >>>>>> +	struct percpu_ref users;	/* serialization against concurrent swapoff */
>> >> >> >> >>>>>>  	unsigned long	flags;		/* SWP_USED etc: see above */
>> >> >> >> >>>>>>  	signed short	prio;		/* swap priority of this type */
>> >> >> >> >>>>>>  	struct plist_node list;		/* entry in swap_active_head */
>> >> >> >> >>>>>> @@ -260,6 +261,7 @@ struct swap_info_struct {
>> >> >> >> >>>>>>  	struct block_device *bdev;	/* swap device or bdev of swap file */
>> >> >> >> >>>>>>  	struct file *swap_file;		/* seldom referenced */
>> >> >> >> >>>>>>  	unsigned int old_block_size;	/* seldom referenced */
>> >> >> >> >>>>>> +	struct completion comp;		/* seldom referenced */
>> >> >> >> >>>>>>  #ifdef CONFIG_FRONTSWAP
>> >> >> >> >>>>>>  	unsigned long *frontswap_map;	/* frontswap in-use, one bit per page */
>> >> >> >> >>>>>>  	atomic_t frontswap_pages;	/* frontswap pages in-use counter */
>> >> >> >> >>>>>> diff --git a/mm/swapfile.c b/mm/swapfile.c
>> >> >> >> >>>>>> index 149e77454e3c..724173cd7d0c 100644
>> >> >> >> >>>>>> --- a/mm/swapfile.c
>> >> >> >> >>>>>> +++ b/mm/swapfile.c
>> >> >> >> >>>>>> @@ -39,6 +39,7 @@
>> >> >> >> >>>>>>  #include <linux/export.h>
>> >> >> >> >>>>>>  #include <linux/swap_slots.h>
>> >> >> >> >>>>>>  #include <linux/sort.h>
>> >> >> >> >>>>>> +#include <linux/completion.h>
>> >> >> >> >>>>>>  
>> >> >> >> >>>>>>  #include <asm/tlbflush.h>
>> >> >> >> >>>>>>  #include <linux/swapops.h>
>> >> >> >> >>>>>> @@ -511,6 +512,15 @@ static void swap_discard_work(struct work_struct *work)
>> >> >> >> >>>>>>  	spin_unlock(&si->lock);
>> >> >> >> >>>>>>  }
>> >> >> >> >>>>>>  
>> >> >> >> >>>>>> +static void swap_users_ref_free(struct percpu_ref *ref)
>> >> >> >> >>>>>> +{
>> >> >> >> >>>>>> +	struct swap_info_struct *si;
>> >> >> >> >>>>>> +
>> >> >> >> >>>>>> +	si = container_of(ref, struct swap_info_struct, users);
>> >> >> >> >>>>>> +	complete(&si->comp);
>> >> >> >> >>>>>> +	percpu_ref_exit(&si->users);
>> >> >> >> >>>>>
>> >> >> >> >>>>> Because percpu_ref_exit() is used, we cannot use percpu_ref_tryget() in
>> >> >> >> >>>>> get_swap_device(), better to add comments there.
>> >> >> >> >>>>
>> >> >> >> >>>> I just noticed that the comments of percpu_ref_tryget_live() says,
>> >> >> >> >>>>
>> >> >> >> >>>>  * This function is safe to call as long as @ref is between init and exit.
>> >> >> >> >>>>
>> >> >> >> >>>> While we need to call get_swap_device() almost at any time, so it's
>> >> >> >> >>>> better to avoid to call percpu_ref_exit() at all.  This will waste some
>> >> >> >> >>>> memory, but we need to follow the API definition to avoid potential
>> >> >> >> >>>> issues in the long term.
>> >> >> >> >>>
>> >> >> >> >>> I have to admit that I'am not really familiar with percpu_ref. So I read the
>> >> >> >> >>> implementation code of the percpu_ref and found percpu_ref_tryget_live() could
>> >> >> >> >>> be called after exit now. But you're right we need to follow the API definition
>> >> >> >> >>> to avoid potential issues in the long term.
>> >> >> >> >>>
>> >> >> >> >>>>
>> >> >> >> >>>> And we need to call percpu_ref_init() before insert the swap_info_struct
>> >> >> >> >>>> into the swap_info[].
>> >> >> >> >>>
>> >> >> >> >>> If we remove the call to percpu_ref_exit(), we should not use percpu_ref_init()
>> >> >> >> >>> here because *percpu_ref->data is assumed to be NULL* in percpu_ref_init() while
>> >> >> >> >>> this is not the case as we do not call percpu_ref_exit(). Maybe percpu_ref_reinit()
>> >> >> >> >>> or percpu_ref_resurrect() will do the work.
>> >> >> >> >>>
>> >> >> >> >>> One more thing, how could I distinguish the killed percpu_ref from newly allocated one?
>> >> >> >> >>> It seems percpu_ref_is_dying is only safe to call when @ref is between init and exit.
>> >> >> >> >>> Maybe I could do this in alloc_swap_info()?
>> >> >> >> >> 
>> >> >> >> >> Yes.  In alloc_swap_info(), you can distinguish newly allocated and
>> >> >> >> >> reused swap_info_struct.
>> >> >> >> >> 
>> >> >> >> >>>>
>> >> >> >> >>>>>> +}
>> >> >> >> >>>>>> +
>> >> >> >> >>>>>>  static void alloc_cluster(struct swap_info_struct *si, unsigned long idx)
>> >> >> >> >>>>>>  {
>> >> >> >> >>>>>>  	struct swap_cluster_info *ci = si->cluster_info;
>> >> >> >> >>>>>> @@ -2500,7 +2510,7 @@ static void enable_swap_info(struct swap_info_struct *p, int prio,
>> >> >> >> >>>>>>  	 * Guarantee swap_map, cluster_info, etc. fields are valid
>> >> >> >> >>>>>>  	 * between get/put_swap_device() if SWP_VALID bit is set
>> >> >> >> >>>>>>  	 */
>> >> >> >> >>>>>> -	synchronize_rcu();
>> >> >> >> >>>>>> +	percpu_ref_reinit(&p->users);
>> >> >> >> >>>>>
>> >> >> >> >>>>> Although the effect is same, I think it's better to use
>> >> >> >> >>>>> percpu_ref_resurrect() here to improve code readability.
>> >> >> >> >>>>
>> >> >> >> >>>> Check the original commit description for commit eb085574a752 "mm, swap:
>> >> >> >> >>>> fix race between swapoff and some swap operations" and discussion email
>> >> >> >> >>>> thread as follows again,
>> >> >> >> >>>>
>> >> >> >> >>>> https://lore.kernel.org/linux-mm/20171219053650.GB7829@linux.vnet.ibm.com/
>> >> >> >> >>>>
>> >> >> >> >>>> I found that the synchronize_rcu() here is to avoid to call smp_rmb() or
>> >> >> >> >>>> smp_load_acquire() in get_swap_device().  Now we will use
>> >> >> >> >>>> percpu_ref_tryget_live() in get_swap_device(), so we will need to add
>> >> >> >> >>>> the necessary memory barrier, or make sure percpu_ref_tryget_live() has
>> >> >> >> >>>> ACQUIRE semantics.  Per my understanding, we need to change
>> >> >> >> >>>> percpu_ref_tryget_live() for that.
>> >> >> >> >>>>
>> >> >> >> >>>
>> >> >> >> >>> Do you mean the below scene is possible?
>> >> >> >> >>>
>> >> >> >> >>> cpu1
>> >> >> >> >>> swapon()
>> >> >> >> >>>   ...
>> >> >> >> >>>   percpu_ref_init
>> >> >> >> >>>   ...
>> >> >> >> >>>   setup_swap_info
>> >> >> >> >>>   /* smp_store_release() is inside percpu_ref_reinit */
>> >> >> >> >>>   percpu_ref_reinit
>> >> >> >> >> 
>> >> >> >> >> spin_unlock() has RELEASE semantics already.
>> >> >> >> >> 
>> >> >> >> >>>   ...
>> >> >> >> >>>
>> >> >> >> >>> cpu2
>> >> >> >> >>> get_swap_device()
>> >> >> >> >>>   /* ignored  smp_rmb() */
>> >> >> >> >>>   percpu_ref_tryget_live
>> >> >> >> >> 
>> >> >> >> >> Some kind of ACQUIRE is required here to guarantee the refcount is
>> >> >> >> >> checked before fetching the other fields of swap_info_struct.  I have
>> >> >> >> >> sent out a RFC patch to mailing list to discuss this.
>> >> >> >
>> >> >> > I'm just catching up and following along a little bit. I apologize I
>> >> >> > haven't read the swap code, but my understanding is you are trying to
>> >> >> > narrow a race condition with swapoff. That makes sense to me. I'm not
>> >> >> > sure I follow the need to race with reinitializing the ref though? Is it
>> >> >> > not possible to wait out the dying swap info and then create a new one
>> >> >> > rather than push acquire semantics?
>> >> >> 
>> >> >> We want to check whether the swap entry is valid (that is, the swap
>> >> >> device isn't swapped off now), prevent it from swapping off, then access
>> >> >> the swap_info_struct data structure.  When accessing swap_info_struct,
>> >> >> we want to guarantee the ordering, so that we will not reference
>> >> >> uninitialized fields of swap_info_struct.
>> >> >> 
>> >> >
>> >> > So in the normal context of percpu_ref, once someone can access it, the
>> >> > elements that it is protecting are expected to be initialized.
>> >> 
>> >> If we can make sure that all elements being initialized fully, why not
>> >> just use percpu_ref_get() instead of percpu_ref_tryget*()?
>> >> 
>> >
>> > Generally, the lookup is protected with rcu and then
>> > percpu_ref_tryget*() is used to obtain a reference. percpu_ref_get() is
>> > only good if you already have a ref as it increments regardless of being
>> > 0.
>> >
>> > What I mean is if you can get a ref, that means the object hasn't been
>> > destroyed. This differs from the semantics you are looking for which I
>> > understand to be: I have long lived pointers to objects. The object may
>> > die, but I may resurrect it and I want the old pointers to still be
>> > valid.
>> >
>> > When is it possible for someone to have a pointer to the swap device and
>> > the refcount goes to 0? It might be better to avoid this situation than
>> > add acquire semantics.
>> >
>> >> > In the basic case for swap off, I'm seeing the goal as to prevent
>> >> > destruction until anyone currently accessing swap is done. In this
>> >> > case wouldn't we always be protecting a live struct?
>> >> >
>> >> > I'm maybe not understanding what conditions you're trying to revive the
>> >> > percpu_ref?
>> >> 
>> >> A swap entry likes an indirect pointer to a swap device.  We may hold a
>> >> swap entry for long time, so that the swap device is swapoff/swapon.
>> >> Then we need to make sure the swap device are fully initialized before
>> >> accessing the swap device via the swap entry.
>> >> 
>> >
>> > So if I have some number of outstanding references, and then
>> > percpu_ref_kill() is called, then only those that have the pointer will
>> > be able to use the swap device as those references are still good. Prior
>> > to calling percpu_ref_kill(), call_rcu() needs to be called on lookup
>> > data structure.
>> >
>> > My personal understanding of tryget() vs tryget_live() is that it
>> > provides a 2 phase clean up and bounds the ability for new users to come
>> > in (cgroup destruction is a primary user). As tryget() might inevitably
>> > let a cgroup live long past its removal, tryget_live() will say oh
>> > you're in the process of dying do something else.
>> 
>> OK.  I think that I understand your typical use case now.  The resource
>> producer code may look like,
>> 
>>   obj = kmalloc();
>>   /* Initialize obj fields */
>>   percpu_ref_init(&obj->ref);
>>   rcu_assign_pointer(global_p, obj);
>> 
>> The resource reclaimer looks like,
>> 
>>   p = global_p;
>>   global_p = NULL;
>>   percpu_ref_kill(&p->ref);
>>   /* wait until percpu_ref_is_zero(&p->ref) */
>>   /* free resources pointed by obj fields */
>>   kfree(p);
>> 
>> The resource producer looks like,
>> 
>>   rcu_read_lock();
>>   p = rcu_dereference(global_p);
>>   if (!p || !percpu_ref_tryget_live(&p->ref)) {
>>           /* Invalid pointer, go out */
>>   }
>>   rcu_read_unlock();
>>   /* use p */
>>   percpu_ref_put(&p->ref);
>> 
>> For this use case, it's not necessary to make percpu_ref_tryget_live()
>> ACQUIRE operation.  Because refcount doesn't act as a flag to indicate
>> whether the object has been fully initialized, global_p does.  And
>> the data dependency guaranteed the required ordering.
>> 
>
> Yes this is spot on.
>
>> The use case of swap is different.  Where global_p always points to
>> the obj (never freed) even if the resources pointed by obj fields has
>> been freed.  And we want to use refcount as a flag to indicate whether
>> the object is fully initialized.  This is hard to be changed, because
>> the global_p is used to identify the stalled pointer from the totally
>> invalid pointer.
>> 
>
> Apologies ahead of time for this possibly dumb question. Is it possible
> to have swapon swap out the global_p with
> old_obj = rcu_access_pointer(global_p);
> rcu_assign_pointer(global_p, obj);
> kfree_rcu(remove_old_obj) or call_rcu();
>
> Then the obj pointed to by global_p would always be valid, but only
> would be alive again if it got the new pointer?

Yes.  This looks good!  Thanks a lot!

Best Regards,
Huang, Ying

>> If all other users follow the typical use case above, we may find some
>> other way to resolve the problem inside swap code, such as adding
>> smp_rmb() after percpu_ref_tryget_live().
>> 
>
> I would prefer it.
>
>> Best Regards,
>> Huang, Ying
>
> Thanks,
> Dennis

^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [PATCH 1/5] mm/swapfile: add percpu_ref support for swap
@ 2021-04-16  0:54                               ` Huang, Ying
  0 siblings, 0 replies; 72+ messages in thread
From: Huang, Ying @ 2021-04-16  0:54 UTC (permalink / raw)
  To: Dennis Zhou
  Cc: Miaohe Lin, akpm, hannes, mhocko, iamjoonsoo.kim, vbabka,
	alex.shi, willy, minchan, richard.weiyang, hughd, tim.c.chen,
	linux-kernel, linux-mm

Dennis Zhou <dennis@kernel.org> writes:

> On Thu, Apr 15, 2021 at 01:24:31PM +0800, Huang, Ying wrote:
>> Dennis Zhou <dennis@kernel.org> writes:
>> 
>> > On Wed, Apr 14, 2021 at 01:44:58PM +0800, Huang, Ying wrote:
>> >> Dennis Zhou <dennis@kernel.org> writes:
>> >> 
>> >> > On Wed, Apr 14, 2021 at 11:59:03AM +0800, Huang, Ying wrote:
>> >> >> Dennis Zhou <dennis@kernel.org> writes:
>> >> >> 
>> >> >> > Hello,
>> >> >> >
>> >> >> > On Wed, Apr 14, 2021 at 10:06:48AM +0800, Huang, Ying wrote:
>> >> >> >> Miaohe Lin <linmiaohe@huawei.com> writes:
>> >> >> >> 
>> >> >> >> > On 2021/4/14 9:17, Huang, Ying wrote:
>> >> >> >> >> Miaohe Lin <linmiaohe@huawei.com> writes:
>> >> >> >> >> 
>> >> >> >> >>> On 2021/4/12 15:24, Huang, Ying wrote:
>> >> >> >> >>>> "Huang, Ying" <ying.huang@intel.com> writes:
>> >> >> >> >>>>
>> >> >> >> >>>>> Miaohe Lin <linmiaohe@huawei.com> writes:
>> >> >> >> >>>>>
>> >> >> >> >>>>>> We will use percpu-refcount to serialize against concurrent swapoff. This
>> >> >> >> >>>>>> patch adds the percpu_ref support for later fixup.
>> >> >> >> >>>>>>
>> >> >> >> >>>>>> Signed-off-by: Miaohe Lin <linmiaohe@huawei.com>
>> >> >> >> >>>>>> ---
>> >> >> >> >>>>>>  include/linux/swap.h |  2 ++
>> >> >> >> >>>>>>  mm/swapfile.c        | 25 ++++++++++++++++++++++---
>> >> >> >> >>>>>>  2 files changed, 24 insertions(+), 3 deletions(-)
>> >> >> >> >>>>>>
>> >> >> >> >>>>>> diff --git a/include/linux/swap.h b/include/linux/swap.h
>> >> >> >> >>>>>> index 144727041e78..849ba5265c11 100644
>> >> >> >> >>>>>> --- a/include/linux/swap.h
>> >> >> >> >>>>>> +++ b/include/linux/swap.h
>> >> >> >> >>>>>> @@ -240,6 +240,7 @@ struct swap_cluster_list {
>> >> >> >> >>>>>>   * The in-memory structure used to track swap areas.
>> >> >> >> >>>>>>   */
>> >> >> >> >>>>>>  struct swap_info_struct {
>> >> >> >> >>>>>> +	struct percpu_ref users;	/* serialization against concurrent swapoff */
>> >> >> >> >>>>>>  	unsigned long	flags;		/* SWP_USED etc: see above */
>> >> >> >> >>>>>>  	signed short	prio;		/* swap priority of this type */
>> >> >> >> >>>>>>  	struct plist_node list;		/* entry in swap_active_head */
>> >> >> >> >>>>>> @@ -260,6 +261,7 @@ struct swap_info_struct {
>> >> >> >> >>>>>>  	struct block_device *bdev;	/* swap device or bdev of swap file */
>> >> >> >> >>>>>>  	struct file *swap_file;		/* seldom referenced */
>> >> >> >> >>>>>>  	unsigned int old_block_size;	/* seldom referenced */
>> >> >> >> >>>>>> +	struct completion comp;		/* seldom referenced */
>> >> >> >> >>>>>>  #ifdef CONFIG_FRONTSWAP
>> >> >> >> >>>>>>  	unsigned long *frontswap_map;	/* frontswap in-use, one bit per page */
>> >> >> >> >>>>>>  	atomic_t frontswap_pages;	/* frontswap pages in-use counter */
>> >> >> >> >>>>>> diff --git a/mm/swapfile.c b/mm/swapfile.c
>> >> >> >> >>>>>> index 149e77454e3c..724173cd7d0c 100644
>> >> >> >> >>>>>> --- a/mm/swapfile.c
>> >> >> >> >>>>>> +++ b/mm/swapfile.c
>> >> >> >> >>>>>> @@ -39,6 +39,7 @@
>> >> >> >> >>>>>>  #include <linux/export.h>
>> >> >> >> >>>>>>  #include <linux/swap_slots.h>
>> >> >> >> >>>>>>  #include <linux/sort.h>
>> >> >> >> >>>>>> +#include <linux/completion.h>
>> >> >> >> >>>>>>  
>> >> >> >> >>>>>>  #include <asm/tlbflush.h>
>> >> >> >> >>>>>>  #include <linux/swapops.h>
>> >> >> >> >>>>>> @@ -511,6 +512,15 @@ static void swap_discard_work(struct work_struct *work)
>> >> >> >> >>>>>>  	spin_unlock(&si->lock);
>> >> >> >> >>>>>>  }
>> >> >> >> >>>>>>  
>> >> >> >> >>>>>> +static void swap_users_ref_free(struct percpu_ref *ref)
>> >> >> >> >>>>>> +{
>> >> >> >> >>>>>> +	struct swap_info_struct *si;
>> >> >> >> >>>>>> +
>> >> >> >> >>>>>> +	si = container_of(ref, struct swap_info_struct, users);
>> >> >> >> >>>>>> +	complete(&si->comp);
>> >> >> >> >>>>>> +	percpu_ref_exit(&si->users);
>> >> >> >> >>>>>
>> >> >> >> >>>>> Because percpu_ref_exit() is used, we cannot use percpu_ref_tryget() in
>> >> >> >> >>>>> get_swap_device(), better to add comments there.
>> >> >> >> >>>>
>> >> >> >> >>>> I just noticed that the comments of percpu_ref_tryget_live() says,
>> >> >> >> >>>>
>> >> >> >> >>>>  * This function is safe to call as long as @ref is between init and exit.
>> >> >> >> >>>>
>> >> >> >> >>>> While we need to call get_swap_device() almost at any time, so it's
>> >> >> >> >>>> better to avoid to call percpu_ref_exit() at all.  This will waste some
>> >> >> >> >>>> memory, but we need to follow the API definition to avoid potential
>> >> >> >> >>>> issues in the long term.
>> >> >> >> >>>
>> >> >> >> >>> I have to admit that I'am not really familiar with percpu_ref. So I read the
>> >> >> >> >>> implementation code of the percpu_ref and found percpu_ref_tryget_live() could
>> >> >> >> >>> be called after exit now. But you're right we need to follow the API definition
>> >> >> >> >>> to avoid potential issues in the long term.
>> >> >> >> >>>
>> >> >> >> >>>>
>> >> >> >> >>>> And we need to call percpu_ref_init() before insert the swap_info_struct
>> >> >> >> >>>> into the swap_info[].
>> >> >> >> >>>
>> >> >> >> >>> If we remove the call to percpu_ref_exit(), we should not use percpu_ref_init()
>> >> >> >> >>> here because *percpu_ref->data is assumed to be NULL* in percpu_ref_init() while
>> >> >> >> >>> this is not the case as we do not call percpu_ref_exit(). Maybe percpu_ref_reinit()
>> >> >> >> >>> or percpu_ref_resurrect() will do the work.
>> >> >> >> >>>
>> >> >> >> >>> One more thing, how could I distinguish the killed percpu_ref from newly allocated one?
>> >> >> >> >>> It seems percpu_ref_is_dying is only safe to call when @ref is between init and exit.
>> >> >> >> >>> Maybe I could do this in alloc_swap_info()?
>> >> >> >> >> 
>> >> >> >> >> Yes.  In alloc_swap_info(), you can distinguish newly allocated and
>> >> >> >> >> reused swap_info_struct.
>> >> >> >> >> 
>> >> >> >> >>>>
>> >> >> >> >>>>>> +}
>> >> >> >> >>>>>> +
>> >> >> >> >>>>>>  static void alloc_cluster(struct swap_info_struct *si, unsigned long idx)
>> >> >> >> >>>>>>  {
>> >> >> >> >>>>>>  	struct swap_cluster_info *ci = si->cluster_info;
>> >> >> >> >>>>>> @@ -2500,7 +2510,7 @@ static void enable_swap_info(struct swap_info_struct *p, int prio,
>> >> >> >> >>>>>>  	 * Guarantee swap_map, cluster_info, etc. fields are valid
>> >> >> >> >>>>>>  	 * between get/put_swap_device() if SWP_VALID bit is set
>> >> >> >> >>>>>>  	 */
>> >> >> >> >>>>>> -	synchronize_rcu();
>> >> >> >> >>>>>> +	percpu_ref_reinit(&p->users);
>> >> >> >> >>>>>
>> >> >> >> >>>>> Although the effect is same, I think it's better to use
>> >> >> >> >>>>> percpu_ref_resurrect() here to improve code readability.
>> >> >> >> >>>>
>> >> >> >> >>>> Check the original commit description for commit eb085574a752 "mm, swap:
>> >> >> >> >>>> fix race between swapoff and some swap operations" and discussion email
>> >> >> >> >>>> thread as follows again,
>> >> >> >> >>>>
>> >> >> >> >>>> https://lore.kernel.org/linux-mm/20171219053650.GB7829@linux.vnet.ibm.com/
>> >> >> >> >>>>
>> >> >> >> >>>> I found that the synchronize_rcu() here is to avoid to call smp_rmb() or
>> >> >> >> >>>> smp_load_acquire() in get_swap_device().  Now we will use
>> >> >> >> >>>> percpu_ref_tryget_live() in get_swap_device(), so we will need to add
>> >> >> >> >>>> the necessary memory barrier, or make sure percpu_ref_tryget_live() has
>> >> >> >> >>>> ACQUIRE semantics.  Per my understanding, we need to change
>> >> >> >> >>>> percpu_ref_tryget_live() for that.
>> >> >> >> >>>>
>> >> >> >> >>>
>> >> >> >> >>> Do you mean the below scene is possible?
>> >> >> >> >>>
>> >> >> >> >>> cpu1
>> >> >> >> >>> swapon()
>> >> >> >> >>>   ...
>> >> >> >> >>>   percpu_ref_init
>> >> >> >> >>>   ...
>> >> >> >> >>>   setup_swap_info
>> >> >> >> >>>   /* smp_store_release() is inside percpu_ref_reinit */
>> >> >> >> >>>   percpu_ref_reinit
>> >> >> >> >> 
>> >> >> >> >> spin_unlock() has RELEASE semantics already.
>> >> >> >> >> 
>> >> >> >> >>>   ...
>> >> >> >> >>>
>> >> >> >> >>> cpu2
>> >> >> >> >>> get_swap_device()
>> >> >> >> >>>   /* ignored  smp_rmb() */
>> >> >> >> >>>   percpu_ref_tryget_live
>> >> >> >> >> 
>> >> >> >> >> Some kind of ACQUIRE is required here to guarantee the refcount is
>> >> >> >> >> checked before fetching the other fields of swap_info_struct.  I have
>> >> >> >> >> sent out a RFC patch to mailing list to discuss this.
>> >> >> >
>> >> >> > I'm just catching up and following along a little bit. I apologize I
>> >> >> > haven't read the swap code, but my understanding is you are trying to
>> >> >> > narrow a race condition with swapoff. That makes sense to me. I'm not
>> >> >> > sure I follow the need to race with reinitializing the ref though? Is it
>> >> >> > not possible to wait out the dying swap info and then create a new one
>> >> >> > rather than push acquire semantics?
>> >> >> 
>> >> >> We want to check whether the swap entry is valid (that is, the swap
>> >> >> device isn't swapped off now), prevent it from swapping off, then access
>> >> >> the swap_info_struct data structure.  When accessing swap_info_struct,
>> >> >> we want to guarantee the ordering, so that we will not reference
>> >> >> uninitialized fields of swap_info_struct.
>> >> >> 
>> >> >
>> >> > So in the normal context of percpu_ref, once someone can access it, the
>> >> > elements that it is protecting are expected to be initialized.
>> >> 
>> >> If we can make sure that all elements being initialized fully, why not
>> >> just use percpu_ref_get() instead of percpu_ref_tryget*()?
>> >> 
>> >
>> > Generally, the lookup is protected with rcu and then
>> > percpu_ref_tryget*() is used to obtain a reference. percpu_ref_get() is
>> > only good if you already have a ref as it increments regardless of being
>> > 0.
>> >
>> > What I mean is if you can get a ref, that means the object hasn't been
>> > destroyed. This differs from the semantics you are looking for which I
>> > understand to be: I have long lived pointers to objects. The object may
>> > die, but I may resurrect it and I want the old pointers to still be
>> > valid.
>> >
>> > When is it possible for someone to have a pointer to the swap device and
>> > the refcount goes to 0? It might be better to avoid this situation than
>> > add acquire semantics.
>> >
>> >> > In the basic case for swap off, I'm seeing the goal as to prevent
>> >> > destruction until anyone currently accessing swap is done. In this
>> >> > case wouldn't we always be protecting a live struct?
>> >> >
>> >> > I'm maybe not understanding what conditions you're trying to revive the
>> >> > percpu_ref?
>> >> 
>> >> A swap entry likes an indirect pointer to a swap device.  We may hold a
>> >> swap entry for long time, so that the swap device is swapoff/swapon.
>> >> Then we need to make sure the swap device are fully initialized before
>> >> accessing the swap device via the swap entry.
>> >> 
>> >
>> > So if I have some number of outstanding references, and then
>> > percpu_ref_kill() is called, then only those that have the pointer will
>> > be able to use the swap device as those references are still good. Prior
>> > to calling percpu_ref_kill(), call_rcu() needs to be called on lookup
>> > data structure.
>> >
>> > My personal understanding of tryget() vs tryget_live() is that it
>> > provides a 2 phase clean up and bounds the ability for new users to come
>> > in (cgroup destruction is a primary user). As tryget() might inevitably
>> > let a cgroup live long past its removal, tryget_live() will say oh
>> > you're in the process of dying do something else.
>> 
>> OK.  I think that I understand your typical use case now.  The resource
>> producer code may look like,
>> 
>>   obj = kmalloc();
>>   /* Initialize obj fields */
>>   percpu_ref_init(&obj->ref);
>>   rcu_assign_pointer(global_p, obj);
>> 
>> The resource reclaimer looks like,
>> 
>>   p = global_p;
>>   global_p = NULL;
>>   percpu_ref_kill(&p->ref);
>>   /* wait until percpu_ref_is_zero(&p->ref) */
>>   /* free resources pointed by obj fields */
>>   kfree(p);
>> 
>> The resource producer looks like,
>> 
>>   rcu_read_lock();
>>   p = rcu_dereference(global_p);
>>   if (!p || !percpu_ref_tryget_live(&p->ref)) {
>>           /* Invalid pointer, go out */
>>   }
>>   rcu_read_unlock();
>>   /* use p */
>>   percpu_ref_put(&p->ref);
>> 
>> For this use case, it's not necessary to make percpu_ref_tryget_live()
>> ACQUIRE operation.  Because refcount doesn't act as a flag to indicate
>> whether the object has been fully initialized, global_p does.  And
>> the data dependency guaranteed the required ordering.
>> 
>
> Yes this is spot on.
>
>> The use case of swap is different.  Where global_p always points to
>> the obj (never freed) even if the resources pointed by obj fields has
>> been freed.  And we want to use refcount as a flag to indicate whether
>> the object is fully initialized.  This is hard to be changed, because
>> the global_p is used to identify the stalled pointer from the totally
>> invalid pointer.
>> 
>
> Apologies ahead of time for this possibly dumb question. Is it possible
> to have swapon swap out the global_p with
> old_obj = rcu_access_pointer(global_p);
> rcu_assign_pointer(global_p, obj);
> kfree_rcu(remove_old_obj) or call_rcu();
>
> Then the obj pointed to by global_p would always be valid, but only
> would be alive again if it got the new pointer?

Yes.  This looks good!  Thanks a lot!

Best Regards,
Huang, Ying

>> If all other users follow the typical use case above, we may find some
>> other way to resolve the problem inside swap code, such as adding
>> smp_rmb() after percpu_ref_tryget_live().
>> 
>
> I would prefer it.
>
>> Best Regards,
>> Huang, Ying
>
> Thanks,
> Dennis


^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [PATCH 1/5] mm/swapfile: add percpu_ref support for swap
  2021-04-15 14:31                           ` Dennis Zhou
  2021-04-16  0:54                               ` Huang, Ying
@ 2021-04-16  2:27                             ` Miaohe Lin
  2021-04-16  6:25                                 ` Huang, Ying
  1 sibling, 1 reply; 72+ messages in thread
From: Miaohe Lin @ 2021-04-16  2:27 UTC (permalink / raw)
  To: Dennis Zhou, Huang, Ying
  Cc: akpm, hannes, mhocko, iamjoonsoo.kim, vbabka, alex.shi, willy,
	minchan, richard.weiyang, hughd, tim.c.chen, linux-kernel,
	linux-mm

On 2021/4/15 22:31, Dennis Zhou wrote:
> On Thu, Apr 15, 2021 at 01:24:31PM +0800, Huang, Ying wrote:
>> Dennis Zhou <dennis@kernel.org> writes:
>>
>>> On Wed, Apr 14, 2021 at 01:44:58PM +0800, Huang, Ying wrote:
>>>> Dennis Zhou <dennis@kernel.org> writes:
>>>>
>>>>> On Wed, Apr 14, 2021 at 11:59:03AM +0800, Huang, Ying wrote:
>>>>>> Dennis Zhou <dennis@kernel.org> writes:
>>>>>>
>>>>>>> Hello,
>>>>>>>
>>>>>>> On Wed, Apr 14, 2021 at 10:06:48AM +0800, Huang, Ying wrote:
>>>>>>>> Miaohe Lin <linmiaohe@huawei.com> writes:
>>>>>>>>
>>>>>>>>> On 2021/4/14 9:17, Huang, Ying wrote:
>>>>>>>>>> Miaohe Lin <linmiaohe@huawei.com> writes:
>>>>>>>>>>
>>>>>>>>>>> On 2021/4/12 15:24, Huang, Ying wrote:
>>>>>>>>>>>> "Huang, Ying" <ying.huang@intel.com> writes:
>>>>>>>>>>>>
>>>>>>>>>>>>> Miaohe Lin <linmiaohe@huawei.com> writes:
>>>>>>>>>>>>>
>>>>>>>>>>>>>> We will use percpu-refcount to serialize against concurrent swapoff. This
>>>>>>>>>>>>>> patch adds the percpu_ref support for later fixup.
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> Signed-off-by: Miaohe Lin <linmiaohe@huawei.com>
>>>>>>>>>>>>>> ---
>>>>>>>>>>>>>>  include/linux/swap.h |  2 ++
>>>>>>>>>>>>>>  mm/swapfile.c        | 25 ++++++++++++++++++++++---
>>>>>>>>>>>>>>  2 files changed, 24 insertions(+), 3 deletions(-)
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> diff --git a/include/linux/swap.h b/include/linux/swap.h
>>>>>>>>>>>>>> index 144727041e78..849ba5265c11 100644
>>>>>>>>>>>>>> --- a/include/linux/swap.h
>>>>>>>>>>>>>> +++ b/include/linux/swap.h
>>>>>>>>>>>>>> @@ -240,6 +240,7 @@ struct swap_cluster_list {
>>>>>>>>>>>>>>   * The in-memory structure used to track swap areas.
>>>>>>>>>>>>>>   */
>>>>>>>>>>>>>>  struct swap_info_struct {
>>>>>>>>>>>>>> +	struct percpu_ref users;	/* serialization against concurrent swapoff */
>>>>>>>>>>>>>>  	unsigned long	flags;		/* SWP_USED etc: see above */
>>>>>>>>>>>>>>  	signed short	prio;		/* swap priority of this type */
>>>>>>>>>>>>>>  	struct plist_node list;		/* entry in swap_active_head */
>>>>>>>>>>>>>> @@ -260,6 +261,7 @@ struct swap_info_struct {
>>>>>>>>>>>>>>  	struct block_device *bdev;	/* swap device or bdev of swap file */
>>>>>>>>>>>>>>  	struct file *swap_file;		/* seldom referenced */
>>>>>>>>>>>>>>  	unsigned int old_block_size;	/* seldom referenced */
>>>>>>>>>>>>>> +	struct completion comp;		/* seldom referenced */
>>>>>>>>>>>>>>  #ifdef CONFIG_FRONTSWAP
>>>>>>>>>>>>>>  	unsigned long *frontswap_map;	/* frontswap in-use, one bit per page */
>>>>>>>>>>>>>>  	atomic_t frontswap_pages;	/* frontswap pages in-use counter */
>>>>>>>>>>>>>> diff --git a/mm/swapfile.c b/mm/swapfile.c
>>>>>>>>>>>>>> index 149e77454e3c..724173cd7d0c 100644
>>>>>>>>>>>>>> --- a/mm/swapfile.c
>>>>>>>>>>>>>> +++ b/mm/swapfile.c
>>>>>>>>>>>>>> @@ -39,6 +39,7 @@
>>>>>>>>>>>>>>  #include <linux/export.h>
>>>>>>>>>>>>>>  #include <linux/swap_slots.h>
>>>>>>>>>>>>>>  #include <linux/sort.h>
>>>>>>>>>>>>>> +#include <linux/completion.h>
>>>>>>>>>>>>>>  
>>>>>>>>>>>>>>  #include <asm/tlbflush.h>
>>>>>>>>>>>>>>  #include <linux/swapops.h>
>>>>>>>>>>>>>> @@ -511,6 +512,15 @@ static void swap_discard_work(struct work_struct *work)
>>>>>>>>>>>>>>  	spin_unlock(&si->lock);
>>>>>>>>>>>>>>  }
>>>>>>>>>>>>>>  
>>>>>>>>>>>>>> +static void swap_users_ref_free(struct percpu_ref *ref)
>>>>>>>>>>>>>> +{
>>>>>>>>>>>>>> +	struct swap_info_struct *si;
>>>>>>>>>>>>>> +
>>>>>>>>>>>>>> +	si = container_of(ref, struct swap_info_struct, users);
>>>>>>>>>>>>>> +	complete(&si->comp);
>>>>>>>>>>>>>> +	percpu_ref_exit(&si->users);
>>>>>>>>>>>>>
>>>>>>>>>>>>> Because percpu_ref_exit() is used, we cannot use percpu_ref_tryget() in
>>>>>>>>>>>>> get_swap_device(), better to add comments there.
>>>>>>>>>>>>
>>>>>>>>>>>> I just noticed that the comments of percpu_ref_tryget_live() says,
>>>>>>>>>>>>
>>>>>>>>>>>>  * This function is safe to call as long as @ref is between init and exit.
>>>>>>>>>>>>
>>>>>>>>>>>> While we need to call get_swap_device() almost at any time, so it's
>>>>>>>>>>>> better to avoid to call percpu_ref_exit() at all.  This will waste some
>>>>>>>>>>>> memory, but we need to follow the API definition to avoid potential
>>>>>>>>>>>> issues in the long term.
>>>>>>>>>>>
>>>>>>>>>>> I have to admit that I'am not really familiar with percpu_ref. So I read the
>>>>>>>>>>> implementation code of the percpu_ref and found percpu_ref_tryget_live() could
>>>>>>>>>>> be called after exit now. But you're right we need to follow the API definition
>>>>>>>>>>> to avoid potential issues in the long term.
>>>>>>>>>>>
>>>>>>>>>>>>
>>>>>>>>>>>> And we need to call percpu_ref_init() before insert the swap_info_struct
>>>>>>>>>>>> into the swap_info[].
>>>>>>>>>>>
>>>>>>>>>>> If we remove the call to percpu_ref_exit(), we should not use percpu_ref_init()
>>>>>>>>>>> here because *percpu_ref->data is assumed to be NULL* in percpu_ref_init() while
>>>>>>>>>>> this is not the case as we do not call percpu_ref_exit(). Maybe percpu_ref_reinit()
>>>>>>>>>>> or percpu_ref_resurrect() will do the work.
>>>>>>>>>>>
>>>>>>>>>>> One more thing, how could I distinguish the killed percpu_ref from newly allocated one?
>>>>>>>>>>> It seems percpu_ref_is_dying is only safe to call when @ref is between init and exit.
>>>>>>>>>>> Maybe I could do this in alloc_swap_info()?
>>>>>>>>>>
>>>>>>>>>> Yes.  In alloc_swap_info(), you can distinguish newly allocated and
>>>>>>>>>> reused swap_info_struct.
>>>>>>>>>>
>>>>>>>>>>>>
>>>>>>>>>>>>>> +}
>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>  static void alloc_cluster(struct swap_info_struct *si, unsigned long idx)
>>>>>>>>>>>>>>  {
>>>>>>>>>>>>>>  	struct swap_cluster_info *ci = si->cluster_info;
>>>>>>>>>>>>>> @@ -2500,7 +2510,7 @@ static void enable_swap_info(struct swap_info_struct *p, int prio,
>>>>>>>>>>>>>>  	 * Guarantee swap_map, cluster_info, etc. fields are valid
>>>>>>>>>>>>>>  	 * between get/put_swap_device() if SWP_VALID bit is set
>>>>>>>>>>>>>>  	 */
>>>>>>>>>>>>>> -	synchronize_rcu();
>>>>>>>>>>>>>> +	percpu_ref_reinit(&p->users);
>>>>>>>>>>>>>
>>>>>>>>>>>>> Although the effect is same, I think it's better to use
>>>>>>>>>>>>> percpu_ref_resurrect() here to improve code readability.
>>>>>>>>>>>>
>>>>>>>>>>>> Check the original commit description for commit eb085574a752 "mm, swap:
>>>>>>>>>>>> fix race between swapoff and some swap operations" and discussion email
>>>>>>>>>>>> thread as follows again,
>>>>>>>>>>>>
>>>>>>>>>>>> https://lore.kernel.org/linux-mm/20171219053650.GB7829@linux.vnet.ibm.com/
>>>>>>>>>>>>
>>>>>>>>>>>> I found that the synchronize_rcu() here is to avoid to call smp_rmb() or
>>>>>>>>>>>> smp_load_acquire() in get_swap_device().  Now we will use
>>>>>>>>>>>> percpu_ref_tryget_live() in get_swap_device(), so we will need to add
>>>>>>>>>>>> the necessary memory barrier, or make sure percpu_ref_tryget_live() has
>>>>>>>>>>>> ACQUIRE semantics.  Per my understanding, we need to change
>>>>>>>>>>>> percpu_ref_tryget_live() for that.
>>>>>>>>>>>>
>>>>>>>>>>>
>>>>>>>>>>> Do you mean the below scene is possible?
>>>>>>>>>>>
>>>>>>>>>>> cpu1
>>>>>>>>>>> swapon()
>>>>>>>>>>>   ...
>>>>>>>>>>>   percpu_ref_init
>>>>>>>>>>>   ...
>>>>>>>>>>>   setup_swap_info
>>>>>>>>>>>   /* smp_store_release() is inside percpu_ref_reinit */
>>>>>>>>>>>   percpu_ref_reinit
>>>>>>>>>>
>>>>>>>>>> spin_unlock() has RELEASE semantics already.
>>>>>>>>>>
>>>>>>>>>>>   ...
>>>>>>>>>>>
>>>>>>>>>>> cpu2
>>>>>>>>>>> get_swap_device()
>>>>>>>>>>>   /* ignored  smp_rmb() */
>>>>>>>>>>>   percpu_ref_tryget_live
>>>>>>>>>>
>>>>>>>>>> Some kind of ACQUIRE is required here to guarantee the refcount is
>>>>>>>>>> checked before fetching the other fields of swap_info_struct.  I have
>>>>>>>>>> sent out a RFC patch to mailing list to discuss this.
>>>>>>>
>>>>>>> I'm just catching up and following along a little bit. I apologize I
>>>>>>> haven't read the swap code, but my understanding is you are trying to
>>>>>>> narrow a race condition with swapoff. That makes sense to me. I'm not
>>>>>>> sure I follow the need to race with reinitializing the ref though? Is it
>>>>>>> not possible to wait out the dying swap info and then create a new one
>>>>>>> rather than push acquire semantics?
>>>>>>
>>>>>> We want to check whether the swap entry is valid (that is, the swap
>>>>>> device isn't swapped off now), prevent it from swapping off, then access
>>>>>> the swap_info_struct data structure.  When accessing swap_info_struct,
>>>>>> we want to guarantee the ordering, so that we will not reference
>>>>>> uninitialized fields of swap_info_struct.
>>>>>>
>>>>>
>>>>> So in the normal context of percpu_ref, once someone can access it, the
>>>>> elements that it is protecting are expected to be initialized.
>>>>
>>>> If we can make sure that all elements being initialized fully, why not
>>>> just use percpu_ref_get() instead of percpu_ref_tryget*()?
>>>>
>>>
>>> Generally, the lookup is protected with rcu and then
>>> percpu_ref_tryget*() is used to obtain a reference. percpu_ref_get() is
>>> only good if you already have a ref as it increments regardless of being
>>> 0.
>>>
>>> What I mean is if you can get a ref, that means the object hasn't been
>>> destroyed. This differs from the semantics you are looking for which I
>>> understand to be: I have long lived pointers to objects. The object may
>>> die, but I may resurrect it and I want the old pointers to still be
>>> valid.
>>>
>>> When is it possible for someone to have a pointer to the swap device and
>>> the refcount goes to 0? It might be better to avoid this situation than
>>> add acquire semantics.
>>>
>>>>> In the basic case for swap off, I'm seeing the goal as to prevent
>>>>> destruction until anyone currently accessing swap is done. In this
>>>>> case wouldn't we always be protecting a live struct?
>>>>>
>>>>> I'm maybe not understanding what conditions you're trying to revive the
>>>>> percpu_ref?
>>>>
>>>> A swap entry likes an indirect pointer to a swap device.  We may hold a
>>>> swap entry for long time, so that the swap device is swapoff/swapon.
>>>> Then we need to make sure the swap device are fully initialized before
>>>> accessing the swap device via the swap entry.
>>>>
>>>
>>> So if I have some number of outstanding references, and then
>>> percpu_ref_kill() is called, then only those that have the pointer will
>>> be able to use the swap device as those references are still good. Prior
>>> to calling percpu_ref_kill(), call_rcu() needs to be called on lookup
>>> data structure.
>>>
>>> My personal understanding of tryget() vs tryget_live() is that it
>>> provides a 2 phase clean up and bounds the ability for new users to come
>>> in (cgroup destruction is a primary user). As tryget() might inevitably
>>> let a cgroup live long past its removal, tryget_live() will say oh
>>> you're in the process of dying do something else.
>>
>> OK.  I think that I understand your typical use case now.  The resource
>> producer code may look like,
>>
>>   obj = kmalloc();
>>   /* Initialize obj fields */
>>   percpu_ref_init(&obj->ref);
>>   rcu_assign_pointer(global_p, obj);
>>
>> The resource reclaimer looks like,
>>
>>   p = global_p;
>>   global_p = NULL;
>>   percpu_ref_kill(&p->ref);
>>   /* wait until percpu_ref_is_zero(&p->ref) */
>>   /* free resources pointed by obj fields */
>>   kfree(p);
>>
>> The resource producer looks like,
>>
>>   rcu_read_lock();
>>   p = rcu_dereference(global_p);
>>   if (!p || !percpu_ref_tryget_live(&p->ref)) {
>>           /* Invalid pointer, go out */
>>   }
>>   rcu_read_unlock();
>>   /* use p */
>>   percpu_ref_put(&p->ref);
>>
>> For this use case, it's not necessary to make percpu_ref_tryget_live()
>> ACQUIRE operation.  Because refcount doesn't act as a flag to indicate
>> whether the object has been fully initialized, global_p does.  And
>> the data dependency guaranteed the required ordering.
>>
> 
> Yes this is spot on.
> 
>> The use case of swap is different.  Where global_p always points to
>> the obj (never freed) even if the resources pointed by obj fields has
>> been freed.  And we want to use refcount as a flag to indicate whether
>> the object is fully initialized.  This is hard to be changed, because
>> the global_p is used to identify the stalled pointer from the totally
>> invalid pointer.
>>
> 
> Apologies ahead of time for this possibly dumb question. Is it possible
> to have swapon swap out the global_p with
> old_obj = rcu_access_pointer(global_p);
> rcu_assign_pointer(global_p, obj);
> kfree_rcu(remove_old_obj) or call_rcu();
> 
> Then the obj pointed to by global_p would always be valid, but only
> would be alive again if it got the new pointer?

Many thanks for both of you! Looks like a nice solution! Will try to do it in v2.
Thanks again! :)

> 
>> If all other users follow the typical use case above, we may find some
>> other way to resolve the problem inside swap code, such as adding
>> smp_rmb() after percpu_ref_tryget_live().
>>
> 
> I would prefer it.
> 
>> Best Regards,
>> Huang, Ying
> 
> Thanks,
> Dennis
> 
> .
> 


^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [PATCH 1/5] mm/swapfile: add percpu_ref support for swap
  2021-04-16  2:27                             ` Miaohe Lin
@ 2021-04-16  6:25                                 ` Huang, Ying
  0 siblings, 0 replies; 72+ messages in thread
From: Huang, Ying @ 2021-04-16  6:25 UTC (permalink / raw)
  To: Miaohe Lin
  Cc: Dennis Zhou, akpm, hannes, mhocko, iamjoonsoo.kim, vbabka,
	alex.shi, willy, minchan, richard.weiyang, hughd, tim.c.chen,
	linux-kernel, linux-mm

Miaohe Lin <linmiaohe@huawei.com> writes:

> On 2021/4/15 22:31, Dennis Zhou wrote:
>> On Thu, Apr 15, 2021 at 01:24:31PM +0800, Huang, Ying wrote:
>>> Dennis Zhou <dennis@kernel.org> writes:
>>>
>>>> On Wed, Apr 14, 2021 at 01:44:58PM +0800, Huang, Ying wrote:
>>>>> Dennis Zhou <dennis@kernel.org> writes:
>>>>>
>>>>>> On Wed, Apr 14, 2021 at 11:59:03AM +0800, Huang, Ying wrote:
>>>>>>> Dennis Zhou <dennis@kernel.org> writes:
>>>>>>>
>>>>>>>> Hello,
>>>>>>>>
>>>>>>>> On Wed, Apr 14, 2021 at 10:06:48AM +0800, Huang, Ying wrote:
>>>>>>>>> Miaohe Lin <linmiaohe@huawei.com> writes:
>>>>>>>>>
>>>>>>>>>> On 2021/4/14 9:17, Huang, Ying wrote:
>>>>>>>>>>> Miaohe Lin <linmiaohe@huawei.com> writes:
>>>>>>>>>>>
>>>>>>>>>>>> On 2021/4/12 15:24, Huang, Ying wrote:
>>>>>>>>>>>>> "Huang, Ying" <ying.huang@intel.com> writes:
>>>>>>>>>>>>>
>>>>>>>>>>>>>> Miaohe Lin <linmiaohe@huawei.com> writes:
>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> We will use percpu-refcount to serialize against concurrent swapoff. This
>>>>>>>>>>>>>>> patch adds the percpu_ref support for later fixup.
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> Signed-off-by: Miaohe Lin <linmiaohe@huawei.com>
>>>>>>>>>>>>>>> ---
>>>>>>>>>>>>>>>  include/linux/swap.h |  2 ++
>>>>>>>>>>>>>>>  mm/swapfile.c        | 25 ++++++++++++++++++++++---
>>>>>>>>>>>>>>>  2 files changed, 24 insertions(+), 3 deletions(-)
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> diff --git a/include/linux/swap.h b/include/linux/swap.h
>>>>>>>>>>>>>>> index 144727041e78..849ba5265c11 100644
>>>>>>>>>>>>>>> --- a/include/linux/swap.h
>>>>>>>>>>>>>>> +++ b/include/linux/swap.h
>>>>>>>>>>>>>>> @@ -240,6 +240,7 @@ struct swap_cluster_list {
>>>>>>>>>>>>>>>   * The in-memory structure used to track swap areas.
>>>>>>>>>>>>>>>   */
>>>>>>>>>>>>>>>  struct swap_info_struct {
>>>>>>>>>>>>>>> +	struct percpu_ref users;	/* serialization against concurrent swapoff */
>>>>>>>>>>>>>>>  	unsigned long	flags;		/* SWP_USED etc: see above */
>>>>>>>>>>>>>>>  	signed short	prio;		/* swap priority of this type */
>>>>>>>>>>>>>>>  	struct plist_node list;		/* entry in swap_active_head */
>>>>>>>>>>>>>>> @@ -260,6 +261,7 @@ struct swap_info_struct {
>>>>>>>>>>>>>>>  	struct block_device *bdev;	/* swap device or bdev of swap file */
>>>>>>>>>>>>>>>  	struct file *swap_file;		/* seldom referenced */
>>>>>>>>>>>>>>>  	unsigned int old_block_size;	/* seldom referenced */
>>>>>>>>>>>>>>> +	struct completion comp;		/* seldom referenced */
>>>>>>>>>>>>>>>  #ifdef CONFIG_FRONTSWAP
>>>>>>>>>>>>>>>  	unsigned long *frontswap_map;	/* frontswap in-use, one bit per page */
>>>>>>>>>>>>>>>  	atomic_t frontswap_pages;	/* frontswap pages in-use counter */
>>>>>>>>>>>>>>> diff --git a/mm/swapfile.c b/mm/swapfile.c
>>>>>>>>>>>>>>> index 149e77454e3c..724173cd7d0c 100644
>>>>>>>>>>>>>>> --- a/mm/swapfile.c
>>>>>>>>>>>>>>> +++ b/mm/swapfile.c
>>>>>>>>>>>>>>> @@ -39,6 +39,7 @@
>>>>>>>>>>>>>>>  #include <linux/export.h>
>>>>>>>>>>>>>>>  #include <linux/swap_slots.h>
>>>>>>>>>>>>>>>  #include <linux/sort.h>
>>>>>>>>>>>>>>> +#include <linux/completion.h>
>>>>>>>>>>>>>>>  
>>>>>>>>>>>>>>>  #include <asm/tlbflush.h>
>>>>>>>>>>>>>>>  #include <linux/swapops.h>
>>>>>>>>>>>>>>> @@ -511,6 +512,15 @@ static void swap_discard_work(struct work_struct *work)
>>>>>>>>>>>>>>>  	spin_unlock(&si->lock);
>>>>>>>>>>>>>>>  }
>>>>>>>>>>>>>>>  
>>>>>>>>>>>>>>> +static void swap_users_ref_free(struct percpu_ref *ref)
>>>>>>>>>>>>>>> +{
>>>>>>>>>>>>>>> +	struct swap_info_struct *si;
>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>> +	si = container_of(ref, struct swap_info_struct, users);
>>>>>>>>>>>>>>> +	complete(&si->comp);
>>>>>>>>>>>>>>> +	percpu_ref_exit(&si->users);
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> Because percpu_ref_exit() is used, we cannot use percpu_ref_tryget() in
>>>>>>>>>>>>>> get_swap_device(), better to add comments there.
>>>>>>>>>>>>>
>>>>>>>>>>>>> I just noticed that the comments of percpu_ref_tryget_live() says,
>>>>>>>>>>>>>
>>>>>>>>>>>>>  * This function is safe to call as long as @ref is between init and exit.
>>>>>>>>>>>>>
>>>>>>>>>>>>> While we need to call get_swap_device() almost at any time, so it's
>>>>>>>>>>>>> better to avoid to call percpu_ref_exit() at all.  This will waste some
>>>>>>>>>>>>> memory, but we need to follow the API definition to avoid potential
>>>>>>>>>>>>> issues in the long term.
>>>>>>>>>>>>
>>>>>>>>>>>> I have to admit that I'am not really familiar with percpu_ref. So I read the
>>>>>>>>>>>> implementation code of the percpu_ref and found percpu_ref_tryget_live() could
>>>>>>>>>>>> be called after exit now. But you're right we need to follow the API definition
>>>>>>>>>>>> to avoid potential issues in the long term.
>>>>>>>>>>>>
>>>>>>>>>>>>>
>>>>>>>>>>>>> And we need to call percpu_ref_init() before insert the swap_info_struct
>>>>>>>>>>>>> into the swap_info[].
>>>>>>>>>>>>
>>>>>>>>>>>> If we remove the call to percpu_ref_exit(), we should not use percpu_ref_init()
>>>>>>>>>>>> here because *percpu_ref->data is assumed to be NULL* in percpu_ref_init() while
>>>>>>>>>>>> this is not the case as we do not call percpu_ref_exit(). Maybe percpu_ref_reinit()
>>>>>>>>>>>> or percpu_ref_resurrect() will do the work.
>>>>>>>>>>>>
>>>>>>>>>>>> One more thing, how could I distinguish the killed percpu_ref from newly allocated one?
>>>>>>>>>>>> It seems percpu_ref_is_dying is only safe to call when @ref is between init and exit.
>>>>>>>>>>>> Maybe I could do this in alloc_swap_info()?
>>>>>>>>>>>
>>>>>>>>>>> Yes.  In alloc_swap_info(), you can distinguish newly allocated and
>>>>>>>>>>> reused swap_info_struct.
>>>>>>>>>>>
>>>>>>>>>>>>>
>>>>>>>>>>>>>>> +}
>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>  static void alloc_cluster(struct swap_info_struct *si, unsigned long idx)
>>>>>>>>>>>>>>>  {
>>>>>>>>>>>>>>>  	struct swap_cluster_info *ci = si->cluster_info;
>>>>>>>>>>>>>>> @@ -2500,7 +2510,7 @@ static void enable_swap_info(struct swap_info_struct *p, int prio,
>>>>>>>>>>>>>>>  	 * Guarantee swap_map, cluster_info, etc. fields are valid
>>>>>>>>>>>>>>>  	 * between get/put_swap_device() if SWP_VALID bit is set
>>>>>>>>>>>>>>>  	 */
>>>>>>>>>>>>>>> -	synchronize_rcu();
>>>>>>>>>>>>>>> +	percpu_ref_reinit(&p->users);
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> Although the effect is same, I think it's better to use
>>>>>>>>>>>>>> percpu_ref_resurrect() here to improve code readability.
>>>>>>>>>>>>>
>>>>>>>>>>>>> Check the original commit description for commit eb085574a752 "mm, swap:
>>>>>>>>>>>>> fix race between swapoff and some swap operations" and discussion email
>>>>>>>>>>>>> thread as follows again,
>>>>>>>>>>>>>
>>>>>>>>>>>>> https://lore.kernel.org/linux-mm/20171219053650.GB7829@linux.vnet.ibm.com/
>>>>>>>>>>>>>
>>>>>>>>>>>>> I found that the synchronize_rcu() here is to avoid to call smp_rmb() or
>>>>>>>>>>>>> smp_load_acquire() in get_swap_device().  Now we will use
>>>>>>>>>>>>> percpu_ref_tryget_live() in get_swap_device(), so we will need to add
>>>>>>>>>>>>> the necessary memory barrier, or make sure percpu_ref_tryget_live() has
>>>>>>>>>>>>> ACQUIRE semantics.  Per my understanding, we need to change
>>>>>>>>>>>>> percpu_ref_tryget_live() for that.
>>>>>>>>>>>>>
>>>>>>>>>>>>
>>>>>>>>>>>> Do you mean the below scene is possible?
>>>>>>>>>>>>
>>>>>>>>>>>> cpu1
>>>>>>>>>>>> swapon()
>>>>>>>>>>>>   ...
>>>>>>>>>>>>   percpu_ref_init
>>>>>>>>>>>>   ...
>>>>>>>>>>>>   setup_swap_info
>>>>>>>>>>>>   /* smp_store_release() is inside percpu_ref_reinit */
>>>>>>>>>>>>   percpu_ref_reinit
>>>>>>>>>>>
>>>>>>>>>>> spin_unlock() has RELEASE semantics already.
>>>>>>>>>>>
>>>>>>>>>>>>   ...
>>>>>>>>>>>>
>>>>>>>>>>>> cpu2
>>>>>>>>>>>> get_swap_device()
>>>>>>>>>>>>   /* ignored  smp_rmb() */
>>>>>>>>>>>>   percpu_ref_tryget_live
>>>>>>>>>>>
>>>>>>>>>>> Some kind of ACQUIRE is required here to guarantee the refcount is
>>>>>>>>>>> checked before fetching the other fields of swap_info_struct.  I have
>>>>>>>>>>> sent out a RFC patch to mailing list to discuss this.
>>>>>>>>
>>>>>>>> I'm just catching up and following along a little bit. I apologize I
>>>>>>>> haven't read the swap code, but my understanding is you are trying to
>>>>>>>> narrow a race condition with swapoff. That makes sense to me. I'm not
>>>>>>>> sure I follow the need to race with reinitializing the ref though? Is it
>>>>>>>> not possible to wait out the dying swap info and then create a new one
>>>>>>>> rather than push acquire semantics?
>>>>>>>
>>>>>>> We want to check whether the swap entry is valid (that is, the swap
>>>>>>> device isn't swapped off now), prevent it from swapping off, then access
>>>>>>> the swap_info_struct data structure.  When accessing swap_info_struct,
>>>>>>> we want to guarantee the ordering, so that we will not reference
>>>>>>> uninitialized fields of swap_info_struct.
>>>>>>>
>>>>>>
>>>>>> So in the normal context of percpu_ref, once someone can access it, the
>>>>>> elements that it is protecting are expected to be initialized.
>>>>>
>>>>> If we can make sure that all elements being initialized fully, why not
>>>>> just use percpu_ref_get() instead of percpu_ref_tryget*()?
>>>>>
>>>>
>>>> Generally, the lookup is protected with rcu and then
>>>> percpu_ref_tryget*() is used to obtain a reference. percpu_ref_get() is
>>>> only good if you already have a ref as it increments regardless of being
>>>> 0.
>>>>
>>>> What I mean is if you can get a ref, that means the object hasn't been
>>>> destroyed. This differs from the semantics you are looking for which I
>>>> understand to be: I have long lived pointers to objects. The object may
>>>> die, but I may resurrect it and I want the old pointers to still be
>>>> valid.
>>>>
>>>> When is it possible for someone to have a pointer to the swap device and
>>>> the refcount goes to 0? It might be better to avoid this situation than
>>>> add acquire semantics.
>>>>
>>>>>> In the basic case for swap off, I'm seeing the goal as to prevent
>>>>>> destruction until anyone currently accessing swap is done. In this
>>>>>> case wouldn't we always be protecting a live struct?
>>>>>>
>>>>>> I'm maybe not understanding what conditions you're trying to revive the
>>>>>> percpu_ref?
>>>>>
>>>>> A swap entry likes an indirect pointer to a swap device.  We may hold a
>>>>> swap entry for long time, so that the swap device is swapoff/swapon.
>>>>> Then we need to make sure the swap device are fully initialized before
>>>>> accessing the swap device via the swap entry.
>>>>>
>>>>
>>>> So if I have some number of outstanding references, and then
>>>> percpu_ref_kill() is called, then only those that have the pointer will
>>>> be able to use the swap device as those references are still good. Prior
>>>> to calling percpu_ref_kill(), call_rcu() needs to be called on lookup
>>>> data structure.
>>>>
>>>> My personal understanding of tryget() vs tryget_live() is that it
>>>> provides a 2 phase clean up and bounds the ability for new users to come
>>>> in (cgroup destruction is a primary user). As tryget() might inevitably
>>>> let a cgroup live long past its removal, tryget_live() will say oh
>>>> you're in the process of dying do something else.
>>>
>>> OK.  I think that I understand your typical use case now.  The resource
>>> producer code may look like,
>>>
>>>   obj = kmalloc();
>>>   /* Initialize obj fields */
>>>   percpu_ref_init(&obj->ref);
>>>   rcu_assign_pointer(global_p, obj);
>>>
>>> The resource reclaimer looks like,
>>>
>>>   p = global_p;
>>>   global_p = NULL;
>>>   percpu_ref_kill(&p->ref);
>>>   /* wait until percpu_ref_is_zero(&p->ref) */
>>>   /* free resources pointed by obj fields */
>>>   kfree(p);
>>>
>>> The resource producer looks like,
>>>
>>>   rcu_read_lock();
>>>   p = rcu_dereference(global_p);
>>>   if (!p || !percpu_ref_tryget_live(&p->ref)) {
>>>           /* Invalid pointer, go out */
>>>   }
>>>   rcu_read_unlock();
>>>   /* use p */
>>>   percpu_ref_put(&p->ref);
>>>
>>> For this use case, it's not necessary to make percpu_ref_tryget_live()
>>> ACQUIRE operation.  Because refcount doesn't act as a flag to indicate
>>> whether the object has been fully initialized, global_p does.  And
>>> the data dependency guaranteed the required ordering.
>>>
>> 
>> Yes this is spot on.
>> 
>>> The use case of swap is different.  Where global_p always points to
>>> the obj (never freed) even if the resources pointed by obj fields has
>>> been freed.  And we want to use refcount as a flag to indicate whether
>>> the object is fully initialized.  This is hard to be changed, because
>>> the global_p is used to identify the stalled pointer from the totally
>>> invalid pointer.
>>>
>> 
>> Apologies ahead of time for this possibly dumb question. Is it possible
>> to have swapon swap out the global_p with
>> old_obj = rcu_access_pointer(global_p);
>> rcu_assign_pointer(global_p, obj);
>> kfree_rcu(remove_old_obj) or call_rcu();
>> 
>> Then the obj pointed to by global_p would always be valid, but only
>> would be alive again if it got the new pointer?
>
> Many thanks for both of you! Looks like a nice solution! Will try to do it in v2.
> Thanks again! :)

Think about this again.  This means that we need to free the old
swap_info_struct at some time.  So something like RCU is needed to
enclose the accessor.  But some accessor doesn't follow this, and it
appears overkill to change all these accessors.  So I think at least as
the first step, smp_rmb() appears more appropriate.

Best Regards,
Huang, Ying

>> 
>>> If all other users follow the typical use case above, we may find some
>>> other way to resolve the problem inside swap code, such as adding
>>> smp_rmb() after percpu_ref_tryget_live().
>>>
>> 
>> I would prefer it.
>> 
>>> Best Regards,
>>> Huang, Ying
>> 
>> Thanks,
>> Dennis
>> 
>> .
>> 

^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [PATCH 1/5] mm/swapfile: add percpu_ref support for swap
@ 2021-04-16  6:25                                 ` Huang, Ying
  0 siblings, 0 replies; 72+ messages in thread
From: Huang, Ying @ 2021-04-16  6:25 UTC (permalink / raw)
  To: Miaohe Lin
  Cc: Dennis Zhou, akpm, hannes, mhocko, iamjoonsoo.kim, vbabka,
	alex.shi, willy, minchan, richard.weiyang, hughd, tim.c.chen,
	linux-kernel, linux-mm

Miaohe Lin <linmiaohe@huawei.com> writes:

> On 2021/4/15 22:31, Dennis Zhou wrote:
>> On Thu, Apr 15, 2021 at 01:24:31PM +0800, Huang, Ying wrote:
>>> Dennis Zhou <dennis@kernel.org> writes:
>>>
>>>> On Wed, Apr 14, 2021 at 01:44:58PM +0800, Huang, Ying wrote:
>>>>> Dennis Zhou <dennis@kernel.org> writes:
>>>>>
>>>>>> On Wed, Apr 14, 2021 at 11:59:03AM +0800, Huang, Ying wrote:
>>>>>>> Dennis Zhou <dennis@kernel.org> writes:
>>>>>>>
>>>>>>>> Hello,
>>>>>>>>
>>>>>>>> On Wed, Apr 14, 2021 at 10:06:48AM +0800, Huang, Ying wrote:
>>>>>>>>> Miaohe Lin <linmiaohe@huawei.com> writes:
>>>>>>>>>
>>>>>>>>>> On 2021/4/14 9:17, Huang, Ying wrote:
>>>>>>>>>>> Miaohe Lin <linmiaohe@huawei.com> writes:
>>>>>>>>>>>
>>>>>>>>>>>> On 2021/4/12 15:24, Huang, Ying wrote:
>>>>>>>>>>>>> "Huang, Ying" <ying.huang@intel.com> writes:
>>>>>>>>>>>>>
>>>>>>>>>>>>>> Miaohe Lin <linmiaohe@huawei.com> writes:
>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> We will use percpu-refcount to serialize against concurrent swapoff. This
>>>>>>>>>>>>>>> patch adds the percpu_ref support for later fixup.
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> Signed-off-by: Miaohe Lin <linmiaohe@huawei.com>
>>>>>>>>>>>>>>> ---
>>>>>>>>>>>>>>>  include/linux/swap.h |  2 ++
>>>>>>>>>>>>>>>  mm/swapfile.c        | 25 ++++++++++++++++++++++---
>>>>>>>>>>>>>>>  2 files changed, 24 insertions(+), 3 deletions(-)
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> diff --git a/include/linux/swap.h b/include/linux/swap.h
>>>>>>>>>>>>>>> index 144727041e78..849ba5265c11 100644
>>>>>>>>>>>>>>> --- a/include/linux/swap.h
>>>>>>>>>>>>>>> +++ b/include/linux/swap.h
>>>>>>>>>>>>>>> @@ -240,6 +240,7 @@ struct swap_cluster_list {
>>>>>>>>>>>>>>>   * The in-memory structure used to track swap areas.
>>>>>>>>>>>>>>>   */
>>>>>>>>>>>>>>>  struct swap_info_struct {
>>>>>>>>>>>>>>> +	struct percpu_ref users;	/* serialization against concurrent swapoff */
>>>>>>>>>>>>>>>  	unsigned long	flags;		/* SWP_USED etc: see above */
>>>>>>>>>>>>>>>  	signed short	prio;		/* swap priority of this type */
>>>>>>>>>>>>>>>  	struct plist_node list;		/* entry in swap_active_head */
>>>>>>>>>>>>>>> @@ -260,6 +261,7 @@ struct swap_info_struct {
>>>>>>>>>>>>>>>  	struct block_device *bdev;	/* swap device or bdev of swap file */
>>>>>>>>>>>>>>>  	struct file *swap_file;		/* seldom referenced */
>>>>>>>>>>>>>>>  	unsigned int old_block_size;	/* seldom referenced */
>>>>>>>>>>>>>>> +	struct completion comp;		/* seldom referenced */
>>>>>>>>>>>>>>>  #ifdef CONFIG_FRONTSWAP
>>>>>>>>>>>>>>>  	unsigned long *frontswap_map;	/* frontswap in-use, one bit per page */
>>>>>>>>>>>>>>>  	atomic_t frontswap_pages;	/* frontswap pages in-use counter */
>>>>>>>>>>>>>>> diff --git a/mm/swapfile.c b/mm/swapfile.c
>>>>>>>>>>>>>>> index 149e77454e3c..724173cd7d0c 100644
>>>>>>>>>>>>>>> --- a/mm/swapfile.c
>>>>>>>>>>>>>>> +++ b/mm/swapfile.c
>>>>>>>>>>>>>>> @@ -39,6 +39,7 @@
>>>>>>>>>>>>>>>  #include <linux/export.h>
>>>>>>>>>>>>>>>  #include <linux/swap_slots.h>
>>>>>>>>>>>>>>>  #include <linux/sort.h>
>>>>>>>>>>>>>>> +#include <linux/completion.h>
>>>>>>>>>>>>>>>  
>>>>>>>>>>>>>>>  #include <asm/tlbflush.h>
>>>>>>>>>>>>>>>  #include <linux/swapops.h>
>>>>>>>>>>>>>>> @@ -511,6 +512,15 @@ static void swap_discard_work(struct work_struct *work)
>>>>>>>>>>>>>>>  	spin_unlock(&si->lock);
>>>>>>>>>>>>>>>  }
>>>>>>>>>>>>>>>  
>>>>>>>>>>>>>>> +static void swap_users_ref_free(struct percpu_ref *ref)
>>>>>>>>>>>>>>> +{
>>>>>>>>>>>>>>> +	struct swap_info_struct *si;
>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>> +	si = container_of(ref, struct swap_info_struct, users);
>>>>>>>>>>>>>>> +	complete(&si->comp);
>>>>>>>>>>>>>>> +	percpu_ref_exit(&si->users);
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> Because percpu_ref_exit() is used, we cannot use percpu_ref_tryget() in
>>>>>>>>>>>>>> get_swap_device(), better to add comments there.
>>>>>>>>>>>>>
>>>>>>>>>>>>> I just noticed that the comments of percpu_ref_tryget_live() says,
>>>>>>>>>>>>>
>>>>>>>>>>>>>  * This function is safe to call as long as @ref is between init and exit.
>>>>>>>>>>>>>
>>>>>>>>>>>>> While we need to call get_swap_device() almost at any time, so it's
>>>>>>>>>>>>> better to avoid to call percpu_ref_exit() at all.  This will waste some
>>>>>>>>>>>>> memory, but we need to follow the API definition to avoid potential
>>>>>>>>>>>>> issues in the long term.
>>>>>>>>>>>>
>>>>>>>>>>>> I have to admit that I'am not really familiar with percpu_ref. So I read the
>>>>>>>>>>>> implementation code of the percpu_ref and found percpu_ref_tryget_live() could
>>>>>>>>>>>> be called after exit now. But you're right we need to follow the API definition
>>>>>>>>>>>> to avoid potential issues in the long term.
>>>>>>>>>>>>
>>>>>>>>>>>>>
>>>>>>>>>>>>> And we need to call percpu_ref_init() before insert the swap_info_struct
>>>>>>>>>>>>> into the swap_info[].
>>>>>>>>>>>>
>>>>>>>>>>>> If we remove the call to percpu_ref_exit(), we should not use percpu_ref_init()
>>>>>>>>>>>> here because *percpu_ref->data is assumed to be NULL* in percpu_ref_init() while
>>>>>>>>>>>> this is not the case as we do not call percpu_ref_exit(). Maybe percpu_ref_reinit()
>>>>>>>>>>>> or percpu_ref_resurrect() will do the work.
>>>>>>>>>>>>
>>>>>>>>>>>> One more thing, how could I distinguish the killed percpu_ref from newly allocated one?
>>>>>>>>>>>> It seems percpu_ref_is_dying is only safe to call when @ref is between init and exit.
>>>>>>>>>>>> Maybe I could do this in alloc_swap_info()?
>>>>>>>>>>>
>>>>>>>>>>> Yes.  In alloc_swap_info(), you can distinguish newly allocated and
>>>>>>>>>>> reused swap_info_struct.
>>>>>>>>>>>
>>>>>>>>>>>>>
>>>>>>>>>>>>>>> +}
>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>  static void alloc_cluster(struct swap_info_struct *si, unsigned long idx)
>>>>>>>>>>>>>>>  {
>>>>>>>>>>>>>>>  	struct swap_cluster_info *ci = si->cluster_info;
>>>>>>>>>>>>>>> @@ -2500,7 +2510,7 @@ static void enable_swap_info(struct swap_info_struct *p, int prio,
>>>>>>>>>>>>>>>  	 * Guarantee swap_map, cluster_info, etc. fields are valid
>>>>>>>>>>>>>>>  	 * between get/put_swap_device() if SWP_VALID bit is set
>>>>>>>>>>>>>>>  	 */
>>>>>>>>>>>>>>> -	synchronize_rcu();
>>>>>>>>>>>>>>> +	percpu_ref_reinit(&p->users);
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> Although the effect is same, I think it's better to use
>>>>>>>>>>>>>> percpu_ref_resurrect() here to improve code readability.
>>>>>>>>>>>>>
>>>>>>>>>>>>> Check the original commit description for commit eb085574a752 "mm, swap:
>>>>>>>>>>>>> fix race between swapoff and some swap operations" and discussion email
>>>>>>>>>>>>> thread as follows again,
>>>>>>>>>>>>>
>>>>>>>>>>>>> https://lore.kernel.org/linux-mm/20171219053650.GB7829@linux.vnet.ibm.com/
>>>>>>>>>>>>>
>>>>>>>>>>>>> I found that the synchronize_rcu() here is to avoid to call smp_rmb() or
>>>>>>>>>>>>> smp_load_acquire() in get_swap_device().  Now we will use
>>>>>>>>>>>>> percpu_ref_tryget_live() in get_swap_device(), so we will need to add
>>>>>>>>>>>>> the necessary memory barrier, or make sure percpu_ref_tryget_live() has
>>>>>>>>>>>>> ACQUIRE semantics.  Per my understanding, we need to change
>>>>>>>>>>>>> percpu_ref_tryget_live() for that.
>>>>>>>>>>>>>
>>>>>>>>>>>>
>>>>>>>>>>>> Do you mean the below scene is possible?
>>>>>>>>>>>>
>>>>>>>>>>>> cpu1
>>>>>>>>>>>> swapon()
>>>>>>>>>>>>   ...
>>>>>>>>>>>>   percpu_ref_init
>>>>>>>>>>>>   ...
>>>>>>>>>>>>   setup_swap_info
>>>>>>>>>>>>   /* smp_store_release() is inside percpu_ref_reinit */
>>>>>>>>>>>>   percpu_ref_reinit
>>>>>>>>>>>
>>>>>>>>>>> spin_unlock() has RELEASE semantics already.
>>>>>>>>>>>
>>>>>>>>>>>>   ...
>>>>>>>>>>>>
>>>>>>>>>>>> cpu2
>>>>>>>>>>>> get_swap_device()
>>>>>>>>>>>>   /* ignored  smp_rmb() */
>>>>>>>>>>>>   percpu_ref_tryget_live
>>>>>>>>>>>
>>>>>>>>>>> Some kind of ACQUIRE is required here to guarantee the refcount is
>>>>>>>>>>> checked before fetching the other fields of swap_info_struct.  I have
>>>>>>>>>>> sent out a RFC patch to mailing list to discuss this.
>>>>>>>>
>>>>>>>> I'm just catching up and following along a little bit. I apologize I
>>>>>>>> haven't read the swap code, but my understanding is you are trying to
>>>>>>>> narrow a race condition with swapoff. That makes sense to me. I'm not
>>>>>>>> sure I follow the need to race with reinitializing the ref though? Is it
>>>>>>>> not possible to wait out the dying swap info and then create a new one
>>>>>>>> rather than push acquire semantics?
>>>>>>>
>>>>>>> We want to check whether the swap entry is valid (that is, the swap
>>>>>>> device isn't swapped off now), prevent it from swapping off, then access
>>>>>>> the swap_info_struct data structure.  When accessing swap_info_struct,
>>>>>>> we want to guarantee the ordering, so that we will not reference
>>>>>>> uninitialized fields of swap_info_struct.
>>>>>>>
>>>>>>
>>>>>> So in the normal context of percpu_ref, once someone can access it, the
>>>>>> elements that it is protecting are expected to be initialized.
>>>>>
>>>>> If we can make sure that all elements being initialized fully, why not
>>>>> just use percpu_ref_get() instead of percpu_ref_tryget*()?
>>>>>
>>>>
>>>> Generally, the lookup is protected with rcu and then
>>>> percpu_ref_tryget*() is used to obtain a reference. percpu_ref_get() is
>>>> only good if you already have a ref as it increments regardless of being
>>>> 0.
>>>>
>>>> What I mean is if you can get a ref, that means the object hasn't been
>>>> destroyed. This differs from the semantics you are looking for which I
>>>> understand to be: I have long lived pointers to objects. The object may
>>>> die, but I may resurrect it and I want the old pointers to still be
>>>> valid.
>>>>
>>>> When is it possible for someone to have a pointer to the swap device and
>>>> the refcount goes to 0? It might be better to avoid this situation than
>>>> add acquire semantics.
>>>>
>>>>>> In the basic case for swap off, I'm seeing the goal as to prevent
>>>>>> destruction until anyone currently accessing swap is done. In this
>>>>>> case wouldn't we always be protecting a live struct?
>>>>>>
>>>>>> I'm maybe not understanding what conditions you're trying to revive the
>>>>>> percpu_ref?
>>>>>
>>>>> A swap entry likes an indirect pointer to a swap device.  We may hold a
>>>>> swap entry for long time, so that the swap device is swapoff/swapon.
>>>>> Then we need to make sure the swap device are fully initialized before
>>>>> accessing the swap device via the swap entry.
>>>>>
>>>>
>>>> So if I have some number of outstanding references, and then
>>>> percpu_ref_kill() is called, then only those that have the pointer will
>>>> be able to use the swap device as those references are still good. Prior
>>>> to calling percpu_ref_kill(), call_rcu() needs to be called on lookup
>>>> data structure.
>>>>
>>>> My personal understanding of tryget() vs tryget_live() is that it
>>>> provides a 2 phase clean up and bounds the ability for new users to come
>>>> in (cgroup destruction is a primary user). As tryget() might inevitably
>>>> let a cgroup live long past its removal, tryget_live() will say oh
>>>> you're in the process of dying do something else.
>>>
>>> OK.  I think that I understand your typical use case now.  The resource
>>> producer code may look like,
>>>
>>>   obj = kmalloc();
>>>   /* Initialize obj fields */
>>>   percpu_ref_init(&obj->ref);
>>>   rcu_assign_pointer(global_p, obj);
>>>
>>> The resource reclaimer looks like,
>>>
>>>   p = global_p;
>>>   global_p = NULL;
>>>   percpu_ref_kill(&p->ref);
>>>   /* wait until percpu_ref_is_zero(&p->ref) */
>>>   /* free resources pointed by obj fields */
>>>   kfree(p);
>>>
>>> The resource producer looks like,
>>>
>>>   rcu_read_lock();
>>>   p = rcu_dereference(global_p);
>>>   if (!p || !percpu_ref_tryget_live(&p->ref)) {
>>>           /* Invalid pointer, go out */
>>>   }
>>>   rcu_read_unlock();
>>>   /* use p */
>>>   percpu_ref_put(&p->ref);
>>>
>>> For this use case, it's not necessary to make percpu_ref_tryget_live()
>>> ACQUIRE operation.  Because refcount doesn't act as a flag to indicate
>>> whether the object has been fully initialized, global_p does.  And
>>> the data dependency guaranteed the required ordering.
>>>
>> 
>> Yes this is spot on.
>> 
>>> The use case of swap is different.  Where global_p always points to
>>> the obj (never freed) even if the resources pointed by obj fields has
>>> been freed.  And we want to use refcount as a flag to indicate whether
>>> the object is fully initialized.  This is hard to be changed, because
>>> the global_p is used to identify the stalled pointer from the totally
>>> invalid pointer.
>>>
>> 
>> Apologies ahead of time for this possibly dumb question. Is it possible
>> to have swapon swap out the global_p with
>> old_obj = rcu_access_pointer(global_p);
>> rcu_assign_pointer(global_p, obj);
>> kfree_rcu(remove_old_obj) or call_rcu();
>> 
>> Then the obj pointed to by global_p would always be valid, but only
>> would be alive again if it got the new pointer?
>
> Many thanks for both of you! Looks like a nice solution! Will try to do it in v2.
> Thanks again! :)

Think about this again.  This means that we need to free the old
swap_info_struct at some time.  So something like RCU is needed to
enclose the accessor.  But some accessor doesn't follow this, and it
appears overkill to change all these accessors.  So I think at least as
the first step, smp_rmb() appears more appropriate.

Best Regards,
Huang, Ying

>> 
>>> If all other users follow the typical use case above, we may find some
>>> other way to resolve the problem inside swap code, such as adding
>>> smp_rmb() after percpu_ref_tryget_live().
>>>
>> 
>> I would prefer it.
>> 
>>> Best Regards,
>>> Huang, Ying
>> 
>> Thanks,
>> Dennis
>> 
>> .
>> 


^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [PATCH 1/5] mm/swapfile: add percpu_ref support for swap
  2021-04-16  6:25                                 ` Huang, Ying
  (?)
@ 2021-04-16  8:30                                 ` Miaohe Lin
  -1 siblings, 0 replies; 72+ messages in thread
From: Miaohe Lin @ 2021-04-16  8:30 UTC (permalink / raw)
  To: Huang, Ying
  Cc: Dennis Zhou, akpm, hannes, mhocko, iamjoonsoo.kim, vbabka,
	alex.shi, willy, minchan, richard.weiyang, hughd, tim.c.chen,
	linux-kernel, linux-mm

On 2021/4/16 14:25, Huang, Ying wrote:
> Miaohe Lin <linmiaohe@huawei.com> writes:
> 
>> On 2021/4/15 22:31, Dennis Zhou wrote:
>>> On Thu, Apr 15, 2021 at 01:24:31PM +0800, Huang, Ying wrote:
>>>> Dennis Zhou <dennis@kernel.org> writes:
>>>>
>>>>> On Wed, Apr 14, 2021 at 01:44:58PM +0800, Huang, Ying wrote:
>>>>>> Dennis Zhou <dennis@kernel.org> writes:
>>>>>>
>>>>>>> On Wed, Apr 14, 2021 at 11:59:03AM +0800, Huang, Ying wrote:
>>>>>>>> Dennis Zhou <dennis@kernel.org> writes:
>>>>>>>>
>>>>>>>>> Hello,
>>>>>>>>>
>>>>>>>>> On Wed, Apr 14, 2021 at 10:06:48AM +0800, Huang, Ying wrote:
>>>>>>>>>> Miaohe Lin <linmiaohe@huawei.com> writes:
>>>>>>>>>>
>>>>>>>>>>> On 2021/4/14 9:17, Huang, Ying wrote:
>>>>>>>>>>>> Miaohe Lin <linmiaohe@huawei.com> writes:
>>>>>>>>>>>>
>>>>>>>>>>>>> On 2021/4/12 15:24, Huang, Ying wrote:
>>>>>>>>>>>>>> "Huang, Ying" <ying.huang@intel.com> writes:
>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> Miaohe Lin <linmiaohe@huawei.com> writes:
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>> We will use percpu-refcount to serialize against concurrent swapoff. This
>>>>>>>>>>>>>>>> patch adds the percpu_ref support for later fixup.
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>> Signed-off-by: Miaohe Lin <linmiaohe@huawei.com>
>>>>>>>>>>>>>>>> ---
>>>>>>>>>>>>>>>>  include/linux/swap.h |  2 ++
>>>>>>>>>>>>>>>>  mm/swapfile.c        | 25 ++++++++++++++++++++++---
>>>>>>>>>>>>>>>>  2 files changed, 24 insertions(+), 3 deletions(-)
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>> diff --git a/include/linux/swap.h b/include/linux/swap.h
>>>>>>>>>>>>>>>> index 144727041e78..849ba5265c11 100644
>>>>>>>>>>>>>>>> --- a/include/linux/swap.h
>>>>>>>>>>>>>>>> +++ b/include/linux/swap.h
>>>>>>>>>>>>>>>> @@ -240,6 +240,7 @@ struct swap_cluster_list {
>>>>>>>>>>>>>>>>   * The in-memory structure used to track swap areas.
>>>>>>>>>>>>>>>>   */
>>>>>>>>>>>>>>>>  struct swap_info_struct {
>>>>>>>>>>>>>>>> +	struct percpu_ref users;	/* serialization against concurrent swapoff */
>>>>>>>>>>>>>>>>  	unsigned long	flags;		/* SWP_USED etc: see above */
>>>>>>>>>>>>>>>>  	signed short	prio;		/* swap priority of this type */
>>>>>>>>>>>>>>>>  	struct plist_node list;		/* entry in swap_active_head */
>>>>>>>>>>>>>>>> @@ -260,6 +261,7 @@ struct swap_info_struct {
>>>>>>>>>>>>>>>>  	struct block_device *bdev;	/* swap device or bdev of swap file */
>>>>>>>>>>>>>>>>  	struct file *swap_file;		/* seldom referenced */
>>>>>>>>>>>>>>>>  	unsigned int old_block_size;	/* seldom referenced */
>>>>>>>>>>>>>>>> +	struct completion comp;		/* seldom referenced */
>>>>>>>>>>>>>>>>  #ifdef CONFIG_FRONTSWAP
>>>>>>>>>>>>>>>>  	unsigned long *frontswap_map;	/* frontswap in-use, one bit per page */
>>>>>>>>>>>>>>>>  	atomic_t frontswap_pages;	/* frontswap pages in-use counter */
>>>>>>>>>>>>>>>> diff --git a/mm/swapfile.c b/mm/swapfile.c
>>>>>>>>>>>>>>>> index 149e77454e3c..724173cd7d0c 100644
>>>>>>>>>>>>>>>> --- a/mm/swapfile.c
>>>>>>>>>>>>>>>> +++ b/mm/swapfile.c
>>>>>>>>>>>>>>>> @@ -39,6 +39,7 @@
>>>>>>>>>>>>>>>>  #include <linux/export.h>
>>>>>>>>>>>>>>>>  #include <linux/swap_slots.h>
>>>>>>>>>>>>>>>>  #include <linux/sort.h>
>>>>>>>>>>>>>>>> +#include <linux/completion.h>
>>>>>>>>>>>>>>>>  
>>>>>>>>>>>>>>>>  #include <asm/tlbflush.h>
>>>>>>>>>>>>>>>>  #include <linux/swapops.h>
>>>>>>>>>>>>>>>> @@ -511,6 +512,15 @@ static void swap_discard_work(struct work_struct *work)
>>>>>>>>>>>>>>>>  	spin_unlock(&si->lock);
>>>>>>>>>>>>>>>>  }
>>>>>>>>>>>>>>>>  
>>>>>>>>>>>>>>>> +static void swap_users_ref_free(struct percpu_ref *ref)
>>>>>>>>>>>>>>>> +{
>>>>>>>>>>>>>>>> +	struct swap_info_struct *si;
>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>> +	si = container_of(ref, struct swap_info_struct, users);
>>>>>>>>>>>>>>>> +	complete(&si->comp);
>>>>>>>>>>>>>>>> +	percpu_ref_exit(&si->users);
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> Because percpu_ref_exit() is used, we cannot use percpu_ref_tryget() in
>>>>>>>>>>>>>>> get_swap_device(), better to add comments there.
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> I just noticed that the comments of percpu_ref_tryget_live() says,
>>>>>>>>>>>>>>
>>>>>>>>>>>>>>  * This function is safe to call as long as @ref is between init and exit.
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> While we need to call get_swap_device() almost at any time, so it's
>>>>>>>>>>>>>> better to avoid to call percpu_ref_exit() at all.  This will waste some
>>>>>>>>>>>>>> memory, but we need to follow the API definition to avoid potential
>>>>>>>>>>>>>> issues in the long term.
>>>>>>>>>>>>>
>>>>>>>>>>>>> I have to admit that I'am not really familiar with percpu_ref. So I read the
>>>>>>>>>>>>> implementation code of the percpu_ref and found percpu_ref_tryget_live() could
>>>>>>>>>>>>> be called after exit now. But you're right we need to follow the API definition
>>>>>>>>>>>>> to avoid potential issues in the long term.
>>>>>>>>>>>>>
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> And we need to call percpu_ref_init() before insert the swap_info_struct
>>>>>>>>>>>>>> into the swap_info[].
>>>>>>>>>>>>>
>>>>>>>>>>>>> If we remove the call to percpu_ref_exit(), we should not use percpu_ref_init()
>>>>>>>>>>>>> here because *percpu_ref->data is assumed to be NULL* in percpu_ref_init() while
>>>>>>>>>>>>> this is not the case as we do not call percpu_ref_exit(). Maybe percpu_ref_reinit()
>>>>>>>>>>>>> or percpu_ref_resurrect() will do the work.
>>>>>>>>>>>>>
>>>>>>>>>>>>> One more thing, how could I distinguish the killed percpu_ref from newly allocated one?
>>>>>>>>>>>>> It seems percpu_ref_is_dying is only safe to call when @ref is between init and exit.
>>>>>>>>>>>>> Maybe I could do this in alloc_swap_info()?
>>>>>>>>>>>>
>>>>>>>>>>>> Yes.  In alloc_swap_info(), you can distinguish newly allocated and
>>>>>>>>>>>> reused swap_info_struct.
>>>>>>>>>>>>
>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>> +}
>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>  static void alloc_cluster(struct swap_info_struct *si, unsigned long idx)
>>>>>>>>>>>>>>>>  {
>>>>>>>>>>>>>>>>  	struct swap_cluster_info *ci = si->cluster_info;
>>>>>>>>>>>>>>>> @@ -2500,7 +2510,7 @@ static void enable_swap_info(struct swap_info_struct *p, int prio,
>>>>>>>>>>>>>>>>  	 * Guarantee swap_map, cluster_info, etc. fields are valid
>>>>>>>>>>>>>>>>  	 * between get/put_swap_device() if SWP_VALID bit is set
>>>>>>>>>>>>>>>>  	 */
>>>>>>>>>>>>>>>> -	synchronize_rcu();
>>>>>>>>>>>>>>>> +	percpu_ref_reinit(&p->users);
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> Although the effect is same, I think it's better to use
>>>>>>>>>>>>>>> percpu_ref_resurrect() here to improve code readability.
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> Check the original commit description for commit eb085574a752 "mm, swap:
>>>>>>>>>>>>>> fix race between swapoff and some swap operations" and discussion email
>>>>>>>>>>>>>> thread as follows again,
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> https://lore.kernel.org/linux-mm/20171219053650.GB7829@linux.vnet.ibm.com/
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> I found that the synchronize_rcu() here is to avoid to call smp_rmb() or
>>>>>>>>>>>>>> smp_load_acquire() in get_swap_device().  Now we will use
>>>>>>>>>>>>>> percpu_ref_tryget_live() in get_swap_device(), so we will need to add
>>>>>>>>>>>>>> the necessary memory barrier, or make sure percpu_ref_tryget_live() has
>>>>>>>>>>>>>> ACQUIRE semantics.  Per my understanding, we need to change
>>>>>>>>>>>>>> percpu_ref_tryget_live() for that.
>>>>>>>>>>>>>>
>>>>>>>>>>>>>
>>>>>>>>>>>>> Do you mean the below scene is possible?
>>>>>>>>>>>>>
>>>>>>>>>>>>> cpu1
>>>>>>>>>>>>> swapon()
>>>>>>>>>>>>>   ...
>>>>>>>>>>>>>   percpu_ref_init
>>>>>>>>>>>>>   ...
>>>>>>>>>>>>>   setup_swap_info
>>>>>>>>>>>>>   /* smp_store_release() is inside percpu_ref_reinit */
>>>>>>>>>>>>>   percpu_ref_reinit
>>>>>>>>>>>>
>>>>>>>>>>>> spin_unlock() has RELEASE semantics already.
>>>>>>>>>>>>
>>>>>>>>>>>>>   ...
>>>>>>>>>>>>>
>>>>>>>>>>>>> cpu2
>>>>>>>>>>>>> get_swap_device()
>>>>>>>>>>>>>   /* ignored  smp_rmb() */
>>>>>>>>>>>>>   percpu_ref_tryget_live
>>>>>>>>>>>>
>>>>>>>>>>>> Some kind of ACQUIRE is required here to guarantee the refcount is
>>>>>>>>>>>> checked before fetching the other fields of swap_info_struct.  I have
>>>>>>>>>>>> sent out a RFC patch to mailing list to discuss this.
>>>>>>>>>
>>>>>>>>> I'm just catching up and following along a little bit. I apologize I
>>>>>>>>> haven't read the swap code, but my understanding is you are trying to
>>>>>>>>> narrow a race condition with swapoff. That makes sense to me. I'm not
>>>>>>>>> sure I follow the need to race with reinitializing the ref though? Is it
>>>>>>>>> not possible to wait out the dying swap info and then create a new one
>>>>>>>>> rather than push acquire semantics?
>>>>>>>>
>>>>>>>> We want to check whether the swap entry is valid (that is, the swap
>>>>>>>> device isn't swapped off now), prevent it from swapping off, then access
>>>>>>>> the swap_info_struct data structure.  When accessing swap_info_struct,
>>>>>>>> we want to guarantee the ordering, so that we will not reference
>>>>>>>> uninitialized fields of swap_info_struct.
>>>>>>>>
>>>>>>>
>>>>>>> So in the normal context of percpu_ref, once someone can access it, the
>>>>>>> elements that it is protecting are expected to be initialized.
>>>>>>
>>>>>> If we can make sure that all elements being initialized fully, why not
>>>>>> just use percpu_ref_get() instead of percpu_ref_tryget*()?
>>>>>>
>>>>>
>>>>> Generally, the lookup is protected with rcu and then
>>>>> percpu_ref_tryget*() is used to obtain a reference. percpu_ref_get() is
>>>>> only good if you already have a ref as it increments regardless of being
>>>>> 0.
>>>>>
>>>>> What I mean is if you can get a ref, that means the object hasn't been
>>>>> destroyed. This differs from the semantics you are looking for which I
>>>>> understand to be: I have long lived pointers to objects. The object may
>>>>> die, but I may resurrect it and I want the old pointers to still be
>>>>> valid.
>>>>>
>>>>> When is it possible for someone to have a pointer to the swap device and
>>>>> the refcount goes to 0? It might be better to avoid this situation than
>>>>> add acquire semantics.
>>>>>
>>>>>>> In the basic case for swap off, I'm seeing the goal as to prevent
>>>>>>> destruction until anyone currently accessing swap is done. In this
>>>>>>> case wouldn't we always be protecting a live struct?
>>>>>>>
>>>>>>> I'm maybe not understanding what conditions you're trying to revive the
>>>>>>> percpu_ref?
>>>>>>
>>>>>> A swap entry likes an indirect pointer to a swap device.  We may hold a
>>>>>> swap entry for long time, so that the swap device is swapoff/swapon.
>>>>>> Then we need to make sure the swap device are fully initialized before
>>>>>> accessing the swap device via the swap entry.
>>>>>>
>>>>>
>>>>> So if I have some number of outstanding references, and then
>>>>> percpu_ref_kill() is called, then only those that have the pointer will
>>>>> be able to use the swap device as those references are still good. Prior
>>>>> to calling percpu_ref_kill(), call_rcu() needs to be called on lookup
>>>>> data structure.
>>>>>
>>>>> My personal understanding of tryget() vs tryget_live() is that it
>>>>> provides a 2 phase clean up and bounds the ability for new users to come
>>>>> in (cgroup destruction is a primary user). As tryget() might inevitably
>>>>> let a cgroup live long past its removal, tryget_live() will say oh
>>>>> you're in the process of dying do something else.
>>>>
>>>> OK.  I think that I understand your typical use case now.  The resource
>>>> producer code may look like,
>>>>
>>>>   obj = kmalloc();
>>>>   /* Initialize obj fields */
>>>>   percpu_ref_init(&obj->ref);
>>>>   rcu_assign_pointer(global_p, obj);
>>>>
>>>> The resource reclaimer looks like,
>>>>
>>>>   p = global_p;
>>>>   global_p = NULL;
>>>>   percpu_ref_kill(&p->ref);
>>>>   /* wait until percpu_ref_is_zero(&p->ref) */
>>>>   /* free resources pointed by obj fields */
>>>>   kfree(p);
>>>>
>>>> The resource producer looks like,
>>>>
>>>>   rcu_read_lock();
>>>>   p = rcu_dereference(global_p);
>>>>   if (!p || !percpu_ref_tryget_live(&p->ref)) {
>>>>           /* Invalid pointer, go out */
>>>>   }
>>>>   rcu_read_unlock();
>>>>   /* use p */
>>>>   percpu_ref_put(&p->ref);
>>>>
>>>> For this use case, it's not necessary to make percpu_ref_tryget_live()
>>>> ACQUIRE operation.  Because refcount doesn't act as a flag to indicate
>>>> whether the object has been fully initialized, global_p does.  And
>>>> the data dependency guaranteed the required ordering.
>>>>
>>>
>>> Yes this is spot on.
>>>
>>>> The use case of swap is different.  Where global_p always points to
>>>> the obj (never freed) even if the resources pointed by obj fields has
>>>> been freed.  And we want to use refcount as a flag to indicate whether
>>>> the object is fully initialized.  This is hard to be changed, because
>>>> the global_p is used to identify the stalled pointer from the totally
>>>> invalid pointer.
>>>>
>>>
>>> Apologies ahead of time for this possibly dumb question. Is it possible
>>> to have swapon swap out the global_p with
>>> old_obj = rcu_access_pointer(global_p);
>>> rcu_assign_pointer(global_p, obj);
>>> kfree_rcu(remove_old_obj) or call_rcu();
>>>
>>> Then the obj pointed to by global_p would always be valid, but only
>>> would be alive again if it got the new pointer?
>>
>> Many thanks for both of you! Looks like a nice solution! Will try to do it in v2.
>> Thanks again! :)
> 
> Think about this again.  This means that we need to free the old
> swap_info_struct at some time.  So something like RCU is needed to
> enclose the accessor.  But some accessor doesn't follow this, and it
> appears overkill to change all these accessors.  So I think at least as
> the first step, smp_rmb() appears more appropriate.
> 

Agree. Thanks!

> Best Regards,
> Huang, Ying
> 
>>>
>>>> If all other users follow the typical use case above, we may find some
>>>> other way to resolve the problem inside swap code, such as adding
>>>> smp_rmb() after percpu_ref_tryget_live().
>>>>
>>>
>>> I would prefer it.
>>>
>>>> Best Regards,
>>>> Huang, Ying
>>>
>>> Thanks,
>>> Dennis
>>>
>>> .
>>>
> 
> .
> 


^ permalink raw reply	[flat|nested] 72+ messages in thread

end of thread, other threads:[~2021-04-16  8:30 UTC | newest]

Thread overview: 72+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-04-08 13:08 [PATCH 0/5] close various race windows for swap Miaohe Lin
2021-04-08 13:08 ` [PATCH 1/5] mm/swapfile: add percpu_ref support " Miaohe Lin
2021-04-12  3:30   ` Huang, Ying
2021-04-12  3:30     ` Huang, Ying
2021-04-12  6:59     ` Miaohe Lin
2021-04-12  7:24     ` Huang, Ying
2021-04-12  7:24       ` Huang, Ying
2021-04-13 12:39       ` Miaohe Lin
2021-04-14  1:17         ` Huang, Ying
2021-04-14  1:17           ` Huang, Ying
2021-04-14  1:58           ` Miaohe Lin
2021-04-14  2:06             ` Huang, Ying
2021-04-14  2:06               ` Huang, Ying
2021-04-14  3:44               ` Dennis Zhou
2021-04-14  3:59                 ` Huang, Ying
2021-04-14  3:59                   ` Huang, Ying
2021-04-14  4:05                   ` Dennis Zhou
2021-04-14  5:44                     ` Huang, Ying
2021-04-14  5:44                       ` Huang, Ying
2021-04-14 14:53                       ` Dennis Zhou
2021-04-15  3:16                         ` Miaohe Lin
2021-04-15  4:20                           ` Dennis Zhou
2021-04-15  9:17                             ` Miaohe Lin
2021-04-15  5:24                         ` Huang, Ying
2021-04-15  5:24                           ` Huang, Ying
2021-04-15 14:31                           ` Dennis Zhou
2021-04-16  0:54                             ` Huang, Ying
2021-04-16  0:54                               ` Huang, Ying
2021-04-16  2:27                             ` Miaohe Lin
2021-04-16  6:25                               ` Huang, Ying
2021-04-16  6:25                                 ` Huang, Ying
2021-04-16  8:30                                 ` Miaohe Lin
2021-04-08 13:08 ` [PATCH 2/5] swap: fix do_swap_page() race with swapoff Miaohe Lin
2021-04-08 21:34   ` Tim Chen
2021-04-09  8:42     ` Miaohe Lin
2021-04-09 17:17       ` Tim Chen
2021-04-10  3:17         ` Miaohe Lin
2021-04-12  1:44           ` Huang, Ying
2021-04-12  1:44             ` Huang, Ying
2021-04-12  3:24             ` Miaohe Lin
2021-04-08 21:37   ` kernel test robot
2021-04-09  8:46     ` Miaohe Lin
2021-04-08 22:56   ` kernel test robot
2021-04-13  1:27   ` Huang, Ying
2021-04-13  1:27     ` Huang, Ying
2021-04-13 19:24     ` Tim Chen
2021-04-14  1:04       ` Huang, Ying
2021-04-14  1:04         ` Huang, Ying
2021-04-14  2:20         ` Miaohe Lin
2021-04-14 16:13         ` Tim Chen
2021-04-15  3:19           ` Miaohe Lin
2021-04-14  2:55     ` Miaohe Lin
2021-04-14  3:07       ` Huang, Ying
2021-04-14  3:07         ` Huang, Ying
2021-04-14  3:27         ` Miaohe Lin
2021-04-08 13:08 ` [PATCH 3/5] mm/swap_state: fix get_shadow_from_swap_cache() " Miaohe Lin
2021-04-13  1:33   ` Huang, Ying
2021-04-13  1:33     ` Huang, Ying
2021-04-14  2:42     ` Miaohe Lin
2021-04-08 13:08 ` [PATCH 4/5] mm/swap_state: fix potential faulted in race in swap_ra_info() Miaohe Lin
2021-04-09  8:50   ` Huang, Ying
2021-04-09  8:50     ` Huang, Ying
2021-04-09  9:00     ` Miaohe Lin
2021-04-12  0:55       ` Huang, Ying
2021-04-12  0:55         ` Huang, Ying
2021-04-12  3:17         ` Miaohe Lin
2021-04-08 13:08 ` [PATCH 5/5] mm/swap_state: fix swap_cluster_readahead() race with swapoff Miaohe Lin
2021-04-13  1:36   ` Huang, Ying
2021-04-13  1:36     ` Huang, Ying
2021-04-14  2:43     ` Miaohe Lin
2021-04-08 14:55 ` [PATCH 0/5] close various race windows for swap riteshh
2021-04-09  8:01   ` Miaohe Lin

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.