linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed
* [PATCH v3 0/4] close various race windows for swap
@ 2021-04-20 13:30 Miaohe Lin
  2021-04-20 13:30 ` [PATCH v3 1/4] mm/swapfile: use percpu_ref to serialize against concurrent swapoff Miaohe Lin
                   ` (3 more replies)
  0 siblings, 4 replies; 10+ messages in thread
From: Miaohe Lin @ 2021-04-20 13:30 UTC (permalink / raw)
  To: akpm
  Cc: ying.huang, dennis, tim.c.chen, hughd, hannes, mhocko,
	iamjoonsoo.kim, alexs, willy, minchan, richard.weiyang,
	shy828301, linux-kernel, linux-mm, linmiaohe

Hi all,
When I was investigating the swap code, I found some possible race
windows. This series aims to fix all these races. But using current
get/put_swap_device() to guard against concurrent swapoff for
swap_readpage() looks terrible because swap_readpage() may take really
long time. And to reduce the performance overhead on the hot-path as
much as possible, it appears we can use the percpu_ref to close this
race window(as suggested by Huang, Ying). The patch 1 adds percpu_ref
support for swap and most of the remaining patches try to use this to
close various race windows. More details can be found in the respective
changelogs. Thanks!

v2->v3:
  some commit log and comment enhance per Huang, Ying
  remove ref_initialized field
  squash PATCH 1-2

v1->v2:
  reorganize the patch-2/5
  various enhance and fixup per Huang, Ying
  Many thanks for the comments of Huang, Ying, Dennis Zhou and Tim Chen.

Miaohe Lin (4):
  mm/swapfile: use percpu_ref to serialize against concurrent swapoff
  swap: fix do_swap_page() race with swapoff
  mm/swap: remove confusing checking for non_swap_entry() in
    swap_ra_info()
  mm/shmem: fix shmem_swapin() race with swapoff

 include/linux/swap.h | 14 ++++++--
 mm/memory.c          |  9 +++++
 mm/shmem.c           |  6 ++++
 mm/swap_state.c      |  6 ----
 mm/swapfile.c        | 79 +++++++++++++++++++++++++++-----------------
 5 files changed, 76 insertions(+), 38 deletions(-)

-- 
2.19.1



^ permalink raw reply	[flat|nested] 10+ messages in thread

* [PATCH v3 1/4] mm/swapfile: use percpu_ref to serialize against concurrent swapoff
  2021-04-20 13:30 [PATCH v3 0/4] close various race windows for swap Miaohe Lin
@ 2021-04-20 13:30 ` Miaohe Lin
  2021-04-21  1:05   ` Huang, Ying
  2021-04-20 13:30 ` [PATCH v3 2/4] swap: fix do_swap_page() race with swapoff Miaohe Lin
                   ` (2 subsequent siblings)
  3 siblings, 1 reply; 10+ messages in thread
From: Miaohe Lin @ 2021-04-20 13:30 UTC (permalink / raw)
  To: akpm
  Cc: ying.huang, dennis, tim.c.chen, hughd, hannes, mhocko,
	iamjoonsoo.kim, alexs, willy, minchan, richard.weiyang,
	shy828301, linux-kernel, linux-mm, linmiaohe

Using current get/put_swap_device() to guard against concurrent swapoff
for some swap ops, e.g. swap_readpage(), looks terrible because they
might take really long time. This patch adds the percpu_ref support to
serialize against concurrent swapoff. Also we remove the SWP_VALID flag
because it's used together with RCU solution.

Signed-off-by: Miaohe Lin <linmiaohe@huawei.com>
---
 include/linux/swap.h |  5 +--
 mm/swapfile.c        | 79 +++++++++++++++++++++++++++-----------------
 2 files changed, 52 insertions(+), 32 deletions(-)

diff --git a/include/linux/swap.h b/include/linux/swap.h
index 144727041e78..c9e7fea10b83 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -177,7 +177,6 @@ enum {
 	SWP_PAGE_DISCARD = (1 << 10),	/* freed swap page-cluster discards */
 	SWP_STABLE_WRITES = (1 << 11),	/* no overwrite PG_writeback pages */
 	SWP_SYNCHRONOUS_IO = (1 << 12),	/* synchronous IO is efficient */
-	SWP_VALID	= (1 << 13),	/* swap is valid to be operated on? */
 					/* add others here before... */
 	SWP_SCANNING	= (1 << 14),	/* refcount in scan_swap_map */
 };
@@ -240,6 +239,7 @@ struct swap_cluster_list {
  * The in-memory structure used to track swap areas.
  */
 struct swap_info_struct {
+	struct percpu_ref users;	/* indicate and keep swap device valid. */
 	unsigned long	flags;		/* SWP_USED etc: see above */
 	signed short	prio;		/* swap priority of this type */
 	struct plist_node list;		/* entry in swap_active_head */
@@ -260,6 +260,7 @@ struct swap_info_struct {
 	struct block_device *bdev;	/* swap device or bdev of swap file */
 	struct file *swap_file;		/* seldom referenced */
 	unsigned int old_block_size;	/* seldom referenced */
+	struct completion comp;		/* seldom referenced */
 #ifdef CONFIG_FRONTSWAP
 	unsigned long *frontswap_map;	/* frontswap in-use, one bit per page */
 	atomic_t frontswap_pages;	/* frontswap pages in-use counter */
@@ -511,7 +512,7 @@ sector_t swap_page_sector(struct page *page);
 
 static inline void put_swap_device(struct swap_info_struct *si)
 {
-	rcu_read_unlock();
+	percpu_ref_put(&si->users);
 }
 
 #else /* CONFIG_SWAP */
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 149e77454e3c..a640fc84be5b 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -39,6 +39,7 @@
 #include <linux/export.h>
 #include <linux/swap_slots.h>
 #include <linux/sort.h>
+#include <linux/completion.h>
 
 #include <asm/tlbflush.h>
 #include <linux/swapops.h>
@@ -511,6 +512,14 @@ static void swap_discard_work(struct work_struct *work)
 	spin_unlock(&si->lock);
 }
 
+static void swap_users_ref_free(struct percpu_ref *ref)
+{
+	struct swap_info_struct *si;
+
+	si = container_of(ref, struct swap_info_struct, users);
+	complete(&si->comp);
+}
+
 static void alloc_cluster(struct swap_info_struct *si, unsigned long idx)
 {
 	struct swap_cluster_info *ci = si->cluster_info;
@@ -1270,18 +1279,12 @@ static unsigned char __swap_entry_free_locked(struct swap_info_struct *p,
  * via preventing the swap device from being swapoff, until
  * put_swap_device() is called.  Otherwise return NULL.
  *
- * The entirety of the RCU read critical section must come before the
- * return from or after the call to synchronize_rcu() in
- * enable_swap_info() or swapoff().  So if "si->flags & SWP_VALID" is
- * true, the si->map, si->cluster_info, etc. must be valid in the
- * critical section.
- *
  * Notice that swapoff or swapoff+swapon can still happen before the
- * rcu_read_lock() in get_swap_device() or after the rcu_read_unlock()
- * in put_swap_device() if there isn't any other way to prevent
- * swapoff, such as page lock, page table lock, etc.  The caller must
- * be prepared for that.  For example, the following situation is
- * possible.
+ * percpu_ref_tryget_live() in get_swap_device() or after the
+ * percpu_ref_put() in put_swap_device() if there isn't any other way
+ * to prevent swapoff, such as page lock, page table lock, etc.  The
+ * caller must be prepared for that.  For example, the following
+ * situation is possible.
  *
  *   CPU1				CPU2
  *   do_swap_page()
@@ -1309,21 +1312,27 @@ struct swap_info_struct *get_swap_device(swp_entry_t entry)
 	si = swp_swap_info(entry);
 	if (!si)
 		goto bad_nofile;
-
-	rcu_read_lock();
-	if (data_race(!(si->flags & SWP_VALID)))
-		goto unlock_out;
+	if (!percpu_ref_tryget_live(&si->users))
+		goto out;
+	/*
+	 * Guarantee the si->users are checked before accessing other
+	 * fields of swap_info_struct.
+	 *
+	 * Paired with the spin_unlock() after setup_swap_info() in
+	 * enable_swap_info().
+	 */
+	smp_rmb();
 	offset = swp_offset(entry);
 	if (offset >= si->max)
-		goto unlock_out;
+		goto put_out;
 
 	return si;
 bad_nofile:
 	pr_err("%s: %s%08lx\n", __func__, Bad_file, entry.val);
 out:
 	return NULL;
-unlock_out:
-	rcu_read_unlock();
+put_out:
+	percpu_ref_put(&si->users);
 	return NULL;
 }
 
@@ -2466,7 +2475,7 @@ static void setup_swap_info(struct swap_info_struct *p, int prio,
 
 static void _enable_swap_info(struct swap_info_struct *p)
 {
-	p->flags |= SWP_WRITEOK | SWP_VALID;
+	p->flags |= SWP_WRITEOK;
 	atomic_long_add(p->pages, &nr_swap_pages);
 	total_swap_pages += p->pages;
 
@@ -2497,10 +2506,9 @@ static void enable_swap_info(struct swap_info_struct *p, int prio,
 	spin_unlock(&p->lock);
 	spin_unlock(&swap_lock);
 	/*
-	 * Guarantee swap_map, cluster_info, etc. fields are valid
-	 * between get/put_swap_device() if SWP_VALID bit is set
+	 * Finished initialized swap device, now it's safe to reference it.
 	 */
-	synchronize_rcu();
+	percpu_ref_resurrect(&p->users);
 	spin_lock(&swap_lock);
 	spin_lock(&p->lock);
 	_enable_swap_info(p);
@@ -2616,16 +2624,16 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
 
 	reenable_swap_slots_cache_unlock();
 
-	spin_lock(&swap_lock);
-	spin_lock(&p->lock);
-	p->flags &= ~SWP_VALID;		/* mark swap device as invalid */
-	spin_unlock(&p->lock);
-	spin_unlock(&swap_lock);
 	/*
-	 * wait for swap operations protected by get/put_swap_device()
-	 * to complete
+	 * Wait for swap operations protected by get/put_swap_device()
+	 * to complete.
+	 *
+	 * We need synchronize_rcu() here to protect the accessing to
+	 * the swap cache data structure.
 	 */
+	percpu_ref_kill(&p->users);
 	synchronize_rcu();
+	wait_for_completion(&p->comp);
 
 	flush_work(&p->discard_work);
 
@@ -2857,6 +2865,12 @@ static struct swap_info_struct *alloc_swap_info(void)
 	if (!p)
 		return ERR_PTR(-ENOMEM);
 
+	if (percpu_ref_init(&p->users, swap_users_ref_free,
+			    PERCPU_REF_INIT_DEAD, GFP_KERNEL)) {
+		kvfree(p);
+		return ERR_PTR(-ENOMEM);
+	}
+
 	spin_lock(&swap_lock);
 	for (type = 0; type < nr_swapfiles; type++) {
 		if (!(swap_info[type]->flags & SWP_USED))
@@ -2864,6 +2878,7 @@ static struct swap_info_struct *alloc_swap_info(void)
 	}
 	if (type >= MAX_SWAPFILES) {
 		spin_unlock(&swap_lock);
+		percpu_ref_exit(&p->users);
 		kvfree(p);
 		return ERR_PTR(-EPERM);
 	}
@@ -2891,9 +2906,13 @@ static struct swap_info_struct *alloc_swap_info(void)
 		plist_node_init(&p->avail_lists[i], 0);
 	p->flags = SWP_USED;
 	spin_unlock(&swap_lock);
-	kvfree(defer);
+	if (defer) {
+		percpu_ref_exit(&defer->users);
+		kvfree(defer);
+	}
 	spin_lock_init(&p->lock);
 	spin_lock_init(&p->cont_lock);
+	init_completion(&p->comp);
 
 	return p;
 }
-- 
2.19.1



^ permalink raw reply related	[flat|nested] 10+ messages in thread

* [PATCH v3 2/4] swap: fix do_swap_page() race with swapoff
  2021-04-20 13:30 [PATCH v3 0/4] close various race windows for swap Miaohe Lin
  2021-04-20 13:30 ` [PATCH v3 1/4] mm/swapfile: use percpu_ref to serialize against concurrent swapoff Miaohe Lin
@ 2021-04-20 13:30 ` Miaohe Lin
  2021-04-21  0:52   ` Huang, Ying
  2021-04-20 13:30 ` [PATCH v3 3/4] mm/swap: remove confusing checking for non_swap_entry() in swap_ra_info() Miaohe Lin
  2021-04-20 13:30 ` [PATCH v3 4/4] mm/shmem: fix shmem_swapin() race with swapoff Miaohe Lin
  3 siblings, 1 reply; 10+ messages in thread
From: Miaohe Lin @ 2021-04-20 13:30 UTC (permalink / raw)
  To: akpm
  Cc: ying.huang, dennis, tim.c.chen, hughd, hannes, mhocko,
	iamjoonsoo.kim, alexs, willy, minchan, richard.weiyang,
	shy828301, linux-kernel, linux-mm, linmiaohe

When I was investigating the swap code, I found the below possible race
window:

CPU 1                                   	CPU 2
-----                                   	-----
do_swap_page
  if (data_race(si->flags & SWP_SYNCHRONOUS_IO)
  swap_readpage
    if (data_race(sis->flags & SWP_FS_OPS)) {
                                        	swapoff
					  	  p->flags &= ~SWP_VALID;
					  	  ..
					  	  synchronize_rcu();
					  	  ..
					  	  p->swap_file = NULL;
    struct file *swap_file = sis->swap_file;
    struct address_space *mapping = swap_file->f_mapping;[oops!]

Note that for the pages that are swapped in through swap cache, this isn't
an issue. Because the page is locked, and the swap entry will be marked
with SWAP_HAS_CACHE, so swapoff() can not proceed until the page has been
unlocked.

Using current get/put_swap_device() to guard against concurrent swapoff for
swap_readpage() looks terrible because swap_readpage() may take really long
time. And this race may not be really pernicious because swapoff is usually
done when system shutdown only. To reduce the performance overhead on the
hot-path as much as possible, it appears we can use the percpu_ref to close
this race window(as suggested by Huang, Ying).

Fixes: 0bcac06f27d7 ("mm,swap: skip swapcache for swapin of synchronous device")
Reported-by: kernel test robot <lkp@intel.com> (auto build test ERROR)
Signed-off-by: Miaohe Lin <linmiaohe@huawei.com>
---
 include/linux/swap.h | 9 +++++++++
 mm/memory.c          | 9 +++++++++
 2 files changed, 18 insertions(+)

diff --git a/include/linux/swap.h b/include/linux/swap.h
index c9e7fea10b83..46d51d058d05 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -527,6 +527,15 @@ static inline struct swap_info_struct *swp_swap_info(swp_entry_t entry)
 	return NULL;
 }
 
+static inline struct swap_info_struct *get_swap_device(swp_entry_t entry)
+{
+	return NULL;
+}
+
+static inline void put_swap_device(struct swap_info_struct *si)
+{
+}
+
 #define swap_address_space(entry)		(NULL)
 #define get_nr_swap_pages()			0L
 #define total_swap_pages			0L
diff --git a/mm/memory.c b/mm/memory.c
index 27014c3bde9f..7a2fe12cf641 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3311,6 +3311,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
 {
 	struct vm_area_struct *vma = vmf->vma;
 	struct page *page = NULL, *swapcache;
+	struct swap_info_struct *si = NULL;
 	swp_entry_t entry;
 	pte_t pte;
 	int locked;
@@ -3338,6 +3339,10 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
 		goto out;
 	}
 
+	/* Prevent swapoff from happening to us. */
+	si = get_swap_device(entry);
+	if (unlikely(!si))
+		goto out;
 
 	delayacct_set_flag(current, DELAYACCT_PF_SWAPIN);
 	page = lookup_swap_cache(entry, vma, vmf->address);
@@ -3514,6 +3519,8 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
 unlock:
 	pte_unmap_unlock(vmf->pte, vmf->ptl);
 out:
+	if (si)
+		put_swap_device(si);
 	return ret;
 out_nomap:
 	pte_unmap_unlock(vmf->pte, vmf->ptl);
@@ -3525,6 +3532,8 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
 		unlock_page(swapcache);
 		put_page(swapcache);
 	}
+	if (si)
+		put_swap_device(si);
 	return ret;
 }
 
-- 
2.19.1



^ permalink raw reply related	[flat|nested] 10+ messages in thread

* [PATCH v3 3/4] mm/swap: remove confusing checking for non_swap_entry() in swap_ra_info()
  2021-04-20 13:30 [PATCH v3 0/4] close various race windows for swap Miaohe Lin
  2021-04-20 13:30 ` [PATCH v3 1/4] mm/swapfile: use percpu_ref to serialize against concurrent swapoff Miaohe Lin
  2021-04-20 13:30 ` [PATCH v3 2/4] swap: fix do_swap_page() race with swapoff Miaohe Lin
@ 2021-04-20 13:30 ` Miaohe Lin
  2021-04-21  0:58   ` Huang, Ying
  2021-04-20 13:30 ` [PATCH v3 4/4] mm/shmem: fix shmem_swapin() race with swapoff Miaohe Lin
  3 siblings, 1 reply; 10+ messages in thread
From: Miaohe Lin @ 2021-04-20 13:30 UTC (permalink / raw)
  To: akpm
  Cc: ying.huang, dennis, tim.c.chen, hughd, hannes, mhocko,
	iamjoonsoo.kim, alexs, willy, minchan, richard.weiyang,
	shy828301, linux-kernel, linux-mm, linmiaohe

The non_swap_entry() was used for working with VMA based swap readahead
via commit ec560175c0b6 ("mm, swap: VMA based swap readahead"). Then it's
moved to swap_ra_info() since commit eaf649ebc3ac ("mm: swap: clean up swap
readahead"). But this makes the code confusing. The non_swap_entry() check
looks racy because while we released the pte lock, somebody else might have
faulted in this pte. So we should check whether it's swap pte first to
guard against such race or swap_type will be unexpected. But the swap_entry
isn't used in this function and we will have enough checking when we really
operate the PTE entries later. So checking for non_swap_entry() is not
really needed here and should be removed to avoid confusion.

Signed-off-by: Miaohe Lin <linmiaohe@huawei.com>
---
 mm/swap_state.c | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/mm/swap_state.c b/mm/swap_state.c
index 272ea2108c9d..df5405384520 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -721,7 +721,6 @@ static void swap_ra_info(struct vm_fault *vmf,
 {
 	struct vm_area_struct *vma = vmf->vma;
 	unsigned long ra_val;
-	swp_entry_t entry;
 	unsigned long faddr, pfn, fpfn;
 	unsigned long start, end;
 	pte_t *pte, *orig_pte;
@@ -739,11 +738,6 @@ static void swap_ra_info(struct vm_fault *vmf,
 
 	faddr = vmf->address;
 	orig_pte = pte = pte_offset_map(vmf->pmd, faddr);
-	entry = pte_to_swp_entry(*pte);
-	if ((unlikely(non_swap_entry(entry)))) {
-		pte_unmap(orig_pte);
-		return;
-	}
 
 	fpfn = PFN_DOWN(faddr);
 	ra_val = GET_SWAP_RA_VAL(vma);
-- 
2.19.1



^ permalink raw reply related	[flat|nested] 10+ messages in thread

* [PATCH v3 4/4] mm/shmem: fix shmem_swapin() race with swapoff
  2021-04-20 13:30 [PATCH v3 0/4] close various race windows for swap Miaohe Lin
                   ` (2 preceding siblings ...)
  2021-04-20 13:30 ` [PATCH v3 3/4] mm/swap: remove confusing checking for non_swap_entry() in swap_ra_info() Miaohe Lin
@ 2021-04-20 13:30 ` Miaohe Lin
  2021-04-21  1:02   ` Huang, Ying
  3 siblings, 1 reply; 10+ messages in thread
From: Miaohe Lin @ 2021-04-20 13:30 UTC (permalink / raw)
  To: akpm
  Cc: ying.huang, dennis, tim.c.chen, hughd, hannes, mhocko,
	iamjoonsoo.kim, alexs, willy, minchan, richard.weiyang,
	shy828301, linux-kernel, linux-mm, linmiaohe

When I was investigating the swap code, I found the below possible race
window:

CPU 1                                         CPU 2
-----                                         -----
shmem_swapin
  swap_cluster_readahead
    if (likely(si->flags & (SWP_BLKDEV | SWP_FS_OPS))) {
                                              swapoff
                                                percpu_ref_kill(&p->users)
                                                synchronize_rcu()
                                                wait_for_completion
                                                ..
                                                si->swap_file = NULL;
    struct inode *inode = si->swap_file->f_mapping->host;[oops!]

Close this race window by using get/put_swap_device() to guard against
concurrent swapoff.

Fixes: 8fd2e0b505d1 ("mm: swap: check if swap backing device is congested or not")
Signed-off-by: Miaohe Lin <linmiaohe@huawei.com>
---
 mm/shmem.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/mm/shmem.c b/mm/shmem.c
index 26c76b13ad23..936ba5595297 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1492,15 +1492,21 @@ static void shmem_pseudo_vma_destroy(struct vm_area_struct *vma)
 static struct page *shmem_swapin(swp_entry_t swap, gfp_t gfp,
 			struct shmem_inode_info *info, pgoff_t index)
 {
+	struct swap_info_struct *si;
 	struct vm_area_struct pvma;
 	struct page *page;
 	struct vm_fault vmf = {
 		.vma = &pvma,
 	};
 
+	/* Prevent swapoff from happening to us. */
+	si = get_swap_device(swap);
+	if (unlikely(!si))
+		return NULL;
 	shmem_pseudo_vma_init(&pvma, info, index);
 	page = swap_cluster_readahead(swap, gfp, &vmf);
 	shmem_pseudo_vma_destroy(&pvma);
+	put_swap_device(si);
 
 	return page;
 }
-- 
2.19.1



^ permalink raw reply related	[flat|nested] 10+ messages in thread

* Re: [PATCH v3 2/4] swap: fix do_swap_page() race with swapoff
  2021-04-20 13:30 ` [PATCH v3 2/4] swap: fix do_swap_page() race with swapoff Miaohe Lin
@ 2021-04-21  0:52   ` Huang, Ying
  0 siblings, 0 replies; 10+ messages in thread
From: Huang, Ying @ 2021-04-21  0:52 UTC (permalink / raw)
  To: Miaohe Lin
  Cc: akpm, dennis, tim.c.chen, hughd, hannes, mhocko, iamjoonsoo.kim,
	alexs, willy, minchan, richard.weiyang, shy828301, linux-kernel,
	linux-mm

Miaohe Lin <linmiaohe@huawei.com> writes:

> When I was investigating the swap code, I found the below possible race
> window:
>
> CPU 1                                   	CPU 2
> -----                                   	-----
> do_swap_page
>   if (data_race(si->flags & SWP_SYNCHRONOUS_IO)
>   swap_readpage
>     if (data_race(sis->flags & SWP_FS_OPS)) {
>                                         	swapoff
> 					  	  p->flags &= ~SWP_VALID;
> 					  	  ..
> 					  	  synchronize_rcu();
> 					  	  ..

You have deleted SWP_VALID and RCU solution in 1/4, so please revise this.

> 					  	  p->swap_file = NULL;
>     struct file *swap_file = sis->swap_file;
>     struct address_space *mapping = swap_file->f_mapping;[oops!]
>
> Note that for the pages that are swapped in through swap cache, this isn't
> an issue. Because the page is locked, and the swap entry will be marked
> with SWAP_HAS_CACHE, so swapoff() can not proceed until the page has been
> unlocked.
>
> Using current get/put_swap_device() to guard against concurrent swapoff for
> swap_readpage() looks terrible because swap_readpage() may take really long
> time. And this race may not be really pernicious because swapoff is usually
> done when system shutdown only. To reduce the performance overhead on the
> hot-path as much as possible, it appears we can use the percpu_ref to close
> this race window(as suggested by Huang, Ying).

This needs to be revised too.  Unless you squash 1/4 and 2/4.

> Fixes: 0bcac06f27d7 ("mm,swap: skip swapcache for swapin of synchronous device")
> Reported-by: kernel test robot <lkp@intel.com> (auto build test ERROR)
> Signed-off-by: Miaohe Lin <linmiaohe@huawei.com>
> ---
>  include/linux/swap.h | 9 +++++++++
>  mm/memory.c          | 9 +++++++++
>  2 files changed, 18 insertions(+)
>
> diff --git a/include/linux/swap.h b/include/linux/swap.h
> index c9e7fea10b83..46d51d058d05 100644
> --- a/include/linux/swap.h
> +++ b/include/linux/swap.h
> @@ -527,6 +527,15 @@ static inline struct swap_info_struct *swp_swap_info(swp_entry_t entry)
>  	return NULL;
>  }
>  
> +static inline struct swap_info_struct *get_swap_device(swp_entry_t entry)
> +{
> +	return NULL;
> +}
> +
> +static inline void put_swap_device(struct swap_info_struct *si)
> +{
> +}
> +
>  #define swap_address_space(entry)		(NULL)
>  #define get_nr_swap_pages()			0L
>  #define total_swap_pages			0L
> diff --git a/mm/memory.c b/mm/memory.c
> index 27014c3bde9f..7a2fe12cf641 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -3311,6 +3311,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
>  {
>  	struct vm_area_struct *vma = vmf->vma;
>  	struct page *page = NULL, *swapcache;
> +	struct swap_info_struct *si = NULL;
>  	swp_entry_t entry;
>  	pte_t pte;
>  	int locked;
> @@ -3338,6 +3339,10 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
>  		goto out;
>  	}
>  
> +	/* Prevent swapoff from happening to us. */
> +	si = get_swap_device(entry);

There's

		struct swap_info_struct *si = swp_swap_info(entry);

in do_swap_page(), you can remove that.

Best Regards,
Huang, Ying

> +	if (unlikely(!si))
> +		goto out;
>  
>  	delayacct_set_flag(current, DELAYACCT_PF_SWAPIN);
>  	page = lookup_swap_cache(entry, vma, vmf->address);
> @@ -3514,6 +3519,8 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
>  unlock:
>  	pte_unmap_unlock(vmf->pte, vmf->ptl);
>  out:
> +	if (si)
> +		put_swap_device(si);
>  	return ret;
>  out_nomap:
>  	pte_unmap_unlock(vmf->pte, vmf->ptl);
> @@ -3525,6 +3532,8 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
>  		unlock_page(swapcache);
>  		put_page(swapcache);
>  	}
> +	if (si)
> +		put_swap_device(si);
>  	return ret;
>  }


^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH v3 3/4] mm/swap: remove confusing checking for non_swap_entry() in swap_ra_info()
  2021-04-20 13:30 ` [PATCH v3 3/4] mm/swap: remove confusing checking for non_swap_entry() in swap_ra_info() Miaohe Lin
@ 2021-04-21  0:58   ` Huang, Ying
  0 siblings, 0 replies; 10+ messages in thread
From: Huang, Ying @ 2021-04-21  0:58 UTC (permalink / raw)
  To: Miaohe Lin
  Cc: akpm, dennis, tim.c.chen, hughd, hannes, mhocko, iamjoonsoo.kim,
	alexs, willy, minchan, richard.weiyang, shy828301, linux-kernel,
	linux-mm

Miaohe Lin <linmiaohe@huawei.com> writes:

> The non_swap_entry() was used for working with VMA based swap readahead
> via commit ec560175c0b6 ("mm, swap: VMA based swap readahead").

At that time, the non_swap_entry() checking is necessary because the
function is called before checking that in do_swap_page().

> Then it's
> moved to swap_ra_info() since commit eaf649ebc3ac ("mm: swap: clean up swap
> readahead").

After that, the non_swap_entry() checking is unnecessary, because
swap_ra_info() is called after non_swap_entry() has been checked
already.  The resulting code is confusing.

> But this makes the code confusing. The non_swap_entry() check
> looks racy because while we released the pte lock, somebody else might have
> faulted in this pte. So we should check whether it's swap pte first to
> guard against such race or swap_type will be unexpected.

The race isn't important because it will not cause problem.

Best Regards,
Huang, Ying

> But the swap_entry
> isn't used in this function and we will have enough checking when we really
> operate the PTE entries later. So checking for non_swap_entry() is not
> really needed here and should be removed to avoid confusion.
>
> Signed-off-by: Miaohe Lin <linmiaohe@huawei.com>
> ---
>  mm/swap_state.c | 6 ------
>  1 file changed, 6 deletions(-)
>
> diff --git a/mm/swap_state.c b/mm/swap_state.c
> index 272ea2108c9d..df5405384520 100644
> --- a/mm/swap_state.c
> +++ b/mm/swap_state.c
> @@ -721,7 +721,6 @@ static void swap_ra_info(struct vm_fault *vmf,
>  {
>  	struct vm_area_struct *vma = vmf->vma;
>  	unsigned long ra_val;
> -	swp_entry_t entry;
>  	unsigned long faddr, pfn, fpfn;
>  	unsigned long start, end;
>  	pte_t *pte, *orig_pte;
> @@ -739,11 +738,6 @@ static void swap_ra_info(struct vm_fault *vmf,
>  
>  	faddr = vmf->address;
>  	orig_pte = pte = pte_offset_map(vmf->pmd, faddr);
> -	entry = pte_to_swp_entry(*pte);
> -	if ((unlikely(non_swap_entry(entry)))) {
> -		pte_unmap(orig_pte);
> -		return;
> -	}
>  
>  	fpfn = PFN_DOWN(faddr);
>  	ra_val = GET_SWAP_RA_VAL(vma);


^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH v3 4/4] mm/shmem: fix shmem_swapin() race with swapoff
  2021-04-20 13:30 ` [PATCH v3 4/4] mm/shmem: fix shmem_swapin() race with swapoff Miaohe Lin
@ 2021-04-21  1:02   ` Huang, Ying
  0 siblings, 0 replies; 10+ messages in thread
From: Huang, Ying @ 2021-04-21  1:02 UTC (permalink / raw)
  To: Miaohe Lin
  Cc: akpm, dennis, tim.c.chen, hughd, hannes, mhocko, iamjoonsoo.kim,
	alexs, willy, minchan, richard.weiyang, shy828301, linux-kernel,
	linux-mm

Miaohe Lin <linmiaohe@huawei.com> writes:

> When I was investigating the swap code, I found the below possible race
> window:
>
> CPU 1                                         CPU 2
> -----                                         -----
> shmem_swapin
>   swap_cluster_readahead
>     if (likely(si->flags & (SWP_BLKDEV | SWP_FS_OPS))) {
>                                               swapoff
>                                                 percpu_ref_kill(&p->users)
>                                                 synchronize_rcu()
>                                                 wait_for_completion

I don't think the above 3 lines are relevant for the race.

>                                                 ..
>                                                 si->swap_file = NULL;
>     struct inode *inode = si->swap_file->f_mapping->host;[oops!]
>
> Close this race window by using get/put_swap_device() to guard against
> concurrent swapoff.
>
> Fixes: 8fd2e0b505d1 ("mm: swap: check if swap backing device is congested or not")
> Signed-off-by: Miaohe Lin <linmiaohe@huawei.com>
> ---
>  mm/shmem.c | 6 ++++++
>  1 file changed, 6 insertions(+)
>
> diff --git a/mm/shmem.c b/mm/shmem.c
> index 26c76b13ad23..936ba5595297 100644
> --- a/mm/shmem.c
> +++ b/mm/shmem.c
> @@ -1492,15 +1492,21 @@ static void shmem_pseudo_vma_destroy(struct vm_area_struct *vma)
>  static struct page *shmem_swapin(swp_entry_t swap, gfp_t gfp,
>  			struct shmem_inode_info *info, pgoff_t index)
>  {
> +	struct swap_info_struct *si;
>  	struct vm_area_struct pvma;
>  	struct page *page;
>  	struct vm_fault vmf = {
>  		.vma = &pvma,
>  	};
>  
> +	/* Prevent swapoff from happening to us. */
> +	si = get_swap_device(swap);

Better to put get/put_swap_device() in shmem_swapin_page(), that make it
possible for us to remove get/put_swap_device() in lookup_swap_cache().

Best Regards,
Huang, Ying

> +	if (unlikely(!si))
> +		return NULL;
>  	shmem_pseudo_vma_init(&pvma, info, index);
>  	page = swap_cluster_readahead(swap, gfp, &vmf);
>  	shmem_pseudo_vma_destroy(&pvma);
> +	put_swap_device(si);
>  
>  	return page;
>  }


^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH v3 1/4] mm/swapfile: use percpu_ref to serialize against concurrent swapoff
  2021-04-20 13:30 ` [PATCH v3 1/4] mm/swapfile: use percpu_ref to serialize against concurrent swapoff Miaohe Lin
@ 2021-04-21  1:05   ` Huang, Ying
  2021-04-21  1:59     ` Miaohe Lin
  0 siblings, 1 reply; 10+ messages in thread
From: Huang, Ying @ 2021-04-21  1:05 UTC (permalink / raw)
  To: Miaohe Lin
  Cc: akpm, dennis, tim.c.chen, hughd, hannes, mhocko, iamjoonsoo.kim,
	alexs, willy, minchan, richard.weiyang, shy828301, linux-kernel,
	linux-mm

Miaohe Lin <linmiaohe@huawei.com> writes:

> Using current get/put_swap_device() to guard against concurrent swapoff
> for some swap ops, e.g. swap_readpage(), looks terrible because they
> might take really long time. This patch adds the percpu_ref support to
> serialize against concurrent swapoff. Also we remove the SWP_VALID flag
> because it's used together with RCU solution.
>
> Signed-off-by: Miaohe Lin <linmiaohe@huawei.com>
> ---
>  include/linux/swap.h |  5 +--
>  mm/swapfile.c        | 79 +++++++++++++++++++++++++++-----------------
>  2 files changed, 52 insertions(+), 32 deletions(-)
>
> diff --git a/include/linux/swap.h b/include/linux/swap.h
> index 144727041e78..c9e7fea10b83 100644
> --- a/include/linux/swap.h
> +++ b/include/linux/swap.h
> @@ -177,7 +177,6 @@ enum {
>  	SWP_PAGE_DISCARD = (1 << 10),	/* freed swap page-cluster discards */
>  	SWP_STABLE_WRITES = (1 << 11),	/* no overwrite PG_writeback pages */
>  	SWP_SYNCHRONOUS_IO = (1 << 12),	/* synchronous IO is efficient */
> -	SWP_VALID	= (1 << 13),	/* swap is valid to be operated on? */
>  					/* add others here before... */
>  	SWP_SCANNING	= (1 << 14),	/* refcount in scan_swap_map */
>  };
> @@ -240,6 +239,7 @@ struct swap_cluster_list {
>   * The in-memory structure used to track swap areas.
>   */
>  struct swap_info_struct {
> +	struct percpu_ref users;	/* indicate and keep swap device valid. */
>  	unsigned long	flags;		/* SWP_USED etc: see above */
>  	signed short	prio;		/* swap priority of this type */
>  	struct plist_node list;		/* entry in swap_active_head */
> @@ -260,6 +260,7 @@ struct swap_info_struct {
>  	struct block_device *bdev;	/* swap device or bdev of swap file */
>  	struct file *swap_file;		/* seldom referenced */
>  	unsigned int old_block_size;	/* seldom referenced */
> +	struct completion comp;		/* seldom referenced */
>  #ifdef CONFIG_FRONTSWAP
>  	unsigned long *frontswap_map;	/* frontswap in-use, one bit per page */
>  	atomic_t frontswap_pages;	/* frontswap pages in-use counter */
> @@ -511,7 +512,7 @@ sector_t swap_page_sector(struct page *page);
>  
>  static inline void put_swap_device(struct swap_info_struct *si)
>  {
> -	rcu_read_unlock();
> +	percpu_ref_put(&si->users);
>  }
>  
>  #else /* CONFIG_SWAP */
> diff --git a/mm/swapfile.c b/mm/swapfile.c
> index 149e77454e3c..a640fc84be5b 100644
> --- a/mm/swapfile.c
> +++ b/mm/swapfile.c
> @@ -39,6 +39,7 @@
>  #include <linux/export.h>
>  #include <linux/swap_slots.h>
>  #include <linux/sort.h>
> +#include <linux/completion.h>
>  
>  #include <asm/tlbflush.h>
>  #include <linux/swapops.h>
> @@ -511,6 +512,14 @@ static void swap_discard_work(struct work_struct *work)
>  	spin_unlock(&si->lock);
>  }
>  
> +static void swap_users_ref_free(struct percpu_ref *ref)
> +{
> +	struct swap_info_struct *si;
> +
> +	si = container_of(ref, struct swap_info_struct, users);
> +	complete(&si->comp);
> +}
> +
>  static void alloc_cluster(struct swap_info_struct *si, unsigned long idx)
>  {
>  	struct swap_cluster_info *ci = si->cluster_info;
> @@ -1270,18 +1279,12 @@ static unsigned char __swap_entry_free_locked(struct swap_info_struct *p,
>   * via preventing the swap device from being swapoff, until
>   * put_swap_device() is called.  Otherwise return NULL.
>   *
> - * The entirety of the RCU read critical section must come before the
> - * return from or after the call to synchronize_rcu() in
> - * enable_swap_info() or swapoff().  So if "si->flags & SWP_VALID" is
> - * true, the si->map, si->cluster_info, etc. must be valid in the
> - * critical section.
> - *
>   * Notice that swapoff or swapoff+swapon can still happen before the
> - * rcu_read_lock() in get_swap_device() or after the rcu_read_unlock()
> - * in put_swap_device() if there isn't any other way to prevent
> - * swapoff, such as page lock, page table lock, etc.  The caller must
> - * be prepared for that.  For example, the following situation is
> - * possible.
> + * percpu_ref_tryget_live() in get_swap_device() or after the
> + * percpu_ref_put() in put_swap_device() if there isn't any other way
> + * to prevent swapoff, such as page lock, page table lock, etc.  The
> + * caller must be prepared for that.  For example, the following
> + * situation is possible.
>   *
>   *   CPU1				CPU2
>   *   do_swap_page()
> @@ -1309,21 +1312,27 @@ struct swap_info_struct *get_swap_device(swp_entry_t entry)
>  	si = swp_swap_info(entry);
>  	if (!si)
>  		goto bad_nofile;
> -
> -	rcu_read_lock();
> -	if (data_race(!(si->flags & SWP_VALID)))
> -		goto unlock_out;
> +	if (!percpu_ref_tryget_live(&si->users))
> +		goto out;
> +	/*
> +	 * Guarantee the si->users are checked before accessing other
> +	 * fields of swap_info_struct.
> +	 *
> +	 * Paired with the spin_unlock() after setup_swap_info() in
> +	 * enable_swap_info().
> +	 */
> +	smp_rmb();
>  	offset = swp_offset(entry);
>  	if (offset >= si->max)
> -		goto unlock_out;
> +		goto put_out;
>  
>  	return si;
>  bad_nofile:
>  	pr_err("%s: %s%08lx\n", __func__, Bad_file, entry.val);
>  out:
>  	return NULL;
> -unlock_out:
> -	rcu_read_unlock();
> +put_out:
> +	percpu_ref_put(&si->users);
>  	return NULL;
>  }
>  
> @@ -2466,7 +2475,7 @@ static void setup_swap_info(struct swap_info_struct *p, int prio,
>  
>  static void _enable_swap_info(struct swap_info_struct *p)
>  {
> -	p->flags |= SWP_WRITEOK | SWP_VALID;
> +	p->flags |= SWP_WRITEOK;
>  	atomic_long_add(p->pages, &nr_swap_pages);
>  	total_swap_pages += p->pages;
>  
> @@ -2497,10 +2506,9 @@ static void enable_swap_info(struct swap_info_struct *p, int prio,
>  	spin_unlock(&p->lock);
>  	spin_unlock(&swap_lock);
>  	/*
> -	 * Guarantee swap_map, cluster_info, etc. fields are valid
> -	 * between get/put_swap_device() if SWP_VALID bit is set
> +	 * Finished initialized swap device, now it's safe to reference it.

s/initialized/initializing/

Otherwise looks good to me!  Thanks!

Reviewed-by: "Huang, Ying" <ying.huang@intel.com>

>  	 */
> -	synchronize_rcu();
> +	percpu_ref_resurrect(&p->users);
>  	spin_lock(&swap_lock);
>  	spin_lock(&p->lock);
>  	_enable_swap_info(p);
> @@ -2616,16 +2624,16 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
>  
>  	reenable_swap_slots_cache_unlock();
>  
> -	spin_lock(&swap_lock);
> -	spin_lock(&p->lock);
> -	p->flags &= ~SWP_VALID;		/* mark swap device as invalid */
> -	spin_unlock(&p->lock);
> -	spin_unlock(&swap_lock);
>  	/*
> -	 * wait for swap operations protected by get/put_swap_device()
> -	 * to complete
> +	 * Wait for swap operations protected by get/put_swap_device()
> +	 * to complete.
> +	 *
> +	 * We need synchronize_rcu() here to protect the accessing to
> +	 * the swap cache data structure.
>  	 */
> +	percpu_ref_kill(&p->users);
>  	synchronize_rcu();
> +	wait_for_completion(&p->comp);
>  
>  	flush_work(&p->discard_work);
>  
> @@ -2857,6 +2865,12 @@ static struct swap_info_struct *alloc_swap_info(void)
>  	if (!p)
>  		return ERR_PTR(-ENOMEM);
>  
> +	if (percpu_ref_init(&p->users, swap_users_ref_free,
> +			    PERCPU_REF_INIT_DEAD, GFP_KERNEL)) {
> +		kvfree(p);
> +		return ERR_PTR(-ENOMEM);
> +	}
> +
>  	spin_lock(&swap_lock);
>  	for (type = 0; type < nr_swapfiles; type++) {
>  		if (!(swap_info[type]->flags & SWP_USED))
> @@ -2864,6 +2878,7 @@ static struct swap_info_struct *alloc_swap_info(void)
>  	}
>  	if (type >= MAX_SWAPFILES) {
>  		spin_unlock(&swap_lock);
> +		percpu_ref_exit(&p->users);
>  		kvfree(p);
>  		return ERR_PTR(-EPERM);
>  	}
> @@ -2891,9 +2906,13 @@ static struct swap_info_struct *alloc_swap_info(void)
>  		plist_node_init(&p->avail_lists[i], 0);
>  	p->flags = SWP_USED;
>  	spin_unlock(&swap_lock);
> -	kvfree(defer);
> +	if (defer) {
> +		percpu_ref_exit(&defer->users);
> +		kvfree(defer);
> +	}
>  	spin_lock_init(&p->lock);
>  	spin_lock_init(&p->cont_lock);
> +	init_completion(&p->comp);
>  
>  	return p;
>  }


^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH v3 1/4] mm/swapfile: use percpu_ref to serialize against concurrent swapoff
  2021-04-21  1:05   ` Huang, Ying
@ 2021-04-21  1:59     ` Miaohe Lin
  0 siblings, 0 replies; 10+ messages in thread
From: Miaohe Lin @ 2021-04-21  1:59 UTC (permalink / raw)
  To: Huang, Ying
  Cc: akpm, dennis, tim.c.chen, hughd, hannes, mhocko, iamjoonsoo.kim,
	alexs, willy, minchan, richard.weiyang, shy828301, linux-kernel,
	linux-mm

On 2021/4/21 9:05, Huang, Ying wrote:
> Miaohe Lin <linmiaohe@huawei.com> writes:
> 
>> Using current get/put_swap_device() to guard against concurrent swapoff
>> for some swap ops, e.g. swap_readpage(), looks terrible because they
>> might take really long time. This patch adds the percpu_ref support to
>> serialize against concurrent swapoff. Also we remove the SWP_VALID flag
>> because it's used together with RCU solution.
>>
>> Signed-off-by: Miaohe Lin <linmiaohe@huawei.com>
>> ---
>>  include/linux/swap.h |  5 +--
>>  mm/swapfile.c        | 79 +++++++++++++++++++++++++++-----------------
>>  2 files changed, 52 insertions(+), 32 deletions(-)
>>
>> diff --git a/include/linux/swap.h b/include/linux/swap.h
>> index 144727041e78..c9e7fea10b83 100644
>> --- a/include/linux/swap.h
>> +++ b/include/linux/swap.h
>> @@ -177,7 +177,6 @@ enum {
>>  	SWP_PAGE_DISCARD = (1 << 10),	/* freed swap page-cluster discards */
>>  	SWP_STABLE_WRITES = (1 << 11),	/* no overwrite PG_writeback pages */
>>  	SWP_SYNCHRONOUS_IO = (1 << 12),	/* synchronous IO is efficient */
>> -	SWP_VALID	= (1 << 13),	/* swap is valid to be operated on? */
>>  					/* add others here before... */
>>  	SWP_SCANNING	= (1 << 14),	/* refcount in scan_swap_map */
>>  };
>> @@ -240,6 +239,7 @@ struct swap_cluster_list {
>>   * The in-memory structure used to track swap areas.
>>   */
>>  struct swap_info_struct {
>> +	struct percpu_ref users;	/* indicate and keep swap device valid. */
>>  	unsigned long	flags;		/* SWP_USED etc: see above */
>>  	signed short	prio;		/* swap priority of this type */
>>  	struct plist_node list;		/* entry in swap_active_head */
>> @@ -260,6 +260,7 @@ struct swap_info_struct {
>>  	struct block_device *bdev;	/* swap device or bdev of swap file */
>>  	struct file *swap_file;		/* seldom referenced */
>>  	unsigned int old_block_size;	/* seldom referenced */
>> +	struct completion comp;		/* seldom referenced */
>>  #ifdef CONFIG_FRONTSWAP
>>  	unsigned long *frontswap_map;	/* frontswap in-use, one bit per page */
>>  	atomic_t frontswap_pages;	/* frontswap pages in-use counter */
>> @@ -511,7 +512,7 @@ sector_t swap_page_sector(struct page *page);
>>  
>>  static inline void put_swap_device(struct swap_info_struct *si)
>>  {
>> -	rcu_read_unlock();
>> +	percpu_ref_put(&si->users);
>>  }
>>  
>>  #else /* CONFIG_SWAP */
>> diff --git a/mm/swapfile.c b/mm/swapfile.c
>> index 149e77454e3c..a640fc84be5b 100644
>> --- a/mm/swapfile.c
>> +++ b/mm/swapfile.c
>> @@ -39,6 +39,7 @@
>>  #include <linux/export.h>
>>  #include <linux/swap_slots.h>
>>  #include <linux/sort.h>
>> +#include <linux/completion.h>
>>  
>>  #include <asm/tlbflush.h>
>>  #include <linux/swapops.h>
>> @@ -511,6 +512,14 @@ static void swap_discard_work(struct work_struct *work)
>>  	spin_unlock(&si->lock);
>>  }
>>  
>> +static void swap_users_ref_free(struct percpu_ref *ref)
>> +{
>> +	struct swap_info_struct *si;
>> +
>> +	si = container_of(ref, struct swap_info_struct, users);
>> +	complete(&si->comp);
>> +}
>> +
>>  static void alloc_cluster(struct swap_info_struct *si, unsigned long idx)
>>  {
>>  	struct swap_cluster_info *ci = si->cluster_info;
>> @@ -1270,18 +1279,12 @@ static unsigned char __swap_entry_free_locked(struct swap_info_struct *p,
>>   * via preventing the swap device from being swapoff, until
>>   * put_swap_device() is called.  Otherwise return NULL.
>>   *
>> - * The entirety of the RCU read critical section must come before the
>> - * return from or after the call to synchronize_rcu() in
>> - * enable_swap_info() or swapoff().  So if "si->flags & SWP_VALID" is
>> - * true, the si->map, si->cluster_info, etc. must be valid in the
>> - * critical section.
>> - *
>>   * Notice that swapoff or swapoff+swapon can still happen before the
>> - * rcu_read_lock() in get_swap_device() or after the rcu_read_unlock()
>> - * in put_swap_device() if there isn't any other way to prevent
>> - * swapoff, such as page lock, page table lock, etc.  The caller must
>> - * be prepared for that.  For example, the following situation is
>> - * possible.
>> + * percpu_ref_tryget_live() in get_swap_device() or after the
>> + * percpu_ref_put() in put_swap_device() if there isn't any other way
>> + * to prevent swapoff, such as page lock, page table lock, etc.  The
>> + * caller must be prepared for that.  For example, the following
>> + * situation is possible.
>>   *
>>   *   CPU1				CPU2
>>   *   do_swap_page()
>> @@ -1309,21 +1312,27 @@ struct swap_info_struct *get_swap_device(swp_entry_t entry)
>>  	si = swp_swap_info(entry);
>>  	if (!si)
>>  		goto bad_nofile;
>> -
>> -	rcu_read_lock();
>> -	if (data_race(!(si->flags & SWP_VALID)))
>> -		goto unlock_out;
>> +	if (!percpu_ref_tryget_live(&si->users))
>> +		goto out;
>> +	/*
>> +	 * Guarantee the si->users are checked before accessing other
>> +	 * fields of swap_info_struct.
>> +	 *
>> +	 * Paired with the spin_unlock() after setup_swap_info() in
>> +	 * enable_swap_info().
>> +	 */
>> +	smp_rmb();
>>  	offset = swp_offset(entry);
>>  	if (offset >= si->max)
>> -		goto unlock_out;
>> +		goto put_out;
>>  
>>  	return si;
>>  bad_nofile:
>>  	pr_err("%s: %s%08lx\n", __func__, Bad_file, entry.val);
>>  out:
>>  	return NULL;
>> -unlock_out:
>> -	rcu_read_unlock();
>> +put_out:
>> +	percpu_ref_put(&si->users);
>>  	return NULL;
>>  }
>>  
>> @@ -2466,7 +2475,7 @@ static void setup_swap_info(struct swap_info_struct *p, int prio,
>>  
>>  static void _enable_swap_info(struct swap_info_struct *p)
>>  {
>> -	p->flags |= SWP_WRITEOK | SWP_VALID;
>> +	p->flags |= SWP_WRITEOK;
>>  	atomic_long_add(p->pages, &nr_swap_pages);
>>  	total_swap_pages += p->pages;
>>  
>> @@ -2497,10 +2506,9 @@ static void enable_swap_info(struct swap_info_struct *p, int prio,
>>  	spin_unlock(&p->lock);
>>  	spin_unlock(&swap_lock);
>>  	/*
>> -	 * Guarantee swap_map, cluster_info, etc. fields are valid
>> -	 * between get/put_swap_device() if SWP_VALID bit is set
>> +	 * Finished initialized swap device, now it's safe to reference it.
> 
> s/initialized/initializing/
> 
> Otherwise looks good to me!  Thanks!
> 
> Reviewed-by: "Huang, Ying" <ying.huang@intel.com>
> 

Many thanks for your quick review and respond again. All these comments look good. Will fix it
in V4. Thank you very much !

>>  	 */
>> -	synchronize_rcu();
>> +	percpu_ref_resurrect(&p->users);
>>  	spin_lock(&swap_lock);
>>  	spin_lock(&p->lock);
>>  	_enable_swap_info(p);
>> @@ -2616,16 +2624,16 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
>>  
>>  	reenable_swap_slots_cache_unlock();
>>  
>> -	spin_lock(&swap_lock);
>> -	spin_lock(&p->lock);
>> -	p->flags &= ~SWP_VALID;		/* mark swap device as invalid */
>> -	spin_unlock(&p->lock);
>> -	spin_unlock(&swap_lock);
>>  	/*
>> -	 * wait for swap operations protected by get/put_swap_device()
>> -	 * to complete
>> +	 * Wait for swap operations protected by get/put_swap_device()
>> +	 * to complete.
>> +	 *
>> +	 * We need synchronize_rcu() here to protect the accessing to
>> +	 * the swap cache data structure.
>>  	 */
>> +	percpu_ref_kill(&p->users);
>>  	synchronize_rcu();
>> +	wait_for_completion(&p->comp);
>>  
>>  	flush_work(&p->discard_work);
>>  
>> @@ -2857,6 +2865,12 @@ static struct swap_info_struct *alloc_swap_info(void)
>>  	if (!p)
>>  		return ERR_PTR(-ENOMEM);
>>  
>> +	if (percpu_ref_init(&p->users, swap_users_ref_free,
>> +			    PERCPU_REF_INIT_DEAD, GFP_KERNEL)) {
>> +		kvfree(p);
>> +		return ERR_PTR(-ENOMEM);
>> +	}
>> +
>>  	spin_lock(&swap_lock);
>>  	for (type = 0; type < nr_swapfiles; type++) {
>>  		if (!(swap_info[type]->flags & SWP_USED))
>> @@ -2864,6 +2878,7 @@ static struct swap_info_struct *alloc_swap_info(void)
>>  	}
>>  	if (type >= MAX_SWAPFILES) {
>>  		spin_unlock(&swap_lock);
>> +		percpu_ref_exit(&p->users);
>>  		kvfree(p);
>>  		return ERR_PTR(-EPERM);
>>  	}
>> @@ -2891,9 +2906,13 @@ static struct swap_info_struct *alloc_swap_info(void)
>>  		plist_node_init(&p->avail_lists[i], 0);
>>  	p->flags = SWP_USED;
>>  	spin_unlock(&swap_lock);
>> -	kvfree(defer);
>> +	if (defer) {
>> +		percpu_ref_exit(&defer->users);
>> +		kvfree(defer);
>> +	}
>>  	spin_lock_init(&p->lock);
>>  	spin_lock_init(&p->cont_lock);
>> +	init_completion(&p->comp);
>>  
>>  	return p;
>>  }
> .
> 



^ permalink raw reply	[flat|nested] 10+ messages in thread

end of thread, other threads:[~2021-04-21  2:00 UTC | newest]

Thread overview: 10+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-04-20 13:30 [PATCH v3 0/4] close various race windows for swap Miaohe Lin
2021-04-20 13:30 ` [PATCH v3 1/4] mm/swapfile: use percpu_ref to serialize against concurrent swapoff Miaohe Lin
2021-04-21  1:05   ` Huang, Ying
2021-04-21  1:59     ` Miaohe Lin
2021-04-20 13:30 ` [PATCH v3 2/4] swap: fix do_swap_page() race with swapoff Miaohe Lin
2021-04-21  0:52   ` Huang, Ying
2021-04-20 13:30 ` [PATCH v3 3/4] mm/swap: remove confusing checking for non_swap_entry() in swap_ra_info() Miaohe Lin
2021-04-21  0:58   ` Huang, Ying
2021-04-20 13:30 ` [PATCH v3 4/4] mm/shmem: fix shmem_swapin() race with swapoff Miaohe Lin
2021-04-21  1:02   ` Huang, Ying

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).