All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v2 0/5] close various race windows for swap
@ 2021-04-17  9:40 Miaohe Lin
  2021-04-17  9:40 ` [PATCH v2 1/5] mm/swapfile: add percpu_ref support " Miaohe Lin
                   ` (4 more replies)
  0 siblings, 5 replies; 33+ messages in thread
From: Miaohe Lin @ 2021-04-17  9:40 UTC (permalink / raw)
  To: akpm
  Cc: ying.huang, dennis, tim.c.chen, hughd, hannes, mhocko,
	iamjoonsoo.kim, alexs, david, minchan, richard.weiyang,
	linux-kernel, linux-mm, linmiaohe

Hi all,
When I was investigating the swap code, I found some possible race
windows. This series aims to fix all these races. But using current
get/put_swap_device() to guard against concurrent swapoff for
swap_readpage() looks terrible because swap_readpage() may take really
long time. And to reduce the performance overhead on the hot-path as
much as possible, it appears we can use the percpu_ref to close this
race window(as suggested by Huang, Ying). The patch 1 adds percpu_ref
support for swap and most of the remaining patches try to use this to
close various race windows. More details can be found in the respective
changelogs. Thanks!

v1->v2:
  reorganize the patch-2/5
  various enhance and fixup per Huang, Ying
  Many thanks for the comments of Huang, Ying, Dennis Zhou and Tim Chen.

Miaohe Lin (5):
  mm/swapfile: add percpu_ref support for swap
  mm/swapfile: use percpu_ref to serialize against concurrent swapoff
  swap: fix do_swap_page() race with swapoff
  mm/swap: remove confusing checking for non_swap_entry() in
    swap_ra_info()
  mm/shmem: fix shmem_swapin() race with swapoff

 include/linux/swap.h | 15 +++++++--
 mm/memory.c          |  9 ++++++
 mm/shmem.c           |  6 ++++
 mm/swap_state.c      |  6 ----
 mm/swapfile.c        | 74 +++++++++++++++++++++++++++-----------------
 5 files changed, 73 insertions(+), 37 deletions(-)

-- 
2.19.1


^ permalink raw reply	[flat|nested] 33+ messages in thread

* [PATCH v2 1/5] mm/swapfile: add percpu_ref support for swap
  2021-04-17  9:40 [PATCH v2 0/5] close various race windows for swap Miaohe Lin
@ 2021-04-17  9:40 ` Miaohe Lin
  2021-04-19  2:48     ` Huang, Ying
  2021-04-17  9:40 ` [PATCH v2 2/5] mm/swapfile: use percpu_ref to serialize against concurrent swapoff Miaohe Lin
                   ` (3 subsequent siblings)
  4 siblings, 1 reply; 33+ messages in thread
From: Miaohe Lin @ 2021-04-17  9:40 UTC (permalink / raw)
  To: akpm
  Cc: ying.huang, dennis, tim.c.chen, hughd, hannes, mhocko,
	iamjoonsoo.kim, alexs, david, minchan, richard.weiyang,
	linux-kernel, linux-mm, linmiaohe

We will use percpu-refcount to serialize against concurrent swapoff. This
patch adds the percpu_ref support for swap.

Signed-off-by: Miaohe Lin <linmiaohe@huawei.com>
---
 include/linux/swap.h |  3 +++
 mm/swapfile.c        | 33 +++++++++++++++++++++++++++++----
 2 files changed, 32 insertions(+), 4 deletions(-)

diff --git a/include/linux/swap.h b/include/linux/swap.h
index 144727041e78..8be36eb58b7a 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -240,6 +240,7 @@ struct swap_cluster_list {
  * The in-memory structure used to track swap areas.
  */
 struct swap_info_struct {
+	struct percpu_ref users;	/* serialization against concurrent swapoff */
 	unsigned long	flags;		/* SWP_USED etc: see above */
 	signed short	prio;		/* swap priority of this type */
 	struct plist_node list;		/* entry in swap_active_head */
@@ -260,6 +261,8 @@ struct swap_info_struct {
 	struct block_device *bdev;	/* swap device or bdev of swap file */
 	struct file *swap_file;		/* seldom referenced */
 	unsigned int old_block_size;	/* seldom referenced */
+	bool ref_initialized;		/* seldom referenced */
+	struct completion comp;		/* seldom referenced */
 #ifdef CONFIG_FRONTSWAP
 	unsigned long *frontswap_map;	/* frontswap in-use, one bit per page */
 	atomic_t frontswap_pages;	/* frontswap pages in-use counter */
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 149e77454e3c..66515a3a2824 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -39,6 +39,7 @@
 #include <linux/export.h>
 #include <linux/swap_slots.h>
 #include <linux/sort.h>
+#include <linux/completion.h>
 
 #include <asm/tlbflush.h>
 #include <linux/swapops.h>
@@ -511,6 +512,14 @@ static void swap_discard_work(struct work_struct *work)
 	spin_unlock(&si->lock);
 }
 
+static void swap_users_ref_free(struct percpu_ref *ref)
+{
+	struct swap_info_struct *si;
+
+	si = container_of(ref, struct swap_info_struct, users);
+	complete(&si->comp);
+}
+
 static void alloc_cluster(struct swap_info_struct *si, unsigned long idx)
 {
 	struct swap_cluster_info *ci = si->cluster_info;
@@ -2500,7 +2509,7 @@ static void enable_swap_info(struct swap_info_struct *p, int prio,
 	 * Guarantee swap_map, cluster_info, etc. fields are valid
 	 * between get/put_swap_device() if SWP_VALID bit is set
 	 */
-	synchronize_rcu();
+	percpu_ref_resurrect(&p->users);
 	spin_lock(&swap_lock);
 	spin_lock(&p->lock);
 	_enable_swap_info(p);
@@ -2621,11 +2630,18 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
 	p->flags &= ~SWP_VALID;		/* mark swap device as invalid */
 	spin_unlock(&p->lock);
 	spin_unlock(&swap_lock);
+
+	percpu_ref_kill(&p->users);
 	/*
-	 * wait for swap operations protected by get/put_swap_device()
-	 * to complete
+	 * We need synchronize_rcu() here to protect the accessing
+	 * to the swap cache data structure.
 	 */
 	synchronize_rcu();
+	/*
+	 * Wait for swap operations protected by get/put_swap_device()
+	 * to complete.
+	 */
+	wait_for_completion(&p->comp);
 
 	flush_work(&p->discard_work);
 
@@ -3132,7 +3148,7 @@ static bool swap_discardable(struct swap_info_struct *si)
 SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
 {
 	struct swap_info_struct *p;
-	struct filename *name;
+	struct filename *name = NULL;
 	struct file *swap_file = NULL;
 	struct address_space *mapping;
 	int prio;
@@ -3163,6 +3179,15 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
 
 	INIT_WORK(&p->discard_work, swap_discard_work);
 
+	if (!p->ref_initialized) {
+		error = percpu_ref_init(&p->users, swap_users_ref_free,
+					PERCPU_REF_INIT_DEAD, GFP_KERNEL);
+		if (unlikely(error))
+			goto bad_swap;
+		init_completion(&p->comp);
+		p->ref_initialized = true;
+	}
+
 	name = getname(specialfile);
 	if (IS_ERR(name)) {
 		error = PTR_ERR(name);
-- 
2.19.1


^ permalink raw reply related	[flat|nested] 33+ messages in thread

* [PATCH v2 2/5] mm/swapfile: use percpu_ref to serialize against concurrent swapoff
  2021-04-17  9:40 [PATCH v2 0/5] close various race windows for swap Miaohe Lin
  2021-04-17  9:40 ` [PATCH v2 1/5] mm/swapfile: add percpu_ref support " Miaohe Lin
@ 2021-04-17  9:40 ` Miaohe Lin
  2021-04-19  2:54     ` Huang, Ying
  2021-04-17  9:40 ` [PATCH v2 3/5] swap: fix do_swap_page() race with swapoff Miaohe Lin
                   ` (2 subsequent siblings)
  4 siblings, 1 reply; 33+ messages in thread
From: Miaohe Lin @ 2021-04-17  9:40 UTC (permalink / raw)
  To: akpm
  Cc: ying.huang, dennis, tim.c.chen, hughd, hannes, mhocko,
	iamjoonsoo.kim, alexs, david, minchan, richard.weiyang,
	linux-kernel, linux-mm, linmiaohe

Use percpu_ref to serialize against concurrent swapoff. Also remove the
SWP_VALID flag because it's used together with RCU solution.

Signed-off-by: Miaohe Lin <linmiaohe@huawei.com>
---
 include/linux/swap.h |  3 +--
 mm/swapfile.c        | 43 +++++++++++++++++--------------------------
 2 files changed, 18 insertions(+), 28 deletions(-)

diff --git a/include/linux/swap.h b/include/linux/swap.h
index 8be36eb58b7a..993693b38109 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -177,7 +177,6 @@ enum {
 	SWP_PAGE_DISCARD = (1 << 10),	/* freed swap page-cluster discards */
 	SWP_STABLE_WRITES = (1 << 11),	/* no overwrite PG_writeback pages */
 	SWP_SYNCHRONOUS_IO = (1 << 12),	/* synchronous IO is efficient */
-	SWP_VALID	= (1 << 13),	/* swap is valid to be operated on? */
 					/* add others here before... */
 	SWP_SCANNING	= (1 << 14),	/* refcount in scan_swap_map */
 };
@@ -514,7 +513,7 @@ sector_t swap_page_sector(struct page *page);
 
 static inline void put_swap_device(struct swap_info_struct *si)
 {
-	rcu_read_unlock();
+	percpu_ref_put(&si->users);
 }
 
 #else /* CONFIG_SWAP */
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 66515a3a2824..90e197bc2eeb 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1279,18 +1279,12 @@ static unsigned char __swap_entry_free_locked(struct swap_info_struct *p,
  * via preventing the swap device from being swapoff, until
  * put_swap_device() is called.  Otherwise return NULL.
  *
- * The entirety of the RCU read critical section must come before the
- * return from or after the call to synchronize_rcu() in
- * enable_swap_info() or swapoff().  So if "si->flags & SWP_VALID" is
- * true, the si->map, si->cluster_info, etc. must be valid in the
- * critical section.
- *
  * Notice that swapoff or swapoff+swapon can still happen before the
- * rcu_read_lock() in get_swap_device() or after the rcu_read_unlock()
- * in put_swap_device() if there isn't any other way to prevent
- * swapoff, such as page lock, page table lock, etc.  The caller must
- * be prepared for that.  For example, the following situation is
- * possible.
+ * percpu_ref_tryget_live() in get_swap_device() or after the
+ * percpu_ref_put() in put_swap_device() if there isn't any other way
+ * to prevent swapoff, such as page lock, page table lock, etc.  The
+ * caller must be prepared for that.  For example, the following
+ * situation is possible.
  *
  *   CPU1				CPU2
  *   do_swap_page()
@@ -1318,21 +1312,24 @@ struct swap_info_struct *get_swap_device(swp_entry_t entry)
 	si = swp_swap_info(entry);
 	if (!si)
 		goto bad_nofile;
-
-	rcu_read_lock();
-	if (data_race(!(si->flags & SWP_VALID)))
-		goto unlock_out;
+	if (!percpu_ref_tryget_live(&si->users))
+		goto out;
+	/*
+	 * Guarantee we will not reference uninitialized fields
+	 * of swap_info_struct.
+	 */
+	smp_rmb();
 	offset = swp_offset(entry);
 	if (offset >= si->max)
-		goto unlock_out;
+		goto put_out;
 
 	return si;
 bad_nofile:
 	pr_err("%s: %s%08lx\n", __func__, Bad_file, entry.val);
 out:
 	return NULL;
-unlock_out:
-	rcu_read_unlock();
+put_out:
+	percpu_ref_put(&si->users);
 	return NULL;
 }
 
@@ -2475,7 +2472,7 @@ static void setup_swap_info(struct swap_info_struct *p, int prio,
 
 static void _enable_swap_info(struct swap_info_struct *p)
 {
-	p->flags |= SWP_WRITEOK | SWP_VALID;
+	p->flags |= SWP_WRITEOK;
 	atomic_long_add(p->pages, &nr_swap_pages);
 	total_swap_pages += p->pages;
 
@@ -2507,7 +2504,7 @@ static void enable_swap_info(struct swap_info_struct *p, int prio,
 	spin_unlock(&swap_lock);
 	/*
 	 * Guarantee swap_map, cluster_info, etc. fields are valid
-	 * between get/put_swap_device() if SWP_VALID bit is set
+	 * between get/put_swap_device().
 	 */
 	percpu_ref_resurrect(&p->users);
 	spin_lock(&swap_lock);
@@ -2625,12 +2622,6 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
 
 	reenable_swap_slots_cache_unlock();
 
-	spin_lock(&swap_lock);
-	spin_lock(&p->lock);
-	p->flags &= ~SWP_VALID;		/* mark swap device as invalid */
-	spin_unlock(&p->lock);
-	spin_unlock(&swap_lock);
-
 	percpu_ref_kill(&p->users);
 	/*
 	 * We need synchronize_rcu() here to protect the accessing
-- 
2.19.1


^ permalink raw reply related	[flat|nested] 33+ messages in thread

* [PATCH v2 3/5] swap: fix do_swap_page() race with swapoff
  2021-04-17  9:40 [PATCH v2 0/5] close various race windows for swap Miaohe Lin
  2021-04-17  9:40 ` [PATCH v2 1/5] mm/swapfile: add percpu_ref support " Miaohe Lin
  2021-04-17  9:40 ` [PATCH v2 2/5] mm/swapfile: use percpu_ref to serialize against concurrent swapoff Miaohe Lin
@ 2021-04-17  9:40 ` Miaohe Lin
  2021-04-19  2:23     ` Huang, Ying
  2021-04-17  9:40 ` [PATCH v2 4/5] mm/swap: remove confusing checking for non_swap_entry() in swap_ra_info() Miaohe Lin
  2021-04-17  9:40 ` [PATCH v2 5/5] mm/shmem: fix shmem_swapin() race with swapoff Miaohe Lin
  4 siblings, 1 reply; 33+ messages in thread
From: Miaohe Lin @ 2021-04-17  9:40 UTC (permalink / raw)
  To: akpm
  Cc: ying.huang, dennis, tim.c.chen, hughd, hannes, mhocko,
	iamjoonsoo.kim, alexs, david, minchan, richard.weiyang,
	linux-kernel, linux-mm, linmiaohe

When I was investigating the swap code, I found the below possible race
window:

CPU 1                                   	CPU 2
-----                                   	-----
do_swap_page
  swap_readpage(skip swap cache case)
    if (data_race(sis->flags & SWP_FS_OPS)) {
                                        	swapoff
					  	  p->flags = &= ~SWP_VALID;
					  	  ..
					  	  synchronize_rcu();
					  	  ..
					  	  p->swap_file = NULL;
    struct file *swap_file = sis->swap_file;
    struct address_space *mapping = swap_file->f_mapping;[oops!]

Note that for the pages that are swapped in through swap cache, this isn't
an issue. Because the page is locked, and the swap entry will be marked
with SWAP_HAS_CACHE, so swapoff() can not proceed until the page has been
unlocked.

Using current get/put_swap_device() to guard against concurrent swapoff for
swap_readpage() looks terrible because swap_readpage() may take really long
time. And this race may not be really pernicious because swapoff is usually
done when system shutdown only. To reduce the performance overhead on the
hot-path as much as possible, it appears we can use the percpu_ref to close
this race window(as suggested by Huang, Ying).

Fixes: 0bcac06f27d7 ("mm,swap: skip swapcache for swapin of synchronous device")
Reported-by: kernel test robot <lkp@intel.com> (auto build test ERROR)
Signed-off-by: Miaohe Lin <linmiaohe@huawei.com>
---
 include/linux/swap.h | 9 +++++++++
 mm/memory.c          | 9 +++++++++
 2 files changed, 18 insertions(+)

diff --git a/include/linux/swap.h b/include/linux/swap.h
index 993693b38109..523c2411a135 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -528,6 +528,15 @@ static inline struct swap_info_struct *swp_swap_info(swp_entry_t entry)
 	return NULL;
 }
 
+static inline struct swap_info_struct *get_swap_device(swp_entry_t entry)
+{
+	return NULL;
+}
+
+static inline void put_swap_device(struct swap_info_struct *si)
+{
+}
+
 #define swap_address_space(entry)		(NULL)
 #define get_nr_swap_pages()			0L
 #define total_swap_pages			0L
diff --git a/mm/memory.c b/mm/memory.c
index 27014c3bde9f..7a2fe12cf641 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3311,6 +3311,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
 {
 	struct vm_area_struct *vma = vmf->vma;
 	struct page *page = NULL, *swapcache;
+	struct swap_info_struct *si = NULL;
 	swp_entry_t entry;
 	pte_t pte;
 	int locked;
@@ -3338,6 +3339,10 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
 		goto out;
 	}
 
+	/* Prevent swapoff from happening to us. */
+	si = get_swap_device(entry);
+	if (unlikely(!si))
+		goto out;
 
 	delayacct_set_flag(current, DELAYACCT_PF_SWAPIN);
 	page = lookup_swap_cache(entry, vma, vmf->address);
@@ -3514,6 +3519,8 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
 unlock:
 	pte_unmap_unlock(vmf->pte, vmf->ptl);
 out:
+	if (si)
+		put_swap_device(si);
 	return ret;
 out_nomap:
 	pte_unmap_unlock(vmf->pte, vmf->ptl);
@@ -3525,6 +3532,8 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
 		unlock_page(swapcache);
 		put_page(swapcache);
 	}
+	if (si)
+		put_swap_device(si);
 	return ret;
 }
 
-- 
2.19.1


^ permalink raw reply related	[flat|nested] 33+ messages in thread

* [PATCH v2 4/5] mm/swap: remove confusing checking for non_swap_entry() in swap_ra_info()
  2021-04-17  9:40 [PATCH v2 0/5] close various race windows for swap Miaohe Lin
                   ` (2 preceding siblings ...)
  2021-04-17  9:40 ` [PATCH v2 3/5] swap: fix do_swap_page() race with swapoff Miaohe Lin
@ 2021-04-17  9:40 ` Miaohe Lin
  2021-04-19  1:53     ` Huang, Ying
  2021-04-17  9:40 ` [PATCH v2 5/5] mm/shmem: fix shmem_swapin() race with swapoff Miaohe Lin
  4 siblings, 1 reply; 33+ messages in thread
From: Miaohe Lin @ 2021-04-17  9:40 UTC (permalink / raw)
  To: akpm
  Cc: ying.huang, dennis, tim.c.chen, hughd, hannes, mhocko,
	iamjoonsoo.kim, alexs, david, minchan, richard.weiyang,
	linux-kernel, linux-mm, linmiaohe

While we released the pte lock, somebody else might faulted in this pte.
So we should check whether it's swap pte first to guard against such race
or swp_type would be unexpected. But the swap_entry isn't used in this
function and we will have enough checking when we really operate the PTE
entries later. So checking for non_swap_entry() is not really needed here
and should be removed to avoid confusion.

Signed-off-by: Miaohe Lin <linmiaohe@huawei.com>
---
 mm/swap_state.c | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/mm/swap_state.c b/mm/swap_state.c
index 272ea2108c9d..df5405384520 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -721,7 +721,6 @@ static void swap_ra_info(struct vm_fault *vmf,
 {
 	struct vm_area_struct *vma = vmf->vma;
 	unsigned long ra_val;
-	swp_entry_t entry;
 	unsigned long faddr, pfn, fpfn;
 	unsigned long start, end;
 	pte_t *pte, *orig_pte;
@@ -739,11 +738,6 @@ static void swap_ra_info(struct vm_fault *vmf,
 
 	faddr = vmf->address;
 	orig_pte = pte = pte_offset_map(vmf->pmd, faddr);
-	entry = pte_to_swp_entry(*pte);
-	if ((unlikely(non_swap_entry(entry)))) {
-		pte_unmap(orig_pte);
-		return;
-	}
 
 	fpfn = PFN_DOWN(faddr);
 	ra_val = GET_SWAP_RA_VAL(vma);
-- 
2.19.1


^ permalink raw reply related	[flat|nested] 33+ messages in thread

* [PATCH v2 5/5] mm/shmem: fix shmem_swapin() race with swapoff
  2021-04-17  9:40 [PATCH v2 0/5] close various race windows for swap Miaohe Lin
                   ` (3 preceding siblings ...)
  2021-04-17  9:40 ` [PATCH v2 4/5] mm/swap: remove confusing checking for non_swap_entry() in swap_ra_info() Miaohe Lin
@ 2021-04-17  9:40 ` Miaohe Lin
  2021-04-19  2:15     ` Huang, Ying
  4 siblings, 1 reply; 33+ messages in thread
From: Miaohe Lin @ 2021-04-17  9:40 UTC (permalink / raw)
  To: akpm
  Cc: ying.huang, dennis, tim.c.chen, hughd, hannes, mhocko,
	iamjoonsoo.kim, alexs, david, minchan, richard.weiyang,
	linux-kernel, linux-mm, linmiaohe

When I was investigating the swap code, I found the below possible race
window:

CPU 1                                           CPU 2
-----                                           -----
shmem_swapin
  swap_cluster_readahead
    if (likely(si->flags & (SWP_BLKDEV | SWP_FS_OPS))) {
                                                swapoff
                                                  si->flags &= ~SWP_VALID;
                                                  ..
                                                  synchronize_rcu();
                                                  ..
                                                  si->swap_file = NULL;
    struct inode *inode = si->swap_file->f_mapping->host;[oops!]

Close this race window by using get/put_swap_device() to guard against
concurrent swapoff.

Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2")
Signed-off-by: Miaohe Lin <linmiaohe@huawei.com>
---
 mm/shmem.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/mm/shmem.c b/mm/shmem.c
index 26c76b13ad23..936ba5595297 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1492,15 +1492,21 @@ static void shmem_pseudo_vma_destroy(struct vm_area_struct *vma)
 static struct page *shmem_swapin(swp_entry_t swap, gfp_t gfp,
 			struct shmem_inode_info *info, pgoff_t index)
 {
+	struct swap_info_struct *si;
 	struct vm_area_struct pvma;
 	struct page *page;
 	struct vm_fault vmf = {
 		.vma = &pvma,
 	};
 
+	/* Prevent swapoff from happening to us. */
+	si = get_swap_device(swap);
+	if (unlikely(!si))
+		return NULL;
 	shmem_pseudo_vma_init(&pvma, info, index);
 	page = swap_cluster_readahead(swap, gfp, &vmf);
 	shmem_pseudo_vma_destroy(&pvma);
+	put_swap_device(si);
 
 	return page;
 }
-- 
2.19.1


^ permalink raw reply related	[flat|nested] 33+ messages in thread

* Re: [PATCH v2 4/5] mm/swap: remove confusing checking for non_swap_entry() in swap_ra_info()
  2021-04-17  9:40 ` [PATCH v2 4/5] mm/swap: remove confusing checking for non_swap_entry() in swap_ra_info() Miaohe Lin
@ 2021-04-19  1:53     ` Huang, Ying
  0 siblings, 0 replies; 33+ messages in thread
From: Huang, Ying @ 2021-04-19  1:53 UTC (permalink / raw)
  To: Miaohe Lin
  Cc: akpm, dennis, tim.c.chen, hughd, hannes, mhocko, iamjoonsoo.kim,
	alexs, david, minchan, richard.weiyang, linux-kernel, linux-mm

Miaohe Lin <linmiaohe@huawei.com> writes:

> While we released the pte lock, somebody else might faulted in this pte.
> So we should check whether it's swap pte first to guard against such race
> or swp_type would be unexpected. But the swap_entry isn't used in this
> function and we will have enough checking when we really operate the PTE
> entries later. So checking for non_swap_entry() is not really needed here
> and should be removed to avoid confusion.

Please rephrase the change log to describe why we have the code and why
it's unnecessary now.  You can dig the git history via git-blame to find
out it.

The patch itself looks good to me.

Best Regards,
Huang, Ying

> Signed-off-by: Miaohe Lin <linmiaohe@huawei.com>
> ---
>  mm/swap_state.c | 6 ------
>  1 file changed, 6 deletions(-)
>
> diff --git a/mm/swap_state.c b/mm/swap_state.c
> index 272ea2108c9d..df5405384520 100644
> --- a/mm/swap_state.c
> +++ b/mm/swap_state.c
> @@ -721,7 +721,6 @@ static void swap_ra_info(struct vm_fault *vmf,
>  {
>  	struct vm_area_struct *vma = vmf->vma;
>  	unsigned long ra_val;
> -	swp_entry_t entry;
>  	unsigned long faddr, pfn, fpfn;
>  	unsigned long start, end;
>  	pte_t *pte, *orig_pte;
> @@ -739,11 +738,6 @@ static void swap_ra_info(struct vm_fault *vmf,
>  
>  	faddr = vmf->address;
>  	orig_pte = pte = pte_offset_map(vmf->pmd, faddr);
> -	entry = pte_to_swp_entry(*pte);
> -	if ((unlikely(non_swap_entry(entry)))) {
> -		pte_unmap(orig_pte);
> -		return;
> -	}
>  
>  	fpfn = PFN_DOWN(faddr);
>  	ra_val = GET_SWAP_RA_VAL(vma);

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH v2 4/5] mm/swap: remove confusing checking for non_swap_entry() in swap_ra_info()
@ 2021-04-19  1:53     ` Huang, Ying
  0 siblings, 0 replies; 33+ messages in thread
From: Huang, Ying @ 2021-04-19  1:53 UTC (permalink / raw)
  To: Miaohe Lin
  Cc: akpm, dennis, tim.c.chen, hughd, hannes, mhocko, iamjoonsoo.kim,
	alexs, david, minchan, richard.weiyang, linux-kernel, linux-mm

Miaohe Lin <linmiaohe@huawei.com> writes:

> While we released the pte lock, somebody else might faulted in this pte.
> So we should check whether it's swap pte first to guard against such race
> or swp_type would be unexpected. But the swap_entry isn't used in this
> function and we will have enough checking when we really operate the PTE
> entries later. So checking for non_swap_entry() is not really needed here
> and should be removed to avoid confusion.

Please rephrase the change log to describe why we have the code and why
it's unnecessary now.  You can dig the git history via git-blame to find
out it.

The patch itself looks good to me.

Best Regards,
Huang, Ying

> Signed-off-by: Miaohe Lin <linmiaohe@huawei.com>
> ---
>  mm/swap_state.c | 6 ------
>  1 file changed, 6 deletions(-)
>
> diff --git a/mm/swap_state.c b/mm/swap_state.c
> index 272ea2108c9d..df5405384520 100644
> --- a/mm/swap_state.c
> +++ b/mm/swap_state.c
> @@ -721,7 +721,6 @@ static void swap_ra_info(struct vm_fault *vmf,
>  {
>  	struct vm_area_struct *vma = vmf->vma;
>  	unsigned long ra_val;
> -	swp_entry_t entry;
>  	unsigned long faddr, pfn, fpfn;
>  	unsigned long start, end;
>  	pte_t *pte, *orig_pte;
> @@ -739,11 +738,6 @@ static void swap_ra_info(struct vm_fault *vmf,
>  
>  	faddr = vmf->address;
>  	orig_pte = pte = pte_offset_map(vmf->pmd, faddr);
> -	entry = pte_to_swp_entry(*pte);
> -	if ((unlikely(non_swap_entry(entry)))) {
> -		pte_unmap(orig_pte);
> -		return;
> -	}
>  
>  	fpfn = PFN_DOWN(faddr);
>  	ra_val = GET_SWAP_RA_VAL(vma);


^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH v2 5/5] mm/shmem: fix shmem_swapin() race with swapoff
  2021-04-17  9:40 ` [PATCH v2 5/5] mm/shmem: fix shmem_swapin() race with swapoff Miaohe Lin
@ 2021-04-19  2:15     ` Huang, Ying
  0 siblings, 0 replies; 33+ messages in thread
From: Huang, Ying @ 2021-04-19  2:15 UTC (permalink / raw)
  To: Miaohe Lin
  Cc: akpm, dennis, tim.c.chen, hughd, hannes, mhocko, iamjoonsoo.kim,
	alexs, david, minchan, richard.weiyang, linux-kernel, linux-mm

Miaohe Lin <linmiaohe@huawei.com> writes:

> When I was investigating the swap code, I found the below possible race
> window:
>
> CPU 1                                           CPU 2
> -----                                           -----
> shmem_swapin
>   swap_cluster_readahead
>     if (likely(si->flags & (SWP_BLKDEV | SWP_FS_OPS))) {
>                                                 swapoff
>                                                   si->flags &= ~SWP_VALID;
>                                                   ..
>                                                   synchronize_rcu();
>                                                   ..

You have removed these code in the previous patches of the series.  And
they are not relevant in this patch.

>                                                   si->swap_file = NULL;
>     struct inode *inode = si->swap_file->f_mapping->host;[oops!]
>
> Close this race window by using get/put_swap_device() to guard against
> concurrent swapoff.
>
> Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2")

No.  This isn't the commit that introduces the race condition.  Please
recheck your git blame result.

Best Regards,
Huang, Ying

> Signed-off-by: Miaohe Lin <linmiaohe@huawei.com>
> ---
>  mm/shmem.c | 6 ++++++
>  1 file changed, 6 insertions(+)
>
> diff --git a/mm/shmem.c b/mm/shmem.c
> index 26c76b13ad23..936ba5595297 100644
> --- a/mm/shmem.c
> +++ b/mm/shmem.c
> @@ -1492,15 +1492,21 @@ static void shmem_pseudo_vma_destroy(struct vm_area_struct *vma)
>  static struct page *shmem_swapin(swp_entry_t swap, gfp_t gfp,
>  			struct shmem_inode_info *info, pgoff_t index)
>  {
> +	struct swap_info_struct *si;
>  	struct vm_area_struct pvma;
>  	struct page *page;
>  	struct vm_fault vmf = {
>  		.vma = &pvma,
>  	};
>  
> +	/* Prevent swapoff from happening to us. */
> +	si = get_swap_device(swap);
> +	if (unlikely(!si))
> +		return NULL;
>  	shmem_pseudo_vma_init(&pvma, info, index);
>  	page = swap_cluster_readahead(swap, gfp, &vmf);
>  	shmem_pseudo_vma_destroy(&pvma);
> +	put_swap_device(si);
>  
>  	return page;
>  }

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH v2 5/5] mm/shmem: fix shmem_swapin() race with swapoff
@ 2021-04-19  2:15     ` Huang, Ying
  0 siblings, 0 replies; 33+ messages in thread
From: Huang, Ying @ 2021-04-19  2:15 UTC (permalink / raw)
  To: Miaohe Lin
  Cc: akpm, dennis, tim.c.chen, hughd, hannes, mhocko, iamjoonsoo.kim,
	alexs, david, minchan, richard.weiyang, linux-kernel, linux-mm

Miaohe Lin <linmiaohe@huawei.com> writes:

> When I was investigating the swap code, I found the below possible race
> window:
>
> CPU 1                                           CPU 2
> -----                                           -----
> shmem_swapin
>   swap_cluster_readahead
>     if (likely(si->flags & (SWP_BLKDEV | SWP_FS_OPS))) {
>                                                 swapoff
>                                                   si->flags &= ~SWP_VALID;
>                                                   ..
>                                                   synchronize_rcu();
>                                                   ..

You have removed these code in the previous patches of the series.  And
they are not relevant in this patch.

>                                                   si->swap_file = NULL;
>     struct inode *inode = si->swap_file->f_mapping->host;[oops!]
>
> Close this race window by using get/put_swap_device() to guard against
> concurrent swapoff.
>
> Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2")

No.  This isn't the commit that introduces the race condition.  Please
recheck your git blame result.

Best Regards,
Huang, Ying

> Signed-off-by: Miaohe Lin <linmiaohe@huawei.com>
> ---
>  mm/shmem.c | 6 ++++++
>  1 file changed, 6 insertions(+)
>
> diff --git a/mm/shmem.c b/mm/shmem.c
> index 26c76b13ad23..936ba5595297 100644
> --- a/mm/shmem.c
> +++ b/mm/shmem.c
> @@ -1492,15 +1492,21 @@ static void shmem_pseudo_vma_destroy(struct vm_area_struct *vma)
>  static struct page *shmem_swapin(swp_entry_t swap, gfp_t gfp,
>  			struct shmem_inode_info *info, pgoff_t index)
>  {
> +	struct swap_info_struct *si;
>  	struct vm_area_struct pvma;
>  	struct page *page;
>  	struct vm_fault vmf = {
>  		.vma = &pvma,
>  	};
>  
> +	/* Prevent swapoff from happening to us. */
> +	si = get_swap_device(swap);
> +	if (unlikely(!si))
> +		return NULL;
>  	shmem_pseudo_vma_init(&pvma, info, index);
>  	page = swap_cluster_readahead(swap, gfp, &vmf);
>  	shmem_pseudo_vma_destroy(&pvma);
> +	put_swap_device(si);
>  
>  	return page;
>  }


^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH v2 3/5] swap: fix do_swap_page() race with swapoff
  2021-04-17  9:40 ` [PATCH v2 3/5] swap: fix do_swap_page() race with swapoff Miaohe Lin
@ 2021-04-19  2:23     ` Huang, Ying
  0 siblings, 0 replies; 33+ messages in thread
From: Huang, Ying @ 2021-04-19  2:23 UTC (permalink / raw)
  To: Miaohe Lin
  Cc: akpm, dennis, tim.c.chen, hughd, hannes, mhocko, iamjoonsoo.kim,
	alexs, david, minchan, richard.weiyang, linux-kernel, linux-mm

Miaohe Lin <linmiaohe@huawei.com> writes:

> When I was investigating the swap code, I found the below possible race
> window:
>
> CPU 1                                   	CPU 2
> -----                                   	-----
> do_swap_page

This is OK for swap cache cases.  So

  if (data_race(si->flags & SWP_SYNCHRONOUS_IO))

should be shown here.

>   swap_readpage(skip swap cache case)
>     if (data_race(sis->flags & SWP_FS_OPS)) {
>                                         	swapoff
> 					  	  p->flags = &= ~SWP_VALID;
> 					  	  ..
> 					  	  synchronize_rcu();
> 					  	  ..
> 					  	  p->swap_file = NULL;
>     struct file *swap_file = sis->swap_file;
>     struct address_space *mapping = swap_file->f_mapping;[oops!]
>
> Note that for the pages that are swapped in through swap cache, this isn't
> an issue. Because the page is locked, and the swap entry will be marked
> with SWAP_HAS_CACHE, so swapoff() can not proceed until the page has been
> unlocked.
>
> Using current get/put_swap_device() to guard against concurrent swapoff for
> swap_readpage() looks terrible because swap_readpage() may take really long
> time. And this race may not be really pernicious because swapoff is usually
> done when system shutdown only. To reduce the performance overhead on the
> hot-path as much as possible, it appears we can use the percpu_ref to close
> this race window(as suggested by Huang, Ying).

I still suggest to squash PATCH 1-3, at least PATCH 1-2.  That will
change the relevant code together and make it easier to review.

Best Regards,
Huang, Ying

> Fixes: 0bcac06f27d7 ("mm,swap: skip swapcache for swapin of synchronous device")
> Reported-by: kernel test robot <lkp@intel.com> (auto build test ERROR)
> Signed-off-by: Miaohe Lin <linmiaohe@huawei.com>
> ---
>  include/linux/swap.h | 9 +++++++++
>  mm/memory.c          | 9 +++++++++
>  2 files changed, 18 insertions(+)
>
> diff --git a/include/linux/swap.h b/include/linux/swap.h
> index 993693b38109..523c2411a135 100644
> --- a/include/linux/swap.h
> +++ b/include/linux/swap.h
> @@ -528,6 +528,15 @@ static inline struct swap_info_struct *swp_swap_info(swp_entry_t entry)
>  	return NULL;
>  }
>  
> +static inline struct swap_info_struct *get_swap_device(swp_entry_t entry)
> +{
> +	return NULL;
> +}
> +
> +static inline void put_swap_device(struct swap_info_struct *si)
> +{
> +}
> +
>  #define swap_address_space(entry)		(NULL)
>  #define get_nr_swap_pages()			0L
>  #define total_swap_pages			0L
> diff --git a/mm/memory.c b/mm/memory.c
> index 27014c3bde9f..7a2fe12cf641 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -3311,6 +3311,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
>  {
>  	struct vm_area_struct *vma = vmf->vma;
>  	struct page *page = NULL, *swapcache;
> +	struct swap_info_struct *si = NULL;
>  	swp_entry_t entry;
>  	pte_t pte;
>  	int locked;
> @@ -3338,6 +3339,10 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
>  		goto out;
>  	}
>  
> +	/* Prevent swapoff from happening to us. */
> +	si = get_swap_device(entry);
> +	if (unlikely(!si))
> +		goto out;
>  
>  	delayacct_set_flag(current, DELAYACCT_PF_SWAPIN);
>  	page = lookup_swap_cache(entry, vma, vmf->address);
> @@ -3514,6 +3519,8 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
>  unlock:
>  	pte_unmap_unlock(vmf->pte, vmf->ptl);
>  out:
> +	if (si)
> +		put_swap_device(si);
>  	return ret;
>  out_nomap:
>  	pte_unmap_unlock(vmf->pte, vmf->ptl);
> @@ -3525,6 +3532,8 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
>  		unlock_page(swapcache);
>  		put_page(swapcache);
>  	}
> +	if (si)
> +		put_swap_device(si);
>  	return ret;
>  }

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH v2 3/5] swap: fix do_swap_page() race with swapoff
@ 2021-04-19  2:23     ` Huang, Ying
  0 siblings, 0 replies; 33+ messages in thread
From: Huang, Ying @ 2021-04-19  2:23 UTC (permalink / raw)
  To: Miaohe Lin
  Cc: akpm, dennis, tim.c.chen, hughd, hannes, mhocko, iamjoonsoo.kim,
	alexs, david, minchan, richard.weiyang, linux-kernel, linux-mm

Miaohe Lin <linmiaohe@huawei.com> writes:

> When I was investigating the swap code, I found the below possible race
> window:
>
> CPU 1                                   	CPU 2
> -----                                   	-----
> do_swap_page

This is OK for swap cache cases.  So

  if (data_race(si->flags & SWP_SYNCHRONOUS_IO))

should be shown here.

>   swap_readpage(skip swap cache case)
>     if (data_race(sis->flags & SWP_FS_OPS)) {
>                                         	swapoff
> 					  	  p->flags = &= ~SWP_VALID;
> 					  	  ..
> 					  	  synchronize_rcu();
> 					  	  ..
> 					  	  p->swap_file = NULL;
>     struct file *swap_file = sis->swap_file;
>     struct address_space *mapping = swap_file->f_mapping;[oops!]
>
> Note that for the pages that are swapped in through swap cache, this isn't
> an issue. Because the page is locked, and the swap entry will be marked
> with SWAP_HAS_CACHE, so swapoff() can not proceed until the page has been
> unlocked.
>
> Using current get/put_swap_device() to guard against concurrent swapoff for
> swap_readpage() looks terrible because swap_readpage() may take really long
> time. And this race may not be really pernicious because swapoff is usually
> done when system shutdown only. To reduce the performance overhead on the
> hot-path as much as possible, it appears we can use the percpu_ref to close
> this race window(as suggested by Huang, Ying).

I still suggest to squash PATCH 1-3, at least PATCH 1-2.  That will
change the relevant code together and make it easier to review.

Best Regards,
Huang, Ying

> Fixes: 0bcac06f27d7 ("mm,swap: skip swapcache for swapin of synchronous device")
> Reported-by: kernel test robot <lkp@intel.com> (auto build test ERROR)
> Signed-off-by: Miaohe Lin <linmiaohe@huawei.com>
> ---
>  include/linux/swap.h | 9 +++++++++
>  mm/memory.c          | 9 +++++++++
>  2 files changed, 18 insertions(+)
>
> diff --git a/include/linux/swap.h b/include/linux/swap.h
> index 993693b38109..523c2411a135 100644
> --- a/include/linux/swap.h
> +++ b/include/linux/swap.h
> @@ -528,6 +528,15 @@ static inline struct swap_info_struct *swp_swap_info(swp_entry_t entry)
>  	return NULL;
>  }
>  
> +static inline struct swap_info_struct *get_swap_device(swp_entry_t entry)
> +{
> +	return NULL;
> +}
> +
> +static inline void put_swap_device(struct swap_info_struct *si)
> +{
> +}
> +
>  #define swap_address_space(entry)		(NULL)
>  #define get_nr_swap_pages()			0L
>  #define total_swap_pages			0L
> diff --git a/mm/memory.c b/mm/memory.c
> index 27014c3bde9f..7a2fe12cf641 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -3311,6 +3311,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
>  {
>  	struct vm_area_struct *vma = vmf->vma;
>  	struct page *page = NULL, *swapcache;
> +	struct swap_info_struct *si = NULL;
>  	swp_entry_t entry;
>  	pte_t pte;
>  	int locked;
> @@ -3338,6 +3339,10 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
>  		goto out;
>  	}
>  
> +	/* Prevent swapoff from happening to us. */
> +	si = get_swap_device(entry);
> +	if (unlikely(!si))
> +		goto out;
>  
>  	delayacct_set_flag(current, DELAYACCT_PF_SWAPIN);
>  	page = lookup_swap_cache(entry, vma, vmf->address);
> @@ -3514,6 +3519,8 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
>  unlock:
>  	pte_unmap_unlock(vmf->pte, vmf->ptl);
>  out:
> +	if (si)
> +		put_swap_device(si);
>  	return ret;
>  out_nomap:
>  	pte_unmap_unlock(vmf->pte, vmf->ptl);
> @@ -3525,6 +3532,8 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
>  		unlock_page(swapcache);
>  		put_page(swapcache);
>  	}
> +	if (si)
> +		put_swap_device(si);
>  	return ret;
>  }


^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH v2 1/5] mm/swapfile: add percpu_ref support for swap
  2021-04-17  9:40 ` [PATCH v2 1/5] mm/swapfile: add percpu_ref support " Miaohe Lin
@ 2021-04-19  2:48     ` Huang, Ying
  0 siblings, 0 replies; 33+ messages in thread
From: Huang, Ying @ 2021-04-19  2:48 UTC (permalink / raw)
  To: Miaohe Lin
  Cc: akpm, dennis, tim.c.chen, hughd, hannes, mhocko, iamjoonsoo.kim,
	alexs, david, minchan, richard.weiyang, linux-kernel, linux-mm

Miaohe Lin <linmiaohe@huawei.com> writes:

> We will use percpu-refcount to serialize against concurrent swapoff. This
> patch adds the percpu_ref support for swap.
>
> Signed-off-by: Miaohe Lin <linmiaohe@huawei.com>
> ---
>  include/linux/swap.h |  3 +++
>  mm/swapfile.c        | 33 +++++++++++++++++++++++++++++----
>  2 files changed, 32 insertions(+), 4 deletions(-)
>
> diff --git a/include/linux/swap.h b/include/linux/swap.h
> index 144727041e78..8be36eb58b7a 100644
> --- a/include/linux/swap.h
> +++ b/include/linux/swap.h
> @@ -240,6 +240,7 @@ struct swap_cluster_list {
>   * The in-memory structure used to track swap areas.
>   */
>  struct swap_info_struct {
> +	struct percpu_ref users;	/* serialization against concurrent swapoff */

The comments aren't general enough.  We use this to check whether the
swap device has been fully initialized, etc. May be something as below?

/* indicate and keep swap device valid */

>  	unsigned long	flags;		/* SWP_USED etc: see above */
>  	signed short	prio;		/* swap priority of this type */
>  	struct plist_node list;		/* entry in swap_active_head */
> @@ -260,6 +261,8 @@ struct swap_info_struct {
>  	struct block_device *bdev;	/* swap device or bdev of swap file */
>  	struct file *swap_file;		/* seldom referenced */
>  	unsigned int old_block_size;	/* seldom referenced */
> +	bool ref_initialized;		/* seldom referenced */
> +	struct completion comp;		/* seldom referenced */
>  #ifdef CONFIG_FRONTSWAP
>  	unsigned long *frontswap_map;	/* frontswap in-use, one bit per page */
>  	atomic_t frontswap_pages;	/* frontswap pages in-use counter */
> diff --git a/mm/swapfile.c b/mm/swapfile.c
> index 149e77454e3c..66515a3a2824 100644
> --- a/mm/swapfile.c
> +++ b/mm/swapfile.c
> @@ -39,6 +39,7 @@
>  #include <linux/export.h>
>  #include <linux/swap_slots.h>
>  #include <linux/sort.h>
> +#include <linux/completion.h>
>  
>  #include <asm/tlbflush.h>
>  #include <linux/swapops.h>
> @@ -511,6 +512,14 @@ static void swap_discard_work(struct work_struct *work)
>  	spin_unlock(&si->lock);
>  }
>  
> +static void swap_users_ref_free(struct percpu_ref *ref)
> +{
> +	struct swap_info_struct *si;
> +
> +	si = container_of(ref, struct swap_info_struct, users);
> +	complete(&si->comp);
> +}
> +
>  static void alloc_cluster(struct swap_info_struct *si, unsigned long idx)
>  {
>  	struct swap_cluster_info *ci = si->cluster_info;
> @@ -2500,7 +2509,7 @@ static void enable_swap_info(struct swap_info_struct *p, int prio,
>  	 * Guarantee swap_map, cluster_info, etc. fields are valid
>  	 * between get/put_swap_device() if SWP_VALID bit is set
>  	 */
> -	synchronize_rcu();

You cannot remove this without changing get/put_swap_device().  It's
better to squash at least PATCH 1-2.

> +	percpu_ref_resurrect(&p->users);
>  	spin_lock(&swap_lock);
>  	spin_lock(&p->lock);
>  	_enable_swap_info(p);
> @@ -2621,11 +2630,18 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
>  	p->flags &= ~SWP_VALID;		/* mark swap device as invalid */
>  	spin_unlock(&p->lock);
>  	spin_unlock(&swap_lock);
> +
> +	percpu_ref_kill(&p->users);
>  	/*
> -	 * wait for swap operations protected by get/put_swap_device()
> -	 * to complete
> +	 * We need synchronize_rcu() here to protect the accessing
> +	 * to the swap cache data structure.
>  	 */
>  	synchronize_rcu();
> +	/*
> +	 * Wait for swap operations protected by get/put_swap_device()
> +	 * to complete.
> +	 */

I think the comments (after some revision) can be moved before
percpu_ref_kill().  The synchronize_rcu() comments can be merged.

> +	wait_for_completion(&p->comp);
>  
>  	flush_work(&p->discard_work);
>  
> @@ -3132,7 +3148,7 @@ static bool swap_discardable(struct swap_info_struct *si)
>  SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
>  {
>  	struct swap_info_struct *p;
> -	struct filename *name;
> +	struct filename *name = NULL;
>  	struct file *swap_file = NULL;
>  	struct address_space *mapping;
>  	int prio;
> @@ -3163,6 +3179,15 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
>  
>  	INIT_WORK(&p->discard_work, swap_discard_work);
>  
> +	if (!p->ref_initialized) {

I don't think it's necessary to add another flag p->ref_initialized.  We
can distinguish newly allocated and reused swap_info_struct in alloc_swap_info().

Best Regards,
Huang, Ying

> +		error = percpu_ref_init(&p->users, swap_users_ref_free,
> +					PERCPU_REF_INIT_DEAD, GFP_KERNEL);
> +		if (unlikely(error))
> +			goto bad_swap;
> +		init_completion(&p->comp);
> +		p->ref_initialized = true;
> +	}
> +
>  	name = getname(specialfile);
>  	if (IS_ERR(name)) {
>  		error = PTR_ERR(name);

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH v2 1/5] mm/swapfile: add percpu_ref support for swap
@ 2021-04-19  2:48     ` Huang, Ying
  0 siblings, 0 replies; 33+ messages in thread
From: Huang, Ying @ 2021-04-19  2:48 UTC (permalink / raw)
  To: Miaohe Lin
  Cc: akpm, dennis, tim.c.chen, hughd, hannes, mhocko, iamjoonsoo.kim,
	alexs, david, minchan, richard.weiyang, linux-kernel, linux-mm

Miaohe Lin <linmiaohe@huawei.com> writes:

> We will use percpu-refcount to serialize against concurrent swapoff. This
> patch adds the percpu_ref support for swap.
>
> Signed-off-by: Miaohe Lin <linmiaohe@huawei.com>
> ---
>  include/linux/swap.h |  3 +++
>  mm/swapfile.c        | 33 +++++++++++++++++++++++++++++----
>  2 files changed, 32 insertions(+), 4 deletions(-)
>
> diff --git a/include/linux/swap.h b/include/linux/swap.h
> index 144727041e78..8be36eb58b7a 100644
> --- a/include/linux/swap.h
> +++ b/include/linux/swap.h
> @@ -240,6 +240,7 @@ struct swap_cluster_list {
>   * The in-memory structure used to track swap areas.
>   */
>  struct swap_info_struct {
> +	struct percpu_ref users;	/* serialization against concurrent swapoff */

The comments aren't general enough.  We use this to check whether the
swap device has been fully initialized, etc. May be something as below?

/* indicate and keep swap device valid */

>  	unsigned long	flags;		/* SWP_USED etc: see above */
>  	signed short	prio;		/* swap priority of this type */
>  	struct plist_node list;		/* entry in swap_active_head */
> @@ -260,6 +261,8 @@ struct swap_info_struct {
>  	struct block_device *bdev;	/* swap device or bdev of swap file */
>  	struct file *swap_file;		/* seldom referenced */
>  	unsigned int old_block_size;	/* seldom referenced */
> +	bool ref_initialized;		/* seldom referenced */
> +	struct completion comp;		/* seldom referenced */
>  #ifdef CONFIG_FRONTSWAP
>  	unsigned long *frontswap_map;	/* frontswap in-use, one bit per page */
>  	atomic_t frontswap_pages;	/* frontswap pages in-use counter */
> diff --git a/mm/swapfile.c b/mm/swapfile.c
> index 149e77454e3c..66515a3a2824 100644
> --- a/mm/swapfile.c
> +++ b/mm/swapfile.c
> @@ -39,6 +39,7 @@
>  #include <linux/export.h>
>  #include <linux/swap_slots.h>
>  #include <linux/sort.h>
> +#include <linux/completion.h>
>  
>  #include <asm/tlbflush.h>
>  #include <linux/swapops.h>
> @@ -511,6 +512,14 @@ static void swap_discard_work(struct work_struct *work)
>  	spin_unlock(&si->lock);
>  }
>  
> +static void swap_users_ref_free(struct percpu_ref *ref)
> +{
> +	struct swap_info_struct *si;
> +
> +	si = container_of(ref, struct swap_info_struct, users);
> +	complete(&si->comp);
> +}
> +
>  static void alloc_cluster(struct swap_info_struct *si, unsigned long idx)
>  {
>  	struct swap_cluster_info *ci = si->cluster_info;
> @@ -2500,7 +2509,7 @@ static void enable_swap_info(struct swap_info_struct *p, int prio,
>  	 * Guarantee swap_map, cluster_info, etc. fields are valid
>  	 * between get/put_swap_device() if SWP_VALID bit is set
>  	 */
> -	synchronize_rcu();

You cannot remove this without changing get/put_swap_device().  It's
better to squash at least PATCH 1-2.

> +	percpu_ref_resurrect(&p->users);
>  	spin_lock(&swap_lock);
>  	spin_lock(&p->lock);
>  	_enable_swap_info(p);
> @@ -2621,11 +2630,18 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
>  	p->flags &= ~SWP_VALID;		/* mark swap device as invalid */
>  	spin_unlock(&p->lock);
>  	spin_unlock(&swap_lock);
> +
> +	percpu_ref_kill(&p->users);
>  	/*
> -	 * wait for swap operations protected by get/put_swap_device()
> -	 * to complete
> +	 * We need synchronize_rcu() here to protect the accessing
> +	 * to the swap cache data structure.
>  	 */
>  	synchronize_rcu();
> +	/*
> +	 * Wait for swap operations protected by get/put_swap_device()
> +	 * to complete.
> +	 */

I think the comments (after some revision) can be moved before
percpu_ref_kill().  The synchronize_rcu() comments can be merged.

> +	wait_for_completion(&p->comp);
>  
>  	flush_work(&p->discard_work);
>  
> @@ -3132,7 +3148,7 @@ static bool swap_discardable(struct swap_info_struct *si)
>  SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
>  {
>  	struct swap_info_struct *p;
> -	struct filename *name;
> +	struct filename *name = NULL;
>  	struct file *swap_file = NULL;
>  	struct address_space *mapping;
>  	int prio;
> @@ -3163,6 +3179,15 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
>  
>  	INIT_WORK(&p->discard_work, swap_discard_work);
>  
> +	if (!p->ref_initialized) {

I don't think it's necessary to add another flag p->ref_initialized.  We
can distinguish newly allocated and reused swap_info_struct in alloc_swap_info().

Best Regards,
Huang, Ying

> +		error = percpu_ref_init(&p->users, swap_users_ref_free,
> +					PERCPU_REF_INIT_DEAD, GFP_KERNEL);
> +		if (unlikely(error))
> +			goto bad_swap;
> +		init_completion(&p->comp);
> +		p->ref_initialized = true;
> +	}
> +
>  	name = getname(specialfile);
>  	if (IS_ERR(name)) {
>  		error = PTR_ERR(name);


^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH v2 2/5] mm/swapfile: use percpu_ref to serialize against concurrent swapoff
  2021-04-17  9:40 ` [PATCH v2 2/5] mm/swapfile: use percpu_ref to serialize against concurrent swapoff Miaohe Lin
@ 2021-04-19  2:54     ` Huang, Ying
  0 siblings, 0 replies; 33+ messages in thread
From: Huang, Ying @ 2021-04-19  2:54 UTC (permalink / raw)
  To: Miaohe Lin
  Cc: akpm, dennis, tim.c.chen, hughd, hannes, mhocko, iamjoonsoo.kim,
	alexs, david, minchan, richard.weiyang, linux-kernel, linux-mm

Miaohe Lin <linmiaohe@huawei.com> writes:

> Use percpu_ref to serialize against concurrent swapoff. Also remove the
> SWP_VALID flag because it's used together with RCU solution.
>
> Signed-off-by: Miaohe Lin <linmiaohe@huawei.com>
> ---
>  include/linux/swap.h |  3 +--
>  mm/swapfile.c        | 43 +++++++++++++++++--------------------------
>  2 files changed, 18 insertions(+), 28 deletions(-)
>
> diff --git a/include/linux/swap.h b/include/linux/swap.h
> index 8be36eb58b7a..993693b38109 100644
> --- a/include/linux/swap.h
> +++ b/include/linux/swap.h
> @@ -177,7 +177,6 @@ enum {
>  	SWP_PAGE_DISCARD = (1 << 10),	/* freed swap page-cluster discards */
>  	SWP_STABLE_WRITES = (1 << 11),	/* no overwrite PG_writeback pages */
>  	SWP_SYNCHRONOUS_IO = (1 << 12),	/* synchronous IO is efficient */
> -	SWP_VALID	= (1 << 13),	/* swap is valid to be operated on? */
>  					/* add others here before... */
>  	SWP_SCANNING	= (1 << 14),	/* refcount in scan_swap_map */
>  };
> @@ -514,7 +513,7 @@ sector_t swap_page_sector(struct page *page);
>  
>  static inline void put_swap_device(struct swap_info_struct *si)
>  {
> -	rcu_read_unlock();
> +	percpu_ref_put(&si->users);
>  }
>  
>  #else /* CONFIG_SWAP */
> diff --git a/mm/swapfile.c b/mm/swapfile.c
> index 66515a3a2824..90e197bc2eeb 100644
> --- a/mm/swapfile.c
> +++ b/mm/swapfile.c
> @@ -1279,18 +1279,12 @@ static unsigned char __swap_entry_free_locked(struct swap_info_struct *p,
>   * via preventing the swap device from being swapoff, until
>   * put_swap_device() is called.  Otherwise return NULL.
>   *
> - * The entirety of the RCU read critical section must come before the
> - * return from or after the call to synchronize_rcu() in
> - * enable_swap_info() or swapoff().  So if "si->flags & SWP_VALID" is
> - * true, the si->map, si->cluster_info, etc. must be valid in the
> - * critical section.
> - *
>   * Notice that swapoff or swapoff+swapon can still happen before the
> - * rcu_read_lock() in get_swap_device() or after the rcu_read_unlock()
> - * in put_swap_device() if there isn't any other way to prevent
> - * swapoff, such as page lock, page table lock, etc.  The caller must
> - * be prepared for that.  For example, the following situation is
> - * possible.
> + * percpu_ref_tryget_live() in get_swap_device() or after the
> + * percpu_ref_put() in put_swap_device() if there isn't any other way
> + * to prevent swapoff, such as page lock, page table lock, etc.  The
> + * caller must be prepared for that.  For example, the following
> + * situation is possible.
>   *
>   *   CPU1				CPU2
>   *   do_swap_page()
> @@ -1318,21 +1312,24 @@ struct swap_info_struct *get_swap_device(swp_entry_t entry)
>  	si = swp_swap_info(entry);
>  	if (!si)
>  		goto bad_nofile;
> -
> -	rcu_read_lock();
> -	if (data_race(!(si->flags & SWP_VALID)))
> -		goto unlock_out;
> +	if (!percpu_ref_tryget_live(&si->users))
> +		goto out;
> +	/*
> +	 * Guarantee we will not reference uninitialized fields
> +	 * of swap_info_struct.
> +	 */

/*
 * Guarantee the si->users are checked before accessing other fields of
 * swap_info_struct.
*/

> +	smp_rmb();

Usually, smp_rmb() need to be paired with smp_wmb().  Some comments are
needed for that.  Here smb_rmb() is paired with the spin_unlock() after
setup_swap_info() in enable_swap_info().

>  	offset = swp_offset(entry);
>  	if (offset >= si->max)
> -		goto unlock_out;
> +		goto put_out;
>  
>  	return si;
>  bad_nofile:
>  	pr_err("%s: %s%08lx\n", __func__, Bad_file, entry.val);
>  out:
>  	return NULL;
> -unlock_out:
> -	rcu_read_unlock();
> +put_out:
> +	percpu_ref_put(&si->users);
>  	return NULL;
>  }
>  
> @@ -2475,7 +2472,7 @@ static void setup_swap_info(struct swap_info_struct *p, int prio,
>  
>  static void _enable_swap_info(struct swap_info_struct *p)
>  {
> -	p->flags |= SWP_WRITEOK | SWP_VALID;
> +	p->flags |= SWP_WRITEOK;
>  	atomic_long_add(p->pages, &nr_swap_pages);
>  	total_swap_pages += p->pages;
>  
> @@ -2507,7 +2504,7 @@ static void enable_swap_info(struct swap_info_struct *p, int prio,
>  	spin_unlock(&swap_lock);
>  	/*
>  	 * Guarantee swap_map, cluster_info, etc. fields are valid
> -	 * between get/put_swap_device() if SWP_VALID bit is set
> +	 * between get/put_swap_device().
>  	 */

The comments need to be revised.  Something likes below?

/* Finished initialized swap device, now it's safe to reference it */

Best Regards,
Huang, Ying

>  	percpu_ref_resurrect(&p->users);
>  	spin_lock(&swap_lock);
> @@ -2625,12 +2622,6 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
>  
>  	reenable_swap_slots_cache_unlock();
>  
> -	spin_lock(&swap_lock);
> -	spin_lock(&p->lock);
> -	p->flags &= ~SWP_VALID;		/* mark swap device as invalid */
> -	spin_unlock(&p->lock);
> -	spin_unlock(&swap_lock);
> -
>  	percpu_ref_kill(&p->users);
>  	/*
>  	 * We need synchronize_rcu() here to protect the accessing

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH v2 2/5] mm/swapfile: use percpu_ref to serialize against concurrent swapoff
@ 2021-04-19  2:54     ` Huang, Ying
  0 siblings, 0 replies; 33+ messages in thread
From: Huang, Ying @ 2021-04-19  2:54 UTC (permalink / raw)
  To: Miaohe Lin
  Cc: akpm, dennis, tim.c.chen, hughd, hannes, mhocko, iamjoonsoo.kim,
	alexs, david, minchan, richard.weiyang, linux-kernel, linux-mm

Miaohe Lin <linmiaohe@huawei.com> writes:

> Use percpu_ref to serialize against concurrent swapoff. Also remove the
> SWP_VALID flag because it's used together with RCU solution.
>
> Signed-off-by: Miaohe Lin <linmiaohe@huawei.com>
> ---
>  include/linux/swap.h |  3 +--
>  mm/swapfile.c        | 43 +++++++++++++++++--------------------------
>  2 files changed, 18 insertions(+), 28 deletions(-)
>
> diff --git a/include/linux/swap.h b/include/linux/swap.h
> index 8be36eb58b7a..993693b38109 100644
> --- a/include/linux/swap.h
> +++ b/include/linux/swap.h
> @@ -177,7 +177,6 @@ enum {
>  	SWP_PAGE_DISCARD = (1 << 10),	/* freed swap page-cluster discards */
>  	SWP_STABLE_WRITES = (1 << 11),	/* no overwrite PG_writeback pages */
>  	SWP_SYNCHRONOUS_IO = (1 << 12),	/* synchronous IO is efficient */
> -	SWP_VALID	= (1 << 13),	/* swap is valid to be operated on? */
>  					/* add others here before... */
>  	SWP_SCANNING	= (1 << 14),	/* refcount in scan_swap_map */
>  };
> @@ -514,7 +513,7 @@ sector_t swap_page_sector(struct page *page);
>  
>  static inline void put_swap_device(struct swap_info_struct *si)
>  {
> -	rcu_read_unlock();
> +	percpu_ref_put(&si->users);
>  }
>  
>  #else /* CONFIG_SWAP */
> diff --git a/mm/swapfile.c b/mm/swapfile.c
> index 66515a3a2824..90e197bc2eeb 100644
> --- a/mm/swapfile.c
> +++ b/mm/swapfile.c
> @@ -1279,18 +1279,12 @@ static unsigned char __swap_entry_free_locked(struct swap_info_struct *p,
>   * via preventing the swap device from being swapoff, until
>   * put_swap_device() is called.  Otherwise return NULL.
>   *
> - * The entirety of the RCU read critical section must come before the
> - * return from or after the call to synchronize_rcu() in
> - * enable_swap_info() or swapoff().  So if "si->flags & SWP_VALID" is
> - * true, the si->map, si->cluster_info, etc. must be valid in the
> - * critical section.
> - *
>   * Notice that swapoff or swapoff+swapon can still happen before the
> - * rcu_read_lock() in get_swap_device() or after the rcu_read_unlock()
> - * in put_swap_device() if there isn't any other way to prevent
> - * swapoff, such as page lock, page table lock, etc.  The caller must
> - * be prepared for that.  For example, the following situation is
> - * possible.
> + * percpu_ref_tryget_live() in get_swap_device() or after the
> + * percpu_ref_put() in put_swap_device() if there isn't any other way
> + * to prevent swapoff, such as page lock, page table lock, etc.  The
> + * caller must be prepared for that.  For example, the following
> + * situation is possible.
>   *
>   *   CPU1				CPU2
>   *   do_swap_page()
> @@ -1318,21 +1312,24 @@ struct swap_info_struct *get_swap_device(swp_entry_t entry)
>  	si = swp_swap_info(entry);
>  	if (!si)
>  		goto bad_nofile;
> -
> -	rcu_read_lock();
> -	if (data_race(!(si->flags & SWP_VALID)))
> -		goto unlock_out;
> +	if (!percpu_ref_tryget_live(&si->users))
> +		goto out;
> +	/*
> +	 * Guarantee we will not reference uninitialized fields
> +	 * of swap_info_struct.
> +	 */

/*
 * Guarantee the si->users are checked before accessing other fields of
 * swap_info_struct.
*/

> +	smp_rmb();

Usually, smp_rmb() need to be paired with smp_wmb().  Some comments are
needed for that.  Here smb_rmb() is paired with the spin_unlock() after
setup_swap_info() in enable_swap_info().

>  	offset = swp_offset(entry);
>  	if (offset >= si->max)
> -		goto unlock_out;
> +		goto put_out;
>  
>  	return si;
>  bad_nofile:
>  	pr_err("%s: %s%08lx\n", __func__, Bad_file, entry.val);
>  out:
>  	return NULL;
> -unlock_out:
> -	rcu_read_unlock();
> +put_out:
> +	percpu_ref_put(&si->users);
>  	return NULL;
>  }
>  
> @@ -2475,7 +2472,7 @@ static void setup_swap_info(struct swap_info_struct *p, int prio,
>  
>  static void _enable_swap_info(struct swap_info_struct *p)
>  {
> -	p->flags |= SWP_WRITEOK | SWP_VALID;
> +	p->flags |= SWP_WRITEOK;
>  	atomic_long_add(p->pages, &nr_swap_pages);
>  	total_swap_pages += p->pages;
>  
> @@ -2507,7 +2504,7 @@ static void enable_swap_info(struct swap_info_struct *p, int prio,
>  	spin_unlock(&swap_lock);
>  	/*
>  	 * Guarantee swap_map, cluster_info, etc. fields are valid
> -	 * between get/put_swap_device() if SWP_VALID bit is set
> +	 * between get/put_swap_device().
>  	 */

The comments need to be revised.  Something likes below?

/* Finished initialized swap device, now it's safe to reference it */

Best Regards,
Huang, Ying

>  	percpu_ref_resurrect(&p->users);
>  	spin_lock(&swap_lock);
> @@ -2625,12 +2622,6 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
>  
>  	reenable_swap_slots_cache_unlock();
>  
> -	spin_lock(&swap_lock);
> -	spin_lock(&p->lock);
> -	p->flags &= ~SWP_VALID;		/* mark swap device as invalid */
> -	spin_unlock(&p->lock);
> -	spin_unlock(&swap_lock);
> -
>  	percpu_ref_kill(&p->users);
>  	/*
>  	 * We need synchronize_rcu() here to protect the accessing


^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH v2 1/5] mm/swapfile: add percpu_ref support for swap
  2021-04-19  2:48     ` Huang, Ying
  (?)
@ 2021-04-19  6:46     ` Miaohe Lin
  2021-04-19  7:09         ` Huang, Ying
  -1 siblings, 1 reply; 33+ messages in thread
From: Miaohe Lin @ 2021-04-19  6:46 UTC (permalink / raw)
  To: Huang, Ying
  Cc: akpm, dennis, tim.c.chen, hughd, hannes, mhocko, iamjoonsoo.kim,
	alexs, david, minchan, richard.weiyang, linux-kernel, linux-mm

On 2021/4/19 10:48, Huang, Ying wrote:
> Miaohe Lin <linmiaohe@huawei.com> writes:
> 
>> We will use percpu-refcount to serialize against concurrent swapoff. This
>> patch adds the percpu_ref support for swap.
>>
>> Signed-off-by: Miaohe Lin <linmiaohe@huawei.com>
>> ---
>>  include/linux/swap.h |  3 +++
>>  mm/swapfile.c        | 33 +++++++++++++++++++++++++++++----
>>  2 files changed, 32 insertions(+), 4 deletions(-)
>>
>> diff --git a/include/linux/swap.h b/include/linux/swap.h
>> index 144727041e78..8be36eb58b7a 100644
>> --- a/include/linux/swap.h
>> +++ b/include/linux/swap.h
>> @@ -240,6 +240,7 @@ struct swap_cluster_list {
>>   * The in-memory structure used to track swap areas.
>>   */
>>  struct swap_info_struct {
>> +	struct percpu_ref users;	/* serialization against concurrent swapoff */
> 
> The comments aren't general enough.  We use this to check whether the
> swap device has been fully initialized, etc. May be something as below?
> 
> /* indicate and keep swap device valid */

Looks good.

> 
>>  	unsigned long	flags;		/* SWP_USED etc: see above */
>>  	signed short	prio;		/* swap priority of this type */
>>  	struct plist_node list;		/* entry in swap_active_head */
>> @@ -260,6 +261,8 @@ struct swap_info_struct {
>>  	struct block_device *bdev;	/* swap device or bdev of swap file */
>>  	struct file *swap_file;		/* seldom referenced */
>>  	unsigned int old_block_size;	/* seldom referenced */
>> +	bool ref_initialized;		/* seldom referenced */
>> +	struct completion comp;		/* seldom referenced */
>>  #ifdef CONFIG_FRONTSWAP
>>  	unsigned long *frontswap_map;	/* frontswap in-use, one bit per page */
>>  	atomic_t frontswap_pages;	/* frontswap pages in-use counter */
>> diff --git a/mm/swapfile.c b/mm/swapfile.c
>> index 149e77454e3c..66515a3a2824 100644
>> --- a/mm/swapfile.c
>> +++ b/mm/swapfile.c
>> @@ -39,6 +39,7 @@
>>  #include <linux/export.h>
>>  #include <linux/swap_slots.h>
>>  #include <linux/sort.h>
>> +#include <linux/completion.h>
>>  
>>  #include <asm/tlbflush.h>
>>  #include <linux/swapops.h>
>> @@ -511,6 +512,14 @@ static void swap_discard_work(struct work_struct *work)
>>  	spin_unlock(&si->lock);
>>  }
>>  
>> +static void swap_users_ref_free(struct percpu_ref *ref)
>> +{
>> +	struct swap_info_struct *si;
>> +
>> +	si = container_of(ref, struct swap_info_struct, users);
>> +	complete(&si->comp);
>> +}
>> +
>>  static void alloc_cluster(struct swap_info_struct *si, unsigned long idx)
>>  {
>>  	struct swap_cluster_info *ci = si->cluster_info;
>> @@ -2500,7 +2509,7 @@ static void enable_swap_info(struct swap_info_struct *p, int prio,
>>  	 * Guarantee swap_map, cluster_info, etc. fields are valid
>>  	 * between get/put_swap_device() if SWP_VALID bit is set
>>  	 */
>> -	synchronize_rcu();
> 
> You cannot remove this without changing get/put_swap_device().  It's
> better to squash at least PATCH 1-2.

Will squash PATCH 1-2. Thanks.

> 
>> +	percpu_ref_resurrect(&p->users);
>>  	spin_lock(&swap_lock);
>>  	spin_lock(&p->lock);
>>  	_enable_swap_info(p);
>> @@ -2621,11 +2630,18 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
>>  	p->flags &= ~SWP_VALID;		/* mark swap device as invalid */
>>  	spin_unlock(&p->lock);
>>  	spin_unlock(&swap_lock);
>> +
>> +	percpu_ref_kill(&p->users);
>>  	/*
>> -	 * wait for swap operations protected by get/put_swap_device()
>> -	 * to complete
>> +	 * We need synchronize_rcu() here to protect the accessing
>> +	 * to the swap cache data structure.
>>  	 */
>>  	synchronize_rcu();
>> +	/*
>> +	 * Wait for swap operations protected by get/put_swap_device()
>> +	 * to complete.
>> +	 */
> 
> I think the comments (after some revision) can be moved before
> percpu_ref_kill().  The synchronize_rcu() comments can be merged.
> 

Ok.

>> +	wait_for_completion(&p->comp);
>>  
>>  	flush_work(&p->discard_work);
>>  
>> @@ -3132,7 +3148,7 @@ static bool swap_discardable(struct swap_info_struct *si)
>>  SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
>>  {
>>  	struct swap_info_struct *p;
>> -	struct filename *name;
>> +	struct filename *name = NULL;
>>  	struct file *swap_file = NULL;
>>  	struct address_space *mapping;
>>  	int prio;
>> @@ -3163,6 +3179,15 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
>>  
>>  	INIT_WORK(&p->discard_work, swap_discard_work);
>>  
>> +	if (!p->ref_initialized) {
> 
> I don't think it's necessary to add another flag p->ref_initialized.  We
> can distinguish newly allocated and reused swap_info_struct in alloc_swap_info().
> 

If newly allocated swap_info_struct failed to init percpu_ref, it will be considered as
a reused one in alloc_swap_info() _but_ the field users of swap_info_struct is actually
uninitialized. Does this make sense for you?

Many Thanks for quick review.

> Best Regards,
> Huang, Ying
> 
>> +		error = percpu_ref_init(&p->users, swap_users_ref_free,
>> +					PERCPU_REF_INIT_DEAD, GFP_KERNEL);
>> +		if (unlikely(error))
>> +			goto bad_swap;
>> +		init_completion(&p->comp);
>> +		p->ref_initialized = true;
>> +	}
>> +
>>  	name = getname(specialfile);
>>  	if (IS_ERR(name)) {
>>  		error = PTR_ERR(name);
> .
> 


^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH v2 4/5] mm/swap: remove confusing checking for non_swap_entry() in swap_ra_info()
  2021-04-19  1:53     ` Huang, Ying
  (?)
@ 2021-04-19  6:46     ` Miaohe Lin
  -1 siblings, 0 replies; 33+ messages in thread
From: Miaohe Lin @ 2021-04-19  6:46 UTC (permalink / raw)
  To: Huang, Ying
  Cc: akpm, dennis, tim.c.chen, hughd, hannes, mhocko, iamjoonsoo.kim,
	alexs, david, minchan, richard.weiyang, linux-kernel, linux-mm

On 2021/4/19 9:53, Huang, Ying wrote:
> Miaohe Lin <linmiaohe@huawei.com> writes:
> 
>> While we released the pte lock, somebody else might faulted in this pte.
>> So we should check whether it's swap pte first to guard against such race
>> or swp_type would be unexpected. But the swap_entry isn't used in this
>> function and we will have enough checking when we really operate the PTE
>> entries later. So checking for non_swap_entry() is not really needed here
>> and should be removed to avoid confusion.
> 
> Please rephrase the change log to describe why we have the code and why
> it's unnecessary now.  You can dig the git history via git-blame to find
> out it.
> 

Will try to do it. Thanks.

> The patch itself looks good to me.
> 
> Best Regards,
> Huang, Ying
> 
>> Signed-off-by: Miaohe Lin <linmiaohe@huawei.com>
>> ---
>>  mm/swap_state.c | 6 ------
>>  1 file changed, 6 deletions(-)
>>
>> diff --git a/mm/swap_state.c b/mm/swap_state.c
>> index 272ea2108c9d..df5405384520 100644
>> --- a/mm/swap_state.c
>> +++ b/mm/swap_state.c
>> @@ -721,7 +721,6 @@ static void swap_ra_info(struct vm_fault *vmf,
>>  {
>>  	struct vm_area_struct *vma = vmf->vma;
>>  	unsigned long ra_val;
>> -	swp_entry_t entry;
>>  	unsigned long faddr, pfn, fpfn;
>>  	unsigned long start, end;
>>  	pte_t *pte, *orig_pte;
>> @@ -739,11 +738,6 @@ static void swap_ra_info(struct vm_fault *vmf,
>>  
>>  	faddr = vmf->address;
>>  	orig_pte = pte = pte_offset_map(vmf->pmd, faddr);
>> -	entry = pte_to_swp_entry(*pte);
>> -	if ((unlikely(non_swap_entry(entry)))) {
>> -		pte_unmap(orig_pte);
>> -		return;
>> -	}
>>  
>>  	fpfn = PFN_DOWN(faddr);
>>  	ra_val = GET_SWAP_RA_VAL(vma);
> .
> 


^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH v2 5/5] mm/shmem: fix shmem_swapin() race with swapoff
  2021-04-19  2:15     ` Huang, Ying
  (?)
@ 2021-04-19  6:49     ` Miaohe Lin
  2021-04-19  7:04         ` Huang, Ying
  -1 siblings, 1 reply; 33+ messages in thread
From: Miaohe Lin @ 2021-04-19  6:49 UTC (permalink / raw)
  To: Huang, Ying
  Cc: akpm, dennis, tim.c.chen, hughd, hannes, mhocko, iamjoonsoo.kim,
	alexs, david, minchan, richard.weiyang, linux-kernel, linux-mm

On 2021/4/19 10:15, Huang, Ying wrote:
> Miaohe Lin <linmiaohe@huawei.com> writes:
> 
>> When I was investigating the swap code, I found the below possible race
>> window:
>>
>> CPU 1                                           CPU 2
>> -----                                           -----
>> shmem_swapin
>>   swap_cluster_readahead
>>     if (likely(si->flags & (SWP_BLKDEV | SWP_FS_OPS))) {
>>                                                 swapoff
>>                                                   si->flags &= ~SWP_VALID;
>>                                                   ..
>>                                                   synchronize_rcu();
>>                                                   ..
> 
> You have removed these code in the previous patches of the series.  And
> they are not relevant in this patch.

Yes, I should change these. Thanks.

> 
>>                                                   si->swap_file = NULL;
>>     struct inode *inode = si->swap_file->f_mapping->host;[oops!]
>>
>> Close this race window by using get/put_swap_device() to guard against
>> concurrent swapoff.
>>
>> Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2")
> 
> No.  This isn't the commit that introduces the race condition.  Please
> recheck your git blame result.
> 

I think this is really hard to find exact commit. I used git blame and found
this race should be existed when this is introduced. Any suggestion ?
Thanks.

> Best Regards,
> Huang, Ying
> 
>> Signed-off-by: Miaohe Lin <linmiaohe@huawei.com>
>> ---
>>  mm/shmem.c | 6 ++++++
>>  1 file changed, 6 insertions(+)
>>
>> diff --git a/mm/shmem.c b/mm/shmem.c
>> index 26c76b13ad23..936ba5595297 100644
>> --- a/mm/shmem.c
>> +++ b/mm/shmem.c
>> @@ -1492,15 +1492,21 @@ static void shmem_pseudo_vma_destroy(struct vm_area_struct *vma)
>>  static struct page *shmem_swapin(swp_entry_t swap, gfp_t gfp,
>>  			struct shmem_inode_info *info, pgoff_t index)
>>  {
>> +	struct swap_info_struct *si;
>>  	struct vm_area_struct pvma;
>>  	struct page *page;
>>  	struct vm_fault vmf = {
>>  		.vma = &pvma,
>>  	};
>>  
>> +	/* Prevent swapoff from happening to us. */
>> +	si = get_swap_device(swap);
>> +	if (unlikely(!si))
>> +		return NULL;
>>  	shmem_pseudo_vma_init(&pvma, info, index);
>>  	page = swap_cluster_readahead(swap, gfp, &vmf);
>>  	shmem_pseudo_vma_destroy(&pvma);
>> +	put_swap_device(si);
>>  
>>  	return page;
>>  }
> .
> 


^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH v2 3/5] swap: fix do_swap_page() race with swapoff
  2021-04-19  2:23     ` Huang, Ying
  (?)
@ 2021-04-19  6:54     ` Miaohe Lin
  -1 siblings, 0 replies; 33+ messages in thread
From: Miaohe Lin @ 2021-04-19  6:54 UTC (permalink / raw)
  To: Huang, Ying
  Cc: akpm, dennis, tim.c.chen, hughd, hannes, mhocko, iamjoonsoo.kim,
	alexs, david, minchan, richard.weiyang, linux-kernel, linux-mm

On 2021/4/19 10:23, Huang, Ying wrote:
> Miaohe Lin <linmiaohe@huawei.com> writes:
> 
>> When I was investigating the swap code, I found the below possible race
>> window:
>>
>> CPU 1                                   	CPU 2
>> -----                                   	-----
>> do_swap_page
> 
> This is OK for swap cache cases.  So
> 
>   if (data_race(si->flags & SWP_SYNCHRONOUS_IO))
> 
> should be shown here.

Ok.

> 
>>   swap_readpage(skip swap cache case)
>>     if (data_race(sis->flags & SWP_FS_OPS)) {
>>                                         	swapoff
>> 					  	  p->flags = &= ~SWP_VALID;
>> 					  	  ..
>> 					  	  synchronize_rcu();
>> 					  	  ..
>> 					  	  p->swap_file = NULL;
>>     struct file *swap_file = sis->swap_file;
>>     struct address_space *mapping = swap_file->f_mapping;[oops!]
>>
>> Note that for the pages that are swapped in through swap cache, this isn't
>> an issue. Because the page is locked, and the swap entry will be marked
>> with SWAP_HAS_CACHE, so swapoff() can not proceed until the page has been
>> unlocked.
>>
>> Using current get/put_swap_device() to guard against concurrent swapoff for
>> swap_readpage() looks terrible because swap_readpage() may take really long
>> time. And this race may not be really pernicious because swapoff is usually
>> done when system shutdown only. To reduce the performance overhead on the
>> hot-path as much as possible, it appears we can use the percpu_ref to close
>> this race window(as suggested by Huang, Ying).
> 
> I still suggest to squash PATCH 1-3, at least PATCH 1-2.  That will
> change the relevant code together and make it easier to review.
> 

Will squash PATCH 1-2. Thanks.

> Best Regards,
> Huang, Ying
> 
>> Fixes: 0bcac06f27d7 ("mm,swap: skip swapcache for swapin of synchronous device")
>> Reported-by: kernel test robot <lkp@intel.com> (auto build test ERROR)
>> Signed-off-by: Miaohe Lin <linmiaohe@huawei.com>
>> ---
>>  include/linux/swap.h | 9 +++++++++
>>  mm/memory.c          | 9 +++++++++
>>  2 files changed, 18 insertions(+)
>>
>> diff --git a/include/linux/swap.h b/include/linux/swap.h
>> index 993693b38109..523c2411a135 100644
>> --- a/include/linux/swap.h
>> +++ b/include/linux/swap.h
>> @@ -528,6 +528,15 @@ static inline struct swap_info_struct *swp_swap_info(swp_entry_t entry)
>>  	return NULL;
>>  }
>>  
>> +static inline struct swap_info_struct *get_swap_device(swp_entry_t entry)
>> +{
>> +	return NULL;
>> +}
>> +
>> +static inline void put_swap_device(struct swap_info_struct *si)
>> +{
>> +}
>> +
>>  #define swap_address_space(entry)		(NULL)
>>  #define get_nr_swap_pages()			0L
>>  #define total_swap_pages			0L
>> diff --git a/mm/memory.c b/mm/memory.c
>> index 27014c3bde9f..7a2fe12cf641 100644
>> --- a/mm/memory.c
>> +++ b/mm/memory.c
>> @@ -3311,6 +3311,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
>>  {
>>  	struct vm_area_struct *vma = vmf->vma;
>>  	struct page *page = NULL, *swapcache;
>> +	struct swap_info_struct *si = NULL;
>>  	swp_entry_t entry;
>>  	pte_t pte;
>>  	int locked;
>> @@ -3338,6 +3339,10 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
>>  		goto out;
>>  	}
>>  
>> +	/* Prevent swapoff from happening to us. */
>> +	si = get_swap_device(entry);
>> +	if (unlikely(!si))
>> +		goto out;
>>  
>>  	delayacct_set_flag(current, DELAYACCT_PF_SWAPIN);
>>  	page = lookup_swap_cache(entry, vma, vmf->address);
>> @@ -3514,6 +3519,8 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
>>  unlock:
>>  	pte_unmap_unlock(vmf->pte, vmf->ptl);
>>  out:
>> +	if (si)
>> +		put_swap_device(si);
>>  	return ret;
>>  out_nomap:
>>  	pte_unmap_unlock(vmf->pte, vmf->ptl);
>> @@ -3525,6 +3532,8 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
>>  		unlock_page(swapcache);
>>  		put_page(swapcache);
>>  	}
>> +	if (si)
>> +		put_swap_device(si);
>>  	return ret;
>>  }
> .
> 


^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH v2 2/5] mm/swapfile: use percpu_ref to serialize against concurrent swapoff
  2021-04-19  2:54     ` Huang, Ying
  (?)
@ 2021-04-19  6:57     ` Miaohe Lin
  -1 siblings, 0 replies; 33+ messages in thread
From: Miaohe Lin @ 2021-04-19  6:57 UTC (permalink / raw)
  To: Huang, Ying
  Cc: akpm, dennis, tim.c.chen, hughd, hannes, mhocko, iamjoonsoo.kim,
	alexs, david, minchan, richard.weiyang, linux-kernel, linux-mm

On 2021/4/19 10:54, Huang, Ying wrote:
> Miaohe Lin <linmiaohe@huawei.com> writes:
> 
>> Use percpu_ref to serialize against concurrent swapoff. Also remove the
>> SWP_VALID flag because it's used together with RCU solution.
>>
>> Signed-off-by: Miaohe Lin <linmiaohe@huawei.com>
>> ---
>>  include/linux/swap.h |  3 +--
>>  mm/swapfile.c        | 43 +++++++++++++++++--------------------------
>>  2 files changed, 18 insertions(+), 28 deletions(-)
>>
>> diff --git a/include/linux/swap.h b/include/linux/swap.h
>> index 8be36eb58b7a..993693b38109 100644
>> --- a/include/linux/swap.h
>> +++ b/include/linux/swap.h
>> @@ -177,7 +177,6 @@ enum {
>>  	SWP_PAGE_DISCARD = (1 << 10),	/* freed swap page-cluster discards */
>>  	SWP_STABLE_WRITES = (1 << 11),	/* no overwrite PG_writeback pages */
>>  	SWP_SYNCHRONOUS_IO = (1 << 12),	/* synchronous IO is efficient */
>> -	SWP_VALID	= (1 << 13),	/* swap is valid to be operated on? */
>>  					/* add others here before... */
>>  	SWP_SCANNING	= (1 << 14),	/* refcount in scan_swap_map */
>>  };
>> @@ -514,7 +513,7 @@ sector_t swap_page_sector(struct page *page);
>>  
>>  static inline void put_swap_device(struct swap_info_struct *si)
>>  {
>> -	rcu_read_unlock();
>> +	percpu_ref_put(&si->users);
>>  }
>>  
>>  #else /* CONFIG_SWAP */
>> diff --git a/mm/swapfile.c b/mm/swapfile.c
>> index 66515a3a2824..90e197bc2eeb 100644
>> --- a/mm/swapfile.c
>> +++ b/mm/swapfile.c
>> @@ -1279,18 +1279,12 @@ static unsigned char __swap_entry_free_locked(struct swap_info_struct *p,
>>   * via preventing the swap device from being swapoff, until
>>   * put_swap_device() is called.  Otherwise return NULL.
>>   *
>> - * The entirety of the RCU read critical section must come before the
>> - * return from or after the call to synchronize_rcu() in
>> - * enable_swap_info() or swapoff().  So if "si->flags & SWP_VALID" is
>> - * true, the si->map, si->cluster_info, etc. must be valid in the
>> - * critical section.
>> - *
>>   * Notice that swapoff or swapoff+swapon can still happen before the
>> - * rcu_read_lock() in get_swap_device() or after the rcu_read_unlock()
>> - * in put_swap_device() if there isn't any other way to prevent
>> - * swapoff, such as page lock, page table lock, etc.  The caller must
>> - * be prepared for that.  For example, the following situation is
>> - * possible.
>> + * percpu_ref_tryget_live() in get_swap_device() or after the
>> + * percpu_ref_put() in put_swap_device() if there isn't any other way
>> + * to prevent swapoff, such as page lock, page table lock, etc.  The
>> + * caller must be prepared for that.  For example, the following
>> + * situation is possible.
>>   *
>>   *   CPU1				CPU2
>>   *   do_swap_page()
>> @@ -1318,21 +1312,24 @@ struct swap_info_struct *get_swap_device(swp_entry_t entry)
>>  	si = swp_swap_info(entry);
>>  	if (!si)
>>  		goto bad_nofile;
>> -
>> -	rcu_read_lock();
>> -	if (data_race(!(si->flags & SWP_VALID)))
>> -		goto unlock_out;
>> +	if (!percpu_ref_tryget_live(&si->users))
>> +		goto out;
>> +	/*
>> +	 * Guarantee we will not reference uninitialized fields
>> +	 * of swap_info_struct.
>> +	 */
> 
> /*
>  * Guarantee the si->users are checked before accessing other fields of
>  * swap_info_struct.
> */
> 
>> +	smp_rmb();
> 
> Usually, smp_rmb() need to be paired with smp_wmb().  Some comments are
> needed for that.  Here smb_rmb() is paired with the spin_unlock() after
> setup_swap_info() in enable_swap_info().
> 
>>  	offset = swp_offset(entry);
>>  	if (offset >= si->max)
>> -		goto unlock_out;
>> +		goto put_out;
>>  
>>  	return si;
>>  bad_nofile:
>>  	pr_err("%s: %s%08lx\n", __func__, Bad_file, entry.val);
>>  out:
>>  	return NULL;
>> -unlock_out:
>> -	rcu_read_unlock();
>> +put_out:
>> +	percpu_ref_put(&si->users);
>>  	return NULL;
>>  }
>>  
>> @@ -2475,7 +2472,7 @@ static void setup_swap_info(struct swap_info_struct *p, int prio,
>>  
>>  static void _enable_swap_info(struct swap_info_struct *p)
>>  {
>> -	p->flags |= SWP_WRITEOK | SWP_VALID;
>> +	p->flags |= SWP_WRITEOK;
>>  	atomic_long_add(p->pages, &nr_swap_pages);
>>  	total_swap_pages += p->pages;
>>  
>> @@ -2507,7 +2504,7 @@ static void enable_swap_info(struct swap_info_struct *p, int prio,
>>  	spin_unlock(&swap_lock);
>>  	/*
>>  	 * Guarantee swap_map, cluster_info, etc. fields are valid
>> -	 * between get/put_swap_device() if SWP_VALID bit is set
>> +	 * between get/put_swap_device().
>>  	 */
> 
> The comments need to be revised.  Something likes below?
> 
> /* Finished initialized swap device, now it's safe to reference it */
> 

All look good for me. Will do. Many thanks!

> Best Regards,
> Huang, Ying
> 
>>  	percpu_ref_resurrect(&p->users);
>>  	spin_lock(&swap_lock);
>> @@ -2625,12 +2622,6 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
>>  
>>  	reenable_swap_slots_cache_unlock();
>>  
>> -	spin_lock(&swap_lock);
>> -	spin_lock(&p->lock);
>> -	p->flags &= ~SWP_VALID;		/* mark swap device as invalid */
>> -	spin_unlock(&p->lock);
>> -	spin_unlock(&swap_lock);
>> -
>>  	percpu_ref_kill(&p->users);
>>  	/*
>>  	 * We need synchronize_rcu() here to protect the accessing
> 
> .
> 


^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH v2 5/5] mm/shmem: fix shmem_swapin() race with swapoff
  2021-04-19  6:49     ` Miaohe Lin
@ 2021-04-19  7:04         ` Huang, Ying
  0 siblings, 0 replies; 33+ messages in thread
From: Huang, Ying @ 2021-04-19  7:04 UTC (permalink / raw)
  To: Miaohe Lin
  Cc: akpm, dennis, tim.c.chen, hughd, hannes, mhocko, iamjoonsoo.kim,
	alexs, david, minchan, richard.weiyang, linux-kernel, linux-mm

Miaohe Lin <linmiaohe@huawei.com> writes:

> On 2021/4/19 10:15, Huang, Ying wrote:
>> Miaohe Lin <linmiaohe@huawei.com> writes:
>> 
>>> When I was investigating the swap code, I found the below possible race
>>> window:
>>>
>>> CPU 1                                           CPU 2
>>> -----                                           -----
>>> shmem_swapin
>>>   swap_cluster_readahead
>>>     if (likely(si->flags & (SWP_BLKDEV | SWP_FS_OPS))) {
>>>                                                 swapoff
>>>                                                   si->flags &= ~SWP_VALID;
>>>                                                   ..
>>>                                                   synchronize_rcu();
>>>                                                   ..
>> 
>> You have removed these code in the previous patches of the series.  And
>> they are not relevant in this patch.
>
> Yes, I should change these. Thanks.
>
>> 
>>>                                                   si->swap_file = NULL;
>>>     struct inode *inode = si->swap_file->f_mapping->host;[oops!]
>>>
>>> Close this race window by using get/put_swap_device() to guard against
>>> concurrent swapoff.
>>>
>>> Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2")
>> 
>> No.  This isn't the commit that introduces the race condition.  Please
>> recheck your git blame result.
>> 
>
> I think this is really hard to find exact commit. I used git blame and found
> this race should be existed when this is introduced. Any suggestion ?
> Thanks.

I think the commit that introduces the race condition is commit
8fd2e0b505d1 ("mm: swap: check if swap backing device is congested or
not")

Best Regards,
Huang, Ying

>> Best Regards,
>> Huang, Ying
>> 
>>> Signed-off-by: Miaohe Lin <linmiaohe@huawei.com>
>>> ---
>>>  mm/shmem.c | 6 ++++++
>>>  1 file changed, 6 insertions(+)
>>>
>>> diff --git a/mm/shmem.c b/mm/shmem.c
>>> index 26c76b13ad23..936ba5595297 100644
>>> --- a/mm/shmem.c
>>> +++ b/mm/shmem.c
>>> @@ -1492,15 +1492,21 @@ static void shmem_pseudo_vma_destroy(struct vm_area_struct *vma)
>>>  static struct page *shmem_swapin(swp_entry_t swap, gfp_t gfp,
>>>  			struct shmem_inode_info *info, pgoff_t index)
>>>  {
>>> +	struct swap_info_struct *si;
>>>  	struct vm_area_struct pvma;
>>>  	struct page *page;
>>>  	struct vm_fault vmf = {
>>>  		.vma = &pvma,
>>>  	};
>>>  
>>> +	/* Prevent swapoff from happening to us. */
>>> +	si = get_swap_device(swap);
>>> +	if (unlikely(!si))
>>> +		return NULL;
>>>  	shmem_pseudo_vma_init(&pvma, info, index);
>>>  	page = swap_cluster_readahead(swap, gfp, &vmf);
>>>  	shmem_pseudo_vma_destroy(&pvma);
>>> +	put_swap_device(si);
>>>  
>>>  	return page;
>>>  }
>> .
>> 

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH v2 5/5] mm/shmem: fix shmem_swapin() race with swapoff
@ 2021-04-19  7:04         ` Huang, Ying
  0 siblings, 0 replies; 33+ messages in thread
From: Huang, Ying @ 2021-04-19  7:04 UTC (permalink / raw)
  To: Miaohe Lin
  Cc: akpm, dennis, tim.c.chen, hughd, hannes, mhocko, iamjoonsoo.kim,
	alexs, david, minchan, richard.weiyang, linux-kernel, linux-mm

Miaohe Lin <linmiaohe@huawei.com> writes:

> On 2021/4/19 10:15, Huang, Ying wrote:
>> Miaohe Lin <linmiaohe@huawei.com> writes:
>> 
>>> When I was investigating the swap code, I found the below possible race
>>> window:
>>>
>>> CPU 1                                           CPU 2
>>> -----                                           -----
>>> shmem_swapin
>>>   swap_cluster_readahead
>>>     if (likely(si->flags & (SWP_BLKDEV | SWP_FS_OPS))) {
>>>                                                 swapoff
>>>                                                   si->flags &= ~SWP_VALID;
>>>                                                   ..
>>>                                                   synchronize_rcu();
>>>                                                   ..
>> 
>> You have removed these code in the previous patches of the series.  And
>> they are not relevant in this patch.
>
> Yes, I should change these. Thanks.
>
>> 
>>>                                                   si->swap_file = NULL;
>>>     struct inode *inode = si->swap_file->f_mapping->host;[oops!]
>>>
>>> Close this race window by using get/put_swap_device() to guard against
>>> concurrent swapoff.
>>>
>>> Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2")
>> 
>> No.  This isn't the commit that introduces the race condition.  Please
>> recheck your git blame result.
>> 
>
> I think this is really hard to find exact commit. I used git blame and found
> this race should be existed when this is introduced. Any suggestion ?
> Thanks.

I think the commit that introduces the race condition is commit
8fd2e0b505d1 ("mm: swap: check if swap backing device is congested or
not")

Best Regards,
Huang, Ying

>> Best Regards,
>> Huang, Ying
>> 
>>> Signed-off-by: Miaohe Lin <linmiaohe@huawei.com>
>>> ---
>>>  mm/shmem.c | 6 ++++++
>>>  1 file changed, 6 insertions(+)
>>>
>>> diff --git a/mm/shmem.c b/mm/shmem.c
>>> index 26c76b13ad23..936ba5595297 100644
>>> --- a/mm/shmem.c
>>> +++ b/mm/shmem.c
>>> @@ -1492,15 +1492,21 @@ static void shmem_pseudo_vma_destroy(struct vm_area_struct *vma)
>>>  static struct page *shmem_swapin(swp_entry_t swap, gfp_t gfp,
>>>  			struct shmem_inode_info *info, pgoff_t index)
>>>  {
>>> +	struct swap_info_struct *si;
>>>  	struct vm_area_struct pvma;
>>>  	struct page *page;
>>>  	struct vm_fault vmf = {
>>>  		.vma = &pvma,
>>>  	};
>>>  
>>> +	/* Prevent swapoff from happening to us. */
>>> +	si = get_swap_device(swap);
>>> +	if (unlikely(!si))
>>> +		return NULL;
>>>  	shmem_pseudo_vma_init(&pvma, info, index);
>>>  	page = swap_cluster_readahead(swap, gfp, &vmf);
>>>  	shmem_pseudo_vma_destroy(&pvma);
>>> +	put_swap_device(si);
>>>  
>>>  	return page;
>>>  }
>> .
>> 


^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH v2 1/5] mm/swapfile: add percpu_ref support for swap
  2021-04-19  6:46     ` Miaohe Lin
@ 2021-04-19  7:09         ` Huang, Ying
  0 siblings, 0 replies; 33+ messages in thread
From: Huang, Ying @ 2021-04-19  7:09 UTC (permalink / raw)
  To: Miaohe Lin
  Cc: akpm, dennis, tim.c.chen, hughd, hannes, mhocko, iamjoonsoo.kim,
	alexs, david, minchan, richard.weiyang, linux-kernel, linux-mm

Miaohe Lin <linmiaohe@huawei.com> writes:

> On 2021/4/19 10:48, Huang, Ying wrote:
>> Miaohe Lin <linmiaohe@huawei.com> writes:
>> 
>>> We will use percpu-refcount to serialize against concurrent swapoff. This
>>> patch adds the percpu_ref support for swap.
>>>
>>> Signed-off-by: Miaohe Lin <linmiaohe@huawei.com>
>>> ---
>>>  include/linux/swap.h |  3 +++
>>>  mm/swapfile.c        | 33 +++++++++++++++++++++++++++++----
>>>  2 files changed, 32 insertions(+), 4 deletions(-)
>>>
>>> diff --git a/include/linux/swap.h b/include/linux/swap.h
>>> index 144727041e78..8be36eb58b7a 100644
>>> --- a/include/linux/swap.h
>>> +++ b/include/linux/swap.h
>>> @@ -240,6 +240,7 @@ struct swap_cluster_list {
>>>   * The in-memory structure used to track swap areas.
>>>   */
>>>  struct swap_info_struct {
>>> +	struct percpu_ref users;	/* serialization against concurrent swapoff */
>> 
>> The comments aren't general enough.  We use this to check whether the
>> swap device has been fully initialized, etc. May be something as below?
>> 
>> /* indicate and keep swap device valid */
>
> Looks good.
>
>> 
>>>  	unsigned long	flags;		/* SWP_USED etc: see above */
>>>  	signed short	prio;		/* swap priority of this type */
>>>  	struct plist_node list;		/* entry in swap_active_head */
>>> @@ -260,6 +261,8 @@ struct swap_info_struct {
>>>  	struct block_device *bdev;	/* swap device or bdev of swap file */
>>>  	struct file *swap_file;		/* seldom referenced */
>>>  	unsigned int old_block_size;	/* seldom referenced */
>>> +	bool ref_initialized;		/* seldom referenced */
>>> +	struct completion comp;		/* seldom referenced */
>>>  #ifdef CONFIG_FRONTSWAP
>>>  	unsigned long *frontswap_map;	/* frontswap in-use, one bit per page */
>>>  	atomic_t frontswap_pages;	/* frontswap pages in-use counter */
>>> diff --git a/mm/swapfile.c b/mm/swapfile.c
>>> index 149e77454e3c..66515a3a2824 100644
>>> --- a/mm/swapfile.c
>>> +++ b/mm/swapfile.c
>>> @@ -39,6 +39,7 @@
>>>  #include <linux/export.h>
>>>  #include <linux/swap_slots.h>
>>>  #include <linux/sort.h>
>>> +#include <linux/completion.h>
>>>  
>>>  #include <asm/tlbflush.h>
>>>  #include <linux/swapops.h>
>>> @@ -511,6 +512,14 @@ static void swap_discard_work(struct work_struct *work)
>>>  	spin_unlock(&si->lock);
>>>  }
>>>  
>>> +static void swap_users_ref_free(struct percpu_ref *ref)
>>> +{
>>> +	struct swap_info_struct *si;
>>> +
>>> +	si = container_of(ref, struct swap_info_struct, users);
>>> +	complete(&si->comp);
>>> +}
>>> +
>>>  static void alloc_cluster(struct swap_info_struct *si, unsigned long idx)
>>>  {
>>>  	struct swap_cluster_info *ci = si->cluster_info;
>>> @@ -2500,7 +2509,7 @@ static void enable_swap_info(struct swap_info_struct *p, int prio,
>>>  	 * Guarantee swap_map, cluster_info, etc. fields are valid
>>>  	 * between get/put_swap_device() if SWP_VALID bit is set
>>>  	 */
>>> -	synchronize_rcu();
>> 
>> You cannot remove this without changing get/put_swap_device().  It's
>> better to squash at least PATCH 1-2.
>
> Will squash PATCH 1-2. Thanks.
>
>> 
>>> +	percpu_ref_resurrect(&p->users);
>>>  	spin_lock(&swap_lock);
>>>  	spin_lock(&p->lock);
>>>  	_enable_swap_info(p);
>>> @@ -2621,11 +2630,18 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
>>>  	p->flags &= ~SWP_VALID;		/* mark swap device as invalid */
>>>  	spin_unlock(&p->lock);
>>>  	spin_unlock(&swap_lock);
>>> +
>>> +	percpu_ref_kill(&p->users);
>>>  	/*
>>> -	 * wait for swap operations protected by get/put_swap_device()
>>> -	 * to complete
>>> +	 * We need synchronize_rcu() here to protect the accessing
>>> +	 * to the swap cache data structure.
>>>  	 */
>>>  	synchronize_rcu();
>>> +	/*
>>> +	 * Wait for swap operations protected by get/put_swap_device()
>>> +	 * to complete.
>>> +	 */
>> 
>> I think the comments (after some revision) can be moved before
>> percpu_ref_kill().  The synchronize_rcu() comments can be merged.
>> 
>
> Ok.
>
>>> +	wait_for_completion(&p->comp);
>>>  
>>>  	flush_work(&p->discard_work);
>>>  
>>> @@ -3132,7 +3148,7 @@ static bool swap_discardable(struct swap_info_struct *si)
>>>  SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
>>>  {
>>>  	struct swap_info_struct *p;
>>> -	struct filename *name;
>>> +	struct filename *name = NULL;
>>>  	struct file *swap_file = NULL;
>>>  	struct address_space *mapping;
>>>  	int prio;
>>> @@ -3163,6 +3179,15 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
>>>  
>>>  	INIT_WORK(&p->discard_work, swap_discard_work);
>>>  
>>> +	if (!p->ref_initialized) {
>> 
>> I don't think it's necessary to add another flag p->ref_initialized.  We
>> can distinguish newly allocated and reused swap_info_struct in alloc_swap_info().
>> 
>
> If newly allocated swap_info_struct failed to init percpu_ref, it will be considered as
> a reused one in alloc_swap_info() _but_ the field users of swap_info_struct is actually
> uninitialized. Does this make sense for you?

We can call percpu_ref_init() just after kvzalloc() in alloc_swap_info().

Best Regards,
Huang, Ying

> Many Thanks for quick review.
>
>> Best Regards,
>> Huang, Ying
>> 
>>> +		error = percpu_ref_init(&p->users, swap_users_ref_free,
>>> +					PERCPU_REF_INIT_DEAD, GFP_KERNEL);
>>> +		if (unlikely(error))
>>> +			goto bad_swap;
>>> +		init_completion(&p->comp);
>>> +		p->ref_initialized = true;
>>> +	}
>>> +
>>>  	name = getname(specialfile);
>>>  	if (IS_ERR(name)) {
>>>  		error = PTR_ERR(name);
>> .
>> 

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH v2 1/5] mm/swapfile: add percpu_ref support for swap
@ 2021-04-19  7:09         ` Huang, Ying
  0 siblings, 0 replies; 33+ messages in thread
From: Huang, Ying @ 2021-04-19  7:09 UTC (permalink / raw)
  To: Miaohe Lin
  Cc: akpm, dennis, tim.c.chen, hughd, hannes, mhocko, iamjoonsoo.kim,
	alexs, david, minchan, richard.weiyang, linux-kernel, linux-mm

Miaohe Lin <linmiaohe@huawei.com> writes:

> On 2021/4/19 10:48, Huang, Ying wrote:
>> Miaohe Lin <linmiaohe@huawei.com> writes:
>> 
>>> We will use percpu-refcount to serialize against concurrent swapoff. This
>>> patch adds the percpu_ref support for swap.
>>>
>>> Signed-off-by: Miaohe Lin <linmiaohe@huawei.com>
>>> ---
>>>  include/linux/swap.h |  3 +++
>>>  mm/swapfile.c        | 33 +++++++++++++++++++++++++++++----
>>>  2 files changed, 32 insertions(+), 4 deletions(-)
>>>
>>> diff --git a/include/linux/swap.h b/include/linux/swap.h
>>> index 144727041e78..8be36eb58b7a 100644
>>> --- a/include/linux/swap.h
>>> +++ b/include/linux/swap.h
>>> @@ -240,6 +240,7 @@ struct swap_cluster_list {
>>>   * The in-memory structure used to track swap areas.
>>>   */
>>>  struct swap_info_struct {
>>> +	struct percpu_ref users;	/* serialization against concurrent swapoff */
>> 
>> The comments aren't general enough.  We use this to check whether the
>> swap device has been fully initialized, etc. May be something as below?
>> 
>> /* indicate and keep swap device valid */
>
> Looks good.
>
>> 
>>>  	unsigned long	flags;		/* SWP_USED etc: see above */
>>>  	signed short	prio;		/* swap priority of this type */
>>>  	struct plist_node list;		/* entry in swap_active_head */
>>> @@ -260,6 +261,8 @@ struct swap_info_struct {
>>>  	struct block_device *bdev;	/* swap device or bdev of swap file */
>>>  	struct file *swap_file;		/* seldom referenced */
>>>  	unsigned int old_block_size;	/* seldom referenced */
>>> +	bool ref_initialized;		/* seldom referenced */
>>> +	struct completion comp;		/* seldom referenced */
>>>  #ifdef CONFIG_FRONTSWAP
>>>  	unsigned long *frontswap_map;	/* frontswap in-use, one bit per page */
>>>  	atomic_t frontswap_pages;	/* frontswap pages in-use counter */
>>> diff --git a/mm/swapfile.c b/mm/swapfile.c
>>> index 149e77454e3c..66515a3a2824 100644
>>> --- a/mm/swapfile.c
>>> +++ b/mm/swapfile.c
>>> @@ -39,6 +39,7 @@
>>>  #include <linux/export.h>
>>>  #include <linux/swap_slots.h>
>>>  #include <linux/sort.h>
>>> +#include <linux/completion.h>
>>>  
>>>  #include <asm/tlbflush.h>
>>>  #include <linux/swapops.h>
>>> @@ -511,6 +512,14 @@ static void swap_discard_work(struct work_struct *work)
>>>  	spin_unlock(&si->lock);
>>>  }
>>>  
>>> +static void swap_users_ref_free(struct percpu_ref *ref)
>>> +{
>>> +	struct swap_info_struct *si;
>>> +
>>> +	si = container_of(ref, struct swap_info_struct, users);
>>> +	complete(&si->comp);
>>> +}
>>> +
>>>  static void alloc_cluster(struct swap_info_struct *si, unsigned long idx)
>>>  {
>>>  	struct swap_cluster_info *ci = si->cluster_info;
>>> @@ -2500,7 +2509,7 @@ static void enable_swap_info(struct swap_info_struct *p, int prio,
>>>  	 * Guarantee swap_map, cluster_info, etc. fields are valid
>>>  	 * between get/put_swap_device() if SWP_VALID bit is set
>>>  	 */
>>> -	synchronize_rcu();
>> 
>> You cannot remove this without changing get/put_swap_device().  It's
>> better to squash at least PATCH 1-2.
>
> Will squash PATCH 1-2. Thanks.
>
>> 
>>> +	percpu_ref_resurrect(&p->users);
>>>  	spin_lock(&swap_lock);
>>>  	spin_lock(&p->lock);
>>>  	_enable_swap_info(p);
>>> @@ -2621,11 +2630,18 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
>>>  	p->flags &= ~SWP_VALID;		/* mark swap device as invalid */
>>>  	spin_unlock(&p->lock);
>>>  	spin_unlock(&swap_lock);
>>> +
>>> +	percpu_ref_kill(&p->users);
>>>  	/*
>>> -	 * wait for swap operations protected by get/put_swap_device()
>>> -	 * to complete
>>> +	 * We need synchronize_rcu() here to protect the accessing
>>> +	 * to the swap cache data structure.
>>>  	 */
>>>  	synchronize_rcu();
>>> +	/*
>>> +	 * Wait for swap operations protected by get/put_swap_device()
>>> +	 * to complete.
>>> +	 */
>> 
>> I think the comments (after some revision) can be moved before
>> percpu_ref_kill().  The synchronize_rcu() comments can be merged.
>> 
>
> Ok.
>
>>> +	wait_for_completion(&p->comp);
>>>  
>>>  	flush_work(&p->discard_work);
>>>  
>>> @@ -3132,7 +3148,7 @@ static bool swap_discardable(struct swap_info_struct *si)
>>>  SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
>>>  {
>>>  	struct swap_info_struct *p;
>>> -	struct filename *name;
>>> +	struct filename *name = NULL;
>>>  	struct file *swap_file = NULL;
>>>  	struct address_space *mapping;
>>>  	int prio;
>>> @@ -3163,6 +3179,15 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
>>>  
>>>  	INIT_WORK(&p->discard_work, swap_discard_work);
>>>  
>>> +	if (!p->ref_initialized) {
>> 
>> I don't think it's necessary to add another flag p->ref_initialized.  We
>> can distinguish newly allocated and reused swap_info_struct in alloc_swap_info().
>> 
>
> If newly allocated swap_info_struct failed to init percpu_ref, it will be considered as
> a reused one in alloc_swap_info() _but_ the field users of swap_info_struct is actually
> uninitialized. Does this make sense for you?

We can call percpu_ref_init() just after kvzalloc() in alloc_swap_info().

Best Regards,
Huang, Ying

> Many Thanks for quick review.
>
>> Best Regards,
>> Huang, Ying
>> 
>>> +		error = percpu_ref_init(&p->users, swap_users_ref_free,
>>> +					PERCPU_REF_INIT_DEAD, GFP_KERNEL);
>>> +		if (unlikely(error))
>>> +			goto bad_swap;
>>> +		init_completion(&p->comp);
>>> +		p->ref_initialized = true;
>>> +	}
>>> +
>>>  	name = getname(specialfile);
>>>  	if (IS_ERR(name)) {
>>>  		error = PTR_ERR(name);
>> .
>> 


^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH v2 5/5] mm/shmem: fix shmem_swapin() race with swapoff
  2021-04-19  7:04         ` Huang, Ying
  (?)
@ 2021-04-19  7:14         ` Miaohe Lin
  2021-04-19  7:41             ` Huang, Ying
  -1 siblings, 1 reply; 33+ messages in thread
From: Miaohe Lin @ 2021-04-19  7:14 UTC (permalink / raw)
  To: Huang, Ying
  Cc: akpm, dennis, tim.c.chen, hughd, hannes, mhocko, iamjoonsoo.kim,
	alexs, david, minchan, richard.weiyang, linux-kernel, linux-mm

On 2021/4/19 15:04, Huang, Ying wrote:
> Miaohe Lin <linmiaohe@huawei.com> writes:
> 
>> On 2021/4/19 10:15, Huang, Ying wrote:
>>> Miaohe Lin <linmiaohe@huawei.com> writes:
>>>
>>>> When I was investigating the swap code, I found the below possible race
>>>> window:
>>>>
>>>> CPU 1                                           CPU 2
>>>> -----                                           -----
>>>> shmem_swapin
>>>>   swap_cluster_readahead
>>>>     if (likely(si->flags & (SWP_BLKDEV | SWP_FS_OPS))) {
>>>>                                                 swapoff
>>>>                                                   si->flags &= ~SWP_VALID;
>>>>                                                   ..
>>>>                                                   synchronize_rcu();
>>>>                                                   ..
>>>
>>> You have removed these code in the previous patches of the series.  And
>>> they are not relevant in this patch.
>>
>> Yes, I should change these. Thanks.
>>
>>>
>>>>                                                   si->swap_file = NULL;
>>>>     struct inode *inode = si->swap_file->f_mapping->host;[oops!]
>>>>
>>>> Close this race window by using get/put_swap_device() to guard against
>>>> concurrent swapoff.
>>>>
>>>> Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2")
>>>
>>> No.  This isn't the commit that introduces the race condition.  Please
>>> recheck your git blame result.
>>>
>>
>> I think this is really hard to find exact commit. I used git blame and found
>> this race should be existed when this is introduced. Any suggestion ?
>> Thanks.
> 
> I think the commit that introduces the race condition is commit
> 8fd2e0b505d1 ("mm: swap: check if swap backing device is congested or
> not")
> 

Thanks.
The commit log only describes one race condition. And for that one, this should be correct
Fixes tag. But there are still many other race conditions inside swap_cluster_readahead,
such as swap_readpage() called from swap_cluster_readahead. This tag could not cover the
all race windows.

> Best Regards,
> Huang, Ying
> 
>>> Best Regards,
>>> Huang, Ying
>>>
>>>> Signed-off-by: Miaohe Lin <linmiaohe@huawei.com>
>>>> ---
>>>>  mm/shmem.c | 6 ++++++
>>>>  1 file changed, 6 insertions(+)
>>>>
>>>> diff --git a/mm/shmem.c b/mm/shmem.c
>>>> index 26c76b13ad23..936ba5595297 100644
>>>> --- a/mm/shmem.c
>>>> +++ b/mm/shmem.c
>>>> @@ -1492,15 +1492,21 @@ static void shmem_pseudo_vma_destroy(struct vm_area_struct *vma)
>>>>  static struct page *shmem_swapin(swp_entry_t swap, gfp_t gfp,
>>>>  			struct shmem_inode_info *info, pgoff_t index)
>>>>  {
>>>> +	struct swap_info_struct *si;
>>>>  	struct vm_area_struct pvma;
>>>>  	struct page *page;
>>>>  	struct vm_fault vmf = {
>>>>  		.vma = &pvma,
>>>>  	};
>>>>  
>>>> +	/* Prevent swapoff from happening to us. */
>>>> +	si = get_swap_device(swap);
>>>> +	if (unlikely(!si))
>>>> +		return NULL;
>>>>  	shmem_pseudo_vma_init(&pvma, info, index);
>>>>  	page = swap_cluster_readahead(swap, gfp, &vmf);
>>>>  	shmem_pseudo_vma_destroy(&pvma);
>>>> +	put_swap_device(si);
>>>>  
>>>>  	return page;
>>>>  }
>>> .
>>>
> .
> 


^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH v2 1/5] mm/swapfile: add percpu_ref support for swap
  2021-04-19  7:09         ` Huang, Ying
  (?)
@ 2021-04-19  7:35         ` Miaohe Lin
  2021-04-19  7:52             ` Huang, Ying
  -1 siblings, 1 reply; 33+ messages in thread
From: Miaohe Lin @ 2021-04-19  7:35 UTC (permalink / raw)
  To: Huang, Ying
  Cc: akpm, dennis, tim.c.chen, hughd, hannes, mhocko, iamjoonsoo.kim,
	alexs, david, minchan, richard.weiyang, linux-kernel, linux-mm

On 2021/4/19 15:09, Huang, Ying wrote:
> Miaohe Lin <linmiaohe@huawei.com> writes:
> 
>> On 2021/4/19 10:48, Huang, Ying wrote:
>>> Miaohe Lin <linmiaohe@huawei.com> writes:
>>>
>>>> We will use percpu-refcount to serialize against concurrent swapoff. This
>>>> patch adds the percpu_ref support for swap.
>>>>
>>>> Signed-off-by: Miaohe Lin <linmiaohe@huawei.com>
>>>> ---
>>>>  include/linux/swap.h |  3 +++
>>>>  mm/swapfile.c        | 33 +++++++++++++++++++++++++++++----
>>>>  2 files changed, 32 insertions(+), 4 deletions(-)
>>>>
>>>> diff --git a/include/linux/swap.h b/include/linux/swap.h
>>>> index 144727041e78..8be36eb58b7a 100644
>>>> --- a/include/linux/swap.h
>>>> +++ b/include/linux/swap.h
>>>> @@ -240,6 +240,7 @@ struct swap_cluster_list {
>>>>   * The in-memory structure used to track swap areas.
>>>>   */
>>>>  struct swap_info_struct {
>>>> +	struct percpu_ref users;	/* serialization against concurrent swapoff */
>>>
>>> The comments aren't general enough.  We use this to check whether the
>>> swap device has been fully initialized, etc. May be something as below?
>>>
>>> /* indicate and keep swap device valid */
>>
>> Looks good.
>>
>>>
>>>>  	unsigned long	flags;		/* SWP_USED etc: see above */
>>>>  	signed short	prio;		/* swap priority of this type */
>>>>  	struct plist_node list;		/* entry in swap_active_head */
>>>> @@ -260,6 +261,8 @@ struct swap_info_struct {
>>>>  	struct block_device *bdev;	/* swap device or bdev of swap file */
>>>>  	struct file *swap_file;		/* seldom referenced */
>>>>  	unsigned int old_block_size;	/* seldom referenced */
>>>> +	bool ref_initialized;		/* seldom referenced */
>>>> +	struct completion comp;		/* seldom referenced */
>>>>  #ifdef CONFIG_FRONTSWAP
>>>>  	unsigned long *frontswap_map;	/* frontswap in-use, one bit per page */
>>>>  	atomic_t frontswap_pages;	/* frontswap pages in-use counter */
>>>> diff --git a/mm/swapfile.c b/mm/swapfile.c
>>>> index 149e77454e3c..66515a3a2824 100644
>>>> --- a/mm/swapfile.c
>>>> +++ b/mm/swapfile.c
>>>> @@ -39,6 +39,7 @@
>>>>  #include <linux/export.h>
>>>>  #include <linux/swap_slots.h>
>>>>  #include <linux/sort.h>
>>>> +#include <linux/completion.h>
>>>>  
>>>>  #include <asm/tlbflush.h>
>>>>  #include <linux/swapops.h>
>>>> @@ -511,6 +512,14 @@ static void swap_discard_work(struct work_struct *work)
>>>>  	spin_unlock(&si->lock);
>>>>  }
>>>>  
>>>> +static void swap_users_ref_free(struct percpu_ref *ref)
>>>> +{
>>>> +	struct swap_info_struct *si;
>>>> +
>>>> +	si = container_of(ref, struct swap_info_struct, users);
>>>> +	complete(&si->comp);
>>>> +}
>>>> +
>>>>  static void alloc_cluster(struct swap_info_struct *si, unsigned long idx)
>>>>  {
>>>>  	struct swap_cluster_info *ci = si->cluster_info;
>>>> @@ -2500,7 +2509,7 @@ static void enable_swap_info(struct swap_info_struct *p, int prio,
>>>>  	 * Guarantee swap_map, cluster_info, etc. fields are valid
>>>>  	 * between get/put_swap_device() if SWP_VALID bit is set
>>>>  	 */
>>>> -	synchronize_rcu();
>>>
>>> You cannot remove this without changing get/put_swap_device().  It's
>>> better to squash at least PATCH 1-2.
>>
>> Will squash PATCH 1-2. Thanks.
>>
>>>
>>>> +	percpu_ref_resurrect(&p->users);
>>>>  	spin_lock(&swap_lock);
>>>>  	spin_lock(&p->lock);
>>>>  	_enable_swap_info(p);
>>>> @@ -2621,11 +2630,18 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
>>>>  	p->flags &= ~SWP_VALID;		/* mark swap device as invalid */
>>>>  	spin_unlock(&p->lock);
>>>>  	spin_unlock(&swap_lock);
>>>> +
>>>> +	percpu_ref_kill(&p->users);
>>>>  	/*
>>>> -	 * wait for swap operations protected by get/put_swap_device()
>>>> -	 * to complete
>>>> +	 * We need synchronize_rcu() here to protect the accessing
>>>> +	 * to the swap cache data structure.
>>>>  	 */
>>>>  	synchronize_rcu();
>>>> +	/*
>>>> +	 * Wait for swap operations protected by get/put_swap_device()
>>>> +	 * to complete.
>>>> +	 */
>>>
>>> I think the comments (after some revision) can be moved before
>>> percpu_ref_kill().  The synchronize_rcu() comments can be merged.
>>>
>>
>> Ok.
>>
>>>> +	wait_for_completion(&p->comp);
>>>>  
>>>>  	flush_work(&p->discard_work);
>>>>  
>>>> @@ -3132,7 +3148,7 @@ static bool swap_discardable(struct swap_info_struct *si)
>>>>  SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
>>>>  {
>>>>  	struct swap_info_struct *p;
>>>> -	struct filename *name;
>>>> +	struct filename *name = NULL;
>>>>  	struct file *swap_file = NULL;
>>>>  	struct address_space *mapping;
>>>>  	int prio;
>>>> @@ -3163,6 +3179,15 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
>>>>  
>>>>  	INIT_WORK(&p->discard_work, swap_discard_work);
>>>>  
>>>> +	if (!p->ref_initialized) {
>>>
>>> I don't think it's necessary to add another flag p->ref_initialized.  We
>>> can distinguish newly allocated and reused swap_info_struct in alloc_swap_info().
>>>
>>
>> If newly allocated swap_info_struct failed to init percpu_ref, it will be considered as
>> a reused one in alloc_swap_info() _but_ the field users of swap_info_struct is actually
>> uninitialized. Does this make sense for you?
> 
> We can call percpu_ref_init() just after kvzalloc() in alloc_swap_info().
> 

Yes, we can do it this way. But using ref_initialized might make the code more straightforward
and simple?

> Best Regards,
> Huang, Ying
> 
>> Many Thanks for quick review.
>>
>>> Best Regards,
>>> Huang, Ying
>>>
>>>> +		error = percpu_ref_init(&p->users, swap_users_ref_free,
>>>> +					PERCPU_REF_INIT_DEAD, GFP_KERNEL);
>>>> +		if (unlikely(error))
>>>> +			goto bad_swap;
>>>> +		init_completion(&p->comp);
>>>> +		p->ref_initialized = true;
>>>> +	}
>>>> +
>>>>  	name = getname(specialfile);
>>>>  	if (IS_ERR(name)) {
>>>>  		error = PTR_ERR(name);
>>> .
>>>
> .
> 


^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH v2 5/5] mm/shmem: fix shmem_swapin() race with swapoff
  2021-04-19  7:14         ` Miaohe Lin
@ 2021-04-19  7:41             ` Huang, Ying
  0 siblings, 0 replies; 33+ messages in thread
From: Huang, Ying @ 2021-04-19  7:41 UTC (permalink / raw)
  To: Miaohe Lin
  Cc: akpm, dennis, tim.c.chen, hughd, hannes, mhocko, iamjoonsoo.kim,
	alexs, david, minchan, richard.weiyang, linux-kernel, linux-mm

Miaohe Lin <linmiaohe@huawei.com> writes:

> On 2021/4/19 15:04, Huang, Ying wrote:
>> Miaohe Lin <linmiaohe@huawei.com> writes:
>> 
>>> On 2021/4/19 10:15, Huang, Ying wrote:
>>>> Miaohe Lin <linmiaohe@huawei.com> writes:
>>>>
>>>>> When I was investigating the swap code, I found the below possible race
>>>>> window:
>>>>>
>>>>> CPU 1                                           CPU 2
>>>>> -----                                           -----
>>>>> shmem_swapin
>>>>>   swap_cluster_readahead
>>>>>     if (likely(si->flags & (SWP_BLKDEV | SWP_FS_OPS))) {
>>>>>                                                 swapoff
>>>>>                                                   si->flags &= ~SWP_VALID;
>>>>>                                                   ..
>>>>>                                                   synchronize_rcu();
>>>>>                                                   ..
>>>>
>>>> You have removed these code in the previous patches of the series.  And
>>>> they are not relevant in this patch.
>>>
>>> Yes, I should change these. Thanks.
>>>
>>>>
>>>>>                                                   si->swap_file = NULL;
>>>>>     struct inode *inode = si->swap_file->f_mapping->host;[oops!]
>>>>>
>>>>> Close this race window by using get/put_swap_device() to guard against
>>>>> concurrent swapoff.
>>>>>
>>>>> Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2")
>>>>
>>>> No.  This isn't the commit that introduces the race condition.  Please
>>>> recheck your git blame result.
>>>>
>>>
>>> I think this is really hard to find exact commit. I used git blame and found
>>> this race should be existed when this is introduced. Any suggestion ?
>>> Thanks.
>> 
>> I think the commit that introduces the race condition is commit
>> 8fd2e0b505d1 ("mm: swap: check if swap backing device is congested or
>> not")
>> 
>
> Thanks.
> The commit log only describes one race condition. And for that one, this should be correct
> Fixes tag. But there are still many other race conditions inside swap_cluster_readahead,
> such as swap_readpage() called from swap_cluster_readahead. This tag could not cover the
> all race windows.

No. swap_readpage() in swap_cluster_readahead() is OK.  Because
__read_swap_cache_async() is called before that, so the swap entry will
be marked with SWAP_HAS_CACHE, and page will be locked.

Best Regards,
Huang, Ying

>> Best Regards,
>> Huang, Ying
>> 
>>>> Best Regards,
>>>> Huang, Ying
>>>>
>>>>> Signed-off-by: Miaohe Lin <linmiaohe@huawei.com>
>>>>> ---
>>>>>  mm/shmem.c | 6 ++++++
>>>>>  1 file changed, 6 insertions(+)
>>>>>
>>>>> diff --git a/mm/shmem.c b/mm/shmem.c
>>>>> index 26c76b13ad23..936ba5595297 100644
>>>>> --- a/mm/shmem.c
>>>>> +++ b/mm/shmem.c
>>>>> @@ -1492,15 +1492,21 @@ static void shmem_pseudo_vma_destroy(struct vm_area_struct *vma)
>>>>>  static struct page *shmem_swapin(swp_entry_t swap, gfp_t gfp,
>>>>>  			struct shmem_inode_info *info, pgoff_t index)
>>>>>  {
>>>>> +	struct swap_info_struct *si;
>>>>>  	struct vm_area_struct pvma;
>>>>>  	struct page *page;
>>>>>  	struct vm_fault vmf = {
>>>>>  		.vma = &pvma,
>>>>>  	};
>>>>>  
>>>>> +	/* Prevent swapoff from happening to us. */
>>>>> +	si = get_swap_device(swap);
>>>>> +	if (unlikely(!si))
>>>>> +		return NULL;
>>>>>  	shmem_pseudo_vma_init(&pvma, info, index);
>>>>>  	page = swap_cluster_readahead(swap, gfp, &vmf);
>>>>>  	shmem_pseudo_vma_destroy(&pvma);
>>>>> +	put_swap_device(si);
>>>>>  
>>>>>  	return page;
>>>>>  }
>>>> .
>>>>
>> .
>> 

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH v2 5/5] mm/shmem: fix shmem_swapin() race with swapoff
@ 2021-04-19  7:41             ` Huang, Ying
  0 siblings, 0 replies; 33+ messages in thread
From: Huang, Ying @ 2021-04-19  7:41 UTC (permalink / raw)
  To: Miaohe Lin
  Cc: akpm, dennis, tim.c.chen, hughd, hannes, mhocko, iamjoonsoo.kim,
	alexs, david, minchan, richard.weiyang, linux-kernel, linux-mm

Miaohe Lin <linmiaohe@huawei.com> writes:

> On 2021/4/19 15:04, Huang, Ying wrote:
>> Miaohe Lin <linmiaohe@huawei.com> writes:
>> 
>>> On 2021/4/19 10:15, Huang, Ying wrote:
>>>> Miaohe Lin <linmiaohe@huawei.com> writes:
>>>>
>>>>> When I was investigating the swap code, I found the below possible race
>>>>> window:
>>>>>
>>>>> CPU 1                                           CPU 2
>>>>> -----                                           -----
>>>>> shmem_swapin
>>>>>   swap_cluster_readahead
>>>>>     if (likely(si->flags & (SWP_BLKDEV | SWP_FS_OPS))) {
>>>>>                                                 swapoff
>>>>>                                                   si->flags &= ~SWP_VALID;
>>>>>                                                   ..
>>>>>                                                   synchronize_rcu();
>>>>>                                                   ..
>>>>
>>>> You have removed these code in the previous patches of the series.  And
>>>> they are not relevant in this patch.
>>>
>>> Yes, I should change these. Thanks.
>>>
>>>>
>>>>>                                                   si->swap_file = NULL;
>>>>>     struct inode *inode = si->swap_file->f_mapping->host;[oops!]
>>>>>
>>>>> Close this race window by using get/put_swap_device() to guard against
>>>>> concurrent swapoff.
>>>>>
>>>>> Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2")
>>>>
>>>> No.  This isn't the commit that introduces the race condition.  Please
>>>> recheck your git blame result.
>>>>
>>>
>>> I think this is really hard to find exact commit. I used git blame and found
>>> this race should be existed when this is introduced. Any suggestion ?
>>> Thanks.
>> 
>> I think the commit that introduces the race condition is commit
>> 8fd2e0b505d1 ("mm: swap: check if swap backing device is congested or
>> not")
>> 
>
> Thanks.
> The commit log only describes one race condition. And for that one, this should be correct
> Fixes tag. But there are still many other race conditions inside swap_cluster_readahead,
> such as swap_readpage() called from swap_cluster_readahead. This tag could not cover the
> all race windows.

No. swap_readpage() in swap_cluster_readahead() is OK.  Because
__read_swap_cache_async() is called before that, so the swap entry will
be marked with SWAP_HAS_CACHE, and page will be locked.

Best Regards,
Huang, Ying

>> Best Regards,
>> Huang, Ying
>> 
>>>> Best Regards,
>>>> Huang, Ying
>>>>
>>>>> Signed-off-by: Miaohe Lin <linmiaohe@huawei.com>
>>>>> ---
>>>>>  mm/shmem.c | 6 ++++++
>>>>>  1 file changed, 6 insertions(+)
>>>>>
>>>>> diff --git a/mm/shmem.c b/mm/shmem.c
>>>>> index 26c76b13ad23..936ba5595297 100644
>>>>> --- a/mm/shmem.c
>>>>> +++ b/mm/shmem.c
>>>>> @@ -1492,15 +1492,21 @@ static void shmem_pseudo_vma_destroy(struct vm_area_struct *vma)
>>>>>  static struct page *shmem_swapin(swp_entry_t swap, gfp_t gfp,
>>>>>  			struct shmem_inode_info *info, pgoff_t index)
>>>>>  {
>>>>> +	struct swap_info_struct *si;
>>>>>  	struct vm_area_struct pvma;
>>>>>  	struct page *page;
>>>>>  	struct vm_fault vmf = {
>>>>>  		.vma = &pvma,
>>>>>  	};
>>>>>  
>>>>> +	/* Prevent swapoff from happening to us. */
>>>>> +	si = get_swap_device(swap);
>>>>> +	if (unlikely(!si))
>>>>> +		return NULL;
>>>>>  	shmem_pseudo_vma_init(&pvma, info, index);
>>>>>  	page = swap_cluster_readahead(swap, gfp, &vmf);
>>>>>  	shmem_pseudo_vma_destroy(&pvma);
>>>>> +	put_swap_device(si);
>>>>>  
>>>>>  	return page;
>>>>>  }
>>>> .
>>>>
>> .
>> 


^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH v2 1/5] mm/swapfile: add percpu_ref support for swap
  2021-04-19  7:35         ` Miaohe Lin
@ 2021-04-19  7:52             ` Huang, Ying
  0 siblings, 0 replies; 33+ messages in thread
From: Huang, Ying @ 2021-04-19  7:52 UTC (permalink / raw)
  To: Miaohe Lin
  Cc: akpm, dennis, tim.c.chen, hughd, hannes, mhocko, iamjoonsoo.kim,
	alexs, david, minchan, richard.weiyang, linux-kernel, linux-mm

Miaohe Lin <linmiaohe@huawei.com> writes:

> On 2021/4/19 15:09, Huang, Ying wrote:
>> Miaohe Lin <linmiaohe@huawei.com> writes:
>> 
>>> On 2021/4/19 10:48, Huang, Ying wrote:
>>>> Miaohe Lin <linmiaohe@huawei.com> writes:
>>>>
>>>>> We will use percpu-refcount to serialize against concurrent swapoff. This
>>>>> patch adds the percpu_ref support for swap.
>>>>>
>>>>> Signed-off-by: Miaohe Lin <linmiaohe@huawei.com>
>>>>> ---
>>>>>  include/linux/swap.h |  3 +++
>>>>>  mm/swapfile.c        | 33 +++++++++++++++++++++++++++++----
>>>>>  2 files changed, 32 insertions(+), 4 deletions(-)
>>>>>
>>>>> diff --git a/include/linux/swap.h b/include/linux/swap.h
>>>>> index 144727041e78..8be36eb58b7a 100644
>>>>> --- a/include/linux/swap.h
>>>>> +++ b/include/linux/swap.h
>>>>> @@ -240,6 +240,7 @@ struct swap_cluster_list {
>>>>>   * The in-memory structure used to track swap areas.
>>>>>   */
>>>>>  struct swap_info_struct {
>>>>> +	struct percpu_ref users;	/* serialization against concurrent swapoff */
>>>>
>>>> The comments aren't general enough.  We use this to check whether the
>>>> swap device has been fully initialized, etc. May be something as below?
>>>>
>>>> /* indicate and keep swap device valid */
>>>
>>> Looks good.
>>>
>>>>
>>>>>  	unsigned long	flags;		/* SWP_USED etc: see above */
>>>>>  	signed short	prio;		/* swap priority of this type */
>>>>>  	struct plist_node list;		/* entry in swap_active_head */
>>>>> @@ -260,6 +261,8 @@ struct swap_info_struct {
>>>>>  	struct block_device *bdev;	/* swap device or bdev of swap file */
>>>>>  	struct file *swap_file;		/* seldom referenced */
>>>>>  	unsigned int old_block_size;	/* seldom referenced */
>>>>> +	bool ref_initialized;		/* seldom referenced */
>>>>> +	struct completion comp;		/* seldom referenced */
>>>>>  #ifdef CONFIG_FRONTSWAP
>>>>>  	unsigned long *frontswap_map;	/* frontswap in-use, one bit per page */
>>>>>  	atomic_t frontswap_pages;	/* frontswap pages in-use counter */
>>>>> diff --git a/mm/swapfile.c b/mm/swapfile.c
>>>>> index 149e77454e3c..66515a3a2824 100644
>>>>> --- a/mm/swapfile.c
>>>>> +++ b/mm/swapfile.c
>>>>> @@ -39,6 +39,7 @@
>>>>>  #include <linux/export.h>
>>>>>  #include <linux/swap_slots.h>
>>>>>  #include <linux/sort.h>
>>>>> +#include <linux/completion.h>
>>>>>  
>>>>>  #include <asm/tlbflush.h>
>>>>>  #include <linux/swapops.h>
>>>>> @@ -511,6 +512,14 @@ static void swap_discard_work(struct work_struct *work)
>>>>>  	spin_unlock(&si->lock);
>>>>>  }
>>>>>  
>>>>> +static void swap_users_ref_free(struct percpu_ref *ref)
>>>>> +{
>>>>> +	struct swap_info_struct *si;
>>>>> +
>>>>> +	si = container_of(ref, struct swap_info_struct, users);
>>>>> +	complete(&si->comp);
>>>>> +}
>>>>> +
>>>>>  static void alloc_cluster(struct swap_info_struct *si, unsigned long idx)
>>>>>  {
>>>>>  	struct swap_cluster_info *ci = si->cluster_info;
>>>>> @@ -2500,7 +2509,7 @@ static void enable_swap_info(struct swap_info_struct *p, int prio,
>>>>>  	 * Guarantee swap_map, cluster_info, etc. fields are valid
>>>>>  	 * between get/put_swap_device() if SWP_VALID bit is set
>>>>>  	 */
>>>>> -	synchronize_rcu();
>>>>
>>>> You cannot remove this without changing get/put_swap_device().  It's
>>>> better to squash at least PATCH 1-2.
>>>
>>> Will squash PATCH 1-2. Thanks.
>>>
>>>>
>>>>> +	percpu_ref_resurrect(&p->users);
>>>>>  	spin_lock(&swap_lock);
>>>>>  	spin_lock(&p->lock);
>>>>>  	_enable_swap_info(p);
>>>>> @@ -2621,11 +2630,18 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
>>>>>  	p->flags &= ~SWP_VALID;		/* mark swap device as invalid */
>>>>>  	spin_unlock(&p->lock);
>>>>>  	spin_unlock(&swap_lock);
>>>>> +
>>>>> +	percpu_ref_kill(&p->users);
>>>>>  	/*
>>>>> -	 * wait for swap operations protected by get/put_swap_device()
>>>>> -	 * to complete
>>>>> +	 * We need synchronize_rcu() here to protect the accessing
>>>>> +	 * to the swap cache data structure.
>>>>>  	 */
>>>>>  	synchronize_rcu();
>>>>> +	/*
>>>>> +	 * Wait for swap operations protected by get/put_swap_device()
>>>>> +	 * to complete.
>>>>> +	 */
>>>>
>>>> I think the comments (after some revision) can be moved before
>>>> percpu_ref_kill().  The synchronize_rcu() comments can be merged.
>>>>
>>>
>>> Ok.
>>>
>>>>> +	wait_for_completion(&p->comp);
>>>>>  
>>>>>  	flush_work(&p->discard_work);
>>>>>  
>>>>> @@ -3132,7 +3148,7 @@ static bool swap_discardable(struct swap_info_struct *si)
>>>>>  SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
>>>>>  {
>>>>>  	struct swap_info_struct *p;
>>>>> -	struct filename *name;
>>>>> +	struct filename *name = NULL;
>>>>>  	struct file *swap_file = NULL;
>>>>>  	struct address_space *mapping;
>>>>>  	int prio;
>>>>> @@ -3163,6 +3179,15 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
>>>>>  
>>>>>  	INIT_WORK(&p->discard_work, swap_discard_work);
>>>>>  
>>>>> +	if (!p->ref_initialized) {
>>>>
>>>> I don't think it's necessary to add another flag p->ref_initialized.  We
>>>> can distinguish newly allocated and reused swap_info_struct in alloc_swap_info().
>>>>
>>>
>>> If newly allocated swap_info_struct failed to init percpu_ref, it will be considered as
>>> a reused one in alloc_swap_info() _but_ the field users of swap_info_struct is actually
>>> uninitialized. Does this make sense for you?
>> 
>> We can call percpu_ref_init() just after kvzalloc() in alloc_swap_info().
>> 
>
> Yes, we can do it this way. But using ref_initialized might make the code more straightforward
> and simple?

I think that it's simpler to call percpu_ref_init() in
alloc_swap_info().  We can just call percpu_ref_init() for allocated
swap_info_struct blindly, and call percpu_ref_exit() if we reuse.

Best Regards,
Huang, Ying

>> Best Regards,
>> Huang, Ying
>> 
>>> Many Thanks for quick review.
>>>
>>>> Best Regards,
>>>> Huang, Ying
>>>>
>>>>> +		error = percpu_ref_init(&p->users, swap_users_ref_free,
>>>>> +					PERCPU_REF_INIT_DEAD, GFP_KERNEL);
>>>>> +		if (unlikely(error))
>>>>> +			goto bad_swap;
>>>>> +		init_completion(&p->comp);
>>>>> +		p->ref_initialized = true;
>>>>> +	}
>>>>> +
>>>>>  	name = getname(specialfile);
>>>>>  	if (IS_ERR(name)) {
>>>>>  		error = PTR_ERR(name);
>>>> .
>>>>
>> .
>> 

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH v2 1/5] mm/swapfile: add percpu_ref support for swap
@ 2021-04-19  7:52             ` Huang, Ying
  0 siblings, 0 replies; 33+ messages in thread
From: Huang, Ying @ 2021-04-19  7:52 UTC (permalink / raw)
  To: Miaohe Lin
  Cc: akpm, dennis, tim.c.chen, hughd, hannes, mhocko, iamjoonsoo.kim,
	alexs, david, minchan, richard.weiyang, linux-kernel, linux-mm

Miaohe Lin <linmiaohe@huawei.com> writes:

> On 2021/4/19 15:09, Huang, Ying wrote:
>> Miaohe Lin <linmiaohe@huawei.com> writes:
>> 
>>> On 2021/4/19 10:48, Huang, Ying wrote:
>>>> Miaohe Lin <linmiaohe@huawei.com> writes:
>>>>
>>>>> We will use percpu-refcount to serialize against concurrent swapoff. This
>>>>> patch adds the percpu_ref support for swap.
>>>>>
>>>>> Signed-off-by: Miaohe Lin <linmiaohe@huawei.com>
>>>>> ---
>>>>>  include/linux/swap.h |  3 +++
>>>>>  mm/swapfile.c        | 33 +++++++++++++++++++++++++++++----
>>>>>  2 files changed, 32 insertions(+), 4 deletions(-)
>>>>>
>>>>> diff --git a/include/linux/swap.h b/include/linux/swap.h
>>>>> index 144727041e78..8be36eb58b7a 100644
>>>>> --- a/include/linux/swap.h
>>>>> +++ b/include/linux/swap.h
>>>>> @@ -240,6 +240,7 @@ struct swap_cluster_list {
>>>>>   * The in-memory structure used to track swap areas.
>>>>>   */
>>>>>  struct swap_info_struct {
>>>>> +	struct percpu_ref users;	/* serialization against concurrent swapoff */
>>>>
>>>> The comments aren't general enough.  We use this to check whether the
>>>> swap device has been fully initialized, etc. May be something as below?
>>>>
>>>> /* indicate and keep swap device valid */
>>>
>>> Looks good.
>>>
>>>>
>>>>>  	unsigned long	flags;		/* SWP_USED etc: see above */
>>>>>  	signed short	prio;		/* swap priority of this type */
>>>>>  	struct plist_node list;		/* entry in swap_active_head */
>>>>> @@ -260,6 +261,8 @@ struct swap_info_struct {
>>>>>  	struct block_device *bdev;	/* swap device or bdev of swap file */
>>>>>  	struct file *swap_file;		/* seldom referenced */
>>>>>  	unsigned int old_block_size;	/* seldom referenced */
>>>>> +	bool ref_initialized;		/* seldom referenced */
>>>>> +	struct completion comp;		/* seldom referenced */
>>>>>  #ifdef CONFIG_FRONTSWAP
>>>>>  	unsigned long *frontswap_map;	/* frontswap in-use, one bit per page */
>>>>>  	atomic_t frontswap_pages;	/* frontswap pages in-use counter */
>>>>> diff --git a/mm/swapfile.c b/mm/swapfile.c
>>>>> index 149e77454e3c..66515a3a2824 100644
>>>>> --- a/mm/swapfile.c
>>>>> +++ b/mm/swapfile.c
>>>>> @@ -39,6 +39,7 @@
>>>>>  #include <linux/export.h>
>>>>>  #include <linux/swap_slots.h>
>>>>>  #include <linux/sort.h>
>>>>> +#include <linux/completion.h>
>>>>>  
>>>>>  #include <asm/tlbflush.h>
>>>>>  #include <linux/swapops.h>
>>>>> @@ -511,6 +512,14 @@ static void swap_discard_work(struct work_struct *work)
>>>>>  	spin_unlock(&si->lock);
>>>>>  }
>>>>>  
>>>>> +static void swap_users_ref_free(struct percpu_ref *ref)
>>>>> +{
>>>>> +	struct swap_info_struct *si;
>>>>> +
>>>>> +	si = container_of(ref, struct swap_info_struct, users);
>>>>> +	complete(&si->comp);
>>>>> +}
>>>>> +
>>>>>  static void alloc_cluster(struct swap_info_struct *si, unsigned long idx)
>>>>>  {
>>>>>  	struct swap_cluster_info *ci = si->cluster_info;
>>>>> @@ -2500,7 +2509,7 @@ static void enable_swap_info(struct swap_info_struct *p, int prio,
>>>>>  	 * Guarantee swap_map, cluster_info, etc. fields are valid
>>>>>  	 * between get/put_swap_device() if SWP_VALID bit is set
>>>>>  	 */
>>>>> -	synchronize_rcu();
>>>>
>>>> You cannot remove this without changing get/put_swap_device().  It's
>>>> better to squash at least PATCH 1-2.
>>>
>>> Will squash PATCH 1-2. Thanks.
>>>
>>>>
>>>>> +	percpu_ref_resurrect(&p->users);
>>>>>  	spin_lock(&swap_lock);
>>>>>  	spin_lock(&p->lock);
>>>>>  	_enable_swap_info(p);
>>>>> @@ -2621,11 +2630,18 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
>>>>>  	p->flags &= ~SWP_VALID;		/* mark swap device as invalid */
>>>>>  	spin_unlock(&p->lock);
>>>>>  	spin_unlock(&swap_lock);
>>>>> +
>>>>> +	percpu_ref_kill(&p->users);
>>>>>  	/*
>>>>> -	 * wait for swap operations protected by get/put_swap_device()
>>>>> -	 * to complete
>>>>> +	 * We need synchronize_rcu() here to protect the accessing
>>>>> +	 * to the swap cache data structure.
>>>>>  	 */
>>>>>  	synchronize_rcu();
>>>>> +	/*
>>>>> +	 * Wait for swap operations protected by get/put_swap_device()
>>>>> +	 * to complete.
>>>>> +	 */
>>>>
>>>> I think the comments (after some revision) can be moved before
>>>> percpu_ref_kill().  The synchronize_rcu() comments can be merged.
>>>>
>>>
>>> Ok.
>>>
>>>>> +	wait_for_completion(&p->comp);
>>>>>  
>>>>>  	flush_work(&p->discard_work);
>>>>>  
>>>>> @@ -3132,7 +3148,7 @@ static bool swap_discardable(struct swap_info_struct *si)
>>>>>  SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
>>>>>  {
>>>>>  	struct swap_info_struct *p;
>>>>> -	struct filename *name;
>>>>> +	struct filename *name = NULL;
>>>>>  	struct file *swap_file = NULL;
>>>>>  	struct address_space *mapping;
>>>>>  	int prio;
>>>>> @@ -3163,6 +3179,15 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
>>>>>  
>>>>>  	INIT_WORK(&p->discard_work, swap_discard_work);
>>>>>  
>>>>> +	if (!p->ref_initialized) {
>>>>
>>>> I don't think it's necessary to add another flag p->ref_initialized.  We
>>>> can distinguish newly allocated and reused swap_info_struct in alloc_swap_info().
>>>>
>>>
>>> If newly allocated swap_info_struct failed to init percpu_ref, it will be considered as
>>> a reused one in alloc_swap_info() _but_ the field users of swap_info_struct is actually
>>> uninitialized. Does this make sense for you?
>> 
>> We can call percpu_ref_init() just after kvzalloc() in alloc_swap_info().
>> 
>
> Yes, we can do it this way. But using ref_initialized might make the code more straightforward
> and simple?

I think that it's simpler to call percpu_ref_init() in
alloc_swap_info().  We can just call percpu_ref_init() for allocated
swap_info_struct blindly, and call percpu_ref_exit() if we reuse.

Best Regards,
Huang, Ying

>> Best Regards,
>> Huang, Ying
>> 
>>> Many Thanks for quick review.
>>>
>>>> Best Regards,
>>>> Huang, Ying
>>>>
>>>>> +		error = percpu_ref_init(&p->users, swap_users_ref_free,
>>>>> +					PERCPU_REF_INIT_DEAD, GFP_KERNEL);
>>>>> +		if (unlikely(error))
>>>>> +			goto bad_swap;
>>>>> +		init_completion(&p->comp);
>>>>> +		p->ref_initialized = true;
>>>>> +	}
>>>>> +
>>>>>  	name = getname(specialfile);
>>>>>  	if (IS_ERR(name)) {
>>>>>  		error = PTR_ERR(name);
>>>> .
>>>>
>> .
>> 


^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH v2 5/5] mm/shmem: fix shmem_swapin() race with swapoff
  2021-04-19  7:41             ` Huang, Ying
  (?)
@ 2021-04-19  8:18             ` Miaohe Lin
  -1 siblings, 0 replies; 33+ messages in thread
From: Miaohe Lin @ 2021-04-19  8:18 UTC (permalink / raw)
  To: Huang, Ying
  Cc: akpm, dennis, tim.c.chen, hughd, hannes, mhocko, iamjoonsoo.kim,
	alexs, david, minchan, richard.weiyang, linux-kernel, linux-mm

On 2021/4/19 15:41, Huang, Ying wrote:
> Miaohe Lin <linmiaohe@huawei.com> writes:
> 
>> On 2021/4/19 15:04, Huang, Ying wrote:
>>> Miaohe Lin <linmiaohe@huawei.com> writes:
>>>
>>>> On 2021/4/19 10:15, Huang, Ying wrote:
>>>>> Miaohe Lin <linmiaohe@huawei.com> writes:
>>>>>
>>>>>> When I was investigating the swap code, I found the below possible race
>>>>>> window:
>>>>>>
>>>>>> CPU 1                                           CPU 2
>>>>>> -----                                           -----
>>>>>> shmem_swapin
>>>>>>   swap_cluster_readahead
>>>>>>     if (likely(si->flags & (SWP_BLKDEV | SWP_FS_OPS))) {
>>>>>>                                                 swapoff
>>>>>>                                                   si->flags &= ~SWP_VALID;
>>>>>>                                                   ..
>>>>>>                                                   synchronize_rcu();
>>>>>>                                                   ..
>>>>>
>>>>> You have removed these code in the previous patches of the series.  And
>>>>> they are not relevant in this patch.
>>>>
>>>> Yes, I should change these. Thanks.
>>>>
>>>>>
>>>>>>                                                   si->swap_file = NULL;
>>>>>>     struct inode *inode = si->swap_file->f_mapping->host;[oops!]
>>>>>>
>>>>>> Close this race window by using get/put_swap_device() to guard against
>>>>>> concurrent swapoff.
>>>>>>
>>>>>> Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2")
>>>>>
>>>>> No.  This isn't the commit that introduces the race condition.  Please
>>>>> recheck your git blame result.
>>>>>
>>>>
>>>> I think this is really hard to find exact commit. I used git blame and found
>>>> this race should be existed when this is introduced. Any suggestion ?
>>>> Thanks.
>>>
>>> I think the commit that introduces the race condition is commit
>>> 8fd2e0b505d1 ("mm: swap: check if swap backing device is congested or
>>> not")
>>>
>>
>> Thanks.
>> The commit log only describes one race condition. And for that one, this should be correct
>> Fixes tag. But there are still many other race conditions inside swap_cluster_readahead,
>> such as swap_readpage() called from swap_cluster_readahead. This tag could not cover the
>> all race windows.
> 
> No. swap_readpage() in swap_cluster_readahead() is OK.  Because
> __read_swap_cache_async() is called before that, so the swap entry will
> be marked with SWAP_HAS_CACHE, and page will be locked.
> 

Oh... I missed this. Many thanks for your remind.

> Best Regards,
> Huang, Ying
> 
>>> Best Regards,
>>> Huang, Ying
>>>
>>>>> Best Regards,
>>>>> Huang, Ying
>>>>>
>>>>>> Signed-off-by: Miaohe Lin <linmiaohe@huawei.com>
>>>>>> ---
>>>>>>  mm/shmem.c | 6 ++++++
>>>>>>  1 file changed, 6 insertions(+)
>>>>>>
>>>>>> diff --git a/mm/shmem.c b/mm/shmem.c
>>>>>> index 26c76b13ad23..936ba5595297 100644
>>>>>> --- a/mm/shmem.c
>>>>>> +++ b/mm/shmem.c
>>>>>> @@ -1492,15 +1492,21 @@ static void shmem_pseudo_vma_destroy(struct vm_area_struct *vma)
>>>>>>  static struct page *shmem_swapin(swp_entry_t swap, gfp_t gfp,
>>>>>>  			struct shmem_inode_info *info, pgoff_t index)
>>>>>>  {
>>>>>> +	struct swap_info_struct *si;
>>>>>>  	struct vm_area_struct pvma;
>>>>>>  	struct page *page;
>>>>>>  	struct vm_fault vmf = {
>>>>>>  		.vma = &pvma,
>>>>>>  	};
>>>>>>  
>>>>>> +	/* Prevent swapoff from happening to us. */
>>>>>> +	si = get_swap_device(swap);
>>>>>> +	if (unlikely(!si))
>>>>>> +		return NULL;
>>>>>>  	shmem_pseudo_vma_init(&pvma, info, index);
>>>>>>  	page = swap_cluster_readahead(swap, gfp, &vmf);
>>>>>>  	shmem_pseudo_vma_destroy(&pvma);
>>>>>> +	put_swap_device(si);
>>>>>>  
>>>>>>  	return page;
>>>>>>  }
>>>>> .
>>>>>
>>> .
>>>
> .
> 


^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH v2 1/5] mm/swapfile: add percpu_ref support for swap
  2021-04-19  7:52             ` Huang, Ying
  (?)
@ 2021-04-19  8:20             ` Miaohe Lin
  -1 siblings, 0 replies; 33+ messages in thread
From: Miaohe Lin @ 2021-04-19  8:20 UTC (permalink / raw)
  To: Huang, Ying
  Cc: akpm, dennis, tim.c.chen, hughd, hannes, mhocko, iamjoonsoo.kim,
	alexs, david, minchan, richard.weiyang, linux-kernel, linux-mm

On 2021/4/19 15:52, Huang, Ying wrote:
> Miaohe Lin <linmiaohe@huawei.com> writes:
> 
>> On 2021/4/19 15:09, Huang, Ying wrote:
>>> Miaohe Lin <linmiaohe@huawei.com> writes:
>>>
>>>> On 2021/4/19 10:48, Huang, Ying wrote:
>>>>> Miaohe Lin <linmiaohe@huawei.com> writes:
>>>>>
>>>>>> We will use percpu-refcount to serialize against concurrent swapoff. This
>>>>>> patch adds the percpu_ref support for swap.
>>>>>>
>>>>>> Signed-off-by: Miaohe Lin <linmiaohe@huawei.com>
>>>>>> ---
>>>>>>  include/linux/swap.h |  3 +++
>>>>>>  mm/swapfile.c        | 33 +++++++++++++++++++++++++++++----
>>>>>>  2 files changed, 32 insertions(+), 4 deletions(-)
>>>>>>
>>>>>> diff --git a/include/linux/swap.h b/include/linux/swap.h
>>>>>> index 144727041e78..8be36eb58b7a 100644
>>>>>> --- a/include/linux/swap.h
>>>>>> +++ b/include/linux/swap.h
>>>>>> @@ -240,6 +240,7 @@ struct swap_cluster_list {
>>>>>>   * The in-memory structure used to track swap areas.
>>>>>>   */
>>>>>>  struct swap_info_struct {
>>>>>> +	struct percpu_ref users;	/* serialization against concurrent swapoff */
>>>>>
>>>>> The comments aren't general enough.  We use this to check whether the
>>>>> swap device has been fully initialized, etc. May be something as below?
>>>>>
>>>>> /* indicate and keep swap device valid */
>>>>
>>>> Looks good.
>>>>
>>>>>
>>>>>>  	unsigned long	flags;		/* SWP_USED etc: see above */
>>>>>>  	signed short	prio;		/* swap priority of this type */
>>>>>>  	struct plist_node list;		/* entry in swap_active_head */
>>>>>> @@ -260,6 +261,8 @@ struct swap_info_struct {
>>>>>>  	struct block_device *bdev;	/* swap device or bdev of swap file */
>>>>>>  	struct file *swap_file;		/* seldom referenced */
>>>>>>  	unsigned int old_block_size;	/* seldom referenced */
>>>>>> +	bool ref_initialized;		/* seldom referenced */
>>>>>> +	struct completion comp;		/* seldom referenced */
>>>>>>  #ifdef CONFIG_FRONTSWAP
>>>>>>  	unsigned long *frontswap_map;	/* frontswap in-use, one bit per page */
>>>>>>  	atomic_t frontswap_pages;	/* frontswap pages in-use counter */
>>>>>> diff --git a/mm/swapfile.c b/mm/swapfile.c
>>>>>> index 149e77454e3c..66515a3a2824 100644
>>>>>> --- a/mm/swapfile.c
>>>>>> +++ b/mm/swapfile.c
>>>>>> @@ -39,6 +39,7 @@
>>>>>>  #include <linux/export.h>
>>>>>>  #include <linux/swap_slots.h>
>>>>>>  #include <linux/sort.h>
>>>>>> +#include <linux/completion.h>
>>>>>>  
>>>>>>  #include <asm/tlbflush.h>
>>>>>>  #include <linux/swapops.h>
>>>>>> @@ -511,6 +512,14 @@ static void swap_discard_work(struct work_struct *work)
>>>>>>  	spin_unlock(&si->lock);
>>>>>>  }
>>>>>>  
>>>>>> +static void swap_users_ref_free(struct percpu_ref *ref)
>>>>>> +{
>>>>>> +	struct swap_info_struct *si;
>>>>>> +
>>>>>> +	si = container_of(ref, struct swap_info_struct, users);
>>>>>> +	complete(&si->comp);
>>>>>> +}
>>>>>> +
>>>>>>  static void alloc_cluster(struct swap_info_struct *si, unsigned long idx)
>>>>>>  {
>>>>>>  	struct swap_cluster_info *ci = si->cluster_info;
>>>>>> @@ -2500,7 +2509,7 @@ static void enable_swap_info(struct swap_info_struct *p, int prio,
>>>>>>  	 * Guarantee swap_map, cluster_info, etc. fields are valid
>>>>>>  	 * between get/put_swap_device() if SWP_VALID bit is set
>>>>>>  	 */
>>>>>> -	synchronize_rcu();
>>>>>
>>>>> You cannot remove this without changing get/put_swap_device().  It's
>>>>> better to squash at least PATCH 1-2.
>>>>
>>>> Will squash PATCH 1-2. Thanks.
>>>>
>>>>>
>>>>>> +	percpu_ref_resurrect(&p->users);
>>>>>>  	spin_lock(&swap_lock);
>>>>>>  	spin_lock(&p->lock);
>>>>>>  	_enable_swap_info(p);
>>>>>> @@ -2621,11 +2630,18 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
>>>>>>  	p->flags &= ~SWP_VALID;		/* mark swap device as invalid */
>>>>>>  	spin_unlock(&p->lock);
>>>>>>  	spin_unlock(&swap_lock);
>>>>>> +
>>>>>> +	percpu_ref_kill(&p->users);
>>>>>>  	/*
>>>>>> -	 * wait for swap operations protected by get/put_swap_device()
>>>>>> -	 * to complete
>>>>>> +	 * We need synchronize_rcu() here to protect the accessing
>>>>>> +	 * to the swap cache data structure.
>>>>>>  	 */
>>>>>>  	synchronize_rcu();
>>>>>> +	/*
>>>>>> +	 * Wait for swap operations protected by get/put_swap_device()
>>>>>> +	 * to complete.
>>>>>> +	 */
>>>>>
>>>>> I think the comments (after some revision) can be moved before
>>>>> percpu_ref_kill().  The synchronize_rcu() comments can be merged.
>>>>>
>>>>
>>>> Ok.
>>>>
>>>>>> +	wait_for_completion(&p->comp);
>>>>>>  
>>>>>>  	flush_work(&p->discard_work);
>>>>>>  
>>>>>> @@ -3132,7 +3148,7 @@ static bool swap_discardable(struct swap_info_struct *si)
>>>>>>  SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
>>>>>>  {
>>>>>>  	struct swap_info_struct *p;
>>>>>> -	struct filename *name;
>>>>>> +	struct filename *name = NULL;
>>>>>>  	struct file *swap_file = NULL;
>>>>>>  	struct address_space *mapping;
>>>>>>  	int prio;
>>>>>> @@ -3163,6 +3179,15 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
>>>>>>  
>>>>>>  	INIT_WORK(&p->discard_work, swap_discard_work);
>>>>>>  
>>>>>> +	if (!p->ref_initialized) {
>>>>>
>>>>> I don't think it's necessary to add another flag p->ref_initialized.  We
>>>>> can distinguish newly allocated and reused swap_info_struct in alloc_swap_info().
>>>>>
>>>>
>>>> If newly allocated swap_info_struct failed to init percpu_ref, it will be considered as
>>>> a reused one in alloc_swap_info() _but_ the field users of swap_info_struct is actually
>>>> uninitialized. Does this make sense for you?
>>>
>>> We can call percpu_ref_init() just after kvzalloc() in alloc_swap_info().
>>>
>>
>> Yes, we can do it this way. But using ref_initialized might make the code more straightforward
>> and simple?
> 
> I think that it's simpler to call percpu_ref_init() in
> alloc_swap_info().  We can just call percpu_ref_init() for allocated
> swap_info_struct blindly, and call percpu_ref_exit() if we reuse.
> 

Looks good. Will do. Many thanks again.

> Best Regards,
> Huang, Ying
> 
>>> Best Regards,
>>> Huang, Ying
>>>
>>>> Many Thanks for quick review.
>>>>
>>>>> Best Regards,
>>>>> Huang, Ying
>>>>>
>>>>>> +		error = percpu_ref_init(&p->users, swap_users_ref_free,
>>>>>> +					PERCPU_REF_INIT_DEAD, GFP_KERNEL);
>>>>>> +		if (unlikely(error))
>>>>>> +			goto bad_swap;
>>>>>> +		init_completion(&p->comp);
>>>>>> +		p->ref_initialized = true;
>>>>>> +	}
>>>>>> +
>>>>>>  	name = getname(specialfile);
>>>>>>  	if (IS_ERR(name)) {
>>>>>>  		error = PTR_ERR(name);
>>>>> .
>>>>>
>>> .
>>>
> .
> 


^ permalink raw reply	[flat|nested] 33+ messages in thread

end of thread, other threads:[~2021-04-19  8:20 UTC | newest]

Thread overview: 33+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-04-17  9:40 [PATCH v2 0/5] close various race windows for swap Miaohe Lin
2021-04-17  9:40 ` [PATCH v2 1/5] mm/swapfile: add percpu_ref support " Miaohe Lin
2021-04-19  2:48   ` Huang, Ying
2021-04-19  2:48     ` Huang, Ying
2021-04-19  6:46     ` Miaohe Lin
2021-04-19  7:09       ` Huang, Ying
2021-04-19  7:09         ` Huang, Ying
2021-04-19  7:35         ` Miaohe Lin
2021-04-19  7:52           ` Huang, Ying
2021-04-19  7:52             ` Huang, Ying
2021-04-19  8:20             ` Miaohe Lin
2021-04-17  9:40 ` [PATCH v2 2/5] mm/swapfile: use percpu_ref to serialize against concurrent swapoff Miaohe Lin
2021-04-19  2:54   ` Huang, Ying
2021-04-19  2:54     ` Huang, Ying
2021-04-19  6:57     ` Miaohe Lin
2021-04-17  9:40 ` [PATCH v2 3/5] swap: fix do_swap_page() race with swapoff Miaohe Lin
2021-04-19  2:23   ` Huang, Ying
2021-04-19  2:23     ` Huang, Ying
2021-04-19  6:54     ` Miaohe Lin
2021-04-17  9:40 ` [PATCH v2 4/5] mm/swap: remove confusing checking for non_swap_entry() in swap_ra_info() Miaohe Lin
2021-04-19  1:53   ` Huang, Ying
2021-04-19  1:53     ` Huang, Ying
2021-04-19  6:46     ` Miaohe Lin
2021-04-17  9:40 ` [PATCH v2 5/5] mm/shmem: fix shmem_swapin() race with swapoff Miaohe Lin
2021-04-19  2:15   ` Huang, Ying
2021-04-19  2:15     ` Huang, Ying
2021-04-19  6:49     ` Miaohe Lin
2021-04-19  7:04       ` Huang, Ying
2021-04-19  7:04         ` Huang, Ying
2021-04-19  7:14         ` Miaohe Lin
2021-04-19  7:41           ` Huang, Ying
2021-04-19  7:41             ` Huang, Ying
2021-04-19  8:18             ` Miaohe Lin

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.