Linux-mm Archive on lore.kernel.org
 help / color / Atom feed
From: Hugh Dickins <hughd@google.com>
To: Andrew Morton <akpm@linux-foundation.org>
Cc: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>,
	 "Alex Xu (Hello71)" <alex_y_xu@yahoo.ca>,
	 Vineeth Pillai <vpillai@digitalocean.com>,
	 Kelley Nielsen <kelleynnn@gmail.com>,
	Rik van Riel <riel@surriel.com>,
	 Huang Ying <ying.huang@intel.com>,
	Hugh Dickins <hughd@google.com>,
	 linux-kernel@vger.kernel.org, linux-mm@kvack.org
Subject: [PATCH 4/4] mm: swapoff: shmem_unuse() stop eviction without igrab()
Date: Mon, 8 Apr 2019 13:01:00 -0700 (PDT)
Message-ID: <alpine.LSU.2.11.1904081259400.1523@eggly.anvils> (raw)
In-Reply-To: <alpine.LSU.2.11.1904081249370.1523@eggly.anvils>

The igrab() in shmem_unuse() looks good, but we forgot that it gives no
protection against concurrent unmounting: a point made by Konstantin
Khlebnikov eight years ago, and then fixed in 2.6.39 by 778dd893ae78
("tmpfs: fix race between umount and swapoff"). The current 5.1-rc
swapoff is liable to hit "VFS: Busy inodes after unmount of tmpfs.
Self-destruct in 5 seconds.  Have a nice day..." followed by GPF.

Once again, give up on using igrab(); but don't go back to making such
heavy-handed use of shmem_swaplist_mutex as last time: that would spoil
the new design, and I expect could deadlock inside shmem_swapin_page().

Instead, shmem_unuse() just raise a "stop_eviction" count in the shmem-
specific inode, and shmem_evict_inode() wait for that to go down to 0.
Call it "stop_eviction" rather than "swapoff_busy" because it can be
put to use for others later (huge tmpfs patches expect to use it).

That simplifies shmem_unuse(), protecting it from both unlink and unmount;
and in practice lets it locate all the swap in its first try.  But do not
rely on that: there's still a theoretical case, when shmem_writepage()
might have been preempted after its get_swap_page(), before making the
swap entry visible to swapoff.

Fixes: b56a2d8af914 ("mm: rid swapoff of quadratic complexity")
Signed-off-by: Hugh Dickins <hughd@google.com>
---

 include/linux/shmem_fs.h |    1 +
 mm/shmem.c               |   39 ++++++++++++++++++---------------------
 mm/swapfile.c            |   11 +++++------
 3 files changed, 24 insertions(+), 27 deletions(-)

--- 5.1-rc4/include/linux/shmem_fs.h	2019-03-17 16:18:15.181820820 -0700
+++ linux/include/linux/shmem_fs.h	2019-04-07 19:18:43.248639711 -0700
@@ -21,6 +21,7 @@ struct shmem_inode_info {
 	struct list_head	swaplist;	/* chain of maybes on swap */
 	struct shared_policy	policy;		/* NUMA memory alloc policy */
 	struct simple_xattrs	xattrs;		/* list of xattrs */
+	atomic_t		stop_eviction;	/* hold when working on inode */
 	struct inode		vfs_inode;
 };
 
--- 5.1-rc4/mm/shmem.c	2019-04-07 19:12:23.603858531 -0700
+++ linux/mm/shmem.c	2019-04-07 19:18:43.248639711 -0700
@@ -1081,9 +1081,15 @@ static void shmem_evict_inode(struct ino
 			}
 			spin_unlock(&sbinfo->shrinklist_lock);
 		}
-		if (!list_empty(&info->swaplist)) {
+		while (!list_empty(&info->swaplist)) {
+			/* Wait while shmem_unuse() is scanning this inode... */
+			wait_var_event(&info->stop_eviction,
+				       !atomic_read(&info->stop_eviction));
 			mutex_lock(&shmem_swaplist_mutex);
 			list_del_init(&info->swaplist);
+			/* ...but beware of the race if we peeked too early */
+			if (!atomic_read(&info->stop_eviction))
+				list_del_init(&info->swaplist);
 			mutex_unlock(&shmem_swaplist_mutex);
 		}
 	}
@@ -1227,36 +1233,27 @@ int shmem_unuse(unsigned int type, bool
 		unsigned long *fs_pages_to_unuse)
 {
 	struct shmem_inode_info *info, *next;
-	struct inode *inode;
-	struct inode *prev_inode = NULL;
 	int error = 0;
 
 	if (list_empty(&shmem_swaplist))
 		return 0;
 
 	mutex_lock(&shmem_swaplist_mutex);
-
-	/*
-	 * The extra refcount on the inode is necessary to safely dereference
-	 * p->next after re-acquiring the lock. New shmem inodes with swap
-	 * get added to the end of the list and we will scan them all.
-	 */
 	list_for_each_entry_safe(info, next, &shmem_swaplist, swaplist) {
 		if (!info->swapped) {
 			list_del_init(&info->swaplist);
 			continue;
 		}
-
-		inode = igrab(&info->vfs_inode);
-		if (!inode)
-			continue;
-
+		/*
+		 * Drop the swaplist mutex while searching the inode for swap;
+		 * but before doing so, make sure shmem_evict_inode() will not
+		 * remove placeholder inode from swaplist, nor let it be freed
+		 * (igrab() would protect from unlink, but not from unmount).
+		 */
+		atomic_inc(&info->stop_eviction);
 		mutex_unlock(&shmem_swaplist_mutex);
-		if (prev_inode)
-			iput(prev_inode);
-		prev_inode = inode;
 
-		error = shmem_unuse_inode(inode, type, frontswap,
+		error = shmem_unuse_inode(&info->vfs_inode, type, frontswap,
 					  fs_pages_to_unuse);
 		cond_resched();
 
@@ -1264,14 +1261,13 @@ int shmem_unuse(unsigned int type, bool
 		next = list_next_entry(info, swaplist);
 		if (!info->swapped)
 			list_del_init(&info->swaplist);
+		if (atomic_dec_and_test(&info->stop_eviction))
+			wake_up_var(&info->stop_eviction);
 		if (error)
 			break;
 	}
 	mutex_unlock(&shmem_swaplist_mutex);
 
-	if (prev_inode)
-		iput(prev_inode);
-
 	return error;
 }
 
@@ -2238,6 +2234,7 @@ static struct inode *shmem_get_inode(str
 		info = SHMEM_I(inode);
 		memset(info, 0, (char *)inode - (char *)info);
 		spin_lock_init(&info->lock);
+		atomic_set(&info->stop_eviction, 0);
 		info->seals = F_SEAL_SEAL;
 		info->flags = flags & VM_NORESERVE;
 		INIT_LIST_HEAD(&info->shrinklist);
--- 5.1-rc4/mm/swapfile.c	2019-04-07 19:17:13.291957539 -0700
+++ linux/mm/swapfile.c	2019-04-07 19:18:43.248639711 -0700
@@ -2116,12 +2116,11 @@ retry:
 	 * Under global memory pressure, swap entries can be reinserted back
 	 * into process space after the mmlist loop above passes over them.
 	 *
-	 * Limit the number of retries? No: when shmem_unuse()'s igrab() fails,
-	 * a shmem inode using swap is being evicted; and when mmget_not_zero()
-	 * above fails, that mm is likely to be freeing swap from exit_mmap().
-	 * Both proceed at their own independent pace: we could move them to
-	 * separate lists, and wait for those lists to be emptied; but it's
-	 * easier and more robust (though cpu-intensive) just to keep retrying.
+	 * Limit the number of retries? No: when mmget_not_zero() above fails,
+	 * that mm is likely to be freeing swap from exit_mmap(), which proceeds
+	 * at its own independent pace; and even shmem_writepage() could have
+	 * been preempted after get_swap_page(), temporarily hiding that swap.
+	 * It's easy and robust (though cpu-intensive) just to keep retrying.
 	 */
 	if (si->inuse_pages) {
 		if (!signal_pending(current))


  parent reply index

Thread overview: 7+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2019-04-08 19:53 [PATCH 0/4] mm: swapoff: fixes for 5.1-rc Hugh Dickins
2019-04-08 19:56 ` [PATCH 1/4] mm: swapoff: shmem_find_swap_entries() filter out other types Hugh Dickins
2019-04-08 19:58 ` [PATCH 2/4] mm: swapoff: remove too limiting SWAP_UNUSE_MAX_TRIES Hugh Dickins
2019-04-08 19:59 ` [PATCH 3/4] mm: swapoff: take notice of completion sooner Hugh Dickins
2019-04-08 20:01 ` Hugh Dickins [this message]
2019-04-09  7:50   ` [PATCH 4/4] mm: swapoff: shmem_unuse() stop eviction without igrab() Konstantin Khlebnikov
2019-04-09 18:43     ` Hugh Dickins

Reply instructions:

You may reply publically to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=alpine.LSU.2.11.1904081259400.1523@eggly.anvils \
    --to=hughd@google.com \
    --cc=akpm@linux-foundation.org \
    --cc=alex_y_xu@yahoo.ca \
    --cc=kelleynnn@gmail.com \
    --cc=khlebnikov@yandex-team.ru \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=riel@surriel.com \
    --cc=vpillai@digitalocean.com \
    --cc=ying.huang@intel.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Linux-mm Archive on lore.kernel.org

Archives are clonable:
	git clone --mirror https://lore.kernel.org/linux-mm/0 linux-mm/git/0.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 linux-mm linux-mm/ https://lore.kernel.org/linux-mm \
		linux-mm@kvack.org linux-mm@archiver.kernel.org
	public-inbox-index linux-mm


Newsgroup available over NNTP:
	nntp://nntp.lore.kernel.org/org.kvack.linux-mm


AGPL code for this site: git clone https://public-inbox.org/ public-inbox