lustre-devel-lustre.org archive mirror
 help / color / mirror / Atom feed
From: James Simmons <jsimmons@infradead.org>
To: Andreas Dilger <adilger@whamcloud.com>,
	Oleg Drokin <green@whamcloud.com>, NeilBrown <neilb@suse.de>
Cc: Lustre Development List <lustre-devel@lists.lustre.org>
Subject: [lustre-devel] [PATCH 01/22] lustre: llite: clear stale page's uptodate bit
Date: Sun, 20 Nov 2022 09:16:47 -0500	[thread overview]
Message-ID: <1668953828-10909-2-git-send-email-jsimmons@infradead.org> (raw)
In-Reply-To: <1668953828-10909-1-git-send-email-jsimmons@infradead.org>

From: Bobi Jam <bobijam@whamcloud.com>

With truncate_inode_page()->do_invalidatepage()->ll_invalidatepage()
call path before deleting vmpage from page cache, the page could be
possibly picked up by ll_read_ahead_page()->grab_cache_page_nowait().

If ll_invalidatepage()->cl_page_delete() does not clear the vmpage's
uptodate bit, the read ahead could pick it up and think it's already
uptodate wrongly.

In ll_fault()->vvp_io_fault_start()->vvp_io_kernel_fault(), the
filemap_fault() will call ll_readpage() to read vmpage and wait for
the unlock of the vmpage, and when ll_readpage() successfully read
the vmpage then unlock the vmpage, memory pressure or truncate can
get in and delete the cl_page, afterward filemap_fault() find that
the vmpage is not uptodate and VM_FAULT_SIGBUS got returned. To fix
this situation, this patch makes vvp_io_kernel_fault() restart
filemap_fault() to get uptodated vmpage again.

WC-bug-id: https://jira.whamcloud.com/browse/LU-16160
Lustre-commit: 5b911e03261c3de6b ("LU-16160 llite: clear stale page's uptodate bit")
Signed-off-by: Bobi Jam <bobijam@whamcloud.com>
Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/48607
Reviewed-by: Alex Zhuravlev <bzzz@whamcloud.com>
Reviewed-by: Andreas Dilger <adilger@whamcloud.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
Signed-off-by: James Simmons <jsimmons@infradead.org>
---
 fs/lustre/include/cl_object.h |  15 ++++-
 fs/lustre/llite/rw.c          |  10 +++-
 fs/lustre/llite/vvp_io.c      | 124 +++++++++++++++++++++++++++++++++++++++---
 fs/lustre/llite/vvp_page.c    |   5 ++
 fs/lustre/obdclass/cl_page.c  |  37 ++++++++++---
 5 files changed, 172 insertions(+), 19 deletions(-)

diff --git a/fs/lustre/include/cl_object.h b/fs/lustre/include/cl_object.h
index 41ce0b0..8be58ff 100644
--- a/fs/lustre/include/cl_object.h
+++ b/fs/lustre/include/cl_object.h
@@ -768,7 +768,15 @@ struct cl_page {
 	enum cl_page_type		 cp_type:CP_TYPE_BITS;
 	unsigned int			 cp_defer_uptodate:1,
 					 cp_ra_updated:1,
-					 cp_ra_used:1;
+					 cp_ra_used:1,
+					 /* fault page read grab extra referece */
+					 cp_fault_ref:1,
+					 /**
+					  * if fault page got delete before returned to
+					  * filemap_fault(), defer the vmpage detach/put
+					  * until filemap_fault() has been handled.
+					  */
+					 cp_defer_detach:1;
 
 	/* which slab kmem index this memory allocated from */
 	short int			 cp_kmem_index;
@@ -2393,6 +2401,11 @@ int cl_io_lru_reserve(const struct lu_env *env, struct cl_io *io,
 int cl_io_read_ahead(const struct lu_env *env, struct cl_io *io,
 		     pgoff_t start, struct cl_read_ahead *ra);
 
+static inline int cl_io_is_pagefault(const struct cl_io *io)
+{
+	return io->ci_type == CIT_FAULT && !io->u.ci_fault.ft_mkwrite;
+}
+
 /**
  * True, if @io is an O_APPEND write(2).
  */
diff --git a/fs/lustre/llite/rw.c b/fs/lustre/llite/rw.c
index 2290b31..0283af4 100644
--- a/fs/lustre/llite/rw.c
+++ b/fs/lustre/llite/rw.c
@@ -1947,7 +1947,15 @@ int ll_readpage(struct file *file, struct page *vmpage)
 			unlock_page(vmpage);
 			result = 0;
 		}
-		cl_page_put(env, page);
+		if (cl_io_is_pagefault(io) && result == 0) {
+			/**
+			 * page fault, retain the cl_page reference until
+			 * vvp_io_kernel_fault() release it.
+			 */
+			page->cp_fault_ref = 1;
+		} else {
+			cl_page_put(env, page);
+		}
 	} else {
 		unlock_page(vmpage);
 		result = PTR_ERR(page);
diff --git a/fs/lustre/llite/vvp_io.c b/fs/lustre/llite/vvp_io.c
index ef7a3d92..be6f17f 100644
--- a/fs/lustre/llite/vvp_io.c
+++ b/fs/lustre/llite/vvp_io.c
@@ -1292,14 +1292,41 @@ static void vvp_io_rw_end(const struct lu_env *env,
 	trunc_sem_up_read(&lli->lli_trunc_sem);
 }
 
-static int vvp_io_kernel_fault(struct vvp_fault_io *cfio)
+static void detach_and_deref_page(struct cl_page *clp, struct page *vmpage)
+{
+	if (!clp->cp_defer_detach)
+		return;
+
+	/**
+	 * cl_page_delete0() took a vmpage reference, but not unlink the vmpage
+	 * from its cl_page.
+	 */
+	clp->cp_defer_detach = 0;
+	ClearPagePrivate(vmpage);
+	vmpage->private = 0;
+
+	put_page(vmpage);
+	refcount_dec(&clp->cp_ref);
+}
+
+static int vvp_io_kernel_fault(const struct lu_env *env,
+			       struct vvp_fault_io *cfio)
 {
 	struct vm_fault *vmf = cfio->ft_vmf;
+	struct file *vmff = cfio->ft_vma->vm_file;
+	struct address_space *mapping = vmff->f_mapping;
+	struct inode *inode = mapping->host;
+	struct page *vmpage = NULL;
+	struct cl_page *clp = NULL;
+	int rc = 0;
 
+	ll_inode_size_lock(inode);
+retry:
 	cfio->ft_flags = filemap_fault(vmf);
 	cfio->ft_flags_valid = 1;
 
 	if (vmf->page) {
+		/* success, vmpage is locked */
 		CDEBUG(D_PAGE,
 		       "page %p map %p index %lu flags %lx count %u priv %0lx: got addr %p type NOPAGE\n",
 		       vmf->page, vmf->page->mapping, vmf->page->index,
@@ -1311,24 +1338,105 @@ static int vvp_io_kernel_fault(struct vvp_fault_io *cfio)
 		}
 
 		cfio->ft_vmpage = vmf->page;
-		return 0;
+
+		/**
+		 * ll_filemap_fault()->ll_readpage() could get an extra cl_page
+		 * reference. So we have to get the cl_page's to check its
+		 * cp_fault_ref and drop the reference later.
+		 */
+		clp = cl_vmpage_page(vmf->page, NULL);
+
+		goto unlock;
+	}
+
+	/* filemap_fault() fails, vmpage is not locked */
+	if (!clp) {
+		vmpage = find_get_page(mapping, vmf->pgoff);
+		if (vmpage) {
+			lock_page(vmpage);
+			clp = cl_vmpage_page(vmpage, NULL);
+			unlock_page(vmpage);
+		}
 	}
 
 	if (cfio->ft_flags & (VM_FAULT_SIGBUS | VM_FAULT_SIGSEGV)) {
+		pgoff_t max_idx;
+
+		/**
+		 * ll_filemap_fault()->ll_readpage() could fill vmpage
+		 * correctly, and unlock the vmpage, while memory pressure or
+		 * truncate could detach cl_page from vmpage, and kernel
+		 * filemap_fault() will wait_on_page_locked(vmpage) and find
+		 * out that the vmpage has been cleared its uptodate bit,
+		 * so it returns VM_FAULT_SIGBUS.
+		 *
+		 * In this case, we'd retry the filemap_fault()->ll_readpage()
+		 * to rebuild the cl_page and fill vmpage with uptodated data.
+		 */
+		if (likely(vmpage)) {
+			bool need_retry = false;
+
+			if (clp) {
+				if (clp->cp_defer_detach) {
+					detach_and_deref_page(clp, vmpage);
+					/**
+					 * check i_size to make sure it's not
+					 * over EOF, we don't want to call
+					 * filemap_fault() repeatedly since it
+					 * returns VM_FAULT_SIGBUS without even
+					 * trying if vmf->pgoff is over EOF.
+					 */
+					max_idx = DIV_ROUND_UP(i_size_read(inode),
+							       PAGE_SIZE);
+					if (vmf->pgoff < max_idx)
+						need_retry = true;
+				}
+				if (clp->cp_fault_ref) {
+					clp->cp_fault_ref = 0;
+					/* ref not released in ll_readpage() */
+					cl_page_put(env, clp);
+				}
+				if (need_retry)
+					goto retry;
+			}
+		}
+
 		CDEBUG(D_PAGE, "got addr %p - SIGBUS\n", (void *)vmf->address);
-		return -EFAULT;
+		rc = -EFAULT;
+		goto unlock;
 	}
 
 	if (cfio->ft_flags & VM_FAULT_OOM) {
 		CDEBUG(D_PAGE, "got addr %p - OOM\n", (void *)vmf->address);
-		return -ENOMEM;
+		rc = -ENOMEM;
+		goto unlock;
 	}
 
-	if (cfio->ft_flags & VM_FAULT_RETRY)
-		return -EAGAIN;
+	if (cfio->ft_flags & VM_FAULT_RETRY) {
+		rc = -EAGAIN;
+		goto unlock;
+	}
 
 	CERROR("Unknown error in page fault %d!\n", cfio->ft_flags);
-	return -EINVAL;
+	rc = -EINVAL;
+unlock:
+	ll_inode_size_unlock(inode);
+	if (clp) {
+		if (clp->cp_defer_detach && vmpage)
+			detach_and_deref_page(clp, vmpage);
+
+		/* additional cl_page ref has been taken in ll_readpage() */
+		if (clp->cp_fault_ref) {
+			clp->cp_fault_ref = 0;
+			/* ref not released in ll_readpage() */
+			cl_page_put(env, clp);
+		}
+		/* ref taken in this function */
+		cl_page_put(env, clp);
+	}
+	if (vmpage)
+		put_page(vmpage);
+	return rc;
 }
 
 static void mkwrite_commit_callback(const struct lu_env *env, struct cl_io *io,
@@ -1368,7 +1476,7 @@ static int vvp_io_fault_start(const struct lu_env *env,
 		LASSERT(cfio->ft_vmpage);
 		lock_page(cfio->ft_vmpage);
 	} else {
-		result = vvp_io_kernel_fault(cfio);
+		result = vvp_io_kernel_fault(env, cfio);
 		if (result != 0)
 			return result;
 	}
diff --git a/fs/lustre/llite/vvp_page.c b/fs/lustre/llite/vvp_page.c
index f359596..9e8c158 100644
--- a/fs/lustre/llite/vvp_page.c
+++ b/fs/lustre/llite/vvp_page.c
@@ -104,6 +104,11 @@ static void vvp_page_completion_read(const struct lu_env *env,
 		ll_ra_count_put(ll_i2sbi(inode), 1);
 
 	if (ioret == 0)  {
+		/**
+		 * cp_defer_uptodate is used for readahead page, and the
+		 * vmpage Uptodate bit is deferred to set in ll_readpage/
+		 * ll_io_read_page.
+		 */
 		if (!cp->cp_defer_uptodate)
 			SetPageUptodate(vmpage);
 	} else if (cp->cp_defer_uptodate) {
diff --git a/fs/lustre/obdclass/cl_page.c b/fs/lustre/obdclass/cl_page.c
index 7011235..3bc1a9b 100644
--- a/fs/lustre/obdclass/cl_page.c
+++ b/fs/lustre/obdclass/cl_page.c
@@ -725,16 +725,35 @@ static void __cl_page_delete(const struct lu_env *env, struct cl_page *cp)
 		LASSERT(PageLocked(vmpage));
 		LASSERT((struct cl_page *)vmpage->private == cp);
 
-		/* Drop the reference count held in vvp_page_init */
-		refcount_dec(&cp->cp_ref);
-		ClearPagePrivate(vmpage);
-		vmpage->private = 0;
-
-		/*
-		 * The reference from vmpage to cl_page is removed,
-		 * but the reference back is still here. It is removed
-		 * later in cl_page_free().
+		/**
+		 * clear vmpage uptodate bit, since ll_read_ahead_pages()->
+		 * ll_read_ahead_page() could pick up this stale vmpage and
+		 * take it as uptodated.
 		 */
+		ClearPageUptodate(vmpage);
+		/**
+		 * vvp_io_kernel_fault()->ll_readpage() set cp_fault_ref
+		 * and need it to check cl_page to retry the page fault read.
+		 */
+		if (cp->cp_fault_ref) {
+			cp->cp_defer_detach = 1;
+			/**
+			 * get a vmpage reference, so that filemap_fault()
+			 * won't free it from pagecache.
+			 */
+			get_page(vmpage);
+		} else {
+			/* Drop the reference count held in vvp_page_init */
+			refcount_dec(&cp->cp_ref);
+			ClearPagePrivate(vmpage);
+			vmpage->private = 0;
+
+			/*
+			 * The reference from vmpage to cl_page is removed,
+			 * but the reference back is still here. It is removed
+			 * later in cl_page_free().
+			 */
+		}
 	}
 }
 
-- 
1.8.3.1

_______________________________________________
lustre-devel mailing list
lustre-devel@lists.lustre.org
http://lists.lustre.org/listinfo.cgi/lustre-devel-lustre.org

  reply	other threads:[~2022-11-20 14:17 UTC|newest]

Thread overview: 23+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2022-11-20 14:16 [lustre-devel] [PATCH 00/22] lustre: backport OpenSFS work as of Nov 20, 2022 James Simmons
2022-11-20 14:16 ` James Simmons [this message]
2022-11-20 14:16 ` [lustre-devel] [PATCH 02/22] lustre: osc: Remove oap lock James Simmons
2022-11-20 14:16 ` [lustre-devel] [PATCH 03/22] lnet: Don't modify uptodate peer with temp NI James Simmons
2022-11-20 14:16 ` [lustre-devel] [PATCH 04/22] lustre: llite: Explicitly support .splice_write James Simmons
2022-11-20 14:16 ` [lustre-devel] [PATCH 05/22] lnet: o2iblnd: add verbose debug prints for rx/tx events James Simmons
2022-11-20 14:16 ` [lustre-devel] [PATCH 06/22] lnet: use Netlink to support old and new NI APIs James Simmons
2022-11-20 14:16 ` [lustre-devel] [PATCH 07/22] lustre: obdclass: improve precision of wakeups for mod_rpcs James Simmons
2022-11-20 14:16 ` [lustre-devel] [PATCH 08/22] lnet: allow ping packet to contain large nids James Simmons
2022-11-20 14:16 ` [lustre-devel] [PATCH 09/22] lustre: llog: skip bad records in llog James Simmons
2022-11-20 14:16 ` [lustre-devel] [PATCH 10/22] lnet: fix build issue when IPv6 is disabled James Simmons
2022-11-20 14:16 ` [lustre-devel] [PATCH 11/22] lustre: obdclass: fill jobid in a safe way James Simmons
2022-11-20 14:16 ` [lustre-devel] [PATCH 12/22] lustre: llite: remove linefeed from LDLM_DEBUG James Simmons
2022-11-20 14:16 ` [lustre-devel] [PATCH 13/22] lnet: selftest: migrate LNet selftest session handling to Netlink James Simmons
2022-11-20 14:17 ` [lustre-devel] [PATCH 14/22] lustre: clio: append to non-existent component James Simmons
2022-11-20 14:17 ` [lustre-devel] [PATCH 15/22] lnet: fix debug message in lnet_discovery_event_reply James Simmons
2022-11-20 14:17 ` [lustre-devel] [PATCH 16/22] lustre: ldlm: group lock unlock fix James Simmons
2022-11-20 14:17 ` [lustre-devel] [PATCH 17/22] lnet: Signal completion on ping send failure James Simmons
2022-11-20 14:17 ` [lustre-devel] [PATCH 18/22] lnet: extend lnet_is_nid_in_ping_info() James Simmons
2022-11-20 14:17 ` [lustre-devel] [PATCH 19/22] lnet: find correct primary for peer James Simmons
2022-11-20 14:17 ` [lustre-devel] [PATCH 20/22] lnet: change lnet_notify() to take struct lnet_nid James Simmons
2022-11-20 14:17 ` [lustre-devel] [PATCH 21/22] lnet: discard lnet_nid2ni_*() James Simmons
2022-11-20 14:17 ` [lustre-devel] [PATCH 22/22] lnet: change lnet_debug_peer() to struct lnet_nid James Simmons

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1668953828-10909-2-git-send-email-jsimmons@infradead.org \
    --to=jsimmons@infradead.org \
    --cc=adilger@whamcloud.com \
    --cc=green@whamcloud.com \
    --cc=lustre-devel@lists.lustre.org \
    --cc=neilb@suse.de \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).