All of lore.kernel.org
 help / color / mirror / Atom feed
From: Dan Williams <dan.j.williams@intel.com>
To: linux-mm@kvack.org
Cc: Matthew Wilcox <willy@infradead.org>, Jan Kara <jack@suse.cz>,
	"Darrick J. Wong" <djwong@kernel.org>,
	Jason Gunthorpe <jgg@nvidia.com>, Christoph Hellwig <hch@lst.de>,
	John Hubbard <jhubbard@nvidia.com>,
	david@fromorbit.com, nvdimm@lists.linux.dev,
	akpm@linux-foundation.org, linux-fsdevel@vger.kernel.org
Subject: [PATCH v3 04/25] fsdax: Introduce dax_zap_mappings()
Date: Fri, 14 Oct 2022 16:57:19 -0700	[thread overview]
Message-ID: <166579183976.2236710.17370760087488536715.stgit@dwillia2-xfh.jf.intel.com> (raw)
In-Reply-To: <166579181584.2236710.17813547487183983273.stgit@dwillia2-xfh.jf.intel.com>

Typical pages take a reference at pte insertion and drop it at
zap_pte_range() time. That reference management is missing for DAX
leading to a situation where DAX pages are mapped in user page tables,
but are not referenced.

Once fsdax decides it wants to unmap the page it can drop its reference,
but unlike typical pages it needs to maintain the association of the
page to the inode that arbitrated the access in the first instance. It
maintains that association until explicit truncate(), or the implicit
truncate() that occurs at inode death, truncate_inode_pages_final().

The zapped state tracks whether the fsdax has dropped its interest in a
page, but still allows the associated i_pages entry to live until
truncate. This facilitates inode lookup while awaiting any page pin
users to drop their pins. For example, if memory_failure() is triggered
on the page after it has been unmapped, but before it has been truncated
from the inode, memory_failure() can still associate the event with the
inode.

Once truncate begins fsdax unmaps the page to prevent any new references
from being taken without calling back into fsdax core to reestablish
the mapping.

This approach relies on all paths that call truncate_inode_pages() to
first call dax_zap_mappings(). For that another bandaid is needed to add
this 'zap' step to the truncate_inode_pages_final() path, but that is
saved for a follow-on patch.

Cc: Matthew Wilcox <willy@infradead.org>
Cc: Jan Kara <jack@suse.cz>
Cc: "Darrick J. Wong" <djwong@kernel.org>
Cc: Jason Gunthorpe <jgg@nvidia.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: John Hubbard <jhubbard@nvidia.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 fs/dax.c            |   72 +++++++++++++++++++++++++++++++++++----------------
 fs/ext4/inode.c     |    2 +
 fs/fuse/dax.c       |    4 +--
 fs/xfs/xfs_file.c   |    2 +
 fs/xfs/xfs_inode.c  |    4 +--
 include/linux/dax.h |   11 +++++---
 6 files changed, 63 insertions(+), 32 deletions(-)

diff --git a/fs/dax.c b/fs/dax.c
index 76bad1c095c0..a75d4bf541b4 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -74,11 +74,12 @@ fs_initcall(init_dax_wait_table);
  * and EMPTY bits aren't set the entry is a normal DAX entry with a filesystem
  * block allocation.
  */
-#define DAX_SHIFT	(4)
+#define DAX_SHIFT	(5)
 #define DAX_LOCKED	(1UL << 0)
 #define DAX_PMD		(1UL << 1)
 #define DAX_ZERO_PAGE	(1UL << 2)
 #define DAX_EMPTY	(1UL << 3)
+#define DAX_ZAP		(1UL << 4)
 
 static unsigned long dax_to_pfn(void *entry)
 {
@@ -95,6 +96,11 @@ static bool dax_is_locked(void *entry)
 	return xa_to_value(entry) & DAX_LOCKED;
 }
 
+static bool dax_is_zapped(void *entry)
+{
+	return xa_to_value(entry) & DAX_ZAP;
+}
+
 static unsigned int dax_entry_order(void *entry)
 {
 	if (xa_to_value(entry) & DAX_PMD)
@@ -407,19 +413,6 @@ static void dax_disassociate_entry(void *entry, struct address_space *mapping,
 	}
 }
 
-static struct page *dax_busy_page(void *entry)
-{
-	unsigned long pfn;
-
-	for_each_mapped_pfn(entry, pfn) {
-		struct page *page = pfn_to_page(pfn);
-
-		if (!dax_page_idle(page))
-			return page;
-	}
-	return NULL;
-}
-
 /*
  * dax_lock_page - Lock the DAX entry corresponding to a page
  * @page: The page whose entry we want to lock
@@ -664,8 +657,43 @@ static void *grab_mapping_entry(struct xa_state *xas,
 	return xa_mk_internal(VM_FAULT_FALLBACK);
 }
 
+static void *dax_zap_entry(struct xa_state *xas, void *entry)
+{
+	unsigned long v = xa_to_value(entry);
+
+	return xas_store(xas, xa_mk_value(v | DAX_ZAP));
+}
+
+/**
+ * Return NULL if the entry is zapped and all pages in the entry are
+ * idle, otherwise return the non-idle page in the entry
+ */
+static struct page *dax_zap_pages(struct xa_state *xas, void *entry)
+{
+	struct page *ret = NULL;
+	unsigned long pfn;
+	bool zap;
+
+	if (!dax_entry_size(entry))
+		return NULL;
+
+	zap = !dax_is_zapped(entry);
+
+	for_each_mapped_pfn(entry, pfn) {
+		struct page *page = pfn_to_page(pfn);
+
+		if (!ret && !dax_page_idle(page))
+			ret = page;
+	}
+
+	if (zap)
+		dax_zap_entry(xas, entry);
+
+	return ret;
+}
+
 /**
- * dax_layout_busy_page_range - find first pinned page in @mapping
+ * dax_zap_mappings_range - find first pinned page in @mapping
  * @mapping: address space to scan for a page with ref count > 1
  * @start: Starting offset. Page containing 'start' is included.
  * @end: End offset. Page containing 'end' is included. If 'end' is LLONG_MAX,
@@ -682,8 +710,8 @@ static void *grab_mapping_entry(struct xa_state *xas,
  * to be able to run unmap_mapping_range() and subsequently not race
  * mapping_mapped() becoming true.
  */
-struct page *dax_layout_busy_page_range(struct address_space *mapping,
-					loff_t start, loff_t end)
+struct page *dax_zap_mappings_range(struct address_space *mapping, loff_t start,
+				    loff_t end)
 {
 	void *entry;
 	unsigned int scanned = 0;
@@ -727,7 +755,7 @@ struct page *dax_layout_busy_page_range(struct address_space *mapping,
 		if (unlikely(dax_is_locked(entry)))
 			entry = get_unlocked_entry(&xas, 0);
 		if (entry)
-			page = dax_busy_page(entry);
+			page = dax_zap_pages(&xas, entry);
 		put_unlocked_entry(&xas, entry, WAKE_NEXT);
 		if (page)
 			break;
@@ -742,13 +770,13 @@ struct page *dax_layout_busy_page_range(struct address_space *mapping,
 	xas_unlock_irq(&xas);
 	return page;
 }
-EXPORT_SYMBOL_GPL(dax_layout_busy_page_range);
+EXPORT_SYMBOL_GPL(dax_zap_mappings_range);
 
-struct page *dax_layout_busy_page(struct address_space *mapping)
+struct page *dax_zap_mappings(struct address_space *mapping)
 {
-	return dax_layout_busy_page_range(mapping, 0, LLONG_MAX);
+	return dax_zap_mappings_range(mapping, 0, LLONG_MAX);
 }
-EXPORT_SYMBOL_GPL(dax_layout_busy_page);
+EXPORT_SYMBOL_GPL(dax_zap_mappings);
 
 static int __dax_invalidate_entry(struct address_space *mapping,
 					  pgoff_t index, bool trunc)
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 478ec6bc0935..3935af49df8b 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -3957,7 +3957,7 @@ int ext4_break_layouts(struct inode *inode)
 		return -EINVAL;
 
 	do {
-		page = dax_layout_busy_page(inode->i_mapping);
+		page = dax_zap_mappings(inode->i_mapping);
 		if (!page)
 			return 0;
 
diff --git a/fs/fuse/dax.c b/fs/fuse/dax.c
index ae52ef7dbabe..8cdc9402e8f7 100644
--- a/fs/fuse/dax.c
+++ b/fs/fuse/dax.c
@@ -443,7 +443,7 @@ static int fuse_setup_new_dax_mapping(struct inode *inode, loff_t pos,
 
 	/*
 	 * Can't do inline reclaim in fault path. We call
-	 * dax_layout_busy_page() before we free a range. And
+	 * dax_zap_mappings() before we free a range. And
 	 * fuse_wait_dax_page() drops mapping->invalidate_lock and requires it.
 	 * In fault path we enter with mapping->invalidate_lock held and can't
 	 * drop it. Also in fault path we hold mapping->invalidate_lock shared
@@ -671,7 +671,7 @@ static int __fuse_dax_break_layouts(struct inode *inode, bool *retry,
 {
 	struct page *page;
 
-	page = dax_layout_busy_page_range(inode->i_mapping, start, end);
+	page = dax_zap_mappings_range(inode->i_mapping, start, end);
 	if (!page)
 		return 0;
 
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 556e28d06788..ca0afcdd98c0 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -822,7 +822,7 @@ xfs_break_dax_layouts(
 
 	ASSERT(xfs_isilocked(XFS_I(inode), XFS_MMAPLOCK_EXCL));
 
-	page = dax_layout_busy_page(inode->i_mapping);
+	page = dax_zap_mappings(inode->i_mapping);
 	if (!page)
 		return 0;
 
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 28493c8e9bb2..d48dfee01008 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -3481,8 +3481,8 @@ xfs_mmaplock_two_inodes_and_break_dax_layout(
 	 * need to unlock & lock the XFS_MMAPLOCK_EXCL which is not suitable
 	 * for this nested lock case.
 	 */
-	page = dax_layout_busy_page(VFS_I(ip2)->i_mapping);
-	if (page && page_ref_count(page) != 1) {
+	page = dax_zap_mappings(VFS_I(ip2)->i_mapping);
+	if (page) {
 		xfs_iunlock(ip2, XFS_MMAPLOCK_EXCL);
 		xfs_iunlock(ip1, XFS_MMAPLOCK_EXCL);
 		goto again;
diff --git a/include/linux/dax.h b/include/linux/dax.h
index 04987d14d7e0..f6acb4ed73cb 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -157,8 +157,9 @@ static inline void fs_put_dax(struct dax_device *dax_dev, void *holder)
 int dax_writeback_mapping_range(struct address_space *mapping,
 		struct dax_device *dax_dev, struct writeback_control *wbc);
 
-struct page *dax_layout_busy_page(struct address_space *mapping);
-struct page *dax_layout_busy_page_range(struct address_space *mapping, loff_t start, loff_t end);
+struct page *dax_zap_mappings(struct address_space *mapping);
+struct page *dax_zap_mappings_range(struct address_space *mapping, loff_t start,
+				    loff_t end);
 dax_entry_t dax_lock_page(struct page *page);
 void dax_unlock_page(struct page *page, dax_entry_t cookie);
 dax_entry_t dax_lock_mapping_entry(struct address_space *mapping,
@@ -166,12 +167,14 @@ dax_entry_t dax_lock_mapping_entry(struct address_space *mapping,
 void dax_unlock_mapping_entry(struct address_space *mapping,
 		unsigned long index, dax_entry_t cookie);
 #else
-static inline struct page *dax_layout_busy_page(struct address_space *mapping)
+static inline struct page *dax_zap_mappings(struct address_space *mapping)
 {
 	return NULL;
 }
 
-static inline struct page *dax_layout_busy_page_range(struct address_space *mapping, pgoff_t start, pgoff_t nr_pages)
+static inline struct page *dax_zap_mappings_range(struct address_space *mapping,
+						  pgoff_t start,
+						  pgoff_t nr_pages)
 {
 	return NULL;
 }


  parent reply	other threads:[~2022-10-14 23:57 UTC|newest]

Thread overview: 42+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2022-10-14 23:56 [PATCH v3 00/25] Fix the DAX-gup mistake Dan Williams
2022-10-14 23:57 ` [PATCH v3 01/25] fsdax: Wait on @page not @page->_refcount Dan Williams
2022-10-14 23:57 ` [PATCH v3 02/25] fsdax: Use dax_page_idle() to document DAX busy page checking Dan Williams
2022-10-14 23:57 ` [PATCH v3 03/25] fsdax: Include unmapped inodes for page-idle detection Dan Williams
2022-10-14 23:57 ` Dan Williams [this message]
2022-11-02 13:04   ` [PATCH v3 04/25] fsdax: Introduce dax_zap_mappings() Aneesh Kumar K.V
2022-10-14 23:57 ` [PATCH v3 05/25] fsdax: Wait for pinned pages during truncate_inode_pages_final() Dan Williams
2022-10-14 23:57 ` [PATCH v3 06/25] fsdax: Validate DAX layouts broken before truncate Dan Williams
2022-10-14 23:57 ` [PATCH v3 07/25] fsdax: Hold dax lock over mapping insertion Dan Williams
2022-10-17 19:31   ` Jason Gunthorpe
2022-10-17 20:17     ` Dan Williams
2022-10-18  5:26       ` Christoph Hellwig
2022-10-18 17:30         ` Dan Williams
2022-10-14 23:57 ` [PATCH v3 08/25] fsdax: Update dax_insert_entry() calling convention to return an error Dan Williams
2022-10-14 23:57 ` [PATCH v3 09/25] fsdax: Rework for_each_mapped_pfn() to dax_for_each_folio() Dan Williams
2022-10-14 23:57 ` [PATCH v3 10/25] fsdax: Introduce pgmap_request_folios() Dan Williams
2022-10-17  6:31   ` Alistair Popple
2022-10-17 20:06     ` Dan Williams
2022-10-17 20:11       ` Jason Gunthorpe
2022-10-17 20:51         ` Dan Williams
2022-10-17 23:57           ` Jason Gunthorpe
2022-10-18  0:19             ` Dan Williams
2022-10-17 19:41   ` Jason Gunthorpe
2022-10-14 23:58 ` [PATCH v3 11/25] fsdax: Rework dax_insert_entry() calling convention Dan Williams
2022-10-14 23:58 ` [PATCH v3 12/25] fsdax: Cleanup dax_associate_entry() Dan Williams
2022-10-14 23:58 ` [PATCH v3 13/25] devdax: Minor warning fixups Dan Williams
2022-10-14 23:58 ` [PATCH v3 14/25] devdax: Fix sparse lock imbalance warning Dan Williams
2022-10-14 23:58 ` [PATCH v3 15/25] libnvdimm/pmem: Support pmem block devices without dax Dan Williams
2022-10-14 23:58 ` [PATCH v3 16/25] devdax: Move address_space helpers to the DAX core Dan Williams
2022-10-14 23:58 ` [PATCH v3 17/25] devdax: Sparse fixes for xarray locking Dan Williams
2022-10-14 23:58 ` [PATCH v3 18/25] devdax: Sparse fixes for vmfault_t / dax-entry conversions Dan Williams
2022-10-14 23:58 ` [PATCH v3 19/25] devdax: Sparse fixes for vm_fault_t in tracepoints Dan Williams
2022-10-14 23:58 ` [PATCH v3 20/25] devdax: add PUD support to the DAX mapping infrastructure Dan Williams
2022-10-14 23:59 ` [PATCH v3 21/25] devdax: Use dax_insert_entry() + dax_delete_mapping_entry() Dan Williams
2022-10-14 23:59 ` [PATCH v3 22/25] mm/memremap_pages: Replace zone_device_page_init() with pgmap_request_folios() Dan Williams
2022-10-17 19:17   ` Lyude Paul
2022-10-14 23:59 ` [PATCH v3 23/25] mm/memremap_pages: Initialize all ZONE_DEVICE pages to start at refcount 0 Dan Williams
2022-10-17  7:04   ` Alistair Popple
2022-10-17 19:48   ` Jason Gunthorpe
2022-10-14 23:59 ` [PATCH v3 24/25] mm/meremap_pages: Delete put_devmap_managed_page_refs() Dan Williams
2022-10-17  7:08   ` Alistair Popple
2022-10-14 23:59 ` [PATCH v3 25/25] mm/gup: Drop DAX pgmap accounting Dan Williams

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=166579183976.2236710.17370760087488536715.stgit@dwillia2-xfh.jf.intel.com \
    --to=dan.j.williams@intel.com \
    --cc=akpm@linux-foundation.org \
    --cc=david@fromorbit.com \
    --cc=djwong@kernel.org \
    --cc=hch@lst.de \
    --cc=jack@suse.cz \
    --cc=jgg@nvidia.com \
    --cc=jhubbard@nvidia.com \
    --cc=linux-fsdevel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=nvdimm@lists.linux.dev \
    --cc=willy@infradead.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.