All of lore.kernel.org
 help / color / mirror / Atom feed
From: Dan Williams <dan.j.williams@intel.com>
To: linux-mm@kvack.org
Cc: Matthew Wilcox <willy@infradead.org>, Jan Kara <jack@suse.cz>,
	"Darrick J. Wong" <djwong@kernel.org>,
	Christoph Hellwig <hch@lst.de>,
	John Hubbard <jhubbard@nvidia.com>,
	Alistair Popple <apopple@nvidia.com>,
	Jason Gunthorpe <jgg@nvidia.com>,
	david@fromorbit.com, nvdimm@lists.linux.dev,
	akpm@linux-foundation.org, linux-fsdevel@vger.kernel.org
Subject: [PATCH v3 10/25] fsdax: Introduce pgmap_request_folios()
Date: Fri, 14 Oct 2022 16:57:55 -0700	[thread overview]
Message-ID: <166579187573.2236710.10151157417629496558.stgit@dwillia2-xfh.jf.intel.com> (raw)
In-Reply-To: <166579181584.2236710.17813547487183983273.stgit@dwillia2-xfh.jf.intel.com>

The next step in sanitizing DAX page and pgmap lifetime is to take page
references when a pgmap user maps a page or otherwise puts it into use.
Unlike the page allocator where the it picks the page/folio, ZONE_DEVICE
users know in advance which folio they want to access.  Additionally,
ZONE_DEVICE implementations know when the pgmap is alive. Introduce
pgmap_request_folios() that pins @nr_folios folios at a time provided
they are contiguous and of the same folio_order().

Some WARN assertions are added to document expectations and catch bugs
in future kernel work, like a potential conversion of fsdax to use
multi-page folios, but they otherwise are not expected to fire.

Note that the paired pgmap_release_folios() implementation temporarily,
in this path, takes an @pgmap argument to drop pgmap references. A
follow-on patch arranges for free_zone_device_page() to drop pgmap
references in all cases. In other words, the intent is that only
put_folio() (on each folio requested pgmap_request_folio()) is needed to
to undo pgmap_request_folios().

The intent is that this also replaces zone_device_page_init(), but that
too requires some more preparatory reworks to unify the various
MEMORY_DEVICE_* types.

Cc: Matthew Wilcox <willy@infradead.org>
Cc: Jan Kara <jack@suse.cz>
Cc: "Darrick J. Wong" <djwong@kernel.org>
Cc: Christoph Hellwig <hch@lst.de>
Cc: John Hubbard <jhubbard@nvidia.com>
Cc: Alistair Popple <apopple@nvidia.com>
Suggested-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 fs/dax.c                 |   32 ++++++++++++++++-----
 include/linux/memremap.h |   17 +++++++++++
 mm/memremap.c            |   70 ++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 111 insertions(+), 8 deletions(-)

diff --git a/fs/dax.c b/fs/dax.c
index d03c7a952d02..095c9d7b4a1d 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -385,20 +385,27 @@ static inline void dax_mapping_set_cow(struct folio *folio)
 	folio->index++;
 }
 
+static struct dev_pagemap *folio_pgmap(struct folio *folio)
+{
+	return folio_page(folio, 0)->pgmap;
+}
+
 /*
  * When it is called in dax_insert_entry(), the cow flag will indicate that
  * whether this entry is shared by multiple files.  If so, set the page->mapping
  * FS_DAX_MAPPING_COW, and use page->index as refcount.
  */
-static void dax_associate_entry(void *entry, struct address_space *mapping,
-		struct vm_area_struct *vma, unsigned long address, bool cow)
+static vm_fault_t dax_associate_entry(void *entry,
+				      struct address_space *mapping,
+				      struct vm_area_struct *vma,
+				      unsigned long address, bool cow)
 {
 	unsigned long size = dax_entry_size(entry), index;
 	struct folio *folio;
 	int i;
 
 	if (IS_ENABLED(CONFIG_FS_DAX_LIMITED))
-		return;
+		return 0;
 
 	index = linear_page_index(vma, address & ~(size - 1));
 	dax_for_each_folio(entry, folio, i)
@@ -406,9 +413,13 @@ static void dax_associate_entry(void *entry, struct address_space *mapping,
 			dax_mapping_set_cow(folio);
 		} else {
 			WARN_ON_ONCE(folio->mapping);
+			if (!pgmap_request_folios(folio_pgmap(folio), folio, 1))
+				return VM_FAULT_SIGBUS;
 			folio->mapping = mapping;
 			folio->index = index + i;
 		}
+
+	return 0;
 }
 
 static void dax_disassociate_entry(void *entry, struct address_space *mapping,
@@ -702,9 +713,12 @@ static struct page *dax_zap_pages(struct xa_state *xas, void *entry)
 
 	zap = !dax_is_zapped(entry);
 
-	dax_for_each_folio(entry, folio, i)
+	dax_for_each_folio(entry, folio, i) {
+		if (zap)
+			pgmap_release_folios(folio_pgmap(folio), folio, 1);
 		if (!ret && !dax_folio_idle(folio))
 			ret = folio_page(folio, 0);
+	}
 
 	if (zap)
 		dax_zap_entry(xas, entry);
@@ -934,6 +948,7 @@ static vm_fault_t dax_insert_entry(struct xa_state *xas, struct vm_fault *vmf,
 	bool dirty = !dax_fault_is_synchronous(iter, vmf->vma);
 	bool cow = dax_fault_is_cow(iter);
 	void *entry = *pentry;
+	vm_fault_t ret = 0;
 
 	if (dirty)
 		__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
@@ -954,8 +969,10 @@ static vm_fault_t dax_insert_entry(struct xa_state *xas, struct vm_fault *vmf,
 		void *old;
 
 		dax_disassociate_entry(entry, mapping, false);
-		dax_associate_entry(new_entry, mapping, vmf->vma, vmf->address,
+		ret = dax_associate_entry(new_entry, mapping, vmf->vma, vmf->address,
 				cow);
+		if (ret)
+			goto out;
 		/*
 		 * Only swap our new entry into the page cache if the current
 		 * entry is a zero page or an empty entry.  If a normal PTE or
@@ -978,10 +995,11 @@ static vm_fault_t dax_insert_entry(struct xa_state *xas, struct vm_fault *vmf,
 	if (cow)
 		xas_set_mark(xas, PAGECACHE_TAG_TOWRITE);
 
+	*pentry = entry;
+out:
 	xas_unlock_irq(xas);
 
-	*pentry = entry;
-	return 0;
+	return ret;
 }
 
 static int dax_writeback_one(struct xa_state *xas, struct dax_device *dax_dev,
diff --git a/include/linux/memremap.h b/include/linux/memremap.h
index 7fcaf3180a5b..b87c16577af1 100644
--- a/include/linux/memremap.h
+++ b/include/linux/memremap.h
@@ -193,7 +193,11 @@ void memunmap_pages(struct dev_pagemap *pgmap);
 void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap);
 void devm_memunmap_pages(struct device *dev, struct dev_pagemap *pgmap);
 struct dev_pagemap *get_dev_pagemap(unsigned long pfn,
-		struct dev_pagemap *pgmap);
+				    struct dev_pagemap *pgmap);
+bool pgmap_request_folios(struct dev_pagemap *pgmap, struct folio *folio,
+			  int nr_folios);
+void pgmap_release_folios(struct dev_pagemap *pgmap, struct folio *folio,
+			  int nr_folios);
 bool pgmap_pfn_valid(struct dev_pagemap *pgmap, unsigned long pfn);
 
 unsigned long vmem_altmap_offset(struct vmem_altmap *altmap);
@@ -223,6 +227,17 @@ static inline struct dev_pagemap *get_dev_pagemap(unsigned long pfn,
 	return NULL;
 }
 
+static inline bool pgmap_request_folios(struct dev_pagemap *pgmap,
+					struct folio *folio, int nr_folios)
+{
+	return false;
+}
+
+static inline void pgmap_release_folios(struct dev_pagemap *pgmap,
+					struct folio *folio, int nr_folios)
+{
+}
+
 static inline bool pgmap_pfn_valid(struct dev_pagemap *pgmap, unsigned long pfn)
 {
 	return false;
diff --git a/mm/memremap.c b/mm/memremap.c
index f9287babb3ce..87a649ecdc54 100644
--- a/mm/memremap.c
+++ b/mm/memremap.c
@@ -530,6 +530,76 @@ void zone_device_page_init(struct page *page)
 }
 EXPORT_SYMBOL_GPL(zone_device_page_init);
 
+static bool folio_span_valid(struct dev_pagemap *pgmap, struct folio *folio,
+			     int nr_folios)
+{
+	unsigned long pfn_start, pfn_end;
+
+	pfn_start = page_to_pfn(folio_page(folio, 0));
+	pfn_end = pfn_start + (1 << folio_order(folio)) * nr_folios - 1;
+
+	if (pgmap != xa_load(&pgmap_array, pfn_start))
+		return false;
+
+	if (pfn_end > pfn_start && pgmap != xa_load(&pgmap_array, pfn_end))
+		return false;
+
+	return true;
+}
+
+/**
+ * pgmap_request_folios - activate an contiguous span of folios in @pgmap
+ * @pgmap: host page map for the folio array
+ * @folio: start of the folio list, all subsequent folios have same folio_size()
+ *
+ * Caller is responsible for @pgmap remaining live for the duration of
+ * this call. Caller is also responsible for not racing requests for the
+ * same folios.
+ */
+bool pgmap_request_folios(struct dev_pagemap *pgmap, struct folio *folio,
+			  int nr_folios)
+{
+	struct folio *iter;
+	int i;
+
+	/*
+	 * All of the WARNs below are for catching bugs in future
+	 * development that changes the assumptions of:
+	 * 1/ uniform folios in @pgmap
+	 * 2/ @pgmap death does not race this routine.
+	 */
+	VM_WARN_ON_ONCE(!folio_span_valid(pgmap, folio, nr_folios));
+
+	if (WARN_ON_ONCE(percpu_ref_is_dying(&pgmap->ref)))
+		return false;
+
+	for (iter = folio_next(folio), i = 1; i < nr_folios;
+	     iter = folio_next(folio), i++)
+		if (WARN_ON_ONCE(folio_order(iter) != folio_order(folio)))
+			return false;
+
+	for (iter = folio, i = 0; i < nr_folios; iter = folio_next(iter), i++) {
+		folio_ref_inc(iter);
+		if (folio_ref_count(iter) == 1)
+			percpu_ref_tryget(&pgmap->ref);
+	}
+
+	return true;
+}
+
+void pgmap_release_folios(struct dev_pagemap *pgmap, struct folio *folio, int nr_folios)
+{
+	struct folio *iter;
+	int i;
+
+	for (iter = folio, i = 0; i < nr_folios; iter = folio_next(iter), i++) {
+		if (!put_devmap_managed_page(&iter->page))
+			folio_put(iter);
+		if (!folio_ref_count(iter))
+			put_dev_pagemap(pgmap);
+	}
+}
+
 #ifdef CONFIG_FS_DAX
 bool __put_devmap_managed_page_refs(struct page *page, int refs)
 {


  parent reply	other threads:[~2022-10-14 23:57 UTC|newest]

Thread overview: 42+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2022-10-14 23:56 [PATCH v3 00/25] Fix the DAX-gup mistake Dan Williams
2022-10-14 23:57 ` [PATCH v3 01/25] fsdax: Wait on @page not @page->_refcount Dan Williams
2022-10-14 23:57 ` [PATCH v3 02/25] fsdax: Use dax_page_idle() to document DAX busy page checking Dan Williams
2022-10-14 23:57 ` [PATCH v3 03/25] fsdax: Include unmapped inodes for page-idle detection Dan Williams
2022-10-14 23:57 ` [PATCH v3 04/25] fsdax: Introduce dax_zap_mappings() Dan Williams
2022-11-02 13:04   ` Aneesh Kumar K.V
2022-10-14 23:57 ` [PATCH v3 05/25] fsdax: Wait for pinned pages during truncate_inode_pages_final() Dan Williams
2022-10-14 23:57 ` [PATCH v3 06/25] fsdax: Validate DAX layouts broken before truncate Dan Williams
2022-10-14 23:57 ` [PATCH v3 07/25] fsdax: Hold dax lock over mapping insertion Dan Williams
2022-10-17 19:31   ` Jason Gunthorpe
2022-10-17 20:17     ` Dan Williams
2022-10-18  5:26       ` Christoph Hellwig
2022-10-18 17:30         ` Dan Williams
2022-10-14 23:57 ` [PATCH v3 08/25] fsdax: Update dax_insert_entry() calling convention to return an error Dan Williams
2022-10-14 23:57 ` [PATCH v3 09/25] fsdax: Rework for_each_mapped_pfn() to dax_for_each_folio() Dan Williams
2022-10-14 23:57 ` Dan Williams [this message]
2022-10-17  6:31   ` [PATCH v3 10/25] fsdax: Introduce pgmap_request_folios() Alistair Popple
2022-10-17 20:06     ` Dan Williams
2022-10-17 20:11       ` Jason Gunthorpe
2022-10-17 20:51         ` Dan Williams
2022-10-17 23:57           ` Jason Gunthorpe
2022-10-18  0:19             ` Dan Williams
2022-10-17 19:41   ` Jason Gunthorpe
2022-10-14 23:58 ` [PATCH v3 11/25] fsdax: Rework dax_insert_entry() calling convention Dan Williams
2022-10-14 23:58 ` [PATCH v3 12/25] fsdax: Cleanup dax_associate_entry() Dan Williams
2022-10-14 23:58 ` [PATCH v3 13/25] devdax: Minor warning fixups Dan Williams
2022-10-14 23:58 ` [PATCH v3 14/25] devdax: Fix sparse lock imbalance warning Dan Williams
2022-10-14 23:58 ` [PATCH v3 15/25] libnvdimm/pmem: Support pmem block devices without dax Dan Williams
2022-10-14 23:58 ` [PATCH v3 16/25] devdax: Move address_space helpers to the DAX core Dan Williams
2022-10-14 23:58 ` [PATCH v3 17/25] devdax: Sparse fixes for xarray locking Dan Williams
2022-10-14 23:58 ` [PATCH v3 18/25] devdax: Sparse fixes for vmfault_t / dax-entry conversions Dan Williams
2022-10-14 23:58 ` [PATCH v3 19/25] devdax: Sparse fixes for vm_fault_t in tracepoints Dan Williams
2022-10-14 23:58 ` [PATCH v3 20/25] devdax: add PUD support to the DAX mapping infrastructure Dan Williams
2022-10-14 23:59 ` [PATCH v3 21/25] devdax: Use dax_insert_entry() + dax_delete_mapping_entry() Dan Williams
2022-10-14 23:59 ` [PATCH v3 22/25] mm/memremap_pages: Replace zone_device_page_init() with pgmap_request_folios() Dan Williams
2022-10-17 19:17   ` Lyude Paul
2022-10-14 23:59 ` [PATCH v3 23/25] mm/memremap_pages: Initialize all ZONE_DEVICE pages to start at refcount 0 Dan Williams
2022-10-17  7:04   ` Alistair Popple
2022-10-17 19:48   ` Jason Gunthorpe
2022-10-14 23:59 ` [PATCH v3 24/25] mm/meremap_pages: Delete put_devmap_managed_page_refs() Dan Williams
2022-10-17  7:08   ` Alistair Popple
2022-10-14 23:59 ` [PATCH v3 25/25] mm/gup: Drop DAX pgmap accounting Dan Williams

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=166579187573.2236710.10151157417629496558.stgit@dwillia2-xfh.jf.intel.com \
    --to=dan.j.williams@intel.com \
    --cc=akpm@linux-foundation.org \
    --cc=apopple@nvidia.com \
    --cc=david@fromorbit.com \
    --cc=djwong@kernel.org \
    --cc=hch@lst.de \
    --cc=jack@suse.cz \
    --cc=jgg@nvidia.com \
    --cc=jhubbard@nvidia.com \
    --cc=linux-fsdevel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=nvdimm@lists.linux.dev \
    --cc=willy@infradead.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.