linux-fsdevel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Dan Williams <dan.j.williams@intel.com>
To: linux-nvdimm@lists.01.org
Cc: Jan Kara <jack@suse.cz>, Christoph Hellwig <hch@lst.de>,
	Matthew Wilcox <mawilcox@microsoft.com>,
	Ross Zwisler <ross.zwisler@linux.intel.com>,
	linux-mm@kvack.org, linux-fsdevel@vger.kernel.org,
	tony.luck@intel.com
Subject: [PATCH 06/11] filesystem-dax: perform __dax_invalidate_mapping_entry() under the page lock
Date: Tue, 22 May 2018 07:40:03 -0700	[thread overview]
Message-ID: <152700000355.24093.14726378287214432782.stgit@dwillia2-desk3.amr.corp.intel.com> (raw)
In-Reply-To: <152699997165.24093.12194490924829406111.stgit@dwillia2-desk3.amr.corp.intel.com>

Hold the page lock while invalidating mapping entries to prevent races
between rmap using the address_space and the filesystem freeing the
address_space.

This is more complicated than the simple description implies because
dev_pagemap pages that fsdax uses do not have any concept of page size.
Size information is stored in the radix and can only be safely read
while holding the xa_lock. Since lock_page() can not be taken while
holding xa_lock, drop xa_lock and speculatively lock all the associated
pages. Once all the pages are locked re-take the xa_lock and revalidate
that the radix entry did not change.

Cc: Jan Kara <jack@suse.cz>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Matthew Wilcox <mawilcox@microsoft.com>
Cc: Ross Zwisler <ross.zwisler@linux.intel.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 fs/dax.c |   91 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 85 insertions(+), 6 deletions(-)

diff --git a/fs/dax.c b/fs/dax.c
index 2e4682cd7c69..e6d44d336283 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -319,6 +319,13 @@ static unsigned long dax_radix_end_pfn(void *entry)
 	for (pfn = dax_radix_pfn(entry); \
 			pfn < dax_radix_end_pfn(entry); pfn++)
 
+#define for_each_mapped_pfn_reverse(entry, pfn) \
+	for (pfn = dax_radix_end_pfn(entry) - 1; \
+			dax_entry_size(entry) \
+			&& pfn >= dax_radix_pfn(entry); \
+			pfn--)
+
+
 static void dax_associate_entry(void *entry, struct address_space *mapping,
 		struct vm_area_struct *vma, unsigned long address)
 {
@@ -497,6 +504,80 @@ static void *grab_mapping_entry(struct address_space *mapping, pgoff_t index,
 	return entry;
 }
 
+static bool dax_lock_pages(struct address_space *mapping, pgoff_t index,
+		void **entry)
+{
+	struct radix_tree_root *pages = &mapping->i_pages;
+	unsigned long pfn;
+	void *entry2;
+
+	xa_lock_irq(pages);
+	*entry = get_unlocked_mapping_entry(mapping, index, NULL);
+	if (!*entry || WARN_ON_ONCE(!radix_tree_exceptional_entry(*entry))) {
+		put_unlocked_mapping_entry(mapping, index, entry);
+		xa_unlock_irq(pages);
+		return false;
+	}
+
+	/*
+	 * In the limited case there are no races to prevent with rmap,
+	 * because rmap can not perform pfn_to_page().
+	 */
+	if (IS_ENABLED(CONFIG_FS_DAX_LIMITED))
+		return true;
+
+	/*
+	 * Now, drop the xa_lock, grab all the page locks then validate
+	 * that the entry has not changed and return with the xa_lock
+	 * held.
+	 */
+	xa_unlock_irq(pages);
+
+	/*
+	 * Retry until the entry stabilizes or someone else invalidates
+	 * the entry;
+	 */
+	for (;;) {
+		for_each_mapped_pfn(*entry, pfn)
+			lock_page(pfn_to_page(pfn));
+
+		xa_lock_irq(pages);
+		entry2 = get_unlocked_mapping_entry(mapping, index, NULL);
+		if (!entry2 || WARN_ON_ONCE(!radix_tree_exceptional_entry(entry2))
+				|| entry2 != *entry) {
+			put_unlocked_mapping_entry(mapping, index, entry2);
+			xa_unlock_irq(pages);
+
+			for_each_mapped_pfn_reverse(*entry, pfn)
+				unlock_page(pfn_to_page(pfn));
+
+			if (!entry2 || !radix_tree_exceptional_entry(entry2))
+				return false;
+			*entry = entry2;
+			continue;
+		}
+		break;
+	}
+
+	return true;
+}
+
+static void dax_unlock_pages(struct address_space *mapping, pgoff_t index,
+		void *entry)
+{
+	struct radix_tree_root *pages = &mapping->i_pages;
+	unsigned long pfn;
+
+	put_unlocked_mapping_entry(mapping, index, entry);
+	xa_unlock_irq(pages);
+
+	if (IS_ENABLED(CONFIG_FS_DAX_LIMITED))
+		return;
+
+	for_each_mapped_pfn_reverse(entry, pfn)
+		unlock_page(pfn_to_page(pfn));
+}
+
 static int __dax_invalidate_mapping_entry(struct address_space *mapping,
 					  pgoff_t index, bool trunc)
 {
@@ -504,10 +585,8 @@ static int __dax_invalidate_mapping_entry(struct address_space *mapping,
 	void *entry;
 	struct radix_tree_root *pages = &mapping->i_pages;
 
-	xa_lock_irq(pages);
-	entry = get_unlocked_mapping_entry(mapping, index, NULL);
-	if (!entry || WARN_ON_ONCE(!radix_tree_exceptional_entry(entry)))
-		goto out;
+	if (!dax_lock_pages(mapping, index, &entry))
+		return ret;
 	if (!trunc &&
 	    (radix_tree_tag_get(pages, index, PAGECACHE_TAG_DIRTY) ||
 	     radix_tree_tag_get(pages, index, PAGECACHE_TAG_TOWRITE)))
@@ -517,8 +596,8 @@ static int __dax_invalidate_mapping_entry(struct address_space *mapping,
 	mapping->nrexceptional--;
 	ret = 1;
 out:
-	put_unlocked_mapping_entry(mapping, index, entry);
-	xa_unlock_irq(pages);
+	dax_unlock_pages(mapping, index, entry);
+
 	return ret;
 }
 /*

  parent reply	other threads:[~2018-05-22 14:40 UTC|newest]

Thread overview: 26+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2018-05-22 14:39 [PATCH 00/11] mm: Teach memory_failure() about ZONE_DEVICE pages Dan Williams
2018-05-22 14:39 ` [PATCH 01/11] device-dax: convert to vmf_insert_mixed and vm_fault_t Dan Williams
2018-05-22 14:39 ` [PATCH 02/11] device-dax: cleanup vm_fault de-reference chains Dan Williams
2018-05-22 14:39 ` [PATCH 03/11] device-dax: enable page_mapping() Dan Williams
2018-05-23  9:03   ` Jan Kara
2018-05-30 19:54   ` kbuild test robot
2018-05-22 14:39 ` [PATCH 04/11] device-dax: set page->index Dan Williams
2018-05-22 14:39 ` [PATCH 05/11] filesystem-dax: " Dan Williams
2018-05-23  8:40   ` Jan Kara
2018-05-30  1:38     ` Dan Williams
2018-05-30  8:13       ` Jan Kara
2018-05-30 23:21         ` Dan Williams
2018-05-31 10:08           ` Jan Kara
2018-05-31 21:49             ` Dan Williams
2018-05-22 14:40 ` Dan Williams [this message]
2018-05-23  9:35   ` [PATCH 06/11] filesystem-dax: perform __dax_invalidate_mapping_entry() under the page lock Jan Kara
2018-05-23 13:50     ` Dan Williams
2018-05-22 14:40 ` [PATCH 07/11] mm, madvise_inject_error: fix page count leak Dan Williams
2018-05-23  4:19   ` Naoya Horiguchi
2018-05-24 20:55     ` Dan Williams
2018-05-22 14:40 ` [PATCH 08/11] x86, memory_failure: introduce {set, clear}_mce_nospec() Dan Williams
2018-05-22 14:40 ` [PATCH 09/11] mm, memory_failure: pass page size to kill_proc() Dan Williams
2018-05-23  6:41   ` Naoya Horiguchi
2018-05-22 14:40 ` [PATCH 10/11] mm, memory_failure: teach memory_failure() about dev_pagemap pages Dan Williams
2018-05-23  6:48   ` Naoya Horiguchi
2018-05-22 14:40 ` [PATCH 11/11] libnvdimm, pmem: restore page attributes when clearing errors Dan Williams

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=152700000355.24093.14726378287214432782.stgit@dwillia2-desk3.amr.corp.intel.com \
    --to=dan.j.williams@intel.com \
    --cc=hch@lst.de \
    --cc=jack@suse.cz \
    --cc=linux-fsdevel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=linux-nvdimm@lists.01.org \
    --cc=mawilcox@microsoft.com \
    --cc=ross.zwisler@linux.intel.com \
    --cc=tony.luck@intel.com \
    --subject='Re: [PATCH 06/11] filesystem-dax: perform __dax_invalidate_mapping_entry() under the page lock' \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).