All of lore.kernel.org
 help / color / mirror / Atom feed
From: Shiyang Ruan <ruansy.fnst@cn.fujitsu.com>
To: <linux-kernel@vger.kernel.org>, <linux-xfs@vger.kernel.org>,
	<linux-nvdimm@lists.01.org>, <linux-mm@kvack.org>
Cc: linux-fsdevel@vger.kernel.org, darrick.wong@oracle.com,
	david@fromorbit.com, hch@lst.de, rgoldwyn@suse.de,
	qi.fuli@fujitsu.com, y-goto@fujitsu.com
Subject: [RFC PATCH 3/4] mm, fsdax: refactor dax handler in memory-failure
Date: Tue, 15 Sep 2020 18:13:10 +0800	[thread overview]
Message-ID: <20200915101311.144269-4-ruansy.fnst@cn.fujitsu.com> (raw)
In-Reply-To: <20200915101311.144269-1-ruansy.fnst@cn.fujitsu.com>

With the ->memory_failure() implemented in pmem device and
->storage_lost() in XFS, we are able to track files or metadata
and process them further.

We don't track files by page->mapping, page->index any more, so
some of functions who obtain ->mapping, ->index from struct page
parameter need to be changed by directly passing mapping and index.

Signed-off-by: Shiyang Ruan <ruansy.fnst@cn.fujitsu.com>
---
 fs/dax.c            |  18 +++----
 include/linux/dax.h |   5 +-
 include/linux/mm.h  |   8 +++
 mm/memory-failure.c | 127 +++++++++++++++++++++++++++-----------------
 4 files changed, 94 insertions(+), 64 deletions(-)

diff --git a/fs/dax.c b/fs/dax.c
index 95341af1a966..1ec592f0aadd 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -379,14 +379,14 @@ static struct page *dax_busy_page(void *entry)
 }
 
 /*
- * dax_lock_mapping_entry - Lock the DAX entry corresponding to a page
+ * dax_lock - Lock the DAX entry corresponding to a page
  * @page: The page whose entry we want to lock
  *
  * Context: Process context.
  * Return: A cookie to pass to dax_unlock_page() or 0 if the entry could
  * not be locked.
  */
-dax_entry_t dax_lock_page(struct page *page)
+dax_entry_t dax_lock(struct address_space *mapping, pgoff_t index)
 {
 	XA_STATE(xas, NULL, 0);
 	void *entry;
@@ -394,8 +394,6 @@ dax_entry_t dax_lock_page(struct page *page)
 	/* Ensure page->mapping isn't freed while we look at it */
 	rcu_read_lock();
 	for (;;) {
-		struct address_space *mapping = READ_ONCE(page->mapping);
-
 		entry = NULL;
 		if (!mapping || !dax_mapping(mapping))
 			break;
@@ -413,11 +411,7 @@ dax_entry_t dax_lock_page(struct page *page)
 
 		xas.xa = &mapping->i_pages;
 		xas_lock_irq(&xas);
-		if (mapping != page->mapping) {
-			xas_unlock_irq(&xas);
-			continue;
-		}
-		xas_set(&xas, page->index);
+		xas_set(&xas, index);
 		entry = xas_load(&xas);
 		if (dax_is_locked(entry)) {
 			rcu_read_unlock();
@@ -433,10 +427,10 @@ dax_entry_t dax_lock_page(struct page *page)
 	return (dax_entry_t)entry;
 }
 
-void dax_unlock_page(struct page *page, dax_entry_t cookie)
+void dax_unlock(struct address_space *mapping, pgoff_t index,
+		dax_entry_t cookie)
 {
-	struct address_space *mapping = page->mapping;
-	XA_STATE(xas, &mapping->i_pages, page->index);
+	XA_STATE(xas, &mapping->i_pages, index);
 
 	if (S_ISCHR(mapping->host->i_mode))
 		return;
diff --git a/include/linux/dax.h b/include/linux/dax.h
index 6904d4e0b2e0..669ba768b89e 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -141,8 +141,9 @@ int dax_writeback_mapping_range(struct address_space *mapping,
 		struct dax_device *dax_dev, struct writeback_control *wbc);
 
 struct page *dax_layout_busy_page(struct address_space *mapping);
-dax_entry_t dax_lock_page(struct page *page);
-void dax_unlock_page(struct page *page, dax_entry_t cookie);
+dax_entry_t dax_lock(struct address_space *mapping, pgoff_t index);
+void dax_unlock(struct address_space *mapping, pgoff_t index,
+		dax_entry_t cookie);
 #else
 static inline bool bdev_dax_supported(struct block_device *bdev,
 		int blocksize)
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 3f0c36e1bf3d..d170b3f74d83 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1126,6 +1126,14 @@ static inline bool is_device_private_page(const struct page *page)
 		page->pgmap->type == MEMORY_DEVICE_PRIVATE;
 }
 
+static inline bool is_device_fsdax_page(const struct page *page)
+{
+	return IS_ENABLED(CONFIG_DEV_PAGEMAP_OPS) &&
+		IS_ENABLED(CONFIG_DEVICE_PRIVATE) &&
+		is_zone_device_page(page) &&
+		page->pgmap->type == MEMORY_DEVICE_FS_DAX;
+}
+
 static inline bool is_pci_p2pdma_page(const struct page *page)
 {
 	return IS_ENABLED(CONFIG_DEV_PAGEMAP_OPS) &&
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index f1aa6433f404..0c4a25bf276f 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -93,6 +93,9 @@ static int hwpoison_filter_dev(struct page *p)
 	if (PageSlab(p))
 		return -EINVAL;
 
+	if (is_device_fsdax_page(p))
+		return 0;
+
 	mapping = page_mapping(p);
 	if (mapping == NULL || mapping->host == NULL)
 		return -EINVAL;
@@ -263,9 +266,8 @@ void shake_page(struct page *p, int access)
 EXPORT_SYMBOL_GPL(shake_page);
 
 static unsigned long dev_pagemap_mapping_shift(struct page *page,
-		struct vm_area_struct *vma)
+		struct vm_area_struct *vma, unsigned long address)
 {
-	unsigned long address = vma_address(page, vma);
 	pgd_t *pgd;
 	p4d_t *p4d;
 	pud_t *pud;
@@ -306,8 +308,8 @@ static unsigned long dev_pagemap_mapping_shift(struct page *page,
  * Uses GFP_ATOMIC allocations to avoid potential recursions in the VM.
  */
 static void add_to_kill(struct task_struct *tsk, struct page *p,
-		       struct vm_area_struct *vma,
-		       struct list_head *to_kill)
+		       struct address_space *mapping, pgoff_t pgoff,
+		       struct vm_area_struct *vma, struct list_head *to_kill)
 {
 	struct to_kill *tk;
 
@@ -317,12 +319,18 @@ static void add_to_kill(struct task_struct *tsk, struct page *p,
 		return;
 	}
 
-	tk->addr = page_address_in_vma(p, vma);
-	if (is_zone_device_page(p))
-		tk->size_shift = dev_pagemap_mapping_shift(p, vma);
-	else
-		tk->size_shift = page_shift(compound_head(p));
-
+	if (is_device_fsdax_page(p)) {
+		tk->addr = vma->vm_start +
+				((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
+		tk->size_shift = dev_pagemap_mapping_shift(p, vma, tk->addr);
+	} else {
+		tk->addr = page_address_in_vma(p, vma);
+		if (is_zone_device_page(p)) {
+			tk->size_shift = dev_pagemap_mapping_shift(p, vma,
+							vma_address(p, vma));
+		} else
+			tk->size_shift = page_shift(compound_head(p));
+	}
 	/*
 	 * Send SIGKILL if "tk->addr == -EFAULT". Also, as
 	 * "tk->size_shift" is always non-zero for !is_zone_device_page(),
@@ -468,7 +476,7 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill,
 			if (!page_mapped_in_vma(page, vma))
 				continue;
 			if (vma->vm_mm == t->mm)
-				add_to_kill(t, page, vma, to_kill);
+				add_to_kill(t, page, NULL, 0, vma, to_kill);
 		}
 	}
 	read_unlock(&tasklist_lock);
@@ -478,23 +486,19 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill,
 /*
  * Collect processes when the error hit a file mapped page.
  */
-static void collect_procs_file(struct page *page, struct list_head *to_kill,
-				int force_early)
+static void collect_procs_file(struct page *page, struct address_space *mapping,
+		pgoff_t pgoff, struct list_head *to_kill, int force_early)
 {
 	struct vm_area_struct *vma;
 	struct task_struct *tsk;
-	struct address_space *mapping = page->mapping;
 
 	i_mmap_lock_read(mapping);
 	read_lock(&tasklist_lock);
 	for_each_process(tsk) {
-		pgoff_t pgoff = page_to_pgoff(page);
 		struct task_struct *t = task_early_kill(tsk, force_early);
-
 		if (!t)
 			continue;
-		vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff,
-				      pgoff) {
+		vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
 			/*
 			 * Send early kill signal to tasks where a vma covers
 			 * the page but the corrupted page is not necessarily
@@ -502,8 +506,10 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill,
 			 * Assume applications who requested early kill want
 			 * to be informed of all such data corruptions.
 			 */
-			if (vma->vm_mm == t->mm)
-				add_to_kill(t, page, vma, to_kill);
+			if (vma->vm_mm == t->mm) {
+				add_to_kill(t, page, mapping, pgoff, vma,
+					    to_kill);
+			}
 		}
 	}
 	read_unlock(&tasklist_lock);
@@ -522,7 +528,8 @@ static void collect_procs(struct page *page, struct list_head *tokill,
 	if (PageAnon(page))
 		collect_procs_anon(page, tokill, force_early);
 	else
-		collect_procs_file(page, tokill, force_early);
+		collect_procs_file(page, page->mapping, page_to_pgoff(page),
+				   tokill, force_early);
 }
 
 static const char *action_name[] = {
@@ -1176,14 +1183,14 @@ static int memory_failure_hugetlb(unsigned long pfn, int flags)
 	return res;
 }
 
-static int memory_failure_dev_pagemap(unsigned long pfn, int flags,
-		struct dev_pagemap *pgmap)
+static int memory_failure_dev_pagemap_kill_procs(unsigned long pfn, int flags,
+		struct address_space *mapping, pgoff_t index)
 {
 	struct page *page = pfn_to_page(pfn);
 	const bool unmap_success = true;
 	unsigned long size = 0;
 	struct to_kill *tk;
-	LIST_HEAD(tokill);
+	LIST_HEAD(to_kill);
 	int rc = -EBUSY;
 	loff_t start;
 	dax_entry_t cookie;
@@ -1195,28 +1202,9 @@ static int memory_failure_dev_pagemap(unsigned long pfn, int flags,
 	 * also prevents changes to the mapping of this pfn until
 	 * poison signaling is complete.
 	 */
-	cookie = dax_lock_page(page);
+	cookie = dax_lock(mapping, index);
 	if (!cookie)
-		goto out;
-
-	if (hwpoison_filter(page)) {
-		rc = 0;
 		goto unlock;
-	}
-
-	if (pgmap->type == MEMORY_DEVICE_PRIVATE) {
-		/*
-		 * TODO: Handle HMM pages which may need coordination
-		 * with device-side memory.
-		 */
-		goto unlock;
-	}
-
-	/*
-	 * Use this flag as an indication that the dax page has been
-	 * remapped UC to prevent speculative consumption of poison.
-	 */
-	SetPageHWPoison(page);
 
 	/*
 	 * Unlike System-RAM there is no possibility to swap in a
@@ -1225,9 +1213,10 @@ static int memory_failure_dev_pagemap(unsigned long pfn, int flags,
 	 * SIGBUS (i.e. MF_MUST_KILL)
 	 */
 	flags |= MF_ACTION_REQUIRED | MF_MUST_KILL;
-	collect_procs(page, &tokill, flags & MF_ACTION_REQUIRED);
+	collect_procs_file(page, mapping, index, &to_kill,
+			   flags & MF_ACTION_REQUIRED);
 
-	list_for_each_entry(tk, &tokill, nd)
+	list_for_each_entry(tk, &to_kill, nd)
 		if (tk->size_shift)
 			size = max(size, 1UL << tk->size_shift);
 	if (size) {
@@ -1237,13 +1226,51 @@ static int memory_failure_dev_pagemap(unsigned long pfn, int flags,
 		 * actual size of the mapping being torn down is
 		 * communicated in siginfo, see kill_proc()
 		 */
-		start = (page->index << PAGE_SHIFT) & ~(size - 1);
-		unmap_mapping_range(page->mapping, start, start + size, 0);
+		start = (index << PAGE_SHIFT) & ~(size - 1);
+		unmap_mapping_range(mapping, start, start + size, 0);
 	}
-	kill_procs(&tokill, flags & MF_MUST_KILL, !unmap_success, pfn, flags);
+
+	kill_procs(&to_kill, flags & MF_MUST_KILL, !unmap_success, pfn, flags);
 	rc = 0;
 unlock:
-	dax_unlock_page(page, cookie);
+	dax_unlock(mapping, index, cookie);
+	return rc;
+}
+
+static int memory_failure_dev_pagemap(unsigned long pfn, int flags,
+		struct dev_pagemap *pgmap)
+{
+	struct page *page = pfn_to_page(pfn);
+	struct mf_recover_controller mfrc = {
+		.recover_fn = memory_failure_dev_pagemap_kill_procs,
+		.pfn = pfn,
+		.flags = flags,
+	};
+	int rc;
+
+	if (pgmap->type == MEMORY_DEVICE_PRIVATE) {
+		/*
+		 * TODO: Handle HMM pages which may need coordination
+		 * with device-side memory.
+		 */
+		goto out;
+	}
+
+	if (hwpoison_filter(page)) {
+		rc = 0;
+		goto out;
+	}
+
+	/*
+	 * Use this flag as an indication that the dax page has been
+	 * remapped UC to prevent speculative consumption of poison.
+	 */
+	SetPageHWPoison(page);
+
+	/* call driver to handle the memory failure */
+	if (pgmap->ops->memory_failure)
+		rc = pgmap->ops->memory_failure(pgmap, &mfrc);
+
 out:
 	/* drop pgmap ref acquired in caller */
 	put_dev_pagemap(pgmap);
-- 
2.28.0


_______________________________________________
Linux-nvdimm mailing list -- linux-nvdimm@lists.01.org
To unsubscribe send an email to linux-nvdimm-leave@lists.01.org

WARNING: multiple messages have this Message-ID (diff)
From: Shiyang Ruan <ruansy.fnst@cn.fujitsu.com>
To: <linux-kernel@vger.kernel.org>, <linux-xfs@vger.kernel.org>,
	<linux-nvdimm@lists.01.org>, <linux-mm@kvack.org>
Cc: <linux-fsdevel@vger.kernel.org>, <darrick.wong@oracle.com>,
	<dan.j.williams@intel.com>, <david@fromorbit.com>, <hch@lst.de>,
	<rgoldwyn@suse.de>, <qi.fuli@fujitsu.com>, <y-goto@fujitsu.com>
Subject: [RFC PATCH 3/4] mm, fsdax: refactor dax handler in memory-failure
Date: Tue, 15 Sep 2020 18:13:10 +0800	[thread overview]
Message-ID: <20200915101311.144269-4-ruansy.fnst@cn.fujitsu.com> (raw)
In-Reply-To: <20200915101311.144269-1-ruansy.fnst@cn.fujitsu.com>

With the ->memory_failure() implemented in pmem device and
->storage_lost() in XFS, we are able to track files or metadata
and process them further.

We don't track files by page->mapping, page->index any more, so
some of functions who obtain ->mapping, ->index from struct page
parameter need to be changed by directly passing mapping and index.

Signed-off-by: Shiyang Ruan <ruansy.fnst@cn.fujitsu.com>
---
 fs/dax.c            |  18 +++----
 include/linux/dax.h |   5 +-
 include/linux/mm.h  |   8 +++
 mm/memory-failure.c | 127 +++++++++++++++++++++++++++-----------------
 4 files changed, 94 insertions(+), 64 deletions(-)

diff --git a/fs/dax.c b/fs/dax.c
index 95341af1a966..1ec592f0aadd 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -379,14 +379,14 @@ static struct page *dax_busy_page(void *entry)
 }
 
 /*
- * dax_lock_mapping_entry - Lock the DAX entry corresponding to a page
+ * dax_lock - Lock the DAX entry corresponding to a page
  * @page: The page whose entry we want to lock
  *
  * Context: Process context.
  * Return: A cookie to pass to dax_unlock_page() or 0 if the entry could
  * not be locked.
  */
-dax_entry_t dax_lock_page(struct page *page)
+dax_entry_t dax_lock(struct address_space *mapping, pgoff_t index)
 {
 	XA_STATE(xas, NULL, 0);
 	void *entry;
@@ -394,8 +394,6 @@ dax_entry_t dax_lock_page(struct page *page)
 	/* Ensure page->mapping isn't freed while we look at it */
 	rcu_read_lock();
 	for (;;) {
-		struct address_space *mapping = READ_ONCE(page->mapping);
-
 		entry = NULL;
 		if (!mapping || !dax_mapping(mapping))
 			break;
@@ -413,11 +411,7 @@ dax_entry_t dax_lock_page(struct page *page)
 
 		xas.xa = &mapping->i_pages;
 		xas_lock_irq(&xas);
-		if (mapping != page->mapping) {
-			xas_unlock_irq(&xas);
-			continue;
-		}
-		xas_set(&xas, page->index);
+		xas_set(&xas, index);
 		entry = xas_load(&xas);
 		if (dax_is_locked(entry)) {
 			rcu_read_unlock();
@@ -433,10 +427,10 @@ dax_entry_t dax_lock_page(struct page *page)
 	return (dax_entry_t)entry;
 }
 
-void dax_unlock_page(struct page *page, dax_entry_t cookie)
+void dax_unlock(struct address_space *mapping, pgoff_t index,
+		dax_entry_t cookie)
 {
-	struct address_space *mapping = page->mapping;
-	XA_STATE(xas, &mapping->i_pages, page->index);
+	XA_STATE(xas, &mapping->i_pages, index);
 
 	if (S_ISCHR(mapping->host->i_mode))
 		return;
diff --git a/include/linux/dax.h b/include/linux/dax.h
index 6904d4e0b2e0..669ba768b89e 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -141,8 +141,9 @@ int dax_writeback_mapping_range(struct address_space *mapping,
 		struct dax_device *dax_dev, struct writeback_control *wbc);
 
 struct page *dax_layout_busy_page(struct address_space *mapping);
-dax_entry_t dax_lock_page(struct page *page);
-void dax_unlock_page(struct page *page, dax_entry_t cookie);
+dax_entry_t dax_lock(struct address_space *mapping, pgoff_t index);
+void dax_unlock(struct address_space *mapping, pgoff_t index,
+		dax_entry_t cookie);
 #else
 static inline bool bdev_dax_supported(struct block_device *bdev,
 		int blocksize)
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 3f0c36e1bf3d..d170b3f74d83 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1126,6 +1126,14 @@ static inline bool is_device_private_page(const struct page *page)
 		page->pgmap->type == MEMORY_DEVICE_PRIVATE;
 }
 
+static inline bool is_device_fsdax_page(const struct page *page)
+{
+	return IS_ENABLED(CONFIG_DEV_PAGEMAP_OPS) &&
+		IS_ENABLED(CONFIG_DEVICE_PRIVATE) &&
+		is_zone_device_page(page) &&
+		page->pgmap->type == MEMORY_DEVICE_FS_DAX;
+}
+
 static inline bool is_pci_p2pdma_page(const struct page *page)
 {
 	return IS_ENABLED(CONFIG_DEV_PAGEMAP_OPS) &&
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index f1aa6433f404..0c4a25bf276f 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -93,6 +93,9 @@ static int hwpoison_filter_dev(struct page *p)
 	if (PageSlab(p))
 		return -EINVAL;
 
+	if (is_device_fsdax_page(p))
+		return 0;
+
 	mapping = page_mapping(p);
 	if (mapping == NULL || mapping->host == NULL)
 		return -EINVAL;
@@ -263,9 +266,8 @@ void shake_page(struct page *p, int access)
 EXPORT_SYMBOL_GPL(shake_page);
 
 static unsigned long dev_pagemap_mapping_shift(struct page *page,
-		struct vm_area_struct *vma)
+		struct vm_area_struct *vma, unsigned long address)
 {
-	unsigned long address = vma_address(page, vma);
 	pgd_t *pgd;
 	p4d_t *p4d;
 	pud_t *pud;
@@ -306,8 +308,8 @@ static unsigned long dev_pagemap_mapping_shift(struct page *page,
  * Uses GFP_ATOMIC allocations to avoid potential recursions in the VM.
  */
 static void add_to_kill(struct task_struct *tsk, struct page *p,
-		       struct vm_area_struct *vma,
-		       struct list_head *to_kill)
+		       struct address_space *mapping, pgoff_t pgoff,
+		       struct vm_area_struct *vma, struct list_head *to_kill)
 {
 	struct to_kill *tk;
 
@@ -317,12 +319,18 @@ static void add_to_kill(struct task_struct *tsk, struct page *p,
 		return;
 	}
 
-	tk->addr = page_address_in_vma(p, vma);
-	if (is_zone_device_page(p))
-		tk->size_shift = dev_pagemap_mapping_shift(p, vma);
-	else
-		tk->size_shift = page_shift(compound_head(p));
-
+	if (is_device_fsdax_page(p)) {
+		tk->addr = vma->vm_start +
+				((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
+		tk->size_shift = dev_pagemap_mapping_shift(p, vma, tk->addr);
+	} else {
+		tk->addr = page_address_in_vma(p, vma);
+		if (is_zone_device_page(p)) {
+			tk->size_shift = dev_pagemap_mapping_shift(p, vma,
+							vma_address(p, vma));
+		} else
+			tk->size_shift = page_shift(compound_head(p));
+	}
 	/*
 	 * Send SIGKILL if "tk->addr == -EFAULT". Also, as
 	 * "tk->size_shift" is always non-zero for !is_zone_device_page(),
@@ -468,7 +476,7 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill,
 			if (!page_mapped_in_vma(page, vma))
 				continue;
 			if (vma->vm_mm == t->mm)
-				add_to_kill(t, page, vma, to_kill);
+				add_to_kill(t, page, NULL, 0, vma, to_kill);
 		}
 	}
 	read_unlock(&tasklist_lock);
@@ -478,23 +486,19 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill,
 /*
  * Collect processes when the error hit a file mapped page.
  */
-static void collect_procs_file(struct page *page, struct list_head *to_kill,
-				int force_early)
+static void collect_procs_file(struct page *page, struct address_space *mapping,
+		pgoff_t pgoff, struct list_head *to_kill, int force_early)
 {
 	struct vm_area_struct *vma;
 	struct task_struct *tsk;
-	struct address_space *mapping = page->mapping;
 
 	i_mmap_lock_read(mapping);
 	read_lock(&tasklist_lock);
 	for_each_process(tsk) {
-		pgoff_t pgoff = page_to_pgoff(page);
 		struct task_struct *t = task_early_kill(tsk, force_early);
-
 		if (!t)
 			continue;
-		vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff,
-				      pgoff) {
+		vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
 			/*
 			 * Send early kill signal to tasks where a vma covers
 			 * the page but the corrupted page is not necessarily
@@ -502,8 +506,10 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill,
 			 * Assume applications who requested early kill want
 			 * to be informed of all such data corruptions.
 			 */
-			if (vma->vm_mm == t->mm)
-				add_to_kill(t, page, vma, to_kill);
+			if (vma->vm_mm == t->mm) {
+				add_to_kill(t, page, mapping, pgoff, vma,
+					    to_kill);
+			}
 		}
 	}
 	read_unlock(&tasklist_lock);
@@ -522,7 +528,8 @@ static void collect_procs(struct page *page, struct list_head *tokill,
 	if (PageAnon(page))
 		collect_procs_anon(page, tokill, force_early);
 	else
-		collect_procs_file(page, tokill, force_early);
+		collect_procs_file(page, page->mapping, page_to_pgoff(page),
+				   tokill, force_early);
 }
 
 static const char *action_name[] = {
@@ -1176,14 +1183,14 @@ static int memory_failure_hugetlb(unsigned long pfn, int flags)
 	return res;
 }
 
-static int memory_failure_dev_pagemap(unsigned long pfn, int flags,
-		struct dev_pagemap *pgmap)
+static int memory_failure_dev_pagemap_kill_procs(unsigned long pfn, int flags,
+		struct address_space *mapping, pgoff_t index)
 {
 	struct page *page = pfn_to_page(pfn);
 	const bool unmap_success = true;
 	unsigned long size = 0;
 	struct to_kill *tk;
-	LIST_HEAD(tokill);
+	LIST_HEAD(to_kill);
 	int rc = -EBUSY;
 	loff_t start;
 	dax_entry_t cookie;
@@ -1195,28 +1202,9 @@ static int memory_failure_dev_pagemap(unsigned long pfn, int flags,
 	 * also prevents changes to the mapping of this pfn until
 	 * poison signaling is complete.
 	 */
-	cookie = dax_lock_page(page);
+	cookie = dax_lock(mapping, index);
 	if (!cookie)
-		goto out;
-
-	if (hwpoison_filter(page)) {
-		rc = 0;
 		goto unlock;
-	}
-
-	if (pgmap->type == MEMORY_DEVICE_PRIVATE) {
-		/*
-		 * TODO: Handle HMM pages which may need coordination
-		 * with device-side memory.
-		 */
-		goto unlock;
-	}
-
-	/*
-	 * Use this flag as an indication that the dax page has been
-	 * remapped UC to prevent speculative consumption of poison.
-	 */
-	SetPageHWPoison(page);
 
 	/*
 	 * Unlike System-RAM there is no possibility to swap in a
@@ -1225,9 +1213,10 @@ static int memory_failure_dev_pagemap(unsigned long pfn, int flags,
 	 * SIGBUS (i.e. MF_MUST_KILL)
 	 */
 	flags |= MF_ACTION_REQUIRED | MF_MUST_KILL;
-	collect_procs(page, &tokill, flags & MF_ACTION_REQUIRED);
+	collect_procs_file(page, mapping, index, &to_kill,
+			   flags & MF_ACTION_REQUIRED);
 
-	list_for_each_entry(tk, &tokill, nd)
+	list_for_each_entry(tk, &to_kill, nd)
 		if (tk->size_shift)
 			size = max(size, 1UL << tk->size_shift);
 	if (size) {
@@ -1237,13 +1226,51 @@ static int memory_failure_dev_pagemap(unsigned long pfn, int flags,
 		 * actual size of the mapping being torn down is
 		 * communicated in siginfo, see kill_proc()
 		 */
-		start = (page->index << PAGE_SHIFT) & ~(size - 1);
-		unmap_mapping_range(page->mapping, start, start + size, 0);
+		start = (index << PAGE_SHIFT) & ~(size - 1);
+		unmap_mapping_range(mapping, start, start + size, 0);
 	}
-	kill_procs(&tokill, flags & MF_MUST_KILL, !unmap_success, pfn, flags);
+
+	kill_procs(&to_kill, flags & MF_MUST_KILL, !unmap_success, pfn, flags);
 	rc = 0;
 unlock:
-	dax_unlock_page(page, cookie);
+	dax_unlock(mapping, index, cookie);
+	return rc;
+}
+
+static int memory_failure_dev_pagemap(unsigned long pfn, int flags,
+		struct dev_pagemap *pgmap)
+{
+	struct page *page = pfn_to_page(pfn);
+	struct mf_recover_controller mfrc = {
+		.recover_fn = memory_failure_dev_pagemap_kill_procs,
+		.pfn = pfn,
+		.flags = flags,
+	};
+	int rc;
+
+	if (pgmap->type == MEMORY_DEVICE_PRIVATE) {
+		/*
+		 * TODO: Handle HMM pages which may need coordination
+		 * with device-side memory.
+		 */
+		goto out;
+	}
+
+	if (hwpoison_filter(page)) {
+		rc = 0;
+		goto out;
+	}
+
+	/*
+	 * Use this flag as an indication that the dax page has been
+	 * remapped UC to prevent speculative consumption of poison.
+	 */
+	SetPageHWPoison(page);
+
+	/* call driver to handle the memory failure */
+	if (pgmap->ops->memory_failure)
+		rc = pgmap->ops->memory_failure(pgmap, &mfrc);
+
 out:
 	/* drop pgmap ref acquired in caller */
 	put_dev_pagemap(pgmap);
-- 
2.28.0




  parent reply	other threads:[~2020-09-15 10:13 UTC|newest]

Thread overview: 18+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2020-09-15 10:13 [RFC PATCH 0/4] fsdax: introduce fs query to support reflink Shiyang Ruan
2020-09-15 10:13 ` Shiyang Ruan
2020-09-15 10:13 ` [RFC PATCH 1/4] fs: introduce ->storage_lost() for memory-failure Shiyang Ruan
2020-09-15 10:13   ` Shiyang Ruan
2020-09-15 16:16   ` Darrick J. Wong
2020-09-15 16:16     ` Darrick J. Wong
2020-09-16  2:15     ` Ruan Shiyang
2020-09-16  2:15       ` Ruan Shiyang
2020-09-15 10:13 ` [RFC PATCH 2/4] pagemap: introduce ->memory_failure() Shiyang Ruan
2020-09-15 10:13   ` Shiyang Ruan
2020-09-15 16:31   ` Darrick J. Wong
2020-09-15 16:31     ` Darrick J. Wong
2020-09-16  2:15     ` Ruan Shiyang
2020-09-16  2:15       ` Ruan Shiyang
2020-09-15 10:13 ` Shiyang Ruan [this message]
2020-09-15 10:13   ` [RFC PATCH 3/4] mm, fsdax: refactor dax handler in memory-failure Shiyang Ruan
2020-09-15 10:13 ` [RFC PATCH 4/4] fsdax: remove useless (dis)associate functions Shiyang Ruan
2020-09-15 10:13   ` Shiyang Ruan

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20200915101311.144269-4-ruansy.fnst@cn.fujitsu.com \
    --to=ruansy.fnst@cn.fujitsu.com \
    --cc=darrick.wong@oracle.com \
    --cc=david@fromorbit.com \
    --cc=hch@lst.de \
    --cc=linux-fsdevel@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=linux-nvdimm@lists.01.org \
    --cc=linux-xfs@vger.kernel.org \
    --cc=qi.fuli@fujitsu.com \
    --cc=rgoldwyn@suse.de \
    --cc=y-goto@fujitsu.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.