All of lore.kernel.org
 help / color / mirror / Atom feed
* [RFC PATCH] mm, hwpoison: Recover from copy-on-write machine checks
@ 2022-10-17 23:42 Tony Luck
  2022-10-18  8:43 ` HORIGUCHI NAOYA(堀口 直也)
  0 siblings, 1 reply; 69+ messages in thread
From: Tony Luck @ 2022-10-17 23:42 UTC (permalink / raw)
  To: Naoya Horiguchi, Andrew Morton
  Cc: Miaohe Lin, Matthew Wilcox, Shuai Xue, Dan Williams, linux-mm,
	linux-kernel, Tony Luck

If the kernel is copying a page as the result of a copy-on-write
fault and runs into an uncorrectable error, Linux will crash because
it does not have recovery code for this case where poison is consumed
by the kernel.

It is easy to set up a test case. Just inject an error into a private
page, fork(2), and have the child process write to the page.

I wrapped that neatly into a test at:

  git://git.kernel.org/pub/scm/linux/kernel/git/aegl/ras-tools.git

just enable ACPI error injection and run:

  # ./einj_mem-uc -f copy-on-write

[Note this test needs some better reporting for the case where this
patch has been applied and the system does NOT crash]

Patch below works ... but there are probably many places where it could
fit better into the general "mm" way of doing things. E.g. using the
copy_mc_to_kernel() function does what I need here, but the name doesn't
seem like it is quite right.

Basic idea is very simple ... if the kernel gets a machine check copying
the page, just free up the new page that was going to be the target of
the copy and return VM_FAULT_HWPOISON to the calling stack.

Slightly-signed-off-by: Tony Luck <tony.luck@intel.com>
---
 include/linux/highmem.h | 19 +++++++++++++++++++
 mm/memory.c             | 28 ++++++++++++++++++++--------
 2 files changed, 39 insertions(+), 8 deletions(-)

diff --git a/include/linux/highmem.h b/include/linux/highmem.h
index e9912da5441b..5967541fbf0e 100644
--- a/include/linux/highmem.h
+++ b/include/linux/highmem.h
@@ -319,6 +319,25 @@ static inline void copy_user_highpage(struct page *to, struct page *from,
 
 #endif
 
+static inline int copy_user_highpage_mc(struct page *to, struct page *from,
+					unsigned long vaddr, struct vm_area_struct *vma)
+{
+	unsigned long ret = 0;
+#ifdef copy_mc_to_kernel
+	char *vfrom, *vto;
+
+	vfrom = kmap_local_page(from);
+	vto = kmap_local_page(to);
+	ret = copy_mc_to_kernel(vto, vfrom, PAGE_SIZE);
+	kunmap_local(vto);
+	kunmap_local(vfrom);
+#else
+	copy_user_highpage(to, from, vaddr, vma);
+#endif
+
+	return ret;
+}
+
 #ifndef __HAVE_ARCH_COPY_HIGHPAGE
 
 static inline void copy_highpage(struct page *to, struct page *from)
diff --git a/mm/memory.c b/mm/memory.c
index f88c351aecd4..b5e22bf4c10a 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2848,8 +2848,14 @@ static inline int pte_unmap_same(struct vm_fault *vmf)
 	return same;
 }
 
-static inline bool __wp_page_copy_user(struct page *dst, struct page *src,
-				       struct vm_fault *vmf)
+/*
+ * Return:
+ *	-1 = copy failed due to poison in source page
+ *	0 = copied failed (some other reason)
+ *	1 = copied succeeded
+ */
+static inline int __wp_page_copy_user(struct page *dst, struct page *src,
+				      struct vm_fault *vmf)
 {
 	bool ret;
 	void *kaddr;
@@ -2860,8 +2866,9 @@ static inline bool __wp_page_copy_user(struct page *dst, struct page *src,
 	unsigned long addr = vmf->address;
 
 	if (likely(src)) {
-		copy_user_highpage(dst, src, addr, vma);
-		return true;
+		if (copy_user_highpage_mc(dst, src, addr, vma))
+			return -1;
+		return 1;
 	}
 
 	/*
@@ -2888,7 +2895,7 @@ static inline bool __wp_page_copy_user(struct page *dst, struct page *src,
 			 * and update local tlb only
 			 */
 			update_mmu_tlb(vma, addr, vmf->pte);
-			ret = false;
+			ret = 0;
 			goto pte_unlock;
 		}
 
@@ -2913,7 +2920,7 @@ static inline bool __wp_page_copy_user(struct page *dst, struct page *src,
 		if (!likely(pte_same(*vmf->pte, vmf->orig_pte))) {
 			/* The PTE changed under us, update local tlb */
 			update_mmu_tlb(vma, addr, vmf->pte);
-			ret = false;
+			ret = 0;
 			goto pte_unlock;
 		}
 
@@ -2932,7 +2939,7 @@ static inline bool __wp_page_copy_user(struct page *dst, struct page *src,
 		}
 	}
 
-	ret = true;
+	ret = 1;
 
 pte_unlock:
 	if (locked)
@@ -3104,6 +3111,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
 	pte_t entry;
 	int page_copied = 0;
 	struct mmu_notifier_range range;
+	int ret;
 
 	delayacct_wpcopy_start();
 
@@ -3121,7 +3129,11 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
 		if (!new_page)
 			goto oom;
 
-		if (!__wp_page_copy_user(new_page, old_page, vmf)) {
+		ret = __wp_page_copy_user(new_page, old_page, vmf);
+		if (ret == -1) {
+			put_page(new_page);
+			return VM_FAULT_HWPOISON;
+		} else if (ret == 0) {
 			/*
 			 * COW failed, if the fault was solved by other,
 			 * it's fine. If not, userspace would re-fault on
-- 
2.37.3


^ permalink raw reply related	[flat|nested] 69+ messages in thread
* Re: [PATCH v4 0/2] Copy-on-write poison recovery
@ 2023-05-08  3:18 Albert E. Davies
  0 siblings, 0 replies; 69+ messages in thread
From: Albert E. Davies @ 2023-05-08  3:18 UTC (permalink / raw)
  To: tony.luck
  Cc: akpm, christophe.leroy, dan.j.williams, glider, linmiaohe,
	linux-kernel, linux-mm, linuxppc-dev, mpe, naoya.horiguchi,
	npiggin, willy, xueshuai

[-- Attachment #1: Type: text/plain, Size: 55 bytes --]




Get Outlook for Android<https://aka.ms/AAb9ysg>

[-- Attachment #2: Type: text/html, Size: 452 bytes --]

^ permalink raw reply	[flat|nested] 69+ messages in thread

end of thread, other threads:[~2023-05-19  7:29 UTC | newest]

Thread overview: 69+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-10-17 23:42 [RFC PATCH] mm, hwpoison: Recover from copy-on-write machine checks Tony Luck
2022-10-18  8:43 ` HORIGUCHI NAOYA(堀口 直也)
2022-10-18 17:52   ` Luck, Tony
2022-10-19 17:08     ` [PATCH v2] mm, hwpoison: Try to recover from copy-on write faults Tony Luck
2022-10-19 17:08       ` Tony Luck
2022-10-19 17:45       ` Dan Williams
2022-10-19 17:45         ` Dan Williams
2022-10-19 20:30         ` Luck, Tony
2022-10-19 20:30           ` Luck, Tony
2022-10-20  1:57       ` Shuai Xue
2022-10-20  1:57         ` Shuai Xue
2022-10-20 20:05         ` Tony Luck
2022-10-20 20:05           ` Tony Luck
2022-10-21  1:38           ` Miaohe Lin
2022-10-21  1:38             ` Miaohe Lin
2022-10-21  3:57             ` Luck, Tony
2022-10-21  3:57               ` Luck, Tony
2022-10-21  1:52           ` Shuai Xue
2022-10-21  1:52             ` Shuai Xue
2022-10-21  4:08             ` Tony Luck
2022-10-21  4:08               ` Tony Luck
2022-10-21  4:11               ` David Laight
2022-10-21  4:11                 ` David Laight
2022-10-21  4:41                 ` Luck, Tony
2022-10-21  4:41                   ` Luck, Tony
2022-10-21  9:29                   ` Shuai Xue
2022-10-21  9:29                     ` Shuai Xue
2022-10-21 16:30                     ` Luck, Tony
2022-10-21 16:30                       ` Luck, Tony
2022-10-23 15:04                       ` Shuai Xue
2022-10-23 15:04                         ` Shuai Xue
2022-10-21  6:57               ` Shuai Xue
2022-10-21  6:57                 ` Shuai Xue
2022-10-21 20:01       ` [PATCH v3 0/2] Copy-on-write poison recovery Tony Luck
2022-10-21 20:01         ` Tony Luck
2022-10-21 20:01         ` [PATCH v3 1/2] mm, hwpoison: Try to recover from copy-on write faults Tony Luck
2022-10-21 20:01           ` Tony Luck
2022-10-25  5:46           ` HORIGUCHI NAOYA(堀口 直也)
2022-10-25  5:46             ` HORIGUCHI NAOYA(堀口 直也)
2022-10-28  2:11           ` Miaohe Lin
2022-10-28  2:11             ` Miaohe Lin
2022-10-28 16:09             ` Luck, Tony
2022-10-28 16:09               ` Luck, Tony
2022-11-02 14:27               ` Alexander Potapenko
2022-11-02 14:27                 ` Alexander Potapenko
2022-11-02 14:30                 ` Alexander Potapenko
2022-11-02 14:30                   ` Alexander Potapenko
2022-10-21 20:01         ` [PATCH v3 2/2] mm, hwpoison: When copy-on-write hits poison, take page offline Tony Luck
2022-10-21 20:01           ` Tony Luck
2022-10-28  2:28           ` Miaohe Lin
2022-10-28  2:28             ` Miaohe Lin
2022-10-28 16:13             ` Luck, Tony
2022-10-28 16:13               ` Luck, Tony
2022-10-29  1:55               ` Miaohe Lin
2022-10-29  1:55                 ` Miaohe Lin
2022-10-23 15:52         ` [PATCH v3 0/2] Copy-on-write poison recovery Shuai Xue
2022-10-23 15:52           ` Shuai Xue
2022-10-26  5:19           ` Shuai Xue
2022-10-26  5:19             ` Shuai Xue
2022-10-31 20:10         ` [PATCH v4 " Tony Luck
2022-10-31 20:10           ` Tony Luck
2022-10-31 20:10           ` [PATCH v4 1/2] mm, hwpoison: Try to recover from copy-on write faults Tony Luck
2022-10-31 20:10             ` Tony Luck
2022-10-31 20:10           ` [PATCH v4 2/2] mm, hwpoison: When copy-on-write hits poison, take page offline Tony Luck
2022-10-31 20:10             ` Tony Luck
2023-05-18 21:49             ` Jane Chu
2023-05-18 22:10               ` Luck, Tony
2023-05-19  7:28               ` Greg Kroah-Hartman
2023-05-08  3:18 [PATCH v4 0/2] Copy-on-write poison recovery Albert E. Davies

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.