linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed
* [PATCH -next] mm: hwpoison: support recovery from HugePage copy-on-write faults
@ 2023-04-11  9:27 Liu Shixin
  2023-04-12  0:26 ` Andrew Morton
  2023-04-12 18:13 ` Mike Kravetz
  0 siblings, 2 replies; 12+ messages in thread
From: Liu Shixin @ 2023-04-11  9:27 UTC (permalink / raw)
  To: Andrew Morton, Naoya Horiguchi, Tony Luck
  Cc: Miaohe Lin, Mike Kravetz, Muchun Song, linux-mm, linux-kernel,
	Liu Shixin

Patch a873dfe1032a ("mm, hwpoison: try to recover from copy-on write faults")
introduced a new copy_user_highpage_mc() function, and fix the kernel crash
when the kernel is copying a normal page as the result of a copy-on-write
fault and runs into an uncorrectable error. But it doesn't work for HugeTLB.

This is to support HugeTLB by using copy_mc_user_highpage() in copy_subpage()
and copy_user_gigantic_page() too.

Moreover, this is also used by userfaultfd, it will return -EHWPOISON if
running into an uncorrectable error.

Signed-off-by: Liu Shixin <liushixin2@huawei.com>
---
 include/linux/mm.h |  6 ++---
 mm/hugetlb.c       | 19 +++++++++++----
 mm/memory.c        | 59 +++++++++++++++++++++++++++++-----------------
 3 files changed, 56 insertions(+), 28 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index cb56b52fc5a2..64827c4d3818 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -3714,9 +3714,9 @@ extern const struct attribute_group memory_failure_attr_group;
 extern void clear_huge_page(struct page *page,
 			    unsigned long addr_hint,
 			    unsigned int pages_per_huge_page);
-void copy_user_folio(struct folio *dst, struct folio *src,
-		      unsigned long addr_hint,
-		      struct vm_area_struct *vma);
+int copy_user_folio(struct folio *dst, struct folio *src,
+		    unsigned long addr_hint,
+		    struct vm_area_struct *vma);
 long copy_folio_from_user(struct folio *dst_folio,
 			   const void __user *usr_src,
 			   bool allow_pagefault);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index efc443a906fa..b92377aae9ae 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -5142,9 +5142,13 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
 					ret = PTR_ERR(new_folio);
 					break;
 				}
-				copy_user_folio(new_folio, page_folio(ptepage),
-						addr, dst_vma);
+				ret = copy_user_folio(new_folio, page_folio(ptepage),
+						      addr, dst_vma);
 				put_page(ptepage);
+				if (ret) {
+					folio_put(new_folio);
+					break;
+				}
 
 				/* Install the new hugetlb folio if src pte stable */
 				dst_ptl = huge_pte_lock(h, dst, dst_pte);
@@ -5661,7 +5665,10 @@ static vm_fault_t hugetlb_wp(struct mm_struct *mm, struct vm_area_struct *vma,
 		goto out_release_all;
 	}
 
-	copy_user_folio(new_folio, page_folio(old_page), address, vma);
+	if (copy_user_folio(new_folio, page_folio(old_page), address, vma)) {
+		ret = VM_FAULT_HWPOISON_LARGE;
+		goto out_release_all;
+	}
 	__folio_mark_uptodate(new_folio);
 
 	mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, haddr,
@@ -6304,9 +6311,13 @@ int hugetlb_mfill_atomic_pte(pte_t *dst_pte,
 			*foliop = NULL;
 			goto out;
 		}
-		copy_user_folio(folio, *foliop, dst_addr, dst_vma);
+		ret = copy_user_folio(folio, *foliop, dst_addr, dst_vma);
 		folio_put(*foliop);
 		*foliop = NULL;
+		if (ret) {
+			folio_put(folio);
+			goto out;
+		}
 	}
 
 	/*
diff --git a/mm/memory.c b/mm/memory.c
index 42dd1ab5e4e6..a9ca2cd80638 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -5734,12 +5734,12 @@ EXPORT_SYMBOL(__might_fault);
  * operation.  The target subpage will be processed last to keep its
  * cache lines hot.
  */
-static inline void process_huge_page(
+static inline int process_huge_page(
 	unsigned long addr_hint, unsigned int pages_per_huge_page,
-	void (*process_subpage)(unsigned long addr, int idx, void *arg),
+	int (*process_subpage)(unsigned long addr, int idx, void *arg),
 	void *arg)
 {
-	int i, n, base, l;
+	int i, n, base, l, ret;
 	unsigned long addr = addr_hint &
 		~(((unsigned long)pages_per_huge_page << PAGE_SHIFT) - 1);
 
@@ -5753,7 +5753,9 @@ static inline void process_huge_page(
 		/* Process subpages at the end of huge page */
 		for (i = pages_per_huge_page - 1; i >= 2 * n; i--) {
 			cond_resched();
-			process_subpage(addr + i * PAGE_SIZE, i, arg);
+			ret = process_subpage(addr + i * PAGE_SIZE, i, arg);
+			if (ret)
+				return ret;
 		}
 	} else {
 		/* If target subpage in second half of huge page */
@@ -5762,7 +5764,9 @@ static inline void process_huge_page(
 		/* Process subpages at the begin of huge page */
 		for (i = 0; i < base; i++) {
 			cond_resched();
-			process_subpage(addr + i * PAGE_SIZE, i, arg);
+			ret = process_subpage(addr + i * PAGE_SIZE, i, arg);
+			if (ret)
+				return ret;
 		}
 	}
 	/*
@@ -5774,10 +5778,15 @@ static inline void process_huge_page(
 		int right_idx = base + 2 * l - 1 - i;
 
 		cond_resched();
-		process_subpage(addr + left_idx * PAGE_SIZE, left_idx, arg);
+		ret = process_subpage(addr + left_idx * PAGE_SIZE, left_idx, arg);
+		if (ret)
+			return ret;
 		cond_resched();
-		process_subpage(addr + right_idx * PAGE_SIZE, right_idx, arg);
+		ret = process_subpage(addr + right_idx * PAGE_SIZE, right_idx, arg);
+		if (ret)
+			return ret;
 	}
+	return 0;
 }
 
 static void clear_gigantic_page(struct page *page,
@@ -5795,11 +5804,12 @@ static void clear_gigantic_page(struct page *page,
 	}
 }
 
-static void clear_subpage(unsigned long addr, int idx, void *arg)
+static int clear_subpage(unsigned long addr, int idx, void *arg)
 {
 	struct page *page = arg;
 
 	clear_user_highpage(page + idx, addr);
+	return 0;
 }
 
 void clear_huge_page(struct page *page,
@@ -5816,7 +5826,7 @@ void clear_huge_page(struct page *page,
 	process_huge_page(addr_hint, pages_per_huge_page, clear_subpage, page);
 }
 
-static void copy_user_gigantic_page(struct folio *dst, struct folio *src,
+static int copy_user_gigantic_page(struct folio *dst, struct folio *src,
 				     unsigned long addr,
 				     struct vm_area_struct *vma,
 				     unsigned int pages_per_huge_page)
@@ -5830,8 +5840,13 @@ static void copy_user_gigantic_page(struct folio *dst, struct folio *src,
 		src_page = folio_page(src, i);
 
 		cond_resched();
-		copy_user_highpage(dst_page, src_page, addr + i*PAGE_SIZE, vma);
+		if (copy_mc_user_highpage(dst_page, src_page,
+					  addr + i*PAGE_SIZE, vma)) {
+			memory_failure_queue(page_to_pfn(src_page), 0);
+			return -EHWPOISON;
+		}
 	}
+	return 0;
 }
 
 struct copy_subpage_arg {
@@ -5840,16 +5855,20 @@ struct copy_subpage_arg {
 	struct vm_area_struct *vma;
 };
 
-static void copy_subpage(unsigned long addr, int idx, void *arg)
+static int copy_subpage(unsigned long addr, int idx, void *arg)
 {
 	struct copy_subpage_arg *copy_arg = arg;
 
-	copy_user_highpage(copy_arg->dst + idx, copy_arg->src + idx,
-			   addr, copy_arg->vma);
+	if (copy_mc_user_highpage(copy_arg->dst + idx, copy_arg->src + idx,
+				  addr, copy_arg->vma)) {
+		memory_failure_queue(page_to_pfn(copy_arg->src + idx), 0);
+		return -EHWPOISON;
+	}
+	return 0;
 }
 
-void copy_user_folio(struct folio *dst, struct folio *src,
-		      unsigned long addr_hint, struct vm_area_struct *vma)
+int copy_user_folio(struct folio *dst, struct folio *src,
+		    unsigned long addr_hint, struct vm_area_struct *vma)
 {
 	unsigned int pages_per_huge_page = folio_nr_pages(dst);
 	unsigned long addr = addr_hint &
@@ -5860,13 +5879,11 @@ void copy_user_folio(struct folio *dst, struct folio *src,
 		.vma = vma,
 	};
 
-	if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) {
-		copy_user_gigantic_page(dst, src, addr, vma,
-					pages_per_huge_page);
-		return;
-	}
+	if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES))
+		return copy_user_gigantic_page(dst, src, addr, vma,
+					       pages_per_huge_page);
 
-	process_huge_page(addr_hint, pages_per_huge_page, copy_subpage, &arg);
+	return process_huge_page(addr_hint, pages_per_huge_page, copy_subpage, &arg);
 }
 
 long copy_folio_from_user(struct folio *dst_folio,
-- 
2.25.1



^ permalink raw reply related	[flat|nested] 12+ messages in thread

* Re: [PATCH -next] mm: hwpoison: support recovery from HugePage copy-on-write faults
  2023-04-11  9:27 [PATCH -next] mm: hwpoison: support recovery from HugePage copy-on-write faults Liu Shixin
@ 2023-04-12  0:26 ` Andrew Morton
  2023-04-12 18:13 ` Mike Kravetz
  1 sibling, 0 replies; 12+ messages in thread
From: Andrew Morton @ 2023-04-12  0:26 UTC (permalink / raw)
  To: Liu Shixin
  Cc: Naoya Horiguchi, Tony Luck, Miaohe Lin, Mike Kravetz,
	Muchun Song, linux-mm, linux-kernel

On Tue, 11 Apr 2023 17:27:41 +0800 Liu Shixin <liushixin2@huawei.com> wrote:

> Patch a873dfe1032a ("mm, hwpoison: try to recover from copy-on write faults")
> introduced a new copy_user_highpage_mc() function, and fix the kernel crash
> when the kernel is copying a normal page as the result of a copy-on-write
> fault and runs into an uncorrectable error. But it doesn't work for HugeTLB.

What does "doesn't work" mean?  Please fully describe the user-visible
effects of the issue which this patch is addressing.

> This is to support HugeTLB by using copy_mc_user_highpage() in copy_subpage()
> and copy_user_gigantic_page() too.
> 
> Moreover, this is also used by userfaultfd, it will return -EHWPOISON if
> running into an uncorrectable error.


^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH -next] mm: hwpoison: support recovery from HugePage copy-on-write faults
  2023-04-11  9:27 [PATCH -next] mm: hwpoison: support recovery from HugePage copy-on-write faults Liu Shixin
  2023-04-12  0:26 ` Andrew Morton
@ 2023-04-12 18:13 ` Mike Kravetz
  2023-04-12 21:57   ` Andrew Morton
  2023-04-13 12:57   ` Jiaqi Yan
  1 sibling, 2 replies; 12+ messages in thread
From: Mike Kravetz @ 2023-04-12 18:13 UTC (permalink / raw)
  To: Liu Shixin
  Cc: Andrew Morton, Naoya Horiguchi, Tony Luck, Miaohe Lin,
	Muchun Song, linux-mm, linux-kernel

On 04/11/23 17:27, Liu Shixin wrote:
> Patch a873dfe1032a ("mm, hwpoison: try to recover from copy-on write faults")
> introduced a new copy_user_highpage_mc() function, and fix the kernel crash
> when the kernel is copying a normal page as the result of a copy-on-write
> fault and runs into an uncorrectable error. But it doesn't work for HugeTLB.

Andrew asked about user-visible effects.  Perhaps, a better way of
stating this in the commit message might be:

Commit a873dfe1032a ("mm, hwpoison: try to recover from copy-on write
faults") introduced the routine copy_user_highpage_mc() to gracefully
handle copying of user pages with uncorrectable errors.  Previously,
such copies would result in a kernel crash.  hugetlb has separate code
paths for copy-on-write and does not benefit from the changes made in
commit a873dfe1032a.

Modify hugetlb copy-on-write code paths to use copy_mc_user_highpage()
so that they can also gracefully handle uncorrectable errors in user
pages.  This involves changing the hugetlb specific routine
?copy_user_folio()? from type void to int so that it can return an error.
Modify the hugetlb userfaultfd code in the same way so that it can return
-EHWPOISON if it encounters an uncorrectable error.

NOTE - There is still some churn in the series that introduces
copy_user_folio() and the name may change.

> This is to support HugeTLB by using copy_mc_user_highpage() in copy_subpage()
> and copy_user_gigantic_page() too.
> 
> Moreover, this is also used by userfaultfd, it will return -EHWPOISON if
> running into an uncorrectable error.
> 
> Signed-off-by: Liu Shixin <liushixin2@huawei.com>
> ---
>  include/linux/mm.h |  6 ++---
>  mm/hugetlb.c       | 19 +++++++++++----
>  mm/memory.c        | 59 +++++++++++++++++++++++++++++-----------------
>  3 files changed, 56 insertions(+), 28 deletions(-)

Code changes look good to me.

Acked-by: Mike Kravetz <mike.kravetz@oracle.com>

Related question perhaps for Tony not directly impacting this patch.
This patch touches the hugetlb clear page paths withour consequence.

Just wondering if we can/should create something like clear_mc_user_highpage
to address clearing pages as well?  Apologies if this was previously
discussed.
-- 
Mike Kravetz


^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH -next] mm: hwpoison: support recovery from HugePage copy-on-write faults
  2023-04-12 18:13 ` Mike Kravetz
@ 2023-04-12 21:57   ` Andrew Morton
  2023-04-12 22:21     ` Mike Kravetz
  2023-04-13  1:49     ` Liu Shixin
  2023-04-13 12:57   ` Jiaqi Yan
  1 sibling, 2 replies; 12+ messages in thread
From: Andrew Morton @ 2023-04-12 21:57 UTC (permalink / raw)
  To: Mike Kravetz
  Cc: Liu Shixin, Naoya Horiguchi, Tony Luck, Miaohe Lin, Muchun Song,
	linux-mm, linux-kernel

On Wed, 12 Apr 2023 11:13:50 -0700 Mike Kravetz <mike.kravetz@oracle.com> wrote:

> On 04/11/23 17:27, Liu Shixin wrote:
> > Patch a873dfe1032a ("mm, hwpoison: try to recover from copy-on write faults")
> > introduced a new copy_user_highpage_mc() function, and fix the kernel crash
> > when the kernel is copying a normal page as the result of a copy-on-write
> > fault and runs into an uncorrectable error. But it doesn't work for HugeTLB.
> 
> Andrew asked about user-visible effects.  Perhaps, a better way of
> stating this in the commit message might be:
> 
> Commit a873dfe1032a ("mm, hwpoison: try to recover from copy-on write
> faults") introduced the routine copy_user_highpage_mc() to gracefully
> handle copying of user pages with uncorrectable errors.  Previously,
> such copies would result in a kernel crash.  hugetlb has separate code
> paths for copy-on-write and does not benefit from the changes made in
> commit a873dfe1032a.
> 
> Modify hugetlb copy-on-write code paths to use copy_mc_user_highpage()
> so that they can also gracefully handle uncorrectable errors in user
> pages.  This involves changing the hugetlb specific routine
> ?copy_user_folio()? from type void to int so that it can return an error.
> Modify the hugetlb userfaultfd code in the same way so that it can return
> -EHWPOISON if it encounters an uncorrectable error.

Thanks, but...  what are the runtime effects?  What does hugetlb
presently do when encountering these uncorrectable error?




^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH -next] mm: hwpoison: support recovery from HugePage copy-on-write faults
  2023-04-12 21:57   ` Andrew Morton
@ 2023-04-12 22:21     ` Mike Kravetz
  2023-04-12 22:56       ` Andrew Morton
  2023-04-13  1:51       ` Liu Shixin
  2023-04-13  1:49     ` Liu Shixin
  1 sibling, 2 replies; 12+ messages in thread
From: Mike Kravetz @ 2023-04-12 22:21 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Liu Shixin, Naoya Horiguchi, Tony Luck, Miaohe Lin, Muchun Song,
	linux-mm, linux-kernel

On 04/12/23 14:57, Andrew Morton wrote:
> On Wed, 12 Apr 2023 11:13:50 -0700 Mike Kravetz <mike.kravetz@oracle.com> wrote:
> 
> > On 04/11/23 17:27, Liu Shixin wrote:
> > > Patch a873dfe1032a ("mm, hwpoison: try to recover from copy-on write faults")
> > > introduced a new copy_user_highpage_mc() function, and fix the kernel crash
> > > when the kernel is copying a normal page as the result of a copy-on-write
> > > fault and runs into an uncorrectable error. But it doesn't work for HugeTLB.
> > 
> > Andrew asked about user-visible effects.  Perhaps, a better way of
> > stating this in the commit message might be:
> > 
> > Commit a873dfe1032a ("mm, hwpoison: try to recover from copy-on write
> > faults") introduced the routine copy_user_highpage_mc() to gracefully
> > handle copying of user pages with uncorrectable errors.  Previously,
> > such copies would result in a kernel crash.  hugetlb has separate code
> > paths for copy-on-write and does not benefit from the changes made in
> > commit a873dfe1032a.

I was just going to suggest adding the line,

Hence, copy-on-write of hugetlb user pages with uncorrectable errors            
will result in a kernel crash as was the case with 'normal' pages before        
commit a873dfe1032a.

However, I'm guessing it might be more clear if we start with the
runtime effects.  Something like:

copy-on-write of hugetlb user pages with uncorrectable errors will result
in a kernel crash.  This is because the copy is performed in kernel mode
and in general we can not handle accessing memory with such errors while
in kernel mode.  Commit a873dfe1032a ("mm, hwpoison: try to recover from
copy-on write faults") introduced the routine copy_user_highpage_mc() to
gracefully handle copying of user pages with uncorrectable errors.  However,
the separate hugetlb copy-on-write code paths were not modified as part
of commit a873dfe1032a.

> > 
> > Modify hugetlb copy-on-write code paths to use copy_mc_user_highpage()
> > so that they can also gracefully handle uncorrectable errors in user
> > pages.  This involves changing the hugetlb specific routine
> > ?copy_user_folio()? from type void to int so that it can return an error.
> > Modify the hugetlb userfaultfd code in the same way so that it can return
> > -EHWPOISON if it encounters an uncorrectable error.
> 
> Thanks, but...  what are the runtime effects?  What does hugetlb
> presently do when encountering these uncorrectable error?

-- 
Mike Kravetz


^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH -next] mm: hwpoison: support recovery from HugePage copy-on-write faults
  2023-04-12 22:21     ` Mike Kravetz
@ 2023-04-12 22:56       ` Andrew Morton
  2023-04-12 23:37         ` Mike Kravetz
  2023-04-13  1:55         ` Liu Shixin
  2023-04-13  1:51       ` Liu Shixin
  1 sibling, 2 replies; 12+ messages in thread
From: Andrew Morton @ 2023-04-12 22:56 UTC (permalink / raw)
  To: Mike Kravetz
  Cc: Liu Shixin, Naoya Horiguchi, Tony Luck, Miaohe Lin, Muchun Song,
	linux-mm, linux-kernel

On Wed, 12 Apr 2023 15:21:38 -0700 Mike Kravetz <mike.kravetz@oracle.com> wrote:

> > > Commit a873dfe1032a ("mm, hwpoison: try to recover from copy-on write
> > > faults") introduced the routine copy_user_highpage_mc() to gracefully
> > > handle copying of user pages with uncorrectable errors.  Previously,
> > > such copies would result in a kernel crash.  hugetlb has separate code
> > > paths for copy-on-write and does not benefit from the changes made in
> > > commit a873dfe1032a.
> 
> I was just going to suggest adding the line,
> 
> Hence, copy-on-write of hugetlb user pages with uncorrectable errors            
> will result in a kernel crash as was the case with 'normal' pages before        
> commit a873dfe1032a.
> 
> However, I'm guessing it might be more clear if we start with the
> runtime effects.  Something like:
> 
> copy-on-write of hugetlb user pages with uncorrectable errors will result
> in a kernel crash.  This is because the copy is performed in kernel mode
> and in general we can not handle accessing memory with such errors while
> in kernel mode.  Commit a873dfe1032a ("mm, hwpoison: try to recover from
> copy-on write faults") introduced the routine copy_user_highpage_mc() to
> gracefully handle copying of user pages with uncorrectable errors.  However,
> the separate hugetlb copy-on-write code paths were not modified as part
> of commit a873dfe1032a.

Sounds good.  So I assume cc:stable is desirable.

I can't actually get the patch to apply to anything.  Can we please
have a redo against current -linus?



^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH -next] mm: hwpoison: support recovery from HugePage copy-on-write faults
  2023-04-12 22:56       ` Andrew Morton
@ 2023-04-12 23:37         ` Mike Kravetz
  2023-04-13  0:47           ` Luck, Tony
  2023-04-13  1:55         ` Liu Shixin
  1 sibling, 1 reply; 12+ messages in thread
From: Mike Kravetz @ 2023-04-12 23:37 UTC (permalink / raw)
  To: Andrew Morton, Tony Luck
  Cc: Liu Shixin, Naoya Horiguchi, Miaohe Lin, Muchun Song, linux-mm,
	linux-kernel

On 04/12/23 15:56, Andrew Morton wrote:
> On Wed, 12 Apr 2023 15:21:38 -0700 Mike Kravetz <mike.kravetz@oracle.com> wrote:
> 
> > > > Commit a873dfe1032a ("mm, hwpoison: try to recover from copy-on write
> > > > faults") introduced the routine copy_user_highpage_mc() to gracefully
> > > > handle copying of user pages with uncorrectable errors.  Previously,
> > > > such copies would result in a kernel crash.  hugetlb has separate code
> > > > paths for copy-on-write and does not benefit from the changes made in
> > > > commit a873dfe1032a.
> > 
> > I was just going to suggest adding the line,
> > 
> > Hence, copy-on-write of hugetlb user pages with uncorrectable errors            
> > will result in a kernel crash as was the case with 'normal' pages before        
> > commit a873dfe1032a.
> > 
> > However, I'm guessing it might be more clear if we start with the
> > runtime effects.  Something like:
> > 
> > copy-on-write of hugetlb user pages with uncorrectable errors will result
> > in a kernel crash.  This is because the copy is performed in kernel mode
> > and in general we can not handle accessing memory with such errors while
> > in kernel mode.  Commit a873dfe1032a ("mm, hwpoison: try to recover from
> > copy-on write faults") introduced the routine copy_user_highpage_mc() to
> > gracefully handle copying of user pages with uncorrectable errors.  However,
> > the separate hugetlb copy-on-write code paths were not modified as part
> > of commit a873dfe1032a.
> 
> Sounds good.  So I assume cc:stable is desirable.

I do not think cc:stable is necessary/desirable.  Why?

a873dfe1032a was an enhancement to better handle copying pages with memory
errors in the kernel.  IIUC, we never handled that situation in the past.
I would not call the fact that it did not take hugetlb into account a bug.
Although, some might argue that it should have addressed all callers of
copy_user_highpage which would have included hugetlb.  IMO, There would be
little to gain by backporing to 6.1 as the issue of copying pages with
errors has existed forever.  Perhaps Tony will comment as I was not involved
in a873dfe1032a.

> I can't actually get the patch to apply to anything.  Can we please
> have a redo against current -linus?
-- 
Mike Kravetz


^ permalink raw reply	[flat|nested] 12+ messages in thread

* RE: [PATCH -next] mm: hwpoison: support recovery from HugePage copy-on-write faults
  2023-04-12 23:37         ` Mike Kravetz
@ 2023-04-13  0:47           ` Luck, Tony
  0 siblings, 0 replies; 12+ messages in thread
From: Luck, Tony @ 2023-04-13  0:47 UTC (permalink / raw)
  To: Mike Kravetz, Andrew Morton
  Cc: Liu Shixin, Naoya Horiguchi, Miaohe Lin, Muchun Song, linux-mm,
	linux-kernel

> I do not think cc:stable is necessary/desirable.  Why?
>
> a873dfe1032a was an enhancement to better handle copying pages with memory
> errors in the kernel.  IIUC, we never handled that situation in the past.
> I would not call the fact that it did not take hugetlb into account a bug.
> Although, some might argue that it should have addressed all callers of
> copy_user_highpage which would have included hugetlb.  IMO, There would be
> little to gain by backporing to 6.1 as the issue of copying pages with
> errors has existed forever.  Perhaps Tony will comment as I was not involved
> in a873dfe1032a.

I concur. We are gradually fixing more cases where Linux consumes
poison data in kernel context. But this is an incremental process to
improve overall reliability. Backporting to stable and long term releases
isn't a priority.

-Tony


^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH -next] mm: hwpoison: support recovery from HugePage copy-on-write faults
  2023-04-12 21:57   ` Andrew Morton
  2023-04-12 22:21     ` Mike Kravetz
@ 2023-04-13  1:49     ` Liu Shixin
  1 sibling, 0 replies; 12+ messages in thread
From: Liu Shixin @ 2023-04-13  1:49 UTC (permalink / raw)
  To: Andrew Morton, Mike Kravetz
  Cc: Naoya Horiguchi, Tony Luck, Miaohe Lin, Muchun Song, linux-mm,
	linux-kernel



On 2023/4/13 5:57, Andrew Morton wrote:
> On Wed, 12 Apr 2023 11:13:50 -0700 Mike Kravetz <mike.kravetz@oracle.com> wrote:
>
>> On 04/11/23 17:27, Liu Shixin wrote:
>>> Patch a873dfe1032a ("mm, hwpoison: try to recover from copy-on write faults")
>>> introduced a new copy_user_highpage_mc() function, and fix the kernel crash
>>> when the kernel is copying a normal page as the result of a copy-on-write
>>> fault and runs into an uncorrectable error. But it doesn't work for HugeTLB.
>> Andrew asked about user-visible effects.  Perhaps, a better way of
>> stating this in the commit message might be:
>>
>> Commit a873dfe1032a ("mm, hwpoison: try to recover from copy-on write
>> faults") introduced the routine copy_user_highpage_mc() to gracefully
>> handle copying of user pages with uncorrectable errors.  Previously,
>> such copies would result in a kernel crash.  hugetlb has separate code
>> paths for copy-on-write and does not benefit from the changes made in
>> commit a873dfe1032a.
>>
>> Modify hugetlb copy-on-write code paths to use copy_mc_user_highpage()
>> so that they can also gracefully handle uncorrectable errors in user
>> pages.  This involves changing the hugetlb specific routine
>> ?copy_user_folio()? from type void to int so that it can return an error.
>> Modify the hugetlb userfaultfd code in the same way so that it can return
>> -EHWPOISON if it encounters an uncorrectable error.
> Thanks, but...  what are the runtime effects?  What does hugetlb
> presently do when encountering these uncorrectable error?
I have tested the HugeTLB case by using tony's testcase[1](need add a MAP_HUGETLB).
Before this patch, the kernel will crash due to the uncorrectable errors. After this patch,
if the error occurs in copy-on-write, the process will be killed, if the errors occurs in
userfaultfd, it will return -EHWPOISON.

Link: https://git.kernel.org/pub/scm/linux/kernel/git/aegl/ras-tools.git [1]
>
>
> .
>



^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH -next] mm: hwpoison: support recovery from HugePage copy-on-write faults
  2023-04-12 22:21     ` Mike Kravetz
  2023-04-12 22:56       ` Andrew Morton
@ 2023-04-13  1:51       ` Liu Shixin
  1 sibling, 0 replies; 12+ messages in thread
From: Liu Shixin @ 2023-04-13  1:51 UTC (permalink / raw)
  To: Mike Kravetz, Andrew Morton
  Cc: Naoya Horiguchi, Tony Luck, Miaohe Lin, Muchun Song, linux-mm,
	linux-kernel



On 2023/4/13 6:21, Mike Kravetz wrote:
> On 04/12/23 14:57, Andrew Morton wrote:
>> On Wed, 12 Apr 2023 11:13:50 -0700 Mike Kravetz <mike.kravetz@oracle.com> wrote:
>>
>>> On 04/11/23 17:27, Liu Shixin wrote:
>>>> Patch a873dfe1032a ("mm, hwpoison: try to recover from copy-on write faults")
>>>> introduced a new copy_user_highpage_mc() function, and fix the kernel crash
>>>> when the kernel is copying a normal page as the result of a copy-on-write
>>>> fault and runs into an uncorrectable error. But it doesn't work for HugeTLB.
>>> Andrew asked about user-visible effects.  Perhaps, a better way of
>>> stating this in the commit message might be:
>>>
>>> Commit a873dfe1032a ("mm, hwpoison: try to recover from copy-on write
>>> faults") introduced the routine copy_user_highpage_mc() to gracefully
>>> handle copying of user pages with uncorrectable errors.  Previously,
>>> such copies would result in a kernel crash.  hugetlb has separate code
>>> paths for copy-on-write and does not benefit from the changes made in
>>> commit a873dfe1032a.
> I was just going to suggest adding the line,
>
> Hence, copy-on-write of hugetlb user pages with uncorrectable errors            
> will result in a kernel crash as was the case with 'normal' pages before        
> commit a873dfe1032a.
>
> However, I'm guessing it might be more clear if we start with the
> runtime effects.  Something like:
>
> copy-on-write of hugetlb user pages with uncorrectable errors will result
> in a kernel crash.  This is because the copy is performed in kernel mode
> and in general we can not handle accessing memory with such errors while
> in kernel mode.  Commit a873dfe1032a ("mm, hwpoison: try to recover from
> copy-on write faults") introduced the routine copy_user_highpage_mc() to
> gracefully handle copying of user pages with uncorrectable errors.  However,
> the separate hugetlb copy-on-write code paths were not modified as part
> of commit a873dfe1032a.
Thanks for your advice, I will add these explaination.
>
>>> Modify hugetlb copy-on-write code paths to use copy_mc_user_highpage()
>>> so that they can also gracefully handle uncorrectable errors in user
>>> pages.  This involves changing the hugetlb specific routine
>>> ?copy_user_folio()? from type void to int so that it can return an error.
>>> Modify the hugetlb userfaultfd code in the same way so that it can return
>>> -EHWPOISON if it encounters an uncorrectable error.
>> Thanks, but...  what are the runtime effects?  What does hugetlb
>> presently do when encountering these uncorrectable error?



^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH -next] mm: hwpoison: support recovery from HugePage copy-on-write faults
  2023-04-12 22:56       ` Andrew Morton
  2023-04-12 23:37         ` Mike Kravetz
@ 2023-04-13  1:55         ` Liu Shixin
  1 sibling, 0 replies; 12+ messages in thread
From: Liu Shixin @ 2023-04-13  1:55 UTC (permalink / raw)
  To: Andrew Morton, Mike Kravetz
  Cc: Naoya Horiguchi, Tony Luck, Miaohe Lin, Muchun Song, linux-mm,
	linux-kernel



On 2023/4/13 6:56, Andrew Morton wrote:
> On Wed, 12 Apr 2023 15:21:38 -0700 Mike Kravetz <mike.kravetz@oracle.com> wrote:
>
>>>> Commit a873dfe1032a ("mm, hwpoison: try to recover from copy-on write
>>>> faults") introduced the routine copy_user_highpage_mc() to gracefully
>>>> handle copying of user pages with uncorrectable errors.  Previously,
>>>> such copies would result in a kernel crash.  hugetlb has separate code
>>>> paths for copy-on-write and does not benefit from the changes made in
>>>> commit a873dfe1032a.
>> I was just going to suggest adding the line,
>>
>> Hence, copy-on-write of hugetlb user pages with uncorrectable errors            
>> will result in a kernel crash as was the case with 'normal' pages before        
>> commit a873dfe1032a.
>>
>> However, I'm guessing it might be more clear if we start with the
>> runtime effects.  Something like:
>>
>> copy-on-write of hugetlb user pages with uncorrectable errors will result
>> in a kernel crash.  This is because the copy is performed in kernel mode
>> and in general we can not handle accessing memory with such errors while
>> in kernel mode.  Commit a873dfe1032a ("mm, hwpoison: try to recover from
>> copy-on write faults") introduced the routine copy_user_highpage_mc() to
>> gracefully handle copying of user pages with uncorrectable errors.  However,
>> the separate hugetlb copy-on-write code paths were not modified as part
>> of commit a873dfe1032a.
> Sounds good.  So I assume cc:stable is desirable.
>
> I can't actually get the patch to apply to anything.  Can we please
> have a redo against current -linus?
OK, I will apply this patch to mainline again.
>
> .
>



^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH -next] mm: hwpoison: support recovery from HugePage copy-on-write faults
  2023-04-12 18:13 ` Mike Kravetz
  2023-04-12 21:57   ` Andrew Morton
@ 2023-04-13 12:57   ` Jiaqi Yan
  1 sibling, 0 replies; 12+ messages in thread
From: Jiaqi Yan @ 2023-04-13 12:57 UTC (permalink / raw)
  To: Mike Kravetz
  Cc: Andrew Morton, Liu Shixin, Miaohe Lin, Muchun Song,
	Naoya Horiguchi, Tony Luck, linux-kernel, linux-mm

[-- Attachment #1: Type: text/plain, Size: 2838 bytes --]

On Wed, Apr 12, 2023 at 11:14 AM Mike Kravetz <mike.kravetz@oracle.com>
wrote:

> On 04/11/23 17:27, Liu Shixin wrote:
> > Patch a873dfe1032a ("mm, hwpoison: try to recover from copy-on write
> faults")
> > introduced a new copy_user_highpage_mc() function, and fix the kernel
> crash
> > when the kernel is copying a normal page as the result of a copy-on-write
> > fault and runs into an uncorrectable error. But it doesn't work for
> HugeTLB.
>
> Andrew asked about user-visible effects.  Perhaps, a better way of
> stating this in the commit message might be:
>
> Commit a873dfe1032a ("mm, hwpoison: try to recover from copy-on write
> faults") introduced the routine copy_user_highpage_mc() to gracefully
> handle copying of user pages with uncorrectable errors.  Previously,
> such copies would result in a kernel crash.  hugetlb has separate code
> paths for copy-on-write and does not benefit from the changes made in
> commit a873dfe1032a.
>
> Modify hugetlb copy-on-write code paths to use copy_mc_user_highpage()
> so that they can also gracefully handle uncorrectable errors in user
> pages.  This involves changing the hugetlb specific routine
> ?copy_user_folio()? from type void to int so that it can return an error.
> Modify the hugetlb userfaultfd code in the same way so that it can return
> -EHWPOISON if it encounters an uncorrectable error.
>
> NOTE - There is still some churn in the series that introduces
> copy_user_folio() and the name may change.
>
> > This is to support HugeTLB by using copy_mc_user_highpage() in
> copy_subpage()
> > and copy_user_gigantic_page() too.
> >
> > Moreover, this is also used by userfaultfd, it will return -EHWPOISON if
> > running into an uncorrectable error.
> >
> > Signed-off-by: Liu Shixin <liushixin2@huawei.com>
> > ---
> >  include/linux/mm.h |  6 ++---
> >  mm/hugetlb.c       | 19 +++++++++++----
> >  mm/memory.c        | 59 +++++++++++++++++++++++++++++-----------------
> >  3 files changed, 56 insertions(+), 28 deletions(-)
>
> Code changes look good to me.
>
> Acked-by: Mike Kravetz <mike.kravetz@oracle.com>
>
> Related question perhaps for Tony not directly impacting this patch.
> This patch touches the hugetlb clear page paths withour consequence.
>
> Just wondering if we can/should create something like
> clear_mc_user_highpage
> to address clearing pages as well?  Apologies if this was previously
> discussed.


Tony may have better answers but allow me to chime in for this question:
Memory related #MC only happens when kernel reads encounter hw
uncorrectbale memory errors. Writes(clearing memory page) are “safe” to
kernel, at least generating no #MC. So I don’t think clear_user_highpage
needs a #MC handled version (or even possible at all).


> --
> Mike Kravetz
>
>

[-- Attachment #2: Type: text/html, Size: 3735 bytes --]

^ permalink raw reply	[flat|nested] 12+ messages in thread

end of thread, other threads:[~2023-04-13 12:57 UTC | newest]

Thread overview: 12+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-04-11  9:27 [PATCH -next] mm: hwpoison: support recovery from HugePage copy-on-write faults Liu Shixin
2023-04-12  0:26 ` Andrew Morton
2023-04-12 18:13 ` Mike Kravetz
2023-04-12 21:57   ` Andrew Morton
2023-04-12 22:21     ` Mike Kravetz
2023-04-12 22:56       ` Andrew Morton
2023-04-12 23:37         ` Mike Kravetz
2023-04-13  0:47           ` Luck, Tony
2023-04-13  1:55         ` Liu Shixin
2023-04-13  1:51       ` Liu Shixin
2023-04-13  1:49     ` Liu Shixin
2023-04-13 12:57   ` Jiaqi Yan

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).