linux-edac.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: "HORIGUCHI NAOYA(堀口 直也)" <naoya.horiguchi@nec.com>
To: Aili Yao <yaoaili@kingsoft.com>
Cc: "Luck, Tony" <tony.luck@intel.com>,
	Oscar Salvador <osalvador@suse.de>,
	"david@redhat.com" <david@redhat.com>,
	"akpm@linux-foundation.org" <akpm@linux-foundation.org>,
	"bp@alien8.de" <bp@alien8.de>,
	"tglx@linutronix.de" <tglx@linutronix.de>,
	"mingo@redhat.com" <mingo@redhat.com>,
	"hpa@zytor.com" <hpa@zytor.com>,
	"x86@kernel.org" <x86@kernel.org>,
	"linux-edac@vger.kernel.org" <linux-edac@vger.kernel.org>,
	"linux-kernel@vger.kernel.org" <linux-kernel@vger.kernel.org>,
	"linux-mm@kvack.org" <linux-mm@kvack.org>,
	"yangfeng1@kingsoft.com" <yangfeng1@kingsoft.com>
Subject: Re: [PATCH] mm,hwpoison: return -EBUSY when page already poisoned
Date: Thu, 11 Mar 2021 08:55:30 +0000	[thread overview]
Message-ID: <20210311085529.GA22268@hori.linux.bs1.fc.nec.co.jp> (raw)
In-Reply-To: <20210310141042.4db9ea29@alex-virtual-machine>

On Wed, Mar 10, 2021 at 02:10:42PM +0800, Aili Yao wrote:
> On Fri, 5 Mar 2021 15:55:25 +0000
> "Luck, Tony" <tony.luck@intel.com> wrote:
> 
> > > From the walk, it seems we have got the virtual address, can we just send a SIGBUS with it?  
> > 
> > If the walk wins the race and the pte for the poisoned page is still valid, then yes.
> > 
> > But we could have:
> > 
> > CPU1                            CPU2
> > memory_failure sets poison
> > bit for struct page
> > 
> > 
> > rmap finds page in task
> > on CPU2 and sets PTE
> > to not-valid-poison
> > 
> >                                 memory_failure returns
> >                                 early because struct page
> >                                 already marked as poison
> > 
> >                                 walk page tables looking
> >                                 for mapping - don't find it
> > 
> > -Tony
> 
> While I don't think there is a race condition, and if you really think the pfn with SIGBUS is not
> proper, I think following patch maybe one way.
> I copy your abandon code, and make a little modification, and just now it pass
> my simple test.
> 
> And also this is a RFC version, only valid if you think the pfn with SIGBUS is not right.
> 
> Thanks!
> 
> From a522ab8856e3a332a2318d57bb19f3c59594d462 Mon Sep 17 00:00:00 2001
> From: Aili Yao <yaoaili@kingsoft.com>
> Date: Wed, 10 Mar 2021 13:59:18 +0800
> Subject: [PATCH] x86/mce: fix invalid SIGBUS address
> 
> walk the current process pte and compare with the pfn;
> 1. only test for normal page and 2M hugetlb page;
> 2. 1G hugetlb and transparentHuge is not support currently;
> 3. May other fails is not recognized, This is a RFC version.
> 
> ---
>  arch/x86/kernel/cpu/mce/core.c | 83 ++++++++++++++++++++++++++++++++++++++++--
>  1 file changed, 80 insertions(+), 3 deletions(-)
> 
> diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c
> index db4afc5..65d7ef7 100644
> --- a/arch/x86/kernel/cpu/mce/core.c
> +++ b/arch/x86/kernel/cpu/mce/core.c
> @@ -28,8 +28,12 @@
>  #include <linux/sysfs.h>
>  #include <linux/types.h>
>  #include <linux/slab.h>
> +#include <linux/hugetlb.h>
> +#include <linux/swap.h>
> +#include <linux/swapops.h>
>  #include <linux/init.h>
>  #include <linux/kmod.h>
> +#include <linux/pagewalk.h>
>  #include <linux/poll.h>
>  #include <linux/nmi.h>
>  #include <linux/cpu.h>

Maybe requiring many dependencies like this implies that you might be better
to do below in mm/memory-failure.c instead of in this file.

> @@ -1235,6 +1239,81 @@ static void __mc_scan_banks(struct mce *m, struct pt_regs *regs, struct mce *fin
>  	/* mce_clear_state will clear *final, save locally for use later */
>  	*m = *final;
>  }
> +static int mc_pte_entry(pte_t *pte, unsigned long addr, unsigned long next, struct mm_walk *walk)
> +{
> +	u64 *buff = (u64 *)walk->private;
> +	u64 pfn = buff[0];
> +
> +	if (!pte_present(*pte) && is_hwpoison_entry(pte_to_swp_entry(*pte)))
> +		goto find;
> +	else if (pte_pfn(*pte) == pfn)
> +		goto find;
> +
> +	return 0;
> +find:
> +	buff[0] = addr;
> +	buff[1] = PAGE_SHIFT;
> +	return true;

Returning true means you stop walking when you find the first entry pointing
to a given pfn. But there could be multiple such entries, so if MCE SRAR is
triggered by memory access to the larger address in hwpoisoned entries, the
returned virtual address might be wrong.

> +}
> +
> +extern bool is_hugetlb_entry_hwpoisoned(pte_t pte);
> +
> +static int mc_hugetlb_range(pte_t *ptep, unsigned long hmask,
> +				 unsigned long addr, unsigned long end,
> +				 struct mm_walk *walk)
> +{
> +	u64 *buff = (u64 *)walk->private;
> +	u64 pfn = buff[0];
> +	int shift = PMD_SHIFT;
> +	pte_t pte =  huge_ptep_get(ptep);
> +
> +	if (unlikely(is_hugetlb_entry_hwpoisoned(pte)))
> +		goto find;
> +
> +	if (pte_pfn(*ptep) == pfn)
> +		goto find;
> +
> +	return 0;
> +find:
> +	buff[0] = addr;
> +	buff[1] = shift;
> +	return true;
> +}
> +
> +static struct mm_walk_ops walk = {
> +	.pte_entry = mc_pte_entry,
> +	.hugetlb_entry	= mc_hugetlb_range
> +};
> +
> +void mc_memory_failure_error(struct task_struct *p, unsigned long pfn)
> +{
> +	u64 buff[2] = {pfn, 0};
> +	struct page *page;
> +	int ret = -1;
> +
> +	page = pfn_to_page(pfn);
> +	if (!page)
> +		goto force_sigbus;
> +
> +	if (is_zone_device_page(page))
> +		goto force_sigbus;
> +
> +	mmap_read_lock(p->mm);
> +	ret = walk_page_range(p->mm, 0, TASK_SIZE_MAX, &walk, (void *)buff);
> +	mmap_read_unlock(p->mm);
> +
> +	if (ret && buff[0]) {
> +		pr_err("Memory error may not recovered: %#llx: Sending SIGBUS to %s:%d due to hardware memory corruption\n",
> +		buff[0], p->comm, p->pid);
> +		force_sig_mceerr(BUS_MCEERR_AR, (void __user *)buff[0], buff[1]);
> +	} else {
> +force_sigbus:
> +		pr_err("Memory error may not recovered, pfn: %#lx: Sending SIGBUS to %s:%d due to hardware memory corruption\n",
> +		pfn, p->comm, p->pid);
> +		force_sig_mceerr(BUS_MCEERR_AR, (void __user *)pfn, PAGE_SHIFT);
> +	}
> +
> +}
>  
>  static void kill_me_now(struct callback_head *ch)
>  {
> @@ -1259,9 +1338,7 @@ static void kill_me_maybe(struct callback_head *cb)
>  	}
>  
>  	if (p->mce_vaddr != (void __user *)-1l) {
> -		pr_err("Memory error may not recovered: %#lx: Sending SIGBUS to %s:%d due to hardware memory corruption\n",
> -			p->mce_addr >> PAGE_SHIFT, p->comm, p->pid);
> -		force_sig_mceerr(BUS_MCEERR_AR, p->mce_vaddr, PAGE_SHIFT);
> +		mc_memory_failure_error(current, p->mce_addr >> PAGE_SHIFT);

I guess that p->mce_vaddr stores the virtual address of the error here.
If so, sending SIGBUS with the address looks enough as we do now, so why
do you walk page table to find the error virtual address?

Thanks,
Naoya Horiguchi

  reply	other threads:[~2021-03-11  8:55 UTC|newest]

Thread overview: 42+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
     [not found] <20210224151619.67c29731@alex-virtual-machine>
     [not found] ` <20210224103105.GA16368@linux>
     [not found]   ` <20210225114329.4e1a41c6@alex-virtual-machine>
     [not found]     ` <20210225112818.GA10141@hori.linux.bs1.fc.nec.co.jp>
     [not found]       ` <20210225113930.GA7227@localhost.localdomain>
     [not found]         ` <20210225123806.GA15006@hori.linux.bs1.fc.nec.co.jp>
     [not found]           ` <20210225181542.GA178925@agluck-desk2.amr.corp.intel.com>
     [not found]             ` <20210226021907.GA27861@hori.linux.bs1.fc.nec.co.jp>
2021-02-26  2:59               ` [PATCH] mm,hwpoison: return -EBUSY when page already poisoned Aili Yao
2021-03-03  3:39                 ` Luck, Tony
2021-03-03  3:57                   ` Aili Yao
2021-03-03  8:39                     ` Aili Yao
2021-03-03 15:41                       ` Luck, Tony
2021-03-04  2:16                         ` Aili Yao
2021-03-04  4:19                           ` Aili Yao
2021-03-04  6:45                             ` Aili Yao
2021-03-04 23:57                               ` Luck, Tony
2021-03-05  1:30                                 ` Aili Yao
2021-03-05  1:36                                   ` Aili Yao
2021-03-05 22:11                                     ` Luck, Tony
2021-03-08  6:45                                       ` HORIGUCHI NAOYA(堀口 直也)
2021-03-08 18:54                                         ` Luck, Tony
2021-03-08 22:38                                           ` HORIGUCHI NAOYA(堀口 直也)
2021-03-08 22:55                                             ` [PATCH] mm/memory-failure: Use a mutex to avoid memory_failure() races Luck, Tony
2021-03-08 23:42                                               ` HORIGUCHI NAOYA(堀口 直也)
2021-03-09  2:04                                               ` Aili Yao
2021-03-09  6:04                                                 ` HORIGUCHI NAOYA(堀口 直也)
2021-03-09  6:35                                                   ` [PATCH v2] mm,hwpoison: return -EBUSY when page already poisoned Aili Yao
2021-03-09  8:28                                                     ` HORIGUCHI NAOYA(堀口 直也)
2021-03-09 20:01                                                       ` Luck, Tony
2021-03-10  8:05                                                         ` HORIGUCHI NAOYA(堀口 直也)
2021-03-13  1:55                                                         ` Jue Wang
2021-03-10  8:01                                                       ` Aili Yao
2021-03-09  6:38                                                   ` [PATCH] mm/memory-failure: Use a mutex to avoid memory_failure() races Aili Yao
2021-03-05 15:55                                   ` [PATCH] mm,hwpoison: return -EBUSY when page already poisoned Luck, Tony
2021-03-10  6:10                                     ` Aili Yao
2021-03-11  8:55                                       ` HORIGUCHI NAOYA(堀口 直也) [this message]
2021-03-11 11:23                                         ` Aili Yao
2021-03-11 17:05                                         ` Luck, Tony
2021-03-12  5:55                                           ` Aili Yao
2021-03-12 16:29                                             ` Luck, Tony
2021-03-12 23:48                                               ` Luck, Tony
2021-03-16  6:42                                                 ` HORIGUCHI NAOYA(堀口 直也)
2021-03-16  7:54                                                   ` Aili Yao
2021-03-17  0:29                                                 ` Luck, Tony
2021-03-17  9:07                                                   ` Aili Yao
2021-03-17  7:48                                         ` Aili Yao
2021-03-17  8:23                                           ` Aili Yao
     [not found]         ` <20210226105250.3a15e35c@alex-virtual-machine>
2021-02-26 17:58           ` Luck, Tony
2021-03-02  4:32             ` Aili Yao

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20210311085529.GA22268@hori.linux.bs1.fc.nec.co.jp \
    --to=naoya.horiguchi@nec.com \
    --cc=akpm@linux-foundation.org \
    --cc=bp@alien8.de \
    --cc=david@redhat.com \
    --cc=hpa@zytor.com \
    --cc=linux-edac@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=mingo@redhat.com \
    --cc=osalvador@suse.de \
    --cc=tglx@linutronix.de \
    --cc=tony.luck@intel.com \
    --cc=x86@kernel.org \
    --cc=yangfeng1@kingsoft.com \
    --cc=yaoaili@kingsoft.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).