LKML Archive on lore.kernel.org
 help / color / Atom feed
From: Naoya Horiguchi <nao.horiguchi@gmail.com>
To: Aili Yao <yaoaili@kingsoft.com>
Cc: "linux-mm@kvack.org" <linux-mm@kvack.org>,
	"Tony Luck" <tony.luck@intel.com>,
	"Andrew Morton" <akpm@linux-foundation.org>,
	"Oscar Salvador" <osalvador@suse.de>,
	"David Hildenbrand" <david@redhat.com>,
	"Borislav Petkov" <bp@alien8.de>,
	"Andy Lutomirski" <luto@kernel.org>,
	"linux-kernel@vger.kernel.org" <linux-kernel@vger.kernel.org>,
	"HORIGUCHI NAOYA(堀口 直也)" <naoya.horiguchi@nec.com>
Subject: [PATCH v2 3/3] mm,hwpoison: add kill_accessing_process() to find error virtual address
Date: Mon, 19 Apr 2021 11:36:58 +0900
Message-ID: <20210419023658.GA1962954@u2004> (raw)
In-Reply-To: <20210419010955.GA17180@hori.linux.bs1.fc.nec.co.jp>

> > 2. In the function hwpoison_pte_range():
> > if (pfn <= hwp->pfn && hwp->pfn < pfn + PMD_SIZE) this check seem we should use PMD_SIZE/PAGE_SIZE or some macro like this?
> 
> Thanks, that's right.  HPAGE_PMD_NR seems to fit here.
> We also need "#ifdef CONFIG_TRANSPARENT_HUGEPAGE" to use it.

I found that the #ifdef is not necessary because the whole
"if (ptl)" is compiled out.  So I don't add #ifdef.

Here's the v2 of 3/3.

Aili, could you test with it?

Thanks,
Naoya Horiguchi

-----
From: Naoya Horiguchi <naoya.horiguchi@nec.com>
Date: Tue, 13 Apr 2021 07:26:25 +0900
Subject: [PATCH v2 3/3] mm,hwpoison: add kill_accessing_process() to find error
 virtual address

The previous patch solves the infinite MCE loop issue when multiple
MCE events races.  The remaining issue is to make sure that all threads
processing Action Required MCEs send to the current processes the
SIGBUS with the proper virtual address and the error size.

This patch suggests to do page table walk to find the error virtual
address.  If we find multiple virtual addresses in walking, we now can't
determine which one is correct, so we fall back to sending SIGBUS in
kill_me_maybe() without error info as we do now.  This corner case needs
to be solved in the future.

Signed-off-by: Naoya Horiguchi <naoya.horiguchi@nec.com>
---
change log v1 -> v2:
- initialize local variables in check_hwpoisoned_entry() and
  hwpoison_pte_range()
- fix and improve logic to calculate error address offset.
---
 arch/x86/kernel/cpu/mce/core.c |  13 ++-
 include/linux/swapops.h        |   5 ++
 mm/memory-failure.c            | 147 ++++++++++++++++++++++++++++++++-
 3 files changed, 161 insertions(+), 4 deletions(-)

diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c
index 7962355436da..3ce23445a48c 100644
--- a/arch/x86/kernel/cpu/mce/core.c
+++ b/arch/x86/kernel/cpu/mce/core.c
@@ -1257,19 +1257,28 @@ static void kill_me_maybe(struct callback_head *cb)
 {
 	struct task_struct *p = container_of(cb, struct task_struct, mce_kill_me);
 	int flags = MF_ACTION_REQUIRED;
+	int ret;
 
 	pr_err("Uncorrected hardware memory error in user-access at %llx", p->mce_addr);
 
 	if (!p->mce_ripv)
 		flags |= MF_MUST_KILL;
 
-	if (!memory_failure(p->mce_addr >> PAGE_SHIFT, flags) &&
-	    !(p->mce_kflags & MCE_IN_KERNEL_COPYIN)) {
+	ret = memory_failure(p->mce_addr >> PAGE_SHIFT, flags);
+	if (!ret && !(p->mce_kflags & MCE_IN_KERNEL_COPYIN)) {
 		set_mce_nospec(p->mce_addr >> PAGE_SHIFT, p->mce_whole_page);
 		sync_core();
 		return;
 	}
 
+	/*
+	 * -EHWPOISON from memory_failure() means that it already sent SIGBUS
+	 * to the current process with the proper error info, so no need to
+	 * send it here again.
+	 */
+	if (ret == -EHWPOISON)
+		return;
+
 	if (p->mce_vaddr != (void __user *)-1l) {
 		force_sig_mceerr(BUS_MCEERR_AR, p->mce_vaddr, PAGE_SHIFT);
 	} else {
diff --git a/include/linux/swapops.h b/include/linux/swapops.h
index d9b7c9132c2f..98ea67fcf360 100644
--- a/include/linux/swapops.h
+++ b/include/linux/swapops.h
@@ -323,6 +323,11 @@ static inline int is_hwpoison_entry(swp_entry_t entry)
 	return swp_type(entry) == SWP_HWPOISON;
 }
 
+static inline unsigned long hwpoison_entry_to_pfn(swp_entry_t entry)
+{
+	return swp_offset(entry);
+}
+
 static inline void num_poisoned_pages_inc(void)
 {
 	atomic_long_inc(&num_poisoned_pages);
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 368ef77e01f9..99dd4caf43cb 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -56,6 +56,7 @@
 #include <linux/kfifo.h>
 #include <linux/ratelimit.h>
 #include <linux/page-isolation.h>
+#include <linux/pagewalk.h>
 #include "internal.h"
 #include "ras/ras_event.h"
 
@@ -554,6 +555,142 @@ static void collect_procs(struct page *page, struct list_head *tokill,
 		collect_procs_file(page, tokill, force_early);
 }
 
+struct hwp_walk {
+	struct to_kill tk;
+	unsigned long pfn;
+	int flags;
+};
+
+static int set_to_kill(struct to_kill *tk, unsigned long addr, short shift)
+{
+	/* Abort pagewalk when finding multiple mappings to the error page. */
+	if (tk->addr)
+		return 1;
+	tk->addr = addr;
+	tk->size_shift = shift;
+	return 0;
+}
+
+static int check_hwpoisoned_entry(pte_t pte, unsigned long addr, short shift,
+				unsigned long poisoned_pfn, struct to_kill *tk)
+{
+	unsigned long pfn = 0;
+
+	if (pte_present(pte)) {
+		pfn = pte_pfn(pte);
+	} else {
+		swp_entry_t swp = pte_to_swp_entry(pte);
+
+		if (is_hwpoison_entry(swp))
+			pfn = hwpoison_entry_to_pfn(swp);
+	}
+
+	if (!pfn || pfn != poisoned_pfn)
+		return 0;
+
+	return set_to_kill(tk, addr, shift);
+}
+
+static int hwpoison_pte_range(pmd_t *pmdp, unsigned long addr,
+			      unsigned long end, struct mm_walk *walk)
+{
+	struct hwp_walk *hwp = (struct hwp_walk *)walk->private;
+	int ret = 0;
+	pte_t *ptep;
+	spinlock_t *ptl;
+
+	ptl = pmd_trans_huge_lock(pmdp, walk->vma);
+	if (ptl) {
+		pmd_t pmd = *pmdp;
+
+		if (pmd_present(pmd)) {
+			unsigned long pfn = pmd_pfn(pmd);
+
+			if (pfn <= hwp->pfn && hwp->pfn < pfn + HPAGE_PMD_NR) {
+				unsigned long hwpoison_vaddr = addr +
+					((hwp->pfn - pfn) << PAGE_SHIFT);
+
+				ret = set_to_kill(&hwp->tk, hwpoison_vaddr,
+						  PAGE_SHIFT);
+			}
+		}
+		spin_unlock(ptl);
+		goto out;
+	}
+
+	if (pmd_trans_unstable(pmdp))
+		goto out;
+
+	ptep = pte_offset_map_lock(walk->vma->vm_mm, pmdp, addr, &ptl);
+	for (; addr != end; ptep++, addr += PAGE_SIZE) {
+		ret = check_hwpoisoned_entry(*ptep, addr, PAGE_SHIFT,
+					     hwp->pfn, &hwp->tk);
+		if (ret == 1)
+			break;
+	}
+	pte_unmap_unlock(ptep - 1, ptl);
+out:
+	cond_resched();
+	return ret;
+}
+
+#ifdef CONFIG_HUGETLB_PAGE
+static int hwpoison_hugetlb_range(pte_t *ptep, unsigned long hmask,
+			    unsigned long addr, unsigned long end,
+			    struct mm_walk *walk)
+{
+	struct hwp_walk *hwp = (struct hwp_walk *)walk->private;
+	pte_t pte = huge_ptep_get(ptep);
+	struct hstate *h = hstate_vma(walk->vma);
+
+	return check_hwpoisoned_entry(pte, addr, huge_page_shift(h),
+				      hwp->pfn, &hwp->tk);
+}
+#else
+#define hwpoison_hugetlb_range	NULL
+#endif
+
+static struct mm_walk_ops hwp_walk_ops = {
+	.pmd_entry = hwpoison_pte_range,
+	.hugetlb_entry = hwpoison_hugetlb_range,
+};
+
+/*
+ * Sends SIGBUS to the current process with the error info.
+ *
+ * This function is intended to handle "Action Required" MCEs on already
+ * hardware poisoned pages. They could happen, for example, when
+ * memory_failure() failed to unmap the error page at the first call, or
+ * when multiple Action Optional MCE events races on different CPUs with
+ * Local MCE enabled.
+ *
+ * MCE handler currently has no easy access to the error virtual address,
+ * so this function walks page table to find it. One challenge on this is
+ * to reliably get the proper virual address of the error to report to
+ * applications via SIGBUS. A process could map a page multiple times to
+ * different virtual addresses, then we now have no way to tell which virtual
+ * address was accessed when the Action Required MCE was generated.
+ * So in such a corner case, we now give up and fall back to sending SIGBUS
+ * with no error info.
+ */
+static int kill_accessing_process(struct task_struct *p, unsigned long pfn,
+				  int flags)
+{
+	int ret;
+	struct hwp_walk priv = {
+		.pfn = pfn,
+	};
+	priv.tk.tsk = p;
+
+	mmap_read_lock(p->mm);
+	ret = walk_page_range(p->mm, 0, TASK_SIZE_MAX, &hwp_walk_ops,
+			      (void *)&priv);
+	if (!ret && priv.tk.addr)
+		kill_proc(&priv.tk, pfn, flags);
+	mmap_read_unlock(p->mm);
+	return ret ? -EFAULT : -EHWPOISON;
+}
+
 static const char *action_name[] = {
 	[MF_IGNORED] = "Ignored",
 	[MF_FAILED] = "Failed",
@@ -1228,7 +1365,10 @@ static int memory_failure_hugetlb(unsigned long pfn, int flags)
 	if (TestSetPageHWPoison(head)) {
 		pr_err("Memory failure: %#lx: already hardware poisoned\n",
 		       pfn);
-		return -EHWPOISON;
+		res = -EHWPOISON;
+		if (flags & MF_ACTION_REQUIRED)
+			res = kill_accessing_process(current, page_to_pfn(head), flags);
+		return res;
 	}
 
 	num_poisoned_pages_inc();
@@ -1437,8 +1577,11 @@ int memory_failure(unsigned long pfn, int flags)
 	if (TestSetPageHWPoison(p)) {
 		pr_err("Memory failure: %#lx: already hardware poisoned\n",
 			pfn);
+		res = -EHWPOISON;
+		if (flags & MF_ACTION_REQUIRED)
+			res = kill_accessing_process(current, pfn, flags);
 		mutex_unlock(&mf_mutex);
-		return -EHWPOISON;
+		return res;
 	}
 
 	orig_head = hpage = compound_head(p);
-- 
2.25.1


  reply index

Thread overview: 12+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2021-04-12 22:43 [PATCH v1 0/3] mm,hwpoison: fix sending SIGBUS for Action Required MCE Naoya Horiguchi
2021-04-12 22:43 ` [PATCH v1 1/3] mm/memory-failure: Use a mutex to avoid memory_failure() races Naoya Horiguchi
2021-04-19 17:05   ` Borislav Petkov
2021-04-20  7:46     ` HORIGUCHI NAOYA(堀口 直也)
2021-04-20 10:16       ` Borislav Petkov
2021-04-21  0:57         ` HORIGUCHI NAOYA(堀口 直也)
2021-04-12 22:43 ` [PATCH v1 2/3] mm,hwpoison: return -EHWPOISON when page already Naoya Horiguchi
2021-04-12 22:43 ` [PATCH v1 3/3] mm,hwpoison: add kill_accessing_process() to find error virtual address Naoya Horiguchi
2021-04-17  5:47 ` [PATCH v1 0/3] mm,hwpoison: fix sending SIGBUS for Action Required MCE Aili Yao
2021-04-19  1:09   ` HORIGUCHI NAOYA(堀口 直也)
2021-04-19  2:36     ` Naoya Horiguchi [this message]
2021-04-19  3:43       ` [PATCH v2 3/3] mm,hwpoison: add kill_accessing_process() to find error virtual address Aili Yao

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20210419023658.GA1962954@u2004 \
    --to=nao.horiguchi@gmail.com \
    --cc=akpm@linux-foundation.org \
    --cc=bp@alien8.de \
    --cc=david@redhat.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=luto@kernel.org \
    --cc=naoya.horiguchi@nec.com \
    --cc=osalvador@suse.de \
    --cc=tony.luck@intel.com \
    --cc=yaoaili@kingsoft.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

LKML Archive on lore.kernel.org

Archives are clonable:
	git clone --mirror https://lore.kernel.org/lkml/0 lkml/git/0.git
	git clone --mirror https://lore.kernel.org/lkml/1 lkml/git/1.git
	git clone --mirror https://lore.kernel.org/lkml/2 lkml/git/2.git
	git clone --mirror https://lore.kernel.org/lkml/3 lkml/git/3.git
	git clone --mirror https://lore.kernel.org/lkml/4 lkml/git/4.git
	git clone --mirror https://lore.kernel.org/lkml/5 lkml/git/5.git
	git clone --mirror https://lore.kernel.org/lkml/6 lkml/git/6.git
	git clone --mirror https://lore.kernel.org/lkml/7 lkml/git/7.git
	git clone --mirror https://lore.kernel.org/lkml/8 lkml/git/8.git
	git clone --mirror https://lore.kernel.org/lkml/9 lkml/git/9.git
	git clone --mirror https://lore.kernel.org/lkml/10 lkml/git/10.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 lkml lkml/ https://lore.kernel.org/lkml \
		linux-kernel@vger.kernel.org
	public-inbox-index lkml

Example config snippet for mirrors

Newsgroup available over NNTP:
	nntp://nntp.lore.kernel.org/org.kernel.vger.linux-kernel


AGPL code for this site: git clone https://public-inbox.org/public-inbox.git