All of lore.kernel.org
 help / color / mirror / Atom feed
From: Dan Williams <dan.j.williams@intel.com>
To: linux-kernel@vger.kernel.org
Cc: Jane Chu <jane.chu@oracle.com>, Christoph Hellwig <hch@lst.de>,
	nvdimm@lists.linux.dev
Subject: [PATCH v10 7/7] pmem: implement pmem_recovery_write()
Date: Fri, 13 May 2022 15:13:20 -0700	[thread overview]
Message-ID: <165247997655.53156.8381418704988035976.stgit@dwillia2-desk3.amr.corp.intel.com> (raw)
In-Reply-To: <20220422224508.440670-8-jane.chu@oracle.com>

From: Jane Chu <jane.chu@oracle.com>

The recovery write thread started out as a normal pwrite thread and
when the filesystem was told about potential media error in the
range, filesystem turns the normal pwrite to a dax_recovery_write.

The recovery write consists of clearing media poison, clearing page
HWPoison bit, reenable page-wide read-write permission, flush the
caches and finally write.  A competing pread thread will be held
off during the recovery process since data read back might not be
valid, and this is achieved by clearing the badblock records after
the recovery write is complete. Competing recovery write threads
are already serialized by writer lock held by dax_iomap_rw().

Signed-off-by: Jane Chu <jane.chu@oracle.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
Changes since v9:
- Fixup compile warnings in debug messages

 drivers/nvdimm/pmem.c |   87 ++++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 79 insertions(+), 8 deletions(-)

diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
index 0961625dfa05..6b24ecada695 100644
--- a/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -266,21 +266,43 @@ __weak long __pmem_direct_access(struct pmem_device *pmem, pgoff_t pgoff,
 		pfn_t *pfn)
 {
 	resource_size_t offset = PFN_PHYS(pgoff) + pmem->data_offset;
-
-	if (unlikely(is_bad_pmem(&pmem->bb, PFN_PHYS(pgoff) / 512,
-					PFN_PHYS(nr_pages))))
-		return -EIO;
+	sector_t sector = PFN_PHYS(pgoff) >> SECTOR_SHIFT;
+	unsigned int num = PFN_PHYS(nr_pages) >> SECTOR_SHIFT;
+	struct badblocks *bb = &pmem->bb;
+	sector_t first_bad;
+	int num_bad;
 
 	if (kaddr)
 		*kaddr = pmem->virt_addr + offset;
 	if (pfn)
 		*pfn = phys_to_pfn_t(pmem->phys_addr + offset, pmem->pfn_flags);
 
+	if (bb->count &&
+	    badblocks_check(bb, sector, num, &first_bad, &num_bad)) {
+		long actual_nr;
+
+		if (mode != DAX_RECOVERY_WRITE)
+			return -EIO;
+
+		/*
+		 * Set the recovery stride is set to kernel page size because
+		 * the underlying driver and firmware clear poison functions
+		 * don't appear to handle large chunk(such as 2MiB) reliably.
+		 */
+		actual_nr = PHYS_PFN(
+			PAGE_ALIGN((first_bad - sector) << SECTOR_SHIFT));
+		dev_dbg(pmem->bb.dev, "start sector(%llu), nr_pages(%ld), first_bad(%llu), actual_nr(%ld)\n",
+				sector, nr_pages, first_bad, actual_nr);
+		if (actual_nr)
+			return actual_nr;
+		return 1;
+	}
+
 	/*
-	 * If badblocks are present, limit known good range to the
-	 * requested range.
+	 * If badblocks are present but not in the range, limit known good range
+	 * to the requested range.
 	 */
-	if (unlikely(pmem->bb.count))
+	if (bb->count)
 		return nr_pages;
 	return PHYS_PFN(pmem->size - pmem->pfn_pad - offset);
 }
@@ -310,10 +332,59 @@ static long pmem_dax_direct_access(struct dax_device *dax_dev,
 	return __pmem_direct_access(pmem, pgoff, nr_pages, mode, kaddr, pfn);
 }
 
+/*
+ * The recovery write thread started out as a normal pwrite thread and
+ * when the filesystem was told about potential media error in the
+ * range, filesystem turns the normal pwrite to a dax_recovery_write.
+ *
+ * The recovery write consists of clearing media poison, clearing page
+ * HWPoison bit, reenable page-wide read-write permission, flush the
+ * caches and finally write.  A competing pread thread will be held
+ * off during the recovery process since data read back might not be
+ * valid, and this is achieved by clearing the badblock records after
+ * the recovery write is complete. Competing recovery write threads
+ * are already serialized by writer lock held by dax_iomap_rw().
+ */
 static size_t pmem_recovery_write(struct dax_device *dax_dev, pgoff_t pgoff,
 		void *addr, size_t bytes, struct iov_iter *i)
 {
-	return 0;
+	struct pmem_device *pmem = dax_get_private(dax_dev);
+	size_t olen, len, off;
+	phys_addr_t pmem_off;
+	struct device *dev = pmem->bb.dev;
+	long cleared;
+
+	off = offset_in_page(addr);
+	len = PFN_PHYS(PFN_UP(off + bytes));
+	if (!is_bad_pmem(&pmem->bb, PFN_PHYS(pgoff) >> SECTOR_SHIFT, len))
+		return _copy_from_iter_flushcache(addr, bytes, i);
+
+	/*
+	 * Not page-aligned range cannot be recovered. This should not
+	 * happen unless something else went wrong.
+	 */
+	if (off || !PAGE_ALIGNED(bytes)) {
+		dev_dbg(dev, "Found poison, but addr(%p) or bytes(%#zx) not page aligned\n",
+			addr, bytes);
+		return 0;
+	}
+
+	pmem_off = PFN_PHYS(pgoff) + pmem->data_offset;
+	cleared = __pmem_clear_poison(pmem, pmem_off, len);
+	if (cleared > 0 && cleared < len) {
+		dev_dbg(dev, "poison cleared only %ld out of %zu bytes\n",
+			cleared, len);
+		return 0;
+	}
+	if (cleared < 0) {
+		dev_dbg(dev, "poison clear failed: %ld\n", cleared);
+		return 0;
+	}
+
+	olen = _copy_from_iter_flushcache(addr, bytes, i);
+	pmem_clear_bb(pmem, to_sect(pmem, pmem_off), cleared >> SECTOR_SHIFT);
+
+	return olen;
 }
 
 static const struct dax_operations pmem_dax_ops = {


      parent reply	other threads:[~2022-05-13 22:13 UTC|newest]

Thread overview: 46+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2022-04-22 22:45 [PATCH v9 0/7] DAX poison recovery Jane Chu
2022-04-22 22:45 ` [dm-devel] " Jane Chu
2022-04-22 22:45 ` [PATCH v9 1/7] acpi/nfit: rely on mce->misc to determine poison granularity Jane Chu
2022-04-22 22:45   ` [dm-devel] " Jane Chu
2022-04-22 22:45 ` [PATCH v9 2/7] x86/mce: relocate set{clear}_mce_nospec() functions Jane Chu
2022-04-22 22:45   ` [dm-devel] " Jane Chu
2022-04-28 13:09   ` Borislav Petkov
2022-04-28 13:09     ` [dm-devel] " Borislav Petkov
2022-05-13 21:41   ` [PATCH v10 " Dan Williams
2022-05-16 18:21     ` [PATCH v11 " Dan Williams
2022-05-16 20:30   ` [PATCH v9 " Dan Williams
2022-05-16 20:30     ` [dm-devel] " Dan Williams
2022-04-22 22:45 ` [PATCH v9 3/7] mce: fix set_mce_nospec to always unmap the whole page Jane Chu
2022-04-22 22:45   ` [dm-devel] " Jane Chu
2022-04-22 23:25   ` Dan Williams
2022-04-22 23:25     ` Dan Williams
2022-05-11  3:56     ` Dan Williams
2022-05-11  3:56       ` [dm-devel] " Dan Williams
2022-05-11  8:44       ` Borislav Petkov
2022-05-11  8:44         ` [dm-devel] " Borislav Petkov
2022-05-11 17:17         ` Luck, Tony
2022-05-11 17:17           ` [dm-devel] " Luck, Tony
2022-05-13  3:41           ` Dan Williams
2022-05-13  3:41             ` [dm-devel] " Dan Williams
2022-05-16 18:38   ` [PATCH v10 " Dan Williams
2022-04-22 22:45 ` [PATCH v9 4/7] dax: introduce DAX_RECOVERY_WRITE dax access mode Jane Chu
2022-04-22 22:45   ` [dm-devel] " Jane Chu
2022-04-23  5:20   ` Christoph Hellwig
2022-04-23  5:20     ` [dm-devel] " Christoph Hellwig
2022-05-13 21:55   ` [PATCH v10 " Dan Williams
2022-05-13 21:55     ` [dm-devel] " Dan Williams
2022-05-13 22:09     ` Dan Williams
2022-05-13 22:09       ` [dm-devel] " Dan Williams
2022-05-13 22:10     ` [PATCH v11 " Dan Williams
2022-05-13 22:10       ` [dm-devel] " Dan Williams
2022-05-16 12:40     ` [PATCH v10 " Vivek Goyal
2022-05-16 12:40       ` [dm-devel] " Vivek Goyal
2022-04-22 22:45 ` [PATCH v9 5/7] dax: add .recovery_write dax_operation Jane Chu
2022-04-22 22:45   ` [dm-devel] " Jane Chu
2022-04-22 22:45 ` [PATCH v9 6/7] pmem: refactor pmem_clear_poison() Jane Chu
2022-04-22 22:45   ` [dm-devel] " Jane Chu
2022-04-22 22:45 ` [PATCH v9 7/7] pmem: implement pmem_recovery_write() Jane Chu
2022-04-22 22:45   ` [dm-devel] " Jane Chu
2022-04-23  5:21   ` Christoph Hellwig
2022-04-23  5:21     ` [dm-devel] " Christoph Hellwig
2022-05-13 22:13   ` Dan Williams [this message]

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=165247997655.53156.8381418704988035976.stgit@dwillia2-desk3.amr.corp.intel.com \
    --to=dan.j.williams@intel.com \
    --cc=hch@lst.de \
    --cc=jane.chu@oracle.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=nvdimm@lists.linux.dev \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.