From: Jane Chu <jane.chu@oracle.com> To: david@fromorbit.com, djwong@kernel.org, dan.j.williams@intel.com, hch@infradead.org, vishal.l.verma@intel.com, dave.jiang@intel.com, agk@redhat.com, snitzer@redhat.com, dm-devel@redhat.com, ira.weiny@intel.com, willy@infradead.org, vgoyal@redhat.com, linux-fsdevel@vger.kernel.org, nvdimm@lists.linux.dev, linux-kernel@vger.kernel.org, linux-xfs@vger.kernel.org Subject: [PATCH v3 5/7] pmem: add pmem_recovery_write() dax op Date: Tue, 11 Jan 2022 11:59:28 -0700 [thread overview] Message-ID: <20220111185930.2601421-6-jane.chu@oracle.com> (raw) In-Reply-To: <20220111185930.2601421-1-jane.chu@oracle.com> pmem_recovery_write() consists of clearing poison via DSM, clearing page HWPoison bit, re-state _PAGE_PRESENT bit, cacheflush, write, and finally clearing bad-block record. A competing pread thread is held off during recovery write by the presence of bad-block record. A competing recovery_write thread is serialized by a lock. Signed-off-by: Jane Chu <jane.chu@oracle.com> --- drivers/nvdimm/pmem.c | 84 +++++++++++++++++++++++++++++++++++++++---- drivers/nvdimm/pmem.h | 1 + 2 files changed, 79 insertions(+), 6 deletions(-) diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c index a68e7d3ed27e..dd2db4905c85 100644 --- a/drivers/nvdimm/pmem.c +++ b/drivers/nvdimm/pmem.c @@ -69,6 +69,14 @@ static void hwpoison_clear(struct pmem_device *pmem, } } +static void pmem_clear_badblocks(struct pmem_device *pmem, sector_t sector, + long cleared_blks) +{ + badblocks_clear(&pmem->bb, sector, cleared_blks); + if (pmem->bb_state) + sysfs_notify_dirent(pmem->bb_state); +} + static blk_status_t pmem_clear_poison(struct pmem_device *pmem, phys_addr_t offset, unsigned int len) { @@ -88,9 +96,7 @@ static blk_status_t pmem_clear_poison(struct pmem_device *pmem, dev_dbg(dev, "%#llx clear %ld sector%s\n", (unsigned long long) sector, cleared, cleared > 1 ? "s" : ""); - badblocks_clear(&pmem->bb, sector, cleared); - if (pmem->bb_state) - sysfs_notify_dirent(pmem->bb_state); + pmem_clear_badblocks(pmem, sector, cleared); } arch_invalidate_pmem(pmem->virt_addr + offset, len); @@ -257,10 +263,15 @@ static int pmem_rw_page(struct block_device *bdev, sector_t sector, __weak long __pmem_direct_access(struct pmem_device *pmem, pgoff_t pgoff, long nr_pages, void **kaddr, pfn_t *pfn) { + bool bad_pmem; + bool do_recovery = false; resource_size_t offset = PFN_PHYS(pgoff) + pmem->data_offset; - if (unlikely(is_bad_pmem(&pmem->bb, PFN_PHYS(pgoff) / 512, - PFN_PHYS(nr_pages)))) + bad_pmem = is_bad_pmem(&pmem->bb, PFN_PHYS(pgoff) / 512, + PFN_PHYS(nr_pages)); + if (bad_pmem && kaddr) + do_recovery = dax_recovery_started(pmem->dax_dev, kaddr); + if (bad_pmem && !do_recovery) return -EIO; if (kaddr) @@ -319,10 +330,70 @@ static size_t pmem_copy_to_iter(struct dax_device *dax_dev, pgoff_t pgoff, return _copy_mc_to_iter(addr, bytes, i); } +/* + * The recovery write thread started out as a normal pwrite thread and + * when the filesystem was told about potential media error in the + * range, filesystem turns the normal pwrite to a dax_recovery_write. + * + * The recovery write consists of clearing poison via DSM, clearing page + * HWPoison bit, reenable page-wide read-write permission, flush the + * caches and finally write. A competing pread thread needs to be held + * off during the recovery process since data read back might not be valid. + * And that's achieved by placing the badblock records clearing after + * the completion of the recovery write. + * + * Any competing recovery write thread needs to be serialized, and this is + * done via pmem device level lock .recovery_lock. + */ static size_t pmem_recovery_write(struct dax_device *dax_dev, pgoff_t pgoff, void *addr, size_t bytes, struct iov_iter *i) { - return 0; + size_t rc, len, off; + phys_addr_t pmem_off; + struct pmem_device *pmem = dax_get_private(dax_dev); + struct device *dev = pmem->bb.dev; + sector_t sector; + long cleared, cleared_blk; + + mutex_lock(&pmem->recovery_lock); + + /* If no poison found in the range, go ahead with write */ + off = (unsigned long)addr & ~PAGE_MASK; + len = PFN_PHYS(PFN_UP(off + bytes)); + if (!is_bad_pmem(&pmem->bb, PFN_PHYS(pgoff) / 512, len)) { + rc = _copy_from_iter_flushcache(addr, bytes, i); + goto write_done; + } + + /* Not page-aligned range cannot be recovered */ + if (off || !(PAGE_ALIGNED(bytes))) { + dev_warn(dev, "Found poison, but addr(%p) or bytes(%#lx) not page aligned\n", + addr, bytes); + rc = (size_t) -EIO; + goto write_done; + } + + pmem_off = PFN_PHYS(pgoff) + pmem->data_offset; + sector = (pmem_off - pmem->data_offset) / 512; + cleared = nvdimm_clear_poison(dev, pmem->phys_addr + pmem_off, len); + cleared_blk = cleared / 512; + if (cleared_blk > 0) { + hwpoison_clear(pmem, pmem->phys_addr + pmem_off, cleared); + } else { + dev_warn(dev, "pmem_recovery_write: cleared_blk: %ld\n", + cleared_blk); + rc = (size_t) -EIO; + goto write_done; + } + arch_invalidate_pmem(pmem->virt_addr + pmem_off, bytes); + + rc = _copy_from_iter_flushcache(addr, bytes, i); + + pmem_clear_badblocks(pmem, sector, cleared_blk); + +write_done: + mutex_unlock(&pmem->recovery_lock); + return rc; } static const struct dax_operations pmem_dax_ops = { @@ -514,6 +585,7 @@ static int pmem_attach_disk(struct device *dev, goto out_cleanup_dax; dax_write_cache(dax_dev, nvdimm_has_cache(nd_region)); set_dax_recovery(dax_dev); + mutex_init(&pmem->recovery_lock); pmem->dax_dev = dax_dev; rc = device_add_disk(dev, disk, pmem_attribute_groups); diff --git a/drivers/nvdimm/pmem.h b/drivers/nvdimm/pmem.h index 59cfe13ea8a8..971bff7552d6 100644 --- a/drivers/nvdimm/pmem.h +++ b/drivers/nvdimm/pmem.h @@ -24,6 +24,7 @@ struct pmem_device { struct dax_device *dax_dev; struct gendisk *disk; struct dev_pagemap pgmap; + struct mutex recovery_lock; }; long __pmem_direct_access(struct pmem_device *pmem, pgoff_t pgoff, -- 2.18.4
WARNING: multiple messages have this Message-ID (diff)
From: Jane Chu <jane.chu@oracle.com> To: david@fromorbit.com, djwong@kernel.org, dan.j.williams@intel.com, hch@infradead.org, vishal.l.verma@intel.com, dave.jiang@intel.com, agk@redhat.com, snitzer@redhat.com, dm-devel@redhat.com, ira.weiny@intel.com, willy@infradead.org, vgoyal@redhat.com, linux-fsdevel@vger.kernel.org, nvdimm@lists.linux.dev, linux-kernel@vger.kernel.org, linux-xfs@vger.kernel.org Subject: [dm-devel] [PATCH v3 5/7] pmem: add pmem_recovery_write() dax op Date: Tue, 11 Jan 2022 11:59:28 -0700 [thread overview] Message-ID: <20220111185930.2601421-6-jane.chu@oracle.com> (raw) In-Reply-To: <20220111185930.2601421-1-jane.chu@oracle.com> pmem_recovery_write() consists of clearing poison via DSM, clearing page HWPoison bit, re-state _PAGE_PRESENT bit, cacheflush, write, and finally clearing bad-block record. A competing pread thread is held off during recovery write by the presence of bad-block record. A competing recovery_write thread is serialized by a lock. Signed-off-by: Jane Chu <jane.chu@oracle.com> --- drivers/nvdimm/pmem.c | 84 +++++++++++++++++++++++++++++++++++++++---- drivers/nvdimm/pmem.h | 1 + 2 files changed, 79 insertions(+), 6 deletions(-) diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c index a68e7d3ed27e..dd2db4905c85 100644 --- a/drivers/nvdimm/pmem.c +++ b/drivers/nvdimm/pmem.c @@ -69,6 +69,14 @@ static void hwpoison_clear(struct pmem_device *pmem, } } +static void pmem_clear_badblocks(struct pmem_device *pmem, sector_t sector, + long cleared_blks) +{ + badblocks_clear(&pmem->bb, sector, cleared_blks); + if (pmem->bb_state) + sysfs_notify_dirent(pmem->bb_state); +} + static blk_status_t pmem_clear_poison(struct pmem_device *pmem, phys_addr_t offset, unsigned int len) { @@ -88,9 +96,7 @@ static blk_status_t pmem_clear_poison(struct pmem_device *pmem, dev_dbg(dev, "%#llx clear %ld sector%s\n", (unsigned long long) sector, cleared, cleared > 1 ? "s" : ""); - badblocks_clear(&pmem->bb, sector, cleared); - if (pmem->bb_state) - sysfs_notify_dirent(pmem->bb_state); + pmem_clear_badblocks(pmem, sector, cleared); } arch_invalidate_pmem(pmem->virt_addr + offset, len); @@ -257,10 +263,15 @@ static int pmem_rw_page(struct block_device *bdev, sector_t sector, __weak long __pmem_direct_access(struct pmem_device *pmem, pgoff_t pgoff, long nr_pages, void **kaddr, pfn_t *pfn) { + bool bad_pmem; + bool do_recovery = false; resource_size_t offset = PFN_PHYS(pgoff) + pmem->data_offset; - if (unlikely(is_bad_pmem(&pmem->bb, PFN_PHYS(pgoff) / 512, - PFN_PHYS(nr_pages)))) + bad_pmem = is_bad_pmem(&pmem->bb, PFN_PHYS(pgoff) / 512, + PFN_PHYS(nr_pages)); + if (bad_pmem && kaddr) + do_recovery = dax_recovery_started(pmem->dax_dev, kaddr); + if (bad_pmem && !do_recovery) return -EIO; if (kaddr) @@ -319,10 +330,70 @@ static size_t pmem_copy_to_iter(struct dax_device *dax_dev, pgoff_t pgoff, return _copy_mc_to_iter(addr, bytes, i); } +/* + * The recovery write thread started out as a normal pwrite thread and + * when the filesystem was told about potential media error in the + * range, filesystem turns the normal pwrite to a dax_recovery_write. + * + * The recovery write consists of clearing poison via DSM, clearing page + * HWPoison bit, reenable page-wide read-write permission, flush the + * caches and finally write. A competing pread thread needs to be held + * off during the recovery process since data read back might not be valid. + * And that's achieved by placing the badblock records clearing after + * the completion of the recovery write. + * + * Any competing recovery write thread needs to be serialized, and this is + * done via pmem device level lock .recovery_lock. + */ static size_t pmem_recovery_write(struct dax_device *dax_dev, pgoff_t pgoff, void *addr, size_t bytes, struct iov_iter *i) { - return 0; + size_t rc, len, off; + phys_addr_t pmem_off; + struct pmem_device *pmem = dax_get_private(dax_dev); + struct device *dev = pmem->bb.dev; + sector_t sector; + long cleared, cleared_blk; + + mutex_lock(&pmem->recovery_lock); + + /* If no poison found in the range, go ahead with write */ + off = (unsigned long)addr & ~PAGE_MASK; + len = PFN_PHYS(PFN_UP(off + bytes)); + if (!is_bad_pmem(&pmem->bb, PFN_PHYS(pgoff) / 512, len)) { + rc = _copy_from_iter_flushcache(addr, bytes, i); + goto write_done; + } + + /* Not page-aligned range cannot be recovered */ + if (off || !(PAGE_ALIGNED(bytes))) { + dev_warn(dev, "Found poison, but addr(%p) or bytes(%#lx) not page aligned\n", + addr, bytes); + rc = (size_t) -EIO; + goto write_done; + } + + pmem_off = PFN_PHYS(pgoff) + pmem->data_offset; + sector = (pmem_off - pmem->data_offset) / 512; + cleared = nvdimm_clear_poison(dev, pmem->phys_addr + pmem_off, len); + cleared_blk = cleared / 512; + if (cleared_blk > 0) { + hwpoison_clear(pmem, pmem->phys_addr + pmem_off, cleared); + } else { + dev_warn(dev, "pmem_recovery_write: cleared_blk: %ld\n", + cleared_blk); + rc = (size_t) -EIO; + goto write_done; + } + arch_invalidate_pmem(pmem->virt_addr + pmem_off, bytes); + + rc = _copy_from_iter_flushcache(addr, bytes, i); + + pmem_clear_badblocks(pmem, sector, cleared_blk); + +write_done: + mutex_unlock(&pmem->recovery_lock); + return rc; } static const struct dax_operations pmem_dax_ops = { @@ -514,6 +585,7 @@ static int pmem_attach_disk(struct device *dev, goto out_cleanup_dax; dax_write_cache(dax_dev, nvdimm_has_cache(nd_region)); set_dax_recovery(dax_dev); + mutex_init(&pmem->recovery_lock); pmem->dax_dev = dax_dev; rc = device_add_disk(dev, disk, pmem_attribute_groups); diff --git a/drivers/nvdimm/pmem.h b/drivers/nvdimm/pmem.h index 59cfe13ea8a8..971bff7552d6 100644 --- a/drivers/nvdimm/pmem.h +++ b/drivers/nvdimm/pmem.h @@ -24,6 +24,7 @@ struct pmem_device { struct dax_device *dax_dev; struct gendisk *disk; struct dev_pagemap pgmap; + struct mutex recovery_lock; }; long __pmem_direct_access(struct pmem_device *pmem, pgoff_t pgoff, -- 2.18.4 -- dm-devel mailing list dm-devel@redhat.com https://listman.redhat.com/mailman/listinfo/dm-devel
next prev parent reply other threads:[~2022-01-11 19:00 UTC|newest] Thread overview: 24+ messages / expand[flat|nested] mbox.gz Atom feed top 2022-01-11 18:59 [PATCH v3 0/7] DAX poison recovery Jane Chu 2022-01-11 18:59 ` [dm-devel] " Jane Chu 2022-01-11 18:59 ` [PATCH v3 1/7] mce: fix set_mce_nospec to always unmap the whole page Jane Chu 2022-01-11 18:59 ` [dm-devel] " Jane Chu 2022-01-11 18:59 ` [PATCH v3 2/7] dax: introduce dax device flag DAXDEV_RECOVERY Jane Chu 2022-01-11 18:59 ` [dm-devel] " Jane Chu 2022-01-11 18:59 ` [PATCH v3 3/7] dm: make dm aware of target's DAXDEV_RECOVERY capability Jane Chu 2022-01-11 18:59 ` [dm-devel] " Jane Chu 2022-01-11 18:59 ` [PATCH v3 4/7] dax: add dax_recovery_write to dax_op and dm target type Jane Chu 2022-01-11 18:59 ` [dm-devel] " Jane Chu 2022-01-11 18:59 ` Jane Chu [this message] 2022-01-11 18:59 ` [dm-devel] [PATCH v3 5/7] pmem: add pmem_recovery_write() dax op Jane Chu 2022-01-11 18:59 ` [PATCH v3 6/7] dax: add recovery_write to dax_iomap_iter in failure path Jane Chu 2022-01-11 18:59 ` [dm-devel] " Jane Chu 2022-01-11 18:59 ` [PATCH v3 7/7] pmem: fix pmem_do_write() avoid writing to 'np' page Jane Chu 2022-01-11 18:59 ` [dm-devel] " Jane Chu 2022-01-20 9:55 ` [PATCH v3 0/7] DAX poison recovery Christoph Hellwig 2022-01-20 9:55 ` [dm-devel] " Christoph Hellwig 2022-01-21 1:33 ` Jane Chu 2022-01-21 1:33 ` [dm-devel] " Jane Chu 2022-01-24 9:01 ` Christoph Hellwig 2022-01-24 9:01 ` [dm-devel] " Christoph Hellwig 2022-01-27 0:25 ` Jane Chu 2022-01-27 0:25 ` [dm-devel] " Jane Chu
Reply instructions: You may reply publicly to this message via plain-text email using any one of the following methods: * Save the following mbox file, import it into your mail client, and reply-to-all from there: mbox Avoid top-posting and favor interleaved quoting: https://en.wikipedia.org/wiki/Posting_style#Interleaved_style * Reply using the --to, --cc, and --in-reply-to switches of git-send-email(1): git send-email \ --in-reply-to=20220111185930.2601421-6-jane.chu@oracle.com \ --to=jane.chu@oracle.com \ --cc=agk@redhat.com \ --cc=dan.j.williams@intel.com \ --cc=dave.jiang@intel.com \ --cc=david@fromorbit.com \ --cc=djwong@kernel.org \ --cc=dm-devel@redhat.com \ --cc=hch@infradead.org \ --cc=ira.weiny@intel.com \ --cc=linux-fsdevel@vger.kernel.org \ --cc=linux-kernel@vger.kernel.org \ --cc=linux-xfs@vger.kernel.org \ --cc=nvdimm@lists.linux.dev \ --cc=snitzer@redhat.com \ --cc=vgoyal@redhat.com \ --cc=vishal.l.verma@intel.com \ --cc=willy@infradead.org \ /path/to/YOUR_REPLY https://kernel.org/pub/software/scm/git/docs/git-send-email.html * If your mail client supports setting the In-Reply-To header via mailto: links, try the mailto: linkBe sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes, see mirroring instructions on how to clone and mirror all data and code used by this external index.