linux-fsdevel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Vivek Goyal <vgoyal@redhat.com>
To: Christoph Hellwig <hch@infradead.org>,
	dan.j.williams@intel.com, linux-nvdimm@lists.01.org
Cc: linux-fsdevel@vger.kernel.org, vishal.l.verma@intel.com,
	Jeff Moyer <jmoyer@redhat.com>
Subject: [RFC] dax,pmem: Provide a dax operation to zero range of memory
Date: Thu, 23 Jan 2020 11:52:49 -0500	[thread overview]
Message-ID: <20200123165249.GA7664@redhat.com> (raw)

Hi,

This is an RFC patch to provide a dax operation to zero a range of memory.
It will also clear poison in the process. This is primarily compile tested
patch. I don't have real hardware to test the poison logic. I am posting
this to figure out if this is the right direction or not.

Motivation from this patch comes from Christoph's feedback that he will
rather prefer a dax way to zero a range instead of relying on having to
call blkdev_issue_zeroout() in __dax_zero_page_range().

https://lkml.org/lkml/2019/8/26/361

My motivation for this change is virtiofs DAX support. There we use DAX
but we don't have a block device. So any dax code which has the assumption
that there is always a block device associated is a problem. So this
is more of a cleanup of one of the places where dax has this dependency
on block device and if we add a dax operation for zeroing a range, it
can help with not having to call blkdev_issue_zeroout() in dax path.

I have yet to take care of stacked block drivers (dm/md).

Current poison clearing logic is primarily written with assumption that
I/O is sector aligned. With this new method, this assumption is broken
and one can pass any range of memory to zero. I have fixed few places
in existing logic to be able to handle an arbitrary start/end. I am
not sure are there other dependencies which might need fixing or
prohibit us from providing this method.

Any feedback or comment is welcome.

Thanks
Vivek

---
 drivers/dax/super.c   |   13 +++++++++
 drivers/nvdimm/pmem.c |   67 ++++++++++++++++++++++++++++++++++++++++++--------
 fs/dax.c              |   39 ++++++++---------------------
 include/linux/dax.h   |    3 ++
 4 files changed, 85 insertions(+), 37 deletions(-)

Index: rhvgoyal-linux/drivers/nvdimm/pmem.c
===================================================================
--- rhvgoyal-linux.orig/drivers/nvdimm/pmem.c	2020-01-23 11:32:11.075139183 -0500
+++ rhvgoyal-linux/drivers/nvdimm/pmem.c	2020-01-23 11:32:28.660139183 -0500
@@ -52,8 +52,8 @@ static void hwpoison_clear(struct pmem_d
 	if (is_vmalloc_addr(pmem->virt_addr))
 		return;
 
-	pfn_start = PHYS_PFN(phys);
-	pfn_end = pfn_start + PHYS_PFN(len);
+	pfn_start = PFN_UP(phys);
+	pfn_end = PFN_DOWN(phys + len);
 	for (pfn = pfn_start; pfn < pfn_end; pfn++) {
 		struct page *page = pfn_to_page(pfn);
 
@@ -71,22 +71,24 @@ static blk_status_t pmem_clear_poison(st
 		phys_addr_t offset, unsigned int len)
 {
 	struct device *dev = to_dev(pmem);
-	sector_t sector;
+	sector_t sector_start, sector_end;
 	long cleared;
 	blk_status_t rc = BLK_STS_OK;
+	int nr_sectors;
 
-	sector = (offset - pmem->data_offset) / 512;
+	sector_start = ALIGN((offset - pmem->data_offset), 512) / 512;
+	sector_end = ALIGN_DOWN((offset - pmem->data_offset + len), 512)/512;
+	nr_sectors =  sector_end - sector_start;
 
 	cleared = nvdimm_clear_poison(dev, pmem->phys_addr + offset, len);
 	if (cleared < len)
 		rc = BLK_STS_IOERR;
-	if (cleared > 0 && cleared / 512) {
+	if (cleared > 0 && nr_sectors > 0) {
 		hwpoison_clear(pmem, pmem->phys_addr + offset, cleared);
-		cleared /= 512;
-		dev_dbg(dev, "%#llx clear %ld sector%s\n",
-				(unsigned long long) sector, cleared,
-				cleared > 1 ? "s" : "");
-		badblocks_clear(&pmem->bb, sector, cleared);
+		dev_dbg(dev, "%#llx clear %d sector%s\n",
+				(unsigned long long) sector_start, nr_sectors,
+				nr_sectors > 1 ? "s" : "");
+		badblocks_clear(&pmem->bb, sector_start, nr_sectors);
 		if (pmem->bb_state)
 			sysfs_notify_dirent(pmem->bb_state);
 	}
@@ -268,6 +270,50 @@ static const struct block_device_operati
 	.revalidate_disk =	nvdimm_revalidate_disk,
 };
 
+static int pmem_dax_zero_page_range(struct dax_device *dax_dev, pgoff_t pgoff,
+				    unsigned int offset, loff_t len)
+{
+	int rc = 0;
+	phys_addr_t phys_pos = pgoff * PAGE_SIZE + offset;
+	struct pmem_device *pmem = dax_get_private(dax_dev);
+	struct page *page = ZERO_PAGE(0);
+
+	do {
+		unsigned bytes, nr_sectors = 0;
+		sector_t sector_start, sector_end;
+		bool bad_pmem = false;
+		phys_addr_t pmem_off = phys_pos + pmem->data_offset;
+		void *pmem_addr = pmem->virt_addr + pmem_off;
+		unsigned int page_offset;
+
+		page_offset = offset_in_page(phys_pos);
+		bytes = min_t(loff_t, PAGE_SIZE - page_offset, len);
+
+		sector_start = ALIGN(phys_pos, 512)/512;
+		sector_end = ALIGN_DOWN(phys_pos + bytes, 512)/512;
+		if (sector_end > sector_start)
+			nr_sectors = sector_end - sector_start;
+
+		if (nr_sectors &&
+		    unlikely(is_bad_pmem(&pmem->bb, sector_start,
+					 nr_sectors * 512)))
+			bad_pmem = true;
+
+		write_pmem(pmem_addr, page, 0, bytes);
+		if (unlikely(bad_pmem)) {
+			rc = pmem_clear_poison(pmem, pmem_off, bytes);
+			write_pmem(pmem_addr, page, 0, bytes);
+		}
+		if (rc > 0)
+			return -EIO;
+
+		phys_pos += phys_pos + bytes;
+		len -= bytes;
+	} while (len > 0);
+
+	return 0;
+}
+
 static long pmem_dax_direct_access(struct dax_device *dax_dev,
 		pgoff_t pgoff, long nr_pages, void **kaddr, pfn_t *pfn)
 {
@@ -299,6 +345,7 @@ static const struct dax_operations pmem_
 	.dax_supported = generic_fsdax_supported,
 	.copy_from_iter = pmem_copy_from_iter,
 	.copy_to_iter = pmem_copy_to_iter,
+	.zero_page_range = pmem_dax_zero_page_range,
 };
 
 static const struct attribute_group *pmem_attribute_groups[] = {
Index: rhvgoyal-linux/include/linux/dax.h
===================================================================
--- rhvgoyal-linux.orig/include/linux/dax.h	2020-01-23 11:25:23.814139183 -0500
+++ rhvgoyal-linux/include/linux/dax.h	2020-01-23 11:32:17.799139183 -0500
@@ -34,6 +34,8 @@ struct dax_operations {
 	/* copy_to_iter: required operation for fs-dax direct-i/o */
 	size_t (*copy_to_iter)(struct dax_device *, pgoff_t, void *, size_t,
 			struct iov_iter *);
+	/* zero_page_range: optional operation for fs-dax direct-i/o */
+	int (*zero_page_range)(struct dax_device *, pgoff_t, unsigned, loff_t);
 };
 
 extern struct attribute_group dax_attribute_group;
@@ -209,6 +211,7 @@ size_t dax_copy_from_iter(struct dax_dev
 		size_t bytes, struct iov_iter *i);
 size_t dax_copy_to_iter(struct dax_device *dax_dev, pgoff_t pgoff, void *addr,
 		size_t bytes, struct iov_iter *i);
+int dax_zero_page_range(struct dax_device *dax_dev, pgoff_t pgoff, unsigned offset, loff_t len);
 void dax_flush(struct dax_device *dax_dev, void *addr, size_t size);
 
 ssize_t dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter,
Index: rhvgoyal-linux/fs/dax.c
===================================================================
--- rhvgoyal-linux.orig/fs/dax.c	2020-01-23 11:25:23.814139183 -0500
+++ rhvgoyal-linux/fs/dax.c	2020-01-23 11:32:17.801139183 -0500
@@ -1044,38 +1044,23 @@ static vm_fault_t dax_load_hole(struct x
 	return ret;
 }
 
-static bool dax_range_is_aligned(struct block_device *bdev,
-				 unsigned int offset, unsigned int length)
-{
-	unsigned short sector_size = bdev_logical_block_size(bdev);
-
-	if (!IS_ALIGNED(offset, sector_size))
-		return false;
-	if (!IS_ALIGNED(length, sector_size))
-		return false;
-
-	return true;
-}
-
 int __dax_zero_page_range(struct block_device *bdev,
 		struct dax_device *dax_dev, sector_t sector,
 		unsigned int offset, unsigned int size)
 {
-	if (dax_range_is_aligned(bdev, offset, size)) {
-		sector_t start_sector = sector + (offset >> 9);
+	pgoff_t pgoff;
+	long rc, id;
 
-		return blkdev_issue_zeroout(bdev, start_sector,
-				size >> 9, GFP_NOFS, 0);
-	} else {
-		pgoff_t pgoff;
-		long rc, id;
+	rc = bdev_dax_pgoff(bdev, sector, PAGE_SIZE, &pgoff);
+	if (rc)
+		return rc;
+
+	id = dax_read_lock();
+	rc = dax_zero_page_range(dax_dev, pgoff, offset, size);
+	if (rc == -EOPNOTSUPP) {
 		void *kaddr;
 
-		rc = bdev_dax_pgoff(bdev, sector, PAGE_SIZE, &pgoff);
-		if (rc)
-			return rc;
-
-		id = dax_read_lock();
+		/* If driver does not implement zero page range, fallback */
 		rc = dax_direct_access(dax_dev, pgoff, 1, &kaddr, NULL);
 		if (rc < 0) {
 			dax_read_unlock(id);
@@ -1083,9 +1068,9 @@ int __dax_zero_page_range(struct block_d
 		}
 		memset(kaddr + offset, 0, size);
 		dax_flush(dax_dev, kaddr + offset, size);
-		dax_read_unlock(id);
 	}
-	return 0;
+	dax_read_unlock(id);
+	return rc;
 }
 EXPORT_SYMBOL_GPL(__dax_zero_page_range);
 
Index: rhvgoyal-linux/drivers/dax/super.c
===================================================================
--- rhvgoyal-linux.orig/drivers/dax/super.c	2020-01-23 11:25:23.814139183 -0500
+++ rhvgoyal-linux/drivers/dax/super.c	2020-01-23 11:32:17.802139183 -0500
@@ -344,6 +344,19 @@ size_t dax_copy_to_iter(struct dax_devic
 }
 EXPORT_SYMBOL_GPL(dax_copy_to_iter);
 
+int dax_zero_page_range(struct dax_device *dax_dev, pgoff_t pgoff,
+			unsigned offset, loff_t len)
+{
+	if (!dax_alive(dax_dev))
+		return 0;
+
+	if (!dax_dev->ops->zero_page_range)
+		return -EOPNOTSUPP;
+
+	return dax_dev->ops->zero_page_range(dax_dev, pgoff, offset, len);
+}
+EXPORT_SYMBOL_GPL(dax_zero_page_range);
+
 #ifdef CONFIG_ARCH_HAS_PMEM_API
 void arch_wb_cache_pmem(void *addr, size_t size);
 void dax_flush(struct dax_device *dax_dev, void *addr, size_t size)


             reply	other threads:[~2020-01-23 16:54 UTC|newest]

Thread overview: 7+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2020-01-23 16:52 Vivek Goyal [this message]
2020-01-23 19:01 ` Darrick J. Wong
2020-01-24 13:52   ` Vivek Goyal
2020-01-31 23:31   ` Dan Williams
2020-02-03  8:20     ` Christoph Hellwig
2020-02-04 23:23     ` Darrick J. Wong
2020-01-31  5:36 ` Christoph Hellwig

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20200123165249.GA7664@redhat.com \
    --to=vgoyal@redhat.com \
    --cc=dan.j.williams@intel.com \
    --cc=hch@infradead.org \
    --cc=jmoyer@redhat.com \
    --cc=linux-fsdevel@vger.kernel.org \
    --cc=linux-nvdimm@lists.01.org \
    --cc=vishal.l.verma@intel.com \
    --subject='Re: [RFC] dax,pmem: Provide a dax operation to zero range of memory' \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).