[PATCH v1 4/4] vfio/nvgpu: register device memory for poison handling

From: <ankita@nvidia.com>
To: <ankita@nvidia.com>, <jgg@nvidia.com>,
	<alex.williamson@redhat.com>, <akpm@linux-foundation.org>,
	<tony.luck@intel.com>, <bp@alien8.de>, <naoya.horiguchi@nec.com>,
	<linmiaohe@huawei.com>
Cc: <aniketa@nvidia.com>, <cjia@nvidia.com>, <kwankhede@nvidia.com>,
	<targupta@nvidia.com>, <vsethi@nvidia.com>, <acurrid@nvidia.com>,
	<anuaggarwal@nvidia.com>, <linux-kernel@vger.kernel.org>,
	<linux-mm@kvack.org>, <linux-edac@vger.kernel.org>,
	<kvm@vger.kernel.org>
Subject: [PATCH v1 4/4] vfio/nvgpu: register device memory for poison handling
Date: Wed, 20 Sep 2023 19:32:10 +0530	[thread overview]
Message-ID: <20230920140210.12663-5-ankita@nvidia.com> (raw)
In-Reply-To: <20230920140210.12663-1-ankita@nvidia.com>

From: Ankit Agrawal <ankita@nvidia.com>

The nvgrace-gpu-vfio-pci module [1] maps the device memory to the user VA
(Qemu) using remap_pfn_range() without adding the memory to the kernel.
The device memory pages are not backed by struct page. Patches 1-3
implements the mechanism to handle ECC/poison on memory page without
struct page and expose a registration function. This new mechanism is
leveraged here.
 
The module registers its memory region with the kernel MM for ECC handling
using the register_pfn_address_space() registration API exposed by the
kernel. It also defines a failure callback function pfn_memory_failure()
to get the poisoned PFN from the MM.
 
The module track poisoned PFN as a bitmap with a bit per PFN. The PFN is
communicated by the kernel MM to the module through the failure function,
which sets the appropriate bit in the bitmap.
 
The module also defines a VMA fault ops for the module. It returns
VM_FAULT_HWPOISON in case the bit for the PFN is set in the bitmap.

[1] https://lore.kernel.org/all/20230915025415.6762-1-ankita@nvidia.com/

Signed-off-by: Ankit Agrawal <ankita@nvidia.com>
---
 drivers/vfio/pci/nvgrace-gpu/main.c | 107 +++++++++++++++++++++++++++-
 drivers/vfio/vfio.h                 |  11 ---
 drivers/vfio/vfio_main.c            |   3 +-
 include/linux/vfio.h                |  15 ++++
 4 files changed, 123 insertions(+), 13 deletions(-)

diff --git a/drivers/vfio/pci/nvgrace-gpu/main.c b/drivers/vfio/pci/nvgrace-gpu/main.c
index ba323f2d8ea1..1c89ce0cc1cc 100644
--- a/drivers/vfio/pci/nvgrace-gpu/main.c
+++ b/drivers/vfio/pci/nvgrace-gpu/main.c
@@ -6,6 +6,10 @@
 #include <linux/pci.h>
 #include <linux/vfio_pci_core.h>
 #include <linux/vfio.h>
+#ifdef CONFIG_MEMORY_FAILURE
+#include <linux/bitmap.h>
+#include <linux/memory-failure.h>
+#endif
 
 struct nvgrace_gpu_vfio_pci_core_device {
 	struct vfio_pci_core_device core_device;
@@ -13,8 +17,85 @@ struct nvgrace_gpu_vfio_pci_core_device {
 	size_t memlength;
 	void *memmap;
 	struct mutex memmap_lock;
+#ifdef CONFIG_MEMORY_FAILURE
+	struct pfn_address_space pfn_address_space;
+	unsigned long *pfn_bitmap;
+#endif
 };
 
+#ifdef CONFIG_MEMORY_FAILURE
+void nvgrace_gpu_vfio_pci_pfn_memory_failure(struct pfn_address_space *pfn_space,
+		unsigned long pfn)
+{
+	struct nvgrace_gpu_vfio_pci_core_device *nvdev = container_of(
+		pfn_space, struct nvgrace_gpu_vfio_pci_core_device, pfn_address_space);
+	unsigned long mem_offset = pfn - pfn_space->node.start;
+
+	if (mem_offset >= nvdev->memlength)
+		return;
+
+	/*
+	 * MM has called to notify a poisoned page. Track that in the bitmap.
+	 */
+	__set_bit(mem_offset, nvdev->pfn_bitmap);
+}
+
+struct pfn_address_space_ops nvgrace_gpu_vfio_pci_pas_ops = {
+	.failure = nvgrace_gpu_vfio_pci_pfn_memory_failure,
+};
+
+static int
+nvgrace_gpu_vfio_pci_register_pfn_range(struct nvgrace_gpu_vfio_pci_core_device *nvdev,
+					struct vm_area_struct *vma)
+{
+	unsigned long nr_pages;
+	int ret = 0;
+
+	nr_pages = nvdev->memlength >> PAGE_SHIFT;
+
+	nvdev->pfn_address_space.node.start = vma->vm_pgoff;
+	nvdev->pfn_address_space.node.last = vma->vm_pgoff + nr_pages - 1;
+	nvdev->pfn_address_space.ops = &nvgrace_gpu_vfio_pci_pas_ops;
+	nvdev->pfn_address_space.mapping = vma->vm_file->f_mapping;
+
+	ret = register_pfn_address_space(&(nvdev->pfn_address_space));
+
+	return ret;
+}
+
+static vm_fault_t nvgrace_gpu_vfio_pci_fault(struct vm_fault *vmf)
+{
+	unsigned long mem_offset = vmf->pgoff - vmf->vma->vm_pgoff;
+	struct vfio_device *core_vdev;
+	struct nvgrace_gpu_vfio_pci_core_device *nvdev;
+
+	if (!(vmf->vma->vm_file))
+		goto error_exit;
+
+	core_vdev = vfio_device_from_file(vmf->vma->vm_file);
+
+	if (!core_vdev)
+		goto error_exit;
+
+	nvdev = container_of(core_vdev,
+			struct nvgrace_gpu_vfio_pci_core_device, core_device.vdev);
+
+	/*
+	 * Check if the page is poisoned.
+	 */
+	if (mem_offset < (nvdev->memlength >> PAGE_SHIFT) &&
+		test_bit(mem_offset, nvdev->pfn_bitmap))
+		return VM_FAULT_HWPOISON;
+
+error_exit:
+	return VM_FAULT_ERROR;
+}
+
+static const struct vm_operations_struct nvgrace_gpu_vfio_pci_mmap_ops = {
+	.fault = nvgrace_gpu_vfio_pci_fault,
+};
+#endif
+
 static int nvgrace_gpu_vfio_pci_open_device(struct vfio_device *core_vdev)
 {
 	struct vfio_pci_core_device *vdev =
@@ -46,6 +127,9 @@ static void nvgrace_gpu_vfio_pci_close_device(struct vfio_device *core_vdev)
 
 	mutex_destroy(&nvdev->memmap_lock);
 
+#ifdef CONFIG_MEMORY_FAILURE
+	unregister_pfn_address_space(&(nvdev->pfn_address_space));
+#endif
 	vfio_pci_core_close_device(core_vdev);
 }
 
@@ -104,8 +188,12 @@ static int nvgrace_gpu_vfio_pci_mmap(struct vfio_device *core_vdev,
 		return ret;
 
 	vma->vm_pgoff = start_pfn;
+#ifdef CONFIG_MEMORY_FAILURE
+	vma->vm_ops = &nvgrace_gpu_vfio_pci_mmap_ops;
 
-	return 0;
+	ret = nvgrace_gpu_vfio_pci_register_pfn_range(nvdev, vma);
+#endif
+	return ret;
 }
 
 static long
@@ -406,6 +494,19 @@ nvgrace_gpu_vfio_pci_fetch_memory_property(struct pci_dev *pdev,
 
 	nvdev->memlength = memlength;
 
+#ifdef CONFIG_MEMORY_FAILURE
+	/*
+	 * A bitmap is maintained to track the pages that are poisoned. Each
+	 * page is represented by a bit. Allocation size in bytes is
+	 * determined by shifting the device memory size by PAGE_SHIFT to
+	 * determine the number of pages; and further shifted by 3 as each
+	 * byte could track 8 pages.
+	 */
+	nvdev->pfn_bitmap
+		= vzalloc((nvdev->memlength >> PAGE_SHIFT)/BITS_PER_TYPE(char));
+	if (!nvdev->pfn_bitmap)
+		ret = -ENOMEM;
+#endif
 	return ret;
 }
 
@@ -442,6 +543,10 @@ static void nvgrace_gpu_vfio_pci_remove(struct pci_dev *pdev)
 	struct nvgrace_gpu_vfio_pci_core_device *nvdev = nvgrace_gpu_drvdata(pdev);
 	struct vfio_pci_core_device *vdev = &nvdev->core_device;
 
+#ifdef CONFIG_MEMORY_FAILURE
+	vfree(nvdev->pfn_bitmap);
+#endif
+
 	vfio_pci_core_unregister_device(vdev);
 	vfio_put_device(&vdev->vdev);
 }
diff --git a/drivers/vfio/vfio.h b/drivers/vfio/vfio.h
index 307e3f29b527..747094503909 100644
--- a/drivers/vfio/vfio.h
+++ b/drivers/vfio/vfio.h
@@ -16,17 +16,6 @@ struct iommufd_ctx;
 struct iommu_group;
 struct vfio_container;
 
-struct vfio_device_file {
-	struct vfio_device *device;
-	struct vfio_group *group;
-
-	u8 access_granted;
-	u32 devid; /* only valid when iommufd is valid */
-	spinlock_t kvm_ref_lock; /* protect kvm field */
-	struct kvm *kvm;
-	struct iommufd_ctx *iommufd; /* protected by struct vfio_device_set::lock */
-};
-
 void vfio_device_put_registration(struct vfio_device *device);
 bool vfio_device_try_get_registration(struct vfio_device *device);
 int vfio_df_open(struct vfio_device_file *df);
diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c
index 40732e8ed4c6..a7dafd7c64a6 100644
--- a/drivers/vfio/vfio_main.c
+++ b/drivers/vfio/vfio_main.c
@@ -1309,7 +1309,7 @@ const struct file_operations vfio_device_fops = {
 	.mmap		= vfio_device_fops_mmap,
 };
 
-static struct vfio_device *vfio_device_from_file(struct file *file)
+struct vfio_device *vfio_device_from_file(struct file *file)
 {
 	struct vfio_device_file *df = file->private_data;
 
@@ -1317,6 +1317,7 @@ static struct vfio_device *vfio_device_from_file(struct file *file)
 		return NULL;
 	return df->device;
 }
+EXPORT_SYMBOL_GPL(vfio_device_from_file);
 
 /**
  * vfio_file_is_valid - True if the file is valid vfio file
diff --git a/include/linux/vfio.h b/include/linux/vfio.h
index 454e9295970c..d88af251e931 100644
--- a/include/linux/vfio.h
+++ b/include/linux/vfio.h
@@ -361,4 +361,19 @@ int vfio_virqfd_enable(void *opaque, int (*handler)(void *, void *),
 		       struct virqfd **pvirqfd, int fd);
 void vfio_virqfd_disable(struct virqfd **pvirqfd);
 
+/*
+ * VFIO device file.
+ */
+struct vfio_device_file {
+	struct vfio_device *device;
+	struct vfio_group *group;
+	u8 access_granted;
+	u32 devid; /* only valid when iommufd is valid */
+	spinlock_t kvm_ref_lock; /* protect kvm field */
+	struct kvm *kvm;
+	struct iommufd_ctx *iommufd; /* protected by struct vfio_device_set::lock */
+};
+
+struct vfio_device *vfio_device_from_file(struct file *file);
+
 #endif /* VFIO_H */
-- 
2.17.1