All of lore.kernel.org
 help / color / mirror / Atom feed
From: Felix Kuehling <Felix.Kuehling@amd.com>
To: amd-gfx@lists.freedesktop.org, dri-devel@lists.freedesktop.org
Cc: David Yat Sin <david.yatsin@amd.com>,
	Rajneesh Bhardwaj <rajneesh.bhardwaj@amd.com>
Subject: [RFC PATCH 05/17] drm/amdkfd: CRIU Implement KFD dumper ioctl
Date: Fri, 30 Apr 2021 21:57:40 -0400	[thread overview]
Message-ID: <20210501015752.888-6-Felix.Kuehling@amd.com> (raw)
In-Reply-To: <20210501015752.888-1-Felix.Kuehling@amd.com>

From: Rajneesh Bhardwaj <rajneesh.bhardwaj@amd.com>

This adds support to discover the buffer objects that belong to a
process being checkpointed. The data corresponding to these buffer
objects is returned to user space plugin running under criu master
context which then stores this info to recreate these buffer objects
during a restore operation.

Signed-off-by: David Yat Sin <david.yatsin@amd.com>
Signed-off-by: Rajneesh Bhardwaj <rajneesh.bhardwaj@amd.com>
(cherry picked from commit 1f114a541bd21873de905db64bb9efa673274d4b)
(cherry picked from commit 20c435fad57d3201e5402e38ae778f1f0f84a09d)
Change-Id: Ifdbeee3606578db61bdc9aca7528272e20a38256
---
 drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 96 +++++++++++++++++++++++-
 1 file changed, 95 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index 6b347ce5992f..1454f5613e60 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -42,6 +42,7 @@
 #include "kfd_svm.h"
 #include "amdgpu_amdkfd.h"
 #include "kfd_smi_events.h"
+#include "amdgpu_object.h"
 
 static long kfd_ioctl(struct file *, unsigned int, unsigned long);
 static int kfd_open(struct inode *, struct file *);
@@ -1806,9 +1807,102 @@ static int kfd_ioctl_svm(struct file *filep, struct kfd_process *p, void *data)
 static int kfd_ioctl_criu_dumper(struct file *filep,
 				struct kfd_process *p, void *data)
 {
+	struct kfd_ioctl_criu_dumper_args *args = data;
+	struct kfd_criu_bo_buckets *bo_bucket;
+	struct amdgpu_bo *dumper_bo;
+	int ret, id, index, i = 0;
+	struct kgd_mem *kgd_mem;
+	void *mem;
+
 	pr_info("Inside %s\n",__func__);
 
-	return 0;
+	if (args->num_of_bos == 0) {
+		pr_err("No BOs to be dumped\n");
+		return -EINVAL;
+	}
+
+	if (p->n_pdds != args->num_of_devices) {
+		pr_err("Invalid number of devices %d (expected = %d)\n",
+								args->num_of_devices, p->n_pdds);
+		return -EINVAL;
+	}
+
+	pr_debug("num of bos = %llu\n", args->num_of_bos);
+
+	bo_bucket = kvzalloc((sizeof(struct kfd_criu_bo_buckets) *
+			     args->num_of_bos), GFP_KERNEL);
+	if (!bo_bucket)
+		return -ENOMEM;
+
+	mutex_lock(&p->mutex);
+
+	if (!kfd_has_process_device_data(p)) {
+		pr_err("No pdd for given process\n");
+		ret = -ENODEV;
+		goto err_unlock;
+	}
+
+	/* Run over all PDDs of the process */
+	for (index = 0; index < p->n_pdds; index++) {
+		struct kfd_process_device *pdd = p->pdds[index];
+
+		idr_for_each_entry(&pdd->alloc_idr, mem, id) {
+			if (!mem) {
+				ret = -ENOMEM;
+				goto err_unlock;
+			}
+
+			kgd_mem = (struct kgd_mem *)mem;
+			dumper_bo = kgd_mem->bo;
+
+			if ((uint64_t)kgd_mem->va > pdd->gpuvm_base) {
+				if (i >= args->num_of_bos) {
+					pr_err("Num of BOs changed since last helper ioctl call\n");
+					ret = -EINVAL;
+					goto err_unlock;
+				}
+
+				bo_bucket[i].bo_addr = (uint64_t)kgd_mem->va;
+				bo_bucket[i].bo_size = amdgpu_bo_size(dumper_bo);
+				bo_bucket[i].gpu_id = pdd->dev->id;
+				bo_bucket[i].bo_alloc_flags = (uint32_t)kgd_mem->alloc_flags;
+				bo_bucket[i].idr_handle = id;
+
+				if (bo_bucket[i].bo_alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_DOORBELL)
+					bo_bucket[i].bo_offset = KFD_MMAP_TYPE_DOORBELL |
+						KFD_MMAP_GPU_ID(pdd->dev->id);
+				else if (bo_bucket[i].bo_alloc_flags &
+				    KFD_IOC_ALLOC_MEM_FLAGS_MMIO_REMAP)
+					bo_bucket[i].bo_offset = KFD_MMAP_TYPE_MMIO |
+						KFD_MMAP_GPU_ID(pdd->dev->id);
+				else
+					bo_bucket[i].bo_offset = amdgpu_bo_mmap_offset(dumper_bo);
+
+				pr_debug("bo_size = 0x%llx, bo_addr = 0x%llx bo_offset = 0x%llx\n"
+					 "gpu_id = 0x%x alloc_flags = 0x%x idr_handle = 0x%x",
+					 bo_bucket[i].bo_size,
+					 bo_bucket[i].bo_addr,
+					 bo_bucket[i].bo_offset,
+					 bo_bucket[i].gpu_id,
+					 bo_bucket[i].bo_alloc_flags,
+					 bo_bucket[i].idr_handle);
+				i++;
+			}
+		}
+	}
+
+	ret = copy_to_user((void __user *)args->kfd_criu_bo_buckets_ptr,
+			bo_bucket,
+			(args->num_of_bos *
+			 sizeof(struct kfd_criu_bo_buckets)));
+	kvfree(bo_bucket);
+	mutex_unlock(&p->mutex);
+	return ret ? -EFAULT : 0;
+
+err_unlock:
+	mutex_unlock(&p->mutex);
+	pr_err("Dumper ioctl failed err:%d\n", ret);
+	return ret;
 }
 
 static int kfd_ioctl_criu_restorer(struct file *filep,
-- 
2.17.1

_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

WARNING: multiple messages have this Message-ID (diff)
From: Felix Kuehling <Felix.Kuehling@amd.com>
To: amd-gfx@lists.freedesktop.org, dri-devel@lists.freedesktop.org
Cc: David Yat Sin <david.yatsin@amd.com>,
	Rajneesh Bhardwaj <rajneesh.bhardwaj@amd.com>
Subject: [RFC PATCH 05/17] drm/amdkfd: CRIU Implement KFD dumper ioctl
Date: Fri, 30 Apr 2021 21:57:40 -0400	[thread overview]
Message-ID: <20210501015752.888-6-Felix.Kuehling@amd.com> (raw)
In-Reply-To: <20210501015752.888-1-Felix.Kuehling@amd.com>

From: Rajneesh Bhardwaj <rajneesh.bhardwaj@amd.com>

This adds support to discover the buffer objects that belong to a
process being checkpointed. The data corresponding to these buffer
objects is returned to user space plugin running under criu master
context which then stores this info to recreate these buffer objects
during a restore operation.

Signed-off-by: David Yat Sin <david.yatsin@amd.com>
Signed-off-by: Rajneesh Bhardwaj <rajneesh.bhardwaj@amd.com>
(cherry picked from commit 1f114a541bd21873de905db64bb9efa673274d4b)
(cherry picked from commit 20c435fad57d3201e5402e38ae778f1f0f84a09d)
Change-Id: Ifdbeee3606578db61bdc9aca7528272e20a38256
---
 drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 96 +++++++++++++++++++++++-
 1 file changed, 95 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index 6b347ce5992f..1454f5613e60 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -42,6 +42,7 @@
 #include "kfd_svm.h"
 #include "amdgpu_amdkfd.h"
 #include "kfd_smi_events.h"
+#include "amdgpu_object.h"
 
 static long kfd_ioctl(struct file *, unsigned int, unsigned long);
 static int kfd_open(struct inode *, struct file *);
@@ -1806,9 +1807,102 @@ static int kfd_ioctl_svm(struct file *filep, struct kfd_process *p, void *data)
 static int kfd_ioctl_criu_dumper(struct file *filep,
 				struct kfd_process *p, void *data)
 {
+	struct kfd_ioctl_criu_dumper_args *args = data;
+	struct kfd_criu_bo_buckets *bo_bucket;
+	struct amdgpu_bo *dumper_bo;
+	int ret, id, index, i = 0;
+	struct kgd_mem *kgd_mem;
+	void *mem;
+
 	pr_info("Inside %s\n",__func__);
 
-	return 0;
+	if (args->num_of_bos == 0) {
+		pr_err("No BOs to be dumped\n");
+		return -EINVAL;
+	}
+
+	if (p->n_pdds != args->num_of_devices) {
+		pr_err("Invalid number of devices %d (expected = %d)\n",
+								args->num_of_devices, p->n_pdds);
+		return -EINVAL;
+	}
+
+	pr_debug("num of bos = %llu\n", args->num_of_bos);
+
+	bo_bucket = kvzalloc((sizeof(struct kfd_criu_bo_buckets) *
+			     args->num_of_bos), GFP_KERNEL);
+	if (!bo_bucket)
+		return -ENOMEM;
+
+	mutex_lock(&p->mutex);
+
+	if (!kfd_has_process_device_data(p)) {
+		pr_err("No pdd for given process\n");
+		ret = -ENODEV;
+		goto err_unlock;
+	}
+
+	/* Run over all PDDs of the process */
+	for (index = 0; index < p->n_pdds; index++) {
+		struct kfd_process_device *pdd = p->pdds[index];
+
+		idr_for_each_entry(&pdd->alloc_idr, mem, id) {
+			if (!mem) {
+				ret = -ENOMEM;
+				goto err_unlock;
+			}
+
+			kgd_mem = (struct kgd_mem *)mem;
+			dumper_bo = kgd_mem->bo;
+
+			if ((uint64_t)kgd_mem->va > pdd->gpuvm_base) {
+				if (i >= args->num_of_bos) {
+					pr_err("Num of BOs changed since last helper ioctl call\n");
+					ret = -EINVAL;
+					goto err_unlock;
+				}
+
+				bo_bucket[i].bo_addr = (uint64_t)kgd_mem->va;
+				bo_bucket[i].bo_size = amdgpu_bo_size(dumper_bo);
+				bo_bucket[i].gpu_id = pdd->dev->id;
+				bo_bucket[i].bo_alloc_flags = (uint32_t)kgd_mem->alloc_flags;
+				bo_bucket[i].idr_handle = id;
+
+				if (bo_bucket[i].bo_alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_DOORBELL)
+					bo_bucket[i].bo_offset = KFD_MMAP_TYPE_DOORBELL |
+						KFD_MMAP_GPU_ID(pdd->dev->id);
+				else if (bo_bucket[i].bo_alloc_flags &
+				    KFD_IOC_ALLOC_MEM_FLAGS_MMIO_REMAP)
+					bo_bucket[i].bo_offset = KFD_MMAP_TYPE_MMIO |
+						KFD_MMAP_GPU_ID(pdd->dev->id);
+				else
+					bo_bucket[i].bo_offset = amdgpu_bo_mmap_offset(dumper_bo);
+
+				pr_debug("bo_size = 0x%llx, bo_addr = 0x%llx bo_offset = 0x%llx\n"
+					 "gpu_id = 0x%x alloc_flags = 0x%x idr_handle = 0x%x",
+					 bo_bucket[i].bo_size,
+					 bo_bucket[i].bo_addr,
+					 bo_bucket[i].bo_offset,
+					 bo_bucket[i].gpu_id,
+					 bo_bucket[i].bo_alloc_flags,
+					 bo_bucket[i].idr_handle);
+				i++;
+			}
+		}
+	}
+
+	ret = copy_to_user((void __user *)args->kfd_criu_bo_buckets_ptr,
+			bo_bucket,
+			(args->num_of_bos *
+			 sizeof(struct kfd_criu_bo_buckets)));
+	kvfree(bo_bucket);
+	mutex_unlock(&p->mutex);
+	return ret ? -EFAULT : 0;
+
+err_unlock:
+	mutex_unlock(&p->mutex);
+	pr_err("Dumper ioctl failed err:%d\n", ret);
+	return ret;
 }
 
 static int kfd_ioctl_criu_restorer(struct file *filep,
-- 
2.17.1

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

  parent reply	other threads:[~2021-05-01  1:58 UTC|newest]

Thread overview: 36+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2021-05-01  1:57 [RFC PATCH 00/17] CRIU support for ROCm Felix Kuehling
2021-05-01  1:57 ` Felix Kuehling
2021-05-01  1:57 ` [RFC PATCH 01/17] x86/configs: CRIU update release defconfig Felix Kuehling
2021-05-01  1:57   ` Felix Kuehling
2021-05-01  1:57 ` [RFC PATCH 02/17] x86/configs: CRIU update debug rock defconfig Felix Kuehling
2021-05-01  1:57   ` Felix Kuehling
2021-05-01  1:57 ` [RFC PATCH 03/17] drm/amdkfd: CRIU Introduce Checkpoint-Restore APIs Felix Kuehling
2021-05-01  1:57   ` Felix Kuehling
2021-05-01  1:57 ` [RFC PATCH 04/17] drm/amdkfd: CRIU Implement KFD helper ioctl Felix Kuehling
2021-05-01  1:57   ` Felix Kuehling
2021-05-01  1:57 ` Felix Kuehling [this message]
2021-05-01  1:57   ` [RFC PATCH 05/17] drm/amdkfd: CRIU Implement KFD dumper ioctl Felix Kuehling
2021-05-01  1:57 ` [RFC PATCH 06/17] drm/amdkfd: CRIU Implement KFD restore ioctl Felix Kuehling
2021-05-01  1:57   ` Felix Kuehling
2021-05-01  1:57 ` [RFC PATCH 07/17] drm/amdkfd: CRIU Implement KFD resume ioctl Felix Kuehling
2021-05-01  1:57   ` Felix Kuehling
2021-05-01  1:57 ` [RFC PATCH 08/17] drm/amdkfd: CRIU add queues support Felix Kuehling
2021-05-01  1:57   ` Felix Kuehling
2021-05-01  1:57 ` [RFC PATCH 09/17] drm/amdkfd: CRIU restore queue ids Felix Kuehling
2021-05-01  1:57   ` Felix Kuehling
2021-05-01  1:57 ` [RFC PATCH 10/17] drm/amdkfd: CRIU restore sdma id for queues Felix Kuehling
2021-05-01  1:57   ` Felix Kuehling
2021-05-01  1:57 ` [RFC PATCH 11/17] drm/amdkfd: CRIU restore queue doorbell id Felix Kuehling
2021-05-01  1:57   ` Felix Kuehling
2021-05-01  1:57 ` [RFC PATCH 12/17] drm/amdkfd: CRIU restore CU mask for queues Felix Kuehling
2021-05-01  1:57   ` Felix Kuehling
2021-05-01  1:57 ` [RFC PATCH 13/17] drm/amdkfd: CRIU dump and restore queue mqds Felix Kuehling
2021-05-01  1:57   ` Felix Kuehling
2021-05-01  1:57 ` [RFC PATCH 14/17] drm/amdkfd: CRIU dump/restore queue control stack Felix Kuehling
2021-05-01  1:57   ` Felix Kuehling
2021-05-01  1:57 ` [RFC PATCH 15/17] drm/amdkfd: CRIU dump and restore events Felix Kuehling
2021-05-01  1:57   ` Felix Kuehling
2021-05-01  1:57 ` [RFC PATCH 16/17] drm/amdkfd: CRIU implement gpu_id remapping Felix Kuehling
2021-05-01  1:57   ` Felix Kuehling
2021-05-01  1:57 ` [RFC PATCH 17/17] Revert "drm/amdgpu: Remove verify_access shortcut for KFD BOs" Felix Kuehling
2021-05-01  1:57   ` Felix Kuehling

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20210501015752.888-6-Felix.Kuehling@amd.com \
    --to=felix.kuehling@amd.com \
    --cc=amd-gfx@lists.freedesktop.org \
    --cc=david.yatsin@amd.com \
    --cc=dri-devel@lists.freedesktop.org \
    --cc=rajneesh.bhardwaj@amd.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.