amd-gfx.lists.freedesktop.org archive mirror
 help / color / mirror / Atom feed
From: Felix Kuehling <Felix.Kuehling@amd.com>
To: amd-gfx@lists.freedesktop.org, dri-devel@lists.freedesktop.org
Cc: David Yat Sin <david.yatsin@amd.com>
Subject: [RFC PATCH 12/17] drm/amdkfd: CRIU restore CU mask for queues
Date: Fri, 30 Apr 2021 21:57:47 -0400	[thread overview]
Message-ID: <20210501015752.888-13-Felix.Kuehling@amd.com> (raw)
In-Reply-To: <20210501015752.888-1-Felix.Kuehling@amd.com>

From: David Yat Sin <david.yatsin@amd.com>

When re-creating queues during CRIU restore, restore the queue
with the same CU mask value used during CRIU dump.

Signed-off-by: David Yat Sin <david.yatsin@amd.com>
Change-Id: I1822fa2488f90365dfe7a3c5971a2ed6c0046b4a
---
 drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 160 +++++++++++++++++++++--
 drivers/gpu/drm/amd/amdkfd/kfd_priv.h    |   3 +
 2 files changed, 154 insertions(+), 9 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index 130ab100abb2..1d2c2d986a44 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -1890,9 +1890,21 @@ static int kfd_devinfo_restore(struct kfd_process *p, struct kfd_criu_devinfo_bu
 	}
 	return 0;
 }
-static void criu_dump_queue(struct kfd_process_device *pdd,
+
+static int get_queue_data_sizes(struct kfd_process_device *pdd,
+				struct queue *q,
+				uint32_t *cu_mask_size)
+{
+	int ret = 0;
+
+	*cu_mask_size = sizeof(uint32_t) * (q->properties.cu_mask_count / 32);
+	return ret;
+}
+
+static int criu_dump_queue(struct kfd_process_device *pdd,
                            struct queue *q,
-                           struct kfd_criu_q_bucket *q_bucket)
+                           struct kfd_criu_q_bucket *q_bucket,
+			   struct queue_restore_data *qrd)
 {
 	q_bucket->gpu_id = pdd->dev->id;
 	q_bucket->type = q->properties.type;
@@ -1920,18 +1932,30 @@ static void criu_dump_queue(struct kfd_process_device *pdd,
 		q->properties.ctx_save_restore_area_size;
 
 	q_bucket->ctl_stack_size = q->properties.ctl_stack_size;
+	if (qrd->cu_mask_size)
+		memcpy(qrd->cu_mask, q->properties.cu_mask, qrd->cu_mask_size);
+
+	q_bucket->cu_mask_size = qrd->cu_mask_size;
+	return 0;
 }
 
 static int criu_dump_queues_device(struct kfd_process_device *pdd,
 				unsigned *q_index,
 				unsigned int max_num_queues,
-				struct kfd_criu_q_bucket *user_buckets)
+				struct kfd_criu_q_bucket *user_buckets,
+				uint8_t *queues_data,
+				uint32_t *queues_data_offset,
+				uint32_t queues_data_size)
 {
 	struct queue *q;
+	struct queue_restore_data qrd;
 	struct kfd_criu_q_bucket q_bucket;
+	uint8_t *data_ptr = NULL;
+	unsigned int data_ptr_size = 0;
 	int ret = 0;
 
 	list_for_each_entry(q, &pdd->qpd.queues_list, list) {
+		unsigned int q_data_size;
 		if (q->properties.type != KFD_QUEUE_TYPE_COMPUTE &&
 			q->properties.type != KFD_QUEUE_TYPE_SDMA &&
 			q->properties.type != KFD_QUEUE_TYPE_SDMA_XGMI) {
@@ -1949,7 +1973,49 @@ static int criu_dump_queues_device(struct kfd_process_device *pdd,
 		}
 
 		memset(&q_bucket, 0, sizeof(q_bucket));
-		criu_dump_queue(pdd, q, &q_bucket);
+		memset(&qrd, 0, sizeof(qrd));
+
+		ret = get_queue_data_sizes(pdd, q, &qrd.cu_mask_size);
+		if (ret) {
+			pr_err("Failed to get queue dump info (%d)\n", ret);
+			ret = -EFAULT;
+			break;
+		}
+
+		q_data_size = qrd.cu_mask_size;
+
+		/* Increase local buffer space if needed */
+		if (data_ptr_size < q_data_size) {
+			if (data_ptr)
+				kfree(data_ptr);
+
+			data_ptr = (uint8_t*)kzalloc(q_data_size, GFP_KERNEL);
+			if (!data_ptr) {
+				ret = -ENOMEM;
+				break;
+			}
+			data_ptr_size = q_data_size;
+		}
+
+		qrd.cu_mask = data_ptr;
+		ret = criu_dump_queue(pdd, q, &q_bucket, &qrd);
+		if (ret)
+			break;
+
+		if (*queues_data_offset + q_data_size > queues_data_size) {
+			pr_err("Required memory exceeds user provided\n");
+			ret = -ENOSPC;
+			break;
+		}
+		ret = copy_to_user((void __user *) (queues_data + *queues_data_offset),
+				data_ptr, q_data_size);
+		if (ret) {
+			ret = -EFAULT;
+			break;
+		}
+		q_bucket.queues_data_offset = *queues_data_offset;
+		*queues_data_offset += q_data_size;
+
 		ret = copy_to_user((void __user *)&user_buckets[*q_index],
 					&q_bucket, sizeof(q_bucket));
 		if (ret) {
@@ -1959,6 +2025,10 @@ static int criu_dump_queues_device(struct kfd_process_device *pdd,
 		}
 		*q_index = *q_index + 1;
 	}
+
+	if (data_ptr)
+		kfree(data_ptr);
+
 	return ret;
 }
 
@@ -1976,6 +2046,8 @@ static int kfd_ioctl_criu_dumper(struct file *filep,
 	struct kfd_criu_q_bucket *user_buckets =
 		(struct kfd_criu_q_bucket*) args->kfd_criu_q_buckets_ptr;
 
+	uint8_t *queues_data = (uint8_t*)args->queues_data_ptr;
+	uint32_t queues_data_offset = 0;
 
 	pr_info("Inside %s\n",__func__);
 
@@ -2076,7 +2148,10 @@ static int kfd_ioctl_criu_dumper(struct file *filep,
 		}
 
 		ret = criu_dump_queues_device(pdd, &q_index,
-					args->num_of_queues, user_buckets);
+					args->num_of_queues, user_buckets,
+					queues_data, &queues_data_offset,
+					args->queues_data_size);
+
 		if (ret)
 			goto err_unlock;
 	}
@@ -2098,8 +2173,9 @@ static int kfd_ioctl_criu_dumper(struct file *filep,
 	return ret;
 }
 
-static void set_queue_properties_from_criu(struct queue_properties *qp,
-                                       struct kfd_criu_q_bucket *q_bucket)
+static int set_queue_properties_from_criu(struct queue_properties *qp,
+                                       struct kfd_criu_q_bucket *q_bucket,
+				       struct queue_restore_data *qrd)
 {
 	qp->is_interop = false;
 	qp->is_gws = q_bucket->is_gws;
@@ -2116,6 +2192,19 @@ static void set_queue_properties_from_criu(struct queue_properties *qp,
 	qp->ctl_stack_size = q_bucket->ctl_stack_size;
 	qp->type = q_bucket->type;
 	qp->format = q_bucket->format;
+
+	if (qrd->cu_mask_size) {
+		qp->cu_mask = kzalloc(qrd->cu_mask_size, GFP_KERNEL);
+		if (!qp->cu_mask) {
+			pr_err("Failed to allocate memory for CU mask\n");
+			return -ENOMEM;
+		}
+
+		memcpy(qp->cu_mask, qrd->cu_mask, qrd->cu_mask_size);
+		qp->cu_mask_count = (qrd->cu_mask_size / sizeof(uint32_t)) * 32;
+	}
+
+	return 0;
 }
 
 /* criu_restore_queue runs with the process mutex locked */
@@ -2148,7 +2237,10 @@ int criu_restore_queue(struct kfd_process *p,
 			q_bucket->q_address);
 
 	memset(&qp, 0, sizeof(qp));
-	set_queue_properties_from_criu(&qp, q_bucket);
+	ret = set_queue_properties_from_criu(&qp, q_bucket, qrd);
+	if (ret)
+		goto err_create_queue;
+
 	print_queue_properties(&qp);
 
 	qrd->qid = q_bucket->q_id;
@@ -2158,12 +2250,18 @@ int criu_restore_queue(struct kfd_process *p,
 	ret = pqm_create_queue(&p->pqm, dev, NULL, &qp, &queue_id, qrd, NULL);
 	if (ret) {
 		pr_err("Failed to create new queue err:%d\n", ret);
-		return -EINVAL;
+		ret = -EINVAL;
+		goto err_create_queue;
 	}
 
 	pr_debug("Queue id %d was restored successfully\n", queue_id);
 
 	return 0;
+err_create_queue:
+	if (qp.cu_mask)
+		kfree(qp.cu_mask);
+
+	return ret;
 }
 
 /* criu_restore_queues runs with the process mutex locked */
@@ -2176,6 +2274,11 @@ static int criu_restore_queues(struct kfd_process *p,
 	int ret;
 	struct kfd_criu_q_bucket *user_buckets =
 		(struct kfd_criu_q_bucket*) args->kfd_criu_q_buckets_ptr;
+
+	uint8_t *queues_data = (uint8_t*)args->queues_data_ptr;
+	uint8_t *data_ptr = NULL;
+	uint32_t data_ptr_size = 0;
+
 	/*
          * This process will not have any queues at this point, but we are
          * setting all the dqm's for this process to evicted state.
@@ -2185,6 +2288,7 @@ static int criu_restore_queues(struct kfd_process *p,
 	for (i = 0; i < args->num_of_queues; i++) {
 		struct kfd_criu_q_bucket q_bucket;
 		struct queue_restore_data qrd;
+		uint32_t q_data_size;
 
 		memset(&qrd, 0, sizeof(qrd));
 
@@ -2212,12 +2316,42 @@ static int criu_restore_queues(struct kfd_process *p,
 			ret = -EFAULT;
 			return ret;
 		}
+
+		q_data_size = q_bucket.cu_mask_size;
+
+		/* Increase local buffer space if needed */
+		if (q_data_size > data_ptr_size) {
+			if (data_ptr)
+				kfree(data_ptr);
+
+			data_ptr = (uint8_t*)kzalloc(q_data_size, GFP_KERNEL);
+			if (!data_ptr) {
+				ret = -ENOMEM;
+				break;
+			}
+			data_ptr_size = q_data_size;
+		}
+
+		ret = copy_from_user(data_ptr,
+				(void __user *) queues_data + q_bucket.queues_data_offset,
+				q_data_size);
+		if (ret) {
+			ret = -EFAULT;
+			break;
+		}
+
+		qrd.cu_mask_size = q_bucket.cu_mask_size;
+		qrd.cu_mask = data_ptr;
+
 		ret = criu_restore_queue(p, dev, pdd, &q_bucket, &qrd);
 		if (ret) {
 			pr_err("Failed to restore queue (%d)\n", ret);
 			break;
 		}
 	}
+
+	if (data_ptr)
+		kfree(data_ptr);
 	return ret;
 }
 
@@ -2507,6 +2641,7 @@ static int kfd_ioctl_criu_helper(struct file *filep,
 				struct kfd_process *p, void *data)
 {
 	struct kfd_ioctl_criu_helper_args *args = data;
+	u32 queues_data_size = 0;
 	struct kgd_mem *kgd_mem;
 	struct queue *q;
 	u64 num_of_bos = 0;
@@ -2544,6 +2679,12 @@ static int kfd_ioctl_criu_helper(struct file *filep,
 				q->properties.type == KFD_QUEUE_TYPE_SDMA ||
 				q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) {
 
+				u32 cu_mask_size = 0;
+				ret = get_queue_data_sizes(pdd, q, &cu_mask_size);
+				if (ret)
+					goto err_unlock;
+
+				queues_data_size += cu_mask_size;
 				q_index++;
 			} else {
 				pr_err("Unsupported queue type (%d)\n", q->properties.type);
@@ -2558,6 +2699,7 @@ static int kfd_ioctl_criu_helper(struct file *filep,
 	args->num_of_devices = p->n_pdds;
 	args->num_of_bos = num_of_bos;
 	args->num_of_queues = q_index;
+	args->queues_data_size = queues_data_size;
 	dev_dbg(kfd_device, "Num of bos = %llu\n", num_of_bos);
 
 err_unlock:
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index a4a0713ca9a3..55180a6fe102 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -473,6 +473,9 @@ struct queue_restore_data {
 	uint32_t qid;
 	uint32_t sdma_id;
 	uint32_t doorbell_id;
+
+	void *cu_mask;
+	uint32_t cu_mask_size;
 };
 
 struct queue_properties {
-- 
2.17.1

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

  parent reply	other threads:[~2021-05-01  1:58 UTC|newest]

Thread overview: 18+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2021-05-01  1:57 [RFC PATCH 00/17] CRIU support for ROCm Felix Kuehling
2021-05-01  1:57 ` [RFC PATCH 01/17] x86/configs: CRIU update release defconfig Felix Kuehling
2021-05-01  1:57 ` [RFC PATCH 02/17] x86/configs: CRIU update debug rock defconfig Felix Kuehling
2021-05-01  1:57 ` [RFC PATCH 03/17] drm/amdkfd: CRIU Introduce Checkpoint-Restore APIs Felix Kuehling
2021-05-01  1:57 ` [RFC PATCH 04/17] drm/amdkfd: CRIU Implement KFD helper ioctl Felix Kuehling
2021-05-01  1:57 ` [RFC PATCH 05/17] drm/amdkfd: CRIU Implement KFD dumper ioctl Felix Kuehling
2021-05-01  1:57 ` [RFC PATCH 06/17] drm/amdkfd: CRIU Implement KFD restore ioctl Felix Kuehling
2021-05-01  1:57 ` [RFC PATCH 07/17] drm/amdkfd: CRIU Implement KFD resume ioctl Felix Kuehling
2021-05-01  1:57 ` [RFC PATCH 08/17] drm/amdkfd: CRIU add queues support Felix Kuehling
2021-05-01  1:57 ` [RFC PATCH 09/17] drm/amdkfd: CRIU restore queue ids Felix Kuehling
2021-05-01  1:57 ` [RFC PATCH 10/17] drm/amdkfd: CRIU restore sdma id for queues Felix Kuehling
2021-05-01  1:57 ` [RFC PATCH 11/17] drm/amdkfd: CRIU restore queue doorbell id Felix Kuehling
2021-05-01  1:57 ` Felix Kuehling [this message]
2021-05-01  1:57 ` [RFC PATCH 13/17] drm/amdkfd: CRIU dump and restore queue mqds Felix Kuehling
2021-05-01  1:57 ` [RFC PATCH 14/17] drm/amdkfd: CRIU dump/restore queue control stack Felix Kuehling
2021-05-01  1:57 ` [RFC PATCH 15/17] drm/amdkfd: CRIU dump and restore events Felix Kuehling
2021-05-01  1:57 ` [RFC PATCH 16/17] drm/amdkfd: CRIU implement gpu_id remapping Felix Kuehling
2021-05-01  1:57 ` [RFC PATCH 17/17] Revert "drm/amdgpu: Remove verify_access shortcut for KFD BOs" Felix Kuehling

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20210501015752.888-13-Felix.Kuehling@amd.com \
    --to=felix.kuehling@amd.com \
    --cc=amd-gfx@lists.freedesktop.org \
    --cc=david.yatsin@amd.com \
    --cc=dri-devel@lists.freedesktop.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).