All of lore.kernel.org
 help / color / mirror / Atom feed
From: David Yat Sin <david.yatsin@amd.com>
To: <amd-gfx@lists.freedesktop.org>
Cc: <felix.kuehling@amd.com>, <rajneesh.bhardwaj@amd.com>,
	David Yat Sin <david.yatsin@amd.com>
Subject: [PATCH v2 09/18] drm/amdkfd: CRIU add queues support
Date: Tue, 24 Aug 2021 15:06:30 -0400	[thread overview]
Message-ID: <20210824190639.6288-10-david.yatsin@amd.com> (raw)
In-Reply-To: <20210824190639.6288-1-david.yatsin@amd.com>

Add support to existing CRIU ioctl's to save number of queues and queue
properties for each queue during checkpoint and re-create queues on
restore.

Signed-off-by: David Yat Sin <david.yatsin@amd.com>
---
 drivers/gpu/drm/amd/amdkfd/kfd_chardev.c      |  16 +-
 drivers/gpu/drm/amd/amdkfd/kfd_priv.h         |  25 +-
 .../amd/amdkfd/kfd_process_queue_manager.c    | 363 ++++++++++++++++++
 3 files changed, 402 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index 821b7663fa5d..7f5dc038030c 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -2006,6 +2006,8 @@ static int kfd_ioctl_criu_dumper(struct file *filep,
 		ret = criu_dump_bos(p, args);
 		break;
 	case KFD_CRIU_OBJECT_TYPE_QUEUE:
+		ret = kfd_criu_dump_queues(p, args);
+		break;
 	case KFD_CRIU_OBJECT_TYPE_EVENT:
 	case KFD_CRIU_OBJECT_TYPE_DEVICE:
 	case KFD_CRIU_OBJECT_TYPE_SVM_RANGE:
@@ -2305,6 +2307,8 @@ static int kfd_ioctl_criu_restorer(struct file *filep,
 		ret = criu_restore_bos(p, args);
 		break;
 	case KFD_CRIU_OBJECT_TYPE_QUEUE:
+		ret = kfd_criu_restore_queues(p, args);
+		break;
 	case KFD_CRIU_OBJECT_TYPE_EVENT:
 	case KFD_CRIU_OBJECT_TYPE_DEVICE:
 	case KFD_CRIU_OBJECT_TYPE_SVM_RANGE:
@@ -2380,6 +2384,7 @@ static int kfd_ioctl_criu_process_info(struct file *filep,
 				struct kfd_process *p, void *data)
 {
 	struct kfd_ioctl_criu_process_info_args *args = data;
+	uint32_t queues_extra_data_size;
 	int ret = 0;
 
 	pr_debug("Inside %s\n", __func__);
@@ -2399,7 +2404,16 @@ static int kfd_ioctl_criu_process_info(struct file *filep,
 	args->total_bos = get_process_num_bos(p);
 	args->bos_priv_data_size = args->total_bos * sizeof(struct kfd_criu_bo_priv_data);
 
-	dev_dbg(kfd_device, "Num of bos:%llu\n", args->total_bos);
+	ret = kfd_process_get_queue_info(p, &args->total_queues, &queues_extra_data_size);
+	if (ret)
+		goto err_unlock;
+
+	args->queues_priv_data_size = queues_extra_data_size +
+				(args->total_queues * sizeof(struct kfd_criu_queue_priv_data));
+
+	dev_dbg(kfd_device, "Num of bos:%llu queues:%u\n",
+				args->total_bos,
+				args->total_queues);
 err_unlock:
 	mutex_unlock(&p->mutex);
 	return ret;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index 0b8165729cde..7c0f3fc551c7 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -1044,13 +1044,36 @@ struct kfd_criu_svm_range_priv_data {
 };
 
 struct kfd_criu_queue_priv_data {
-	uint64_t reserved;
+	uint64_t q_address;
+	uint64_t q_size;
+	uint64_t read_ptr_addr;
+	uint64_t write_ptr_addr;
+	uint64_t doorbell_off;
+	uint64_t eop_ring_buffer_address;
+	uint64_t ctx_save_restore_area_address;
+	uint32_t gpu_id;
+	uint32_t type;
+	uint32_t format;
+	uint32_t q_id;
+	uint32_t priority;
+	uint32_t q_percent;
+	uint32_t doorbell_id;
+	uint32_t is_gws;
+	uint32_t sdma_id;
+	uint32_t eop_ring_buffer_size;
+	uint32_t ctx_save_restore_area_size;
+	uint32_t ctl_stack_size;
+	uint32_t cu_mask_size;
+	uint32_t mqd_size;
 };
 
 struct kfd_criu_event_priv_data {
 	uint64_t reserved;
 };
 
+int kfd_process_get_queue_info(struct kfd_process *p, uint32_t *num_queues, uint32_t *q_data_sizes);
+int kfd_criu_dump_queues(struct kfd_process *p, struct kfd_ioctl_criu_dumper_args *args);
+int kfd_criu_restore_queues(struct kfd_process *p, struct kfd_ioctl_criu_restorer_args *args);
 /* CRIU - End */
 
 /* Queue Context Management */
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
index 95a6c36cea4c..65cec45190b3 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
@@ -507,6 +507,369 @@ int pqm_get_wave_state(struct process_queue_manager *pqm,
 						       save_area_used_size);
 }
 
+
+static void get_queue_data_sizes(struct kfd_process_device *pdd,
+				struct queue *q,
+				uint32_t *cu_mask_size)
+{
+	*cu_mask_size = sizeof(uint32_t) * (q->properties.cu_mask_count / 32);
+}
+
+int kfd_process_get_queue_info(struct kfd_process *p, uint32_t *num_queues, uint32_t *q_data_sizes)
+{
+	u32 data_sizes = 0;
+	u32 q_index = 0;
+	struct queue *q;
+	int i;
+
+	/* Run over all PDDs of the process */
+	for (i = 0; i < p->n_pdds; i++) {
+		struct kfd_process_device *pdd = p->pdds[i];
+
+		list_for_each_entry(q, &pdd->qpd.queues_list, list) {
+			if (q->properties.type == KFD_QUEUE_TYPE_COMPUTE ||
+				q->properties.type == KFD_QUEUE_TYPE_SDMA ||
+				q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) {
+				u32 cu_mask_size;
+
+				get_queue_data_sizes(pdd, q, &cu_mask_size);
+
+				data_sizes += cu_mask_size;
+				q_index++;
+			} else {
+				pr_err("Unsupported queue type (%d)\n", q->properties.type);
+				return -EOPNOTSUPP;
+			}
+		}
+	}
+	*num_queues = q_index;
+	*q_data_sizes = data_sizes;
+
+	return 0;
+}
+
+static void criu_dump_queue(struct kfd_process_device *pdd,
+			   struct queue *q,
+			   struct kfd_criu_queue_bucket *q_bucket,
+			   void *private_data)
+{
+	struct kfd_criu_queue_priv_data *q_data = (struct kfd_criu_queue_priv_data *) private_data;
+	uint8_t *cu_mask;
+
+	cu_mask = (void *)(q_data + 1);
+
+	q_bucket->gpu_id = pdd->dev->id;
+	q_data->type = q->properties.type;
+	q_data->format = q->properties.format;
+	q_data->q_id =  q->properties.queue_id;
+	q_data->q_address = q->properties.queue_address;
+	q_data->q_size = q->properties.queue_size;
+	q_data->priority = q->properties.priority;
+	q_data->q_percent = q->properties.queue_percent;
+	q_data->read_ptr_addr = (uint64_t)q->properties.read_ptr;
+	q_data->write_ptr_addr = (uint64_t)q->properties.write_ptr;
+	q_data->doorbell_id = q->doorbell_id;
+
+	q_data->sdma_id = q->sdma_id;
+
+	q_data->eop_ring_buffer_address =
+		q->properties.eop_ring_buffer_address;
+
+	q_data->eop_ring_buffer_size = q->properties.eop_ring_buffer_size;
+
+	q_data->ctx_save_restore_area_address =
+		q->properties.ctx_save_restore_area_address;
+
+	q_data->ctx_save_restore_area_size =
+		q->properties.ctx_save_restore_area_size;
+
+	if (q_data->cu_mask_size)
+		memcpy(cu_mask, q->properties.cu_mask, q_data->cu_mask_size);
+
+	pr_debug("Dumping Queue: gpu_id:%x queue_id:%u\n", q_bucket->gpu_id, q_data->q_id);
+}
+
+static int criu_dump_queues_device(struct kfd_process_device *pdd,
+				unsigned int *q_index,
+				unsigned int max_num_queues,
+				struct kfd_criu_queue_bucket *q_buckets,
+				uint8_t *user_priv_data,
+				uint64_t *queues_priv_data_offset)
+{
+	struct queue *q;
+	uint8_t *q_private_data = NULL; /* Local buffer to store individual queue private data */
+	unsigned int q_private_data_size = 0;
+	int ret = 0;
+
+	list_for_each_entry(q, &pdd->qpd.queues_list, list) {
+		struct kfd_criu_queue_bucket q_bucket;
+		struct kfd_criu_queue_priv_data *q_data;
+		uint64_t q_data_size;
+		uint32_t cu_mask_size;
+
+		if (q->properties.type != KFD_QUEUE_TYPE_COMPUTE &&
+			q->properties.type != KFD_QUEUE_TYPE_SDMA &&
+			q->properties.type != KFD_QUEUE_TYPE_SDMA_XGMI) {
+
+			pr_err("Unsupported queue type (%d)\n", q->properties.type);
+			return -EOPNOTSUPP;
+		}
+
+		memset(&q_bucket, 0, sizeof(q_bucket));
+
+		get_queue_data_sizes(pdd, q, &cu_mask_size);
+
+		q_data_size = sizeof(*q_data) + cu_mask_size;
+
+		/* Increase local buffer space if needed */
+		if (q_private_data_size < q_data_size) {
+			kfree(q_private_data);
+
+			q_private_data = kzalloc(q_data_size, GFP_KERNEL);
+			if (!q_private_data) {
+				ret = -ENOMEM;
+				break;
+			}
+			q_private_data_size = q_data_size;
+		}
+
+		q_data = (struct kfd_criu_queue_priv_data *)q_private_data;
+
+		q_data->cu_mask_size = cu_mask_size;
+
+		criu_dump_queue(pdd, q, &q_bucket, q_data);
+
+		q_bucket.priv_data_offset = *queues_priv_data_offset;
+		q_bucket.priv_data_size = q_data_size;
+
+		ret = copy_to_user((void __user *) (user_priv_data + q_bucket.priv_data_offset),
+				q_private_data, q_bucket.priv_data_size);
+		if (ret) {
+			ret = -EFAULT;
+			break;
+		}
+		*queues_priv_data_offset += q_data_size;
+
+		ret = copy_to_user((void __user *)&q_buckets[*q_index],
+					&q_bucket, sizeof(q_bucket));
+		if (ret) {
+			pr_err("Failed to copy queue information to user\n");
+			ret = -EFAULT;
+			break;
+		}
+		*q_index = *q_index + 1;
+	}
+
+	kfree(q_private_data);
+
+	return ret;
+}
+
+int kfd_criu_dump_queues(struct kfd_process *p, struct kfd_ioctl_criu_dumper_args *args)
+{
+	struct kfd_criu_queue_bucket *queue_buckets;
+	uint32_t num_queues, queue_extra_data_sizes;
+	uint64_t queues_priv_data_offset = 0;
+	int ret = 0, pdd_index, q_index = 0;
+	void *private_data; /* Pointer to first private data in userspace */
+
+	ret = kfd_process_get_queue_info(p, &num_queues, &queue_extra_data_sizes);
+	if (ret)
+		return ret;
+
+	if (args->num_objects != num_queues) {
+		pr_err("Mismatch with number of queues (current:%d user:%lld)\n",
+							num_queues, args->num_objects);
+		return -EINVAL;
+	}
+
+	if (args->objects_size != queue_extra_data_sizes +
+				  (num_queues * (sizeof(*queue_buckets) +
+						 sizeof(struct kfd_criu_queue_priv_data)))) {
+		pr_err("Invalid objects size for queues\n");
+		return -EINVAL;
+	}
+
+	/* Queue private data size for each queue can vary in size as it also includes cu_mask.
+	 * First queue private data starts after all queue_buckets
+	 */
+
+	queue_buckets = (struct kfd_criu_queue_bucket *)args->objects;
+	private_data = (void *)(queue_buckets + args->num_objects);
+
+	for (pdd_index = 0; pdd_index < p->n_pdds; pdd_index++) {
+		struct kfd_process_device *pdd = p->pdds[pdd_index];
+
+		/* criu_dump_queues_device will copy data to user */
+		ret = criu_dump_queues_device(pdd,
+					      &q_index,
+					      args->num_objects,
+					      queue_buckets,
+					      private_data,
+					      &queues_priv_data_offset);
+
+		if (ret)
+			break;
+	}
+
+	return ret;
+}
+
+static int set_queue_properties_from_criu(struct queue_properties *qp,
+					  struct kfd_criu_queue_bucket *q_bucket,
+					  struct kfd_criu_queue_priv_data *q_data,
+					  void *cu_mask)
+{
+	qp->is_interop = false;
+	qp->is_gws = q_data->is_gws;
+	qp->queue_percent = q_data->q_percent;
+	qp->priority = q_data->priority;
+	qp->queue_address = q_data->q_address;
+	qp->queue_size = q_data->q_size;
+	qp->read_ptr = (uint32_t *) q_data->read_ptr_addr;
+	qp->write_ptr = (uint32_t *) q_data->write_ptr_addr;
+	qp->eop_ring_buffer_address = q_data->eop_ring_buffer_address;
+	qp->eop_ring_buffer_size = q_data->eop_ring_buffer_size;
+	qp->ctx_save_restore_area_address = q_data->ctx_save_restore_area_address;
+	qp->ctx_save_restore_area_size = q_data->ctx_save_restore_area_size;
+	qp->ctl_stack_size = q_data->ctl_stack_size;
+	qp->type = q_data->type;
+	qp->format = q_data->format;
+
+	if (q_data->cu_mask_size) {
+		qp->cu_mask = kzalloc(q_data->cu_mask_size, GFP_KERNEL);
+		if (!qp->cu_mask)
+			return -ENOMEM;
+
+		/* CU mask is stored after q_data */
+		memcpy(qp->cu_mask, cu_mask, q_data->cu_mask_size);
+		qp->cu_mask_count = (q_data->cu_mask_size / sizeof(uint32_t)) * 32;
+	}
+
+	return 0;
+}
+
+static int criu_restore_queue(struct kfd_process *p,
+			      struct kfd_dev *dev,
+			      struct kfd_process_device *pdd,
+			      struct kfd_criu_queue_bucket *q_bucket,
+			      void *private_data)
+{
+	struct kfd_criu_queue_priv_data *q_data = (struct kfd_criu_queue_priv_data *) private_data;
+	uint8_t *cu_mask;
+	struct queue_properties qp;
+	unsigned int queue_id;
+	int ret = 0;
+
+	pr_debug("Restoring Queue: gpu_id:%x queue_id:%u\n", q_bucket->gpu_id, q_data->q_id);
+
+	/* data stored in this order: cu_mask */
+	cu_mask = (void *)(q_data + 1);
+
+	memset(&qp, 0, sizeof(qp));
+	ret = set_queue_properties_from_criu(&qp, q_bucket, q_data, cu_mask);
+	if (ret)
+		goto err_create_queue;
+
+	print_queue_properties(&qp);
+
+	ret = pqm_create_queue(&p->pqm, dev, NULL, &qp, &queue_id, NULL);
+	if (ret) {
+		pr_err("Failed to create new queue err:%d\n", ret);
+		ret = -EINVAL;
+		goto err_create_queue;
+	}
+
+	pr_debug("Queue id %d was restored successfully\n", queue_id);
+
+	return 0;
+err_create_queue:
+	kfree(qp.cu_mask);
+
+	return ret;
+}
+
+int kfd_criu_restore_queues(struct kfd_process *p, struct kfd_ioctl_criu_restorer_args *args)
+{
+	int ret = 0, i;
+	struct kfd_criu_queue_bucket *user_buckets;
+	uint8_t *all_private_data; /* Pointer to first private data in userspace */
+	uint8_t *q_private_data = NULL; /* Local buffer for individual queue private data */
+	unsigned int q_private_data_size = 0;
+
+	user_buckets = (struct kfd_criu_queue_bucket *)args->objects;
+	all_private_data = (void *)(user_buckets + args->num_objects);
+
+	/*
+	 * This process will not have any queues at this point, but we are
+	 * setting all the dqm's for this process to evicted state.
+	 */
+	kfd_process_evict_queues(p);
+
+	for (i = 0; i < args->num_objects; i++) {
+		struct kfd_process_device *pdd;
+		struct kfd_dev *dev;
+		struct kfd_criu_queue_bucket q_bucket;
+
+		ret = copy_from_user(&q_bucket, (void __user *)&user_buckets[i],
+				sizeof(struct kfd_criu_queue_bucket));
+
+		if (ret) {
+			ret = -EFAULT;
+			goto exit;
+		}
+
+		/* Increase local buffer space if needed */
+		if (q_bucket.priv_data_size > q_private_data_size) {
+			kfree(q_private_data);
+
+			q_private_data = kmalloc(q_bucket.priv_data_size, GFP_KERNEL);
+			if (!q_private_data) {
+				ret = -ENOMEM;
+				goto exit;
+			}
+			q_private_data_size = q_bucket.priv_data_size;
+		}
+
+		ret = copy_from_user(q_private_data,
+				(void __user *) (all_private_data + q_bucket.priv_data_offset),
+				q_bucket.priv_data_size);
+		if (ret) {
+			ret = -EFAULT;
+			goto exit;
+		}
+
+		dev = kfd_device_by_id(q_bucket.gpu_id);
+		if (!dev) {
+			pr_err("Could not get kfd_dev from gpu_id = 0x%x\n",
+			q_bucket.gpu_id);
+
+			ret = -EINVAL;
+			goto exit;
+		}
+
+		pdd = kfd_get_process_device_data(dev, p);
+		if (!pdd) {
+			pr_err("Failed to get pdd\n");
+			ret = -EFAULT;
+			return ret;
+		}
+
+		ret = criu_restore_queue(p, dev, pdd, &q_bucket, q_private_data);
+		if (ret) {
+			pr_err("Failed to restore queue (%d)\n", ret);
+			goto exit;
+		}
+
+	}
+
+exit:
+	kfree(q_private_data);
+
+	return ret;
+}
+
+
 #if defined(CONFIG_DEBUG_FS)
 
 int pqm_debugfs_mqds(struct seq_file *m, void *data)
-- 
2.17.1


  parent reply	other threads:[~2021-08-24 19:40 UTC|newest]

Thread overview: 19+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2021-08-24 19:06 [PATCH v2 00/18] CHECKPOINT RESTORE WITH ROCm David Yat Sin
2021-08-24 19:06 ` [PATCH v2 01/18] x86/configs: CRIU update release defconfig David Yat Sin
2021-08-24 19:06 ` [PATCH v2 02/18] x86/configs: CRIU update debug rock defconfig David Yat Sin
2021-08-24 19:06 ` [PATCH v2 03/18] drm/amdkfd: CRIU Introduce Checkpoint-Restore APIs David Yat Sin
2021-08-24 19:06 ` [PATCH v2 04/18] drm/amdkfd: CRIU Implement KFD process_info ioctl David Yat Sin
2021-08-24 19:06 ` [PATCH v2 05/18] drm/amdkfd: CRIU Implement KFD dumper ioctl David Yat Sin
2021-08-24 19:06 ` [PATCH v2 06/18] drm/amdkfd: CRIU Implement KFD restore ioctl David Yat Sin
2021-08-24 19:06 ` [PATCH v2 07/18] drm/amdkfd: CRIU Implement KFD resume ioctl David Yat Sin
2021-08-24 19:06 ` [PATCH v2 08/18] drm/amdkfd: CRIU Implement KFD pause ioctl David Yat Sin
2021-08-24 19:06 ` David Yat Sin [this message]
2021-08-24 19:06 ` [PATCH v2 10/18] drm/amdkfd: CRIU restore queue ids David Yat Sin
2021-08-24 19:06 ` [PATCH v2 11/18] drm/amdkfd: CRIU restore sdma id for queues David Yat Sin
2021-08-24 19:06 ` [PATCH v2 12/18] drm/amdkfd: CRIU restore queue doorbell id David Yat Sin
2021-08-24 19:06 ` [PATCH v2 13/18] drm/amdkfd: CRIU dump and restore queue mqds David Yat Sin
2021-08-24 19:06 ` [PATCH v2 14/18] drm/amdkfd: CRIU dump/restore queue control stack David Yat Sin
2021-08-24 19:06 ` [PATCH v2 15/18] drm/amdkfd: CRIU dump and restore events David Yat Sin
2021-08-24 19:06 ` [PATCH v2 16/18] drm/amdkfd: CRIU implement gpu_id remapping David Yat Sin
2021-08-24 19:06 ` [PATCH v2 17/18] Revert "drm/amdgpu: Remove verify_access shortcut for KFD BOs" David Yat Sin
2021-08-24 19:06 ` [PATCH v2 18/18] drm/amdkfd: CRIU export kfd bos as prime dmabuf objects David Yat Sin

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20210824190639.6288-10-david.yatsin@amd.com \
    --to=david.yatsin@amd.com \
    --cc=amd-gfx@lists.freedesktop.org \
    --cc=felix.kuehling@amd.com \
    --cc=rajneesh.bhardwaj@amd.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.