All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v2 1/2] drm/amdkfd: Fix GWS queue count
@ 2022-04-18 16:44 David Yat Sin
  2022-04-18 16:44 ` [PATCH v2 2/2] drm/amdkfd: CRIU add support for GWS queues David Yat Sin
                   ` (2 more replies)
  0 siblings, 3 replies; 8+ messages in thread
From: David Yat Sin @ 2022-04-18 16:44 UTC (permalink / raw)
  To: amd-gfx; +Cc: Felix.Kuehling

dqm->gws_queue_count and pdd->qpd.mapped_gws_queue needs to be updated
each time the queue gets evicted.

Signed-off-by: David Yat Sin <david.yatsin@amd.com>
---
 .../drm/amd/amdkfd/kfd_device_queue_manager.c | 83 +++++++++----------
 1 file changed, 37 insertions(+), 46 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
index acf4f7975850..198672264492 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -130,19 +130,33 @@ void program_sh_mem_settings(struct device_queue_manager *dqm,
 }
 
 static void increment_queue_count(struct device_queue_manager *dqm,
-			enum kfd_queue_type type)
+				  struct qcm_process_device *qpd,
+				  struct queue *q)
 {
 	dqm->active_queue_count++;
-	if (type == KFD_QUEUE_TYPE_COMPUTE || type == KFD_QUEUE_TYPE_DIQ)
+	if (q->properties.type == KFD_QUEUE_TYPE_COMPUTE ||
+	    q->properties.type == KFD_QUEUE_TYPE_DIQ)
 		dqm->active_cp_queue_count++;
+
+	if (q->properties.is_gws) {
+		dqm->gws_queue_count++;
+		qpd->mapped_gws_queue = true;
+	}
 }
 
 static void decrement_queue_count(struct device_queue_manager *dqm,
-			enum kfd_queue_type type)
+				  struct qcm_process_device *qpd,
+				  struct queue *q)
 {
 	dqm->active_queue_count--;
-	if (type == KFD_QUEUE_TYPE_COMPUTE || type == KFD_QUEUE_TYPE_DIQ)
+	if (q->properties.type == KFD_QUEUE_TYPE_COMPUTE ||
+	    q->properties.type == KFD_QUEUE_TYPE_DIQ)
 		dqm->active_cp_queue_count--;
+
+	if (q->properties.is_gws) {
+		dqm->gws_queue_count--;
+		qpd->mapped_gws_queue = false;
+	}
 }
 
 /*
@@ -412,7 +426,7 @@ static int create_queue_nocpsch(struct device_queue_manager *dqm,
 	list_add(&q->list, &qpd->queues_list);
 	qpd->queue_count++;
 	if (q->properties.is_active)
-		increment_queue_count(dqm, q->properties.type);
+		increment_queue_count(dqm, qpd, q);
 
 	/*
 	 * Unconditionally increment this counter, regardless of the queue's
@@ -601,13 +615,8 @@ static int destroy_queue_nocpsch_locked(struct device_queue_manager *dqm,
 		deallocate_vmid(dqm, qpd, q);
 	}
 	qpd->queue_count--;
-	if (q->properties.is_active) {
-		decrement_queue_count(dqm, q->properties.type);
-		if (q->properties.is_gws) {
-			dqm->gws_queue_count--;
-			qpd->mapped_gws_queue = false;
-		}
-	}
+	if (q->properties.is_active)
+		decrement_queue_count(dqm, qpd, q);
 
 	return retval;
 }
@@ -700,12 +709,11 @@ static int update_queue(struct device_queue_manager *dqm, struct queue *q,
 	 * dqm->active_queue_count to determine whether a new runlist must be
 	 * uploaded.
 	 */
-	if (q->properties.is_active && !prev_active)
-		increment_queue_count(dqm, q->properties.type);
-	else if (!q->properties.is_active && prev_active)
-		decrement_queue_count(dqm, q->properties.type);
-
-	if (q->gws && !q->properties.is_gws) {
+	if (q->properties.is_active && !prev_active) {
+		increment_queue_count(dqm, &pdd->qpd, q);
+	} else if (!q->properties.is_active && prev_active) {
+		decrement_queue_count(dqm, &pdd->qpd, q);
+	} else if (q->gws && !q->properties.is_gws) {
 		if (q->properties.is_active) {
 			dqm->gws_queue_count++;
 			pdd->qpd.mapped_gws_queue = true;
@@ -767,11 +775,7 @@ static int evict_process_queues_nocpsch(struct device_queue_manager *dqm,
 		mqd_mgr = dqm->mqd_mgrs[get_mqd_type_from_queue_type(
 				q->properties.type)];
 		q->properties.is_active = false;
-		decrement_queue_count(dqm, q->properties.type);
-		if (q->properties.is_gws) {
-			dqm->gws_queue_count--;
-			qpd->mapped_gws_queue = false;
-		}
+		decrement_queue_count(dqm, qpd, q);
 
 		if (WARN_ONCE(!dqm->sched_running, "Evict when stopped\n"))
 			continue;
@@ -817,7 +821,7 @@ static int evict_process_queues_cpsch(struct device_queue_manager *dqm,
 			continue;
 
 		q->properties.is_active = false;
-		decrement_queue_count(dqm, q->properties.type);
+		decrement_queue_count(dqm, qpd, q);
 	}
 	pdd->last_evict_timestamp = get_jiffies_64();
 	retval = execute_queues_cpsch(dqm,
@@ -888,11 +892,7 @@ static int restore_process_queues_nocpsch(struct device_queue_manager *dqm,
 		mqd_mgr = dqm->mqd_mgrs[get_mqd_type_from_queue_type(
 				q->properties.type)];
 		q->properties.is_active = true;
-		increment_queue_count(dqm, q->properties.type);
-		if (q->properties.is_gws) {
-			dqm->gws_queue_count++;
-			qpd->mapped_gws_queue = true;
-		}
+		increment_queue_count(dqm, qpd, q);
 
 		if (WARN_ONCE(!dqm->sched_running, "Restore when stopped\n"))
 			continue;
@@ -950,7 +950,7 @@ static int restore_process_queues_cpsch(struct device_queue_manager *dqm,
 			continue;
 
 		q->properties.is_active = true;
-		increment_queue_count(dqm, q->properties.type);
+		increment_queue_count(dqm, &pdd->qpd, q);
 	}
 	retval = execute_queues_cpsch(dqm,
 				KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0);
@@ -1378,7 +1378,7 @@ static int create_kernel_queue_cpsch(struct device_queue_manager *dqm,
 			dqm->total_queue_count);
 
 	list_add(&kq->list, &qpd->priv_queue_list);
-	increment_queue_count(dqm, kq->queue->properties.type);
+	increment_queue_count(dqm, qpd, kq->queue);
 	qpd->is_debug = true;
 	execute_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0);
 	dqm_unlock(dqm);
@@ -1392,7 +1392,7 @@ static void destroy_kernel_queue_cpsch(struct device_queue_manager *dqm,
 {
 	dqm_lock(dqm);
 	list_del(&kq->list);
-	decrement_queue_count(dqm, kq->queue->properties.type);
+	decrement_queue_count(dqm, qpd, kq->queue);
 	qpd->is_debug = false;
 	execute_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0);
 	/*
@@ -1467,7 +1467,7 @@ static int create_queue_cpsch(struct device_queue_manager *dqm, struct queue *q,
 	qpd->queue_count++;
 
 	if (q->properties.is_active) {
-		increment_queue_count(dqm, q->properties.type);
+		increment_queue_count(dqm, qpd, q);
 
 		execute_queues_cpsch(dqm,
 				KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0);
@@ -1683,15 +1683,11 @@ static int destroy_queue_cpsch(struct device_queue_manager *dqm,
 	list_del(&q->list);
 	qpd->queue_count--;
 	if (q->properties.is_active) {
-		decrement_queue_count(dqm, q->properties.type);
+		decrement_queue_count(dqm, qpd, q);
 		retval = execute_queues_cpsch(dqm,
 				KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0);
 		if (retval == -ETIME)
 			qpd->reset_wavefronts = true;
-		if (q->properties.is_gws) {
-			dqm->gws_queue_count--;
-			qpd->mapped_gws_queue = false;
-		}
 	}
 
 	/*
@@ -1932,7 +1928,7 @@ static int process_termination_cpsch(struct device_queue_manager *dqm,
 	/* Clean all kernel queues */
 	list_for_each_entry_safe(kq, kq_next, &qpd->priv_queue_list, list) {
 		list_del(&kq->list);
-		decrement_queue_count(dqm, kq->queue->properties.type);
+		decrement_queue_count(dqm, qpd, kq->queue);
 		qpd->is_debug = false;
 		dqm->total_queue_count--;
 		filter = KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES;
@@ -1945,13 +1941,8 @@ static int process_termination_cpsch(struct device_queue_manager *dqm,
 		else if (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI)
 			deallocate_sdma_queue(dqm, q);
 
-		if (q->properties.is_active) {
-			decrement_queue_count(dqm, q->properties.type);
-			if (q->properties.is_gws) {
-				dqm->gws_queue_count--;
-				qpd->mapped_gws_queue = false;
-			}
-		}
+		if (q->properties.is_active)
+			decrement_queue_count(dqm, qpd, q);
 
 		dqm->total_queue_count--;
 	}
-- 
2.30.2


^ permalink raw reply related	[flat|nested] 8+ messages in thread

* [PATCH v2 2/2] drm/amdkfd: CRIU add support for GWS queues
  2022-04-18 16:44 [PATCH v2 1/2] drm/amdkfd: Fix GWS queue count David Yat Sin
@ 2022-04-18 16:44 ` David Yat Sin
  2022-04-18 20:23   ` Paul Menzel
  2022-04-18 19:01 ` [PATCH v2 1/2] drm/amdkfd: Fix GWS queue count Felix Kuehling
  2022-04-18 20:18 ` Paul Menzel
  2 siblings, 1 reply; 8+ messages in thread
From: David Yat Sin @ 2022-04-18 16:44 UTC (permalink / raw)
  To: amd-gfx; +Cc: Felix.Kuehling

Adding support to checkpoint/restore GWS(Global Wave Sync) queues.

Signed-off-by: David Yat Sin <david.yatsin@amd.com>
---
 drivers/gpu/drm/amd/amdkfd/kfd_priv.h                  |  2 +-
 drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c | 10 +++++++---
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index f36062be9ca8..192dbef04c43 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -1102,7 +1102,7 @@ struct kfd_criu_queue_priv_data {
 	uint32_t priority;
 	uint32_t q_percent;
 	uint32_t doorbell_id;
-	uint32_t is_gws;
+	uint32_t gws;
 	uint32_t sdma_id;
 	uint32_t eop_ring_buffer_size;
 	uint32_t ctx_save_restore_area_size;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
index 6eca9509f2e3..4f58e671d39b 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
@@ -636,6 +636,8 @@ static int criu_checkpoint_queue(struct kfd_process_device *pdd,
 	q_data->ctx_save_restore_area_size =
 		q->properties.ctx_save_restore_area_size;
 
+	q_data->gws = !!q->gws;
+
 	ret = pqm_checkpoint_mqd(&pdd->process->pqm, q->properties.queue_id, mqd, ctl_stack);
 	if (ret) {
 		pr_err("Failed checkpoint queue_mqd (%d)\n", ret);
@@ -743,7 +745,6 @@ static void set_queue_properties_from_criu(struct queue_properties *qp,
 					  struct kfd_criu_queue_priv_data *q_data)
 {
 	qp->is_interop = false;
-	qp->is_gws = q_data->is_gws;
 	qp->queue_percent = q_data->q_percent;
 	qp->priority = q_data->priority;
 	qp->queue_address = q_data->q_address;
@@ -826,12 +827,15 @@ int kfd_criu_restore_queue(struct kfd_process *p,
 				NULL);
 	if (ret) {
 		pr_err("Failed to create new queue err:%d\n", ret);
-		ret = -EINVAL;
+		goto exit;
 	}
 
+	if (q_data->gws)
+		ret = pqm_set_gws(&p->pqm, q_data->q_id, pdd->dev->gws);
+
 exit:
 	if (ret)
-		pr_err("Failed to create queue (%d)\n", ret);
+		pr_err("Failed to restore queue (%d)\n", ret);
 	else
 		pr_debug("Queue id %d was restored successfully\n", queue_id);
 
-- 
2.30.2


^ permalink raw reply related	[flat|nested] 8+ messages in thread

* Re: [PATCH v2 1/2] drm/amdkfd: Fix GWS queue count
  2022-04-18 16:44 [PATCH v2 1/2] drm/amdkfd: Fix GWS queue count David Yat Sin
  2022-04-18 16:44 ` [PATCH v2 2/2] drm/amdkfd: CRIU add support for GWS queues David Yat Sin
@ 2022-04-18 19:01 ` Felix Kuehling
  2022-04-18 20:18 ` Paul Menzel
  2 siblings, 0 replies; 8+ messages in thread
From: Felix Kuehling @ 2022-04-18 19:01 UTC (permalink / raw)
  To: David Yat Sin, amd-gfx


Am 2022-04-18 um 12:44 schrieb David Yat Sin:
> dqm->gws_queue_count and pdd->qpd.mapped_gws_queue needs to be updated
> each time the queue gets evicted.
>
> Signed-off-by: David Yat Sin <david.yatsin@amd.com>
The series is

Reviewed-by: Felix Kuehling <Felix.Kuehling@amd.com>


> ---
>   .../drm/amd/amdkfd/kfd_device_queue_manager.c | 83 +++++++++----------
>   1 file changed, 37 insertions(+), 46 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> index acf4f7975850..198672264492 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> @@ -130,19 +130,33 @@ void program_sh_mem_settings(struct device_queue_manager *dqm,
>   }
>   
>   static void increment_queue_count(struct device_queue_manager *dqm,
> -			enum kfd_queue_type type)
> +				  struct qcm_process_device *qpd,
> +				  struct queue *q)
>   {
>   	dqm->active_queue_count++;
> -	if (type == KFD_QUEUE_TYPE_COMPUTE || type == KFD_QUEUE_TYPE_DIQ)
> +	if (q->properties.type == KFD_QUEUE_TYPE_COMPUTE ||
> +	    q->properties.type == KFD_QUEUE_TYPE_DIQ)
>   		dqm->active_cp_queue_count++;
> +
> +	if (q->properties.is_gws) {
> +		dqm->gws_queue_count++;
> +		qpd->mapped_gws_queue = true;
> +	}
>   }
>   
>   static void decrement_queue_count(struct device_queue_manager *dqm,
> -			enum kfd_queue_type type)
> +				  struct qcm_process_device *qpd,
> +				  struct queue *q)
>   {
>   	dqm->active_queue_count--;
> -	if (type == KFD_QUEUE_TYPE_COMPUTE || type == KFD_QUEUE_TYPE_DIQ)
> +	if (q->properties.type == KFD_QUEUE_TYPE_COMPUTE ||
> +	    q->properties.type == KFD_QUEUE_TYPE_DIQ)
>   		dqm->active_cp_queue_count--;
> +
> +	if (q->properties.is_gws) {
> +		dqm->gws_queue_count--;
> +		qpd->mapped_gws_queue = false;
> +	}
>   }
>   
>   /*
> @@ -412,7 +426,7 @@ static int create_queue_nocpsch(struct device_queue_manager *dqm,
>   	list_add(&q->list, &qpd->queues_list);
>   	qpd->queue_count++;
>   	if (q->properties.is_active)
> -		increment_queue_count(dqm, q->properties.type);
> +		increment_queue_count(dqm, qpd, q);
>   
>   	/*
>   	 * Unconditionally increment this counter, regardless of the queue's
> @@ -601,13 +615,8 @@ static int destroy_queue_nocpsch_locked(struct device_queue_manager *dqm,
>   		deallocate_vmid(dqm, qpd, q);
>   	}
>   	qpd->queue_count--;
> -	if (q->properties.is_active) {
> -		decrement_queue_count(dqm, q->properties.type);
> -		if (q->properties.is_gws) {
> -			dqm->gws_queue_count--;
> -			qpd->mapped_gws_queue = false;
> -		}
> -	}
> +	if (q->properties.is_active)
> +		decrement_queue_count(dqm, qpd, q);
>   
>   	return retval;
>   }
> @@ -700,12 +709,11 @@ static int update_queue(struct device_queue_manager *dqm, struct queue *q,
>   	 * dqm->active_queue_count to determine whether a new runlist must be
>   	 * uploaded.
>   	 */
> -	if (q->properties.is_active && !prev_active)
> -		increment_queue_count(dqm, q->properties.type);
> -	else if (!q->properties.is_active && prev_active)
> -		decrement_queue_count(dqm, q->properties.type);
> -
> -	if (q->gws && !q->properties.is_gws) {
> +	if (q->properties.is_active && !prev_active) {
> +		increment_queue_count(dqm, &pdd->qpd, q);
> +	} else if (!q->properties.is_active && prev_active) {
> +		decrement_queue_count(dqm, &pdd->qpd, q);
> +	} else if (q->gws && !q->properties.is_gws) {
>   		if (q->properties.is_active) {
>   			dqm->gws_queue_count++;
>   			pdd->qpd.mapped_gws_queue = true;
> @@ -767,11 +775,7 @@ static int evict_process_queues_nocpsch(struct device_queue_manager *dqm,
>   		mqd_mgr = dqm->mqd_mgrs[get_mqd_type_from_queue_type(
>   				q->properties.type)];
>   		q->properties.is_active = false;
> -		decrement_queue_count(dqm, q->properties.type);
> -		if (q->properties.is_gws) {
> -			dqm->gws_queue_count--;
> -			qpd->mapped_gws_queue = false;
> -		}
> +		decrement_queue_count(dqm, qpd, q);
>   
>   		if (WARN_ONCE(!dqm->sched_running, "Evict when stopped\n"))
>   			continue;
> @@ -817,7 +821,7 @@ static int evict_process_queues_cpsch(struct device_queue_manager *dqm,
>   			continue;
>   
>   		q->properties.is_active = false;
> -		decrement_queue_count(dqm, q->properties.type);
> +		decrement_queue_count(dqm, qpd, q);
>   	}
>   	pdd->last_evict_timestamp = get_jiffies_64();
>   	retval = execute_queues_cpsch(dqm,
> @@ -888,11 +892,7 @@ static int restore_process_queues_nocpsch(struct device_queue_manager *dqm,
>   		mqd_mgr = dqm->mqd_mgrs[get_mqd_type_from_queue_type(
>   				q->properties.type)];
>   		q->properties.is_active = true;
> -		increment_queue_count(dqm, q->properties.type);
> -		if (q->properties.is_gws) {
> -			dqm->gws_queue_count++;
> -			qpd->mapped_gws_queue = true;
> -		}
> +		increment_queue_count(dqm, qpd, q);
>   
>   		if (WARN_ONCE(!dqm->sched_running, "Restore when stopped\n"))
>   			continue;
> @@ -950,7 +950,7 @@ static int restore_process_queues_cpsch(struct device_queue_manager *dqm,
>   			continue;
>   
>   		q->properties.is_active = true;
> -		increment_queue_count(dqm, q->properties.type);
> +		increment_queue_count(dqm, &pdd->qpd, q);
>   	}
>   	retval = execute_queues_cpsch(dqm,
>   				KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0);
> @@ -1378,7 +1378,7 @@ static int create_kernel_queue_cpsch(struct device_queue_manager *dqm,
>   			dqm->total_queue_count);
>   
>   	list_add(&kq->list, &qpd->priv_queue_list);
> -	increment_queue_count(dqm, kq->queue->properties.type);
> +	increment_queue_count(dqm, qpd, kq->queue);
>   	qpd->is_debug = true;
>   	execute_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0);
>   	dqm_unlock(dqm);
> @@ -1392,7 +1392,7 @@ static void destroy_kernel_queue_cpsch(struct device_queue_manager *dqm,
>   {
>   	dqm_lock(dqm);
>   	list_del(&kq->list);
> -	decrement_queue_count(dqm, kq->queue->properties.type);
> +	decrement_queue_count(dqm, qpd, kq->queue);
>   	qpd->is_debug = false;
>   	execute_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0);
>   	/*
> @@ -1467,7 +1467,7 @@ static int create_queue_cpsch(struct device_queue_manager *dqm, struct queue *q,
>   	qpd->queue_count++;
>   
>   	if (q->properties.is_active) {
> -		increment_queue_count(dqm, q->properties.type);
> +		increment_queue_count(dqm, qpd, q);
>   
>   		execute_queues_cpsch(dqm,
>   				KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0);
> @@ -1683,15 +1683,11 @@ static int destroy_queue_cpsch(struct device_queue_manager *dqm,
>   	list_del(&q->list);
>   	qpd->queue_count--;
>   	if (q->properties.is_active) {
> -		decrement_queue_count(dqm, q->properties.type);
> +		decrement_queue_count(dqm, qpd, q);
>   		retval = execute_queues_cpsch(dqm,
>   				KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0);
>   		if (retval == -ETIME)
>   			qpd->reset_wavefronts = true;
> -		if (q->properties.is_gws) {
> -			dqm->gws_queue_count--;
> -			qpd->mapped_gws_queue = false;
> -		}
>   	}
>   
>   	/*
> @@ -1932,7 +1928,7 @@ static int process_termination_cpsch(struct device_queue_manager *dqm,
>   	/* Clean all kernel queues */
>   	list_for_each_entry_safe(kq, kq_next, &qpd->priv_queue_list, list) {
>   		list_del(&kq->list);
> -		decrement_queue_count(dqm, kq->queue->properties.type);
> +		decrement_queue_count(dqm, qpd, kq->queue);
>   		qpd->is_debug = false;
>   		dqm->total_queue_count--;
>   		filter = KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES;
> @@ -1945,13 +1941,8 @@ static int process_termination_cpsch(struct device_queue_manager *dqm,
>   		else if (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI)
>   			deallocate_sdma_queue(dqm, q);
>   
> -		if (q->properties.is_active) {
> -			decrement_queue_count(dqm, q->properties.type);
> -			if (q->properties.is_gws) {
> -				dqm->gws_queue_count--;
> -				qpd->mapped_gws_queue = false;
> -			}
> -		}
> +		if (q->properties.is_active)
> +			decrement_queue_count(dqm, qpd, q);
>   
>   		dqm->total_queue_count--;
>   	}

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH v2 1/2] drm/amdkfd: Fix GWS queue count
  2022-04-18 16:44 [PATCH v2 1/2] drm/amdkfd: Fix GWS queue count David Yat Sin
  2022-04-18 16:44 ` [PATCH v2 2/2] drm/amdkfd: CRIU add support for GWS queues David Yat Sin
  2022-04-18 19:01 ` [PATCH v2 1/2] drm/amdkfd: Fix GWS queue count Felix Kuehling
@ 2022-04-18 20:18 ` Paul Menzel
  2 siblings, 0 replies; 8+ messages in thread
From: Paul Menzel @ 2022-04-18 20:18 UTC (permalink / raw)
  To: David Yat Sin; +Cc: Felix Kühling, amd-gfx

Dear David,


Thank you for your patch.

Am 18.04.22 um 18:44 schrieb David Yat Sin:
> dqm->gws_queue_count and pdd->qpd.mapped_gws_queue needs to be updated

s/needs/need/

> each time the queue gets evicted.

Why?

Do you only change the case, when an element of the queue gets evicted?

Next time, a short note about the implementation would be nic.e

No Fixes tag?


Kind regards,

Paul


> Signed-off-by: David Yat Sin <david.yatsin@amd.com>
> ---
>   .../drm/amd/amdkfd/kfd_device_queue_manager.c | 83 +++++++++----------
>   1 file changed, 37 insertions(+), 46 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> index acf4f7975850..198672264492 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> @@ -130,19 +130,33 @@ void program_sh_mem_settings(struct device_queue_manager *dqm,
>   }
>   
>   static void increment_queue_count(struct device_queue_manager *dqm,
> -			enum kfd_queue_type type)
> +				  struct qcm_process_device *qpd,
> +				  struct queue *q)
>   {
>   	dqm->active_queue_count++;
> -	if (type == KFD_QUEUE_TYPE_COMPUTE || type == KFD_QUEUE_TYPE_DIQ)
> +	if (q->properties.type == KFD_QUEUE_TYPE_COMPUTE ||
> +	    q->properties.type == KFD_QUEUE_TYPE_DIQ)
>   		dqm->active_cp_queue_count++;
> +
> +	if (q->properties.is_gws) {
> +		dqm->gws_queue_count++;
> +		qpd->mapped_gws_queue = true;
> +	}
>   }
>   
>   static void decrement_queue_count(struct device_queue_manager *dqm,
> -			enum kfd_queue_type type)
> +				  struct qcm_process_device *qpd,
> +				  struct queue *q)
>   {
>   	dqm->active_queue_count--;
> -	if (type == KFD_QUEUE_TYPE_COMPUTE || type == KFD_QUEUE_TYPE_DIQ)
> +	if (q->properties.type == KFD_QUEUE_TYPE_COMPUTE ||
> +	    q->properties.type == KFD_QUEUE_TYPE_DIQ)
>   		dqm->active_cp_queue_count--;
> +
> +	if (q->properties.is_gws) {
> +		dqm->gws_queue_count--;
> +		qpd->mapped_gws_queue = false;
> +	}
>   }
>   
>   /*
> @@ -412,7 +426,7 @@ static int create_queue_nocpsch(struct device_queue_manager *dqm,
>   	list_add(&q->list, &qpd->queues_list);
>   	qpd->queue_count++;
>   	if (q->properties.is_active)
> -		increment_queue_count(dqm, q->properties.type);
> +		increment_queue_count(dqm, qpd, q);
>   
>   	/*
>   	 * Unconditionally increment this counter, regardless of the queue's
> @@ -601,13 +615,8 @@ static int destroy_queue_nocpsch_locked(struct device_queue_manager *dqm,
>   		deallocate_vmid(dqm, qpd, q);
>   	}
>   	qpd->queue_count--;
> -	if (q->properties.is_active) {
> -		decrement_queue_count(dqm, q->properties.type);
> -		if (q->properties.is_gws) {
> -			dqm->gws_queue_count--;
> -			qpd->mapped_gws_queue = false;
> -		}
> -	}
> +	if (q->properties.is_active)
> +		decrement_queue_count(dqm, qpd, q);
>   
>   	return retval;
>   }
> @@ -700,12 +709,11 @@ static int update_queue(struct device_queue_manager *dqm, struct queue *q,
>   	 * dqm->active_queue_count to determine whether a new runlist must be
>   	 * uploaded.
>   	 */
> -	if (q->properties.is_active && !prev_active)
> -		increment_queue_count(dqm, q->properties.type);
> -	else if (!q->properties.is_active && prev_active)
> -		decrement_queue_count(dqm, q->properties.type);
> -
> -	if (q->gws && !q->properties.is_gws) {
> +	if (q->properties.is_active && !prev_active) {
> +		increment_queue_count(dqm, &pdd->qpd, q);
> +	} else if (!q->properties.is_active && prev_active) {
> +		decrement_queue_count(dqm, &pdd->qpd, q);
> +	} else if (q->gws && !q->properties.is_gws) {
>   		if (q->properties.is_active) {
>   			dqm->gws_queue_count++;
>   			pdd->qpd.mapped_gws_queue = true;
> @@ -767,11 +775,7 @@ static int evict_process_queues_nocpsch(struct device_queue_manager *dqm,
>   		mqd_mgr = dqm->mqd_mgrs[get_mqd_type_from_queue_type(
>   				q->properties.type)];
>   		q->properties.is_active = false;
> -		decrement_queue_count(dqm, q->properties.type);
> -		if (q->properties.is_gws) {
> -			dqm->gws_queue_count--;
> -			qpd->mapped_gws_queue = false;
> -		}
> +		decrement_queue_count(dqm, qpd, q);
>   
>   		if (WARN_ONCE(!dqm->sched_running, "Evict when stopped\n"))
>   			continue;
> @@ -817,7 +821,7 @@ static int evict_process_queues_cpsch(struct device_queue_manager *dqm,
>   			continue;
>   
>   		q->properties.is_active = false;
> -		decrement_queue_count(dqm, q->properties.type);
> +		decrement_queue_count(dqm, qpd, q);
>   	}
>   	pdd->last_evict_timestamp = get_jiffies_64();
>   	retval = execute_queues_cpsch(dqm,
> @@ -888,11 +892,7 @@ static int restore_process_queues_nocpsch(struct device_queue_manager *dqm,
>   		mqd_mgr = dqm->mqd_mgrs[get_mqd_type_from_queue_type(
>   				q->properties.type)];
>   		q->properties.is_active = true;
> -		increment_queue_count(dqm, q->properties.type);
> -		if (q->properties.is_gws) {
> -			dqm->gws_queue_count++;
> -			qpd->mapped_gws_queue = true;
> -		}
> +		increment_queue_count(dqm, qpd, q);
>   
>   		if (WARN_ONCE(!dqm->sched_running, "Restore when stopped\n"))
>   			continue;
> @@ -950,7 +950,7 @@ static int restore_process_queues_cpsch(struct device_queue_manager *dqm,
>   			continue;
>   
>   		q->properties.is_active = true;
> -		increment_queue_count(dqm, q->properties.type);
> +		increment_queue_count(dqm, &pdd->qpd, q);
>   	}
>   	retval = execute_queues_cpsch(dqm,
>   				KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0);
> @@ -1378,7 +1378,7 @@ static int create_kernel_queue_cpsch(struct device_queue_manager *dqm,
>   			dqm->total_queue_count);
>   
>   	list_add(&kq->list, &qpd->priv_queue_list);
> -	increment_queue_count(dqm, kq->queue->properties.type);
> +	increment_queue_count(dqm, qpd, kq->queue);
>   	qpd->is_debug = true;
>   	execute_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0);
>   	dqm_unlock(dqm);
> @@ -1392,7 +1392,7 @@ static void destroy_kernel_queue_cpsch(struct device_queue_manager *dqm,
>   {
>   	dqm_lock(dqm);
>   	list_del(&kq->list);
> -	decrement_queue_count(dqm, kq->queue->properties.type);
> +	decrement_queue_count(dqm, qpd, kq->queue);
>   	qpd->is_debug = false;
>   	execute_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0);
>   	/*
> @@ -1467,7 +1467,7 @@ static int create_queue_cpsch(struct device_queue_manager *dqm, struct queue *q,
>   	qpd->queue_count++;
>   
>   	if (q->properties.is_active) {
> -		increment_queue_count(dqm, q->properties.type);
> +		increment_queue_count(dqm, qpd, q);
>   
>   		execute_queues_cpsch(dqm,
>   				KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0);
> @@ -1683,15 +1683,11 @@ static int destroy_queue_cpsch(struct device_queue_manager *dqm,
>   	list_del(&q->list);
>   	qpd->queue_count--;
>   	if (q->properties.is_active) {
> -		decrement_queue_count(dqm, q->properties.type);
> +		decrement_queue_count(dqm, qpd, q);
>   		retval = execute_queues_cpsch(dqm,
>   				KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0);
>   		if (retval == -ETIME)
>   			qpd->reset_wavefronts = true;
> -		if (q->properties.is_gws) {
> -			dqm->gws_queue_count--;
> -			qpd->mapped_gws_queue = false;
> -		}
>   	}
>   
>   	/*
> @@ -1932,7 +1928,7 @@ static int process_termination_cpsch(struct device_queue_manager *dqm,
>   	/* Clean all kernel queues */
>   	list_for_each_entry_safe(kq, kq_next, &qpd->priv_queue_list, list) {
>   		list_del(&kq->list);
> -		decrement_queue_count(dqm, kq->queue->properties.type);
> +		decrement_queue_count(dqm, qpd, kq->queue);
>   		qpd->is_debug = false;
>   		dqm->total_queue_count--;
>   		filter = KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES;
> @@ -1945,13 +1941,8 @@ static int process_termination_cpsch(struct device_queue_manager *dqm,
>   		else if (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI)
>   			deallocate_sdma_queue(dqm, q);
>   
> -		if (q->properties.is_active) {
> -			decrement_queue_count(dqm, q->properties.type);
> -			if (q->properties.is_gws) {
> -				dqm->gws_queue_count--;
> -				qpd->mapped_gws_queue = false;
> -			}
> -		}
> +		if (q->properties.is_active)
> +			decrement_queue_count(dqm, qpd, q);
>   
>   		dqm->total_queue_count--;
>   	}

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH v2 2/2] drm/amdkfd: CRIU add support for GWS queues
  2022-04-18 16:44 ` [PATCH v2 2/2] drm/amdkfd: CRIU add support for GWS queues David Yat Sin
@ 2022-04-18 20:23   ` Paul Menzel
  2022-04-19  0:04     ` Yat Sin, David
  0 siblings, 1 reply; 8+ messages in thread
From: Paul Menzel @ 2022-04-18 20:23 UTC (permalink / raw)
  To: David Yat Sin; +Cc: Felix Kühling, amd-gfx

Dear David,


Thank you for your patch.

Am 18.04.22 um 18:44 schrieb David Yat Sin:

In the commit message summary, you could reorder some words:

Add CRIU support for GWS queues

> Adding support to checkpoint/restore GWS(Global Wave Sync) queues.

s/Adding/Add/

Please add a space before the (.

How can this be tested?

> Signed-off-by: David Yat Sin <david.yatsin@amd.com>
> ---
>   drivers/gpu/drm/amd/amdkfd/kfd_priv.h                  |  2 +-
>   drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c | 10 +++++++---
>   2 files changed, 8 insertions(+), 4 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> index f36062be9ca8..192dbef04c43 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> @@ -1102,7 +1102,7 @@ struct kfd_criu_queue_priv_data {
>   	uint32_t priority;
>   	uint32_t q_percent;
>   	uint32_t doorbell_id;
> -	uint32_t is_gws;
> +	uint32_t gws;

Why is the new name better?

>   	uint32_t sdma_id;
>   	uint32_t eop_ring_buffer_size;
>   	uint32_t ctx_save_restore_area_size;
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
> index 6eca9509f2e3..4f58e671d39b 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
> @@ -636,6 +636,8 @@ static int criu_checkpoint_queue(struct kfd_process_device *pdd,
>   	q_data->ctx_save_restore_area_size =
>   		q->properties.ctx_save_restore_area_size;
>   
> +	q_data->gws = !!q->gws;
> +
>   	ret = pqm_checkpoint_mqd(&pdd->process->pqm, q->properties.queue_id, mqd, ctl_stack);
>   	if (ret) {
>   		pr_err("Failed checkpoint queue_mqd (%d)\n", ret);
> @@ -743,7 +745,6 @@ static void set_queue_properties_from_criu(struct queue_properties *qp,
>   					  struct kfd_criu_queue_priv_data *q_data)
>   {
>   	qp->is_interop = false;
> -	qp->is_gws = q_data->is_gws;
>   	qp->queue_percent = q_data->q_percent;
>   	qp->priority = q_data->priority;
>   	qp->queue_address = q_data->q_address;
> @@ -826,12 +827,15 @@ int kfd_criu_restore_queue(struct kfd_process *p,
>   				NULL);
>   	if (ret) {
>   		pr_err("Failed to create new queue err:%d\n", ret);
> -		ret = -EINVAL;
> +		goto exit;
>   	}
>   
> +	if (q_data->gws)
> +		ret = pqm_set_gws(&p->pqm, q_data->q_id, pdd->dev->gws);
> +
>   exit:
>   	if (ret)
> -		pr_err("Failed to create queue (%d)\n", ret);
> +		pr_err("Failed to restore queue (%d)\n", ret);

Maybe separate this out, so it can be applied to stable series.

>   	else
>   		pr_debug("Queue id %d was restored successfully\n", queue_id);
>   


Kind regards,

Paul

^ permalink raw reply	[flat|nested] 8+ messages in thread

* RE: [PATCH v2 2/2] drm/amdkfd: CRIU add support for GWS queues
  2022-04-18 20:23   ` Paul Menzel
@ 2022-04-19  0:04     ` Yat Sin, David
  2022-04-19  6:54       ` Paul Menzel
  0 siblings, 1 reply; 8+ messages in thread
From: Yat Sin, David @ 2022-04-19  0:04 UTC (permalink / raw)
  To: Paul Menzel; +Cc: Kuehling, Felix, amd-gfx



> -----Original Message-----
> From: Paul Menzel <pmenzel@molgen.mpg.de>
> Sent: Monday, April 18, 2022 4:23 PM
> To: Yat Sin, David <David.YatSin@amd.com>
> Cc: amd-gfx@lists.freedesktop.org; Kuehling, Felix
> <Felix.Kuehling@amd.com>
> Subject: Re: [PATCH v2 2/2] drm/amdkfd: CRIU add support for GWS queues
> 
> Dear David,
> 
> 
> Thank you for your patch.
> 
> Am 18.04.22 um 18:44 schrieb David Yat Sin:
> 
> In the commit message summary, you could reorder some words:
> 
> Add CRIU support for GWS queues
> 
> > Adding support to checkpoint/restore GWS(Global Wave Sync) queues.
> 
> s/Adding/Add/
> 
> Please add a space before the (.
ACK
> 
> How can this be tested?
We have some internal tests that can we be used to specifically test this feature.
> 
> > Signed-off-by: David Yat Sin <david.yatsin@amd.com>
> > ---
> >   drivers/gpu/drm/amd/amdkfd/kfd_priv.h                  |  2 +-
> >   drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c | 10
> +++++++---
> >   2 files changed, 8 insertions(+), 4 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> > b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> > index f36062be9ca8..192dbef04c43 100644
> > --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> > @@ -1102,7 +1102,7 @@ struct kfd_criu_queue_priv_data {
> >   	uint32_t priority;
> >   	uint32_t q_percent;
> >   	uint32_t doorbell_id;
> > -	uint32_t is_gws;
> > +	uint32_t gws;
> 
> Why is the new name better?
The old variable (is_gws) was obtained from the queue_properties structure during checkpoint and is only used temporarily during queue creation, so this variable cannot be used to determine whether a queue as gws enabled. The new variable (gws) is obtained from the queue structure. The name is changed to better reflect this.
> 
> >   	uint32_t sdma_id;
> >   	uint32_t eop_ring_buffer_size;
> >   	uint32_t ctx_save_restore_area_size; diff --git
> > a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
> > b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
> > index 6eca9509f2e3..4f58e671d39b 100644
> > --- a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
> > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
> > @@ -636,6 +636,8 @@ static int criu_checkpoint_queue(struct
> kfd_process_device *pdd,
> >   	q_data->ctx_save_restore_area_size =
> >   		q->properties.ctx_save_restore_area_size;
> >
> > +	q_data->gws = !!q->gws;
> > +
> >   	ret = pqm_checkpoint_mqd(&pdd->process->pqm, q-
> >properties.queue_id, mqd, ctl_stack);
> >   	if (ret) {
> >   		pr_err("Failed checkpoint queue_mqd (%d)\n", ret); @@ -
> 743,7
> > +745,6 @@ static void set_queue_properties_from_criu(struct
> queue_properties *qp,
> >   					  struct kfd_criu_queue_priv_data
> *q_data)
> >   {
> >   	qp->is_interop = false;
> > -	qp->is_gws = q_data->is_gws;
> >   	qp->queue_percent = q_data->q_percent;
> >   	qp->priority = q_data->priority;
> >   	qp->queue_address = q_data->q_address; @@ -826,12 +827,15 @@
> int
> > kfd_criu_restore_queue(struct kfd_process *p,
> >   				NULL);
> >   	if (ret) {
> >   		pr_err("Failed to create new queue err:%d\n", ret);
> > -		ret = -EINVAL;
> > +		goto exit;
> >   	}
> >
> > +	if (q_data->gws)
> > +		ret = pqm_set_gws(&p->pqm, q_data->q_id, pdd->dev->gws);
> > +
> >   exit:
> >   	if (ret)
> > -		pr_err("Failed to create queue (%d)\n", ret);
> > +		pr_err("Failed to restore queue (%d)\n", ret);
> 
> Maybe separate this out, so it can be applied to stable series.
> 
> >   	else
> >   		pr_debug("Queue id %d was restored successfully\n",
> queue_id);
> >
> 
> 
> Kind regards,
> 
> Paul

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH v2 2/2] drm/amdkfd: CRIU add support for GWS queues
  2022-04-19  0:04     ` Yat Sin, David
@ 2022-04-19  6:54       ` Paul Menzel
  2022-04-19 12:24         ` Yat Sin, David
  0 siblings, 1 reply; 8+ messages in thread
From: Paul Menzel @ 2022-04-19  6:54 UTC (permalink / raw)
  To: David Yat Sin; +Cc: Felix Kühling, amd-gfx


Dear David,


Thank you for sending out v3 of these patches.

Am 19.04.22 um 02:04 schrieb Yat Sin, David:
> 
> 
>> -----Original Message-----
>> From: Paul Menzel <pmenzel@molgen.mpg.de>
>> Sent: Monday, April 18, 2022 4:23 PM

[…]
>> Am 18.04.22 um 18:44 schrieb David Yat Sin:
>>
>> In the commit message summary, you could reorder some words:
>>
>> Add CRIU support for GWS queues
>>
>>> Adding support to checkpoint/restore GWS(Global Wave Sync) queues.
>>
>> s/Adding/Add/

Did you miss the two comments above?

>> Please add a space before the (.
> ACK
>>
>> How can this be tested?
> We have some internal tests that can we be used to specifically test this feature.

Nice. Are you going to publish these in the future?

>>> Signed-off-by: David Yat Sin <david.yatsin@amd.com>
>>> ---
>>>    drivers/gpu/drm/amd/amdkfd/kfd_priv.h                  |  2 +-
>>>    drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c | 10 +++++++---
>>>    2 files changed, 8 insertions(+), 4 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>>> b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>>> index f36062be9ca8..192dbef04c43 100644
>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>>> @@ -1102,7 +1102,7 @@ struct kfd_criu_queue_priv_data {
>>>    	uint32_t priority;
>>>    	uint32_t q_percent;
>>>    	uint32_t doorbell_id;
>>> -	uint32_t is_gws;
>>> +	uint32_t gws;
>>
>> Why is the new name better?
> The old variable (is_gws) was obtained from the queue_properties
> structure during checkpoint and is only used temporarily during queue
> creation, so this variable cannot be used to determine whether a
> queue as gws enabled. The new variable (gws) is obtained from the
> queue structure. The name is changed to better reflect this.

Further down you seem to use it like a boolean though. So a name
reflecting that would be nice.

>>>    	uint32_t sdma_id;
>>>    	uint32_t eop_ring_buffer_size;
>>>    	uint32_t ctx_save_restore_area_size; diff --git
>>> a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
>>> b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
>>> index 6eca9509f2e3..4f58e671d39b 100644
>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
>>> @@ -636,6 +636,8 @@ static int criu_checkpoint_queue(struct
>> kfd_process_device *pdd,
>>>    	q_data->ctx_save_restore_area_size =
>>>    		q->properties.ctx_save_restore_area_size;
>>>
>>> +	q_data->gws = !!q->gws;
>>> +
>>>    	ret = pqm_checkpoint_mqd(&pdd->process->pqm, q-> properties.queue_id, mqd, ctl_stack);
>>>    	if (ret) {
>>>    		pr_err("Failed checkpoint queue_mqd (%d)\n", ret); @@ -743,7
>>> +745,6 @@ static void set_queue_properties_from_criu(struct queue_properties *qp,
>>>    					  struct kfd_criu_queue_priv_data *q_data)
>>>    {
>>>    	qp->is_interop = false;
>>> -	qp->is_gws = q_data->is_gws;
>>>    	qp->queue_percent = q_data->q_percent;
>>>    	qp->priority = q_data->priority;
>>>    	qp->queue_address = q_data->q_address; @@ -826,12 +827,15 @@
>> int kfd_criu_restore_queue(struct kfd_process *p,
>>>    				NULL);
>>>    	if (ret) {
>>>    		pr_err("Failed to create new queue err:%d\n", ret);
>>> -		ret = -EINVAL;
>>> +		goto exit;
>>>    	}
>>>
>>> +	if (q_data->gws)
>>> +		ret = pqm_set_gws(&p->pqm, q_data->q_id, pdd->dev->gws);
>>> +
>>>    exit:
>>>    	if (ret)
>>> -		pr_err("Failed to create queue (%d)\n", ret);
>>> +		pr_err("Failed to restore queue (%d)\n", ret);
>>
>> Maybe separate this out, so it can be applied to stable series.

Did you miss this comment?

>>>    	else
>>>    		pr_debug("Queue id %d was restored successfully\n", queue_id);
>>>


Kind regards,

Paul

^ permalink raw reply	[flat|nested] 8+ messages in thread

* RE: [PATCH v2 2/2] drm/amdkfd: CRIU add support for GWS queues
  2022-04-19  6:54       ` Paul Menzel
@ 2022-04-19 12:24         ` Yat Sin, David
  0 siblings, 0 replies; 8+ messages in thread
From: Yat Sin, David @ 2022-04-19 12:24 UTC (permalink / raw)
  To: Paul Menzel; +Cc: Kuehling, Felix, amd-gfx



> -----Original Message-----
> From: Paul Menzel <pmenzel@molgen.mpg.de>
> Sent: Tuesday, April 19, 2022 2:54 AM
> To: Yat Sin, David <David.YatSin@amd.com>
> Cc: Kuehling, Felix <Felix.Kuehling@amd.com>; amd-
> gfx@lists.freedesktop.org
> Subject: Re: [PATCH v2 2/2] drm/amdkfd: CRIU add support for GWS queues
> 
> 
> Dear David,
> 
> 
> Thank you for sending out v3 of these patches.
> 
> Am 19.04.22 um 02:04 schrieb Yat Sin, David:
> >
> >
> >> -----Original Message-----
> >> From: Paul Menzel <pmenzel@molgen.mpg.de>
> >> Sent: Monday, April 18, 2022 4:23 PM
> 
> […]
> >> Am 18.04.22 um 18:44 schrieb David Yat Sin:
> >>
> >> In the commit message summary, you could reorder some words:
> >>
> >> Add CRIU support for GWS queues
> >>
> >>> Adding support to checkpoint/restore GWS(Global Wave Sync) queues.
> >>
> >> s/Adding/Add/
> 
> Did you miss the two comments above?
ACK
> 
> >> Please add a space before the (.
> > ACK
> >>
> >> How can this be tested?
> > We have some internal tests that can we be used to specifically test this
> feature.
> 
> Nice. Are you going to publish these in the future?
I think some of these tests depend on other frameworks, so it might not be straight forward to do this.
> 
> >>> Signed-off-by: David Yat Sin <david.yatsin@amd.com>
> >>> ---
> >>>    drivers/gpu/drm/amd/amdkfd/kfd_priv.h                  |  2 +-
> >>>    drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c | 10
> +++++++---
> >>>    2 files changed, 8 insertions(+), 4 deletions(-)
> >>>
> >>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> >>> b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> >>> index f36062be9ca8..192dbef04c43 100644
> >>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> >>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> >>> @@ -1102,7 +1102,7 @@ struct kfd_criu_queue_priv_data {
> >>>    	uint32_t priority;
> >>>    	uint32_t q_percent;
> >>>    	uint32_t doorbell_id;
> >>> -	uint32_t is_gws;
> >>> +	uint32_t gws;
> >>
> >> Why is the new name better?
> > The old variable (is_gws) was obtained from the queue_properties
> > structure during checkpoint and is only used temporarily during queue
> > creation, so this variable cannot be used to determine whether a queue
> > as gws enabled. The new variable (gws) is obtained from the queue
> > structure. The name is changed to better reflect this.
> 
> Further down you seem to use it like a boolean though. So a name reflecting
> that would be nice.
To me this is ok. I would rather have the variable name match its source.
> 
> >>>    	uint32_t sdma_id;
> >>>    	uint32_t eop_ring_buffer_size;
> >>>    	uint32_t ctx_save_restore_area_size; diff --git
> >>> a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
> >>> b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
> >>> index 6eca9509f2e3..4f58e671d39b 100644
> >>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
> >>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
> >>> @@ -636,6 +636,8 @@ static int criu_checkpoint_queue(struct
> >> kfd_process_device *pdd,
> >>>    	q_data->ctx_save_restore_area_size =
> >>>    		q->properties.ctx_save_restore_area_size;
> >>>
> >>> +	q_data->gws = !!q->gws;
> >>> +
> >>>    	ret = pqm_checkpoint_mqd(&pdd->process->pqm, q->
> properties.queue_id, mqd, ctl_stack);
> >>>    	if (ret) {
> >>>    		pr_err("Failed checkpoint queue_mqd (%d)\n", ret); @@ -
> 743,7
> >>> +745,6 @@ static void set_queue_properties_from_criu(struct
> >>> +queue_properties *qp,
> >>>    					  struct kfd_criu_queue_priv_data
> *q_data)
> >>>    {
> >>>    	qp->is_interop = false;
> >>> -	qp->is_gws = q_data->is_gws;
> >>>    	qp->queue_percent = q_data->q_percent;
> >>>    	qp->priority = q_data->priority;
> >>>    	qp->queue_address = q_data->q_address; @@ -826,12 +827,15 @@
> >> int kfd_criu_restore_queue(struct kfd_process *p,
> >>>    				NULL);
> >>>    	if (ret) {
> >>>    		pr_err("Failed to create new queue err:%d\n", ret);
> >>> -		ret = -EINVAL;
> >>> +		goto exit;
> >>>    	}
> >>>
> >>> +	if (q_data->gws)
> >>> +		ret = pqm_set_gws(&p->pqm, q_data->q_id, pdd->dev->gws);
> >>> +
> >>>    exit:
> >>>    	if (ret)
> >>> -		pr_err("Failed to create queue (%d)\n", ret);
> >>> +		pr_err("Failed to restore queue (%d)\n", ret);
> >>
> >> Maybe separate this out, so it can be applied to stable series.
> 
> Did you miss this comment?
What do you mean by stable series?

> 
> >>>    	else
> >>>    		pr_debug("Queue id %d was restored successfully\n",
> queue_id);
> >>>
> 
> 
> Kind regards,
> 
> Paul

^ permalink raw reply	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2022-04-19 12:24 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-04-18 16:44 [PATCH v2 1/2] drm/amdkfd: Fix GWS queue count David Yat Sin
2022-04-18 16:44 ` [PATCH v2 2/2] drm/amdkfd: CRIU add support for GWS queues David Yat Sin
2022-04-18 20:23   ` Paul Menzel
2022-04-19  0:04     ` Yat Sin, David
2022-04-19  6:54       ` Paul Menzel
2022-04-19 12:24         ` Yat Sin, David
2022-04-18 19:01 ` [PATCH v2 1/2] drm/amdkfd: Fix GWS queue count Felix Kuehling
2022-04-18 20:18 ` Paul Menzel

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.