* [PATCH v2 1/2] drm/amdkfd: Fix GWS queue count
@ 2022-04-18 16:44 David Yat Sin
2022-04-18 16:44 ` [PATCH v2 2/2] drm/amdkfd: CRIU add support for GWS queues David Yat Sin
` (2 more replies)
0 siblings, 3 replies; 8+ messages in thread
From: David Yat Sin @ 2022-04-18 16:44 UTC (permalink / raw)
To: amd-gfx; +Cc: Felix.Kuehling
dqm->gws_queue_count and pdd->qpd.mapped_gws_queue needs to be updated
each time the queue gets evicted.
Signed-off-by: David Yat Sin <david.yatsin@amd.com>
---
.../drm/amd/amdkfd/kfd_device_queue_manager.c | 83 +++++++++----------
1 file changed, 37 insertions(+), 46 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
index acf4f7975850..198672264492 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -130,19 +130,33 @@ void program_sh_mem_settings(struct device_queue_manager *dqm,
}
static void increment_queue_count(struct device_queue_manager *dqm,
- enum kfd_queue_type type)
+ struct qcm_process_device *qpd,
+ struct queue *q)
{
dqm->active_queue_count++;
- if (type == KFD_QUEUE_TYPE_COMPUTE || type == KFD_QUEUE_TYPE_DIQ)
+ if (q->properties.type == KFD_QUEUE_TYPE_COMPUTE ||
+ q->properties.type == KFD_QUEUE_TYPE_DIQ)
dqm->active_cp_queue_count++;
+
+ if (q->properties.is_gws) {
+ dqm->gws_queue_count++;
+ qpd->mapped_gws_queue = true;
+ }
}
static void decrement_queue_count(struct device_queue_manager *dqm,
- enum kfd_queue_type type)
+ struct qcm_process_device *qpd,
+ struct queue *q)
{
dqm->active_queue_count--;
- if (type == KFD_QUEUE_TYPE_COMPUTE || type == KFD_QUEUE_TYPE_DIQ)
+ if (q->properties.type == KFD_QUEUE_TYPE_COMPUTE ||
+ q->properties.type == KFD_QUEUE_TYPE_DIQ)
dqm->active_cp_queue_count--;
+
+ if (q->properties.is_gws) {
+ dqm->gws_queue_count--;
+ qpd->mapped_gws_queue = false;
+ }
}
/*
@@ -412,7 +426,7 @@ static int create_queue_nocpsch(struct device_queue_manager *dqm,
list_add(&q->list, &qpd->queues_list);
qpd->queue_count++;
if (q->properties.is_active)
- increment_queue_count(dqm, q->properties.type);
+ increment_queue_count(dqm, qpd, q);
/*
* Unconditionally increment this counter, regardless of the queue's
@@ -601,13 +615,8 @@ static int destroy_queue_nocpsch_locked(struct device_queue_manager *dqm,
deallocate_vmid(dqm, qpd, q);
}
qpd->queue_count--;
- if (q->properties.is_active) {
- decrement_queue_count(dqm, q->properties.type);
- if (q->properties.is_gws) {
- dqm->gws_queue_count--;
- qpd->mapped_gws_queue = false;
- }
- }
+ if (q->properties.is_active)
+ decrement_queue_count(dqm, qpd, q);
return retval;
}
@@ -700,12 +709,11 @@ static int update_queue(struct device_queue_manager *dqm, struct queue *q,
* dqm->active_queue_count to determine whether a new runlist must be
* uploaded.
*/
- if (q->properties.is_active && !prev_active)
- increment_queue_count(dqm, q->properties.type);
- else if (!q->properties.is_active && prev_active)
- decrement_queue_count(dqm, q->properties.type);
-
- if (q->gws && !q->properties.is_gws) {
+ if (q->properties.is_active && !prev_active) {
+ increment_queue_count(dqm, &pdd->qpd, q);
+ } else if (!q->properties.is_active && prev_active) {
+ decrement_queue_count(dqm, &pdd->qpd, q);
+ } else if (q->gws && !q->properties.is_gws) {
if (q->properties.is_active) {
dqm->gws_queue_count++;
pdd->qpd.mapped_gws_queue = true;
@@ -767,11 +775,7 @@ static int evict_process_queues_nocpsch(struct device_queue_manager *dqm,
mqd_mgr = dqm->mqd_mgrs[get_mqd_type_from_queue_type(
q->properties.type)];
q->properties.is_active = false;
- decrement_queue_count(dqm, q->properties.type);
- if (q->properties.is_gws) {
- dqm->gws_queue_count--;
- qpd->mapped_gws_queue = false;
- }
+ decrement_queue_count(dqm, qpd, q);
if (WARN_ONCE(!dqm->sched_running, "Evict when stopped\n"))
continue;
@@ -817,7 +821,7 @@ static int evict_process_queues_cpsch(struct device_queue_manager *dqm,
continue;
q->properties.is_active = false;
- decrement_queue_count(dqm, q->properties.type);
+ decrement_queue_count(dqm, qpd, q);
}
pdd->last_evict_timestamp = get_jiffies_64();
retval = execute_queues_cpsch(dqm,
@@ -888,11 +892,7 @@ static int restore_process_queues_nocpsch(struct device_queue_manager *dqm,
mqd_mgr = dqm->mqd_mgrs[get_mqd_type_from_queue_type(
q->properties.type)];
q->properties.is_active = true;
- increment_queue_count(dqm, q->properties.type);
- if (q->properties.is_gws) {
- dqm->gws_queue_count++;
- qpd->mapped_gws_queue = true;
- }
+ increment_queue_count(dqm, qpd, q);
if (WARN_ONCE(!dqm->sched_running, "Restore when stopped\n"))
continue;
@@ -950,7 +950,7 @@ static int restore_process_queues_cpsch(struct device_queue_manager *dqm,
continue;
q->properties.is_active = true;
- increment_queue_count(dqm, q->properties.type);
+ increment_queue_count(dqm, &pdd->qpd, q);
}
retval = execute_queues_cpsch(dqm,
KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0);
@@ -1378,7 +1378,7 @@ static int create_kernel_queue_cpsch(struct device_queue_manager *dqm,
dqm->total_queue_count);
list_add(&kq->list, &qpd->priv_queue_list);
- increment_queue_count(dqm, kq->queue->properties.type);
+ increment_queue_count(dqm, qpd, kq->queue);
qpd->is_debug = true;
execute_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0);
dqm_unlock(dqm);
@@ -1392,7 +1392,7 @@ static void destroy_kernel_queue_cpsch(struct device_queue_manager *dqm,
{
dqm_lock(dqm);
list_del(&kq->list);
- decrement_queue_count(dqm, kq->queue->properties.type);
+ decrement_queue_count(dqm, qpd, kq->queue);
qpd->is_debug = false;
execute_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0);
/*
@@ -1467,7 +1467,7 @@ static int create_queue_cpsch(struct device_queue_manager *dqm, struct queue *q,
qpd->queue_count++;
if (q->properties.is_active) {
- increment_queue_count(dqm, q->properties.type);
+ increment_queue_count(dqm, qpd, q);
execute_queues_cpsch(dqm,
KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0);
@@ -1683,15 +1683,11 @@ static int destroy_queue_cpsch(struct device_queue_manager *dqm,
list_del(&q->list);
qpd->queue_count--;
if (q->properties.is_active) {
- decrement_queue_count(dqm, q->properties.type);
+ decrement_queue_count(dqm, qpd, q);
retval = execute_queues_cpsch(dqm,
KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0);
if (retval == -ETIME)
qpd->reset_wavefronts = true;
- if (q->properties.is_gws) {
- dqm->gws_queue_count--;
- qpd->mapped_gws_queue = false;
- }
}
/*
@@ -1932,7 +1928,7 @@ static int process_termination_cpsch(struct device_queue_manager *dqm,
/* Clean all kernel queues */
list_for_each_entry_safe(kq, kq_next, &qpd->priv_queue_list, list) {
list_del(&kq->list);
- decrement_queue_count(dqm, kq->queue->properties.type);
+ decrement_queue_count(dqm, qpd, kq->queue);
qpd->is_debug = false;
dqm->total_queue_count--;
filter = KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES;
@@ -1945,13 +1941,8 @@ static int process_termination_cpsch(struct device_queue_manager *dqm,
else if (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI)
deallocate_sdma_queue(dqm, q);
- if (q->properties.is_active) {
- decrement_queue_count(dqm, q->properties.type);
- if (q->properties.is_gws) {
- dqm->gws_queue_count--;
- qpd->mapped_gws_queue = false;
- }
- }
+ if (q->properties.is_active)
+ decrement_queue_count(dqm, qpd, q);
dqm->total_queue_count--;
}
--
2.30.2
^ permalink raw reply related [flat|nested] 8+ messages in thread
* [PATCH v2 2/2] drm/amdkfd: CRIU add support for GWS queues
2022-04-18 16:44 [PATCH v2 1/2] drm/amdkfd: Fix GWS queue count David Yat Sin
@ 2022-04-18 16:44 ` David Yat Sin
2022-04-18 20:23 ` Paul Menzel
2022-04-18 19:01 ` [PATCH v2 1/2] drm/amdkfd: Fix GWS queue count Felix Kuehling
2022-04-18 20:18 ` Paul Menzel
2 siblings, 1 reply; 8+ messages in thread
From: David Yat Sin @ 2022-04-18 16:44 UTC (permalink / raw)
To: amd-gfx; +Cc: Felix.Kuehling
Adding support to checkpoint/restore GWS(Global Wave Sync) queues.
Signed-off-by: David Yat Sin <david.yatsin@amd.com>
---
drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 2 +-
drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c | 10 +++++++---
2 files changed, 8 insertions(+), 4 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index f36062be9ca8..192dbef04c43 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -1102,7 +1102,7 @@ struct kfd_criu_queue_priv_data {
uint32_t priority;
uint32_t q_percent;
uint32_t doorbell_id;
- uint32_t is_gws;
+ uint32_t gws;
uint32_t sdma_id;
uint32_t eop_ring_buffer_size;
uint32_t ctx_save_restore_area_size;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
index 6eca9509f2e3..4f58e671d39b 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
@@ -636,6 +636,8 @@ static int criu_checkpoint_queue(struct kfd_process_device *pdd,
q_data->ctx_save_restore_area_size =
q->properties.ctx_save_restore_area_size;
+ q_data->gws = !!q->gws;
+
ret = pqm_checkpoint_mqd(&pdd->process->pqm, q->properties.queue_id, mqd, ctl_stack);
if (ret) {
pr_err("Failed checkpoint queue_mqd (%d)\n", ret);
@@ -743,7 +745,6 @@ static void set_queue_properties_from_criu(struct queue_properties *qp,
struct kfd_criu_queue_priv_data *q_data)
{
qp->is_interop = false;
- qp->is_gws = q_data->is_gws;
qp->queue_percent = q_data->q_percent;
qp->priority = q_data->priority;
qp->queue_address = q_data->q_address;
@@ -826,12 +827,15 @@ int kfd_criu_restore_queue(struct kfd_process *p,
NULL);
if (ret) {
pr_err("Failed to create new queue err:%d\n", ret);
- ret = -EINVAL;
+ goto exit;
}
+ if (q_data->gws)
+ ret = pqm_set_gws(&p->pqm, q_data->q_id, pdd->dev->gws);
+
exit:
if (ret)
- pr_err("Failed to create queue (%d)\n", ret);
+ pr_err("Failed to restore queue (%d)\n", ret);
else
pr_debug("Queue id %d was restored successfully\n", queue_id);
--
2.30.2
^ permalink raw reply related [flat|nested] 8+ messages in thread
* Re: [PATCH v2 1/2] drm/amdkfd: Fix GWS queue count
2022-04-18 16:44 [PATCH v2 1/2] drm/amdkfd: Fix GWS queue count David Yat Sin
2022-04-18 16:44 ` [PATCH v2 2/2] drm/amdkfd: CRIU add support for GWS queues David Yat Sin
@ 2022-04-18 19:01 ` Felix Kuehling
2022-04-18 20:18 ` Paul Menzel
2 siblings, 0 replies; 8+ messages in thread
From: Felix Kuehling @ 2022-04-18 19:01 UTC (permalink / raw)
To: David Yat Sin, amd-gfx
Am 2022-04-18 um 12:44 schrieb David Yat Sin:
> dqm->gws_queue_count and pdd->qpd.mapped_gws_queue needs to be updated
> each time the queue gets evicted.
>
> Signed-off-by: David Yat Sin <david.yatsin@amd.com>
The series is
Reviewed-by: Felix Kuehling <Felix.Kuehling@amd.com>
> ---
> .../drm/amd/amdkfd/kfd_device_queue_manager.c | 83 +++++++++----------
> 1 file changed, 37 insertions(+), 46 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> index acf4f7975850..198672264492 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> @@ -130,19 +130,33 @@ void program_sh_mem_settings(struct device_queue_manager *dqm,
> }
>
> static void increment_queue_count(struct device_queue_manager *dqm,
> - enum kfd_queue_type type)
> + struct qcm_process_device *qpd,
> + struct queue *q)
> {
> dqm->active_queue_count++;
> - if (type == KFD_QUEUE_TYPE_COMPUTE || type == KFD_QUEUE_TYPE_DIQ)
> + if (q->properties.type == KFD_QUEUE_TYPE_COMPUTE ||
> + q->properties.type == KFD_QUEUE_TYPE_DIQ)
> dqm->active_cp_queue_count++;
> +
> + if (q->properties.is_gws) {
> + dqm->gws_queue_count++;
> + qpd->mapped_gws_queue = true;
> + }
> }
>
> static void decrement_queue_count(struct device_queue_manager *dqm,
> - enum kfd_queue_type type)
> + struct qcm_process_device *qpd,
> + struct queue *q)
> {
> dqm->active_queue_count--;
> - if (type == KFD_QUEUE_TYPE_COMPUTE || type == KFD_QUEUE_TYPE_DIQ)
> + if (q->properties.type == KFD_QUEUE_TYPE_COMPUTE ||
> + q->properties.type == KFD_QUEUE_TYPE_DIQ)
> dqm->active_cp_queue_count--;
> +
> + if (q->properties.is_gws) {
> + dqm->gws_queue_count--;
> + qpd->mapped_gws_queue = false;
> + }
> }
>
> /*
> @@ -412,7 +426,7 @@ static int create_queue_nocpsch(struct device_queue_manager *dqm,
> list_add(&q->list, &qpd->queues_list);
> qpd->queue_count++;
> if (q->properties.is_active)
> - increment_queue_count(dqm, q->properties.type);
> + increment_queue_count(dqm, qpd, q);
>
> /*
> * Unconditionally increment this counter, regardless of the queue's
> @@ -601,13 +615,8 @@ static int destroy_queue_nocpsch_locked(struct device_queue_manager *dqm,
> deallocate_vmid(dqm, qpd, q);
> }
> qpd->queue_count--;
> - if (q->properties.is_active) {
> - decrement_queue_count(dqm, q->properties.type);
> - if (q->properties.is_gws) {
> - dqm->gws_queue_count--;
> - qpd->mapped_gws_queue = false;
> - }
> - }
> + if (q->properties.is_active)
> + decrement_queue_count(dqm, qpd, q);
>
> return retval;
> }
> @@ -700,12 +709,11 @@ static int update_queue(struct device_queue_manager *dqm, struct queue *q,
> * dqm->active_queue_count to determine whether a new runlist must be
> * uploaded.
> */
> - if (q->properties.is_active && !prev_active)
> - increment_queue_count(dqm, q->properties.type);
> - else if (!q->properties.is_active && prev_active)
> - decrement_queue_count(dqm, q->properties.type);
> -
> - if (q->gws && !q->properties.is_gws) {
> + if (q->properties.is_active && !prev_active) {
> + increment_queue_count(dqm, &pdd->qpd, q);
> + } else if (!q->properties.is_active && prev_active) {
> + decrement_queue_count(dqm, &pdd->qpd, q);
> + } else if (q->gws && !q->properties.is_gws) {
> if (q->properties.is_active) {
> dqm->gws_queue_count++;
> pdd->qpd.mapped_gws_queue = true;
> @@ -767,11 +775,7 @@ static int evict_process_queues_nocpsch(struct device_queue_manager *dqm,
> mqd_mgr = dqm->mqd_mgrs[get_mqd_type_from_queue_type(
> q->properties.type)];
> q->properties.is_active = false;
> - decrement_queue_count(dqm, q->properties.type);
> - if (q->properties.is_gws) {
> - dqm->gws_queue_count--;
> - qpd->mapped_gws_queue = false;
> - }
> + decrement_queue_count(dqm, qpd, q);
>
> if (WARN_ONCE(!dqm->sched_running, "Evict when stopped\n"))
> continue;
> @@ -817,7 +821,7 @@ static int evict_process_queues_cpsch(struct device_queue_manager *dqm,
> continue;
>
> q->properties.is_active = false;
> - decrement_queue_count(dqm, q->properties.type);
> + decrement_queue_count(dqm, qpd, q);
> }
> pdd->last_evict_timestamp = get_jiffies_64();
> retval = execute_queues_cpsch(dqm,
> @@ -888,11 +892,7 @@ static int restore_process_queues_nocpsch(struct device_queue_manager *dqm,
> mqd_mgr = dqm->mqd_mgrs[get_mqd_type_from_queue_type(
> q->properties.type)];
> q->properties.is_active = true;
> - increment_queue_count(dqm, q->properties.type);
> - if (q->properties.is_gws) {
> - dqm->gws_queue_count++;
> - qpd->mapped_gws_queue = true;
> - }
> + increment_queue_count(dqm, qpd, q);
>
> if (WARN_ONCE(!dqm->sched_running, "Restore when stopped\n"))
> continue;
> @@ -950,7 +950,7 @@ static int restore_process_queues_cpsch(struct device_queue_manager *dqm,
> continue;
>
> q->properties.is_active = true;
> - increment_queue_count(dqm, q->properties.type);
> + increment_queue_count(dqm, &pdd->qpd, q);
> }
> retval = execute_queues_cpsch(dqm,
> KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0);
> @@ -1378,7 +1378,7 @@ static int create_kernel_queue_cpsch(struct device_queue_manager *dqm,
> dqm->total_queue_count);
>
> list_add(&kq->list, &qpd->priv_queue_list);
> - increment_queue_count(dqm, kq->queue->properties.type);
> + increment_queue_count(dqm, qpd, kq->queue);
> qpd->is_debug = true;
> execute_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0);
> dqm_unlock(dqm);
> @@ -1392,7 +1392,7 @@ static void destroy_kernel_queue_cpsch(struct device_queue_manager *dqm,
> {
> dqm_lock(dqm);
> list_del(&kq->list);
> - decrement_queue_count(dqm, kq->queue->properties.type);
> + decrement_queue_count(dqm, qpd, kq->queue);
> qpd->is_debug = false;
> execute_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0);
> /*
> @@ -1467,7 +1467,7 @@ static int create_queue_cpsch(struct device_queue_manager *dqm, struct queue *q,
> qpd->queue_count++;
>
> if (q->properties.is_active) {
> - increment_queue_count(dqm, q->properties.type);
> + increment_queue_count(dqm, qpd, q);
>
> execute_queues_cpsch(dqm,
> KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0);
> @@ -1683,15 +1683,11 @@ static int destroy_queue_cpsch(struct device_queue_manager *dqm,
> list_del(&q->list);
> qpd->queue_count--;
> if (q->properties.is_active) {
> - decrement_queue_count(dqm, q->properties.type);
> + decrement_queue_count(dqm, qpd, q);
> retval = execute_queues_cpsch(dqm,
> KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0);
> if (retval == -ETIME)
> qpd->reset_wavefronts = true;
> - if (q->properties.is_gws) {
> - dqm->gws_queue_count--;
> - qpd->mapped_gws_queue = false;
> - }
> }
>
> /*
> @@ -1932,7 +1928,7 @@ static int process_termination_cpsch(struct device_queue_manager *dqm,
> /* Clean all kernel queues */
> list_for_each_entry_safe(kq, kq_next, &qpd->priv_queue_list, list) {
> list_del(&kq->list);
> - decrement_queue_count(dqm, kq->queue->properties.type);
> + decrement_queue_count(dqm, qpd, kq->queue);
> qpd->is_debug = false;
> dqm->total_queue_count--;
> filter = KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES;
> @@ -1945,13 +1941,8 @@ static int process_termination_cpsch(struct device_queue_manager *dqm,
> else if (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI)
> deallocate_sdma_queue(dqm, q);
>
> - if (q->properties.is_active) {
> - decrement_queue_count(dqm, q->properties.type);
> - if (q->properties.is_gws) {
> - dqm->gws_queue_count--;
> - qpd->mapped_gws_queue = false;
> - }
> - }
> + if (q->properties.is_active)
> + decrement_queue_count(dqm, qpd, q);
>
> dqm->total_queue_count--;
> }
^ permalink raw reply [flat|nested] 8+ messages in thread
* Re: [PATCH v2 1/2] drm/amdkfd: Fix GWS queue count
2022-04-18 16:44 [PATCH v2 1/2] drm/amdkfd: Fix GWS queue count David Yat Sin
2022-04-18 16:44 ` [PATCH v2 2/2] drm/amdkfd: CRIU add support for GWS queues David Yat Sin
2022-04-18 19:01 ` [PATCH v2 1/2] drm/amdkfd: Fix GWS queue count Felix Kuehling
@ 2022-04-18 20:18 ` Paul Menzel
2 siblings, 0 replies; 8+ messages in thread
From: Paul Menzel @ 2022-04-18 20:18 UTC (permalink / raw)
To: David Yat Sin; +Cc: Felix Kühling, amd-gfx
Dear David,
Thank you for your patch.
Am 18.04.22 um 18:44 schrieb David Yat Sin:
> dqm->gws_queue_count and pdd->qpd.mapped_gws_queue needs to be updated
s/needs/need/
> each time the queue gets evicted.
Why?
Do you only change the case, when an element of the queue gets evicted?
Next time, a short note about the implementation would be nic.e
No Fixes tag?
Kind regards,
Paul
> Signed-off-by: David Yat Sin <david.yatsin@amd.com>
> ---
> .../drm/amd/amdkfd/kfd_device_queue_manager.c | 83 +++++++++----------
> 1 file changed, 37 insertions(+), 46 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> index acf4f7975850..198672264492 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> @@ -130,19 +130,33 @@ void program_sh_mem_settings(struct device_queue_manager *dqm,
> }
>
> static void increment_queue_count(struct device_queue_manager *dqm,
> - enum kfd_queue_type type)
> + struct qcm_process_device *qpd,
> + struct queue *q)
> {
> dqm->active_queue_count++;
> - if (type == KFD_QUEUE_TYPE_COMPUTE || type == KFD_QUEUE_TYPE_DIQ)
> + if (q->properties.type == KFD_QUEUE_TYPE_COMPUTE ||
> + q->properties.type == KFD_QUEUE_TYPE_DIQ)
> dqm->active_cp_queue_count++;
> +
> + if (q->properties.is_gws) {
> + dqm->gws_queue_count++;
> + qpd->mapped_gws_queue = true;
> + }
> }
>
> static void decrement_queue_count(struct device_queue_manager *dqm,
> - enum kfd_queue_type type)
> + struct qcm_process_device *qpd,
> + struct queue *q)
> {
> dqm->active_queue_count--;
> - if (type == KFD_QUEUE_TYPE_COMPUTE || type == KFD_QUEUE_TYPE_DIQ)
> + if (q->properties.type == KFD_QUEUE_TYPE_COMPUTE ||
> + q->properties.type == KFD_QUEUE_TYPE_DIQ)
> dqm->active_cp_queue_count--;
> +
> + if (q->properties.is_gws) {
> + dqm->gws_queue_count--;
> + qpd->mapped_gws_queue = false;
> + }
> }
>
> /*
> @@ -412,7 +426,7 @@ static int create_queue_nocpsch(struct device_queue_manager *dqm,
> list_add(&q->list, &qpd->queues_list);
> qpd->queue_count++;
> if (q->properties.is_active)
> - increment_queue_count(dqm, q->properties.type);
> + increment_queue_count(dqm, qpd, q);
>
> /*
> * Unconditionally increment this counter, regardless of the queue's
> @@ -601,13 +615,8 @@ static int destroy_queue_nocpsch_locked(struct device_queue_manager *dqm,
> deallocate_vmid(dqm, qpd, q);
> }
> qpd->queue_count--;
> - if (q->properties.is_active) {
> - decrement_queue_count(dqm, q->properties.type);
> - if (q->properties.is_gws) {
> - dqm->gws_queue_count--;
> - qpd->mapped_gws_queue = false;
> - }
> - }
> + if (q->properties.is_active)
> + decrement_queue_count(dqm, qpd, q);
>
> return retval;
> }
> @@ -700,12 +709,11 @@ static int update_queue(struct device_queue_manager *dqm, struct queue *q,
> * dqm->active_queue_count to determine whether a new runlist must be
> * uploaded.
> */
> - if (q->properties.is_active && !prev_active)
> - increment_queue_count(dqm, q->properties.type);
> - else if (!q->properties.is_active && prev_active)
> - decrement_queue_count(dqm, q->properties.type);
> -
> - if (q->gws && !q->properties.is_gws) {
> + if (q->properties.is_active && !prev_active) {
> + increment_queue_count(dqm, &pdd->qpd, q);
> + } else if (!q->properties.is_active && prev_active) {
> + decrement_queue_count(dqm, &pdd->qpd, q);
> + } else if (q->gws && !q->properties.is_gws) {
> if (q->properties.is_active) {
> dqm->gws_queue_count++;
> pdd->qpd.mapped_gws_queue = true;
> @@ -767,11 +775,7 @@ static int evict_process_queues_nocpsch(struct device_queue_manager *dqm,
> mqd_mgr = dqm->mqd_mgrs[get_mqd_type_from_queue_type(
> q->properties.type)];
> q->properties.is_active = false;
> - decrement_queue_count(dqm, q->properties.type);
> - if (q->properties.is_gws) {
> - dqm->gws_queue_count--;
> - qpd->mapped_gws_queue = false;
> - }
> + decrement_queue_count(dqm, qpd, q);
>
> if (WARN_ONCE(!dqm->sched_running, "Evict when stopped\n"))
> continue;
> @@ -817,7 +821,7 @@ static int evict_process_queues_cpsch(struct device_queue_manager *dqm,
> continue;
>
> q->properties.is_active = false;
> - decrement_queue_count(dqm, q->properties.type);
> + decrement_queue_count(dqm, qpd, q);
> }
> pdd->last_evict_timestamp = get_jiffies_64();
> retval = execute_queues_cpsch(dqm,
> @@ -888,11 +892,7 @@ static int restore_process_queues_nocpsch(struct device_queue_manager *dqm,
> mqd_mgr = dqm->mqd_mgrs[get_mqd_type_from_queue_type(
> q->properties.type)];
> q->properties.is_active = true;
> - increment_queue_count(dqm, q->properties.type);
> - if (q->properties.is_gws) {
> - dqm->gws_queue_count++;
> - qpd->mapped_gws_queue = true;
> - }
> + increment_queue_count(dqm, qpd, q);
>
> if (WARN_ONCE(!dqm->sched_running, "Restore when stopped\n"))
> continue;
> @@ -950,7 +950,7 @@ static int restore_process_queues_cpsch(struct device_queue_manager *dqm,
> continue;
>
> q->properties.is_active = true;
> - increment_queue_count(dqm, q->properties.type);
> + increment_queue_count(dqm, &pdd->qpd, q);
> }
> retval = execute_queues_cpsch(dqm,
> KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0);
> @@ -1378,7 +1378,7 @@ static int create_kernel_queue_cpsch(struct device_queue_manager *dqm,
> dqm->total_queue_count);
>
> list_add(&kq->list, &qpd->priv_queue_list);
> - increment_queue_count(dqm, kq->queue->properties.type);
> + increment_queue_count(dqm, qpd, kq->queue);
> qpd->is_debug = true;
> execute_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0);
> dqm_unlock(dqm);
> @@ -1392,7 +1392,7 @@ static void destroy_kernel_queue_cpsch(struct device_queue_manager *dqm,
> {
> dqm_lock(dqm);
> list_del(&kq->list);
> - decrement_queue_count(dqm, kq->queue->properties.type);
> + decrement_queue_count(dqm, qpd, kq->queue);
> qpd->is_debug = false;
> execute_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0);
> /*
> @@ -1467,7 +1467,7 @@ static int create_queue_cpsch(struct device_queue_manager *dqm, struct queue *q,
> qpd->queue_count++;
>
> if (q->properties.is_active) {
> - increment_queue_count(dqm, q->properties.type);
> + increment_queue_count(dqm, qpd, q);
>
> execute_queues_cpsch(dqm,
> KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0);
> @@ -1683,15 +1683,11 @@ static int destroy_queue_cpsch(struct device_queue_manager *dqm,
> list_del(&q->list);
> qpd->queue_count--;
> if (q->properties.is_active) {
> - decrement_queue_count(dqm, q->properties.type);
> + decrement_queue_count(dqm, qpd, q);
> retval = execute_queues_cpsch(dqm,
> KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0);
> if (retval == -ETIME)
> qpd->reset_wavefronts = true;
> - if (q->properties.is_gws) {
> - dqm->gws_queue_count--;
> - qpd->mapped_gws_queue = false;
> - }
> }
>
> /*
> @@ -1932,7 +1928,7 @@ static int process_termination_cpsch(struct device_queue_manager *dqm,
> /* Clean all kernel queues */
> list_for_each_entry_safe(kq, kq_next, &qpd->priv_queue_list, list) {
> list_del(&kq->list);
> - decrement_queue_count(dqm, kq->queue->properties.type);
> + decrement_queue_count(dqm, qpd, kq->queue);
> qpd->is_debug = false;
> dqm->total_queue_count--;
> filter = KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES;
> @@ -1945,13 +1941,8 @@ static int process_termination_cpsch(struct device_queue_manager *dqm,
> else if (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI)
> deallocate_sdma_queue(dqm, q);
>
> - if (q->properties.is_active) {
> - decrement_queue_count(dqm, q->properties.type);
> - if (q->properties.is_gws) {
> - dqm->gws_queue_count--;
> - qpd->mapped_gws_queue = false;
> - }
> - }
> + if (q->properties.is_active)
> + decrement_queue_count(dqm, qpd, q);
>
> dqm->total_queue_count--;
> }
^ permalink raw reply [flat|nested] 8+ messages in thread
* Re: [PATCH v2 2/2] drm/amdkfd: CRIU add support for GWS queues
2022-04-18 16:44 ` [PATCH v2 2/2] drm/amdkfd: CRIU add support for GWS queues David Yat Sin
@ 2022-04-18 20:23 ` Paul Menzel
2022-04-19 0:04 ` Yat Sin, David
0 siblings, 1 reply; 8+ messages in thread
From: Paul Menzel @ 2022-04-18 20:23 UTC (permalink / raw)
To: David Yat Sin; +Cc: Felix Kühling, amd-gfx
Dear David,
Thank you for your patch.
Am 18.04.22 um 18:44 schrieb David Yat Sin:
In the commit message summary, you could reorder some words:
Add CRIU support for GWS queues
> Adding support to checkpoint/restore GWS(Global Wave Sync) queues.
s/Adding/Add/
Please add a space before the (.
How can this be tested?
> Signed-off-by: David Yat Sin <david.yatsin@amd.com>
> ---
> drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 2 +-
> drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c | 10 +++++++---
> 2 files changed, 8 insertions(+), 4 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> index f36062be9ca8..192dbef04c43 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> @@ -1102,7 +1102,7 @@ struct kfd_criu_queue_priv_data {
> uint32_t priority;
> uint32_t q_percent;
> uint32_t doorbell_id;
> - uint32_t is_gws;
> + uint32_t gws;
Why is the new name better?
> uint32_t sdma_id;
> uint32_t eop_ring_buffer_size;
> uint32_t ctx_save_restore_area_size;
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
> index 6eca9509f2e3..4f58e671d39b 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
> @@ -636,6 +636,8 @@ static int criu_checkpoint_queue(struct kfd_process_device *pdd,
> q_data->ctx_save_restore_area_size =
> q->properties.ctx_save_restore_area_size;
>
> + q_data->gws = !!q->gws;
> +
> ret = pqm_checkpoint_mqd(&pdd->process->pqm, q->properties.queue_id, mqd, ctl_stack);
> if (ret) {
> pr_err("Failed checkpoint queue_mqd (%d)\n", ret);
> @@ -743,7 +745,6 @@ static void set_queue_properties_from_criu(struct queue_properties *qp,
> struct kfd_criu_queue_priv_data *q_data)
> {
> qp->is_interop = false;
> - qp->is_gws = q_data->is_gws;
> qp->queue_percent = q_data->q_percent;
> qp->priority = q_data->priority;
> qp->queue_address = q_data->q_address;
> @@ -826,12 +827,15 @@ int kfd_criu_restore_queue(struct kfd_process *p,
> NULL);
> if (ret) {
> pr_err("Failed to create new queue err:%d\n", ret);
> - ret = -EINVAL;
> + goto exit;
> }
>
> + if (q_data->gws)
> + ret = pqm_set_gws(&p->pqm, q_data->q_id, pdd->dev->gws);
> +
> exit:
> if (ret)
> - pr_err("Failed to create queue (%d)\n", ret);
> + pr_err("Failed to restore queue (%d)\n", ret);
Maybe separate this out, so it can be applied to stable series.
> else
> pr_debug("Queue id %d was restored successfully\n", queue_id);
>
Kind regards,
Paul
^ permalink raw reply [flat|nested] 8+ messages in thread
* RE: [PATCH v2 2/2] drm/amdkfd: CRIU add support for GWS queues
2022-04-18 20:23 ` Paul Menzel
@ 2022-04-19 0:04 ` Yat Sin, David
2022-04-19 6:54 ` Paul Menzel
0 siblings, 1 reply; 8+ messages in thread
From: Yat Sin, David @ 2022-04-19 0:04 UTC (permalink / raw)
To: Paul Menzel; +Cc: Kuehling, Felix, amd-gfx
> -----Original Message-----
> From: Paul Menzel <pmenzel@molgen.mpg.de>
> Sent: Monday, April 18, 2022 4:23 PM
> To: Yat Sin, David <David.YatSin@amd.com>
> Cc: amd-gfx@lists.freedesktop.org; Kuehling, Felix
> <Felix.Kuehling@amd.com>
> Subject: Re: [PATCH v2 2/2] drm/amdkfd: CRIU add support for GWS queues
>
> Dear David,
>
>
> Thank you for your patch.
>
> Am 18.04.22 um 18:44 schrieb David Yat Sin:
>
> In the commit message summary, you could reorder some words:
>
> Add CRIU support for GWS queues
>
> > Adding support to checkpoint/restore GWS(Global Wave Sync) queues.
>
> s/Adding/Add/
>
> Please add a space before the (.
ACK
>
> How can this be tested?
We have some internal tests that can we be used to specifically test this feature.
>
> > Signed-off-by: David Yat Sin <david.yatsin@amd.com>
> > ---
> > drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 2 +-
> > drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c | 10
> +++++++---
> > 2 files changed, 8 insertions(+), 4 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> > b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> > index f36062be9ca8..192dbef04c43 100644
> > --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> > @@ -1102,7 +1102,7 @@ struct kfd_criu_queue_priv_data {
> > uint32_t priority;
> > uint32_t q_percent;
> > uint32_t doorbell_id;
> > - uint32_t is_gws;
> > + uint32_t gws;
>
> Why is the new name better?
The old variable (is_gws) was obtained from the queue_properties structure during checkpoint and is only used temporarily during queue creation, so this variable cannot be used to determine whether a queue as gws enabled. The new variable (gws) is obtained from the queue structure. The name is changed to better reflect this.
>
> > uint32_t sdma_id;
> > uint32_t eop_ring_buffer_size;
> > uint32_t ctx_save_restore_area_size; diff --git
> > a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
> > b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
> > index 6eca9509f2e3..4f58e671d39b 100644
> > --- a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
> > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
> > @@ -636,6 +636,8 @@ static int criu_checkpoint_queue(struct
> kfd_process_device *pdd,
> > q_data->ctx_save_restore_area_size =
> > q->properties.ctx_save_restore_area_size;
> >
> > + q_data->gws = !!q->gws;
> > +
> > ret = pqm_checkpoint_mqd(&pdd->process->pqm, q-
> >properties.queue_id, mqd, ctl_stack);
> > if (ret) {
> > pr_err("Failed checkpoint queue_mqd (%d)\n", ret); @@ -
> 743,7
> > +745,6 @@ static void set_queue_properties_from_criu(struct
> queue_properties *qp,
> > struct kfd_criu_queue_priv_data
> *q_data)
> > {
> > qp->is_interop = false;
> > - qp->is_gws = q_data->is_gws;
> > qp->queue_percent = q_data->q_percent;
> > qp->priority = q_data->priority;
> > qp->queue_address = q_data->q_address; @@ -826,12 +827,15 @@
> int
> > kfd_criu_restore_queue(struct kfd_process *p,
> > NULL);
> > if (ret) {
> > pr_err("Failed to create new queue err:%d\n", ret);
> > - ret = -EINVAL;
> > + goto exit;
> > }
> >
> > + if (q_data->gws)
> > + ret = pqm_set_gws(&p->pqm, q_data->q_id, pdd->dev->gws);
> > +
> > exit:
> > if (ret)
> > - pr_err("Failed to create queue (%d)\n", ret);
> > + pr_err("Failed to restore queue (%d)\n", ret);
>
> Maybe separate this out, so it can be applied to stable series.
>
> > else
> > pr_debug("Queue id %d was restored successfully\n",
> queue_id);
> >
>
>
> Kind regards,
>
> Paul
^ permalink raw reply [flat|nested] 8+ messages in thread
* Re: [PATCH v2 2/2] drm/amdkfd: CRIU add support for GWS queues
2022-04-19 0:04 ` Yat Sin, David
@ 2022-04-19 6:54 ` Paul Menzel
2022-04-19 12:24 ` Yat Sin, David
0 siblings, 1 reply; 8+ messages in thread
From: Paul Menzel @ 2022-04-19 6:54 UTC (permalink / raw)
To: David Yat Sin; +Cc: Felix Kühling, amd-gfx
Dear David,
Thank you for sending out v3 of these patches.
Am 19.04.22 um 02:04 schrieb Yat Sin, David:
>
>
>> -----Original Message-----
>> From: Paul Menzel <pmenzel@molgen.mpg.de>
>> Sent: Monday, April 18, 2022 4:23 PM
[…]
>> Am 18.04.22 um 18:44 schrieb David Yat Sin:
>>
>> In the commit message summary, you could reorder some words:
>>
>> Add CRIU support for GWS queues
>>
>>> Adding support to checkpoint/restore GWS(Global Wave Sync) queues.
>>
>> s/Adding/Add/
Did you miss the two comments above?
>> Please add a space before the (.
> ACK
>>
>> How can this be tested?
> We have some internal tests that can we be used to specifically test this feature.
Nice. Are you going to publish these in the future?
>>> Signed-off-by: David Yat Sin <david.yatsin@amd.com>
>>> ---
>>> drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 2 +-
>>> drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c | 10 +++++++---
>>> 2 files changed, 8 insertions(+), 4 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>>> b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>>> index f36062be9ca8..192dbef04c43 100644
>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>>> @@ -1102,7 +1102,7 @@ struct kfd_criu_queue_priv_data {
>>> uint32_t priority;
>>> uint32_t q_percent;
>>> uint32_t doorbell_id;
>>> - uint32_t is_gws;
>>> + uint32_t gws;
>>
>> Why is the new name better?
> The old variable (is_gws) was obtained from the queue_properties
> structure during checkpoint and is only used temporarily during queue
> creation, so this variable cannot be used to determine whether a
> queue as gws enabled. The new variable (gws) is obtained from the
> queue structure. The name is changed to better reflect this.
Further down you seem to use it like a boolean though. So a name
reflecting that would be nice.
>>> uint32_t sdma_id;
>>> uint32_t eop_ring_buffer_size;
>>> uint32_t ctx_save_restore_area_size; diff --git
>>> a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
>>> b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
>>> index 6eca9509f2e3..4f58e671d39b 100644
>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
>>> @@ -636,6 +636,8 @@ static int criu_checkpoint_queue(struct
>> kfd_process_device *pdd,
>>> q_data->ctx_save_restore_area_size =
>>> q->properties.ctx_save_restore_area_size;
>>>
>>> + q_data->gws = !!q->gws;
>>> +
>>> ret = pqm_checkpoint_mqd(&pdd->process->pqm, q-> properties.queue_id, mqd, ctl_stack);
>>> if (ret) {
>>> pr_err("Failed checkpoint queue_mqd (%d)\n", ret); @@ -743,7
>>> +745,6 @@ static void set_queue_properties_from_criu(struct queue_properties *qp,
>>> struct kfd_criu_queue_priv_data *q_data)
>>> {
>>> qp->is_interop = false;
>>> - qp->is_gws = q_data->is_gws;
>>> qp->queue_percent = q_data->q_percent;
>>> qp->priority = q_data->priority;
>>> qp->queue_address = q_data->q_address; @@ -826,12 +827,15 @@
>> int kfd_criu_restore_queue(struct kfd_process *p,
>>> NULL);
>>> if (ret) {
>>> pr_err("Failed to create new queue err:%d\n", ret);
>>> - ret = -EINVAL;
>>> + goto exit;
>>> }
>>>
>>> + if (q_data->gws)
>>> + ret = pqm_set_gws(&p->pqm, q_data->q_id, pdd->dev->gws);
>>> +
>>> exit:
>>> if (ret)
>>> - pr_err("Failed to create queue (%d)\n", ret);
>>> + pr_err("Failed to restore queue (%d)\n", ret);
>>
>> Maybe separate this out, so it can be applied to stable series.
Did you miss this comment?
>>> else
>>> pr_debug("Queue id %d was restored successfully\n", queue_id);
>>>
Kind regards,
Paul
^ permalink raw reply [flat|nested] 8+ messages in thread
* RE: [PATCH v2 2/2] drm/amdkfd: CRIU add support for GWS queues
2022-04-19 6:54 ` Paul Menzel
@ 2022-04-19 12:24 ` Yat Sin, David
0 siblings, 0 replies; 8+ messages in thread
From: Yat Sin, David @ 2022-04-19 12:24 UTC (permalink / raw)
To: Paul Menzel; +Cc: Kuehling, Felix, amd-gfx
> -----Original Message-----
> From: Paul Menzel <pmenzel@molgen.mpg.de>
> Sent: Tuesday, April 19, 2022 2:54 AM
> To: Yat Sin, David <David.YatSin@amd.com>
> Cc: Kuehling, Felix <Felix.Kuehling@amd.com>; amd-
> gfx@lists.freedesktop.org
> Subject: Re: [PATCH v2 2/2] drm/amdkfd: CRIU add support for GWS queues
>
>
> Dear David,
>
>
> Thank you for sending out v3 of these patches.
>
> Am 19.04.22 um 02:04 schrieb Yat Sin, David:
> >
> >
> >> -----Original Message-----
> >> From: Paul Menzel <pmenzel@molgen.mpg.de>
> >> Sent: Monday, April 18, 2022 4:23 PM
>
> […]
> >> Am 18.04.22 um 18:44 schrieb David Yat Sin:
> >>
> >> In the commit message summary, you could reorder some words:
> >>
> >> Add CRIU support for GWS queues
> >>
> >>> Adding support to checkpoint/restore GWS(Global Wave Sync) queues.
> >>
> >> s/Adding/Add/
>
> Did you miss the two comments above?
ACK
>
> >> Please add a space before the (.
> > ACK
> >>
> >> How can this be tested?
> > We have some internal tests that can we be used to specifically test this
> feature.
>
> Nice. Are you going to publish these in the future?
I think some of these tests depend on other frameworks, so it might not be straight forward to do this.
>
> >>> Signed-off-by: David Yat Sin <david.yatsin@amd.com>
> >>> ---
> >>> drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 2 +-
> >>> drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c | 10
> +++++++---
> >>> 2 files changed, 8 insertions(+), 4 deletions(-)
> >>>
> >>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> >>> b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> >>> index f36062be9ca8..192dbef04c43 100644
> >>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> >>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> >>> @@ -1102,7 +1102,7 @@ struct kfd_criu_queue_priv_data {
> >>> uint32_t priority;
> >>> uint32_t q_percent;
> >>> uint32_t doorbell_id;
> >>> - uint32_t is_gws;
> >>> + uint32_t gws;
> >>
> >> Why is the new name better?
> > The old variable (is_gws) was obtained from the queue_properties
> > structure during checkpoint and is only used temporarily during queue
> > creation, so this variable cannot be used to determine whether a queue
> > as gws enabled. The new variable (gws) is obtained from the queue
> > structure. The name is changed to better reflect this.
>
> Further down you seem to use it like a boolean though. So a name reflecting
> that would be nice.
To me this is ok. I would rather have the variable name match its source.
>
> >>> uint32_t sdma_id;
> >>> uint32_t eop_ring_buffer_size;
> >>> uint32_t ctx_save_restore_area_size; diff --git
> >>> a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
> >>> b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
> >>> index 6eca9509f2e3..4f58e671d39b 100644
> >>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
> >>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
> >>> @@ -636,6 +636,8 @@ static int criu_checkpoint_queue(struct
> >> kfd_process_device *pdd,
> >>> q_data->ctx_save_restore_area_size =
> >>> q->properties.ctx_save_restore_area_size;
> >>>
> >>> + q_data->gws = !!q->gws;
> >>> +
> >>> ret = pqm_checkpoint_mqd(&pdd->process->pqm, q->
> properties.queue_id, mqd, ctl_stack);
> >>> if (ret) {
> >>> pr_err("Failed checkpoint queue_mqd (%d)\n", ret); @@ -
> 743,7
> >>> +745,6 @@ static void set_queue_properties_from_criu(struct
> >>> +queue_properties *qp,
> >>> struct kfd_criu_queue_priv_data
> *q_data)
> >>> {
> >>> qp->is_interop = false;
> >>> - qp->is_gws = q_data->is_gws;
> >>> qp->queue_percent = q_data->q_percent;
> >>> qp->priority = q_data->priority;
> >>> qp->queue_address = q_data->q_address; @@ -826,12 +827,15 @@
> >> int kfd_criu_restore_queue(struct kfd_process *p,
> >>> NULL);
> >>> if (ret) {
> >>> pr_err("Failed to create new queue err:%d\n", ret);
> >>> - ret = -EINVAL;
> >>> + goto exit;
> >>> }
> >>>
> >>> + if (q_data->gws)
> >>> + ret = pqm_set_gws(&p->pqm, q_data->q_id, pdd->dev->gws);
> >>> +
> >>> exit:
> >>> if (ret)
> >>> - pr_err("Failed to create queue (%d)\n", ret);
> >>> + pr_err("Failed to restore queue (%d)\n", ret);
> >>
> >> Maybe separate this out, so it can be applied to stable series.
>
> Did you miss this comment?
What do you mean by stable series?
>
> >>> else
> >>> pr_debug("Queue id %d was restored successfully\n",
> queue_id);
> >>>
>
>
> Kind regards,
>
> Paul
^ permalink raw reply [flat|nested] 8+ messages in thread
end of thread, other threads:[~2022-04-19 12:24 UTC | newest]
Thread overview: 8+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-04-18 16:44 [PATCH v2 1/2] drm/amdkfd: Fix GWS queue count David Yat Sin
2022-04-18 16:44 ` [PATCH v2 2/2] drm/amdkfd: CRIU add support for GWS queues David Yat Sin
2022-04-18 20:23 ` Paul Menzel
2022-04-19 0:04 ` Yat Sin, David
2022-04-19 6:54 ` Paul Menzel
2022-04-19 12:24 ` Yat Sin, David
2022-04-18 19:01 ` [PATCH v2 1/2] drm/amdkfd: Fix GWS queue count Felix Kuehling
2022-04-18 20:18 ` Paul Menzel
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.