All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 1/2] drm/amdkfd: Fix GWS queue count
@ 2022-04-13 15:51 David Yat Sin
  2022-04-13 15:51 ` [PATCH 2/2] drm/amdkfd: CRIU add support for GWS queues David Yat Sin
  2022-04-14 14:52 ` [PATCH 1/2] drm/amdkfd: Fix GWS queue count Felix Kuehling
  0 siblings, 2 replies; 7+ messages in thread
From: David Yat Sin @ 2022-04-13 15:51 UTC (permalink / raw)
  To: amd-gfx; +Cc: Felix.Kuehling

Queue can be inactive during process termination. This would cause
dqm->gws_queue_count to not be decremented. There can only be 1 GWS
queue per device process so moving the logic out of loop.

Signed-off-by: David Yat Sin <david.yatsin@amd.com>
---
 .../gpu/drm/amd/amdkfd/kfd_device_queue_manager.c    | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
index acf4f7975850..7c107b88d944 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -1945,17 +1945,17 @@ static int process_termination_cpsch(struct device_queue_manager *dqm,
 		else if (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI)
 			deallocate_sdma_queue(dqm, q);
 
-		if (q->properties.is_active) {
+		if (q->properties.is_active)
 			decrement_queue_count(dqm, q->properties.type);
-			if (q->properties.is_gws) {
-				dqm->gws_queue_count--;
-				qpd->mapped_gws_queue = false;
-			}
-		}
 
 		dqm->total_queue_count--;
 	}
 
+	if (qpd->mapped_gws_queue) {
+		dqm->gws_queue_count--;
+		qpd->mapped_gws_queue = false;
+	}
+
 	/* Unregister process */
 	list_for_each_entry_safe(cur, next_dpn, &dqm->queues, list) {
 		if (qpd == cur->qpd) {
-- 
2.30.2


^ permalink raw reply related	[flat|nested] 7+ messages in thread

* [PATCH 2/2] drm/amdkfd: CRIU add support for GWS queues
  2022-04-13 15:51 [PATCH 1/2] drm/amdkfd: Fix GWS queue count David Yat Sin
@ 2022-04-13 15:51 ` David Yat Sin
  2022-04-14 14:52 ` [PATCH 1/2] drm/amdkfd: Fix GWS queue count Felix Kuehling
  1 sibling, 0 replies; 7+ messages in thread
From: David Yat Sin @ 2022-04-13 15:51 UTC (permalink / raw)
  To: amd-gfx; +Cc: Felix.Kuehling

Adding support to checkpoint/restore GWS(Global Wave Sync) queues.

Signed-off-by: David Yat Sin <david.yatsin@amd.com>
---
 drivers/gpu/drm/amd/amdkfd/kfd_chardev.c      |  2 +-
 drivers/gpu/drm/amd/amdkfd/kfd_priv.h         |  4 ++--
 .../amd/amdkfd/kfd_process_queue_manager.c    | 22 ++++++++++++++-----
 3 files changed, 19 insertions(+), 9 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index e1e2362841f8..b2c72ddd9c1c 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -1377,7 +1377,7 @@ static int kfd_ioctl_alloc_queue_gws(struct file *filep,
 		goto out_unlock;
 	}
 
-	retval = pqm_set_gws(&p->pqm, args->queue_id, args->num_gws ? dev->gws : NULL);
+	retval = pqm_set_gws(&p->pqm, args->queue_id, args->num_gws ? dev->gws : NULL, false);
 	mutex_unlock(&p->mutex);
 
 	args->first_gws = 0;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index f36062be9ca8..3ec32675210c 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -1102,7 +1102,7 @@ struct kfd_criu_queue_priv_data {
 	uint32_t priority;
 	uint32_t q_percent;
 	uint32_t doorbell_id;
-	uint32_t is_gws;
+	uint32_t gws;
 	uint32_t sdma_id;
 	uint32_t eop_ring_buffer_size;
 	uint32_t ctx_save_restore_area_size;
@@ -1199,7 +1199,7 @@ int pqm_update_queue_properties(struct process_queue_manager *pqm, unsigned int
 int pqm_update_mqd(struct process_queue_manager *pqm, unsigned int qid,
 			struct mqd_update_info *minfo);
 int pqm_set_gws(struct process_queue_manager *pqm, unsigned int qid,
-			void *gws);
+			void *gws, bool criu_restore);
 struct kernel_queue *pqm_get_kernel_queue(struct process_queue_manager *pqm,
 						unsigned int qid);
 struct queue *pqm_get_user_queue(struct process_queue_manager *pqm,
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
index 6eca9509f2e3..3eb9d43fdaac 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
@@ -91,7 +91,7 @@ void kfd_process_dequeue_from_device(struct kfd_process_device *pdd)
 }
 
 int pqm_set_gws(struct process_queue_manager *pqm, unsigned int qid,
-			void *gws)
+		void *gws, bool criu_restore)
 {
 	struct kfd_dev *dev = NULL;
 	struct process_queue_node *pqn;
@@ -135,8 +135,14 @@ int pqm_set_gws(struct process_queue_manager *pqm, unsigned int qid,
 	pqn->q->gws = mem;
 	pdd->qpd.num_gws = gws ? dev->adev->gds.gws_size : 0;
 
-	return pqn->q->device->dqm->ops.update_queue(pqn->q->device->dqm,
-							pqn->q, NULL);
+	ret = pqn->q->device->dqm->ops.update_queue(pqn->q->device->dqm,
+						    pqn->q, NULL);
+
+	if (!ret && criu_restore) {
+		dev->dqm->gws_queue_count++;
+		pdd->qpd.mapped_gws_queue = true;
+	}
+	return ret;
 }
 
 void kfd_process_dequeue_from_all_devices(struct kfd_process *p)
@@ -636,6 +642,8 @@ static int criu_checkpoint_queue(struct kfd_process_device *pdd,
 	q_data->ctx_save_restore_area_size =
 		q->properties.ctx_save_restore_area_size;
 
+	q_data->gws = !!q->gws;
+
 	ret = pqm_checkpoint_mqd(&pdd->process->pqm, q->properties.queue_id, mqd, ctl_stack);
 	if (ret) {
 		pr_err("Failed checkpoint queue_mqd (%d)\n", ret);
@@ -743,7 +751,6 @@ static void set_queue_properties_from_criu(struct queue_properties *qp,
 					  struct kfd_criu_queue_priv_data *q_data)
 {
 	qp->is_interop = false;
-	qp->is_gws = q_data->is_gws;
 	qp->queue_percent = q_data->q_percent;
 	qp->priority = q_data->priority;
 	qp->queue_address = q_data->q_address;
@@ -826,12 +833,15 @@ int kfd_criu_restore_queue(struct kfd_process *p,
 				NULL);
 	if (ret) {
 		pr_err("Failed to create new queue err:%d\n", ret);
-		ret = -EINVAL;
+		goto exit;
 	}
 
+	if (q_data->gws)
+		ret = pqm_set_gws(&p->pqm, q_data->q_id, pdd->dev->gws, true);
+
 exit:
 	if (ret)
-		pr_err("Failed to create queue (%d)\n", ret);
+		pr_err("Failed to restore queue (%d)\n", ret);
 	else
 		pr_debug("Queue id %d was restored successfully\n", queue_id);
 
-- 
2.30.2


^ permalink raw reply related	[flat|nested] 7+ messages in thread

* Re: [PATCH 1/2] drm/amdkfd: Fix GWS queue count
  2022-04-13 15:51 [PATCH 1/2] drm/amdkfd: Fix GWS queue count David Yat Sin
  2022-04-13 15:51 ` [PATCH 2/2] drm/amdkfd: CRIU add support for GWS queues David Yat Sin
@ 2022-04-14 14:52 ` Felix Kuehling
  2022-04-14 15:08   ` Yat Sin, David
  1 sibling, 1 reply; 7+ messages in thread
From: Felix Kuehling @ 2022-04-14 14:52 UTC (permalink / raw)
  To: David Yat Sin, amd-gfx


Am 2022-04-13 um 11:51 schrieb David Yat Sin:
> Queue can be inactive during process termination. This would cause
> dqm->gws_queue_count to not be decremented. There can only be 1 GWS
> queue per device process so moving the logic out of loop.
>
> Signed-off-by: David Yat Sin <david.yatsin@amd.com>
> ---
>   .../gpu/drm/amd/amdkfd/kfd_device_queue_manager.c    | 12 ++++++------
>   1 file changed, 6 insertions(+), 6 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> index acf4f7975850..7c107b88d944 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> @@ -1945,17 +1945,17 @@ static int process_termination_cpsch(struct device_queue_manager *dqm,
>   		else if (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI)
>   			deallocate_sdma_queue(dqm, q);
>   
> -		if (q->properties.is_active) {
> +		if (q->properties.is_active)
>   			decrement_queue_count(dqm, q->properties.type);
> -			if (q->properties.is_gws) {
> -				dqm->gws_queue_count--;
> -				qpd->mapped_gws_queue = false;
> -			}
> -		}

This looks like the original intention was to decrement the counter for 
inactive GWS queues. This makes sense because this counter is used to 
determine whether the runlist is oversubscribed. Inactive queues are not 
in the runlist, so they should not be counted.

I see that the counter is updated when queues are activated and 
deactivated in update_queue. So IMO this patch is both incorrect and 
unnecessary. Did you see an actual problem with the GWS counter during 
process termination? If so, I'd like to know more to understand the root 
cause.

Regards,
   Felix


>   
>   		dqm->total_queue_count--;
>   	}
>   
> +	if (qpd->mapped_gws_queue) {
> +		dqm->gws_queue_count--;
> +		qpd->mapped_gws_queue = false;
> +	}
> +
>   	/* Unregister process */
>   	list_for_each_entry_safe(cur, next_dpn, &dqm->queues, list) {
>   		if (qpd == cur->qpd) {

^ permalink raw reply	[flat|nested] 7+ messages in thread

* RE: [PATCH 1/2] drm/amdkfd: Fix GWS queue count
  2022-04-14 14:52 ` [PATCH 1/2] drm/amdkfd: Fix GWS queue count Felix Kuehling
@ 2022-04-14 15:08   ` Yat Sin, David
  2022-04-14 15:42     ` Felix Kuehling
  0 siblings, 1 reply; 7+ messages in thread
From: Yat Sin, David @ 2022-04-14 15:08 UTC (permalink / raw)
  To: Kuehling, Felix, amd-gfx



> -----Original Message-----
> From: Kuehling, Felix <Felix.Kuehling@amd.com>
> Sent: Thursday, April 14, 2022 10:52 AM
> To: Yat Sin, David <David.YatSin@amd.com>; amd-gfx@lists.freedesktop.org
> Subject: Re: [PATCH 1/2] drm/amdkfd: Fix GWS queue count
> 
> 
> Am 2022-04-13 um 11:51 schrieb David Yat Sin:
> > Queue can be inactive during process termination. This would cause
> > dqm->gws_queue_count to not be decremented. There can only be 1 GWS
> > queue per device process so moving the logic out of loop.
> >
> > Signed-off-by: David Yat Sin <david.yatsin@amd.com>
> > ---
> >   .../gpu/drm/amd/amdkfd/kfd_device_queue_manager.c    | 12 ++++++-----
> -
> >   1 file changed, 6 insertions(+), 6 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> > b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> > index acf4f7975850..7c107b88d944 100644
> > --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> > @@ -1945,17 +1945,17 @@ static int process_termination_cpsch(struct
> device_queue_manager *dqm,
> >   		else if (q->properties.type ==
> KFD_QUEUE_TYPE_SDMA_XGMI)
> >   			deallocate_sdma_queue(dqm, q);
> >
> > -		if (q->properties.is_active) {
> > +		if (q->properties.is_active)
> >   			decrement_queue_count(dqm, q->properties.type);
> > -			if (q->properties.is_gws) {
> > -				dqm->gws_queue_count--;
> > -				qpd->mapped_gws_queue = false;
> > -			}
> > -		}
> 
> This looks like the original intention was to decrement the counter for inactive
> GWS queues. This makes sense because this counter is used to determine
> whether the runlist is oversubscribed. Inactive queues are not in the runlist,
> so they should not be counted.
> 
> I see that the counter is updated when queues are activated and deactivated
> in update_queue. So IMO this patch is both incorrect and unnecessary. Did
> you see an actual problem with the GWS counter during process termination?
> If so, I'd like to know more to understand the root cause.
> 
> Regards,
>    Felix

Yes, when using a unit test (for example KFDGWSTest.Semaphore), 
1. Put a sleep in the middle of the application (after calling hsaKmtAllocQueueGWS)
2. Run application and kill the application which it is in sleep (application never calls queue.Destroy()).

Then inside kernel dqm->gws_queue_count keeps incrementing each time the experiment is repeated and never goes back to 0. This seems to be because the sleep causes the process to be evicted and q->properties.is_active is false so code does not enter that if statement.

Regards,
David

> 
> 
> >
> >   		dqm->total_queue_count--;
> >   	}
> >
> > +	if (qpd->mapped_gws_queue) {
> > +		dqm->gws_queue_count--;
> > +		qpd->mapped_gws_queue = false;
> > +	}
> > +
> >   	/* Unregister process */
> >   	list_for_each_entry_safe(cur, next_dpn, &dqm->queues, list) {
> >   		if (qpd == cur->qpd) {

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH 1/2] drm/amdkfd: Fix GWS queue count
  2022-04-14 15:08   ` Yat Sin, David
@ 2022-04-14 15:42     ` Felix Kuehling
  2022-04-14 17:56       ` Yat Sin, David
  0 siblings, 1 reply; 7+ messages in thread
From: Felix Kuehling @ 2022-04-14 15:42 UTC (permalink / raw)
  To: Yat Sin, David, amd-gfx


Am 2022-04-14 um 11:08 schrieb Yat Sin, David:
>> -----Original Message-----
>> From: Kuehling, Felix <Felix.Kuehling@amd.com>
>> Sent: Thursday, April 14, 2022 10:52 AM
>> To: Yat Sin, David <David.YatSin@amd.com>; amd-gfx@lists.freedesktop.org
>> Subject: Re: [PATCH 1/2] drm/amdkfd: Fix GWS queue count
>>
>>
>> Am 2022-04-13 um 11:51 schrieb David Yat Sin:
>>> Queue can be inactive during process termination. This would cause
>>> dqm->gws_queue_count to not be decremented. There can only be 1 GWS
>>> queue per device process so moving the logic out of loop.
>>>
>>> Signed-off-by: David Yat Sin <david.yatsin@amd.com>
>>> ---
>>>    .../gpu/drm/amd/amdkfd/kfd_device_queue_manager.c    | 12 ++++++-----
>> -
>>>    1 file changed, 6 insertions(+), 6 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
>>> b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
>>> index acf4f7975850..7c107b88d944 100644
>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
>>> @@ -1945,17 +1945,17 @@ static int process_termination_cpsch(struct
>> device_queue_manager *dqm,
>>>    		else if (q->properties.type ==
>> KFD_QUEUE_TYPE_SDMA_XGMI)
>>>    			deallocate_sdma_queue(dqm, q);
>>>
>>> -		if (q->properties.is_active) {
>>> +		if (q->properties.is_active)
>>>    			decrement_queue_count(dqm, q->properties.type);
>>> -			if (q->properties.is_gws) {
>>> -				dqm->gws_queue_count--;
>>> -				qpd->mapped_gws_queue = false;
>>> -			}
>>> -		}
>> This looks like the original intention was to decrement the counter for inactive
>> GWS queues. This makes sense because this counter is used to determine
>> whether the runlist is oversubscribed. Inactive queues are not in the runlist,
>> so they should not be counted.
>>
>> I see that the counter is updated when queues are activated and deactivated
>> in update_queue. So IMO this patch is both incorrect and unnecessary. Did
>> you see an actual problem with the GWS counter during process termination?
>> If so, I'd like to know more to understand the root cause.
>>
>> Regards,
>>     Felix
> Yes, when using a unit test (for example KFDGWSTest.Semaphore),
> 1. Put a sleep in the middle of the application (after calling hsaKmtAllocQueueGWS)
> 2. Run application and kill the application which it is in sleep (application never calls queue.Destroy()).
>
> Then inside kernel dqm->gws_queue_count keeps incrementing each time the experiment is repeated and never goes back to 0. This seems to be because the sleep causes the process to be evicted and q->properties.is_active is false so code does not enter that if statement.

That's weird. Can you find out why it's not getting there? In the test 
you describe, I would expect the queue to be active, so the counter 
should be decremented by the existing code.

Does the test evict the queues for some reason? is_active gets set to 
false when a queue is evicted. Looks like we're missing code to update 
the gws_queue_count in evict/restore_process_queues_cpsch (it is present 
in evict/restore_process_queues_nocpsch).

Maybe the management of this counter should be moved into 
increment/decrement_queue_count, so we don't need to duplicate it in 
many places.

Regards,
   Felix


>
> Regards,
> David
>
>>>    		dqm->total_queue_count--;
>>>    	}
>>>
>>> +	if (qpd->mapped_gws_queue) {
>>> +		dqm->gws_queue_count--;
>>> +		qpd->mapped_gws_queue = false;
>>> +	}
>>> +
>>>    	/* Unregister process */
>>>    	list_for_each_entry_safe(cur, next_dpn, &dqm->queues, list) {
>>>    		if (qpd == cur->qpd) {

^ permalink raw reply	[flat|nested] 7+ messages in thread

* RE: [PATCH 1/2] drm/amdkfd: Fix GWS queue count
  2022-04-14 15:42     ` Felix Kuehling
@ 2022-04-14 17:56       ` Yat Sin, David
  2022-04-14 18:04         ` Felix Kuehling
  0 siblings, 1 reply; 7+ messages in thread
From: Yat Sin, David @ 2022-04-14 17:56 UTC (permalink / raw)
  To: Kuehling, Felix, amd-gfx



> -----Original Message-----
> From: Kuehling, Felix <Felix.Kuehling@amd.com>
> Sent: Thursday, April 14, 2022 11:42 AM
> To: Yat Sin, David <David.YatSin@amd.com>; amd-gfx@lists.freedesktop.org
> Subject: Re: [PATCH 1/2] drm/amdkfd: Fix GWS queue count
> 
> 
> Am 2022-04-14 um 11:08 schrieb Yat Sin, David:
> >> -----Original Message-----
> >> From: Kuehling, Felix <Felix.Kuehling@amd.com>
> >> Sent: Thursday, April 14, 2022 10:52 AM
> >> To: Yat Sin, David <David.YatSin@amd.com>;
> >> amd-gfx@lists.freedesktop.org
> >> Subject: Re: [PATCH 1/2] drm/amdkfd: Fix GWS queue count
> >>
> >>
> >> Am 2022-04-13 um 11:51 schrieb David Yat Sin:
> >>> Queue can be inactive during process termination. This would cause
> >>> dqm->gws_queue_count to not be decremented. There can only be 1
> GWS
> >>> queue per device process so moving the logic out of loop.
> >>>
> >>> Signed-off-by: David Yat Sin <david.yatsin@amd.com>
> >>> ---
> >>>    .../gpu/drm/amd/amdkfd/kfd_device_queue_manager.c    | 12 ++++++-
> ----
> >> -
> >>>    1 file changed, 6 insertions(+), 6 deletions(-)
> >>>
> >>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> >>> b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> >>> index acf4f7975850..7c107b88d944 100644
> >>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> >>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> >>> @@ -1945,17 +1945,17 @@ static int process_termination_cpsch(struct
> >> device_queue_manager *dqm,
> >>>    		else if (q->properties.type ==
> >> KFD_QUEUE_TYPE_SDMA_XGMI)
> >>>    			deallocate_sdma_queue(dqm, q);
> >>>
> >>> -		if (q->properties.is_active) {
> >>> +		if (q->properties.is_active)
> >>>    			decrement_queue_count(dqm, q->properties.type);
> >>> -			if (q->properties.is_gws) {
> >>> -				dqm->gws_queue_count--;
> >>> -				qpd->mapped_gws_queue = false;
> >>> -			}
> >>> -		}
> >> This looks like the original intention was to decrement the counter
> >> for inactive GWS queues. This makes sense because this counter is
> >> used to determine whether the runlist is oversubscribed. Inactive
> >> queues are not in the runlist, so they should not be counted.
> >>
> >> I see that the counter is updated when queues are activated and
> >> deactivated in update_queue. So IMO this patch is both incorrect and
> >> unnecessary. Did you see an actual problem with the GWS counter during
> process termination?
> >> If so, I'd like to know more to understand the root cause.
> >>
> >> Regards,
> >>     Felix
> > Yes, when using a unit test (for example KFDGWSTest.Semaphore), 1. Put
> > a sleep in the middle of the application (after calling
> > hsaKmtAllocQueueGWS) 2. Run application and kill the application which it
> is in sleep (application never calls queue.Destroy()).
> >
> > Then inside kernel dqm->gws_queue_count keeps incrementing each time
> the experiment is repeated and never goes back to 0. This seems to be
> because the sleep causes the process to be evicted and q-
> >properties.is_active is false so code does not enter that if statement.
> 
> That's weird. Can you find out why it's not getting there? In the test you
> describe, I would expect the queue to be active, so the counter should be
> decremented by the existing code.
> 
> Does the test evict the queues for some reason? is_active gets set to false
> when a queue is evicted. 
Yes, the queue is evicted because of the sleep() call in userspace.

>Looks like we're missing code to update the
> gws_queue_count in evict/restore_process_queues_cpsch (it is present in
> evict/restore_process_queues_nocpsch).

I think this is the actual problem and the increment/decrement should be done there. I did not realize the dqm->gws_queue_count only counts not-evicted queues. I will post new patch with this change instead.

> 
> Maybe the management of this counter should be moved into
> increment/decrement_queue_count, so we don't need to duplicate it in
> many places.
ACK

Regards,
David

> 
> Regards,
>    Felix
> 
> 
> >
> > Regards,
> > David
> >
> >>>    		dqm->total_queue_count--;
> >>>    	}
> >>>
> >>> +	if (qpd->mapped_gws_queue) {
> >>> +		dqm->gws_queue_count--;
> >>> +		qpd->mapped_gws_queue = false;
> >>> +	}
> >>> +
> >>>    	/* Unregister process */
> >>>    	list_for_each_entry_safe(cur, next_dpn, &dqm->queues, list) {
> >>>    		if (qpd == cur->qpd) {

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH 1/2] drm/amdkfd: Fix GWS queue count
  2022-04-14 17:56       ` Yat Sin, David
@ 2022-04-14 18:04         ` Felix Kuehling
  0 siblings, 0 replies; 7+ messages in thread
From: Felix Kuehling @ 2022-04-14 18:04 UTC (permalink / raw)
  To: Yat Sin, David, amd-gfx


Am 2022-04-14 um 13:56 schrieb Yat Sin, David:
>> -----Original Message-----
>> From: Kuehling, Felix <Felix.Kuehling@amd.com>
>> Sent: Thursday, April 14, 2022 11:42 AM
>> To: Yat Sin, David <David.YatSin@amd.com>; amd-gfx@lists.freedesktop.org
>> Subject: Re: [PATCH 1/2] drm/amdkfd: Fix GWS queue count
>>
>>
>> Am 2022-04-14 um 11:08 schrieb Yat Sin, David:
>>>> -----Original Message-----
>>>> From: Kuehling, Felix <Felix.Kuehling@amd.com>
>>>> Sent: Thursday, April 14, 2022 10:52 AM
>>>> To: Yat Sin, David <David.YatSin@amd.com>;
>>>> amd-gfx@lists.freedesktop.org
>>>> Subject: Re: [PATCH 1/2] drm/amdkfd: Fix GWS queue count
>>>>
>>>>
>>>> Am 2022-04-13 um 11:51 schrieb David Yat Sin:
>>>>> Queue can be inactive during process termination. This would cause
>>>>> dqm->gws_queue_count to not be decremented. There can only be 1
>> GWS
>>>>> queue per device process so moving the logic out of loop.
>>>>>
>>>>> Signed-off-by: David Yat Sin <david.yatsin@amd.com>
>>>>> ---
>>>>>     .../gpu/drm/amd/amdkfd/kfd_device_queue_manager.c    | 12 ++++++-
>> ----
>>>> -
>>>>>     1 file changed, 6 insertions(+), 6 deletions(-)
>>>>>
>>>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
>>>>> b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
>>>>> index acf4f7975850..7c107b88d944 100644
>>>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
>>>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
>>>>> @@ -1945,17 +1945,17 @@ static int process_termination_cpsch(struct
>>>> device_queue_manager *dqm,
>>>>>     		else if (q->properties.type ==
>>>> KFD_QUEUE_TYPE_SDMA_XGMI)
>>>>>     			deallocate_sdma_queue(dqm, q);
>>>>>
>>>>> -		if (q->properties.is_active) {
>>>>> +		if (q->properties.is_active)
>>>>>     			decrement_queue_count(dqm, q->properties.type);
>>>>> -			if (q->properties.is_gws) {
>>>>> -				dqm->gws_queue_count--;
>>>>> -				qpd->mapped_gws_queue = false;
>>>>> -			}
>>>>> -		}
>>>> This looks like the original intention was to decrement the counter
>>>> for inactive GWS queues. This makes sense because this counter is
>>>> used to determine whether the runlist is oversubscribed. Inactive
>>>> queues are not in the runlist, so they should not be counted.
>>>>
>>>> I see that the counter is updated when queues are activated and
>>>> deactivated in update_queue. So IMO this patch is both incorrect and
>>>> unnecessary. Did you see an actual problem with the GWS counter during
>> process termination?
>>>> If so, I'd like to know more to understand the root cause.
>>>>
>>>> Regards,
>>>>      Felix
>>> Yes, when using a unit test (for example KFDGWSTest.Semaphore), 1. Put
>>> a sleep in the middle of the application (after calling
>>> hsaKmtAllocQueueGWS) 2. Run application and kill the application which it
>> is in sleep (application never calls queue.Destroy()).
>>> Then inside kernel dqm->gws_queue_count keeps incrementing each time
>> the experiment is repeated and never goes back to 0. This seems to be
>> because the sleep causes the process to be evicted and q-
>>> properties.is_active is false so code does not enter that if statement.
>> That's weird. Can you find out why it's not getting there? In the test you
>> describe, I would expect the queue to be active, so the counter should be
>> decremented by the existing code.
>>
>> Does the test evict the queues for some reason? is_active gets set to false
>> when a queue is evicted.
> Yes, the queue is evicted because of the sleep() call in userspace.

Sleep() shouldn't cause queue evictions. Evictions are usually temporary 
due to memory manager events (memory evictions or MMU notifiers). More 
permanent evictions can happen after VM faults. Does the test cause a VM 
fault before the sleep?


>
>> Looks like we're missing code to update the
>> gws_queue_count in evict/restore_process_queues_cpsch (it is present in
>> evict/restore_process_queues_nocpsch).
> I think this is the actual problem and the increment/decrement should be done there. I did not realize the dqm->gws_queue_count only counts not-evicted queues. I will post new patch with this change instead.

Thanks,
   Felix


>
>> Maybe the management of this counter should be moved into
>> increment/decrement_queue_count, so we don't need to duplicate it in
>> many places.
> ACK
>
> Regards,
> David
>
>> Regards,
>>     Felix
>>
>>
>>> Regards,
>>> David
>>>
>>>>>     		dqm->total_queue_count--;
>>>>>     	}
>>>>>
>>>>> +	if (qpd->mapped_gws_queue) {
>>>>> +		dqm->gws_queue_count--;
>>>>> +		qpd->mapped_gws_queue = false;
>>>>> +	}
>>>>> +
>>>>>     	/* Unregister process */
>>>>>     	list_for_each_entry_safe(cur, next_dpn, &dqm->queues, list) {
>>>>>     		if (qpd == cur->qpd) {

^ permalink raw reply	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2022-04-14 18:04 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-04-13 15:51 [PATCH 1/2] drm/amdkfd: Fix GWS queue count David Yat Sin
2022-04-13 15:51 ` [PATCH 2/2] drm/amdkfd: CRIU add support for GWS queues David Yat Sin
2022-04-14 14:52 ` [PATCH 1/2] drm/amdkfd: Fix GWS queue count Felix Kuehling
2022-04-14 15:08   ` Yat Sin, David
2022-04-14 15:42     ` Felix Kuehling
2022-04-14 17:56       ` Yat Sin, David
2022-04-14 18:04         ` Felix Kuehling

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.