All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] drm/amdgpu: Always emit GDS switch when GDS/GWS/OA is used
@ 2023-07-07  6:28 ` Friedrich Vock
  0 siblings, 0 replies; 7+ messages in thread
From: Friedrich Vock @ 2023-07-07  6:28 UTC (permalink / raw)
  To: amd-gfx; +Cc: Christian König, Friedrich Vock, stable

During gfxoff, the per-VMID GDS registers are reset and not restored
afterwards. The kernel needs to emit a GDS switch to manually update the
GWS registers in this case. Since gfxoff can happen between any two
submissions and the kernel has no way of knowing, emit the GDS switch
before every submission.

Fixes: 56b0989e29 ("drm/amdgpu: fix GDS/GWS/OA switch handling")
Cc: stable@vger.kernel.org
Link: https://gitlab.freedesktop.org/drm/amd/-/issues/2530
Signed-off-by: Friedrich Vock <friedrich.vock@gmx.de>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ids.c | 22 +++++++---------------
 drivers/gpu/drm/amd/amdgpu/amdgpu_job.h |  1 -
 drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c  | 10 ++++++++--
 3 files changed, 15 insertions(+), 18 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.c
index ff1ea99292fb..de73797e9279 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.c
@@ -165,24 +165,17 @@ bool amdgpu_vmid_had_gpu_reset(struct amdgpu_device *adev,
 		atomic_read(&adev->gpu_reset_counter);
 }

-/* Check if we need to switch to another set of resources */
-static bool amdgpu_vmid_gds_switch_needed(struct amdgpu_vmid *id,
-					  struct amdgpu_job *job)
-{
-	return id->gds_base != job->gds_base ||
-		id->gds_size != job->gds_size ||
-		id->gws_base != job->gws_base ||
-		id->gws_size != job->gws_size ||
-		id->oa_base != job->oa_base ||
-		id->oa_size != job->oa_size;
-}
-
 /* Check if the id is compatible with the job */
 static bool amdgpu_vmid_compatible(struct amdgpu_vmid *id,
 				   struct amdgpu_job *job)
 {
 	return  id->pd_gpu_addr == job->vm_pd_addr &&
-		!amdgpu_vmid_gds_switch_needed(id, job);
+		id->gds_base == job->gds_base &&
+		id->gds_size == job->gds_size &&
+		id->gws_base == job->gws_base &&
+		id->gws_size == job->gws_size &&
+		id->oa_base == job->oa_base &&
+		id->oa_size == job->oa_size;
 }

 /**
@@ -434,7 +427,6 @@ int amdgpu_vmid_grab(struct amdgpu_vm *vm, struct amdgpu_ring *ring,
 		list_move_tail(&id->list, &id_mgr->ids_lru);
 	}

-	job->gds_switch_needed = amdgpu_vmid_gds_switch_needed(id, job);
 	if (job->vm_needs_flush) {
 		id->flushed_updates = amdgpu_vm_tlb_seq(vm);
 		dma_fence_put(id->last_flush);
@@ -503,7 +495,7 @@ void amdgpu_vmid_free_reserved(struct amdgpu_device *adev,
  * @vmhub: vmhub type
  * @vmid: vmid number to use
  *
- * Reset saved GDW, GWS and OA to force switch on next flush.
+ * Reset saved GDS, GWS and OA data.
  */
 void amdgpu_vmid_reset(struct amdgpu_device *adev, unsigned vmhub,
 		       unsigned vmid)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
index a963a25ddd62..2898508b1ce4 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
@@ -53,7 +53,6 @@ struct amdgpu_job {
 	uint32_t		preamble_status;
 	uint32_t                preemption_status;
 	bool                    vm_needs_flush;
-	bool			gds_switch_needed;
 	bool			spm_update_needed;
 	uint64_t		vm_pd_addr;
 	unsigned		vmid;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
index 291977b93b1d..61856040cae2 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
@@ -557,6 +557,12 @@ void amdgpu_vm_check_compute_bug(struct amdgpu_device *adev)
 	}
 }

+/* Check if the job needs a GDS switch */
+static bool amdgpu_vm_need_gds_switch(struct amdgpu_job *job)
+{
+	return job->gds_size || job->gws_size || job->oa_size;
+}
+
 /**
  * amdgpu_vm_need_pipeline_sync - Check if pipe sync is needed for job.
  *
@@ -579,7 +585,7 @@ bool amdgpu_vm_need_pipeline_sync(struct amdgpu_ring *ring,
 	if (job->vm_needs_flush || ring->has_compute_vm_bug)
 		return true;

-	if (ring->funcs->emit_gds_switch && job->gds_switch_needed)
+	if (ring->funcs->emit_gds_switch && amdgpu_vm_need_gds_switch(job))
 		return true;

 	if (amdgpu_vmid_had_gpu_reset(adev, &id_mgr->ids[job->vmid]))
@@ -609,7 +615,7 @@ int amdgpu_vm_flush(struct amdgpu_ring *ring, struct amdgpu_job *job,
 	struct amdgpu_vmid *id = &id_mgr->ids[job->vmid];
 	bool spm_update_needed = job->spm_update_needed;
 	bool gds_switch_needed = ring->funcs->emit_gds_switch &&
-		job->gds_switch_needed;
+		amdgpu_vm_need_gds_switch(job);
 	bool vm_flush_needed = job->vm_needs_flush;
 	struct dma_fence *fence = NULL;
 	bool pasid_mapping_needed = false;
--
2.41.0


^ permalink raw reply related	[flat|nested] 7+ messages in thread

* [PATCH] drm/amdgpu: Always emit GDS switch when GDS/GWS/OA is used
@ 2023-07-07  6:28 ` Friedrich Vock
  0 siblings, 0 replies; 7+ messages in thread
From: Friedrich Vock @ 2023-07-07  6:28 UTC (permalink / raw)
  To: amd-gfx; +Cc: Friedrich Vock, Christian König, stable

During gfxoff, the per-VMID GDS registers are reset and not restored
afterwards. The kernel needs to emit a GDS switch to manually update the
GWS registers in this case. Since gfxoff can happen between any two
submissions and the kernel has no way of knowing, emit the GDS switch
before every submission.

Fixes: 56b0989e29 ("drm/amdgpu: fix GDS/GWS/OA switch handling")
Cc: stable@vger.kernel.org
Link: https://gitlab.freedesktop.org/drm/amd/-/issues/2530
Signed-off-by: Friedrich Vock <friedrich.vock@gmx.de>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ids.c | 22 +++++++---------------
 drivers/gpu/drm/amd/amdgpu/amdgpu_job.h |  1 -
 drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c  | 10 ++++++++--
 3 files changed, 15 insertions(+), 18 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.c
index ff1ea99292fb..de73797e9279 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.c
@@ -165,24 +165,17 @@ bool amdgpu_vmid_had_gpu_reset(struct amdgpu_device *adev,
 		atomic_read(&adev->gpu_reset_counter);
 }

-/* Check if we need to switch to another set of resources */
-static bool amdgpu_vmid_gds_switch_needed(struct amdgpu_vmid *id,
-					  struct amdgpu_job *job)
-{
-	return id->gds_base != job->gds_base ||
-		id->gds_size != job->gds_size ||
-		id->gws_base != job->gws_base ||
-		id->gws_size != job->gws_size ||
-		id->oa_base != job->oa_base ||
-		id->oa_size != job->oa_size;
-}
-
 /* Check if the id is compatible with the job */
 static bool amdgpu_vmid_compatible(struct amdgpu_vmid *id,
 				   struct amdgpu_job *job)
 {
 	return  id->pd_gpu_addr == job->vm_pd_addr &&
-		!amdgpu_vmid_gds_switch_needed(id, job);
+		id->gds_base == job->gds_base &&
+		id->gds_size == job->gds_size &&
+		id->gws_base == job->gws_base &&
+		id->gws_size == job->gws_size &&
+		id->oa_base == job->oa_base &&
+		id->oa_size == job->oa_size;
 }

 /**
@@ -434,7 +427,6 @@ int amdgpu_vmid_grab(struct amdgpu_vm *vm, struct amdgpu_ring *ring,
 		list_move_tail(&id->list, &id_mgr->ids_lru);
 	}

-	job->gds_switch_needed = amdgpu_vmid_gds_switch_needed(id, job);
 	if (job->vm_needs_flush) {
 		id->flushed_updates = amdgpu_vm_tlb_seq(vm);
 		dma_fence_put(id->last_flush);
@@ -503,7 +495,7 @@ void amdgpu_vmid_free_reserved(struct amdgpu_device *adev,
  * @vmhub: vmhub type
  * @vmid: vmid number to use
  *
- * Reset saved GDW, GWS and OA to force switch on next flush.
+ * Reset saved GDS, GWS and OA data.
  */
 void amdgpu_vmid_reset(struct amdgpu_device *adev, unsigned vmhub,
 		       unsigned vmid)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
index a963a25ddd62..2898508b1ce4 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
@@ -53,7 +53,6 @@ struct amdgpu_job {
 	uint32_t		preamble_status;
 	uint32_t                preemption_status;
 	bool                    vm_needs_flush;
-	bool			gds_switch_needed;
 	bool			spm_update_needed;
 	uint64_t		vm_pd_addr;
 	unsigned		vmid;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
index 291977b93b1d..61856040cae2 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
@@ -557,6 +557,12 @@ void amdgpu_vm_check_compute_bug(struct amdgpu_device *adev)
 	}
 }

+/* Check if the job needs a GDS switch */
+static bool amdgpu_vm_need_gds_switch(struct amdgpu_job *job)
+{
+	return job->gds_size || job->gws_size || job->oa_size;
+}
+
 /**
  * amdgpu_vm_need_pipeline_sync - Check if pipe sync is needed for job.
  *
@@ -579,7 +585,7 @@ bool amdgpu_vm_need_pipeline_sync(struct amdgpu_ring *ring,
 	if (job->vm_needs_flush || ring->has_compute_vm_bug)
 		return true;

-	if (ring->funcs->emit_gds_switch && job->gds_switch_needed)
+	if (ring->funcs->emit_gds_switch && amdgpu_vm_need_gds_switch(job))
 		return true;

 	if (amdgpu_vmid_had_gpu_reset(adev, &id_mgr->ids[job->vmid]))
@@ -609,7 +615,7 @@ int amdgpu_vm_flush(struct amdgpu_ring *ring, struct amdgpu_job *job,
 	struct amdgpu_vmid *id = &id_mgr->ids[job->vmid];
 	bool spm_update_needed = job->spm_update_needed;
 	bool gds_switch_needed = ring->funcs->emit_gds_switch &&
-		job->gds_switch_needed;
+		amdgpu_vm_need_gds_switch(job);
 	bool vm_flush_needed = job->vm_needs_flush;
 	struct dma_fence *fence = NULL;
 	bool pasid_mapping_needed = false;
--
2.41.0


^ permalink raw reply related	[flat|nested] 7+ messages in thread

* Re: [PATCH] drm/amdgpu: Always emit GDS switch when GDS/GWS/OA is used
  2023-07-07  6:28 ` Friedrich Vock
  (?)
@ 2023-07-07  6:56 ` Christian König
  2023-07-07  7:28   ` Friedrich Vock
  -1 siblings, 1 reply; 7+ messages in thread
From: Christian König @ 2023-07-07  6:56 UTC (permalink / raw)
  To: Friedrich Vock, amd-gfx; +Cc: stable



Am 07.07.23 um 08:28 schrieb Friedrich Vock:
> During gfxoff, the per-VMID GDS registers are reset and not restored
> afterwards.

Hui? Since when? Those registers should be part of the saved ones.

Have you found that by observation?

Thanks,
Christian.


>   The kernel needs to emit a GDS switch to manually update the
> GWS registers in this case. Since gfxoff can happen between any two
> submissions and the kernel has no way of knowing, emit the GDS switch
> before every submission.
>
> Fixes: 56b0989e29 ("drm/amdgpu: fix GDS/GWS/OA switch handling")
> Cc: stable@vger.kernel.org
> Link: https://gitlab.freedesktop.org/drm/amd/-/issues/2530
> Signed-off-by: Friedrich Vock <friedrich.vock@gmx.de>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_ids.c | 22 +++++++---------------
>   drivers/gpu/drm/amd/amdgpu/amdgpu_job.h |  1 -
>   drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c  | 10 ++++++++--
>   3 files changed, 15 insertions(+), 18 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.c
> index ff1ea99292fb..de73797e9279 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.c
> @@ -165,24 +165,17 @@ bool amdgpu_vmid_had_gpu_reset(struct amdgpu_device *adev,
>   		atomic_read(&adev->gpu_reset_counter);
>   }
>
> -/* Check if we need to switch to another set of resources */
> -static bool amdgpu_vmid_gds_switch_needed(struct amdgpu_vmid *id,
> -					  struct amdgpu_job *job)
> -{
> -	return id->gds_base != job->gds_base ||
> -		id->gds_size != job->gds_size ||
> -		id->gws_base != job->gws_base ||
> -		id->gws_size != job->gws_size ||
> -		id->oa_base != job->oa_base ||
> -		id->oa_size != job->oa_size;
> -}
> -
>   /* Check if the id is compatible with the job */
>   static bool amdgpu_vmid_compatible(struct amdgpu_vmid *id,
>   				   struct amdgpu_job *job)
>   {
>   	return  id->pd_gpu_addr == job->vm_pd_addr &&
> -		!amdgpu_vmid_gds_switch_needed(id, job);
> +		id->gds_base == job->gds_base &&
> +		id->gds_size == job->gds_size &&
> +		id->gws_base == job->gws_base &&
> +		id->gws_size == job->gws_size &&
> +		id->oa_base == job->oa_base &&
> +		id->oa_size == job->oa_size;
>   }
>
>   /**
> @@ -434,7 +427,6 @@ int amdgpu_vmid_grab(struct amdgpu_vm *vm, struct amdgpu_ring *ring,
>   		list_move_tail(&id->list, &id_mgr->ids_lru);
>   	}
>
> -	job->gds_switch_needed = amdgpu_vmid_gds_switch_needed(id, job);
>   	if (job->vm_needs_flush) {
>   		id->flushed_updates = amdgpu_vm_tlb_seq(vm);
>   		dma_fence_put(id->last_flush);
> @@ -503,7 +495,7 @@ void amdgpu_vmid_free_reserved(struct amdgpu_device *adev,
>    * @vmhub: vmhub type
>    * @vmid: vmid number to use
>    *
> - * Reset saved GDW, GWS and OA to force switch on next flush.
> + * Reset saved GDS, GWS and OA data.
>    */
>   void amdgpu_vmid_reset(struct amdgpu_device *adev, unsigned vmhub,
>   		       unsigned vmid)
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
> index a963a25ddd62..2898508b1ce4 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
> @@ -53,7 +53,6 @@ struct amdgpu_job {
>   	uint32_t		preamble_status;
>   	uint32_t                preemption_status;
>   	bool                    vm_needs_flush;
> -	bool			gds_switch_needed;
>   	bool			spm_update_needed;
>   	uint64_t		vm_pd_addr;
>   	unsigned		vmid;
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> index 291977b93b1d..61856040cae2 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> @@ -557,6 +557,12 @@ void amdgpu_vm_check_compute_bug(struct amdgpu_device *adev)
>   	}
>   }
>
> +/* Check if the job needs a GDS switch */
> +static bool amdgpu_vm_need_gds_switch(struct amdgpu_job *job)
> +{
> +	return job->gds_size || job->gws_size || job->oa_size;
> +}
> +
>   /**
>    * amdgpu_vm_need_pipeline_sync - Check if pipe sync is needed for job.
>    *
> @@ -579,7 +585,7 @@ bool amdgpu_vm_need_pipeline_sync(struct amdgpu_ring *ring,
>   	if (job->vm_needs_flush || ring->has_compute_vm_bug)
>   		return true;
>
> -	if (ring->funcs->emit_gds_switch && job->gds_switch_needed)
> +	if (ring->funcs->emit_gds_switch && amdgpu_vm_need_gds_switch(job))
>   		return true;
>
>   	if (amdgpu_vmid_had_gpu_reset(adev, &id_mgr->ids[job->vmid]))
> @@ -609,7 +615,7 @@ int amdgpu_vm_flush(struct amdgpu_ring *ring, struct amdgpu_job *job,
>   	struct amdgpu_vmid *id = &id_mgr->ids[job->vmid];
>   	bool spm_update_needed = job->spm_update_needed;
>   	bool gds_switch_needed = ring->funcs->emit_gds_switch &&
> -		job->gds_switch_needed;
> +		amdgpu_vm_need_gds_switch(job);
>   	bool vm_flush_needed = job->vm_needs_flush;
>   	struct dma_fence *fence = NULL;
>   	bool pasid_mapping_needed = false;
> --
> 2.41.0
>


^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] drm/amdgpu: Always emit GDS switch when GDS/GWS/OA is used
  2023-07-07  6:56 ` Christian König
@ 2023-07-07  7:28   ` Friedrich Vock
  2023-07-07  8:21     ` Christian König
  0 siblings, 1 reply; 7+ messages in thread
From: Friedrich Vock @ 2023-07-07  7:28 UTC (permalink / raw)
  To: Christian König, amd-gfx; +Cc: stable

Hi Christian,

On 07.07.23 08:56, Christian König wrote:
>
>
> Am 07.07.23 um 08:28 schrieb Friedrich Vock:
>> During gfxoff, the per-VMID GDS registers are reset and not restored
>> afterwards.
>
> Hui? Since when? Those registers should be part of the saved ones.
>
> Have you found that by observation?

yes. I tested this on my RX 6700 XT and the Steam Deck (Vangogh). In the
bug report I linked, a test program using GWS I developed hangs because
of this.

The hang occurs as soon as the kernel re-uses a VMID on which GWS was
already used once. In the hung state, inspecting the per-VMID GWS
registers shows that the values have been reset to 0.
The hang does not occur when gfxoff is disabled.

Even without causing hangs, you can confirm the behaviour by doing the
following:
1. Disable gfxoff.
2. Set some GWS registers.
3. Enable gfxoff and wait a bit.
4. Disable gfxoff and read the registers again. The GWS registers have
been reset.

I performed this test for the GDS_BASE/SIZE registers and it seems these
aren't affected, so it's only GWS that is buggy here.
I should probably make a v2 that combines the behaviour before this
patch for GDS and OA, and the patched behaviour for GWS.

I'm not aware of userspace using GWS (yet, I had some ideas for using it
in RADV which is what I've been writing these tests for),
so perhaps the Cc to stable can also be omitted.

Thanks,
Friedrich

>
> Thanks,
> Christian.
>
>
>>   The kernel needs to emit a GDS switch to manually update the
>> GWS registers in this case. Since gfxoff can happen between any two
>> submissions and the kernel has no way of knowing, emit the GDS switch
>> before every submission.
>>
>> Fixes: 56b0989e29 ("drm/amdgpu: fix GDS/GWS/OA switch handling")
>> Cc: stable@vger.kernel.org
>> Link: https://gitlab.freedesktop.org/drm/amd/-/issues/2530
>> Signed-off-by: Friedrich Vock <friedrich.vock@gmx.de>
>> ---
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_ids.c | 22 +++++++---------------
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_job.h |  1 -
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c  | 10 ++++++++--
>>   3 files changed, 15 insertions(+), 18 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.c
>> index ff1ea99292fb..de73797e9279 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.c
>> @@ -165,24 +165,17 @@ bool amdgpu_vmid_had_gpu_reset(struct
>> amdgpu_device *adev,
>>           atomic_read(&adev->gpu_reset_counter);
>>   }
>>
>> -/* Check if we need to switch to another set of resources */
>> -static bool amdgpu_vmid_gds_switch_needed(struct amdgpu_vmid *id,
>> -                      struct amdgpu_job *job)
>> -{
>> -    return id->gds_base != job->gds_base ||
>> -        id->gds_size != job->gds_size ||
>> -        id->gws_base != job->gws_base ||
>> -        id->gws_size != job->gws_size ||
>> -        id->oa_base != job->oa_base ||
>> -        id->oa_size != job->oa_size;
>> -}
>> -
>>   /* Check if the id is compatible with the job */
>>   static bool amdgpu_vmid_compatible(struct amdgpu_vmid *id,
>>                      struct amdgpu_job *job)
>>   {
>>       return  id->pd_gpu_addr == job->vm_pd_addr &&
>> -        !amdgpu_vmid_gds_switch_needed(id, job);
>> +        id->gds_base == job->gds_base &&
>> +        id->gds_size == job->gds_size &&
>> +        id->gws_base == job->gws_base &&
>> +        id->gws_size == job->gws_size &&
>> +        id->oa_base == job->oa_base &&
>> +        id->oa_size == job->oa_size;
>>   }
>>
>>   /**
>> @@ -434,7 +427,6 @@ int amdgpu_vmid_grab(struct amdgpu_vm *vm, struct
>> amdgpu_ring *ring,
>>           list_move_tail(&id->list, &id_mgr->ids_lru);
>>       }
>>
>> -    job->gds_switch_needed = amdgpu_vmid_gds_switch_needed(id, job);
>>       if (job->vm_needs_flush) {
>>           id->flushed_updates = amdgpu_vm_tlb_seq(vm);
>>           dma_fence_put(id->last_flush);
>> @@ -503,7 +495,7 @@ void amdgpu_vmid_free_reserved(struct
>> amdgpu_device *adev,
>>    * @vmhub: vmhub type
>>    * @vmid: vmid number to use
>>    *
>> - * Reset saved GDW, GWS and OA to force switch on next flush.
>> + * Reset saved GDS, GWS and OA data.
>>    */
>>   void amdgpu_vmid_reset(struct amdgpu_device *adev, unsigned vmhub,
>>                  unsigned vmid)
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
>> index a963a25ddd62..2898508b1ce4 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
>> @@ -53,7 +53,6 @@ struct amdgpu_job {
>>       uint32_t        preamble_status;
>>       uint32_t                preemption_status;
>>       bool                    vm_needs_flush;
>> -    bool            gds_switch_needed;
>>       bool            spm_update_needed;
>>       uint64_t        vm_pd_addr;
>>       unsigned        vmid;
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>> index 291977b93b1d..61856040cae2 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>> @@ -557,6 +557,12 @@ void amdgpu_vm_check_compute_bug(struct
>> amdgpu_device *adev)
>>       }
>>   }
>>
>> +/* Check if the job needs a GDS switch */
>> +static bool amdgpu_vm_need_gds_switch(struct amdgpu_job *job)
>> +{
>> +    return job->gds_size || job->gws_size || job->oa_size;
>> +}
>> +
>>   /**
>>    * amdgpu_vm_need_pipeline_sync - Check if pipe sync is needed for
>> job.
>>    *
>> @@ -579,7 +585,7 @@ bool amdgpu_vm_need_pipeline_sync(struct
>> amdgpu_ring *ring,
>>       if (job->vm_needs_flush || ring->has_compute_vm_bug)
>>           return true;
>>
>> -    if (ring->funcs->emit_gds_switch && job->gds_switch_needed)
>> +    if (ring->funcs->emit_gds_switch && amdgpu_vm_need_gds_switch(job))
>>           return true;
>>
>>       if (amdgpu_vmid_had_gpu_reset(adev, &id_mgr->ids[job->vmid]))
>> @@ -609,7 +615,7 @@ int amdgpu_vm_flush(struct amdgpu_ring *ring,
>> struct amdgpu_job *job,
>>       struct amdgpu_vmid *id = &id_mgr->ids[job->vmid];
>>       bool spm_update_needed = job->spm_update_needed;
>>       bool gds_switch_needed = ring->funcs->emit_gds_switch &&
>> -        job->gds_switch_needed;
>> +        amdgpu_vm_need_gds_switch(job);
>>       bool vm_flush_needed = job->vm_needs_flush;
>>       struct dma_fence *fence = NULL;
>>       bool pasid_mapping_needed = false;
>> --
>> 2.41.0
>>
>

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] drm/amdgpu: Always emit GDS switch when GDS/GWS/OA is used
  2023-07-07  7:28   ` Friedrich Vock
@ 2023-07-07  8:21     ` Christian König
  2023-07-20 21:25       ` Friedrich Vock
  0 siblings, 1 reply; 7+ messages in thread
From: Christian König @ 2023-07-07  8:21 UTC (permalink / raw)
  To: Friedrich Vock, amd-gfx; +Cc: stable

Am 07.07.23 um 09:28 schrieb Friedrich Vock:
> Hi Christian,
>
> On 07.07.23 08:56, Christian König wrote:
>>
>>
>> Am 07.07.23 um 08:28 schrieb Friedrich Vock:
>>> During gfxoff, the per-VMID GDS registers are reset and not restored
>>> afterwards.
>>
>> Hui? Since when? Those registers should be part of the saved ones.
>>
>> Have you found that by observation?
>
> yes. I tested this on my RX 6700 XT and the Steam Deck (Vangogh). In the
> bug report I linked, a test program using GWS I developed hangs because
> of this.
>
> The hang occurs as soon as the kernel re-uses a VMID on which GWS was
> already used once. In the hung state, inspecting the per-VMID GWS
> registers shows that the values have been reset to 0.
> The hang does not occur when gfxoff is disabled.
>
> Even without causing hangs, you can confirm the behaviour by doing the
> following:
> 1. Disable gfxoff.
> 2. Set some GWS registers.
> 3. Enable gfxoff and wait a bit.
> 4. Disable gfxoff and read the registers again. The GWS registers have
> been reset.
>
> I performed this test for the GDS_BASE/SIZE registers and it seems these
> aren't affected, so it's only GWS that is buggy here.

That's most like a bug in the FW then. I'm going to ask around internally.

> I should probably make a v2 that combines the behaviour before this
> patch for GDS and OA, and the patched behaviour for GWS.

Yeah, that sounds like a good idea to me. But let me ping the fw teams 
first.

>
> I'm not aware of userspace using GWS (yet, I had some ideas for using it
> in RADV which is what I've been writing these tests for),
> so perhaps the Cc to stable can also be omitted.

Depends on what the fw teams says. As far as I know GWS has never been 
used widely on Linux.

Could be that they say there is a hw bug and we deprecated it for this 
generation, or it's simply not handled by the fw and the driver needs to 
take care of this (like this patch does) or whatever.

Thanks for the notice,
Christian.

>
> Thanks,
> Friedrich
>
>>
>> Thanks,
>> Christian.
>>
>>
>>>   The kernel needs to emit a GDS switch to manually update the
>>> GWS registers in this case. Since gfxoff can happen between any two
>>> submissions and the kernel has no way of knowing, emit the GDS switch
>>> before every submission.
>>>
>>> Fixes: 56b0989e29 ("drm/amdgpu: fix GDS/GWS/OA switch handling")
>>> Cc: stable@vger.kernel.org
>>> Link: https://gitlab.freedesktop.org/drm/amd/-/issues/2530
>>> Signed-off-by: Friedrich Vock <friedrich.vock@gmx.de>
>>> ---
>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_ids.c | 22 +++++++---------------
>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_job.h |  1 -
>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c  | 10 ++++++++--
>>>   3 files changed, 15 insertions(+), 18 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.c
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.c
>>> index ff1ea99292fb..de73797e9279 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.c
>>> @@ -165,24 +165,17 @@ bool amdgpu_vmid_had_gpu_reset(struct
>>> amdgpu_device *adev,
>>>           atomic_read(&adev->gpu_reset_counter);
>>>   }
>>>
>>> -/* Check if we need to switch to another set of resources */
>>> -static bool amdgpu_vmid_gds_switch_needed(struct amdgpu_vmid *id,
>>> -                      struct amdgpu_job *job)
>>> -{
>>> -    return id->gds_base != job->gds_base ||
>>> -        id->gds_size != job->gds_size ||
>>> -        id->gws_base != job->gws_base ||
>>> -        id->gws_size != job->gws_size ||
>>> -        id->oa_base != job->oa_base ||
>>> -        id->oa_size != job->oa_size;
>>> -}
>>> -
>>>   /* Check if the id is compatible with the job */
>>>   static bool amdgpu_vmid_compatible(struct amdgpu_vmid *id,
>>>                      struct amdgpu_job *job)
>>>   {
>>>       return  id->pd_gpu_addr == job->vm_pd_addr &&
>>> -        !amdgpu_vmid_gds_switch_needed(id, job);
>>> +        id->gds_base == job->gds_base &&
>>> +        id->gds_size == job->gds_size &&
>>> +        id->gws_base == job->gws_base &&
>>> +        id->gws_size == job->gws_size &&
>>> +        id->oa_base == job->oa_base &&
>>> +        id->oa_size == job->oa_size;
>>>   }
>>>
>>>   /**
>>> @@ -434,7 +427,6 @@ int amdgpu_vmid_grab(struct amdgpu_vm *vm, struct
>>> amdgpu_ring *ring,
>>>           list_move_tail(&id->list, &id_mgr->ids_lru);
>>>       }
>>>
>>> -    job->gds_switch_needed = amdgpu_vmid_gds_switch_needed(id, job);
>>>       if (job->vm_needs_flush) {
>>>           id->flushed_updates = amdgpu_vm_tlb_seq(vm);
>>>           dma_fence_put(id->last_flush);
>>> @@ -503,7 +495,7 @@ void amdgpu_vmid_free_reserved(struct
>>> amdgpu_device *adev,
>>>    * @vmhub: vmhub type
>>>    * @vmid: vmid number to use
>>>    *
>>> - * Reset saved GDW, GWS and OA to force switch on next flush.
>>> + * Reset saved GDS, GWS and OA data.
>>>    */
>>>   void amdgpu_vmid_reset(struct amdgpu_device *adev, unsigned vmhub,
>>>                  unsigned vmid)
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
>>> index a963a25ddd62..2898508b1ce4 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
>>> @@ -53,7 +53,6 @@ struct amdgpu_job {
>>>       uint32_t        preamble_status;
>>>       uint32_t                preemption_status;
>>>       bool                    vm_needs_flush;
>>> -    bool            gds_switch_needed;
>>>       bool            spm_update_needed;
>>>       uint64_t        vm_pd_addr;
>>>       unsigned        vmid;
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>> index 291977b93b1d..61856040cae2 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>> @@ -557,6 +557,12 @@ void amdgpu_vm_check_compute_bug(struct
>>> amdgpu_device *adev)
>>>       }
>>>   }
>>>
>>> +/* Check if the job needs a GDS switch */
>>> +static bool amdgpu_vm_need_gds_switch(struct amdgpu_job *job)
>>> +{
>>> +    return job->gds_size || job->gws_size || job->oa_size;
>>> +}
>>> +
>>>   /**
>>>    * amdgpu_vm_need_pipeline_sync - Check if pipe sync is needed for
>>> job.
>>>    *
>>> @@ -579,7 +585,7 @@ bool amdgpu_vm_need_pipeline_sync(struct
>>> amdgpu_ring *ring,
>>>       if (job->vm_needs_flush || ring->has_compute_vm_bug)
>>>           return true;
>>>
>>> -    if (ring->funcs->emit_gds_switch && job->gds_switch_needed)
>>> +    if (ring->funcs->emit_gds_switch && 
>>> amdgpu_vm_need_gds_switch(job))
>>>           return true;
>>>
>>>       if (amdgpu_vmid_had_gpu_reset(adev, &id_mgr->ids[job->vmid]))
>>> @@ -609,7 +615,7 @@ int amdgpu_vm_flush(struct amdgpu_ring *ring,
>>> struct amdgpu_job *job,
>>>       struct amdgpu_vmid *id = &id_mgr->ids[job->vmid];
>>>       bool spm_update_needed = job->spm_update_needed;
>>>       bool gds_switch_needed = ring->funcs->emit_gds_switch &&
>>> -        job->gds_switch_needed;
>>> +        amdgpu_vm_need_gds_switch(job);
>>>       bool vm_flush_needed = job->vm_needs_flush;
>>>       struct dma_fence *fence = NULL;
>>>       bool pasid_mapping_needed = false;
>>> -- 
>>> 2.41.0
>>>
>>


^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] drm/amdgpu: Always emit GDS switch when GDS/GWS/OA is used
  2023-07-07  8:21     ` Christian König
@ 2023-07-20 21:25       ` Friedrich Vock
  2023-08-02 10:29         ` Friedrich Vock
  0 siblings, 1 reply; 7+ messages in thread
From: Friedrich Vock @ 2023-07-20 21:25 UTC (permalink / raw)
  To: Christian König, amd-gfx; +Cc: stable

Hi,

On 07.07.23 10:21, Christian König wrote:
> Am 07.07.23 um 09:28 schrieb Friedrich Vock:
>> Hi Christian,
>>
>> On 07.07.23 08:56, Christian König wrote:
>>>
>>>
>>> Am 07.07.23 um 08:28 schrieb Friedrich Vock:
>>>> During gfxoff, the per-VMID GDS registers are reset and not restored
>>>> afterwards.
>>>
>>> Hui? Since when? Those registers should be part of the saved ones.
>>>
>>> Have you found that by observation?
>>
>> yes. I tested this on my RX 6700 XT and the Steam Deck (Vangogh). In the
>> bug report I linked, a test program using GWS I developed hangs because
>> of this.
>>
>> The hang occurs as soon as the kernel re-uses a VMID on which GWS was
>> already used once. In the hung state, inspecting the per-VMID GWS
>> registers shows that the values have been reset to 0.
>> The hang does not occur when gfxoff is disabled.
>>
>> Even without causing hangs, you can confirm the behaviour by doing the
>> following:
>> 1. Disable gfxoff.
>> 2. Set some GWS registers.
>> 3. Enable gfxoff and wait a bit.
>> 4. Disable gfxoff and read the registers again. The GWS registers have
>> been reset.
>>
>> I performed this test for the GDS_BASE/SIZE registers and it seems these
>> aren't affected, so it's only GWS that is buggy here.
>
> That's most like a bug in the FW then. I'm going to ask around
> internally.

Did the talks with the FW team result in anything yet? It's not that
high-priority, but it'd be nice to know if this is going to be fixed in
the firmware or if I should make a v2 (or if this isn't going to be
fixed at all).

Thanks,
Friedrich

>
>> I should probably make a v2 that combines the behaviour before this
>> patch for GDS and OA, and the patched behaviour for GWS.
>
> Yeah, that sounds like a good idea to me. But let me ping the fw teams
> first.
>
>>
>> I'm not aware of userspace using GWS (yet, I had some ideas for using it
>> in RADV which is what I've been writing these tests for),
>> so perhaps the Cc to stable can also be omitted.
>
> Depends on what the fw teams says. As far as I know GWS has never been
> used widely on Linux.
>
> Could be that they say there is a hw bug and we deprecated it for this
> generation, or it's simply not handled by the fw and the driver needs
> to take care of this (like this patch does) or whatever.
>
> Thanks for the notice,
> Christian.
>
>>
>> Thanks,
>> Friedrich
>>
>>>
>>> Thanks,
>>> Christian.
>>>
>>>
>>>>   The kernel needs to emit a GDS switch to manually update the
>>>> GWS registers in this case. Since gfxoff can happen between any two
>>>> submissions and the kernel has no way of knowing, emit the GDS switch
>>>> before every submission.
>>>>
>>>> Fixes: 56b0989e29 ("drm/amdgpu: fix GDS/GWS/OA switch handling")
>>>> Cc: stable@vger.kernel.org
>>>> Link: https://gitlab.freedesktop.org/drm/amd/-/issues/2530
>>>> Signed-off-by: Friedrich Vock <friedrich.vock@gmx.de>
>>>> ---
>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_ids.c | 22 +++++++---------------
>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_job.h |  1 -
>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c  | 10 ++++++++--
>>>>   3 files changed, 15 insertions(+), 18 deletions(-)
>>>>
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.c
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.c
>>>> index ff1ea99292fb..de73797e9279 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.c
>>>> @@ -165,24 +165,17 @@ bool amdgpu_vmid_had_gpu_reset(struct
>>>> amdgpu_device *adev,
>>>>           atomic_read(&adev->gpu_reset_counter);
>>>>   }
>>>>
>>>> -/* Check if we need to switch to another set of resources */
>>>> -static bool amdgpu_vmid_gds_switch_needed(struct amdgpu_vmid *id,
>>>> -                      struct amdgpu_job *job)
>>>> -{
>>>> -    return id->gds_base != job->gds_base ||
>>>> -        id->gds_size != job->gds_size ||
>>>> -        id->gws_base != job->gws_base ||
>>>> -        id->gws_size != job->gws_size ||
>>>> -        id->oa_base != job->oa_base ||
>>>> -        id->oa_size != job->oa_size;
>>>> -}
>>>> -
>>>>   /* Check if the id is compatible with the job */
>>>>   static bool amdgpu_vmid_compatible(struct amdgpu_vmid *id,
>>>>                      struct amdgpu_job *job)
>>>>   {
>>>>       return  id->pd_gpu_addr == job->vm_pd_addr &&
>>>> -        !amdgpu_vmid_gds_switch_needed(id, job);
>>>> +        id->gds_base == job->gds_base &&
>>>> +        id->gds_size == job->gds_size &&
>>>> +        id->gws_base == job->gws_base &&
>>>> +        id->gws_size == job->gws_size &&
>>>> +        id->oa_base == job->oa_base &&
>>>> +        id->oa_size == job->oa_size;
>>>>   }
>>>>
>>>>   /**
>>>> @@ -434,7 +427,6 @@ int amdgpu_vmid_grab(struct amdgpu_vm *vm, struct
>>>> amdgpu_ring *ring,
>>>>           list_move_tail(&id->list, &id_mgr->ids_lru);
>>>>       }
>>>>
>>>> -    job->gds_switch_needed = amdgpu_vmid_gds_switch_needed(id, job);
>>>>       if (job->vm_needs_flush) {
>>>>           id->flushed_updates = amdgpu_vm_tlb_seq(vm);
>>>>           dma_fence_put(id->last_flush);
>>>> @@ -503,7 +495,7 @@ void amdgpu_vmid_free_reserved(struct
>>>> amdgpu_device *adev,
>>>>    * @vmhub: vmhub type
>>>>    * @vmid: vmid number to use
>>>>    *
>>>> - * Reset saved GDW, GWS and OA to force switch on next flush.
>>>> + * Reset saved GDS, GWS and OA data.
>>>>    */
>>>>   void amdgpu_vmid_reset(struct amdgpu_device *adev, unsigned vmhub,
>>>>                  unsigned vmid)
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
>>>> index a963a25ddd62..2898508b1ce4 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
>>>> @@ -53,7 +53,6 @@ struct amdgpu_job {
>>>>       uint32_t        preamble_status;
>>>>       uint32_t                preemption_status;
>>>>       bool                    vm_needs_flush;
>>>> -    bool            gds_switch_needed;
>>>>       bool            spm_update_needed;
>>>>       uint64_t        vm_pd_addr;
>>>>       unsigned        vmid;
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>>> index 291977b93b1d..61856040cae2 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>>> @@ -557,6 +557,12 @@ void amdgpu_vm_check_compute_bug(struct
>>>> amdgpu_device *adev)
>>>>       }
>>>>   }
>>>>
>>>> +/* Check if the job needs a GDS switch */
>>>> +static bool amdgpu_vm_need_gds_switch(struct amdgpu_job *job)
>>>> +{
>>>> +    return job->gds_size || job->gws_size || job->oa_size;
>>>> +}
>>>> +
>>>>   /**
>>>>    * amdgpu_vm_need_pipeline_sync - Check if pipe sync is needed for
>>>> job.
>>>>    *
>>>> @@ -579,7 +585,7 @@ bool amdgpu_vm_need_pipeline_sync(struct
>>>> amdgpu_ring *ring,
>>>>       if (job->vm_needs_flush || ring->has_compute_vm_bug)
>>>>           return true;
>>>>
>>>> -    if (ring->funcs->emit_gds_switch && job->gds_switch_needed)
>>>> +    if (ring->funcs->emit_gds_switch &&
>>>> amdgpu_vm_need_gds_switch(job))
>>>>           return true;
>>>>
>>>>       if (amdgpu_vmid_had_gpu_reset(adev, &id_mgr->ids[job->vmid]))
>>>> @@ -609,7 +615,7 @@ int amdgpu_vm_flush(struct amdgpu_ring *ring,
>>>> struct amdgpu_job *job,
>>>>       struct amdgpu_vmid *id = &id_mgr->ids[job->vmid];
>>>>       bool spm_update_needed = job->spm_update_needed;
>>>>       bool gds_switch_needed = ring->funcs->emit_gds_switch &&
>>>> -        job->gds_switch_needed;
>>>> +        amdgpu_vm_need_gds_switch(job);
>>>>       bool vm_flush_needed = job->vm_needs_flush;
>>>>       struct dma_fence *fence = NULL;
>>>>       bool pasid_mapping_needed = false;
>>>> --
>>>> 2.41.0
>>>>
>>>
>

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] drm/amdgpu: Always emit GDS switch when GDS/GWS/OA is used
  2023-07-20 21:25       ` Friedrich Vock
@ 2023-08-02 10:29         ` Friedrich Vock
  0 siblings, 0 replies; 7+ messages in thread
From: Friedrich Vock @ 2023-08-02 10:29 UTC (permalink / raw)
  To: Christian König, amd-gfx; +Cc: stable

Gentle ping. Any updates on this yet?

Thanks,
Friedrich

On 20.07.23 23:25, Friedrich Vock wrote:
> Hi,
>
> On 07.07.23 10:21, Christian König wrote:
>> Am 07.07.23 um 09:28 schrieb Friedrich Vock:
>>> Hi Christian,
>>>
>>> On 07.07.23 08:56, Christian König wrote:
>>>>
>>>>
>>>> Am 07.07.23 um 08:28 schrieb Friedrich Vock:
>>>>> During gfxoff, the per-VMID GDS registers are reset and not restored
>>>>> afterwards.
>>>>
>>>> Hui? Since when? Those registers should be part of the saved ones.
>>>>
>>>> Have you found that by observation?
>>>
>>> yes. I tested this on my RX 6700 XT and the Steam Deck (Vangogh). In
>>> the
>>> bug report I linked, a test program using GWS I developed hangs because
>>> of this.
>>>
>>> The hang occurs as soon as the kernel re-uses a VMID on which GWS was
>>> already used once. In the hung state, inspecting the per-VMID GWS
>>> registers shows that the values have been reset to 0.
>>> The hang does not occur when gfxoff is disabled.
>>>
>>> Even without causing hangs, you can confirm the behaviour by doing the
>>> following:
>>> 1. Disable gfxoff.
>>> 2. Set some GWS registers.
>>> 3. Enable gfxoff and wait a bit.
>>> 4. Disable gfxoff and read the registers again. The GWS registers have
>>> been reset.
>>>
>>> I performed this test for the GDS_BASE/SIZE registers and it seems
>>> these
>>> aren't affected, so it's only GWS that is buggy here.
>>
>> That's most like a bug in the FW then. I'm going to ask around
>> internally.
>
> Did the talks with the FW team result in anything yet? It's not that
> high-priority, but it'd be nice to know if this is going to be fixed in
> the firmware or if I should make a v2 (or if this isn't going to be
> fixed at all).
>
> Thanks,
> Friedrich
>
>>
>>> I should probably make a v2 that combines the behaviour before this
>>> patch for GDS and OA, and the patched behaviour for GWS.
>>
>> Yeah, that sounds like a good idea to me. But let me ping the fw teams
>> first.
>>
>>>
>>> I'm not aware of userspace using GWS (yet, I had some ideas for
>>> using it
>>> in RADV which is what I've been writing these tests for),
>>> so perhaps the Cc to stable can also be omitted.
>>
>> Depends on what the fw teams says. As far as I know GWS has never been
>> used widely on Linux.
>>
>> Could be that they say there is a hw bug and we deprecated it for this
>> generation, or it's simply not handled by the fw and the driver needs
>> to take care of this (like this patch does) or whatever.
>>
>> Thanks for the notice,
>> Christian.
>>
>>>
>>> Thanks,
>>> Friedrich
>>>
>>>>
>>>> Thanks,
>>>> Christian.
>>>>
>>>>
>>>>>   The kernel needs to emit a GDS switch to manually update the
>>>>> GWS registers in this case. Since gfxoff can happen between any two
>>>>> submissions and the kernel has no way of knowing, emit the GDS switch
>>>>> before every submission.
>>>>>
>>>>> Fixes: 56b0989e29 ("drm/amdgpu: fix GDS/GWS/OA switch handling")
>>>>> Cc: stable@vger.kernel.org
>>>>> Link: https://gitlab.freedesktop.org/drm/amd/-/issues/2530
>>>>> Signed-off-by: Friedrich Vock <friedrich.vock@gmx.de>
>>>>> ---
>>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_ids.c | 22 +++++++---------------
>>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_job.h |  1 -
>>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c  | 10 ++++++++--
>>>>>   3 files changed, 15 insertions(+), 18 deletions(-)
>>>>>
>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.c
>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.c
>>>>> index ff1ea99292fb..de73797e9279 100644
>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.c
>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.c
>>>>> @@ -165,24 +165,17 @@ bool amdgpu_vmid_had_gpu_reset(struct
>>>>> amdgpu_device *adev,
>>>>>           atomic_read(&adev->gpu_reset_counter);
>>>>>   }
>>>>>
>>>>> -/* Check if we need to switch to another set of resources */
>>>>> -static bool amdgpu_vmid_gds_switch_needed(struct amdgpu_vmid *id,
>>>>> -                      struct amdgpu_job *job)
>>>>> -{
>>>>> -    return id->gds_base != job->gds_base ||
>>>>> -        id->gds_size != job->gds_size ||
>>>>> -        id->gws_base != job->gws_base ||
>>>>> -        id->gws_size != job->gws_size ||
>>>>> -        id->oa_base != job->oa_base ||
>>>>> -        id->oa_size != job->oa_size;
>>>>> -}
>>>>> -
>>>>>   /* Check if the id is compatible with the job */
>>>>>   static bool amdgpu_vmid_compatible(struct amdgpu_vmid *id,
>>>>>                      struct amdgpu_job *job)
>>>>>   {
>>>>>       return  id->pd_gpu_addr == job->vm_pd_addr &&
>>>>> -        !amdgpu_vmid_gds_switch_needed(id, job);
>>>>> +        id->gds_base == job->gds_base &&
>>>>> +        id->gds_size == job->gds_size &&
>>>>> +        id->gws_base == job->gws_base &&
>>>>> +        id->gws_size == job->gws_size &&
>>>>> +        id->oa_base == job->oa_base &&
>>>>> +        id->oa_size == job->oa_size;
>>>>>   }
>>>>>
>>>>>   /**
>>>>> @@ -434,7 +427,6 @@ int amdgpu_vmid_grab(struct amdgpu_vm *vm, struct
>>>>> amdgpu_ring *ring,
>>>>>           list_move_tail(&id->list, &id_mgr->ids_lru);
>>>>>       }
>>>>>
>>>>> -    job->gds_switch_needed = amdgpu_vmid_gds_switch_needed(id, job);
>>>>>       if (job->vm_needs_flush) {
>>>>>           id->flushed_updates = amdgpu_vm_tlb_seq(vm);
>>>>>           dma_fence_put(id->last_flush);
>>>>> @@ -503,7 +495,7 @@ void amdgpu_vmid_free_reserved(struct
>>>>> amdgpu_device *adev,
>>>>>    * @vmhub: vmhub type
>>>>>    * @vmid: vmid number to use
>>>>>    *
>>>>> - * Reset saved GDW, GWS and OA to force switch on next flush.
>>>>> + * Reset saved GDS, GWS and OA data.
>>>>>    */
>>>>>   void amdgpu_vmid_reset(struct amdgpu_device *adev, unsigned vmhub,
>>>>>                  unsigned vmid)
>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
>>>>> index a963a25ddd62..2898508b1ce4 100644
>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
>>>>> @@ -53,7 +53,6 @@ struct amdgpu_job {
>>>>>       uint32_t        preamble_status;
>>>>>       uint32_t                preemption_status;
>>>>>       bool                    vm_needs_flush;
>>>>> -    bool            gds_switch_needed;
>>>>>       bool            spm_update_needed;
>>>>>       uint64_t        vm_pd_addr;
>>>>>       unsigned        vmid;
>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>>>> index 291977b93b1d..61856040cae2 100644
>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>>>> @@ -557,6 +557,12 @@ void amdgpu_vm_check_compute_bug(struct
>>>>> amdgpu_device *adev)
>>>>>       }
>>>>>   }
>>>>>
>>>>> +/* Check if the job needs a GDS switch */
>>>>> +static bool amdgpu_vm_need_gds_switch(struct amdgpu_job *job)
>>>>> +{
>>>>> +    return job->gds_size || job->gws_size || job->oa_size;
>>>>> +}
>>>>> +
>>>>>   /**
>>>>>    * amdgpu_vm_need_pipeline_sync - Check if pipe sync is needed for
>>>>> job.
>>>>>    *
>>>>> @@ -579,7 +585,7 @@ bool amdgpu_vm_need_pipeline_sync(struct
>>>>> amdgpu_ring *ring,
>>>>>       if (job->vm_needs_flush || ring->has_compute_vm_bug)
>>>>>           return true;
>>>>>
>>>>> -    if (ring->funcs->emit_gds_switch && job->gds_switch_needed)
>>>>> +    if (ring->funcs->emit_gds_switch &&
>>>>> amdgpu_vm_need_gds_switch(job))
>>>>>           return true;
>>>>>
>>>>>       if (amdgpu_vmid_had_gpu_reset(adev, &id_mgr->ids[job->vmid]))
>>>>> @@ -609,7 +615,7 @@ int amdgpu_vm_flush(struct amdgpu_ring *ring,
>>>>> struct amdgpu_job *job,
>>>>>       struct amdgpu_vmid *id = &id_mgr->ids[job->vmid];
>>>>>       bool spm_update_needed = job->spm_update_needed;
>>>>>       bool gds_switch_needed = ring->funcs->emit_gds_switch &&
>>>>> -        job->gds_switch_needed;
>>>>> +        amdgpu_vm_need_gds_switch(job);
>>>>>       bool vm_flush_needed = job->vm_needs_flush;
>>>>>       struct dma_fence *fence = NULL;
>>>>>       bool pasid_mapping_needed = false;
>>>>> --
>>>>> 2.41.0
>>>>>
>>>>
>>

^ permalink raw reply	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2023-08-02 10:34 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-07-07  6:28 [PATCH] drm/amdgpu: Always emit GDS switch when GDS/GWS/OA is used Friedrich Vock
2023-07-07  6:28 ` Friedrich Vock
2023-07-07  6:56 ` Christian König
2023-07-07  7:28   ` Friedrich Vock
2023-07-07  8:21     ` Christian König
2023-07-20 21:25       ` Friedrich Vock
2023-08-02 10:29         ` Friedrich Vock

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.