All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] drm/amdgpu: add a workaround for GDS ordered append hangs with compute queues
@ 2019-01-21 23:46 Marek Olšák
       [not found] ` <20190121234647.3995-1-maraeo-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
  0 siblings, 1 reply; 9+ messages in thread
From: Marek Olšák @ 2019-01-21 23:46 UTC (permalink / raw)
  To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

From: Marek Olšák <marek.olsak@amd.com>

I'm not increasing the DRM version because GDS isn't totally without bugs yet.

Signed-off-by: Marek Olšák <marek.olsak@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_gds.h |  2 ++
 drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c   | 17 ++++++++++++
 drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c   | 17 ++++++++++++
 drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c   | 36 +++++++++++++++++++++++++
 include/uapi/drm/amdgpu_drm.h           |  5 ++++
 5 files changed, 77 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gds.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_gds.h
index ecbcefe49a98..f89f5734d985 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gds.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gds.h
@@ -30,20 +30,22 @@ struct amdgpu_bo;
 struct amdgpu_gds_asic_info {
 	uint32_t	total_size;
 	uint32_t	gfx_partition_size;
 	uint32_t	cs_partition_size;
 };
 
 struct amdgpu_gds {
 	struct amdgpu_gds_asic_info	mem;
 	struct amdgpu_gds_asic_info	gws;
 	struct amdgpu_gds_asic_info	oa;
+	uint32_t			gds_compute_max_wave_id;
+
 	/* At present, GDS, GWS and OA resources for gfx (graphics)
 	 * is always pre-allocated and available for graphics operation.
 	 * Such resource is shared between all gfx clients.
 	 * TODO: move this operation to user space
 	 * */
 	struct amdgpu_bo*		gds_gfx_bo;
 	struct amdgpu_bo*		gws_gfx_bo;
 	struct amdgpu_bo*		oa_gfx_bo;
 };
 
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c
index 7984292f9282..d971ea914755 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c
@@ -2257,20 +2257,36 @@ static void gfx_v7_0_ring_emit_ib_gfx(struct amdgpu_ring *ring,
 }
 
 static void gfx_v7_0_ring_emit_ib_compute(struct amdgpu_ring *ring,
 					  struct amdgpu_job *job,
 					  struct amdgpu_ib *ib,
 					  uint32_t flags)
 {
 	unsigned vmid = AMDGPU_JOB_GET_VMID(job);
 	u32 control = INDIRECT_BUFFER_VALID | ib->length_dw | (vmid << 24);
 
+	/* Currently, there is a high possibility to get wave ID mismatch
+	 * between ME and GDS, leading to a hw deadlock, because ME generates
+	 * different wave IDs than the GDS expects. This situation happens
+	 * randomly when at least 5 compute pipes use GDS ordered append.
+	 * The wave IDs generated by ME are also wrong after suspend/resume.
+	 * Those are probably bugs somewhere else in the kernel driver.
+	 *
+	 * Writing GDS_COMPUTE_MAX_WAVE_ID resets wave ID counters in ME and
+	 * GDS to 0 for this ring (me/pipe).
+	 */
+	if (ib->flags & AMDGPU_IB_FLAG_RESET_GDS_MAX_WAVE_ID) {
+		amdgpu_ring_write(ring, PACKET3(PACKET3_SET_CONFIG_REG, 1));
+		amdgpu_ring_write(ring, mmGDS_COMPUTE_MAX_WAVE_ID - PACKET3_SET_CONFIG_REG_START);
+		amdgpu_ring_write(ring, ring->adev->gds.gds_compute_max_wave_id);
+	}
+
 	amdgpu_ring_write(ring, PACKET3(PACKET3_INDIRECT_BUFFER, 2));
 	amdgpu_ring_write(ring,
 #ifdef __BIG_ENDIAN
 					  (2 << 0) |
 #endif
 					  (ib->gpu_addr & 0xFFFFFFFC));
 	amdgpu_ring_write(ring, upper_32_bits(ib->gpu_addr) & 0xFFFF);
 	amdgpu_ring_write(ring, control);
 }
 
@@ -5050,20 +5066,21 @@ static void gfx_v7_0_set_irq_funcs(struct amdgpu_device *adev)
 	adev->gfx.priv_inst_irq.num_types = 1;
 	adev->gfx.priv_inst_irq.funcs = &gfx_v7_0_priv_inst_irq_funcs;
 }
 
 static void gfx_v7_0_set_gds_init(struct amdgpu_device *adev)
 {
 	/* init asci gds info */
 	adev->gds.mem.total_size = RREG32(mmGDS_VMID0_SIZE);
 	adev->gds.gws.total_size = 64;
 	adev->gds.oa.total_size = 16;
+	adev->gds.gds_compute_max_wave_id = RREG32(mmGDS_COMPUTE_MAX_WAVE_ID);
 
 	if (adev->gds.mem.total_size == 64 * 1024) {
 		adev->gds.mem.gfx_partition_size = 4096;
 		adev->gds.mem.cs_partition_size = 4096;
 
 		adev->gds.gws.gfx_partition_size = 4;
 		adev->gds.gws.cs_partition_size = 4;
 
 		adev->gds.oa.gfx_partition_size = 4;
 		adev->gds.oa.cs_partition_size = 1;
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
index a26747681ed6..dcdae74fc0e1 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
@@ -6077,20 +6077,36 @@ static void gfx_v8_0_ring_emit_ib_gfx(struct amdgpu_ring *ring,
 }
 
 static void gfx_v8_0_ring_emit_ib_compute(struct amdgpu_ring *ring,
 					  struct amdgpu_job *job,
 					  struct amdgpu_ib *ib,
 					  uint32_t flags)
 {
 	unsigned vmid = AMDGPU_JOB_GET_VMID(job);
 	u32 control = INDIRECT_BUFFER_VALID | ib->length_dw | (vmid << 24);
 
+	/* Currently, there is a high possibility to get wave ID mismatch
+	 * between ME and GDS, leading to a hw deadlock, because ME generates
+	 * different wave IDs than the GDS expects. This situation happens
+	 * randomly when at least 5 compute pipes use GDS ordered append.
+	 * The wave IDs generated by ME are also wrong after suspend/resume.
+	 * Those are probably bugs somewhere else in the kernel driver.
+	 *
+	 * Writing GDS_COMPUTE_MAX_WAVE_ID resets wave ID counters in ME and
+	 * GDS to 0 for this ring (me/pipe).
+	 */
+	if (ib->flags & AMDGPU_IB_FLAG_RESET_GDS_MAX_WAVE_ID) {
+		amdgpu_ring_write(ring, PACKET3(PACKET3_SET_CONFIG_REG, 1));
+		amdgpu_ring_write(ring, mmGDS_COMPUTE_MAX_WAVE_ID - PACKET3_SET_CONFIG_REG_START);
+		amdgpu_ring_write(ring, ring->adev->gds.gds_compute_max_wave_id);
+	}
+
 	amdgpu_ring_write(ring, PACKET3(PACKET3_INDIRECT_BUFFER, 2));
 	amdgpu_ring_write(ring,
 #ifdef __BIG_ENDIAN
 				(2 << 0) |
 #endif
 				(ib->gpu_addr & 0xFFFFFFFC));
 	amdgpu_ring_write(ring, upper_32_bits(ib->gpu_addr) & 0xFFFF);
 	amdgpu_ring_write(ring, control);
 }
 
@@ -6989,20 +7005,21 @@ static void gfx_v8_0_set_rlc_funcs(struct amdgpu_device *adev)
 {
 	adev->gfx.rlc.funcs = &iceland_rlc_funcs;
 }
 
 static void gfx_v8_0_set_gds_init(struct amdgpu_device *adev)
 {
 	/* init asci gds info */
 	adev->gds.mem.total_size = RREG32(mmGDS_VMID0_SIZE);
 	adev->gds.gws.total_size = 64;
 	adev->gds.oa.total_size = 16;
+	adev->gds.gds_compute_max_wave_id = RREG32(mmGDS_COMPUTE_MAX_WAVE_ID);
 
 	if (adev->gds.mem.total_size == 64 * 1024) {
 		adev->gds.mem.gfx_partition_size = 4096;
 		adev->gds.mem.cs_partition_size = 4096;
 
 		adev->gds.gws.gfx_partition_size = 4;
 		adev->gds.gws.cs_partition_size = 4;
 
 		adev->gds.oa.gfx_partition_size = 4;
 		adev->gds.oa.cs_partition_size = 1;
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
index 262ee3cf6f1c..63b898fc0467 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
@@ -4003,20 +4003,36 @@ static void gfx_v9_0_ring_emit_ib_gfx(struct amdgpu_ring *ring,
 }
 
 static void gfx_v9_0_ring_emit_ib_compute(struct amdgpu_ring *ring,
 					  struct amdgpu_job *job,
 					  struct amdgpu_ib *ib,
 					  uint32_t flags)
 {
 	unsigned vmid = AMDGPU_JOB_GET_VMID(job);
 	u32 control = INDIRECT_BUFFER_VALID | ib->length_dw | (vmid << 24);
 
+	/* Currently, there is a high possibility to get wave ID mismatch
+	 * between ME and GDS, leading to a hw deadlock, because ME generates
+	 * different wave IDs than the GDS expects. This situation happens
+	 * randomly when at least 5 compute pipes use GDS ordered append.
+	 * The wave IDs generated by ME are also wrong after suspend/resume.
+	 * Those are probably bugs somewhere else in the kernel driver.
+	 *
+	 * Writing GDS_COMPUTE_MAX_WAVE_ID resets wave ID counters in ME and
+	 * GDS to 0 for this ring (me/pipe).
+	 */
+	if (ib->flags & AMDGPU_IB_FLAG_RESET_GDS_MAX_WAVE_ID) {
+		amdgpu_ring_write(ring, PACKET3(PACKET3_SET_CONFIG_REG, 1));
+		amdgpu_ring_write(ring, mmGDS_COMPUTE_MAX_WAVE_ID);
+		amdgpu_ring_write(ring, ring->adev->gds.gds_compute_max_wave_id);
+	}
+
 	amdgpu_ring_write(ring, PACKET3(PACKET3_INDIRECT_BUFFER, 2));
 	BUG_ON(ib->gpu_addr & 0x3); /* Dword align */
 	amdgpu_ring_write(ring,
 #ifdef __BIG_ENDIAN
 				(2 << 0) |
 #endif
 				lower_32_bits(ib->gpu_addr));
 	amdgpu_ring_write(ring, upper_32_bits(ib->gpu_addr));
 	amdgpu_ring_write(ring, control);
 }
@@ -4839,20 +4855,40 @@ static void gfx_v9_0_set_gds_init(struct amdgpu_device *adev)
 		adev->gds.mem.total_size = 0x10000;
 		break;
 	case CHIP_RAVEN:
 		adev->gds.mem.total_size = 0x1000;
 		break;
 	default:
 		adev->gds.mem.total_size = 0x10000;
 		break;
 	}
 
+	switch (adev->asic_type) {
+	case CHIP_VEGA10:
+	case CHIP_VEGA20:
+		adev->gds.gds_compute_max_wave_id = 0x7ff;
+		break;
+	case CHIP_VEGA12:
+		adev->gds.gds_compute_max_wave_id = 0x27f;
+		break;
+	case CHIP_RAVEN:
+		if (adev->rev_id >= 0x8)
+			adev->gds.gds_compute_max_wave_id = 0x77; /* raven2 */
+		else
+			adev->gds.gds_compute_max_wave_id = 0x15f; /* raven1 */
+		break;
+	default:
+		/* this really depends on the chip */
+		adev->gds.gds_compute_max_wave_id = 0x7ff;
+		break;
+	}
+
 	adev->gds.gws.total_size = 64;
 	adev->gds.oa.total_size = 16;
 
 	if (adev->gds.mem.total_size == 64 * 1024) {
 		adev->gds.mem.gfx_partition_size = 4096;
 		adev->gds.mem.cs_partition_size = 4096;
 
 		adev->gds.gws.gfx_partition_size = 4;
 		adev->gds.gws.cs_partition_size = 4;
 
diff --git a/include/uapi/drm/amdgpu_drm.h b/include/uapi/drm/amdgpu_drm.h
index faaad04814e4..662d379ea624 100644
--- a/include/uapi/drm/amdgpu_drm.h
+++ b/include/uapi/drm/amdgpu_drm.h
@@ -561,20 +561,25 @@ union drm_amdgpu_cs {
 /* Preamble flag, which means the IB could be dropped if no context switch */
 #define AMDGPU_IB_FLAG_PREAMBLE (1<<1)
 
 /* Preempt flag, IB should set Pre_enb bit if PREEMPT flag detected */
 #define AMDGPU_IB_FLAG_PREEMPT (1<<2)
 
 /* The IB fence should do the L2 writeback but not invalidate any shader
  * caches (L2/vL1/sL1/I$). */
 #define AMDGPU_IB_FLAG_TC_WB_NOT_INVALIDATE (1 << 3)
 
+/* Set GDS_COMPUTE_MAX_WAVE_ID = DEFAULT before PACKET3_INDIRECT_BUFFER.
+ * This will reset wave ID counters for the IB.
+ */
+#define AMDGPU_IB_FLAG_RESET_GDS_MAX_WAVE_ID (1 << 4)
+
 struct drm_amdgpu_cs_chunk_ib {
 	__u32 _pad;
 	/** AMDGPU_IB_FLAG_* */
 	__u32 flags;
 	/** Virtual address to begin IB execution */
 	__u64 va_start;
 	/** Size of submission */
 	__u32 ib_bytes;
 	/** HW IP to submit to */
 	__u32 ip_type;
-- 
2.17.1

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 9+ messages in thread

* Re: [PATCH] drm/amdgpu: add a workaround for GDS ordered append hangs with compute queues
       [not found] ` <20190121234647.3995-1-maraeo-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
@ 2019-01-22  8:54   ` Christian König
  0 siblings, 0 replies; 9+ messages in thread
From: Christian König @ 2019-01-22  8:54 UTC (permalink / raw)
  To: Marek Olšák, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

Am 22.01.19 um 00:46 schrieb Marek Olšák:
> From: Marek Olšák <marek.olsak@amd.com>
>
> I'm not increasing the DRM version because GDS isn't totally without bugs yet.

Looks mostly good on first glance.

But one things that is certainly wrong is when you add any 
amdgpu_ring_write() call to the emit_ibs callback you also need to 
update the estimation on how many DW can be used by an IB.

Look out for the structure where the gfx_*_ring_emit_ib is used.

Regards,
Christian.

>
> Signed-off-by: Marek Olšák <marek.olsak@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_gds.h |  2 ++
>   drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c   | 17 ++++++++++++
>   drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c   | 17 ++++++++++++
>   drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c   | 36 +++++++++++++++++++++++++
>   include/uapi/drm/amdgpu_drm.h           |  5 ++++
>   5 files changed, 77 insertions(+)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gds.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_gds.h
> index ecbcefe49a98..f89f5734d985 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gds.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gds.h
> @@ -30,20 +30,22 @@ struct amdgpu_bo;
>   struct amdgpu_gds_asic_info {
>   	uint32_t	total_size;
>   	uint32_t	gfx_partition_size;
>   	uint32_t	cs_partition_size;
>   };
>   
>   struct amdgpu_gds {
>   	struct amdgpu_gds_asic_info	mem;
>   	struct amdgpu_gds_asic_info	gws;
>   	struct amdgpu_gds_asic_info	oa;
> +	uint32_t			gds_compute_max_wave_id;
> +
>   	/* At present, GDS, GWS and OA resources for gfx (graphics)
>   	 * is always pre-allocated and available for graphics operation.
>   	 * Such resource is shared between all gfx clients.
>   	 * TODO: move this operation to user space
>   	 * */
>   	struct amdgpu_bo*		gds_gfx_bo;
>   	struct amdgpu_bo*		gws_gfx_bo;
>   	struct amdgpu_bo*		oa_gfx_bo;
>   };
>   
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c
> index 7984292f9282..d971ea914755 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c
> @@ -2257,20 +2257,36 @@ static void gfx_v7_0_ring_emit_ib_gfx(struct amdgpu_ring *ring,
>   }
>   
>   static void gfx_v7_0_ring_emit_ib_compute(struct amdgpu_ring *ring,
>   					  struct amdgpu_job *job,
>   					  struct amdgpu_ib *ib,
>   					  uint32_t flags)
>   {
>   	unsigned vmid = AMDGPU_JOB_GET_VMID(job);
>   	u32 control = INDIRECT_BUFFER_VALID | ib->length_dw | (vmid << 24);
>   
> +	/* Currently, there is a high possibility to get wave ID mismatch
> +	 * between ME and GDS, leading to a hw deadlock, because ME generates
> +	 * different wave IDs than the GDS expects. This situation happens
> +	 * randomly when at least 5 compute pipes use GDS ordered append.
> +	 * The wave IDs generated by ME are also wrong after suspend/resume.
> +	 * Those are probably bugs somewhere else in the kernel driver.
> +	 *
> +	 * Writing GDS_COMPUTE_MAX_WAVE_ID resets wave ID counters in ME and
> +	 * GDS to 0 for this ring (me/pipe).
> +	 */
> +	if (ib->flags & AMDGPU_IB_FLAG_RESET_GDS_MAX_WAVE_ID) {
> +		amdgpu_ring_write(ring, PACKET3(PACKET3_SET_CONFIG_REG, 1));
> +		amdgpu_ring_write(ring, mmGDS_COMPUTE_MAX_WAVE_ID - PACKET3_SET_CONFIG_REG_START);
> +		amdgpu_ring_write(ring, ring->adev->gds.gds_compute_max_wave_id);
> +	}
> +
>   	amdgpu_ring_write(ring, PACKET3(PACKET3_INDIRECT_BUFFER, 2));
>   	amdgpu_ring_write(ring,
>   #ifdef __BIG_ENDIAN
>   					  (2 << 0) |
>   #endif
>   					  (ib->gpu_addr & 0xFFFFFFFC));
>   	amdgpu_ring_write(ring, upper_32_bits(ib->gpu_addr) & 0xFFFF);
>   	amdgpu_ring_write(ring, control);
>   }
>   
> @@ -5050,20 +5066,21 @@ static void gfx_v7_0_set_irq_funcs(struct amdgpu_device *adev)
>   	adev->gfx.priv_inst_irq.num_types = 1;
>   	adev->gfx.priv_inst_irq.funcs = &gfx_v7_0_priv_inst_irq_funcs;
>   }
>   
>   static void gfx_v7_0_set_gds_init(struct amdgpu_device *adev)
>   {
>   	/* init asci gds info */
>   	adev->gds.mem.total_size = RREG32(mmGDS_VMID0_SIZE);
>   	adev->gds.gws.total_size = 64;
>   	adev->gds.oa.total_size = 16;
> +	adev->gds.gds_compute_max_wave_id = RREG32(mmGDS_COMPUTE_MAX_WAVE_ID);
>   
>   	if (adev->gds.mem.total_size == 64 * 1024) {
>   		adev->gds.mem.gfx_partition_size = 4096;
>   		adev->gds.mem.cs_partition_size = 4096;
>   
>   		adev->gds.gws.gfx_partition_size = 4;
>   		adev->gds.gws.cs_partition_size = 4;
>   
>   		adev->gds.oa.gfx_partition_size = 4;
>   		adev->gds.oa.cs_partition_size = 1;
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> index a26747681ed6..dcdae74fc0e1 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> @@ -6077,20 +6077,36 @@ static void gfx_v8_0_ring_emit_ib_gfx(struct amdgpu_ring *ring,
>   }
>   
>   static void gfx_v8_0_ring_emit_ib_compute(struct amdgpu_ring *ring,
>   					  struct amdgpu_job *job,
>   					  struct amdgpu_ib *ib,
>   					  uint32_t flags)
>   {
>   	unsigned vmid = AMDGPU_JOB_GET_VMID(job);
>   	u32 control = INDIRECT_BUFFER_VALID | ib->length_dw | (vmid << 24);
>   
> +	/* Currently, there is a high possibility to get wave ID mismatch
> +	 * between ME and GDS, leading to a hw deadlock, because ME generates
> +	 * different wave IDs than the GDS expects. This situation happens
> +	 * randomly when at least 5 compute pipes use GDS ordered append.
> +	 * The wave IDs generated by ME are also wrong after suspend/resume.
> +	 * Those are probably bugs somewhere else in the kernel driver.
> +	 *
> +	 * Writing GDS_COMPUTE_MAX_WAVE_ID resets wave ID counters in ME and
> +	 * GDS to 0 for this ring (me/pipe).
> +	 */
> +	if (ib->flags & AMDGPU_IB_FLAG_RESET_GDS_MAX_WAVE_ID) {
> +		amdgpu_ring_write(ring, PACKET3(PACKET3_SET_CONFIG_REG, 1));
> +		amdgpu_ring_write(ring, mmGDS_COMPUTE_MAX_WAVE_ID - PACKET3_SET_CONFIG_REG_START);
> +		amdgpu_ring_write(ring, ring->adev->gds.gds_compute_max_wave_id);
> +	}
> +
>   	amdgpu_ring_write(ring, PACKET3(PACKET3_INDIRECT_BUFFER, 2));
>   	amdgpu_ring_write(ring,
>   #ifdef __BIG_ENDIAN
>   				(2 << 0) |
>   #endif
>   				(ib->gpu_addr & 0xFFFFFFFC));
>   	amdgpu_ring_write(ring, upper_32_bits(ib->gpu_addr) & 0xFFFF);
>   	amdgpu_ring_write(ring, control);
>   }
>   
> @@ -6989,20 +7005,21 @@ static void gfx_v8_0_set_rlc_funcs(struct amdgpu_device *adev)
>   {
>   	adev->gfx.rlc.funcs = &iceland_rlc_funcs;
>   }
>   
>   static void gfx_v8_0_set_gds_init(struct amdgpu_device *adev)
>   {
>   	/* init asci gds info */
>   	adev->gds.mem.total_size = RREG32(mmGDS_VMID0_SIZE);
>   	adev->gds.gws.total_size = 64;
>   	adev->gds.oa.total_size = 16;
> +	adev->gds.gds_compute_max_wave_id = RREG32(mmGDS_COMPUTE_MAX_WAVE_ID);
>   
>   	if (adev->gds.mem.total_size == 64 * 1024) {
>   		adev->gds.mem.gfx_partition_size = 4096;
>   		adev->gds.mem.cs_partition_size = 4096;
>   
>   		adev->gds.gws.gfx_partition_size = 4;
>   		adev->gds.gws.cs_partition_size = 4;
>   
>   		adev->gds.oa.gfx_partition_size = 4;
>   		adev->gds.oa.cs_partition_size = 1;
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> index 262ee3cf6f1c..63b898fc0467 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> @@ -4003,20 +4003,36 @@ static void gfx_v9_0_ring_emit_ib_gfx(struct amdgpu_ring *ring,
>   }
>   
>   static void gfx_v9_0_ring_emit_ib_compute(struct amdgpu_ring *ring,
>   					  struct amdgpu_job *job,
>   					  struct amdgpu_ib *ib,
>   					  uint32_t flags)
>   {
>   	unsigned vmid = AMDGPU_JOB_GET_VMID(job);
>   	u32 control = INDIRECT_BUFFER_VALID | ib->length_dw | (vmid << 24);
>   
> +	/* Currently, there is a high possibility to get wave ID mismatch
> +	 * between ME and GDS, leading to a hw deadlock, because ME generates
> +	 * different wave IDs than the GDS expects. This situation happens
> +	 * randomly when at least 5 compute pipes use GDS ordered append.
> +	 * The wave IDs generated by ME are also wrong after suspend/resume.
> +	 * Those are probably bugs somewhere else in the kernel driver.
> +	 *
> +	 * Writing GDS_COMPUTE_MAX_WAVE_ID resets wave ID counters in ME and
> +	 * GDS to 0 for this ring (me/pipe).
> +	 */
> +	if (ib->flags & AMDGPU_IB_FLAG_RESET_GDS_MAX_WAVE_ID) {
> +		amdgpu_ring_write(ring, PACKET3(PACKET3_SET_CONFIG_REG, 1));
> +		amdgpu_ring_write(ring, mmGDS_COMPUTE_MAX_WAVE_ID);
> +		amdgpu_ring_write(ring, ring->adev->gds.gds_compute_max_wave_id);
> +	}
> +
>   	amdgpu_ring_write(ring, PACKET3(PACKET3_INDIRECT_BUFFER, 2));
>   	BUG_ON(ib->gpu_addr & 0x3); /* Dword align */
>   	amdgpu_ring_write(ring,
>   #ifdef __BIG_ENDIAN
>   				(2 << 0) |
>   #endif
>   				lower_32_bits(ib->gpu_addr));
>   	amdgpu_ring_write(ring, upper_32_bits(ib->gpu_addr));
>   	amdgpu_ring_write(ring, control);
>   }
> @@ -4839,20 +4855,40 @@ static void gfx_v9_0_set_gds_init(struct amdgpu_device *adev)
>   		adev->gds.mem.total_size = 0x10000;
>   		break;
>   	case CHIP_RAVEN:
>   		adev->gds.mem.total_size = 0x1000;
>   		break;
>   	default:
>   		adev->gds.mem.total_size = 0x10000;
>   		break;
>   	}
>   
> +	switch (adev->asic_type) {
> +	case CHIP_VEGA10:
> +	case CHIP_VEGA20:
> +		adev->gds.gds_compute_max_wave_id = 0x7ff;
> +		break;
> +	case CHIP_VEGA12:
> +		adev->gds.gds_compute_max_wave_id = 0x27f;
> +		break;
> +	case CHIP_RAVEN:
> +		if (adev->rev_id >= 0x8)
> +			adev->gds.gds_compute_max_wave_id = 0x77; /* raven2 */
> +		else
> +			adev->gds.gds_compute_max_wave_id = 0x15f; /* raven1 */
> +		break;
> +	default:
> +		/* this really depends on the chip */
> +		adev->gds.gds_compute_max_wave_id = 0x7ff;
> +		break;
> +	}
> +
>   	adev->gds.gws.total_size = 64;
>   	adev->gds.oa.total_size = 16;
>   
>   	if (adev->gds.mem.total_size == 64 * 1024) {
>   		adev->gds.mem.gfx_partition_size = 4096;
>   		adev->gds.mem.cs_partition_size = 4096;
>   
>   		adev->gds.gws.gfx_partition_size = 4;
>   		adev->gds.gws.cs_partition_size = 4;
>   
> diff --git a/include/uapi/drm/amdgpu_drm.h b/include/uapi/drm/amdgpu_drm.h
> index faaad04814e4..662d379ea624 100644
> --- a/include/uapi/drm/amdgpu_drm.h
> +++ b/include/uapi/drm/amdgpu_drm.h
> @@ -561,20 +561,25 @@ union drm_amdgpu_cs {
>   /* Preamble flag, which means the IB could be dropped if no context switch */
>   #define AMDGPU_IB_FLAG_PREAMBLE (1<<1)
>   
>   /* Preempt flag, IB should set Pre_enb bit if PREEMPT flag detected */
>   #define AMDGPU_IB_FLAG_PREEMPT (1<<2)
>   
>   /* The IB fence should do the L2 writeback but not invalidate any shader
>    * caches (L2/vL1/sL1/I$). */
>   #define AMDGPU_IB_FLAG_TC_WB_NOT_INVALIDATE (1 << 3)
>   
> +/* Set GDS_COMPUTE_MAX_WAVE_ID = DEFAULT before PACKET3_INDIRECT_BUFFER.
> + * This will reset wave ID counters for the IB.
> + */
> +#define AMDGPU_IB_FLAG_RESET_GDS_MAX_WAVE_ID (1 << 4)
> +
>   struct drm_amdgpu_cs_chunk_ib {
>   	__u32 _pad;
>   	/** AMDGPU_IB_FLAG_* */
>   	__u32 flags;
>   	/** Virtual address to begin IB execution */
>   	__u64 va_start;
>   	/** Size of submission */
>   	__u32 ib_bytes;
>   	/** HW IP to submit to */
>   	__u32 ip_type;

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] drm/amdgpu: add a workaround for GDS ordered append hangs with compute queues
       [not found]                 ` <44f785c1-16bf-289c-4502-0a7a3637aa30-5C7GfCeVMHo@public.gmane.org>
@ 2019-02-04 17:08                   ` Alex Deucher
  0 siblings, 0 replies; 9+ messages in thread
From: Alex Deucher @ 2019-02-04 17:08 UTC (permalink / raw)
  To: Koenig, Christian; +Cc: amd-gfx mailing list, Marek Olšák

On Mon, Feb 4, 2019 at 11:09 AM Koenig, Christian
<Christian.Koenig@amd.com> wrote:
>
> Am 04.02.19 um 16:55 schrieb Marek Olšák:
>
> On Mon, Feb 4, 2019 at 7:42 AM Christian König <ckoenig.leichtzumerken@gmail.com> wrote:
>>
>> At least from coding style, backward compatibility etc.. this looks sane to me, so feel free to add an Acked-by.
>>
>> But I absolutely can't judge if that is correct from the hardware point of view or not.
>
>
> Our GDS docs say that writing the register resets the wave counter.
>
>>
>>
>> And I think that somebody else looking at this is mandatory for it to be committed.
>
>
> There won't be anybody else. Nobody here really understands GDS, nobody here really cares about GDS, and hw guys weren't helpful with the hangs. If I didn't discover this by luck, GDS OA would be unusable on Linux.
>
>
> Great, as long as Alex is ok with this feel free to go ahead.

Acked-by: Alex Deucher <alexander.deucher@amd.com>

>
> Christian.
>
>
> Marek
>
>
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] drm/amdgpu: add a workaround for GDS ordered append hangs with compute queues
       [not found]             ` <CAAxE2A4vsMRL0FZYBknCTMEbDBNYJZOdOnR32EZqM5iaKX6j3g-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
  2019-02-04 16:00               ` Marek Olšák
@ 2019-02-04 16:09               ` Koenig, Christian
       [not found]                 ` <44f785c1-16bf-289c-4502-0a7a3637aa30-5C7GfCeVMHo@public.gmane.org>
  1 sibling, 1 reply; 9+ messages in thread
From: Koenig, Christian @ 2019-02-04 16:09 UTC (permalink / raw)
  To: Marek Olšák; +Cc: amd-gfx mailing list


[-- Attachment #1.1: Type: text/plain, Size: 852 bytes --]

Am 04.02.19 um 16:55 schrieb Marek Olšák:
On Mon, Feb 4, 2019 at 7:42 AM Christian König <ckoenig.leichtzumerken@gmail.com<mailto:ckoenig.leichtzumerken@gmail.com>> wrote:
At least from coding style, backward compatibility etc.. this looks sane to me, so feel free to add an Acked-by.

But I absolutely can't judge if that is correct from the hardware point of view or not.

Our GDS docs say that writing the register resets the wave counter.


And I think that somebody else looking at this is mandatory for it to be committed.

There won't be anybody else. Nobody here really understands GDS, nobody here really cares about GDS, and hw guys weren't helpful with the hangs. If I didn't discover this by luck, GDS OA would be unusable on Linux.

Great, as long as Alex is ok with this feel free to go ahead.

Christian.


Marek


[-- Attachment #1.2: Type: text/html, Size: 2171 bytes --]

[-- Attachment #2: Type: text/plain, Size: 154 bytes --]

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] drm/amdgpu: add a workaround for GDS ordered append hangs with compute queues
       [not found]             ` <CAAxE2A4vsMRL0FZYBknCTMEbDBNYJZOdOnR32EZqM5iaKX6j3g-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
@ 2019-02-04 16:00               ` Marek Olšák
  2019-02-04 16:09               ` Koenig, Christian
  1 sibling, 0 replies; 9+ messages in thread
From: Marek Olšák @ 2019-02-04 16:00 UTC (permalink / raw)
  To: Christian König; +Cc: amd-gfx mailing list


[-- Attachment #1.1: Type: text/plain, Size: 944 bytes --]

FYI, when I push this, I'll bump the DRM version.

Marek

On Mon, Feb 4, 2019 at 10:55 AM Marek Olšák <maraeo-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org> wrote:

> On Mon, Feb 4, 2019 at 7:42 AM Christian König <
> ckoenig.leichtzumerken-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org> wrote:
>
>> At least from coding style, backward compatibility etc.. this looks sane
>> to me, so feel free to add an Acked-by.
>>
>> But I absolutely can't judge if that is correct from the hardware point
>> of view or not.
>>
>
> Our GDS docs say that writing the register resets the wave counter.
>
>
>>
>> And I think that somebody else looking at this is mandatory for it to be
>> committed.
>>
>
> There won't be anybody else. Nobody here really understands GDS, nobody
> here really cares about GDS, and hw guys weren't helpful with the hangs. If
> I didn't discover this by luck, GDS OA would be unusable on Linux.
>
> Marek
>

[-- Attachment #1.2: Type: text/html, Size: 2096 bytes --]

[-- Attachment #2: Type: text/plain, Size: 154 bytes --]

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] drm/amdgpu: add a workaround for GDS ordered append hangs with compute queues
       [not found]         ` <1b1294f7-1604-fe49-de7f-0e234656eecd-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
@ 2019-02-04 15:55           ` Marek Olšák
       [not found]             ` <CAAxE2A4vsMRL0FZYBknCTMEbDBNYJZOdOnR32EZqM5iaKX6j3g-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
  0 siblings, 1 reply; 9+ messages in thread
From: Marek Olšák @ 2019-02-04 15:55 UTC (permalink / raw)
  To: Christian König; +Cc: amd-gfx mailing list


[-- Attachment #1.1: Type: text/plain, Size: 744 bytes --]

On Mon, Feb 4, 2019 at 7:42 AM Christian König <
ckoenig.leichtzumerken-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org> wrote:

> At least from coding style, backward compatibility etc.. this looks sane
> to me, so feel free to add an Acked-by.
>
> But I absolutely can't judge if that is correct from the hardware point of
> view or not.
>

Our GDS docs say that writing the register resets the wave counter.


>
> And I think that somebody else looking at this is mandatory for it to be
> committed.
>

There won't be anybody else. Nobody here really understands GDS, nobody
here really cares about GDS, and hw guys weren't helpful with the hangs. If
I didn't discover this by luck, GDS OA would be unusable on Linux.

Marek

[-- Attachment #1.2: Type: text/html, Size: 1544 bytes --]

[-- Attachment #2: Type: text/plain, Size: 154 bytes --]

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] drm/amdgpu: add a workaround for GDS ordered append hangs with compute queues
       [not found]     ` <CAAxE2A7xs40YZvCQpMmv9X0Z98Sc7+ZEbFJDJLfmkMuWHzuprw-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
@ 2019-02-04 12:42       ` Christian König
       [not found]         ` <1b1294f7-1604-fe49-de7f-0e234656eecd-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
  0 siblings, 1 reply; 9+ messages in thread
From: Christian König @ 2019-02-04 12:42 UTC (permalink / raw)
  To: Marek Olšák, amd-gfx mailing list


[-- Attachment #1.1: Type: text/plain, Size: 23200 bytes --]

At least from coding style, backward compatibility etc.. this looks sane 
to me, so feel free to add an Acked-by.

But I absolutely can't judge if that is correct from the hardware point 
of view or not.

And I think that somebody else looking at this is mandatory for it to be 
committed.

Christian.

Am 28.01.19 um 18:25 schrieb Marek Olšák:
> Ping
>
> On Tue, Jan 22, 2019 at 3:05 PM Marek Olšák <maraeo-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org 
> <mailto:maraeo-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>> wrote:
>
>     From: Marek Olšák <marek.olsak-5C7GfCeVMHo@public.gmane.org <mailto:marek.olsak-5C7GfCeVMHo@public.gmane.org>>
>
>     I'm not increasing the DRM version because GDS isn't totally
>     without bugs yet.
>
>     v2: update emit_ib_size
>
>     Signed-off-by: Marek Olšák <marek.olsak-5C7GfCeVMHo@public.gmane.org
>     <mailto:marek.olsak-5C7GfCeVMHo@public.gmane.org>>
>     ---
>      drivers/gpu/drm/amd/amdgpu/amdgpu_gds.h |  2 ++
>      drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c   | 19 +++++++++++-
>      drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c   | 21 +++++++++++--
>      drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c   | 40
>     +++++++++++++++++++++++--
>      include/uapi/drm/amdgpu_drm.h           |  5 ++++
>      5 files changed, 82 insertions(+), 5 deletions(-)
>
>     diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gds.h
>     b/drivers/gpu/drm/amd/amdgpu/amdgpu_gds.h
>     index ecbcefe49a98..f89f5734d985 100644
>     --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gds.h
>     +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gds.h
>     @@ -30,20 +30,22 @@ struct amdgpu_bo;
>      struct amdgpu_gds_asic_info {
>             uint32_t        total_size;
>             uint32_t        gfx_partition_size;
>             uint32_t        cs_partition_size;
>      };
>
>      struct amdgpu_gds {
>             struct amdgpu_gds_asic_info     mem;
>             struct amdgpu_gds_asic_info     gws;
>             struct amdgpu_gds_asic_info     oa;
>     +       uint32_t gds_compute_max_wave_id;
>     +
>             /* At present, GDS, GWS and OA resources for gfx (graphics)
>              * is always pre-allocated and available for graphics
>     operation.
>              * Such resource is shared between all gfx clients.
>              * TODO: move this operation to user space
>              * */
>             struct amdgpu_bo*               gds_gfx_bo;
>             struct amdgpu_bo*               gws_gfx_bo;
>             struct amdgpu_bo*               oa_gfx_bo;
>      };
>
>     diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c
>     b/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c
>     index 7984292f9282..a59e0fdf5a97 100644
>     --- a/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c
>     +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c
>     @@ -2257,20 +2257,36 @@ static void
>     gfx_v7_0_ring_emit_ib_gfx(struct amdgpu_ring *ring,
>      }
>
>      static void gfx_v7_0_ring_emit_ib_compute(struct amdgpu_ring *ring,
>                                               struct amdgpu_job *job,
>                                               struct amdgpu_ib *ib,
>                                               uint32_t flags)
>      {
>             unsigned vmid = AMDGPU_JOB_GET_VMID(job);
>             u32 control = INDIRECT_BUFFER_VALID | ib->length_dw |
>     (vmid << 24);
>
>     +       /* Currently, there is a high possibility to get wave ID
>     mismatch
>     +        * between ME and GDS, leading to a hw deadlock, because
>     ME generates
>     +        * different wave IDs than the GDS expects. This situation
>     happens
>     +        * randomly when at least 5 compute pipes use GDS ordered
>     append.
>     +        * The wave IDs generated by ME are also wrong after
>     suspend/resume.
>     +        * Those are probably bugs somewhere else in the kernel
>     driver.
>     +        *
>     +        * Writing GDS_COMPUTE_MAX_WAVE_ID resets wave ID counters
>     in ME and
>     +        * GDS to 0 for this ring (me/pipe).
>     +        */
>     +       if (ib->flags & AMDGPU_IB_FLAG_RESET_GDS_MAX_WAVE_ID) {
>     +               amdgpu_ring_write(ring,
>     PACKET3(PACKET3_SET_CONFIG_REG, 1));
>     +               amdgpu_ring_write(ring, mmGDS_COMPUTE_MAX_WAVE_ID
>     - PACKET3_SET_CONFIG_REG_START);
>     +               amdgpu_ring_write(ring,
>     ring->adev->gds.gds_compute_max_wave_id);
>     +       }
>     +
>             amdgpu_ring_write(ring, PACKET3(PACKET3_INDIRECT_BUFFER, 2));
>             amdgpu_ring_write(ring,
>      #ifdef __BIG_ENDIAN
>                                               (2 << 0) |
>      #endif
>                                               (ib->gpu_addr &
>     0xFFFFFFFC));
>             amdgpu_ring_write(ring, upper_32_bits(ib->gpu_addr) & 0xFFFF);
>             amdgpu_ring_write(ring, control);
>      }
>
>     @@ -4993,21 +5009,21 @@ static const struct amdgpu_ring_funcs
>     gfx_v7_0_ring_funcs_compute = {
>             .get_rptr = gfx_v7_0_ring_get_rptr,
>             .get_wptr = gfx_v7_0_ring_get_wptr_compute,
>             .set_wptr = gfx_v7_0_ring_set_wptr_compute,
>             .emit_frame_size =
>                     20 + /* gfx_v7_0_ring_emit_gds_switch */
>                     7 + /* gfx_v7_0_ring_emit_hdp_flush */
>                     5 + /* hdp invalidate */
>                     7 + /* gfx_v7_0_ring_emit_pipeline_sync */
>                     CIK_FLUSH_GPU_TLB_NUM_WREG * 5 + 7 + /*
>     gfx_v7_0_ring_emit_vm_flush */
>                     7 + 7 + 7, /* gfx_v7_0_ring_emit_fence_compute x3
>     for user fence, vm fence */
>     -       .emit_ib_size = 4, /* gfx_v7_0_ring_emit_ib_compute */
>     +       .emit_ib_size = 7, /* gfx_v7_0_ring_emit_ib_compute */
>             .emit_ib = gfx_v7_0_ring_emit_ib_compute,
>             .emit_fence = gfx_v7_0_ring_emit_fence_compute,
>             .emit_pipeline_sync = gfx_v7_0_ring_emit_pipeline_sync,
>             .emit_vm_flush = gfx_v7_0_ring_emit_vm_flush,
>             .emit_gds_switch = gfx_v7_0_ring_emit_gds_switch,
>             .emit_hdp_flush = gfx_v7_0_ring_emit_hdp_flush,
>             .test_ring = gfx_v7_0_ring_test_ring,
>             .test_ib = gfx_v7_0_ring_test_ib,
>             .insert_nop = amdgpu_ring_insert_nop,
>             .pad_ib = amdgpu_ring_generic_pad_ib,
>     @@ -5050,20 +5066,21 @@ static void gfx_v7_0_set_irq_funcs(struct
>     amdgpu_device *adev)
>             adev->gfx.priv_inst_irq.num_types = 1;
>             adev->gfx.priv_inst_irq.funcs = &gfx_v7_0_priv_inst_irq_funcs;
>      }
>
>      static void gfx_v7_0_set_gds_init(struct amdgpu_device *adev)
>      {
>             /* init asci gds info */
>             adev->gds.mem.total_size = RREG32(mmGDS_VMID0_SIZE);
>             adev->gds.gws.total_size = 64;
>             adev->gds.oa.total_size = 16;
>     +       adev->gds.gds_compute_max_wave_id =
>     RREG32(mmGDS_COMPUTE_MAX_WAVE_ID);
>
>             if (adev->gds.mem.total_size == 64 * 1024) {
>                     adev->gds.mem.gfx_partition_size = 4096;
>                     adev->gds.mem.cs_partition_size = 4096;
>
>                     adev->gds.gws.gfx_partition_size = 4;
>                     adev->gds.gws.cs_partition_size = 4;
>
>                     adev->gds.oa.gfx_partition_size = 4;
>                     adev->gds.oa.cs_partition_size = 1;
>     diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>     b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>     index a26747681ed6..b8e50a34bdb3 100644
>     --- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>     +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>     @@ -6077,20 +6077,36 @@ static void
>     gfx_v8_0_ring_emit_ib_gfx(struct amdgpu_ring *ring,
>      }
>
>      static void gfx_v8_0_ring_emit_ib_compute(struct amdgpu_ring *ring,
>                                               struct amdgpu_job *job,
>                                               struct amdgpu_ib *ib,
>                                               uint32_t flags)
>      {
>             unsigned vmid = AMDGPU_JOB_GET_VMID(job);
>             u32 control = INDIRECT_BUFFER_VALID | ib->length_dw |
>     (vmid << 24);
>
>     +       /* Currently, there is a high possibility to get wave ID
>     mismatch
>     +        * between ME and GDS, leading to a hw deadlock, because
>     ME generates
>     +        * different wave IDs than the GDS expects. This situation
>     happens
>     +        * randomly when at least 5 compute pipes use GDS ordered
>     append.
>     +        * The wave IDs generated by ME are also wrong after
>     suspend/resume.
>     +        * Those are probably bugs somewhere else in the kernel
>     driver.
>     +        *
>     +        * Writing GDS_COMPUTE_MAX_WAVE_ID resets wave ID counters
>     in ME and
>     +        * GDS to 0 for this ring (me/pipe).
>     +        */
>     +       if (ib->flags & AMDGPU_IB_FLAG_RESET_GDS_MAX_WAVE_ID) {
>     +               amdgpu_ring_write(ring,
>     PACKET3(PACKET3_SET_CONFIG_REG, 1));
>     +               amdgpu_ring_write(ring, mmGDS_COMPUTE_MAX_WAVE_ID
>     - PACKET3_SET_CONFIG_REG_START);
>     +               amdgpu_ring_write(ring,
>     ring->adev->gds.gds_compute_max_wave_id);
>     +       }
>     +
>             amdgpu_ring_write(ring, PACKET3(PACKET3_INDIRECT_BUFFER, 2));
>             amdgpu_ring_write(ring,
>      #ifdef __BIG_ENDIAN
>                                     (2 << 0) |
>      #endif
>                                     (ib->gpu_addr & 0xFFFFFFFC));
>             amdgpu_ring_write(ring, upper_32_bits(ib->gpu_addr) & 0xFFFF);
>             amdgpu_ring_write(ring, control);
>      }
>
>     @@ -6883,21 +6899,21 @@ static const struct amdgpu_ring_funcs
>     gfx_v8_0_ring_funcs_compute = {
>             .get_rptr = gfx_v8_0_ring_get_rptr,
>             .get_wptr = gfx_v8_0_ring_get_wptr_compute,
>             .set_wptr = gfx_v8_0_ring_set_wptr_compute,
>             .emit_frame_size =
>                     20 + /* gfx_v8_0_ring_emit_gds_switch */
>                     7 + /* gfx_v8_0_ring_emit_hdp_flush */
>                     5 + /* hdp_invalidate */
>                     7 + /* gfx_v8_0_ring_emit_pipeline_sync */
>                     VI_FLUSH_GPU_TLB_NUM_WREG * 5 + 7 + /*
>     gfx_v8_0_ring_emit_vm_flush */
>                     7 + 7 + 7, /* gfx_v8_0_ring_emit_fence_compute x3
>     for user fence, vm fence */
>     -       .emit_ib_size = 4, /* gfx_v8_0_ring_emit_ib_compute */
>     +       .emit_ib_size = 7, /* gfx_v8_0_ring_emit_ib_compute */
>             .emit_ib = gfx_v8_0_ring_emit_ib_compute,
>             .emit_fence = gfx_v8_0_ring_emit_fence_compute,
>             .emit_pipeline_sync = gfx_v8_0_ring_emit_pipeline_sync,
>             .emit_vm_flush = gfx_v8_0_ring_emit_vm_flush,
>             .emit_gds_switch = gfx_v8_0_ring_emit_gds_switch,
>             .emit_hdp_flush = gfx_v8_0_ring_emit_hdp_flush,
>             .test_ring = gfx_v8_0_ring_test_ring,
>             .test_ib = gfx_v8_0_ring_test_ib,
>             .insert_nop = amdgpu_ring_insert_nop,
>             .pad_ib = amdgpu_ring_generic_pad_ib,
>     @@ -6913,21 +6929,21 @@ static const struct amdgpu_ring_funcs
>     gfx_v8_0_ring_funcs_kiq = {
>             .get_rptr = gfx_v8_0_ring_get_rptr,
>             .get_wptr = gfx_v8_0_ring_get_wptr_compute,
>             .set_wptr = gfx_v8_0_ring_set_wptr_compute,
>             .emit_frame_size =
>                     20 + /* gfx_v8_0_ring_emit_gds_switch */
>                     7 + /* gfx_v8_0_ring_emit_hdp_flush */
>                     5 + /* hdp_invalidate */
>                     7 + /* gfx_v8_0_ring_emit_pipeline_sync */
>                     17 + /* gfx_v8_0_ring_emit_vm_flush */
>                     7 + 7 + 7, /* gfx_v8_0_ring_emit_fence_kiq x3 for
>     user fence, vm fence */
>     -       .emit_ib_size = 4, /* gfx_v8_0_ring_emit_ib_compute */
>     +       .emit_ib_size = 7, /* gfx_v8_0_ring_emit_ib_compute */
>             .emit_fence = gfx_v8_0_ring_emit_fence_kiq,
>             .test_ring = gfx_v8_0_ring_test_ring,
>             .insert_nop = amdgpu_ring_insert_nop,
>             .pad_ib = amdgpu_ring_generic_pad_ib,
>             .emit_rreg = gfx_v8_0_ring_emit_rreg,
>             .emit_wreg = gfx_v8_0_ring_emit_wreg,
>      };
>
>      static void gfx_v8_0_set_ring_funcs(struct amdgpu_device *adev)
>      {
>     @@ -6989,20 +7005,21 @@ static void gfx_v8_0_set_rlc_funcs(struct
>     amdgpu_device *adev)
>      {
>             adev->gfx.rlc.funcs = &iceland_rlc_funcs;
>      }
>
>      static void gfx_v8_0_set_gds_init(struct amdgpu_device *adev)
>      {
>             /* init asci gds info */
>             adev->gds.mem.total_size = RREG32(mmGDS_VMID0_SIZE);
>             adev->gds.gws.total_size = 64;
>             adev->gds.oa.total_size = 16;
>     +       adev->gds.gds_compute_max_wave_id =
>     RREG32(mmGDS_COMPUTE_MAX_WAVE_ID);
>
>             if (adev->gds.mem.total_size == 64 * 1024) {
>                     adev->gds.mem.gfx_partition_size = 4096;
>                     adev->gds.mem.cs_partition_size = 4096;
>
>                     adev->gds.gws.gfx_partition_size = 4;
>                     adev->gds.gws.cs_partition_size = 4;
>
>                     adev->gds.oa.gfx_partition_size = 4;
>                     adev->gds.oa.cs_partition_size = 1;
>     diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>     b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>     index 262ee3cf6f1c..5533f6e4f4a4 100644
>     --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>     +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>     @@ -4003,20 +4003,36 @@ static void
>     gfx_v9_0_ring_emit_ib_gfx(struct amdgpu_ring *ring,
>      }
>
>      static void gfx_v9_0_ring_emit_ib_compute(struct amdgpu_ring *ring,
>                                               struct amdgpu_job *job,
>                                               struct amdgpu_ib *ib,
>                                               uint32_t flags)
>      {
>             unsigned vmid = AMDGPU_JOB_GET_VMID(job);
>             u32 control = INDIRECT_BUFFER_VALID | ib->length_dw |
>     (vmid << 24);
>
>     +       /* Currently, there is a high possibility to get wave ID
>     mismatch
>     +        * between ME and GDS, leading to a hw deadlock, because
>     ME generates
>     +        * different wave IDs than the GDS expects. This situation
>     happens
>     +        * randomly when at least 5 compute pipes use GDS ordered
>     append.
>     +        * The wave IDs generated by ME are also wrong after
>     suspend/resume.
>     +        * Those are probably bugs somewhere else in the kernel
>     driver.
>     +        *
>     +        * Writing GDS_COMPUTE_MAX_WAVE_ID resets wave ID counters
>     in ME and
>     +        * GDS to 0 for this ring (me/pipe).
>     +        */
>     +       if (ib->flags & AMDGPU_IB_FLAG_RESET_GDS_MAX_WAVE_ID) {
>     +               amdgpu_ring_write(ring,
>     PACKET3(PACKET3_SET_CONFIG_REG, 1));
>     +               amdgpu_ring_write(ring, mmGDS_COMPUTE_MAX_WAVE_ID);
>     +               amdgpu_ring_write(ring,
>     ring->adev->gds.gds_compute_max_wave_id);
>     +       }
>     +
>             amdgpu_ring_write(ring, PACKET3(PACKET3_INDIRECT_BUFFER, 2));
>             BUG_ON(ib->gpu_addr & 0x3); /* Dword align */
>             amdgpu_ring_write(ring,
>      #ifdef __BIG_ENDIAN
>                                     (2 << 0) |
>      #endif
>     lower_32_bits(ib->gpu_addr));
>             amdgpu_ring_write(ring, upper_32_bits(ib->gpu_addr));
>             amdgpu_ring_write(ring, control);
>      }
>     @@ -4722,21 +4738,21 @@ static const struct amdgpu_ring_funcs
>     gfx_v9_0_ring_funcs_compute = {
>             .set_wptr = gfx_v9_0_ring_set_wptr_compute,
>             .emit_frame_size =
>                     20 + /* gfx_v9_0_ring_emit_gds_switch */
>                     7 + /* gfx_v9_0_ring_emit_hdp_flush */
>                     5 + /* hdp invalidate */
>                     7 + /* gfx_v9_0_ring_emit_pipeline_sync */
>                     SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
>                     SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
>                     2 + /* gfx_v9_0_ring_emit_vm_flush */
>                     8 + 8 + 8, /* gfx_v9_0_ring_emit_fence x3 for user
>     fence, vm fence */
>     -       .emit_ib_size = 4, /* gfx_v9_0_ring_emit_ib_compute */
>     +       .emit_ib_size = 7, /* gfx_v9_0_ring_emit_ib_compute */
>             .emit_ib = gfx_v9_0_ring_emit_ib_compute,
>             .emit_fence = gfx_v9_0_ring_emit_fence,
>             .emit_pipeline_sync = gfx_v9_0_ring_emit_pipeline_sync,
>             .emit_vm_flush = gfx_v9_0_ring_emit_vm_flush,
>             .emit_gds_switch = gfx_v9_0_ring_emit_gds_switch,
>             .emit_hdp_flush = gfx_v9_0_ring_emit_hdp_flush,
>             .test_ring = gfx_v9_0_ring_test_ring,
>             .test_ib = gfx_v9_0_ring_test_ib,
>             .insert_nop = amdgpu_ring_insert_nop,
>             .pad_ib = amdgpu_ring_generic_pad_ib,
>     @@ -4757,21 +4773,21 @@ static const struct amdgpu_ring_funcs
>     gfx_v9_0_ring_funcs_kiq = {
>             .set_wptr = gfx_v9_0_ring_set_wptr_compute,
>             .emit_frame_size =
>                     20 + /* gfx_v9_0_ring_emit_gds_switch */
>                     7 + /* gfx_v9_0_ring_emit_hdp_flush */
>                     5 + /* hdp invalidate */
>                     7 + /* gfx_v9_0_ring_emit_pipeline_sync */
>                     SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
>                     SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
>                     2 + /* gfx_v9_0_ring_emit_vm_flush */
>                     8 + 8 + 8, /* gfx_v9_0_ring_emit_fence_kiq x3 for
>     user fence, vm fence */
>     -       .emit_ib_size = 4, /* gfx_v9_0_ring_emit_ib_compute */
>     +       .emit_ib_size = 7, /* gfx_v9_0_ring_emit_ib_compute */
>             .emit_fence = gfx_v9_0_ring_emit_fence_kiq,
>             .test_ring = gfx_v9_0_ring_test_ring,
>             .insert_nop = amdgpu_ring_insert_nop,
>             .pad_ib = amdgpu_ring_generic_pad_ib,
>             .emit_rreg = gfx_v9_0_ring_emit_rreg,
>             .emit_wreg = gfx_v9_0_ring_emit_wreg,
>             .emit_reg_wait = gfx_v9_0_ring_emit_reg_wait,
>             .emit_reg_write_reg_wait =
>     gfx_v9_0_ring_emit_reg_write_reg_wait,
>      };
>
>     @@ -4839,20 +4855,40 @@ static void gfx_v9_0_set_gds_init(struct
>     amdgpu_device *adev)
>                     adev->gds.mem.total_size = 0x10000;
>                     break;
>             case CHIP_RAVEN:
>                     adev->gds.mem.total_size = 0x1000;
>                     break;
>             default:
>                     adev->gds.mem.total_size = 0x10000;
>                     break;
>             }
>
>     +       switch (adev->asic_type) {
>     +       case CHIP_VEGA10:
>     +       case CHIP_VEGA20:
>     +               adev->gds.gds_compute_max_wave_id = 0x7ff;
>     +               break;
>     +       case CHIP_VEGA12:
>     +               adev->gds.gds_compute_max_wave_id = 0x27f;
>     +               break;
>     +       case CHIP_RAVEN:
>     +               if (adev->rev_id >= 0x8)
>     +                       adev->gds.gds_compute_max_wave_id = 0x77;
>     /* raven2 */
>     +               else
>     +                       adev->gds.gds_compute_max_wave_id = 0x15f;
>     /* raven1 */
>     +               break;
>     +       default:
>     +               /* this really depends on the chip */
>     +               adev->gds.gds_compute_max_wave_id = 0x7ff;
>     +               break;
>     +       }
>     +
>             adev->gds.gws.total_size = 64;
>             adev->gds.oa.total_size = 16;
>
>             if (adev->gds.mem.total_size == 64 * 1024) {
>                     adev->gds.mem.gfx_partition_size = 4096;
>                     adev->gds.mem.cs_partition_size = 4096;
>
>                     adev->gds.gws.gfx_partition_size = 4;
>                     adev->gds.gws.cs_partition_size = 4;
>
>     diff --git a/include/uapi/drm/amdgpu_drm.h
>     b/include/uapi/drm/amdgpu_drm.h
>     index faaad04814e4..662d379ea624 100644
>     --- a/include/uapi/drm/amdgpu_drm.h
>     +++ b/include/uapi/drm/amdgpu_drm.h
>     @@ -561,20 +561,25 @@ union drm_amdgpu_cs {
>      /* Preamble flag, which means the IB could be dropped if no
>     context switch */
>      #define AMDGPU_IB_FLAG_PREAMBLE (1<<1)
>
>      /* Preempt flag, IB should set Pre_enb bit if PREEMPT flag
>     detected */
>      #define AMDGPU_IB_FLAG_PREEMPT (1<<2)
>
>      /* The IB fence should do the L2 writeback but not invalidate any
>     shader
>       * caches (L2/vL1/sL1/I$). */
>      #define AMDGPU_IB_FLAG_TC_WB_NOT_INVALIDATE (1 << 3)
>
>     +/* Set GDS_COMPUTE_MAX_WAVE_ID = DEFAULT before
>     PACKET3_INDIRECT_BUFFER.
>     + * This will reset wave ID counters for the IB.
>     + */
>     +#define AMDGPU_IB_FLAG_RESET_GDS_MAX_WAVE_ID (1 << 4)
>     +
>      struct drm_amdgpu_cs_chunk_ib {
>             __u32 _pad;
>             /** AMDGPU_IB_FLAG_* */
>             __u32 flags;
>             /** Virtual address to begin IB execution */
>             __u64 va_start;
>             /** Size of submission */
>             __u32 ib_bytes;
>             /** HW IP to submit to */
>             __u32 ip_type;
>     -- 
>     2.17.1
>
>
> _______________________________________________
> amd-gfx mailing list
> amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx


[-- Attachment #1.2: Type: text/html, Size: 28853 bytes --]

[-- Attachment #2: Type: text/plain, Size: 154 bytes --]

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] drm/amdgpu: add a workaround for GDS ordered append hangs with compute queues
       [not found] ` <20190122200546.1575-1-maraeo-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
@ 2019-01-28 17:25   ` Marek Olšák
       [not found]     ` <CAAxE2A7xs40YZvCQpMmv9X0Z98Sc7+ZEbFJDJLfmkMuWHzuprw-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
  0 siblings, 1 reply; 9+ messages in thread
From: Marek Olšák @ 2019-01-28 17:25 UTC (permalink / raw)
  To: amd-gfx mailing list


[-- Attachment #1.1: Type: text/plain, Size: 19490 bytes --]

Ping

On Tue, Jan 22, 2019 at 3:05 PM Marek Olšák <maraeo-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org> wrote:

> From: Marek Olšák <marek.olsak-5C7GfCeVMHo@public.gmane.org>
>
> I'm not increasing the DRM version because GDS isn't totally without bugs
> yet.
>
> v2: update emit_ib_size
>
> Signed-off-by: Marek Olšák <marek.olsak-5C7GfCeVMHo@public.gmane.org>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_gds.h |  2 ++
>  drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c   | 19 +++++++++++-
>  drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c   | 21 +++++++++++--
>  drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c   | 40 +++++++++++++++++++++++--
>  include/uapi/drm/amdgpu_drm.h           |  5 ++++
>  5 files changed, 82 insertions(+), 5 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gds.h
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gds.h
> index ecbcefe49a98..f89f5734d985 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gds.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gds.h
> @@ -30,20 +30,22 @@ struct amdgpu_bo;
>  struct amdgpu_gds_asic_info {
>         uint32_t        total_size;
>         uint32_t        gfx_partition_size;
>         uint32_t        cs_partition_size;
>  };
>
>  struct amdgpu_gds {
>         struct amdgpu_gds_asic_info     mem;
>         struct amdgpu_gds_asic_info     gws;
>         struct amdgpu_gds_asic_info     oa;
> +       uint32_t                        gds_compute_max_wave_id;
> +
>         /* At present, GDS, GWS and OA resources for gfx (graphics)
>          * is always pre-allocated and available for graphics operation.
>          * Such resource is shared between all gfx clients.
>          * TODO: move this operation to user space
>          * */
>         struct amdgpu_bo*               gds_gfx_bo;
>         struct amdgpu_bo*               gws_gfx_bo;
>         struct amdgpu_bo*               oa_gfx_bo;
>  };
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c
> b/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c
> index 7984292f9282..a59e0fdf5a97 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c
> @@ -2257,20 +2257,36 @@ static void gfx_v7_0_ring_emit_ib_gfx(struct
> amdgpu_ring *ring,
>  }
>
>  static void gfx_v7_0_ring_emit_ib_compute(struct amdgpu_ring *ring,
>                                           struct amdgpu_job *job,
>                                           struct amdgpu_ib *ib,
>                                           uint32_t flags)
>  {
>         unsigned vmid = AMDGPU_JOB_GET_VMID(job);
>         u32 control = INDIRECT_BUFFER_VALID | ib->length_dw | (vmid << 24);
>
> +       /* Currently, there is a high possibility to get wave ID mismatch
> +        * between ME and GDS, leading to a hw deadlock, because ME
> generates
> +        * different wave IDs than the GDS expects. This situation happens
> +        * randomly when at least 5 compute pipes use GDS ordered append.
> +        * The wave IDs generated by ME are also wrong after
> suspend/resume.
> +        * Those are probably bugs somewhere else in the kernel driver.
> +        *
> +        * Writing GDS_COMPUTE_MAX_WAVE_ID resets wave ID counters in ME
> and
> +        * GDS to 0 for this ring (me/pipe).
> +        */
> +       if (ib->flags & AMDGPU_IB_FLAG_RESET_GDS_MAX_WAVE_ID) {
> +               amdgpu_ring_write(ring, PACKET3(PACKET3_SET_CONFIG_REG,
> 1));
> +               amdgpu_ring_write(ring, mmGDS_COMPUTE_MAX_WAVE_ID -
> PACKET3_SET_CONFIG_REG_START);
> +               amdgpu_ring_write(ring,
> ring->adev->gds.gds_compute_max_wave_id);
> +       }
> +
>         amdgpu_ring_write(ring, PACKET3(PACKET3_INDIRECT_BUFFER, 2));
>         amdgpu_ring_write(ring,
>  #ifdef __BIG_ENDIAN
>                                           (2 << 0) |
>  #endif
>                                           (ib->gpu_addr & 0xFFFFFFFC));
>         amdgpu_ring_write(ring, upper_32_bits(ib->gpu_addr) & 0xFFFF);
>         amdgpu_ring_write(ring, control);
>  }
>
> @@ -4993,21 +5009,21 @@ static const struct amdgpu_ring_funcs
> gfx_v7_0_ring_funcs_compute = {
>         .get_rptr = gfx_v7_0_ring_get_rptr,
>         .get_wptr = gfx_v7_0_ring_get_wptr_compute,
>         .set_wptr = gfx_v7_0_ring_set_wptr_compute,
>         .emit_frame_size =
>                 20 + /* gfx_v7_0_ring_emit_gds_switch */
>                 7 + /* gfx_v7_0_ring_emit_hdp_flush */
>                 5 + /* hdp invalidate */
>                 7 + /* gfx_v7_0_ring_emit_pipeline_sync */
>                 CIK_FLUSH_GPU_TLB_NUM_WREG * 5 + 7 + /*
> gfx_v7_0_ring_emit_vm_flush */
>                 7 + 7 + 7, /* gfx_v7_0_ring_emit_fence_compute x3 for user
> fence, vm fence */
> -       .emit_ib_size = 4, /* gfx_v7_0_ring_emit_ib_compute */
> +       .emit_ib_size = 7, /* gfx_v7_0_ring_emit_ib_compute */
>         .emit_ib = gfx_v7_0_ring_emit_ib_compute,
>         .emit_fence = gfx_v7_0_ring_emit_fence_compute,
>         .emit_pipeline_sync = gfx_v7_0_ring_emit_pipeline_sync,
>         .emit_vm_flush = gfx_v7_0_ring_emit_vm_flush,
>         .emit_gds_switch = gfx_v7_0_ring_emit_gds_switch,
>         .emit_hdp_flush = gfx_v7_0_ring_emit_hdp_flush,
>         .test_ring = gfx_v7_0_ring_test_ring,
>         .test_ib = gfx_v7_0_ring_test_ib,
>         .insert_nop = amdgpu_ring_insert_nop,
>         .pad_ib = amdgpu_ring_generic_pad_ib,
> @@ -5050,20 +5066,21 @@ static void gfx_v7_0_set_irq_funcs(struct
> amdgpu_device *adev)
>         adev->gfx.priv_inst_irq.num_types = 1;
>         adev->gfx.priv_inst_irq.funcs = &gfx_v7_0_priv_inst_irq_funcs;
>  }
>
>  static void gfx_v7_0_set_gds_init(struct amdgpu_device *adev)
>  {
>         /* init asci gds info */
>         adev->gds.mem.total_size = RREG32(mmGDS_VMID0_SIZE);
>         adev->gds.gws.total_size = 64;
>         adev->gds.oa.total_size = 16;
> +       adev->gds.gds_compute_max_wave_id =
> RREG32(mmGDS_COMPUTE_MAX_WAVE_ID);
>
>         if (adev->gds.mem.total_size == 64 * 1024) {
>                 adev->gds.mem.gfx_partition_size = 4096;
>                 adev->gds.mem.cs_partition_size = 4096;
>
>                 adev->gds.gws.gfx_partition_size = 4;
>                 adev->gds.gws.cs_partition_size = 4;
>
>                 adev->gds.oa.gfx_partition_size = 4;
>                 adev->gds.oa.cs_partition_size = 1;
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> index a26747681ed6..b8e50a34bdb3 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> @@ -6077,20 +6077,36 @@ static void gfx_v8_0_ring_emit_ib_gfx(struct
> amdgpu_ring *ring,
>  }
>
>  static void gfx_v8_0_ring_emit_ib_compute(struct amdgpu_ring *ring,
>                                           struct amdgpu_job *job,
>                                           struct amdgpu_ib *ib,
>                                           uint32_t flags)
>  {
>         unsigned vmid = AMDGPU_JOB_GET_VMID(job);
>         u32 control = INDIRECT_BUFFER_VALID | ib->length_dw | (vmid << 24);
>
> +       /* Currently, there is a high possibility to get wave ID mismatch
> +        * between ME and GDS, leading to a hw deadlock, because ME
> generates
> +        * different wave IDs than the GDS expects. This situation happens
> +        * randomly when at least 5 compute pipes use GDS ordered append.
> +        * The wave IDs generated by ME are also wrong after
> suspend/resume.
> +        * Those are probably bugs somewhere else in the kernel driver.
> +        *
> +        * Writing GDS_COMPUTE_MAX_WAVE_ID resets wave ID counters in ME
> and
> +        * GDS to 0 for this ring (me/pipe).
> +        */
> +       if (ib->flags & AMDGPU_IB_FLAG_RESET_GDS_MAX_WAVE_ID) {
> +               amdgpu_ring_write(ring, PACKET3(PACKET3_SET_CONFIG_REG,
> 1));
> +               amdgpu_ring_write(ring, mmGDS_COMPUTE_MAX_WAVE_ID -
> PACKET3_SET_CONFIG_REG_START);
> +               amdgpu_ring_write(ring,
> ring->adev->gds.gds_compute_max_wave_id);
> +       }
> +
>         amdgpu_ring_write(ring, PACKET3(PACKET3_INDIRECT_BUFFER, 2));
>         amdgpu_ring_write(ring,
>  #ifdef __BIG_ENDIAN
>                                 (2 << 0) |
>  #endif
>                                 (ib->gpu_addr & 0xFFFFFFFC));
>         amdgpu_ring_write(ring, upper_32_bits(ib->gpu_addr) & 0xFFFF);
>         amdgpu_ring_write(ring, control);
>  }
>
> @@ -6883,21 +6899,21 @@ static const struct amdgpu_ring_funcs
> gfx_v8_0_ring_funcs_compute = {
>         .get_rptr = gfx_v8_0_ring_get_rptr,
>         .get_wptr = gfx_v8_0_ring_get_wptr_compute,
>         .set_wptr = gfx_v8_0_ring_set_wptr_compute,
>         .emit_frame_size =
>                 20 + /* gfx_v8_0_ring_emit_gds_switch */
>                 7 + /* gfx_v8_0_ring_emit_hdp_flush */
>                 5 + /* hdp_invalidate */
>                 7 + /* gfx_v8_0_ring_emit_pipeline_sync */
>                 VI_FLUSH_GPU_TLB_NUM_WREG * 5 + 7 + /*
> gfx_v8_0_ring_emit_vm_flush */
>                 7 + 7 + 7, /* gfx_v8_0_ring_emit_fence_compute x3 for user
> fence, vm fence */
> -       .emit_ib_size = 4, /* gfx_v8_0_ring_emit_ib_compute */
> +       .emit_ib_size = 7, /* gfx_v8_0_ring_emit_ib_compute */
>         .emit_ib = gfx_v8_0_ring_emit_ib_compute,
>         .emit_fence = gfx_v8_0_ring_emit_fence_compute,
>         .emit_pipeline_sync = gfx_v8_0_ring_emit_pipeline_sync,
>         .emit_vm_flush = gfx_v8_0_ring_emit_vm_flush,
>         .emit_gds_switch = gfx_v8_0_ring_emit_gds_switch,
>         .emit_hdp_flush = gfx_v8_0_ring_emit_hdp_flush,
>         .test_ring = gfx_v8_0_ring_test_ring,
>         .test_ib = gfx_v8_0_ring_test_ib,
>         .insert_nop = amdgpu_ring_insert_nop,
>         .pad_ib = amdgpu_ring_generic_pad_ib,
> @@ -6913,21 +6929,21 @@ static const struct amdgpu_ring_funcs
> gfx_v8_0_ring_funcs_kiq = {
>         .get_rptr = gfx_v8_0_ring_get_rptr,
>         .get_wptr = gfx_v8_0_ring_get_wptr_compute,
>         .set_wptr = gfx_v8_0_ring_set_wptr_compute,
>         .emit_frame_size =
>                 20 + /* gfx_v8_0_ring_emit_gds_switch */
>                 7 + /* gfx_v8_0_ring_emit_hdp_flush */
>                 5 + /* hdp_invalidate */
>                 7 + /* gfx_v8_0_ring_emit_pipeline_sync */
>                 17 + /* gfx_v8_0_ring_emit_vm_flush */
>                 7 + 7 + 7, /* gfx_v8_0_ring_emit_fence_kiq x3 for user
> fence, vm fence */
> -       .emit_ib_size = 4, /* gfx_v8_0_ring_emit_ib_compute */
> +       .emit_ib_size = 7, /* gfx_v8_0_ring_emit_ib_compute */
>         .emit_fence = gfx_v8_0_ring_emit_fence_kiq,
>         .test_ring = gfx_v8_0_ring_test_ring,
>         .insert_nop = amdgpu_ring_insert_nop,
>         .pad_ib = amdgpu_ring_generic_pad_ib,
>         .emit_rreg = gfx_v8_0_ring_emit_rreg,
>         .emit_wreg = gfx_v8_0_ring_emit_wreg,
>  };
>
>  static void gfx_v8_0_set_ring_funcs(struct amdgpu_device *adev)
>  {
> @@ -6989,20 +7005,21 @@ static void gfx_v8_0_set_rlc_funcs(struct
> amdgpu_device *adev)
>  {
>         adev->gfx.rlc.funcs = &iceland_rlc_funcs;
>  }
>
>  static void gfx_v8_0_set_gds_init(struct amdgpu_device *adev)
>  {
>         /* init asci gds info */
>         adev->gds.mem.total_size = RREG32(mmGDS_VMID0_SIZE);
>         adev->gds.gws.total_size = 64;
>         adev->gds.oa.total_size = 16;
> +       adev->gds.gds_compute_max_wave_id =
> RREG32(mmGDS_COMPUTE_MAX_WAVE_ID);
>
>         if (adev->gds.mem.total_size == 64 * 1024) {
>                 adev->gds.mem.gfx_partition_size = 4096;
>                 adev->gds.mem.cs_partition_size = 4096;
>
>                 adev->gds.gws.gfx_partition_size = 4;
>                 adev->gds.gws.cs_partition_size = 4;
>
>                 adev->gds.oa.gfx_partition_size = 4;
>                 adev->gds.oa.cs_partition_size = 1;
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> index 262ee3cf6f1c..5533f6e4f4a4 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> @@ -4003,20 +4003,36 @@ static void gfx_v9_0_ring_emit_ib_gfx(struct
> amdgpu_ring *ring,
>  }
>
>  static void gfx_v9_0_ring_emit_ib_compute(struct amdgpu_ring *ring,
>                                           struct amdgpu_job *job,
>                                           struct amdgpu_ib *ib,
>                                           uint32_t flags)
>  {
>         unsigned vmid = AMDGPU_JOB_GET_VMID(job);
>         u32 control = INDIRECT_BUFFER_VALID | ib->length_dw | (vmid << 24);
>
> +       /* Currently, there is a high possibility to get wave ID mismatch
> +        * between ME and GDS, leading to a hw deadlock, because ME
> generates
> +        * different wave IDs than the GDS expects. This situation happens
> +        * randomly when at least 5 compute pipes use GDS ordered append.
> +        * The wave IDs generated by ME are also wrong after
> suspend/resume.
> +        * Those are probably bugs somewhere else in the kernel driver.
> +        *
> +        * Writing GDS_COMPUTE_MAX_WAVE_ID resets wave ID counters in ME
> and
> +        * GDS to 0 for this ring (me/pipe).
> +        */
> +       if (ib->flags & AMDGPU_IB_FLAG_RESET_GDS_MAX_WAVE_ID) {
> +               amdgpu_ring_write(ring, PACKET3(PACKET3_SET_CONFIG_REG,
> 1));
> +               amdgpu_ring_write(ring, mmGDS_COMPUTE_MAX_WAVE_ID);
> +               amdgpu_ring_write(ring,
> ring->adev->gds.gds_compute_max_wave_id);
> +       }
> +
>         amdgpu_ring_write(ring, PACKET3(PACKET3_INDIRECT_BUFFER, 2));
>         BUG_ON(ib->gpu_addr & 0x3); /* Dword align */
>         amdgpu_ring_write(ring,
>  #ifdef __BIG_ENDIAN
>                                 (2 << 0) |
>  #endif
>                                 lower_32_bits(ib->gpu_addr));
>         amdgpu_ring_write(ring, upper_32_bits(ib->gpu_addr));
>         amdgpu_ring_write(ring, control);
>  }
> @@ -4722,21 +4738,21 @@ static const struct amdgpu_ring_funcs
> gfx_v9_0_ring_funcs_compute = {
>         .set_wptr = gfx_v9_0_ring_set_wptr_compute,
>         .emit_frame_size =
>                 20 + /* gfx_v9_0_ring_emit_gds_switch */
>                 7 + /* gfx_v9_0_ring_emit_hdp_flush */
>                 5 + /* hdp invalidate */
>                 7 + /* gfx_v9_0_ring_emit_pipeline_sync */
>                 SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
>                 SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
>                 2 + /* gfx_v9_0_ring_emit_vm_flush */
>                 8 + 8 + 8, /* gfx_v9_0_ring_emit_fence x3 for user fence,
> vm fence */
> -       .emit_ib_size = 4, /* gfx_v9_0_ring_emit_ib_compute */
> +       .emit_ib_size = 7, /* gfx_v9_0_ring_emit_ib_compute */
>         .emit_ib = gfx_v9_0_ring_emit_ib_compute,
>         .emit_fence = gfx_v9_0_ring_emit_fence,
>         .emit_pipeline_sync = gfx_v9_0_ring_emit_pipeline_sync,
>         .emit_vm_flush = gfx_v9_0_ring_emit_vm_flush,
>         .emit_gds_switch = gfx_v9_0_ring_emit_gds_switch,
>         .emit_hdp_flush = gfx_v9_0_ring_emit_hdp_flush,
>         .test_ring = gfx_v9_0_ring_test_ring,
>         .test_ib = gfx_v9_0_ring_test_ib,
>         .insert_nop = amdgpu_ring_insert_nop,
>         .pad_ib = amdgpu_ring_generic_pad_ib,
> @@ -4757,21 +4773,21 @@ static const struct amdgpu_ring_funcs
> gfx_v9_0_ring_funcs_kiq = {
>         .set_wptr = gfx_v9_0_ring_set_wptr_compute,
>         .emit_frame_size =
>                 20 + /* gfx_v9_0_ring_emit_gds_switch */
>                 7 + /* gfx_v9_0_ring_emit_hdp_flush */
>                 5 + /* hdp invalidate */
>                 7 + /* gfx_v9_0_ring_emit_pipeline_sync */
>                 SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
>                 SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
>                 2 + /* gfx_v9_0_ring_emit_vm_flush */
>                 8 + 8 + 8, /* gfx_v9_0_ring_emit_fence_kiq x3 for user
> fence, vm fence */
> -       .emit_ib_size = 4, /* gfx_v9_0_ring_emit_ib_compute */
> +       .emit_ib_size = 7, /* gfx_v9_0_ring_emit_ib_compute */
>         .emit_fence = gfx_v9_0_ring_emit_fence_kiq,
>         .test_ring = gfx_v9_0_ring_test_ring,
>         .insert_nop = amdgpu_ring_insert_nop,
>         .pad_ib = amdgpu_ring_generic_pad_ib,
>         .emit_rreg = gfx_v9_0_ring_emit_rreg,
>         .emit_wreg = gfx_v9_0_ring_emit_wreg,
>         .emit_reg_wait = gfx_v9_0_ring_emit_reg_wait,
>         .emit_reg_write_reg_wait = gfx_v9_0_ring_emit_reg_write_reg_wait,
>  };
>
> @@ -4839,20 +4855,40 @@ static void gfx_v9_0_set_gds_init(struct
> amdgpu_device *adev)
>                 adev->gds.mem.total_size = 0x10000;
>                 break;
>         case CHIP_RAVEN:
>                 adev->gds.mem.total_size = 0x1000;
>                 break;
>         default:
>                 adev->gds.mem.total_size = 0x10000;
>                 break;
>         }
>
> +       switch (adev->asic_type) {
> +       case CHIP_VEGA10:
> +       case CHIP_VEGA20:
> +               adev->gds.gds_compute_max_wave_id = 0x7ff;
> +               break;
> +       case CHIP_VEGA12:
> +               adev->gds.gds_compute_max_wave_id = 0x27f;
> +               break;
> +       case CHIP_RAVEN:
> +               if (adev->rev_id >= 0x8)
> +                       adev->gds.gds_compute_max_wave_id = 0x77; /*
> raven2 */
> +               else
> +                       adev->gds.gds_compute_max_wave_id = 0x15f; /*
> raven1 */
> +               break;
> +       default:
> +               /* this really depends on the chip */
> +               adev->gds.gds_compute_max_wave_id = 0x7ff;
> +               break;
> +       }
> +
>         adev->gds.gws.total_size = 64;
>         adev->gds.oa.total_size = 16;
>
>         if (adev->gds.mem.total_size == 64 * 1024) {
>                 adev->gds.mem.gfx_partition_size = 4096;
>                 adev->gds.mem.cs_partition_size = 4096;
>
>                 adev->gds.gws.gfx_partition_size = 4;
>                 adev->gds.gws.cs_partition_size = 4;
>
> diff --git a/include/uapi/drm/amdgpu_drm.h b/include/uapi/drm/amdgpu_drm.h
> index faaad04814e4..662d379ea624 100644
> --- a/include/uapi/drm/amdgpu_drm.h
> +++ b/include/uapi/drm/amdgpu_drm.h
> @@ -561,20 +561,25 @@ union drm_amdgpu_cs {
>  /* Preamble flag, which means the IB could be dropped if no context
> switch */
>  #define AMDGPU_IB_FLAG_PREAMBLE (1<<1)
>
>  /* Preempt flag, IB should set Pre_enb bit if PREEMPT flag detected */
>  #define AMDGPU_IB_FLAG_PREEMPT (1<<2)
>
>  /* The IB fence should do the L2 writeback but not invalidate any shader
>   * caches (L2/vL1/sL1/I$). */
>  #define AMDGPU_IB_FLAG_TC_WB_NOT_INVALIDATE (1 << 3)
>
> +/* Set GDS_COMPUTE_MAX_WAVE_ID = DEFAULT before PACKET3_INDIRECT_BUFFER.
> + * This will reset wave ID counters for the IB.
> + */
> +#define AMDGPU_IB_FLAG_RESET_GDS_MAX_WAVE_ID (1 << 4)
> +
>  struct drm_amdgpu_cs_chunk_ib {
>         __u32 _pad;
>         /** AMDGPU_IB_FLAG_* */
>         __u32 flags;
>         /** Virtual address to begin IB execution */
>         __u64 va_start;
>         /** Size of submission */
>         __u32 ib_bytes;
>         /** HW IP to submit to */
>         __u32 ip_type;
> --
> 2.17.1
>
>

[-- Attachment #1.2: Type: text/html, Size: 22677 bytes --]

[-- Attachment #2: Type: text/plain, Size: 154 bytes --]

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 9+ messages in thread

* [PATCH] drm/amdgpu: add a workaround for GDS ordered append hangs with compute queues
@ 2019-01-22 20:05 Marek Olšák
       [not found] ` <20190122200546.1575-1-maraeo-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
  0 siblings, 1 reply; 9+ messages in thread
From: Marek Olšák @ 2019-01-22 20:05 UTC (permalink / raw)
  To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

From: Marek Olšák <marek.olsak@amd.com>

I'm not increasing the DRM version because GDS isn't totally without bugs yet.

v2: update emit_ib_size

Signed-off-by: Marek Olšák <marek.olsak@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_gds.h |  2 ++
 drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c   | 19 +++++++++++-
 drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c   | 21 +++++++++++--
 drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c   | 40 +++++++++++++++++++++++--
 include/uapi/drm/amdgpu_drm.h           |  5 ++++
 5 files changed, 82 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gds.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_gds.h
index ecbcefe49a98..f89f5734d985 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gds.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gds.h
@@ -30,20 +30,22 @@ struct amdgpu_bo;
 struct amdgpu_gds_asic_info {
 	uint32_t	total_size;
 	uint32_t	gfx_partition_size;
 	uint32_t	cs_partition_size;
 };
 
 struct amdgpu_gds {
 	struct amdgpu_gds_asic_info	mem;
 	struct amdgpu_gds_asic_info	gws;
 	struct amdgpu_gds_asic_info	oa;
+	uint32_t			gds_compute_max_wave_id;
+
 	/* At present, GDS, GWS and OA resources for gfx (graphics)
 	 * is always pre-allocated and available for graphics operation.
 	 * Such resource is shared between all gfx clients.
 	 * TODO: move this operation to user space
 	 * */
 	struct amdgpu_bo*		gds_gfx_bo;
 	struct amdgpu_bo*		gws_gfx_bo;
 	struct amdgpu_bo*		oa_gfx_bo;
 };
 
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c
index 7984292f9282..a59e0fdf5a97 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c
@@ -2257,20 +2257,36 @@ static void gfx_v7_0_ring_emit_ib_gfx(struct amdgpu_ring *ring,
 }
 
 static void gfx_v7_0_ring_emit_ib_compute(struct amdgpu_ring *ring,
 					  struct amdgpu_job *job,
 					  struct amdgpu_ib *ib,
 					  uint32_t flags)
 {
 	unsigned vmid = AMDGPU_JOB_GET_VMID(job);
 	u32 control = INDIRECT_BUFFER_VALID | ib->length_dw | (vmid << 24);
 
+	/* Currently, there is a high possibility to get wave ID mismatch
+	 * between ME and GDS, leading to a hw deadlock, because ME generates
+	 * different wave IDs than the GDS expects. This situation happens
+	 * randomly when at least 5 compute pipes use GDS ordered append.
+	 * The wave IDs generated by ME are also wrong after suspend/resume.
+	 * Those are probably bugs somewhere else in the kernel driver.
+	 *
+	 * Writing GDS_COMPUTE_MAX_WAVE_ID resets wave ID counters in ME and
+	 * GDS to 0 for this ring (me/pipe).
+	 */
+	if (ib->flags & AMDGPU_IB_FLAG_RESET_GDS_MAX_WAVE_ID) {
+		amdgpu_ring_write(ring, PACKET3(PACKET3_SET_CONFIG_REG, 1));
+		amdgpu_ring_write(ring, mmGDS_COMPUTE_MAX_WAVE_ID - PACKET3_SET_CONFIG_REG_START);
+		amdgpu_ring_write(ring, ring->adev->gds.gds_compute_max_wave_id);
+	}
+
 	amdgpu_ring_write(ring, PACKET3(PACKET3_INDIRECT_BUFFER, 2));
 	amdgpu_ring_write(ring,
 #ifdef __BIG_ENDIAN
 					  (2 << 0) |
 #endif
 					  (ib->gpu_addr & 0xFFFFFFFC));
 	amdgpu_ring_write(ring, upper_32_bits(ib->gpu_addr) & 0xFFFF);
 	amdgpu_ring_write(ring, control);
 }
 
@@ -4993,21 +5009,21 @@ static const struct amdgpu_ring_funcs gfx_v7_0_ring_funcs_compute = {
 	.get_rptr = gfx_v7_0_ring_get_rptr,
 	.get_wptr = gfx_v7_0_ring_get_wptr_compute,
 	.set_wptr = gfx_v7_0_ring_set_wptr_compute,
 	.emit_frame_size =
 		20 + /* gfx_v7_0_ring_emit_gds_switch */
 		7 + /* gfx_v7_0_ring_emit_hdp_flush */
 		5 + /* hdp invalidate */
 		7 + /* gfx_v7_0_ring_emit_pipeline_sync */
 		CIK_FLUSH_GPU_TLB_NUM_WREG * 5 + 7 + /* gfx_v7_0_ring_emit_vm_flush */
 		7 + 7 + 7, /* gfx_v7_0_ring_emit_fence_compute x3 for user fence, vm fence */
-	.emit_ib_size =	4, /* gfx_v7_0_ring_emit_ib_compute */
+	.emit_ib_size =	7, /* gfx_v7_0_ring_emit_ib_compute */
 	.emit_ib = gfx_v7_0_ring_emit_ib_compute,
 	.emit_fence = gfx_v7_0_ring_emit_fence_compute,
 	.emit_pipeline_sync = gfx_v7_0_ring_emit_pipeline_sync,
 	.emit_vm_flush = gfx_v7_0_ring_emit_vm_flush,
 	.emit_gds_switch = gfx_v7_0_ring_emit_gds_switch,
 	.emit_hdp_flush = gfx_v7_0_ring_emit_hdp_flush,
 	.test_ring = gfx_v7_0_ring_test_ring,
 	.test_ib = gfx_v7_0_ring_test_ib,
 	.insert_nop = amdgpu_ring_insert_nop,
 	.pad_ib = amdgpu_ring_generic_pad_ib,
@@ -5050,20 +5066,21 @@ static void gfx_v7_0_set_irq_funcs(struct amdgpu_device *adev)
 	adev->gfx.priv_inst_irq.num_types = 1;
 	adev->gfx.priv_inst_irq.funcs = &gfx_v7_0_priv_inst_irq_funcs;
 }
 
 static void gfx_v7_0_set_gds_init(struct amdgpu_device *adev)
 {
 	/* init asci gds info */
 	adev->gds.mem.total_size = RREG32(mmGDS_VMID0_SIZE);
 	adev->gds.gws.total_size = 64;
 	adev->gds.oa.total_size = 16;
+	adev->gds.gds_compute_max_wave_id = RREG32(mmGDS_COMPUTE_MAX_WAVE_ID);
 
 	if (adev->gds.mem.total_size == 64 * 1024) {
 		adev->gds.mem.gfx_partition_size = 4096;
 		adev->gds.mem.cs_partition_size = 4096;
 
 		adev->gds.gws.gfx_partition_size = 4;
 		adev->gds.gws.cs_partition_size = 4;
 
 		adev->gds.oa.gfx_partition_size = 4;
 		adev->gds.oa.cs_partition_size = 1;
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
index a26747681ed6..b8e50a34bdb3 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
@@ -6077,20 +6077,36 @@ static void gfx_v8_0_ring_emit_ib_gfx(struct amdgpu_ring *ring,
 }
 
 static void gfx_v8_0_ring_emit_ib_compute(struct amdgpu_ring *ring,
 					  struct amdgpu_job *job,
 					  struct amdgpu_ib *ib,
 					  uint32_t flags)
 {
 	unsigned vmid = AMDGPU_JOB_GET_VMID(job);
 	u32 control = INDIRECT_BUFFER_VALID | ib->length_dw | (vmid << 24);
 
+	/* Currently, there is a high possibility to get wave ID mismatch
+	 * between ME and GDS, leading to a hw deadlock, because ME generates
+	 * different wave IDs than the GDS expects. This situation happens
+	 * randomly when at least 5 compute pipes use GDS ordered append.
+	 * The wave IDs generated by ME are also wrong after suspend/resume.
+	 * Those are probably bugs somewhere else in the kernel driver.
+	 *
+	 * Writing GDS_COMPUTE_MAX_WAVE_ID resets wave ID counters in ME and
+	 * GDS to 0 for this ring (me/pipe).
+	 */
+	if (ib->flags & AMDGPU_IB_FLAG_RESET_GDS_MAX_WAVE_ID) {
+		amdgpu_ring_write(ring, PACKET3(PACKET3_SET_CONFIG_REG, 1));
+		amdgpu_ring_write(ring, mmGDS_COMPUTE_MAX_WAVE_ID - PACKET3_SET_CONFIG_REG_START);
+		amdgpu_ring_write(ring, ring->adev->gds.gds_compute_max_wave_id);
+	}
+
 	amdgpu_ring_write(ring, PACKET3(PACKET3_INDIRECT_BUFFER, 2));
 	amdgpu_ring_write(ring,
 #ifdef __BIG_ENDIAN
 				(2 << 0) |
 #endif
 				(ib->gpu_addr & 0xFFFFFFFC));
 	amdgpu_ring_write(ring, upper_32_bits(ib->gpu_addr) & 0xFFFF);
 	amdgpu_ring_write(ring, control);
 }
 
@@ -6883,21 +6899,21 @@ static const struct amdgpu_ring_funcs gfx_v8_0_ring_funcs_compute = {
 	.get_rptr = gfx_v8_0_ring_get_rptr,
 	.get_wptr = gfx_v8_0_ring_get_wptr_compute,
 	.set_wptr = gfx_v8_0_ring_set_wptr_compute,
 	.emit_frame_size =
 		20 + /* gfx_v8_0_ring_emit_gds_switch */
 		7 + /* gfx_v8_0_ring_emit_hdp_flush */
 		5 + /* hdp_invalidate */
 		7 + /* gfx_v8_0_ring_emit_pipeline_sync */
 		VI_FLUSH_GPU_TLB_NUM_WREG * 5 + 7 + /* gfx_v8_0_ring_emit_vm_flush */
 		7 + 7 + 7, /* gfx_v8_0_ring_emit_fence_compute x3 for user fence, vm fence */
-	.emit_ib_size =	4, /* gfx_v8_0_ring_emit_ib_compute */
+	.emit_ib_size =	7, /* gfx_v8_0_ring_emit_ib_compute */
 	.emit_ib = gfx_v8_0_ring_emit_ib_compute,
 	.emit_fence = gfx_v8_0_ring_emit_fence_compute,
 	.emit_pipeline_sync = gfx_v8_0_ring_emit_pipeline_sync,
 	.emit_vm_flush = gfx_v8_0_ring_emit_vm_flush,
 	.emit_gds_switch = gfx_v8_0_ring_emit_gds_switch,
 	.emit_hdp_flush = gfx_v8_0_ring_emit_hdp_flush,
 	.test_ring = gfx_v8_0_ring_test_ring,
 	.test_ib = gfx_v8_0_ring_test_ib,
 	.insert_nop = amdgpu_ring_insert_nop,
 	.pad_ib = amdgpu_ring_generic_pad_ib,
@@ -6913,21 +6929,21 @@ static const struct amdgpu_ring_funcs gfx_v8_0_ring_funcs_kiq = {
 	.get_rptr = gfx_v8_0_ring_get_rptr,
 	.get_wptr = gfx_v8_0_ring_get_wptr_compute,
 	.set_wptr = gfx_v8_0_ring_set_wptr_compute,
 	.emit_frame_size =
 		20 + /* gfx_v8_0_ring_emit_gds_switch */
 		7 + /* gfx_v8_0_ring_emit_hdp_flush */
 		5 + /* hdp_invalidate */
 		7 + /* gfx_v8_0_ring_emit_pipeline_sync */
 		17 + /* gfx_v8_0_ring_emit_vm_flush */
 		7 + 7 + 7, /* gfx_v8_0_ring_emit_fence_kiq x3 for user fence, vm fence */
-	.emit_ib_size =	4, /* gfx_v8_0_ring_emit_ib_compute */
+	.emit_ib_size =	7, /* gfx_v8_0_ring_emit_ib_compute */
 	.emit_fence = gfx_v8_0_ring_emit_fence_kiq,
 	.test_ring = gfx_v8_0_ring_test_ring,
 	.insert_nop = amdgpu_ring_insert_nop,
 	.pad_ib = amdgpu_ring_generic_pad_ib,
 	.emit_rreg = gfx_v8_0_ring_emit_rreg,
 	.emit_wreg = gfx_v8_0_ring_emit_wreg,
 };
 
 static void gfx_v8_0_set_ring_funcs(struct amdgpu_device *adev)
 {
@@ -6989,20 +7005,21 @@ static void gfx_v8_0_set_rlc_funcs(struct amdgpu_device *adev)
 {
 	adev->gfx.rlc.funcs = &iceland_rlc_funcs;
 }
 
 static void gfx_v8_0_set_gds_init(struct amdgpu_device *adev)
 {
 	/* init asci gds info */
 	adev->gds.mem.total_size = RREG32(mmGDS_VMID0_SIZE);
 	adev->gds.gws.total_size = 64;
 	adev->gds.oa.total_size = 16;
+	adev->gds.gds_compute_max_wave_id = RREG32(mmGDS_COMPUTE_MAX_WAVE_ID);
 
 	if (adev->gds.mem.total_size == 64 * 1024) {
 		adev->gds.mem.gfx_partition_size = 4096;
 		adev->gds.mem.cs_partition_size = 4096;
 
 		adev->gds.gws.gfx_partition_size = 4;
 		adev->gds.gws.cs_partition_size = 4;
 
 		adev->gds.oa.gfx_partition_size = 4;
 		adev->gds.oa.cs_partition_size = 1;
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
index 262ee3cf6f1c..5533f6e4f4a4 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
@@ -4003,20 +4003,36 @@ static void gfx_v9_0_ring_emit_ib_gfx(struct amdgpu_ring *ring,
 }
 
 static void gfx_v9_0_ring_emit_ib_compute(struct amdgpu_ring *ring,
 					  struct amdgpu_job *job,
 					  struct amdgpu_ib *ib,
 					  uint32_t flags)
 {
 	unsigned vmid = AMDGPU_JOB_GET_VMID(job);
 	u32 control = INDIRECT_BUFFER_VALID | ib->length_dw | (vmid << 24);
 
+	/* Currently, there is a high possibility to get wave ID mismatch
+	 * between ME and GDS, leading to a hw deadlock, because ME generates
+	 * different wave IDs than the GDS expects. This situation happens
+	 * randomly when at least 5 compute pipes use GDS ordered append.
+	 * The wave IDs generated by ME are also wrong after suspend/resume.
+	 * Those are probably bugs somewhere else in the kernel driver.
+	 *
+	 * Writing GDS_COMPUTE_MAX_WAVE_ID resets wave ID counters in ME and
+	 * GDS to 0 for this ring (me/pipe).
+	 */
+	if (ib->flags & AMDGPU_IB_FLAG_RESET_GDS_MAX_WAVE_ID) {
+		amdgpu_ring_write(ring, PACKET3(PACKET3_SET_CONFIG_REG, 1));
+		amdgpu_ring_write(ring, mmGDS_COMPUTE_MAX_WAVE_ID);
+		amdgpu_ring_write(ring, ring->adev->gds.gds_compute_max_wave_id);
+	}
+
 	amdgpu_ring_write(ring, PACKET3(PACKET3_INDIRECT_BUFFER, 2));
 	BUG_ON(ib->gpu_addr & 0x3); /* Dword align */
 	amdgpu_ring_write(ring,
 #ifdef __BIG_ENDIAN
 				(2 << 0) |
 #endif
 				lower_32_bits(ib->gpu_addr));
 	amdgpu_ring_write(ring, upper_32_bits(ib->gpu_addr));
 	amdgpu_ring_write(ring, control);
 }
@@ -4722,21 +4738,21 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_compute = {
 	.set_wptr = gfx_v9_0_ring_set_wptr_compute,
 	.emit_frame_size =
 		20 + /* gfx_v9_0_ring_emit_gds_switch */
 		7 + /* gfx_v9_0_ring_emit_hdp_flush */
 		5 + /* hdp invalidate */
 		7 + /* gfx_v9_0_ring_emit_pipeline_sync */
 		SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
 		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
 		2 + /* gfx_v9_0_ring_emit_vm_flush */
 		8 + 8 + 8, /* gfx_v9_0_ring_emit_fence x3 for user fence, vm fence */
-	.emit_ib_size =	4, /* gfx_v9_0_ring_emit_ib_compute */
+	.emit_ib_size =	7, /* gfx_v9_0_ring_emit_ib_compute */
 	.emit_ib = gfx_v9_0_ring_emit_ib_compute,
 	.emit_fence = gfx_v9_0_ring_emit_fence,
 	.emit_pipeline_sync = gfx_v9_0_ring_emit_pipeline_sync,
 	.emit_vm_flush = gfx_v9_0_ring_emit_vm_flush,
 	.emit_gds_switch = gfx_v9_0_ring_emit_gds_switch,
 	.emit_hdp_flush = gfx_v9_0_ring_emit_hdp_flush,
 	.test_ring = gfx_v9_0_ring_test_ring,
 	.test_ib = gfx_v9_0_ring_test_ib,
 	.insert_nop = amdgpu_ring_insert_nop,
 	.pad_ib = amdgpu_ring_generic_pad_ib,
@@ -4757,21 +4773,21 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_kiq = {
 	.set_wptr = gfx_v9_0_ring_set_wptr_compute,
 	.emit_frame_size =
 		20 + /* gfx_v9_0_ring_emit_gds_switch */
 		7 + /* gfx_v9_0_ring_emit_hdp_flush */
 		5 + /* hdp invalidate */
 		7 + /* gfx_v9_0_ring_emit_pipeline_sync */
 		SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
 		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
 		2 + /* gfx_v9_0_ring_emit_vm_flush */
 		8 + 8 + 8, /* gfx_v9_0_ring_emit_fence_kiq x3 for user fence, vm fence */
-	.emit_ib_size =	4, /* gfx_v9_0_ring_emit_ib_compute */
+	.emit_ib_size =	7, /* gfx_v9_0_ring_emit_ib_compute */
 	.emit_fence = gfx_v9_0_ring_emit_fence_kiq,
 	.test_ring = gfx_v9_0_ring_test_ring,
 	.insert_nop = amdgpu_ring_insert_nop,
 	.pad_ib = amdgpu_ring_generic_pad_ib,
 	.emit_rreg = gfx_v9_0_ring_emit_rreg,
 	.emit_wreg = gfx_v9_0_ring_emit_wreg,
 	.emit_reg_wait = gfx_v9_0_ring_emit_reg_wait,
 	.emit_reg_write_reg_wait = gfx_v9_0_ring_emit_reg_write_reg_wait,
 };
 
@@ -4839,20 +4855,40 @@ static void gfx_v9_0_set_gds_init(struct amdgpu_device *adev)
 		adev->gds.mem.total_size = 0x10000;
 		break;
 	case CHIP_RAVEN:
 		adev->gds.mem.total_size = 0x1000;
 		break;
 	default:
 		adev->gds.mem.total_size = 0x10000;
 		break;
 	}
 
+	switch (adev->asic_type) {
+	case CHIP_VEGA10:
+	case CHIP_VEGA20:
+		adev->gds.gds_compute_max_wave_id = 0x7ff;
+		break;
+	case CHIP_VEGA12:
+		adev->gds.gds_compute_max_wave_id = 0x27f;
+		break;
+	case CHIP_RAVEN:
+		if (adev->rev_id >= 0x8)
+			adev->gds.gds_compute_max_wave_id = 0x77; /* raven2 */
+		else
+			adev->gds.gds_compute_max_wave_id = 0x15f; /* raven1 */
+		break;
+	default:
+		/* this really depends on the chip */
+		adev->gds.gds_compute_max_wave_id = 0x7ff;
+		break;
+	}
+
 	adev->gds.gws.total_size = 64;
 	adev->gds.oa.total_size = 16;
 
 	if (adev->gds.mem.total_size == 64 * 1024) {
 		adev->gds.mem.gfx_partition_size = 4096;
 		adev->gds.mem.cs_partition_size = 4096;
 
 		adev->gds.gws.gfx_partition_size = 4;
 		adev->gds.gws.cs_partition_size = 4;
 
diff --git a/include/uapi/drm/amdgpu_drm.h b/include/uapi/drm/amdgpu_drm.h
index faaad04814e4..662d379ea624 100644
--- a/include/uapi/drm/amdgpu_drm.h
+++ b/include/uapi/drm/amdgpu_drm.h
@@ -561,20 +561,25 @@ union drm_amdgpu_cs {
 /* Preamble flag, which means the IB could be dropped if no context switch */
 #define AMDGPU_IB_FLAG_PREAMBLE (1<<1)
 
 /* Preempt flag, IB should set Pre_enb bit if PREEMPT flag detected */
 #define AMDGPU_IB_FLAG_PREEMPT (1<<2)
 
 /* The IB fence should do the L2 writeback but not invalidate any shader
  * caches (L2/vL1/sL1/I$). */
 #define AMDGPU_IB_FLAG_TC_WB_NOT_INVALIDATE (1 << 3)
 
+/* Set GDS_COMPUTE_MAX_WAVE_ID = DEFAULT before PACKET3_INDIRECT_BUFFER.
+ * This will reset wave ID counters for the IB.
+ */
+#define AMDGPU_IB_FLAG_RESET_GDS_MAX_WAVE_ID (1 << 4)
+
 struct drm_amdgpu_cs_chunk_ib {
 	__u32 _pad;
 	/** AMDGPU_IB_FLAG_* */
 	__u32 flags;
 	/** Virtual address to begin IB execution */
 	__u64 va_start;
 	/** Size of submission */
 	__u32 ib_bytes;
 	/** HW IP to submit to */
 	__u32 ip_type;
-- 
2.17.1

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2019-02-04 17:08 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-01-21 23:46 [PATCH] drm/amdgpu: add a workaround for GDS ordered append hangs with compute queues Marek Olšák
     [not found] ` <20190121234647.3995-1-maraeo-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
2019-01-22  8:54   ` Christian König
2019-01-22 20:05 Marek Olšák
     [not found] ` <20190122200546.1575-1-maraeo-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
2019-01-28 17:25   ` Marek Olšák
     [not found]     ` <CAAxE2A7xs40YZvCQpMmv9X0Z98Sc7+ZEbFJDJLfmkMuWHzuprw-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
2019-02-04 12:42       ` Christian König
     [not found]         ` <1b1294f7-1604-fe49-de7f-0e234656eecd-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
2019-02-04 15:55           ` Marek Olšák
     [not found]             ` <CAAxE2A4vsMRL0FZYBknCTMEbDBNYJZOdOnR32EZqM5iaKX6j3g-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
2019-02-04 16:00               ` Marek Olšák
2019-02-04 16:09               ` Koenig, Christian
     [not found]                 ` <44f785c1-16bf-289c-4502-0a7a3637aa30-5C7GfCeVMHo@public.gmane.org>
2019-02-04 17:08                   ` Alex Deucher

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.