Ping On Tue, Jan 22, 2019 at 3:05 PM Marek Olšák wrote: > From: Marek Olšák > > I'm not increasing the DRM version because GDS isn't totally without bugs > yet. > > v2: update emit_ib_size > > Signed-off-by: Marek Olšák > --- > drivers/gpu/drm/amd/amdgpu/amdgpu_gds.h | 2 ++ > drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c | 19 +++++++++++- > drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c | 21 +++++++++++-- > drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c | 40 +++++++++++++++++++++++-- > include/uapi/drm/amdgpu_drm.h | 5 ++++ > 5 files changed, 82 insertions(+), 5 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gds.h > b/drivers/gpu/drm/amd/amdgpu/amdgpu_gds.h > index ecbcefe49a98..f89f5734d985 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gds.h > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gds.h > @@ -30,20 +30,22 @@ struct amdgpu_bo; > struct amdgpu_gds_asic_info { > uint32_t total_size; > uint32_t gfx_partition_size; > uint32_t cs_partition_size; > }; > > struct amdgpu_gds { > struct amdgpu_gds_asic_info mem; > struct amdgpu_gds_asic_info gws; > struct amdgpu_gds_asic_info oa; > + uint32_t gds_compute_max_wave_id; > + > /* At present, GDS, GWS and OA resources for gfx (graphics) > * is always pre-allocated and available for graphics operation. > * Such resource is shared between all gfx clients. > * TODO: move this operation to user space > * */ > struct amdgpu_bo* gds_gfx_bo; > struct amdgpu_bo* gws_gfx_bo; > struct amdgpu_bo* oa_gfx_bo; > }; > > diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c > b/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c > index 7984292f9282..a59e0fdf5a97 100644 > --- a/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c > +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c > @@ -2257,20 +2257,36 @@ static void gfx_v7_0_ring_emit_ib_gfx(struct > amdgpu_ring *ring, > } > > static void gfx_v7_0_ring_emit_ib_compute(struct amdgpu_ring *ring, > struct amdgpu_job *job, > struct amdgpu_ib *ib, > uint32_t flags) > { > unsigned vmid = AMDGPU_JOB_GET_VMID(job); > u32 control = INDIRECT_BUFFER_VALID | ib->length_dw | (vmid << 24); > > + /* Currently, there is a high possibility to get wave ID mismatch > + * between ME and GDS, leading to a hw deadlock, because ME > generates > + * different wave IDs than the GDS expects. This situation happens > + * randomly when at least 5 compute pipes use GDS ordered append. > + * The wave IDs generated by ME are also wrong after > suspend/resume. > + * Those are probably bugs somewhere else in the kernel driver. > + * > + * Writing GDS_COMPUTE_MAX_WAVE_ID resets wave ID counters in ME > and > + * GDS to 0 for this ring (me/pipe). > + */ > + if (ib->flags & AMDGPU_IB_FLAG_RESET_GDS_MAX_WAVE_ID) { > + amdgpu_ring_write(ring, PACKET3(PACKET3_SET_CONFIG_REG, > 1)); > + amdgpu_ring_write(ring, mmGDS_COMPUTE_MAX_WAVE_ID - > PACKET3_SET_CONFIG_REG_START); > + amdgpu_ring_write(ring, > ring->adev->gds.gds_compute_max_wave_id); > + } > + > amdgpu_ring_write(ring, PACKET3(PACKET3_INDIRECT_BUFFER, 2)); > amdgpu_ring_write(ring, > #ifdef __BIG_ENDIAN > (2 << 0) | > #endif > (ib->gpu_addr & 0xFFFFFFFC)); > amdgpu_ring_write(ring, upper_32_bits(ib->gpu_addr) & 0xFFFF); > amdgpu_ring_write(ring, control); > } > > @@ -4993,21 +5009,21 @@ static const struct amdgpu_ring_funcs > gfx_v7_0_ring_funcs_compute = { > .get_rptr = gfx_v7_0_ring_get_rptr, > .get_wptr = gfx_v7_0_ring_get_wptr_compute, > .set_wptr = gfx_v7_0_ring_set_wptr_compute, > .emit_frame_size = > 20 + /* gfx_v7_0_ring_emit_gds_switch */ > 7 + /* gfx_v7_0_ring_emit_hdp_flush */ > 5 + /* hdp invalidate */ > 7 + /* gfx_v7_0_ring_emit_pipeline_sync */ > CIK_FLUSH_GPU_TLB_NUM_WREG * 5 + 7 + /* > gfx_v7_0_ring_emit_vm_flush */ > 7 + 7 + 7, /* gfx_v7_0_ring_emit_fence_compute x3 for user > fence, vm fence */ > - .emit_ib_size = 4, /* gfx_v7_0_ring_emit_ib_compute */ > + .emit_ib_size = 7, /* gfx_v7_0_ring_emit_ib_compute */ > .emit_ib = gfx_v7_0_ring_emit_ib_compute, > .emit_fence = gfx_v7_0_ring_emit_fence_compute, > .emit_pipeline_sync = gfx_v7_0_ring_emit_pipeline_sync, > .emit_vm_flush = gfx_v7_0_ring_emit_vm_flush, > .emit_gds_switch = gfx_v7_0_ring_emit_gds_switch, > .emit_hdp_flush = gfx_v7_0_ring_emit_hdp_flush, > .test_ring = gfx_v7_0_ring_test_ring, > .test_ib = gfx_v7_0_ring_test_ib, > .insert_nop = amdgpu_ring_insert_nop, > .pad_ib = amdgpu_ring_generic_pad_ib, > @@ -5050,20 +5066,21 @@ static void gfx_v7_0_set_irq_funcs(struct > amdgpu_device *adev) > adev->gfx.priv_inst_irq.num_types = 1; > adev->gfx.priv_inst_irq.funcs = &gfx_v7_0_priv_inst_irq_funcs; > } > > static void gfx_v7_0_set_gds_init(struct amdgpu_device *adev) > { > /* init asci gds info */ > adev->gds.mem.total_size = RREG32(mmGDS_VMID0_SIZE); > adev->gds.gws.total_size = 64; > adev->gds.oa.total_size = 16; > + adev->gds.gds_compute_max_wave_id = > RREG32(mmGDS_COMPUTE_MAX_WAVE_ID); > > if (adev->gds.mem.total_size == 64 * 1024) { > adev->gds.mem.gfx_partition_size = 4096; > adev->gds.mem.cs_partition_size = 4096; > > adev->gds.gws.gfx_partition_size = 4; > adev->gds.gws.cs_partition_size = 4; > > adev->gds.oa.gfx_partition_size = 4; > adev->gds.oa.cs_partition_size = 1; > diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c > b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c > index a26747681ed6..b8e50a34bdb3 100644 > --- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c > +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c > @@ -6077,20 +6077,36 @@ static void gfx_v8_0_ring_emit_ib_gfx(struct > amdgpu_ring *ring, > } > > static void gfx_v8_0_ring_emit_ib_compute(struct amdgpu_ring *ring, > struct amdgpu_job *job, > struct amdgpu_ib *ib, > uint32_t flags) > { > unsigned vmid = AMDGPU_JOB_GET_VMID(job); > u32 control = INDIRECT_BUFFER_VALID | ib->length_dw | (vmid << 24); > > + /* Currently, there is a high possibility to get wave ID mismatch > + * between ME and GDS, leading to a hw deadlock, because ME > generates > + * different wave IDs than the GDS expects. This situation happens > + * randomly when at least 5 compute pipes use GDS ordered append. > + * The wave IDs generated by ME are also wrong after > suspend/resume. > + * Those are probably bugs somewhere else in the kernel driver. > + * > + * Writing GDS_COMPUTE_MAX_WAVE_ID resets wave ID counters in ME > and > + * GDS to 0 for this ring (me/pipe). > + */ > + if (ib->flags & AMDGPU_IB_FLAG_RESET_GDS_MAX_WAVE_ID) { > + amdgpu_ring_write(ring, PACKET3(PACKET3_SET_CONFIG_REG, > 1)); > + amdgpu_ring_write(ring, mmGDS_COMPUTE_MAX_WAVE_ID - > PACKET3_SET_CONFIG_REG_START); > + amdgpu_ring_write(ring, > ring->adev->gds.gds_compute_max_wave_id); > + } > + > amdgpu_ring_write(ring, PACKET3(PACKET3_INDIRECT_BUFFER, 2)); > amdgpu_ring_write(ring, > #ifdef __BIG_ENDIAN > (2 << 0) | > #endif > (ib->gpu_addr & 0xFFFFFFFC)); > amdgpu_ring_write(ring, upper_32_bits(ib->gpu_addr) & 0xFFFF); > amdgpu_ring_write(ring, control); > } > > @@ -6883,21 +6899,21 @@ static const struct amdgpu_ring_funcs > gfx_v8_0_ring_funcs_compute = { > .get_rptr = gfx_v8_0_ring_get_rptr, > .get_wptr = gfx_v8_0_ring_get_wptr_compute, > .set_wptr = gfx_v8_0_ring_set_wptr_compute, > .emit_frame_size = > 20 + /* gfx_v8_0_ring_emit_gds_switch */ > 7 + /* gfx_v8_0_ring_emit_hdp_flush */ > 5 + /* hdp_invalidate */ > 7 + /* gfx_v8_0_ring_emit_pipeline_sync */ > VI_FLUSH_GPU_TLB_NUM_WREG * 5 + 7 + /* > gfx_v8_0_ring_emit_vm_flush */ > 7 + 7 + 7, /* gfx_v8_0_ring_emit_fence_compute x3 for user > fence, vm fence */ > - .emit_ib_size = 4, /* gfx_v8_0_ring_emit_ib_compute */ > + .emit_ib_size = 7, /* gfx_v8_0_ring_emit_ib_compute */ > .emit_ib = gfx_v8_0_ring_emit_ib_compute, > .emit_fence = gfx_v8_0_ring_emit_fence_compute, > .emit_pipeline_sync = gfx_v8_0_ring_emit_pipeline_sync, > .emit_vm_flush = gfx_v8_0_ring_emit_vm_flush, > .emit_gds_switch = gfx_v8_0_ring_emit_gds_switch, > .emit_hdp_flush = gfx_v8_0_ring_emit_hdp_flush, > .test_ring = gfx_v8_0_ring_test_ring, > .test_ib = gfx_v8_0_ring_test_ib, > .insert_nop = amdgpu_ring_insert_nop, > .pad_ib = amdgpu_ring_generic_pad_ib, > @@ -6913,21 +6929,21 @@ static const struct amdgpu_ring_funcs > gfx_v8_0_ring_funcs_kiq = { > .get_rptr = gfx_v8_0_ring_get_rptr, > .get_wptr = gfx_v8_0_ring_get_wptr_compute, > .set_wptr = gfx_v8_0_ring_set_wptr_compute, > .emit_frame_size = > 20 + /* gfx_v8_0_ring_emit_gds_switch */ > 7 + /* gfx_v8_0_ring_emit_hdp_flush */ > 5 + /* hdp_invalidate */ > 7 + /* gfx_v8_0_ring_emit_pipeline_sync */ > 17 + /* gfx_v8_0_ring_emit_vm_flush */ > 7 + 7 + 7, /* gfx_v8_0_ring_emit_fence_kiq x3 for user > fence, vm fence */ > - .emit_ib_size = 4, /* gfx_v8_0_ring_emit_ib_compute */ > + .emit_ib_size = 7, /* gfx_v8_0_ring_emit_ib_compute */ > .emit_fence = gfx_v8_0_ring_emit_fence_kiq, > .test_ring = gfx_v8_0_ring_test_ring, > .insert_nop = amdgpu_ring_insert_nop, > .pad_ib = amdgpu_ring_generic_pad_ib, > .emit_rreg = gfx_v8_0_ring_emit_rreg, > .emit_wreg = gfx_v8_0_ring_emit_wreg, > }; > > static void gfx_v8_0_set_ring_funcs(struct amdgpu_device *adev) > { > @@ -6989,20 +7005,21 @@ static void gfx_v8_0_set_rlc_funcs(struct > amdgpu_device *adev) > { > adev->gfx.rlc.funcs = &iceland_rlc_funcs; > } > > static void gfx_v8_0_set_gds_init(struct amdgpu_device *adev) > { > /* init asci gds info */ > adev->gds.mem.total_size = RREG32(mmGDS_VMID0_SIZE); > adev->gds.gws.total_size = 64; > adev->gds.oa.total_size = 16; > + adev->gds.gds_compute_max_wave_id = > RREG32(mmGDS_COMPUTE_MAX_WAVE_ID); > > if (adev->gds.mem.total_size == 64 * 1024) { > adev->gds.mem.gfx_partition_size = 4096; > adev->gds.mem.cs_partition_size = 4096; > > adev->gds.gws.gfx_partition_size = 4; > adev->gds.gws.cs_partition_size = 4; > > adev->gds.oa.gfx_partition_size = 4; > adev->gds.oa.cs_partition_size = 1; > diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c > b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c > index 262ee3cf6f1c..5533f6e4f4a4 100644 > --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c > +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c > @@ -4003,20 +4003,36 @@ static void gfx_v9_0_ring_emit_ib_gfx(struct > amdgpu_ring *ring, > } > > static void gfx_v9_0_ring_emit_ib_compute(struct amdgpu_ring *ring, > struct amdgpu_job *job, > struct amdgpu_ib *ib, > uint32_t flags) > { > unsigned vmid = AMDGPU_JOB_GET_VMID(job); > u32 control = INDIRECT_BUFFER_VALID | ib->length_dw | (vmid << 24); > > + /* Currently, there is a high possibility to get wave ID mismatch > + * between ME and GDS, leading to a hw deadlock, because ME > generates > + * different wave IDs than the GDS expects. This situation happens > + * randomly when at least 5 compute pipes use GDS ordered append. > + * The wave IDs generated by ME are also wrong after > suspend/resume. > + * Those are probably bugs somewhere else in the kernel driver. > + * > + * Writing GDS_COMPUTE_MAX_WAVE_ID resets wave ID counters in ME > and > + * GDS to 0 for this ring (me/pipe). > + */ > + if (ib->flags & AMDGPU_IB_FLAG_RESET_GDS_MAX_WAVE_ID) { > + amdgpu_ring_write(ring, PACKET3(PACKET3_SET_CONFIG_REG, > 1)); > + amdgpu_ring_write(ring, mmGDS_COMPUTE_MAX_WAVE_ID); > + amdgpu_ring_write(ring, > ring->adev->gds.gds_compute_max_wave_id); > + } > + > amdgpu_ring_write(ring, PACKET3(PACKET3_INDIRECT_BUFFER, 2)); > BUG_ON(ib->gpu_addr & 0x3); /* Dword align */ > amdgpu_ring_write(ring, > #ifdef __BIG_ENDIAN > (2 << 0) | > #endif > lower_32_bits(ib->gpu_addr)); > amdgpu_ring_write(ring, upper_32_bits(ib->gpu_addr)); > amdgpu_ring_write(ring, control); > } > @@ -4722,21 +4738,21 @@ static const struct amdgpu_ring_funcs > gfx_v9_0_ring_funcs_compute = { > .set_wptr = gfx_v9_0_ring_set_wptr_compute, > .emit_frame_size = > 20 + /* gfx_v9_0_ring_emit_gds_switch */ > 7 + /* gfx_v9_0_ring_emit_hdp_flush */ > 5 + /* hdp invalidate */ > 7 + /* gfx_v9_0_ring_emit_pipeline_sync */ > SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 + > SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 + > 2 + /* gfx_v9_0_ring_emit_vm_flush */ > 8 + 8 + 8, /* gfx_v9_0_ring_emit_fence x3 for user fence, > vm fence */ > - .emit_ib_size = 4, /* gfx_v9_0_ring_emit_ib_compute */ > + .emit_ib_size = 7, /* gfx_v9_0_ring_emit_ib_compute */ > .emit_ib = gfx_v9_0_ring_emit_ib_compute, > .emit_fence = gfx_v9_0_ring_emit_fence, > .emit_pipeline_sync = gfx_v9_0_ring_emit_pipeline_sync, > .emit_vm_flush = gfx_v9_0_ring_emit_vm_flush, > .emit_gds_switch = gfx_v9_0_ring_emit_gds_switch, > .emit_hdp_flush = gfx_v9_0_ring_emit_hdp_flush, > .test_ring = gfx_v9_0_ring_test_ring, > .test_ib = gfx_v9_0_ring_test_ib, > .insert_nop = amdgpu_ring_insert_nop, > .pad_ib = amdgpu_ring_generic_pad_ib, > @@ -4757,21 +4773,21 @@ static const struct amdgpu_ring_funcs > gfx_v9_0_ring_funcs_kiq = { > .set_wptr = gfx_v9_0_ring_set_wptr_compute, > .emit_frame_size = > 20 + /* gfx_v9_0_ring_emit_gds_switch */ > 7 + /* gfx_v9_0_ring_emit_hdp_flush */ > 5 + /* hdp invalidate */ > 7 + /* gfx_v9_0_ring_emit_pipeline_sync */ > SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 + > SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 + > 2 + /* gfx_v9_0_ring_emit_vm_flush */ > 8 + 8 + 8, /* gfx_v9_0_ring_emit_fence_kiq x3 for user > fence, vm fence */ > - .emit_ib_size = 4, /* gfx_v9_0_ring_emit_ib_compute */ > + .emit_ib_size = 7, /* gfx_v9_0_ring_emit_ib_compute */ > .emit_fence = gfx_v9_0_ring_emit_fence_kiq, > .test_ring = gfx_v9_0_ring_test_ring, > .insert_nop = amdgpu_ring_insert_nop, > .pad_ib = amdgpu_ring_generic_pad_ib, > .emit_rreg = gfx_v9_0_ring_emit_rreg, > .emit_wreg = gfx_v9_0_ring_emit_wreg, > .emit_reg_wait = gfx_v9_0_ring_emit_reg_wait, > .emit_reg_write_reg_wait = gfx_v9_0_ring_emit_reg_write_reg_wait, > }; > > @@ -4839,20 +4855,40 @@ static void gfx_v9_0_set_gds_init(struct > amdgpu_device *adev) > adev->gds.mem.total_size = 0x10000; > break; > case CHIP_RAVEN: > adev->gds.mem.total_size = 0x1000; > break; > default: > adev->gds.mem.total_size = 0x10000; > break; > } > > + switch (adev->asic_type) { > + case CHIP_VEGA10: > + case CHIP_VEGA20: > + adev->gds.gds_compute_max_wave_id = 0x7ff; > + break; > + case CHIP_VEGA12: > + adev->gds.gds_compute_max_wave_id = 0x27f; > + break; > + case CHIP_RAVEN: > + if (adev->rev_id >= 0x8) > + adev->gds.gds_compute_max_wave_id = 0x77; /* > raven2 */ > + else > + adev->gds.gds_compute_max_wave_id = 0x15f; /* > raven1 */ > + break; > + default: > + /* this really depends on the chip */ > + adev->gds.gds_compute_max_wave_id = 0x7ff; > + break; > + } > + > adev->gds.gws.total_size = 64; > adev->gds.oa.total_size = 16; > > if (adev->gds.mem.total_size == 64 * 1024) { > adev->gds.mem.gfx_partition_size = 4096; > adev->gds.mem.cs_partition_size = 4096; > > adev->gds.gws.gfx_partition_size = 4; > adev->gds.gws.cs_partition_size = 4; > > diff --git a/include/uapi/drm/amdgpu_drm.h b/include/uapi/drm/amdgpu_drm.h > index faaad04814e4..662d379ea624 100644 > --- a/include/uapi/drm/amdgpu_drm.h > +++ b/include/uapi/drm/amdgpu_drm.h > @@ -561,20 +561,25 @@ union drm_amdgpu_cs { > /* Preamble flag, which means the IB could be dropped if no context > switch */ > #define AMDGPU_IB_FLAG_PREAMBLE (1<<1) > > /* Preempt flag, IB should set Pre_enb bit if PREEMPT flag detected */ > #define AMDGPU_IB_FLAG_PREEMPT (1<<2) > > /* The IB fence should do the L2 writeback but not invalidate any shader > * caches (L2/vL1/sL1/I$). */ > #define AMDGPU_IB_FLAG_TC_WB_NOT_INVALIDATE (1 << 3) > > +/* Set GDS_COMPUTE_MAX_WAVE_ID = DEFAULT before PACKET3_INDIRECT_BUFFER. > + * This will reset wave ID counters for the IB. > + */ > +#define AMDGPU_IB_FLAG_RESET_GDS_MAX_WAVE_ID (1 << 4) > + > struct drm_amdgpu_cs_chunk_ib { > __u32 _pad; > /** AMDGPU_IB_FLAG_* */ > __u32 flags; > /** Virtual address to begin IB execution */ > __u64 va_start; > /** Size of submission */ > __u32 ib_bytes; > /** HW IP to submit to */ > __u32 ip_type; > -- > 2.17.1 > >