All of lore.kernel.org
 help / color / mirror / Atom feed
* [RFC PATCH] drm/amdgpu: add support for user trap handlers
@ 2020-08-24 11:49 Samuel Pitoiset
  2020-08-24 18:17 ` Marek Olšák
                   ` (3 more replies)
  0 siblings, 4 replies; 16+ messages in thread
From: Samuel Pitoiset @ 2020-08-24 11:49 UTC (permalink / raw)
  To: amd-gfx; +Cc: alexander.deucher, Samuel Pitoiset

A trap handler can be used by userspace to catch shader exceptions
like divide by zero, memory violations etc.

On GFX6-GFX8, the registers used to configure TBA/TMA aren't
privileged while on GFX9+ they are per VMID and privileged,
so that only the KMD can configure them.

This introduces a new CS chunk that can be used to set the
TBA/TMA virtual address at submit time.

TODO:
- add GFX 6,7 and 10 support
- rebase on top of amd-staging-drm-next (this branch currently
hangs my GPU at boot)

Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c   | 31 +++++++++++++++++
 drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c  |  3 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_ids.h  |  4 +++
 drivers/gpu/drm/amd/amdgpu/amdgpu_job.h  |  4 +++
 drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h |  4 +++
 drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c   | 15 ++++++++-
 drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c    | 42 ++++++++++++++++++++++--
 drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c    | 19 +++++++++++
 include/uapi/drm/amdgpu_drm.h            |  8 +++++
 9 files changed, 126 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
index a512ccbc4dea..6ca5c4912e3a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
@@ -104,6 +104,19 @@ static int amdgpu_cs_bo_handles_chunk(struct amdgpu_cs_parser *p,
 	return r;
 }
 
+static int amdgpu_cs_user_trap_chunk(struct amdgpu_cs_parser *p,
+				     struct drm_amdgpu_cs_chunk_trap *data,
+				     uint64_t *tba_addr, uint64_t *tma_addr)
+{
+	if (!data->tba_addr || !data->tma_addr)
+		return -EINVAL;
+
+	*tba_addr = data->tba_addr;
+	*tma_addr = data->tma_addr;
+
+	return 0;
+}
+
 static int amdgpu_cs_parser_init(struct amdgpu_cs_parser *p, union drm_amdgpu_cs *cs)
 {
 	struct amdgpu_fpriv *fpriv = p->filp->driver_priv;
@@ -112,6 +125,7 @@ static int amdgpu_cs_parser_init(struct amdgpu_cs_parser *p, union drm_amdgpu_cs
 	uint64_t *chunk_array;
 	unsigned size, num_ibs = 0;
 	uint32_t uf_offset = 0;
+	uint64_t tba_addr = 0, tma_addr = 0;
 	int i;
 	int ret;
 
@@ -214,6 +228,19 @@ static int amdgpu_cs_parser_init(struct amdgpu_cs_parser *p, union drm_amdgpu_cs
 
 			break;
 
+		case AMDGPU_CHUNK_ID_TRAP:
+			size = sizeof(struct drm_amdgpu_cs_chunk_trap);
+			if (p->chunks[i].length_dw * sizeof(uint32_t) < size) {
+				ret = -EINVAL;
+				goto free_partial_kdata;
+			}
+
+			ret = amdgpu_cs_user_trap_chunk(p, p->chunks[i].kdata,
+							&tba_addr, &tma_addr);
+			if (ret)
+				goto free_partial_kdata;
+			break;
+
 		case AMDGPU_CHUNK_ID_DEPENDENCIES:
 		case AMDGPU_CHUNK_ID_SYNCOBJ_IN:
 		case AMDGPU_CHUNK_ID_SYNCOBJ_OUT:
@@ -239,6 +266,10 @@ static int amdgpu_cs_parser_init(struct amdgpu_cs_parser *p, union drm_amdgpu_cs
 
 	if (p->uf_entry.tv.bo)
 		p->job->uf_addr = uf_offset;
+
+	p->job->tba_addr = tba_addr;
+	p->job->tma_addr = tma_addr;
+
 	kfree(chunk_array);
 
 	/* Use this opportunity to fill in task info for the vm */
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
index 26127c7d2f32..1e703119e4c2 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
@@ -88,9 +88,10 @@
  * - 3.37.0 - L2 is invalidated before SDMA IBs, needed for correctness
  * - 3.38.0 - Add AMDGPU_IB_FLAG_EMIT_MEM_SYNC
  * - 3.39.0 - DMABUF implicit sync does a full pipeline sync
+ * - 3.40.0 - Add AMDGPU_CHUNK_ID_TRAP
  */
 #define KMS_DRIVER_MAJOR	3
-#define KMS_DRIVER_MINOR	39
+#define KMS_DRIVER_MINOR	40
 #define KMS_DRIVER_PATCHLEVEL	0
 
 int amdgpu_vram_limit = 0;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.h
index 8e58325bbca2..fd0d56724b4d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.h
@@ -58,6 +58,10 @@ struct amdgpu_vmid {
 	uint32_t		oa_base;
 	uint32_t		oa_size;
 
+	/* user trap */
+	uint64_t		tba_addr;
+	uint64_t		tma_addr;
+
 	unsigned		pasid;
 	struct dma_fence	*pasid_mapping;
 };
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
index 81caac9b958a..b8ed5b13ea44 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
@@ -62,6 +62,10 @@ struct amdgpu_job {
 	/* user fence handling */
 	uint64_t		uf_addr;
 	uint64_t		uf_sequence;
+
+	/* user trap handling */
+	uint64_t		tba_addr;
+	uint64_t		tma_addr;
 };
 
 int amdgpu_job_alloc(struct amdgpu_device *adev, unsigned num_ibs,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
index da871d84b742..1f165a6295d9 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
@@ -197,6 +197,9 @@ struct amdgpu_ring_funcs {
 	void (*soft_recovery)(struct amdgpu_ring *ring, unsigned vmid);
 	int (*preempt_ib)(struct amdgpu_ring *ring);
 	void (*emit_mem_sync)(struct amdgpu_ring *ring);
+	void (*emit_trap_handler)(struct amdgpu_ring *ring,
+				  uint32_t vmid,
+				  uint64_t tba_addr, uint64_t tma_addr);
 };
 
 struct amdgpu_ring {
@@ -265,6 +268,7 @@ struct amdgpu_ring {
 #define amdgpu_ring_emit_vm_flush(r, vmid, addr) (r)->funcs->emit_vm_flush((r), (vmid), (addr))
 #define amdgpu_ring_emit_fence(r, addr, seq, flags) (r)->funcs->emit_fence((r), (addr), (seq), (flags))
 #define amdgpu_ring_emit_gds_switch(r, v, db, ds, wb, ws, ab, as) (r)->funcs->emit_gds_switch((r), (v), (db), (ds), (wb), (ws), (ab), (as))
+#define amdgpu_ring_emit_trap_handler(r, v, tba, tma) (r)->funcs->emit_trap_handler((r), (v), (tba), (tma))
 #define amdgpu_ring_emit_hdp_flush(r) (r)->funcs->emit_hdp_flush((r))
 #define amdgpu_ring_emit_switch_buffer(r) (r)->funcs->emit_switch_buffer((r))
 #define amdgpu_ring_emit_cntxcntl(r, d) (r)->funcs->emit_cntxcntl((r), (d))
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
index 71e005cf2952..24916082de0b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
@@ -1076,6 +1076,9 @@ int amdgpu_vm_flush(struct amdgpu_ring *ring, struct amdgpu_job *job,
 		id->gws_size != job->gws_size ||
 		id->oa_base != job->oa_base ||
 		id->oa_size != job->oa_size);
+	bool trap_handler_needed = ring->funcs->emit_trap_handler && (
+		id->tba_addr != job->tba_addr ||
+		id->tma_addr != job->tma_addr);
 	bool vm_flush_needed = job->vm_needs_flush;
 	struct dma_fence *fence = NULL;
 	bool pasid_mapping_needed = false;
@@ -1088,6 +1091,7 @@ int amdgpu_vm_flush(struct amdgpu_ring *ring, struct amdgpu_job *job,
 
 	if (amdgpu_vmid_had_gpu_reset(adev, id)) {
 		gds_switch_needed = true;
+		trap_handler_needed = true;
 		vm_flush_needed = true;
 		pasid_mapping_needed = true;
 	}
@@ -1099,12 +1103,14 @@ int amdgpu_vm_flush(struct amdgpu_ring *ring, struct amdgpu_job *job,
 	mutex_unlock(&id_mgr->lock);
 
 	gds_switch_needed &= !!ring->funcs->emit_gds_switch;
+	trap_handler_needed &= !!ring->funcs->emit_trap_handler;
 	vm_flush_needed &= !!ring->funcs->emit_vm_flush  &&
 			job->vm_pd_addr != AMDGPU_BO_INVALID_OFFSET;
 	pasid_mapping_needed &= adev->gmc.gmc_funcs->emit_pasid_mapping &&
 		ring->funcs->emit_wreg;
 
-	if (!vm_flush_needed && !gds_switch_needed && !need_pipe_sync)
+	if (!vm_flush_needed && !gds_switch_needed &&
+	    !trap_handler_needed && !need_pipe_sync)
 		return 0;
 
 	if (ring->funcs->init_cond_exec)
@@ -1158,6 +1164,13 @@ int amdgpu_vm_flush(struct amdgpu_ring *ring, struct amdgpu_job *job,
 					    job->oa_size);
 	}
 
+	if (ring->funcs->emit_trap_handler && trap_handler_needed) {
+		id->tba_addr = job->tba_addr;
+		id->tma_addr = job->tma_addr;
+		amdgpu_ring_emit_trap_handler(ring, job->vmid, job->tba_addr,
+					      job->tma_addr);
+	}
+
 	if (ring->funcs->patch_cond_exec)
 		amdgpu_ring_patch_cond_exec(ring, patch_offset);
 
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
index 33f1c4a46ebe..59db577e8c8f 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
@@ -5222,6 +5222,40 @@ static void gfx_v8_0_ring_emit_gds_switch(struct amdgpu_ring *ring,
 	amdgpu_ring_write(ring, (1 << (oa_size + oa_base)) - (1 << oa_base));
 }
 
+static void gfx_v8_0_ring_emit_trap_handler(struct amdgpu_ring *ring,
+					    uint32_t vmid,
+					    uint64_t tba_addr,
+					    uint64_t tma_addr)
+{
+	if (ring->funcs->type == AMDGPU_RING_TYPE_GFX) {
+		static const u32 regs[] = {
+			mmSPI_SHADER_TBA_LO_PS,
+			mmSPI_SHADER_TBA_LO_VS,
+			mmSPI_SHADER_TBA_LO_GS,
+			mmSPI_SHADER_TBA_LO_ES,
+			mmSPI_SHADER_TBA_LO_HS,
+			mmSPI_SHADER_TBA_LO_LS,
+		};
+		int i;
+
+		for (i = 0; i < ARRAY_SIZE(regs); i++) {
+			amdgpu_ring_write(ring, PACKET3(PACKET3_SET_SH_REG, 4));
+			amdgpu_ring_write(ring, regs[i] - PACKET3_SET_SH_REG_START);
+			amdgpu_ring_write(ring, lower_32_bits(tba_addr >> 8));
+			amdgpu_ring_write(ring, upper_32_bits(tba_addr >> 8));
+			amdgpu_ring_write(ring, lower_32_bits(tma_addr >> 8));
+			amdgpu_ring_write(ring, upper_32_bits(tma_addr >> 8));
+		}
+	} else {
+		amdgpu_ring_write(ring, PACKET3(PACKET3_SET_SH_REG, 4));
+		amdgpu_ring_write(ring, mmCOMPUTE_TBA_LO - PACKET3_SET_SH_REG_START);
+		amdgpu_ring_write(ring, lower_32_bits(tba_addr >> 8));
+		amdgpu_ring_write(ring, upper_32_bits(tba_addr >> 8));
+		amdgpu_ring_write(ring, lower_32_bits(tma_addr >> 8));
+		amdgpu_ring_write(ring, upper_32_bits(tma_addr >> 8));
+	}
+}
+
 static uint32_t wave_read_ind(struct amdgpu_device *adev, uint32_t simd, uint32_t wave, uint32_t address)
 {
 	WREG32(mmSQ_IND_INDEX,
@@ -6890,7 +6924,8 @@ static const struct amdgpu_ring_funcs gfx_v8_0_ring_funcs_gfx = {
 		5 + /* HDP_INVL */
 		12 + 12 + /* FENCE x2 */
 		2 + /* SWITCH_BUFFER */
-		5, /* SURFACE_SYNC */
+		5 + /* SURFACE_SYNC */
+		36, /* gfx_v8_0_ring_emit_trap_handler */
 	.emit_ib_size =	4, /* gfx_v8_0_ring_emit_ib_gfx */
 	.emit_ib = gfx_v8_0_ring_emit_ib_gfx,
 	.emit_fence = gfx_v8_0_ring_emit_fence_gfx,
@@ -6909,6 +6944,7 @@ static const struct amdgpu_ring_funcs gfx_v8_0_ring_funcs_gfx = {
 	.emit_wreg = gfx_v8_0_ring_emit_wreg,
 	.soft_recovery = gfx_v8_0_ring_soft_recovery,
 	.emit_mem_sync = gfx_v8_0_emit_mem_sync,
+	.emit_trap_handler = gfx_v8_0_ring_emit_trap_handler,
 };
 
 static const struct amdgpu_ring_funcs gfx_v8_0_ring_funcs_compute = {
@@ -6926,7 +6962,8 @@ static const struct amdgpu_ring_funcs gfx_v8_0_ring_funcs_compute = {
 		7 + /* gfx_v8_0_ring_emit_pipeline_sync */
 		VI_FLUSH_GPU_TLB_NUM_WREG * 5 + 7 + /* gfx_v8_0_ring_emit_vm_flush */
 		7 + 7 + 7 + /* gfx_v8_0_ring_emit_fence_compute x3 for user fence, vm fence */
-		7, /* gfx_v8_0_emit_mem_sync_compute */
+		7 + /* gfx_v8_0_emit_mem_sync_compute */
+		6, /* gfx_v8_0_emit_trap_handler */
 	.emit_ib_size =	7, /* gfx_v8_0_ring_emit_ib_compute */
 	.emit_ib = gfx_v8_0_ring_emit_ib_compute,
 	.emit_fence = gfx_v8_0_ring_emit_fence_compute,
@@ -6940,6 +6977,7 @@ static const struct amdgpu_ring_funcs gfx_v8_0_ring_funcs_compute = {
 	.pad_ib = amdgpu_ring_generic_pad_ib,
 	.emit_wreg = gfx_v8_0_ring_emit_wreg,
 	.emit_mem_sync = gfx_v8_0_emit_mem_sync_compute,
+	.emit_trap_handler = gfx_v8_0_ring_emit_trap_handler,
 };
 
 static const struct amdgpu_ring_funcs gfx_v8_0_ring_funcs_kiq = {
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
index cb9d60a4e05e..4fc00f196085 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
@@ -4162,6 +4162,23 @@ static void gfx_v9_0_ring_emit_gds_switch(struct amdgpu_ring *ring,
 				   (1 << (oa_size + oa_base)) - (1 << oa_base));
 }
 
+static void gfx_v9_0_ring_emit_trap_handler(struct amdgpu_ring *ring,
+					    uint32_t vmid,
+					    uint64_t tba_addr,
+					    uint64_t tma_addr)
+{
+	struct amdgpu_device *adev = ring->adev;
+
+	mutex_lock(&adev->srbm_mutex);
+	soc15_grbm_select(adev, 0, 0, 0, vmid);
+	WREG32_SOC15_RLC(GC, 0, mmSQ_SHADER_TBA_LO, lower_32_bits(tba_addr >> 8));
+	WREG32_SOC15_RLC(GC, 0, mmSQ_SHADER_TBA_HI, upper_32_bits(tba_addr >> 8));
+	WREG32_SOC15_RLC(GC, 0, mmSQ_SHADER_TMA_LO, lower_32_bits(tma_addr >> 8));
+	WREG32_SOC15_RLC(GC, 0, mmSQ_SHADER_TMA_HI, upper_32_bits(tma_addr >> 8));
+	soc15_grbm_select(adev, 0, 0, 0, 0);
+	mutex_unlock(&adev->srbm_mutex);
+}
+
 static const u32 vgpr_init_compute_shader[] =
 {
 	0xb07c0000, 0xbe8000ff,
@@ -6720,6 +6737,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_gfx = {
 	.emit_reg_write_reg_wait = gfx_v9_0_ring_emit_reg_write_reg_wait,
 	.soft_recovery = gfx_v9_0_ring_soft_recovery,
 	.emit_mem_sync = gfx_v9_0_emit_mem_sync,
+	.emit_trap_handler = gfx_v9_0_ring_emit_trap_handler,
 };
 
 static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_compute = {
@@ -6756,6 +6774,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_compute = {
 	.emit_reg_wait = gfx_v9_0_ring_emit_reg_wait,
 	.emit_reg_write_reg_wait = gfx_v9_0_ring_emit_reg_write_reg_wait,
 	.emit_mem_sync = gfx_v9_0_emit_mem_sync,
+	.emit_trap_handler = gfx_v9_0_ring_emit_trap_handler,
 };
 
 static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_kiq = {
diff --git a/include/uapi/drm/amdgpu_drm.h b/include/uapi/drm/amdgpu_drm.h
index 3218576e109d..7eae264adb5d 100644
--- a/include/uapi/drm/amdgpu_drm.h
+++ b/include/uapi/drm/amdgpu_drm.h
@@ -551,6 +551,7 @@ struct drm_amdgpu_gem_va {
 #define AMDGPU_CHUNK_ID_SCHEDULED_DEPENDENCIES	0x07
 #define AMDGPU_CHUNK_ID_SYNCOBJ_TIMELINE_WAIT    0x08
 #define AMDGPU_CHUNK_ID_SYNCOBJ_TIMELINE_SIGNAL  0x09
+#define AMDGPU_CHUNK_ID_TRAP            0x0a
 
 struct drm_amdgpu_cs_chunk {
 	__u32		chunk_id;
@@ -645,6 +646,13 @@ struct drm_amdgpu_cs_chunk_syncobj {
        __u64 point;
 };
 
+struct drm_amdgpu_cs_chunk_trap {
+	/** Trap Base Address */
+       __u64 tba_addr;
+	/** Trap Memory Address */
+       __u64 tma_addr;
+};
+
 #define AMDGPU_FENCE_TO_HANDLE_GET_SYNCOBJ	0
 #define AMDGPU_FENCE_TO_HANDLE_GET_SYNCOBJ_FD	1
 #define AMDGPU_FENCE_TO_HANDLE_GET_SYNC_FILE_FD	2
-- 
2.28.0

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 16+ messages in thread

* Re: [RFC PATCH] drm/amdgpu: add support for user trap handlers
  2020-08-24 11:49 [RFC PATCH] drm/amdgpu: add support for user trap handlers Samuel Pitoiset
@ 2020-08-24 18:17 ` Marek Olšák
  2020-08-25  7:04   ` Samuel Pitoiset
  2020-08-24 18:33 ` Alex Deucher
                   ` (2 subsequent siblings)
  3 siblings, 1 reply; 16+ messages in thread
From: Marek Olšák @ 2020-08-24 18:17 UTC (permalink / raw)
  To: Samuel Pitoiset; +Cc: Deucher, Alexander, amd-gfx mailing list


[-- Attachment #1.1: Type: text/plain, Size: 17442 bytes --]

SET_SH_REG won't work with CP register shadowing. You need to use
WRITE_DATA or WREG32.

Marek

On Mon, Aug 24, 2020 at 7:57 AM Samuel Pitoiset <samuel.pitoiset@gmail.com>
wrote:

> A trap handler can be used by userspace to catch shader exceptions
> like divide by zero, memory violations etc.
>
> On GFX6-GFX8, the registers used to configure TBA/TMA aren't
> privileged while on GFX9+ they are per VMID and privileged,
> so that only the KMD can configure them.
>
> This introduces a new CS chunk that can be used to set the
> TBA/TMA virtual address at submit time.
>
> TODO:
> - add GFX 6,7 and 10 support
> - rebase on top of amd-staging-drm-next (this branch currently
> hangs my GPU at boot)
>
> Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c   | 31 +++++++++++++++++
>  drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c  |  3 +-
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ids.h  |  4 +++
>  drivers/gpu/drm/amd/amdgpu/amdgpu_job.h  |  4 +++
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h |  4 +++
>  drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c   | 15 ++++++++-
>  drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c    | 42 ++++++++++++++++++++++--
>  drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c    | 19 +++++++++++
>  include/uapi/drm/amdgpu_drm.h            |  8 +++++
>  9 files changed, 126 insertions(+), 4 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> index a512ccbc4dea..6ca5c4912e3a 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> @@ -104,6 +104,19 @@ static int amdgpu_cs_bo_handles_chunk(struct
> amdgpu_cs_parser *p,
>         return r;
>  }
>
> +static int amdgpu_cs_user_trap_chunk(struct amdgpu_cs_parser *p,
> +                                    struct drm_amdgpu_cs_chunk_trap *data,
> +                                    uint64_t *tba_addr, uint64_t
> *tma_addr)
> +{
> +       if (!data->tba_addr || !data->tma_addr)
> +               return -EINVAL;
> +
> +       *tba_addr = data->tba_addr;
> +       *tma_addr = data->tma_addr;
> +
> +       return 0;
> +}
> +
>  static int amdgpu_cs_parser_init(struct amdgpu_cs_parser *p, union
> drm_amdgpu_cs *cs)
>  {
>         struct amdgpu_fpriv *fpriv = p->filp->driver_priv;
> @@ -112,6 +125,7 @@ static int amdgpu_cs_parser_init(struct
> amdgpu_cs_parser *p, union drm_amdgpu_cs
>         uint64_t *chunk_array;
>         unsigned size, num_ibs = 0;
>         uint32_t uf_offset = 0;
> +       uint64_t tba_addr = 0, tma_addr = 0;
>         int i;
>         int ret;
>
> @@ -214,6 +228,19 @@ static int amdgpu_cs_parser_init(struct
> amdgpu_cs_parser *p, union drm_amdgpu_cs
>
>                         break;
>
> +               case AMDGPU_CHUNK_ID_TRAP:
> +                       size = sizeof(struct drm_amdgpu_cs_chunk_trap);
> +                       if (p->chunks[i].length_dw * sizeof(uint32_t) <
> size) {
> +                               ret = -EINVAL;
> +                               goto free_partial_kdata;
> +                       }
> +
> +                       ret = amdgpu_cs_user_trap_chunk(p,
> p->chunks[i].kdata,
> +                                                       &tba_addr,
> &tma_addr);
> +                       if (ret)
> +                               goto free_partial_kdata;
> +                       break;
> +
>                 case AMDGPU_CHUNK_ID_DEPENDENCIES:
>                 case AMDGPU_CHUNK_ID_SYNCOBJ_IN:
>                 case AMDGPU_CHUNK_ID_SYNCOBJ_OUT:
> @@ -239,6 +266,10 @@ static int amdgpu_cs_parser_init(struct
> amdgpu_cs_parser *p, union drm_amdgpu_cs
>
>         if (p->uf_entry.tv.bo)
>                 p->job->uf_addr = uf_offset;
> +
> +       p->job->tba_addr = tba_addr;
> +       p->job->tma_addr = tma_addr;
> +
>         kfree(chunk_array);
>
>         /* Use this opportunity to fill in task info for the vm */
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> index 26127c7d2f32..1e703119e4c2 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> @@ -88,9 +88,10 @@
>   * - 3.37.0 - L2 is invalidated before SDMA IBs, needed for correctness
>   * - 3.38.0 - Add AMDGPU_IB_FLAG_EMIT_MEM_SYNC
>   * - 3.39.0 - DMABUF implicit sync does a full pipeline sync
> + * - 3.40.0 - Add AMDGPU_CHUNK_ID_TRAP
>   */
>  #define KMS_DRIVER_MAJOR       3
> -#define KMS_DRIVER_MINOR       39
> +#define KMS_DRIVER_MINOR       40
>  #define KMS_DRIVER_PATCHLEVEL  0
>
>  int amdgpu_vram_limit = 0;
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.h
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.h
> index 8e58325bbca2..fd0d56724b4d 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.h
> @@ -58,6 +58,10 @@ struct amdgpu_vmid {
>         uint32_t                oa_base;
>         uint32_t                oa_size;
>
> +       /* user trap */
> +       uint64_t                tba_addr;
> +       uint64_t                tma_addr;
> +
>         unsigned                pasid;
>         struct dma_fence        *pasid_mapping;
>  };
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
> index 81caac9b958a..b8ed5b13ea44 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
> @@ -62,6 +62,10 @@ struct amdgpu_job {
>         /* user fence handling */
>         uint64_t                uf_addr;
>         uint64_t                uf_sequence;
> +
> +       /* user trap handling */
> +       uint64_t                tba_addr;
> +       uint64_t                tma_addr;
>  };
>
>  int amdgpu_job_alloc(struct amdgpu_device *adev, unsigned num_ibs,
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> index da871d84b742..1f165a6295d9 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> @@ -197,6 +197,9 @@ struct amdgpu_ring_funcs {
>         void (*soft_recovery)(struct amdgpu_ring *ring, unsigned vmid);
>         int (*preempt_ib)(struct amdgpu_ring *ring);
>         void (*emit_mem_sync)(struct amdgpu_ring *ring);
> +       void (*emit_trap_handler)(struct amdgpu_ring *ring,
> +                                 uint32_t vmid,
> +                                 uint64_t tba_addr, uint64_t tma_addr);
>  };
>
>  struct amdgpu_ring {
> @@ -265,6 +268,7 @@ struct amdgpu_ring {
>  #define amdgpu_ring_emit_vm_flush(r, vmid, addr)
> (r)->funcs->emit_vm_flush((r), (vmid), (addr))
>  #define amdgpu_ring_emit_fence(r, addr, seq, flags)
> (r)->funcs->emit_fence((r), (addr), (seq), (flags))
>  #define amdgpu_ring_emit_gds_switch(r, v, db, ds, wb, ws, ab, as)
> (r)->funcs->emit_gds_switch((r), (v), (db), (ds), (wb), (ws), (ab), (as))
> +#define amdgpu_ring_emit_trap_handler(r, v, tba, tma)
> (r)->funcs->emit_trap_handler((r), (v), (tba), (tma))
>  #define amdgpu_ring_emit_hdp_flush(r) (r)->funcs->emit_hdp_flush((r))
>  #define amdgpu_ring_emit_switch_buffer(r)
> (r)->funcs->emit_switch_buffer((r))
>  #define amdgpu_ring_emit_cntxcntl(r, d) (r)->funcs->emit_cntxcntl((r),
> (d))
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> index 71e005cf2952..24916082de0b 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> @@ -1076,6 +1076,9 @@ int amdgpu_vm_flush(struct amdgpu_ring *ring, struct
> amdgpu_job *job,
>                 id->gws_size != job->gws_size ||
>                 id->oa_base != job->oa_base ||
>                 id->oa_size != job->oa_size);
> +       bool trap_handler_needed = ring->funcs->emit_trap_handler && (
> +               id->tba_addr != job->tba_addr ||
> +               id->tma_addr != job->tma_addr);
>         bool vm_flush_needed = job->vm_needs_flush;
>         struct dma_fence *fence = NULL;
>         bool pasid_mapping_needed = false;
> @@ -1088,6 +1091,7 @@ int amdgpu_vm_flush(struct amdgpu_ring *ring, struct
> amdgpu_job *job,
>
>         if (amdgpu_vmid_had_gpu_reset(adev, id)) {
>                 gds_switch_needed = true;
> +               trap_handler_needed = true;
>                 vm_flush_needed = true;
>                 pasid_mapping_needed = true;
>         }
> @@ -1099,12 +1103,14 @@ int amdgpu_vm_flush(struct amdgpu_ring *ring,
> struct amdgpu_job *job,
>         mutex_unlock(&id_mgr->lock);
>
>         gds_switch_needed &= !!ring->funcs->emit_gds_switch;
> +       trap_handler_needed &= !!ring->funcs->emit_trap_handler;
>         vm_flush_needed &= !!ring->funcs->emit_vm_flush  &&
>                         job->vm_pd_addr != AMDGPU_BO_INVALID_OFFSET;
>         pasid_mapping_needed &= adev->gmc.gmc_funcs->emit_pasid_mapping &&
>                 ring->funcs->emit_wreg;
>
> -       if (!vm_flush_needed && !gds_switch_needed && !need_pipe_sync)
> +       if (!vm_flush_needed && !gds_switch_needed &&
> +           !trap_handler_needed && !need_pipe_sync)
>                 return 0;
>
>         if (ring->funcs->init_cond_exec)
> @@ -1158,6 +1164,13 @@ int amdgpu_vm_flush(struct amdgpu_ring *ring,
> struct amdgpu_job *job,
>                                             job->oa_size);
>         }
>
> +       if (ring->funcs->emit_trap_handler && trap_handler_needed) {
> +               id->tba_addr = job->tba_addr;
> +               id->tma_addr = job->tma_addr;
> +               amdgpu_ring_emit_trap_handler(ring, job->vmid,
> job->tba_addr,
> +                                             job->tma_addr);
> +       }
> +
>         if (ring->funcs->patch_cond_exec)
>                 amdgpu_ring_patch_cond_exec(ring, patch_offset);
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> index 33f1c4a46ebe..59db577e8c8f 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> @@ -5222,6 +5222,40 @@ static void gfx_v8_0_ring_emit_gds_switch(struct
> amdgpu_ring *ring,
>         amdgpu_ring_write(ring, (1 << (oa_size + oa_base)) - (1 <<
> oa_base));
>  }
>
> +static void gfx_v8_0_ring_emit_trap_handler(struct amdgpu_ring *ring,
> +                                           uint32_t vmid,
> +                                           uint64_t tba_addr,
> +                                           uint64_t tma_addr)
> +{
> +       if (ring->funcs->type == AMDGPU_RING_TYPE_GFX) {
> +               static const u32 regs[] = {
> +                       mmSPI_SHADER_TBA_LO_PS,
> +                       mmSPI_SHADER_TBA_LO_VS,
> +                       mmSPI_SHADER_TBA_LO_GS,
> +                       mmSPI_SHADER_TBA_LO_ES,
> +                       mmSPI_SHADER_TBA_LO_HS,
> +                       mmSPI_SHADER_TBA_LO_LS,
> +               };
> +               int i;
> +
> +               for (i = 0; i < ARRAY_SIZE(regs); i++) {
> +                       amdgpu_ring_write(ring,
> PACKET3(PACKET3_SET_SH_REG, 4));
> +                       amdgpu_ring_write(ring, regs[i] -
> PACKET3_SET_SH_REG_START);
> +                       amdgpu_ring_write(ring, lower_32_bits(tba_addr >>
> 8));
> +                       amdgpu_ring_write(ring, upper_32_bits(tba_addr >>
> 8));
> +                       amdgpu_ring_write(ring, lower_32_bits(tma_addr >>
> 8));
> +                       amdgpu_ring_write(ring, upper_32_bits(tma_addr >>
> 8));
> +               }
> +       } else {
> +               amdgpu_ring_write(ring, PACKET3(PACKET3_SET_SH_REG, 4));
> +               amdgpu_ring_write(ring, mmCOMPUTE_TBA_LO -
> PACKET3_SET_SH_REG_START);
> +               amdgpu_ring_write(ring, lower_32_bits(tba_addr >> 8));
> +               amdgpu_ring_write(ring, upper_32_bits(tba_addr >> 8));
> +               amdgpu_ring_write(ring, lower_32_bits(tma_addr >> 8));
> +               amdgpu_ring_write(ring, upper_32_bits(tma_addr >> 8));
> +       }
> +}
> +
>  static uint32_t wave_read_ind(struct amdgpu_device *adev, uint32_t simd,
> uint32_t wave, uint32_t address)
>  {
>         WREG32(mmSQ_IND_INDEX,
> @@ -6890,7 +6924,8 @@ static const struct amdgpu_ring_funcs
> gfx_v8_0_ring_funcs_gfx = {
>                 5 + /* HDP_INVL */
>                 12 + 12 + /* FENCE x2 */
>                 2 + /* SWITCH_BUFFER */
> -               5, /* SURFACE_SYNC */
> +               5 + /* SURFACE_SYNC */
> +               36, /* gfx_v8_0_ring_emit_trap_handler */
>         .emit_ib_size = 4, /* gfx_v8_0_ring_emit_ib_gfx */
>         .emit_ib = gfx_v8_0_ring_emit_ib_gfx,
>         .emit_fence = gfx_v8_0_ring_emit_fence_gfx,
> @@ -6909,6 +6944,7 @@ static const struct amdgpu_ring_funcs
> gfx_v8_0_ring_funcs_gfx = {
>         .emit_wreg = gfx_v8_0_ring_emit_wreg,
>         .soft_recovery = gfx_v8_0_ring_soft_recovery,
>         .emit_mem_sync = gfx_v8_0_emit_mem_sync,
> +       .emit_trap_handler = gfx_v8_0_ring_emit_trap_handler,
>  };
>
>  static const struct amdgpu_ring_funcs gfx_v8_0_ring_funcs_compute = {
> @@ -6926,7 +6962,8 @@ static const struct amdgpu_ring_funcs
> gfx_v8_0_ring_funcs_compute = {
>                 7 + /* gfx_v8_0_ring_emit_pipeline_sync */
>                 VI_FLUSH_GPU_TLB_NUM_WREG * 5 + 7 + /*
> gfx_v8_0_ring_emit_vm_flush */
>                 7 + 7 + 7 + /* gfx_v8_0_ring_emit_fence_compute x3 for
> user fence, vm fence */
> -               7, /* gfx_v8_0_emit_mem_sync_compute */
> +               7 + /* gfx_v8_0_emit_mem_sync_compute */
> +               6, /* gfx_v8_0_emit_trap_handler */
>         .emit_ib_size = 7, /* gfx_v8_0_ring_emit_ib_compute */
>         .emit_ib = gfx_v8_0_ring_emit_ib_compute,
>         .emit_fence = gfx_v8_0_ring_emit_fence_compute,
> @@ -6940,6 +6977,7 @@ static const struct amdgpu_ring_funcs
> gfx_v8_0_ring_funcs_compute = {
>         .pad_ib = amdgpu_ring_generic_pad_ib,
>         .emit_wreg = gfx_v8_0_ring_emit_wreg,
>         .emit_mem_sync = gfx_v8_0_emit_mem_sync_compute,
> +       .emit_trap_handler = gfx_v8_0_ring_emit_trap_handler,
>  };
>
>  static const struct amdgpu_ring_funcs gfx_v8_0_ring_funcs_kiq = {
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> index cb9d60a4e05e..4fc00f196085 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> @@ -4162,6 +4162,23 @@ static void gfx_v9_0_ring_emit_gds_switch(struct
> amdgpu_ring *ring,
>                                    (1 << (oa_size + oa_base)) - (1 <<
> oa_base));
>  }
>
> +static void gfx_v9_0_ring_emit_trap_handler(struct amdgpu_ring *ring,
> +                                           uint32_t vmid,
> +                                           uint64_t tba_addr,
> +                                           uint64_t tma_addr)
> +{
> +       struct amdgpu_device *adev = ring->adev;
> +
> +       mutex_lock(&adev->srbm_mutex);
> +       soc15_grbm_select(adev, 0, 0, 0, vmid);
> +       WREG32_SOC15_RLC(GC, 0, mmSQ_SHADER_TBA_LO, lower_32_bits(tba_addr
> >> 8));
> +       WREG32_SOC15_RLC(GC, 0, mmSQ_SHADER_TBA_HI, upper_32_bits(tba_addr
> >> 8));
> +       WREG32_SOC15_RLC(GC, 0, mmSQ_SHADER_TMA_LO, lower_32_bits(tma_addr
> >> 8));
> +       WREG32_SOC15_RLC(GC, 0, mmSQ_SHADER_TMA_HI, upper_32_bits(tma_addr
> >> 8));
> +       soc15_grbm_select(adev, 0, 0, 0, 0);
> +       mutex_unlock(&adev->srbm_mutex);
> +}
> +
>  static const u32 vgpr_init_compute_shader[] =
>  {
>         0xb07c0000, 0xbe8000ff,
> @@ -6720,6 +6737,7 @@ static const struct amdgpu_ring_funcs
> gfx_v9_0_ring_funcs_gfx = {
>         .emit_reg_write_reg_wait = gfx_v9_0_ring_emit_reg_write_reg_wait,
>         .soft_recovery = gfx_v9_0_ring_soft_recovery,
>         .emit_mem_sync = gfx_v9_0_emit_mem_sync,
> +       .emit_trap_handler = gfx_v9_0_ring_emit_trap_handler,
>  };
>
>  static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_compute = {
> @@ -6756,6 +6774,7 @@ static const struct amdgpu_ring_funcs
> gfx_v9_0_ring_funcs_compute = {
>         .emit_reg_wait = gfx_v9_0_ring_emit_reg_wait,
>         .emit_reg_write_reg_wait = gfx_v9_0_ring_emit_reg_write_reg_wait,
>         .emit_mem_sync = gfx_v9_0_emit_mem_sync,
> +       .emit_trap_handler = gfx_v9_0_ring_emit_trap_handler,
>  };
>
>  static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_kiq = {
> diff --git a/include/uapi/drm/amdgpu_drm.h b/include/uapi/drm/amdgpu_drm.h
> index 3218576e109d..7eae264adb5d 100644
> --- a/include/uapi/drm/amdgpu_drm.h
> +++ b/include/uapi/drm/amdgpu_drm.h
> @@ -551,6 +551,7 @@ struct drm_amdgpu_gem_va {
>  #define AMDGPU_CHUNK_ID_SCHEDULED_DEPENDENCIES 0x07
>  #define AMDGPU_CHUNK_ID_SYNCOBJ_TIMELINE_WAIT    0x08
>  #define AMDGPU_CHUNK_ID_SYNCOBJ_TIMELINE_SIGNAL  0x09
> +#define AMDGPU_CHUNK_ID_TRAP            0x0a
>
>  struct drm_amdgpu_cs_chunk {
>         __u32           chunk_id;
> @@ -645,6 +646,13 @@ struct drm_amdgpu_cs_chunk_syncobj {
>         __u64 point;
>  };
>
> +struct drm_amdgpu_cs_chunk_trap {
> +       /** Trap Base Address */
> +       __u64 tba_addr;
> +       /** Trap Memory Address */
> +       __u64 tma_addr;
> +};
> +
>  #define AMDGPU_FENCE_TO_HANDLE_GET_SYNCOBJ     0
>  #define AMDGPU_FENCE_TO_HANDLE_GET_SYNCOBJ_FD  1
>  #define AMDGPU_FENCE_TO_HANDLE_GET_SYNC_FILE_FD        2
> --
> 2.28.0
>
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>

[-- Attachment #1.2: Type: text/html, Size: 21049 bytes --]

[-- Attachment #2: Type: text/plain, Size: 154 bytes --]

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [RFC PATCH] drm/amdgpu: add support for user trap handlers
  2020-08-24 11:49 [RFC PATCH] drm/amdgpu: add support for user trap handlers Samuel Pitoiset
  2020-08-24 18:17 ` Marek Olšák
@ 2020-08-24 18:33 ` Alex Deucher
  2020-08-24 21:32   ` Alex Deucher
  2020-08-25 14:07 ` [PATCH v2] " Samuel Pitoiset
  2021-05-06  6:54 ` [PATCH v3] " Samuel Pitoiset
  3 siblings, 1 reply; 16+ messages in thread
From: Alex Deucher @ 2020-08-24 18:33 UTC (permalink / raw)
  To: Samuel Pitoiset; +Cc: Deucher, Alexander, amd-gfx list

On Mon, Aug 24, 2020 at 7:57 AM Samuel Pitoiset
<samuel.pitoiset@gmail.com> wrote:
>
> A trap handler can be used by userspace to catch shader exceptions
> like divide by zero, memory violations etc.
>
> On GFX6-GFX8, the registers used to configure TBA/TMA aren't
> privileged while on GFX9+ they are per VMID and privileged,
> so that only the KMD can configure them.
>
> This introduces a new CS chunk that can be used to set the
> TBA/TMA virtual address at submit time.
>
> TODO:
> - add GFX 6,7 and 10 support
> - rebase on top of amd-staging-drm-next (this branch currently
> hangs my GPU at boot)
>
> Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c   | 31 +++++++++++++++++
>  drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c  |  3 +-
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ids.h  |  4 +++
>  drivers/gpu/drm/amd/amdgpu/amdgpu_job.h  |  4 +++
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h |  4 +++
>  drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c   | 15 ++++++++-
>  drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c    | 42 ++++++++++++++++++++++--
>  drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c    | 19 +++++++++++
>  include/uapi/drm/amdgpu_drm.h            |  8 +++++
>  9 files changed, 126 insertions(+), 4 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> index a512ccbc4dea..6ca5c4912e3a 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> @@ -104,6 +104,19 @@ static int amdgpu_cs_bo_handles_chunk(struct amdgpu_cs_parser *p,
>         return r;
>  }
>
> +static int amdgpu_cs_user_trap_chunk(struct amdgpu_cs_parser *p,
> +                                    struct drm_amdgpu_cs_chunk_trap *data,
> +                                    uint64_t *tba_addr, uint64_t *tma_addr)
> +{
> +       if (!data->tba_addr || !data->tma_addr)
> +               return -EINVAL;
> +
> +       *tba_addr = data->tba_addr;
> +       *tma_addr = data->tma_addr;
> +
> +       return 0;
> +}
> +
>  static int amdgpu_cs_parser_init(struct amdgpu_cs_parser *p, union drm_amdgpu_cs *cs)
>  {
>         struct amdgpu_fpriv *fpriv = p->filp->driver_priv;
> @@ -112,6 +125,7 @@ static int amdgpu_cs_parser_init(struct amdgpu_cs_parser *p, union drm_amdgpu_cs
>         uint64_t *chunk_array;
>         unsigned size, num_ibs = 0;
>         uint32_t uf_offset = 0;
> +       uint64_t tba_addr = 0, tma_addr = 0;
>         int i;
>         int ret;
>
> @@ -214,6 +228,19 @@ static int amdgpu_cs_parser_init(struct amdgpu_cs_parser *p, union drm_amdgpu_cs
>
>                         break;
>
> +               case AMDGPU_CHUNK_ID_TRAP:
> +                       size = sizeof(struct drm_amdgpu_cs_chunk_trap);
> +                       if (p->chunks[i].length_dw * sizeof(uint32_t) < size) {
> +                               ret = -EINVAL;
> +                               goto free_partial_kdata;
> +                       }
> +
> +                       ret = amdgpu_cs_user_trap_chunk(p, p->chunks[i].kdata,
> +                                                       &tba_addr, &tma_addr);
> +                       if (ret)
> +                               goto free_partial_kdata;
> +                       break;
> +
>                 case AMDGPU_CHUNK_ID_DEPENDENCIES:
>                 case AMDGPU_CHUNK_ID_SYNCOBJ_IN:
>                 case AMDGPU_CHUNK_ID_SYNCOBJ_OUT:
> @@ -239,6 +266,10 @@ static int amdgpu_cs_parser_init(struct amdgpu_cs_parser *p, union drm_amdgpu_cs
>
>         if (p->uf_entry.tv.bo)
>                 p->job->uf_addr = uf_offset;
> +
> +       p->job->tba_addr = tba_addr;
> +       p->job->tma_addr = tma_addr;
> +
>         kfree(chunk_array);
>
>         /* Use this opportunity to fill in task info for the vm */
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> index 26127c7d2f32..1e703119e4c2 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> @@ -88,9 +88,10 @@
>   * - 3.37.0 - L2 is invalidated before SDMA IBs, needed for correctness
>   * - 3.38.0 - Add AMDGPU_IB_FLAG_EMIT_MEM_SYNC
>   * - 3.39.0 - DMABUF implicit sync does a full pipeline sync
> + * - 3.40.0 - Add AMDGPU_CHUNK_ID_TRAP
>   */
>  #define KMS_DRIVER_MAJOR       3
> -#define KMS_DRIVER_MINOR       39
> +#define KMS_DRIVER_MINOR       40
>  #define KMS_DRIVER_PATCHLEVEL  0
>
>  int amdgpu_vram_limit = 0;
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.h
> index 8e58325bbca2..fd0d56724b4d 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.h
> @@ -58,6 +58,10 @@ struct amdgpu_vmid {
>         uint32_t                oa_base;
>         uint32_t                oa_size;
>
> +       /* user trap */
> +       uint64_t                tba_addr;
> +       uint64_t                tma_addr;
> +
>         unsigned                pasid;
>         struct dma_fence        *pasid_mapping;
>  };
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
> index 81caac9b958a..b8ed5b13ea44 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
> @@ -62,6 +62,10 @@ struct amdgpu_job {
>         /* user fence handling */
>         uint64_t                uf_addr;
>         uint64_t                uf_sequence;
> +
> +       /* user trap handling */
> +       uint64_t                tba_addr;
> +       uint64_t                tma_addr;
>  };
>
>  int amdgpu_job_alloc(struct amdgpu_device *adev, unsigned num_ibs,
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> index da871d84b742..1f165a6295d9 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> @@ -197,6 +197,9 @@ struct amdgpu_ring_funcs {
>         void (*soft_recovery)(struct amdgpu_ring *ring, unsigned vmid);
>         int (*preempt_ib)(struct amdgpu_ring *ring);
>         void (*emit_mem_sync)(struct amdgpu_ring *ring);
> +       void (*emit_trap_handler)(struct amdgpu_ring *ring,
> +                                 uint32_t vmid,
> +                                 uint64_t tba_addr, uint64_t tma_addr);
>  };
>
>  struct amdgpu_ring {
> @@ -265,6 +268,7 @@ struct amdgpu_ring {
>  #define amdgpu_ring_emit_vm_flush(r, vmid, addr) (r)->funcs->emit_vm_flush((r), (vmid), (addr))
>  #define amdgpu_ring_emit_fence(r, addr, seq, flags) (r)->funcs->emit_fence((r), (addr), (seq), (flags))
>  #define amdgpu_ring_emit_gds_switch(r, v, db, ds, wb, ws, ab, as) (r)->funcs->emit_gds_switch((r), (v), (db), (ds), (wb), (ws), (ab), (as))
> +#define amdgpu_ring_emit_trap_handler(r, v, tba, tma) (r)->funcs->emit_trap_handler((r), (v), (tba), (tma))
>  #define amdgpu_ring_emit_hdp_flush(r) (r)->funcs->emit_hdp_flush((r))
>  #define amdgpu_ring_emit_switch_buffer(r) (r)->funcs->emit_switch_buffer((r))
>  #define amdgpu_ring_emit_cntxcntl(r, d) (r)->funcs->emit_cntxcntl((r), (d))
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> index 71e005cf2952..24916082de0b 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> @@ -1076,6 +1076,9 @@ int amdgpu_vm_flush(struct amdgpu_ring *ring, struct amdgpu_job *job,
>                 id->gws_size != job->gws_size ||
>                 id->oa_base != job->oa_base ||
>                 id->oa_size != job->oa_size);
> +       bool trap_handler_needed = ring->funcs->emit_trap_handler && (
> +               id->tba_addr != job->tba_addr ||
> +               id->tma_addr != job->tma_addr);
>         bool vm_flush_needed = job->vm_needs_flush;
>         struct dma_fence *fence = NULL;
>         bool pasid_mapping_needed = false;
> @@ -1088,6 +1091,7 @@ int amdgpu_vm_flush(struct amdgpu_ring *ring, struct amdgpu_job *job,
>
>         if (amdgpu_vmid_had_gpu_reset(adev, id)) {
>                 gds_switch_needed = true;
> +               trap_handler_needed = true;
>                 vm_flush_needed = true;
>                 pasid_mapping_needed = true;
>         }
> @@ -1099,12 +1103,14 @@ int amdgpu_vm_flush(struct amdgpu_ring *ring, struct amdgpu_job *job,
>         mutex_unlock(&id_mgr->lock);
>
>         gds_switch_needed &= !!ring->funcs->emit_gds_switch;
> +       trap_handler_needed &= !!ring->funcs->emit_trap_handler;
>         vm_flush_needed &= !!ring->funcs->emit_vm_flush  &&
>                         job->vm_pd_addr != AMDGPU_BO_INVALID_OFFSET;
>         pasid_mapping_needed &= adev->gmc.gmc_funcs->emit_pasid_mapping &&
>                 ring->funcs->emit_wreg;
>
> -       if (!vm_flush_needed && !gds_switch_needed && !need_pipe_sync)
> +       if (!vm_flush_needed && !gds_switch_needed &&
> +           !trap_handler_needed && !need_pipe_sync)
>                 return 0;
>
>         if (ring->funcs->init_cond_exec)
> @@ -1158,6 +1164,13 @@ int amdgpu_vm_flush(struct amdgpu_ring *ring, struct amdgpu_job *job,
>                                             job->oa_size);
>         }
>
> +       if (ring->funcs->emit_trap_handler && trap_handler_needed) {
> +               id->tba_addr = job->tba_addr;
> +               id->tma_addr = job->tma_addr;
> +               amdgpu_ring_emit_trap_handler(ring, job->vmid, job->tba_addr,
> +                                             job->tma_addr);
> +       }
> +
>         if (ring->funcs->patch_cond_exec)
>                 amdgpu_ring_patch_cond_exec(ring, patch_offset);
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> index 33f1c4a46ebe..59db577e8c8f 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> @@ -5222,6 +5222,40 @@ static void gfx_v8_0_ring_emit_gds_switch(struct amdgpu_ring *ring,
>         amdgpu_ring_write(ring, (1 << (oa_size + oa_base)) - (1 << oa_base));
>  }
>
> +static void gfx_v8_0_ring_emit_trap_handler(struct amdgpu_ring *ring,
> +                                           uint32_t vmid,
> +                                           uint64_t tba_addr,
> +                                           uint64_t tma_addr)
> +{
> +       if (ring->funcs->type == AMDGPU_RING_TYPE_GFX) {
> +               static const u32 regs[] = {
> +                       mmSPI_SHADER_TBA_LO_PS,
> +                       mmSPI_SHADER_TBA_LO_VS,
> +                       mmSPI_SHADER_TBA_LO_GS,
> +                       mmSPI_SHADER_TBA_LO_ES,
> +                       mmSPI_SHADER_TBA_LO_HS,
> +                       mmSPI_SHADER_TBA_LO_LS,
> +               };
> +               int i;
> +
> +               for (i = 0; i < ARRAY_SIZE(regs); i++) {
> +                       amdgpu_ring_write(ring, PACKET3(PACKET3_SET_SH_REG, 4));
> +                       amdgpu_ring_write(ring, regs[i] - PACKET3_SET_SH_REG_START);
> +                       amdgpu_ring_write(ring, lower_32_bits(tba_addr >> 8));
> +                       amdgpu_ring_write(ring, upper_32_bits(tba_addr >> 8));
> +                       amdgpu_ring_write(ring, lower_32_bits(tma_addr >> 8));
> +                       amdgpu_ring_write(ring, upper_32_bits(tma_addr >> 8));
> +               }
> +       } else {
> +               amdgpu_ring_write(ring, PACKET3(PACKET3_SET_SH_REG, 4));
> +               amdgpu_ring_write(ring, mmCOMPUTE_TBA_LO - PACKET3_SET_SH_REG_START);
> +               amdgpu_ring_write(ring, lower_32_bits(tba_addr >> 8));
> +               amdgpu_ring_write(ring, upper_32_bits(tba_addr >> 8));
> +               amdgpu_ring_write(ring, lower_32_bits(tma_addr >> 8));
> +               amdgpu_ring_write(ring, upper_32_bits(tma_addr >> 8));
> +       }
> +}
> +
>  static uint32_t wave_read_ind(struct amdgpu_device *adev, uint32_t simd, uint32_t wave, uint32_t address)
>  {
>         WREG32(mmSQ_IND_INDEX,
> @@ -6890,7 +6924,8 @@ static const struct amdgpu_ring_funcs gfx_v8_0_ring_funcs_gfx = {
>                 5 + /* HDP_INVL */
>                 12 + 12 + /* FENCE x2 */
>                 2 + /* SWITCH_BUFFER */
> -               5, /* SURFACE_SYNC */
> +               5 + /* SURFACE_SYNC */
> +               36, /* gfx_v8_0_ring_emit_trap_handler */
>         .emit_ib_size = 4, /* gfx_v8_0_ring_emit_ib_gfx */
>         .emit_ib = gfx_v8_0_ring_emit_ib_gfx,
>         .emit_fence = gfx_v8_0_ring_emit_fence_gfx,
> @@ -6909,6 +6944,7 @@ static const struct amdgpu_ring_funcs gfx_v8_0_ring_funcs_gfx = {
>         .emit_wreg = gfx_v8_0_ring_emit_wreg,
>         .soft_recovery = gfx_v8_0_ring_soft_recovery,
>         .emit_mem_sync = gfx_v8_0_emit_mem_sync,
> +       .emit_trap_handler = gfx_v8_0_ring_emit_trap_handler,
>  };
>
>  static const struct amdgpu_ring_funcs gfx_v8_0_ring_funcs_compute = {
> @@ -6926,7 +6962,8 @@ static const struct amdgpu_ring_funcs gfx_v8_0_ring_funcs_compute = {
>                 7 + /* gfx_v8_0_ring_emit_pipeline_sync */
>                 VI_FLUSH_GPU_TLB_NUM_WREG * 5 + 7 + /* gfx_v8_0_ring_emit_vm_flush */
>                 7 + 7 + 7 + /* gfx_v8_0_ring_emit_fence_compute x3 for user fence, vm fence */
> -               7, /* gfx_v8_0_emit_mem_sync_compute */
> +               7 + /* gfx_v8_0_emit_mem_sync_compute */
> +               6, /* gfx_v8_0_emit_trap_handler */
>         .emit_ib_size = 7, /* gfx_v8_0_ring_emit_ib_compute */
>         .emit_ib = gfx_v8_0_ring_emit_ib_compute,
>         .emit_fence = gfx_v8_0_ring_emit_fence_compute,
> @@ -6940,6 +6977,7 @@ static const struct amdgpu_ring_funcs gfx_v8_0_ring_funcs_compute = {
>         .pad_ib = amdgpu_ring_generic_pad_ib,
>         .emit_wreg = gfx_v8_0_ring_emit_wreg,
>         .emit_mem_sync = gfx_v8_0_emit_mem_sync_compute,
> +       .emit_trap_handler = gfx_v8_0_ring_emit_trap_handler,
>  };
>
>  static const struct amdgpu_ring_funcs gfx_v8_0_ring_funcs_kiq = {
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> index cb9d60a4e05e..4fc00f196085 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> @@ -4162,6 +4162,23 @@ static void gfx_v9_0_ring_emit_gds_switch(struct amdgpu_ring *ring,
>                                    (1 << (oa_size + oa_base)) - (1 << oa_base));
>  }
>
> +static void gfx_v9_0_ring_emit_trap_handler(struct amdgpu_ring *ring,
> +                                           uint32_t vmid,
> +                                           uint64_t tba_addr,
> +                                           uint64_t tma_addr)
> +{
> +       struct amdgpu_device *adev = ring->adev;
> +
> +       mutex_lock(&adev->srbm_mutex);
> +       soc15_grbm_select(adev, 0, 0, 0, vmid);
> +       WREG32_SOC15_RLC(GC, 0, mmSQ_SHADER_TBA_LO, lower_32_bits(tba_addr >> 8));
> +       WREG32_SOC15_RLC(GC, 0, mmSQ_SHADER_TBA_HI, upper_32_bits(tba_addr >> 8));
> +       WREG32_SOC15_RLC(GC, 0, mmSQ_SHADER_TMA_LO, lower_32_bits(tma_addr >> 8));
> +       WREG32_SOC15_RLC(GC, 0, mmSQ_SHADER_TMA_HI, upper_32_bits(tma_addr >> 8));
> +       soc15_grbm_select(adev, 0, 0, 0, 0);
> +       mutex_unlock(&adev->srbm_mutex);

This won't work.  This updates registers via MMIO using the CPU.  We
need to have the registers updated asynchronously via the CP so they
get updated when the specific jobs are executed by the engine.  vmid's
are shared resources and are assigned dynamically via the kernel
driver.  If you update via MMIO the changes take effect immediately
rather than when the actual work is scheduled on the engine.
Unfortunately, at the moment, I don't see a way to do this with the CP
with the packets that are currently available.

Alex


> +}
> +
>  static const u32 vgpr_init_compute_shader[] =
>  {
>         0xb07c0000, 0xbe8000ff,
> @@ -6720,6 +6737,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_gfx = {
>         .emit_reg_write_reg_wait = gfx_v9_0_ring_emit_reg_write_reg_wait,
>         .soft_recovery = gfx_v9_0_ring_soft_recovery,
>         .emit_mem_sync = gfx_v9_0_emit_mem_sync,
> +       .emit_trap_handler = gfx_v9_0_ring_emit_trap_handler,
>  };
>
>  static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_compute = {
> @@ -6756,6 +6774,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_compute = {
>         .emit_reg_wait = gfx_v9_0_ring_emit_reg_wait,
>         .emit_reg_write_reg_wait = gfx_v9_0_ring_emit_reg_write_reg_wait,
>         .emit_mem_sync = gfx_v9_0_emit_mem_sync,
> +       .emit_trap_handler = gfx_v9_0_ring_emit_trap_handler,
>  };
>
>  static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_kiq = {
> diff --git a/include/uapi/drm/amdgpu_drm.h b/include/uapi/drm/amdgpu_drm.h
> index 3218576e109d..7eae264adb5d 100644
> --- a/include/uapi/drm/amdgpu_drm.h
> +++ b/include/uapi/drm/amdgpu_drm.h
> @@ -551,6 +551,7 @@ struct drm_amdgpu_gem_va {
>  #define AMDGPU_CHUNK_ID_SCHEDULED_DEPENDENCIES 0x07
>  #define AMDGPU_CHUNK_ID_SYNCOBJ_TIMELINE_WAIT    0x08
>  #define AMDGPU_CHUNK_ID_SYNCOBJ_TIMELINE_SIGNAL  0x09
> +#define AMDGPU_CHUNK_ID_TRAP            0x0a
>
>  struct drm_amdgpu_cs_chunk {
>         __u32           chunk_id;
> @@ -645,6 +646,13 @@ struct drm_amdgpu_cs_chunk_syncobj {
>         __u64 point;
>  };
>
> +struct drm_amdgpu_cs_chunk_trap {
> +       /** Trap Base Address */
> +       __u64 tba_addr;
> +       /** Trap Memory Address */
> +       __u64 tma_addr;
> +};
> +
>  #define AMDGPU_FENCE_TO_HANDLE_GET_SYNCOBJ     0
>  #define AMDGPU_FENCE_TO_HANDLE_GET_SYNCOBJ_FD  1
>  #define AMDGPU_FENCE_TO_HANDLE_GET_SYNC_FILE_FD        2
> --
> 2.28.0
>
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [RFC PATCH] drm/amdgpu: add support for user trap handlers
  2020-08-24 18:33 ` Alex Deucher
@ 2020-08-24 21:32   ` Alex Deucher
  2020-08-25  7:06     ` Samuel Pitoiset
  0 siblings, 1 reply; 16+ messages in thread
From: Alex Deucher @ 2020-08-24 21:32 UTC (permalink / raw)
  To: Samuel Pitoiset; +Cc: Deucher, Alexander, amd-gfx list

On Mon, Aug 24, 2020 at 2:33 PM Alex Deucher <alexdeucher@gmail.com> wrote:
>
> On Mon, Aug 24, 2020 at 7:57 AM Samuel Pitoiset
> <samuel.pitoiset@gmail.com> wrote:
> >
> > A trap handler can be used by userspace to catch shader exceptions
> > like divide by zero, memory violations etc.
> >
> > On GFX6-GFX8, the registers used to configure TBA/TMA aren't
> > privileged while on GFX9+ they are per VMID and privileged,
> > so that only the KMD can configure them.
> >
> > This introduces a new CS chunk that can be used to set the
> > TBA/TMA virtual address at submit time.
> >
> > TODO:
> > - add GFX 6,7 and 10 support
> > - rebase on top of amd-staging-drm-next (this branch currently
> > hangs my GPU at boot)
> >
> > Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
> > ---
> >  drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c   | 31 +++++++++++++++++
> >  drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c  |  3 +-
> >  drivers/gpu/drm/amd/amdgpu/amdgpu_ids.h  |  4 +++
> >  drivers/gpu/drm/amd/amdgpu/amdgpu_job.h  |  4 +++
> >  drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h |  4 +++
> >  drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c   | 15 ++++++++-
> >  drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c    | 42 ++++++++++++++++++++++--
> >  drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c    | 19 +++++++++++
> >  include/uapi/drm/amdgpu_drm.h            |  8 +++++
> >  9 files changed, 126 insertions(+), 4 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> > index a512ccbc4dea..6ca5c4912e3a 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> > @@ -104,6 +104,19 @@ static int amdgpu_cs_bo_handles_chunk(struct amdgpu_cs_parser *p,
> >         return r;
> >  }
> >
> > +static int amdgpu_cs_user_trap_chunk(struct amdgpu_cs_parser *p,
> > +                                    struct drm_amdgpu_cs_chunk_trap *data,
> > +                                    uint64_t *tba_addr, uint64_t *tma_addr)
> > +{
> > +       if (!data->tba_addr || !data->tma_addr)
> > +               return -EINVAL;
> > +
> > +       *tba_addr = data->tba_addr;
> > +       *tma_addr = data->tma_addr;
> > +
> > +       return 0;
> > +}
> > +
> >  static int amdgpu_cs_parser_init(struct amdgpu_cs_parser *p, union drm_amdgpu_cs *cs)
> >  {
> >         struct amdgpu_fpriv *fpriv = p->filp->driver_priv;
> > @@ -112,6 +125,7 @@ static int amdgpu_cs_parser_init(struct amdgpu_cs_parser *p, union drm_amdgpu_cs
> >         uint64_t *chunk_array;
> >         unsigned size, num_ibs = 0;
> >         uint32_t uf_offset = 0;
> > +       uint64_t tba_addr = 0, tma_addr = 0;
> >         int i;
> >         int ret;
> >
> > @@ -214,6 +228,19 @@ static int amdgpu_cs_parser_init(struct amdgpu_cs_parser *p, union drm_amdgpu_cs
> >
> >                         break;
> >
> > +               case AMDGPU_CHUNK_ID_TRAP:
> > +                       size = sizeof(struct drm_amdgpu_cs_chunk_trap);
> > +                       if (p->chunks[i].length_dw * sizeof(uint32_t) < size) {
> > +                               ret = -EINVAL;
> > +                               goto free_partial_kdata;
> > +                       }
> > +
> > +                       ret = amdgpu_cs_user_trap_chunk(p, p->chunks[i].kdata,
> > +                                                       &tba_addr, &tma_addr);
> > +                       if (ret)
> > +                               goto free_partial_kdata;
> > +                       break;
> > +
> >                 case AMDGPU_CHUNK_ID_DEPENDENCIES:
> >                 case AMDGPU_CHUNK_ID_SYNCOBJ_IN:
> >                 case AMDGPU_CHUNK_ID_SYNCOBJ_OUT:
> > @@ -239,6 +266,10 @@ static int amdgpu_cs_parser_init(struct amdgpu_cs_parser *p, union drm_amdgpu_cs
> >
> >         if (p->uf_entry.tv.bo)
> >                 p->job->uf_addr = uf_offset;
> > +
> > +       p->job->tba_addr = tba_addr;
> > +       p->job->tma_addr = tma_addr;
> > +
> >         kfree(chunk_array);
> >
> >         /* Use this opportunity to fill in task info for the vm */
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> > index 26127c7d2f32..1e703119e4c2 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> > @@ -88,9 +88,10 @@
> >   * - 3.37.0 - L2 is invalidated before SDMA IBs, needed for correctness
> >   * - 3.38.0 - Add AMDGPU_IB_FLAG_EMIT_MEM_SYNC
> >   * - 3.39.0 - DMABUF implicit sync does a full pipeline sync
> > + * - 3.40.0 - Add AMDGPU_CHUNK_ID_TRAP
> >   */
> >  #define KMS_DRIVER_MAJOR       3
> > -#define KMS_DRIVER_MINOR       39
> > +#define KMS_DRIVER_MINOR       40
> >  #define KMS_DRIVER_PATCHLEVEL  0
> >
> >  int amdgpu_vram_limit = 0;
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.h
> > index 8e58325bbca2..fd0d56724b4d 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.h
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.h
> > @@ -58,6 +58,10 @@ struct amdgpu_vmid {
> >         uint32_t                oa_base;
> >         uint32_t                oa_size;
> >
> > +       /* user trap */
> > +       uint64_t                tba_addr;
> > +       uint64_t                tma_addr;
> > +
> >         unsigned                pasid;
> >         struct dma_fence        *pasid_mapping;
> >  };
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
> > index 81caac9b958a..b8ed5b13ea44 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
> > @@ -62,6 +62,10 @@ struct amdgpu_job {
> >         /* user fence handling */
> >         uint64_t                uf_addr;
> >         uint64_t                uf_sequence;
> > +
> > +       /* user trap handling */
> > +       uint64_t                tba_addr;
> > +       uint64_t                tma_addr;
> >  };
> >
> >  int amdgpu_job_alloc(struct amdgpu_device *adev, unsigned num_ibs,
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> > index da871d84b742..1f165a6295d9 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> > @@ -197,6 +197,9 @@ struct amdgpu_ring_funcs {
> >         void (*soft_recovery)(struct amdgpu_ring *ring, unsigned vmid);
> >         int (*preempt_ib)(struct amdgpu_ring *ring);
> >         void (*emit_mem_sync)(struct amdgpu_ring *ring);
> > +       void (*emit_trap_handler)(struct amdgpu_ring *ring,
> > +                                 uint32_t vmid,
> > +                                 uint64_t tba_addr, uint64_t tma_addr);
> >  };
> >
> >  struct amdgpu_ring {
> > @@ -265,6 +268,7 @@ struct amdgpu_ring {
> >  #define amdgpu_ring_emit_vm_flush(r, vmid, addr) (r)->funcs->emit_vm_flush((r), (vmid), (addr))
> >  #define amdgpu_ring_emit_fence(r, addr, seq, flags) (r)->funcs->emit_fence((r), (addr), (seq), (flags))
> >  #define amdgpu_ring_emit_gds_switch(r, v, db, ds, wb, ws, ab, as) (r)->funcs->emit_gds_switch((r), (v), (db), (ds), (wb), (ws), (ab), (as))
> > +#define amdgpu_ring_emit_trap_handler(r, v, tba, tma) (r)->funcs->emit_trap_handler((r), (v), (tba), (tma))
> >  #define amdgpu_ring_emit_hdp_flush(r) (r)->funcs->emit_hdp_flush((r))
> >  #define amdgpu_ring_emit_switch_buffer(r) (r)->funcs->emit_switch_buffer((r))
> >  #define amdgpu_ring_emit_cntxcntl(r, d) (r)->funcs->emit_cntxcntl((r), (d))
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> > index 71e005cf2952..24916082de0b 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> > @@ -1076,6 +1076,9 @@ int amdgpu_vm_flush(struct amdgpu_ring *ring, struct amdgpu_job *job,
> >                 id->gws_size != job->gws_size ||
> >                 id->oa_base != job->oa_base ||
> >                 id->oa_size != job->oa_size);
> > +       bool trap_handler_needed = ring->funcs->emit_trap_handler && (
> > +               id->tba_addr != job->tba_addr ||
> > +               id->tma_addr != job->tma_addr);
> >         bool vm_flush_needed = job->vm_needs_flush;
> >         struct dma_fence *fence = NULL;
> >         bool pasid_mapping_needed = false;
> > @@ -1088,6 +1091,7 @@ int amdgpu_vm_flush(struct amdgpu_ring *ring, struct amdgpu_job *job,
> >
> >         if (amdgpu_vmid_had_gpu_reset(adev, id)) {
> >                 gds_switch_needed = true;
> > +               trap_handler_needed = true;
> >                 vm_flush_needed = true;
> >                 pasid_mapping_needed = true;
> >         }
> > @@ -1099,12 +1103,14 @@ int amdgpu_vm_flush(struct amdgpu_ring *ring, struct amdgpu_job *job,
> >         mutex_unlock(&id_mgr->lock);
> >
> >         gds_switch_needed &= !!ring->funcs->emit_gds_switch;
> > +       trap_handler_needed &= !!ring->funcs->emit_trap_handler;
> >         vm_flush_needed &= !!ring->funcs->emit_vm_flush  &&
> >                         job->vm_pd_addr != AMDGPU_BO_INVALID_OFFSET;
> >         pasid_mapping_needed &= adev->gmc.gmc_funcs->emit_pasid_mapping &&
> >                 ring->funcs->emit_wreg;
> >
> > -       if (!vm_flush_needed && !gds_switch_needed && !need_pipe_sync)
> > +       if (!vm_flush_needed && !gds_switch_needed &&
> > +           !trap_handler_needed && !need_pipe_sync)
> >                 return 0;
> >
> >         if (ring->funcs->init_cond_exec)
> > @@ -1158,6 +1164,13 @@ int amdgpu_vm_flush(struct amdgpu_ring *ring, struct amdgpu_job *job,
> >                                             job->oa_size);
> >         }
> >
> > +       if (ring->funcs->emit_trap_handler && trap_handler_needed) {
> > +               id->tba_addr = job->tba_addr;
> > +               id->tma_addr = job->tma_addr;
> > +               amdgpu_ring_emit_trap_handler(ring, job->vmid, job->tba_addr,
> > +                                             job->tma_addr);
> > +       }
> > +
> >         if (ring->funcs->patch_cond_exec)
> >                 amdgpu_ring_patch_cond_exec(ring, patch_offset);
> >
> > diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> > index 33f1c4a46ebe..59db577e8c8f 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> > @@ -5222,6 +5222,40 @@ static void gfx_v8_0_ring_emit_gds_switch(struct amdgpu_ring *ring,
> >         amdgpu_ring_write(ring, (1 << (oa_size + oa_base)) - (1 << oa_base));
> >  }
> >
> > +static void gfx_v8_0_ring_emit_trap_handler(struct amdgpu_ring *ring,
> > +                                           uint32_t vmid,
> > +                                           uint64_t tba_addr,
> > +                                           uint64_t tma_addr)
> > +{
> > +       if (ring->funcs->type == AMDGPU_RING_TYPE_GFX) {
> > +               static const u32 regs[] = {
> > +                       mmSPI_SHADER_TBA_LO_PS,
> > +                       mmSPI_SHADER_TBA_LO_VS,
> > +                       mmSPI_SHADER_TBA_LO_GS,
> > +                       mmSPI_SHADER_TBA_LO_ES,
> > +                       mmSPI_SHADER_TBA_LO_HS,
> > +                       mmSPI_SHADER_TBA_LO_LS,
> > +               };
> > +               int i;
> > +
> > +               for (i = 0; i < ARRAY_SIZE(regs); i++) {
> > +                       amdgpu_ring_write(ring, PACKET3(PACKET3_SET_SH_REG, 4));
> > +                       amdgpu_ring_write(ring, regs[i] - PACKET3_SET_SH_REG_START);
> > +                       amdgpu_ring_write(ring, lower_32_bits(tba_addr >> 8));
> > +                       amdgpu_ring_write(ring, upper_32_bits(tba_addr >> 8));
> > +                       amdgpu_ring_write(ring, lower_32_bits(tma_addr >> 8));
> > +                       amdgpu_ring_write(ring, upper_32_bits(tma_addr >> 8));
> > +               }
> > +       } else {
> > +               amdgpu_ring_write(ring, PACKET3(PACKET3_SET_SH_REG, 4));
> > +               amdgpu_ring_write(ring, mmCOMPUTE_TBA_LO - PACKET3_SET_SH_REG_START);
> > +               amdgpu_ring_write(ring, lower_32_bits(tba_addr >> 8));
> > +               amdgpu_ring_write(ring, upper_32_bits(tba_addr >> 8));
> > +               amdgpu_ring_write(ring, lower_32_bits(tma_addr >> 8));
> > +               amdgpu_ring_write(ring, upper_32_bits(tma_addr >> 8));
> > +       }
> > +}
> > +
> >  static uint32_t wave_read_ind(struct amdgpu_device *adev, uint32_t simd, uint32_t wave, uint32_t address)
> >  {
> >         WREG32(mmSQ_IND_INDEX,
> > @@ -6890,7 +6924,8 @@ static const struct amdgpu_ring_funcs gfx_v8_0_ring_funcs_gfx = {
> >                 5 + /* HDP_INVL */
> >                 12 + 12 + /* FENCE x2 */
> >                 2 + /* SWITCH_BUFFER */
> > -               5, /* SURFACE_SYNC */
> > +               5 + /* SURFACE_SYNC */
> > +               36, /* gfx_v8_0_ring_emit_trap_handler */
> >         .emit_ib_size = 4, /* gfx_v8_0_ring_emit_ib_gfx */
> >         .emit_ib = gfx_v8_0_ring_emit_ib_gfx,
> >         .emit_fence = gfx_v8_0_ring_emit_fence_gfx,
> > @@ -6909,6 +6944,7 @@ static const struct amdgpu_ring_funcs gfx_v8_0_ring_funcs_gfx = {
> >         .emit_wreg = gfx_v8_0_ring_emit_wreg,
> >         .soft_recovery = gfx_v8_0_ring_soft_recovery,
> >         .emit_mem_sync = gfx_v8_0_emit_mem_sync,
> > +       .emit_trap_handler = gfx_v8_0_ring_emit_trap_handler,
> >  };
> >
> >  static const struct amdgpu_ring_funcs gfx_v8_0_ring_funcs_compute = {
> > @@ -6926,7 +6962,8 @@ static const struct amdgpu_ring_funcs gfx_v8_0_ring_funcs_compute = {
> >                 7 + /* gfx_v8_0_ring_emit_pipeline_sync */
> >                 VI_FLUSH_GPU_TLB_NUM_WREG * 5 + 7 + /* gfx_v8_0_ring_emit_vm_flush */
> >                 7 + 7 + 7 + /* gfx_v8_0_ring_emit_fence_compute x3 for user fence, vm fence */
> > -               7, /* gfx_v8_0_emit_mem_sync_compute */
> > +               7 + /* gfx_v8_0_emit_mem_sync_compute */
> > +               6, /* gfx_v8_0_emit_trap_handler */
> >         .emit_ib_size = 7, /* gfx_v8_0_ring_emit_ib_compute */
> >         .emit_ib = gfx_v8_0_ring_emit_ib_compute,
> >         .emit_fence = gfx_v8_0_ring_emit_fence_compute,
> > @@ -6940,6 +6977,7 @@ static const struct amdgpu_ring_funcs gfx_v8_0_ring_funcs_compute = {
> >         .pad_ib = amdgpu_ring_generic_pad_ib,
> >         .emit_wreg = gfx_v8_0_ring_emit_wreg,
> >         .emit_mem_sync = gfx_v8_0_emit_mem_sync_compute,
> > +       .emit_trap_handler = gfx_v8_0_ring_emit_trap_handler,
> >  };
> >
> >  static const struct amdgpu_ring_funcs gfx_v8_0_ring_funcs_kiq = {
> > diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> > index cb9d60a4e05e..4fc00f196085 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> > @@ -4162,6 +4162,23 @@ static void gfx_v9_0_ring_emit_gds_switch(struct amdgpu_ring *ring,
> >                                    (1 << (oa_size + oa_base)) - (1 << oa_base));
> >  }
> >
> > +static void gfx_v9_0_ring_emit_trap_handler(struct amdgpu_ring *ring,
> > +                                           uint32_t vmid,
> > +                                           uint64_t tba_addr,
> > +                                           uint64_t tma_addr)
> > +{
> > +       struct amdgpu_device *adev = ring->adev;
> > +
> > +       mutex_lock(&adev->srbm_mutex);
> > +       soc15_grbm_select(adev, 0, 0, 0, vmid);
> > +       WREG32_SOC15_RLC(GC, 0, mmSQ_SHADER_TBA_LO, lower_32_bits(tba_addr >> 8));
> > +       WREG32_SOC15_RLC(GC, 0, mmSQ_SHADER_TBA_HI, upper_32_bits(tba_addr >> 8));
> > +       WREG32_SOC15_RLC(GC, 0, mmSQ_SHADER_TMA_LO, lower_32_bits(tma_addr >> 8));
> > +       WREG32_SOC15_RLC(GC, 0, mmSQ_SHADER_TMA_HI, upper_32_bits(tma_addr >> 8));
> > +       soc15_grbm_select(adev, 0, 0, 0, 0);
> > +       mutex_unlock(&adev->srbm_mutex);
>
> This won't work.  This updates registers via MMIO using the CPU.  We
> need to have the registers updated asynchronously via the CP so they
> get updated when the specific jobs are executed by the engine.  vmid's
> are shared resources and are assigned dynamically via the kernel
> driver.  If you update via MMIO the changes take effect immediately
> rather than when the actual work is scheduled on the engine.
> Unfortunately, at the moment, I don't see a way to do this with the CP
> with the packets that are currently available.

One option might be to do this via MMIO, but only support it when
using a reserved vmid.

Alex


>
> Alex
>
>
> > +}
> > +
> >  static const u32 vgpr_init_compute_shader[] =
> >  {
> >         0xb07c0000, 0xbe8000ff,
> > @@ -6720,6 +6737,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_gfx = {
> >         .emit_reg_write_reg_wait = gfx_v9_0_ring_emit_reg_write_reg_wait,
> >         .soft_recovery = gfx_v9_0_ring_soft_recovery,
> >         .emit_mem_sync = gfx_v9_0_emit_mem_sync,
> > +       .emit_trap_handler = gfx_v9_0_ring_emit_trap_handler,
> >  };
> >
> >  static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_compute = {
> > @@ -6756,6 +6774,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_compute = {
> >         .emit_reg_wait = gfx_v9_0_ring_emit_reg_wait,
> >         .emit_reg_write_reg_wait = gfx_v9_0_ring_emit_reg_write_reg_wait,
> >         .emit_mem_sync = gfx_v9_0_emit_mem_sync,
> > +       .emit_trap_handler = gfx_v9_0_ring_emit_trap_handler,
> >  };
> >
> >  static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_kiq = {
> > diff --git a/include/uapi/drm/amdgpu_drm.h b/include/uapi/drm/amdgpu_drm.h
> > index 3218576e109d..7eae264adb5d 100644
> > --- a/include/uapi/drm/amdgpu_drm.h
> > +++ b/include/uapi/drm/amdgpu_drm.h
> > @@ -551,6 +551,7 @@ struct drm_amdgpu_gem_va {
> >  #define AMDGPU_CHUNK_ID_SCHEDULED_DEPENDENCIES 0x07
> >  #define AMDGPU_CHUNK_ID_SYNCOBJ_TIMELINE_WAIT    0x08
> >  #define AMDGPU_CHUNK_ID_SYNCOBJ_TIMELINE_SIGNAL  0x09
> > +#define AMDGPU_CHUNK_ID_TRAP            0x0a
> >
> >  struct drm_amdgpu_cs_chunk {
> >         __u32           chunk_id;
> > @@ -645,6 +646,13 @@ struct drm_amdgpu_cs_chunk_syncobj {
> >         __u64 point;
> >  };
> >
> > +struct drm_amdgpu_cs_chunk_trap {
> > +       /** Trap Base Address */
> > +       __u64 tba_addr;
> > +       /** Trap Memory Address */
> > +       __u64 tma_addr;
> > +};
> > +
> >  #define AMDGPU_FENCE_TO_HANDLE_GET_SYNCOBJ     0
> >  #define AMDGPU_FENCE_TO_HANDLE_GET_SYNCOBJ_FD  1
> >  #define AMDGPU_FENCE_TO_HANDLE_GET_SYNC_FILE_FD        2
> > --
> > 2.28.0
> >
> > _______________________________________________
> > amd-gfx mailing list
> > amd-gfx@lists.freedesktop.org
> > https://lists.freedesktop.org/mailman/listinfo/amd-gfx
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [RFC PATCH] drm/amdgpu: add support for user trap handlers
  2020-08-24 18:17 ` Marek Olšák
@ 2020-08-25  7:04   ` Samuel Pitoiset
  0 siblings, 0 replies; 16+ messages in thread
From: Samuel Pitoiset @ 2020-08-25  7:04 UTC (permalink / raw)
  To: amd-gfx


[-- Attachment #1.1: Type: text/plain, Size: 20820 bytes --]


On 8/24/20 8:17 PM, Marek Olšák wrote:
> SET_SH_REG won't work with CP register shadowing. You need to use 
> WRITE_DATA or WREG32.
You are right, will fix.
>
> Marek
>
> On Mon, Aug 24, 2020 at 7:57 AM Samuel Pitoiset 
> <samuel.pitoiset@gmail.com <mailto:samuel.pitoiset@gmail.com>> wrote:
>
>     A trap handler can be used by userspace to catch shader exceptions
>     like divide by zero, memory violations etc.
>
>     On GFX6-GFX8, the registers used to configure TBA/TMA aren't
>     privileged while on GFX9+ they are per VMID and privileged,
>     so that only the KMD can configure them.
>
>     This introduces a new CS chunk that can be used to set the
>     TBA/TMA virtual address at submit time.
>
>     TODO:
>     - add GFX 6,7 and 10 support
>     - rebase on top of amd-staging-drm-next (this branch currently
>     hangs my GPU at boot)
>
>     Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com
>     <mailto:samuel.pitoiset@gmail.com>>
>     ---
>      drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c   | 31 +++++++++++++++++
>      drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c  |  3 +-
>      drivers/gpu/drm/amd/amdgpu/amdgpu_ids.h  |  4 +++
>      drivers/gpu/drm/amd/amdgpu/amdgpu_job.h  |  4 +++
>      drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h |  4 +++
>      drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c   | 15 ++++++++-
>      drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c    | 42
>     ++++++++++++++++++++++--
>      drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c    | 19 +++++++++++
>      include/uapi/drm/amdgpu_drm.h            |  8 +++++
>      9 files changed, 126 insertions(+), 4 deletions(-)
>
>     diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>     b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>     index a512ccbc4dea..6ca5c4912e3a 100644
>     --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>     +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>     @@ -104,6 +104,19 @@ static int amdgpu_cs_bo_handles_chunk(struct
>     amdgpu_cs_parser *p,
>             return r;
>      }
>
>     +static int amdgpu_cs_user_trap_chunk(struct amdgpu_cs_parser *p,
>     +                                    struct
>     drm_amdgpu_cs_chunk_trap *data,
>     +                                    uint64_t *tba_addr, uint64_t
>     *tma_addr)
>     +{
>     +       if (!data->tba_addr || !data->tma_addr)
>     +               return -EINVAL;
>     +
>     +       *tba_addr = data->tba_addr;
>     +       *tma_addr = data->tma_addr;
>     +
>     +       return 0;
>     +}
>     +
>      static int amdgpu_cs_parser_init(struct amdgpu_cs_parser *p,
>     union drm_amdgpu_cs *cs)
>      {
>             struct amdgpu_fpriv *fpriv = p->filp->driver_priv;
>     @@ -112,6 +125,7 @@ static int amdgpu_cs_parser_init(struct
>     amdgpu_cs_parser *p, union drm_amdgpu_cs
>             uint64_t *chunk_array;
>             unsigned size, num_ibs = 0;
>             uint32_t uf_offset = 0;
>     +       uint64_t tba_addr = 0, tma_addr = 0;
>             int i;
>             int ret;
>
>     @@ -214,6 +228,19 @@ static int amdgpu_cs_parser_init(struct
>     amdgpu_cs_parser *p, union drm_amdgpu_cs
>
>                             break;
>
>     +               case AMDGPU_CHUNK_ID_TRAP:
>     +                       size = sizeof(struct
>     drm_amdgpu_cs_chunk_trap);
>     +                       if (p->chunks[i].length_dw *
>     sizeof(uint32_t) < size) {
>     +                               ret = -EINVAL;
>     +                               goto free_partial_kdata;
>     +                       }
>     +
>     +                       ret = amdgpu_cs_user_trap_chunk(p,
>     p->chunks[i].kdata,
>     +  &tba_addr, &tma_addr);
>     +                       if (ret)
>     +                               goto free_partial_kdata;
>     +                       break;
>     +
>                     case AMDGPU_CHUNK_ID_DEPENDENCIES:
>                     case AMDGPU_CHUNK_ID_SYNCOBJ_IN:
>                     case AMDGPU_CHUNK_ID_SYNCOBJ_OUT:
>     @@ -239,6 +266,10 @@ static int amdgpu_cs_parser_init(struct
>     amdgpu_cs_parser *p, union drm_amdgpu_cs
>
>             if (p->uf_entry.tv.bo <http://uf_entry.tv.bo>)
>                     p->job->uf_addr = uf_offset;
>     +
>     +       p->job->tba_addr = tba_addr;
>     +       p->job->tma_addr = tma_addr;
>     +
>             kfree(chunk_array);
>
>             /* Use this opportunity to fill in task info for the vm */
>     diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>     b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>     index 26127c7d2f32..1e703119e4c2 100644
>     --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>     +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>     @@ -88,9 +88,10 @@
>       * - 3.37.0 - L2 is invalidated before SDMA IBs, needed for
>     correctness
>       * - 3.38.0 - Add AMDGPU_IB_FLAG_EMIT_MEM_SYNC
>       * - 3.39.0 - DMABUF implicit sync does a full pipeline sync
>     + * - 3.40.0 - Add AMDGPU_CHUNK_ID_TRAP
>       */
>      #define KMS_DRIVER_MAJOR       3
>     -#define KMS_DRIVER_MINOR       39
>     +#define KMS_DRIVER_MINOR       40
>      #define KMS_DRIVER_PATCHLEVEL  0
>
>      int amdgpu_vram_limit = 0;
>     diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.h
>     b/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.h
>     index 8e58325bbca2..fd0d56724b4d 100644
>     --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.h
>     +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.h
>     @@ -58,6 +58,10 @@ struct amdgpu_vmid {
>             uint32_t                oa_base;
>             uint32_t                oa_size;
>
>     +       /* user trap */
>     +       uint64_t                tba_addr;
>     +       uint64_t                tma_addr;
>     +
>             unsigned                pasid;
>             struct dma_fence        *pasid_mapping;
>      };
>     diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
>     b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
>     index 81caac9b958a..b8ed5b13ea44 100644
>     --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
>     +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
>     @@ -62,6 +62,10 @@ struct amdgpu_job {
>             /* user fence handling */
>             uint64_t                uf_addr;
>             uint64_t                uf_sequence;
>     +
>     +       /* user trap handling */
>     +       uint64_t                tba_addr;
>     +       uint64_t                tma_addr;
>      };
>
>      int amdgpu_job_alloc(struct amdgpu_device *adev, unsigned num_ibs,
>     diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>     b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>     index da871d84b742..1f165a6295d9 100644
>     --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>     +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>     @@ -197,6 +197,9 @@ struct amdgpu_ring_funcs {
>             void (*soft_recovery)(struct amdgpu_ring *ring, unsigned
>     vmid);
>             int (*preempt_ib)(struct amdgpu_ring *ring);
>             void (*emit_mem_sync)(struct amdgpu_ring *ring);
>     +       void (*emit_trap_handler)(struct amdgpu_ring *ring,
>     +                                 uint32_t vmid,
>     +                                 uint64_t tba_addr, uint64_t
>     tma_addr);
>      };
>
>      struct amdgpu_ring {
>     @@ -265,6 +268,7 @@ struct amdgpu_ring {
>      #define amdgpu_ring_emit_vm_flush(r, vmid, addr)
>     (r)->funcs->emit_vm_flush((r), (vmid), (addr))
>      #define amdgpu_ring_emit_fence(r, addr, seq, flags)
>     (r)->funcs->emit_fence((r), (addr), (seq), (flags))
>      #define amdgpu_ring_emit_gds_switch(r, v, db, ds, wb, ws, ab, as)
>     (r)->funcs->emit_gds_switch((r), (v), (db), (ds), (wb), (ws),
>     (ab), (as))
>     +#define amdgpu_ring_emit_trap_handler(r, v, tba, tma)
>     (r)->funcs->emit_trap_handler((r), (v), (tba), (tma))
>      #define amdgpu_ring_emit_hdp_flush(r) (r)->funcs->emit_hdp_flush((r))
>      #define amdgpu_ring_emit_switch_buffer(r)
>     (r)->funcs->emit_switch_buffer((r))
>      #define amdgpu_ring_emit_cntxcntl(r, d)
>     (r)->funcs->emit_cntxcntl((r), (d))
>     diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>     b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>     index 71e005cf2952..24916082de0b 100644
>     --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>     +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>     @@ -1076,6 +1076,9 @@ int amdgpu_vm_flush(struct amdgpu_ring
>     *ring, struct amdgpu_job *job,
>                     id->gws_size != job->gws_size ||
>                     id->oa_base != job->oa_base ||
>                     id->oa_size != job->oa_size);
>     +       bool trap_handler_needed = ring->funcs->emit_trap_handler && (
>     +               id->tba_addr != job->tba_addr ||
>     +               id->tma_addr != job->tma_addr);
>             bool vm_flush_needed = job->vm_needs_flush;
>             struct dma_fence *fence = NULL;
>             bool pasid_mapping_needed = false;
>     @@ -1088,6 +1091,7 @@ int amdgpu_vm_flush(struct amdgpu_ring
>     *ring, struct amdgpu_job *job,
>
>             if (amdgpu_vmid_had_gpu_reset(adev, id)) {
>                     gds_switch_needed = true;
>     +               trap_handler_needed = true;
>                     vm_flush_needed = true;
>                     pasid_mapping_needed = true;
>             }
>     @@ -1099,12 +1103,14 @@ int amdgpu_vm_flush(struct amdgpu_ring
>     *ring, struct amdgpu_job *job,
>             mutex_unlock(&id_mgr->lock);
>
>             gds_switch_needed &= !!ring->funcs->emit_gds_switch;
>     +       trap_handler_needed &= !!ring->funcs->emit_trap_handler;
>             vm_flush_needed &= !!ring->funcs->emit_vm_flush  &&
>                             job->vm_pd_addr != AMDGPU_BO_INVALID_OFFSET;
>             pasid_mapping_needed &=
>     adev->gmc.gmc_funcs->emit_pasid_mapping &&
>                     ring->funcs->emit_wreg;
>
>     -       if (!vm_flush_needed && !gds_switch_needed && !need_pipe_sync)
>     +       if (!vm_flush_needed && !gds_switch_needed &&
>     +           !trap_handler_needed && !need_pipe_sync)
>                     return 0;
>
>             if (ring->funcs->init_cond_exec)
>     @@ -1158,6 +1164,13 @@ int amdgpu_vm_flush(struct amdgpu_ring
>     *ring, struct amdgpu_job *job,
>                                                 job->oa_size);
>             }
>
>     +       if (ring->funcs->emit_trap_handler && trap_handler_needed) {
>     +               id->tba_addr = job->tba_addr;
>     +               id->tma_addr = job->tma_addr;
>     +               amdgpu_ring_emit_trap_handler(ring, job->vmid,
>     job->tba_addr,
>     +  job->tma_addr);
>     +       }
>     +
>             if (ring->funcs->patch_cond_exec)
>                     amdgpu_ring_patch_cond_exec(ring, patch_offset);
>
>     diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>     b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>     index 33f1c4a46ebe..59db577e8c8f 100644
>     --- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>     +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>     @@ -5222,6 +5222,40 @@ static void
>     gfx_v8_0_ring_emit_gds_switch(struct amdgpu_ring *ring,
>             amdgpu_ring_write(ring, (1 << (oa_size + oa_base)) - (1 <<
>     oa_base));
>      }
>
>     +static void gfx_v8_0_ring_emit_trap_handler(struct amdgpu_ring *ring,
>     +                                           uint32_t vmid,
>     +                                           uint64_t tba_addr,
>     +                                           uint64_t tma_addr)
>     +{
>     +       if (ring->funcs->type == AMDGPU_RING_TYPE_GFX) {
>     +               static const u32 regs[] = {
>     +                       mmSPI_SHADER_TBA_LO_PS,
>     +                       mmSPI_SHADER_TBA_LO_VS,
>     +                       mmSPI_SHADER_TBA_LO_GS,
>     +                       mmSPI_SHADER_TBA_LO_ES,
>     +                       mmSPI_SHADER_TBA_LO_HS,
>     +                       mmSPI_SHADER_TBA_LO_LS,
>     +               };
>     +               int i;
>     +
>     +               for (i = 0; i < ARRAY_SIZE(regs); i++) {
>     +                       amdgpu_ring_write(ring,
>     PACKET3(PACKET3_SET_SH_REG, 4));
>     +                       amdgpu_ring_write(ring, regs[i] -
>     PACKET3_SET_SH_REG_START);
>     +                       amdgpu_ring_write(ring,
>     lower_32_bits(tba_addr >> 8));
>     +                       amdgpu_ring_write(ring,
>     upper_32_bits(tba_addr >> 8));
>     +                       amdgpu_ring_write(ring,
>     lower_32_bits(tma_addr >> 8));
>     +                       amdgpu_ring_write(ring,
>     upper_32_bits(tma_addr >> 8));
>     +               }
>     +       } else {
>     +               amdgpu_ring_write(ring,
>     PACKET3(PACKET3_SET_SH_REG, 4));
>     +               amdgpu_ring_write(ring, mmCOMPUTE_TBA_LO -
>     PACKET3_SET_SH_REG_START);
>     +               amdgpu_ring_write(ring, lower_32_bits(tba_addr >> 8));
>     +               amdgpu_ring_write(ring, upper_32_bits(tba_addr >> 8));
>     +               amdgpu_ring_write(ring, lower_32_bits(tma_addr >> 8));
>     +               amdgpu_ring_write(ring, upper_32_bits(tma_addr >> 8));
>     +       }
>     +}
>     +
>      static uint32_t wave_read_ind(struct amdgpu_device *adev,
>     uint32_t simd, uint32_t wave, uint32_t address)
>      {
>             WREG32(mmSQ_IND_INDEX,
>     @@ -6890,7 +6924,8 @@ static const struct amdgpu_ring_funcs
>     gfx_v8_0_ring_funcs_gfx = {
>                     5 + /* HDP_INVL */
>                     12 + 12 + /* FENCE x2 */
>                     2 + /* SWITCH_BUFFER */
>     -               5, /* SURFACE_SYNC */
>     +               5 + /* SURFACE_SYNC */
>     +               36, /* gfx_v8_0_ring_emit_trap_handler */
>             .emit_ib_size = 4, /* gfx_v8_0_ring_emit_ib_gfx */
>             .emit_ib = gfx_v8_0_ring_emit_ib_gfx,
>             .emit_fence = gfx_v8_0_ring_emit_fence_gfx,
>     @@ -6909,6 +6944,7 @@ static const struct amdgpu_ring_funcs
>     gfx_v8_0_ring_funcs_gfx = {
>             .emit_wreg = gfx_v8_0_ring_emit_wreg,
>             .soft_recovery = gfx_v8_0_ring_soft_recovery,
>             .emit_mem_sync = gfx_v8_0_emit_mem_sync,
>     +       .emit_trap_handler = gfx_v8_0_ring_emit_trap_handler,
>      };
>
>      static const struct amdgpu_ring_funcs gfx_v8_0_ring_funcs_compute = {
>     @@ -6926,7 +6962,8 @@ static const struct amdgpu_ring_funcs
>     gfx_v8_0_ring_funcs_compute = {
>                     7 + /* gfx_v8_0_ring_emit_pipeline_sync */
>                     VI_FLUSH_GPU_TLB_NUM_WREG * 5 + 7 + /*
>     gfx_v8_0_ring_emit_vm_flush */
>                     7 + 7 + 7 + /* gfx_v8_0_ring_emit_fence_compute x3
>     for user fence, vm fence */
>     -               7, /* gfx_v8_0_emit_mem_sync_compute */
>     +               7 + /* gfx_v8_0_emit_mem_sync_compute */
>     +               6, /* gfx_v8_0_emit_trap_handler */
>             .emit_ib_size = 7, /* gfx_v8_0_ring_emit_ib_compute */
>             .emit_ib = gfx_v8_0_ring_emit_ib_compute,
>             .emit_fence = gfx_v8_0_ring_emit_fence_compute,
>     @@ -6940,6 +6977,7 @@ static const struct amdgpu_ring_funcs
>     gfx_v8_0_ring_funcs_compute = {
>             .pad_ib = amdgpu_ring_generic_pad_ib,
>             .emit_wreg = gfx_v8_0_ring_emit_wreg,
>             .emit_mem_sync = gfx_v8_0_emit_mem_sync_compute,
>     +       .emit_trap_handler = gfx_v8_0_ring_emit_trap_handler,
>      };
>
>      static const struct amdgpu_ring_funcs gfx_v8_0_ring_funcs_kiq = {
>     diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>     b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>     index cb9d60a4e05e..4fc00f196085 100644
>     --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>     +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>     @@ -4162,6 +4162,23 @@ static void
>     gfx_v9_0_ring_emit_gds_switch(struct amdgpu_ring *ring,
>                                        (1 << (oa_size + oa_base)) - (1
>     << oa_base));
>      }
>
>     +static void gfx_v9_0_ring_emit_trap_handler(struct amdgpu_ring *ring,
>     +                                           uint32_t vmid,
>     +                                           uint64_t tba_addr,
>     +                                           uint64_t tma_addr)
>     +{
>     +       struct amdgpu_device *adev = ring->adev;
>     +
>     +       mutex_lock(&adev->srbm_mutex);
>     +       soc15_grbm_select(adev, 0, 0, 0, vmid);
>     +       WREG32_SOC15_RLC(GC, 0, mmSQ_SHADER_TBA_LO,
>     lower_32_bits(tba_addr >> 8));
>     +       WREG32_SOC15_RLC(GC, 0, mmSQ_SHADER_TBA_HI,
>     upper_32_bits(tba_addr >> 8));
>     +       WREG32_SOC15_RLC(GC, 0, mmSQ_SHADER_TMA_LO,
>     lower_32_bits(tma_addr >> 8));
>     +       WREG32_SOC15_RLC(GC, 0, mmSQ_SHADER_TMA_HI,
>     upper_32_bits(tma_addr >> 8));
>     +       soc15_grbm_select(adev, 0, 0, 0, 0);
>     +       mutex_unlock(&adev->srbm_mutex);
>     +}
>     +
>      static const u32 vgpr_init_compute_shader[] =
>      {
>             0xb07c0000, 0xbe8000ff,
>     @@ -6720,6 +6737,7 @@ static const struct amdgpu_ring_funcs
>     gfx_v9_0_ring_funcs_gfx = {
>             .emit_reg_write_reg_wait =
>     gfx_v9_0_ring_emit_reg_write_reg_wait,
>             .soft_recovery = gfx_v9_0_ring_soft_recovery,
>             .emit_mem_sync = gfx_v9_0_emit_mem_sync,
>     +       .emit_trap_handler = gfx_v9_0_ring_emit_trap_handler,
>      };
>
>      static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_compute = {
>     @@ -6756,6 +6774,7 @@ static const struct amdgpu_ring_funcs
>     gfx_v9_0_ring_funcs_compute = {
>             .emit_reg_wait = gfx_v9_0_ring_emit_reg_wait,
>             .emit_reg_write_reg_wait =
>     gfx_v9_0_ring_emit_reg_write_reg_wait,
>             .emit_mem_sync = gfx_v9_0_emit_mem_sync,
>     +       .emit_trap_handler = gfx_v9_0_ring_emit_trap_handler,
>      };
>
>      static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_kiq = {
>     diff --git a/include/uapi/drm/amdgpu_drm.h
>     b/include/uapi/drm/amdgpu_drm.h
>     index 3218576e109d..7eae264adb5d 100644
>     --- a/include/uapi/drm/amdgpu_drm.h
>     +++ b/include/uapi/drm/amdgpu_drm.h
>     @@ -551,6 +551,7 @@ struct drm_amdgpu_gem_va {
>      #define AMDGPU_CHUNK_ID_SCHEDULED_DEPENDENCIES 0x07
>      #define AMDGPU_CHUNK_ID_SYNCOBJ_TIMELINE_WAIT    0x08
>      #define AMDGPU_CHUNK_ID_SYNCOBJ_TIMELINE_SIGNAL  0x09
>     +#define AMDGPU_CHUNK_ID_TRAP            0x0a
>
>      struct drm_amdgpu_cs_chunk {
>             __u32           chunk_id;
>     @@ -645,6 +646,13 @@ struct drm_amdgpu_cs_chunk_syncobj {
>             __u64 point;
>      };
>
>     +struct drm_amdgpu_cs_chunk_trap {
>     +       /** Trap Base Address */
>     +       __u64 tba_addr;
>     +       /** Trap Memory Address */
>     +       __u64 tma_addr;
>     +};
>     +
>      #define AMDGPU_FENCE_TO_HANDLE_GET_SYNCOBJ     0
>      #define AMDGPU_FENCE_TO_HANDLE_GET_SYNCOBJ_FD  1
>      #define AMDGPU_FENCE_TO_HANDLE_GET_SYNC_FILE_FD        2
>     -- 
>     2.28.0
>
>     _______________________________________________
>     amd-gfx mailing list
>     amd-gfx@lists.freedesktop.org <mailto:amd-gfx@lists.freedesktop.org>
>     https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>
>
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx

[-- Attachment #1.2: Type: text/html, Size: 26726 bytes --]

[-- Attachment #2: Type: text/plain, Size: 154 bytes --]

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [RFC PATCH] drm/amdgpu: add support for user trap handlers
  2020-08-24 21:32   ` Alex Deucher
@ 2020-08-25  7:06     ` Samuel Pitoiset
  2020-08-25 13:13       ` Alex Deucher
  0 siblings, 1 reply; 16+ messages in thread
From: Samuel Pitoiset @ 2020-08-25  7:06 UTC (permalink / raw)
  To: Alex Deucher; +Cc: Deucher, Alexander, amd-gfx list


On 8/24/20 11:32 PM, Alex Deucher wrote:
> On Mon, Aug 24, 2020 at 2:33 PM Alex Deucher <alexdeucher@gmail.com> wrote:
>> On Mon, Aug 24, 2020 at 7:57 AM Samuel Pitoiset
>> <samuel.pitoiset@gmail.com> wrote:
>>> A trap handler can be used by userspace to catch shader exceptions
>>> like divide by zero, memory violations etc.
>>>
>>> On GFX6-GFX8, the registers used to configure TBA/TMA aren't
>>> privileged while on GFX9+ they are per VMID and privileged,
>>> so that only the KMD can configure them.
>>>
>>> This introduces a new CS chunk that can be used to set the
>>> TBA/TMA virtual address at submit time.
>>>
>>> TODO:
>>> - add GFX 6,7 and 10 support
>>> - rebase on top of amd-staging-drm-next (this branch currently
>>> hangs my GPU at boot)
>>>
>>> Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
>>> ---
>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c   | 31 +++++++++++++++++
>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c  |  3 +-
>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_ids.h  |  4 +++
>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_job.h  |  4 +++
>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h |  4 +++
>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c   | 15 ++++++++-
>>>   drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c    | 42 ++++++++++++++++++++++--
>>>   drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c    | 19 +++++++++++
>>>   include/uapi/drm/amdgpu_drm.h            |  8 +++++
>>>   9 files changed, 126 insertions(+), 4 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>>> index a512ccbc4dea..6ca5c4912e3a 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>>> @@ -104,6 +104,19 @@ static int amdgpu_cs_bo_handles_chunk(struct amdgpu_cs_parser *p,
>>>          return r;
>>>   }
>>>
>>> +static int amdgpu_cs_user_trap_chunk(struct amdgpu_cs_parser *p,
>>> +                                    struct drm_amdgpu_cs_chunk_trap *data,
>>> +                                    uint64_t *tba_addr, uint64_t *tma_addr)
>>> +{
>>> +       if (!data->tba_addr || !data->tma_addr)
>>> +               return -EINVAL;
>>> +
>>> +       *tba_addr = data->tba_addr;
>>> +       *tma_addr = data->tma_addr;
>>> +
>>> +       return 0;
>>> +}
>>> +
>>>   static int amdgpu_cs_parser_init(struct amdgpu_cs_parser *p, union drm_amdgpu_cs *cs)
>>>   {
>>>          struct amdgpu_fpriv *fpriv = p->filp->driver_priv;
>>> @@ -112,6 +125,7 @@ static int amdgpu_cs_parser_init(struct amdgpu_cs_parser *p, union drm_amdgpu_cs
>>>          uint64_t *chunk_array;
>>>          unsigned size, num_ibs = 0;
>>>          uint32_t uf_offset = 0;
>>> +       uint64_t tba_addr = 0, tma_addr = 0;
>>>          int i;
>>>          int ret;
>>>
>>> @@ -214,6 +228,19 @@ static int amdgpu_cs_parser_init(struct amdgpu_cs_parser *p, union drm_amdgpu_cs
>>>
>>>                          break;
>>>
>>> +               case AMDGPU_CHUNK_ID_TRAP:
>>> +                       size = sizeof(struct drm_amdgpu_cs_chunk_trap);
>>> +                       if (p->chunks[i].length_dw * sizeof(uint32_t) < size) {
>>> +                               ret = -EINVAL;
>>> +                               goto free_partial_kdata;
>>> +                       }
>>> +
>>> +                       ret = amdgpu_cs_user_trap_chunk(p, p->chunks[i].kdata,
>>> +                                                       &tba_addr, &tma_addr);
>>> +                       if (ret)
>>> +                               goto free_partial_kdata;
>>> +                       break;
>>> +
>>>                  case AMDGPU_CHUNK_ID_DEPENDENCIES:
>>>                  case AMDGPU_CHUNK_ID_SYNCOBJ_IN:
>>>                  case AMDGPU_CHUNK_ID_SYNCOBJ_OUT:
>>> @@ -239,6 +266,10 @@ static int amdgpu_cs_parser_init(struct amdgpu_cs_parser *p, union drm_amdgpu_cs
>>>
>>>          if (p->uf_entry.tv.bo)
>>>                  p->job->uf_addr = uf_offset;
>>> +
>>> +       p->job->tba_addr = tba_addr;
>>> +       p->job->tma_addr = tma_addr;
>>> +
>>>          kfree(chunk_array);
>>>
>>>          /* Use this opportunity to fill in task info for the vm */
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>>> index 26127c7d2f32..1e703119e4c2 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>>> @@ -88,9 +88,10 @@
>>>    * - 3.37.0 - L2 is invalidated before SDMA IBs, needed for correctness
>>>    * - 3.38.0 - Add AMDGPU_IB_FLAG_EMIT_MEM_SYNC
>>>    * - 3.39.0 - DMABUF implicit sync does a full pipeline sync
>>> + * - 3.40.0 - Add AMDGPU_CHUNK_ID_TRAP
>>>    */
>>>   #define KMS_DRIVER_MAJOR       3
>>> -#define KMS_DRIVER_MINOR       39
>>> +#define KMS_DRIVER_MINOR       40
>>>   #define KMS_DRIVER_PATCHLEVEL  0
>>>
>>>   int amdgpu_vram_limit = 0;
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.h
>>> index 8e58325bbca2..fd0d56724b4d 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.h
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.h
>>> @@ -58,6 +58,10 @@ struct amdgpu_vmid {
>>>          uint32_t                oa_base;
>>>          uint32_t                oa_size;
>>>
>>> +       /* user trap */
>>> +       uint64_t                tba_addr;
>>> +       uint64_t                tma_addr;
>>> +
>>>          unsigned                pasid;
>>>          struct dma_fence        *pasid_mapping;
>>>   };
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
>>> index 81caac9b958a..b8ed5b13ea44 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
>>> @@ -62,6 +62,10 @@ struct amdgpu_job {
>>>          /* user fence handling */
>>>          uint64_t                uf_addr;
>>>          uint64_t                uf_sequence;
>>> +
>>> +       /* user trap handling */
>>> +       uint64_t                tba_addr;
>>> +       uint64_t                tma_addr;
>>>   };
>>>
>>>   int amdgpu_job_alloc(struct amdgpu_device *adev, unsigned num_ibs,
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>>> index da871d84b742..1f165a6295d9 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>>> @@ -197,6 +197,9 @@ struct amdgpu_ring_funcs {
>>>          void (*soft_recovery)(struct amdgpu_ring *ring, unsigned vmid);
>>>          int (*preempt_ib)(struct amdgpu_ring *ring);
>>>          void (*emit_mem_sync)(struct amdgpu_ring *ring);
>>> +       void (*emit_trap_handler)(struct amdgpu_ring *ring,
>>> +                                 uint32_t vmid,
>>> +                                 uint64_t tba_addr, uint64_t tma_addr);
>>>   };
>>>
>>>   struct amdgpu_ring {
>>> @@ -265,6 +268,7 @@ struct amdgpu_ring {
>>>   #define amdgpu_ring_emit_vm_flush(r, vmid, addr) (r)->funcs->emit_vm_flush((r), (vmid), (addr))
>>>   #define amdgpu_ring_emit_fence(r, addr, seq, flags) (r)->funcs->emit_fence((r), (addr), (seq), (flags))
>>>   #define amdgpu_ring_emit_gds_switch(r, v, db, ds, wb, ws, ab, as) (r)->funcs->emit_gds_switch((r), (v), (db), (ds), (wb), (ws), (ab), (as))
>>> +#define amdgpu_ring_emit_trap_handler(r, v, tba, tma) (r)->funcs->emit_trap_handler((r), (v), (tba), (tma))
>>>   #define amdgpu_ring_emit_hdp_flush(r) (r)->funcs->emit_hdp_flush((r))
>>>   #define amdgpu_ring_emit_switch_buffer(r) (r)->funcs->emit_switch_buffer((r))
>>>   #define amdgpu_ring_emit_cntxcntl(r, d) (r)->funcs->emit_cntxcntl((r), (d))
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>> index 71e005cf2952..24916082de0b 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>> @@ -1076,6 +1076,9 @@ int amdgpu_vm_flush(struct amdgpu_ring *ring, struct amdgpu_job *job,
>>>                  id->gws_size != job->gws_size ||
>>>                  id->oa_base != job->oa_base ||
>>>                  id->oa_size != job->oa_size);
>>> +       bool trap_handler_needed = ring->funcs->emit_trap_handler && (
>>> +               id->tba_addr != job->tba_addr ||
>>> +               id->tma_addr != job->tma_addr);
>>>          bool vm_flush_needed = job->vm_needs_flush;
>>>          struct dma_fence *fence = NULL;
>>>          bool pasid_mapping_needed = false;
>>> @@ -1088,6 +1091,7 @@ int amdgpu_vm_flush(struct amdgpu_ring *ring, struct amdgpu_job *job,
>>>
>>>          if (amdgpu_vmid_had_gpu_reset(adev, id)) {
>>>                  gds_switch_needed = true;
>>> +               trap_handler_needed = true;
>>>                  vm_flush_needed = true;
>>>                  pasid_mapping_needed = true;
>>>          }
>>> @@ -1099,12 +1103,14 @@ int amdgpu_vm_flush(struct amdgpu_ring *ring, struct amdgpu_job *job,
>>>          mutex_unlock(&id_mgr->lock);
>>>
>>>          gds_switch_needed &= !!ring->funcs->emit_gds_switch;
>>> +       trap_handler_needed &= !!ring->funcs->emit_trap_handler;
>>>          vm_flush_needed &= !!ring->funcs->emit_vm_flush  &&
>>>                          job->vm_pd_addr != AMDGPU_BO_INVALID_OFFSET;
>>>          pasid_mapping_needed &= adev->gmc.gmc_funcs->emit_pasid_mapping &&
>>>                  ring->funcs->emit_wreg;
>>>
>>> -       if (!vm_flush_needed && !gds_switch_needed && !need_pipe_sync)
>>> +       if (!vm_flush_needed && !gds_switch_needed &&
>>> +           !trap_handler_needed && !need_pipe_sync)
>>>                  return 0;
>>>
>>>          if (ring->funcs->init_cond_exec)
>>> @@ -1158,6 +1164,13 @@ int amdgpu_vm_flush(struct amdgpu_ring *ring, struct amdgpu_job *job,
>>>                                              job->oa_size);
>>>          }
>>>
>>> +       if (ring->funcs->emit_trap_handler && trap_handler_needed) {
>>> +               id->tba_addr = job->tba_addr;
>>> +               id->tma_addr = job->tma_addr;
>>> +               amdgpu_ring_emit_trap_handler(ring, job->vmid, job->tba_addr,
>>> +                                             job->tma_addr);
>>> +       }
>>> +
>>>          if (ring->funcs->patch_cond_exec)
>>>                  amdgpu_ring_patch_cond_exec(ring, patch_offset);
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>>> index 33f1c4a46ebe..59db577e8c8f 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>>> @@ -5222,6 +5222,40 @@ static void gfx_v8_0_ring_emit_gds_switch(struct amdgpu_ring *ring,
>>>          amdgpu_ring_write(ring, (1 << (oa_size + oa_base)) - (1 << oa_base));
>>>   }
>>>
>>> +static void gfx_v8_0_ring_emit_trap_handler(struct amdgpu_ring *ring,
>>> +                                           uint32_t vmid,
>>> +                                           uint64_t tba_addr,
>>> +                                           uint64_t tma_addr)
>>> +{
>>> +       if (ring->funcs->type == AMDGPU_RING_TYPE_GFX) {
>>> +               static const u32 regs[] = {
>>> +                       mmSPI_SHADER_TBA_LO_PS,
>>> +                       mmSPI_SHADER_TBA_LO_VS,
>>> +                       mmSPI_SHADER_TBA_LO_GS,
>>> +                       mmSPI_SHADER_TBA_LO_ES,
>>> +                       mmSPI_SHADER_TBA_LO_HS,
>>> +                       mmSPI_SHADER_TBA_LO_LS,
>>> +               };
>>> +               int i;
>>> +
>>> +               for (i = 0; i < ARRAY_SIZE(regs); i++) {
>>> +                       amdgpu_ring_write(ring, PACKET3(PACKET3_SET_SH_REG, 4));
>>> +                       amdgpu_ring_write(ring, regs[i] - PACKET3_SET_SH_REG_START);
>>> +                       amdgpu_ring_write(ring, lower_32_bits(tba_addr >> 8));
>>> +                       amdgpu_ring_write(ring, upper_32_bits(tba_addr >> 8));
>>> +                       amdgpu_ring_write(ring, lower_32_bits(tma_addr >> 8));
>>> +                       amdgpu_ring_write(ring, upper_32_bits(tma_addr >> 8));
>>> +               }
>>> +       } else {
>>> +               amdgpu_ring_write(ring, PACKET3(PACKET3_SET_SH_REG, 4));
>>> +               amdgpu_ring_write(ring, mmCOMPUTE_TBA_LO - PACKET3_SET_SH_REG_START);
>>> +               amdgpu_ring_write(ring, lower_32_bits(tba_addr >> 8));
>>> +               amdgpu_ring_write(ring, upper_32_bits(tba_addr >> 8));
>>> +               amdgpu_ring_write(ring, lower_32_bits(tma_addr >> 8));
>>> +               amdgpu_ring_write(ring, upper_32_bits(tma_addr >> 8));
>>> +       }
>>> +}
>>> +
>>>   static uint32_t wave_read_ind(struct amdgpu_device *adev, uint32_t simd, uint32_t wave, uint32_t address)
>>>   {
>>>          WREG32(mmSQ_IND_INDEX,
>>> @@ -6890,7 +6924,8 @@ static const struct amdgpu_ring_funcs gfx_v8_0_ring_funcs_gfx = {
>>>                  5 + /* HDP_INVL */
>>>                  12 + 12 + /* FENCE x2 */
>>>                  2 + /* SWITCH_BUFFER */
>>> -               5, /* SURFACE_SYNC */
>>> +               5 + /* SURFACE_SYNC */
>>> +               36, /* gfx_v8_0_ring_emit_trap_handler */
>>>          .emit_ib_size = 4, /* gfx_v8_0_ring_emit_ib_gfx */
>>>          .emit_ib = gfx_v8_0_ring_emit_ib_gfx,
>>>          .emit_fence = gfx_v8_0_ring_emit_fence_gfx,
>>> @@ -6909,6 +6944,7 @@ static const struct amdgpu_ring_funcs gfx_v8_0_ring_funcs_gfx = {
>>>          .emit_wreg = gfx_v8_0_ring_emit_wreg,
>>>          .soft_recovery = gfx_v8_0_ring_soft_recovery,
>>>          .emit_mem_sync = gfx_v8_0_emit_mem_sync,
>>> +       .emit_trap_handler = gfx_v8_0_ring_emit_trap_handler,
>>>   };
>>>
>>>   static const struct amdgpu_ring_funcs gfx_v8_0_ring_funcs_compute = {
>>> @@ -6926,7 +6962,8 @@ static const struct amdgpu_ring_funcs gfx_v8_0_ring_funcs_compute = {
>>>                  7 + /* gfx_v8_0_ring_emit_pipeline_sync */
>>>                  VI_FLUSH_GPU_TLB_NUM_WREG * 5 + 7 + /* gfx_v8_0_ring_emit_vm_flush */
>>>                  7 + 7 + 7 + /* gfx_v8_0_ring_emit_fence_compute x3 for user fence, vm fence */
>>> -               7, /* gfx_v8_0_emit_mem_sync_compute */
>>> +               7 + /* gfx_v8_0_emit_mem_sync_compute */
>>> +               6, /* gfx_v8_0_emit_trap_handler */
>>>          .emit_ib_size = 7, /* gfx_v8_0_ring_emit_ib_compute */
>>>          .emit_ib = gfx_v8_0_ring_emit_ib_compute,
>>>          .emit_fence = gfx_v8_0_ring_emit_fence_compute,
>>> @@ -6940,6 +6977,7 @@ static const struct amdgpu_ring_funcs gfx_v8_0_ring_funcs_compute = {
>>>          .pad_ib = amdgpu_ring_generic_pad_ib,
>>>          .emit_wreg = gfx_v8_0_ring_emit_wreg,
>>>          .emit_mem_sync = gfx_v8_0_emit_mem_sync_compute,
>>> +       .emit_trap_handler = gfx_v8_0_ring_emit_trap_handler,
>>>   };
>>>
>>>   static const struct amdgpu_ring_funcs gfx_v8_0_ring_funcs_kiq = {
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>>> index cb9d60a4e05e..4fc00f196085 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>>> @@ -4162,6 +4162,23 @@ static void gfx_v9_0_ring_emit_gds_switch(struct amdgpu_ring *ring,
>>>                                     (1 << (oa_size + oa_base)) - (1 << oa_base));
>>>   }
>>>
>>> +static void gfx_v9_0_ring_emit_trap_handler(struct amdgpu_ring *ring,
>>> +                                           uint32_t vmid,
>>> +                                           uint64_t tba_addr,
>>> +                                           uint64_t tma_addr)
>>> +{
>>> +       struct amdgpu_device *adev = ring->adev;
>>> +
>>> +       mutex_lock(&adev->srbm_mutex);
>>> +       soc15_grbm_select(adev, 0, 0, 0, vmid);
>>> +       WREG32_SOC15_RLC(GC, 0, mmSQ_SHADER_TBA_LO, lower_32_bits(tba_addr >> 8));
>>> +       WREG32_SOC15_RLC(GC, 0, mmSQ_SHADER_TBA_HI, upper_32_bits(tba_addr >> 8));
>>> +       WREG32_SOC15_RLC(GC, 0, mmSQ_SHADER_TMA_LO, lower_32_bits(tma_addr >> 8));
>>> +       WREG32_SOC15_RLC(GC, 0, mmSQ_SHADER_TMA_HI, upper_32_bits(tma_addr >> 8));
>>> +       soc15_grbm_select(adev, 0, 0, 0, 0);
>>> +       mutex_unlock(&adev->srbm_mutex);
>> This won't work.  This updates registers via MMIO using the CPU.  We
>> need to have the registers updated asynchronously via the CP so they
>> get updated when the specific jobs are executed by the engine.  vmid's
>> are shared resources and are assigned dynamically via the kernel
>> driver.  If you update via MMIO the changes take effect immediately
>> rather than when the actual work is scheduled on the engine.
>> Unfortunately, at the moment, I don't see a way to do this with the CP
>> with the packets that are currently available.
> One option might be to do this via MMIO, but only support it when
> using a reserved vmid.

Hmm, yes, that's completely broken actually. Thanks for the explanation, 
that makes total sense.

So, no way to configure these registers via the CP at the moment. Do you 
have any plans?

I will have a look at the reserved vmid.

>
> Alex
>
>
>> Alex
>>
>>
>>> +}
>>> +
>>>   static const u32 vgpr_init_compute_shader[] =
>>>   {
>>>          0xb07c0000, 0xbe8000ff,
>>> @@ -6720,6 +6737,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_gfx = {
>>>          .emit_reg_write_reg_wait = gfx_v9_0_ring_emit_reg_write_reg_wait,
>>>          .soft_recovery = gfx_v9_0_ring_soft_recovery,
>>>          .emit_mem_sync = gfx_v9_0_emit_mem_sync,
>>> +       .emit_trap_handler = gfx_v9_0_ring_emit_trap_handler,
>>>   };
>>>
>>>   static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_compute = {
>>> @@ -6756,6 +6774,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_compute = {
>>>          .emit_reg_wait = gfx_v9_0_ring_emit_reg_wait,
>>>          .emit_reg_write_reg_wait = gfx_v9_0_ring_emit_reg_write_reg_wait,
>>>          .emit_mem_sync = gfx_v9_0_emit_mem_sync,
>>> +       .emit_trap_handler = gfx_v9_0_ring_emit_trap_handler,
>>>   };
>>>
>>>   static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_kiq = {
>>> diff --git a/include/uapi/drm/amdgpu_drm.h b/include/uapi/drm/amdgpu_drm.h
>>> index 3218576e109d..7eae264adb5d 100644
>>> --- a/include/uapi/drm/amdgpu_drm.h
>>> +++ b/include/uapi/drm/amdgpu_drm.h
>>> @@ -551,6 +551,7 @@ struct drm_amdgpu_gem_va {
>>>   #define AMDGPU_CHUNK_ID_SCHEDULED_DEPENDENCIES 0x07
>>>   #define AMDGPU_CHUNK_ID_SYNCOBJ_TIMELINE_WAIT    0x08
>>>   #define AMDGPU_CHUNK_ID_SYNCOBJ_TIMELINE_SIGNAL  0x09
>>> +#define AMDGPU_CHUNK_ID_TRAP            0x0a
>>>
>>>   struct drm_amdgpu_cs_chunk {
>>>          __u32           chunk_id;
>>> @@ -645,6 +646,13 @@ struct drm_amdgpu_cs_chunk_syncobj {
>>>          __u64 point;
>>>   };
>>>
>>> +struct drm_amdgpu_cs_chunk_trap {
>>> +       /** Trap Base Address */
>>> +       __u64 tba_addr;
>>> +       /** Trap Memory Address */
>>> +       __u64 tma_addr;
>>> +};
>>> +
>>>   #define AMDGPU_FENCE_TO_HANDLE_GET_SYNCOBJ     0
>>>   #define AMDGPU_FENCE_TO_HANDLE_GET_SYNCOBJ_FD  1
>>>   #define AMDGPU_FENCE_TO_HANDLE_GET_SYNC_FILE_FD        2
>>> --
>>> 2.28.0
>>>
>>> _______________________________________________
>>> amd-gfx mailing list
>>> amd-gfx@lists.freedesktop.org
>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [RFC PATCH] drm/amdgpu: add support for user trap handlers
  2020-08-25  7:06     ` Samuel Pitoiset
@ 2020-08-25 13:13       ` Alex Deucher
  0 siblings, 0 replies; 16+ messages in thread
From: Alex Deucher @ 2020-08-25 13:13 UTC (permalink / raw)
  To: Samuel Pitoiset; +Cc: Deucher, Alexander, amd-gfx list

On Tue, Aug 25, 2020 at 3:06 AM Samuel Pitoiset
<samuel.pitoiset@gmail.com> wrote:
>
>
> On 8/24/20 11:32 PM, Alex Deucher wrote:
> > On Mon, Aug 24, 2020 at 2:33 PM Alex Deucher <alexdeucher@gmail.com> wrote:
> >> On Mon, Aug 24, 2020 at 7:57 AM Samuel Pitoiset
> >> <samuel.pitoiset@gmail.com> wrote:
> >>> A trap handler can be used by userspace to catch shader exceptions
> >>> like divide by zero, memory violations etc.
> >>>
> >>> On GFX6-GFX8, the registers used to configure TBA/TMA aren't
> >>> privileged while on GFX9+ they are per VMID and privileged,
> >>> so that only the KMD can configure them.
> >>>
> >>> This introduces a new CS chunk that can be used to set the
> >>> TBA/TMA virtual address at submit time.
> >>>
> >>> TODO:
> >>> - add GFX 6,7 and 10 support
> >>> - rebase on top of amd-staging-drm-next (this branch currently
> >>> hangs my GPU at boot)
> >>>
> >>> Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
> >>> ---
> >>>   drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c   | 31 +++++++++++++++++
> >>>   drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c  |  3 +-
> >>>   drivers/gpu/drm/amd/amdgpu/amdgpu_ids.h  |  4 +++
> >>>   drivers/gpu/drm/amd/amdgpu/amdgpu_job.h  |  4 +++
> >>>   drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h |  4 +++
> >>>   drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c   | 15 ++++++++-
> >>>   drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c    | 42 ++++++++++++++++++++++--
> >>>   drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c    | 19 +++++++++++
> >>>   include/uapi/drm/amdgpu_drm.h            |  8 +++++
> >>>   9 files changed, 126 insertions(+), 4 deletions(-)
> >>>
> >>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> >>> index a512ccbc4dea..6ca5c4912e3a 100644
> >>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> >>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> >>> @@ -104,6 +104,19 @@ static int amdgpu_cs_bo_handles_chunk(struct amdgpu_cs_parser *p,
> >>>          return r;
> >>>   }
> >>>
> >>> +static int amdgpu_cs_user_trap_chunk(struct amdgpu_cs_parser *p,
> >>> +                                    struct drm_amdgpu_cs_chunk_trap *data,
> >>> +                                    uint64_t *tba_addr, uint64_t *tma_addr)
> >>> +{
> >>> +       if (!data->tba_addr || !data->tma_addr)
> >>> +               return -EINVAL;
> >>> +
> >>> +       *tba_addr = data->tba_addr;
> >>> +       *tma_addr = data->tma_addr;
> >>> +
> >>> +       return 0;
> >>> +}
> >>> +
> >>>   static int amdgpu_cs_parser_init(struct amdgpu_cs_parser *p, union drm_amdgpu_cs *cs)
> >>>   {
> >>>          struct amdgpu_fpriv *fpriv = p->filp->driver_priv;
> >>> @@ -112,6 +125,7 @@ static int amdgpu_cs_parser_init(struct amdgpu_cs_parser *p, union drm_amdgpu_cs
> >>>          uint64_t *chunk_array;
> >>>          unsigned size, num_ibs = 0;
> >>>          uint32_t uf_offset = 0;
> >>> +       uint64_t tba_addr = 0, tma_addr = 0;
> >>>          int i;
> >>>          int ret;
> >>>
> >>> @@ -214,6 +228,19 @@ static int amdgpu_cs_parser_init(struct amdgpu_cs_parser *p, union drm_amdgpu_cs
> >>>
> >>>                          break;
> >>>
> >>> +               case AMDGPU_CHUNK_ID_TRAP:
> >>> +                       size = sizeof(struct drm_amdgpu_cs_chunk_trap);
> >>> +                       if (p->chunks[i].length_dw * sizeof(uint32_t) < size) {
> >>> +                               ret = -EINVAL;
> >>> +                               goto free_partial_kdata;
> >>> +                       }
> >>> +
> >>> +                       ret = amdgpu_cs_user_trap_chunk(p, p->chunks[i].kdata,
> >>> +                                                       &tba_addr, &tma_addr);
> >>> +                       if (ret)
> >>> +                               goto free_partial_kdata;
> >>> +                       break;
> >>> +
> >>>                  case AMDGPU_CHUNK_ID_DEPENDENCIES:
> >>>                  case AMDGPU_CHUNK_ID_SYNCOBJ_IN:
> >>>                  case AMDGPU_CHUNK_ID_SYNCOBJ_OUT:
> >>> @@ -239,6 +266,10 @@ static int amdgpu_cs_parser_init(struct amdgpu_cs_parser *p, union drm_amdgpu_cs
> >>>
> >>>          if (p->uf_entry.tv.bo)
> >>>                  p->job->uf_addr = uf_offset;
> >>> +
> >>> +       p->job->tba_addr = tba_addr;
> >>> +       p->job->tma_addr = tma_addr;
> >>> +
> >>>          kfree(chunk_array);
> >>>
> >>>          /* Use this opportunity to fill in task info for the vm */
> >>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> >>> index 26127c7d2f32..1e703119e4c2 100644
> >>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> >>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> >>> @@ -88,9 +88,10 @@
> >>>    * - 3.37.0 - L2 is invalidated before SDMA IBs, needed for correctness
> >>>    * - 3.38.0 - Add AMDGPU_IB_FLAG_EMIT_MEM_SYNC
> >>>    * - 3.39.0 - DMABUF implicit sync does a full pipeline sync
> >>> + * - 3.40.0 - Add AMDGPU_CHUNK_ID_TRAP
> >>>    */
> >>>   #define KMS_DRIVER_MAJOR       3
> >>> -#define KMS_DRIVER_MINOR       39
> >>> +#define KMS_DRIVER_MINOR       40
> >>>   #define KMS_DRIVER_PATCHLEVEL  0
> >>>
> >>>   int amdgpu_vram_limit = 0;
> >>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.h
> >>> index 8e58325bbca2..fd0d56724b4d 100644
> >>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.h
> >>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.h
> >>> @@ -58,6 +58,10 @@ struct amdgpu_vmid {
> >>>          uint32_t                oa_base;
> >>>          uint32_t                oa_size;
> >>>
> >>> +       /* user trap */
> >>> +       uint64_t                tba_addr;
> >>> +       uint64_t                tma_addr;
> >>> +
> >>>          unsigned                pasid;
> >>>          struct dma_fence        *pasid_mapping;
> >>>   };
> >>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
> >>> index 81caac9b958a..b8ed5b13ea44 100644
> >>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
> >>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
> >>> @@ -62,6 +62,10 @@ struct amdgpu_job {
> >>>          /* user fence handling */
> >>>          uint64_t                uf_addr;
> >>>          uint64_t                uf_sequence;
> >>> +
> >>> +       /* user trap handling */
> >>> +       uint64_t                tba_addr;
> >>> +       uint64_t                tma_addr;
> >>>   };
> >>>
> >>>   int amdgpu_job_alloc(struct amdgpu_device *adev, unsigned num_ibs,
> >>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> >>> index da871d84b742..1f165a6295d9 100644
> >>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> >>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> >>> @@ -197,6 +197,9 @@ struct amdgpu_ring_funcs {
> >>>          void (*soft_recovery)(struct amdgpu_ring *ring, unsigned vmid);
> >>>          int (*preempt_ib)(struct amdgpu_ring *ring);
> >>>          void (*emit_mem_sync)(struct amdgpu_ring *ring);
> >>> +       void (*emit_trap_handler)(struct amdgpu_ring *ring,
> >>> +                                 uint32_t vmid,
> >>> +                                 uint64_t tba_addr, uint64_t tma_addr);
> >>>   };
> >>>
> >>>   struct amdgpu_ring {
> >>> @@ -265,6 +268,7 @@ struct amdgpu_ring {
> >>>   #define amdgpu_ring_emit_vm_flush(r, vmid, addr) (r)->funcs->emit_vm_flush((r), (vmid), (addr))
> >>>   #define amdgpu_ring_emit_fence(r, addr, seq, flags) (r)->funcs->emit_fence((r), (addr), (seq), (flags))
> >>>   #define amdgpu_ring_emit_gds_switch(r, v, db, ds, wb, ws, ab, as) (r)->funcs->emit_gds_switch((r), (v), (db), (ds), (wb), (ws), (ab), (as))
> >>> +#define amdgpu_ring_emit_trap_handler(r, v, tba, tma) (r)->funcs->emit_trap_handler((r), (v), (tba), (tma))
> >>>   #define amdgpu_ring_emit_hdp_flush(r) (r)->funcs->emit_hdp_flush((r))
> >>>   #define amdgpu_ring_emit_switch_buffer(r) (r)->funcs->emit_switch_buffer((r))
> >>>   #define amdgpu_ring_emit_cntxcntl(r, d) (r)->funcs->emit_cntxcntl((r), (d))
> >>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> >>> index 71e005cf2952..24916082de0b 100644
> >>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> >>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> >>> @@ -1076,6 +1076,9 @@ int amdgpu_vm_flush(struct amdgpu_ring *ring, struct amdgpu_job *job,
> >>>                  id->gws_size != job->gws_size ||
> >>>                  id->oa_base != job->oa_base ||
> >>>                  id->oa_size != job->oa_size);
> >>> +       bool trap_handler_needed = ring->funcs->emit_trap_handler && (
> >>> +               id->tba_addr != job->tba_addr ||
> >>> +               id->tma_addr != job->tma_addr);
> >>>          bool vm_flush_needed = job->vm_needs_flush;
> >>>          struct dma_fence *fence = NULL;
> >>>          bool pasid_mapping_needed = false;
> >>> @@ -1088,6 +1091,7 @@ int amdgpu_vm_flush(struct amdgpu_ring *ring, struct amdgpu_job *job,
> >>>
> >>>          if (amdgpu_vmid_had_gpu_reset(adev, id)) {
> >>>                  gds_switch_needed = true;
> >>> +               trap_handler_needed = true;
> >>>                  vm_flush_needed = true;
> >>>                  pasid_mapping_needed = true;
> >>>          }
> >>> @@ -1099,12 +1103,14 @@ int amdgpu_vm_flush(struct amdgpu_ring *ring, struct amdgpu_job *job,
> >>>          mutex_unlock(&id_mgr->lock);
> >>>
> >>>          gds_switch_needed &= !!ring->funcs->emit_gds_switch;
> >>> +       trap_handler_needed &= !!ring->funcs->emit_trap_handler;
> >>>          vm_flush_needed &= !!ring->funcs->emit_vm_flush  &&
> >>>                          job->vm_pd_addr != AMDGPU_BO_INVALID_OFFSET;
> >>>          pasid_mapping_needed &= adev->gmc.gmc_funcs->emit_pasid_mapping &&
> >>>                  ring->funcs->emit_wreg;
> >>>
> >>> -       if (!vm_flush_needed && !gds_switch_needed && !need_pipe_sync)
> >>> +       if (!vm_flush_needed && !gds_switch_needed &&
> >>> +           !trap_handler_needed && !need_pipe_sync)
> >>>                  return 0;
> >>>
> >>>          if (ring->funcs->init_cond_exec)
> >>> @@ -1158,6 +1164,13 @@ int amdgpu_vm_flush(struct amdgpu_ring *ring, struct amdgpu_job *job,
> >>>                                              job->oa_size);
> >>>          }
> >>>
> >>> +       if (ring->funcs->emit_trap_handler && trap_handler_needed) {
> >>> +               id->tba_addr = job->tba_addr;
> >>> +               id->tma_addr = job->tma_addr;
> >>> +               amdgpu_ring_emit_trap_handler(ring, job->vmid, job->tba_addr,
> >>> +                                             job->tma_addr);
> >>> +       }
> >>> +
> >>>          if (ring->funcs->patch_cond_exec)
> >>>                  amdgpu_ring_patch_cond_exec(ring, patch_offset);
> >>>
> >>> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> >>> index 33f1c4a46ebe..59db577e8c8f 100644
> >>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> >>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> >>> @@ -5222,6 +5222,40 @@ static void gfx_v8_0_ring_emit_gds_switch(struct amdgpu_ring *ring,
> >>>          amdgpu_ring_write(ring, (1 << (oa_size + oa_base)) - (1 << oa_base));
> >>>   }
> >>>
> >>> +static void gfx_v8_0_ring_emit_trap_handler(struct amdgpu_ring *ring,
> >>> +                                           uint32_t vmid,
> >>> +                                           uint64_t tba_addr,
> >>> +                                           uint64_t tma_addr)
> >>> +{
> >>> +       if (ring->funcs->type == AMDGPU_RING_TYPE_GFX) {
> >>> +               static const u32 regs[] = {
> >>> +                       mmSPI_SHADER_TBA_LO_PS,
> >>> +                       mmSPI_SHADER_TBA_LO_VS,
> >>> +                       mmSPI_SHADER_TBA_LO_GS,
> >>> +                       mmSPI_SHADER_TBA_LO_ES,
> >>> +                       mmSPI_SHADER_TBA_LO_HS,
> >>> +                       mmSPI_SHADER_TBA_LO_LS,
> >>> +               };
> >>> +               int i;
> >>> +
> >>> +               for (i = 0; i < ARRAY_SIZE(regs); i++) {
> >>> +                       amdgpu_ring_write(ring, PACKET3(PACKET3_SET_SH_REG, 4));
> >>> +                       amdgpu_ring_write(ring, regs[i] - PACKET3_SET_SH_REG_START);
> >>> +                       amdgpu_ring_write(ring, lower_32_bits(tba_addr >> 8));
> >>> +                       amdgpu_ring_write(ring, upper_32_bits(tba_addr >> 8));
> >>> +                       amdgpu_ring_write(ring, lower_32_bits(tma_addr >> 8));
> >>> +                       amdgpu_ring_write(ring, upper_32_bits(tma_addr >> 8));
> >>> +               }
> >>> +       } else {
> >>> +               amdgpu_ring_write(ring, PACKET3(PACKET3_SET_SH_REG, 4));
> >>> +               amdgpu_ring_write(ring, mmCOMPUTE_TBA_LO - PACKET3_SET_SH_REG_START);
> >>> +               amdgpu_ring_write(ring, lower_32_bits(tba_addr >> 8));
> >>> +               amdgpu_ring_write(ring, upper_32_bits(tba_addr >> 8));
> >>> +               amdgpu_ring_write(ring, lower_32_bits(tma_addr >> 8));
> >>> +               amdgpu_ring_write(ring, upper_32_bits(tma_addr >> 8));
> >>> +       }
> >>> +}
> >>> +
> >>>   static uint32_t wave_read_ind(struct amdgpu_device *adev, uint32_t simd, uint32_t wave, uint32_t address)
> >>>   {
> >>>          WREG32(mmSQ_IND_INDEX,
> >>> @@ -6890,7 +6924,8 @@ static const struct amdgpu_ring_funcs gfx_v8_0_ring_funcs_gfx = {
> >>>                  5 + /* HDP_INVL */
> >>>                  12 + 12 + /* FENCE x2 */
> >>>                  2 + /* SWITCH_BUFFER */
> >>> -               5, /* SURFACE_SYNC */
> >>> +               5 + /* SURFACE_SYNC */
> >>> +               36, /* gfx_v8_0_ring_emit_trap_handler */
> >>>          .emit_ib_size = 4, /* gfx_v8_0_ring_emit_ib_gfx */
> >>>          .emit_ib = gfx_v8_0_ring_emit_ib_gfx,
> >>>          .emit_fence = gfx_v8_0_ring_emit_fence_gfx,
> >>> @@ -6909,6 +6944,7 @@ static const struct amdgpu_ring_funcs gfx_v8_0_ring_funcs_gfx = {
> >>>          .emit_wreg = gfx_v8_0_ring_emit_wreg,
> >>>          .soft_recovery = gfx_v8_0_ring_soft_recovery,
> >>>          .emit_mem_sync = gfx_v8_0_emit_mem_sync,
> >>> +       .emit_trap_handler = gfx_v8_0_ring_emit_trap_handler,
> >>>   };
> >>>
> >>>   static const struct amdgpu_ring_funcs gfx_v8_0_ring_funcs_compute = {
> >>> @@ -6926,7 +6962,8 @@ static const struct amdgpu_ring_funcs gfx_v8_0_ring_funcs_compute = {
> >>>                  7 + /* gfx_v8_0_ring_emit_pipeline_sync */
> >>>                  VI_FLUSH_GPU_TLB_NUM_WREG * 5 + 7 + /* gfx_v8_0_ring_emit_vm_flush */
> >>>                  7 + 7 + 7 + /* gfx_v8_0_ring_emit_fence_compute x3 for user fence, vm fence */
> >>> -               7, /* gfx_v8_0_emit_mem_sync_compute */
> >>> +               7 + /* gfx_v8_0_emit_mem_sync_compute */
> >>> +               6, /* gfx_v8_0_emit_trap_handler */
> >>>          .emit_ib_size = 7, /* gfx_v8_0_ring_emit_ib_compute */
> >>>          .emit_ib = gfx_v8_0_ring_emit_ib_compute,
> >>>          .emit_fence = gfx_v8_0_ring_emit_fence_compute,
> >>> @@ -6940,6 +6977,7 @@ static const struct amdgpu_ring_funcs gfx_v8_0_ring_funcs_compute = {
> >>>          .pad_ib = amdgpu_ring_generic_pad_ib,
> >>>          .emit_wreg = gfx_v8_0_ring_emit_wreg,
> >>>          .emit_mem_sync = gfx_v8_0_emit_mem_sync_compute,
> >>> +       .emit_trap_handler = gfx_v8_0_ring_emit_trap_handler,
> >>>   };
> >>>
> >>>   static const struct amdgpu_ring_funcs gfx_v8_0_ring_funcs_kiq = {
> >>> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> >>> index cb9d60a4e05e..4fc00f196085 100644
> >>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> >>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> >>> @@ -4162,6 +4162,23 @@ static void gfx_v9_0_ring_emit_gds_switch(struct amdgpu_ring *ring,
> >>>                                     (1 << (oa_size + oa_base)) - (1 << oa_base));
> >>>   }
> >>>
> >>> +static void gfx_v9_0_ring_emit_trap_handler(struct amdgpu_ring *ring,
> >>> +                                           uint32_t vmid,
> >>> +                                           uint64_t tba_addr,
> >>> +                                           uint64_t tma_addr)
> >>> +{
> >>> +       struct amdgpu_device *adev = ring->adev;
> >>> +
> >>> +       mutex_lock(&adev->srbm_mutex);
> >>> +       soc15_grbm_select(adev, 0, 0, 0, vmid);
> >>> +       WREG32_SOC15_RLC(GC, 0, mmSQ_SHADER_TBA_LO, lower_32_bits(tba_addr >> 8));
> >>> +       WREG32_SOC15_RLC(GC, 0, mmSQ_SHADER_TBA_HI, upper_32_bits(tba_addr >> 8));
> >>> +       WREG32_SOC15_RLC(GC, 0, mmSQ_SHADER_TMA_LO, lower_32_bits(tma_addr >> 8));
> >>> +       WREG32_SOC15_RLC(GC, 0, mmSQ_SHADER_TMA_HI, upper_32_bits(tma_addr >> 8));
> >>> +       soc15_grbm_select(adev, 0, 0, 0, 0);
> >>> +       mutex_unlock(&adev->srbm_mutex);
> >> This won't work.  This updates registers via MMIO using the CPU.  We
> >> need to have the registers updated asynchronously via the CP so they
> >> get updated when the specific jobs are executed by the engine.  vmid's
> >> are shared resources and are assigned dynamically via the kernel
> >> driver.  If you update via MMIO the changes take effect immediately
> >> rather than when the actual work is scheduled on the engine.
> >> Unfortunately, at the moment, I don't see a way to do this with the CP
> >> with the packets that are currently available.
> > One option might be to do this via MMIO, but only support it when
> > using a reserved vmid.
>
> Hmm, yes, that's completely broken actually. Thanks for the explanation,
> that makes total sense.
>
> So, no way to configure these registers via the CP at the moment. Do you
> have any plans?

I have a dialog going with the CP team to see what we can do.  Stay tuned.

Alex


>
> I will have a look at the reserved vmid.
>
> >
> > Alex
> >
> >
> >> Alex
> >>
> >>
> >>> +}
> >>> +
> >>>   static const u32 vgpr_init_compute_shader[] =
> >>>   {
> >>>          0xb07c0000, 0xbe8000ff,
> >>> @@ -6720,6 +6737,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_gfx = {
> >>>          .emit_reg_write_reg_wait = gfx_v9_0_ring_emit_reg_write_reg_wait,
> >>>          .soft_recovery = gfx_v9_0_ring_soft_recovery,
> >>>          .emit_mem_sync = gfx_v9_0_emit_mem_sync,
> >>> +       .emit_trap_handler = gfx_v9_0_ring_emit_trap_handler,
> >>>   };
> >>>
> >>>   static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_compute = {
> >>> @@ -6756,6 +6774,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_compute = {
> >>>          .emit_reg_wait = gfx_v9_0_ring_emit_reg_wait,
> >>>          .emit_reg_write_reg_wait = gfx_v9_0_ring_emit_reg_write_reg_wait,
> >>>          .emit_mem_sync = gfx_v9_0_emit_mem_sync,
> >>> +       .emit_trap_handler = gfx_v9_0_ring_emit_trap_handler,
> >>>   };
> >>>
> >>>   static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_kiq = {
> >>> diff --git a/include/uapi/drm/amdgpu_drm.h b/include/uapi/drm/amdgpu_drm.h
> >>> index 3218576e109d..7eae264adb5d 100644
> >>> --- a/include/uapi/drm/amdgpu_drm.h
> >>> +++ b/include/uapi/drm/amdgpu_drm.h
> >>> @@ -551,6 +551,7 @@ struct drm_amdgpu_gem_va {
> >>>   #define AMDGPU_CHUNK_ID_SCHEDULED_DEPENDENCIES 0x07
> >>>   #define AMDGPU_CHUNK_ID_SYNCOBJ_TIMELINE_WAIT    0x08
> >>>   #define AMDGPU_CHUNK_ID_SYNCOBJ_TIMELINE_SIGNAL  0x09
> >>> +#define AMDGPU_CHUNK_ID_TRAP            0x0a
> >>>
> >>>   struct drm_amdgpu_cs_chunk {
> >>>          __u32           chunk_id;
> >>> @@ -645,6 +646,13 @@ struct drm_amdgpu_cs_chunk_syncobj {
> >>>          __u64 point;
> >>>   };
> >>>
> >>> +struct drm_amdgpu_cs_chunk_trap {
> >>> +       /** Trap Base Address */
> >>> +       __u64 tba_addr;
> >>> +       /** Trap Memory Address */
> >>> +       __u64 tma_addr;
> >>> +};
> >>> +
> >>>   #define AMDGPU_FENCE_TO_HANDLE_GET_SYNCOBJ     0
> >>>   #define AMDGPU_FENCE_TO_HANDLE_GET_SYNCOBJ_FD  1
> >>>   #define AMDGPU_FENCE_TO_HANDLE_GET_SYNC_FILE_FD        2
> >>> --
> >>> 2.28.0
> >>>
> >>> _______________________________________________
> >>> amd-gfx mailing list
> >>> amd-gfx@lists.freedesktop.org
> >>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 16+ messages in thread

* [PATCH v2] drm/amdgpu: add support for user trap handlers
  2020-08-24 11:49 [RFC PATCH] drm/amdgpu: add support for user trap handlers Samuel Pitoiset
  2020-08-24 18:17 ` Marek Olšák
  2020-08-24 18:33 ` Alex Deucher
@ 2020-08-25 14:07 ` Samuel Pitoiset
  2020-08-28  7:57   ` Christian König
  2021-05-06  6:54 ` [PATCH v3] " Samuel Pitoiset
  3 siblings, 1 reply; 16+ messages in thread
From: Samuel Pitoiset @ 2020-08-25 14:07 UTC (permalink / raw)
  To: amd-gfx; +Cc: Samuel Pitoiset

A trap handler can be used by userspace to catch shader exceptions
like divide by zero, memory violations etc.

On GFX6-GFX8, the registers used to configure TBA/TMA aren't
privileged and can be configured from userpace.

On GFX9+ they are per VMID and privileged, only the KMD can
configure them. At the moment, we don't know how to set them
via the CP, so they are only emitted if a VMID is reserved.

This introduces a new CS chunk that can be used to set the
TBA/TMA virtual address at submit time.

TODO:
- rebase on top of amd-staging-drm-next (this branch currently
hangs my GPU at boot)

Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c   | 31 ++++++++++++++++++++++++
 drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c  |  3 ++-
 drivers/gpu/drm/amd/amdgpu/amdgpu_ids.h  |  4 +++
 drivers/gpu/drm/amd/amdgpu/amdgpu_job.h  |  4 +++
 drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h |  4 +++
 drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c   | 20 +++++++++++++--
 drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c   | 20 +++++++++++++++
 drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c    | 19 +++++++++++++++
 include/uapi/drm/amdgpu_drm.h            |  8 ++++++
 9 files changed, 110 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
index a512ccbc4dea..6ca5c4912e3a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
@@ -104,6 +104,19 @@ static int amdgpu_cs_bo_handles_chunk(struct amdgpu_cs_parser *p,
 	return r;
 }
 
+static int amdgpu_cs_user_trap_chunk(struct amdgpu_cs_parser *p,
+				     struct drm_amdgpu_cs_chunk_trap *data,
+				     uint64_t *tba_addr, uint64_t *tma_addr)
+{
+	if (!data->tba_addr || !data->tma_addr)
+		return -EINVAL;
+
+	*tba_addr = data->tba_addr;
+	*tma_addr = data->tma_addr;
+
+	return 0;
+}
+
 static int amdgpu_cs_parser_init(struct amdgpu_cs_parser *p, union drm_amdgpu_cs *cs)
 {
 	struct amdgpu_fpriv *fpriv = p->filp->driver_priv;
@@ -112,6 +125,7 @@ static int amdgpu_cs_parser_init(struct amdgpu_cs_parser *p, union drm_amdgpu_cs
 	uint64_t *chunk_array;
 	unsigned size, num_ibs = 0;
 	uint32_t uf_offset = 0;
+	uint64_t tba_addr = 0, tma_addr = 0;
 	int i;
 	int ret;
 
@@ -214,6 +228,19 @@ static int amdgpu_cs_parser_init(struct amdgpu_cs_parser *p, union drm_amdgpu_cs
 
 			break;
 
+		case AMDGPU_CHUNK_ID_TRAP:
+			size = sizeof(struct drm_amdgpu_cs_chunk_trap);
+			if (p->chunks[i].length_dw * sizeof(uint32_t) < size) {
+				ret = -EINVAL;
+				goto free_partial_kdata;
+			}
+
+			ret = amdgpu_cs_user_trap_chunk(p, p->chunks[i].kdata,
+							&tba_addr, &tma_addr);
+			if (ret)
+				goto free_partial_kdata;
+			break;
+
 		case AMDGPU_CHUNK_ID_DEPENDENCIES:
 		case AMDGPU_CHUNK_ID_SYNCOBJ_IN:
 		case AMDGPU_CHUNK_ID_SYNCOBJ_OUT:
@@ -239,6 +266,10 @@ static int amdgpu_cs_parser_init(struct amdgpu_cs_parser *p, union drm_amdgpu_cs
 
 	if (p->uf_entry.tv.bo)
 		p->job->uf_addr = uf_offset;
+
+	p->job->tba_addr = tba_addr;
+	p->job->tma_addr = tma_addr;
+
 	kfree(chunk_array);
 
 	/* Use this opportunity to fill in task info for the vm */
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
index 26127c7d2f32..1e703119e4c2 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
@@ -88,9 +88,10 @@
  * - 3.37.0 - L2 is invalidated before SDMA IBs, needed for correctness
  * - 3.38.0 - Add AMDGPU_IB_FLAG_EMIT_MEM_SYNC
  * - 3.39.0 - DMABUF implicit sync does a full pipeline sync
+ * - 3.40.0 - Add AMDGPU_CHUNK_ID_TRAP
  */
 #define KMS_DRIVER_MAJOR	3
-#define KMS_DRIVER_MINOR	39
+#define KMS_DRIVER_MINOR	40
 #define KMS_DRIVER_PATCHLEVEL	0
 
 int amdgpu_vram_limit = 0;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.h
index 8e58325bbca2..fd0d56724b4d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.h
@@ -58,6 +58,10 @@ struct amdgpu_vmid {
 	uint32_t		oa_base;
 	uint32_t		oa_size;
 
+	/* user trap */
+	uint64_t		tba_addr;
+	uint64_t		tma_addr;
+
 	unsigned		pasid;
 	struct dma_fence	*pasid_mapping;
 };
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
index 81caac9b958a..b8ed5b13ea44 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
@@ -62,6 +62,10 @@ struct amdgpu_job {
 	/* user fence handling */
 	uint64_t		uf_addr;
 	uint64_t		uf_sequence;
+
+	/* user trap handling */
+	uint64_t		tba_addr;
+	uint64_t		tma_addr;
 };
 
 int amdgpu_job_alloc(struct amdgpu_device *adev, unsigned num_ibs,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
index da871d84b742..1f165a6295d9 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
@@ -197,6 +197,9 @@ struct amdgpu_ring_funcs {
 	void (*soft_recovery)(struct amdgpu_ring *ring, unsigned vmid);
 	int (*preempt_ib)(struct amdgpu_ring *ring);
 	void (*emit_mem_sync)(struct amdgpu_ring *ring);
+	void (*emit_trap_handler)(struct amdgpu_ring *ring,
+				  uint32_t vmid,
+				  uint64_t tba_addr, uint64_t tma_addr);
 };
 
 struct amdgpu_ring {
@@ -265,6 +268,7 @@ struct amdgpu_ring {
 #define amdgpu_ring_emit_vm_flush(r, vmid, addr) (r)->funcs->emit_vm_flush((r), (vmid), (addr))
 #define amdgpu_ring_emit_fence(r, addr, seq, flags) (r)->funcs->emit_fence((r), (addr), (seq), (flags))
 #define amdgpu_ring_emit_gds_switch(r, v, db, ds, wb, ws, ab, as) (r)->funcs->emit_gds_switch((r), (v), (db), (ds), (wb), (ws), (ab), (as))
+#define amdgpu_ring_emit_trap_handler(r, v, tba, tma) (r)->funcs->emit_trap_handler((r), (v), (tba), (tma))
 #define amdgpu_ring_emit_hdp_flush(r) (r)->funcs->emit_hdp_flush((r))
 #define amdgpu_ring_emit_switch_buffer(r) (r)->funcs->emit_switch_buffer((r))
 #define amdgpu_ring_emit_cntxcntl(r, d) (r)->funcs->emit_cntxcntl((r), (d))
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
index 71e005cf2952..6b619bb03777 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
@@ -1079,15 +1079,22 @@ int amdgpu_vm_flush(struct amdgpu_ring *ring, struct amdgpu_job *job,
 	bool vm_flush_needed = job->vm_needs_flush;
 	struct dma_fence *fence = NULL;
 	bool pasid_mapping_needed = false;
+	bool trap_handler_needed = false;
 	unsigned patch_offset = 0;
 	bool update_spm_vmid_needed = (job->vm && (job->vm->reserved_vmid[vmhub] != NULL));
 	int r;
 
-	if (update_spm_vmid_needed && adev->gfx.rlc.funcs->update_spm_vmid)
+	if (update_spm_vmid_needed && adev->gfx.rlc.funcs->update_spm_vmid) {
 		adev->gfx.rlc.funcs->update_spm_vmid(adev, job->vmid);
 
+		trap_handler_needed = ring->funcs->emit_trap_handler && (
+			id->tba_addr != job->tba_addr ||
+			id->tma_addr != job->tma_addr);
+	}
+
 	if (amdgpu_vmid_had_gpu_reset(adev, id)) {
 		gds_switch_needed = true;
+		trap_handler_needed = true;
 		vm_flush_needed = true;
 		pasid_mapping_needed = true;
 	}
@@ -1099,12 +1106,14 @@ int amdgpu_vm_flush(struct amdgpu_ring *ring, struct amdgpu_job *job,
 	mutex_unlock(&id_mgr->lock);
 
 	gds_switch_needed &= !!ring->funcs->emit_gds_switch;
+	trap_handler_needed &= !!ring->funcs->emit_trap_handler;
 	vm_flush_needed &= !!ring->funcs->emit_vm_flush  &&
 			job->vm_pd_addr != AMDGPU_BO_INVALID_OFFSET;
 	pasid_mapping_needed &= adev->gmc.gmc_funcs->emit_pasid_mapping &&
 		ring->funcs->emit_wreg;
 
-	if (!vm_flush_needed && !gds_switch_needed && !need_pipe_sync)
+	if (!vm_flush_needed && !gds_switch_needed &&
+	    !trap_handler_needed && !need_pipe_sync)
 		return 0;
 
 	if (ring->funcs->init_cond_exec)
@@ -1158,6 +1167,13 @@ int amdgpu_vm_flush(struct amdgpu_ring *ring, struct amdgpu_job *job,
 					    job->oa_size);
 	}
 
+	if (ring->funcs->emit_trap_handler && trap_handler_needed) {
+		id->tba_addr = job->tba_addr;
+		id->tma_addr = job->tma_addr;
+		amdgpu_ring_emit_trap_handler(ring, job->vmid, job->tba_addr,
+					      job->tma_addr);
+	}
+
 	if (ring->funcs->patch_cond_exec)
 		amdgpu_ring_patch_cond_exec(ring, patch_offset);
 
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
index 65997ffaed45..f864b217589f 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
@@ -7141,6 +7141,24 @@ static void gfx_v10_0_ring_emit_gds_switch(struct amdgpu_ring *ring,
 				    (1 << (oa_size + oa_base)) - (1 << oa_base));
 }
 
+static void gfx_v10_0_ring_emit_trap_handler(struct amdgpu_ring *ring,
+					    uint32_t vmid,
+					    uint64_t tba_addr,
+					    uint64_t tma_addr)
+{
+	struct amdgpu_device *adev = ring->adev;
+
+	mutex_lock(&adev->srbm_mutex);
+	nv_grbm_select(adev, 0, 0, 0, vmid);
+	WREG32_SOC15_RLC(GC, 0, mmSQ_SHADER_TBA_LO, lower_32_bits(tba_addr >> 8));
+	WREG32_SOC15_RLC(GC, 0, mmSQ_SHADER_TBA_HI, upper_32_bits(tba_addr >> 8) |
+				1 << SQ_SHADER_TBA_HI__TRAP_EN__SHIFT);
+	WREG32_SOC15_RLC(GC, 0, mmSQ_SHADER_TMA_LO, lower_32_bits(tma_addr >> 8));
+	WREG32_SOC15_RLC(GC, 0, mmSQ_SHADER_TMA_HI, upper_32_bits(tma_addr >> 8));
+	nv_grbm_select(adev, 0, 0, 0, 0);
+	mutex_unlock(&adev->srbm_mutex);
+}
+
 static int gfx_v10_0_early_init(void *handle)
 {
 	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
@@ -8530,6 +8548,7 @@ static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_gfx = {
 	.emit_reg_write_reg_wait = gfx_v10_0_ring_emit_reg_write_reg_wait,
 	.soft_recovery = gfx_v10_0_ring_soft_recovery,
 	.emit_mem_sync = gfx_v10_0_emit_mem_sync,
+	.emit_trap_handler = gfx_v10_0_ring_emit_trap_handler,
 };
 
 static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_compute = {
@@ -8566,6 +8585,7 @@ static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_compute = {
 	.emit_reg_wait = gfx_v10_0_ring_emit_reg_wait,
 	.emit_reg_write_reg_wait = gfx_v10_0_ring_emit_reg_write_reg_wait,
 	.emit_mem_sync = gfx_v10_0_emit_mem_sync,
+	.emit_trap_handler = gfx_v10_0_ring_emit_trap_handler,
 };
 
 static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_kiq = {
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
index cb9d60a4e05e..4fc00f196085 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
@@ -4162,6 +4162,23 @@ static void gfx_v9_0_ring_emit_gds_switch(struct amdgpu_ring *ring,
 				   (1 << (oa_size + oa_base)) - (1 << oa_base));
 }
 
+static void gfx_v9_0_ring_emit_trap_handler(struct amdgpu_ring *ring,
+					    uint32_t vmid,
+					    uint64_t tba_addr,
+					    uint64_t tma_addr)
+{
+	struct amdgpu_device *adev = ring->adev;
+
+	mutex_lock(&adev->srbm_mutex);
+	soc15_grbm_select(adev, 0, 0, 0, vmid);
+	WREG32_SOC15_RLC(GC, 0, mmSQ_SHADER_TBA_LO, lower_32_bits(tba_addr >> 8));
+	WREG32_SOC15_RLC(GC, 0, mmSQ_SHADER_TBA_HI, upper_32_bits(tba_addr >> 8));
+	WREG32_SOC15_RLC(GC, 0, mmSQ_SHADER_TMA_LO, lower_32_bits(tma_addr >> 8));
+	WREG32_SOC15_RLC(GC, 0, mmSQ_SHADER_TMA_HI, upper_32_bits(tma_addr >> 8));
+	soc15_grbm_select(adev, 0, 0, 0, 0);
+	mutex_unlock(&adev->srbm_mutex);
+}
+
 static const u32 vgpr_init_compute_shader[] =
 {
 	0xb07c0000, 0xbe8000ff,
@@ -6720,6 +6737,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_gfx = {
 	.emit_reg_write_reg_wait = gfx_v9_0_ring_emit_reg_write_reg_wait,
 	.soft_recovery = gfx_v9_0_ring_soft_recovery,
 	.emit_mem_sync = gfx_v9_0_emit_mem_sync,
+	.emit_trap_handler = gfx_v9_0_ring_emit_trap_handler,
 };
 
 static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_compute = {
@@ -6756,6 +6774,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_compute = {
 	.emit_reg_wait = gfx_v9_0_ring_emit_reg_wait,
 	.emit_reg_write_reg_wait = gfx_v9_0_ring_emit_reg_write_reg_wait,
 	.emit_mem_sync = gfx_v9_0_emit_mem_sync,
+	.emit_trap_handler = gfx_v9_0_ring_emit_trap_handler,
 };
 
 static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_kiq = {
diff --git a/include/uapi/drm/amdgpu_drm.h b/include/uapi/drm/amdgpu_drm.h
index 3218576e109d..7eae264adb5d 100644
--- a/include/uapi/drm/amdgpu_drm.h
+++ b/include/uapi/drm/amdgpu_drm.h
@@ -551,6 +551,7 @@ struct drm_amdgpu_gem_va {
 #define AMDGPU_CHUNK_ID_SCHEDULED_DEPENDENCIES	0x07
 #define AMDGPU_CHUNK_ID_SYNCOBJ_TIMELINE_WAIT    0x08
 #define AMDGPU_CHUNK_ID_SYNCOBJ_TIMELINE_SIGNAL  0x09
+#define AMDGPU_CHUNK_ID_TRAP            0x0a
 
 struct drm_amdgpu_cs_chunk {
 	__u32		chunk_id;
@@ -645,6 +646,13 @@ struct drm_amdgpu_cs_chunk_syncobj {
        __u64 point;
 };
 
+struct drm_amdgpu_cs_chunk_trap {
+	/** Trap Base Address */
+       __u64 tba_addr;
+	/** Trap Memory Address */
+       __u64 tma_addr;
+};
+
 #define AMDGPU_FENCE_TO_HANDLE_GET_SYNCOBJ	0
 #define AMDGPU_FENCE_TO_HANDLE_GET_SYNCOBJ_FD	1
 #define AMDGPU_FENCE_TO_HANDLE_GET_SYNC_FILE_FD	2
-- 
2.28.0

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 16+ messages in thread

* Re: [PATCH v2] drm/amdgpu: add support for user trap handlers
  2020-08-25 14:07 ` [PATCH v2] " Samuel Pitoiset
@ 2020-08-28  7:57   ` Christian König
  2020-08-28  8:14     ` Samuel Pitoiset
  0 siblings, 1 reply; 16+ messages in thread
From: Christian König @ 2020-08-28  7:57 UTC (permalink / raw)
  To: Samuel Pitoiset, amd-gfx

Am 25.08.20 um 16:07 schrieb Samuel Pitoiset:
> A trap handler can be used by userspace to catch shader exceptions
> like divide by zero, memory violations etc.
>
> On GFX6-GFX8, the registers used to configure TBA/TMA aren't
> privileged and can be configured from userpace.
>
> On GFX9+ they are per VMID and privileged, only the KMD can
> configure them. At the moment, we don't know how to set them
> via the CP, so they are only emitted if a VMID is reserved.
>
> This introduces a new CS chunk that can be used to set the
> TBA/TMA virtual address at submit time.
>
> TODO:
> - rebase on top of amd-staging-drm-next (this branch currently
> hangs my GPU at boot)

Please split that up into multiple patches. The first one adding the 
general infrastructure and the following one the implementation for gfx9 
and gfx10.

And maybe even support this for gfx6-8 even if it is not necessary? 
Looks trivial to implement and would give userspace a more uniform 
handling for this.

A few more comments below.

>
> Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c   | 31 ++++++++++++++++++++++++
>   drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c  |  3 ++-
>   drivers/gpu/drm/amd/amdgpu/amdgpu_ids.h  |  4 +++
>   drivers/gpu/drm/amd/amdgpu/amdgpu_job.h  |  4 +++
>   drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h |  4 +++
>   drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c   | 20 +++++++++++++--
>   drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c   | 20 +++++++++++++++
>   drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c    | 19 +++++++++++++++
>   include/uapi/drm/amdgpu_drm.h            |  8 ++++++
>   9 files changed, 110 insertions(+), 3 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> index a512ccbc4dea..6ca5c4912e3a 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> @@ -104,6 +104,19 @@ static int amdgpu_cs_bo_handles_chunk(struct amdgpu_cs_parser *p,
>   	return r;
>   }
>   
> +static int amdgpu_cs_user_trap_chunk(struct amdgpu_cs_parser *p,
> +				     struct drm_amdgpu_cs_chunk_trap *data,
> +				     uint64_t *tba_addr, uint64_t *tma_addr)
> +{
> +	if (!data->tba_addr || !data->tma_addr)
> +		return -EINVAL;
> +
> +	*tba_addr = data->tba_addr;
> +	*tma_addr = data->tma_addr;
> +
> +	return 0;
> +}
> +
>   static int amdgpu_cs_parser_init(struct amdgpu_cs_parser *p, union drm_amdgpu_cs *cs)
>   {
>   	struct amdgpu_fpriv *fpriv = p->filp->driver_priv;
> @@ -112,6 +125,7 @@ static int amdgpu_cs_parser_init(struct amdgpu_cs_parser *p, union drm_amdgpu_cs
>   	uint64_t *chunk_array;
>   	unsigned size, num_ibs = 0;
>   	uint32_t uf_offset = 0;
> +	uint64_t tba_addr = 0, tma_addr = 0;
>   	int i;
>   	int ret;
>   
> @@ -214,6 +228,19 @@ static int amdgpu_cs_parser_init(struct amdgpu_cs_parser *p, union drm_amdgpu_cs
>   
>   			break;
>   
> +		case AMDGPU_CHUNK_ID_TRAP:
> +			size = sizeof(struct drm_amdgpu_cs_chunk_trap);
> +			if (p->chunks[i].length_dw * sizeof(uint32_t) < size) {
> +				ret = -EINVAL;
> +				goto free_partial_kdata;
> +			}
> +
> +			ret = amdgpu_cs_user_trap_chunk(p, p->chunks[i].kdata,
> +							&tba_addr, &tma_addr);
> +			if (ret)
> +				goto free_partial_kdata;
> +			break;
> +
>   		case AMDGPU_CHUNK_ID_DEPENDENCIES:
>   		case AMDGPU_CHUNK_ID_SYNCOBJ_IN:
>   		case AMDGPU_CHUNK_ID_SYNCOBJ_OUT:
> @@ -239,6 +266,10 @@ static int amdgpu_cs_parser_init(struct amdgpu_cs_parser *p, union drm_amdgpu_cs
>   
>   	if (p->uf_entry.tv.bo)
>   		p->job->uf_addr = uf_offset;
> +
> +	p->job->tba_addr = tba_addr;
> +	p->job->tma_addr = tma_addr;
> +
>   	kfree(chunk_array);
>   
>   	/* Use this opportunity to fill in task info for the vm */
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> index 26127c7d2f32..1e703119e4c2 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> @@ -88,9 +88,10 @@
>    * - 3.37.0 - L2 is invalidated before SDMA IBs, needed for correctness
>    * - 3.38.0 - Add AMDGPU_IB_FLAG_EMIT_MEM_SYNC
>    * - 3.39.0 - DMABUF implicit sync does a full pipeline sync
> + * - 3.40.0 - Add AMDGPU_CHUNK_ID_TRAP
>    */
>   #define KMS_DRIVER_MAJOR	3
> -#define KMS_DRIVER_MINOR	39
> +#define KMS_DRIVER_MINOR	40
>   #define KMS_DRIVER_PATCHLEVEL	0
>   
>   int amdgpu_vram_limit = 0;
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.h
> index 8e58325bbca2..fd0d56724b4d 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.h
> @@ -58,6 +58,10 @@ struct amdgpu_vmid {
>   	uint32_t		oa_base;
>   	uint32_t		oa_size;
>   
> +	/* user trap */
> +	uint64_t		tba_addr;
> +	uint64_t		tma_addr;
> +
>   	unsigned		pasid;
>   	struct dma_fence	*pasid_mapping;
>   };
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
> index 81caac9b958a..b8ed5b13ea44 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
> @@ -62,6 +62,10 @@ struct amdgpu_job {
>   	/* user fence handling */
>   	uint64_t		uf_addr;
>   	uint64_t		uf_sequence;
> +
> +	/* user trap handling */
> +	uint64_t		tba_addr;
> +	uint64_t		tma_addr;
>   };
>   
>   int amdgpu_job_alloc(struct amdgpu_device *adev, unsigned num_ibs,
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> index da871d84b742..1f165a6295d9 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> @@ -197,6 +197,9 @@ struct amdgpu_ring_funcs {
>   	void (*soft_recovery)(struct amdgpu_ring *ring, unsigned vmid);
>   	int (*preempt_ib)(struct amdgpu_ring *ring);
>   	void (*emit_mem_sync)(struct amdgpu_ring *ring);
> +	void (*emit_trap_handler)(struct amdgpu_ring *ring,
> +				  uint32_t vmid,
> +				  uint64_t tba_addr, uint64_t tma_addr);
>   };
>   
>   struct amdgpu_ring {
> @@ -265,6 +268,7 @@ struct amdgpu_ring {
>   #define amdgpu_ring_emit_vm_flush(r, vmid, addr) (r)->funcs->emit_vm_flush((r), (vmid), (addr))
>   #define amdgpu_ring_emit_fence(r, addr, seq, flags) (r)->funcs->emit_fence((r), (addr), (seq), (flags))
>   #define amdgpu_ring_emit_gds_switch(r, v, db, ds, wb, ws, ab, as) (r)->funcs->emit_gds_switch((r), (v), (db), (ds), (wb), (ws), (ab), (as))
> +#define amdgpu_ring_emit_trap_handler(r, v, tba, tma) (r)->funcs->emit_trap_handler((r), (v), (tba), (tma))
>   #define amdgpu_ring_emit_hdp_flush(r) (r)->funcs->emit_hdp_flush((r))
>   #define amdgpu_ring_emit_switch_buffer(r) (r)->funcs->emit_switch_buffer((r))
>   #define amdgpu_ring_emit_cntxcntl(r, d) (r)->funcs->emit_cntxcntl((r), (d))
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> index 71e005cf2952..6b619bb03777 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> @@ -1079,15 +1079,22 @@ int amdgpu_vm_flush(struct amdgpu_ring *ring, struct amdgpu_job *job,
>   	bool vm_flush_needed = job->vm_needs_flush;
>   	struct dma_fence *fence = NULL;
>   	bool pasid_mapping_needed = false;
> +	bool trap_handler_needed = false;
>   	unsigned patch_offset = 0;
>   	bool update_spm_vmid_needed = (job->vm && (job->vm->reserved_vmid[vmhub] != NULL));
>   	int r;
>   
> -	if (update_spm_vmid_needed && adev->gfx.rlc.funcs->update_spm_vmid)
> +	if (update_spm_vmid_needed && adev->gfx.rlc.funcs->update_spm_vmid) {
>   		adev->gfx.rlc.funcs->update_spm_vmid(adev, job->vmid);
>   
> +		trap_handler_needed = ring->funcs->emit_trap_handler && (
> +			id->tba_addr != job->tba_addr ||
> +			id->tma_addr != job->tma_addr);

That's probably not such a good idea since it makes the trap handler 
depend on the VMID reservation.

> +	}
> +
>   	if (amdgpu_vmid_had_gpu_reset(adev, id)) {
>   		gds_switch_needed = true;
> +		trap_handler_needed = true;
>   		vm_flush_needed = true;
>   		pasid_mapping_needed = true;
>   	}
> @@ -1099,12 +1106,14 @@ int amdgpu_vm_flush(struct amdgpu_ring *ring, struct amdgpu_job *job,
>   	mutex_unlock(&id_mgr->lock);
>   
>   	gds_switch_needed &= !!ring->funcs->emit_gds_switch;
> +	trap_handler_needed &= !!ring->funcs->emit_trap_handler;
>   	vm_flush_needed &= !!ring->funcs->emit_vm_flush  &&
>   			job->vm_pd_addr != AMDGPU_BO_INVALID_OFFSET;
>   	pasid_mapping_needed &= adev->gmc.gmc_funcs->emit_pasid_mapping &&
>   		ring->funcs->emit_wreg;
>   
> -	if (!vm_flush_needed && !gds_switch_needed && !need_pipe_sync)
> +	if (!vm_flush_needed && !gds_switch_needed &&
> +	    !trap_handler_needed && !need_pipe_sync)
>   		return 0;
>   
>   	if (ring->funcs->init_cond_exec)
> @@ -1158,6 +1167,13 @@ int amdgpu_vm_flush(struct amdgpu_ring *ring, struct amdgpu_job *job,
>   					    job->oa_size);
>   	}
>   
> +	if (ring->funcs->emit_trap_handler && trap_handler_needed) {
> +		id->tba_addr = job->tba_addr;
> +		id->tma_addr = job->tma_addr;
> +		amdgpu_ring_emit_trap_handler(ring, job->vmid, job->tba_addr,
> +					      job->tma_addr);
> +	}
> +

Well that doesn't seem to make sense at all here.

>   	if (ring->funcs->patch_cond_exec)
>   		amdgpu_ring_patch_cond_exec(ring, patch_offset);
>   
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> index 65997ffaed45..f864b217589f 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> @@ -7141,6 +7141,24 @@ static void gfx_v10_0_ring_emit_gds_switch(struct amdgpu_ring *ring,
>   				    (1 << (oa_size + oa_base)) - (1 << oa_base));
>   }
>   
> +static void gfx_v10_0_ring_emit_trap_handler(struct amdgpu_ring *ring,
> +					    uint32_t vmid,
> +					    uint64_t tba_addr,
> +					    uint64_t tma_addr)
> +{
> +	struct amdgpu_device *adev = ring->adev;
> +
> +	mutex_lock(&adev->srbm_mutex);
> +	nv_grbm_select(adev, 0, 0, 0, vmid);
> +	WREG32_SOC15_RLC(GC, 0, mmSQ_SHADER_TBA_LO, lower_32_bits(tba_addr >> 8));
> +	WREG32_SOC15_RLC(GC, 0, mmSQ_SHADER_TBA_HI, upper_32_bits(tba_addr >> 8) |
> +				1 << SQ_SHADER_TBA_HI__TRAP_EN__SHIFT);
> +	WREG32_SOC15_RLC(GC, 0, mmSQ_SHADER_TMA_LO, lower_32_bits(tma_addr >> 8));
> +	WREG32_SOC15_RLC(GC, 0, mmSQ_SHADER_TMA_HI, upper_32_bits(tma_addr >> 8));
> +	nv_grbm_select(adev, 0, 0, 0, 0);
> +	mutex_unlock(&adev->srbm_mutex);

This is not emitting the trap handler update to the ring, but writing it 
directly to the registers.

Regards,
Christian.

> +}
> +
>   static int gfx_v10_0_early_init(void *handle)
>   {
>   	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
> @@ -8530,6 +8548,7 @@ static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_gfx = {
>   	.emit_reg_write_reg_wait = gfx_v10_0_ring_emit_reg_write_reg_wait,
>   	.soft_recovery = gfx_v10_0_ring_soft_recovery,
>   	.emit_mem_sync = gfx_v10_0_emit_mem_sync,
> +	.emit_trap_handler = gfx_v10_0_ring_emit_trap_handler,
>   };
>   
>   static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_compute = {
> @@ -8566,6 +8585,7 @@ static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_compute = {
>   	.emit_reg_wait = gfx_v10_0_ring_emit_reg_wait,
>   	.emit_reg_write_reg_wait = gfx_v10_0_ring_emit_reg_write_reg_wait,
>   	.emit_mem_sync = gfx_v10_0_emit_mem_sync,
> +	.emit_trap_handler = gfx_v10_0_ring_emit_trap_handler,
>   };
>   
>   static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_kiq = {
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> index cb9d60a4e05e..4fc00f196085 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> @@ -4162,6 +4162,23 @@ static void gfx_v9_0_ring_emit_gds_switch(struct amdgpu_ring *ring,
>   				   (1 << (oa_size + oa_base)) - (1 << oa_base));
>   }
>   
> +static void gfx_v9_0_ring_emit_trap_handler(struct amdgpu_ring *ring,
> +					    uint32_t vmid,
> +					    uint64_t tba_addr,
> +					    uint64_t tma_addr)
> +{
> +	struct amdgpu_device *adev = ring->adev;
> +
> +	mutex_lock(&adev->srbm_mutex);
> +	soc15_grbm_select(adev, 0, 0, 0, vmid);
> +	WREG32_SOC15_RLC(GC, 0, mmSQ_SHADER_TBA_LO, lower_32_bits(tba_addr >> 8));
> +	WREG32_SOC15_RLC(GC, 0, mmSQ_SHADER_TBA_HI, upper_32_bits(tba_addr >> 8));
> +	WREG32_SOC15_RLC(GC, 0, mmSQ_SHADER_TMA_LO, lower_32_bits(tma_addr >> 8));
> +	WREG32_SOC15_RLC(GC, 0, mmSQ_SHADER_TMA_HI, upper_32_bits(tma_addr >> 8));
> +	soc15_grbm_select(adev, 0, 0, 0, 0);
> +	mutex_unlock(&adev->srbm_mutex);
> +}
> +
>   static const u32 vgpr_init_compute_shader[] =
>   {
>   	0xb07c0000, 0xbe8000ff,
> @@ -6720,6 +6737,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_gfx = {
>   	.emit_reg_write_reg_wait = gfx_v9_0_ring_emit_reg_write_reg_wait,
>   	.soft_recovery = gfx_v9_0_ring_soft_recovery,
>   	.emit_mem_sync = gfx_v9_0_emit_mem_sync,
> +	.emit_trap_handler = gfx_v9_0_ring_emit_trap_handler,
>   };
>   
>   static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_compute = {
> @@ -6756,6 +6774,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_compute = {
>   	.emit_reg_wait = gfx_v9_0_ring_emit_reg_wait,
>   	.emit_reg_write_reg_wait = gfx_v9_0_ring_emit_reg_write_reg_wait,
>   	.emit_mem_sync = gfx_v9_0_emit_mem_sync,
> +	.emit_trap_handler = gfx_v9_0_ring_emit_trap_handler,
>   };
>   
>   static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_kiq = {
> diff --git a/include/uapi/drm/amdgpu_drm.h b/include/uapi/drm/amdgpu_drm.h
> index 3218576e109d..7eae264adb5d 100644
> --- a/include/uapi/drm/amdgpu_drm.h
> +++ b/include/uapi/drm/amdgpu_drm.h
> @@ -551,6 +551,7 @@ struct drm_amdgpu_gem_va {
>   #define AMDGPU_CHUNK_ID_SCHEDULED_DEPENDENCIES	0x07
>   #define AMDGPU_CHUNK_ID_SYNCOBJ_TIMELINE_WAIT    0x08
>   #define AMDGPU_CHUNK_ID_SYNCOBJ_TIMELINE_SIGNAL  0x09
> +#define AMDGPU_CHUNK_ID_TRAP            0x0a
>   
>   struct drm_amdgpu_cs_chunk {
>   	__u32		chunk_id;
> @@ -645,6 +646,13 @@ struct drm_amdgpu_cs_chunk_syncobj {
>          __u64 point;
>   };
>   
> +struct drm_amdgpu_cs_chunk_trap {
> +	/** Trap Base Address */
> +       __u64 tba_addr;
> +	/** Trap Memory Address */
> +       __u64 tma_addr;
> +};
> +
>   #define AMDGPU_FENCE_TO_HANDLE_GET_SYNCOBJ	0
>   #define AMDGPU_FENCE_TO_HANDLE_GET_SYNCOBJ_FD	1
>   #define AMDGPU_FENCE_TO_HANDLE_GET_SYNC_FILE_FD	2

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH v2] drm/amdgpu: add support for user trap handlers
  2020-08-28  7:57   ` Christian König
@ 2020-08-28  8:14     ` Samuel Pitoiset
  2020-08-28  8:23       ` Christian König
  0 siblings, 1 reply; 16+ messages in thread
From: Samuel Pitoiset @ 2020-08-28  8:14 UTC (permalink / raw)
  To: christian.koenig, amd-gfx


On 8/28/20 9:57 AM, Christian König wrote:
> Am 25.08.20 um 16:07 schrieb Samuel Pitoiset:
>> A trap handler can be used by userspace to catch shader exceptions
>> like divide by zero, memory violations etc.
>>
>> On GFX6-GFX8, the registers used to configure TBA/TMA aren't
>> privileged and can be configured from userpace.
>>
>> On GFX9+ they are per VMID and privileged, only the KMD can
>> configure them. At the moment, we don't know how to set them
>> via the CP, so they are only emitted if a VMID is reserved.
>>
>> This introduces a new CS chunk that can be used to set the
>> TBA/TMA virtual address at submit time.
>>
>> TODO:
>> - rebase on top of amd-staging-drm-next (this branch currently
>> hangs my GPU at boot)
>
> Please split that up into multiple patches. The first one adding the 
> general infrastructure and the following one the implementation for 
> gfx9 and gfx10.
Sounds good.
>
> And maybe even support this for gfx6-8 even if it is not necessary? 
> Looks trivial to implement and would give userspace a more uniform 
> handling for this.
v1 added gfx6-8 support but I removed it in v2 as requested by Alex 
because it's useless and might be better for preemption.
>
> A few more comments below.
>
>>
>> Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
>> ---
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c   | 31 ++++++++++++++++++++++++
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c  |  3 ++-
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_ids.h  |  4 +++
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_job.h  |  4 +++
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h |  4 +++
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c   | 20 +++++++++++++--
>>   drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c   | 20 +++++++++++++++
>>   drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c    | 19 +++++++++++++++
>>   include/uapi/drm/amdgpu_drm.h            |  8 ++++++
>>   9 files changed, 110 insertions(+), 3 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>> index a512ccbc4dea..6ca5c4912e3a 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>> @@ -104,6 +104,19 @@ static int amdgpu_cs_bo_handles_chunk(struct 
>> amdgpu_cs_parser *p,
>>       return r;
>>   }
>>   +static int amdgpu_cs_user_trap_chunk(struct amdgpu_cs_parser *p,
>> +                     struct drm_amdgpu_cs_chunk_trap *data,
>> +                     uint64_t *tba_addr, uint64_t *tma_addr)
>> +{
>> +    if (!data->tba_addr || !data->tma_addr)
>> +        return -EINVAL;
>> +
>> +    *tba_addr = data->tba_addr;
>> +    *tma_addr = data->tma_addr;
>> +
>> +    return 0;
>> +}
>> +
>>   static int amdgpu_cs_parser_init(struct amdgpu_cs_parser *p, union 
>> drm_amdgpu_cs *cs)
>>   {
>>       struct amdgpu_fpriv *fpriv = p->filp->driver_priv;
>> @@ -112,6 +125,7 @@ static int amdgpu_cs_parser_init(struct 
>> amdgpu_cs_parser *p, union drm_amdgpu_cs
>>       uint64_t *chunk_array;
>>       unsigned size, num_ibs = 0;
>>       uint32_t uf_offset = 0;
>> +    uint64_t tba_addr = 0, tma_addr = 0;
>>       int i;
>>       int ret;
>>   @@ -214,6 +228,19 @@ static int amdgpu_cs_parser_init(struct 
>> amdgpu_cs_parser *p, union drm_amdgpu_cs
>>                 break;
>>   +        case AMDGPU_CHUNK_ID_TRAP:
>> +            size = sizeof(struct drm_amdgpu_cs_chunk_trap);
>> +            if (p->chunks[i].length_dw * sizeof(uint32_t) < size) {
>> +                ret = -EINVAL;
>> +                goto free_partial_kdata;
>> +            }
>> +
>> +            ret = amdgpu_cs_user_trap_chunk(p, p->chunks[i].kdata,
>> +                            &tba_addr, &tma_addr);
>> +            if (ret)
>> +                goto free_partial_kdata;
>> +            break;
>> +
>>           case AMDGPU_CHUNK_ID_DEPENDENCIES:
>>           case AMDGPU_CHUNK_ID_SYNCOBJ_IN:
>>           case AMDGPU_CHUNK_ID_SYNCOBJ_OUT:
>> @@ -239,6 +266,10 @@ static int amdgpu_cs_parser_init(struct 
>> amdgpu_cs_parser *p, union drm_amdgpu_cs
>>         if (p->uf_entry.tv.bo)
>>           p->job->uf_addr = uf_offset;
>> +
>> +    p->job->tba_addr = tba_addr;
>> +    p->job->tma_addr = tma_addr;
>> +
>>       kfree(chunk_array);
>>         /* Use this opportunity to fill in task info for the vm */
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>> index 26127c7d2f32..1e703119e4c2 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>> @@ -88,9 +88,10 @@
>>    * - 3.37.0 - L2 is invalidated before SDMA IBs, needed for 
>> correctness
>>    * - 3.38.0 - Add AMDGPU_IB_FLAG_EMIT_MEM_SYNC
>>    * - 3.39.0 - DMABUF implicit sync does a full pipeline sync
>> + * - 3.40.0 - Add AMDGPU_CHUNK_ID_TRAP
>>    */
>>   #define KMS_DRIVER_MAJOR    3
>> -#define KMS_DRIVER_MINOR    39
>> +#define KMS_DRIVER_MINOR    40
>>   #define KMS_DRIVER_PATCHLEVEL    0
>>     int amdgpu_vram_limit = 0;
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.h 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.h
>> index 8e58325bbca2..fd0d56724b4d 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.h
>> @@ -58,6 +58,10 @@ struct amdgpu_vmid {
>>       uint32_t        oa_base;
>>       uint32_t        oa_size;
>>   +    /* user trap */
>> +    uint64_t        tba_addr;
>> +    uint64_t        tma_addr;
>> +
>>       unsigned        pasid;
>>       struct dma_fence    *pasid_mapping;
>>   };
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
>> index 81caac9b958a..b8ed5b13ea44 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
>> @@ -62,6 +62,10 @@ struct amdgpu_job {
>>       /* user fence handling */
>>       uint64_t        uf_addr;
>>       uint64_t        uf_sequence;
>> +
>> +    /* user trap handling */
>> +    uint64_t        tba_addr;
>> +    uint64_t        tma_addr;
>>   };
>>     int amdgpu_job_alloc(struct amdgpu_device *adev, unsigned num_ibs,
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>> index da871d84b742..1f165a6295d9 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>> @@ -197,6 +197,9 @@ struct amdgpu_ring_funcs {
>>       void (*soft_recovery)(struct amdgpu_ring *ring, unsigned vmid);
>>       int (*preempt_ib)(struct amdgpu_ring *ring);
>>       void (*emit_mem_sync)(struct amdgpu_ring *ring);
>> +    void (*emit_trap_handler)(struct amdgpu_ring *ring,
>> +                  uint32_t vmid,
>> +                  uint64_t tba_addr, uint64_t tma_addr);
>>   };
>>     struct amdgpu_ring {
>> @@ -265,6 +268,7 @@ struct amdgpu_ring {
>>   #define amdgpu_ring_emit_vm_flush(r, vmid, addr) 
>> (r)->funcs->emit_vm_flush((r), (vmid), (addr))
>>   #define amdgpu_ring_emit_fence(r, addr, seq, flags) 
>> (r)->funcs->emit_fence((r), (addr), (seq), (flags))
>>   #define amdgpu_ring_emit_gds_switch(r, v, db, ds, wb, ws, ab, as) 
>> (r)->funcs->emit_gds_switch((r), (v), (db), (ds), (wb), (ws), (ab), 
>> (as))
>> +#define amdgpu_ring_emit_trap_handler(r, v, tba, tma) 
>> (r)->funcs->emit_trap_handler((r), (v), (tba), (tma))
>>   #define amdgpu_ring_emit_hdp_flush(r) (r)->funcs->emit_hdp_flush((r))
>>   #define amdgpu_ring_emit_switch_buffer(r) 
>> (r)->funcs->emit_switch_buffer((r))
>>   #define amdgpu_ring_emit_cntxcntl(r, d) 
>> (r)->funcs->emit_cntxcntl((r), (d))
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>> index 71e005cf2952..6b619bb03777 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>> @@ -1079,15 +1079,22 @@ int amdgpu_vm_flush(struct amdgpu_ring *ring, 
>> struct amdgpu_job *job,
>>       bool vm_flush_needed = job->vm_needs_flush;
>>       struct dma_fence *fence = NULL;
>>       bool pasid_mapping_needed = false;
>> +    bool trap_handler_needed = false;
>>       unsigned patch_offset = 0;
>>       bool update_spm_vmid_needed = (job->vm && 
>> (job->vm->reserved_vmid[vmhub] != NULL));
>>       int r;
>>   -    if (update_spm_vmid_needed && 
>> adev->gfx.rlc.funcs->update_spm_vmid)
>> +    if (update_spm_vmid_needed && 
>> adev->gfx.rlc.funcs->update_spm_vmid) {
>>           adev->gfx.rlc.funcs->update_spm_vmid(adev, job->vmid);
>>   +        trap_handler_needed = ring->funcs->emit_trap_handler && (
>> +            id->tba_addr != job->tba_addr ||
>> +            id->tma_addr != job->tma_addr);
>
> That's probably not such a good idea since it makes the trap handler 
> depend on the VMID reservation.
>
>> +    }
>> +
>>       if (amdgpu_vmid_had_gpu_reset(adev, id)) {
>>           gds_switch_needed = true;
>> +        trap_handler_needed = true;
>>           vm_flush_needed = true;
>>           pasid_mapping_needed = true;
>>       }
>> @@ -1099,12 +1106,14 @@ int amdgpu_vm_flush(struct amdgpu_ring *ring, 
>> struct amdgpu_job *job,
>>       mutex_unlock(&id_mgr->lock);
>>         gds_switch_needed &= !!ring->funcs->emit_gds_switch;
>> +    trap_handler_needed &= !!ring->funcs->emit_trap_handler;
>>       vm_flush_needed &= !!ring->funcs->emit_vm_flush &&
>>               job->vm_pd_addr != AMDGPU_BO_INVALID_OFFSET;
>>       pasid_mapping_needed &= adev->gmc.gmc_funcs->emit_pasid_mapping &&
>>           ring->funcs->emit_wreg;
>>   -    if (!vm_flush_needed && !gds_switch_needed && !need_pipe_sync)
>> +    if (!vm_flush_needed && !gds_switch_needed &&
>> +        !trap_handler_needed && !need_pipe_sync)
>>           return 0;
>>         if (ring->funcs->init_cond_exec)
>> @@ -1158,6 +1167,13 @@ int amdgpu_vm_flush(struct amdgpu_ring *ring, 
>> struct amdgpu_job *job,
>>                           job->oa_size);
>>       }
>>   +    if (ring->funcs->emit_trap_handler && trap_handler_needed) {
>> +        id->tba_addr = job->tba_addr;
>> +        id->tma_addr = job->tma_addr;
>> +        amdgpu_ring_emit_trap_handler(ring, job->vmid, job->tba_addr,
>> +                          job->tma_addr);
>> +    }
>> +
>
> Well that doesn't seem to make sense at all here.
>
>>       if (ring->funcs->patch_cond_exec)
>>           amdgpu_ring_patch_cond_exec(ring, patch_offset);
>>   diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c 
>> b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>> index 65997ffaed45..f864b217589f 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>> @@ -7141,6 +7141,24 @@ static void 
>> gfx_v10_0_ring_emit_gds_switch(struct amdgpu_ring *ring,
>>                       (1 << (oa_size + oa_base)) - (1 << oa_base));
>>   }
>>   +static void gfx_v10_0_ring_emit_trap_handler(struct amdgpu_ring 
>> *ring,
>> +                        uint32_t vmid,
>> +                        uint64_t tba_addr,
>> +                        uint64_t tma_addr)
>> +{
>> +    struct amdgpu_device *adev = ring->adev;
>> +
>> +    mutex_lock(&adev->srbm_mutex);
>> +    nv_grbm_select(adev, 0, 0, 0, vmid);
>> +    WREG32_SOC15_RLC(GC, 0, mmSQ_SHADER_TBA_LO, 
>> lower_32_bits(tba_addr >> 8));
>> +    WREG32_SOC15_RLC(GC, 0, mmSQ_SHADER_TBA_HI, 
>> upper_32_bits(tba_addr >> 8) |
>> +                1 << SQ_SHADER_TBA_HI__TRAP_EN__SHIFT);
>> +    WREG32_SOC15_RLC(GC, 0, mmSQ_SHADER_TMA_LO, 
>> lower_32_bits(tma_addr >> 8));
>> +    WREG32_SOC15_RLC(GC, 0, mmSQ_SHADER_TMA_HI, 
>> upper_32_bits(tma_addr >> 8));
>> +    nv_grbm_select(adev, 0, 0, 0, 0);
>> +    mutex_unlock(&adev->srbm_mutex);
>
> This is not emitting the trap handler update to the ring, but writing 
> it directly to the registers.

This uses direct driver access because we don't know how to emit them 
via CP. This is also why it's only emitted if a VMID is reserved.

I think Alex is having a discussion with the CP team about that.

>
> Regards,
> Christian.
>
>> +}
>> +
>>   static int gfx_v10_0_early_init(void *handle)
>>   {
>>       struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>> @@ -8530,6 +8548,7 @@ static const struct amdgpu_ring_funcs 
>> gfx_v10_0_ring_funcs_gfx = {
>>       .emit_reg_write_reg_wait = gfx_v10_0_ring_emit_reg_write_reg_wait,
>>       .soft_recovery = gfx_v10_0_ring_soft_recovery,
>>       .emit_mem_sync = gfx_v10_0_emit_mem_sync,
>> +    .emit_trap_handler = gfx_v10_0_ring_emit_trap_handler,
>>   };
>>     static const struct amdgpu_ring_funcs 
>> gfx_v10_0_ring_funcs_compute = {
>> @@ -8566,6 +8585,7 @@ static const struct amdgpu_ring_funcs 
>> gfx_v10_0_ring_funcs_compute = {
>>       .emit_reg_wait = gfx_v10_0_ring_emit_reg_wait,
>>       .emit_reg_write_reg_wait = gfx_v10_0_ring_emit_reg_write_reg_wait,
>>       .emit_mem_sync = gfx_v10_0_emit_mem_sync,
>> +    .emit_trap_handler = gfx_v10_0_ring_emit_trap_handler,
>>   };
>>     static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_kiq = {
>> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c 
>> b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>> index cb9d60a4e05e..4fc00f196085 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>> @@ -4162,6 +4162,23 @@ static void 
>> gfx_v9_0_ring_emit_gds_switch(struct amdgpu_ring *ring,
>>                      (1 << (oa_size + oa_base)) - (1 << oa_base));
>>   }
>>   +static void gfx_v9_0_ring_emit_trap_handler(struct amdgpu_ring *ring,
>> +                        uint32_t vmid,
>> +                        uint64_t tba_addr,
>> +                        uint64_t tma_addr)
>> +{
>> +    struct amdgpu_device *adev = ring->adev;
>> +
>> +    mutex_lock(&adev->srbm_mutex);
>> +    soc15_grbm_select(adev, 0, 0, 0, vmid);
>> +    WREG32_SOC15_RLC(GC, 0, mmSQ_SHADER_TBA_LO, 
>> lower_32_bits(tba_addr >> 8));
>> +    WREG32_SOC15_RLC(GC, 0, mmSQ_SHADER_TBA_HI, 
>> upper_32_bits(tba_addr >> 8));
>> +    WREG32_SOC15_RLC(GC, 0, mmSQ_SHADER_TMA_LO, 
>> lower_32_bits(tma_addr >> 8));
>> +    WREG32_SOC15_RLC(GC, 0, mmSQ_SHADER_TMA_HI, 
>> upper_32_bits(tma_addr >> 8));
>> +    soc15_grbm_select(adev, 0, 0, 0, 0);
>> +    mutex_unlock(&adev->srbm_mutex);
>> +}
>> +
>>   static const u32 vgpr_init_compute_shader[] =
>>   {
>>       0xb07c0000, 0xbe8000ff,
>> @@ -6720,6 +6737,7 @@ static const struct amdgpu_ring_funcs 
>> gfx_v9_0_ring_funcs_gfx = {
>>       .emit_reg_write_reg_wait = gfx_v9_0_ring_emit_reg_write_reg_wait,
>>       .soft_recovery = gfx_v9_0_ring_soft_recovery,
>>       .emit_mem_sync = gfx_v9_0_emit_mem_sync,
>> +    .emit_trap_handler = gfx_v9_0_ring_emit_trap_handler,
>>   };
>>     static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_compute 
>> = {
>> @@ -6756,6 +6774,7 @@ static const struct amdgpu_ring_funcs 
>> gfx_v9_0_ring_funcs_compute = {
>>       .emit_reg_wait = gfx_v9_0_ring_emit_reg_wait,
>>       .emit_reg_write_reg_wait = gfx_v9_0_ring_emit_reg_write_reg_wait,
>>       .emit_mem_sync = gfx_v9_0_emit_mem_sync,
>> +    .emit_trap_handler = gfx_v9_0_ring_emit_trap_handler,
>>   };
>>     static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_kiq = {
>> diff --git a/include/uapi/drm/amdgpu_drm.h 
>> b/include/uapi/drm/amdgpu_drm.h
>> index 3218576e109d..7eae264adb5d 100644
>> --- a/include/uapi/drm/amdgpu_drm.h
>> +++ b/include/uapi/drm/amdgpu_drm.h
>> @@ -551,6 +551,7 @@ struct drm_amdgpu_gem_va {
>>   #define AMDGPU_CHUNK_ID_SCHEDULED_DEPENDENCIES    0x07
>>   #define AMDGPU_CHUNK_ID_SYNCOBJ_TIMELINE_WAIT    0x08
>>   #define AMDGPU_CHUNK_ID_SYNCOBJ_TIMELINE_SIGNAL  0x09
>> +#define AMDGPU_CHUNK_ID_TRAP            0x0a
>>     struct drm_amdgpu_cs_chunk {
>>       __u32        chunk_id;
>> @@ -645,6 +646,13 @@ struct drm_amdgpu_cs_chunk_syncobj {
>>          __u64 point;
>>   };
>>   +struct drm_amdgpu_cs_chunk_trap {
>> +    /** Trap Base Address */
>> +       __u64 tba_addr;
>> +    /** Trap Memory Address */
>> +       __u64 tma_addr;
>> +};
>> +
>>   #define AMDGPU_FENCE_TO_HANDLE_GET_SYNCOBJ    0
>>   #define AMDGPU_FENCE_TO_HANDLE_GET_SYNCOBJ_FD    1
>>   #define AMDGPU_FENCE_TO_HANDLE_GET_SYNC_FILE_FD    2
>
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH v2] drm/amdgpu: add support for user trap handlers
  2020-08-28  8:14     ` Samuel Pitoiset
@ 2020-08-28  8:23       ` Christian König
  2020-08-28  8:25         ` Samuel Pitoiset
  2020-09-23 12:52         ` Samuel Pitoiset
  0 siblings, 2 replies; 16+ messages in thread
From: Christian König @ 2020-08-28  8:23 UTC (permalink / raw)
  To: Samuel Pitoiset, amd-gfx

Am 28.08.20 um 10:14 schrieb Samuel Pitoiset:
>
> On 8/28/20 9:57 AM, Christian König wrote:
>> Am 25.08.20 um 16:07 schrieb Samuel Pitoiset:
>>> A trap handler can be used by userspace to catch shader exceptions
>>> like divide by zero, memory violations etc.
>>>
>>> On GFX6-GFX8, the registers used to configure TBA/TMA aren't
>>> privileged and can be configured from userpace.
>>>
>>> On GFX9+ they are per VMID and privileged, only the KMD can
>>> configure them. At the moment, we don't know how to set them
>>> via the CP, so they are only emitted if a VMID is reserved.
>>>
>>> This introduces a new CS chunk that can be used to set the
>>> TBA/TMA virtual address at submit time.
>>>
>>> TODO:
>>> - rebase on top of amd-staging-drm-next (this branch currently
>>> hangs my GPU at boot)
>>
>> Please split that up into multiple patches. The first one adding the 
>> general infrastructure and the following one the implementation for 
>> gfx9 and gfx10.
> Sounds good.
>>
>> And maybe even support this for gfx6-8 even if it is not necessary? 
>> Looks trivial to implement and would give userspace a more uniform 
>> handling for this.
> v1 added gfx6-8 support but I removed it in v2 as requested by Alex 
> because it's useless and might be better for preemption.
>>
>> A few more comments below.
>>
>>>
>>> Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
>>> ---
>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c   | 31 
>>> ++++++++++++++++++++++++
>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c  |  3 ++-
>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_ids.h  |  4 +++
>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_job.h  |  4 +++
>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h |  4 +++
>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c   | 20 +++++++++++++--
>>>   drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c   | 20 +++++++++++++++
>>>   drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c    | 19 +++++++++++++++
>>>   include/uapi/drm/amdgpu_drm.h            |  8 ++++++
>>>   9 files changed, 110 insertions(+), 3 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c 
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>>> index a512ccbc4dea..6ca5c4912e3a 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>>> @@ -104,6 +104,19 @@ static int amdgpu_cs_bo_handles_chunk(struct 
>>> amdgpu_cs_parser *p,
>>>       return r;
>>>   }
>>>   +static int amdgpu_cs_user_trap_chunk(struct amdgpu_cs_parser *p,
>>> +                     struct drm_amdgpu_cs_chunk_trap *data,
>>> +                     uint64_t *tba_addr, uint64_t *tma_addr)
>>> +{
>>> +    if (!data->tba_addr || !data->tma_addr)
>>> +        return -EINVAL;
>>> +
>>> +    *tba_addr = data->tba_addr;
>>> +    *tma_addr = data->tma_addr;
>>> +
>>> +    return 0;
>>> +}
>>> +
>>>   static int amdgpu_cs_parser_init(struct amdgpu_cs_parser *p, union 
>>> drm_amdgpu_cs *cs)
>>>   {
>>>       struct amdgpu_fpriv *fpriv = p->filp->driver_priv;
>>> @@ -112,6 +125,7 @@ static int amdgpu_cs_parser_init(struct 
>>> amdgpu_cs_parser *p, union drm_amdgpu_cs
>>>       uint64_t *chunk_array;
>>>       unsigned size, num_ibs = 0;
>>>       uint32_t uf_offset = 0;
>>> +    uint64_t tba_addr = 0, tma_addr = 0;
>>>       int i;
>>>       int ret;
>>>   @@ -214,6 +228,19 @@ static int amdgpu_cs_parser_init(struct 
>>> amdgpu_cs_parser *p, union drm_amdgpu_cs
>>>                 break;
>>>   +        case AMDGPU_CHUNK_ID_TRAP:
>>> +            size = sizeof(struct drm_amdgpu_cs_chunk_trap);
>>> +            if (p->chunks[i].length_dw * sizeof(uint32_t) < size) {
>>> +                ret = -EINVAL;
>>> +                goto free_partial_kdata;
>>> +            }
>>> +
>>> +            ret = amdgpu_cs_user_trap_chunk(p, p->chunks[i].kdata,
>>> +                            &tba_addr, &tma_addr);
>>> +            if (ret)
>>> +                goto free_partial_kdata;
>>> +            break;
>>> +
>>>           case AMDGPU_CHUNK_ID_DEPENDENCIES:
>>>           case AMDGPU_CHUNK_ID_SYNCOBJ_IN:
>>>           case AMDGPU_CHUNK_ID_SYNCOBJ_OUT:
>>> @@ -239,6 +266,10 @@ static int amdgpu_cs_parser_init(struct 
>>> amdgpu_cs_parser *p, union drm_amdgpu_cs
>>>         if (p->uf_entry.tv.bo)
>>>           p->job->uf_addr = uf_offset;
>>> +
>>> +    p->job->tba_addr = tba_addr;
>>> +    p->job->tma_addr = tma_addr;
>>> +
>>>       kfree(chunk_array);
>>>         /* Use this opportunity to fill in task info for the vm */
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c 
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>>> index 26127c7d2f32..1e703119e4c2 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>>> @@ -88,9 +88,10 @@
>>>    * - 3.37.0 - L2 is invalidated before SDMA IBs, needed for 
>>> correctness
>>>    * - 3.38.0 - Add AMDGPU_IB_FLAG_EMIT_MEM_SYNC
>>>    * - 3.39.0 - DMABUF implicit sync does a full pipeline sync
>>> + * - 3.40.0 - Add AMDGPU_CHUNK_ID_TRAP
>>>    */
>>>   #define KMS_DRIVER_MAJOR    3
>>> -#define KMS_DRIVER_MINOR    39
>>> +#define KMS_DRIVER_MINOR    40
>>>   #define KMS_DRIVER_PATCHLEVEL    0
>>>     int amdgpu_vram_limit = 0;
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.h 
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.h
>>> index 8e58325bbca2..fd0d56724b4d 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.h
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.h
>>> @@ -58,6 +58,10 @@ struct amdgpu_vmid {
>>>       uint32_t        oa_base;
>>>       uint32_t        oa_size;
>>>   +    /* user trap */
>>> +    uint64_t        tba_addr;
>>> +    uint64_t        tma_addr;
>>> +
>>>       unsigned        pasid;
>>>       struct dma_fence    *pasid_mapping;
>>>   };
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h 
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
>>> index 81caac9b958a..b8ed5b13ea44 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
>>> @@ -62,6 +62,10 @@ struct amdgpu_job {
>>>       /* user fence handling */
>>>       uint64_t        uf_addr;
>>>       uint64_t        uf_sequence;
>>> +
>>> +    /* user trap handling */
>>> +    uint64_t        tba_addr;
>>> +    uint64_t        tma_addr;
>>>   };
>>>     int amdgpu_job_alloc(struct amdgpu_device *adev, unsigned num_ibs,
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h 
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>>> index da871d84b742..1f165a6295d9 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>>> @@ -197,6 +197,9 @@ struct amdgpu_ring_funcs {
>>>       void (*soft_recovery)(struct amdgpu_ring *ring, unsigned vmid);
>>>       int (*preempt_ib)(struct amdgpu_ring *ring);
>>>       void (*emit_mem_sync)(struct amdgpu_ring *ring);
>>> +    void (*emit_trap_handler)(struct amdgpu_ring *ring,
>>> +                  uint32_t vmid,
>>> +                  uint64_t tba_addr, uint64_t tma_addr);
>>>   };
>>>     struct amdgpu_ring {
>>> @@ -265,6 +268,7 @@ struct amdgpu_ring {
>>>   #define amdgpu_ring_emit_vm_flush(r, vmid, addr) 
>>> (r)->funcs->emit_vm_flush((r), (vmid), (addr))
>>>   #define amdgpu_ring_emit_fence(r, addr, seq, flags) 
>>> (r)->funcs->emit_fence((r), (addr), (seq), (flags))
>>>   #define amdgpu_ring_emit_gds_switch(r, v, db, ds, wb, ws, ab, as) 
>>> (r)->funcs->emit_gds_switch((r), (v), (db), (ds), (wb), (ws), (ab), 
>>> (as))
>>> +#define amdgpu_ring_emit_trap_handler(r, v, tba, tma) 
>>> (r)->funcs->emit_trap_handler((r), (v), (tba), (tma))
>>>   #define amdgpu_ring_emit_hdp_flush(r) (r)->funcs->emit_hdp_flush((r))
>>>   #define amdgpu_ring_emit_switch_buffer(r) 
>>> (r)->funcs->emit_switch_buffer((r))
>>>   #define amdgpu_ring_emit_cntxcntl(r, d) 
>>> (r)->funcs->emit_cntxcntl((r), (d))
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c 
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>> index 71e005cf2952..6b619bb03777 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>> @@ -1079,15 +1079,22 @@ int amdgpu_vm_flush(struct amdgpu_ring 
>>> *ring, struct amdgpu_job *job,
>>>       bool vm_flush_needed = job->vm_needs_flush;
>>>       struct dma_fence *fence = NULL;
>>>       bool pasid_mapping_needed = false;
>>> +    bool trap_handler_needed = false;
>>>       unsigned patch_offset = 0;
>>>       bool update_spm_vmid_needed = (job->vm && 
>>> (job->vm->reserved_vmid[vmhub] != NULL));
>>>       int r;
>>>   -    if (update_spm_vmid_needed && 
>>> adev->gfx.rlc.funcs->update_spm_vmid)
>>> +    if (update_spm_vmid_needed && 
>>> adev->gfx.rlc.funcs->update_spm_vmid) {
>>>           adev->gfx.rlc.funcs->update_spm_vmid(adev, job->vmid);
>>>   +        trap_handler_needed = ring->funcs->emit_trap_handler && (
>>> +            id->tba_addr != job->tba_addr ||
>>> +            id->tma_addr != job->tma_addr);
>>
>> That's probably not such a good idea since it makes the trap handler 
>> depend on the VMID reservation.
>>
>>> +    }
>>> +
>>>       if (amdgpu_vmid_had_gpu_reset(adev, id)) {
>>>           gds_switch_needed = true;
>>> +        trap_handler_needed = true;
>>>           vm_flush_needed = true;
>>>           pasid_mapping_needed = true;
>>>       }
>>> @@ -1099,12 +1106,14 @@ int amdgpu_vm_flush(struct amdgpu_ring 
>>> *ring, struct amdgpu_job *job,
>>>       mutex_unlock(&id_mgr->lock);
>>>         gds_switch_needed &= !!ring->funcs->emit_gds_switch;
>>> +    trap_handler_needed &= !!ring->funcs->emit_trap_handler;
>>>       vm_flush_needed &= !!ring->funcs->emit_vm_flush &&
>>>               job->vm_pd_addr != AMDGPU_BO_INVALID_OFFSET;
>>>       pasid_mapping_needed &= 
>>> adev->gmc.gmc_funcs->emit_pasid_mapping &&
>>>           ring->funcs->emit_wreg;
>>>   -    if (!vm_flush_needed && !gds_switch_needed && !need_pipe_sync)
>>> +    if (!vm_flush_needed && !gds_switch_needed &&
>>> +        !trap_handler_needed && !need_pipe_sync)
>>>           return 0;
>>>         if (ring->funcs->init_cond_exec)
>>> @@ -1158,6 +1167,13 @@ int amdgpu_vm_flush(struct amdgpu_ring *ring, 
>>> struct amdgpu_job *job,
>>>                           job->oa_size);
>>>       }
>>>   +    if (ring->funcs->emit_trap_handler && trap_handler_needed) {
>>> +        id->tba_addr = job->tba_addr;
>>> +        id->tma_addr = job->tma_addr;
>>> +        amdgpu_ring_emit_trap_handler(ring, job->vmid, job->tba_addr,
>>> +                          job->tma_addr);
>>> +    }
>>> +
>>
>> Well that doesn't seem to make sense at all here.
>>
>>>       if (ring->funcs->patch_cond_exec)
>>>           amdgpu_ring_patch_cond_exec(ring, patch_offset);
>>>   diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c 
>>> b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>>> index 65997ffaed45..f864b217589f 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>>> @@ -7141,6 +7141,24 @@ static void 
>>> gfx_v10_0_ring_emit_gds_switch(struct amdgpu_ring *ring,
>>>                       (1 << (oa_size + oa_base)) - (1 << oa_base));
>>>   }
>>>   +static void gfx_v10_0_ring_emit_trap_handler(struct amdgpu_ring 
>>> *ring,
>>> +                        uint32_t vmid,
>>> +                        uint64_t tba_addr,
>>> +                        uint64_t tma_addr)
>>> +{
>>> +    struct amdgpu_device *adev = ring->adev;
>>> +
>>> +    mutex_lock(&adev->srbm_mutex);
>>> +    nv_grbm_select(adev, 0, 0, 0, vmid);
>>> +    WREG32_SOC15_RLC(GC, 0, mmSQ_SHADER_TBA_LO, 
>>> lower_32_bits(tba_addr >> 8));
>>> +    WREG32_SOC15_RLC(GC, 0, mmSQ_SHADER_TBA_HI, 
>>> upper_32_bits(tba_addr >> 8) |
>>> +                1 << SQ_SHADER_TBA_HI__TRAP_EN__SHIFT);
>>> +    WREG32_SOC15_RLC(GC, 0, mmSQ_SHADER_TMA_LO, 
>>> lower_32_bits(tma_addr >> 8));
>>> +    WREG32_SOC15_RLC(GC, 0, mmSQ_SHADER_TMA_HI, 
>>> upper_32_bits(tma_addr >> 8));
>>> +    nv_grbm_select(adev, 0, 0, 0, 0);
>>> +    mutex_unlock(&adev->srbm_mutex);
>>
>> This is not emitting the trap handler update to the ring, but writing 
>> it directly to the registers.
>
> This uses direct driver access because we don't know how to emit them 
> via CP. This is also why it's only emitted if a VMID is reserved.
>
> I think Alex is having a discussion with the CP team about that.

Ah! Ok in this case please keep the patch on hold until this is cleared.

But even when the VMID is reserved this could cause big problems if 
userspace decides to change the trap addresses on the fly.

So we can't really do it like this without waiting for the hardware to 
be idle and causing massive performance loss.

Regards,
Christian.

>
>>
>> Regards,
>> Christian.
>>
>>> +}
>>> +
>>>   static int gfx_v10_0_early_init(void *handle)
>>>   {
>>>       struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>>> @@ -8530,6 +8548,7 @@ static const struct amdgpu_ring_funcs 
>>> gfx_v10_0_ring_funcs_gfx = {
>>>       .emit_reg_write_reg_wait = 
>>> gfx_v10_0_ring_emit_reg_write_reg_wait,
>>>       .soft_recovery = gfx_v10_0_ring_soft_recovery,
>>>       .emit_mem_sync = gfx_v10_0_emit_mem_sync,
>>> +    .emit_trap_handler = gfx_v10_0_ring_emit_trap_handler,
>>>   };
>>>     static const struct amdgpu_ring_funcs 
>>> gfx_v10_0_ring_funcs_compute = {
>>> @@ -8566,6 +8585,7 @@ static const struct amdgpu_ring_funcs 
>>> gfx_v10_0_ring_funcs_compute = {
>>>       .emit_reg_wait = gfx_v10_0_ring_emit_reg_wait,
>>>       .emit_reg_write_reg_wait = 
>>> gfx_v10_0_ring_emit_reg_write_reg_wait,
>>>       .emit_mem_sync = gfx_v10_0_emit_mem_sync,
>>> +    .emit_trap_handler = gfx_v10_0_ring_emit_trap_handler,
>>>   };
>>>     static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_kiq = {
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c 
>>> b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>>> index cb9d60a4e05e..4fc00f196085 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>>> @@ -4162,6 +4162,23 @@ static void 
>>> gfx_v9_0_ring_emit_gds_switch(struct amdgpu_ring *ring,
>>>                      (1 << (oa_size + oa_base)) - (1 << oa_base));
>>>   }
>>>   +static void gfx_v9_0_ring_emit_trap_handler(struct amdgpu_ring 
>>> *ring,
>>> +                        uint32_t vmid,
>>> +                        uint64_t tba_addr,
>>> +                        uint64_t tma_addr)
>>> +{
>>> +    struct amdgpu_device *adev = ring->adev;
>>> +
>>> +    mutex_lock(&adev->srbm_mutex);
>>> +    soc15_grbm_select(adev, 0, 0, 0, vmid);
>>> +    WREG32_SOC15_RLC(GC, 0, mmSQ_SHADER_TBA_LO, 
>>> lower_32_bits(tba_addr >> 8));
>>> +    WREG32_SOC15_RLC(GC, 0, mmSQ_SHADER_TBA_HI, 
>>> upper_32_bits(tba_addr >> 8));
>>> +    WREG32_SOC15_RLC(GC, 0, mmSQ_SHADER_TMA_LO, 
>>> lower_32_bits(tma_addr >> 8));
>>> +    WREG32_SOC15_RLC(GC, 0, mmSQ_SHADER_TMA_HI, 
>>> upper_32_bits(tma_addr >> 8));
>>> +    soc15_grbm_select(adev, 0, 0, 0, 0);
>>> +    mutex_unlock(&adev->srbm_mutex);
>>> +}
>>> +
>>>   static const u32 vgpr_init_compute_shader[] =
>>>   {
>>>       0xb07c0000, 0xbe8000ff,
>>> @@ -6720,6 +6737,7 @@ static const struct amdgpu_ring_funcs 
>>> gfx_v9_0_ring_funcs_gfx = {
>>>       .emit_reg_write_reg_wait = gfx_v9_0_ring_emit_reg_write_reg_wait,
>>>       .soft_recovery = gfx_v9_0_ring_soft_recovery,
>>>       .emit_mem_sync = gfx_v9_0_emit_mem_sync,
>>> +    .emit_trap_handler = gfx_v9_0_ring_emit_trap_handler,
>>>   };
>>>     static const struct amdgpu_ring_funcs 
>>> gfx_v9_0_ring_funcs_compute = {
>>> @@ -6756,6 +6774,7 @@ static const struct amdgpu_ring_funcs 
>>> gfx_v9_0_ring_funcs_compute = {
>>>       .emit_reg_wait = gfx_v9_0_ring_emit_reg_wait,
>>>       .emit_reg_write_reg_wait = gfx_v9_0_ring_emit_reg_write_reg_wait,
>>>       .emit_mem_sync = gfx_v9_0_emit_mem_sync,
>>> +    .emit_trap_handler = gfx_v9_0_ring_emit_trap_handler,
>>>   };
>>>     static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_kiq = {
>>> diff --git a/include/uapi/drm/amdgpu_drm.h 
>>> b/include/uapi/drm/amdgpu_drm.h
>>> index 3218576e109d..7eae264adb5d 100644
>>> --- a/include/uapi/drm/amdgpu_drm.h
>>> +++ b/include/uapi/drm/amdgpu_drm.h
>>> @@ -551,6 +551,7 @@ struct drm_amdgpu_gem_va {
>>>   #define AMDGPU_CHUNK_ID_SCHEDULED_DEPENDENCIES    0x07
>>>   #define AMDGPU_CHUNK_ID_SYNCOBJ_TIMELINE_WAIT    0x08
>>>   #define AMDGPU_CHUNK_ID_SYNCOBJ_TIMELINE_SIGNAL  0x09
>>> +#define AMDGPU_CHUNK_ID_TRAP            0x0a
>>>     struct drm_amdgpu_cs_chunk {
>>>       __u32        chunk_id;
>>> @@ -645,6 +646,13 @@ struct drm_amdgpu_cs_chunk_syncobj {
>>>          __u64 point;
>>>   };
>>>   +struct drm_amdgpu_cs_chunk_trap {
>>> +    /** Trap Base Address */
>>> +       __u64 tba_addr;
>>> +    /** Trap Memory Address */
>>> +       __u64 tma_addr;
>>> +};
>>> +
>>>   #define AMDGPU_FENCE_TO_HANDLE_GET_SYNCOBJ    0
>>>   #define AMDGPU_FENCE_TO_HANDLE_GET_SYNCOBJ_FD    1
>>>   #define AMDGPU_FENCE_TO_HANDLE_GET_SYNC_FILE_FD    2
>>

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH v2] drm/amdgpu: add support for user trap handlers
  2020-08-28  8:23       ` Christian König
@ 2020-08-28  8:25         ` Samuel Pitoiset
  2020-09-23 12:52         ` Samuel Pitoiset
  1 sibling, 0 replies; 16+ messages in thread
From: Samuel Pitoiset @ 2020-08-28  8:25 UTC (permalink / raw)
  To: Christian König, amd-gfx


On 8/28/20 10:23 AM, Christian König wrote:
> Am 28.08.20 um 10:14 schrieb Samuel Pitoiset:
>>
>> On 8/28/20 9:57 AM, Christian König wrote:
>>> Am 25.08.20 um 16:07 schrieb Samuel Pitoiset:
>>>> A trap handler can be used by userspace to catch shader exceptions
>>>> like divide by zero, memory violations etc.
>>>>
>>>> On GFX6-GFX8, the registers used to configure TBA/TMA aren't
>>>> privileged and can be configured from userpace.
>>>>
>>>> On GFX9+ they are per VMID and privileged, only the KMD can
>>>> configure them. At the moment, we don't know how to set them
>>>> via the CP, so they are only emitted if a VMID is reserved.
>>>>
>>>> This introduces a new CS chunk that can be used to set the
>>>> TBA/TMA virtual address at submit time.
>>>>
>>>> TODO:
>>>> - rebase on top of amd-staging-drm-next (this branch currently
>>>> hangs my GPU at boot)
>>>
>>> Please split that up into multiple patches. The first one adding the 
>>> general infrastructure and the following one the implementation for 
>>> gfx9 and gfx10.
>> Sounds good.
>>>
>>> And maybe even support this for gfx6-8 even if it is not necessary? 
>>> Looks trivial to implement and would give userspace a more uniform 
>>> handling for this.
>> v1 added gfx6-8 support but I removed it in v2 as requested by Alex 
>> because it's useless and might be better for preemption.
>>>
>>> A few more comments below.
>>>
>>>>
>>>> Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
>>>> ---
>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c   | 31 
>>>> ++++++++++++++++++++++++
>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c  |  3 ++-
>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_ids.h  |  4 +++
>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_job.h  |  4 +++
>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h |  4 +++
>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c   | 20 +++++++++++++--
>>>>   drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c   | 20 +++++++++++++++
>>>>   drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c    | 19 +++++++++++++++
>>>>   include/uapi/drm/amdgpu_drm.h            |  8 ++++++
>>>>   9 files changed, 110 insertions(+), 3 deletions(-)
>>>>
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c 
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>>>> index a512ccbc4dea..6ca5c4912e3a 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>>>> @@ -104,6 +104,19 @@ static int amdgpu_cs_bo_handles_chunk(struct 
>>>> amdgpu_cs_parser *p,
>>>>       return r;
>>>>   }
>>>>   +static int amdgpu_cs_user_trap_chunk(struct amdgpu_cs_parser *p,
>>>> +                     struct drm_amdgpu_cs_chunk_trap *data,
>>>> +                     uint64_t *tba_addr, uint64_t *tma_addr)
>>>> +{
>>>> +    if (!data->tba_addr || !data->tma_addr)
>>>> +        return -EINVAL;
>>>> +
>>>> +    *tba_addr = data->tba_addr;
>>>> +    *tma_addr = data->tma_addr;
>>>> +
>>>> +    return 0;
>>>> +}
>>>> +
>>>>   static int amdgpu_cs_parser_init(struct amdgpu_cs_parser *p, 
>>>> union drm_amdgpu_cs *cs)
>>>>   {
>>>>       struct amdgpu_fpriv *fpriv = p->filp->driver_priv;
>>>> @@ -112,6 +125,7 @@ static int amdgpu_cs_parser_init(struct 
>>>> amdgpu_cs_parser *p, union drm_amdgpu_cs
>>>>       uint64_t *chunk_array;
>>>>       unsigned size, num_ibs = 0;
>>>>       uint32_t uf_offset = 0;
>>>> +    uint64_t tba_addr = 0, tma_addr = 0;
>>>>       int i;
>>>>       int ret;
>>>>   @@ -214,6 +228,19 @@ static int amdgpu_cs_parser_init(struct 
>>>> amdgpu_cs_parser *p, union drm_amdgpu_cs
>>>>                 break;
>>>>   +        case AMDGPU_CHUNK_ID_TRAP:
>>>> +            size = sizeof(struct drm_amdgpu_cs_chunk_trap);
>>>> +            if (p->chunks[i].length_dw * sizeof(uint32_t) < size) {
>>>> +                ret = -EINVAL;
>>>> +                goto free_partial_kdata;
>>>> +            }
>>>> +
>>>> +            ret = amdgpu_cs_user_trap_chunk(p, p->chunks[i].kdata,
>>>> +                            &tba_addr, &tma_addr);
>>>> +            if (ret)
>>>> +                goto free_partial_kdata;
>>>> +            break;
>>>> +
>>>>           case AMDGPU_CHUNK_ID_DEPENDENCIES:
>>>>           case AMDGPU_CHUNK_ID_SYNCOBJ_IN:
>>>>           case AMDGPU_CHUNK_ID_SYNCOBJ_OUT:
>>>> @@ -239,6 +266,10 @@ static int amdgpu_cs_parser_init(struct 
>>>> amdgpu_cs_parser *p, union drm_amdgpu_cs
>>>>         if (p->uf_entry.tv.bo)
>>>>           p->job->uf_addr = uf_offset;
>>>> +
>>>> +    p->job->tba_addr = tba_addr;
>>>> +    p->job->tma_addr = tma_addr;
>>>> +
>>>>       kfree(chunk_array);
>>>>         /* Use this opportunity to fill in task info for the vm */
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c 
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>>>> index 26127c7d2f32..1e703119e4c2 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>>>> @@ -88,9 +88,10 @@
>>>>    * - 3.37.0 - L2 is invalidated before SDMA IBs, needed for 
>>>> correctness
>>>>    * - 3.38.0 - Add AMDGPU_IB_FLAG_EMIT_MEM_SYNC
>>>>    * - 3.39.0 - DMABUF implicit sync does a full pipeline sync
>>>> + * - 3.40.0 - Add AMDGPU_CHUNK_ID_TRAP
>>>>    */
>>>>   #define KMS_DRIVER_MAJOR    3
>>>> -#define KMS_DRIVER_MINOR    39
>>>> +#define KMS_DRIVER_MINOR    40
>>>>   #define KMS_DRIVER_PATCHLEVEL    0
>>>>     int amdgpu_vram_limit = 0;
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.h 
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.h
>>>> index 8e58325bbca2..fd0d56724b4d 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.h
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.h
>>>> @@ -58,6 +58,10 @@ struct amdgpu_vmid {
>>>>       uint32_t        oa_base;
>>>>       uint32_t        oa_size;
>>>>   +    /* user trap */
>>>> +    uint64_t        tba_addr;
>>>> +    uint64_t        tma_addr;
>>>> +
>>>>       unsigned        pasid;
>>>>       struct dma_fence    *pasid_mapping;
>>>>   };
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h 
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
>>>> index 81caac9b958a..b8ed5b13ea44 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
>>>> @@ -62,6 +62,10 @@ struct amdgpu_job {
>>>>       /* user fence handling */
>>>>       uint64_t        uf_addr;
>>>>       uint64_t        uf_sequence;
>>>> +
>>>> +    /* user trap handling */
>>>> +    uint64_t        tba_addr;
>>>> +    uint64_t        tma_addr;
>>>>   };
>>>>     int amdgpu_job_alloc(struct amdgpu_device *adev, unsigned num_ibs,
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h 
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>>>> index da871d84b742..1f165a6295d9 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>>>> @@ -197,6 +197,9 @@ struct amdgpu_ring_funcs {
>>>>       void (*soft_recovery)(struct amdgpu_ring *ring, unsigned vmid);
>>>>       int (*preempt_ib)(struct amdgpu_ring *ring);
>>>>       void (*emit_mem_sync)(struct amdgpu_ring *ring);
>>>> +    void (*emit_trap_handler)(struct amdgpu_ring *ring,
>>>> +                  uint32_t vmid,
>>>> +                  uint64_t tba_addr, uint64_t tma_addr);
>>>>   };
>>>>     struct amdgpu_ring {
>>>> @@ -265,6 +268,7 @@ struct amdgpu_ring {
>>>>   #define amdgpu_ring_emit_vm_flush(r, vmid, addr) 
>>>> (r)->funcs->emit_vm_flush((r), (vmid), (addr))
>>>>   #define amdgpu_ring_emit_fence(r, addr, seq, flags) 
>>>> (r)->funcs->emit_fence((r), (addr), (seq), (flags))
>>>>   #define amdgpu_ring_emit_gds_switch(r, v, db, ds, wb, ws, ab, as) 
>>>> (r)->funcs->emit_gds_switch((r), (v), (db), (ds), (wb), (ws), (ab), 
>>>> (as))
>>>> +#define amdgpu_ring_emit_trap_handler(r, v, tba, tma) 
>>>> (r)->funcs->emit_trap_handler((r), (v), (tba), (tma))
>>>>   #define amdgpu_ring_emit_hdp_flush(r) 
>>>> (r)->funcs->emit_hdp_flush((r))
>>>>   #define amdgpu_ring_emit_switch_buffer(r) 
>>>> (r)->funcs->emit_switch_buffer((r))
>>>>   #define amdgpu_ring_emit_cntxcntl(r, d) 
>>>> (r)->funcs->emit_cntxcntl((r), (d))
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c 
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>>> index 71e005cf2952..6b619bb03777 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>>> @@ -1079,15 +1079,22 @@ int amdgpu_vm_flush(struct amdgpu_ring 
>>>> *ring, struct amdgpu_job *job,
>>>>       bool vm_flush_needed = job->vm_needs_flush;
>>>>       struct dma_fence *fence = NULL;
>>>>       bool pasid_mapping_needed = false;
>>>> +    bool trap_handler_needed = false;
>>>>       unsigned patch_offset = 0;
>>>>       bool update_spm_vmid_needed = (job->vm && 
>>>> (job->vm->reserved_vmid[vmhub] != NULL));
>>>>       int r;
>>>>   -    if (update_spm_vmid_needed && 
>>>> adev->gfx.rlc.funcs->update_spm_vmid)
>>>> +    if (update_spm_vmid_needed && 
>>>> adev->gfx.rlc.funcs->update_spm_vmid) {
>>>>           adev->gfx.rlc.funcs->update_spm_vmid(adev, job->vmid);
>>>>   +        trap_handler_needed = ring->funcs->emit_trap_handler && (
>>>> +            id->tba_addr != job->tba_addr ||
>>>> +            id->tma_addr != job->tma_addr);
>>>
>>> That's probably not such a good idea since it makes the trap handler 
>>> depend on the VMID reservation.
>>>
>>>> +    }
>>>> +
>>>>       if (amdgpu_vmid_had_gpu_reset(adev, id)) {
>>>>           gds_switch_needed = true;
>>>> +        trap_handler_needed = true;
>>>>           vm_flush_needed = true;
>>>>           pasid_mapping_needed = true;
>>>>       }
>>>> @@ -1099,12 +1106,14 @@ int amdgpu_vm_flush(struct amdgpu_ring 
>>>> *ring, struct amdgpu_job *job,
>>>>       mutex_unlock(&id_mgr->lock);
>>>>         gds_switch_needed &= !!ring->funcs->emit_gds_switch;
>>>> +    trap_handler_needed &= !!ring->funcs->emit_trap_handler;
>>>>       vm_flush_needed &= !!ring->funcs->emit_vm_flush &&
>>>>               job->vm_pd_addr != AMDGPU_BO_INVALID_OFFSET;
>>>>       pasid_mapping_needed &= 
>>>> adev->gmc.gmc_funcs->emit_pasid_mapping &&
>>>>           ring->funcs->emit_wreg;
>>>>   -    if (!vm_flush_needed && !gds_switch_needed && !need_pipe_sync)
>>>> +    if (!vm_flush_needed && !gds_switch_needed &&
>>>> +        !trap_handler_needed && !need_pipe_sync)
>>>>           return 0;
>>>>         if (ring->funcs->init_cond_exec)
>>>> @@ -1158,6 +1167,13 @@ int amdgpu_vm_flush(struct amdgpu_ring 
>>>> *ring, struct amdgpu_job *job,
>>>>                           job->oa_size);
>>>>       }
>>>>   +    if (ring->funcs->emit_trap_handler && trap_handler_needed) {
>>>> +        id->tba_addr = job->tba_addr;
>>>> +        id->tma_addr = job->tma_addr;
>>>> +        amdgpu_ring_emit_trap_handler(ring, job->vmid, job->tba_addr,
>>>> +                          job->tma_addr);
>>>> +    }
>>>> +
>>>
>>> Well that doesn't seem to make sense at all here.
>>>
>>>>       if (ring->funcs->patch_cond_exec)
>>>>           amdgpu_ring_patch_cond_exec(ring, patch_offset);
>>>>   diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c 
>>>> b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>>>> index 65997ffaed45..f864b217589f 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>>>> @@ -7141,6 +7141,24 @@ static void 
>>>> gfx_v10_0_ring_emit_gds_switch(struct amdgpu_ring *ring,
>>>>                       (1 << (oa_size + oa_base)) - (1 << oa_base));
>>>>   }
>>>>   +static void gfx_v10_0_ring_emit_trap_handler(struct amdgpu_ring 
>>>> *ring,
>>>> +                        uint32_t vmid,
>>>> +                        uint64_t tba_addr,
>>>> +                        uint64_t tma_addr)
>>>> +{
>>>> +    struct amdgpu_device *adev = ring->adev;
>>>> +
>>>> +    mutex_lock(&adev->srbm_mutex);
>>>> +    nv_grbm_select(adev, 0, 0, 0, vmid);
>>>> +    WREG32_SOC15_RLC(GC, 0, mmSQ_SHADER_TBA_LO, 
>>>> lower_32_bits(tba_addr >> 8));
>>>> +    WREG32_SOC15_RLC(GC, 0, mmSQ_SHADER_TBA_HI, 
>>>> upper_32_bits(tba_addr >> 8) |
>>>> +                1 << SQ_SHADER_TBA_HI__TRAP_EN__SHIFT);
>>>> +    WREG32_SOC15_RLC(GC, 0, mmSQ_SHADER_TMA_LO, 
>>>> lower_32_bits(tma_addr >> 8));
>>>> +    WREG32_SOC15_RLC(GC, 0, mmSQ_SHADER_TMA_HI, 
>>>> upper_32_bits(tma_addr >> 8));
>>>> +    nv_grbm_select(adev, 0, 0, 0, 0);
>>>> +    mutex_unlock(&adev->srbm_mutex);
>>>
>>> This is not emitting the trap handler update to the ring, but 
>>> writing it directly to the registers.
>>
>> This uses direct driver access because we don't know how to emit them 
>> via CP. This is also why it's only emitted if a VMID is reserved.
>>
>> I think Alex is having a discussion with the CP team about that.
>
> Ah! Ok in this case please keep the patch on hold until this is cleared.
>
> But even when the VMID is reserved this could cause big problems if 
> userspace decides to change the trap addresses on the fly.
>
> So we can't really do it like this without waiting for the hardware to 
> be idle and causing massive performance loss.

Yes, it's quite experimental. Hopefully we will find a way to set those 
registers via the CP, and problem will be solved! :-)

>
> Regards,
> Christian.
>
>>
>>>
>>> Regards,
>>> Christian.
>>>
>>>> +}
>>>> +
>>>>   static int gfx_v10_0_early_init(void *handle)
>>>>   {
>>>>       struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>>>> @@ -8530,6 +8548,7 @@ static const struct amdgpu_ring_funcs 
>>>> gfx_v10_0_ring_funcs_gfx = {
>>>>       .emit_reg_write_reg_wait = 
>>>> gfx_v10_0_ring_emit_reg_write_reg_wait,
>>>>       .soft_recovery = gfx_v10_0_ring_soft_recovery,
>>>>       .emit_mem_sync = gfx_v10_0_emit_mem_sync,
>>>> +    .emit_trap_handler = gfx_v10_0_ring_emit_trap_handler,
>>>>   };
>>>>     static const struct amdgpu_ring_funcs 
>>>> gfx_v10_0_ring_funcs_compute = {
>>>> @@ -8566,6 +8585,7 @@ static const struct amdgpu_ring_funcs 
>>>> gfx_v10_0_ring_funcs_compute = {
>>>>       .emit_reg_wait = gfx_v10_0_ring_emit_reg_wait,
>>>>       .emit_reg_write_reg_wait = 
>>>> gfx_v10_0_ring_emit_reg_write_reg_wait,
>>>>       .emit_mem_sync = gfx_v10_0_emit_mem_sync,
>>>> +    .emit_trap_handler = gfx_v10_0_ring_emit_trap_handler,
>>>>   };
>>>>     static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_kiq = {
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c 
>>>> b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>>>> index cb9d60a4e05e..4fc00f196085 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>>>> @@ -4162,6 +4162,23 @@ static void 
>>>> gfx_v9_0_ring_emit_gds_switch(struct amdgpu_ring *ring,
>>>>                      (1 << (oa_size + oa_base)) - (1 << oa_base));
>>>>   }
>>>>   +static void gfx_v9_0_ring_emit_trap_handler(struct amdgpu_ring 
>>>> *ring,
>>>> +                        uint32_t vmid,
>>>> +                        uint64_t tba_addr,
>>>> +                        uint64_t tma_addr)
>>>> +{
>>>> +    struct amdgpu_device *adev = ring->adev;
>>>> +
>>>> +    mutex_lock(&adev->srbm_mutex);
>>>> +    soc15_grbm_select(adev, 0, 0, 0, vmid);
>>>> +    WREG32_SOC15_RLC(GC, 0, mmSQ_SHADER_TBA_LO, 
>>>> lower_32_bits(tba_addr >> 8));
>>>> +    WREG32_SOC15_RLC(GC, 0, mmSQ_SHADER_TBA_HI, 
>>>> upper_32_bits(tba_addr >> 8));
>>>> +    WREG32_SOC15_RLC(GC, 0, mmSQ_SHADER_TMA_LO, 
>>>> lower_32_bits(tma_addr >> 8));
>>>> +    WREG32_SOC15_RLC(GC, 0, mmSQ_SHADER_TMA_HI, 
>>>> upper_32_bits(tma_addr >> 8));
>>>> +    soc15_grbm_select(adev, 0, 0, 0, 0);
>>>> +    mutex_unlock(&adev->srbm_mutex);
>>>> +}
>>>> +
>>>>   static const u32 vgpr_init_compute_shader[] =
>>>>   {
>>>>       0xb07c0000, 0xbe8000ff,
>>>> @@ -6720,6 +6737,7 @@ static const struct amdgpu_ring_funcs 
>>>> gfx_v9_0_ring_funcs_gfx = {
>>>>       .emit_reg_write_reg_wait = 
>>>> gfx_v9_0_ring_emit_reg_write_reg_wait,
>>>>       .soft_recovery = gfx_v9_0_ring_soft_recovery,
>>>>       .emit_mem_sync = gfx_v9_0_emit_mem_sync,
>>>> +    .emit_trap_handler = gfx_v9_0_ring_emit_trap_handler,
>>>>   };
>>>>     static const struct amdgpu_ring_funcs 
>>>> gfx_v9_0_ring_funcs_compute = {
>>>> @@ -6756,6 +6774,7 @@ static const struct amdgpu_ring_funcs 
>>>> gfx_v9_0_ring_funcs_compute = {
>>>>       .emit_reg_wait = gfx_v9_0_ring_emit_reg_wait,
>>>>       .emit_reg_write_reg_wait = 
>>>> gfx_v9_0_ring_emit_reg_write_reg_wait,
>>>>       .emit_mem_sync = gfx_v9_0_emit_mem_sync,
>>>> +    .emit_trap_handler = gfx_v9_0_ring_emit_trap_handler,
>>>>   };
>>>>     static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_kiq = {
>>>> diff --git a/include/uapi/drm/amdgpu_drm.h 
>>>> b/include/uapi/drm/amdgpu_drm.h
>>>> index 3218576e109d..7eae264adb5d 100644
>>>> --- a/include/uapi/drm/amdgpu_drm.h
>>>> +++ b/include/uapi/drm/amdgpu_drm.h
>>>> @@ -551,6 +551,7 @@ struct drm_amdgpu_gem_va {
>>>>   #define AMDGPU_CHUNK_ID_SCHEDULED_DEPENDENCIES    0x07
>>>>   #define AMDGPU_CHUNK_ID_SYNCOBJ_TIMELINE_WAIT    0x08
>>>>   #define AMDGPU_CHUNK_ID_SYNCOBJ_TIMELINE_SIGNAL  0x09
>>>> +#define AMDGPU_CHUNK_ID_TRAP            0x0a
>>>>     struct drm_amdgpu_cs_chunk {
>>>>       __u32        chunk_id;
>>>> @@ -645,6 +646,13 @@ struct drm_amdgpu_cs_chunk_syncobj {
>>>>          __u64 point;
>>>>   };
>>>>   +struct drm_amdgpu_cs_chunk_trap {
>>>> +    /** Trap Base Address */
>>>> +       __u64 tba_addr;
>>>> +    /** Trap Memory Address */
>>>> +       __u64 tma_addr;
>>>> +};
>>>> +
>>>>   #define AMDGPU_FENCE_TO_HANDLE_GET_SYNCOBJ    0
>>>>   #define AMDGPU_FENCE_TO_HANDLE_GET_SYNCOBJ_FD    1
>>>>   #define AMDGPU_FENCE_TO_HANDLE_GET_SYNC_FILE_FD    2
>>>
>
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH v2] drm/amdgpu: add support for user trap handlers
  2020-08-28  8:23       ` Christian König
  2020-08-28  8:25         ` Samuel Pitoiset
@ 2020-09-23 12:52         ` Samuel Pitoiset
  2020-09-23 12:59           ` Christian König
  1 sibling, 1 reply; 16+ messages in thread
From: Samuel Pitoiset @ 2020-09-23 12:52 UTC (permalink / raw)
  To: Christian König, amd-gfx


On 8/28/20 10:23 AM, Christian König wrote:
> Am 28.08.20 um 10:14 schrieb Samuel Pitoiset:
>>
>> On 8/28/20 9:57 AM, Christian König wrote:
>>> Am 25.08.20 um 16:07 schrieb Samuel Pitoiset:
>>>> A trap handler can be used by userspace to catch shader exceptions
>>>> like divide by zero, memory violations etc.
>>>>
>>>> On GFX6-GFX8, the registers used to configure TBA/TMA aren't
>>>> privileged and can be configured from userpace.
>>>>
>>>> On GFX9+ they are per VMID and privileged, only the KMD can
>>>> configure them. At the moment, we don't know how to set them
>>>> via the CP, so they are only emitted if a VMID is reserved.
>>>>
>>>> This introduces a new CS chunk that can be used to set the
>>>> TBA/TMA virtual address at submit time.
>>>>
>>>> TODO:
>>>> - rebase on top of amd-staging-drm-next (this branch currently
>>>> hangs my GPU at boot)
>>>
>>> Please split that up into multiple patches. The first one adding the 
>>> general infrastructure and the following one the implementation for 
>>> gfx9 and gfx10.
>> Sounds good.
>>>
>>> And maybe even support this for gfx6-8 even if it is not necessary? 
>>> Looks trivial to implement and would give userspace a more uniform 
>>> handling for this.
>> v1 added gfx6-8 support but I removed it in v2 as requested by Alex 
>> because it's useless and might be better for preemption.
>>>
>>> A few more comments below.
>>>
>>>>
>>>> Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
>>>> ---
>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c   | 31 
>>>> ++++++++++++++++++++++++
>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c  |  3 ++-
>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_ids.h  |  4 +++
>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_job.h  |  4 +++
>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h |  4 +++
>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c   | 20 +++++++++++++--
>>>>   drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c   | 20 +++++++++++++++
>>>>   drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c    | 19 +++++++++++++++
>>>>   include/uapi/drm/amdgpu_drm.h            |  8 ++++++
>>>>   9 files changed, 110 insertions(+), 3 deletions(-)
>>>>
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c 
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>>>> index a512ccbc4dea..6ca5c4912e3a 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>>>> @@ -104,6 +104,19 @@ static int amdgpu_cs_bo_handles_chunk(struct 
>>>> amdgpu_cs_parser *p,
>>>>       return r;
>>>>   }
>>>>   +static int amdgpu_cs_user_trap_chunk(struct amdgpu_cs_parser *p,
>>>> +                     struct drm_amdgpu_cs_chunk_trap *data,
>>>> +                     uint64_t *tba_addr, uint64_t *tma_addr)
>>>> +{
>>>> +    if (!data->tba_addr || !data->tma_addr)
>>>> +        return -EINVAL;
>>>> +
>>>> +    *tba_addr = data->tba_addr;
>>>> +    *tma_addr = data->tma_addr;
>>>> +
>>>> +    return 0;
>>>> +}
>>>> +
>>>>   static int amdgpu_cs_parser_init(struct amdgpu_cs_parser *p, 
>>>> union drm_amdgpu_cs *cs)
>>>>   {
>>>>       struct amdgpu_fpriv *fpriv = p->filp->driver_priv;
>>>> @@ -112,6 +125,7 @@ static int amdgpu_cs_parser_init(struct 
>>>> amdgpu_cs_parser *p, union drm_amdgpu_cs
>>>>       uint64_t *chunk_array;
>>>>       unsigned size, num_ibs = 0;
>>>>       uint32_t uf_offset = 0;
>>>> +    uint64_t tba_addr = 0, tma_addr = 0;
>>>>       int i;
>>>>       int ret;
>>>>   @@ -214,6 +228,19 @@ static int amdgpu_cs_parser_init(struct 
>>>> amdgpu_cs_parser *p, union drm_amdgpu_cs
>>>>                 break;
>>>>   +        case AMDGPU_CHUNK_ID_TRAP:
>>>> +            size = sizeof(struct drm_amdgpu_cs_chunk_trap);
>>>> +            if (p->chunks[i].length_dw * sizeof(uint32_t) < size) {
>>>> +                ret = -EINVAL;
>>>> +                goto free_partial_kdata;
>>>> +            }
>>>> +
>>>> +            ret = amdgpu_cs_user_trap_chunk(p, p->chunks[i].kdata,
>>>> +                            &tba_addr, &tma_addr);
>>>> +            if (ret)
>>>> +                goto free_partial_kdata;
>>>> +            break;
>>>> +
>>>>           case AMDGPU_CHUNK_ID_DEPENDENCIES:
>>>>           case AMDGPU_CHUNK_ID_SYNCOBJ_IN:
>>>>           case AMDGPU_CHUNK_ID_SYNCOBJ_OUT:
>>>> @@ -239,6 +266,10 @@ static int amdgpu_cs_parser_init(struct 
>>>> amdgpu_cs_parser *p, union drm_amdgpu_cs
>>>>         if (p->uf_entry.tv.bo)
>>>>           p->job->uf_addr = uf_offset;
>>>> +
>>>> +    p->job->tba_addr = tba_addr;
>>>> +    p->job->tma_addr = tma_addr;
>>>> +
>>>>       kfree(chunk_array);
>>>>         /* Use this opportunity to fill in task info for the vm */
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c 
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>>>> index 26127c7d2f32..1e703119e4c2 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>>>> @@ -88,9 +88,10 @@
>>>>    * - 3.37.0 - L2 is invalidated before SDMA IBs, needed for 
>>>> correctness
>>>>    * - 3.38.0 - Add AMDGPU_IB_FLAG_EMIT_MEM_SYNC
>>>>    * - 3.39.0 - DMABUF implicit sync does a full pipeline sync
>>>> + * - 3.40.0 - Add AMDGPU_CHUNK_ID_TRAP
>>>>    */
>>>>   #define KMS_DRIVER_MAJOR    3
>>>> -#define KMS_DRIVER_MINOR    39
>>>> +#define KMS_DRIVER_MINOR    40
>>>>   #define KMS_DRIVER_PATCHLEVEL    0
>>>>     int amdgpu_vram_limit = 0;
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.h 
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.h
>>>> index 8e58325bbca2..fd0d56724b4d 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.h
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.h
>>>> @@ -58,6 +58,10 @@ struct amdgpu_vmid {
>>>>       uint32_t        oa_base;
>>>>       uint32_t        oa_size;
>>>>   +    /* user trap */
>>>> +    uint64_t        tba_addr;
>>>> +    uint64_t        tma_addr;
>>>> +
>>>>       unsigned        pasid;
>>>>       struct dma_fence    *pasid_mapping;
>>>>   };
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h 
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
>>>> index 81caac9b958a..b8ed5b13ea44 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
>>>> @@ -62,6 +62,10 @@ struct amdgpu_job {
>>>>       /* user fence handling */
>>>>       uint64_t        uf_addr;
>>>>       uint64_t        uf_sequence;
>>>> +
>>>> +    /* user trap handling */
>>>> +    uint64_t        tba_addr;
>>>> +    uint64_t        tma_addr;
>>>>   };
>>>>     int amdgpu_job_alloc(struct amdgpu_device *adev, unsigned num_ibs,
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h 
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>>>> index da871d84b742..1f165a6295d9 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>>>> @@ -197,6 +197,9 @@ struct amdgpu_ring_funcs {
>>>>       void (*soft_recovery)(struct amdgpu_ring *ring, unsigned vmid);
>>>>       int (*preempt_ib)(struct amdgpu_ring *ring);
>>>>       void (*emit_mem_sync)(struct amdgpu_ring *ring);
>>>> +    void (*emit_trap_handler)(struct amdgpu_ring *ring,
>>>> +                  uint32_t vmid,
>>>> +                  uint64_t tba_addr, uint64_t tma_addr);
>>>>   };
>>>>     struct amdgpu_ring {
>>>> @@ -265,6 +268,7 @@ struct amdgpu_ring {
>>>>   #define amdgpu_ring_emit_vm_flush(r, vmid, addr) 
>>>> (r)->funcs->emit_vm_flush((r), (vmid), (addr))
>>>>   #define amdgpu_ring_emit_fence(r, addr, seq, flags) 
>>>> (r)->funcs->emit_fence((r), (addr), (seq), (flags))
>>>>   #define amdgpu_ring_emit_gds_switch(r, v, db, ds, wb, ws, ab, as) 
>>>> (r)->funcs->emit_gds_switch((r), (v), (db), (ds), (wb), (ws), (ab), 
>>>> (as))
>>>> +#define amdgpu_ring_emit_trap_handler(r, v, tba, tma) 
>>>> (r)->funcs->emit_trap_handler((r), (v), (tba), (tma))
>>>>   #define amdgpu_ring_emit_hdp_flush(r) 
>>>> (r)->funcs->emit_hdp_flush((r))
>>>>   #define amdgpu_ring_emit_switch_buffer(r) 
>>>> (r)->funcs->emit_switch_buffer((r))
>>>>   #define amdgpu_ring_emit_cntxcntl(r, d) 
>>>> (r)->funcs->emit_cntxcntl((r), (d))
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c 
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>>> index 71e005cf2952..6b619bb03777 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>>> @@ -1079,15 +1079,22 @@ int amdgpu_vm_flush(struct amdgpu_ring 
>>>> *ring, struct amdgpu_job *job,
>>>>       bool vm_flush_needed = job->vm_needs_flush;
>>>>       struct dma_fence *fence = NULL;
>>>>       bool pasid_mapping_needed = false;
>>>> +    bool trap_handler_needed = false;
>>>>       unsigned patch_offset = 0;
>>>>       bool update_spm_vmid_needed = (job->vm && 
>>>> (job->vm->reserved_vmid[vmhub] != NULL));
>>>>       int r;
>>>>   -    if (update_spm_vmid_needed && 
>>>> adev->gfx.rlc.funcs->update_spm_vmid)
>>>> +    if (update_spm_vmid_needed && 
>>>> adev->gfx.rlc.funcs->update_spm_vmid) {
>>>>           adev->gfx.rlc.funcs->update_spm_vmid(adev, job->vmid);
>>>>   +        trap_handler_needed = ring->funcs->emit_trap_handler && (
>>>> +            id->tba_addr != job->tba_addr ||
>>>> +            id->tma_addr != job->tma_addr);
>>>
>>> That's probably not such a good idea since it makes the trap handler 
>>> depend on the VMID reservation.
>>>
>>>> +    }
>>>> +
>>>>       if (amdgpu_vmid_had_gpu_reset(adev, id)) {
>>>>           gds_switch_needed = true;
>>>> +        trap_handler_needed = true;
>>>>           vm_flush_needed = true;
>>>>           pasid_mapping_needed = true;
>>>>       }
>>>> @@ -1099,12 +1106,14 @@ int amdgpu_vm_flush(struct amdgpu_ring 
>>>> *ring, struct amdgpu_job *job,
>>>>       mutex_unlock(&id_mgr->lock);
>>>>         gds_switch_needed &= !!ring->funcs->emit_gds_switch;
>>>> +    trap_handler_needed &= !!ring->funcs->emit_trap_handler;
>>>>       vm_flush_needed &= !!ring->funcs->emit_vm_flush &&
>>>>               job->vm_pd_addr != AMDGPU_BO_INVALID_OFFSET;
>>>>       pasid_mapping_needed &= 
>>>> adev->gmc.gmc_funcs->emit_pasid_mapping &&
>>>>           ring->funcs->emit_wreg;
>>>>   -    if (!vm_flush_needed && !gds_switch_needed && !need_pipe_sync)
>>>> +    if (!vm_flush_needed && !gds_switch_needed &&
>>>> +        !trap_handler_needed && !need_pipe_sync)
>>>>           return 0;
>>>>         if (ring->funcs->init_cond_exec)
>>>> @@ -1158,6 +1167,13 @@ int amdgpu_vm_flush(struct amdgpu_ring 
>>>> *ring, struct amdgpu_job *job,
>>>>                           job->oa_size);
>>>>       }
>>>>   +    if (ring->funcs->emit_trap_handler && trap_handler_needed) {
>>>> +        id->tba_addr = job->tba_addr;
>>>> +        id->tma_addr = job->tma_addr;
>>>> +        amdgpu_ring_emit_trap_handler(ring, job->vmid, job->tba_addr,
>>>> +                          job->tma_addr);
>>>> +    }
>>>> +
>>>
>>> Well that doesn't seem to make sense at all here.
>>>
>>>>       if (ring->funcs->patch_cond_exec)
>>>>           amdgpu_ring_patch_cond_exec(ring, patch_offset);
>>>>   diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c 
>>>> b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>>>> index 65997ffaed45..f864b217589f 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>>>> @@ -7141,6 +7141,24 @@ static void 
>>>> gfx_v10_0_ring_emit_gds_switch(struct amdgpu_ring *ring,
>>>>                       (1 << (oa_size + oa_base)) - (1 << oa_base));
>>>>   }
>>>>   +static void gfx_v10_0_ring_emit_trap_handler(struct amdgpu_ring 
>>>> *ring,
>>>> +                        uint32_t vmid,
>>>> +                        uint64_t tba_addr,
>>>> +                        uint64_t tma_addr)
>>>> +{
>>>> +    struct amdgpu_device *adev = ring->adev;
>>>> +
>>>> +    mutex_lock(&adev->srbm_mutex);
>>>> +    nv_grbm_select(adev, 0, 0, 0, vmid);
>>>> +    WREG32_SOC15_RLC(GC, 0, mmSQ_SHADER_TBA_LO, 
>>>> lower_32_bits(tba_addr >> 8));
>>>> +    WREG32_SOC15_RLC(GC, 0, mmSQ_SHADER_TBA_HI, 
>>>> upper_32_bits(tba_addr >> 8) |
>>>> +                1 << SQ_SHADER_TBA_HI__TRAP_EN__SHIFT);
>>>> +    WREG32_SOC15_RLC(GC, 0, mmSQ_SHADER_TMA_LO, 
>>>> lower_32_bits(tma_addr >> 8));
>>>> +    WREG32_SOC15_RLC(GC, 0, mmSQ_SHADER_TMA_HI, 
>>>> upper_32_bits(tma_addr >> 8));
>>>> +    nv_grbm_select(adev, 0, 0, 0, 0);
>>>> +    mutex_unlock(&adev->srbm_mutex);
>>>
>>> This is not emitting the trap handler update to the ring, but 
>>> writing it directly to the registers.
>>
>> This uses direct driver access because we don't know how to emit them 
>> via CP. This is also why it's only emitted if a VMID is reserved.
>>
>> I think Alex is having a discussion with the CP team about that.
>
> Ah! Ok in this case please keep the patch on hold until this is cleared.
>
> But even when the VMID is reserved this could cause big problems if 
> userspace decides to change the trap addresses on the fly.
>
> So we can't really do it like this without waiting for the hardware to 
> be idle and causing massive performance loss.

So, according to Alex, nothing will happen for current GFX9-10 ASICs but 
we would like to be able to use that behind the vmid reservation logic.

Do you have any more comments on this patch?

>
> Regards,
> Christian.
>
>>
>>>
>>> Regards,
>>> Christian.
>>>
>>>> +}
>>>> +
>>>>   static int gfx_v10_0_early_init(void *handle)
>>>>   {
>>>>       struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>>>> @@ -8530,6 +8548,7 @@ static const struct amdgpu_ring_funcs 
>>>> gfx_v10_0_ring_funcs_gfx = {
>>>>       .emit_reg_write_reg_wait = 
>>>> gfx_v10_0_ring_emit_reg_write_reg_wait,
>>>>       .soft_recovery = gfx_v10_0_ring_soft_recovery,
>>>>       .emit_mem_sync = gfx_v10_0_emit_mem_sync,
>>>> +    .emit_trap_handler = gfx_v10_0_ring_emit_trap_handler,
>>>>   };
>>>>     static const struct amdgpu_ring_funcs 
>>>> gfx_v10_0_ring_funcs_compute = {
>>>> @@ -8566,6 +8585,7 @@ static const struct amdgpu_ring_funcs 
>>>> gfx_v10_0_ring_funcs_compute = {
>>>>       .emit_reg_wait = gfx_v10_0_ring_emit_reg_wait,
>>>>       .emit_reg_write_reg_wait = 
>>>> gfx_v10_0_ring_emit_reg_write_reg_wait,
>>>>       .emit_mem_sync = gfx_v10_0_emit_mem_sync,
>>>> +    .emit_trap_handler = gfx_v10_0_ring_emit_trap_handler,
>>>>   };
>>>>     static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_kiq = {
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c 
>>>> b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>>>> index cb9d60a4e05e..4fc00f196085 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>>>> @@ -4162,6 +4162,23 @@ static void 
>>>> gfx_v9_0_ring_emit_gds_switch(struct amdgpu_ring *ring,
>>>>                      (1 << (oa_size + oa_base)) - (1 << oa_base));
>>>>   }
>>>>   +static void gfx_v9_0_ring_emit_trap_handler(struct amdgpu_ring 
>>>> *ring,
>>>> +                        uint32_t vmid,
>>>> +                        uint64_t tba_addr,
>>>> +                        uint64_t tma_addr)
>>>> +{
>>>> +    struct amdgpu_device *adev = ring->adev;
>>>> +
>>>> +    mutex_lock(&adev->srbm_mutex);
>>>> +    soc15_grbm_select(adev, 0, 0, 0, vmid);
>>>> +    WREG32_SOC15_RLC(GC, 0, mmSQ_SHADER_TBA_LO, 
>>>> lower_32_bits(tba_addr >> 8));
>>>> +    WREG32_SOC15_RLC(GC, 0, mmSQ_SHADER_TBA_HI, 
>>>> upper_32_bits(tba_addr >> 8));
>>>> +    WREG32_SOC15_RLC(GC, 0, mmSQ_SHADER_TMA_LO, 
>>>> lower_32_bits(tma_addr >> 8));
>>>> +    WREG32_SOC15_RLC(GC, 0, mmSQ_SHADER_TMA_HI, 
>>>> upper_32_bits(tma_addr >> 8));
>>>> +    soc15_grbm_select(adev, 0, 0, 0, 0);
>>>> +    mutex_unlock(&adev->srbm_mutex);
>>>> +}
>>>> +
>>>>   static const u32 vgpr_init_compute_shader[] =
>>>>   {
>>>>       0xb07c0000, 0xbe8000ff,
>>>> @@ -6720,6 +6737,7 @@ static const struct amdgpu_ring_funcs 
>>>> gfx_v9_0_ring_funcs_gfx = {
>>>>       .emit_reg_write_reg_wait = 
>>>> gfx_v9_0_ring_emit_reg_write_reg_wait,
>>>>       .soft_recovery = gfx_v9_0_ring_soft_recovery,
>>>>       .emit_mem_sync = gfx_v9_0_emit_mem_sync,
>>>> +    .emit_trap_handler = gfx_v9_0_ring_emit_trap_handler,
>>>>   };
>>>>     static const struct amdgpu_ring_funcs 
>>>> gfx_v9_0_ring_funcs_compute = {
>>>> @@ -6756,6 +6774,7 @@ static const struct amdgpu_ring_funcs 
>>>> gfx_v9_0_ring_funcs_compute = {
>>>>       .emit_reg_wait = gfx_v9_0_ring_emit_reg_wait,
>>>>       .emit_reg_write_reg_wait = 
>>>> gfx_v9_0_ring_emit_reg_write_reg_wait,
>>>>       .emit_mem_sync = gfx_v9_0_emit_mem_sync,
>>>> +    .emit_trap_handler = gfx_v9_0_ring_emit_trap_handler,
>>>>   };
>>>>     static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_kiq = {
>>>> diff --git a/include/uapi/drm/amdgpu_drm.h 
>>>> b/include/uapi/drm/amdgpu_drm.h
>>>> index 3218576e109d..7eae264adb5d 100644
>>>> --- a/include/uapi/drm/amdgpu_drm.h
>>>> +++ b/include/uapi/drm/amdgpu_drm.h
>>>> @@ -551,6 +551,7 @@ struct drm_amdgpu_gem_va {
>>>>   #define AMDGPU_CHUNK_ID_SCHEDULED_DEPENDENCIES    0x07
>>>>   #define AMDGPU_CHUNK_ID_SYNCOBJ_TIMELINE_WAIT    0x08
>>>>   #define AMDGPU_CHUNK_ID_SYNCOBJ_TIMELINE_SIGNAL  0x09
>>>> +#define AMDGPU_CHUNK_ID_TRAP            0x0a
>>>>     struct drm_amdgpu_cs_chunk {
>>>>       __u32        chunk_id;
>>>> @@ -645,6 +646,13 @@ struct drm_amdgpu_cs_chunk_syncobj {
>>>>          __u64 point;
>>>>   };
>>>>   +struct drm_amdgpu_cs_chunk_trap {
>>>> +    /** Trap Base Address */
>>>> +       __u64 tba_addr;
>>>> +    /** Trap Memory Address */
>>>> +       __u64 tma_addr;
>>>> +};
>>>> +
>>>>   #define AMDGPU_FENCE_TO_HANDLE_GET_SYNCOBJ    0
>>>>   #define AMDGPU_FENCE_TO_HANDLE_GET_SYNCOBJ_FD    1
>>>>   #define AMDGPU_FENCE_TO_HANDLE_GET_SYNC_FILE_FD    2
>>>
>
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH v2] drm/amdgpu: add support for user trap handlers
  2020-09-23 12:52         ` Samuel Pitoiset
@ 2020-09-23 12:59           ` Christian König
  0 siblings, 0 replies; 16+ messages in thread
From: Christian König @ 2020-09-23 12:59 UTC (permalink / raw)
  To: Samuel Pitoiset, Christian König, amd-gfx

Am 23.09.20 um 14:52 schrieb Samuel Pitoiset:
>
> On 8/28/20 10:23 AM, Christian König wrote:
>> Am 28.08.20 um 10:14 schrieb Samuel Pitoiset:
>>>
>>> On 8/28/20 9:57 AM, Christian König wrote:
>>>> Am 25.08.20 um 16:07 schrieb Samuel Pitoiset:
>>>>> A trap handler can be used by userspace to catch shader exceptions
>>>>> like divide by zero, memory violations etc.
>>>>>
>>>>> On GFX6-GFX8, the registers used to configure TBA/TMA aren't
>>>>> privileged and can be configured from userpace.
>>>>>
>>>>> On GFX9+ they are per VMID and privileged, only the KMD can
>>>>> configure them. At the moment, we don't know how to set them
>>>>> via the CP, so they are only emitted if a VMID is reserved.
>>>>>
>>>>> This introduces a new CS chunk that can be used to set the
>>>>> TBA/TMA virtual address at submit time.
>>>>>
>>>>> TODO:
>>>>> - rebase on top of amd-staging-drm-next (this branch currently
>>>>> hangs my GPU at boot)
>>>>
>>>> Please split that up into multiple patches. The first one adding 
>>>> the general infrastructure and the following one the implementation 
>>>> for gfx9 and gfx10.
>>> Sounds good.
>>>>
>>>> And maybe even support this for gfx6-8 even if it is not necessary? 
>>>> Looks trivial to implement and would give userspace a more uniform 
>>>> handling for this.
>>> v1 added gfx6-8 support but I removed it in v2 as requested by Alex 
>>> because it's useless and might be better for preemption.
>>>>
>>>> A few more comments below.
>>>>
>>>>>
>>>>> Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
>>>>> ---
>>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c   | 31 
>>>>> ++++++++++++++++++++++++
>>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c  |  3 ++-
>>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_ids.h  |  4 +++
>>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_job.h  |  4 +++
>>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h |  4 +++
>>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c   | 20 +++++++++++++--
>>>>>   drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c   | 20 +++++++++++++++
>>>>>   drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c    | 19 +++++++++++++++
>>>>>   include/uapi/drm/amdgpu_drm.h            |  8 ++++++
>>>>>   9 files changed, 110 insertions(+), 3 deletions(-)
>>>>>
>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c 
>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>>>>> index a512ccbc4dea..6ca5c4912e3a 100644
>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>>>>> @@ -104,6 +104,19 @@ static int amdgpu_cs_bo_handles_chunk(struct 
>>>>> amdgpu_cs_parser *p,
>>>>>       return r;
>>>>>   }
>>>>>   +static int amdgpu_cs_user_trap_chunk(struct amdgpu_cs_parser *p,
>>>>> +                     struct drm_amdgpu_cs_chunk_trap *data,
>>>>> +                     uint64_t *tba_addr, uint64_t *tma_addr)
>>>>> +{
>>>>> +    if (!data->tba_addr || !data->tma_addr)
>>>>> +        return -EINVAL;
>>>>> +
>>>>> +    *tba_addr = data->tba_addr;
>>>>> +    *tma_addr = data->tma_addr;
>>>>> +
>>>>> +    return 0;
>>>>> +}
>>>>> +
>>>>>   static int amdgpu_cs_parser_init(struct amdgpu_cs_parser *p, 
>>>>> union drm_amdgpu_cs *cs)
>>>>>   {
>>>>>       struct amdgpu_fpriv *fpriv = p->filp->driver_priv;
>>>>> @@ -112,6 +125,7 @@ static int amdgpu_cs_parser_init(struct 
>>>>> amdgpu_cs_parser *p, union drm_amdgpu_cs
>>>>>       uint64_t *chunk_array;
>>>>>       unsigned size, num_ibs = 0;
>>>>>       uint32_t uf_offset = 0;
>>>>> +    uint64_t tba_addr = 0, tma_addr = 0;
>>>>>       int i;
>>>>>       int ret;
>>>>>   @@ -214,6 +228,19 @@ static int amdgpu_cs_parser_init(struct 
>>>>> amdgpu_cs_parser *p, union drm_amdgpu_cs
>>>>>                 break;
>>>>>   +        case AMDGPU_CHUNK_ID_TRAP:
>>>>> +            size = sizeof(struct drm_amdgpu_cs_chunk_trap);
>>>>> +            if (p->chunks[i].length_dw * sizeof(uint32_t) < size) {
>>>>> +                ret = -EINVAL;
>>>>> +                goto free_partial_kdata;
>>>>> +            }
>>>>> +
>>>>> +            ret = amdgpu_cs_user_trap_chunk(p, p->chunks[i].kdata,
>>>>> +                            &tba_addr, &tma_addr);
>>>>> +            if (ret)
>>>>> +                goto free_partial_kdata;
>>>>> +            break;
>>>>> +
>>>>>           case AMDGPU_CHUNK_ID_DEPENDENCIES:
>>>>>           case AMDGPU_CHUNK_ID_SYNCOBJ_IN:
>>>>>           case AMDGPU_CHUNK_ID_SYNCOBJ_OUT:
>>>>> @@ -239,6 +266,10 @@ static int amdgpu_cs_parser_init(struct 
>>>>> amdgpu_cs_parser *p, union drm_amdgpu_cs
>>>>>         if (p->uf_entry.tv.bo)
>>>>>           p->job->uf_addr = uf_offset;
>>>>> +
>>>>> +    p->job->tba_addr = tba_addr;
>>>>> +    p->job->tma_addr = tma_addr;
>>>>> +
>>>>>       kfree(chunk_array);
>>>>>         /* Use this opportunity to fill in task info for the vm */
>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c 
>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>>>>> index 26127c7d2f32..1e703119e4c2 100644
>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>>>>> @@ -88,9 +88,10 @@
>>>>>    * - 3.37.0 - L2 is invalidated before SDMA IBs, needed for 
>>>>> correctness
>>>>>    * - 3.38.0 - Add AMDGPU_IB_FLAG_EMIT_MEM_SYNC
>>>>>    * - 3.39.0 - DMABUF implicit sync does a full pipeline sync
>>>>> + * - 3.40.0 - Add AMDGPU_CHUNK_ID_TRAP
>>>>>    */
>>>>>   #define KMS_DRIVER_MAJOR    3
>>>>> -#define KMS_DRIVER_MINOR    39
>>>>> +#define KMS_DRIVER_MINOR    40
>>>>>   #define KMS_DRIVER_PATCHLEVEL    0
>>>>>     int amdgpu_vram_limit = 0;
>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.h 
>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.h
>>>>> index 8e58325bbca2..fd0d56724b4d 100644
>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.h
>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.h
>>>>> @@ -58,6 +58,10 @@ struct amdgpu_vmid {
>>>>>       uint32_t        oa_base;
>>>>>       uint32_t        oa_size;
>>>>>   +    /* user trap */
>>>>> +    uint64_t        tba_addr;
>>>>> +    uint64_t        tma_addr;
>>>>> +
>>>>>       unsigned        pasid;
>>>>>       struct dma_fence    *pasid_mapping;
>>>>>   };
>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h 
>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
>>>>> index 81caac9b958a..b8ed5b13ea44 100644
>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
>>>>> @@ -62,6 +62,10 @@ struct amdgpu_job {
>>>>>       /* user fence handling */
>>>>>       uint64_t        uf_addr;
>>>>>       uint64_t        uf_sequence;
>>>>> +
>>>>> +    /* user trap handling */
>>>>> +    uint64_t        tba_addr;
>>>>> +    uint64_t        tma_addr;
>>>>>   };
>>>>>     int amdgpu_job_alloc(struct amdgpu_device *adev, unsigned 
>>>>> num_ibs,
>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h 
>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>>>>> index da871d84b742..1f165a6295d9 100644
>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>>>>> @@ -197,6 +197,9 @@ struct amdgpu_ring_funcs {
>>>>>       void (*soft_recovery)(struct amdgpu_ring *ring, unsigned vmid);
>>>>>       int (*preempt_ib)(struct amdgpu_ring *ring);
>>>>>       void (*emit_mem_sync)(struct amdgpu_ring *ring);
>>>>> +    void (*emit_trap_handler)(struct amdgpu_ring *ring,
>>>>> +                  uint32_t vmid,
>>>>> +                  uint64_t tba_addr, uint64_t tma_addr);
>>>>>   };
>>>>>     struct amdgpu_ring {
>>>>> @@ -265,6 +268,7 @@ struct amdgpu_ring {
>>>>>   #define amdgpu_ring_emit_vm_flush(r, vmid, addr) 
>>>>> (r)->funcs->emit_vm_flush((r), (vmid), (addr))
>>>>>   #define amdgpu_ring_emit_fence(r, addr, seq, flags) 
>>>>> (r)->funcs->emit_fence((r), (addr), (seq), (flags))
>>>>>   #define amdgpu_ring_emit_gds_switch(r, v, db, ds, wb, ws, ab, 
>>>>> as) (r)->funcs->emit_gds_switch((r), (v), (db), (ds), (wb), (ws), 
>>>>> (ab), (as))
>>>>> +#define amdgpu_ring_emit_trap_handler(r, v, tba, tma) 
>>>>> (r)->funcs->emit_trap_handler((r), (v), (tba), (tma))
>>>>>   #define amdgpu_ring_emit_hdp_flush(r) 
>>>>> (r)->funcs->emit_hdp_flush((r))
>>>>>   #define amdgpu_ring_emit_switch_buffer(r) 
>>>>> (r)->funcs->emit_switch_buffer((r))
>>>>>   #define amdgpu_ring_emit_cntxcntl(r, d) 
>>>>> (r)->funcs->emit_cntxcntl((r), (d))
>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c 
>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>>>> index 71e005cf2952..6b619bb03777 100644
>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>>>> @@ -1079,15 +1079,22 @@ int amdgpu_vm_flush(struct amdgpu_ring 
>>>>> *ring, struct amdgpu_job *job,
>>>>>       bool vm_flush_needed = job->vm_needs_flush;
>>>>>       struct dma_fence *fence = NULL;
>>>>>       bool pasid_mapping_needed = false;
>>>>> +    bool trap_handler_needed = false;
>>>>>       unsigned patch_offset = 0;
>>>>>       bool update_spm_vmid_needed = (job->vm && 
>>>>> (job->vm->reserved_vmid[vmhub] != NULL));
>>>>>       int r;
>>>>>   -    if (update_spm_vmid_needed && 
>>>>> adev->gfx.rlc.funcs->update_spm_vmid)
>>>>> +    if (update_spm_vmid_needed && 
>>>>> adev->gfx.rlc.funcs->update_spm_vmid) {
>>>>>           adev->gfx.rlc.funcs->update_spm_vmid(adev, job->vmid);
>>>>>   +        trap_handler_needed = ring->funcs->emit_trap_handler && (
>>>>> +            id->tba_addr != job->tba_addr ||
>>>>> +            id->tma_addr != job->tma_addr);
>>>>
>>>> That's probably not such a good idea since it makes the trap 
>>>> handler depend on the VMID reservation.
>>>>
>>>>> +    }
>>>>> +
>>>>>       if (amdgpu_vmid_had_gpu_reset(adev, id)) {
>>>>>           gds_switch_needed = true;
>>>>> +        trap_handler_needed = true;
>>>>>           vm_flush_needed = true;
>>>>>           pasid_mapping_needed = true;
>>>>>       }
>>>>> @@ -1099,12 +1106,14 @@ int amdgpu_vm_flush(struct amdgpu_ring 
>>>>> *ring, struct amdgpu_job *job,
>>>>>       mutex_unlock(&id_mgr->lock);
>>>>>         gds_switch_needed &= !!ring->funcs->emit_gds_switch;
>>>>> +    trap_handler_needed &= !!ring->funcs->emit_trap_handler;
>>>>>       vm_flush_needed &= !!ring->funcs->emit_vm_flush &&
>>>>>               job->vm_pd_addr != AMDGPU_BO_INVALID_OFFSET;
>>>>>       pasid_mapping_needed &= 
>>>>> adev->gmc.gmc_funcs->emit_pasid_mapping &&
>>>>>           ring->funcs->emit_wreg;
>>>>>   -    if (!vm_flush_needed && !gds_switch_needed && !need_pipe_sync)
>>>>> +    if (!vm_flush_needed && !gds_switch_needed &&
>>>>> +        !trap_handler_needed && !need_pipe_sync)
>>>>>           return 0;
>>>>>         if (ring->funcs->init_cond_exec)
>>>>> @@ -1158,6 +1167,13 @@ int amdgpu_vm_flush(struct amdgpu_ring 
>>>>> *ring, struct amdgpu_job *job,
>>>>>                           job->oa_size);
>>>>>       }
>>>>>   +    if (ring->funcs->emit_trap_handler && trap_handler_needed) {
>>>>> +        id->tba_addr = job->tba_addr;
>>>>> +        id->tma_addr = job->tma_addr;
>>>>> +        amdgpu_ring_emit_trap_handler(ring, job->vmid, 
>>>>> job->tba_addr,
>>>>> +                          job->tma_addr);
>>>>> +    }
>>>>> +
>>>>
>>>> Well that doesn't seem to make sense at all here.
>>>>
>>>>>       if (ring->funcs->patch_cond_exec)
>>>>>           amdgpu_ring_patch_cond_exec(ring, patch_offset);
>>>>>   diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c 
>>>>> b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>>>>> index 65997ffaed45..f864b217589f 100644
>>>>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>>>>> @@ -7141,6 +7141,24 @@ static void 
>>>>> gfx_v10_0_ring_emit_gds_switch(struct amdgpu_ring *ring,
>>>>>                       (1 << (oa_size + oa_base)) - (1 << oa_base));
>>>>>   }
>>>>>   +static void gfx_v10_0_ring_emit_trap_handler(struct amdgpu_ring 
>>>>> *ring,
>>>>> +                        uint32_t vmid,
>>>>> +                        uint64_t tba_addr,
>>>>> +                        uint64_t tma_addr)
>>>>> +{
>>>>> +    struct amdgpu_device *adev = ring->adev;
>>>>> +
>>>>> +    mutex_lock(&adev->srbm_mutex);
>>>>> +    nv_grbm_select(adev, 0, 0, 0, vmid);
>>>>> +    WREG32_SOC15_RLC(GC, 0, mmSQ_SHADER_TBA_LO, 
>>>>> lower_32_bits(tba_addr >> 8));
>>>>> +    WREG32_SOC15_RLC(GC, 0, mmSQ_SHADER_TBA_HI, 
>>>>> upper_32_bits(tba_addr >> 8) |
>>>>> +                1 << SQ_SHADER_TBA_HI__TRAP_EN__SHIFT);
>>>>> +    WREG32_SOC15_RLC(GC, 0, mmSQ_SHADER_TMA_LO, 
>>>>> lower_32_bits(tma_addr >> 8));
>>>>> +    WREG32_SOC15_RLC(GC, 0, mmSQ_SHADER_TMA_HI, 
>>>>> upper_32_bits(tma_addr >> 8));
>>>>> +    nv_grbm_select(adev, 0, 0, 0, 0);
>>>>> +    mutex_unlock(&adev->srbm_mutex);
>>>>
>>>> This is not emitting the trap handler update to the ring, but 
>>>> writing it directly to the registers.
>>>
>>> This uses direct driver access because we don't know how to emit 
>>> them via CP. This is also why it's only emitted if a VMID is reserved.
>>>
>>> I think Alex is having a discussion with the CP team about that.
>>
>> Ah! Ok in this case please keep the patch on hold until this is cleared.
>>
>> But even when the VMID is reserved this could cause big problems if 
>> userspace decides to change the trap addresses on the fly.
>>
>> So we can't really do it like this without waiting for the hardware 
>> to be idle and causing massive performance loss.
>
> So, according to Alex, nothing will happen for current GFX9-10 ASICs 
> but we would like to be able to use that behind the vmid reservation 
> logic.
>
> Do you have any more comments on this patch?

In this case please completely drop this patch and use some approach 
where you update the TRAP handler for the reserved VMID only.

Regards,
Christian.

>
>>
>> Regards,
>> Christian.
>>
>>>
>>>>
>>>> Regards,
>>>> Christian.
>>>>
>>>>> +}
>>>>> +
>>>>>   static int gfx_v10_0_early_init(void *handle)
>>>>>   {
>>>>>       struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>>>>> @@ -8530,6 +8548,7 @@ static const struct amdgpu_ring_funcs 
>>>>> gfx_v10_0_ring_funcs_gfx = {
>>>>>       .emit_reg_write_reg_wait = 
>>>>> gfx_v10_0_ring_emit_reg_write_reg_wait,
>>>>>       .soft_recovery = gfx_v10_0_ring_soft_recovery,
>>>>>       .emit_mem_sync = gfx_v10_0_emit_mem_sync,
>>>>> +    .emit_trap_handler = gfx_v10_0_ring_emit_trap_handler,
>>>>>   };
>>>>>     static const struct amdgpu_ring_funcs 
>>>>> gfx_v10_0_ring_funcs_compute = {
>>>>> @@ -8566,6 +8585,7 @@ static const struct amdgpu_ring_funcs 
>>>>> gfx_v10_0_ring_funcs_compute = {
>>>>>       .emit_reg_wait = gfx_v10_0_ring_emit_reg_wait,
>>>>>       .emit_reg_write_reg_wait = 
>>>>> gfx_v10_0_ring_emit_reg_write_reg_wait,
>>>>>       .emit_mem_sync = gfx_v10_0_emit_mem_sync,
>>>>> +    .emit_trap_handler = gfx_v10_0_ring_emit_trap_handler,
>>>>>   };
>>>>>     static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_kiq 
>>>>> = {
>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c 
>>>>> b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>>>>> index cb9d60a4e05e..4fc00f196085 100644
>>>>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>>>>> @@ -4162,6 +4162,23 @@ static void 
>>>>> gfx_v9_0_ring_emit_gds_switch(struct amdgpu_ring *ring,
>>>>>                      (1 << (oa_size + oa_base)) - (1 << oa_base));
>>>>>   }
>>>>>   +static void gfx_v9_0_ring_emit_trap_handler(struct amdgpu_ring 
>>>>> *ring,
>>>>> +                        uint32_t vmid,
>>>>> +                        uint64_t tba_addr,
>>>>> +                        uint64_t tma_addr)
>>>>> +{
>>>>> +    struct amdgpu_device *adev = ring->adev;
>>>>> +
>>>>> +    mutex_lock(&adev->srbm_mutex);
>>>>> +    soc15_grbm_select(adev, 0, 0, 0, vmid);
>>>>> +    WREG32_SOC15_RLC(GC, 0, mmSQ_SHADER_TBA_LO, 
>>>>> lower_32_bits(tba_addr >> 8));
>>>>> +    WREG32_SOC15_RLC(GC, 0, mmSQ_SHADER_TBA_HI, 
>>>>> upper_32_bits(tba_addr >> 8));
>>>>> +    WREG32_SOC15_RLC(GC, 0, mmSQ_SHADER_TMA_LO, 
>>>>> lower_32_bits(tma_addr >> 8));
>>>>> +    WREG32_SOC15_RLC(GC, 0, mmSQ_SHADER_TMA_HI, 
>>>>> upper_32_bits(tma_addr >> 8));
>>>>> +    soc15_grbm_select(adev, 0, 0, 0, 0);
>>>>> +    mutex_unlock(&adev->srbm_mutex);
>>>>> +}
>>>>> +
>>>>>   static const u32 vgpr_init_compute_shader[] =
>>>>>   {
>>>>>       0xb07c0000, 0xbe8000ff,
>>>>> @@ -6720,6 +6737,7 @@ static const struct amdgpu_ring_funcs 
>>>>> gfx_v9_0_ring_funcs_gfx = {
>>>>>       .emit_reg_write_reg_wait = 
>>>>> gfx_v9_0_ring_emit_reg_write_reg_wait,
>>>>>       .soft_recovery = gfx_v9_0_ring_soft_recovery,
>>>>>       .emit_mem_sync = gfx_v9_0_emit_mem_sync,
>>>>> +    .emit_trap_handler = gfx_v9_0_ring_emit_trap_handler,
>>>>>   };
>>>>>     static const struct amdgpu_ring_funcs 
>>>>> gfx_v9_0_ring_funcs_compute = {
>>>>> @@ -6756,6 +6774,7 @@ static const struct amdgpu_ring_funcs 
>>>>> gfx_v9_0_ring_funcs_compute = {
>>>>>       .emit_reg_wait = gfx_v9_0_ring_emit_reg_wait,
>>>>>       .emit_reg_write_reg_wait = 
>>>>> gfx_v9_0_ring_emit_reg_write_reg_wait,
>>>>>       .emit_mem_sync = gfx_v9_0_emit_mem_sync,
>>>>> +    .emit_trap_handler = gfx_v9_0_ring_emit_trap_handler,
>>>>>   };
>>>>>     static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_kiq = {
>>>>> diff --git a/include/uapi/drm/amdgpu_drm.h 
>>>>> b/include/uapi/drm/amdgpu_drm.h
>>>>> index 3218576e109d..7eae264adb5d 100644
>>>>> --- a/include/uapi/drm/amdgpu_drm.h
>>>>> +++ b/include/uapi/drm/amdgpu_drm.h
>>>>> @@ -551,6 +551,7 @@ struct drm_amdgpu_gem_va {
>>>>>   #define AMDGPU_CHUNK_ID_SCHEDULED_DEPENDENCIES    0x07
>>>>>   #define AMDGPU_CHUNK_ID_SYNCOBJ_TIMELINE_WAIT    0x08
>>>>>   #define AMDGPU_CHUNK_ID_SYNCOBJ_TIMELINE_SIGNAL  0x09
>>>>> +#define AMDGPU_CHUNK_ID_TRAP            0x0a
>>>>>     struct drm_amdgpu_cs_chunk {
>>>>>       __u32        chunk_id;
>>>>> @@ -645,6 +646,13 @@ struct drm_amdgpu_cs_chunk_syncobj {
>>>>>          __u64 point;
>>>>>   };
>>>>>   +struct drm_amdgpu_cs_chunk_trap {
>>>>> +    /** Trap Base Address */
>>>>> +       __u64 tba_addr;
>>>>> +    /** Trap Memory Address */
>>>>> +       __u64 tma_addr;
>>>>> +};
>>>>> +
>>>>>   #define AMDGPU_FENCE_TO_HANDLE_GET_SYNCOBJ    0
>>>>>   #define AMDGPU_FENCE_TO_HANDLE_GET_SYNCOBJ_FD    1
>>>>>   #define AMDGPU_FENCE_TO_HANDLE_GET_SYNC_FILE_FD    2
>>>>
>>
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 16+ messages in thread

* [PATCH v3] drm/amdgpu: add support for user trap handlers
  2020-08-24 11:49 [RFC PATCH] drm/amdgpu: add support for user trap handlers Samuel Pitoiset
                   ` (2 preceding siblings ...)
  2020-08-25 14:07 ` [PATCH v2] " Samuel Pitoiset
@ 2021-05-06  6:54 ` Samuel Pitoiset
  2021-05-06  7:09   ` Samuel Pitoiset
  3 siblings, 1 reply; 16+ messages in thread
From: Samuel Pitoiset @ 2021-05-06  6:54 UTC (permalink / raw)
  To: amd-gfx; +Cc: Samuel Pitoiset

A trap handler can be used by userspace to catch shader exceptions
like divide by zero, memory violations etc.

On GFX6-GFX8, the registers used to configure TBA/TMA aren't
privileged and can be configured from userpace.

On GFX9+ they are per VMID and privileged, only the KMD can
configure them. At the moment, we don't know how to set them
via the CP, so they are only emitted if a VMID is reserved.

This introduces a new CS chunk that can be used to set the
TBA/TMA virtual address at submit time.

Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c   | 31 ++++++++++++++++
 drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c  |  3 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_ids.h  |  4 ++
 drivers/gpu/drm/amd/amdgpu/amdgpu_job.h  |  4 ++
 drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h |  3 ++
 drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c   | 20 +++++++++-
 drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c   | 47 ++++++++++++++++++++++++
 drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c    | 19 ++++++++++
 include/uapi/drm/amdgpu_drm.h            |  8 ++++
 9 files changed, 136 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
index 90136f9dedd6..0cc9f5eb0484 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
@@ -103,6 +103,19 @@ static int amdgpu_cs_bo_handles_chunk(struct amdgpu_cs_parser *p,
 	return r;
 }
 
+static int amdgpu_cs_user_trap_chunk(struct amdgpu_cs_parser *p,
+				     struct drm_amdgpu_cs_chunk_trap *data,
+				     uint64_t *tba_addr, uint64_t *tma_addr)
+{
+	if (!data->tba_addr || !data->tma_addr)
+		return -EINVAL;
+
+	*tba_addr = data->tba_addr;
+	*tma_addr = data->tma_addr;
+
+	return 0;
+}
+
 static int amdgpu_cs_parser_init(struct amdgpu_cs_parser *p, union drm_amdgpu_cs *cs)
 {
 	struct amdgpu_fpriv *fpriv = p->filp->driver_priv;
@@ -111,6 +124,7 @@ static int amdgpu_cs_parser_init(struct amdgpu_cs_parser *p, union drm_amdgpu_cs
 	uint64_t *chunk_array;
 	unsigned size, num_ibs = 0;
 	uint32_t uf_offset = 0;
+	uint64_t tba_addr = 0, tma_addr = 0;
 	int i;
 	int ret;
 
@@ -213,6 +227,19 @@ static int amdgpu_cs_parser_init(struct amdgpu_cs_parser *p, union drm_amdgpu_cs
 
 			break;
 
+		case AMDGPU_CHUNK_ID_TRAP:
+			size = sizeof(struct drm_amdgpu_cs_chunk_trap);
+			if (p->chunks[i].length_dw * sizeof(uint32_t) < size) {
+				ret = -EINVAL;
+				goto free_partial_kdata;
+			}
+
+			ret = amdgpu_cs_user_trap_chunk(p, p->chunks[i].kdata,
+							&tba_addr, &tma_addr);
+			if (ret)
+				goto free_partial_kdata;
+			break;
+
 		case AMDGPU_CHUNK_ID_DEPENDENCIES:
 		case AMDGPU_CHUNK_ID_SYNCOBJ_IN:
 		case AMDGPU_CHUNK_ID_SYNCOBJ_OUT:
@@ -238,6 +265,10 @@ static int amdgpu_cs_parser_init(struct amdgpu_cs_parser *p, union drm_amdgpu_cs
 
 	if (p->uf_entry.tv.bo)
 		p->job->uf_addr = uf_offset;
+
+	p->job->tba_addr = tba_addr;
+	p->job->tma_addr = tma_addr;
+
 	kvfree(chunk_array);
 
 	/* Use this opportunity to fill in task info for the vm */
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
index d4a40cd0fe09..21c3b6eaf359 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
@@ -94,9 +94,10 @@
  * - 3.39.0 - DMABUF implicit sync does a full pipeline sync
  * - 3.40.0 - Add AMDGPU_IDS_FLAGS_TMZ
  * - 3.41.0 - Add video codec query
+ * - 3.42.0 - Add AMDGPU_CHUNK_ID_TRAP
  */
 #define KMS_DRIVER_MAJOR	3
-#define KMS_DRIVER_MINOR	41
+#define KMS_DRIVER_MINOR	42
 #define KMS_DRIVER_PATCHLEVEL	0
 
 int amdgpu_vram_limit;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.h
index 0c3b4fa1f936..d165970ffdd7 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.h
@@ -58,6 +58,10 @@ struct amdgpu_vmid {
 	uint32_t		oa_base;
 	uint32_t		oa_size;
 
+	/* user trap */
+	uint64_t		tba_addr;
+	uint64_t		tma_addr;
+
 	unsigned		pasid;
 	struct dma_fence	*pasid_mapping;
 };
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
index 81caac9b958a..b8ed5b13ea44 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
@@ -62,6 +62,10 @@ struct amdgpu_job {
 	/* user fence handling */
 	uint64_t		uf_addr;
 	uint64_t		uf_sequence;
+
+	/* user trap handling */
+	uint64_t		tba_addr;
+	uint64_t		tma_addr;
 };
 
 int amdgpu_job_alloc(struct amdgpu_device *adev, unsigned num_ibs,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
index ca1622835296..550d71b2a1a8 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
@@ -199,6 +199,8 @@ struct amdgpu_ring_funcs {
 	int (*preempt_ib)(struct amdgpu_ring *ring);
 	void (*emit_mem_sync)(struct amdgpu_ring *ring);
 	void (*emit_wave_limit)(struct amdgpu_ring *ring, bool enable);
+	void (*emit_trap_handler)(struct amdgpu_ring *ring, uint32_t vmid,
+				  uint64_t tba_addr, uint64_t tma_addr);
 };
 
 struct amdgpu_ring {
@@ -263,6 +265,7 @@ struct amdgpu_ring {
 #define amdgpu_ring_emit_vm_flush(r, vmid, addr) (r)->funcs->emit_vm_flush((r), (vmid), (addr))
 #define amdgpu_ring_emit_fence(r, addr, seq, flags) (r)->funcs->emit_fence((r), (addr), (seq), (flags))
 #define amdgpu_ring_emit_gds_switch(r, v, db, ds, wb, ws, ab, as) (r)->funcs->emit_gds_switch((r), (v), (db), (ds), (wb), (ws), (ab), (as))
+#define amdgpu_ring_emit_trap_handler(r, v, tba, tma) (r)->funcs->emit_trap_handler((r), (v), (tba), (tma))
 #define amdgpu_ring_emit_hdp_flush(r) (r)->funcs->emit_hdp_flush((r))
 #define amdgpu_ring_emit_switch_buffer(r) (r)->funcs->emit_switch_buffer((r))
 #define amdgpu_ring_emit_cntxcntl(r, d) (r)->funcs->emit_cntxcntl((r), (d))
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
index f2513be72980..53552f7cc66b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
@@ -1107,15 +1107,22 @@ int amdgpu_vm_flush(struct amdgpu_ring *ring, struct amdgpu_job *job,
 	bool vm_flush_needed = job->vm_needs_flush;
 	struct dma_fence *fence = NULL;
 	bool pasid_mapping_needed = false;
+	bool trap_handler_needed = false;
 	unsigned patch_offset = 0;
 	bool update_spm_vmid_needed = (job->vm && (job->vm->reserved_vmid[vmhub] != NULL));
 	int r;
 
-	if (update_spm_vmid_needed && adev->gfx.rlc.funcs->update_spm_vmid)
+	if (update_spm_vmid_needed && adev->gfx.rlc.funcs->update_spm_vmid) {
 		adev->gfx.rlc.funcs->update_spm_vmid(adev, job->vmid);
 
+		trap_handler_needed = ring->funcs->emit_trap_handler && (
+			id->tba_addr != job->tba_addr ||
+			id->tma_addr != job->tma_addr);
+	}
+
 	if (amdgpu_vmid_had_gpu_reset(adev, id)) {
 		gds_switch_needed = true;
+		trap_handler_needed = true;
 		vm_flush_needed = true;
 		pasid_mapping_needed = true;
 	}
@@ -1127,12 +1134,14 @@ int amdgpu_vm_flush(struct amdgpu_ring *ring, struct amdgpu_job *job,
 	mutex_unlock(&id_mgr->lock);
 
 	gds_switch_needed &= !!ring->funcs->emit_gds_switch;
+	trap_handler_needed &= !!ring->funcs->emit_trap_handler;
 	vm_flush_needed &= !!ring->funcs->emit_vm_flush  &&
 			job->vm_pd_addr != AMDGPU_BO_INVALID_OFFSET;
 	pasid_mapping_needed &= adev->gmc.gmc_funcs->emit_pasid_mapping &&
 		ring->funcs->emit_wreg;
 
-	if (!vm_flush_needed && !gds_switch_needed && !need_pipe_sync)
+	if (!vm_flush_needed && !gds_switch_needed &&
+	    !trap_handler_needed && !need_pipe_sync)
 		return 0;
 
 	if (ring->funcs->init_cond_exec)
@@ -1186,6 +1195,13 @@ int amdgpu_vm_flush(struct amdgpu_ring *ring, struct amdgpu_job *job,
 					    job->oa_size);
 	}
 
+	if (ring->funcs->emit_trap_handler && trap_handler_needed) {
+		id->tba_addr = job->tba_addr;
+		id->tma_addr = job->tma_addr;
+		amdgpu_ring_emit_trap_handler(ring, job->vmid, job->tba_addr,
+					      job->tma_addr);
+	}
+
 	if (ring->funcs->patch_cond_exec)
 		amdgpu_ring_patch_cond_exec(ring, patch_offset);
 
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
index 49fd10a15707..26bbde676cf0 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
@@ -177,6 +177,15 @@
 #define mmGC_THROTTLE_CTRL_Sienna_Cichlid              0x2030
 #define mmGC_THROTTLE_CTRL_Sienna_Cichlid_BASE_IDX     0
 
+#define mmSQ_SHADER_TBA_LO_Sienna_Cichlid 0x10b2
+#define mmSQ_SHADER_TBA_LO_Sienna_Cichlid_BASE_IDX 0
+#define mmSQ_SHADER_TBA_HI_Sienna_Cichlid 0x10b3
+#define mmSQ_SHADER_TBA_HI_Sienna_Cichlid_BASE_IDX 0
+#define mmSQ_SHADER_TMA_LO_Sienna_Cichlid 0x10b4
+#define mmSQ_SHADER_TMA_LO_Sienna_Cichlid_BASE_IDX 0
+#define mmSQ_SHADER_TMA_HI_Sienna_Cichlid 0x10b5
+#define mmSQ_SHADER_TMA_HI_Sienna_Cichlid_BASE_IDX 0
+
 #define GFX_RLCG_GC_WRITE_OLD	(0x8 << 28)
 #define GFX_RLCG_GC_WRITE	(0x0 << 28)
 #define GFX_RLCG_GC_READ	(0x1 << 28)
@@ -7531,6 +7540,42 @@ static void gfx_v10_0_ring_emit_gds_switch(struct amdgpu_ring *ring,
 				    (1 << (oa_size + oa_base)) - (1 << oa_base));
 }
 
+static void gfx_v10_0_ring_emit_trap_handler(struct amdgpu_ring *ring,
+					    uint32_t vmid,
+					    uint64_t tba_addr,
+					    uint64_t tma_addr)
+{
+	struct amdgpu_device *adev = ring->adev;
+
+	mutex_lock(&adev->srbm_mutex);
+	nv_grbm_select(adev, 0, 0, 0, vmid);
+	switch (adev->asic_type) {
+	case CHIP_SIENNA_CICHLID:
+	case CHIP_NAVY_FLOUNDER:
+	case CHIP_VANGOGH:
+	case CHIP_DIMGREY_CAVEFISH:
+		WREG32_SOC15_RLC(GC, 0, mmSQ_SHADER_TBA_LO_Sienna_Cichlid,
+				 lower_32_bits(tba_addr >> 8));
+		WREG32_SOC15_RLC(GC, 0, mmSQ_SHADER_TBA_HI_Sienna_Cichlid,
+				 upper_32_bits(tba_addr >> 8) |
+					1 << SQ_SHADER_TBA_HI__TRAP_EN__SHIFT);
+		WREG32_SOC15_RLC(GC, 0, mmSQ_SHADER_TMA_LO_Sienna_Cichlid,
+				 lower_32_bits(tma_addr >> 8));
+		WREG32_SOC15_RLC(GC, 0, mmSQ_SHADER_TMA_HI_Sienna_Cichlid,
+				 upper_32_bits(tma_addr >> 8));
+		break;
+	default:
+		WREG32_SOC15_RLC(GC, 0, mmSQ_SHADER_TBA_LO, lower_32_bits(tba_addr >> 8));
+		WREG32_SOC15_RLC(GC, 0, mmSQ_SHADER_TBA_HI, upper_32_bits(tba_addr >> 8) |
+					1 << SQ_SHADER_TBA_HI__TRAP_EN__SHIFT);
+		WREG32_SOC15_RLC(GC, 0, mmSQ_SHADER_TMA_LO, lower_32_bits(tma_addr >> 8));
+		WREG32_SOC15_RLC(GC, 0, mmSQ_SHADER_TMA_HI, upper_32_bits(tma_addr >> 8));
+		break;
+	}
+	nv_grbm_select(adev, 0, 0, 0, 0);
+	mutex_unlock(&adev->srbm_mutex);
+}
+
 static int gfx_v10_0_early_init(void *handle)
 {
 	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
@@ -9014,6 +9059,7 @@ static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_gfx = {
 	.emit_reg_write_reg_wait = gfx_v10_0_ring_emit_reg_write_reg_wait,
 	.soft_recovery = gfx_v10_0_ring_soft_recovery,
 	.emit_mem_sync = gfx_v10_0_emit_mem_sync,
+	.emit_trap_handler = gfx_v10_0_ring_emit_trap_handler,
 };
 
 static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_compute = {
@@ -9050,6 +9096,7 @@ static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_compute = {
 	.emit_reg_wait = gfx_v10_0_ring_emit_reg_wait,
 	.emit_reg_write_reg_wait = gfx_v10_0_ring_emit_reg_write_reg_wait,
 	.emit_mem_sync = gfx_v10_0_emit_mem_sync,
+	.emit_trap_handler = gfx_v10_0_ring_emit_trap_handler,
 };
 
 static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_kiq = {
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
index 16a3b279a9ef..417ac13a23b1 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
@@ -4250,6 +4250,23 @@ static void gfx_v9_0_ring_emit_gds_switch(struct amdgpu_ring *ring,
 				   (1 << (oa_size + oa_base)) - (1 << oa_base));
 }
 
+static void gfx_v9_0_ring_emit_trap_handler(struct amdgpu_ring *ring,
+					    uint32_t vmid,
+					    uint64_t tba_addr,
+					    uint64_t tma_addr)
+{
+	struct amdgpu_device *adev = ring->adev;
+
+	mutex_lock(&adev->srbm_mutex);
+	soc15_grbm_select(adev, 0, 0, 0, vmid);
+	WREG32_SOC15_RLC(GC, 0, mmSQ_SHADER_TBA_LO, lower_32_bits(tba_addr >> 8));
+	WREG32_SOC15_RLC(GC, 0, mmSQ_SHADER_TBA_HI, upper_32_bits(tba_addr >> 8));
+	WREG32_SOC15_RLC(GC, 0, mmSQ_SHADER_TMA_LO, lower_32_bits(tma_addr >> 8));
+	WREG32_SOC15_RLC(GC, 0, mmSQ_SHADER_TMA_HI, upper_32_bits(tma_addr >> 8));
+	soc15_grbm_select(adev, 0, 0, 0, 0);
+	mutex_unlock(&adev->srbm_mutex);
+}
+
 static const u32 vgpr_init_compute_shader[] =
 {
 	0xb07c0000, 0xbe8000ff,
@@ -6879,6 +6896,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_gfx = {
 	.emit_reg_write_reg_wait = gfx_v9_0_ring_emit_reg_write_reg_wait,
 	.soft_recovery = gfx_v9_0_ring_soft_recovery,
 	.emit_mem_sync = gfx_v9_0_emit_mem_sync,
+	.emit_trap_handler = gfx_v9_0_ring_emit_trap_handler,
 };
 
 static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_compute = {
@@ -6918,6 +6936,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_compute = {
 	.emit_reg_write_reg_wait = gfx_v9_0_ring_emit_reg_write_reg_wait,
 	.emit_mem_sync = gfx_v9_0_emit_mem_sync,
 	.emit_wave_limit = gfx_v9_0_emit_wave_limit,
+	.emit_trap_handler = gfx_v9_0_ring_emit_trap_handler,
 };
 
 static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_kiq = {
diff --git a/include/uapi/drm/amdgpu_drm.h b/include/uapi/drm/amdgpu_drm.h
index 2b487a8d2727..17322ad4754e 100644
--- a/include/uapi/drm/amdgpu_drm.h
+++ b/include/uapi/drm/amdgpu_drm.h
@@ -552,6 +552,7 @@ struct drm_amdgpu_gem_va {
 #define AMDGPU_CHUNK_ID_SCHEDULED_DEPENDENCIES	0x07
 #define AMDGPU_CHUNK_ID_SYNCOBJ_TIMELINE_WAIT    0x08
 #define AMDGPU_CHUNK_ID_SYNCOBJ_TIMELINE_SIGNAL  0x09
+#define AMDGPU_CHUNK_ID_TRAP            0x0a
 
 struct drm_amdgpu_cs_chunk {
 	__u32		chunk_id;
@@ -646,6 +647,13 @@ struct drm_amdgpu_cs_chunk_syncobj {
        __u64 point;
 };
 
+struct drm_amdgpu_cs_chunk_trap {
+	/** Trap Base Address */
+       __u64 tba_addr;
+	/** Trap Memory Address */
+       __u64 tma_addr;
+};
+
 #define AMDGPU_FENCE_TO_HANDLE_GET_SYNCOBJ	0
 #define AMDGPU_FENCE_TO_HANDLE_GET_SYNCOBJ_FD	1
 #define AMDGPU_FENCE_TO_HANDLE_GET_SYNC_FILE_FD	2
-- 
2.31.1

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 16+ messages in thread

* Re: [PATCH v3] drm/amdgpu: add support for user trap handlers
  2021-05-06  6:54 ` [PATCH v3] " Samuel Pitoiset
@ 2021-05-06  7:09   ` Samuel Pitoiset
  0 siblings, 0 replies; 16+ messages in thread
From: Samuel Pitoiset @ 2021-05-06  7:09 UTC (permalink / raw)
  To: Samuel Pitoiset, amd-gfx

Added GF10.3 support.

I re-tested this on GFX9 and it still works, though on GFX10+ the trap 
handler is never reached, is there something obviously wrong in this patch?

Thanks!

On 5/6/21 8:54 AM, Samuel Pitoiset wrote:
> A trap handler can be used by userspace to catch shader exceptions
> like divide by zero, memory violations etc.
>
> On GFX6-GFX8, the registers used to configure TBA/TMA aren't
> privileged and can be configured from userpace.
>
> On GFX9+ they are per VMID and privileged, only the KMD can
> configure them. At the moment, we don't know how to set them
> via the CP, so they are only emitted if a VMID is reserved.
>
> This introduces a new CS chunk that can be used to set the
> TBA/TMA virtual address at submit time.
>
> Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c   | 31 ++++++++++++++++
>   drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c  |  3 +-
>   drivers/gpu/drm/amd/amdgpu/amdgpu_ids.h  |  4 ++
>   drivers/gpu/drm/amd/amdgpu/amdgpu_job.h  |  4 ++
>   drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h |  3 ++
>   drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c   | 20 +++++++++-
>   drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c   | 47 ++++++++++++++++++++++++
>   drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c    | 19 ++++++++++
>   include/uapi/drm/amdgpu_drm.h            |  8 ++++
>   9 files changed, 136 insertions(+), 3 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> index 90136f9dedd6..0cc9f5eb0484 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> @@ -103,6 +103,19 @@ static int amdgpu_cs_bo_handles_chunk(struct amdgpu_cs_parser *p,
>   	return r;
>   }
>   
> +static int amdgpu_cs_user_trap_chunk(struct amdgpu_cs_parser *p,
> +				     struct drm_amdgpu_cs_chunk_trap *data,
> +				     uint64_t *tba_addr, uint64_t *tma_addr)
> +{
> +	if (!data->tba_addr || !data->tma_addr)
> +		return -EINVAL;
> +
> +	*tba_addr = data->tba_addr;
> +	*tma_addr = data->tma_addr;
> +
> +	return 0;
> +}
> +
>   static int amdgpu_cs_parser_init(struct amdgpu_cs_parser *p, union drm_amdgpu_cs *cs)
>   {
>   	struct amdgpu_fpriv *fpriv = p->filp->driver_priv;
> @@ -111,6 +124,7 @@ static int amdgpu_cs_parser_init(struct amdgpu_cs_parser *p, union drm_amdgpu_cs
>   	uint64_t *chunk_array;
>   	unsigned size, num_ibs = 0;
>   	uint32_t uf_offset = 0;
> +	uint64_t tba_addr = 0, tma_addr = 0;
>   	int i;
>   	int ret;
>   
> @@ -213,6 +227,19 @@ static int amdgpu_cs_parser_init(struct amdgpu_cs_parser *p, union drm_amdgpu_cs
>   
>   			break;
>   
> +		case AMDGPU_CHUNK_ID_TRAP:
> +			size = sizeof(struct drm_amdgpu_cs_chunk_trap);
> +			if (p->chunks[i].length_dw * sizeof(uint32_t) < size) {
> +				ret = -EINVAL;
> +				goto free_partial_kdata;
> +			}
> +
> +			ret = amdgpu_cs_user_trap_chunk(p, p->chunks[i].kdata,
> +							&tba_addr, &tma_addr);
> +			if (ret)
> +				goto free_partial_kdata;
> +			break;
> +
>   		case AMDGPU_CHUNK_ID_DEPENDENCIES:
>   		case AMDGPU_CHUNK_ID_SYNCOBJ_IN:
>   		case AMDGPU_CHUNK_ID_SYNCOBJ_OUT:
> @@ -238,6 +265,10 @@ static int amdgpu_cs_parser_init(struct amdgpu_cs_parser *p, union drm_amdgpu_cs
>   
>   	if (p->uf_entry.tv.bo)
>   		p->job->uf_addr = uf_offset;
> +
> +	p->job->tba_addr = tba_addr;
> +	p->job->tma_addr = tma_addr;
> +
>   	kvfree(chunk_array);
>   
>   	/* Use this opportunity to fill in task info for the vm */
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> index d4a40cd0fe09..21c3b6eaf359 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> @@ -94,9 +94,10 @@
>    * - 3.39.0 - DMABUF implicit sync does a full pipeline sync
>    * - 3.40.0 - Add AMDGPU_IDS_FLAGS_TMZ
>    * - 3.41.0 - Add video codec query
> + * - 3.42.0 - Add AMDGPU_CHUNK_ID_TRAP
>    */
>   #define KMS_DRIVER_MAJOR	3
> -#define KMS_DRIVER_MINOR	41
> +#define KMS_DRIVER_MINOR	42
>   #define KMS_DRIVER_PATCHLEVEL	0
>   
>   int amdgpu_vram_limit;
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.h
> index 0c3b4fa1f936..d165970ffdd7 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.h
> @@ -58,6 +58,10 @@ struct amdgpu_vmid {
>   	uint32_t		oa_base;
>   	uint32_t		oa_size;
>   
> +	/* user trap */
> +	uint64_t		tba_addr;
> +	uint64_t		tma_addr;
> +
>   	unsigned		pasid;
>   	struct dma_fence	*pasid_mapping;
>   };
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
> index 81caac9b958a..b8ed5b13ea44 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
> @@ -62,6 +62,10 @@ struct amdgpu_job {
>   	/* user fence handling */
>   	uint64_t		uf_addr;
>   	uint64_t		uf_sequence;
> +
> +	/* user trap handling */
> +	uint64_t		tba_addr;
> +	uint64_t		tma_addr;
>   };
>   
>   int amdgpu_job_alloc(struct amdgpu_device *adev, unsigned num_ibs,
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> index ca1622835296..550d71b2a1a8 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> @@ -199,6 +199,8 @@ struct amdgpu_ring_funcs {
>   	int (*preempt_ib)(struct amdgpu_ring *ring);
>   	void (*emit_mem_sync)(struct amdgpu_ring *ring);
>   	void (*emit_wave_limit)(struct amdgpu_ring *ring, bool enable);
> +	void (*emit_trap_handler)(struct amdgpu_ring *ring, uint32_t vmid,
> +				  uint64_t tba_addr, uint64_t tma_addr);
>   };
>   
>   struct amdgpu_ring {
> @@ -263,6 +265,7 @@ struct amdgpu_ring {
>   #define amdgpu_ring_emit_vm_flush(r, vmid, addr) (r)->funcs->emit_vm_flush((r), (vmid), (addr))
>   #define amdgpu_ring_emit_fence(r, addr, seq, flags) (r)->funcs->emit_fence((r), (addr), (seq), (flags))
>   #define amdgpu_ring_emit_gds_switch(r, v, db, ds, wb, ws, ab, as) (r)->funcs->emit_gds_switch((r), (v), (db), (ds), (wb), (ws), (ab), (as))
> +#define amdgpu_ring_emit_trap_handler(r, v, tba, tma) (r)->funcs->emit_trap_handler((r), (v), (tba), (tma))
>   #define amdgpu_ring_emit_hdp_flush(r) (r)->funcs->emit_hdp_flush((r))
>   #define amdgpu_ring_emit_switch_buffer(r) (r)->funcs->emit_switch_buffer((r))
>   #define amdgpu_ring_emit_cntxcntl(r, d) (r)->funcs->emit_cntxcntl((r), (d))
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> index f2513be72980..53552f7cc66b 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> @@ -1107,15 +1107,22 @@ int amdgpu_vm_flush(struct amdgpu_ring *ring, struct amdgpu_job *job,
>   	bool vm_flush_needed = job->vm_needs_flush;
>   	struct dma_fence *fence = NULL;
>   	bool pasid_mapping_needed = false;
> +	bool trap_handler_needed = false;
>   	unsigned patch_offset = 0;
>   	bool update_spm_vmid_needed = (job->vm && (job->vm->reserved_vmid[vmhub] != NULL));
>   	int r;
>   
> -	if (update_spm_vmid_needed && adev->gfx.rlc.funcs->update_spm_vmid)
> +	if (update_spm_vmid_needed && adev->gfx.rlc.funcs->update_spm_vmid) {
>   		adev->gfx.rlc.funcs->update_spm_vmid(adev, job->vmid);
>   
> +		trap_handler_needed = ring->funcs->emit_trap_handler && (
> +			id->tba_addr != job->tba_addr ||
> +			id->tma_addr != job->tma_addr);
> +	}
> +
>   	if (amdgpu_vmid_had_gpu_reset(adev, id)) {
>   		gds_switch_needed = true;
> +		trap_handler_needed = true;
>   		vm_flush_needed = true;
>   		pasid_mapping_needed = true;
>   	}
> @@ -1127,12 +1134,14 @@ int amdgpu_vm_flush(struct amdgpu_ring *ring, struct amdgpu_job *job,
>   	mutex_unlock(&id_mgr->lock);
>   
>   	gds_switch_needed &= !!ring->funcs->emit_gds_switch;
> +	trap_handler_needed &= !!ring->funcs->emit_trap_handler;
>   	vm_flush_needed &= !!ring->funcs->emit_vm_flush  &&
>   			job->vm_pd_addr != AMDGPU_BO_INVALID_OFFSET;
>   	pasid_mapping_needed &= adev->gmc.gmc_funcs->emit_pasid_mapping &&
>   		ring->funcs->emit_wreg;
>   
> -	if (!vm_flush_needed && !gds_switch_needed && !need_pipe_sync)
> +	if (!vm_flush_needed && !gds_switch_needed &&
> +	    !trap_handler_needed && !need_pipe_sync)
>   		return 0;
>   
>   	if (ring->funcs->init_cond_exec)
> @@ -1186,6 +1195,13 @@ int amdgpu_vm_flush(struct amdgpu_ring *ring, struct amdgpu_job *job,
>   					    job->oa_size);
>   	}
>   
> +	if (ring->funcs->emit_trap_handler && trap_handler_needed) {
> +		id->tba_addr = job->tba_addr;
> +		id->tma_addr = job->tma_addr;
> +		amdgpu_ring_emit_trap_handler(ring, job->vmid, job->tba_addr,
> +					      job->tma_addr);
> +	}
> +
>   	if (ring->funcs->patch_cond_exec)
>   		amdgpu_ring_patch_cond_exec(ring, patch_offset);
>   
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> index 49fd10a15707..26bbde676cf0 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> @@ -177,6 +177,15 @@
>   #define mmGC_THROTTLE_CTRL_Sienna_Cichlid              0x2030
>   #define mmGC_THROTTLE_CTRL_Sienna_Cichlid_BASE_IDX     0
>   
> +#define mmSQ_SHADER_TBA_LO_Sienna_Cichlid 0x10b2
> +#define mmSQ_SHADER_TBA_LO_Sienna_Cichlid_BASE_IDX 0
> +#define mmSQ_SHADER_TBA_HI_Sienna_Cichlid 0x10b3
> +#define mmSQ_SHADER_TBA_HI_Sienna_Cichlid_BASE_IDX 0
> +#define mmSQ_SHADER_TMA_LO_Sienna_Cichlid 0x10b4
> +#define mmSQ_SHADER_TMA_LO_Sienna_Cichlid_BASE_IDX 0
> +#define mmSQ_SHADER_TMA_HI_Sienna_Cichlid 0x10b5
> +#define mmSQ_SHADER_TMA_HI_Sienna_Cichlid_BASE_IDX 0
> +
>   #define GFX_RLCG_GC_WRITE_OLD	(0x8 << 28)
>   #define GFX_RLCG_GC_WRITE	(0x0 << 28)
>   #define GFX_RLCG_GC_READ	(0x1 << 28)
> @@ -7531,6 +7540,42 @@ static void gfx_v10_0_ring_emit_gds_switch(struct amdgpu_ring *ring,
>   				    (1 << (oa_size + oa_base)) - (1 << oa_base));
>   }
>   
> +static void gfx_v10_0_ring_emit_trap_handler(struct amdgpu_ring *ring,
> +					    uint32_t vmid,
> +					    uint64_t tba_addr,
> +					    uint64_t tma_addr)
> +{
> +	struct amdgpu_device *adev = ring->adev;
> +
> +	mutex_lock(&adev->srbm_mutex);
> +	nv_grbm_select(adev, 0, 0, 0, vmid);
> +	switch (adev->asic_type) {
> +	case CHIP_SIENNA_CICHLID:
> +	case CHIP_NAVY_FLOUNDER:
> +	case CHIP_VANGOGH:
> +	case CHIP_DIMGREY_CAVEFISH:
> +		WREG32_SOC15_RLC(GC, 0, mmSQ_SHADER_TBA_LO_Sienna_Cichlid,
> +				 lower_32_bits(tba_addr >> 8));
> +		WREG32_SOC15_RLC(GC, 0, mmSQ_SHADER_TBA_HI_Sienna_Cichlid,
> +				 upper_32_bits(tba_addr >> 8) |
> +					1 << SQ_SHADER_TBA_HI__TRAP_EN__SHIFT);
> +		WREG32_SOC15_RLC(GC, 0, mmSQ_SHADER_TMA_LO_Sienna_Cichlid,
> +				 lower_32_bits(tma_addr >> 8));
> +		WREG32_SOC15_RLC(GC, 0, mmSQ_SHADER_TMA_HI_Sienna_Cichlid,
> +				 upper_32_bits(tma_addr >> 8));
> +		break;
> +	default:
> +		WREG32_SOC15_RLC(GC, 0, mmSQ_SHADER_TBA_LO, lower_32_bits(tba_addr >> 8));
> +		WREG32_SOC15_RLC(GC, 0, mmSQ_SHADER_TBA_HI, upper_32_bits(tba_addr >> 8) |
> +					1 << SQ_SHADER_TBA_HI__TRAP_EN__SHIFT);
> +		WREG32_SOC15_RLC(GC, 0, mmSQ_SHADER_TMA_LO, lower_32_bits(tma_addr >> 8));
> +		WREG32_SOC15_RLC(GC, 0, mmSQ_SHADER_TMA_HI, upper_32_bits(tma_addr >> 8));
> +		break;
> +	}
> +	nv_grbm_select(adev, 0, 0, 0, 0);
> +	mutex_unlock(&adev->srbm_mutex);
> +}
> +
>   static int gfx_v10_0_early_init(void *handle)
>   {
>   	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
> @@ -9014,6 +9059,7 @@ static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_gfx = {
>   	.emit_reg_write_reg_wait = gfx_v10_0_ring_emit_reg_write_reg_wait,
>   	.soft_recovery = gfx_v10_0_ring_soft_recovery,
>   	.emit_mem_sync = gfx_v10_0_emit_mem_sync,
> +	.emit_trap_handler = gfx_v10_0_ring_emit_trap_handler,
>   };
>   
>   static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_compute = {
> @@ -9050,6 +9096,7 @@ static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_compute = {
>   	.emit_reg_wait = gfx_v10_0_ring_emit_reg_wait,
>   	.emit_reg_write_reg_wait = gfx_v10_0_ring_emit_reg_write_reg_wait,
>   	.emit_mem_sync = gfx_v10_0_emit_mem_sync,
> +	.emit_trap_handler = gfx_v10_0_ring_emit_trap_handler,
>   };
>   
>   static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_kiq = {
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> index 16a3b279a9ef..417ac13a23b1 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> @@ -4250,6 +4250,23 @@ static void gfx_v9_0_ring_emit_gds_switch(struct amdgpu_ring *ring,
>   				   (1 << (oa_size + oa_base)) - (1 << oa_base));
>   }
>   
> +static void gfx_v9_0_ring_emit_trap_handler(struct amdgpu_ring *ring,
> +					    uint32_t vmid,
> +					    uint64_t tba_addr,
> +					    uint64_t tma_addr)
> +{
> +	struct amdgpu_device *adev = ring->adev;
> +
> +	mutex_lock(&adev->srbm_mutex);
> +	soc15_grbm_select(adev, 0, 0, 0, vmid);
> +	WREG32_SOC15_RLC(GC, 0, mmSQ_SHADER_TBA_LO, lower_32_bits(tba_addr >> 8));
> +	WREG32_SOC15_RLC(GC, 0, mmSQ_SHADER_TBA_HI, upper_32_bits(tba_addr >> 8));
> +	WREG32_SOC15_RLC(GC, 0, mmSQ_SHADER_TMA_LO, lower_32_bits(tma_addr >> 8));
> +	WREG32_SOC15_RLC(GC, 0, mmSQ_SHADER_TMA_HI, upper_32_bits(tma_addr >> 8));
> +	soc15_grbm_select(adev, 0, 0, 0, 0);
> +	mutex_unlock(&adev->srbm_mutex);
> +}
> +
>   static const u32 vgpr_init_compute_shader[] =
>   {
>   	0xb07c0000, 0xbe8000ff,
> @@ -6879,6 +6896,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_gfx = {
>   	.emit_reg_write_reg_wait = gfx_v9_0_ring_emit_reg_write_reg_wait,
>   	.soft_recovery = gfx_v9_0_ring_soft_recovery,
>   	.emit_mem_sync = gfx_v9_0_emit_mem_sync,
> +	.emit_trap_handler = gfx_v9_0_ring_emit_trap_handler,
>   };
>   
>   static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_compute = {
> @@ -6918,6 +6936,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_compute = {
>   	.emit_reg_write_reg_wait = gfx_v9_0_ring_emit_reg_write_reg_wait,
>   	.emit_mem_sync = gfx_v9_0_emit_mem_sync,
>   	.emit_wave_limit = gfx_v9_0_emit_wave_limit,
> +	.emit_trap_handler = gfx_v9_0_ring_emit_trap_handler,
>   };
>   
>   static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_kiq = {
> diff --git a/include/uapi/drm/amdgpu_drm.h b/include/uapi/drm/amdgpu_drm.h
> index 2b487a8d2727..17322ad4754e 100644
> --- a/include/uapi/drm/amdgpu_drm.h
> +++ b/include/uapi/drm/amdgpu_drm.h
> @@ -552,6 +552,7 @@ struct drm_amdgpu_gem_va {
>   #define AMDGPU_CHUNK_ID_SCHEDULED_DEPENDENCIES	0x07
>   #define AMDGPU_CHUNK_ID_SYNCOBJ_TIMELINE_WAIT    0x08
>   #define AMDGPU_CHUNK_ID_SYNCOBJ_TIMELINE_SIGNAL  0x09
> +#define AMDGPU_CHUNK_ID_TRAP            0x0a
>   
>   struct drm_amdgpu_cs_chunk {
>   	__u32		chunk_id;
> @@ -646,6 +647,13 @@ struct drm_amdgpu_cs_chunk_syncobj {
>          __u64 point;
>   };
>   
> +struct drm_amdgpu_cs_chunk_trap {
> +	/** Trap Base Address */
> +       __u64 tba_addr;
> +	/** Trap Memory Address */
> +       __u64 tma_addr;
> +};
> +
>   #define AMDGPU_FENCE_TO_HANDLE_GET_SYNCOBJ	0
>   #define AMDGPU_FENCE_TO_HANDLE_GET_SYNCOBJ_FD	1
>   #define AMDGPU_FENCE_TO_HANDLE_GET_SYNC_FILE_FD	2
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 16+ messages in thread

end of thread, other threads:[~2021-05-06  7:09 UTC | newest]

Thread overview: 16+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-08-24 11:49 [RFC PATCH] drm/amdgpu: add support for user trap handlers Samuel Pitoiset
2020-08-24 18:17 ` Marek Olšák
2020-08-25  7:04   ` Samuel Pitoiset
2020-08-24 18:33 ` Alex Deucher
2020-08-24 21:32   ` Alex Deucher
2020-08-25  7:06     ` Samuel Pitoiset
2020-08-25 13:13       ` Alex Deucher
2020-08-25 14:07 ` [PATCH v2] " Samuel Pitoiset
2020-08-28  7:57   ` Christian König
2020-08-28  8:14     ` Samuel Pitoiset
2020-08-28  8:23       ` Christian König
2020-08-28  8:25         ` Samuel Pitoiset
2020-09-23 12:52         ` Samuel Pitoiset
2020-09-23 12:59           ` Christian König
2021-05-06  6:54 ` [PATCH v3] " Samuel Pitoiset
2021-05-06  7:09   ` Samuel Pitoiset

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.