amd-gfx.lists.freedesktop.org archive mirror
 help / color / mirror / Atom feed
* [PATCH 00/32] Upstream of kernel support for AMDGPU ISA debugging
@ 2023-01-25 19:53 Jonathan Kim
  2023-01-25 19:53 ` [PATCH 01/32] drm/amdkfd: add debug and runtime enable interface Jonathan Kim
                   ` (31 more replies)
  0 siblings, 32 replies; 68+ messages in thread
From: Jonathan Kim @ 2023-01-25 19:53 UTC (permalink / raw)
  To: amd-gfx, dri-devel; +Cc: Felix.Kuehling, Jonathan.Kim

AMDGPU kernel upstream support for debugging of compute ISA.

Current production ROCm GDB interface for ISA debugging:
https://rocmdocs.amd.com/en/latest/ROCm_Tools/ROCgdb.html

WIP upstream source for ROCm GDB API, ROC Kernel and ROC Thunk can be referenced here:
https://github.com/ROCm-Developer-Tools/ROCdbgapi/tree/wip-dbgapi
https://github.com/RadeonOpenCompute/ROCK-Kernel-Driver/tree/wip-dbgapi
https://github.com/RadeonOpenCompute/ROCT-Thunk-Interface/tree/wip-dbgapi



^ permalink raw reply	[flat|nested] 68+ messages in thread

* [PATCH 01/32] drm/amdkfd: add debug and runtime enable interface
  2023-01-25 19:53 [PATCH 00/32] Upstream of kernel support for AMDGPU ISA debugging Jonathan Kim
@ 2023-01-25 19:53 ` Jonathan Kim
  2023-02-16 22:16   ` Felix Kuehling
  2023-01-25 19:53 ` [PATCH 02/32] drm/amdkfd: display debug capabilities Jonathan Kim
                   ` (30 subsequent siblings)
  31 siblings, 1 reply; 68+ messages in thread
From: Jonathan Kim @ 2023-01-25 19:53 UTC (permalink / raw)
  To: amd-gfx, dri-devel; +Cc: Felix.Kuehling, Jonathan.Kim

Introduce the GPU debug operations interface.

For ROCm-GDB to extend the GNU Debugger's ability to inspect the AMD GPU
instruction set, provide the necessary interface to allow the debugger
to HW debug-mode set and query exceptions per HSA queue, process or
device.

The runtime_enable interface coordinates exception handling with the
HSA runtime.

Usage is available in the kern docs at uapi/linux/kfd_ioctl.h.

v2: was previously reviewed but removed deprecrated wave launch modes
(kill and disable).
Also remove non-needed dbg flag option.
Add revision and subvendor info to debug device snapshot entry.
Add trap on wave start and end override option.

Signed-off-by: Jonathan Kim <jonathan.kim@amd.com>
---
 drivers/gpu/drm/amd/amdkfd/kfd_chardev.c |  48 ++
 include/uapi/linux/kfd_ioctl.h           | 663 ++++++++++++++++++++++-
 2 files changed, 710 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index f79b8e964140..d3b019e64093 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -2645,6 +2645,48 @@ static int kfd_ioctl_criu(struct file *filep, struct kfd_process *p, void *data)
 	return ret;
 }
 
+static int kfd_ioctl_runtime_enable(struct file *filep, struct kfd_process *p, void *data)
+{
+	return 0;
+}
+
+static int kfd_ioctl_set_debug_trap(struct file *filep, struct kfd_process *p, void *data)
+{
+	struct kfd_ioctl_dbg_trap_args *args = data;
+	int r = 0;
+
+	if (sched_policy == KFD_SCHED_POLICY_NO_HWS) {
+		pr_err("Debugging does not support sched_policy %i", sched_policy);
+		return -EINVAL;
+	}
+
+	switch (args->op) {
+	case KFD_IOC_DBG_TRAP_ENABLE:
+	case KFD_IOC_DBG_TRAP_DISABLE:
+	case KFD_IOC_DBG_TRAP_SEND_RUNTIME_EVENT:
+	case KFD_IOC_DBG_TRAP_SET_EXCEPTIONS_ENABLED:
+	case KFD_IOC_DBG_TRAP_SET_WAVE_LAUNCH_OVERRIDE:
+	case KFD_IOC_DBG_TRAP_SET_WAVE_LAUNCH_MODE:
+	case KFD_IOC_DBG_TRAP_SUSPEND_QUEUES:
+	case KFD_IOC_DBG_TRAP_RESUME_QUEUES:
+	case KFD_IOC_DBG_TRAP_SET_NODE_ADDRESS_WATCH:
+	case KFD_IOC_DBG_TRAP_CLEAR_NODE_ADDRESS_WATCH:
+	case KFD_IOC_DBG_TRAP_SET_FLAGS:
+	case KFD_IOC_DBG_TRAP_QUERY_DEBUG_EVENT:
+	case KFD_IOC_DBG_TRAP_QUERY_EXCEPTION_INFO:
+	case KFD_IOC_DBG_TRAP_GET_QUEUE_SNAPSHOT:
+	case KFD_IOC_DBG_TRAP_GET_DEVICE_SNAPSHOT:
+		pr_warn("Debugging not supported yet\n");
+		r = -EACCES;
+		break;
+	default:
+		pr_err("Invalid option: %i\n", args->op);
+		r = -EINVAL;
+	}
+
+	return r;
+}
+
 #define AMDKFD_IOCTL_DEF(ioctl, _func, _flags) \
 	[_IOC_NR(ioctl)] = {.cmd = ioctl, .func = _func, .flags = _flags, \
 			    .cmd_drv = 0, .name = #ioctl}
@@ -2754,6 +2796,12 @@ static const struct amdkfd_ioctl_desc amdkfd_ioctls[] = {
 
 	AMDKFD_IOCTL_DEF(AMDKFD_IOC_AVAILABLE_MEMORY,
 			kfd_ioctl_get_available_memory, 0),
+
+	AMDKFD_IOCTL_DEF(AMDKFD_IOC_RUNTIME_ENABLE,
+			kfd_ioctl_runtime_enable, 0),
+
+	AMDKFD_IOCTL_DEF(AMDKFD_IOC_DBG_TRAP,
+			kfd_ioctl_set_debug_trap, 0),
 };
 
 #define AMDKFD_CORE_IOCTL_COUNT	ARRAY_SIZE(amdkfd_ioctls)
diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h
index 42b60198b6c5..9ef4eed45c19 100644
--- a/include/uapi/linux/kfd_ioctl.h
+++ b/include/uapi/linux/kfd_ioctl.h
@@ -109,6 +109,32 @@ struct kfd_ioctl_get_available_memory_args {
 	__u32 pad;
 };
 
+struct kfd_dbg_device_info_entry {
+	__u64 exception_status;
+	__u64 lds_base;
+	__u64 lds_limit;
+	__u64 scratch_base;
+	__u64 scratch_limit;
+	__u64 gpuvm_base;
+	__u64 gpuvm_limit;
+	__u32 gpu_id;
+	__u32 location_id;
+	__u32 vendor_id;
+	__u32 device_id;
+	__u32 revision_id;
+	__u32 subsystem_vendor_id;
+	__u32 subsystem_device_id;
+	__u32 fw_version;
+	__u32 gfx_target_version;
+	__u32 simd_count;
+	__u32 max_waves_per_simd;
+	__u32 array_count;
+	__u32 simd_arrays_per_engine;
+	__u32 capability;
+	__u32 debug_prop;
+	__u32 pad;
+};
+
 /* For kfd_ioctl_set_memory_policy_args.default_policy and alternate_policy */
 #define KFD_IOC_CACHE_POLICY_COHERENT 0
 #define KFD_IOC_CACHE_POLICY_NONCOHERENT 1
@@ -766,6 +792,635 @@ struct kfd_ioctl_set_xnack_mode_args {
 	__s32 xnack_enabled;
 };
 
+/* Wave launch override modes */
+enum kfd_dbg_trap_override_mode {
+	KFD_DBG_TRAP_OVERRIDE_OR = 0,
+	KFD_DBG_TRAP_OVERRIDE_REPLACE = 1
+};
+
+/* Wave launch overrides */
+enum kfd_dbg_trap_mask {
+	KFD_DBG_TRAP_MASK_FP_INVALID = 1,
+	KFD_DBG_TRAP_MASK_FP_INPUT_DENORMAL = 2,
+	KFD_DBG_TRAP_MASK_FP_DIVIDE_BY_ZERO = 4,
+	KFD_DBG_TRAP_MASK_FP_OVERFLOW = 8,
+	KFD_DBG_TRAP_MASK_FP_UNDERFLOW = 16,
+	KFD_DBG_TRAP_MASK_FP_INEXACT = 32,
+	KFD_DBG_TRAP_MASK_INT_DIVIDE_BY_ZERO = 64,
+	KFD_DBG_TRAP_MASK_DBG_ADDRESS_WATCH = 128,
+	KFD_DBG_TRAP_MASK_DBG_MEMORY_VIOLATION = 256,
+	KFD_DBG_TRAP_MASK_TRAP_ON_WAVE_START = (1 << 30),
+	KFD_DBG_TRAP_MASK_TRAP_ON_WAVE_END = (1 << 31)
+};
+
+/* Wave launch modes */
+enum kfd_dbg_trap_wave_launch_mode {
+	KFD_DBG_TRAP_WAVE_LAUNCH_MODE_NORMAL = 0,
+	KFD_DBG_TRAP_WAVE_LAUNCH_MODE_HALT = 1,
+	KFD_DBG_TRAP_WAVE_LAUNCH_MODE_DEBUG = 3
+};
+
+/* Address watch modes */
+enum kfd_dbg_trap_address_watch_mode {
+	KFD_DBG_TRAP_ADDRESS_WATCH_MODE_READ = 0,
+	KFD_DBG_TRAP_ADDRESS_WATCH_MODE_NONREAD = 1,
+	KFD_DBG_TRAP_ADDRESS_WATCH_MODE_ATOMIC = 2,
+	KFD_DBG_TRAP_ADDRESS_WATCH_MODE_ALL = 3
+};
+
+/* Additional wave settings */
+enum kfd_dbg_trap_flags {
+	KFD_DBG_TRAP_FLAG_SINGLE_MEM_OP = 1,
+};
+
+/* Trap exceptions */
+enum kfd_dbg_trap_exception_code {
+	EC_NONE = 0,
+	/* per queue */
+	EC_QUEUE_WAVE_ABORT = 1,
+	EC_QUEUE_WAVE_TRAP = 2,
+	EC_QUEUE_WAVE_MATH_ERROR = 3,
+	EC_QUEUE_WAVE_ILLEGAL_INSTRUCTION = 4,
+	EC_QUEUE_WAVE_MEMORY_VIOLATION = 5,
+	EC_QUEUE_WAVE_APERTURE_VIOLATION = 6,
+	EC_QUEUE_PACKET_DISPATCH_DIM_INVALID = 16,
+	EC_QUEUE_PACKET_DISPATCH_GROUP_SEGMENT_SIZE_INVALID = 17,
+	EC_QUEUE_PACKET_DISPATCH_CODE_INVALID = 18,
+	EC_QUEUE_PACKET_RESERVED = 19,
+	EC_QUEUE_PACKET_UNSUPPORTED = 20,
+	EC_QUEUE_PACKET_DISPATCH_WORK_GROUP_SIZE_INVALID = 21,
+	EC_QUEUE_PACKET_DISPATCH_REGISTER_INVALID = 22,
+	EC_QUEUE_PACKET_VENDOR_UNSUPPORTED = 23,
+	EC_QUEUE_PREEMPTION_ERROR = 30,
+	EC_QUEUE_NEW = 31,
+	/* per device */
+	EC_DEVICE_QUEUE_DELETE = 32,
+	EC_DEVICE_MEMORY_VIOLATION = 33,
+	EC_DEVICE_RAS_ERROR = 34,
+	EC_DEVICE_FATAL_HALT = 35,
+	EC_DEVICE_NEW = 36,
+	/* per process */
+	EC_PROCESS_RUNTIME = 48,
+	EC_PROCESS_DEVICE_REMOVE = 49,
+	EC_MAX
+};
+
+/* Mask generated by ecode in kfd_dbg_trap_exception_code */
+#define KFD_EC_MASK(ecode)	(1ULL << (ecode - 1))
+
+/* Masks for exception code type checks below */
+#define KFD_EC_MASK_QUEUE	(KFD_EC_MASK(EC_QUEUE_WAVE_ABORT) |	\
+				 KFD_EC_MASK(EC_QUEUE_WAVE_TRAP) |	\
+				 KFD_EC_MASK(EC_QUEUE_WAVE_MATH_ERROR) |	\
+				 KFD_EC_MASK(EC_QUEUE_WAVE_ILLEGAL_INSTRUCTION) |	\
+				 KFD_EC_MASK(EC_QUEUE_WAVE_MEMORY_VIOLATION) |	\
+				 KFD_EC_MASK(EC_QUEUE_WAVE_APERTURE_VIOLATION) |	\
+				 KFD_EC_MASK(EC_QUEUE_PACKET_DISPATCH_DIM_INVALID) |	\
+				 KFD_EC_MASK(EC_QUEUE_PACKET_DISPATCH_GROUP_SEGMENT_SIZE_INVALID) |	\
+				 KFD_EC_MASK(EC_QUEUE_PACKET_DISPATCH_CODE_INVALID) |	\
+				 KFD_EC_MASK(EC_QUEUE_PACKET_UNSUPPORTED) |	\
+				 KFD_EC_MASK(EC_QUEUE_PACKET_DISPATCH_WORK_GROUP_SIZE_INVALID) |	\
+				 KFD_EC_MASK(EC_QUEUE_PACKET_DISPATCH_REGISTER_INVALID) |	\
+				 KFD_EC_MASK(EC_QUEUE_PACKET_VENDOR_UNSUPPORTED)	|	\
+				 KFD_EC_MASK(EC_QUEUE_PREEMPTION_ERROR)	|	\
+				 KFD_EC_MASK(EC_QUEUE_NEW))
+#define KFD_EC_MASK_DEVICE	(KFD_EC_MASK(EC_DEVICE_QUEUE_DELETE) |		\
+				 KFD_EC_MASK(EC_DEVICE_RAS_ERROR) |		\
+				 KFD_EC_MASK(EC_DEVICE_FATAL_HALT) |		\
+				 KFD_EC_MASK(EC_DEVICE_MEMORY_VIOLATION) |	\
+				 KFD_EC_MASK(EC_DEVICE_NEW))
+#define KFD_EC_MASK_PROCESS	(KFD_EC_MASK(EC_PROCESS_RUNTIME) |	\
+				 KFD_EC_MASK(EC_PROCESS_DEVICE_REMOVE))
+
+/* Checks for exception code types for KFD search */
+#define KFD_DBG_EC_TYPE_IS_QUEUE(ecode)					\
+			(!!(KFD_EC_MASK(ecode) & KFD_EC_MASK_QUEUE))
+#define KFD_DBG_EC_TYPE_IS_DEVICE(ecode)				\
+			(!!(KFD_EC_MASK(ecode) & KFD_EC_MASK_DEVICE))
+#define KFD_DBG_EC_TYPE_IS_PROCESS(ecode)				\
+			(!!(KFD_EC_MASK(ecode) & KFD_EC_MASK_PROCESS))
+
+
+/* Runtime enable states */
+enum kfd_dbg_runtime_state {
+	DEBUG_RUNTIME_STATE_DISABLED = 0,
+	DEBUG_RUNTIME_STATE_ENABLED = 1,
+	DEBUG_RUNTIME_STATE_ENABLED_BUSY = 2,
+	DEBUG_RUNTIME_STATE_ENABLED_ERROR = 3
+};
+
+/* Runtime enable status */
+struct kfd_runtime_info {
+	__u64 r_debug;
+	__u32 runtime_state;
+	__u32 ttmp_setup;
+};
+
+/* Enable modes for runtime enable */
+#define KFD_RUNTIME_ENABLE_MODE_ENABLE_MASK	1
+#define KFD_RUNTIME_ENABLE_MODE_TTMP_SAVE_MASK	2
+
+/**
+ * kfd_ioctl_runtime_enable_args - Arguments for runtime enable
+ *
+ * Coordinates debug exception signalling and debug device enablement with runtime.
+ *
+ * @r_debug - pointer to user struct for sharing information between ROCr and the debuggger
+ * @mode_mask - mask to set mode
+ *	KFD_RUNTIME_ENABLE_MODE_ENABLE_MASK - enable runtime for debugging, otherwise disable
+ *	KFD_RUNTIME_ENABLE_MODE_TTMP_SAVE_MASK - enable trap temporary setup (ignore on disable)
+ *
+ * Return - 0 on SUCCESS.
+ *	  - EBUSY if runtime enable call already pending.
+ *	  - EEXIST if user queues already active prior to call.
+ *	    If process is debug enabled, runtime enable will enable debug devices and
+ *	    wait for debugger process to send runtime exception EC_PROCESS_RUNTIME
+ *	    to unblock - see kfd_ioctl_dbg_trap_args.
+ *
+ */
+struct kfd_ioctl_runtime_enable_args {
+	__u64 r_debug;
+	__u32 mode_mask;
+};
+
+/* Queue information */
+struct kfd_queue_snapshot_entry {
+	__u64 exception_status;
+	__u64 ring_base_address;
+	__u64 write_pointer_address;
+	__u64 read_pointer_address;
+	__u64 ctx_save_restore_address;
+	__u32 queue_id;
+	__u32 gpu_id;
+	__u32 ring_size;
+	__u32 queue_type;
+	__u32 ctx_save_restore_area_size;
+	__u32 reserved;
+};
+
+/* Queue status return for suspend/resume */
+#define KFD_DBG_QUEUE_ERROR_BIT		30
+#define KFD_DBG_QUEUE_INVALID_BIT	31
+#define KFD_DBG_QUEUE_ERROR_MASK	(1 << KFD_DBG_QUEUE_ERROR_BIT)
+#define KFD_DBG_QUEUE_INVALID_MASK	(1 << KFD_DBG_QUEUE_INVALID_BIT)
+
+/* Context save area header information */
+struct kfd_context_save_area_header {
+	__u32 control_stack_offset;
+	__u32 control_stack_size;
+	__u32 wave_state_offset;
+	__u32 wave_state_size;
+	__u32 debug_offset;
+	__u32 debug_size;
+	__u64 err_payload_addr;
+	__u32 err_event_id;
+	__u32 reserved1;
+};
+
+/*
+ * Debug operations
+ *
+ * For specifics on usage and return values, see documentation per operation
+ * below.  Otherwise, generic error returns apply:
+ *	- ESRCH if the process to debug does not exist.
+ *
+ *	- EINVAL (with KFD_IOC_DBG_TRAP_ENABLE exempt) if operation
+ *		 KFD_IOC_DBG_TRAP_ENABLE has not succeeded prior.
+ *		 Also returns this error if GPU hardware scheduling is not supported.
+ *
+ *	- EPERM (with KFD_IOC_DBG_TRAP_DISABLE exempt) if target process is not
+ *		 PTRACE_ATTACHED.  KFD_IOC_DBG_TRAP_DISABLE is exempt to allow
+ *		 clean up of debug mode as long as process is debug enabled.
+ *
+ *	- EACCES if any DBG_HW_OP (debug hardware operation) is requested when
+ *		 AMDKFD_IOC_RUNTIME_ENABLE has not succeeded prior.
+ *
+ *	- ENODEV if any GPU does not support debugging on a DBG_HW_OP call.
+ *
+ *	- Other errors may be returned when a DBG_HW_OP occurs while the GPU
+ *	  is in a fatal state.
+ *
+ */
+enum kfd_dbg_trap_operations {
+	KFD_IOC_DBG_TRAP_ENABLE = 0,
+	KFD_IOC_DBG_TRAP_DISABLE = 1,
+	KFD_IOC_DBG_TRAP_SEND_RUNTIME_EVENT = 2,
+	KFD_IOC_DBG_TRAP_SET_EXCEPTIONS_ENABLED = 3,
+	KFD_IOC_DBG_TRAP_SET_WAVE_LAUNCH_OVERRIDE = 4,  /* DBG_HW_OP */
+	KFD_IOC_DBG_TRAP_SET_WAVE_LAUNCH_MODE = 5,      /* DBG_HW_OP */
+	KFD_IOC_DBG_TRAP_SUSPEND_QUEUES = 6,		/* DBG_HW_OP */
+	KFD_IOC_DBG_TRAP_RESUME_QUEUES = 7,		/* DBG_HW_OP */
+	KFD_IOC_DBG_TRAP_SET_NODE_ADDRESS_WATCH = 8,	/* DBG_HW_OP */
+	KFD_IOC_DBG_TRAP_CLEAR_NODE_ADDRESS_WATCH = 9,	/* DBG_HW_OP */
+	KFD_IOC_DBG_TRAP_SET_FLAGS = 10,
+	KFD_IOC_DBG_TRAP_QUERY_DEBUG_EVENT = 11,
+	KFD_IOC_DBG_TRAP_QUERY_EXCEPTION_INFO = 12,
+	KFD_IOC_DBG_TRAP_GET_QUEUE_SNAPSHOT = 13,
+	KFD_IOC_DBG_TRAP_GET_DEVICE_SNAPSHOT = 14
+};
+
+/**
+ * kfd_ioctl_dbg_trap_enable_args
+ *
+ *     Arguments for KFD_IOC_DBG_TRAP_ENABLE.
+ *
+ *     Enables debug session for target process. Call @op KFD_IOC_DBG_TRAP_DISABLE in
+ *     kfd_ioctl_dbg_trap_args to disable debug session.
+ *
+ *     @exception_mask (IN)	- exceptions to raise to the debugger
+ *     @rinfo_ptr      (IN)	- pointer to runtime info buffer (see kfd_runtime_info)
+ *     @rinfo_size     (IN/OUT)	- size of runtime info buffer in bytes
+ *     @dbg_fd	       (IN)	- fd the KFD will nofify the debugger with of raised
+ *				  exceptions set in exception_mask.
+ *
+ *     Generic errors apply (see kfd_dbg_trap_operations).
+ *     Return - 0 on SUCCESS.
+ *		Copies KFD saved kfd_runtime_info to @rinfo_ptr on enable.
+ *		Size of kfd_runtime saved by the KFD returned to @rinfo_size.
+ *            - EBADF if KFD cannot get a reference to dbg_fd.
+ *            - EFAULT if KFD cannot copy runtime info to rinfo_ptr.
+ *            - EINVAL if target process is already debug enabled.
+ *
+ */
+struct kfd_ioctl_dbg_trap_enable_args {
+	__u64 exception_mask;
+	__u64 rinfo_ptr;
+	__u32 rinfo_size;
+	__u32 dbg_fd;
+};
+
+/**
+ * kfd_ioctl_dbg_trap_send_runtime_event_args
+ *
+ *
+ *     Arguments for KFD_IOC_DBG_TRAP_SEND_RUNTIME_EVENT.
+ *     Raises exceptions to runtime.
+ *
+ *     @exception_mask (IN) - exceptions to raise to runtime
+ *     @gpu_id	       (IN) - target device id
+ *     @queue_id       (IN) - target queue id
+ *
+ *     Generic errors apply (see kfd_dbg_trap_operations).
+ *     Return - 0 on SUCCESS.
+ *	      - ENODEV if gpu_id not found.
+ *		If exception_mask contains EC_PROCESS_RUNTIME, unblocks pending
+ *		AMDKFD_IOC_RUNTIME_ENABLE call - see kfd_ioctl_runtime_enable_args.
+ *		All other exceptions are raised to runtime through err_payload_addr.
+ *		See kfd_context_save_area_header.
+ */
+struct kfd_ioctl_dbg_trap_send_runtime_event_args {
+	__u64 exception_mask;
+	__u32 gpu_id;
+	__u32 queue_id;
+};
+
+/**
+ * kfd_ioctl_dbg_trap_set_exceptions_enabled_args
+ *
+ *     Arguments for KFD_IOC_SET_EXCEPTIONS_ENABLED
+ *     Set new exceptions to be raised to the debugger.
+ *
+ *     @exception_mask (IN) - new exceptions to raise the debugger
+ *
+ *     Generic errors apply (see kfd_dbg_trap_operations).
+ *     Return - 0 on SUCCESS.
+ */
+struct kfd_ioctl_dbg_trap_set_exceptions_enabled_args {
+	__u64 exception_mask;
+};
+
+/**
+ * kfd_ioctl_dbg_trap_set_wave_launch_override_args
+ *
+ *     Arguments for KFD_IOC_DBG_TRAP_SET_WAVE_LAUNCH_OVERRIDE
+ *     Enable HW exceptions to raise trap.
+ *
+ *     @override_mode	     (IN)     - see kfd_dbg_trap_override_mode
+ *     @enable_mask	     (IN/OUT) - reference kfd_dbg_trap_mask.
+ *					IN is the override modes requested to be enabled.
+ *					OUT is referenced in Return below.
+ *     @support_request_mask (IN/OUT) - reference kfd_dbg_trap_mask.
+ *					IN is the override modes requested for support check.
+ *					OUT is referenced in Return below.
+ *
+ *     Generic errors apply (see kfd_dbg_trap_operations).
+ *     Return - 0 on SUCCESS.
+ *		Previous enablement is returned in @enable_mask.
+ *		Actual override support is returned in @support_request_mask.
+ *	      - EINVAL if override mode is not supported.
+ *	      - EACCES if trap support requested is not actually supported.
+ *		i.e. enable_mask (IN) is not a subset of support_request_mask (OUT).
+ *		Otherwise it is considered a generic error (see kfd_dbg_trap_operations).
+ */
+struct kfd_ioctl_dbg_trap_set_wave_launch_override_args {
+	__u32 override_mode;
+	__u32 enable_mask;
+	__u32 support_request_mask;
+	__u32 pad;
+};
+
+/**
+ * kfd_ioctl_dbg_trap_set_wave_launch_mode_args
+ *
+ *     Arguments for KFD_IOC_DBG_TRAP_SET_WAVE_LAUNCH_MODE
+ *     Set wave launch mode.
+ *
+ *     @mode (IN) - see kfd_dbg_trap_wave_launch_mode
+ *
+ *     Generic errors apply (see kfd_dbg_trap_operations).
+ *     Return - 0 on SUCCESS.
+ */
+struct kfd_ioctl_dbg_trap_set_wave_launch_mode_args {
+	__u32 launch_mode;
+	__u32 pad;
+};
+
+/**
+ * kfd_ioctl_dbg_trap_suspend_queues_ags
+ *
+ *     Arguments for KFD_IOC_DBG_TRAP_SUSPEND_QUEUES
+ *     Suspend queues.
+ *
+ *     @exception_mask	(IN) - raised exceptions to clear
+ *     @queue_array_ptr (IN) - pointer to array of queue ids (u32 per queue id)
+ *			       to suspend
+ *     @num_queues	(IN) - number of queues to suspend in @queue_array_ptr
+ *     @grace_period	(IN) - wave time allowance before preemption
+ *			       per 1K GPU clock cycle unit
+ *
+ *     Generic errors apply (see kfd_dbg_trap_operations).
+ *     Destruction of a suspended queue is blocked until the queue is
+ *     resumed.  This allows the debugger to access queue information and
+ *     the its context save area without running into a race condition on
+ *     queue destruction.
+ *     Automatically copies per queue context save area header information
+ *     into the save area base
+ *     (see kfd_queue_snapshot_entry and kfd_context_save_area_header).
+ *
+ *     Return - Number of queues suspended on SUCCESS.
+ *	.	KFD_DBG_QUEUE_ERROR_MASK and KFD_DBG_QUEUE_INVALID_MASK masked
+ *		for each queue id in @queue_array_ptr array reports unsuccessful
+ *		suspend reason.
+ *		KFD_DBG_QUEUE_ERROR_MASK = HW failure.
+ *		KFD_DBG_QUEUE_INVALID_MASK = queue does not exist, is new or
+ *		is being destroyed.
+ */
+struct kfd_ioctl_dbg_trap_suspend_queues_args {
+	__u64 exception_mask;
+	__u64 queue_array_ptr;
+	__u32 num_queues;
+	__u32 grace_period;
+};
+
+/**
+ * kfd_ioctl_dbg_trap_resume_queues_args
+ *
+ *     Arguments for KFD_IOC_DBG_TRAP_RESUME_QUEUES
+ *     Resume queues.
+ *
+ *     @queue_array_ptr (IN) - pointer to array of queue ids (u32 per queue id)
+ *			       to resume
+ *     @num_queues	(IN) - number of queues to resume in @queue_array_ptr
+ *
+ *     Generic errors apply (see kfd_dbg_trap_operations).
+ *     Return - Number of queues resumed on SUCCESS.
+ *		KFD_DBG_QUEUE_ERROR_MASK and KFD_DBG_QUEUE_INVALID_MASK mask
+ *		for each queue id in @queue_array_ptr array reports unsuccessful
+ *		resume reason.
+ *		KFD_DBG_QUEUE_ERROR_MASK = HW failure.
+ *		KFD_DBG_QUEUE_INVALID_MASK = queue does not exist.
+ */
+struct kfd_ioctl_dbg_trap_resume_queues_args {
+	__u64 queue_array_ptr;
+	__u32 num_queues;
+	__u32 pad;
+};
+
+/**
+ * kfd_ioctl_dbg_trap_set_node_address_watch_args
+ *
+ *     Arguments for KFD_IOC_DBG_TRAP_SET_NODE_ADDRESS_WATCH
+ *     Sets address watch for device.
+ *
+ *     @address	(IN)  - watch address to set
+ *     @mode    (IN)  - see kfd_dbg_trap_address_watch_mode
+ *     @mask    (IN)  - watch address mask
+ *     @gpu_id  (IN)  - target gpu to set watch point
+ *     @id      (OUT) - watch id allocated
+ *
+ *     Generic errors apply (see kfd_dbg_trap_operations).
+ *     Return - 0 on SUCCESS.
+ *		Allocated watch ID returned to @id.
+ *	      - ENODEV if gpu_id not found.
+ *	      - ENOMEM if watch IDs can be allocated
+ */
+struct kfd_ioctl_dbg_trap_set_node_address_watch_args {
+	__u64 address;
+	__u32 mode;
+	__u32 mask;
+	__u32 gpu_id;
+	__u32 id;
+};
+
+/**
+ * kfd_ioctl_dbg_trap_clear_node_address_watch_args
+ *
+ *     Arguments for KFD_IOC_DBG_TRAP_CLEAR_NODE_ADDRESS_WATCH
+ *     Clear address watch for device.
+ *
+ *     @gpu_id  (IN)  - target device to clear watch point
+ *     @id      (IN) - allocated watch id to clear
+ *
+ *     Generic errors apply (see kfd_dbg_trap_operations).
+ *     Return - 0 on SUCCESS.
+ *	      - ENODEV if gpu_id not found.
+ *	      - EINVAL if watch ID has not been allocated.
+ */
+struct kfd_ioctl_dbg_trap_clear_node_address_watch_args {
+	__u32 gpu_id;
+	__u32 id;
+};
+
+/**
+ * kfd_ioctl_dbg_trap_set_flags_args
+ *
+ *     Arguments for KFD_IOC_DBG_TRAP_SET_FLAGS
+ *     Sets flags for wave behaviour.
+ *
+ *     @flags (IN/OUT) - IN = flags to enable, OUT = flags previously enabled
+ *
+ *     Generic errors apply (see kfd_dbg_trap_operations).
+ *     Return - 0 on SUCCESS.
+ *	      - EACCESS if any debug device does not allow flag options.
+ */
+struct kfd_ioctl_dbg_trap_set_flags_args {
+	__u32 flags;
+	__u32 pad;
+};
+
+/**
+ * kfd_ioctl_dbg_trap_query_debug_event_args
+ *
+ *     Arguments for KFD_IOC_DBG_TRAP_QUERY_DEBUG_EVENT
+ *
+ *     Find one or more raised exceptions. This function can return multiple
+ *     exceptions from a single queue or a single device with one call. To find
+ *     all raised exceptions, this function must be called repeatedly until it
+ *     returns -EAGAIN. Returned exceptions can optionally be cleared by
+ *     setting the corresponding bit in the @exception_mask input parameter.
+ *     However, clearing an exception prevents retrieving further information
+ *     about it with KFD_IOC_DBG_TRAP_QUERY_EXCEPTION_INFO.
+ *
+ *     @exception_mask (IN/OUT) - exception to clear (IN) and raised (OUT)
+ *     @gpu_id	       (OUT)    - gpu id of exceptions raised
+ *     @queue_id       (OUT)    - queue id of exceptions raised
+ *
+ *     Generic errors apply (see kfd_dbg_trap_operations).
+ *     Return - 0 on raised exception found
+ *              Raised exceptions found are returned in @exception mask
+ *              with reported source id returned in @gpu_id or @queue_id.
+ *            - EAGAIN if no raised exception has been found
+ */
+struct kfd_ioctl_dbg_trap_query_debug_event_args {
+	__u64 exception_mask;
+	__u32 gpu_id;
+	__u32 queue_id;
+};
+
+/**
+ * kfd_ioctl_dbg_trap_query_exception_info_args
+ *
+ *     Arguments KFD_IOC_DBG_TRAP_QUERY_EXCEPTION_INFO
+ *     Get additional info on raised exception.
+ *
+ *     @info_ptr	(IN)	 - pointer to exception info buffer to copy to
+ *     @info_size	(IN/OUT) - exception info buffer size (bytes)
+ *     @source_id	(IN)     - target gpu or queue id
+ *     @exception_code	(IN)     - target exception
+ *     @clear_exception	(IN)     - clear raised @exception_code exception
+ *				   (0 = false, 1 = true)
+ *
+ *     Generic errors apply (see kfd_dbg_trap_operations).
+ *     Return - 0 on SUCCESS.
+ *              If @exception_code is EC_DEVICE_MEMORY_VIOLATION, copy @info_size(OUT)
+ *		bytes of memory exception data to @info_ptr.
+ *              If @exception_code is EC_PROCESS_RUNTIME, copy saved
+ *              kfd_runtime_info to @info_ptr.
+ *              Actual required @info_ptr size (bytes) is returned in @info_size.
+ */
+struct kfd_ioctl_dbg_trap_query_exception_info_args {
+	__u64 info_ptr;
+	__u32 info_size;
+	__u32 source_id;
+	__u32 exception_code;
+	__u32 clear_exception;
+};
+
+/**
+ * kfd_ioctl_dbg_trap_get_queue_snapshot_args
+ *
+ *     Arguments KFD_IOC_DBG_TRAP_GET_QUEUE_SNAPSHOT
+ *     Get queue information.
+ *
+ *     @exception_mask	 (IN)	  - exceptions raised to clear
+ *     @snapshot_buf_ptr (IN)	  - queue snapshot entry buffer (see kfd_queue_snapshot_entry)
+ *     @num_queues	 (IN/OUT) - number of queue snapshot entries
+ *         The debugger specifies the size of the array allocated in @num_queues.
+ *         KFD returns the number of queues that actually existed. If this is
+ *         larger than the size specified by the debugger, KFD will not overflow
+ *         the array allocated by the debugger.
+ *
+ *     @entry_size	 (IN/OUT) - size per entry in bytes
+ *         The debugger specifies sizeof(struct kfd_queue_snapshot_entry) in
+ *         @entry_size. KFD returns the number of bytes actually populated per
+ *         entry. The debugger should use the KFD_IOCTL_MINOR_VERSION to determine,
+ *         which fields in struct kfd_queue_snapshot_entry are valid. This allows
+ *         growing the ABI in a backwards compatible manner.
+ *         Note that entry_size(IN) should still be used to stride the snapshot buffer in the
+ *         event that it's larger than actual kfd_queue_snapshot_entry.
+ *
+ *     Generic errors apply (see kfd_dbg_trap_operations).
+ *     Return - 0 on SUCCESS.
+ *              Copies @num_queues(IN) queue snapshot entries of size @entry_size(IN)
+ *              into @snapshot_buf_ptr if @num_queues(IN) > 0.
+ *              Otherwise return @num_queues(OUT) queue snapshot entries that exist.
+ */
+struct kfd_ioctl_dbg_trap_queue_snapshot_args {
+	__u64 exception_mask;
+	__u64 snapshot_buf_ptr;
+	__u32 num_queues;
+	__u32 entry_size;
+};
+
+/**
+ * kfd_ioctl_dbg_trap_get_device_snapshot_args
+ *
+ *     Arguments for KFD_IOC_DBG_TRAP_GET_DEVICE_SNAPSHOT
+ *     Get device information.
+ *
+ *     @exception_mask	 (IN)	  - exceptions raised to clear
+ *     @snapshot_buf_ptr (IN)	  - pointer to snapshot buffer (see kfd_dbg_device_info_entry)
+ *     @num_devices	 (IN/OUT) - number of debug devices to snapshot
+ *         The debugger specifies the size of the array allocated in @num_devices.
+ *         KFD returns the number of devices that actually existed. If this is
+ *         larger than the size specified by the debugger, KFD will not overflow
+ *         the array allocated by the debugger.
+ *
+ *     @entry_size	 (IN/OUT) - size per entry in bytes
+ *         The debugger specifies sizeof(struct kfd_dbg_device_info_entry) in
+ *         @entry_size. KFD returns the number of bytes actually populated. The
+ *         debugger should use KFD_IOCTL_MINOR_VERSION to determine, which fields
+ *         in struct kfd_dbg_device_info_entry are valid. This allows growing the
+ *         ABI in a backwards compatible manner.
+ *         Note that entry_size(IN) should still be used to stride the snapshot buffer in the
+ *         event that it's larger than actual kfd_dbg_device_info_entry.
+ *
+ *     Generic errors apply (see kfd_dbg_trap_operations).
+ *     Return - 0 on SUCCESS.
+ *              Copies @num_devices(IN) device snapshot entries of size @entry_size(IN)
+ *              into @snapshot_buf_ptr if @num_devices(IN) > 0.
+ *              Otherwise return @num_devices(OUT) queue snapshot entries that exist.
+ */
+struct kfd_ioctl_dbg_trap_device_snapshot_args {
+	__u64 exception_mask;
+	__u64 snapshot_buf_ptr;
+	__u32 num_devices;
+	__u32 entry_size;
+};
+
+/**
+ * kfd_ioctl_dbg_trap_args
+ *
+ * Arguments to debug target process.
+ *
+ *     @pid - target process to debug
+ *     @op  - debug operation (see kfd_dbg_trap_operations)
+ *
+ *     @op determines which union struct args to use.
+ *     Refer to kern docs for each kfd_ioctl_dbg_trap_*_args struct.
+ */
+struct kfd_ioctl_dbg_trap_args {
+	__u32 pid;
+	__u32 op;
+
+	union {
+		struct kfd_ioctl_dbg_trap_enable_args enable;
+		struct kfd_ioctl_dbg_trap_send_runtime_event_args send_runtime_event;
+		struct kfd_ioctl_dbg_trap_set_exceptions_enabled_args set_exceptions_enabled;
+		struct kfd_ioctl_dbg_trap_set_wave_launch_override_args launch_override;
+		struct kfd_ioctl_dbg_trap_set_wave_launch_mode_args launch_mode;
+		struct kfd_ioctl_dbg_trap_suspend_queues_args suspend_queues;
+		struct kfd_ioctl_dbg_trap_resume_queues_args resume_queues;
+		struct kfd_ioctl_dbg_trap_set_node_address_watch_args set_node_address_watch;
+		struct kfd_ioctl_dbg_trap_clear_node_address_watch_args clear_node_address_watch;
+		struct kfd_ioctl_dbg_trap_set_flags_args set_flags;
+		struct kfd_ioctl_dbg_trap_query_debug_event_args query_debug_event;
+		struct kfd_ioctl_dbg_trap_query_exception_info_args query_exception_info;
+		struct kfd_ioctl_dbg_trap_queue_snapshot_args queue_snapshot;
+		struct kfd_ioctl_dbg_trap_device_snapshot_args device_snapshot;
+	};
+};
+
 #define AMDKFD_IOCTL_BASE 'K'
 #define AMDKFD_IO(nr)			_IO(AMDKFD_IOCTL_BASE, nr)
 #define AMDKFD_IOR(nr, type)		_IOR(AMDKFD_IOCTL_BASE, nr, type)
@@ -877,7 +1532,13 @@ struct kfd_ioctl_set_xnack_mode_args {
 #define AMDKFD_IOC_AVAILABLE_MEMORY		\
 		AMDKFD_IOWR(0x23, struct kfd_ioctl_get_available_memory_args)
 
+#define AMDKFD_IOC_RUNTIME_ENABLE		\
+		AMDKFD_IOWR(0x24, struct kfd_ioctl_runtime_enable_args)
+
+#define AMDKFD_IOC_DBG_TRAP			\
+		AMDKFD_IOWR(0x25, struct kfd_ioctl_dbg_trap_args)
+
 #define AMDKFD_COMMAND_START		0x01
-#define AMDKFD_COMMAND_END		0x24
+#define AMDKFD_COMMAND_END		0x26
 
 #endif
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 68+ messages in thread

* [PATCH 02/32] drm/amdkfd: display debug capabilities
  2023-01-25 19:53 [PATCH 00/32] Upstream of kernel support for AMDGPU ISA debugging Jonathan Kim
  2023-01-25 19:53 ` [PATCH 01/32] drm/amdkfd: add debug and runtime enable interface Jonathan Kim
@ 2023-01-25 19:53 ` Jonathan Kim
  2023-02-16 22:24   ` Felix Kuehling
  2023-01-25 19:53 ` [PATCH 03/32] drm/amdkfd: prepare per-process debug enable and disable Jonathan Kim
                   ` (29 subsequent siblings)
  31 siblings, 1 reply; 68+ messages in thread
From: Jonathan Kim @ 2023-01-25 19:53 UTC (permalink / raw)
  To: amd-gfx, dri-devel; +Cc: Felix.Kuehling, Jonathan.Kim

Expose debug capabilities in the KFD topology node's HSA capabilities and
debug properties flags.

Ensure correct capabilities are exposed based on firmware support.

Flag definitions can be referenced in uapi/linux/kfd_sysfs.h.

v2: v1 was reviewed but re-requesting review for the following.
- remove asic family code name comments in firmware support checking
- add gfx11 requirements in fw support checks and debug props and caps

Signed-off-by: Jonathan Kim <jonathan.kim@amd.com>
---
 drivers/gpu/drm/amd/amdkfd/kfd_topology.c | 101 ++++++++++++++++++++--
 drivers/gpu/drm/amd/amdkfd/kfd_topology.h |   6 ++
 include/uapi/linux/kfd_sysfs.h            |  15 ++++
 3 files changed, 117 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
index 3fdaba56be6f..647a14142da9 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
@@ -551,6 +551,8 @@ static ssize_t node_show(struct kobject *kobj, struct attribute *attr,
 				      dev->gpu->mec_fw_version);
 		sysfs_show_32bit_prop(buffer, offs, "capability",
 				      dev->node_props.capability);
+		sysfs_show_64bit_prop(buffer, offs, "debug_prop",
+				      dev->node_props.debug_prop);
 		sysfs_show_32bit_prop(buffer, offs, "sdma_fw_version",
 				      dev->gpu->sdma_fw_version);
 		sysfs_show_64bit_prop(buffer, offs, "unique_id",
@@ -1865,6 +1867,97 @@ static int kfd_topology_add_device_locked(struct kfd_dev *gpu, uint32_t gpu_id,
 	return res;
 }
 
+static void kfd_topology_set_dbg_firmware_support(struct kfd_topology_device *dev)
+{
+	bool firmware_supported = true;
+
+	if (KFD_GC_VERSION(dev->gpu) >= IP_VERSION(11, 0, 0) &&
+			KFD_GC_VERSION(dev->gpu) < IP_VERSION(12, 0, 0)) {
+		firmware_supported =
+			(dev->gpu->adev->mes.sched_version & AMDGPU_MES_VERSION_MASK) >= 9;
+		goto out;
+	}
+
+	/*
+	 * Note: Any unlisted devices here are assumed to support exception handling.
+	 * Add additional checks here as needed.
+	 */
+	switch (KFD_GC_VERSION(dev->gpu)) {
+	case IP_VERSION(9, 0, 1):
+		firmware_supported = dev->gpu->mec_fw_version >= 459 + 32768;
+		break;
+	case IP_VERSION(9, 1, 0):
+	case IP_VERSION(9, 2, 1):
+	case IP_VERSION(9, 2, 2):
+	case IP_VERSION(9, 3, 0):
+	case IP_VERSION(9, 4, 0):
+		firmware_supported = dev->gpu->mec_fw_version >= 459;
+		break;
+	case IP_VERSION(9, 4, 1):
+		firmware_supported = dev->gpu->mec_fw_version >= 60;
+		break;
+	case IP_VERSION(9, 4, 2):
+		firmware_supported = dev->gpu->mec_fw_version >= 51;
+		break;
+	case IP_VERSION(10, 1, 10):
+	case IP_VERSION(10, 1, 2):
+	case IP_VERSION(10, 1, 1):
+		firmware_supported = dev->gpu->mec_fw_version >= 144;
+		break;
+	case IP_VERSION(10, 3, 0):
+	case IP_VERSION(10, 3, 2):
+	case IP_VERSION(10, 3, 1):
+	case IP_VERSION(10, 3, 4):
+	case IP_VERSION(10, 3, 5):
+		firmware_supported = dev->gpu->mec_fw_version >= 89;
+		break;
+	case IP_VERSION(10, 1, 3):
+	case IP_VERSION(10, 3, 3):
+		firmware_supported = false;
+		break;
+	default:
+		break;
+	}
+
+out:
+	if (firmware_supported)
+		dev->node_props.capability |= HSA_CAP_TRAP_DEBUG_FIRMWARE_SUPPORTED;
+}
+
+static void kfd_topology_set_capabilities(struct kfd_topology_device *dev)
+{
+	dev->node_props.capability |= ((HSA_CAP_DOORBELL_TYPE_2_0 <<
+				HSA_CAP_DOORBELL_TYPE_TOTALBITS_SHIFT) &
+				HSA_CAP_DOORBELL_TYPE_TOTALBITS_MASK);
+
+	dev->node_props.capability |= HSA_CAP_TRAP_DEBUG_SUPPORT |
+			HSA_CAP_TRAP_DEBUG_WAVE_LAUNCH_TRAP_OVERRIDE_SUPPORTED |
+			HSA_CAP_TRAP_DEBUG_WAVE_LAUNCH_MODE_SUPPORTED;
+
+	if (KFD_GC_VERSION(dev->gpu) < IP_VERSION(10, 0, 0)) {
+		dev->node_props.debug_prop |= HSA_DBG_WATCH_ADDR_MASK_LO_BIT_GFX9 |
+						HSA_DBG_WATCH_ADDR_MASK_HI_BIT;
+
+		if (KFD_GC_VERSION(dev->gpu) < IP_VERSION(9, 4, 2))
+			dev->node_props.debug_prop |=
+				HSA_DBG_DISPATCH_INFO_ALWAYS_VALID;
+		else
+			dev->node_props.capability |=
+				HSA_CAP_TRAP_DEBUG_PRECISE_MEMORY_OPERATIONS_SUPPORTED;
+	} else {
+		dev->node_props.debug_prop |= HSA_DBG_WATCH_ADDR_MASK_LO_BIT_GFX10 |
+					HSA_DBG_WATCH_ADDR_MASK_HI_BIT;
+
+		if (KFD_GC_VERSION(dev->gpu) < IP_VERSION(11, 0, 0))
+			dev->node_props.debug_prop |= HSA_DBG_DISPATCH_INFO_ALWAYS_VALID;
+		else
+			dev->node_props.capability |=
+				HSA_CAP_TRAP_DEBUG_PRECISE_MEMORY_OPERATIONS_SUPPORTED;
+	}
+
+	kfd_topology_set_dbg_firmware_support(dev);
+}
+
 int kfd_topology_add_device(struct kfd_dev *gpu)
 {
 	uint32_t gpu_id;
@@ -1966,13 +2059,11 @@ int kfd_topology_add_device(struct kfd_dev *gpu)
 			HSA_CAP_DOORBELL_TYPE_TOTALBITS_MASK);
 		break;
 	default:
-		if (KFD_GC_VERSION(dev->gpu) >= IP_VERSION(9, 0, 1))
-			dev->node_props.capability |= ((HSA_CAP_DOORBELL_TYPE_2_0 <<
-				HSA_CAP_DOORBELL_TYPE_TOTALBITS_SHIFT) &
-				HSA_CAP_DOORBELL_TYPE_TOTALBITS_MASK);
-		else
+		if (KFD_GC_VERSION(dev->gpu) < IP_VERSION(9, 0, 1))
 			WARN(1, "Unexpected ASIC family %u",
 			     dev->gpu->adev->asic_type);
+		else
+			kfd_topology_set_capabilities(dev);
 	}
 
 	/*
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.h b/drivers/gpu/drm/amd/amdkfd/kfd_topology.h
index fca30d00a9bb..53b9b7bf52ee 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.h
@@ -31,6 +31,11 @@
 
 #define KFD_TOPOLOGY_PUBLIC_NAME_SIZE 32
 
+#define HSA_DBG_WATCH_ADDR_MASK_LO_BIT_GFX9	6
+#define HSA_DBG_WATCH_ADDR_MASK_LO_BIT_GFX10	7
+#define HSA_DBG_WATCH_ADDR_MASK_HI_BIT  \
+			(29 << HSA_DBG_WATCH_ADDR_MASK_HI_BIT_SHIFT)
+
 struct kfd_node_properties {
 	uint64_t hive_id;
 	uint32_t cpu_cores_count;
@@ -42,6 +47,7 @@ struct kfd_node_properties {
 	uint32_t cpu_core_id_base;
 	uint32_t simd_id_base;
 	uint32_t capability;
+	uint64_t debug_prop;
 	uint32_t max_waves_per_simd;
 	uint32_t lds_size_in_kb;
 	uint32_t gds_size_in_kb;
diff --git a/include/uapi/linux/kfd_sysfs.h b/include/uapi/linux/kfd_sysfs.h
index 3e330f368917..a51b7331e0b4 100644
--- a/include/uapi/linux/kfd_sysfs.h
+++ b/include/uapi/linux/kfd_sysfs.h
@@ -43,6 +43,11 @@
 #define HSA_CAP_DOORBELL_TYPE_2_0		0x2
 #define HSA_CAP_AQL_QUEUE_DOUBLE_MAP		0x00004000
 
+#define HSA_CAP_TRAP_DEBUG_SUPPORT              0x00008000
+#define HSA_CAP_TRAP_DEBUG_WAVE_LAUNCH_TRAP_OVERRIDE_SUPPORTED  0x00010000
+#define HSA_CAP_TRAP_DEBUG_WAVE_LAUNCH_MODE_SUPPORTED           0x00020000
+#define HSA_CAP_TRAP_DEBUG_PRECISE_MEMORY_OPERATIONS_SUPPORTED  0x00040000
+
 /* Old buggy user mode depends on this being 0 */
 #define HSA_CAP_RESERVED_WAS_SRAM_EDCSUPPORTED	0x00080000
 
@@ -53,8 +58,18 @@
 #define HSA_CAP_SRAM_EDCSUPPORTED		0x04000000
 #define HSA_CAP_SVMAPI_SUPPORTED		0x08000000
 #define HSA_CAP_FLAGS_COHERENTHOSTACCESS	0x10000000
+#define HSA_CAP_TRAP_DEBUG_FIRMWARE_SUPPORTED   0x20000000
 #define HSA_CAP_RESERVED			0xe00f8000
 
+/* debug_prop bits in node properties */
+#define HSA_DBG_WATCH_ADDR_MASK_LO_BIT_MASK     0x0000000f
+#define HSA_DBG_WATCH_ADDR_MASK_LO_BIT_SHIFT    0
+#define HSA_DBG_WATCH_ADDR_MASK_HI_BIT_MASK     0x000003f0
+#define HSA_DBG_WATCH_ADDR_MASK_HI_BIT_SHIFT    4
+#define HSA_DBG_DISPATCH_INFO_ALWAYS_VALID      0x00000400
+#define HSA_DBG_WATCHPOINTS_EXCLUSIVE           0x00000800
+#define HSA_DBG_RESERVED                0xfffffffffffff000ull
+
 /* Heap types in memory properties */
 #define HSA_MEM_HEAP_TYPE_SYSTEM	0
 #define HSA_MEM_HEAP_TYPE_FB_PUBLIC	1
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 68+ messages in thread

* [PATCH 03/32] drm/amdkfd: prepare per-process debug enable and disable
  2023-01-25 19:53 [PATCH 00/32] Upstream of kernel support for AMDGPU ISA debugging Jonathan Kim
  2023-01-25 19:53 ` [PATCH 01/32] drm/amdkfd: add debug and runtime enable interface Jonathan Kim
  2023-01-25 19:53 ` [PATCH 02/32] drm/amdkfd: display debug capabilities Jonathan Kim
@ 2023-01-25 19:53 ` Jonathan Kim
  2023-02-16 23:44   ` Felix Kuehling
  2023-01-25 19:53 ` [PATCH 04/32] drm/amdgpu: add kgd hw debug mode setting interface Jonathan Kim
                   ` (28 subsequent siblings)
  31 siblings, 1 reply; 68+ messages in thread
From: Jonathan Kim @ 2023-01-25 19:53 UTC (permalink / raw)
  To: amd-gfx, dri-devel; +Cc: Felix.Kuehling, Jonathan.Kim

The ROCm debugger will attach to a process to debug by PTRACE and will
expect the KFD to prepare a process for the target PID, whether the
target PID has opened the KFD device or not.

This patch is to explicity handle this requirement.  Further HW mode
setting and runtime coordination requirements will be handled in
following patches.

In the case where the target process has not opened the KFD device,
a new KFD process must be created for the target PID.
The debugger as well as the target process for this case will have not
acquired any VMs so handle process restoration to correctly account for
this.

To coordinate with HSA runtime, the debugger must be aware of the target
process' runtime enablement status and will copy the runtime status
information into the debugged KFD process for later query.

On enablement, the debugger will subscribe to a set of exceptions where
each exception events will notify the debugger through a pollable FIFO
file descriptor that the debugger provides to the KFD to manage.
Some events will be synchronously raised while other are scheduled,
which is why a debug_event_workarea worker is initialized.

Finally on process termination of either the debugger or the target,
debugging must be disabled if it has not been done so.

v3: fix typo on debug trap disable and PTRACE ATTACH relax check.
remove unnecessary queue eviction counter reset when there's nothing
to evict.
change err code to EALREADY if attaching to an already attached process.
move debug disable to release worker to avoid race with disable from
ioctl call.

v2: relax debug trap disable and PTRACE ATTACH requirement.

Signed-off-by: Jonathan Kim <jonathan.kim@amd.com>
---
 drivers/gpu/drm/amd/amdkfd/Makefile           |  3 +-
 drivers/gpu/drm/amd/amdkfd/kfd_chardev.c      | 88 ++++++++++++++++-
 drivers/gpu/drm/amd/amdkfd/kfd_debug.c        | 94 +++++++++++++++++++
 drivers/gpu/drm/amd/amdkfd/kfd_debug.h        | 33 +++++++
 .../drm/amd/amdkfd/kfd_device_queue_manager.c | 22 ++++-
 drivers/gpu/drm/amd/amdkfd/kfd_priv.h         | 34 ++++++-
 drivers/gpu/drm/amd/amdkfd/kfd_process.c      | 63 +++++++++----
 7 files changed, 308 insertions(+), 29 deletions(-)
 create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_debug.c
 create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_debug.h

diff --git a/drivers/gpu/drm/amd/amdkfd/Makefile b/drivers/gpu/drm/amd/amdkfd/Makefile
index e758c2a24cd0..747754428073 100644
--- a/drivers/gpu/drm/amd/amdkfd/Makefile
+++ b/drivers/gpu/drm/amd/amdkfd/Makefile
@@ -55,7 +55,8 @@ AMDKFD_FILES	:= $(AMDKFD_PATH)/kfd_module.o \
 		$(AMDKFD_PATH)/kfd_int_process_v9.o \
 		$(AMDKFD_PATH)/kfd_int_process_v11.o \
 		$(AMDKFD_PATH)/kfd_smi_events.o \
-		$(AMDKFD_PATH)/kfd_crat.o
+		$(AMDKFD_PATH)/kfd_crat.o \
+		$(AMDKFD_PATH)/kfd_debug.o
 
 ifneq ($(CONFIG_AMD_IOMMU_V2),)
 AMDKFD_FILES += $(AMDKFD_PATH)/kfd_iommu.o
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index d3b019e64093..ee05c2e54ef6 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -44,6 +44,7 @@
 #include "amdgpu_amdkfd.h"
 #include "kfd_smi_events.h"
 #include "amdgpu_dma_buf.h"
+#include "kfd_debug.h"
 
 static long kfd_ioctl(struct file *, unsigned int, unsigned long);
 static int kfd_open(struct inode *, struct file *);
@@ -142,10 +143,15 @@ static int kfd_open(struct inode *inode, struct file *filep)
 		return -EPERM;
 	}
 
-	process = kfd_create_process(filep);
+	process = kfd_create_process(current);
 	if (IS_ERR(process))
 		return PTR_ERR(process);
 
+	if (kfd_process_init_cwsr_apu(process, filep)) {
+		kfd_unref_process(process);
+		return -EFAULT;
+	}
+
 	if (kfd_is_locked()) {
 		dev_dbg(kfd_device, "kfd is locked!\n"
 				"process %d unreferenced", process->pasid);
@@ -2653,6 +2659,9 @@ static int kfd_ioctl_runtime_enable(struct file *filep, struct kfd_process *p, v
 static int kfd_ioctl_set_debug_trap(struct file *filep, struct kfd_process *p, void *data)
 {
 	struct kfd_ioctl_dbg_trap_args *args = data;
+	struct task_struct *thread = NULL;
+	struct pid *pid = NULL;
+	struct kfd_process *target = NULL;
 	int r = 0;
 
 	if (sched_policy == KFD_SCHED_POLICY_NO_HWS) {
@@ -2660,9 +2669,71 @@ static int kfd_ioctl_set_debug_trap(struct file *filep, struct kfd_process *p, v
 		return -EINVAL;
 	}
 
+	pid = find_get_pid(args->pid);
+	if (!pid) {
+		pr_debug("Cannot find pid info for %i\n", args->pid);
+		r = -ESRCH;
+		goto out;
+	}
+
+	thread = get_pid_task(pid, PIDTYPE_PID);
+
+	if (args->op == KFD_IOC_DBG_TRAP_ENABLE) {
+		bool create_process;
+
+		rcu_read_lock();
+		create_process = thread && thread != current && ptrace_parent(thread) == current;
+		rcu_read_unlock();
+
+		target = create_process ? kfd_create_process(thread) :
+					kfd_lookup_process_by_pid(pid);
+	} else {
+		target = kfd_lookup_process_by_pid(pid);
+	}
+
+	if (!target) {
+		pr_debug("Cannot find process PID %i to debug\n", args->pid);
+		r = -ESRCH;
+		goto out;
+	}
+
+	/* Check if target is still PTRACED. */
+	rcu_read_lock();
+	if (target != p && args->op != KFD_IOC_DBG_TRAP_DISABLE
+				&& ptrace_parent(target->lead_thread) != current) {
+		pr_err("PID %i is not PTRACED and cannot be debugged\n", args->pid);
+		r = -EPERM;
+	}
+	rcu_read_unlock();
+
+	if (r)
+		goto out;
+
+	mutex_lock(&target->mutex);
+
+	if (args->op != KFD_IOC_DBG_TRAP_ENABLE && !target->debug_trap_enabled) {
+		pr_err("PID %i not debug enabled for op %i\n", args->pid, args->op);
+		r = -EINVAL;
+		goto unlock_out;
+	}
+
 	switch (args->op) {
 	case KFD_IOC_DBG_TRAP_ENABLE:
+		if (target != p)
+			target->debugger_process = p;
+
+		r = kfd_dbg_trap_enable(target,
+					args->enable.dbg_fd,
+					(void __user *)args->enable.rinfo_ptr,
+					&args->enable.rinfo_size);
+		if (!r)
+			target->exception_enable_mask = args->enable.exception_mask;
+
+		pr_warn("Debug functions limited\n");
+		break;
 	case KFD_IOC_DBG_TRAP_DISABLE:
+		r = kfd_dbg_trap_disable(target);
+		break;
 	case KFD_IOC_DBG_TRAP_SEND_RUNTIME_EVENT:
 	case KFD_IOC_DBG_TRAP_SET_EXCEPTIONS_ENABLED:
 	case KFD_IOC_DBG_TRAP_SET_WAVE_LAUNCH_OVERRIDE:
@@ -2676,7 +2747,7 @@ static int kfd_ioctl_set_debug_trap(struct file *filep, struct kfd_process *p, v
 	case KFD_IOC_DBG_TRAP_QUERY_EXCEPTION_INFO:
 	case KFD_IOC_DBG_TRAP_GET_QUEUE_SNAPSHOT:
 	case KFD_IOC_DBG_TRAP_GET_DEVICE_SNAPSHOT:
-		pr_warn("Debugging not supported yet\n");
+		pr_warn("Debug op %i not supported yet\n", args->op);
 		r = -EACCES;
 		break;
 	default:
@@ -2684,6 +2755,19 @@ static int kfd_ioctl_set_debug_trap(struct file *filep, struct kfd_process *p, v
 		r = -EINVAL;
 	}
 
+unlock_out:
+	mutex_unlock(&target->mutex);
+
+out:
+	if (thread)
+		put_task_struct(thread);
+
+	if (pid)
+		put_pid(pid);
+
+	if (target)
+		kfd_unref_process(target);
+
 	return r;
 }
 
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
new file mode 100644
index 000000000000..f6ea6db266b4
--- /dev/null
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
@@ -0,0 +1,94 @@
+/*
+ * Copyright 2022 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "kfd_debug.h"
+#include <linux/file.h>
+
+void debug_event_write_work_handler(struct work_struct *work)
+{
+	struct kfd_process *process;
+
+	static const char write_data = '.';
+	loff_t pos = 0;
+
+	process = container_of(work,
+			struct kfd_process,
+			debug_event_workarea);
+
+	kernel_write(process->dbg_ev_file, &write_data, 1, &pos);
+}
+
+int kfd_dbg_trap_disable(struct kfd_process *target)
+{
+	if (!target->debug_trap_enabled)
+		return 0;
+
+	fput(target->dbg_ev_file);
+	target->dbg_ev_file = NULL;
+
+	if (target->debugger_process) {
+		atomic_dec(&target->debugger_process->debugged_process_count);
+		target->debugger_process = NULL;
+	}
+
+	target->debug_trap_enabled = false;
+	kfd_unref_process(target);
+
+	return 0;
+}
+
+int kfd_dbg_trap_enable(struct kfd_process *target, uint32_t fd,
+			void __user *runtime_info, uint32_t *runtime_size)
+{
+	struct file *f;
+	uint32_t copy_size;
+	int r = 0;
+
+	if (target->debug_trap_enabled)
+		return -EALREADY;
+
+	copy_size = min((size_t)(*runtime_size), sizeof(target->runtime_info));
+
+	f = fget(fd);
+	if (!f) {
+		pr_err("Failed to get file for (%i)\n", fd);
+		return -EBADF;
+	}
+
+	target->dbg_ev_file = f;
+
+	/* We already hold the process reference but hold another one for the
+	 * debug session.
+	 */
+	kref_get(&target->ref);
+	target->debug_trap_enabled = true;
+
+	if (target->debugger_process)
+		atomic_inc(&target->debugger_process->debugged_process_count);
+
+	if (copy_to_user(runtime_info, (void *)&target->runtime_info, copy_size))
+		r = -EFAULT;
+
+	*runtime_size = sizeof(target->runtime_info);
+
+	return r;
+}
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.h b/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
new file mode 100644
index 000000000000..b2217eb1399c
--- /dev/null
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
@@ -0,0 +1,33 @@
+/*
+ * Copyright 2022 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef KFD_DEBUG_EVENTS_H_INCLUDED
+#define KFD_DEBUG_EVENTS_H_INCLUDED
+
+#include "kfd_priv.h"
+
+int kfd_dbg_trap_disable(struct kfd_process *target);
+int kfd_dbg_trap_enable(struct kfd_process *target, uint32_t fd,
+			void __user *runtime_info,
+			uint32_t *runtime_info_size);
+void debug_event_write_work_handler(struct work_struct *work);
+#endif
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
index c06ada0844ba..a2ac98d06e71 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -979,6 +979,14 @@ static int evict_process_queues_cpsch(struct device_queue_manager *dqm,
 		goto out;
 
 	pdd = qpd_to_pdd(qpd);
+
+	/* The debugger creates processes that temporarily have not acquired
+	 * all VMs for all devices and has no VMs itself.
+	 * Skip queue eviction on process eviction.
+	 */
+	if (!pdd->drm_priv)
+		goto out;
+
 	pr_debug_ratelimited("Evicting PASID 0x%x queues\n",
 			    pdd->process->pasid);
 
@@ -1100,13 +1108,10 @@ static int restore_process_queues_cpsch(struct device_queue_manager *dqm,
 {
 	struct queue *q;
 	struct kfd_process_device *pdd;
-	uint64_t pd_base;
 	uint64_t eviction_duration;
 	int retval = 0;
 
 	pdd = qpd_to_pdd(qpd);
-	/* Retrieve PD base */
-	pd_base = amdgpu_amdkfd_gpuvm_get_process_page_dir(pdd->drm_priv);
 
 	dqm_lock(dqm);
 	if (WARN_ON_ONCE(!qpd->evicted)) /* already restored, do nothing */
@@ -1116,12 +1121,19 @@ static int restore_process_queues_cpsch(struct device_queue_manager *dqm,
 		goto out;
 	}
 
+	/* The debugger creates processes that temporarily have not acquired
+	 * all VMs for all devices and has no VMs itself.
+	 * Skip queue restore on process restore.
+	 */
+	if (!pdd->drm_priv)
+		goto out;
+
 	pr_debug_ratelimited("Restoring PASID 0x%x queues\n",
 			    pdd->process->pasid);
 
 	/* Update PD Base in QPD */
-	qpd->page_table_base = pd_base;
-	pr_debug("Updated PD address to 0x%llx\n", pd_base);
+	qpd->page_table_base = amdgpu_amdkfd_gpuvm_get_process_page_dir(pdd->drm_priv);
+	pr_debug("Updated PD address to 0x%llx\n", qpd->page_table_base);
 
 	/* activate all active queues on the qpd */
 	list_for_each_entry(q, &qpd->queues_list, list) {
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index bfa30d12406b..62b75ba28425 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -886,19 +886,48 @@ struct kfd_process {
 	 */
 	unsigned long last_restore_timestamp;
 
+	/* Indicates device process is debug attached with reserved vmid. */
+	bool debug_trap_enabled;
+
+	/* per-process-per device debug event fd file */
+	struct file *dbg_ev_file;
+
+	/* If the process is a kfd debugger, we need to know so we can clean
+	 * up at exit time.  If a process enables debugging on itself, it does
+	 * its own clean-up, so we don't set the flag here.  We track this by
+	 * counting the number of processes this process is debugging.
+	 */
+	atomic_t debugged_process_count;
+
+	/* If the process is a debugged, this is the debugger process */
+	struct kfd_process *debugger_process;
+
 	/* Kobj for our procfs */
 	struct kobject *kobj;
 	struct kobject *kobj_queues;
 	struct attribute attr_pasid;
 
+	/* Keep track cwsr init */
+	bool has_cwsr;
+
+	/* Exception code enable mask and status */
+	uint64_t exception_enable_mask;
+
 	/* shared virtual memory registered by this process */
 	struct svm_range_list svms;
 
 	bool xnack_enabled;
 
+	/* Work area for debugger event writer worker. */
+	struct work_struct debug_event_workarea;
+
 	atomic_t poison;
 	/* Queues are in paused stated because we are in the process of doing a CRIU checkpoint */
 	bool queues_paused;
+
+	/* Tracks runtime enable status */
+	struct kfd_runtime_info runtime_info;
+
 };
 
 #define KFD_PROCESS_TABLE_SIZE 5 /* bits: 32 entries */
@@ -928,7 +957,7 @@ bool kfd_dev_is_large_bar(struct kfd_dev *dev);
 
 int kfd_process_create_wq(void);
 void kfd_process_destroy_wq(void);
-struct kfd_process *kfd_create_process(struct file *filep);
+struct kfd_process *kfd_create_process(struct task_struct *thread);
 struct kfd_process *kfd_get_process(const struct task_struct *task);
 struct kfd_process *kfd_lookup_process_by_pasid(u32 pasid);
 struct kfd_process *kfd_lookup_process_by_mm(const struct mm_struct *mm);
@@ -1055,6 +1084,9 @@ void kfd_process_set_trap_handler(struct qcm_process_device *qpd,
 				  uint64_t tba_addr,
 				  uint64_t tma_addr);
 
+/* CWSR initialization */
+int kfd_process_init_cwsr_apu(struct kfd_process *process, struct file *filep);
+
 /* CRIU */
 /*
  * Need to increment KFD_CRIU_PRIV_VERSION each time a change is made to any of the CRIU private
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
index 72df6286e240..e935158ab311 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
@@ -44,6 +44,7 @@ struct mm_struct;
 #include "kfd_iommu.h"
 #include "kfd_svm.h"
 #include "kfd_smi_events.h"
+#include "kfd_debug.h"
 
 /*
  * List of struct kfd_process (field kfd_process).
@@ -69,7 +70,6 @@ static struct kfd_process *find_process(const struct task_struct *thread,
 					bool ref);
 static void kfd_process_ref_release(struct kref *ref);
 static struct kfd_process *create_process(const struct task_struct *thread);
-static int kfd_process_init_cwsr_apu(struct kfd_process *p, struct file *filep);
 
 static void evict_process_worker(struct work_struct *work);
 static void restore_process_worker(struct work_struct *work);
@@ -798,18 +798,19 @@ static void kfd_process_device_destroy_ib_mem(struct kfd_process_device *pdd)
 	kfd_process_free_gpuvm(qpd->ib_mem, pdd, &qpd->ib_kaddr);
 }
 
-struct kfd_process *kfd_create_process(struct file *filep)
+struct kfd_process *kfd_create_process(struct task_struct *thread)
 {
 	struct kfd_process *process;
-	struct task_struct *thread = current;
 	int ret;
 
-	if (!thread->mm)
+	if (!(thread->mm && mmget_not_zero(thread->mm)))
 		return ERR_PTR(-EINVAL);
 
 	/* Only the pthreads threading model is supported. */
-	if (thread->group_leader->mm != thread->mm)
+	if (thread->group_leader->mm != thread->mm) {
+		mmput(thread->mm);
 		return ERR_PTR(-EINVAL);
+	}
 
 	/*
 	 * take kfd processes mutex before starting of process creation
@@ -827,10 +828,6 @@ struct kfd_process *kfd_create_process(struct file *filep)
 		if (IS_ERR(process))
 			goto out;
 
-		ret = kfd_process_init_cwsr_apu(process, filep);
-		if (ret)
-			goto out_destroy;
-
 		if (!procfs.kobj)
 			goto out;
 
@@ -864,16 +861,9 @@ struct kfd_process *kfd_create_process(struct file *filep)
 	if (!IS_ERR(process))
 		kref_get(&process->ref);
 	mutex_unlock(&kfd_processes_mutex);
+	mmput(thread->mm);
 
 	return process;
-
-out_destroy:
-	hash_del_rcu(&process->kfd_processes);
-	mutex_unlock(&kfd_processes_mutex);
-	synchronize_srcu(&kfd_processes_srcu);
-	/* kfd_process_free_notifier will trigger the cleanup */
-	mmu_notifier_put(&process->mmu_notifier);
-	return ERR_PTR(ret);
 }
 
 struct kfd_process *kfd_get_process(const struct task_struct *thread)
@@ -1115,6 +1105,26 @@ static void kfd_process_wq_release(struct work_struct *work)
 	struct kfd_process *p = container_of(work, struct kfd_process,
 					     release_work);
 
+	kfd_dbg_trap_disable(p);
+
+	if (atomic_read(&p->debugged_process_count) > 0) {
+		struct kfd_process *target;
+		unsigned int temp;
+		int idx = srcu_read_lock(&kfd_processes_srcu);
+
+		hash_for_each_rcu(kfd_processes_table, temp, target, kfd_processes) {
+			if (target->debugger_process && target->debugger_process == p) {
+				mutex_lock(&target->mutex);
+				kfd_dbg_trap_disable(target);
+				mutex_unlock(&target->mutex);
+				if (atomic_read(&p->debugged_process_count) == 0)
+					break;
+			}
+		}
+
+		srcu_read_unlock(&kfd_processes_srcu, idx);
+	}
+
 	kfd_process_dequeue_from_all_devices(p);
 	pqm_uninit(&p->pqm);
 
@@ -1200,11 +1210,14 @@ static const struct mmu_notifier_ops kfd_process_mmu_notifier_ops = {
 	.free_notifier = kfd_process_free_notifier,
 };
 
-static int kfd_process_init_cwsr_apu(struct kfd_process *p, struct file *filep)
+int kfd_process_init_cwsr_apu(struct kfd_process *p, struct file *filep)
 {
 	unsigned long  offset;
 	int i;
 
+	if (p->has_cwsr)
+		return 0;
+
 	for (i = 0; i < p->n_pdds; i++) {
 		struct kfd_dev *dev = p->pdds[i]->dev;
 		struct qcm_process_device *qpd = &p->pdds[i]->qpd;
@@ -1233,6 +1246,8 @@ static int kfd_process_init_cwsr_apu(struct kfd_process *p, struct file *filep)
 			qpd->tba_addr, qpd->tma_addr, qpd->cwsr_kaddr);
 	}
 
+	p->has_cwsr = true;
+
 	return 0;
 }
 
@@ -1375,6 +1390,10 @@ static struct kfd_process *create_process(const struct task_struct *thread)
 	if (err)
 		goto err_event_init;
 	process->is_32bit_user_mode = in_compat_syscall();
+	process->debug_trap_enabled = false;
+	process->debugger_process = NULL;
+	process->exception_enable_mask = 0;
+	atomic_set(&process->debugged_process_count, 0);
 
 	process->pasid = kfd_pasid_alloc();
 	if (process->pasid == 0) {
@@ -1422,6 +1441,8 @@ static struct kfd_process *create_process(const struct task_struct *thread)
 	kfd_unref_process(process);
 	get_task_struct(process->lead_thread);
 
+	INIT_WORK(&process->debug_event_workarea, debug_event_write_work_handler);
+
 	return process;
 
 err_register_notifier:
@@ -1908,8 +1929,10 @@ static void restore_process_worker(struct work_struct *work)
 	 */
 
 	p->last_restore_timestamp = get_jiffies_64();
-	ret = amdgpu_amdkfd_gpuvm_restore_process_bos(p->kgd_process_info,
-						     &p->ef);
+	/* VMs may not have been acquired yet during debugging. */
+	if (p->kgd_process_info)
+		ret = amdgpu_amdkfd_gpuvm_restore_process_bos(p->kgd_process_info,
+							     &p->ef);
 	if (ret) {
 		pr_debug("Failed to restore BOs of pasid 0x%x, retry after %d ms\n",
 			 p->pasid, PROCESS_BACK_OFF_TIME_MS);
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 68+ messages in thread

* [PATCH 04/32] drm/amdgpu: add kgd hw debug mode setting interface
  2023-01-25 19:53 [PATCH 00/32] Upstream of kernel support for AMDGPU ISA debugging Jonathan Kim
                   ` (2 preceding siblings ...)
  2023-01-25 19:53 ` [PATCH 03/32] drm/amdkfd: prepare per-process debug enable and disable Jonathan Kim
@ 2023-01-25 19:53 ` Jonathan Kim
  2023-01-25 19:53 ` [PATCH 05/32] drm/amdgpu: setup hw debug registers on driver initialization Jonathan Kim
                   ` (27 subsequent siblings)
  31 siblings, 0 replies; 68+ messages in thread
From: Jonathan Kim @ 2023-01-25 19:53 UTC (permalink / raw)
  To: amd-gfx, dri-devel; +Cc: Felix.Kuehling, Jonathan.Kim

Introduce the require KGD debug calls that will execute hardware debug
mode setting.

Signed-off-by: Jonathan Kim <jonathan.kim@amd.com>
Reviewed-by: Felix Kuehling <felix.kuehling@amd.com>
---
 .../gpu/drm/amd/include/kgd_kfd_interface.h   | 34 +++++++++++++++++++
 1 file changed, 34 insertions(+)

diff --git a/drivers/gpu/drm/amd/include/kgd_kfd_interface.h b/drivers/gpu/drm/amd/include/kgd_kfd_interface.h
index 5cb3e8634739..15e7a5c920a0 100644
--- a/drivers/gpu/drm/amd/include/kgd_kfd_interface.h
+++ b/drivers/gpu/drm/amd/include/kgd_kfd_interface.h
@@ -289,6 +289,40 @@ struct kfd2kgd_calls {
 			uint32_t vmid, uint64_t page_table_base);
 	uint32_t (*read_vmid_from_vmfault_reg)(struct amdgpu_device *adev);
 
+	uint32_t (*enable_debug_trap)(struct amdgpu_device *adev,
+					bool restore_dbg_registers,
+					uint32_t vmid);
+	uint32_t (*disable_debug_trap)(struct amdgpu_device *adev,
+					bool keep_trap_enabled,
+					uint32_t vmid);
+	int (*validate_trap_override_request)(struct amdgpu_device *adev,
+					uint32_t trap_override,
+					uint32_t *trap_mask_supported);
+	uint32_t (*set_wave_launch_trap_override)(struct amdgpu_device *adev,
+					     uint32_t vmid,
+					     uint32_t trap_override,
+					     uint32_t trap_mask_bits,
+					     uint32_t trap_mask_request,
+					     uint32_t *trap_mask_prev,
+					     uint32_t kfd_dbg_trap_cntl_prev);
+	uint32_t (*set_wave_launch_mode)(struct amdgpu_device *adev,
+					uint8_t wave_launch_mode,
+					uint32_t vmid);
+	uint32_t (*set_address_watch)(struct amdgpu_device *adev,
+					uint64_t watch_address,
+					uint32_t watch_address_mask,
+					uint32_t watch_id,
+					uint32_t watch_mode,
+					uint32_t debug_vmid);
+	uint32_t (*clear_address_watch)(struct amdgpu_device *adev,
+			uint32_t watch_id);
+	void (*get_iq_wait_times)(struct amdgpu_device *adev,
+			uint32_t *wait_times);
+	void (*build_grace_period_packet_info)(struct amdgpu_device *adev,
+			uint32_t wait_times,
+			uint32_t grace_period,
+			uint32_t *reg_offset,
+			uint32_t *reg_data);
 	void (*get_cu_occupancy)(struct amdgpu_device *adev, int pasid,
 			int *wave_cnt, int *max_waves_per_cu);
 	void (*program_trap_handler_settings)(struct amdgpu_device *adev,
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 68+ messages in thread

* [PATCH 05/32] drm/amdgpu: setup hw debug registers on driver initialization
  2023-01-25 19:53 [PATCH 00/32] Upstream of kernel support for AMDGPU ISA debugging Jonathan Kim
                   ` (3 preceding siblings ...)
  2023-01-25 19:53 ` [PATCH 04/32] drm/amdgpu: add kgd hw debug mode setting interface Jonathan Kim
@ 2023-01-25 19:53 ` Jonathan Kim
  2023-02-16 22:39   ` Felix Kuehling
  2023-01-25 19:53 ` [PATCH 06/32] drm/amdgpu: add gfx9 hw debug mode enable and disable calls Jonathan Kim
                   ` (26 subsequent siblings)
  31 siblings, 1 reply; 68+ messages in thread
From: Jonathan Kim @ 2023-01-25 19:53 UTC (permalink / raw)
  To: amd-gfx, dri-devel; +Cc: Felix.Kuehling, Jonathan.Kim

Add missing debug trap registers references and initialize all debug
registers on boot by clearing the hardware exception overrides and the
wave allocation ID index.

The debugger requires that TTMPs 6 & 7 save the dispatch ID to map
waves onto dispatch during compute context inspection.
In order to correctly set this up, set the special reserved CP bit by
default whenever the MQD is initailized.

v2: leave TRAP_EN set for multi-process debugging as per process disable
will be taken care of in later patches.
fixup typo in description.
enable ttmp setup for dispatch boundary in mqd init for gfx11.
add trap on wave start and end registers for gfx11.

Signed-off-by: Jonathan Kim <jonathan.kim@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c        | 26 +++++++
 drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c        |  1 +
 drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c         | 30 ++++++++
 .../gpu/drm/amd/amdkfd/kfd_mqd_manager_v10.c  |  5 ++
 .../gpu/drm/amd/amdkfd/kfd_mqd_manager_v11.c  |  5 ++
 .../gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c   |  5 ++
 .../include/asic_reg/gc/gc_10_1_0_offset.h    | 14 ++++
 .../include/asic_reg/gc/gc_10_1_0_sh_mask.h   | 69 +++++++++++++++++++
 .../include/asic_reg/gc/gc_10_3_0_offset.h    | 10 +++
 .../include/asic_reg/gc/gc_10_3_0_sh_mask.h   |  4 ++
 .../include/asic_reg/gc/gc_11_0_0_sh_mask.h   |  4 ++
 11 files changed, 173 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
index 6983acc456b2..a5faf23805b5 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
@@ -4823,6 +4823,29 @@ static u32 gfx_v10_0_init_pa_sc_tile_steering_override(struct amdgpu_device *ade
 
 #define DEFAULT_SH_MEM_BASES	(0x6000)
 
+static void gfx_v10_0_debug_trap_config_init(struct amdgpu_device *adev,
+				uint32_t first_vmid,
+				uint32_t last_vmid)
+{
+	uint32_t data;
+	uint32_t trap_config_vmid_mask = 0;
+	int i;
+
+	/* Calculate trap config vmid mask */
+	for (i = first_vmid; i < last_vmid; i++)
+		trap_config_vmid_mask |= (1 << i);
+
+	data = REG_SET_FIELD(0, SPI_GDBG_TRAP_CONFIG,
+			VMID_SEL, trap_config_vmid_mask);
+	data = REG_SET_FIELD(data, SPI_GDBG_TRAP_CONFIG,
+			TRAP_EN, 1);
+	WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_TRAP_CONFIG), data);
+	WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_TRAP_MASK), 0);
+
+	WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_TRAP_DATA0), 0);
+	WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_TRAP_DATA1), 0);
+}
+
 static void gfx_v10_0_init_compute_vmid(struct amdgpu_device *adev)
 {
 	int i;
@@ -4854,6 +4877,9 @@ static void gfx_v10_0_init_compute_vmid(struct amdgpu_device *adev)
 		WREG32_SOC15_OFFSET(GC, 0, mmGDS_GWS_VMID0, i, 0);
 		WREG32_SOC15_OFFSET(GC, 0, mmGDS_OA_VMID0, i, 0);
 	}
+
+	gfx_v10_0_debug_trap_config_init(adev, adev->vm_manager.first_kfd_vmid,
+					AMDGPU_NUM_VMID);
 }
 
 static void gfx_v10_0_init_gds_vmid(struct amdgpu_device *adev)
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
index c621b2ad7ba3..3ca7a31fb770 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
@@ -1572,6 +1572,7 @@ static void gfx_v11_0_init_compute_vmid(struct amdgpu_device *adev)
 		/* Enable trap for each kfd vmid. */
 		data = RREG32_SOC15(GC, 0, regSPI_GDBG_PER_VMID_CNTL);
 		data = REG_SET_FIELD(data, SPI_GDBG_PER_VMID_CNTL, TRAP_EN, 1);
+		WREG32_SOC15(GC, 0, regSPI_GDBG_PER_VMID_CNTL, data);
 	}
 	soc21_grbm_select(adev, 0, 0, 0, 0);
 	mutex_unlock(&adev->srbm_mutex);
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
index 8ad5c03506f2..222fe87161b7 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
@@ -2289,6 +2289,29 @@ static void gfx_v9_0_setup_rb(struct amdgpu_device *adev)
 	adev->gfx.config.num_rbs = hweight32(active_rbs);
 }
 
+static void gfx_v9_0_debug_trap_config_init(struct amdgpu_device *adev,
+				uint32_t first_vmid,
+				uint32_t last_vmid)
+{
+	uint32_t data;
+	uint32_t trap_config_vmid_mask = 0;
+	int i;
+
+	/* Calculate trap config vmid mask */
+	for (i = first_vmid; i < last_vmid; i++)
+		trap_config_vmid_mask |= (1 << i);
+
+	data = REG_SET_FIELD(0, SPI_GDBG_TRAP_CONFIG,
+			VMID_SEL, trap_config_vmid_mask);
+	data = REG_SET_FIELD(data, SPI_GDBG_TRAP_CONFIG,
+			TRAP_EN, 1);
+	WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_TRAP_CONFIG), data);
+	WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_TRAP_MASK), 0);
+
+	WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_TRAP_DATA0), 0);
+	WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_TRAP_DATA1), 0);
+}
+
 #define DEFAULT_SH_MEM_BASES	(0x6000)
 static void gfx_v9_0_init_compute_vmid(struct amdgpu_device *adev)
 {
@@ -4565,6 +4588,13 @@ static int gfx_v9_0_late_init(void *handle)
 	if (r)
 		return r;
 
+	if (adev->ip_versions[GC_HWIP][0] == IP_VERSION(9, 4, 2))
+		gfx_v9_4_2_debug_trap_config_init(adev,
+			adev->vm_manager.first_kfd_vmid, AMDGPU_NUM_VMID);
+	else
+		gfx_v9_0_debug_trap_config_init(adev,
+			adev->vm_manager.first_kfd_vmid, AMDGPU_NUM_VMID);
+
 	return 0;
 }
 
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v10.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v10.c
index d3e2b6a599a4..cb484ace17de 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v10.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v10.c
@@ -117,6 +117,11 @@ static void init_mqd(struct mqd_manager *mm, void **mqd,
 			1 << CP_HQD_QUANTUM__QUANTUM_SCALE__SHIFT |
 			1 << CP_HQD_QUANTUM__QUANTUM_DURATION__SHIFT;
 
+	/* Set cp_hqd_hq_scheduler0 bit 14 to 1 to have the CP set up the
+	 * DISPATCH_PTR.  This is required for the kfd debugger
+	 */
+	m->cp_hqd_hq_scheduler0 = 1 << 14;
+
 	if (q->format == KFD_QUEUE_FORMAT_AQL) {
 		m->cp_hqd_aql_control =
 			1 << CP_HQD_AQL_CONTROL__CONTROL0__SHIFT;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v11.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v11.c
index 4f6390f3236e..ac7c8fc83c94 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v11.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v11.c
@@ -143,6 +143,11 @@ static void init_mqd(struct mqd_manager *mm, void **mqd,
 			1 << CP_HQD_QUANTUM__QUANTUM_SCALE__SHIFT |
 			1 << CP_HQD_QUANTUM__QUANTUM_DURATION__SHIFT;
 
+	/* Set cp_hqd_hq_scheduler0 bit 14 to 1 to have the CP set up the
+	 * DISPATCH_PTR.  This is required for the kfd debugger
+	 */
+	m->cp_hqd_hq_status0 = 1 << 14;
+
 	if (q->format == KFD_QUEUE_FORMAT_AQL) {
 		m->cp_hqd_aql_control =
 			1 << CP_HQD_AQL_CONTROL__CONTROL0__SHIFT;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c
index 0778e587a2d6..86f1cf090246 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c
@@ -164,6 +164,11 @@ static void init_mqd(struct mqd_manager *mm, void **mqd,
 			1 << CP_HQD_QUANTUM__QUANTUM_SCALE__SHIFT |
 			1 << CP_HQD_QUANTUM__QUANTUM_DURATION__SHIFT;
 
+	/* Set cp_hqd_hq_scheduler0 bit 14 to 1 to have the CP set up the
+	 * DISPATCH_PTR.  This is required for the kfd debugger
+	 */
+	m->cp_hqd_hq_status0 = 1 << 14;
+
 	if (q->format == KFD_QUEUE_FORMAT_AQL) {
 		m->cp_hqd_aql_control =
 			1 << CP_HQD_AQL_CONTROL__CONTROL0__SHIFT;
diff --git a/drivers/gpu/drm/amd/include/asic_reg/gc/gc_10_1_0_offset.h b/drivers/gpu/drm/amd/include/asic_reg/gc/gc_10_1_0_offset.h
index 18d34bbceebe..7d384f86bd67 100644
--- a/drivers/gpu/drm/amd/include/asic_reg/gc/gc_10_1_0_offset.h
+++ b/drivers/gpu/drm/amd/include/asic_reg/gc/gc_10_1_0_offset.h
@@ -5190,6 +5190,20 @@
 #define mmSPI_WCL_PIPE_PERCENT_CS6_BASE_IDX                                                            0
 #define mmSPI_WCL_PIPE_PERCENT_CS7                                                                     0x1f70
 #define mmSPI_WCL_PIPE_PERCENT_CS7_BASE_IDX                                                            0
+#define mmSPI_GDBG_WAVE_CNTL                                                                           0x1f71
+#define mmSPI_GDBG_WAVE_CNTL_BASE_IDX                                                                  0
+#define mmSPI_GDBG_TRAP_CONFIG                                                                         0x1f72
+#define mmSPI_GDBG_TRAP_CONFIG_BASE_IDX                                                                0
+#define mmSPI_GDBG_TRAP_MASK                                                                           0x1f73
+#define mmSPI_GDBG_TRAP_MASK_BASE_IDX                                                                  0
+#define mmSPI_GDBG_WAVE_CNTL2                                                                          0x1f74
+#define mmSPI_GDBG_WAVE_CNTL2_BASE_IDX                                                                 0
+#define mmSPI_GDBG_WAVE_CNTL3                                                                          0x1f75
+#define mmSPI_GDBG_WAVE_CNTL3_BASE_IDX                                                                 0
+#define mmSPI_GDBG_TRAP_DATA0                                                                          0x1f78
+#define mmSPI_GDBG_TRAP_DATA0_BASE_IDX                                                                 0
+#define mmSPI_GDBG_TRAP_DATA1                                                                          0x1f79
+#define mmSPI_GDBG_TRAP_DATA1_BASE_IDX                                                                 0
 #define mmSPI_COMPUTE_QUEUE_RESET                                                                      0x1f7b
 #define mmSPI_COMPUTE_QUEUE_RESET_BASE_IDX                                                             0
 #define mmSPI_RESOURCE_RESERVE_CU_0                                                                    0x1f7c
diff --git a/drivers/gpu/drm/amd/include/asic_reg/gc/gc_10_1_0_sh_mask.h b/drivers/gpu/drm/amd/include/asic_reg/gc/gc_10_1_0_sh_mask.h
index 4127896ffcdf..08772ba845b0 100644
--- a/drivers/gpu/drm/amd/include/asic_reg/gc/gc_10_1_0_sh_mask.h
+++ b/drivers/gpu/drm/amd/include/asic_reg/gc/gc_10_1_0_sh_mask.h
@@ -19646,6 +19646,75 @@
 //SPI_WCL_PIPE_PERCENT_CS7
 #define SPI_WCL_PIPE_PERCENT_CS7__VALUE__SHIFT                                                                0x0
 #define SPI_WCL_PIPE_PERCENT_CS7__VALUE_MASK                                                                  0x7FL
+//SPI_GDBG_WAVE_CNTL
+#define SPI_GDBG_WAVE_CNTL__STALL_RA__SHIFT                                                                   0x0
+#define SPI_GDBG_WAVE_CNTL__STALL_VMID__SHIFT                                                                 0x1
+#define SPI_GDBG_WAVE_CNTL__STALL_RA_MASK                                                                     0x00000001L
+#define SPI_GDBG_WAVE_CNTL__STALL_VMID_MASK                                                                   0x0001FFFEL
+//SPI_GDBG_TRAP_CONFIG
+#define SPI_GDBG_TRAP_CONFIG__ME_SEL__SHIFT                                                                   0x0
+#define SPI_GDBG_TRAP_CONFIG__PIPE_SEL__SHIFT                                                                 0x2
+#define SPI_GDBG_TRAP_CONFIG__QUEUE_SEL__SHIFT                                                                0x4
+#define SPI_GDBG_TRAP_CONFIG__ME_MATCH__SHIFT                                                                 0x7
+#define SPI_GDBG_TRAP_CONFIG__PIPE_MATCH__SHIFT                                                               0x8
+#define SPI_GDBG_TRAP_CONFIG__QUEUE_MATCH__SHIFT                                                              0x9
+#define SPI_GDBG_TRAP_CONFIG__TRAP_EN__SHIFT                                                                  0xf
+#define SPI_GDBG_TRAP_CONFIG__VMID_SEL__SHIFT                                                                 0x10
+#define SPI_GDBG_TRAP_CONFIG__ME_SEL_MASK                                                                     0x00000003L
+#define SPI_GDBG_TRAP_CONFIG__PIPE_SEL_MASK                                                                   0x0000000CL
+#define SPI_GDBG_TRAP_CONFIG__QUEUE_SEL_MASK                                                                  0x00000070L
+#define SPI_GDBG_TRAP_CONFIG__ME_MATCH_MASK                                                                   0x00000080L
+#define SPI_GDBG_TRAP_CONFIG__PIPE_MATCH_MASK                                                                 0x00000100L
+#define SPI_GDBG_TRAP_CONFIG__QUEUE_MATCH_MASK                                                                0x00000200L
+#define SPI_GDBG_TRAP_CONFIG__TRAP_EN_MASK                                                                    0x00008000L
+#define SPI_GDBG_TRAP_CONFIG__VMID_SEL_MASK                                                                   0xFFFF0000L
+//SPI_GDBG_TRAP_MASK
+#define SPI_GDBG_TRAP_MASK__EXCP_EN__SHIFT                                                                    0x0
+#define SPI_GDBG_TRAP_MASK__REPLACE__SHIFT                                                                    0x9
+#define SPI_GDBG_TRAP_MASK__EXCP_EN_MASK                                                                      0x01FFL
+#define SPI_GDBG_TRAP_MASK__REPLACE_MASK                                                                      0x0200L
+//SPI_GDBG_WAVE_CNTL2
+#define SPI_GDBG_WAVE_CNTL2__VMID_MASK__SHIFT                                                                 0x0
+#define SPI_GDBG_WAVE_CNTL2__MODE__SHIFT                                                                      0x10
+#define SPI_GDBG_WAVE_CNTL2__VMID_MASK_MASK                                                                   0x0000FFFFL
+#define SPI_GDBG_WAVE_CNTL2__MODE_MASK                                                                        0x00030000L
+//SPI_GDBG_WAVE_CNTL3
+#define SPI_GDBG_WAVE_CNTL3__STALL_PS__SHIFT                                                                  0x0
+#define SPI_GDBG_WAVE_CNTL3__STALL_VS__SHIFT                                                                  0x1
+#define SPI_GDBG_WAVE_CNTL3__STALL_GS__SHIFT                                                                  0x2
+#define SPI_GDBG_WAVE_CNTL3__STALL_HS__SHIFT                                                                  0x3
+#define SPI_GDBG_WAVE_CNTL3__STALL_CSG__SHIFT                                                                 0x4
+#define SPI_GDBG_WAVE_CNTL3__STALL_CS0__SHIFT                                                                 0x5
+#define SPI_GDBG_WAVE_CNTL3__STALL_CS1__SHIFT                                                                 0x6
+#define SPI_GDBG_WAVE_CNTL3__STALL_CS2__SHIFT                                                                 0x7
+#define SPI_GDBG_WAVE_CNTL3__STALL_CS3__SHIFT                                                                 0x8
+#define SPI_GDBG_WAVE_CNTL3__STALL_CS4__SHIFT                                                                 0x9
+#define SPI_GDBG_WAVE_CNTL3__STALL_CS5__SHIFT                                                                 0xa
+#define SPI_GDBG_WAVE_CNTL3__STALL_CS6__SHIFT                                                                 0xb
+#define SPI_GDBG_WAVE_CNTL3__STALL_CS7__SHIFT                                                                 0xc
+#define SPI_GDBG_WAVE_CNTL3__STALL_DURATION__SHIFT                                                            0xd
+#define SPI_GDBG_WAVE_CNTL3__STALL_MULT__SHIFT                                                                0x1c
+#define SPI_GDBG_WAVE_CNTL3__STALL_PS_MASK                                                                    0x00000001L
+#define SPI_GDBG_WAVE_CNTL3__STALL_VS_MASK                                                                    0x00000002L
+#define SPI_GDBG_WAVE_CNTL3__STALL_GS_MASK                                                                    0x00000004L
+#define SPI_GDBG_WAVE_CNTL3__STALL_HS_MASK                                                                    0x00000008L
+#define SPI_GDBG_WAVE_CNTL3__STALL_CSG_MASK                                                                   0x00000010L
+#define SPI_GDBG_WAVE_CNTL3__STALL_CS0_MASK                                                                   0x00000020L
+#define SPI_GDBG_WAVE_CNTL3__STALL_CS1_MASK                                                                   0x00000040L
+#define SPI_GDBG_WAVE_CNTL3__STALL_CS2_MASK                                                                   0x00000080L
+#define SPI_GDBG_WAVE_CNTL3__STALL_CS3_MASK                                                                   0x00000100L
+#define SPI_GDBG_WAVE_CNTL3__STALL_CS4_MASK                                                                   0x00000200L
+#define SPI_GDBG_WAVE_CNTL3__STALL_CS5_MASK                                                                   0x00000400L
+#define SPI_GDBG_WAVE_CNTL3__STALL_CS6_MASK                                                                   0x00000800L
+#define SPI_GDBG_WAVE_CNTL3__STALL_CS7_MASK                                                                   0x00001000L
+#define SPI_GDBG_WAVE_CNTL3__STALL_DURATION_MASK                                                              0x0FFFE000L
+#define SPI_GDBG_WAVE_CNTL3__STALL_MULT_MASK                                                                  0x10000000L
+//SPI_GDBG_TRAP_DATA0
+#define SPI_GDBG_TRAP_DATA0__DATA__SHIFT                                                                      0x0
+#define SPI_GDBG_TRAP_DATA0__DATA_MASK                                                                        0xFFFFFFFFL
+//SPI_GDBG_TRAP_DATA1
+#define SPI_GDBG_TRAP_DATA1__DATA__SHIFT                                                                      0x0
+#define SPI_GDBG_TRAP_DATA1__DATA_MASK                                                                        0xFFFFFFFFL
 //SPI_COMPUTE_QUEUE_RESET
 #define SPI_COMPUTE_QUEUE_RESET__RESET__SHIFT                                                                 0x0
 #define SPI_COMPUTE_QUEUE_RESET__RESET_MASK                                                                   0x01L
diff --git a/drivers/gpu/drm/amd/include/asic_reg/gc/gc_10_3_0_offset.h b/drivers/gpu/drm/amd/include/asic_reg/gc/gc_10_3_0_offset.h
index 3973110f149c..d09f1a06f4bf 100644
--- a/drivers/gpu/drm/amd/include/asic_reg/gc/gc_10_3_0_offset.h
+++ b/drivers/gpu/drm/amd/include/asic_reg/gc/gc_10_3_0_offset.h
@@ -26,6 +26,8 @@
 #define mmSQ_DEBUG_STS_GLOBAL_BASE_IDX                                                                 0
 #define mmSQ_DEBUG_STS_GLOBAL2                                                                         0x10B0
 #define mmSQ_DEBUG_STS_GLOBAL2_BASE_IDX                                                                0
+#define mmSQ_DEBUG                                                                                     0x10B1
+#define mmSQ_DEBUG_BASE_IDX                                                                            0
 
 // addressBlock: gc_sdma0_sdma0dec
 // base address: 0x4980
@@ -4849,10 +4851,18 @@
 #define mmSPI_WCL_PIPE_PERCENT_CS3_BASE_IDX                                                            0
 #define mmSPI_GDBG_WAVE_CNTL                                                                           0x1f71
 #define mmSPI_GDBG_WAVE_CNTL_BASE_IDX                                                                  0
+#define mmSPI_GDBG_TRAP_CONFIG                                                                         0x1f72
+#define mmSPI_GDBG_TRAP_CONFIG_BASE_IDX                                                                0
 #define mmSPI_GDBG_TRAP_MASK                                                                           0x1f73
 #define mmSPI_GDBG_TRAP_MASK_BASE_IDX                                                                  0
 #define mmSPI_GDBG_WAVE_CNTL2                                                                          0x1f74
 #define mmSPI_GDBG_WAVE_CNTL2_BASE_IDX                                                                 0
+#define mmSPI_GDBG_WAVE_CNTL3                                                                          0x1f75
+#define mmSPI_GDBG_WAVE_CNTL3_BASE_IDX                                                                 0
+#define mmSPI_GDBG_TRAP_DATA0                                                                          0x1f78
+#define mmSPI_GDBG_TRAP_DATA0_BASE_IDX                                                                 0
+#define mmSPI_GDBG_TRAP_DATA1                                                                          0x1f79
+#define mmSPI_GDBG_TRAP_DATA1_BASE_IDX                                                                 0
 #define mmSPI_COMPUTE_QUEUE_RESET                                                                      0x1f7b
 #define mmSPI_COMPUTE_QUEUE_RESET_BASE_IDX                                                             0
 #define mmSPI_RESOURCE_RESERVE_CU_0                                                                    0x1f7c
diff --git a/drivers/gpu/drm/amd/include/asic_reg/gc/gc_10_3_0_sh_mask.h b/drivers/gpu/drm/amd/include/asic_reg/gc/gc_10_3_0_sh_mask.h
index d4e8ff22ecb8..fc85aee010fe 100644
--- a/drivers/gpu/drm/amd/include/asic_reg/gc/gc_10_3_0_sh_mask.h
+++ b/drivers/gpu/drm/amd/include/asic_reg/gc/gc_10_3_0_sh_mask.h
@@ -47853,6 +47853,10 @@
 
 
 // addressBlock: sqind
+//SQ_DEBUG
+#define SQ_DEBUG__SINGLE_MEMOP_MASK 0x00000001L
+#define SQ_DEBUG__SINGLE_MEMOP__SHIFT 0x00000000
+
 //SQ_DEBUG_STS_GLOBAL
 #define SQ_DEBUG_STS_GLOBAL2__FIFO_LEVEL_GFX0_MASK 0x000000ffL
 #define SQ_DEBUG_STS_GLOBAL2__FIFO_LEVEL_GFX0__SHIFT 0x00000000
diff --git a/drivers/gpu/drm/amd/include/asic_reg/gc/gc_11_0_0_sh_mask.h b/drivers/gpu/drm/amd/include/asic_reg/gc/gc_11_0_0_sh_mask.h
index 4f08f90856fc..3088a4a13cb5 100644
--- a/drivers/gpu/drm/amd/include/asic_reg/gc/gc_11_0_0_sh_mask.h
+++ b/drivers/gpu/drm/amd/include/asic_reg/gc/gc_11_0_0_sh_mask.h
@@ -17216,11 +17216,15 @@
 #define SPI_GDBG_PER_VMID_CNTL__TRAP_EN__SHIFT                                                                0x3
 #define SPI_GDBG_PER_VMID_CNTL__EXCP_EN__SHIFT                                                                0x4
 #define SPI_GDBG_PER_VMID_CNTL__EXCP_REPLACE__SHIFT                                                           0xd
+#define SPI_GDBG_PER_VMID_CNTL__TRAP_ON_START__SHIFT                                                          0xe
+#define SPI_GDBG_PER_VMID_CNTL__TRAP_ON_END__SHIFT                                                            0xf
 #define SPI_GDBG_PER_VMID_CNTL__STALL_VMID_MASK                                                               0x00000001L
 #define SPI_GDBG_PER_VMID_CNTL__LAUNCH_MODE_MASK                                                              0x00000006L
 #define SPI_GDBG_PER_VMID_CNTL__TRAP_EN_MASK                                                                  0x00000008L
 #define SPI_GDBG_PER_VMID_CNTL__EXCP_EN_MASK                                                                  0x00001FF0L
 #define SPI_GDBG_PER_VMID_CNTL__EXCP_REPLACE_MASK                                                             0x00002000L
+#define SPI_GDBG_PER_VMID_CNTL__TRAP_ON_START_MASK                                                            0x00004000L
+#define SPI_GDBG_PER_VMID_CNTL__TRAP_ON_END_MASK                                                              0x00008000L
 //SPI_COMPUTE_QUEUE_RESET
 #define SPI_COMPUTE_QUEUE_RESET__RESET__SHIFT                                                                 0x0
 #define SPI_COMPUTE_QUEUE_RESET__RESET_MASK                                                                   0x01L
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 68+ messages in thread

* [PATCH 06/32] drm/amdgpu: add gfx9 hw debug mode enable and disable calls
  2023-01-25 19:53 [PATCH 00/32] Upstream of kernel support for AMDGPU ISA debugging Jonathan Kim
                   ` (4 preceding siblings ...)
  2023-01-25 19:53 ` [PATCH 05/32] drm/amdgpu: setup hw debug registers on driver initialization Jonathan Kim
@ 2023-01-25 19:53 ` Jonathan Kim
  2023-01-29  5:12   ` kernel test robot
  2023-02-16 22:54   ` Felix Kuehling
  2023-01-25 19:53 ` [PATCH 07/32] drm/amdgpu: add gfx9.4.1 " Jonathan Kim
                   ` (25 subsequent siblings)
  31 siblings, 2 replies; 68+ messages in thread
From: Jonathan Kim @ 2023-01-25 19:53 UTC (permalink / raw)
  To: amd-gfx, dri-devel; +Cc: Felix.Kuehling, Jonathan.Kim

Implement the per-device calls to enable or disable HW debug mode for
GFX9 prior to GFX9.4.1.

GFX9.4.1 and onward will require their own enable/disable sequence as
follow on patches.

When hardware debug mode setting is requested, waves will inherit
these settings in the Shader Processor Input's (SPI) Sequencer Global
Block (SQG). This means that the KGD must drain all waves from the SPI
into SQG (approximately 96 SPI clock cycles) prior to debug mode setting
to ensure that the order of operations that the debugger expects with
regards to debug mode setting transaction requests and wave inheritence
of that mode is upheld.

Also ensure that exception overrides are reset to their original state
prior to debug enable or disable.

v2: remove unnecessary static srbm lock renaming.
add comments to explain ignored arguments for debug trap enable and
disable.

Signed-off-by: Jonathan Kim <jonathan.kim@amd.com>
---
 .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c | 93 +++++++++++++++++++
 .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h |  9 ++
 drivers/gpu/drm/amd/amdkfd/kfd_debug.h        |  3 +
 3 files changed, 105 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
index e92b93557c13..94a9fd9bd984 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
@@ -646,6 +646,97 @@ int kgd_gfx_v9_wave_control_execute(struct amdgpu_device *adev,
 	return 0;
 }
 
+/*
+ * GFX9 helper for wave launch stall requirements on debug trap setting.
+ *
+ * vmid:
+ *   Target VMID to stall/unstall.
+ *
+ * stall:
+ *   0-unstall wave launch (enable), 1-stall wave launch (disable).
+ *   After wavefront launch has been stalled, allocated waves must drain from
+ *   SPI in order for debug trap settings to take effect on those waves.
+ *   This is roughly a ~96 clock cycle wait on SPI where a read on
+ *   SPI_GDBG_WAVE_CNTL translates to ~32 clock cycles.
+ *   KGD_GFX_V9_WAVE_LAUNCH_SPI_DRAIN_LATENCY indicates the number of reads required.
+ *
+ *   NOTE: We can afford to clear the entire STALL_VMID field on unstall
+ *   because GFX9.4.1 cannot support multi-process debugging due to trap
+ *   configuration and masking being limited to global scope.  Always assume
+ *   single process conditions.
+
+ */
+#define KGD_GFX_V9_WAVE_LAUNCH_SPI_DRAIN_LATENCY	3
+void kgd_gfx_v9_set_wave_launch_stall(struct amdgpu_device *adev,
+					uint32_t vmid,
+					bool stall)
+{
+	int i;
+	uint32_t data = RREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_WAVE_CNTL));
+
+	if (adev->ip_versions[GC_HWIP][0] == IP_VERSION(9, 4, 1))
+		data = REG_SET_FIELD(data, SPI_GDBG_WAVE_CNTL, STALL_VMID,
+							stall ? 1 << vmid : 0);
+	else
+		data = REG_SET_FIELD(data, SPI_GDBG_WAVE_CNTL, STALL_RA,
+							stall ? 1 : 0);
+
+	WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_WAVE_CNTL), data);
+
+	if (!stall)
+		return;
+
+	for (i = 0; i < KGD_GFX_V9_WAVE_LAUNCH_SPI_DRAIN_LATENCY; i++)
+		RREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_WAVE_CNTL));
+}
+
+/**
+ * restore_dbg_reisters is ignored here but is a general interface requirement
+ * for devices that support GFXOFF and where the RLC save/restore list
+ * does not support hw registers for debugging i.e. the driver has to manually
+ * initialize the debug mode registers after it has disabled GFX off during the
+ * debug session.
+ */
+uint32_t kgd_gfx_v9_enable_debug_trap(struct amdgpu_device *adev,
+				bool restore_dbg_registers,
+				uint32_t vmid)
+{
+	mutex_lock(&adev->grbm_idx_mutex);
+
+	kgd_gfx_v9_set_wave_launch_stall(adev, vmid, true);
+
+	WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_TRAP_MASK), 0);
+
+	kgd_gfx_v9_set_wave_launch_stall(adev, vmid, false);
+
+	mutex_unlock(&adev->grbm_idx_mutex);
+
+	return 0;
+}
+
+/**
+ * keep_trap_enabled is ignored here but is a general interface requirement
+ * for devices that support multi-process debugging where the performance
+ * overhead from trap temporary setup needs to be bypassed when the debug
+ * session has ended.
+ */
+uint32_t kgd_gfx_v9_disable_debug_trap(struct amdgpu_device *adev,
+					bool keep_trap_enabled,
+					uint32_t vmid)
+{
+	mutex_lock(&adev->grbm_idx_mutex);
+
+	kgd_gfx_v9_set_wave_launch_stall(adev, vmid, true);
+
+	WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_TRAP_MASK), 0);
+
+	kgd_gfx_v9_set_wave_launch_stall(adev, vmid, false);
+
+	mutex_unlock(&adev->grbm_idx_mutex);
+
+	return 0;
+}
+
 void kgd_gfx_v9_set_vm_context_page_table_base(struct amdgpu_device *adev,
 			uint32_t vmid, uint64_t page_table_base)
 {
@@ -871,6 +962,8 @@ const struct kfd2kgd_calls gfx_v9_kfd2kgd = {
 	.get_atc_vmid_pasid_mapping_info =
 			kgd_gfx_v9_get_atc_vmid_pasid_mapping_info,
 	.set_vm_context_page_table_base = kgd_gfx_v9_set_vm_context_page_table_base,
+	.enable_debug_trap = kgd_gfx_v9_enable_debug_trap,
+	.disable_debug_trap = kgd_gfx_v9_disable_debug_trap,
 	.get_cu_occupancy = kgd_gfx_v9_get_cu_occupancy,
 	.program_trap_handler_settings = kgd_gfx_v9_program_trap_handler_settings,
 };
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h
index c7ed3bc9053c..d39256162616 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h
@@ -58,3 +58,12 @@ void kgd_gfx_v9_get_cu_occupancy(struct amdgpu_device *adev, int pasid,
 		int *pasid_wave_cnt, int *max_waves_per_cu);
 void kgd_gfx_v9_program_trap_handler_settings(struct amdgpu_device *adev,
 		uint32_t vmid, uint64_t tba_addr, uint64_t tma_addr);
+void kgd_gfx_v9_set_wave_launch_stall(struct amdgpu_device *adev,
+					uint32_t vmid,
+					bool stall);
+uint32_t kgd_gfx_v9_enable_debug_trap(struct amdgpu_device *adev,
+				      bool restore_dbg_registers,
+				      uint32_t vmid);
+uint32_t kgd_gfx_v9_disable_debug_trap(struct amdgpu_device *adev,
+					bool keep_trap_enabled,
+					uint32_t vmid);
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.h b/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
index b2217eb1399c..8aa7a3ad4e97 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
@@ -25,6 +25,9 @@
 
 #include "kfd_priv.h"
 
+void kgd_gfx_v9_set_wave_launch_stall(struct amdgpu_device *adev,
+					uint32_t vmid,
+					bool stall);
 int kfd_dbg_trap_disable(struct kfd_process *target);
 int kfd_dbg_trap_enable(struct kfd_process *target, uint32_t fd,
 			void __user *runtime_info,
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 68+ messages in thread

* [PATCH 07/32] drm/amdgpu: add gfx9.4.1 hw debug mode enable and disable calls
  2023-01-25 19:53 [PATCH 00/32] Upstream of kernel support for AMDGPU ISA debugging Jonathan Kim
                   ` (5 preceding siblings ...)
  2023-01-25 19:53 ` [PATCH 06/32] drm/amdgpu: add gfx9 hw debug mode enable and disable calls Jonathan Kim
@ 2023-01-25 19:53 ` Jonathan Kim
  2023-01-29  6:34   ` kernel test robot
  2023-02-16 23:01   ` Felix Kuehling
  2023-01-25 19:53 ` [PATCH 08/32] drm/amdgpu: add gfx10 " Jonathan Kim
                   ` (24 subsequent siblings)
  31 siblings, 2 replies; 68+ messages in thread
From: Jonathan Kim @ 2023-01-25 19:53 UTC (permalink / raw)
  To: amd-gfx, dri-devel; +Cc: Felix.Kuehling, Jonathan.Kim

On GFX9.4.1, the implicit wait count instruction on s_barrier is
disabled by default in the driver during normal operation for
performance requirements.

There is a hardware bug in GFX9.4.1 where if the implicit wait count
instruction after an s_barrier instruction is disabled, any wave that
hits an exception may step over the s_barrier when returning from the
trap handler with the barrier logic having no ability to be
aware of this, thereby causing other waves to wait at the barrier
indefinitely resulting in a shader hang.  This bug has been corrected
for GFX9.4.2 and onward.

Since the debugger subscribes to hardware exceptions, in order to avoid
this bug, the debugger must enable implicit wait count on s_barrier
for a debug session and disable it on detach.

In order to change this setting in the in the device global SQ_CONFIG
register, the GFX pipeline must be idle.  GFX9.4.1 as a compute device
will either dispatch work through the compute ring buffers used for
image post processing or through the hardware scheduler by the KFD.

Have the KGD suspend and drain the compute ring buffer, then suspend the
hardware scheduler and block any future KFD process job requests before
changing the implicit wait count setting.  Once set, resume all work.

v2: remove flush on kfd suspend as that will be a general fix required
outside of this patch series.
comment on trap enable/disable ignored variables.

Signed-off-by: Jonathan Kim <jonathan.kim@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h           |   3 +
 .../drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c   | 118 +++++++++++++++++-
 drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c         |   4 +-
 3 files changed, 122 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index 872450a3a164..3c03e34c194c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -1041,6 +1041,9 @@ struct amdgpu_device {
 	struct pci_saved_state          *pci_state;
 	pci_channel_state_t		pci_channel_state;
 
+	/* Track auto wait count on s_barrier settings */
+	bool				barrier_has_auto_waitcnt;
+
 	struct amdgpu_reset_control     *reset_cntl;
 	uint32_t                        ip_versions[MAX_HWIP][HWIP_MAX_INSTANCE];
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c
index 4191af5a3f13..d5bb86ccd617 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c
@@ -26,6 +26,7 @@
 #include "amdgpu.h"
 #include "amdgpu_amdkfd.h"
 #include "amdgpu_amdkfd_arcturus.h"
+#include "amdgpu_reset.h"
 #include "sdma0/sdma0_4_2_2_offset.h"
 #include "sdma0/sdma0_4_2_2_sh_mask.h"
 #include "sdma1/sdma1_4_2_2_offset.h"
@@ -48,6 +49,8 @@
 #include "amdgpu_amdkfd_gfx_v9.h"
 #include "gfxhub_v1_0.h"
 #include "mmhub_v9_4.h"
+#include "gc/gc_9_0_offset.h"
+#include "gc/gc_9_0_sh_mask.h"
 
 #define HQD_N_REGS 56
 #define DUMP_REG(addr) do {				\
@@ -276,6 +279,117 @@ int kgd_arcturus_hqd_sdma_destroy(struct amdgpu_device *adev, void *mqd,
 	return 0;
 }
 
+/*
+ * Helper used to suspend/resume gfx pipe for image post process work to set
+ * barrier behaviour.
+ */
+static int suspend_resume_compute_scheduler(struct amdgpu_device *adev, bool suspend)
+{
+	int i, r = 0;
+
+	for (i = 0; i < adev->gfx.num_compute_rings; i++) {
+		struct amdgpu_ring *ring = &adev->gfx.compute_ring[i];
+
+		if (!(ring && ring->sched.thread))
+			continue;
+
+		/* stop secheduler and drain ring. */
+		if (suspend) {
+			drm_sched_stop(&ring->sched, NULL);
+			r = amdgpu_fence_wait_empty(ring);
+			if (r)
+				goto out;
+		} else {
+			drm_sched_start(&ring->sched, false);
+		}
+	}
+
+out:
+	/* return on resume or failure to drain rings. */
+	if (!suspend || r)
+		return r;
+
+	return amdgpu_device_ip_wait_for_idle(adev, GC_HWIP);
+}
+
+static void set_barrier_auto_waitcnt(struct amdgpu_device *adev, bool enable_waitcnt)
+{
+	uint32_t data;
+
+	WRITE_ONCE(adev->barrier_has_auto_waitcnt, enable_waitcnt);
+
+	if (!down_read_trylock(&adev->reset_domain->sem))
+		return;
+
+	amdgpu_amdkfd_suspend(adev, false);
+
+	if (suspend_resume_compute_scheduler(adev, true))
+		goto out;
+
+	data = RREG32(SOC15_REG_OFFSET(GC, 0, mmSQ_CONFIG));
+	data = REG_SET_FIELD(data, SQ_CONFIG, DISABLE_BARRIER_WAITCNT,
+						enable_waitcnt ? 0 : 1);
+	WREG32(SOC15_REG_OFFSET(GC, 0, mmSQ_CONFIG), data);
+
+out:
+	suspend_resume_compute_scheduler(adev, false);
+
+	amdgpu_amdkfd_resume(adev, false);
+
+	up_read(&adev->reset_domain->sem);
+}
+
+/**
+ * restore_dbg_reisters is ignored here but is a general interface requirement
+ * for devices that support GFXOFF and where the RLC save/restore list
+ * does not support hw registers for debugging i.e. the driver has to manually
+ * initialize the debug mode registers after it has disabled GFX off during the
+ * debug session.
+ */
+static uint32_t kgd_arcturus_enable_debug_trap(struct amdgpu_device *adev,
+				bool restore_dbg_registers,
+				uint32_t vmid)
+{
+	mutex_lock(&adev->grbm_idx_mutex);
+
+	kgd_gfx_v9_set_wave_launch_stall(adev, vmid, true);
+
+	set_barrier_auto_waitcnt(adev, true);
+
+	WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_TRAP_MASK), 0);
+
+	kgd_gfx_v9_set_wave_launch_stall(adev, vmid, false);
+
+	mutex_unlock(&adev->grbm_idx_mutex);
+
+	return 0;
+}
+
+/**
+ * keep_trap_enabled is ignored here but is a general interface requirement
+ * for devices that support multi-process debugging where the performance
+ * overhead from trap temporary setup needs to be bypassed when the debug
+ * session has ended.
+ */
+static uint32_t kgd_arcturus_disable_debug_trap(struct amdgpu_device *adev,
+					bool keep_trap_enabled,
+					uint32_t vmid)
+{
+
+	mutex_lock(&adev->grbm_idx_mutex);
+
+	kgd_gfx_v9_set_wave_launch_stall(adev, vmid, true);
+
+	set_barrier_auto_waitcnt(adev, false);
+
+	WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_TRAP_MASK), 0);
+
+	kgd_gfx_v9_set_wave_launch_stall(adev, vmid, false);
+
+	mutex_unlock(&adev->grbm_idx_mutex);
+
+	return 0;
+}
 const struct kfd2kgd_calls arcturus_kfd2kgd = {
 	.program_sh_mem_settings = kgd_gfx_v9_program_sh_mem_settings,
 	.set_pasid_vmid_mapping = kgd_gfx_v9_set_pasid_vmid_mapping,
@@ -294,6 +408,8 @@ const struct kfd2kgd_calls arcturus_kfd2kgd = {
 				kgd_gfx_v9_get_atc_vmid_pasid_mapping_info,
 	.set_vm_context_page_table_base =
 				kgd_gfx_v9_set_vm_context_page_table_base,
+	.enable_debug_trap = kgd_arcturus_enable_debug_trap,
+	.disable_debug_trap = kgd_arcturus_disable_debug_trap,
 	.get_cu_occupancy = kgd_gfx_v9_get_cu_occupancy,
-	.program_trap_handler_settings = kgd_gfx_v9_program_trap_handler_settings
+	.program_trap_handler_settings = kgd_gfx_v9_program_trap_handler_settings,
 };
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
index 222fe87161b7..56d25a6f1da9 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
@@ -2376,8 +2376,8 @@ static void gfx_v9_0_init_sq_config(struct amdgpu_device *adev)
 	switch (adev->ip_versions[GC_HWIP][0]) {
 	case IP_VERSION(9, 4, 1):
 		tmp = RREG32_SOC15(GC, 0, mmSQ_CONFIG);
-		tmp = REG_SET_FIELD(tmp, SQ_CONFIG,
-					DISABLE_BARRIER_WAITCNT, 1);
+		tmp = REG_SET_FIELD(tmp, SQ_CONFIG, DISABLE_BARRIER_WAITCNT,
+				READ_ONCE(adev->barrier_has_auto_waitcnt) ? 0 : 1);
 		WREG32_SOC15(GC, 0, mmSQ_CONFIG, tmp);
 		break;
 	default:
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 68+ messages in thread

* [PATCH 08/32] drm/amdgpu: add gfx10 hw debug mode enable and disable calls
  2023-01-25 19:53 [PATCH 00/32] Upstream of kernel support for AMDGPU ISA debugging Jonathan Kim
                   ` (6 preceding siblings ...)
  2023-01-25 19:53 ` [PATCH 07/32] drm/amdgpu: add gfx9.4.1 " Jonathan Kim
@ 2023-01-25 19:53 ` Jonathan Kim
  2023-01-29  7:55   ` kernel test robot
  2023-02-16 23:11   ` Felix Kuehling
  2023-01-25 19:53 ` [PATCH 09/32] drm/amdgpu: add gfx9.4.2 " Jonathan Kim
                   ` (23 subsequent siblings)
  31 siblings, 2 replies; 68+ messages in thread
From: Jonathan Kim @ 2023-01-25 19:53 UTC (permalink / raw)
  To: amd-gfx, dri-devel; +Cc: Felix.Kuehling, Jonathan.Kim

Similar to GFX9 debug devices, set the hardware debug mode by draining
the SPI appropriately prior the mode setting request.

Because GFX10 has waves allocated by the work group boundaray and each
SE's SPI instances do not communicate, the SPI drain time is much longer.
This long drain time will be fixed for GFX11 onwards.

Also remove a bunch of deprecated misplaced references for GFX10.3.

Signed-off-by: Jonathan Kim <jonathan.kim@amd.com>
---
 .../drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c    |  95 +++++++++++
 .../drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.h    |  28 ++++
 .../drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10_3.c  | 147 +-----------------
 3 files changed, 126 insertions(+), 144 deletions(-)
 create mode 100644 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.h

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
index 9378fc79e9ea..c09b45de02d0 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
@@ -708,6 +708,99 @@ static void set_vm_context_page_table_base(struct amdgpu_device *adev,
 	adev->gfxhub.funcs->setup_vm_pt_regs(adev, vmid, page_table_base);
 }
 
+/*
+ * GFX10 helper for wave launch stall requirements on debug trap setting.
+ *
+ * vmid:
+ *   Target VMID to stall/unstall.
+ *
+ * stall:
+ *   0-unstall wave launch (enable), 1-stall wave launch (disable).
+ *   After wavefront launch has been stalled, allocated waves must drain from
+ *   SPI in order for debug trap settings to take effect on those waves.
+ *   This is roughly a ~3500 clock cycle wait on SPI where a read on
+ *   SPI_GDBG_WAVE_CNTL translates to ~32 clock cycles.
+ *   KGD_GFX_V10_WAVE_LAUNCH_SPI_DRAIN_LATENCY indicates the number of reads required.
+ *
+ *   NOTE: We can afford to clear the entire STALL_VMID field on unstall
+ *   because current GFX10 chips cannot support multi-process debugging due to
+ *   trap configuration and masking being limited to global scope.  Always
+ *   assume single process conditions.
+ *
+ */
+
+#define KGD_GFX_V10_WAVE_LAUNCH_SPI_DRAIN_LATENCY	110
+static void kgd_gfx_v10_set_wave_launch_stall(struct amdgpu_device *adev, uint32_t vmid, bool stall)
+{
+	uint32_t data = RREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_WAVE_CNTL));
+	int i;
+
+	data = REG_SET_FIELD(data, SPI_GDBG_WAVE_CNTL, STALL_VMID,
+							stall ? 1 << vmid : 0);
+
+	WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_WAVE_CNTL), data);
+
+	if (!stall)
+		return;
+
+	for (i = 0; i < KGD_GFX_V10_WAVE_LAUNCH_SPI_DRAIN_LATENCY; i++)
+		RREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_WAVE_CNTL));
+}
+
+uint32_t kgd_gfx_v10_enable_debug_trap(struct amdgpu_device *adev,
+				bool restore_dbg_registers,
+				uint32_t vmid)
+{
+
+	mutex_lock(&adev->grbm_idx_mutex);
+
+	kgd_gfx_v10_set_wave_launch_stall(adev, vmid, true);
+
+	/* assume gfx off is disabled for the debug session if rlc restore not supported. */
+	if (restore_dbg_registers) {
+		uint32_t data = 0;
+
+		data = REG_SET_FIELD(data, SPI_GDBG_TRAP_CONFIG,
+				VMID_SEL, 1 << vmid);
+		data = REG_SET_FIELD(data, SPI_GDBG_TRAP_CONFIG,
+				TRAP_EN, 1);
+		WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_TRAP_CONFIG), data);
+		WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_TRAP_DATA0), 0);
+		WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_TRAP_DATA1), 0);
+
+		kgd_gfx_v10_set_wave_launch_stall(adev, vmid, false);
+
+		mutex_unlock(&adev->grbm_idx_mutex);
+
+		return 0;
+	}
+
+	WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_TRAP_MASK), 0);
+
+	kgd_gfx_v10_set_wave_launch_stall(adev, vmid, false);
+
+	mutex_unlock(&adev->grbm_idx_mutex);
+
+	return 0;
+}
+
+uint32_t kgd_gfx_v10_disable_debug_trap(struct amdgpu_device *adev,
+					bool keep_trap_enabled,
+					uint32_t vmid)
+{
+	mutex_lock(&adev->grbm_idx_mutex);
+
+	kgd_gfx_v10_set_wave_launch_stall(adev, vmid, true);
+
+	WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_TRAP_MASK), 0);
+
+	kgd_gfx_v10_set_wave_launch_stall(adev, vmid, false);
+
+	mutex_unlock(&adev->grbm_idx_mutex);
+
+	return 0;
+}
+
 static void program_trap_handler_settings(struct amdgpu_device *adev,
 		uint32_t vmid, uint64_t tba_addr, uint64_t tma_addr)
 {
@@ -750,5 +843,7 @@ const struct kfd2kgd_calls gfx_v10_kfd2kgd = {
 	.get_atc_vmid_pasid_mapping_info =
 			get_atc_vmid_pasid_mapping_info,
 	.set_vm_context_page_table_base = set_vm_context_page_table_base,
+	.enable_debug_trap = kgd_gfx_v10_enable_debug_trap,
+	.disable_debug_trap = kgd_gfx_v10_disable_debug_trap,
 	.program_trap_handler_settings = program_trap_handler_settings,
 };
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.h
new file mode 100644
index 000000000000..370d6c312981
--- /dev/null
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.h
@@ -0,0 +1,28 @@
+/*
+ * Copyright 2022 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+uint32_t kgd_gfx_v10_enable_debug_trap(struct amdgpu_device *adev,
+				      bool restore_dbg_registers,
+				      uint32_t vmid);
+uint32_t kgd_gfx_v10_disable_debug_trap(struct amdgpu_device *adev,
+					bool keep_trap_enabled,
+					uint32_t vmid);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10_3.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10_3.c
index ba21ec6b35e0..73e3b9ae1fb0 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10_3.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10_3.c
@@ -22,6 +22,7 @@
 #include <linux/mmu_context.h>
 #include "amdgpu.h"
 #include "amdgpu_amdkfd.h"
+#include "amdgpu_amdkfd_gfx_v10.h"
 #include "gc/gc_10_3_0_offset.h"
 #include "gc/gc_10_3_0_sh_mask.h"
 #include "oss/osssys_5_0_0_offset.h"
@@ -652,142 +653,6 @@ static void program_trap_handler_settings_v10_3(struct amdgpu_device *adev,
 	unlock_srbm(adev);
 }
 
-#if 0
-uint32_t enable_debug_trap_v10_3(struct amdgpu_device *adev,
-				uint32_t trap_debug_wave_launch_mode,
-				uint32_t vmid)
-{
-	uint32_t data = 0;
-	uint32_t orig_wave_cntl_value;
-	uint32_t orig_stall_vmid;
-
-	mutex_lock(&adev->grbm_idx_mutex);
-
-	orig_wave_cntl_value = RREG32(SOC15_REG_OFFSET(GC,
-				0,
-				mmSPI_GDBG_WAVE_CNTL));
-	orig_stall_vmid = REG_GET_FIELD(orig_wave_cntl_value,
-			SPI_GDBG_WAVE_CNTL,
-			STALL_VMID);
-
-	data = REG_SET_FIELD(data, SPI_GDBG_WAVE_CNTL, STALL_RA, 1);
-	WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_WAVE_CNTL), data);
-
-	data = 0;
-	WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_TRAP_MASK), data);
-
-	WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_WAVE_CNTL), orig_stall_vmid);
-
-	mutex_unlock(&adev->grbm_idx_mutex);
-
-	return 0;
-}
-
-uint32_t disable_debug_trap_v10_3(struct amdgpu_device *adev)
-{
-	mutex_lock(&adev->grbm_idx_mutex);
-
-	WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_TRAP_MASK), 0);
-
-	mutex_unlock(&adev->grbm_idx_mutex);
-
-	return 0;
-}
-
-uint32_t set_wave_launch_trap_override_v10_3(struct amdgpu_device *adev,
-						uint32_t trap_override,
-						uint32_t trap_mask)
-{
-	uint32_t data = 0;
-
-	mutex_lock(&adev->grbm_idx_mutex);
-
-	data = RREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_WAVE_CNTL));
-	data = REG_SET_FIELD(data, SPI_GDBG_WAVE_CNTL, STALL_RA, 1);
-	WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_WAVE_CNTL), data);
-
-	data = 0;
-	data = REG_SET_FIELD(data, SPI_GDBG_TRAP_MASK,
-			EXCP_EN, trap_mask);
-	data = REG_SET_FIELD(data, SPI_GDBG_TRAP_MASK,
-			REPLACE, trap_override);
-	WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_TRAP_MASK), data);
-
-	data = RREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_WAVE_CNTL));
-	data = REG_SET_FIELD(data, SPI_GDBG_WAVE_CNTL, STALL_RA, 0);
-	WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_WAVE_CNTL), data);
-
-	mutex_unlock(&adev->grbm_idx_mutex);
-
-	return 0;
-}
-
-uint32_t set_wave_launch_mode_v10_3(struct amdgpu_device *adev,
-					uint8_t wave_launch_mode,
-					uint32_t vmid)
-{
-	uint32_t data = 0;
-	bool is_stall_mode;
-	bool is_mode_set;
-
-	is_stall_mode = (wave_launch_mode == 4);
-	is_mode_set = (wave_launch_mode != 0 && wave_launch_mode != 4);
-
-	mutex_lock(&adev->grbm_idx_mutex);
-
-	data = REG_SET_FIELD(data, SPI_GDBG_WAVE_CNTL2,
-			VMID_MASK, is_mode_set ? 1 << vmid : 0);
-	data = REG_SET_FIELD(data, SPI_GDBG_WAVE_CNTL2,
-			MODE, is_mode_set ? wave_launch_mode : 0);
-	WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_WAVE_CNTL2), data);
-
-	data = RREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_WAVE_CNTL));
-	data = REG_SET_FIELD(data, SPI_GDBG_WAVE_CNTL,
-			STALL_VMID, is_stall_mode ? 1 << vmid : 0);
-	data = REG_SET_FIELD(data, SPI_GDBG_WAVE_CNTL,
-			STALL_RA, is_stall_mode ? 1 : 0);
-	WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_WAVE_CNTL), data);
-
-	mutex_unlock(&adev->grbm_idx_mutex);
-
-	return 0;
-}
-
-/* kgd_get_iq_wait_times: Returns the mmCP_IQ_WAIT_TIME1/2 values
- * The values read are:
- *	ib_offload_wait_time     -- Wait Count for Indirect Buffer Offloads.
- *	atomic_offload_wait_time -- Wait Count for L2 and GDS Atomics Offloads.
- *	wrm_offload_wait_time    -- Wait Count for WAIT_REG_MEM Offloads.
- *	gws_wait_time            -- Wait Count for Global Wave Syncs.
- *	que_sleep_wait_time      -- Wait Count for Dequeue Retry.
- *	sch_wave_wait_time       -- Wait Count for Scheduling Wave Message.
- *	sem_rearm_wait_time      -- Wait Count for Semaphore re-arm.
- *	deq_retry_wait_time      -- Wait Count for Global Wave Syncs.
- */
-void get_iq_wait_times_v10_3(struct amdgpu_device *adev,
-					uint32_t *wait_times)
-
-{
-	*wait_times = RREG32(SOC15_REG_OFFSET(GC, 0, mmCP_IQ_WAIT_TIME2));
-}
-
-void build_grace_period_packet_info_v10_3(struct amdgpu_device *adev,
-						uint32_t wait_times,
-						uint32_t grace_period,
-						uint32_t *reg_offset,
-						uint32_t *reg_data)
-{
-	*reg_data = wait_times;
-
-	*reg_data = REG_SET_FIELD(*reg_data,
-			CP_IQ_WAIT_TIME2,
-			SCH_WAVE,
-			grace_period);
-
-	*reg_offset = mmCP_IQ_WAIT_TIME2;
-}
-#endif
-
 const struct kfd2kgd_calls gfx_v10_3_kfd2kgd = {
 	.program_sh_mem_settings = program_sh_mem_settings_v10_3,
 	.set_pasid_vmid_mapping = set_pasid_vmid_mapping_v10_3,
@@ -805,12 +670,6 @@ const struct kfd2kgd_calls gfx_v10_3_kfd2kgd = {
 	.get_atc_vmid_pasid_mapping_info = get_atc_vmid_pasid_mapping_info_v10_3,
 	.set_vm_context_page_table_base = set_vm_context_page_table_base_v10_3,
 	.program_trap_handler_settings = program_trap_handler_settings_v10_3,
-#if 0
-	.enable_debug_trap = enable_debug_trap_v10_3,
-	.disable_debug_trap = disable_debug_trap_v10_3,
-	.set_wave_launch_trap_override = set_wave_launch_trap_override_v10_3,
-	.set_wave_launch_mode = set_wave_launch_mode_v10_3,
-	.get_iq_wait_times = get_iq_wait_times_v10_3,
-	.build_grace_period_packet_info = build_grace_period_packet_info_v10_3,
-#endif
+	.enable_debug_trap = kgd_gfx_v10_enable_debug_trap,
+	.disable_debug_trap = kgd_gfx_v10_disable_debug_trap
 };
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 68+ messages in thread

* [PATCH 09/32] drm/amdgpu: add gfx9.4.2 hw debug mode enable and disable calls
  2023-01-25 19:53 [PATCH 00/32] Upstream of kernel support for AMDGPU ISA debugging Jonathan Kim
                   ` (7 preceding siblings ...)
  2023-01-25 19:53 ` [PATCH 08/32] drm/amdgpu: add gfx10 " Jonathan Kim
@ 2023-01-25 19:53 ` Jonathan Kim
  2023-02-16 23:14   ` Felix Kuehling
  2023-01-25 19:53 ` [PATCH 10/32] drm/amdgpu: add gfx11 " Jonathan Kim
                   ` (22 subsequent siblings)
  31 siblings, 1 reply; 68+ messages in thread
From: Jonathan Kim @ 2023-01-25 19:53 UTC (permalink / raw)
  To: amd-gfx, dri-devel; +Cc: Felix.Kuehling, Jonathan.Kim

GFX9.4.2 now supports per-VMID debug mode controls registers
(SPI_GDBG_PER_VMID_CNTL).

Because the KFD lets the HWS handle PASID-VMID mapping, the KFD will
forward all debug mode setting register writes to the HWS scheduler
using a new MAP_PROCESS API, so instead of writing to registers, return
the required register values that the HWS needs to write on debug enable
and disable.

v2: add commentary on unused restore_dbg_registers for debug enable.

Signed-off-by: Jonathan Kim <jonathan.kim@amd.com>
---
 .../drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c  | 43 ++++++++++++++++++-
 1 file changed, 41 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c
index 4485bb29bec9..89868f9927ae 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c
@@ -23,6 +23,44 @@
 #include "amdgpu_amdkfd.h"
 #include "amdgpu_amdkfd_arcturus.h"
 #include "amdgpu_amdkfd_gfx_v9.h"
+#include "gc/gc_9_4_2_offset.h"
+#include "gc/gc_9_4_2_sh_mask.h"
+
+/**
+ * Returns TRAP_EN, EXCP_EN and EXCP_REPLACE.
+ *
+ * restore_dbg_reisters is ignored here but is a general interface requirement
+ * for devices that support GFXOFF and where the RLC save/restore list
+ * does not support hw registers for debugging i.e. the driver has to manually
+ * initialize the debug mode registers after it has disabled GFX off during the
+ * debug session.
+ */
+static uint32_t kgd_aldebaran_enable_debug_trap(struct amdgpu_device *adev,
+					    bool restore_dbg_registers,
+					    uint32_t vmid)
+{
+	uint32_t data = 0;
+
+	data = REG_SET_FIELD(data, SPI_GDBG_PER_VMID_CNTL, TRAP_EN, 1);
+	data = REG_SET_FIELD(data, SPI_GDBG_PER_VMID_CNTL, EXCP_EN, 0);
+	data = REG_SET_FIELD(data, SPI_GDBG_PER_VMID_CNTL, EXCP_REPLACE, 0);
+
+	return data;
+}
+
+/* returns TRAP_EN, EXCP_EN and EXCP_REPLACE. */
+static uint32_t kgd_aldebaran_disable_debug_trap(struct amdgpu_device *adev,
+						bool keep_trap_enabled,
+						uint32_t vmid)
+{
+	uint32_t data = 0;
+
+	data = REG_SET_FIELD(data, SPI_GDBG_PER_VMID_CNTL, TRAP_EN, keep_trap_enabled);
+	data = REG_SET_FIELD(data, SPI_GDBG_PER_VMID_CNTL, EXCP_EN, 0);
+	data = REG_SET_FIELD(data, SPI_GDBG_PER_VMID_CNTL, EXCP_REPLACE, 0);
+
+	return data;
+}
 
 const struct kfd2kgd_calls aldebaran_kfd2kgd = {
 	.program_sh_mem_settings = kgd_gfx_v9_program_sh_mem_settings,
@@ -41,6 +79,7 @@ const struct kfd2kgd_calls aldebaran_kfd2kgd = {
 	.get_atc_vmid_pasid_mapping_info =
 				kgd_gfx_v9_get_atc_vmid_pasid_mapping_info,
 	.set_vm_context_page_table_base = kgd_gfx_v9_set_vm_context_page_table_base,
-	.get_cu_occupancy = kgd_gfx_v9_get_cu_occupancy,
-	.program_trap_handler_settings = kgd_gfx_v9_program_trap_handler_settings
+	.enable_debug_trap = kgd_aldebaran_enable_debug_trap,
+	.disable_debug_trap = kgd_aldebaran_disable_debug_trap,
+	.program_trap_handler_settings = kgd_gfx_v9_program_trap_handler_settings,
 };
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 68+ messages in thread

* [PATCH 10/32] drm/amdgpu: add gfx11 hw debug mode enable and disable calls
  2023-01-25 19:53 [PATCH 00/32] Upstream of kernel support for AMDGPU ISA debugging Jonathan Kim
                   ` (8 preceding siblings ...)
  2023-01-25 19:53 ` [PATCH 09/32] drm/amdgpu: add gfx9.4.2 " Jonathan Kim
@ 2023-01-25 19:53 ` Jonathan Kim
  2023-02-16 23:19   ` Felix Kuehling
  2023-01-25 19:53 ` [PATCH 11/32] drm/amdgpu: add configurable grace period for unmap queues Jonathan Kim
                   ` (21 subsequent siblings)
  31 siblings, 1 reply; 68+ messages in thread
From: Jonathan Kim @ 2023-01-25 19:53 UTC (permalink / raw)
  To: amd-gfx, dri-devel; +Cc: Felix.Kuehling, Jonathan.Kim

Implement the per-device calls to enable or disable HW debug mode
for GFX11.

Signed-off-by: Jonathan Kim <jonathan.kim@amd.com>
---
 .../drm/amd/amdgpu/amdgpu_amdkfd_gfx_v11.c    | 39 +++++++++++++++++++
 1 file changed, 39 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v11.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v11.c
index 7e80caa05060..34aeff692eba 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v11.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v11.c
@@ -30,6 +30,7 @@
 #include "soc15d.h"
 #include "v11_structs.h"
 #include "soc21.h"
+#include <uapi/linux/kfd_ioctl.h>
 
 enum hqd_dequeue_request_type {
 	NO_ACTION = 0,
@@ -606,6 +607,42 @@ static void set_vm_context_page_table_base_v11(struct amdgpu_device *adev,
 	adev->gfxhub.funcs->setup_vm_pt_regs(adev, vmid, page_table_base);
 }
 
+/**
+ * Returns TRAP_EN, EXCP_EN and EXCP_REPLACE.
+ *
+ * restore_dbg_reisters is ignored here but is a general interface requirement
+ * for devices that support GFXOFF and where the RLC save/restore list
+ * does not support hw registers for debugging i.e. the driver has to manually
+ * initialize the debug mode registers after it has disabled GFX off during the
+ * debug session.
+ */
+static uint32_t kgd_gfx_v11_enable_debug_trap(struct amdgpu_device *adev,
+					    bool restore_dbg_registers,
+					    uint32_t vmid)
+{
+	uint32_t data = 0;
+
+	data = REG_SET_FIELD(data, SPI_GDBG_PER_VMID_CNTL, TRAP_EN, 1);
+	data = REG_SET_FIELD(data, SPI_GDBG_PER_VMID_CNTL, EXCP_EN, 0);
+	data = REG_SET_FIELD(data, SPI_GDBG_PER_VMID_CNTL, EXCP_REPLACE, 0);
+
+	return data;
+}
+
+/* Returns TRAP_EN, EXCP_EN and EXCP_REPLACE. */
+static uint32_t kgd_gfx_v11_disable_debug_trap(struct amdgpu_device *adev,
+						bool keep_trap_enabled,
+						uint32_t vmid)
+{
+	uint32_t data = 0;
+
+	data = REG_SET_FIELD(data, SPI_GDBG_PER_VMID_CNTL, TRAP_EN, keep_trap_enabled);
+	data = REG_SET_FIELD(data, SPI_GDBG_PER_VMID_CNTL, EXCP_EN, 0);
+	data = REG_SET_FIELD(data, SPI_GDBG_PER_VMID_CNTL, EXCP_REPLACE, 0);
+
+	return data;
+}
+
 const struct kfd2kgd_calls gfx_v11_kfd2kgd = {
 	.program_sh_mem_settings = program_sh_mem_settings_v11,
 	.set_pasid_vmid_mapping = set_pasid_vmid_mapping_v11,
@@ -622,4 +659,6 @@ const struct kfd2kgd_calls gfx_v11_kfd2kgd = {
 	.wave_control_execute = wave_control_execute_v11,
 	.get_atc_vmid_pasid_mapping_info = NULL,
 	.set_vm_context_page_table_base = set_vm_context_page_table_base_v11,
+	.enable_debug_trap = kgd_gfx_v11_enable_debug_trap,
+	.disable_debug_trap = kgd_gfx_v11_disable_debug_trap
 };
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 68+ messages in thread

* [PATCH 11/32] drm/amdgpu: add configurable grace period for unmap queues
  2023-01-25 19:53 [PATCH 00/32] Upstream of kernel support for AMDGPU ISA debugging Jonathan Kim
                   ` (9 preceding siblings ...)
  2023-01-25 19:53 ` [PATCH 10/32] drm/amdgpu: add gfx11 " Jonathan Kim
@ 2023-01-25 19:53 ` Jonathan Kim
  2023-03-20 19:19   ` Felix Kuehling
  2023-01-25 19:53 ` [PATCH 12/32] drm/amdkfd: prepare map process for single process debug devices Jonathan Kim
                   ` (20 subsequent siblings)
  31 siblings, 1 reply; 68+ messages in thread
From: Jonathan Kim @ 2023-01-25 19:53 UTC (permalink / raw)
  To: amd-gfx, dri-devel; +Cc: Felix.Kuehling, Jonathan.Kim

The HWS schedule allows a grace period for wave completion prior to
preemption for better performance by avoiding CWSR on waves that can
potentially complete quickly. The debugger, on the other hand, will
want to inspect wave status immediately after it actively triggers
preemption (a suspend function to be provided).

To minimize latency between preemption and debugger wave inspection, allow
immediate preemption by setting the grace period to 0.

Note that setting the preepmtion grace period to 0 will result in an
infinite grace period being set due to a CP FW bug so set it to 1 for now.

v2: clarify purpose in the description of this patch

Signed-off-by: Jonathan Kim <jonathan.kim@amd.com>
---
 .../drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c  |  2 +
 .../drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c   |  2 +
 .../drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c    | 43 ++++++++++++
 .../drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.h    |  6 ++
 .../drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10_3.c  |  2 +
 .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c | 43 ++++++++++++
 .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h |  9 ++-
 .../drm/amd/amdkfd/kfd_device_queue_manager.c | 61 ++++++++++++-----
 .../drm/amd/amdkfd/kfd_device_queue_manager.h |  2 +
 .../gpu/drm/amd/amdkfd/kfd_packet_manager.c   | 32 +++++++++
 .../drm/amd/amdkfd/kfd_packet_manager_v9.c    | 39 +++++++++++
 .../gpu/drm/amd/amdkfd/kfd_pm4_headers_ai.h   | 65 +++++++++++++++++++
 drivers/gpu/drm/amd/amdkfd/kfd_priv.h         |  5 ++
 13 files changed, 291 insertions(+), 20 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c
index 89868f9927ae..a64a53f9efe6 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c
@@ -81,5 +81,7 @@ const struct kfd2kgd_calls aldebaran_kfd2kgd = {
 	.set_vm_context_page_table_base = kgd_gfx_v9_set_vm_context_page_table_base,
 	.enable_debug_trap = kgd_aldebaran_enable_debug_trap,
 	.disable_debug_trap = kgd_aldebaran_disable_debug_trap,
+	.get_iq_wait_times = kgd_gfx_v9_get_iq_wait_times,
+	.build_grace_period_packet_info = kgd_gfx_v9_build_grace_period_packet_info,
 	.program_trap_handler_settings = kgd_gfx_v9_program_trap_handler_settings,
 };
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c
index d5bb86ccd617..ef8befc31fc6 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c
@@ -410,6 +410,8 @@ const struct kfd2kgd_calls arcturus_kfd2kgd = {
 				kgd_gfx_v9_set_vm_context_page_table_base,
 	.enable_debug_trap = kgd_arcturus_enable_debug_trap,
 	.disable_debug_trap = kgd_arcturus_disable_debug_trap,
+	.get_iq_wait_times = kgd_gfx_v9_get_iq_wait_times,
+	.build_grace_period_packet_info = kgd_gfx_v9_build_grace_period_packet_info,
 	.get_cu_occupancy = kgd_gfx_v9_get_cu_occupancy,
 	.program_trap_handler_settings = kgd_gfx_v9_program_trap_handler_settings,
 };
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
index c09b45de02d0..2491402afd58 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
@@ -801,6 +801,47 @@ uint32_t kgd_gfx_v10_disable_debug_trap(struct amdgpu_device *adev,
 	return 0;
 }
 
+/* kgd_gfx_v10_get_iq_wait_times: Returns the mmCP_IQ_WAIT_TIME1/2 values
+ * The values read are:
+ *     ib_offload_wait_time     -- Wait Count for Indirect Buffer Offloads.
+ *     atomic_offload_wait_time -- Wait Count for L2 and GDS Atomics Offloads.
+ *     wrm_offload_wait_time    -- Wait Count for WAIT_REG_MEM Offloads.
+ *     gws_wait_time            -- Wait Count for Global Wave Syncs.
+ *     que_sleep_wait_time      -- Wait Count for Dequeue Retry.
+ *     sch_wave_wait_time       -- Wait Count for Scheduling Wave Message.
+ *     sem_rearm_wait_time      -- Wait Count for Semaphore re-arm.
+ *     deq_retry_wait_time      -- Wait Count for Global Wave Syncs.
+ */
+void kgd_gfx_v10_get_iq_wait_times(struct amdgpu_device *adev,
+					uint32_t *wait_times)
+
+{
+	*wait_times = RREG32(SOC15_REG_OFFSET(GC, 0, mmCP_IQ_WAIT_TIME2));
+}
+
+void kgd_gfx_v10_build_grace_period_packet_info(struct amdgpu_device *adev,
+						uint32_t wait_times,
+						uint32_t grace_period,
+						uint32_t *reg_offset,
+						uint32_t *reg_data)
+{
+	*reg_data = wait_times;
+
+	/*
+	 * The CP cannont handle a 0 grace period input and will result in
+	 * an infinite grace period being set so set to 1 to prevent this.
+	 */
+	if (grace_period == 0)
+		grace_period = 1;
+
+	*reg_data = REG_SET_FIELD(*reg_data,
+			CP_IQ_WAIT_TIME2,
+			SCH_WAVE,
+			grace_period);
+
+	*reg_offset = SOC15_REG_OFFSET(GC, 0, mmCP_IQ_WAIT_TIME2);
+}
+
 static void program_trap_handler_settings(struct amdgpu_device *adev,
 		uint32_t vmid, uint64_t tba_addr, uint64_t tma_addr)
 {
@@ -845,5 +886,7 @@ const struct kfd2kgd_calls gfx_v10_kfd2kgd = {
 	.set_vm_context_page_table_base = set_vm_context_page_table_base,
 	.enable_debug_trap = kgd_gfx_v10_enable_debug_trap,
 	.disable_debug_trap = kgd_gfx_v10_disable_debug_trap,
+	.get_iq_wait_times = kgd_gfx_v10_get_iq_wait_times,
+	.build_grace_period_packet_info = kgd_gfx_v10_build_grace_period_packet_info,
 	.program_trap_handler_settings = program_trap_handler_settings,
 };
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.h
index 370d6c312981..0abc1e805180 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.h
@@ -26,3 +26,9 @@ uint32_t kgd_gfx_v10_enable_debug_trap(struct amdgpu_device *adev,
 uint32_t kgd_gfx_v10_disable_debug_trap(struct amdgpu_device *adev,
 					bool keep_trap_enabled,
 					uint32_t vmid);
+void kgd_gfx_v10_get_iq_wait_times(struct amdgpu_device *adev, uint32_t *wait_times);
+void kgd_gfx_v10_build_grace_period_packet_info(struct amdgpu_device *adev,
+					       uint32_t wait_times,
+					       uint32_t grace_period,
+					       uint32_t *reg_offset,
+					       uint32_t *reg_data);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10_3.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10_3.c
index 73e3b9ae1fb0..c57f2a6b6e23 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10_3.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10_3.c
@@ -670,6 +670,8 @@ const struct kfd2kgd_calls gfx_v10_3_kfd2kgd = {
 	.get_atc_vmid_pasid_mapping_info = get_atc_vmid_pasid_mapping_info_v10_3,
 	.set_vm_context_page_table_base = set_vm_context_page_table_base_v10_3,
 	.program_trap_handler_settings = program_trap_handler_settings_v10_3,
+	.get_iq_wait_times = kgd_gfx_v10_get_iq_wait_times,
+	.build_grace_period_packet_info = kgd_gfx_v10_build_grace_period_packet_info,
 	.enable_debug_trap = kgd_gfx_v10_enable_debug_trap,
 	.disable_debug_trap = kgd_gfx_v10_disable_debug_trap
 };
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
index 94a9fd9bd984..4a8bd266d3f6 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
@@ -737,6 +737,24 @@ uint32_t kgd_gfx_v9_disable_debug_trap(struct amdgpu_device *adev,
 	return 0;
 }
 
+/* kgd_gfx_v9_get_iq_wait_times: Returns the mmCP_IQ_WAIT_TIME1/2 values
+ * The values read are:
+ *     ib_offload_wait_time     -- Wait Count for Indirect Buffer Offloads.
+ *     atomic_offload_wait_time -- Wait Count for L2 and GDS Atomics Offloads.
+ *     wrm_offload_wait_time    -- Wait Count for WAIT_REG_MEM Offloads.
+ *     gws_wait_time            -- Wait Count for Global Wave Syncs.
+ *     que_sleep_wait_time      -- Wait Count for Dequeue Retry.
+ *     sch_wave_wait_time       -- Wait Count for Scheduling Wave Message.
+ *     sem_rearm_wait_time      -- Wait Count for Semaphore re-arm.
+ *     deq_retry_wait_time      -- Wait Count for Global Wave Syncs.
+ */
+void kgd_gfx_v9_get_iq_wait_times(struct amdgpu_device *adev,
+					uint32_t *wait_times)
+
+{
+	*wait_times = RREG32(SOC15_REG_OFFSET(GC, 0, mmCP_IQ_WAIT_TIME2));
+}
+
 void kgd_gfx_v9_set_vm_context_page_table_base(struct amdgpu_device *adev,
 			uint32_t vmid, uint64_t page_table_base)
 {
@@ -921,6 +939,29 @@ void kgd_gfx_v9_get_cu_occupancy(struct amdgpu_device *adev, int pasid,
 				adev->gfx.cu_info.max_waves_per_simd;
 }
 
+void kgd_gfx_v9_build_grace_period_packet_info(struct amdgpu_device *adev,
+		uint32_t wait_times,
+		uint32_t grace_period,
+		uint32_t *reg_offset,
+		uint32_t *reg_data)
+{
+	*reg_data = wait_times;
+
+	/*
+	 * The CP cannont handle a 0 grace period input and will result in
+	 * an infinite grace period being set so set to 1 to prevent this.
+	 */
+	if (grace_period == 0)
+		grace_period = 1;
+
+	*reg_data = REG_SET_FIELD(*reg_data,
+			CP_IQ_WAIT_TIME2,
+			SCH_WAVE,
+			grace_period);
+
+	*reg_offset = SOC15_REG_OFFSET(GC, 0, mmCP_IQ_WAIT_TIME2);
+}
+
 void kgd_gfx_v9_program_trap_handler_settings(struct amdgpu_device *adev,
                         uint32_t vmid, uint64_t tba_addr, uint64_t tma_addr)
 {
@@ -964,6 +1005,8 @@ const struct kfd2kgd_calls gfx_v9_kfd2kgd = {
 	.set_vm_context_page_table_base = kgd_gfx_v9_set_vm_context_page_table_base,
 	.enable_debug_trap = kgd_gfx_v9_enable_debug_trap,
 	.disable_debug_trap = kgd_gfx_v9_disable_debug_trap,
+	.get_iq_wait_times = kgd_gfx_v9_get_iq_wait_times,
+	.build_grace_period_packet_info = kgd_gfx_v9_build_grace_period_packet_info,
 	.get_cu_occupancy = kgd_gfx_v9_get_cu_occupancy,
 	.program_trap_handler_settings = kgd_gfx_v9_program_trap_handler_settings,
 };
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h
index d39256162616..c0866497cb5c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h
@@ -20,8 +20,6 @@
  * OTHER DEALINGS IN THE SOFTWARE.
  */
 
-
-
 void kgd_gfx_v9_program_sh_mem_settings(struct amdgpu_device *adev, uint32_t vmid,
 		uint32_t sh_mem_config,
 		uint32_t sh_mem_ape1_base, uint32_t sh_mem_ape1_limit,
@@ -51,7 +49,6 @@ int kgd_gfx_v9_wave_control_execute(struct amdgpu_device *adev,
 					uint32_t sq_cmd);
 bool kgd_gfx_v9_get_atc_vmid_pasid_mapping_info(struct amdgpu_device *adev,
 					uint8_t vmid, uint16_t *p_pasid);
-
 void kgd_gfx_v9_set_vm_context_page_table_base(struct amdgpu_device *adev,
 			uint32_t vmid, uint64_t page_table_base);
 void kgd_gfx_v9_get_cu_occupancy(struct amdgpu_device *adev, int pasid,
@@ -67,3 +64,9 @@ uint32_t kgd_gfx_v9_enable_debug_trap(struct amdgpu_device *adev,
 uint32_t kgd_gfx_v9_disable_debug_trap(struct amdgpu_device *adev,
 					bool keep_trap_enabled,
 					uint32_t vmid);
+void kgd_gfx_v9_get_iq_wait_times(struct amdgpu_device *adev, uint32_t *wait_times);
+void kgd_gfx_v9_build_grace_period_packet_info(struct amdgpu_device *adev,
+					       uint32_t wait_times,
+					       uint32_t grace_period,
+					       uint32_t *reg_offset,
+					       uint32_t *reg_data);
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
index a2ac98d06e71..7556f80d41e4 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -46,10 +46,13 @@ static int set_pasid_vmid_mapping(struct device_queue_manager *dqm,
 
 static int execute_queues_cpsch(struct device_queue_manager *dqm,
 				enum kfd_unmap_queues_filter filter,
-				uint32_t filter_param);
+				uint32_t filter_param,
+				uint32_t grace_period);
 static int unmap_queues_cpsch(struct device_queue_manager *dqm,
 				enum kfd_unmap_queues_filter filter,
-				uint32_t filter_param, bool reset);
+				uint32_t filter_param,
+				uint32_t grace_period,
+				bool reset);
 
 static int map_queues_cpsch(struct device_queue_manager *dqm);
 
@@ -839,7 +842,7 @@ static int update_queue(struct device_queue_manager *dqm, struct queue *q,
 	if (dqm->sched_policy != KFD_SCHED_POLICY_NO_HWS) {
 		if (!dqm->dev->shared_resources.enable_mes)
 			retval = unmap_queues_cpsch(dqm,
-						    KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0, false);
+						    KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0, USE_DEFAULT_GRACE_PERIOD, false);
 		else if (prev_active)
 			retval = remove_queue_mes(dqm, q, &pdd->qpd);
 
@@ -1015,7 +1018,8 @@ static int evict_process_queues_cpsch(struct device_queue_manager *dqm,
 		retval = execute_queues_cpsch(dqm,
 					      qpd->is_debug ?
 					      KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES :
-					      KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0);
+					      KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0,
+					      USE_DEFAULT_GRACE_PERIOD);
 
 out:
 	dqm_unlock(dqm);
@@ -1155,7 +1159,7 @@ static int restore_process_queues_cpsch(struct device_queue_manager *dqm,
 	}
 	if (!dqm->dev->shared_resources.enable_mes)
 		retval = execute_queues_cpsch(dqm,
-					      KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0);
+					      KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0, USE_DEFAULT_GRACE_PERIOD);
 	qpd->evicted = 0;
 	eviction_duration = get_jiffies_64() - pdd->last_evict_timestamp;
 	atomic64_add(eviction_duration, &pdd->evict_duration_counter);
@@ -1490,6 +1494,9 @@ static int initialize_cpsch(struct device_queue_manager *dqm)
 
 	init_sdma_bitmaps(dqm);
 
+	if (dqm->dev->kfd2kgd->get_iq_wait_times)
+		dqm->dev->kfd2kgd->get_iq_wait_times(dqm->dev->adev,
+					&dqm->wait_times);
 	return 0;
 }
 
@@ -1529,7 +1536,7 @@ static int start_cpsch(struct device_queue_manager *dqm)
 	dqm->is_resetting = false;
 	dqm->sched_running = true;
 	if (!dqm->dev->shared_resources.enable_mes)
-		execute_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0);
+		execute_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0, USE_DEFAULT_GRACE_PERIOD);
 	dqm_unlock(dqm);
 
 	return 0;
@@ -1554,7 +1561,7 @@ static int stop_cpsch(struct device_queue_manager *dqm)
 
 	if (!dqm->is_hws_hang) {
 		if (!dqm->dev->shared_resources.enable_mes)
-			unmap_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0, false);
+			unmap_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0, USE_DEFAULT_GRACE_PERIOD, false);
 		else
 			remove_all_queues_mes(dqm);
 	}
@@ -1596,7 +1603,8 @@ static int create_kernel_queue_cpsch(struct device_queue_manager *dqm,
 	list_add(&kq->list, &qpd->priv_queue_list);
 	increment_queue_count(dqm, qpd, kq->queue);
 	qpd->is_debug = true;
-	execute_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0);
+	execute_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0,
+			USE_DEFAULT_GRACE_PERIOD);
 	dqm_unlock(dqm);
 
 	return 0;
@@ -1610,7 +1618,8 @@ static void destroy_kernel_queue_cpsch(struct device_queue_manager *dqm,
 	list_del(&kq->list);
 	decrement_queue_count(dqm, qpd, kq->queue);
 	qpd->is_debug = false;
-	execute_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0);
+	execute_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0,
+			USE_DEFAULT_GRACE_PERIOD);
 	/*
 	 * Unconditionally decrement this counter, regardless of the queue's
 	 * type.
@@ -1687,7 +1696,7 @@ static int create_queue_cpsch(struct device_queue_manager *dqm, struct queue *q,
 
 		if (!dqm->dev->shared_resources.enable_mes)
 			retval = execute_queues_cpsch(dqm,
-					KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0);
+					KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0, USE_DEFAULT_GRACE_PERIOD);
 		else
 			retval = add_queue_mes(dqm, q, qpd);
 		if (retval)
@@ -1776,7 +1785,9 @@ static int map_queues_cpsch(struct device_queue_manager *dqm)
 /* dqm->lock mutex has to be locked before calling this function */
 static int unmap_queues_cpsch(struct device_queue_manager *dqm,
 				enum kfd_unmap_queues_filter filter,
-				uint32_t filter_param, bool reset)
+				uint32_t filter_param,
+				uint32_t grace_period,
+				bool reset)
 {
 	int retval = 0;
 	struct mqd_manager *mqd_mgr;
@@ -1788,6 +1799,12 @@ static int unmap_queues_cpsch(struct device_queue_manager *dqm,
 	if (!dqm->active_runlist)
 		return retval;
 
+	if (grace_period != USE_DEFAULT_GRACE_PERIOD) {
+		retval = pm_update_grace_period(&dqm->packet_mgr, grace_period);
+		if (retval)
+			return retval;
+	}
+
 	retval = pm_send_unmap_queue(&dqm->packet_mgr, filter, filter_param, reset);
 	if (retval)
 		return retval;
@@ -1820,6 +1837,13 @@ static int unmap_queues_cpsch(struct device_queue_manager *dqm,
 		return -ETIME;
 	}
 
+	/* We need to reset the grace period value for this device */
+	if (grace_period != USE_DEFAULT_GRACE_PERIOD) {
+		if (pm_update_grace_period(&dqm->packet_mgr,
+					USE_DEFAULT_GRACE_PERIOD))
+			pr_err("Failed to reset grace period\n");
+	}
+
 	pm_release_ib(&dqm->packet_mgr);
 	dqm->active_runlist = false;
 
@@ -1835,7 +1859,7 @@ static int reset_queues_cpsch(struct device_queue_manager *dqm,
 	dqm_lock(dqm);
 
 	retval = unmap_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_BY_PASID,
-			pasid, true);
+			pasid, USE_DEFAULT_GRACE_PERIOD, true);
 
 	dqm_unlock(dqm);
 	return retval;
@@ -1844,13 +1868,14 @@ static int reset_queues_cpsch(struct device_queue_manager *dqm,
 /* dqm->lock mutex has to be locked before calling this function */
 static int execute_queues_cpsch(struct device_queue_manager *dqm,
 				enum kfd_unmap_queues_filter filter,
-				uint32_t filter_param)
+				uint32_t filter_param,
+				uint32_t grace_period)
 {
 	int retval;
 
 	if (dqm->is_hws_hang)
 		return -EIO;
-	retval = unmap_queues_cpsch(dqm, filter, filter_param, false);
+	retval = unmap_queues_cpsch(dqm, filter, filter_param, grace_period, false);
 	if (retval)
 		return retval;
 
@@ -1908,7 +1933,8 @@ static int destroy_queue_cpsch(struct device_queue_manager *dqm,
 		if (!dqm->dev->shared_resources.enable_mes) {
 			decrement_queue_count(dqm, qpd, q);
 			retval = execute_queues_cpsch(dqm,
-						      KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0);
+						      KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0,
+						      USE_DEFAULT_GRACE_PERIOD);
 			if (retval == -ETIME)
 				qpd->reset_wavefronts = true;
 		} else {
@@ -2193,7 +2219,7 @@ static int process_termination_cpsch(struct device_queue_manager *dqm,
 	}
 
 	if (!dqm->dev->shared_resources.enable_mes)
-		retval = execute_queues_cpsch(dqm, filter, 0);
+		retval = execute_queues_cpsch(dqm, filter, 0, USE_DEFAULT_GRACE_PERIOD);
 
 	if ((!dqm->is_hws_hang) && (retval || qpd->reset_wavefronts)) {
 		pr_warn("Resetting wave fronts (cpsch) on dev %p\n", dqm->dev);
@@ -2537,7 +2563,8 @@ int dqm_debugfs_hang_hws(struct device_queue_manager *dqm)
 		return r;
 	}
 	dqm->active_runlist = true;
-	r = execute_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0);
+	r = execute_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES,
+				0, USE_DEFAULT_GRACE_PERIOD);
 	dqm_unlock(dqm);
 
 	return r;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
index a537b9ef3e16..fb48b124161f 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
@@ -37,6 +37,7 @@
 
 #define KFD_MES_PROCESS_QUANTUM		100000
 #define KFD_MES_GANG_QUANTUM		10000
+#define USE_DEFAULT_GRACE_PERIOD 0xffffffff
 
 struct device_process_node {
 	struct qcm_process_device *qpd;
@@ -256,6 +257,7 @@ struct device_queue_manager {
 	struct work_struct	hw_exception_work;
 	struct kfd_mem_obj	hiq_sdma_mqd;
 	bool			sched_running;
+	uint32_t		wait_times;
 };
 
 void device_queue_manager_init_cik(
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c
index ed02b6d8bf63..c57f9a46dfcc 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c
@@ -369,6 +369,38 @@ int pm_send_query_status(struct packet_manager *pm, uint64_t fence_address,
 	return retval;
 }
 
+int pm_update_grace_period(struct packet_manager *pm, uint32_t grace_period)
+{
+	int retval = 0;
+	uint32_t *buffer, size;
+
+	size = pm->pmf->set_grace_period_size;
+
+	mutex_lock(&pm->lock);
+
+	if (size) {
+		kq_acquire_packet_buffer(pm->priv_queue,
+			size / sizeof(uint32_t),
+			(unsigned int **)&buffer);
+
+		if (!buffer) {
+			pr_err("Failed to allocate buffer on kernel queue\n");
+			retval = -ENOMEM;
+			goto out;
+		}
+
+		retval = pm->pmf->set_grace_period(pm, buffer, grace_period);
+		if (!retval)
+			kq_submit_packet(pm->priv_queue);
+		else
+			kq_rollback_packet(pm->priv_queue);
+	}
+
+out:
+	mutex_unlock(&pm->lock);
+	return retval;
+}
+
 int pm_send_unmap_queue(struct packet_manager *pm,
 			enum kfd_unmap_queues_filter filter,
 			uint32_t filter_param, bool reset)
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager_v9.c
index 18250845a989..f0cdc8695b8c 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager_v9.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager_v9.c
@@ -251,6 +251,41 @@ static int pm_map_queues_v9(struct packet_manager *pm, uint32_t *buffer,
 	return 0;
 }
 
+static int pm_set_grace_period_v9(struct packet_manager *pm,
+		uint32_t *buffer,
+		uint32_t grace_period)
+{
+	struct pm4_mec_write_data_mmio *packet;
+	uint32_t reg_offset = 0;
+	uint32_t reg_data = 0;
+
+	pm->dqm->dev->kfd2kgd->build_grace_period_packet_info(
+			pm->dqm->dev->adev,
+			pm->dqm->wait_times,
+			grace_period,
+			&reg_offset,
+			&reg_data);
+
+	if (grace_period == USE_DEFAULT_GRACE_PERIOD)
+		reg_data = pm->dqm->wait_times;
+
+	packet = (struct pm4_mec_write_data_mmio *)buffer;
+	memset(buffer, 0, sizeof(struct pm4_mec_write_data_mmio));
+
+	packet->header.u32All = pm_build_pm4_header(IT_WRITE_DATA,
+					sizeof(struct pm4_mec_write_data_mmio));
+
+	packet->bitfields2.dst_sel  = dst_sel___write_data__mem_mapped_register;
+	packet->bitfields2.addr_incr =
+			addr_incr___write_data__do_not_increment_address;
+
+	packet->bitfields3.dst_mmreg_addr = reg_offset;
+
+	packet->data = reg_data;
+
+	return 0;
+}
+
 static int pm_unmap_queues_v9(struct packet_manager *pm, uint32_t *buffer,
 			enum kfd_unmap_queues_filter filter,
 			uint32_t filter_param, bool reset)
@@ -333,6 +368,7 @@ const struct packet_manager_funcs kfd_v9_pm_funcs = {
 	.set_resources		= pm_set_resources_v9,
 	.map_queues		= pm_map_queues_v9,
 	.unmap_queues		= pm_unmap_queues_v9,
+	.set_grace_period       = pm_set_grace_period_v9,
 	.query_status		= pm_query_status_v9,
 	.release_mem		= NULL,
 	.map_process_size	= sizeof(struct pm4_mes_map_process),
@@ -340,6 +376,7 @@ const struct packet_manager_funcs kfd_v9_pm_funcs = {
 	.set_resources_size	= sizeof(struct pm4_mes_set_resources),
 	.map_queues_size	= sizeof(struct pm4_mes_map_queues),
 	.unmap_queues_size	= sizeof(struct pm4_mes_unmap_queues),
+	.set_grace_period_size  = sizeof(struct pm4_mec_write_data_mmio),
 	.query_status_size	= sizeof(struct pm4_mes_query_status),
 	.release_mem_size	= 0,
 };
@@ -350,6 +387,7 @@ const struct packet_manager_funcs kfd_aldebaran_pm_funcs = {
 	.set_resources		= pm_set_resources_v9,
 	.map_queues		= pm_map_queues_v9,
 	.unmap_queues		= pm_unmap_queues_v9,
+	.set_grace_period       = pm_set_grace_period_v9,
 	.query_status		= pm_query_status_v9,
 	.release_mem		= NULL,
 	.map_process_size	= sizeof(struct pm4_mes_map_process_aldebaran),
@@ -357,6 +395,7 @@ const struct packet_manager_funcs kfd_aldebaran_pm_funcs = {
 	.set_resources_size	= sizeof(struct pm4_mes_set_resources),
 	.map_queues_size	= sizeof(struct pm4_mes_map_queues),
 	.unmap_queues_size	= sizeof(struct pm4_mes_unmap_queues),
+	.set_grace_period_size  = sizeof(struct pm4_mec_write_data_mmio),
 	.query_status_size	= sizeof(struct pm4_mes_query_status),
 	.release_mem_size	= 0,
 };
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers_ai.h b/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers_ai.h
index a666710ed403..795001c947e1 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers_ai.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers_ai.h
@@ -583,6 +583,71 @@ struct pm4_mec_release_mem {
 
 #endif
 
+#ifndef PM4_MEC_WRITE_DATA_DEFINED
+#define PM4_MEC_WRITE_DATA_DEFINED
+
+enum WRITE_DATA_dst_sel_enum {
+	dst_sel___write_data__mem_mapped_register = 0,
+	dst_sel___write_data__tc_l2 = 2,
+	dst_sel___write_data__gds = 3,
+	dst_sel___write_data__memory = 5,
+	dst_sel___write_data__memory_mapped_adc_persistent_state = 6,
+};
+
+enum WRITE_DATA_addr_incr_enum {
+	addr_incr___write_data__increment_address = 0,
+	addr_incr___write_data__do_not_increment_address = 1
+};
+
+enum WRITE_DATA_wr_confirm_enum {
+	wr_confirm___write_data__do_not_wait_for_write_confirmation = 0,
+	wr_confirm___write_data__wait_for_write_confirmation = 1
+};
+
+enum WRITE_DATA_cache_policy_enum {
+	cache_policy___write_data__lru = 0,
+	cache_policy___write_data__stream = 1
+};
+
+
+struct pm4_mec_write_data_mmio {
+	union {
+		union PM4_MES_TYPE_3_HEADER header;     /*header */
+		unsigned int ordinal1;
+	};
+
+	union {
+		struct {
+			unsigned int reserved1:8;
+			unsigned int dst_sel:4;
+			unsigned int reserved2:4;
+			unsigned int addr_incr:1;
+			unsigned int reserved3:2;
+			unsigned int resume_vf:1;
+			unsigned int wr_confirm:1;
+			unsigned int reserved4:4;
+			unsigned int cache_policy:2;
+			unsigned int reserved5:5;
+		} bitfields2;
+		unsigned int ordinal2;
+	};
+
+	union {
+		struct {
+			unsigned int dst_mmreg_addr:18;
+			unsigned int reserved6:14;
+		} bitfields3;
+		unsigned int ordinal3;
+	};
+
+	uint32_t reserved7;
+
+	uint32_t data;
+
+};
+
+#endif
+
 enum {
 	CACHE_FLUSH_AND_INV_TS_EVENT = 0x00000014
 };
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index 62b75ba28425..d557a7ae756c 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -1300,6 +1300,8 @@ struct packet_manager_funcs {
 	int (*unmap_queues)(struct packet_manager *pm, uint32_t *buffer,
 			enum kfd_unmap_queues_filter mode,
 			uint32_t filter_param, bool reset);
+	int (*set_grace_period)(struct packet_manager *pm, uint32_t *buffer,
+			uint32_t grace_period);
 	int (*query_status)(struct packet_manager *pm, uint32_t *buffer,
 			uint64_t fence_address,	uint64_t fence_value);
 	int (*release_mem)(uint64_t gpu_addr, uint32_t *buffer);
@@ -1310,6 +1312,7 @@ struct packet_manager_funcs {
 	int set_resources_size;
 	int map_queues_size;
 	int unmap_queues_size;
+	int set_grace_period_size;
 	int query_status_size;
 	int release_mem_size;
 };
@@ -1332,6 +1335,8 @@ int pm_send_unmap_queue(struct packet_manager *pm,
 
 void pm_release_ib(struct packet_manager *pm);
 
+int pm_update_grace_period(struct packet_manager *pm, uint32_t grace_period);
+
 /* Following PM funcs can be shared among VI and AI */
 unsigned int pm_build_pm4_header(unsigned int opcode, size_t packet_size);
 
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 68+ messages in thread

* [PATCH 12/32] drm/amdkfd: prepare map process for single process debug devices
  2023-01-25 19:53 [PATCH 00/32] Upstream of kernel support for AMDGPU ISA debugging Jonathan Kim
                   ` (10 preceding siblings ...)
  2023-01-25 19:53 ` [PATCH 11/32] drm/amdgpu: add configurable grace period for unmap queues Jonathan Kim
@ 2023-01-25 19:53 ` Jonathan Kim
  2023-03-20 20:06   ` Felix Kuehling
  2023-01-25 19:53 ` [PATCH 13/32] drm/amdgpu: prepare map process for multi-process " Jonathan Kim
                   ` (19 subsequent siblings)
  31 siblings, 1 reply; 68+ messages in thread
From: Jonathan Kim @ 2023-01-25 19:53 UTC (permalink / raw)
  To: amd-gfx, dri-devel; +Cc: Felix.Kuehling, Jonathan.Kim

Older HW only supports debugging on a single process because the
SPI debug mode setting registers are device global.

The HWS has supplied a single pinned VMID (0xf) for MAP_PROCESS
for debug purposes. To pin the VMID, the KFD will remove the VMID from
the HWS dynamic VMID allocation via SET_RESOUCES so that a debugged
process will never migrate away from its pinned VMID.

The KFD is responsible for reserving and releasing this pinned VMID
accordingly whenever the debugger attaches and detaches respectively.

Signed-off-by: Jonathan Kim <jonathan.kim@amd.com>
---
 .../drm/amd/amdkfd/kfd_device_queue_manager.c | 101 +++++++++++++++++-
 .../drm/amd/amdkfd/kfd_device_queue_manager.h |   5 +
 .../drm/amd/amdkfd/kfd_packet_manager_v9.c    |   9 ++
 .../gpu/drm/amd/amdkfd/kfd_pm4_headers_ai.h   |   5 +-
 4 files changed, 114 insertions(+), 6 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
index 7556f80d41e4..0cd3a5e9ff25 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -1490,7 +1490,7 @@ static int initialize_cpsch(struct device_queue_manager *dqm)
 	dqm->active_cp_queue_count = 0;
 	dqm->gws_queue_count = 0;
 	dqm->active_runlist = false;
-	INIT_WORK(&dqm->hw_exception_work, kfd_process_hw_exception);
+	dqm->trap_debug_vmid = 0;
 
 	init_sdma_bitmaps(dqm);
 
@@ -1933,8 +1933,7 @@ static int destroy_queue_cpsch(struct device_queue_manager *dqm,
 		if (!dqm->dev->shared_resources.enable_mes) {
 			decrement_queue_count(dqm, qpd, q);
 			retval = execute_queues_cpsch(dqm,
-						      KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0,
-						      USE_DEFAULT_GRACE_PERIOD);
+						      KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0, USE_DEFAULT_GRACE_PERIOD);
 			if (retval == -ETIME)
 				qpd->reset_wavefronts = true;
 		} else {
@@ -2463,6 +2462,98 @@ static void kfd_process_hw_exception(struct work_struct *work)
 	amdgpu_amdkfd_gpu_reset(dqm->dev->adev);
 }
 
+int reserve_debug_trap_vmid(struct device_queue_manager *dqm,
+				struct qcm_process_device *qpd)
+{
+	int r;
+	int updated_vmid_mask;
+
+	if (dqm->sched_policy == KFD_SCHED_POLICY_NO_HWS) {
+		pr_err("Unsupported on sched_policy: %i\n", dqm->sched_policy);
+		return -EINVAL;
+	}
+
+	dqm_lock(dqm);
+
+	if (dqm->trap_debug_vmid != 0) {
+		pr_err("Trap debug id already reserved\n");
+		r = -EBUSY;
+		goto out_unlock;
+	}
+
+	r = unmap_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0,
+			USE_DEFAULT_GRACE_PERIOD, false);
+	if (r)
+		goto out_unlock;
+
+	updated_vmid_mask = dqm->dev->shared_resources.compute_vmid_bitmap;
+	updated_vmid_mask &= ~(1 << dqm->dev->vm_info.last_vmid_kfd);
+
+	dqm->dev->shared_resources.compute_vmid_bitmap = updated_vmid_mask;
+	dqm->trap_debug_vmid = dqm->dev->vm_info.last_vmid_kfd;
+	r = set_sched_resources(dqm);
+	if (r)
+		goto out_unlock;
+
+	r = map_queues_cpsch(dqm);
+	if (r)
+		goto out_unlock;
+
+	pr_debug("Reserved VMID for trap debug: %i\n", dqm->trap_debug_vmid);
+
+out_unlock:
+	dqm_unlock(dqm);
+	return r;
+}
+
+/*
+ * Releases vmid for the trap debugger
+ */
+int release_debug_trap_vmid(struct device_queue_manager *dqm,
+			struct qcm_process_device *qpd)
+{
+	int r;
+	int updated_vmid_mask;
+	uint32_t trap_debug_vmid;
+
+	if (dqm->sched_policy == KFD_SCHED_POLICY_NO_HWS) {
+		pr_err("Unsupported on sched_policy: %i\n", dqm->sched_policy);
+		return -EINVAL;
+	}
+
+	dqm_lock(dqm);
+	trap_debug_vmid = dqm->trap_debug_vmid;
+	if (dqm->trap_debug_vmid == 0) {
+		pr_err("Trap debug id is not reserved\n");
+		r = -EINVAL;
+		goto out_unlock;
+	}
+
+	r = unmap_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0,
+			USE_DEFAULT_GRACE_PERIOD, false);
+	if (r)
+		goto out_unlock;
+
+	updated_vmid_mask = dqm->dev->shared_resources.compute_vmid_bitmap;
+	updated_vmid_mask |= (1 << dqm->dev->vm_info.last_vmid_kfd);
+
+	dqm->dev->shared_resources.compute_vmid_bitmap = updated_vmid_mask;
+	dqm->trap_debug_vmid = 0;
+	r = set_sched_resources(dqm);
+	if (r)
+		goto out_unlock;
+
+	r = map_queues_cpsch(dqm);
+	if (r)
+		goto out_unlock;
+
+	pr_debug("Released VMID for trap debug: %i\n", trap_debug_vmid);
+
+out_unlock:
+	dqm_unlock(dqm);
+	return r;
+}
+
 #if defined(CONFIG_DEBUG_FS)
 
 static void seq_reg_dump(struct seq_file *m,
@@ -2563,8 +2654,8 @@ int dqm_debugfs_hang_hws(struct device_queue_manager *dqm)
 		return r;
 	}
 	dqm->active_runlist = true;
-	r = execute_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES,
-				0, USE_DEFAULT_GRACE_PERIOD);
+	r = execute_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0,
+			USE_DEFAULT_GRACE_PERIOD);
 	dqm_unlock(dqm);
 
 	return r;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
index fb48b124161f..0cb1504d24cf 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
@@ -250,6 +250,7 @@ struct device_queue_manager {
 	struct kfd_mem_obj	*fence_mem;
 	bool			active_runlist;
 	int			sched_policy;
+	uint32_t		trap_debug_vmid;
 
 	/* hw exception  */
 	bool			is_hws_hang;
@@ -281,6 +282,10 @@ unsigned int get_queues_per_pipe(struct device_queue_manager *dqm);
 unsigned int get_pipes_per_mec(struct device_queue_manager *dqm);
 unsigned int get_num_sdma_queues(struct device_queue_manager *dqm);
 unsigned int get_num_xgmi_sdma_queues(struct device_queue_manager *dqm);
+int reserve_debug_trap_vmid(struct device_queue_manager *dqm,
+			struct qcm_process_device *qpd);
+int release_debug_trap_vmid(struct device_queue_manager *dqm,
+			struct qcm_process_device *qpd);
 
 static inline unsigned int get_sh_mem_bases_32(struct kfd_process_device *pdd)
 {
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager_v9.c
index f0cdc8695b8c..363cf8e005cc 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager_v9.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager_v9.c
@@ -34,6 +34,9 @@ static int pm_map_process_v9(struct packet_manager *pm,
 {
 	struct pm4_mes_map_process *packet;
 	uint64_t vm_page_table_base_addr = qpd->page_table_base;
+	struct kfd_dev *kfd = pm->dqm->dev;
+	struct kfd_process_device *pdd =
+			container_of(qpd, struct kfd_process_device, qpd);
 
 	packet = (struct pm4_mes_map_process *)buffer;
 	memset(buffer, 0, sizeof(struct pm4_mes_map_process));
@@ -49,6 +52,12 @@ static int pm_map_process_v9(struct packet_manager *pm,
 	packet->bitfields14.sdma_enable = 1;
 	packet->bitfields14.num_queues = (qpd->is_debug) ? 0 : qpd->queue_count;
 
+	if (kfd->dqm->trap_debug_vmid && pdd->process->debug_trap_enabled &&
+			pdd->process->runtime_info.runtime_state == DEBUG_RUNTIME_STATE_ENABLED) {
+		packet->bitfields2.debug_vmid = kfd->dqm->trap_debug_vmid;
+		packet->bitfields2.new_debug = 1;
+	}
+
 	packet->sh_mem_config = qpd->sh_mem_config;
 	packet->sh_mem_bases = qpd->sh_mem_bases;
 	if (qpd->tba_addr) {
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers_ai.h b/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers_ai.h
index 795001c947e1..bb6edbc27de7 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers_ai.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers_ai.h
@@ -146,7 +146,10 @@ struct pm4_mes_map_process {
 	union {
 		struct {
 			uint32_t pasid:16;
-			uint32_t reserved1:8;
+			uint32_t reserved1:2;
+			uint32_t debug_vmid:4;
+			uint32_t new_debug:1;
+			uint32_t reserved2:1;
 			uint32_t diq_enable:1;
 			uint32_t process_quantum:7;
 		} bitfields2;
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 68+ messages in thread

* [PATCH 13/32] drm/amdgpu: prepare map process for multi-process debug devices
  2023-01-25 19:53 [PATCH 00/32] Upstream of kernel support for AMDGPU ISA debugging Jonathan Kim
                   ` (11 preceding siblings ...)
  2023-01-25 19:53 ` [PATCH 12/32] drm/amdkfd: prepare map process for single process debug devices Jonathan Kim
@ 2023-01-25 19:53 ` Jonathan Kim
  2023-03-20 20:16   ` Felix Kuehling
  2023-01-25 19:53 ` [PATCH 14/32] drm/amdgpu: expose debug api for mes Jonathan Kim
                   ` (18 subsequent siblings)
  31 siblings, 1 reply; 68+ messages in thread
From: Jonathan Kim @ 2023-01-25 19:53 UTC (permalink / raw)
  To: amd-gfx, dri-devel; +Cc: Felix.Kuehling, Jonathan.Kim

Unlike single process debug devices, multi-process debug devices allow
debug mode setting per-VMID (non-device-global).

Because the HWS manages PASID-VMID mapping, the new MAP_PROCESS API allows
the KFD to forward the required SPI debug register write requests.

To request a new debug mode setting change, the KFD must be able to
preempt all queues then remap all queues with these new setting
requests for MAP_PROCESS to take effect.

Note that by default, trap enablement in non-debug mode must be disabled
for performance reasons for multi-process debug devices due to setup
overhead in FW.

v2: remove asic family code name comment in per vmid support check

Signed-off-by: Jonathan Kim <jonathan.kim@amd.com>
---
 drivers/gpu/drm/amd/amdkfd/kfd_debug.h        |  7 +++
 .../drm/amd/amdkfd/kfd_device_queue_manager.c | 50 +++++++++++++++++++
 .../drm/amd/amdkfd/kfd_device_queue_manager.h |  3 ++
 .../drm/amd/amdkfd/kfd_packet_manager_v9.c    | 15 ++++++
 drivers/gpu/drm/amd/amdkfd/kfd_priv.h         |  9 ++++
 drivers/gpu/drm/amd/amdkfd/kfd_process.c      |  5 ++
 6 files changed, 89 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.h b/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
index 8aa7a3ad4e97..53c5a3e55bd2 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
@@ -32,5 +32,12 @@ int kfd_dbg_trap_disable(struct kfd_process *target);
 int kfd_dbg_trap_enable(struct kfd_process *target, uint32_t fd,
 			void __user *runtime_info,
 			uint32_t *runtime_info_size);
+
+static inline bool kfd_dbg_is_per_vmid_supported(struct kfd_dev *dev)
+{
+	return KFD_GC_VERSION(dev) == IP_VERSION(9, 4, 2);
+}
+
 void debug_event_write_work_handler(struct work_struct *work);
+
 #endif
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
index 0cd3a5e9ff25..2517716d7cbc 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -2554,6 +2554,56 @@ int release_debug_trap_vmid(struct device_queue_manager *dqm,
 	return r;
 }
 
+int debug_lock_and_unmap(struct device_queue_manager *dqm)
+{
+	int r;
+
+	if (dqm->sched_policy == KFD_SCHED_POLICY_NO_HWS) {
+		pr_err("Unsupported on sched_policy: %i\n", dqm->sched_policy);
+		return -EINVAL;
+	}
+
+	if (!kfd_dbg_is_per_vmid_supported(dqm->dev))
+		return 0;
+
+	dqm_lock(dqm);
+
+	r = unmap_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0, 0, false);
+	if (r)
+		dqm_unlock(dqm);
+
+	return r;
+}
+
+int debug_map_and_unlock(struct device_queue_manager *dqm)
+{
+	int r;
+
+	if (dqm->sched_policy == KFD_SCHED_POLICY_NO_HWS) {
+		pr_err("Unsupported on sched_policy: %i\n", dqm->sched_policy);
+		return -EINVAL;
+	}
+
+	if (!kfd_dbg_is_per_vmid_supported(dqm->dev))
+		return 0;
+
+	r = map_queues_cpsch(dqm);
+
+	dqm_unlock(dqm);
+
+	return r;
+}
+
+int debug_refresh_runlist(struct device_queue_manager *dqm)
+{
+	int r = debug_lock_and_unmap(dqm);
+
+	if (r)
+		return r;
+
+	return debug_map_and_unlock(dqm);
+}
+
 #if defined(CONFIG_DEBUG_FS)
 
 static void seq_reg_dump(struct seq_file *m,
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
index 0cb1504d24cf..bef3be84c5cc 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
@@ -286,6 +286,9 @@ int reserve_debug_trap_vmid(struct device_queue_manager *dqm,
 			struct qcm_process_device *qpd);
 int release_debug_trap_vmid(struct device_queue_manager *dqm,
 			struct qcm_process_device *qpd);
+int debug_lock_and_unmap(struct device_queue_manager *dqm);
+int debug_map_and_unlock(struct device_queue_manager *dqm);
+int debug_refresh_runlist(struct device_queue_manager *dqm);
 
 static inline unsigned int get_sh_mem_bases_32(struct kfd_process_device *pdd)
 {
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager_v9.c
index 363cf8e005cc..f19c506da23d 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager_v9.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager_v9.c
@@ -88,6 +88,10 @@ static int pm_map_process_aldebaran(struct packet_manager *pm,
 {
 	struct pm4_mes_map_process_aldebaran *packet;
 	uint64_t vm_page_table_base_addr = qpd->page_table_base;
+	struct kfd_dev *kfd = pm->dqm->dev;
+	struct kfd_process_device *pdd =
+			container_of(qpd, struct kfd_process_device, qpd);
+	int i;
 
 	packet = (struct pm4_mes_map_process_aldebaran *)buffer;
 	memset(buffer, 0, sizeof(struct pm4_mes_map_process_aldebaran));
@@ -102,6 +106,17 @@ static int pm_map_process_aldebaran(struct packet_manager *pm,
 	packet->bitfields14.num_oac = qpd->num_oac;
 	packet->bitfields14.sdma_enable = 1;
 	packet->bitfields14.num_queues = (qpd->is_debug) ? 0 : qpd->queue_count;
+	/* TRAP_EN is set on boot so keep it set in non-debug mode. */
+	packet->spi_gdbg_per_vmid_cntl = pdd->spi_dbg_override |
+						pdd->spi_dbg_launch_mode;
+
+	if (pdd->process->debug_trap_enabled) {
+		for (i = 0; i < kfd->device_info.num_of_watch_points; i++)
+			packet->tcp_watch_cntl[i] = pdd->watch_points[i];
+
+		packet->bitfields2.single_memops =
+				!!(pdd->process->dbg_flags & KFD_DBG_TRAP_FLAG_SINGLE_MEM_OP);
+	}
 
 	packet->sh_mem_config = qpd->sh_mem_config;
 	packet->sh_mem_bases = qpd->sh_mem_bases;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index d557a7ae756c..8f1e2f9023db 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -782,6 +782,12 @@ struct kfd_process_device {
 	uint64_t faults;
 	uint64_t page_in;
 	uint64_t page_out;
+
+	/* Tracks debug per-vmid request settings */
+	uint32_t spi_dbg_override;
+	uint32_t spi_dbg_launch_mode;
+	uint32_t watch_points[4];
+
 	/*
 	 * If this process has been checkpointed before, then the user
 	 * application will use the original gpu_id on the
@@ -918,6 +924,9 @@ struct kfd_process {
 
 	bool xnack_enabled;
 
+	/* Tracks debug per-vmid request for debug flags */
+	bool dbg_flags;
+
 	/* Work area for debugger event writer worker. */
 	struct work_struct debug_event_workarea;
 
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
index e935158ab311..94c6545a58b4 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
@@ -1556,6 +1556,11 @@ struct kfd_process_device *kfd_create_process_device_data(struct kfd_dev *dev,
 	}
 
 	p->pdds[p->n_pdds++] = pdd;
+	if (kfd_dbg_is_per_vmid_supported(pdd->dev))
+		pdd->spi_dbg_override = pdd->dev->kfd2kgd->disable_debug_trap(
+							pdd->dev->adev,
+							false,
+							0);
 
 	/* Init idr used for memory handle translation */
 	idr_init(&pdd->alloc_idr);
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 68+ messages in thread

* [PATCH 14/32] drm/amdgpu: expose debug api for mes
  2023-01-25 19:53 [PATCH 00/32] Upstream of kernel support for AMDGPU ISA debugging Jonathan Kim
                   ` (12 preceding siblings ...)
  2023-01-25 19:53 ` [PATCH 13/32] drm/amdgpu: prepare map process for multi-process " Jonathan Kim
@ 2023-01-25 19:53 ` Jonathan Kim
  2023-03-20 20:47   ` Felix Kuehling
  2023-01-25 19:53 ` [PATCH 15/32] drm/amdkfd: prepare trap workaround for gfx11 Jonathan Kim
                   ` (17 subsequent siblings)
  31 siblings, 1 reply; 68+ messages in thread
From: Jonathan Kim @ 2023-01-25 19:53 UTC (permalink / raw)
  To: amd-gfx, dri-devel; +Cc: Felix.Kuehling, Jonathan.Kim

Similar to the F32 HWS, the RS64 HWS for GFX11 now supports a multi-process
debug API.

The skip_process_ctx_clear ADD_QUEUE requirement is to prevent the MES
from clearing the process context when the first queue is added to the
scheduler in order to maintain debug mode settings during queue preemption
and restore.  The MES clears the process context in this case due to an
unresolved FW caching bug during normal mode operations.
During debug mode, the KFD will hold a reference to the target process
so the process context should never go stale and MES can afford to skip
this requirement.

Signed-off-by: Jonathan Kim <jonathan.kim@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c       | 32 +++++++++++++++++++
 drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h       | 20 ++++++++++++
 drivers/gpu/drm/amd/amdgpu/mes_v11_0.c        | 12 +++++++
 drivers/gpu/drm/amd/include/mes_v11_api_def.h | 21 +++++++++++-
 4 files changed, 84 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
index 82e27bd4f038..4916e0b0156f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
@@ -924,6 +924,38 @@ int amdgpu_mes_reg_wait(struct amdgpu_device *adev, uint32_t reg,
 	return r;
 }
 
+int amdgpu_mes_set_shader_debugger(struct amdgpu_device *adev,
+				uint64_t process_context_addr,
+				uint32_t spi_gdbg_per_vmid_cntl,
+				const uint32_t *tcp_watch_cntl,
+				uint32_t flags)
+{
+	struct mes_misc_op_input op_input = {0};
+	int r;
+
+	if (!adev->mes.funcs->misc_op) {
+		DRM_ERROR("mes set shader debugger is not supported!\n");
+		return -EINVAL;
+	}
+
+	op_input.op = MES_MISC_OP_SET_SHADER_DEBUGGER;
+	op_input.set_shader_debugger.process_context_addr = process_context_addr;
+	op_input.set_shader_debugger.flags.u32all = flags;
+	op_input.set_shader_debugger.spi_gdbg_per_vmid_cntl = spi_gdbg_per_vmid_cntl;
+	memcpy(op_input.set_shader_debugger.tcp_watch_cntl, tcp_watch_cntl,
+			sizeof(op_input.set_shader_debugger.tcp_watch_cntl));
+
+	amdgpu_mes_lock(&adev->mes);
+
+	r = adev->mes.funcs->misc_op(&adev->mes, &op_input);
+	if (r)
+		DRM_ERROR("failed to set_shader_debugger\n");
+
+	amdgpu_mes_unlock(&adev->mes);
+
+	return r;
+}
+
 static void
 amdgpu_mes_ring_to_queue_props(struct amdgpu_device *adev,
 			       struct amdgpu_ring *ring,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
index 547ec35691fa..d20df0cf0d88 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
@@ -256,6 +256,7 @@ enum mes_misc_opcode {
 	MES_MISC_OP_READ_REG,
 	MES_MISC_OP_WRM_REG_WAIT,
 	MES_MISC_OP_WRM_REG_WR_WAIT,
+	MES_MISC_OP_SET_SHADER_DEBUGGER,
 };
 
 struct mes_misc_op_input {
@@ -278,6 +279,20 @@ struct mes_misc_op_input {
 			uint32_t                   reg0;
 			uint32_t                   reg1;
 		} wrm_reg;
+
+		struct {
+			uint64_t process_context_addr;
+			union {
+				struct {
+					uint64_t single_memop : 1;
+					uint64_t single_alu_op : 1;
+					uint64_t reserved: 30;
+				};
+				uint32_t u32all;
+			} flags;
+			uint32_t spi_gdbg_per_vmid_cntl;
+			uint32_t tcp_watch_cntl[4];
+		} set_shader_debugger;
 	};
 };
 
@@ -340,6 +355,11 @@ int amdgpu_mes_reg_wait(struct amdgpu_device *adev, uint32_t reg,
 int amdgpu_mes_reg_write_reg_wait(struct amdgpu_device *adev,
 				  uint32_t reg0, uint32_t reg1,
 				  uint32_t ref, uint32_t mask);
+int amdgpu_mes_set_shader_debugger(struct amdgpu_device *adev,
+				uint64_t process_context_addr,
+				uint32_t spi_gdbg_per_vmid_cntl,
+				const uint32_t *tcp_watch_cntl,
+				uint32_t flags);
 
 int amdgpu_mes_add_ring(struct amdgpu_device *adev, int gang_id,
 			int queue_type, int idx,
diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
index 62cdd2113135..fbacdc42efac 100644
--- a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
@@ -334,6 +334,18 @@ static int mes_v11_0_misc_op(struct amdgpu_mes *mes,
 		misc_pkt.wait_reg_mem.reg_offset1 = input->wrm_reg.reg0;
 		misc_pkt.wait_reg_mem.reg_offset2 = input->wrm_reg.reg1;
 		break;
+	case MES_MISC_OP_SET_SHADER_DEBUGGER:
+		misc_pkt.opcode = MESAPI_MISC__SET_SHADER_DEBUGGER;
+		misc_pkt.set_shader_debugger.process_context_addr =
+				input->set_shader_debugger.process_context_addr;
+		misc_pkt.set_shader_debugger.flags.u32all =
+				input->set_shader_debugger.flags.u32all;
+		misc_pkt.set_shader_debugger.spi_gdbg_per_vmid_cntl =
+				input->set_shader_debugger.spi_gdbg_per_vmid_cntl;
+		memcpy(misc_pkt.set_shader_debugger.tcp_watch_cntl,
+				input->set_shader_debugger.tcp_watch_cntl,
+				sizeof(misc_pkt.set_shader_debugger.tcp_watch_cntl));
+		break;
 	default:
 		DRM_ERROR("unsupported misc op (%d) \n", input->op);
 		return -EINVAL;
diff --git a/drivers/gpu/drm/amd/include/mes_v11_api_def.h b/drivers/gpu/drm/amd/include/mes_v11_api_def.h
index dc694cb246d9..f3c15f18ddb5 100644
--- a/drivers/gpu/drm/amd/include/mes_v11_api_def.h
+++ b/drivers/gpu/drm/amd/include/mes_v11_api_def.h
@@ -274,7 +274,8 @@ union MESAPI__ADD_QUEUE {
 			uint32_t is_kfd_process		: 1;
 			uint32_t trap_en		: 1;
 			uint32_t is_aql_queue		: 1;
-			uint32_t reserved		: 20;
+			uint32_t skip_process_ctx_clear : 1;
+			uint32_t reserved		: 19;
 		};
 		struct MES_API_STATUS		api_status;
 		uint64_t                        tma_addr;
@@ -523,6 +524,7 @@ enum MESAPI_MISC_OPCODE {
 	MESAPI_MISC__QUERY_STATUS,
 	MESAPI_MISC__READ_REG,
 	MESAPI_MISC__WAIT_REG_MEM,
+	MESAPI_MISC__SET_SHADER_DEBUGGER,
 	MESAPI_MISC__MAX,
 };
 
@@ -561,6 +563,20 @@ struct QUERY_STATUS {
 	uint32_t context_id;
 };
 
+struct SET_SHADER_DEBUGGER {
+	uint64_t process_context_addr;
+	union {
+		struct {
+			uint32_t single_memop : 1;  /* SQ_DEBUG.single_memop */
+			uint32_t single_alu_op : 1; /* SQ_DEBUG.single_alu_op */
+			uint32_t reserved : 30;
+		};
+		uint32_t u32all;
+	} flags;
+	uint32_t spi_gdbg_per_vmid_cntl;
+	uint32_t tcp_watch_cntl[4]; /* TCP_WATCHx_CNTL */
+};
+
 union MESAPI__MISC {
 	struct {
 		union MES_API_HEADER	header;
@@ -573,6 +589,9 @@ union MESAPI__MISC {
 			struct		QUERY_STATUS query_status;
 			struct		READ_REG read_reg;
 			struct          WAIT_REG_MEM wait_reg_mem;
+			struct		SET_SHADER_DEBUGGER set_shader_debugger;
+			enum MES_AMD_PRIORITY_LEVEL queue_sch_level;
+
 			uint32_t	data[MISC_DATA_MAX_SIZE_IN_DWORDS];
 		};
 	};
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 68+ messages in thread

* [PATCH 15/32] drm/amdkfd: prepare trap workaround for gfx11
  2023-01-25 19:53 [PATCH 00/32] Upstream of kernel support for AMDGPU ISA debugging Jonathan Kim
                   ` (13 preceding siblings ...)
  2023-01-25 19:53 ` [PATCH 14/32] drm/amdgpu: expose debug api for mes Jonathan Kim
@ 2023-01-25 19:53 ` Jonathan Kim
  2023-03-20 21:49   ` Felix Kuehling
  2023-01-25 19:53 ` [PATCH 16/32] drm/amdkfd: add per process hw trap enable and disable functions Jonathan Kim
                   ` (16 subsequent siblings)
  31 siblings, 1 reply; 68+ messages in thread
From: Jonathan Kim @ 2023-01-25 19:53 UTC (permalink / raw)
  To: amd-gfx, dri-devel; +Cc: Felix.Kuehling, Jonathan.Kim

Due to a HW bug, waves in only half the shader arrays can enter trap.

When starting a debug session, relocate all waves to the first shader
array of each shader engine and mask off the 2nd shader array as
unavailable.

When ending a debug session, re-enable the 2nd shader array per
shader engine.

User CU masking per queue cannot be guaranteed to remain functional
if requested during debugging (e.g. user cu mask requests only 2nd shader
array as an available resource leading to zero HW resources available)
nor can runtime be alerted of any of these changes during execution.

Make user CU masking and debugging mutual exclusive with respect to
availability.

If the debugger tries to attach to a process with a user cu masked
queue, return the runtime status as enabled but busy.

If the debugger tries to attach and fails to reallocate queue waves to
the first shader array of each shader engine, return the runtime status
as enabled but with an error.

In addition, like any other mutli-process debug supported devices,
disable trap temporary setup per-process to avoid performance impact from
setup overhead.

Signed-off-by: Jonathan Kim <jonathan.kim@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h       |  2 +
 drivers/gpu/drm/amd/amdgpu/mes_v11_0.c        |  7 +-
 drivers/gpu/drm/amd/amdkfd/kfd_chardev.c      |  2 -
 drivers/gpu/drm/amd/amdkfd/kfd_debug.c        | 64 +++++++++++++++++++
 drivers/gpu/drm/amd/amdkfd/kfd_debug.h        |  3 +-
 .../drm/amd/amdkfd/kfd_device_queue_manager.c |  7 ++
 .../gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c  |  3 +-
 .../gpu/drm/amd/amdkfd/kfd_mqd_manager_v10.c  |  3 +-
 .../gpu/drm/amd/amdkfd/kfd_mqd_manager_v11.c  | 42 ++++++++----
 .../gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c   |  3 +-
 .../gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c   |  3 +-
 drivers/gpu/drm/amd/amdkfd/kfd_priv.h         |  5 +-
 .../amd/amdkfd/kfd_process_queue_manager.c    |  9 ++-
 13 files changed, 124 insertions(+), 29 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
index d20df0cf0d88..b5f5eed2b5ef 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
@@ -219,6 +219,8 @@ struct mes_add_queue_input {
 	uint32_t        gws_size;
 	uint64_t	tba_addr;
 	uint64_t	tma_addr;
+	uint32_t	trap_en;
+	uint32_t	skip_process_ctx_clear;
 	uint32_t	is_kfd_process;
 	uint32_t	is_aql_queue;
 	uint32_t	queue_size;
diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
index fbacdc42efac..38c7a0cbf264 100644
--- a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
@@ -197,17 +197,14 @@ static int mes_v11_0_add_hw_queue(struct amdgpu_mes *mes,
 	mes_add_queue_pkt.gws_size = input->gws_size;
 	mes_add_queue_pkt.trap_handler_addr = input->tba_addr;
 	mes_add_queue_pkt.tma_addr = input->tma_addr;
+	mes_add_queue_pkt.trap_en = input->trap_en;
+	mes_add_queue_pkt.skip_process_ctx_clear = input->skip_process_ctx_clear;
 	mes_add_queue_pkt.is_kfd_process = input->is_kfd_process;
 
 	/* For KFD, gds_size is re-used for queue size (needed in MES for AQL queues) */
 	mes_add_queue_pkt.is_aql_queue = input->is_aql_queue;
 	mes_add_queue_pkt.gds_size = input->queue_size;
 
-	if (!(((adev->mes.sched_version & AMDGPU_MES_VERSION_MASK) >= 4) &&
-		  (adev->ip_versions[GC_HWIP][0] >= IP_VERSION(11, 0, 0)) &&
-		  (adev->ip_versions[GC_HWIP][0] <= IP_VERSION(11, 0, 3))))
-		mes_add_queue_pkt.trap_en = 1;
-
 	/* For KFD, gds_size is re-used for queue size (needed in MES for AQL queues) */
 	mes_add_queue_pkt.is_aql_queue = input->is_aql_queue;
 	mes_add_queue_pkt.gds_size = input->queue_size;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index ee05c2e54ef6..f5f639de28f0 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -530,8 +530,6 @@ static int kfd_ioctl_set_cu_mask(struct file *filp, struct kfd_process *p,
 		goto out;
 	}
 
-	minfo.update_flag = UPDATE_FLAG_CU_MASK;
-
 	mutex_lock(&p->mutex);
 
 	retval = pqm_update_mqd(&p->pqm, args->queue_id, &minfo);
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
index f6ea6db266b4..6e99a0160275 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
@@ -37,6 +37,70 @@ void debug_event_write_work_handler(struct work_struct *work)
 	kernel_write(process->dbg_ev_file, &write_data, 1, &pos);
 }
 
+static int kfd_dbg_set_queue_workaround(struct queue *q, bool enable)
+{
+	struct mqd_update_info minfo = {0};
+	int err;
+
+	if (!q || (!q->properties.is_dbg_wa && !enable))
+		return 0;
+
+	if (KFD_GC_VERSION(q->device) < IP_VERSION(11, 0, 0) ||
+			KFD_GC_VERSION(q->device) >= IP_VERSION(12, 0, 0))
+		return 0;
+
+	if (enable && q->properties.is_user_cu_masked)
+		return -EBUSY;
+
+	minfo.update_flag = enable ? UPDATE_FLAG_DBG_WA_ENABLE : UPDATE_FLAG_DBG_WA_DISABLE;
+
+	q->properties.is_dbg_wa = enable;
+	err = q->device->dqm->ops.update_queue(q->device->dqm, q, &minfo);
+	if (err)
+		q->properties.is_dbg_wa = false;
+
+	return err;
+}
+
+static int kfd_dbg_set_workaround(struct kfd_process *target, bool enable)
+{
+	struct process_queue_manager *pqm = &target->pqm;
+	struct process_queue_node *pqn;
+	int r = 0;
+
+	list_for_each_entry(pqn, &pqm->queues, process_queue_list) {
+		r = kfd_dbg_set_queue_workaround(pqn->q, enable);
+		if (enable && r)
+			goto unwind;
+	}
+
+	return 0;
+
+unwind:
+	list_for_each_entry(pqn, &pqm->queues, process_queue_list)
+		kfd_dbg_set_queue_workaround(pqn->q, false);
+
+	if (enable) {
+		target->runtime_info.runtime_state = r == -EBUSY ?
+				DEBUG_RUNTIME_STATE_ENABLED_BUSY :
+				DEBUG_RUNTIME_STATE_ENABLED_ERROR;
+	}
+
+	return r;
+}
+
+static int kfd_dbg_set_mes_debug_mode(struct kfd_process_device *pdd)
+{
+	uint32_t spi_dbg_cntl = pdd->spi_dbg_override | pdd->spi_dbg_launch_mode;
+	uint32_t flags = pdd->process->dbg_flags;
+
+	if (!kfd_dbg_is_per_vmid_supported(pdd->dev))
+		return 0;
+
+	return amdgpu_mes_set_shader_debugger(pdd->dev->adev, pdd->proc_ctx_gpu_addr, spi_dbg_cntl,
+						pdd->watch_points, flags);
+}
+
 int kfd_dbg_trap_disable(struct kfd_process *target)
 {
 	if (!target->debug_trap_enabled)
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.h b/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
index 53c5a3e55bd2..0c09f1729325 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
@@ -35,7 +35,8 @@ int kfd_dbg_trap_enable(struct kfd_process *target, uint32_t fd,
 
 static inline bool kfd_dbg_is_per_vmid_supported(struct kfd_dev *dev)
 {
-	return KFD_GC_VERSION(dev) == IP_VERSION(9, 4, 2);
+	return KFD_GC_VERSION(dev) == IP_VERSION(9, 4, 2) ||
+	       KFD_GC_VERSION(dev) >= IP_VERSION(11, 0, 0);
 }
 
 void debug_event_write_work_handler(struct work_struct *work);
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
index 2517716d7cbc..be1985b87ea7 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -214,6 +214,10 @@ static int add_queue_mes(struct device_queue_manager *dqm, struct queue *q,
 	queue_input.paging = false;
 	queue_input.tba_addr = qpd->tba_addr;
 	queue_input.tma_addr = qpd->tma_addr;
+	queue_input.trap_en = KFD_GC_VERSION(q->device) < IP_VERSION(11, 0, 0) ||
+			      KFD_GC_VERSION(q->device) >= IP_VERSION(12, 0, 0) ||
+			      q->properties.is_dbg_wa;
+	queue_input.skip_process_ctx_clear = qpd->pqm->process->debug_trap_enabled;
 
 	queue_type = convert_to_mes_queue_type(q->properties.type);
 	if (queue_type < 0) {
@@ -1679,6 +1683,9 @@ static int create_queue_cpsch(struct device_queue_manager *dqm, struct queue *q,
 	 * updates the is_evicted flag but is a no-op otherwise.
 	 */
 	q->properties.is_evicted = !!qpd->evicted;
+	q->properties.is_dbg_wa = qpd->pqm->process->debug_trap_enabled &&
+			KFD_GC_VERSION(q->device) >= IP_VERSION(11, 0, 0) &&
+			KFD_GC_VERSION(q->device) < IP_VERSION(12, 0, 0);
 
 	if (qd)
 		mqd_mgr->restore_mqd(mqd_mgr, &q->mqd, q->mqd_mem_obj, &q->gart_mqd_addr,
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c
index 4889865c725c..c2a7226fc588 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c
@@ -48,8 +48,7 @@ static void update_cu_mask(struct mqd_manager *mm, void *mqd,
 	struct cik_mqd *m;
 	uint32_t se_mask[4] = {0}; /* 4 is the max # of SEs */
 
-	if (!minfo || (minfo->update_flag != UPDATE_FLAG_CU_MASK) ||
-	    !minfo->cu_mask.ptr)
+	if (!minfo || !minfo->cu_mask.ptr)
 		return;
 
 	mqd_symmetrically_map_cu_mask(mm,
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v10.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v10.c
index cb484ace17de..8248e77751e7 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v10.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v10.c
@@ -48,8 +48,7 @@ static void update_cu_mask(struct mqd_manager *mm, void *mqd,
 	struct v10_compute_mqd *m;
 	uint32_t se_mask[4] = {0}; /* 4 is the max # of SEs */
 
-	if (!minfo || (minfo->update_flag != UPDATE_FLAG_CU_MASK) ||
-	    !minfo->cu_mask.ptr)
+	if (!minfo || !minfo->cu_mask.ptr)
 		return;
 
 	mqd_symmetrically_map_cu_mask(mm,
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v11.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v11.c
index ac7c8fc83c94..18ab613e787c 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v11.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v11.c
@@ -46,15 +46,33 @@ static void update_cu_mask(struct mqd_manager *mm, void *mqd,
 {
 	struct v11_compute_mqd *m;
 	uint32_t se_mask[KFD_MAX_NUM_SE] = {0};
+	bool has_wa_flag = minfo && (minfo->update_flag & (UPDATE_FLAG_DBG_WA_ENABLE |
+			UPDATE_FLAG_DBG_WA_DISABLE));
 
-	if (!minfo || (minfo->update_flag != UPDATE_FLAG_CU_MASK) ||
-	    !minfo->cu_mask.ptr)
+	if (!minfo || !(has_wa_flag || minfo->cu_mask.ptr))
 		return;
 
+	m = get_mqd(mqd);
+
+	if (has_wa_flag) {
+		uint32_t wa_mask = minfo->update_flag == UPDATE_FLAG_DBG_WA_ENABLE ?
+						0xffff : 0xffffffff;
+
+		m->compute_static_thread_mgmt_se0 = wa_mask;
+		m->compute_static_thread_mgmt_se1 = wa_mask;
+		m->compute_static_thread_mgmt_se2 = wa_mask;
+		m->compute_static_thread_mgmt_se3 = wa_mask;
+		m->compute_static_thread_mgmt_se4 = wa_mask;
+		m->compute_static_thread_mgmt_se5 = wa_mask;
+		m->compute_static_thread_mgmt_se6 = wa_mask;
+		m->compute_static_thread_mgmt_se7 = wa_mask;
+
+		return;
+	}
+
 	mqd_symmetrically_map_cu_mask(mm,
 		minfo->cu_mask.ptr, minfo->cu_mask.count, se_mask);
 
-	m = get_mqd(mqd);
 	m->compute_static_thread_mgmt_se0 = se_mask[0];
 	m->compute_static_thread_mgmt_se1 = se_mask[1];
 	m->compute_static_thread_mgmt_se2 = se_mask[2];
@@ -109,6 +127,7 @@ static void init_mqd(struct mqd_manager *mm, void **mqd,
 	uint64_t addr;
 	struct v11_compute_mqd *m;
 	int size;
+	uint32_t wa_mask = q->is_dbg_wa ? 0xffff : 0xffffffff;
 
 	m = (struct v11_compute_mqd *) mqd_mem_obj->cpu_ptr;
 	addr = mqd_mem_obj->gpu_addr;
@@ -122,14 +141,15 @@ static void init_mqd(struct mqd_manager *mm, void **mqd,
 
 	m->header = 0xC0310800;
 	m->compute_pipelinestat_enable = 1;
-	m->compute_static_thread_mgmt_se0 = 0xFFFFFFFF;
-	m->compute_static_thread_mgmt_se1 = 0xFFFFFFFF;
-	m->compute_static_thread_mgmt_se2 = 0xFFFFFFFF;
-	m->compute_static_thread_mgmt_se3 = 0xFFFFFFFF;
-	m->compute_static_thread_mgmt_se4 = 0xFFFFFFFF;
-	m->compute_static_thread_mgmt_se5 = 0xFFFFFFFF;
-	m->compute_static_thread_mgmt_se6 = 0xFFFFFFFF;
-	m->compute_static_thread_mgmt_se7 = 0xFFFFFFFF;
+
+	m->compute_static_thread_mgmt_se0 = wa_mask;
+	m->compute_static_thread_mgmt_se1 = wa_mask;
+	m->compute_static_thread_mgmt_se2 = wa_mask;
+	m->compute_static_thread_mgmt_se3 = wa_mask;
+	m->compute_static_thread_mgmt_se4 = wa_mask;
+	m->compute_static_thread_mgmt_se5 = wa_mask;
+	m->compute_static_thread_mgmt_se6 = wa_mask;
+	m->compute_static_thread_mgmt_se7 = wa_mask;
 
 	m->cp_hqd_persistent_state = CP_HQD_PERSISTENT_STATE__PRELOAD_REQ_MASK |
 			0x55 << CP_HQD_PERSISTENT_STATE__PRELOAD_SIZE__SHIFT;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c
index 86f1cf090246..50da16dd4c96 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c
@@ -49,8 +49,7 @@ static void update_cu_mask(struct mqd_manager *mm, void *mqd,
 	struct v9_mqd *m;
 	uint32_t se_mask[KFD_MAX_NUM_SE] = {0};
 
-	if (!minfo || (minfo->update_flag != UPDATE_FLAG_CU_MASK) ||
-	    !minfo->cu_mask.ptr)
+	if (!minfo || !minfo->cu_mask.ptr)
 		return;
 
 	mqd_symmetrically_map_cu_mask(mm,
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c
index 530ba6f5b57e..58b40bff3e0c 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c
@@ -51,8 +51,7 @@ static void update_cu_mask(struct mqd_manager *mm, void *mqd,
 	struct vi_mqd *m;
 	uint32_t se_mask[4] = {0}; /* 4 is the max # of SEs */
 
-	if (!minfo || (minfo->update_flag != UPDATE_FLAG_CU_MASK) ||
-	    !minfo->cu_mask.ptr)
+	if (!minfo || !minfo->cu_mask.ptr)
 		return;
 
 	mqd_symmetrically_map_cu_mask(mm,
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index 8f1e2f9023db..75521d96e937 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -479,6 +479,8 @@ struct queue_properties {
 	bool is_evicted;
 	bool is_active;
 	bool is_gws;
+	bool is_dbg_wa;
+	bool is_user_cu_masked;
 	/* Not relevant for user mode queues in cp scheduling */
 	unsigned int vmid;
 	/* Relevant only for sdma queues*/
@@ -501,7 +503,8 @@ struct queue_properties {
 			    !(q).is_evicted)
 
 enum mqd_update_flag {
-	UPDATE_FLAG_CU_MASK = 0,
+	UPDATE_FLAG_DBG_WA_ENABLE = 1,
+	UPDATE_FLAG_DBG_WA_DISABLE = 2,
 };
 
 struct mqd_update_info {
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
index 5137476ec18e..d8f032214481 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
@@ -498,8 +498,12 @@ int pqm_update_mqd(struct process_queue_manager *pqm,
 		return -EFAULT;
 	}
 
+	/* CUs are masked for debugger requirements so deny user mask  */
+	if (pqn->q->properties.is_dbg_wa && minfo && minfo->cu_mask.ptr)
+		return -EBUSY;
+
 	/* ASICs that have WGPs must enforce pairwise enabled mask checks. */
-	if (minfo && minfo->update_flag == UPDATE_FLAG_CU_MASK && minfo->cu_mask.ptr &&
+	if (minfo && minfo->cu_mask.ptr &&
 			KFD_GC_VERSION(pqn->q->device) >= IP_VERSION(10, 0, 0)) {
 		int i;
 
@@ -518,6 +522,9 @@ int pqm_update_mqd(struct process_queue_manager *pqm,
 	if (retval != 0)
 		return retval;
 
+	if (minfo && minfo->cu_mask.ptr)
+		pqn->q->properties.is_user_cu_masked = true;
+
 	return 0;
 }
 
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 68+ messages in thread

* [PATCH 16/32] drm/amdkfd: add per process hw trap enable and disable functions
  2023-01-25 19:53 [PATCH 00/32] Upstream of kernel support for AMDGPU ISA debugging Jonathan Kim
                   ` (14 preceding siblings ...)
  2023-01-25 19:53 ` [PATCH 15/32] drm/amdkfd: prepare trap workaround for gfx11 Jonathan Kim
@ 2023-01-25 19:53 ` Jonathan Kim
  2023-03-20 23:06   ` Felix Kuehling
  2023-01-25 19:53 ` [PATCH 17/32] drm/amdkfd: add raise exception event function Jonathan Kim
                   ` (15 subsequent siblings)
  31 siblings, 1 reply; 68+ messages in thread
From: Jonathan Kim @ 2023-01-25 19:53 UTC (permalink / raw)
  To: amd-gfx, dri-devel; +Cc: Felix.Kuehling, Jonathan.Kim

To enable HW debug mode per process, all devices must be debug enabled
successfully.  If a failure occures, rewind the enablement of debug mode
on the enabled devices.

A power management scenario that needs to be considered is HW
debug mode setting during GFXOFF.  During GFXOFF, these registers
will be unreachable so we have to transiently disable GFXOFF when
setting.  Also, some devices don't support the RLC save restore
function for these debug registers so we have to disable GFXOFF
completely during a debug session.

Cooperative launch also has debugging restriction based on HW/FW bugs.
If such bugs exists, the debugger cannot attach to a process that uses GWS
resources nor can GWS resources be requested if a process is being
debugged.

Multi-process debug devices can only enable trap temporaries based
on certain runtime scenerios, which will be explained when the
runtime enable functions are implemented in a follow up patch.

v2: add gfx11 support. fix fw checks. remove asic family name comments.

Signed-off-by: Jonathan Kim <jonathan.kim@amd.com>
---
 drivers/gpu/drm/amd/amdkfd/kfd_chardev.c      |   5 +
 drivers/gpu/drm/amd/amdkfd/kfd_debug.c        | 148 +++++++++++++++++-
 drivers/gpu/drm/amd/amdkfd/kfd_debug.h        |  29 ++++
 .../drm/amd/amdkfd/kfd_device_queue_manager.c |   1 +
 drivers/gpu/drm/amd/amdkfd/kfd_process.c      |   9 ++
 5 files changed, 190 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index f5f639de28f0..628178126d3b 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -1453,6 +1453,11 @@ static int kfd_ioctl_alloc_queue_gws(struct file *filep,
 		goto out_unlock;
 	}
 
+	if (!kfd_dbg_has_gws_support(dev) && p->debug_trap_enabled) {
+		retval = -EBUSY;
+		goto out_unlock;
+	}
+
 	retval = pqm_set_gws(&p->pqm, args->queue_id, args->num_gws ? dev->gws : NULL);
 	mutex_unlock(&p->mutex);
 
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
index 6e99a0160275..659dfc7411fe 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
@@ -21,6 +21,7 @@
  */
 
 #include "kfd_debug.h"
+#include "kfd_device_queue_manager.h"
 #include <linux/file.h>
 
 void debug_event_write_work_handler(struct work_struct *work)
@@ -101,11 +102,68 @@ static int kfd_dbg_set_mes_debug_mode(struct kfd_process_device *pdd)
 						pdd->watch_points, flags);
 }
 
+/* kfd_dbg_trap_deactivate:
+ *	target: target process
+ *	unwind: If this is unwinding a failed kfd_dbg_trap_enable()
+ *	unwind_count:
+ *		If unwind == true, how far down the pdd list we need
+ *				to unwind
+ *		else: ignored
+ */
+static void kfd_dbg_trap_deactivate(struct kfd_process *target, bool unwind, int unwind_count)
+{
+	int i, count = 0;
+
+	for (i = 0; i < target->n_pdds; i++) {
+		struct kfd_process_device *pdd = target->pdds[i];
+
+		/* If this is an unwind, and we have unwound the required
+		 * enable calls on the pdd list, we need to stop now
+		 * otherwise we may mess up another debugger session.
+		 */
+		if (unwind && count == unwind_count)
+			break;
+
+		/* GFX off is already disabled by debug activate if not RLC restore supported. */
+		if (kfd_dbg_is_rlc_restore_supported(pdd->dev))
+			amdgpu_gfx_off_ctrl(pdd->dev->adev, false);
+		pdd->spi_dbg_override =
+				pdd->dev->kfd2kgd->disable_debug_trap(
+				pdd->dev->adev,
+				target->runtime_info.ttmp_setup,
+				pdd->dev->vm_info.last_vmid_kfd);
+		if (kfd_dbg_is_rlc_restore_supported(pdd->dev))
+			amdgpu_gfx_off_ctrl(pdd->dev->adev, true);
+
+		if (!kfd_dbg_is_per_vmid_supported(pdd->dev) &&
+				release_debug_trap_vmid(pdd->dev->dqm, &pdd->qpd))
+			pr_err("Failed to release debug vmid on [%i]\n", pdd->dev->id);
+
+		if (!pdd->dev->shared_resources.enable_mes)
+			debug_refresh_runlist(pdd->dev->dqm);
+		else
+			kfd_dbg_set_mes_debug_mode(pdd);
+
+		count++;
+	}
+
+	kfd_dbg_set_workaround(target, false);
+}
+
 int kfd_dbg_trap_disable(struct kfd_process *target)
 {
 	if (!target->debug_trap_enabled)
 		return 0;
 
+	/*
+	 * Defer deactivation to runtime if runtime not enabled otherwise reset
+	 * attached running target runtime state to enable for re-attach.
+	 */
+	if (target->runtime_info.runtime_state == DEBUG_RUNTIME_STATE_ENABLED)
+		kfd_dbg_trap_deactivate(target, false, 0);
+	else if (target->runtime_info.runtime_state != DEBUG_RUNTIME_STATE_DISABLED)
+		target->runtime_info.runtime_state = DEBUG_RUNTIME_STATE_ENABLED;
+
 	fput(target->dbg_ev_file);
 	target->dbg_ev_file = NULL;
 
@@ -120,16 +178,96 @@ int kfd_dbg_trap_disable(struct kfd_process *target)
 	return 0;
 }
 
+static int kfd_dbg_trap_activate(struct kfd_process *target)
+{
+	int i, r = 0, unwind_count = 0;
+
+	r = kfd_dbg_set_workaround(target, true);
+	if (r)
+		return r;
+
+	for (i = 0; i < target->n_pdds; i++) {
+		struct kfd_process_device *pdd = target->pdds[i];
+
+		if (!kfd_dbg_is_per_vmid_supported(pdd->dev)) {
+			r = reserve_debug_trap_vmid(pdd->dev->dqm, &pdd->qpd);
+
+			if (r) {
+				target->runtime_info.runtime_state = (r == -EBUSY) ?
+							DEBUG_RUNTIME_STATE_ENABLED_BUSY :
+							DEBUG_RUNTIME_STATE_ENABLED_ERROR;
+
+				goto unwind_err;
+			}
+		}
+
+		/* Disable GFX OFF to prevent garbage read/writes to debug registers.
+		 * If RLC restore of debug registers is not supported and runtime enable
+		 * hasn't done so already on ttmp setup request, restore the trap config registers.
+		 *
+		 * If RLC restore of debug registers is not supported, keep gfx off disabled for
+		 * the debug session.
+		 */
+		amdgpu_gfx_off_ctrl(pdd->dev->adev, false);
+		if (!(kfd_dbg_is_rlc_restore_supported(pdd->dev) ||
+						target->runtime_info.ttmp_setup))
+			pdd->dev->kfd2kgd->enable_debug_trap(pdd->dev->adev, true,
+								pdd->dev->vm_info.last_vmid_kfd);
+
+		pdd->spi_dbg_override = pdd->dev->kfd2kgd->enable_debug_trap(
+					pdd->dev->adev,
+					false,
+					pdd->dev->vm_info.last_vmid_kfd);
+
+		if (kfd_dbg_is_rlc_restore_supported(pdd->dev))
+			amdgpu_gfx_off_ctrl(pdd->dev->adev, true);
+
+		if (!pdd->dev->shared_resources.enable_mes)
+			r = debug_refresh_runlist(pdd->dev->dqm);
+		else
+			r = kfd_dbg_set_mes_debug_mode(pdd);
+
+		if (r) {
+			target->runtime_info.runtime_state =
+					DEBUG_RUNTIME_STATE_ENABLED_ERROR;
+			goto unwind_err;
+		}
+
+		/* Increment unwind_count as the last step */
+		unwind_count++;
+	}
+
+	return 0;
+
+unwind_err:
+	/* Enabling debug failed, we need to disable on
+	 * all GPUs so the enable is all or nothing.
+	 */
+	kfd_dbg_trap_deactivate(target, true, unwind_count);
+	return r;
+}
+
 int kfd_dbg_trap_enable(struct kfd_process *target, uint32_t fd,
 			void __user *runtime_info, uint32_t *runtime_size)
 {
 	struct file *f;
 	uint32_t copy_size;
-	int r = 0;
+	int i, r = 0;
 
 	if (target->debug_trap_enabled)
 		return -EALREADY;
 
+	/* Enable pre-checks */
+	for (i = 0; i < target->n_pdds; i++) {
+		struct kfd_process_device *pdd = target->pdds[i];
+
+		if (!KFD_IS_SOC15(pdd->dev))
+			return -ENODEV;
+
+		if (!kfd_dbg_has_gws_support(pdd->dev) && pdd->qpd.num_gws)
+			return -EBUSY;
+	}
+
 	copy_size = min((size_t)(*runtime_size), sizeof(target->runtime_info));
 
 	f = fget(fd);
@@ -140,6 +278,10 @@ int kfd_dbg_trap_enable(struct kfd_process *target, uint32_t fd,
 
 	target->dbg_ev_file = f;
 
+	/* defer activation to runtime if not runtime enabled */
+	if (target->runtime_info.runtime_state == DEBUG_RUNTIME_STATE_ENABLED)
+		kfd_dbg_trap_activate(target);
+
 	/* We already hold the process reference but hold another one for the
 	 * debug session.
 	 */
@@ -149,8 +291,10 @@ int kfd_dbg_trap_enable(struct kfd_process *target, uint32_t fd,
 	if (target->debugger_process)
 		atomic_inc(&target->debugger_process->debugged_process_count);
 
-	if (copy_to_user(runtime_info, (void *)&target->runtime_info, copy_size))
+	if (copy_to_user(runtime_info, (void *)&target->runtime_info, copy_size)) {
+		kfd_dbg_trap_deactivate(target, false, 0);
 		r = -EFAULT;
+	}
 
 	*runtime_size = sizeof(target->runtime_info);
 
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.h b/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
index 0c09f1729325..f199698d8d60 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
@@ -41,4 +41,33 @@ static inline bool kfd_dbg_is_per_vmid_supported(struct kfd_dev *dev)
 
 void debug_event_write_work_handler(struct work_struct *work);
 
+/*
+ * If GFX off is enabled, chips that do not support RLC restore for the debug
+ * registers will disable GFX off temporarily for the entire debug session.
+ * See disable_on_trap_action_entry and enable_on_trap_action_exit for details.
+ */
+static inline bool kfd_dbg_is_rlc_restore_supported(struct kfd_dev *dev)
+{
+	return !(KFD_GC_VERSION(dev) == IP_VERSION(10, 1, 10) ||
+		 KFD_GC_VERSION(dev) == IP_VERSION(10, 1, 1));
+}
+
+static inline bool kfd_dbg_has_gws_support(struct kfd_dev *dev)
+{
+	if ((KFD_GC_VERSION(dev) == IP_VERSION(9, 0, 1)
+			&& dev->mec2_fw_version < 0x81b6) ||
+		(KFD_GC_VERSION(dev) >= IP_VERSION(9, 1, 0)
+			&& KFD_GC_VERSION(dev) <= IP_VERSION(9, 2, 2)
+			&& dev->mec2_fw_version < 0x1b6) ||
+		(KFD_GC_VERSION(dev) == IP_VERSION(9, 4, 0)
+			&& dev->mec2_fw_version < 0x1b6) ||
+		(KFD_GC_VERSION(dev) == IP_VERSION(9, 4, 1)
+			&& dev->mec2_fw_version < 0x30) ||
+		(KFD_GC_VERSION(dev) >= IP_VERSION(11, 0, 0) &&
+			KFD_GC_VERSION(dev) < IP_VERSION(12, 0, 0)))
+		return false;
+
+	/* Assume debugging and cooperative launch supported otherwise. */
+	return true;
+}
 #endif
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
index be1985b87ea7..3b747e51684e 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -36,6 +36,7 @@
 #include "kfd_kernel_queue.h"
 #include "amdgpu_amdkfd.h"
 #include "mes_api_def.h"
+#include "kfd_debug.h"
 
 /* Size of the per-pipe EOP queue */
 #define CIK_HPD_EOP_BYTES_LOG2 11
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
index 94c6545a58b4..0ef2d00af8b1 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
@@ -1181,6 +1181,7 @@ static void kfd_process_notifier_release(struct mmu_notifier *mn,
 					struct mm_struct *mm)
 {
 	struct kfd_process *p;
+	int i;
 
 	/*
 	 * The kfd_process structure can not be free because the
@@ -1198,6 +1199,14 @@ static void kfd_process_notifier_release(struct mmu_notifier *mn,
 	cancel_delayed_work_sync(&p->eviction_work);
 	cancel_delayed_work_sync(&p->restore_work);
 
+	for (i = 0; i < p->n_pdds; i++) {
+		struct kfd_process_device *pdd = p->pdds[i];
+
+		/* re-enable GFX OFF since runtime enable with ttmp setup disabled it. */
+		if (!kfd_dbg_is_rlc_restore_supported(pdd->dev) && p->runtime_info.ttmp_setup)
+			amdgpu_gfx_off_ctrl(pdd->dev->adev, true);
+	}
+
 	/* Indicate to other users that MM is no longer valid */
 	p->mm = NULL;
 
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 68+ messages in thread

* [PATCH 17/32] drm/amdkfd: add raise exception event function
  2023-01-25 19:53 [PATCH 00/32] Upstream of kernel support for AMDGPU ISA debugging Jonathan Kim
                   ` (15 preceding siblings ...)
  2023-01-25 19:53 ` [PATCH 16/32] drm/amdkfd: add per process hw trap enable and disable functions Jonathan Kim
@ 2023-01-25 19:53 ` Jonathan Kim
  2023-03-20 23:18   ` Felix Kuehling
  2023-01-25 19:53 ` [PATCH 18/32] drm/amdkfd: add send exception operation Jonathan Kim
                   ` (14 subsequent siblings)
  31 siblings, 1 reply; 68+ messages in thread
From: Jonathan Kim @ 2023-01-25 19:53 UTC (permalink / raw)
  To: amd-gfx, dri-devel; +Cc: Felix.Kuehling, Jonathan.Kim

Exception events can be generated from interrupts or queue activitity.

The raise event function will save exception status of a queue, device
or process then notify the debugger of the status change by writing to
a debugger polled file descriptor that the debugger provides during
debug attach.

For memory violation exceptions, extra exception data will be saved.

The debugger will be able to query the saved exception states by query
operation that will be provided by follow up patches.

Signed-off-by: Jonathan Kim <jonathan.kim@amd.com>
---
 drivers/gpu/drm/amd/amdkfd/kfd_debug.c | 91 +++++++++++++++++++++++++-
 drivers/gpu/drm/amd/amdkfd/kfd_debug.h |  5 ++
 drivers/gpu/drm/amd/amdkfd/kfd_priv.h  |  7 ++
 3 files changed, 102 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
index 659dfc7411fe..fcd064b13f6a 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
@@ -38,6 +38,93 @@ void debug_event_write_work_handler(struct work_struct *work)
 	kernel_write(process->dbg_ev_file, &write_data, 1, &pos);
 }
 
+/* update process/device/queue exception status, write to descriptor
+ * only if exception_status is enabled.
+ */
+bool kfd_dbg_ev_raise(uint64_t event_mask,
+			struct kfd_process *process, struct kfd_dev *dev,
+			unsigned int source_id, bool use_worker,
+			void *exception_data, size_t exception_data_size)
+{
+	struct process_queue_manager *pqm;
+	struct process_queue_node *pqn;
+	int i;
+	static const char write_data = '.';
+	loff_t pos = 0;
+	bool is_subscribed = true;
+
+	if (!(process && process->debug_trap_enabled))
+		return false;
+
+	mutex_lock(&process->event_mutex);
+
+	if (event_mask & KFD_EC_MASK_DEVICE) {
+		for (i = 0; i < process->n_pdds; i++) {
+			struct kfd_process_device *pdd = process->pdds[i];
+
+			if (pdd->dev != dev)
+				continue;
+
+			pdd->exception_status |= event_mask & KFD_EC_MASK_DEVICE;
+
+			if (event_mask & KFD_EC_MASK(EC_DEVICE_MEMORY_VIOLATION)) {
+				if (!pdd->vm_fault_exc_data) {
+					pdd->vm_fault_exc_data = kmemdup(
+							exception_data,
+							exception_data_size,
+							GFP_KERNEL);
+					if (!pdd->vm_fault_exc_data)
+						pr_debug("Failed to allocate exception data memory");
+				} else {
+					pr_debug("Debugger exception data not saved\n");
+					print_hex_dump_bytes("exception data: ",
+							DUMP_PREFIX_OFFSET,
+							exception_data,
+							exception_data_size);
+				}
+			}
+			break;
+		}
+	} else if (event_mask & KFD_EC_MASK_PROCESS) {
+		process->exception_status |= event_mask & KFD_EC_MASK_PROCESS;
+	} else {
+		pqm = &process->pqm;
+		list_for_each_entry(pqn, &pqm->queues,
+				process_queue_list) {
+			int target_id;
+
+			if (!pqn->q)
+				continue;
+
+			target_id = event_mask & KFD_EC_MASK(EC_QUEUE_NEW) ?
+					pqn->q->properties.queue_id :
+							pqn->q->doorbell_id;
+
+			if (pqn->q->device != dev || target_id != source_id)
+				continue;
+
+			pqn->q->properties.exception_status |= event_mask;
+			break;
+		}
+	}
+
+	if (process->exception_enable_mask & event_mask) {
+		if (use_worker)
+			schedule_work(&process->debug_event_workarea);
+		else
+			kernel_write(process->dbg_ev_file,
+					&write_data,
+					1,
+					&pos);
+	} else {
+		is_subscribed = false;
+	}
+
+	mutex_unlock(&process->event_mutex);
+
+	return is_subscribed;
+}
+
 static int kfd_dbg_set_queue_workaround(struct queue *q, bool enable)
 {
 	struct mqd_update_info minfo = {0};
@@ -88,7 +175,6 @@ static int kfd_dbg_set_workaround(struct kfd_process *target, bool enable)
 	}
 
 	return r;
-}
 
 static int kfd_dbg_set_mes_debug_mode(struct kfd_process_device *pdd)
 {
@@ -114,6 +200,9 @@ static void kfd_dbg_trap_deactivate(struct kfd_process *target, bool unwind, int
 {
 	int i, count = 0;
 
+	if (!unwind)
+		cancel_work_sync(&target->debug_event_workarea);
+
 	for (i = 0; i < target->n_pdds; i++) {
 		struct kfd_process_device *pdd = target->pdds[i];
 
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.h b/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
index f199698d8d60..2d5bc102f6b4 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
@@ -28,6 +28,11 @@
 void kgd_gfx_v9_set_wave_launch_stall(struct amdgpu_device *adev,
 					uint32_t vmid,
 					bool stall);
+bool kfd_dbg_ev_raise(uint64_t event_mask,
+			struct kfd_process *process, struct kfd_dev *dev,
+			unsigned int source_id, bool use_worker,
+			void *exception_data,
+			size_t exception_data_size);
 int kfd_dbg_trap_disable(struct kfd_process *target);
 int kfd_dbg_trap_enable(struct kfd_process *target, uint32_t fd,
 			void __user *runtime_info,
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index 75521d96e937..e503bd94dda6 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -495,6 +495,7 @@ struct queue_properties {
 	uint32_t ctl_stack_size;
 	uint64_t tba_addr;
 	uint64_t tma_addr;
+	uint64_t exception_status;
 };
 
 #define QUEUE_IS_ACTIVE(q) ((q).queue_size > 0 &&	\
@@ -786,6 +787,11 @@ struct kfd_process_device {
 	uint64_t page_in;
 	uint64_t page_out;
 
+	/* Exception code status*/
+	uint64_t exception_status;
+	void *vm_fault_exc_data;
+	size_t vm_fault_exc_data_size;
+
 	/* Tracks debug per-vmid request settings */
 	uint32_t spi_dbg_override;
 	uint32_t spi_dbg_launch_mode;
@@ -921,6 +927,7 @@ struct kfd_process {
 
 	/* Exception code enable mask and status */
 	uint64_t exception_enable_mask;
+	uint64_t exception_status;
 
 	/* shared virtual memory registered by this process */
 	struct svm_range_list svms;
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 68+ messages in thread

* [PATCH 18/32] drm/amdkfd: add send exception operation
  2023-01-25 19:53 [PATCH 00/32] Upstream of kernel support for AMDGPU ISA debugging Jonathan Kim
                   ` (16 preceding siblings ...)
  2023-01-25 19:53 ` [PATCH 17/32] drm/amdkfd: add raise exception event function Jonathan Kim
@ 2023-01-25 19:53 ` Jonathan Kim
  2023-03-20 23:26   ` Felix Kuehling
  2023-01-25 19:53 ` [PATCH 19/32] drm/amdkfd: add runtime enable operation Jonathan Kim
                   ` (13 subsequent siblings)
  31 siblings, 1 reply; 68+ messages in thread
From: Jonathan Kim @ 2023-01-25 19:53 UTC (permalink / raw)
  To: amd-gfx, dri-devel; +Cc: Felix.Kuehling, Jonathan.Kim

Add a debug operation that allows the debugger to send an exception
directly to runtime through a payload address.

For memory violations, normal vmfault signals will be applied to
notify runtime instead after passing in the saved exception data
when a memory violation was raised to the debugger.

For runtime exceptions, this will unblock the runtime enable
function which will be explained and implemented in a follow up
patch.

Signed-off-by: Jonathan Kim <jonathan.kim@amd.com>
---
 .../gpu/drm/amd/amdkfd/cik_event_interrupt.c  |  4 +-
 drivers/gpu/drm/amd/amdkfd/kfd_chardev.c      |  5 ++
 drivers/gpu/drm/amd/amdkfd/kfd_debug.c        | 44 ++++++++++++
 drivers/gpu/drm/amd/amdkfd/kfd_debug.h        |  5 ++
 drivers/gpu/drm/amd/amdkfd/kfd_events.c       |  3 +-
 .../gpu/drm/amd/amdkfd/kfd_int_process_v9.c   |  2 +-
 drivers/gpu/drm/amd/amdkfd/kfd_priv.h         |  7 +-
 drivers/gpu/drm/amd/amdkfd/kfd_process.c      | 71 ++++++++++++++++++-
 8 files changed, 135 insertions(+), 6 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
index 5c8023cba196..62a38cd820fc 100644
--- a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
+++ b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
@@ -118,9 +118,9 @@ static void cik_event_interrupt_wq(struct kfd_dev *dev,
 			return;
 
 		if (info.vmid == vmid)
-			kfd_signal_vm_fault_event(dev, pasid, &info);
+			kfd_signal_vm_fault_event(dev, pasid, &info, NULL);
 		else
-			kfd_signal_vm_fault_event(dev, pasid, NULL);
+			kfd_signal_vm_fault_event(dev, pasid, NULL, NULL);
 	}
 }
 
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index 628178126d3b..09fe8576dc8c 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -2738,6 +2738,11 @@ static int kfd_ioctl_set_debug_trap(struct file *filep, struct kfd_process *p, v
 		r = kfd_dbg_trap_disable(target);
 		break;
 	case KFD_IOC_DBG_TRAP_SEND_RUNTIME_EVENT:
+		r = kfd_dbg_send_exception_to_runtime(target,
+				args->send_runtime_event.gpu_id,
+				args->send_runtime_event.queue_id,
+				args->send_runtime_event.exception_mask);
+		break;
 	case KFD_IOC_DBG_TRAP_SET_EXCEPTIONS_ENABLED:
 	case KFD_IOC_DBG_TRAP_SET_WAVE_LAUNCH_OVERRIDE:
 	case KFD_IOC_DBG_TRAP_SET_WAVE_LAUNCH_MODE:
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
index fcd064b13f6a..4174b479ea6f 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
@@ -125,6 +125,49 @@ bool kfd_dbg_ev_raise(uint64_t event_mask,
 	return is_subscribed;
 }
 
+int kfd_dbg_send_exception_to_runtime(struct kfd_process *p,
+					unsigned int dev_id,
+					unsigned int queue_id,
+					uint64_t error_reason)
+{
+	if (error_reason & KFD_EC_MASK(EC_DEVICE_MEMORY_VIOLATION)) {
+		struct kfd_process_device *pdd = NULL;
+		struct kfd_hsa_memory_exception_data *data;
+		int i;
+
+		for (i = 0; i < p->n_pdds; i++) {
+			if (p->pdds[i]->dev->id == dev_id) {
+				pdd = p->pdds[i];
+				break;
+			}
+		}
+
+		if (!pdd)
+			return -ENODEV;
+
+		data = (struct kfd_hsa_memory_exception_data *)
+						pdd->vm_fault_exc_data;
+
+		kfd_dqm_evict_pasid(pdd->dev->dqm, p->pasid);
+		kfd_signal_vm_fault_event(pdd->dev, p->pasid, NULL, data);
+		error_reason &= ~KFD_EC_MASK(EC_DEVICE_MEMORY_VIOLATION);
+	}
+
+	if (error_reason & (KFD_EC_MASK(EC_PROCESS_RUNTIME))) {
+		/*
+		 * block should only happen after the debugger receives runtime
+		 * enable notice.
+		 */
+		up(&p->runtime_enable_sema);
+		error_reason &= ~KFD_EC_MASK(EC_PROCESS_RUNTIME);
+	}
+
+	if (error_reason)
+		return kfd_send_exception_to_runtime(p, queue_id, error_reason);
+
+	return 0;
+}
+
 static int kfd_dbg_set_queue_workaround(struct queue *q, bool enable)
 {
 	struct mqd_update_info minfo = {0};
@@ -175,6 +218,7 @@ static int kfd_dbg_set_workaround(struct kfd_process *target, bool enable)
 	}
 
 	return r;
+}
 
 static int kfd_dbg_set_mes_debug_mode(struct kfd_process_device *pdd)
 {
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.h b/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
index 2d5bc102f6b4..fefb9dc5cf69 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
@@ -38,6 +38,11 @@ int kfd_dbg_trap_enable(struct kfd_process *target, uint32_t fd,
 			void __user *runtime_info,
 			uint32_t *runtime_info_size);
 
+int kfd_dbg_send_exception_to_runtime(struct kfd_process *p,
+					unsigned int dev_id,
+					unsigned int queue_id,
+					uint64_t error_reason);
+
 static inline bool kfd_dbg_is_per_vmid_supported(struct kfd_dev *dev)
 {
 	return KFD_GC_VERSION(dev) == IP_VERSION(9, 4, 2) ||
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_events.c b/drivers/gpu/drm/amd/amdkfd/kfd_events.c
index 729d26d648af..0efd447762d6 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_events.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_events.c
@@ -1225,7 +1225,8 @@ void kfd_signal_hw_exception_event(u32 pasid)
 }
 
 void kfd_signal_vm_fault_event(struct kfd_dev *dev, u32 pasid,
-				struct kfd_vm_fault_info *info)
+				struct kfd_vm_fault_info *info,
+				struct kfd_hsa_memory_exception_data *data)
 {
 	struct kfd_event *ev;
 	uint32_t id;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
index 0b75a37b689b..e092563f22de 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
@@ -362,7 +362,7 @@ static void event_interrupt_wq_v9(struct kfd_dev *dev,
 
 		kfd_smi_event_update_vmfault(dev, pasid);
 		kfd_dqm_evict_pasid(dev->dqm, pasid);
-		kfd_signal_vm_fault_event(dev, pasid, &info);
+		kfd_signal_vm_fault_event(dev, pasid, &info, NULL);
 	}
 }
 
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index e503bd94dda6..4cb433a21e3d 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -945,6 +945,7 @@ struct kfd_process {
 	bool queues_paused;
 
 	/* Tracks runtime enable status */
+	struct semaphore runtime_enable_sema;
 	struct kfd_runtime_info runtime_info;
 
 };
@@ -1394,7 +1395,8 @@ int kfd_get_num_events(struct kfd_process *p);
 int kfd_event_destroy(struct kfd_process *p, uint32_t event_id);
 
 void kfd_signal_vm_fault_event(struct kfd_dev *dev, u32 pasid,
-				struct kfd_vm_fault_info *info);
+				struct kfd_vm_fault_info *info,
+				struct kfd_hsa_memory_exception_data *data);
 
 void kfd_signal_reset_event(struct kfd_dev *dev);
 
@@ -1410,6 +1412,9 @@ static inline bool kfd_flush_tlb_after_unmap(struct kfd_dev *dev)
 	       KFD_GC_VERSION(dev) == IP_VERSION(9, 4, 0);
 }
 
+int kfd_send_exception_to_runtime(struct kfd_process *p,
+				unsigned int queue_id,
+				uint64_t error_reason);
 bool kfd_is_locked(void);
 
 /* Compute profile */
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
index 0ef2d00af8b1..8519604f7249 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
@@ -1403,6 +1403,7 @@ static struct kfd_process *create_process(const struct task_struct *thread)
 	process->debugger_process = NULL;
 	process->exception_enable_mask = 0;
 	atomic_set(&process->debugged_process_count, 0);
+	sema_init(&process->runtime_enable_sema, 0);
 
 	process->pasid = kfd_pasid_alloc();
 	if (process->pasid == 0) {
@@ -2058,6 +2059,75 @@ void kfd_flush_tlb(struct kfd_process_device *pdd, enum TLB_FLUSH_TYPE type)
 	}
 }
 
+struct send_exception_work_handler_workarea {
+	struct work_struct work;
+	struct kfd_process *p;
+	unsigned int queue_id;
+	uint64_t error_reason;
+};
+
+static void send_exception_work_handler(struct work_struct *work)
+{
+	struct send_exception_work_handler_workarea *workarea;
+	struct kfd_process *p;
+	struct queue *q;
+	struct mm_struct *mm;
+	struct kfd_context_save_area_header __user *csa_header;
+	uint64_t __user *err_payload_ptr;
+	uint64_t cur_err;
+	uint32_t ev_id;
+
+	workarea = container_of(work,
+				struct send_exception_work_handler_workarea,
+				work);
+	p = workarea->p;
+
+	mm = get_task_mm(p->lead_thread);
+
+	if (!mm)
+		return;
+
+	kthread_use_mm(mm);
+
+	q = pqm_get_user_queue(&p->pqm, workarea->queue_id);
+
+	if (!q)
+		goto out;
+
+	csa_header = (void __user *)q->properties.ctx_save_restore_area_address;
+
+	get_user(err_payload_ptr, (uint64_t __user **)&csa_header->err_payload_addr);
+	get_user(cur_err, err_payload_ptr);
+	cur_err |= workarea->error_reason;
+	put_user(cur_err, err_payload_ptr);
+	get_user(ev_id, &csa_header->err_event_id);
+
+	kfd_set_event(p, ev_id);
+
+out:
+	kthread_unuse_mm(mm);
+	mmput(mm);
+}
+
+int kfd_send_exception_to_runtime(struct kfd_process *p,
+			unsigned int queue_id,
+			uint64_t error_reason)
+{
+	struct send_exception_work_handler_workarea worker;
+
+	INIT_WORK_ONSTACK(&worker.work, send_exception_work_handler);
+
+	worker.p = p;
+	worker.queue_id = queue_id;
+	worker.error_reason = error_reason;
+
+	schedule_work(&worker.work);
+	flush_work(&worker.work);
+	destroy_work_on_stack(&worker.work);
+
+	return 0;
+}
+
 struct kfd_process_device *kfd_process_device_data_by_id(struct kfd_process *p, uint32_t gpu_id)
 {
 	int i;
@@ -2117,4 +2187,3 @@ int kfd_debugfs_mqds_by_process(struct seq_file *m, void *data)
 }
 
 #endif
-
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 68+ messages in thread

* [PATCH 19/32] drm/amdkfd: add runtime enable operation
  2023-01-25 19:53 [PATCH 00/32] Upstream of kernel support for AMDGPU ISA debugging Jonathan Kim
                   ` (17 preceding siblings ...)
  2023-01-25 19:53 ` [PATCH 18/32] drm/amdkfd: add send exception operation Jonathan Kim
@ 2023-01-25 19:53 ` Jonathan Kim
  2023-03-21  0:31   ` Felix Kuehling
  2023-01-25 19:53 ` [PATCH 20/32] drm/amdkfd: add debug trap enabled flag to tma Jonathan Kim
                   ` (12 subsequent siblings)
  31 siblings, 1 reply; 68+ messages in thread
From: Jonathan Kim @ 2023-01-25 19:53 UTC (permalink / raw)
  To: amd-gfx, dri-devel; +Cc: Felix.Kuehling, Jonathan.Kim

The debugger can attach to a process prior to HSA enablement (i.e.
inferior is spawned by the debugger and attached to immediately before
target process has been enabled for HSA dispatches) or it
can attach to a running target that is already HSA enabled.  Either
way, the debugger needs to know the enablement status to know when
it can inspect queues.

For the scenario where the debugger spawns the target process,
it will have to wait for ROCr's runtime enable request from the target.
The runtime enable request will be able to see that its process has been
debug attached.  ROCr raises an EC_PROCESS_RUNTIME signal to the
debugger then blocks the target process while waiting the debugger's
response. Once the debugger has received the runtime signal, it will
unblock the target process.

For the scenario where the debugger attaches to a running target
process, ROCr will set the target process' runtime status as enabled so
that on an attach request, the debugger will be able to see this
status and will continue with debug enablement as normal.

A secondary requirement is to conditionally enable the trap tempories only
if the user requests it (env var HSA_ENABLE_DEBUG=1) or if the debugger
attaches with HSA runtime enabled.  This is because setting up the trap
temporaries incurs a performance overhead that is unacceptable for
microbench performance in normal mode for certain customers.

In the scenario where the debugger spawns the target process, when ROCr
detects that the debugger has attached during the runtime enable
request, it will enable the trap temporaries before it blocks the target
process while waiting for the debugger to respond.

In the scenario where the debugger attaches to a running target process,
it will enable to trap temporaries itself.

Finally, there is an additional restriction that is required to be
enforced with runtime enable and HW debug mode setting. The debugger must
first ensure that HW debug mode has been enabled before permitting HW debug
mode operations.

With single process debug devices, allowing the debugger to set debug
HW modes prior to trap activation means that debug HW mode setting can
occur before the KFD has reserved the debug VMID (0xf) from the hardware
scheduler's VMID allocation resource pool.  This can result in the
hardware scheduler assigning VMID 0xf to a non-debugged process and
having that process inherit debug HW mode settings intended for the
debugged target process instead, which is both incorrect and potentially
fatal for normal mode operation.

With multi process debug devices, allowing the debugger to set debug
HW modes prior to trap activation means that non-debugged processes
migrating to a new VMID could inherit unintended debug settings.

All debug operations that touch HW settings must require trap activation
where trap activation is triggered by both debug attach and runtime
enablement (target has KFD opened and is ready to dispatch work).

v2: fix up hierarchy of semantics in description.

Signed-off-by: Jonathan Kim <jonathan.kim@amd.com>
---
 drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 150 ++++++++++++++++++++++-
 drivers/gpu/drm/amd/amdkfd/kfd_debug.c   |   6 +-
 drivers/gpu/drm/amd/amdkfd/kfd_debug.h   |   4 +
 drivers/gpu/drm/amd/amdkfd/kfd_priv.h    |   1 +
 4 files changed, 157 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index 09fe8576dc8c..46f9d453dc5e 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -2654,11 +2654,147 @@ static int kfd_ioctl_criu(struct file *filep, struct kfd_process *p, void *data)
 	return ret;
 }
 
-static int kfd_ioctl_runtime_enable(struct file *filep, struct kfd_process *p, void *data)
+static int runtime_enable(struct kfd_process *p, uint64_t r_debug,
+			bool enable_ttmp_setup)
 {
+	int i = 0, ret = 0;
+
+	if (p->is_runtime_retry)
+		goto retry;
+
+	if (p->runtime_info.runtime_state != DEBUG_RUNTIME_STATE_DISABLED)
+		return -EBUSY;
+
+	for (i = 0; i < p->n_pdds; i++) {
+		struct kfd_process_device *pdd = p->pdds[i];
+
+		if (pdd->qpd.queue_count)
+			return -EEXIST;
+	}
+
+	p->runtime_info.runtime_state = DEBUG_RUNTIME_STATE_ENABLED;
+	p->runtime_info.r_debug = r_debug;
+	p->runtime_info.ttmp_setup = enable_ttmp_setup;
+
+	if (p->runtime_info.ttmp_setup) {
+		for (i = 0; i < p->n_pdds; i++) {
+			struct kfd_process_device *pdd = p->pdds[i];
+
+			if (!kfd_dbg_is_rlc_restore_supported(pdd->dev)) {
+				amdgpu_gfx_off_ctrl(pdd->dev->adev, false);
+				pdd->dev->kfd2kgd->enable_debug_trap(
+						pdd->dev->adev,
+						true,
+						pdd->dev->vm_info.last_vmid_kfd);
+			}
+
+			if (kfd_dbg_is_per_vmid_supported(pdd->dev)) {
+				pdd->spi_dbg_override = pdd->dev->kfd2kgd->enable_debug_trap(
+						pdd->dev->adev,
+						false,
+						pdd->dev->vm_info.last_vmid_kfd);
+
+				if (!pdd->dev->shared_resources.enable_mes)
+					debug_refresh_runlist(pdd->dev->dqm);
+				else
+					kfd_dbg_set_mes_debug_mode(pdd);
+			}
+		}
+	}
+
+retry:
+	if (p->debug_trap_enabled) {
+		if (!p->is_runtime_retry) {
+			kfd_dbg_trap_activate(p);
+			kfd_dbg_ev_raise(KFD_EC_MASK(EC_PROCESS_RUNTIME),
+					p, NULL, 0, false, NULL, 0);
+		}
+
+		mutex_unlock(&p->mutex);
+		ret = down_interruptible(&p->runtime_enable_sema);
+		mutex_lock(&p->mutex);
+
+		p->is_runtime_retry = !!ret;
+	}
+
+	return ret;
+}
+
+static int runtime_disable(struct kfd_process *p)
+{
+	int i = 0, ret;
+	bool was_enabled = p->runtime_info.runtime_state == DEBUG_RUNTIME_STATE_ENABLED;
+
+	p->runtime_info.runtime_state = DEBUG_RUNTIME_STATE_DISABLED;
+	p->runtime_info.r_debug = 0;
+
+	if (p->debug_trap_enabled) {
+		if (was_enabled)
+			kfd_dbg_trap_deactivate(p, false, 0);
+
+		if (!p->is_runtime_retry)
+			kfd_dbg_ev_raise(KFD_EC_MASK(EC_PROCESS_RUNTIME),
+					p, NULL, 0, false, NULL, 0);
+
+		mutex_unlock(&p->mutex);
+		ret = down_interruptible(&p->runtime_enable_sema);
+		mutex_lock(&p->mutex);
+
+		p->is_runtime_retry = !!ret;
+		if (ret)
+			return ret;
+	}
+
+	if (was_enabled && p->runtime_info.ttmp_setup) {
+		for (i = 0; i < p->n_pdds; i++) {
+			struct kfd_process_device *pdd = p->pdds[i];
+
+			if (!kfd_dbg_is_rlc_restore_supported(pdd->dev))
+				amdgpu_gfx_off_ctrl(pdd->dev->adev, true);
+		}
+	}
+
+	p->runtime_info.ttmp_setup = false;
+
+	/* disable DISPATCH_PTR save */
+	for (i = 0; i < p->n_pdds; i++) {
+		struct kfd_process_device *pdd = p->pdds[i];
+
+		if (kfd_dbg_is_per_vmid_supported(pdd->dev)) {
+			pdd->spi_dbg_override =
+					pdd->dev->kfd2kgd->disable_debug_trap(
+					pdd->dev->adev,
+					false,
+					pdd->dev->vm_info.last_vmid_kfd);
+
+			if (!pdd->dev->shared_resources.enable_mes)
+				debug_refresh_runlist(pdd->dev->dqm);
+			else
+				kfd_dbg_set_mes_debug_mode(pdd);
+		}
+	}
+
 	return 0;
 }
 
+static int kfd_ioctl_runtime_enable(struct file *filep, struct kfd_process *p, void *data)
+{
+	struct kfd_ioctl_runtime_enable_args *args = data;
+	int r;
+
+	mutex_lock(&p->mutex);
+
+	if (args->mode_mask & KFD_RUNTIME_ENABLE_MODE_ENABLE_MASK)
+		r = runtime_enable(p, args->r_debug,
+				!!(args->mode_mask & KFD_RUNTIME_ENABLE_MODE_TTMP_SAVE_MASK));
+	else
+		r = runtime_disable(p);
+
+	mutex_unlock(&p->mutex);
+
+	return r;
+}
+
 static int kfd_ioctl_set_debug_trap(struct file *filep, struct kfd_process *p, void *data)
 {
 	struct kfd_ioctl_dbg_trap_args *args = data;
@@ -2720,6 +2856,18 @@ static int kfd_ioctl_set_debug_trap(struct file *filep, struct kfd_process *p, v
 		goto unlock_out;
 	}
 
+	if (target->runtime_info.runtime_state != DEBUG_RUNTIME_STATE_ENABLED &&
+			(args->op == KFD_IOC_DBG_TRAP_SET_WAVE_LAUNCH_OVERRIDE ||
+			 args->op == KFD_IOC_DBG_TRAP_SET_WAVE_LAUNCH_MODE ||
+			 args->op == KFD_IOC_DBG_TRAP_SUSPEND_QUEUES ||
+			 args->op == KFD_IOC_DBG_TRAP_RESUME_QUEUES ||
+			 args->op == KFD_IOC_DBG_TRAP_SET_NODE_ADDRESS_WATCH ||
+			 args->op == KFD_IOC_DBG_TRAP_CLEAR_NODE_ADDRESS_WATCH ||
+			 args->op == KFD_IOC_DBG_TRAP_SET_FLAGS)) {
+		r = -EPERM;
+		goto unlock_out;
+	}
+
 	switch (args->op) {
 	case KFD_IOC_DBG_TRAP_ENABLE:
 		if (target != p)
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
index 4174b479ea6f..47f8425a0db3 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
@@ -220,7 +220,7 @@ static int kfd_dbg_set_workaround(struct kfd_process *target, bool enable)
 	return r;
 }
 
-static int kfd_dbg_set_mes_debug_mode(struct kfd_process_device *pdd)
+int kfd_dbg_set_mes_debug_mode(struct kfd_process_device *pdd)
 {
 	uint32_t spi_dbg_cntl = pdd->spi_dbg_override | pdd->spi_dbg_launch_mode;
 	uint32_t flags = pdd->process->dbg_flags;
@@ -240,7 +240,7 @@ static int kfd_dbg_set_mes_debug_mode(struct kfd_process_device *pdd)
  *				to unwind
  *		else: ignored
  */
-static void kfd_dbg_trap_deactivate(struct kfd_process *target, bool unwind, int unwind_count)
+void kfd_dbg_trap_deactivate(struct kfd_process *target, bool unwind, int unwind_count)
 {
 	int i, count = 0;
 
@@ -311,7 +311,7 @@ int kfd_dbg_trap_disable(struct kfd_process *target)
 	return 0;
 }
 
-static int kfd_dbg_trap_activate(struct kfd_process *target)
+int kfd_dbg_trap_activate(struct kfd_process *target)
 {
 	int i, r = 0, unwind_count = 0;
 
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.h b/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
index fefb9dc5cf69..22707f7a2368 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
@@ -28,6 +28,8 @@
 void kgd_gfx_v9_set_wave_launch_stall(struct amdgpu_device *adev,
 					uint32_t vmid,
 					bool stall);
+void kfd_dbg_trap_deactivate(struct kfd_process *target, bool unwind, int unwind_count);
+int kfd_dbg_trap_activate(struct kfd_process *target);
 bool kfd_dbg_ev_raise(uint64_t event_mask,
 			struct kfd_process *process, struct kfd_dev *dev,
 			unsigned int source_id, bool use_worker,
@@ -80,4 +82,6 @@ static inline bool kfd_dbg_has_gws_support(struct kfd_dev *dev)
 	/* Assume debugging and cooperative launch supported otherwise. */
 	return true;
 }
+
+int kfd_dbg_set_mes_debug_mode(struct kfd_process_device *pdd);
 #endif
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index 4cb433a21e3d..63c59ad2a4ca 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -946,6 +946,7 @@ struct kfd_process {
 
 	/* Tracks runtime enable status */
 	struct semaphore runtime_enable_sema;
+	bool is_runtime_retry;
 	struct kfd_runtime_info runtime_info;
 
 };
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 68+ messages in thread

* [PATCH 20/32] drm/amdkfd: add debug trap enabled flag to tma
  2023-01-25 19:53 [PATCH 00/32] Upstream of kernel support for AMDGPU ISA debugging Jonathan Kim
                   ` (18 preceding siblings ...)
  2023-01-25 19:53 ` [PATCH 19/32] drm/amdkfd: add runtime enable operation Jonathan Kim
@ 2023-01-25 19:53 ` Jonathan Kim
  2023-01-25 19:53 ` [PATCH 21/32] drm/amdkfd: update process interrupt handling for debug events Jonathan Kim
                   ` (11 subsequent siblings)
  31 siblings, 0 replies; 68+ messages in thread
From: Jonathan Kim @ 2023-01-25 19:53 UTC (permalink / raw)
  To: amd-gfx, dri-devel; +Cc: Felix.Kuehling, Jonathan.Kim

From: Jay Cornwall <jay.cornwall@amd.com>

Trap handler behavior will differ when a debugger is attached.

Make the debug trap flag available in the trap handler TMA.
Update it when the debug trap ioctl is invoked.

v4: fix up comments to clarify flagging implementation.

v3: Rebase for upstream

v2:
Add missing debug flag setup on APUs

Signed-off-by: Jay Cornwall <jay.cornwall@amd.com>
Reviewed-by: Felix Kuehling <Felix.Kuehling@amd.com>
Signed-off-by: Jonathan Kim <jonathan.kim@amd.com>
---
 drivers/gpu/drm/amd/amdkfd/kfd_debug.c   | 11 +++++++++++
 drivers/gpu/drm/amd/amdkfd/kfd_priv.h    |  2 ++
 drivers/gpu/drm/amd/amdkfd/kfd_process.c | 15 +++++++++++++++
 3 files changed, 28 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
index 47f8425a0db3..16acf3d416eb 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
@@ -257,6 +257,8 @@ void kfd_dbg_trap_deactivate(struct kfd_process *target, bool unwind, int unwind
 		if (unwind && count == unwind_count)
 			break;
 
+		kfd_process_set_trap_debug_flag(&pdd->qpd, false);
+
 		/* GFX off is already disabled by debug activate if not RLC restore supported. */
 		if (kfd_dbg_is_rlc_restore_supported(pdd->dev))
 			amdgpu_gfx_off_ctrl(pdd->dev->adev, false);
@@ -355,6 +357,15 @@ int kfd_dbg_trap_activate(struct kfd_process *target)
 		if (kfd_dbg_is_rlc_restore_supported(pdd->dev))
 			amdgpu_gfx_off_ctrl(pdd->dev->adev, true);
 
+		/**
+		 * Setting the debug flag in the trap handler requires that the TMA has been
+		 * allocated, which occurs during CWSR initialization.
+		 * In the event that CWSR has not been initialized at this point, setting the
+		 * flag will be called again during CWSR initialization if the target process
+		 * is still debug enabled.
+		 */
+		kfd_process_set_trap_debug_flag(&pdd->qpd, true);
+
 		if (!pdd->dev->shared_resources.enable_mes)
 			r = debug_refresh_runlist(pdd->dev->dqm);
 		else
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index 63c59ad2a4ca..d7f00181ae6b 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -1104,6 +1104,8 @@ int kfd_init_apertures(struct kfd_process *process);
 void kfd_process_set_trap_handler(struct qcm_process_device *qpd,
 				  uint64_t tba_addr,
 				  uint64_t tma_addr);
+void kfd_process_set_trap_debug_flag(struct qcm_process_device *qpd,
+				     bool enabled);
 
 /* CWSR initialization */
 int kfd_process_init_cwsr_apu(struct kfd_process *process, struct file *filep);
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
index 8519604f7249..5da1edd36bd2 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
@@ -1250,6 +1250,8 @@ int kfd_process_init_cwsr_apu(struct kfd_process *p, struct file *filep)
 
 		memcpy(qpd->cwsr_kaddr, dev->cwsr_isa, dev->cwsr_isa_size);
 
+		kfd_process_set_trap_debug_flag(qpd, p->debug_trap_enabled);
+
 		qpd->tma_addr = qpd->tba_addr + KFD_CWSR_TMA_OFFSET;
 		pr_debug("set tba :0x%llx, tma:0x%llx, cwsr_kaddr:%p for pqm.\n",
 			qpd->tba_addr, qpd->tma_addr, qpd->cwsr_kaddr);
@@ -1286,6 +1288,9 @@ static int kfd_process_device_init_cwsr_dgpu(struct kfd_process_device *pdd)
 
 	memcpy(qpd->cwsr_kaddr, dev->cwsr_isa, dev->cwsr_isa_size);
 
+	kfd_process_set_trap_debug_flag(&pdd->qpd,
+					pdd->process->debug_trap_enabled);
+
 	qpd->tma_addr = qpd->tba_addr + KFD_CWSR_TMA_OFFSET;
 	pr_debug("set tba :0x%llx, tma:0x%llx, cwsr_kaddr:%p for pqm.\n",
 		 qpd->tba_addr, qpd->tma_addr, qpd->cwsr_kaddr);
@@ -1372,6 +1377,16 @@ bool kfd_process_xnack_mode(struct kfd_process *p, bool supported)
 	return true;
 }
 
+void kfd_process_set_trap_debug_flag(struct qcm_process_device *qpd,
+				     bool enabled)
+{
+	if (qpd->cwsr_kaddr) {
+		uint64_t *tma =
+			(uint64_t *)(qpd->cwsr_kaddr + KFD_CWSR_TMA_OFFSET);
+		tma[2] = enabled;
+	}
+}
+
 /*
  * On return the kfd_process is fully operational and will be freed when the
  * mm is released
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 68+ messages in thread

* [PATCH 21/32] drm/amdkfd: update process interrupt handling for debug events
  2023-01-25 19:53 [PATCH 00/32] Upstream of kernel support for AMDGPU ISA debugging Jonathan Kim
                   ` (19 preceding siblings ...)
  2023-01-25 19:53 ` [PATCH 20/32] drm/amdkfd: add debug trap enabled flag to tma Jonathan Kim
@ 2023-01-25 19:53 ` Jonathan Kim
  2023-03-21 21:07   ` Felix Kuehling
  2023-01-25 19:53 ` [PATCH 22/32] drm/amdkfd: add debug set exceptions enabled operation Jonathan Kim
                   ` (10 subsequent siblings)
  31 siblings, 1 reply; 68+ messages in thread
From: Jonathan Kim @ 2023-01-25 19:53 UTC (permalink / raw)
  To: amd-gfx, dri-devel; +Cc: Felix.Kuehling, Jonathan.Kim

The debugger must be notified by any debugger subscribed exception
that comes from hardware interrupts.

If a debugger session exits, any exceptions it subscribed to may still
have interrupts in the interrupt ring buffer or KGD/KFD pipeline.
To prevent a new session from inheriting stale interrupts, when a new
queue is created, open an interrupt drain and allow the IH ring to drain
from a timestamped checkpoint.  Then inject a custom IV so that once
the custom IV is picked up by the KFD, it's safe to close the drain
and proceed with queue creation.

The drain must also be on debug disable as SW interrupts may still
be processed.  Drain at this time and clear all the exception status.

The debugger may also not be attached nor subscibed to certain
exceptions so forward them directly to the runtime.

GFX10 also requires its own IV processing, hence the creation of
kfd_int_process_v10.c.  This is because the IV from SQ interrupts are
packed into a new continguous format unlike GFX9. To make this clear,
a separate interrupting handling code file was created.

v3: enable gfx11 interrupts
v2: fix interrupt drain on debug disable.
fix interrupt drain on queue create during -ERESTARTSYS.
fix up macros naming for ECODE parsing.

Signed-off-by: Jonathan Kim <jonathan.kim@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c    |  16 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h    |   2 +
 drivers/gpu/drm/amd/amdkfd/Makefile           |   1 +
 drivers/gpu/drm/amd/amdkfd/kfd_debug.c        |  85 ++++
 drivers/gpu/drm/amd/amdkfd/kfd_debug.h        |   6 +
 drivers/gpu/drm/amd/amdkfd/kfd_device.c       |   4 +-
 .../gpu/drm/amd/amdkfd/kfd_int_process_v10.c  | 405 ++++++++++++++++++
 .../gpu/drm/amd/amdkfd/kfd_int_process_v11.c  |  21 +-
 .../gpu/drm/amd/amdkfd/kfd_int_process_v9.c   |  98 ++++-
 drivers/gpu/drm/amd/amdkfd/kfd_priv.h         |  12 +
 drivers/gpu/drm/amd/amdkfd/kfd_process.c      |  47 ++
 .../amd/amdkfd/kfd_process_queue_manager.c    |   4 +
 12 files changed, 681 insertions(+), 20 deletions(-)
 create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_int_process_v10.c

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
index 8816853e50c0..60c3b0449d86 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
@@ -763,6 +763,22 @@ void amdgpu_amdkfd_ras_poison_consumption_handler(struct amdgpu_device *adev, bo
 	amdgpu_umc_poison_handler(adev, reset);
 }
 
+int amdgpu_amdkfd_send_close_event_drain_irq(struct amdgpu_device *adev,
+					uint32_t *payload)
+{
+	int ret;
+
+	/* Device or IH ring is not ready so bail. */
+	ret = amdgpu_ih_wait_on_checkpoint_process_ts(adev, &adev->irq.ih);
+	if (ret)
+		return ret;
+
+	/* Send payload to fence KFD interrupts */
+	amdgpu_amdkfd_interrupt(adev, payload);
+
+	return 0;
+}
+
 bool amdgpu_amdkfd_ras_query_utcl2_poison_status(struct amdgpu_device *adev)
 {
 	if (adev->gfx.ras && adev->gfx.ras->query_utcl2_poison_status)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
index 333780491867..df782274a4c8 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
@@ -241,6 +241,8 @@ int amdgpu_amdkfd_get_xgmi_bandwidth_mbytes(struct amdgpu_device *dst,
 					    struct amdgpu_device *src,
 					    bool is_min);
 int amdgpu_amdkfd_get_pcie_bandwidth_mbytes(struct amdgpu_device *adev, bool is_min);
+int amdgpu_amdkfd_send_close_event_drain_irq(struct amdgpu_device *adev,
+					uint32_t *payload);
 
 /* Read user wptr from a specified user address space with page fault
  * disabled. The memory must be pinned and mapped to the hardware when
diff --git a/drivers/gpu/drm/amd/amdkfd/Makefile b/drivers/gpu/drm/amd/amdkfd/Makefile
index 747754428073..2ec8f27c5366 100644
--- a/drivers/gpu/drm/amd/amdkfd/Makefile
+++ b/drivers/gpu/drm/amd/amdkfd/Makefile
@@ -53,6 +53,7 @@ AMDKFD_FILES	:= $(AMDKFD_PATH)/kfd_module.o \
 		$(AMDKFD_PATH)/kfd_events.o \
 		$(AMDKFD_PATH)/cik_event_interrupt.o \
 		$(AMDKFD_PATH)/kfd_int_process_v9.o \
+		$(AMDKFD_PATH)/kfd_int_process_v10.o \
 		$(AMDKFD_PATH)/kfd_int_process_v11.o \
 		$(AMDKFD_PATH)/kfd_smi_events.o \
 		$(AMDKFD_PATH)/kfd_crat.o \
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
index 16acf3d416eb..0c876172db4b 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
@@ -125,6 +125,65 @@ bool kfd_dbg_ev_raise(uint64_t event_mask,
 	return is_subscribed;
 }
 
+/* set pending event queue entry from ring entry  */
+bool kfd_set_dbg_ev_from_interrupt(struct kfd_dev *dev,
+				   unsigned int pasid,
+				   uint32_t doorbell_id,
+				   uint64_t trap_mask,
+				   void *exception_data,
+				   size_t exception_data_size)
+{
+	struct kfd_process *p;
+	bool signaled_to_debugger_or_runtime = false;
+
+	p = kfd_lookup_process_by_pasid(pasid);
+
+	if (!p)
+		return false;
+
+	if (!kfd_dbg_ev_raise(trap_mask, p, dev, doorbell_id, true,
+					exception_data, exception_data_size)) {
+		struct process_queue_manager *pqm;
+		struct process_queue_node *pqn;
+
+		if (!!(trap_mask & KFD_EC_MASK_QUEUE) &&
+				p->runtime_info.runtime_state == DEBUG_RUNTIME_STATE_ENABLED) {
+			mutex_lock(&p->mutex);
+
+			pqm = &p->pqm;
+			list_for_each_entry(pqn, &pqm->queues,
+							process_queue_list) {
+
+				if (!(pqn->q && pqn->q->device == dev &&
+						pqn->q->doorbell_id == doorbell_id))
+					continue;
+
+				kfd_send_exception_to_runtime(p,
+						pqn->q->properties.queue_id,
+						trap_mask);
+
+				signaled_to_debugger_or_runtime = true;
+
+				break;
+			}
+
+			mutex_unlock(&p->mutex);
+		} else if (trap_mask & KFD_EC_MASK(EC_DEVICE_MEMORY_VIOLATION)) {
+			kfd_dqm_evict_pasid(dev->dqm, p->pasid);
+			kfd_signal_vm_fault_event(dev, p->pasid, NULL,
+							exception_data);
+
+			signaled_to_debugger_or_runtime = true;
+		}
+	} else {
+		signaled_to_debugger_or_runtime = true;
+	}
+
+	kfd_unref_process(p);
+
+	return signaled_to_debugger_or_runtime;
+}
+
 int kfd_dbg_send_exception_to_runtime(struct kfd_process *p,
 					unsigned int dev_id,
 					unsigned int queue_id,
@@ -285,6 +344,31 @@ void kfd_dbg_trap_deactivate(struct kfd_process *target, bool unwind, int unwind
 	kfd_dbg_set_workaround(target, false);
 }
 
+static void kfd_dbg_clean_exception_status(struct kfd_process *target)
+{
+	struct process_queue_manager *pqm;
+	struct process_queue_node *pqn;
+	int i;
+
+	for (i = 0; i < target->n_pdds; i++) {
+		struct kfd_process_device *pdd = target->pdds[i];
+
+		kfd_process_drain_interrupts(pdd);
+
+		pdd->exception_status = 0;
+	}
+
+	pqm = &target->pqm;
+	list_for_each_entry(pqn, &pqm->queues, process_queue_list) {
+		if (!pqn->q)
+			continue;
+
+		pqn->q->properties.exception_status = 0;
+	}
+
+	target->exception_status = 0;
+}
+
 int kfd_dbg_trap_disable(struct kfd_process *target)
 {
 	if (!target->debug_trap_enabled)
@@ -308,6 +392,7 @@ int kfd_dbg_trap_disable(struct kfd_process *target)
 	}
 
 	target->debug_trap_enabled = false;
+	kfd_dbg_clean_exception_status(target);
 	kfd_unref_process(target);
 
 	return 0;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.h b/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
index 22707f7a2368..43284243b2c4 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
@@ -30,6 +30,12 @@ void kgd_gfx_v9_set_wave_launch_stall(struct amdgpu_device *adev,
 					bool stall);
 void kfd_dbg_trap_deactivate(struct kfd_process *target, bool unwind, int unwind_count);
 int kfd_dbg_trap_activate(struct kfd_process *target);
+bool kfd_set_dbg_ev_from_interrupt(struct kfd_dev *dev,
+				   unsigned int pasid,
+				   uint32_t doorbell_id,
+				   uint64_t trap_mask,
+				   void *exception_data,
+				   size_t exception_data_size);
 bool kfd_dbg_ev_raise(uint64_t event_mask,
 			struct kfd_process *process, struct kfd_dev *dev,
 			unsigned int source_id, bool use_worker,
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
index 521dfa88aad8..6e25238d18f9 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
@@ -135,6 +135,8 @@ static void kfd_device_info_set_event_interrupt_class(struct kfd_dev *kfd)
 	case IP_VERSION(9, 4, 0): /* VEGA20 */
 	case IP_VERSION(9, 4, 1): /* ARCTURUS */
 	case IP_VERSION(9, 4, 2): /* ALDEBARAN */
+		kfd->device_info.event_interrupt_class = &event_interrupt_class_v9;
+		break;
 	case IP_VERSION(10, 3, 1): /* VANGOGH */
 	case IP_VERSION(10, 3, 3): /* YELLOW_CARP */
 	case IP_VERSION(10, 3, 6): /* GC 10.3.6 */
@@ -148,7 +150,7 @@ static void kfd_device_info_set_event_interrupt_class(struct kfd_dev *kfd)
 	case IP_VERSION(10, 3, 2): /* NAVY_FLOUNDER */
 	case IP_VERSION(10, 3, 4): /* DIMGREY_CAVEFISH */
 	case IP_VERSION(10, 3, 5): /* BEIGE_GOBY */
-		kfd->device_info.event_interrupt_class = &event_interrupt_class_v9;
+		kfd->device_info.event_interrupt_class = &event_interrupt_class_v10;
 		break;
 	case IP_VERSION(11, 0, 0):
 	case IP_VERSION(11, 0, 1):
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v10.c b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v10.c
new file mode 100644
index 000000000000..e1c0bf313237
--- /dev/null
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v10.c
@@ -0,0 +1,405 @@
+/*
+ * Copyright 2022 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "kfd_events.h"
+#include "kfd_debug.h"
+#include "soc15_int.h"
+#include "kfd_device_queue_manager.h"
+
+/*
+ * GFX10 SQ Interrupts
+ *
+ * There are 3 encoding types of interrupts sourced from SQ sent as a 44-bit
+ * packet to the Interrupt Handler:
+ * Auto - Generated by the SQG (various cmd overflows, timestamps etc)
+ * Wave - Generated by S_SENDMSG through a shader program
+ * Error - HW generated errors (Illegal instructions, Memviols, EDC etc)
+ *
+ * The 44-bit packet is mapped as {context_id1[7:0],context_id0[31:0]} plus
+ * 4-bits for VMID (SOC15_VMID_FROM_IH_ENTRY) as such:
+ *
+ * - context_id1[7:6]
+ * Encoding type (0 = Auto, 1 = Wave, 2 = Error)
+ *
+ * - context_id0[24]
+ * PRIV bit indicates that Wave S_SEND or error occurred within trap
+ *
+ * - context_id0[22:0]
+ * 23-bit data with the following layout per encoding type:
+ * Auto - only context_id0[8:0] is used, which reports various interrupts
+ * generated by SQG.  The rest is 0.
+ * Wave - user data sent from m0 via S_SENDMSG
+ * Error - Error type (context_id0[22:19]), Error Details (rest of bits)
+ *
+ * The other context_id bits show coordinates (SE/SH/CU/SIMD/WGP) for wave
+ * S_SENDMSG and Errors.  These are 0 for Auto.
+ */
+
+enum SQ_INTERRUPT_WORD_ENCODING {
+	SQ_INTERRUPT_WORD_ENCODING_AUTO = 0x0,
+	SQ_INTERRUPT_WORD_ENCODING_INST,
+	SQ_INTERRUPT_WORD_ENCODING_ERROR,
+};
+
+enum SQ_INTERRUPT_ERROR_TYPE {
+	SQ_INTERRUPT_ERROR_TYPE_EDC_FUE = 0x0,
+	SQ_INTERRUPT_ERROR_TYPE_ILLEGAL_INST,
+	SQ_INTERRUPT_ERROR_TYPE_MEMVIOL,
+	SQ_INTERRUPT_ERROR_TYPE_EDC_FED,
+};
+
+/* SQ_INTERRUPT_WORD_AUTO_CTXID */
+#define SQ_INTERRUPT_WORD_AUTO_CTXID0__THREAD_TRACE__SHIFT 0
+#define SQ_INTERRUPT_WORD_AUTO_CTXID0__WLT__SHIFT 1
+#define SQ_INTERRUPT_WORD_AUTO_CTXID0__THREAD_TRACE_BUF0_FULL__SHIFT 2
+#define SQ_INTERRUPT_WORD_AUTO_CTXID0__THREAD_TRACE_BUF1_FULL__SHIFT 3
+#define SQ_INTERRUPT_WORD_AUTO_CTXID0__THREAD_TRACE_UTC_ERROR__SHIFT 7
+#define SQ_INTERRUPT_WORD_AUTO_CTXID1__SE_ID__SHIFT 4
+#define SQ_INTERRUPT_WORD_AUTO_CTXID1__ENCODING__SHIFT 6
+
+#define SQ_INTERRUPT_WORD_AUTO_CTXID0__THREAD_TRACE_MASK 0x00000001
+#define SQ_INTERRUPT_WORD_AUTO_CTXID0__WLT_MASK 0x00000002
+#define SQ_INTERRUPT_WORD_AUTO_CTXID0__THREAD_TRACE_BUF0_FULL_MASK 0x00000004
+#define SQ_INTERRUPT_WORD_AUTO_CTXID0__THREAD_TRACE_BUF1_FULL_MASK 0x00000008
+#define SQ_INTERRUPT_WORD_AUTO_CTXID0__THREAD_TRACE_UTC_ERROR_MASK 0x00000080
+#define SQ_INTERRUPT_WORD_AUTO_CTXID1__SE_ID_MASK 0x030
+#define SQ_INTERRUPT_WORD_AUTO_CTXID1__ENCODING_MASK 0x0c0
+
+/* SQ_INTERRUPT_WORD_WAVE_CTXID */
+#define SQ_INTERRUPT_WORD_WAVE_CTXID0__DATA__SHIFT 0
+#define SQ_INTERRUPT_WORD_WAVE_CTXID0__SA_ID__SHIFT 23
+#define SQ_INTERRUPT_WORD_WAVE_CTXID0__PRIV__SHIFT 24
+#define SQ_INTERRUPT_WORD_WAVE_CTXID0__WAVE_ID__SHIFT 25
+#define SQ_INTERRUPT_WORD_WAVE_CTXID0__SIMD_ID__SHIFT 30
+#define SQ_INTERRUPT_WORD_WAVE_CTXID1__WGP_ID__SHIFT 0
+#define SQ_INTERRUPT_WORD_WAVE_CTXID1__SE_ID__SHIFT 4
+#define SQ_INTERRUPT_WORD_WAVE_CTXID1__ENCODING__SHIFT 6
+
+#define SQ_INTERRUPT_WORD_WAVE_CTXID0__DATA_MASK 0x000007fffff
+#define SQ_INTERRUPT_WORD_WAVE_CTXID0__SA_ID_MASK 0x0000800000
+#define SQ_INTERRUPT_WORD_WAVE_CTXID0__PRIV_MASK 0x00001000000
+#define SQ_INTERRUPT_WORD_WAVE_CTXID0__WAVE_ID_MASK 0x0003e000000
+#define SQ_INTERRUPT_WORD_WAVE_CTXID0__SIMD_ID_MASK 0x000c0000000
+#define SQ_INTERRUPT_WORD_WAVE_CTXID1__WGP_ID_MASK 0x00f
+#define SQ_INTERRUPT_WORD_WAVE_CTXID1__SE_ID_MASK 0x030
+#define SQ_INTERRUPT_WORD_WAVE_CTXID1__ENCODING_MASK 0x0c0
+
+#define KFD_CTXID0__ERR_TYPE_MASK 0x780000
+#define KFD_CTXID0__ERR_TYPE__SHIFT 19
+
+/* GFX10 SQ interrupt ENC type bit (context_id1[7:6]) for wave s_sendmsg */
+#define KFD_CONTEXT_ID1_ENC_TYPE_WAVE_MASK	0x40
+/* GFX10 SQ interrupt PRIV bit (context_id0[24]) for s_sendmsg inside trap */
+#define KFD_CONTEXT_ID0_PRIV_MASK		0x1000000
+/*
+ * The debugger will send user data(m0) with PRIV=1 to indicate it requires
+ * notification from the KFD with the following queue id (DOORBELL_ID) and
+ * trap code (TRAP_CODE).
+ */
+#define KFD_CONTEXT_ID0_DEBUG_DOORBELL_MASK	0x0003ff
+#define KFD_CONTEXT_ID0_DEBUG_TRAP_CODE_SHIFT	10
+#define KFD_CONTEXT_ID0_DEBUG_TRAP_CODE_MASK	0x07fc00
+#define KFD_DEBUG_DOORBELL_ID(ctxid0)	((ctxid0) &	\
+				KFD_CONTEXT_ID0_DEBUG_DOORBELL_MASK)
+#define KFD_DEBUG_TRAP_CODE(ctxid0)	(((ctxid0) &	\
+				KFD_CONTEXT_ID0_DEBUG_TRAP_CODE_MASK)	\
+				>> KFD_CONTEXT_ID0_DEBUG_TRAP_CODE_SHIFT)
+#define KFD_DEBUG_CP_BAD_OP_ECODE_MASK		0x3fffc00
+#define KFD_DEBUG_CP_BAD_OP_ECODE_SHIFT		10
+#define KFD_DEBUG_CP_BAD_OP_ECODE(ctxid0) (((ctxid0) &			\
+				KFD_DEBUG_CP_BAD_OP_ECODE_MASK)		\
+				>> KFD_DEBUG_CP_BAD_OP_ECODE_SHIFT)
+
+static void event_interrupt_poison_consumption(struct kfd_dev *dev,
+				uint16_t pasid, uint16_t client_id)
+{
+	int old_poison, ret = -EINVAL;
+	struct kfd_process *p = kfd_lookup_process_by_pasid(pasid);
+
+	if (!p)
+		return;
+
+	/* all queues of a process will be unmapped in one time */
+	old_poison = atomic_cmpxchg(&p->poison, 0, 1);
+	kfd_unref_process(p);
+	if (old_poison)
+		return;
+
+	switch (client_id) {
+	case SOC15_IH_CLIENTID_SE0SH:
+	case SOC15_IH_CLIENTID_SE1SH:
+	case SOC15_IH_CLIENTID_SE2SH:
+	case SOC15_IH_CLIENTID_SE3SH:
+	case SOC15_IH_CLIENTID_UTCL2:
+		ret = kfd_dqm_evict_pasid(dev->dqm, pasid);
+		break;
+	case SOC15_IH_CLIENTID_SDMA0:
+	case SOC15_IH_CLIENTID_SDMA1:
+	case SOC15_IH_CLIENTID_SDMA2:
+	case SOC15_IH_CLIENTID_SDMA3:
+	case SOC15_IH_CLIENTID_SDMA4:
+		break;
+	default:
+		break;
+	}
+
+	kfd_signal_poison_consumed_event(dev, pasid);
+
+	/* resetting queue passes, do page retirement without gpu reset
+	 * resetting queue fails, fallback to gpu reset solution
+	 */
+	if (!ret) {
+		dev_warn(dev->adev->dev,
+			"RAS poison consumption, unmap queue flow succeeded: client id %d\n",
+			client_id);
+		amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, false);
+	} else {
+		dev_warn(dev->adev->dev,
+			"RAS poison consumption, fall back to gpu reset flow: client id %d\n",
+			client_id);
+		amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, true);
+	}
+}
+
+static bool event_interrupt_isr_v10(struct kfd_dev *dev,
+					const uint32_t *ih_ring_entry,
+					uint32_t *patched_ihre,
+					bool *patched_flag)
+{
+	uint16_t source_id, client_id, pasid, vmid;
+	const uint32_t *data = ih_ring_entry;
+
+	source_id = SOC15_SOURCE_ID_FROM_IH_ENTRY(ih_ring_entry);
+	client_id = SOC15_CLIENT_ID_FROM_IH_ENTRY(ih_ring_entry);
+
+	/* Only handle interrupts from KFD VMIDs */
+	vmid = SOC15_VMID_FROM_IH_ENTRY(ih_ring_entry);
+	if (!KFD_IRQ_IS_FENCE(client_id, source_id) &&
+	   (vmid < dev->vm_info.first_vmid_kfd ||
+	    vmid > dev->vm_info.last_vmid_kfd))
+		return false;
+
+	pasid = SOC15_PASID_FROM_IH_ENTRY(ih_ring_entry);
+
+	/* Only handle clients we care about */
+	if (client_id != SOC15_IH_CLIENTID_GRBM_CP &&
+	    client_id != SOC15_IH_CLIENTID_SDMA0 &&
+	    client_id != SOC15_IH_CLIENTID_SDMA1 &&
+	    client_id != SOC15_IH_CLIENTID_SDMA2 &&
+	    client_id != SOC15_IH_CLIENTID_SDMA3 &&
+	    client_id != SOC15_IH_CLIENTID_SDMA4 &&
+	    client_id != SOC15_IH_CLIENTID_SDMA5 &&
+	    client_id != SOC15_IH_CLIENTID_SDMA6 &&
+	    client_id != SOC15_IH_CLIENTID_SDMA7 &&
+	    client_id != SOC15_IH_CLIENTID_VMC &&
+	    client_id != SOC15_IH_CLIENTID_VMC1 &&
+	    client_id != SOC15_IH_CLIENTID_UTCL2 &&
+	    client_id != SOC15_IH_CLIENTID_SE0SH &&
+	    client_id != SOC15_IH_CLIENTID_SE1SH &&
+	    client_id != SOC15_IH_CLIENTID_SE2SH &&
+	    client_id != SOC15_IH_CLIENTID_SE3SH)
+		return false;
+
+	pr_debug("client id 0x%x, source id %d, vmid %d, pasid 0x%x. raw data:\n",
+		 client_id, source_id, vmid, pasid);
+	pr_debug("%8X, %8X, %8X, %8X, %8X, %8X, %8X, %8X.\n",
+		 data[0], data[1], data[2], data[3],
+		 data[4], data[5], data[6], data[7]);
+
+	/* If there is no valid PASID, it's likely a bug */
+	if (WARN_ONCE(pasid == 0, "Bug: No PASID in KFD interrupt"))
+		return 0;
+
+	/* Interrupt types we care about: various signals and faults.
+	 * They will be forwarded to a work queue (see below).
+	 */
+	return source_id == SOC15_INTSRC_CP_END_OF_PIPE ||
+		source_id == SOC15_INTSRC_SDMA_TRAP ||
+		source_id == SOC15_INTSRC_SQ_INTERRUPT_MSG ||
+		source_id == SOC15_INTSRC_CP_BAD_OPCODE ||
+		client_id == SOC15_IH_CLIENTID_VMC ||
+		client_id == SOC15_IH_CLIENTID_VMC1 ||
+		client_id == SOC15_IH_CLIENTID_UTCL2 ||
+		KFD_IRQ_IS_FENCE(client_id, source_id);
+}
+
+static void event_interrupt_wq_v10(struct kfd_dev *dev,
+					const uint32_t *ih_ring_entry)
+{
+	uint16_t source_id, client_id, pasid, vmid;
+	uint32_t context_id0, context_id1;
+	uint32_t encoding, sq_intr_err_type;
+
+	source_id = SOC15_SOURCE_ID_FROM_IH_ENTRY(ih_ring_entry);
+	client_id = SOC15_CLIENT_ID_FROM_IH_ENTRY(ih_ring_entry);
+	pasid = SOC15_PASID_FROM_IH_ENTRY(ih_ring_entry);
+	vmid = SOC15_VMID_FROM_IH_ENTRY(ih_ring_entry);
+	context_id0 = SOC15_CONTEXT_ID0_FROM_IH_ENTRY(ih_ring_entry);
+	context_id1 = SOC15_CONTEXT_ID1_FROM_IH_ENTRY(ih_ring_entry);
+
+	if (client_id == SOC15_IH_CLIENTID_GRBM_CP ||
+	    client_id == SOC15_IH_CLIENTID_SE0SH ||
+	    client_id == SOC15_IH_CLIENTID_SE1SH ||
+	    client_id == SOC15_IH_CLIENTID_SE2SH ||
+	    client_id == SOC15_IH_CLIENTID_SE3SH) {
+		if (source_id == SOC15_INTSRC_CP_END_OF_PIPE)
+			kfd_signal_event_interrupt(pasid, context_id0, 32);
+		else if (source_id == SOC15_INTSRC_SQ_INTERRUPT_MSG) {
+			encoding = REG_GET_FIELD(context_id1,
+						SQ_INTERRUPT_WORD_WAVE_CTXID1, ENCODING);
+			switch (encoding) {
+			case SQ_INTERRUPT_WORD_ENCODING_AUTO:
+				pr_debug(
+					"sq_intr: auto, se %d, ttrace %d, wlt %d, ttrac_buf0_full %d, ttrac_buf1_full %d, ttrace_utc_err %d\n",
+					REG_GET_FIELD(context_id1, SQ_INTERRUPT_WORD_AUTO_CTXID1,
+							SE_ID),
+					REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_AUTO_CTXID0,
+							THREAD_TRACE),
+					REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_AUTO_CTXID0,
+							WLT),
+					REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_AUTO_CTXID0,
+							THREAD_TRACE_BUF0_FULL),
+					REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_AUTO_CTXID0,
+							THREAD_TRACE_BUF1_FULL),
+					REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_AUTO_CTXID0,
+							THREAD_TRACE_UTC_ERROR));
+				break;
+			case SQ_INTERRUPT_WORD_ENCODING_INST:
+				pr_debug("sq_intr: inst, se %d, data 0x%x, sa %d, priv %d, wave_id %d, simd_id %d, wgp_id %d\n",
+					REG_GET_FIELD(context_id1, SQ_INTERRUPT_WORD_WAVE_CTXID1,
+							SE_ID),
+					REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_WAVE_CTXID0,
+							DATA),
+					REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_WAVE_CTXID0,
+							SA_ID),
+					REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_WAVE_CTXID0,
+							PRIV),
+					REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_WAVE_CTXID0,
+							WAVE_ID),
+					REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_WAVE_CTXID0,
+							SIMD_ID),
+					REG_GET_FIELD(context_id1, SQ_INTERRUPT_WORD_WAVE_CTXID1,
+							WGP_ID));
+				if (context_id0 & SQ_INTERRUPT_WORD_WAVE_CTXID0__PRIV_MASK) {
+					if (kfd_set_dbg_ev_from_interrupt(dev, pasid,
+							KFD_DEBUG_DOORBELL_ID(context_id0),
+							KFD_DEBUG_TRAP_CODE(context_id0),
+							NULL, 0))
+						return;
+				}
+				break;
+			case SQ_INTERRUPT_WORD_ENCODING_ERROR:
+				sq_intr_err_type = REG_GET_FIELD(context_id0, KFD_CTXID0,
+								ERR_TYPE);
+				pr_warn("sq_intr: error, se %d, data 0x%x, sa %d, priv %d, wave_id %d, simd_id %d, wgp_id %d, err_type %d\n",
+					REG_GET_FIELD(context_id1, SQ_INTERRUPT_WORD_WAVE_CTXID1,
+							SE_ID),
+					REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_WAVE_CTXID0,
+							DATA),
+					REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_WAVE_CTXID0,
+							SA_ID),
+					REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_WAVE_CTXID0,
+							PRIV),
+					REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_WAVE_CTXID0,
+							WAVE_ID),
+					REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_WAVE_CTXID0,
+							SIMD_ID),
+					REG_GET_FIELD(context_id1, SQ_INTERRUPT_WORD_WAVE_CTXID1,
+							WGP_ID),
+					sq_intr_err_type);
+				if (sq_intr_err_type != SQ_INTERRUPT_ERROR_TYPE_ILLEGAL_INST &&
+					sq_intr_err_type != SQ_INTERRUPT_ERROR_TYPE_MEMVIOL) {
+					event_interrupt_poison_consumption(dev, pasid, source_id);
+					return;
+				}
+				break;
+			default:
+				break;
+			}
+			kfd_signal_event_interrupt(pasid, context_id0 & 0x7fffff, 23);
+		} else if (source_id == SOC15_INTSRC_CP_BAD_OPCODE) {
+			kfd_set_dbg_ev_from_interrupt(dev, pasid,
+				KFD_DEBUG_DOORBELL_ID(context_id0),
+				KFD_EC_MASK(KFD_DEBUG_CP_BAD_OP_ECODE(context_id0)),
+				NULL,
+				0);
+		}
+	} else if (client_id == SOC15_IH_CLIENTID_SDMA0 ||
+		   client_id == SOC15_IH_CLIENTID_SDMA1 ||
+		   client_id == SOC15_IH_CLIENTID_SDMA2 ||
+		   client_id == SOC15_IH_CLIENTID_SDMA3 ||
+		   (client_id == SOC15_IH_CLIENTID_SDMA3_Sienna_Cichlid &&
+		    KFD_GC_VERSION(dev) == IP_VERSION(10, 3, 0)) ||
+		   client_id == SOC15_IH_CLIENTID_SDMA4 ||
+		   client_id == SOC15_IH_CLIENTID_SDMA5 ||
+		   client_id == SOC15_IH_CLIENTID_SDMA6 ||
+		   client_id == SOC15_IH_CLIENTID_SDMA7) {
+		if (source_id == SOC15_INTSRC_SDMA_TRAP) {
+			kfd_signal_event_interrupt(pasid, context_id0 & 0xfffffff, 28);
+		} else if (source_id == SOC15_INTSRC_SDMA_ECC) {
+			event_interrupt_poison_consumption(dev, pasid, source_id);
+			return;
+		}
+	} else if (client_id == SOC15_IH_CLIENTID_VMC ||
+		   client_id == SOC15_IH_CLIENTID_VMC1 ||
+		   client_id == SOC15_IH_CLIENTID_UTCL2) {
+		struct kfd_vm_fault_info info = {0};
+		uint16_t ring_id = SOC15_RING_ID_FROM_IH_ENTRY(ih_ring_entry);
+		struct kfd_hsa_memory_exception_data exception_data;
+
+		if (client_id == SOC15_IH_CLIENTID_UTCL2 &&
+				amdgpu_amdkfd_ras_query_utcl2_poison_status(dev->adev)) {
+			event_interrupt_poison_consumption(dev, pasid, client_id);
+			return;
+		}
+
+		info.vmid = vmid;
+		info.mc_id = client_id;
+		info.page_addr = ih_ring_entry[4] |
+			(uint64_t)(ih_ring_entry[5] & 0xf) << 32;
+		info.prot_valid = ring_id & 0x08;
+		info.prot_read  = ring_id & 0x10;
+		info.prot_write = ring_id & 0x20;
+
+		memset(&exception_data, 0, sizeof(exception_data));
+		exception_data.gpu_id = dev->id;
+		exception_data.va = (info.page_addr) << PAGE_SHIFT;
+		exception_data.failure.NotPresent = info.prot_valid ? 1 : 0;
+		exception_data.failure.NoExecute = info.prot_exec ? 1 : 0;
+		exception_data.failure.ReadOnly = info.prot_write ? 1 : 0;
+		exception_data.failure.imprecise = 0;
+
+		kfd_set_dbg_ev_from_interrupt(dev,
+						pasid,
+						-1,
+						KFD_EC_MASK(EC_DEVICE_MEMORY_VIOLATION),
+						&exception_data,
+						sizeof(exception_data));
+	} else if (KFD_IRQ_IS_FENCE(client_id, source_id)) {
+		kfd_process_close_interrupt_drain(pasid);
+	}
+}
+
+const struct kfd_event_interrupt_class event_interrupt_class_v10 = {
+	.interrupt_isr = event_interrupt_isr_v10,
+	.interrupt_wq = event_interrupt_wq_v10,
+};
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v11.c b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v11.c
index 0d53f6067422..9a32c95500c1 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v11.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v11.c
@@ -26,6 +26,7 @@
 #include "kfd_device_queue_manager.h"
 #include "ivsrcid/vmc/irqsrcs_vmc_1_0.h"
 #include "kfd_smi_events.h"
+#include "kfd_debug.h"
 
 /*
  * GFX11 SQ Interrupts
@@ -238,7 +239,7 @@ static bool event_interrupt_isr_v11(struct kfd_dev *dev,
 	client_id = SOC15_CLIENT_ID_FROM_IH_ENTRY(ih_ring_entry);
 	/* Only handle interrupts from KFD VMIDs */
 	vmid = SOC15_VMID_FROM_IH_ENTRY(ih_ring_entry);
-	if (/*!KFD_IRQ_IS_FENCE(client_id, source_id) &&*/
+	if (!KFD_IRQ_IS_FENCE(client_id, source_id) &&
 	    (vmid < dev->vm_info.first_vmid_kfd ||
 	    vmid > dev->vm_info.last_vmid_kfd))
 		return false;
@@ -267,7 +268,7 @@ static bool event_interrupt_isr_v11(struct kfd_dev *dev,
 		source_id == SOC15_INTSRC_SQ_INTERRUPT_MSG ||
 		source_id == SOC15_INTSRC_CP_BAD_OPCODE ||
 		source_id == SOC21_INTSRC_SDMA_TRAP ||
-		/* KFD_IRQ_IS_FENCE(client_id, source_id) || */
+		KFD_IRQ_IS_FENCE(client_id, source_id) ||
 		(((client_id == SOC21_IH_CLIENTID_VMC) ||
 		 ((client_id == SOC21_IH_CLIENTID_GFX) &&
 		  (source_id == UTCL2_1_0__SRCID__FAULT))) &&
@@ -312,9 +313,9 @@ static void event_interrupt_wq_v11(struct kfd_dev *dev,
 		exception_data.failure.ReadOnly = info.prot_write ? 1 : 0;
 		exception_data.failure.imprecise = 0;
 
-		/*kfd_set_dbg_ev_from_interrupt(dev, pasid, -1,
+		kfd_set_dbg_ev_from_interrupt(dev, pasid, -1,
 					      KFD_EC_MASK(EC_DEVICE_MEMORY_VIOLATION),
-					      &exception_data, sizeof(exception_data));*/
+					      &exception_data, sizeof(exception_data));
 		kfd_smi_event_update_vmfault(dev, pasid);
 
 	/* GRBM, SDMA, SE, PMM */
@@ -324,11 +325,11 @@ static void event_interrupt_wq_v11(struct kfd_dev *dev,
 		/* CP */
 		if (source_id == SOC15_INTSRC_CP_END_OF_PIPE)
 			kfd_signal_event_interrupt(pasid, context_id0, 32);
-		/*else if (source_id == SOC15_INTSRC_CP_BAD_OPCODE)
+		else if (source_id == SOC15_INTSRC_CP_BAD_OPCODE)
 			kfd_set_dbg_ev_from_interrupt(dev, pasid,
 				KFD_CTXID0_DOORBELL_ID(context_id0),
 				KFD_EC_MASK(KFD_CTXID0_CP_BAD_OP_ECODE(context_id0)),
-				NULL, 0);*/
+				NULL, 0);
 
 		/* SDMA */
 		else if (source_id == SOC21_INTSRC_SDMA_TRAP)
@@ -350,11 +351,11 @@ static void event_interrupt_wq_v11(struct kfd_dev *dev,
 				print_sq_intr_info_inst(context_id0, context_id1);
 				sq_int_priv = REG_GET_FIELD(context_id0,
 						SQ_INTERRUPT_WORD_WAVE_CTXID0, PRIV);
-				/*if (sq_int_priv && (kfd_set_dbg_ev_from_interrupt(dev, pasid,
+				if (sq_int_priv && (kfd_set_dbg_ev_from_interrupt(dev, pasid,
 						KFD_CTXID0_DOORBELL_ID(context_id0),
 						KFD_CTXID0_TRAP_CODE(context_id0),
 						NULL, 0)))
-					return;*/
+					return;
 				break;
 			case SQ_INTERRUPT_WORD_ENCODING_ERROR:
 				print_sq_intr_info_error(context_id0, context_id1);
@@ -373,8 +374,8 @@ static void event_interrupt_wq_v11(struct kfd_dev *dev,
 			kfd_signal_event_interrupt(pasid, context_id0 & 0xffffff, 24);
 		}
 
-	/*} else if (KFD_IRQ_IS_FENCE(client_id, source_id)) {
-		kfd_process_close_interrupt_drain(pasid);*/
+	} else if (KFD_IRQ_IS_FENCE(client_id, source_id)) {
+		kfd_process_close_interrupt_drain(pasid);
 	}
 }
 
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
index e092563f22de..c68611857629 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
@@ -23,10 +23,40 @@
 
 #include "kfd_priv.h"
 #include "kfd_events.h"
+#include "kfd_debug.h"
 #include "soc15_int.h"
 #include "kfd_device_queue_manager.h"
 #include "kfd_smi_events.h"
 
+/*
+ * GFX9 SQ Interrupts
+ *
+ * There are 3 encoding types of interrupts sourced from SQ sent as a 44-bit
+ * packet to the Interrupt Handler:
+ * Auto - Generated by the SQG (various cmd overflows, timestamps etc)
+ * Wave - Generated by S_SENDMSG through a shader program
+ * Error - HW generated errors (Illegal instructions, Memviols, EDC etc)
+ *
+ * The 44-bit packet is mapped as {context_id1[7:0],context_id0[31:0]} plus
+ * 4-bits for VMID (SOC15_VMID_FROM_IH_ENTRY) as such:
+ *
+ * - context_id0[27:26]
+ * Encoding type (0 = Auto, 1 = Wave, 2 = Error)
+ *
+ * - context_id0[13]
+ * PRIV bit indicates that Wave S_SEND or error occurred within trap
+ *
+ * - {context_id1[7:0],context_id0[31:28],context_id0[11:0]}
+ * 24-bit data with the following layout per encoding type:
+ * Auto - only context_id0[8:0] is used, which reports various interrupts
+ * generated by SQG.  The rest is 0.
+ * Wave - user data sent from m0 via S_SENDMSG
+ * Error - Error type (context_id1[7:4]), Error Details (rest of bits)
+ *
+ * The other context_id bits show coordinates (SE/SH/CU/SIMD/WAVE) for wave
+ * S_SENDMSG and Errors.  These are 0 for Auto.
+ */
+
 enum SQ_INTERRUPT_WORD_ENCODING {
 	SQ_INTERRUPT_WORD_ENCODING_AUTO = 0x0,
 	SQ_INTERRUPT_WORD_ENCODING_INST,
@@ -84,12 +114,32 @@ enum SQ_INTERRUPT_ERROR_TYPE {
 #define SQ_INTERRUPT_WORD_WAVE_CTXID__SE_ID_MASK 0x03000000
 #define SQ_INTERRUPT_WORD_WAVE_CTXID__ENCODING_MASK 0x0c000000
 
+/* GFX9 SQ interrupt 24-bit data from context_id<0,1> */
 #define KFD_CONTEXT_ID_GET_SQ_INT_DATA(ctx0, ctx1)                             \
 	((ctx0 & 0xfff) | ((ctx0 >> 16) & 0xf000) | ((ctx1 << 16) & 0xff0000))
 
 #define KFD_SQ_INT_DATA__ERR_TYPE_MASK 0xF00000
 #define KFD_SQ_INT_DATA__ERR_TYPE__SHIFT 20
 
+/*
+ * The debugger will send user data(m0) with PRIV=1 to indicate it requires
+ * notification from the KFD with the following queue id (DOORBELL_ID) and
+ * trap code (TRAP_CODE).
+ */
+#define KFD_INT_DATA_DEBUG_DOORBELL_MASK	0x0003ff
+#define KFD_INT_DATA_DEBUG_TRAP_CODE_SHIFT	10
+#define KFD_INT_DATA_DEBUG_TRAP_CODE_MASK	0x07fc00
+#define KFD_DEBUG_DOORBELL_ID(sq_int_data)	((sq_int_data) &	\
+				KFD_INT_DATA_DEBUG_DOORBELL_MASK)
+#define KFD_DEBUG_TRAP_CODE(sq_int_data)	(((sq_int_data) &	\
+				KFD_INT_DATA_DEBUG_TRAP_CODE_MASK)	\
+				>> KFD_INT_DATA_DEBUG_TRAP_CODE_SHIFT)
+#define KFD_DEBUG_CP_BAD_OP_ECODE_MASK		0x3fffc00
+#define KFD_DEBUG_CP_BAD_OP_ECODE_SHIFT		10
+#define KFD_DEBUG_CP_BAD_OP_ECODE(ctxid0)	(((ctxid0) &		\
+				KFD_DEBUG_CP_BAD_OP_ECODE_MASK)		\
+				>> KFD_DEBUG_CP_BAD_OP_ECODE_SHIFT)
+
 static void event_interrupt_poison_consumption_v9(struct kfd_dev *dev,
 				uint16_t pasid, uint16_t client_id)
 {
@@ -168,14 +218,16 @@ static bool event_interrupt_isr_v9(struct kfd_dev *dev,
 	uint16_t source_id, client_id, pasid, vmid;
 	const uint32_t *data = ih_ring_entry;
 
+	source_id = SOC15_SOURCE_ID_FROM_IH_ENTRY(ih_ring_entry);
+	client_id = SOC15_CLIENT_ID_FROM_IH_ENTRY(ih_ring_entry);
+
 	/* Only handle interrupts from KFD VMIDs */
 	vmid = SOC15_VMID_FROM_IH_ENTRY(ih_ring_entry);
-	if (vmid < dev->vm_info.first_vmid_kfd ||
-	    vmid > dev->vm_info.last_vmid_kfd)
+	if (!KFD_IRQ_IS_FENCE(client_id, source_id) &&
+	   (vmid < dev->vm_info.first_vmid_kfd ||
+	    vmid > dev->vm_info.last_vmid_kfd))
 		return false;
 
-	source_id = SOC15_SOURCE_ID_FROM_IH_ENTRY(ih_ring_entry);
-	client_id = SOC15_CLIENT_ID_FROM_IH_ENTRY(ih_ring_entry);
 	pasid = SOC15_PASID_FROM_IH_ENTRY(ih_ring_entry);
 
 	/* Only handle clients we care about */
@@ -194,7 +246,8 @@ static bool event_interrupt_isr_v9(struct kfd_dev *dev,
 	    client_id != SOC15_IH_CLIENTID_SE0SH &&
 	    client_id != SOC15_IH_CLIENTID_SE1SH &&
 	    client_id != SOC15_IH_CLIENTID_SE2SH &&
-	    client_id != SOC15_IH_CLIENTID_SE3SH)
+	    client_id != SOC15_IH_CLIENTID_SE3SH &&
+	    !KFD_IRQ_IS_FENCE(client_id, source_id))
 		return false;
 
 	/* This is a known issue for gfx9. Under non HWS, pasid is not set
@@ -247,6 +300,7 @@ static bool event_interrupt_isr_v9(struct kfd_dev *dev,
 		source_id == SOC15_INTSRC_SDMA_ECC ||
 		source_id == SOC15_INTSRC_SQ_INTERRUPT_MSG ||
 		source_id == SOC15_INTSRC_CP_BAD_OPCODE ||
+		KFD_IRQ_IS_FENCE(client_id, source_id) ||
 		((client_id == SOC15_IH_CLIENTID_VMC ||
 		client_id == SOC15_IH_CLIENTID_VMC1 ||
 		client_id == SOC15_IH_CLIENTID_UTCL2) &&
@@ -302,6 +356,13 @@ static void event_interrupt_wq_v9(struct kfd_dev *dev,
 					REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_WAVE_CTXID, SIMD_ID),
 					REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_WAVE_CTXID, CU_ID),
 					sq_int_data);
+				if (context_id0 & SQ_INTERRUPT_WORD_WAVE_CTXID__PRIV_MASK) {
+					if (kfd_set_dbg_ev_from_interrupt(dev, pasid,
+							KFD_DEBUG_DOORBELL_ID(sq_int_data),
+							KFD_DEBUG_TRAP_CODE(sq_int_data),
+							NULL, 0))
+						return;
+				}
 				break;
 			case SQ_INTERRUPT_WORD_ENCODING_ERROR:
 				sq_intr_err = REG_GET_FIELD(sq_int_data, KFD_SQ_INT_DATA, ERR_TYPE);
@@ -324,8 +385,12 @@ static void event_interrupt_wq_v9(struct kfd_dev *dev,
 				break;
 			}
 			kfd_signal_event_interrupt(pasid, context_id0 & 0xffffff, 24);
-		} else if (source_id == SOC15_INTSRC_CP_BAD_OPCODE)
-			kfd_signal_hw_exception_event(pasid);
+		} else if (source_id == SOC15_INTSRC_CP_BAD_OPCODE) {
+			kfd_set_dbg_ev_from_interrupt(dev, pasid,
+				KFD_DEBUG_DOORBELL_ID(context_id0),
+				KFD_EC_MASK(KFD_DEBUG_CP_BAD_OP_ECODE(context_id0)),
+				NULL, 0);
+		}
 	} else if (client_id == SOC15_IH_CLIENTID_SDMA0 ||
 		   client_id == SOC15_IH_CLIENTID_SDMA1 ||
 		   client_id == SOC15_IH_CLIENTID_SDMA2 ||
@@ -345,6 +410,7 @@ static void event_interrupt_wq_v9(struct kfd_dev *dev,
 		   client_id == SOC15_IH_CLIENTID_UTCL2) {
 		struct kfd_vm_fault_info info = {0};
 		uint16_t ring_id = SOC15_RING_ID_FROM_IH_ENTRY(ih_ring_entry);
+		struct kfd_hsa_memory_exception_data exception_data;
 
 		if (client_id == SOC15_IH_CLIENTID_UTCL2 &&
 		    amdgpu_amdkfd_ras_query_utcl2_poison_status(dev->adev)) {
@@ -360,9 +426,23 @@ static void event_interrupt_wq_v9(struct kfd_dev *dev,
 		info.prot_read  = ring_id & 0x10;
 		info.prot_write = ring_id & 0x20;
 
+		memset(&exception_data, 0, sizeof(exception_data));
+		exception_data.gpu_id = dev->id;
+		exception_data.va = (info.page_addr) << PAGE_SHIFT;
+		exception_data.failure.NotPresent = info.prot_valid ? 1 : 0;
+		exception_data.failure.NoExecute = info.prot_exec ? 1 : 0;
+		exception_data.failure.ReadOnly = info.prot_write ? 1 : 0;
+		exception_data.failure.imprecise = 0;
+
+		kfd_set_dbg_ev_from_interrupt(dev,
+						pasid,
+						-1,
+						KFD_EC_MASK(EC_DEVICE_MEMORY_VIOLATION),
+						&exception_data,
+						sizeof(exception_data));
 		kfd_smi_event_update_vmfault(dev, pasid);
-		kfd_dqm_evict_pasid(dev->dqm, pasid);
-		kfd_signal_vm_fault_event(dev, pasid, &info, NULL);
+	} else if (KFD_IRQ_IS_FENCE(client_id, source_id)) {
+		kfd_process_close_interrupt_drain(pasid);
 	}
 }
 
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index d7f00181ae6b..6f7dc23af104 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -929,6 +929,10 @@ struct kfd_process {
 	uint64_t exception_enable_mask;
 	uint64_t exception_status;
 
+	/* Used to drain stale interrupts */
+	wait_queue_head_t wait_irq_drain;
+	bool irq_drain_is_open;
+
 	/* shared virtual memory registered by this process */
 	struct svm_range_list svms;
 
@@ -1091,12 +1095,19 @@ int kfd_numa_node_to_apic_id(int numa_node_id);
 void kfd_double_confirm_iommu_support(struct kfd_dev *gpu);
 
 /* Interrupts */
+#define	KFD_IRQ_FENCE_CLIENTID	0xff
+#define	KFD_IRQ_FENCE_SOURCEID	0xff
+#define	KFD_IRQ_IS_FENCE(client, source)				\
+				((client) == KFD_IRQ_FENCE_CLIENTID &&	\
+				(source) == KFD_IRQ_FENCE_SOURCEID)
 int kfd_interrupt_init(struct kfd_dev *dev);
 void kfd_interrupt_exit(struct kfd_dev *dev);
 bool enqueue_ih_ring_entry(struct kfd_dev *kfd,	const void *ih_ring_entry);
 bool interrupt_is_wanted(struct kfd_dev *dev,
 				const uint32_t *ih_ring_entry,
 				uint32_t *patched_ihre, bool *flag);
+int kfd_process_drain_interrupts(struct kfd_process_device *pdd);
+void kfd_process_close_interrupt_drain(unsigned int pasid);
 
 /* amdkfd Apertures */
 int kfd_init_apertures(struct kfd_process *process);
@@ -1368,6 +1379,7 @@ uint64_t kfd_get_number_elems(struct kfd_dev *kfd);
 /* Events */
 extern const struct kfd_event_interrupt_class event_interrupt_class_cik;
 extern const struct kfd_event_interrupt_class event_interrupt_class_v9;
+extern const struct kfd_event_interrupt_class event_interrupt_class_v10;
 extern const struct kfd_event_interrupt_class event_interrupt_class_v11;
 
 extern const struct kfd_device_global_init_class device_global_init_class_cik;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
index 5da1edd36bd2..df2097a89afb 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
@@ -856,6 +856,8 @@ struct kfd_process *kfd_create_process(struct task_struct *thread)
 		kfd_procfs_add_sysfs_stats(process);
 		kfd_procfs_add_sysfs_files(process);
 		kfd_procfs_add_sysfs_counters(process);
+
+		init_waitqueue_head(&process->wait_irq_drain);
 	}
 out:
 	if (!IS_ERR(process))
@@ -2074,6 +2076,51 @@ void kfd_flush_tlb(struct kfd_process_device *pdd, enum TLB_FLUSH_TYPE type)
 	}
 }
 
+/* assumes caller holds process lock. */
+int kfd_process_drain_interrupts(struct kfd_process_device *pdd)
+{
+	uint32_t irq_drain_fence[8];
+	int r = 0;
+
+	if (!KFD_IS_SOC15(pdd->dev))
+		return 0;
+
+	pdd->process->irq_drain_is_open = true;
+
+	memset(irq_drain_fence, 0, sizeof(irq_drain_fence));
+	irq_drain_fence[0] = (KFD_IRQ_FENCE_SOURCEID << 8) |
+							KFD_IRQ_FENCE_CLIENTID;
+	irq_drain_fence[3] = pdd->process->pasid;
+
+	/* ensure stale irqs scheduled KFD interrupts and send drain fence. */
+	if (amdgpu_amdkfd_send_close_event_drain_irq(pdd->dev->adev,
+							irq_drain_fence)) {
+		pdd->process->irq_drain_is_open = false;
+		return 0;
+	}
+
+	r = wait_event_interruptible(pdd->process->wait_irq_drain,
+				!READ_ONCE(pdd->process->irq_drain_is_open));
+	if (r)
+		pdd->process->irq_drain_is_open = false;
+
+	return r;
+}
+
+void kfd_process_close_interrupt_drain(unsigned int pasid)
+{
+	struct kfd_process *p;
+
+	p = kfd_lookup_process_by_pasid(pasid);
+
+	if (!p)
+		return;
+
+	WRITE_ONCE(p->irq_drain_is_open, false);
+	wake_up_all(&p->wait_irq_drain);
+	kfd_unref_process(p);
+}
+
 struct send_exception_work_handler_workarea {
 	struct work_struct work;
 	struct kfd_process *p;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
index d8f032214481..0ae6026c7d69 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
@@ -330,6 +330,10 @@ int pqm_create_queue(struct process_queue_manager *pqm,
 		kq->queue->properties.queue_id = *qid;
 		pqn->kq = kq;
 		pqn->q = NULL;
+		retval = kfd_process_drain_interrupts(pdd);
+		if (retval)
+			break;
+
 		retval = dev->dqm->ops.create_kernel_queue(dev->dqm,
 							kq, &pdd->qpd);
 		break;
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 68+ messages in thread

* [PATCH 22/32] drm/amdkfd: add debug set exceptions enabled operation
  2023-01-25 19:53 [PATCH 00/32] Upstream of kernel support for AMDGPU ISA debugging Jonathan Kim
                   ` (20 preceding siblings ...)
  2023-01-25 19:53 ` [PATCH 21/32] drm/amdkfd: update process interrupt handling for debug events Jonathan Kim
@ 2023-01-25 19:53 ` Jonathan Kim
  2023-01-25 19:53 ` [PATCH 23/32] drm/amdkfd: add debug wave launch override operation Jonathan Kim
                   ` (9 subsequent siblings)
  31 siblings, 0 replies; 68+ messages in thread
From: Jonathan Kim @ 2023-01-25 19:53 UTC (permalink / raw)
  To: amd-gfx, dri-devel; +Cc: Felix.Kuehling, Jonathan.Kim

The debugger subscibes to nofication for requested exceptions on attach.
Allow the debugger to change its subsciption later on.

Signed-off-by: Jonathan Kim <jonathan.kim@amd.com>
Reviewed-by: Felix Kuehling <felix.kuehling@amd.com>
---
 drivers/gpu/drm/amd/amdkfd/kfd_chardev.c |  3 ++
 drivers/gpu/drm/amd/amdkfd/kfd_debug.c   | 36 ++++++++++++++++++++++++
 drivers/gpu/drm/amd/amdkfd/kfd_debug.h   |  2 ++
 3 files changed, 41 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index 46f9d453dc5e..9b87ba351eff 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -2892,6 +2892,9 @@ static int kfd_ioctl_set_debug_trap(struct file *filep, struct kfd_process *p, v
 				args->send_runtime_event.exception_mask);
 		break;
 	case KFD_IOC_DBG_TRAP_SET_EXCEPTIONS_ENABLED:
+		kfd_dbg_set_enabled_debug_exception_mask(target,
+				args->set_exceptions_enabled.exception_mask);
+		break;
 	case KFD_IOC_DBG_TRAP_SET_WAVE_LAUNCH_OVERRIDE:
 	case KFD_IOC_DBG_TRAP_SET_WAVE_LAUNCH_MODE:
 	case KFD_IOC_DBG_TRAP_SUSPEND_QUEUES:
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
index 0c876172db4b..3ea53aaa776b 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
@@ -529,3 +529,39 @@ int kfd_dbg_trap_enable(struct kfd_process *target, uint32_t fd,
 
 	return r;
 }
+
+void kfd_dbg_set_enabled_debug_exception_mask(struct kfd_process *target,
+					uint64_t exception_set_mask)
+{
+	uint64_t found_mask = 0;
+	struct process_queue_manager *pqm;
+	struct process_queue_node *pqn;
+	static const char write_data = '.';
+	loff_t pos = 0;
+	int i;
+
+	mutex_lock(&target->event_mutex);
+
+	found_mask |= target->exception_status;
+
+	pqm = &target->pqm;
+	list_for_each_entry(pqn, &pqm->queues, process_queue_list) {
+		if (!pqn)
+			continue;
+
+		found_mask |= pqn->q->properties.exception_status;
+	}
+
+	for (i = 0; i < target->n_pdds; i++) {
+		struct kfd_process_device *pdd = target->pdds[i];
+
+		found_mask |= pdd->exception_status;
+	}
+
+	if (exception_set_mask & found_mask)
+		kernel_write(target->dbg_ev_file, &write_data, 1, &pos);
+
+	target->exception_enable_mask = exception_set_mask;
+
+	mutex_unlock(&target->event_mutex);
+}
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.h b/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
index 43284243b2c4..81557579ab04 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
@@ -59,6 +59,8 @@ static inline bool kfd_dbg_is_per_vmid_supported(struct kfd_dev *dev)
 
 void debug_event_write_work_handler(struct work_struct *work);
 
+void kfd_dbg_set_enabled_debug_exception_mask(struct kfd_process *target,
+					uint64_t exception_set_mask);
 /*
  * If GFX off is enabled, chips that do not support RLC restore for the debug
  * registers will disable GFX off temporarily for the entire debug session.
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 68+ messages in thread

* [PATCH 23/32] drm/amdkfd: add debug wave launch override operation
  2023-01-25 19:53 [PATCH 00/32] Upstream of kernel support for AMDGPU ISA debugging Jonathan Kim
                   ` (21 preceding siblings ...)
  2023-01-25 19:53 ` [PATCH 22/32] drm/amdkfd: add debug set exceptions enabled operation Jonathan Kim
@ 2023-01-25 19:53 ` Jonathan Kim
  2023-03-21 21:37   ` Felix Kuehling
  2023-01-25 19:53 ` [PATCH 24/32] drm/amdkfd: add debug wave launch mode operation Jonathan Kim
                   ` (8 subsequent siblings)
  31 siblings, 1 reply; 68+ messages in thread
From: Jonathan Kim @ 2023-01-25 19:53 UTC (permalink / raw)
  To: amd-gfx, dri-devel; +Cc: Felix.Kuehling, Jonathan.Kim

This operation allows the debugger to override the enabled HW
exceptions on the device.

On debug devices that only support the debugging of a single process,
the HW exceptions are global and set through the SPI_GDBG_TRAP_MASK
register.
Because they are global, only address watch exceptions are allowed to
be enabled.  In other words, the debugger must preserve all non-address
watch exception states in normal mode operation by barring a full
replacement override or a non-address watch override request.

For multi-process debugging, all HW exception overrides are per-VMID so
all exceptions can be overridden or fully replaced.

In order for the debugger to know what is permissible, returned the
supported override mask back to the debugger along with the previously
enable overrides.

v3: v2 was reviewed but requesting re-review for GFX11 added supported.

v2: switch unsupported override mode return from EPERM to EINVAL to
support unique EPERM on PTRACE failure.

Signed-off-by: Jonathan Kim <jonathan.kim@amd.com>
---
 .../drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c  | 47 ++++++++++
 .../drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c   |  2 +
 .../drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c    | 55 ++++++++++++
 .../drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.h    | 10 +++
 .../drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10_3.c  |  5 +-
 .../drm/amd/amdgpu/amdgpu_amdkfd_gfx_v11.c    | 86 ++++++++++++++++++-
 .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c | 55 ++++++++++++
 .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h | 10 +++
 drivers/gpu/drm/amd/amdkfd/kfd_chardev.c      |  7 ++
 drivers/gpu/drm/amd/amdkfd/kfd_debug.c        | 69 +++++++++++++++
 drivers/gpu/drm/amd/amdkfd/kfd_debug.h        |  6 ++
 11 files changed, 350 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c
index a64a53f9efe6..84a9d9391ea4 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c
@@ -25,6 +25,7 @@
 #include "amdgpu_amdkfd_gfx_v9.h"
 #include "gc/gc_9_4_2_offset.h"
 #include "gc/gc_9_4_2_sh_mask.h"
+#include <uapi/linux/kfd_ioctl.h>
 
 /**
  * Returns TRAP_EN, EXCP_EN and EXCP_REPLACE.
@@ -62,6 +63,50 @@ static uint32_t kgd_aldebaran_disable_debug_trap(struct amdgpu_device *adev,
 	return data;
 }
 
+static int kgd_aldebaran_validate_trap_override_request(struct amdgpu_device *adev,
+							uint32_t trap_override,
+							uint32_t *trap_mask_supported)
+{
+	*trap_mask_supported &= KFD_DBG_TRAP_MASK_FP_INVALID |
+				KFD_DBG_TRAP_MASK_FP_INPUT_DENORMAL |
+				KFD_DBG_TRAP_MASK_FP_DIVIDE_BY_ZERO |
+				KFD_DBG_TRAP_MASK_FP_OVERFLOW |
+				KFD_DBG_TRAP_MASK_FP_UNDERFLOW |
+				KFD_DBG_TRAP_MASK_FP_INEXACT |
+				KFD_DBG_TRAP_MASK_INT_DIVIDE_BY_ZERO |
+				KFD_DBG_TRAP_MASK_DBG_ADDRESS_WATCH |
+				KFD_DBG_TRAP_MASK_DBG_MEMORY_VIOLATION;
+
+	if (trap_override != KFD_DBG_TRAP_OVERRIDE_OR &&
+			trap_override != KFD_DBG_TRAP_OVERRIDE_REPLACE)
+		return -EPERM;
+
+	return 0;
+}
+
+/* returns TRAP_EN, EXCP_EN and EXCP_RPLACE. */
+static uint32_t kgd_aldebaran_set_wave_launch_trap_override(struct amdgpu_device *adev,
+					uint32_t vmid,
+					uint32_t trap_override,
+					uint32_t trap_mask_bits,
+					uint32_t trap_mask_request,
+					uint32_t *trap_mask_prev,
+					uint32_t kfd_dbg_trap_cntl_prev)
+
+{
+	uint32_t data = 0;
+
+	*trap_mask_prev = REG_GET_FIELD(kfd_dbg_trap_cntl_prev, SPI_GDBG_PER_VMID_CNTL, EXCP_EN);
+	trap_mask_bits = (trap_mask_bits & trap_mask_request) |
+		(*trap_mask_prev & ~trap_mask_request);
+
+	data = REG_SET_FIELD(data, SPI_GDBG_PER_VMID_CNTL, TRAP_EN, 1);
+	data = REG_SET_FIELD(data, SPI_GDBG_PER_VMID_CNTL, EXCP_EN, trap_mask_bits);
+	data = REG_SET_FIELD(data, SPI_GDBG_PER_VMID_CNTL, EXCP_REPLACE, trap_override);
+
+	return data;
+}
+
 const struct kfd2kgd_calls aldebaran_kfd2kgd = {
 	.program_sh_mem_settings = kgd_gfx_v9_program_sh_mem_settings,
 	.set_pasid_vmid_mapping = kgd_gfx_v9_set_pasid_vmid_mapping,
@@ -81,6 +126,8 @@ const struct kfd2kgd_calls aldebaran_kfd2kgd = {
 	.set_vm_context_page_table_base = kgd_gfx_v9_set_vm_context_page_table_base,
 	.enable_debug_trap = kgd_aldebaran_enable_debug_trap,
 	.disable_debug_trap = kgd_aldebaran_disable_debug_trap,
+	.validate_trap_override_request = kgd_aldebaran_validate_trap_override_request,
+	.set_wave_launch_trap_override = kgd_aldebaran_set_wave_launch_trap_override,
 	.get_iq_wait_times = kgd_gfx_v9_get_iq_wait_times,
 	.build_grace_period_packet_info = kgd_gfx_v9_build_grace_period_packet_info,
 	.program_trap_handler_settings = kgd_gfx_v9_program_trap_handler_settings,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c
index ef8befc31fc6..0405725e95e3 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c
@@ -410,6 +410,8 @@ const struct kfd2kgd_calls arcturus_kfd2kgd = {
 				kgd_gfx_v9_set_vm_context_page_table_base,
 	.enable_debug_trap = kgd_arcturus_enable_debug_trap,
 	.disable_debug_trap = kgd_arcturus_disable_debug_trap,
+	.validate_trap_override_request = kgd_gfx_v9_validate_trap_override_request,
+	.set_wave_launch_trap_override = kgd_gfx_v9_set_wave_launch_trap_override,
 	.get_iq_wait_times = kgd_gfx_v9_get_iq_wait_times,
 	.build_grace_period_packet_info = kgd_gfx_v9_build_grace_period_packet_info,
 	.get_cu_occupancy = kgd_gfx_v9_get_cu_occupancy,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
index 2491402afd58..32a6e5fbeacd 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
@@ -31,6 +31,7 @@
 #include "v10_structs.h"
 #include "nv.h"
 #include "nvd.h"
+#include <uapi/linux/kfd_ioctl.h>
 
 enum hqd_dequeue_request_type {
 	NO_ACTION = 0,
@@ -801,6 +802,58 @@ uint32_t kgd_gfx_v10_disable_debug_trap(struct amdgpu_device *adev,
 	return 0;
 }
 
+int kgd_gfx_v10_validate_trap_override_request(struct amdgpu_device *adev,
+					      uint32_t trap_override,
+					      uint32_t *trap_mask_supported)
+{
+	*trap_mask_supported &= KFD_DBG_TRAP_MASK_DBG_ADDRESS_WATCH;
+
+	/* The SPI_GDBG_TRAP_MASK register is global and affects all
+	 * processes. Only allow OR-ing the address-watch bit, since
+	 * this only affects processes under the debugger. Other bits
+	 * should stay 0 to avoid the debugger interfering with other
+	 * processes.
+	 */
+	if (trap_override != KFD_DBG_TRAP_OVERRIDE_OR)
+		return -EINVAL;
+
+	return 0;
+}
+
+uint32_t kgd_gfx_v10_set_wave_launch_trap_override(struct amdgpu_device *adev,
+					      uint32_t vmid,
+					      uint32_t trap_override,
+					      uint32_t trap_mask_bits,
+					      uint32_t trap_mask_request,
+					      uint32_t *trap_mask_prev,
+					      uint32_t kfd_dbg_trap_cntl_prev)
+{
+	uint32_t data, wave_cntl_prev;
+
+	mutex_lock(&adev->grbm_idx_mutex);
+
+	wave_cntl_prev = RREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_WAVE_CNTL));
+
+	kgd_gfx_v10_set_wave_launch_stall(adev, vmid, true);
+
+	data = RREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_TRAP_MASK));
+	*trap_mask_prev = REG_GET_FIELD(data, SPI_GDBG_TRAP_MASK, EXCP_EN);
+
+	trap_mask_bits = (trap_mask_bits & trap_mask_request) |
+		(*trap_mask_prev & ~trap_mask_request);
+
+	data = REG_SET_FIELD(data, SPI_GDBG_TRAP_MASK, EXCP_EN, trap_mask_bits);
+	data = REG_SET_FIELD(data, SPI_GDBG_TRAP_MASK, REPLACE, trap_override);
+	WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_TRAP_MASK), data);
+
+	/* We need to preserve wave launch mode stall settings. */
+	WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_WAVE_CNTL), wave_cntl_prev);
+
+	mutex_unlock(&adev->grbm_idx_mutex);
+
+	return 0;
+}
+
 /* kgd_gfx_v10_get_iq_wait_times: Returns the mmCP_IQ_WAIT_TIME1/2 values
  * The values read are:
  *     ib_offload_wait_time     -- Wait Count for Indirect Buffer Offloads.
@@ -886,6 +939,8 @@ const struct kfd2kgd_calls gfx_v10_kfd2kgd = {
 	.set_vm_context_page_table_base = set_vm_context_page_table_base,
 	.enable_debug_trap = kgd_gfx_v10_enable_debug_trap,
 	.disable_debug_trap = kgd_gfx_v10_disable_debug_trap,
+	.validate_trap_override_request = kgd_gfx_v10_validate_trap_override_request,
+	.set_wave_launch_trap_override = kgd_gfx_v10_set_wave_launch_trap_override,
 	.get_iq_wait_times = kgd_gfx_v10_get_iq_wait_times,
 	.build_grace_period_packet_info = kgd_gfx_v10_build_grace_period_packet_info,
 	.program_trap_handler_settings = program_trap_handler_settings,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.h
index 0abc1e805180..85c929fc2926 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.h
@@ -26,6 +26,16 @@ uint32_t kgd_gfx_v10_enable_debug_trap(struct amdgpu_device *adev,
 uint32_t kgd_gfx_v10_disable_debug_trap(struct amdgpu_device *adev,
 					bool keep_trap_enabled,
 					uint32_t vmid);
+int kgd_gfx_v10_validate_trap_override_request(struct amdgpu_device *adev,
+					     uint32_t trap_override,
+					     uint32_t *trap_mask_supported);
+uint32_t kgd_gfx_v10_set_wave_launch_trap_override(struct amdgpu_device *adev,
+					     uint32_t vmid,
+					     uint32_t trap_override,
+					     uint32_t trap_mask_bits,
+					     uint32_t trap_mask_request,
+					     uint32_t *trap_mask_prev,
+					     uint32_t kfd_dbg_trap_cntl_prev);
 void kgd_gfx_v10_get_iq_wait_times(struct amdgpu_device *adev, uint32_t *wait_times);
 void kgd_gfx_v10_build_grace_period_packet_info(struct amdgpu_device *adev,
 					       uint32_t wait_times,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10_3.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10_3.c
index c57f2a6b6e23..ae3ead207df4 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10_3.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10_3.c
@@ -673,5 +673,8 @@ const struct kfd2kgd_calls gfx_v10_3_kfd2kgd = {
 	.get_iq_wait_times = kgd_gfx_v10_get_iq_wait_times,
 	.build_grace_period_packet_info = kgd_gfx_v10_build_grace_period_packet_info,
 	.enable_debug_trap = kgd_gfx_v10_enable_debug_trap,
-	.disable_debug_trap = kgd_gfx_v10_disable_debug_trap
+	.disable_debug_trap = kgd_gfx_v10_disable_debug_trap,
+	.validate_trap_override_request = kgd_gfx_v10_validate_trap_override_request,
+	.set_wave_launch_trap_override = kgd_gfx_v10_set_wave_launch_trap_override
+
 };
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v11.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v11.c
index 34aeff692eba..3fb81e6e9422 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v11.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v11.c
@@ -643,6 +643,88 @@ static uint32_t kgd_gfx_v11_disable_debug_trap(struct amdgpu_device *adev,
 	return data;
 }
 
+static int kgd_gfx_v11_validate_trap_override_request(struct amdgpu_device *adev,
+							uint32_t trap_override,
+							uint32_t *trap_mask_supported)
+{
+	*trap_mask_supported &= KFD_DBG_TRAP_MASK_FP_INVALID |
+				KFD_DBG_TRAP_MASK_FP_INPUT_DENORMAL |
+				KFD_DBG_TRAP_MASK_FP_DIVIDE_BY_ZERO |
+				KFD_DBG_TRAP_MASK_FP_OVERFLOW |
+				KFD_DBG_TRAP_MASK_FP_UNDERFLOW |
+				KFD_DBG_TRAP_MASK_FP_INEXACT |
+				KFD_DBG_TRAP_MASK_INT_DIVIDE_BY_ZERO |
+				KFD_DBG_TRAP_MASK_DBG_ADDRESS_WATCH |
+				KFD_DBG_TRAP_MASK_DBG_MEMORY_VIOLATION;
+
+	if (adev->ip_versions[GC_HWIP][0] >= IP_VERSION(11, 0, 4))
+		*trap_mask_supported |= KFD_DBG_TRAP_MASK_TRAP_ON_WAVE_START |
+					KFD_DBG_TRAP_MASK_TRAP_ON_WAVE_END;
+
+	if (trap_override != KFD_DBG_TRAP_OVERRIDE_OR &&
+			trap_override != KFD_DBG_TRAP_OVERRIDE_REPLACE)
+		return -EPERM;
+
+	return 0;
+}
+
+static uint32_t trap_mask_map_sw_to_hw(uint32_t mask)
+{
+	uint32_t trap_on_start = (mask & KFD_DBG_TRAP_MASK_TRAP_ON_WAVE_START) ? 1 : 0;
+	uint32_t trap_on_end = (mask & KFD_DBG_TRAP_MASK_TRAP_ON_WAVE_END) ? 1 : 0;
+	uint32_t excp_en = mask & (KFD_DBG_TRAP_MASK_FP_INVALID |
+			KFD_DBG_TRAP_MASK_FP_INPUT_DENORMAL |
+			KFD_DBG_TRAP_MASK_FP_DIVIDE_BY_ZERO |
+			KFD_DBG_TRAP_MASK_FP_OVERFLOW |
+			KFD_DBG_TRAP_MASK_FP_UNDERFLOW |
+			KFD_DBG_TRAP_MASK_FP_INEXACT |
+			KFD_DBG_TRAP_MASK_INT_DIVIDE_BY_ZERO |
+			KFD_DBG_TRAP_MASK_DBG_ADDRESS_WATCH |
+			KFD_DBG_TRAP_MASK_DBG_MEMORY_VIOLATION);
+	uint32_t ret;
+
+	ret = REG_SET_FIELD(0, SPI_GDBG_PER_VMID_CNTL, EXCP_EN, excp_en);
+	ret = REG_SET_FIELD(ret, SPI_GDBG_PER_VMID_CNTL, TRAP_ON_START, trap_on_start);
+	ret = REG_SET_FIELD(ret, SPI_GDBG_PER_VMID_CNTL, TRAP_ON_END, trap_on_end);
+
+	return ret;
+}
+
+static uint32_t trap_mask_map_hw_to_sw(uint32_t mask)
+{
+	uint32_t ret = REG_GET_FIELD(mask, SPI_GDBG_PER_VMID_CNTL, EXCP_EN);
+
+	if (REG_GET_FIELD(mask, SPI_GDBG_PER_VMID_CNTL, TRAP_ON_START))
+		ret |= KFD_DBG_TRAP_MASK_TRAP_ON_WAVE_START;
+
+	if (REG_GET_FIELD(mask, SPI_GDBG_PER_VMID_CNTL, TRAP_ON_END))
+		ret |= KFD_DBG_TRAP_MASK_TRAP_ON_WAVE_END;
+
+	return ret;
+}
+
+/* Returns TRAP_EN, EXCP_EN and EXCP_REPLACE. */
+static uint32_t kgd_gfx_v11_set_wave_launch_trap_override(struct amdgpu_device *adev,
+					uint32_t vmid,
+					uint32_t trap_override,
+					uint32_t trap_mask_bits,
+					uint32_t trap_mask_request,
+					uint32_t *trap_mask_prev,
+					uint32_t kfd_dbg_trap_cntl_prev)
+{
+	uint32_t data = 0;
+
+	*trap_mask_prev = trap_mask_map_hw_to_sw(kfd_dbg_trap_cntl_prev);
+
+	data = (trap_mask_bits & trap_mask_request) | (*trap_mask_prev & ~trap_mask_request);
+	data = trap_mask_map_sw_to_hw(data);
+
+	data = REG_SET_FIELD(data, SPI_GDBG_PER_VMID_CNTL, TRAP_EN, 1);
+	data = REG_SET_FIELD(data, SPI_GDBG_PER_VMID_CNTL, EXCP_REPLACE, trap_override);
+
+	return data;
+}
+
 const struct kfd2kgd_calls gfx_v11_kfd2kgd = {
 	.program_sh_mem_settings = program_sh_mem_settings_v11,
 	.set_pasid_vmid_mapping = set_pasid_vmid_mapping_v11,
@@ -660,5 +742,7 @@ const struct kfd2kgd_calls gfx_v11_kfd2kgd = {
 	.get_atc_vmid_pasid_mapping_info = NULL,
 	.set_vm_context_page_table_base = set_vm_context_page_table_base_v11,
 	.enable_debug_trap = kgd_gfx_v11_enable_debug_trap,
-	.disable_debug_trap = kgd_gfx_v11_disable_debug_trap
+	.disable_debug_trap = kgd_gfx_v11_disable_debug_trap,
+	.validate_trap_override_request = kgd_gfx_v11_validate_trap_override_request,
+	.set_wave_launch_trap_override = kgd_gfx_v11_set_wave_launch_trap_override
 };
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
index 4a8bd266d3f6..81643385512a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
@@ -38,6 +38,7 @@
 #include "soc15d.h"
 #include "gfx_v9_0.h"
 #include "amdgpu_amdkfd_gfx_v9.h"
+#include <uapi/linux/kfd_ioctl.h>
 
 enum hqd_dequeue_request_type {
 	NO_ACTION = 0,
@@ -737,6 +738,58 @@ uint32_t kgd_gfx_v9_disable_debug_trap(struct amdgpu_device *adev,
 	return 0;
 }
 
+int kgd_gfx_v9_validate_trap_override_request(struct amdgpu_device *adev,
+					uint32_t trap_override,
+					uint32_t *trap_mask_supported)
+{
+	*trap_mask_supported &= KFD_DBG_TRAP_MASK_DBG_ADDRESS_WATCH;
+
+	/* The SPI_GDBG_TRAP_MASK register is global and affects all
+	 * processes. Only allow OR-ing the address-watch bit, since
+	 * this only affects processes under the debugger. Other bits
+	 * should stay 0 to avoid the debugger interfering with other
+	 * processes.
+	 */
+	if (trap_override != KFD_DBG_TRAP_OVERRIDE_OR)
+		return -EINVAL;
+
+	return 0;
+}
+
+uint32_t kgd_gfx_v9_set_wave_launch_trap_override(struct amdgpu_device *adev,
+					     uint32_t vmid,
+					     uint32_t trap_override,
+					     uint32_t trap_mask_bits,
+					     uint32_t trap_mask_request,
+					     uint32_t *trap_mask_prev,
+					     uint32_t kfd_dbg_cntl_prev)
+{
+	uint32_t data, wave_cntl_prev;
+
+	mutex_lock(&adev->grbm_idx_mutex);
+
+	wave_cntl_prev = RREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_WAVE_CNTL));
+
+	kgd_gfx_v9_set_wave_launch_stall(adev, vmid, true);
+
+	data = RREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_TRAP_MASK));
+	*trap_mask_prev = REG_GET_FIELD(data, SPI_GDBG_TRAP_MASK, EXCP_EN);
+
+	trap_mask_bits = (trap_mask_bits & trap_mask_request) |
+		(*trap_mask_prev & ~trap_mask_request);
+
+	data = REG_SET_FIELD(data, SPI_GDBG_TRAP_MASK, EXCP_EN, trap_mask_bits);
+	data = REG_SET_FIELD(data, SPI_GDBG_TRAP_MASK, REPLACE, trap_override);
+	WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_TRAP_MASK), data);
+
+	/* We need to preserve wave launch mode stall settings. */
+	WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_WAVE_CNTL), wave_cntl_prev);
+
+	mutex_unlock(&adev->grbm_idx_mutex);
+
+	return 0;
+}
+
 /* kgd_gfx_v9_get_iq_wait_times: Returns the mmCP_IQ_WAIT_TIME1/2 values
  * The values read are:
  *     ib_offload_wait_time     -- Wait Count for Indirect Buffer Offloads.
@@ -1005,6 +1058,8 @@ const struct kfd2kgd_calls gfx_v9_kfd2kgd = {
 	.set_vm_context_page_table_base = kgd_gfx_v9_set_vm_context_page_table_base,
 	.enable_debug_trap = kgd_gfx_v9_enable_debug_trap,
 	.disable_debug_trap = kgd_gfx_v9_disable_debug_trap,
+	.validate_trap_override_request = kgd_gfx_v9_validate_trap_override_request,
+	.set_wave_launch_trap_override = kgd_gfx_v9_set_wave_launch_trap_override,
 	.get_iq_wait_times = kgd_gfx_v9_get_iq_wait_times,
 	.build_grace_period_packet_info = kgd_gfx_v9_build_grace_period_packet_info,
 	.get_cu_occupancy = kgd_gfx_v9_get_cu_occupancy,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h
index c0866497cb5c..47cff392b434 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h
@@ -64,6 +64,16 @@ uint32_t kgd_gfx_v9_enable_debug_trap(struct amdgpu_device *adev,
 uint32_t kgd_gfx_v9_disable_debug_trap(struct amdgpu_device *adev,
 					bool keep_trap_enabled,
 					uint32_t vmid);
+int kgd_gfx_v9_validate_trap_override_request(struct amdgpu_device *adev,
+					     uint32_t trap_override,
+					     uint32_t *trap_mask_supported);
+uint32_t kgd_gfx_v9_set_wave_launch_trap_override(struct amdgpu_device *adev,
+					     uint32_t vmid,
+					     uint32_t trap_override,
+					     uint32_t trap_mask_bits,
+					     uint32_t trap_mask_request,
+					     uint32_t *trap_mask_prev,
+					     uint32_t kfd_dbg_trap_cntl_prev);
 void kgd_gfx_v9_get_iq_wait_times(struct amdgpu_device *adev, uint32_t *wait_times);
 void kgd_gfx_v9_build_grace_period_packet_info(struct amdgpu_device *adev,
 					       uint32_t wait_times,
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index 9b87ba351eff..28b9db5806f4 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -2896,6 +2896,13 @@ static int kfd_ioctl_set_debug_trap(struct file *filep, struct kfd_process *p, v
 				args->set_exceptions_enabled.exception_mask);
 		break;
 	case KFD_IOC_DBG_TRAP_SET_WAVE_LAUNCH_OVERRIDE:
+		r = kfd_dbg_trap_set_wave_launch_override(target,
+				args->launch_override.override_mode,
+				args->launch_override.enable_mask,
+				args->launch_override.support_request_mask,
+				&args->launch_override.enable_mask,
+				&args->launch_override.support_request_mask);
+		break;
 	case KFD_IOC_DBG_TRAP_SET_WAVE_LAUNCH_MODE:
 	case KFD_IOC_DBG_TRAP_SUSPEND_QUEUES:
 	case KFD_IOC_DBG_TRAP_RESUME_QUEUES:
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
index 3ea53aaa776b..a9b52f114ac6 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
@@ -530,6 +530,75 @@ int kfd_dbg_trap_enable(struct kfd_process *target, uint32_t fd,
 	return r;
 }
 
+static int kfd_dbg_validate_trap_override_request(struct kfd_process *p,
+						uint32_t trap_override,
+						uint32_t trap_mask_request,
+						uint32_t *trap_mask_supported)
+{
+	int i = 0;
+
+	*trap_mask_supported = 0xffffffff;
+
+	for (i = 0; i < p->n_pdds; i++) {
+		struct kfd_process_device *pdd = p->pdds[i];
+		int err = pdd->dev->kfd2kgd->validate_trap_override_request(
+								pdd->dev->adev,
+								trap_override,
+								trap_mask_supported);
+
+		if (err)
+			return err;
+	}
+
+	if (trap_mask_request & ~*trap_mask_supported)
+		return -EACCES;
+
+	return 0;
+}
+
+int kfd_dbg_trap_set_wave_launch_override(struct kfd_process *target,
+					uint32_t trap_override,
+					uint32_t trap_mask_bits,
+					uint32_t trap_mask_request,
+					uint32_t *trap_mask_prev,
+					uint32_t *trap_mask_supported)
+{
+	int r = 0, i;
+
+	r = kfd_dbg_validate_trap_override_request(target,
+						trap_override,
+						trap_mask_request,
+						trap_mask_supported);
+
+	if (r)
+		return r;
+
+	for (i = 0; i < target->n_pdds; i++) {
+		struct kfd_process_device *pdd = target->pdds[i];
+
+		amdgpu_gfx_off_ctrl(pdd->dev->adev, false);
+		pdd->spi_dbg_override = pdd->dev->kfd2kgd->set_wave_launch_trap_override(
+				pdd->dev->adev,
+				pdd->dev->vm_info.last_vmid_kfd,
+				trap_override,
+				trap_mask_bits,
+				trap_mask_request,
+				trap_mask_prev,
+				pdd->spi_dbg_override);
+		amdgpu_gfx_off_ctrl(pdd->dev->adev, true);
+
+		if (!pdd->dev->shared_resources.enable_mes)
+			r = debug_refresh_runlist(pdd->dev->dqm);
+		else
+			r = kfd_dbg_set_mes_debug_mode(pdd);
+
+		if (r)
+			break;
+	}
+
+	return r;
+}
+
 void kfd_dbg_set_enabled_debug_exception_mask(struct kfd_process *target,
 					uint64_t exception_set_mask)
 {
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.h b/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
index 81557579ab04..864eb01f8973 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
@@ -45,6 +45,12 @@ int kfd_dbg_trap_disable(struct kfd_process *target);
 int kfd_dbg_trap_enable(struct kfd_process *target, uint32_t fd,
 			void __user *runtime_info,
 			uint32_t *runtime_info_size);
+int kfd_dbg_trap_set_wave_launch_override(struct kfd_process *target,
+					uint32_t trap_override,
+					uint32_t trap_mask_bits,
+					uint32_t trap_mask_request,
+					uint32_t *trap_mask_prev,
+					uint32_t *trap_mask_supported);
 
 int kfd_dbg_send_exception_to_runtime(struct kfd_process *p,
 					unsigned int dev_id,
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 68+ messages in thread

* [PATCH 24/32] drm/amdkfd: add debug wave launch mode operation
  2023-01-25 19:53 [PATCH 00/32] Upstream of kernel support for AMDGPU ISA debugging Jonathan Kim
                   ` (22 preceding siblings ...)
  2023-01-25 19:53 ` [PATCH 23/32] drm/amdkfd: add debug wave launch override operation Jonathan Kim
@ 2023-01-25 19:53 ` Jonathan Kim
  2023-03-21 21:42   ` Felix Kuehling
  2023-01-25 19:53 ` [PATCH 25/32] drm/amdkfd: add debug suspend and resume process queues operation Jonathan Kim
                   ` (7 subsequent siblings)
  31 siblings, 1 reply; 68+ messages in thread
From: Jonathan Kim @ 2023-01-25 19:53 UTC (permalink / raw)
  To: amd-gfx, dri-devel; +Cc: Felix.Kuehling, Jonathan.Kim

Allow the debugger to set wave behaviour on to either normally operate,
halt at launch, trap on every instruction, terminate immediately or
stall on allocation.

v2: add gfx11 support and remove deprecated launch mode options

Signed-off-by: Jonathan Kim <jonathan.kim@amd.com>
---
 .../drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c  | 12 +++++++
 .../drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c   |  1 +
 .../drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c    | 25 +++++++++++++
 .../drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.h    |  3 ++
 .../drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10_3.c  |  3 +-
 .../drm/amd/amdgpu/amdgpu_amdkfd_gfx_v11.c    | 14 +++++++-
 .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c | 25 +++++++++++++
 .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h |  3 ++
 drivers/gpu/drm/amd/amdkfd/kfd_chardev.c      |  3 ++
 drivers/gpu/drm/amd/amdkfd/kfd_debug.c        | 36 ++++++++++++++++++-
 drivers/gpu/drm/amd/amdkfd/kfd_debug.h        |  5 ++-
 11 files changed, 124 insertions(+), 6 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c
index 84a9d9391ea4..4de2066215b4 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c
@@ -107,6 +107,17 @@ static uint32_t kgd_aldebaran_set_wave_launch_trap_override(struct amdgpu_device
 	return data;
 }
 
+static uint32_t kgd_aldebaran_set_wave_launch_mode(struct amdgpu_device *adev,
+					uint8_t wave_launch_mode,
+					uint32_t vmid)
+{
+	uint32_t data = 0;
+
+	data = REG_SET_FIELD(data, SPI_GDBG_PER_VMID_CNTL, LAUNCH_MODE, wave_launch_mode);
+
+	return data;
+}
+
 const struct kfd2kgd_calls aldebaran_kfd2kgd = {
 	.program_sh_mem_settings = kgd_gfx_v9_program_sh_mem_settings,
 	.set_pasid_vmid_mapping = kgd_gfx_v9_set_pasid_vmid_mapping,
@@ -128,6 +139,7 @@ const struct kfd2kgd_calls aldebaran_kfd2kgd = {
 	.disable_debug_trap = kgd_aldebaran_disable_debug_trap,
 	.validate_trap_override_request = kgd_aldebaran_validate_trap_override_request,
 	.set_wave_launch_trap_override = kgd_aldebaran_set_wave_launch_trap_override,
+	.set_wave_launch_mode = kgd_aldebaran_set_wave_launch_mode,
 	.get_iq_wait_times = kgd_gfx_v9_get_iq_wait_times,
 	.build_grace_period_packet_info = kgd_gfx_v9_build_grace_period_packet_info,
 	.program_trap_handler_settings = kgd_gfx_v9_program_trap_handler_settings,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c
index 0405725e95e3..500013540356 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c
@@ -412,6 +412,7 @@ const struct kfd2kgd_calls arcturus_kfd2kgd = {
 	.disable_debug_trap = kgd_arcturus_disable_debug_trap,
 	.validate_trap_override_request = kgd_gfx_v9_validate_trap_override_request,
 	.set_wave_launch_trap_override = kgd_gfx_v9_set_wave_launch_trap_override,
+	.set_wave_launch_mode = kgd_gfx_v9_set_wave_launch_mode,
 	.get_iq_wait_times = kgd_gfx_v9_get_iq_wait_times,
 	.build_grace_period_packet_info = kgd_gfx_v9_build_grace_period_packet_info,
 	.get_cu_occupancy = kgd_gfx_v9_get_cu_occupancy,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
index 32a6e5fbeacd..7591145bc69f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
@@ -854,6 +854,30 @@ uint32_t kgd_gfx_v10_set_wave_launch_trap_override(struct amdgpu_device *adev,
 	return 0;
 }
 
+uint32_t kgd_gfx_v10_set_wave_launch_mode(struct amdgpu_device *adev,
+					uint8_t wave_launch_mode,
+					uint32_t vmid)
+{
+	uint32_t data = 0;
+	bool is_mode_set = !!wave_launch_mode;
+
+	mutex_lock(&adev->grbm_idx_mutex);
+
+	kgd_gfx_v10_set_wave_launch_stall(adev, vmid, true);
+
+	data = REG_SET_FIELD(data, SPI_GDBG_WAVE_CNTL2,
+			VMID_MASK, is_mode_set ? 1 << vmid : 0);
+	data = REG_SET_FIELD(data, SPI_GDBG_WAVE_CNTL2,
+			MODE, is_mode_set ? wave_launch_mode : 0);
+	WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_WAVE_CNTL2), data);
+
+	kgd_gfx_v10_set_wave_launch_stall(adev, vmid, false);
+
+	mutex_unlock(&adev->grbm_idx_mutex);
+
+	return 0;
+}
+
 /* kgd_gfx_v10_get_iq_wait_times: Returns the mmCP_IQ_WAIT_TIME1/2 values
  * The values read are:
  *     ib_offload_wait_time     -- Wait Count for Indirect Buffer Offloads.
@@ -941,6 +965,7 @@ const struct kfd2kgd_calls gfx_v10_kfd2kgd = {
 	.disable_debug_trap = kgd_gfx_v10_disable_debug_trap,
 	.validate_trap_override_request = kgd_gfx_v10_validate_trap_override_request,
 	.set_wave_launch_trap_override = kgd_gfx_v10_set_wave_launch_trap_override,
+	.set_wave_launch_mode = kgd_gfx_v10_set_wave_launch_mode,
 	.get_iq_wait_times = kgd_gfx_v10_get_iq_wait_times,
 	.build_grace_period_packet_info = kgd_gfx_v10_build_grace_period_packet_info,
 	.program_trap_handler_settings = program_trap_handler_settings,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.h
index 85c929fc2926..34c04a2bb83b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.h
@@ -36,6 +36,9 @@ uint32_t kgd_gfx_v10_set_wave_launch_trap_override(struct amdgpu_device *adev,
 					     uint32_t trap_mask_request,
 					     uint32_t *trap_mask_prev,
 					     uint32_t kfd_dbg_trap_cntl_prev);
+uint32_t kgd_gfx_v10_set_wave_launch_mode(struct amdgpu_device *adev,
+					 uint8_t wave_launch_mode,
+					 uint32_t vmid);
 void kgd_gfx_v10_get_iq_wait_times(struct amdgpu_device *adev, uint32_t *wait_times);
 void kgd_gfx_v10_build_grace_period_packet_info(struct amdgpu_device *adev,
 					       uint32_t wait_times,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10_3.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10_3.c
index ae3ead207df4..8627c5458973 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10_3.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10_3.c
@@ -675,6 +675,7 @@ const struct kfd2kgd_calls gfx_v10_3_kfd2kgd = {
 	.enable_debug_trap = kgd_gfx_v10_enable_debug_trap,
 	.disable_debug_trap = kgd_gfx_v10_disable_debug_trap,
 	.validate_trap_override_request = kgd_gfx_v10_validate_trap_override_request,
-	.set_wave_launch_trap_override = kgd_gfx_v10_set_wave_launch_trap_override
+	.set_wave_launch_trap_override = kgd_gfx_v10_set_wave_launch_trap_override,
+	.set_wave_launch_mode = kgd_gfx_v10_set_wave_launch_mode
 
 };
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v11.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v11.c
index 3fb81e6e9422..4fdc25222dcd 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v11.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v11.c
@@ -725,6 +725,17 @@ static uint32_t kgd_gfx_v11_set_wave_launch_trap_override(struct amdgpu_device *
 	return data;
 }
 
+static uint32_t kgd_gfx_v11_set_wave_launch_mode(struct amdgpu_device *adev,
+					uint8_t wave_launch_mode,
+					uint32_t vmid)
+{
+	uint32_t data = 0;
+
+	data = REG_SET_FIELD(data, SPI_GDBG_PER_VMID_CNTL, LAUNCH_MODE, wave_launch_mode);
+
+	return data;
+}
+
 const struct kfd2kgd_calls gfx_v11_kfd2kgd = {
 	.program_sh_mem_settings = program_sh_mem_settings_v11,
 	.set_pasid_vmid_mapping = set_pasid_vmid_mapping_v11,
@@ -744,5 +755,6 @@ const struct kfd2kgd_calls gfx_v11_kfd2kgd = {
 	.enable_debug_trap = kgd_gfx_v11_enable_debug_trap,
 	.disable_debug_trap = kgd_gfx_v11_disable_debug_trap,
 	.validate_trap_override_request = kgd_gfx_v11_validate_trap_override_request,
-	.set_wave_launch_trap_override = kgd_gfx_v11_set_wave_launch_trap_override
+	.set_wave_launch_trap_override = kgd_gfx_v11_set_wave_launch_trap_override,
+	.set_wave_launch_mode = kgd_gfx_v11_set_wave_launch_mode
 };
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
index 81643385512a..a3c8f5578788 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
@@ -790,6 +790,30 @@ uint32_t kgd_gfx_v9_set_wave_launch_trap_override(struct amdgpu_device *adev,
 	return 0;
 }
 
+uint32_t kgd_gfx_v9_set_wave_launch_mode(struct amdgpu_device *adev,
+					uint8_t wave_launch_mode,
+					uint32_t vmid)
+{
+	uint32_t data = 0;
+	bool is_mode_set = !!wave_launch_mode;
+
+	mutex_lock(&adev->grbm_idx_mutex);
+
+	kgd_gfx_v9_set_wave_launch_stall(adev, vmid, true);
+
+	data = REG_SET_FIELD(data, SPI_GDBG_WAVE_CNTL2,
+		VMID_MASK, is_mode_set ? 1 << vmid : 0);
+	data = REG_SET_FIELD(data, SPI_GDBG_WAVE_CNTL2,
+		MODE, is_mode_set ? wave_launch_mode : 0);
+	WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_WAVE_CNTL2), data);
+
+	kgd_gfx_v9_set_wave_launch_stall(adev, vmid, false);
+
+	mutex_unlock(&adev->grbm_idx_mutex);
+
+	return 0;
+}
+
 /* kgd_gfx_v9_get_iq_wait_times: Returns the mmCP_IQ_WAIT_TIME1/2 values
  * The values read are:
  *     ib_offload_wait_time     -- Wait Count for Indirect Buffer Offloads.
@@ -1060,6 +1084,7 @@ const struct kfd2kgd_calls gfx_v9_kfd2kgd = {
 	.disable_debug_trap = kgd_gfx_v9_disable_debug_trap,
 	.validate_trap_override_request = kgd_gfx_v9_validate_trap_override_request,
 	.set_wave_launch_trap_override = kgd_gfx_v9_set_wave_launch_trap_override,
+	.set_wave_launch_mode = kgd_gfx_v9_set_wave_launch_mode,
 	.get_iq_wait_times = kgd_gfx_v9_get_iq_wait_times,
 	.build_grace_period_packet_info = kgd_gfx_v9_build_grace_period_packet_info,
 	.get_cu_occupancy = kgd_gfx_v9_get_cu_occupancy,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h
index 47cff392b434..2a2ab42037e4 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h
@@ -67,6 +67,9 @@ uint32_t kgd_gfx_v9_disable_debug_trap(struct amdgpu_device *adev,
 int kgd_gfx_v9_validate_trap_override_request(struct amdgpu_device *adev,
 					     uint32_t trap_override,
 					     uint32_t *trap_mask_supported);
+uint32_t kgd_gfx_v9_set_wave_launch_mode(struct amdgpu_device *adev,
+					uint8_t wave_launch_mode,
+					uint32_t vmid);
 uint32_t kgd_gfx_v9_set_wave_launch_trap_override(struct amdgpu_device *adev,
 					     uint32_t vmid,
 					     uint32_t trap_override,
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index 28b9db5806f4..205a487d91d2 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -2904,6 +2904,9 @@ static int kfd_ioctl_set_debug_trap(struct file *filep, struct kfd_process *p, v
 				&args->launch_override.support_request_mask);
 		break;
 	case KFD_IOC_DBG_TRAP_SET_WAVE_LAUNCH_MODE:
+		r = kfd_dbg_trap_set_wave_launch_mode(target,
+				args->launch_mode.launch_mode);
+		break;
 	case KFD_IOC_DBG_TRAP_SUSPEND_QUEUES:
 	case KFD_IOC_DBG_TRAP_RESUME_QUEUES:
 	case KFD_IOC_DBG_TRAP_SET_NODE_ADDRESS_WATCH:
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
index a9b52f114ac6..b630633609b0 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
@@ -303,8 +303,10 @@ void kfd_dbg_trap_deactivate(struct kfd_process *target, bool unwind, int unwind
 {
 	int i, count = 0;
 
-	if (!unwind)
+	if (!unwind) {
 		cancel_work_sync(&target->debug_event_workarea);
+		kfd_dbg_trap_set_wave_launch_mode(target, 0);
+	}
 
 	for (i = 0; i < target->n_pdds; i++) {
 		struct kfd_process_device *pdd = target->pdds[i];
@@ -599,6 +601,38 @@ int kfd_dbg_trap_set_wave_launch_override(struct kfd_process *target,
 	return r;
 }
 
+int kfd_dbg_trap_set_wave_launch_mode(struct kfd_process *target,
+					uint8_t wave_launch_mode)
+{
+	int r = 0, i;
+
+	if (wave_launch_mode != KFD_DBG_TRAP_WAVE_LAUNCH_MODE_NORMAL &&
+			wave_launch_mode != KFD_DBG_TRAP_WAVE_LAUNCH_MODE_HALT &&
+			wave_launch_mode != KFD_DBG_TRAP_WAVE_LAUNCH_MODE_DEBUG)
+		return -EINVAL;
+
+	for (i = 0; i < target->n_pdds; i++) {
+		struct kfd_process_device *pdd = target->pdds[i];
+
+		amdgpu_gfx_off_ctrl(pdd->dev->adev, false);
+		pdd->spi_dbg_launch_mode = pdd->dev->kfd2kgd->set_wave_launch_mode(
+				pdd->dev->adev,
+				wave_launch_mode,
+				pdd->dev->vm_info.last_vmid_kfd);
+		amdgpu_gfx_off_ctrl(pdd->dev->adev, true);
+
+		if (!pdd->dev->shared_resources.enable_mes)
+			r = debug_refresh_runlist(pdd->dev->dqm);
+		else
+			r = kfd_dbg_set_mes_debug_mode(pdd);
+
+		if (r)
+			break;
+	}
+
+	return r;
+}
+
 void kfd_dbg_set_enabled_debug_exception_mask(struct kfd_process *target,
 					uint64_t exception_set_mask)
 {
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.h b/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
index 864eb01f8973..0d70f162d6d8 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
@@ -25,9 +25,6 @@
 
 #include "kfd_priv.h"
 
-void kgd_gfx_v9_set_wave_launch_stall(struct amdgpu_device *adev,
-					uint32_t vmid,
-					bool stall);
 void kfd_dbg_trap_deactivate(struct kfd_process *target, bool unwind, int unwind_count);
 int kfd_dbg_trap_activate(struct kfd_process *target);
 bool kfd_set_dbg_ev_from_interrupt(struct kfd_dev *dev,
@@ -51,6 +48,8 @@ int kfd_dbg_trap_set_wave_launch_override(struct kfd_process *target,
 					uint32_t trap_mask_request,
 					uint32_t *trap_mask_prev,
 					uint32_t *trap_mask_supported);
+int kfd_dbg_trap_set_wave_launch_mode(struct kfd_process *target,
+					uint8_t wave_launch_mode);
 
 int kfd_dbg_send_exception_to_runtime(struct kfd_process *p,
 					unsigned int dev_id,
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 68+ messages in thread

* [PATCH 25/32] drm/amdkfd: add debug suspend and resume process queues operation
  2023-01-25 19:53 [PATCH 00/32] Upstream of kernel support for AMDGPU ISA debugging Jonathan Kim
                   ` (23 preceding siblings ...)
  2023-01-25 19:53 ` [PATCH 24/32] drm/amdkfd: add debug wave launch mode operation Jonathan Kim
@ 2023-01-25 19:53 ` Jonathan Kim
  2023-03-21 22:16   ` Felix Kuehling
  2023-01-25 19:53 ` [PATCH 26/32] drm/amdkfd: add debug set and clear address watch points operation Jonathan Kim
                   ` (6 subsequent siblings)
  31 siblings, 1 reply; 68+ messages in thread
From: Jonathan Kim @ 2023-01-25 19:53 UTC (permalink / raw)
  To: amd-gfx, dri-devel; +Cc: Felix.Kuehling, Jonathan.Kim

In order to inspect waves from the saved context at any point during a
debug session, the debugger must be able to preempt queues to trigger
context save by suspending them.

On queue suspend, the KFD will copy the context save header information
so that the debugger can correctly crawl the appropriate size of the saved
context. The debugger must then also be allowed to resume suspended queues.

A queue that is newly created cannot be suspended because queue ids are
recycled after destruction so the debugger needs to know that this has
occurred.  Query functions will be later added that will clear a given
queue of its new queue status.

A queue cannot be destroyed while it is suspended to preserve its saved
context during debugger inspection.  Have queue destruction block while
a queue is suspended and unblocked when it is resumed.  Likewise, if a
queue is about to be destroyed, it cannot be suspended.

Return the number of queues successfully suspended or resumed along with
a per queue status array where the upper bits per queue status show that
the request was invalid (new/destroyed queue suspend request, missing
queue) or an error occurred (HWS in a fatal state so it can't suspend or
resume queues).

v2: add gfx11/mes support.
prevent header copy on suspend from overwriting user fields.
simplify resume_queues function.
address other nit-picks

Signed-off-by: Jonathan Kim <jonathan.kim@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c    |   5 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h    |   1 +
 drivers/gpu/drm/amd/amdkfd/kfd_chardev.c      |  11 +
 drivers/gpu/drm/amd/amdkfd/kfd_debug.c        |   7 +
 .../drm/amd/amdkfd/kfd_device_queue_manager.c | 446 +++++++++++++++++-
 .../drm/amd/amdkfd/kfd_device_queue_manager.h |  10 +
 .../gpu/drm/amd/amdkfd/kfd_mqd_manager_v10.c  |  14 +
 .../gpu/drm/amd/amdkfd/kfd_mqd_manager_v11.c  |  11 +-
 .../gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c   |  18 +-
 drivers/gpu/drm/amd/amdkfd/kfd_priv.h         |   5 +-
 10 files changed, 518 insertions(+), 10 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
index 60c3b0449d86..d50415fe0475 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
@@ -758,6 +758,11 @@ bool amdgpu_amdkfd_have_atomics_support(struct amdgpu_device *adev)
 	return adev->have_atomics_support;
 }
 
+void amdgpu_amdkfd_debug_mem_fence(struct amdgpu_device *adev)
+{
+	amdgpu_device_flush_hdp(adev, NULL);
+}
+
 void amdgpu_amdkfd_ras_poison_consumption_handler(struct amdgpu_device *adev, bool reset)
 {
 	amdgpu_umc_poison_handler(adev, reset);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
index df782274a4c8..9d1c6ab14331 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
@@ -310,6 +310,7 @@ int amdgpu_amdkfd_gpuvm_import_dmabuf(struct amdgpu_device *adev,
 				      uint64_t va, void *drm_priv,
 				      struct kgd_mem **mem, uint64_t *size,
 				      uint64_t *mmap_offset);
+void amdgpu_amdkfd_debug_mem_fence(struct amdgpu_device *adev);
 int amdgpu_amdkfd_get_tile_config(struct amdgpu_device *adev,
 				struct tile_config *config);
 void amdgpu_amdkfd_ras_poison_consumption_handler(struct amdgpu_device *adev,
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index 205a487d91d2..b62e93b35a44 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -410,6 +410,7 @@ static int kfd_ioctl_create_queue(struct file *filep, struct kfd_process *p,
 	pr_debug("Write ptr address   == 0x%016llX\n",
 			args->write_pointer_address);
 
+	kfd_dbg_ev_raise(KFD_EC_MASK(EC_QUEUE_NEW), p, dev, queue_id, false, NULL, 0);
 	return 0;
 
 err_create_queue:
@@ -2908,7 +2909,17 @@ static int kfd_ioctl_set_debug_trap(struct file *filep, struct kfd_process *p, v
 				args->launch_mode.launch_mode);
 		break;
 	case KFD_IOC_DBG_TRAP_SUSPEND_QUEUES:
+		r = suspend_queues(target,
+				args->suspend_queues.num_queues,
+				args->suspend_queues.grace_period,
+				args->suspend_queues.exception_mask,
+				(uint32_t *)args->suspend_queues.queue_array_ptr);
+
+		break;
 	case KFD_IOC_DBG_TRAP_RESUME_QUEUES:
+		r = resume_queues(target, args->resume_queues.num_queues,
+				(uint32_t *)args->resume_queues.queue_array_ptr);
+		break;
 	case KFD_IOC_DBG_TRAP_SET_NODE_ADDRESS_WATCH:
 	case KFD_IOC_DBG_TRAP_CLEAR_NODE_ADDRESS_WATCH:
 	case KFD_IOC_DBG_TRAP_SET_FLAGS:
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
index b630633609b0..730e53584113 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
@@ -344,6 +344,13 @@ void kfd_dbg_trap_deactivate(struct kfd_process *target, bool unwind, int unwind
 	}
 
 	kfd_dbg_set_workaround(target, false);
+
+	if (!unwind) {
+		int resume_count = resume_queues(target, 0, NULL);
+
+		if (resume_count)
+			pr_debug("Resumed %d queues\n", resume_count);
+	}
 }
 
 static void kfd_dbg_clean_exception_status(struct kfd_process *target)
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
index 3b747e51684e..7792fe9491c5 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -925,6 +925,92 @@ static int update_queue(struct device_queue_manager *dqm, struct queue *q,
 	return retval;
 }
 
+/* suspend_single_queue does not lock the dqm like the
+ * evict_process_queues_cpsch or evict_process_queues_nocpsch. You should
+ * lock the dqm before calling, and unlock after calling.
+ *
+ * The reason we don't lock the dqm is because this function may be
+ * called on multiple queues in a loop, so rather than locking/unlocking
+ * multiple times, we will just keep the dqm locked for all of the calls.
+ */
+static int suspend_single_queue(struct device_queue_manager *dqm,
+				      struct kfd_process_device *pdd,
+				      struct queue *q)
+{
+	bool is_new;
+
+	if (q->properties.is_suspended)
+		return 0;
+
+	pr_debug("Suspending PASID %u queue [%i]\n",
+			pdd->process->pasid,
+			q->properties.queue_id);
+
+	is_new = q->properties.exception_status & KFD_EC_MASK(EC_QUEUE_NEW);
+
+	if (is_new || q->properties.is_being_destroyed) {
+		pr_debug("Suspend: skip %s queue id %i\n",
+				is_new ? "new" : "destroyed",
+				q->properties.queue_id);
+		return -EBUSY;
+	}
+
+	q->properties.is_suspended = true;
+	if (q->properties.is_active) {
+		if (dqm->dev->shared_resources.enable_mes) {
+			int r = remove_queue_mes(dqm, q, &pdd->qpd);
+
+			if (r)
+				return r;
+		}
+
+		decrement_queue_count(dqm, &pdd->qpd, q);
+		q->properties.is_active = false;
+	}
+
+	return 0;
+}
+
+/* resume_single_queue does not lock the dqm like the functions
+ * restore_process_queues_cpsch or restore_process_queues_nocpsch. You should
+ * lock the dqm before calling, and unlock after calling.
+ *
+ * The reason we don't lock the dqm is because this function may be
+ * called on multiple queues in a loop, so rather than locking/unlocking
+ * multiple times, we will just keep the dqm locked for all of the calls.
+ */
+static int resume_single_queue(struct device_queue_manager *dqm,
+				      struct qcm_process_device *qpd,
+				      struct queue *q)
+{
+	struct kfd_process_device *pdd;
+
+	if (!q->properties.is_suspended)
+		return 0;
+
+	pdd = qpd_to_pdd(qpd);
+
+	pr_debug("Restoring from suspend PASID %u queue [%i]\n",
+			    pdd->process->pasid,
+			    q->properties.queue_id);
+
+	q->properties.is_suspended = false;
+
+	if (QUEUE_IS_ACTIVE(q->properties)) {
+		if (dqm->dev->shared_resources.enable_mes) {
+			int r = add_queue_mes(dqm, q, &pdd->qpd);
+
+			if (r)
+				return r;
+		}
+
+		q->properties.is_active = true;
+		increment_queue_count(dqm, qpd, q);
+	}
+
+	return 0;
+}
+
 static int evict_process_queues_nocpsch(struct device_queue_manager *dqm,
 					struct qcm_process_device *qpd)
 {
@@ -1890,6 +1976,31 @@ static int execute_queues_cpsch(struct device_queue_manager *dqm,
 	return map_queues_cpsch(dqm);
 }
 
+static int wait_on_destroy_queue(struct device_queue_manager *dqm,
+				 struct queue *q)
+{
+	struct kfd_process_device *pdd = kfd_get_process_device_data(q->device,
+								q->process);
+	int ret = 0;
+
+	if (pdd->qpd.is_debug)
+		return ret;
+
+	q->properties.is_being_destroyed = true;
+
+	if (pdd->process->debug_trap_enabled && q->properties.is_suspended) {
+		dqm_unlock(dqm);
+		mutex_unlock(&q->process->mutex);
+		ret = wait_event_interruptible(dqm->destroy_wait,
+						!q->properties.is_suspended);
+
+		mutex_lock(&q->process->mutex);
+		dqm_lock(dqm);
+	}
+
+	return ret;
+}
+
 static int destroy_queue_cpsch(struct device_queue_manager *dqm,
 				struct qcm_process_device *qpd,
 				struct queue *q)
@@ -1909,11 +2020,16 @@ static int destroy_queue_cpsch(struct device_queue_manager *dqm,
 				q->properties.queue_id);
 	}
 
-	retval = 0;
-
 	/* remove queue from list to prevent rescheduling after preemption */
 	dqm_lock(dqm);
 
+	retval = wait_on_destroy_queue(dqm, q);
+
+	if (retval) {
+		dqm_unlock(dqm);
+		return retval;
+	}
+
 	if (qpd->is_debug) {
 		/*
 		 * error, currently we do not allow to destroy a queue
@@ -1959,7 +2075,14 @@ static int destroy_queue_cpsch(struct device_queue_manager *dqm,
 
 	dqm_unlock(dqm);
 
-	/* Do free_mqd after dqm_unlock(dqm) to avoid circular locking */
+	/*
+	 * Do free_mqd and raise delete event after dqm_unlock(dqm) to avoid
+	 * circular locking
+	 */
+	kfd_dbg_ev_raise(KFD_EC_MASK(EC_DEVICE_QUEUE_DELETE),
+				qpd->pqm->process, q->device,
+				-1, false, NULL, 0);
+
 	mqd_mgr->free_mqd(mqd_mgr, q->mqd, q->mqd_mem_obj);
 
 	return retval;
@@ -2423,8 +2546,10 @@ struct device_queue_manager *device_queue_manager_init(struct kfd_dev *dev)
 		goto out_free;
 	}
 
-	if (!dqm->ops.initialize(dqm))
+	if (!dqm->ops.initialize(dqm)) {
+		init_waitqueue_head(&dqm->destroy_wait);
 		return dqm;
+	}
 
 out_free:
 	kfree(dqm);
@@ -2562,6 +2687,319 @@ int release_debug_trap_vmid(struct device_queue_manager *dqm,
 	return r;
 }
 
+#define QUEUE_NOT_FOUND		-1
+/* invalidate queue operation in array */
+static void q_array_invalidate(uint32_t num_queues, uint32_t *queue_ids)
+{
+	int i;
+
+	for (i = 0; i < num_queues; i++)
+		queue_ids[i] |= KFD_DBG_QUEUE_INVALID_MASK;
+}
+
+/* find queue index in array */
+static int q_array_get_index(unsigned int queue_id,
+		uint32_t num_queues,
+		uint32_t *queue_ids)
+{
+	int i;
+
+	for (i = 0; i < num_queues; i++)
+		if (queue_id == (queue_ids[i] & ~KFD_DBG_QUEUE_INVALID_MASK))
+			return i;
+
+	return QUEUE_NOT_FOUND;
+}
+
+struct copy_context_work_handler_workarea {
+	struct work_struct copy_context_work;
+	struct kfd_process *p;
+};
+
+static void copy_context_work_handler (struct work_struct *work)
+{
+	struct copy_context_work_handler_workarea *workarea;
+	struct mqd_manager *mqd_mgr;
+	struct queue *q;
+	struct mm_struct *mm;
+	struct kfd_process *p;
+	uint32_t tmp_ctl_stack_used_size, tmp_save_area_used_size;
+	int i;
+
+	workarea = container_of(work,
+			struct copy_context_work_handler_workarea,
+			copy_context_work);
+
+	p = workarea->p;
+	mm = get_task_mm(p->lead_thread);
+
+	if (!mm)
+		return;
+
+	kthread_use_mm(mm);
+	for (i = 0; i < p->n_pdds; i++) {
+		struct kfd_process_device *pdd = p->pdds[i];
+		struct device_queue_manager *dqm = pdd->dev->dqm;
+		struct qcm_process_device *qpd = &pdd->qpd;
+
+		list_for_each_entry(q, &qpd->queues_list, list) {
+			mqd_mgr = dqm->mqd_mgrs[KFD_MQD_TYPE_CP];
+
+			/* We ignore the return value from get_wave_state
+			 * because
+			 * i) right now, it always returns 0, and
+			 * ii) if we hit an error, we would continue to the
+			 *      next queue anyway.
+			 */
+			mqd_mgr->get_wave_state(mqd_mgr,
+					q->mqd,
+					(void __user *)	q->properties.ctx_save_restore_area_address,
+					&tmp_ctl_stack_used_size,
+					&tmp_save_area_used_size);
+		}
+	}
+	kthread_unuse_mm(mm);
+	mmput(mm);
+}
+
+static uint32_t *get_queue_ids(uint32_t num_queues, uint32_t *usr_queue_id_array)
+{
+	size_t array_size = num_queues * sizeof(uint32_t);
+	uint32_t *queue_ids = NULL;
+
+	if (!usr_queue_id_array)
+		return NULL;
+
+	queue_ids = kzalloc(array_size, GFP_KERNEL);
+	if (!queue_ids)
+		return ERR_PTR(-ENOMEM);
+
+	if (copy_from_user(queue_ids, usr_queue_id_array, array_size))
+		return ERR_PTR(-EFAULT);
+
+	return queue_ids;
+}
+
+int resume_queues(struct kfd_process *p,
+		uint32_t num_queues,
+		uint32_t *usr_queue_id_array)
+{
+	uint32_t *queue_ids = NULL;
+	int total_resumed = 0;
+	int i;
+
+	if (usr_queue_id_array) {
+		queue_ids = get_queue_ids(num_queues, usr_queue_id_array);
+
+		if (IS_ERR(queue_ids))
+			return PTR_ERR(queue_ids);
+
+		/* mask all queues as invalid.  unmask per successful request */
+		q_array_invalidate(num_queues, queue_ids);
+	}
+
+	for (i = 0; i < p->n_pdds; i++) {
+		struct kfd_process_device *pdd = p->pdds[i];
+		struct device_queue_manager *dqm = pdd->dev->dqm;
+		struct qcm_process_device *qpd = &pdd->qpd;
+		struct queue *q;
+		int r, per_device_resumed = 0;
+
+		dqm_lock(dqm);
+
+		/* unmask queues that resume or already resumed as valid */
+		list_for_each_entry(q, &qpd->queues_list, list) {
+			int q_idx = QUEUE_NOT_FOUND;
+
+			if (queue_ids)
+				q_idx = q_array_get_index(
+						q->properties.queue_id,
+						num_queues,
+						queue_ids);
+
+			if (!queue_ids || q_idx != QUEUE_NOT_FOUND) {
+				int err = resume_single_queue(dqm, &pdd->qpd, q);
+
+				if (queue_ids) {
+					if (!err) {
+						queue_ids[q_idx] &=
+							~KFD_DBG_QUEUE_INVALID_MASK;
+					} else {
+						queue_ids[q_idx] |=
+							KFD_DBG_QUEUE_ERROR_MASK;
+						break;
+					}
+				}
+
+				if (dqm->dev->shared_resources.enable_mes) {
+					wake_up_all(&dqm->destroy_wait);
+					if (!err)
+						total_resumed++;
+				} else {
+					per_device_resumed++;
+				}
+			}
+		}
+
+		if (!per_device_resumed) {
+			dqm_unlock(dqm);
+			continue;
+		}
+
+		r = execute_queues_cpsch(dqm,
+					KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES,
+					0,
+					USE_DEFAULT_GRACE_PERIOD);
+		if (r) {
+			pr_err("Failed to resume process queues\n");
+			if (queue_ids) {
+				list_for_each_entry(q, &qpd->queues_list, list) {
+					int q_idx = q_array_get_index(
+							q->properties.queue_id,
+							num_queues,
+							queue_ids);
+
+					/* mask queue as error on resume fail */
+					if (q_idx != QUEUE_NOT_FOUND)
+						queue_ids[q_idx] |=
+							KFD_DBG_QUEUE_ERROR_MASK;
+				}
+			}
+		} else {
+			wake_up_all(&dqm->destroy_wait);
+			total_resumed += per_device_resumed;
+		}
+
+		dqm_unlock(dqm);
+	}
+
+	if (queue_ids) {
+		if (copy_to_user((void __user *)usr_queue_id_array, queue_ids,
+				num_queues * sizeof(uint32_t)))
+			pr_err("copy_to_user failed on queue resume\n");
+
+		kfree(queue_ids);
+	}
+
+	return total_resumed;
+}
+
+int suspend_queues(struct kfd_process *p,
+			uint32_t num_queues,
+			uint32_t grace_period,
+			uint64_t exception_clear_mask,
+			uint32_t *usr_queue_id_array)
+{
+	uint32_t *queue_ids = get_queue_ids(num_queues, usr_queue_id_array);
+	int total_suspended = 0;
+	int i;
+
+	if (IS_ERR(queue_ids))
+		return PTR_ERR(queue_ids);
+
+	/* mask all queues as invalid.  umask on successful request */
+	q_array_invalidate(num_queues, queue_ids);
+
+	for (i = 0; i < p->n_pdds; i++) {
+		struct kfd_process_device *pdd = p->pdds[i];
+		struct device_queue_manager *dqm = pdd->dev->dqm;
+		struct qcm_process_device *qpd = &pdd->qpd;
+		struct queue *q;
+		int r, per_device_suspended = 0;
+
+		mutex_lock(&p->event_mutex);
+		dqm_lock(dqm);
+
+		/* unmask queues that suspend or already suspended */
+		list_for_each_entry(q, &qpd->queues_list, list) {
+			int q_idx = q_array_get_index(q->properties.queue_id,
+							num_queues,
+							queue_ids);
+
+			if (q_idx != QUEUE_NOT_FOUND) {
+				int err = suspend_single_queue(dqm, pdd, q);
+				bool is_mes = dqm->dev->shared_resources.enable_mes;
+
+				if (!err) {
+					queue_ids[q_idx] &= ~KFD_DBG_QUEUE_INVALID_MASK;
+					if (exception_clear_mask && is_mes)
+						q->properties.exception_status &=
+							~exception_clear_mask;
+
+					if (is_mes)
+						total_suspended++;
+					else
+						per_device_suspended++;
+				} else if (err != -EBUSY) {
+					r = err;
+					queue_ids[q_idx] |= KFD_DBG_QUEUE_ERROR_MASK;
+					break;
+				}
+			}
+		}
+
+		if (!per_device_suspended) {
+			dqm_unlock(dqm);
+			mutex_unlock(&p->event_mutex);
+			if (total_suspended)
+				amdgpu_amdkfd_debug_mem_fence(dqm->dev->adev);
+			continue;
+		}
+
+		r = execute_queues_cpsch(dqm,
+			KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0,
+			grace_period);
+
+		if (r)
+			pr_err("Failed to suspend process queues.\n");
+		else
+			total_suspended += per_device_suspended;
+
+		list_for_each_entry(q, &qpd->queues_list, list) {
+			int q_idx = q_array_get_index(q->properties.queue_id,
+						num_queues, queue_ids);
+
+			if (q_idx == QUEUE_NOT_FOUND)
+				continue;
+
+			/* mask queue as error on suspend fail */
+			if (r)
+				queue_ids[q_idx] |= KFD_DBG_QUEUE_ERROR_MASK;
+			else if (exception_clear_mask)
+				q->properties.exception_status &=
+							~exception_clear_mask;
+		}
+
+		dqm_unlock(dqm);
+		mutex_unlock(&p->event_mutex);
+		amdgpu_device_flush_hdp(dqm->dev->adev, NULL);
+	}
+
+	if (total_suspended) {
+		struct copy_context_work_handler_workarea copy_context_worker;
+
+		INIT_WORK_ONSTACK(
+				&copy_context_worker.copy_context_work,
+				copy_context_work_handler);
+
+		copy_context_worker.p = p;
+
+		schedule_work(&copy_context_worker.copy_context_work);
+
+
+		flush_work(&copy_context_worker.copy_context_work);
+		destroy_work_on_stack(&copy_context_worker.copy_context_work);
+	}
+
+	if (copy_to_user((void __user *)usr_queue_id_array, queue_ids,
+			num_queues * sizeof(uint32_t)))
+		pr_err("copy_to_user failed on queue suspend\n");
+
+	kfree(queue_ids);
+
+	return total_suspended;
+}
+
 int debug_lock_and_unmap(struct device_queue_manager *dqm)
 {
 	int r;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
index bef3be84c5cc..7ccf8d0d1867 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
@@ -259,6 +259,8 @@ struct device_queue_manager {
 	struct kfd_mem_obj	hiq_sdma_mqd;
 	bool			sched_running;
 	uint32_t		wait_times;
+
+	wait_queue_head_t	destroy_wait;
 };
 
 void device_queue_manager_init_cik(
@@ -286,6 +288,14 @@ int reserve_debug_trap_vmid(struct device_queue_manager *dqm,
 			struct qcm_process_device *qpd);
 int release_debug_trap_vmid(struct device_queue_manager *dqm,
 			struct qcm_process_device *qpd);
+int suspend_queues(struct kfd_process *p,
+			uint32_t num_queues,
+			uint32_t grace_period,
+			uint64_t exception_clear_mask,
+			uint32_t *usr_queue_id_array);
+int resume_queues(struct kfd_process *p,
+		uint32_t num_queues,
+		uint32_t *usr_queue_id_array);
 int debug_lock_and_unmap(struct device_queue_manager *dqm);
 int debug_map_and_unlock(struct device_queue_manager *dqm);
 int debug_refresh_runlist(struct device_queue_manager *dqm);
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v10.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v10.c
index 8248e77751e7..ceae8bff4906 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v10.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v10.c
@@ -236,6 +236,11 @@ static int get_wave_state(struct mqd_manager *mm, void *mqd,
 			  u32 *save_area_used_size)
 {
 	struct v10_compute_mqd *m;
+	struct kfd_context_save_area_header header;
+	size_t header_copy_size = sizeof(header.control_stack_size) +
+			sizeof(header.wave_state_size) +
+			sizeof(header.wave_state_offset) +
+			sizeof(header.control_stack_offset);
 
 	m = get_mqd(mqd);
 
@@ -254,6 +259,15 @@ static int get_wave_state(struct mqd_manager *mm, void *mqd,
 	 * accessible to user mode
 	 */
 
+	header.control_stack_size = *ctl_stack_used_size;
+	header.wave_state_size = *save_area_used_size;
+
+	header.wave_state_offset = m->cp_hqd_wg_state_offset;
+	header.control_stack_offset = m->cp_hqd_cntl_stack_offset;
+
+	if (copy_to_user(ctl_stack, &header, header_copy_size))
+		return -EFAULT;
+
 	return 0;
 }
 
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v11.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v11.c
index 18ab613e787c..266b8d9cc9b2 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v11.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v11.c
@@ -283,7 +283,11 @@ static int get_wave_state(struct mqd_manager *mm, void *mqd,
 			  u32 *save_area_used_size)
 {
 	struct v11_compute_mqd *m;
-	/*struct mqd_user_context_save_area_header header;*/
+	struct kfd_context_save_area_header header;
+	size_t header_copy_size = sizeof(header.control_stack_size) +
+			sizeof(header.wave_state_size) +
+			sizeof(header.wave_state_offset) +
+			sizeof(header.control_stack_offset);
 
 	m = get_mqd(mqd);
 
@@ -301,16 +305,15 @@ static int get_wave_state(struct mqd_manager *mm, void *mqd,
 	 * it's part of the context save area that is already
 	 * accessible to user mode
 	 */
-/*
 	header.control_stack_size = *ctl_stack_used_size;
 	header.wave_state_size = *save_area_used_size;
 
 	header.wave_state_offset = m->cp_hqd_wg_state_offset;
 	header.control_stack_offset = m->cp_hqd_cntl_stack_offset;
 
-	if (copy_to_user(ctl_stack, &header, sizeof(header)))
+	if (copy_to_user(ctl_stack, &header, header_copy_size))
 		return -EFAULT;
-*/
+
 	return 0;
 }
 
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c
index 50da16dd4c96..047c43418a1a 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c
@@ -288,6 +288,11 @@ static int get_wave_state(struct mqd_manager *mm, void *mqd,
 			  u32 *save_area_used_size)
 {
 	struct v9_mqd *m;
+	struct kfd_context_save_area_header header;
+	size_t header_copy_size = sizeof(header.control_stack_size) +
+		sizeof(header.wave_state_size) +
+		sizeof(header.wave_state_offset) +
+		sizeof(header.control_stack_offset);
 
 	/* Control stack is located one page after MQD. */
 	void *mqd_ctl_stack = (void *)((uintptr_t)mqd + PAGE_SIZE);
@@ -299,7 +304,18 @@ static int get_wave_state(struct mqd_manager *mm, void *mqd,
 	*save_area_used_size = m->cp_hqd_wg_state_offset -
 		m->cp_hqd_cntl_stack_size;
 
-	if (copy_to_user(ctl_stack, mqd_ctl_stack, m->cp_hqd_cntl_stack_size))
+	header.control_stack_size = *ctl_stack_used_size;
+	header.wave_state_size = *save_area_used_size;
+
+	header.wave_state_offset = m->cp_hqd_wg_state_offset;
+	header.control_stack_offset = m->cp_hqd_cntl_stack_offset;
+
+	if (copy_to_user(ctl_stack, &header, header_copy_size))
+		return -EFAULT;
+
+	if (copy_to_user(ctl_stack + m->cp_hqd_cntl_stack_offset,
+				mqd_ctl_stack + m->cp_hqd_cntl_stack_offset,
+				*ctl_stack_used_size))
 		return -EFAULT;
 
 	return 0;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index 6f7dc23af104..8dc7cc1e18a5 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -477,6 +477,8 @@ struct queue_properties {
 	uint32_t doorbell_off;
 	bool is_interop;
 	bool is_evicted;
+	bool is_suspended;
+	bool is_being_destroyed;
 	bool is_active;
 	bool is_gws;
 	bool is_dbg_wa;
@@ -501,7 +503,8 @@ struct queue_properties {
 #define QUEUE_IS_ACTIVE(q) ((q).queue_size > 0 &&	\
 			    (q).queue_address != 0 &&	\
 			    (q).queue_percent > 0 &&	\
-			    !(q).is_evicted)
+			    !(q).is_evicted &&		\
+			    !(q).is_suspended)
 
 enum mqd_update_flag {
 	UPDATE_FLAG_DBG_WA_ENABLE = 1,
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 68+ messages in thread

* [PATCH 26/32] drm/amdkfd: add debug set and clear address watch points operation
  2023-01-25 19:53 [PATCH 00/32] Upstream of kernel support for AMDGPU ISA debugging Jonathan Kim
                   ` (24 preceding siblings ...)
  2023-01-25 19:53 ` [PATCH 25/32] drm/amdkfd: add debug suspend and resume process queues operation Jonathan Kim
@ 2023-01-25 19:53 ` Jonathan Kim
  2023-03-22 21:38   ` Felix Kuehling
  2023-01-25 19:53 ` [PATCH 27/32] drm/amdkfd: add debug set flags operation Jonathan Kim
                   ` (5 subsequent siblings)
  31 siblings, 1 reply; 68+ messages in thread
From: Jonathan Kim @ 2023-01-25 19:53 UTC (permalink / raw)
  To: amd-gfx, dri-devel; +Cc: Felix.Kuehling, Jonathan.Kim

Shader read, write and atomic memory operations can be alerted to the
debugger as an address watch exception.

Allow the debugger to pass in a watch point to a particular memory
address per device.

Note that there exists only 4 watch points per devices to date, so have
the KFD keep track of what watch points are allocated or not.

v3: add gfx11 support.
cleanup gfx9 kgd calls to set and clear address watch.
use per device spinlock to set watch points.
fixup runlist refresh calls on set/clear address watch.

v2: change dev_id arg to gpu_id for consistency

Signed-off-by: Jonathan Kim <jonathan.kim@amd.com>
---
 .../drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c  |  51 +++++++
 .../drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c   |   2 +
 .../drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c    |  78 ++++++++++
 .../drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.h    |   8 ++
 .../drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10_3.c  |   5 +-
 .../drm/amd/amdgpu/amdgpu_amdkfd_gfx_v11.c    |  52 ++++++-
 .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c |  77 ++++++++++
 .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h |   8 ++
 drivers/gpu/drm/amd/amdkfd/kfd_chardev.c      |  24 ++++
 drivers/gpu/drm/amd/amdkfd/kfd_debug.c        | 136 ++++++++++++++++++
 drivers/gpu/drm/amd/amdkfd/kfd_debug.h        |   8 +-
 drivers/gpu/drm/amd/amdkfd/kfd_device.c       |   1 +
 drivers/gpu/drm/amd/amdkfd/kfd_priv.h         |   6 +-
 13 files changed, 451 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c
index 4de2066215b4..18baf1cd8c01 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c
@@ -118,6 +118,55 @@ static uint32_t kgd_aldebaran_set_wave_launch_mode(struct amdgpu_device *adev,
 	return data;
 }
 
+#define TCP_WATCH_STRIDE (regTCP_WATCH1_ADDR_H - regTCP_WATCH0_ADDR_H)
+static uint32_t kgd_gfx_aldebaran_set_address_watch(
+					struct amdgpu_device *adev,
+					uint64_t watch_address,
+					uint32_t watch_address_mask,
+					uint32_t watch_id,
+					uint32_t watch_mode,
+					uint32_t debug_vmid)
+{
+	uint32_t watch_address_high;
+	uint32_t watch_address_low;
+	uint32_t watch_address_cntl;
+
+	watch_address_cntl = 0;
+	watch_address_low = lower_32_bits(watch_address);
+	watch_address_high = upper_32_bits(watch_address) & 0xffff;
+
+	watch_address_cntl = REG_SET_FIELD(watch_address_cntl,
+			TCP_WATCH0_CNTL,
+			MODE,
+			watch_mode);
+
+	watch_address_cntl = REG_SET_FIELD(watch_address_cntl,
+			TCP_WATCH0_CNTL,
+			MASK,
+			watch_address_mask >> 6);
+
+	watch_address_cntl = REG_SET_FIELD(watch_address_cntl,
+			TCP_WATCH0_CNTL,
+			VALID,
+			1);
+
+	WREG32_RLC((SOC15_REG_OFFSET(GC, 0, regTCP_WATCH0_ADDR_H) +
+			(watch_id * TCP_WATCH_STRIDE)),
+			watch_address_high);
+
+	WREG32_RLC((SOC15_REG_OFFSET(GC, 0, regTCP_WATCH0_ADDR_L) +
+			(watch_id * TCP_WATCH_STRIDE)),
+			watch_address_low);
+
+	return watch_address_cntl;
+}
+
+uint32_t kgd_gfx_aldebaran_clear_address_watch(struct amdgpu_device *adev,
+					uint32_t watch_id)
+{
+	return 0;
+}
+
 const struct kfd2kgd_calls aldebaran_kfd2kgd = {
 	.program_sh_mem_settings = kgd_gfx_v9_program_sh_mem_settings,
 	.set_pasid_vmid_mapping = kgd_gfx_v9_set_pasid_vmid_mapping,
@@ -140,6 +189,8 @@ const struct kfd2kgd_calls aldebaran_kfd2kgd = {
 	.validate_trap_override_request = kgd_aldebaran_validate_trap_override_request,
 	.set_wave_launch_trap_override = kgd_aldebaran_set_wave_launch_trap_override,
 	.set_wave_launch_mode = kgd_aldebaran_set_wave_launch_mode,
+	.set_address_watch = kgd_gfx_aldebaran_set_address_watch,
+	.clear_address_watch = kgd_gfx_aldebaran_clear_address_watch,
 	.get_iq_wait_times = kgd_gfx_v9_get_iq_wait_times,
 	.build_grace_period_packet_info = kgd_gfx_v9_build_grace_period_packet_info,
 	.program_trap_handler_settings = kgd_gfx_v9_program_trap_handler_settings,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c
index 500013540356..a7fb5ef13166 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c
@@ -413,6 +413,8 @@ const struct kfd2kgd_calls arcturus_kfd2kgd = {
 	.validate_trap_override_request = kgd_gfx_v9_validate_trap_override_request,
 	.set_wave_launch_trap_override = kgd_gfx_v9_set_wave_launch_trap_override,
 	.set_wave_launch_mode = kgd_gfx_v9_set_wave_launch_mode,
+	.set_address_watch = kgd_gfx_v9_set_address_watch,
+	.clear_address_watch = kgd_gfx_v9_clear_address_watch,
 	.get_iq_wait_times = kgd_gfx_v9_get_iq_wait_times,
 	.build_grace_period_packet_info = kgd_gfx_v9_build_grace_period_packet_info,
 	.get_cu_occupancy = kgd_gfx_v9_get_cu_occupancy,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
index 7591145bc69f..c9246370984c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
@@ -878,6 +878,82 @@ uint32_t kgd_gfx_v10_set_wave_launch_mode(struct amdgpu_device *adev,
 	return 0;
 }
 
+#define TCP_WATCH_STRIDE (mmTCP_WATCH1_ADDR_H - mmTCP_WATCH0_ADDR_H)
+uint32_t kgd_gfx_v10_set_address_watch(struct amdgpu_device *adev,
+					uint64_t watch_address,
+					uint32_t watch_address_mask,
+					uint32_t watch_id,
+					uint32_t watch_mode,
+					uint32_t debug_vmid)
+{
+	uint32_t watch_address_high;
+	uint32_t watch_address_low;
+	uint32_t watch_address_cntl;
+
+	watch_address_cntl = 0;
+
+	watch_address_low = lower_32_bits(watch_address);
+	watch_address_high = upper_32_bits(watch_address) & 0xffff;
+
+	watch_address_cntl = REG_SET_FIELD(watch_address_cntl,
+			TCP_WATCH0_CNTL,
+			VMID,
+			debug_vmid);
+	watch_address_cntl = REG_SET_FIELD(watch_address_cntl,
+			TCP_WATCH0_CNTL,
+			MODE,
+			watch_mode);
+	watch_address_cntl = REG_SET_FIELD(watch_address_cntl,
+			TCP_WATCH0_CNTL,
+			MASK,
+			watch_address_mask >> 7);
+
+	/* Turning off this watch point until we set all the registers */
+	watch_address_cntl = REG_SET_FIELD(watch_address_cntl,
+			TCP_WATCH0_CNTL,
+			VALID,
+			0);
+
+	WREG32((SOC15_REG_OFFSET(GC, 0, mmTCP_WATCH0_CNTL) +
+			(watch_id * TCP_WATCH_STRIDE)),
+			watch_address_cntl);
+
+	WREG32((SOC15_REG_OFFSET(GC, 0, mmTCP_WATCH0_ADDR_H) +
+			(watch_id * TCP_WATCH_STRIDE)),
+			watch_address_high);
+
+	WREG32((SOC15_REG_OFFSET(GC, 0, mmTCP_WATCH0_ADDR_L) +
+			(watch_id * TCP_WATCH_STRIDE)),
+			watch_address_low);
+
+	/* Enable the watch point */
+	watch_address_cntl = REG_SET_FIELD(watch_address_cntl,
+			TCP_WATCH0_CNTL,
+			VALID,
+			1);
+
+	WREG32((SOC15_REG_OFFSET(GC, 0, mmTCP_WATCH0_CNTL) +
+			(watch_id * TCP_WATCH_STRIDE)),
+			watch_address_cntl);
+
+	return 0;
+}
+
+uint32_t kgd_gfx_v10_clear_address_watch(struct amdgpu_device *adev,
+					uint32_t watch_id)
+{
+	uint32_t watch_address_cntl;
+
+	watch_address_cntl = 0;
+
+	WREG32((SOC15_REG_OFFSET(GC, 0, mmTCP_WATCH0_CNTL) +
+			(watch_id * TCP_WATCH_STRIDE)),
+			watch_address_cntl);
+
+	return 0;
+}
+
+
 /* kgd_gfx_v10_get_iq_wait_times: Returns the mmCP_IQ_WAIT_TIME1/2 values
  * The values read are:
  *     ib_offload_wait_time     -- Wait Count for Indirect Buffer Offloads.
@@ -966,6 +1042,8 @@ const struct kfd2kgd_calls gfx_v10_kfd2kgd = {
 	.validate_trap_override_request = kgd_gfx_v10_validate_trap_override_request,
 	.set_wave_launch_trap_override = kgd_gfx_v10_set_wave_launch_trap_override,
 	.set_wave_launch_mode = kgd_gfx_v10_set_wave_launch_mode,
+	.set_address_watch = kgd_gfx_v10_set_address_watch,
+	.clear_address_watch = kgd_gfx_v10_clear_address_watch,
 	.get_iq_wait_times = kgd_gfx_v10_get_iq_wait_times,
 	.build_grace_period_packet_info = kgd_gfx_v10_build_grace_period_packet_info,
 	.program_trap_handler_settings = program_trap_handler_settings,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.h
index 34c04a2bb83b..334ff16e25db 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.h
@@ -39,6 +39,14 @@ uint32_t kgd_gfx_v10_set_wave_launch_trap_override(struct amdgpu_device *adev,
 uint32_t kgd_gfx_v10_set_wave_launch_mode(struct amdgpu_device *adev,
 					 uint8_t wave_launch_mode,
 					 uint32_t vmid);
+uint32_t kgd_gfx_v10_set_address_watch(struct amdgpu_device *adev,
+					uint64_t watch_address,
+					uint32_t watch_address_mask,
+					uint32_t watch_id,
+					uint32_t watch_mode,
+					uint32_t debug_vmid);
+uint32_t kgd_gfx_v10_clear_address_watch(struct amdgpu_device *adev,
+					uint32_t watch_id);
 void kgd_gfx_v10_get_iq_wait_times(struct amdgpu_device *adev, uint32_t *wait_times);
 void kgd_gfx_v10_build_grace_period_packet_info(struct amdgpu_device *adev,
 					       uint32_t wait_times,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10_3.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10_3.c
index 8627c5458973..ee36ba045dcf 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10_3.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10_3.c
@@ -676,6 +676,7 @@ const struct kfd2kgd_calls gfx_v10_3_kfd2kgd = {
 	.disable_debug_trap = kgd_gfx_v10_disable_debug_trap,
 	.validate_trap_override_request = kgd_gfx_v10_validate_trap_override_request,
 	.set_wave_launch_trap_override = kgd_gfx_v10_set_wave_launch_trap_override,
-	.set_wave_launch_mode = kgd_gfx_v10_set_wave_launch_mode
-
+	.set_wave_launch_mode = kgd_gfx_v10_set_wave_launch_mode,
+	.set_address_watch = kgd_gfx_v10_set_address_watch,
+	.clear_address_watch = kgd_gfx_v10_clear_address_watch
 };
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v11.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v11.c
index 4fdc25222dcd..358c219fb704 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v11.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v11.c
@@ -736,6 +736,54 @@ static uint32_t kgd_gfx_v11_set_wave_launch_mode(struct amdgpu_device *adev,
 	return data;
 }
 
+#define TCP_WATCH_STRIDE (regTCP_WATCH1_ADDR_H - regTCP_WATCH0_ADDR_H)
+static uint32_t kgd_gfx_v11_set_address_watch(struct amdgpu_device *adev,
+					uint64_t watch_address,
+					uint32_t watch_address_mask,
+					uint32_t watch_id,
+					uint32_t watch_mode,
+					uint32_t debug_vmid)
+{
+	uint32_t watch_address_high;
+	uint32_t watch_address_low;
+	uint32_t watch_address_cntl;
+
+	watch_address_cntl = 0;
+	watch_address_low = lower_32_bits(watch_address);
+	watch_address_high = upper_32_bits(watch_address) & 0xffff;
+
+	watch_address_cntl = REG_SET_FIELD(watch_address_cntl,
+			TCP_WATCH0_CNTL,
+			MODE,
+			watch_mode);
+
+	watch_address_cntl = REG_SET_FIELD(watch_address_cntl,
+			TCP_WATCH0_CNTL,
+			MASK,
+			watch_address_mask >> 7);
+
+	watch_address_cntl = REG_SET_FIELD(watch_address_cntl,
+			TCP_WATCH0_CNTL,
+			VALID,
+			1);
+
+	WREG32_RLC((SOC15_REG_OFFSET(GC, 0, regTCP_WATCH0_ADDR_H) +
+			(watch_id * TCP_WATCH_STRIDE)),
+			watch_address_high);
+
+	WREG32_RLC((SOC15_REG_OFFSET(GC, 0, regTCP_WATCH0_ADDR_L) +
+			(watch_id * TCP_WATCH_STRIDE)),
+			watch_address_low);
+
+	return watch_address_cntl;
+}
+
+uint32_t kgd_gfx_v11_clear_address_watch(struct amdgpu_device *adev,
+					uint32_t watch_id)
+{
+	return 0;
+}
+
 const struct kfd2kgd_calls gfx_v11_kfd2kgd = {
 	.program_sh_mem_settings = program_sh_mem_settings_v11,
 	.set_pasid_vmid_mapping = set_pasid_vmid_mapping_v11,
@@ -756,5 +804,7 @@ const struct kfd2kgd_calls gfx_v11_kfd2kgd = {
 	.disable_debug_trap = kgd_gfx_v11_disable_debug_trap,
 	.validate_trap_override_request = kgd_gfx_v11_validate_trap_override_request,
 	.set_wave_launch_trap_override = kgd_gfx_v11_set_wave_launch_trap_override,
-	.set_wave_launch_mode = kgd_gfx_v11_set_wave_launch_mode
+	.set_wave_launch_mode = kgd_gfx_v11_set_wave_launch_mode,
+	.set_address_watch = kgd_gfx_v11_set_address_watch,
+	.clear_address_watch = kgd_gfx_v11_clear_address_watch
 };
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
index a3c8f5578788..43296b78d888 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
@@ -814,6 +814,81 @@ uint32_t kgd_gfx_v9_set_wave_launch_mode(struct amdgpu_device *adev,
 	return 0;
 }
 
+#define TCP_WATCH_STRIDE (mmTCP_WATCH1_ADDR_H - mmTCP_WATCH0_ADDR_H)
+uint32_t kgd_gfx_v9_set_address_watch(struct amdgpu_device *adev,
+					uint64_t watch_address,
+					uint32_t watch_address_mask,
+					uint32_t watch_id,
+					uint32_t watch_mode,
+					uint32_t debug_vmid)
+{
+	uint32_t watch_address_high;
+	uint32_t watch_address_low;
+	uint32_t watch_address_cntl;
+
+	watch_address_cntl = 0;
+
+	watch_address_low = lower_32_bits(watch_address);
+	watch_address_high = upper_32_bits(watch_address) & 0xffff;
+
+	watch_address_cntl = REG_SET_FIELD(watch_address_cntl,
+			TCP_WATCH0_CNTL,
+			VMID,
+			debug_vmid);
+	watch_address_cntl = REG_SET_FIELD(watch_address_cntl,
+			TCP_WATCH0_CNTL,
+			MODE,
+			watch_mode);
+	watch_address_cntl = REG_SET_FIELD(watch_address_cntl,
+			TCP_WATCH0_CNTL,
+			MASK,
+			watch_address_mask >> 6);
+
+	/* Turning off this watch point until we set all the registers */
+	watch_address_cntl = REG_SET_FIELD(watch_address_cntl,
+			TCP_WATCH0_CNTL,
+			VALID,
+			0);
+
+	WREG32_RLC((SOC15_REG_OFFSET(GC, 0, mmTCP_WATCH0_CNTL) +
+			(watch_id * TCP_WATCH_STRIDE)),
+			watch_address_cntl);
+
+	WREG32_RLC((SOC15_REG_OFFSET(GC, 0, mmTCP_WATCH0_ADDR_H) +
+			(watch_id * TCP_WATCH_STRIDE)),
+			watch_address_high);
+
+	WREG32_RLC((SOC15_REG_OFFSET(GC, 0, mmTCP_WATCH0_ADDR_L) +
+			(watch_id * TCP_WATCH_STRIDE)),
+			watch_address_low);
+
+	/* Enable the watch point */
+	watch_address_cntl = REG_SET_FIELD(watch_address_cntl,
+			TCP_WATCH0_CNTL,
+			VALID,
+			1);
+
+	WREG32_RLC((SOC15_REG_OFFSET(GC, 0, mmTCP_WATCH0_CNTL) +
+			(watch_id * TCP_WATCH_STRIDE)),
+			watch_address_cntl);
+
+	return 0;
+}
+
+uint32_t kgd_gfx_v9_clear_address_watch(struct amdgpu_device *adev,
+					uint32_t watch_id)
+{
+	uint32_t watch_address_cntl;
+
+	watch_address_cntl = 0;
+
+	WREG32_RLC((SOC15_REG_OFFSET(GC, 0, mmTCP_WATCH0_CNTL) +
+			(watch_id * TCP_WATCH_STRIDE)),
+			watch_address_cntl);
+
+	return 0;
+}
+
 /* kgd_gfx_v9_get_iq_wait_times: Returns the mmCP_IQ_WAIT_TIME1/2 values
  * The values read are:
  *     ib_offload_wait_time     -- Wait Count for Indirect Buffer Offloads.
@@ -1085,6 +1160,8 @@ const struct kfd2kgd_calls gfx_v9_kfd2kgd = {
 	.validate_trap_override_request = kgd_gfx_v9_validate_trap_override_request,
 	.set_wave_launch_trap_override = kgd_gfx_v9_set_wave_launch_trap_override,
 	.set_wave_launch_mode = kgd_gfx_v9_set_wave_launch_mode,
+	.set_address_watch = kgd_gfx_v9_set_address_watch,
+	.clear_address_watch = kgd_gfx_v9_clear_address_watch,
 	.get_iq_wait_times = kgd_gfx_v9_get_iq_wait_times,
 	.build_grace_period_packet_info = kgd_gfx_v9_build_grace_period_packet_info,
 	.get_cu_occupancy = kgd_gfx_v9_get_cu_occupancy,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h
index 2a2ab42037e4..ba52b61b68c5 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h
@@ -77,6 +77,14 @@ uint32_t kgd_gfx_v9_set_wave_launch_trap_override(struct amdgpu_device *adev,
 					     uint32_t trap_mask_request,
 					     uint32_t *trap_mask_prev,
 					     uint32_t kfd_dbg_trap_cntl_prev);
+uint32_t kgd_gfx_v9_set_address_watch(struct amdgpu_device *adev,
+					uint64_t watch_address,
+					uint32_t watch_address_mask,
+					uint32_t watch_id,
+					uint32_t watch_mode,
+					uint32_t debug_vmid);
+uint32_t kgd_gfx_v9_clear_address_watch(struct amdgpu_device *adev,
+					uint32_t watch_id);
 void kgd_gfx_v9_get_iq_wait_times(struct amdgpu_device *adev, uint32_t *wait_times);
 void kgd_gfx_v9_build_grace_period_packet_info(struct amdgpu_device *adev,
 					       uint32_t wait_times,
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index b62e93b35a44..8f2ede781863 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -2802,6 +2802,7 @@ static int kfd_ioctl_set_debug_trap(struct file *filep, struct kfd_process *p, v
 	struct task_struct *thread = NULL;
 	struct pid *pid = NULL;
 	struct kfd_process *target = NULL;
+	struct kfd_process_device *pdd = NULL;
 	int r = 0;
 
 	if (sched_policy == KFD_SCHED_POLICY_NO_HWS) {
@@ -2869,6 +2870,20 @@ static int kfd_ioctl_set_debug_trap(struct file *filep, struct kfd_process *p, v
 		goto unlock_out;
 	}
 
+	if (args->op == KFD_IOC_DBG_TRAP_SET_NODE_ADDRESS_WATCH ||
+			args->op == KFD_IOC_DBG_TRAP_CLEAR_NODE_ADDRESS_WATCH) {
+		int user_gpu_id = kfd_process_get_user_gpu_id(target,
+				args->op == KFD_IOC_DBG_TRAP_SET_NODE_ADDRESS_WATCH ?
+					args->set_node_address_watch.gpu_id :
+					args->clear_node_address_watch.gpu_id);
+
+		pdd = kfd_process_device_data_by_id(target, user_gpu_id);
+		if (user_gpu_id == -EINVAL || !pdd) {
+			r = -ENODEV;
+			goto unlock_out;
+		}
+	}
+
 	switch (args->op) {
 	case KFD_IOC_DBG_TRAP_ENABLE:
 		if (target != p)
@@ -2921,7 +2936,16 @@ static int kfd_ioctl_set_debug_trap(struct file *filep, struct kfd_process *p, v
 				(uint32_t *)args->resume_queues.queue_array_ptr);
 		break;
 	case KFD_IOC_DBG_TRAP_SET_NODE_ADDRESS_WATCH:
+		r = kfd_dbg_trap_set_dev_address_watch(pdd,
+				args->set_node_address_watch.address,
+				args->set_node_address_watch.mask,
+				&args->set_node_address_watch.id,
+				args->set_node_address_watch.mode);
+		break;
 	case KFD_IOC_DBG_TRAP_CLEAR_NODE_ADDRESS_WATCH:
+		r = kfd_dbg_trap_clear_dev_address_watch(pdd,
+				args->clear_node_address_watch.id);
+		break;
 	case KFD_IOC_DBG_TRAP_SET_FLAGS:
 	case KFD_IOC_DBG_TRAP_QUERY_DEBUG_EVENT:
 	case KFD_IOC_DBG_TRAP_QUERY_EXCEPTION_INFO:
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
index 730e53584113..8d2e1adb442d 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
@@ -24,6 +24,8 @@
 #include "kfd_device_queue_manager.h"
 #include <linux/file.h>
 
+#define MAX_WATCH_ADDRESSES	4
+
 void debug_event_write_work_handler(struct work_struct *work)
 {
 	struct kfd_process *process;
@@ -291,6 +293,139 @@ int kfd_dbg_set_mes_debug_mode(struct kfd_process_device *pdd)
 						pdd->watch_points, flags);
 }
 
+#define KFD_DEBUGGER_INVALID_WATCH_POINT_ID -1
+static int kfd_dbg_get_dev_watch_id(struct kfd_process_device *pdd, int *watch_id)
+{
+	int i;
+
+	*watch_id = KFD_DEBUGGER_INVALID_WATCH_POINT_ID;
+
+	spin_lock(&pdd->dev->watch_points_lock);
+
+	for (i = 0; i < MAX_WATCH_ADDRESSES; i++) {
+		/* device watchpoint in use so skip */
+		if ((pdd->dev->alloc_watch_ids >> i) & 0x1)
+			continue;
+
+		pdd->alloc_watch_ids |= 0x1 << i;
+		pdd->dev->alloc_watch_ids |= 0x1 << i;
+		*watch_id = i;
+		spin_unlock(&pdd->dev->watch_points_lock);
+		return 0;
+	}
+
+	spin_unlock(&pdd->dev->watch_points_lock);
+
+	return -ENOMEM;
+}
+
+static void kfd_dbg_clear_dev_watch_id(struct kfd_process_device *pdd, int watch_id)
+{
+	spin_lock(&pdd->dev->watch_points_lock);
+
+	/* process owns device watch point so safe to clear */
+	if ((pdd->alloc_watch_ids >> watch_id) & 0x1) {
+		pdd->alloc_watch_ids &= ~(0x1 << watch_id);
+		pdd->dev->alloc_watch_ids &= ~(0x1 << watch_id);
+	}
+
+	spin_unlock(&pdd->dev->watch_points_lock);
+}
+
+static bool kfd_dbg_owns_dev_watch_id(struct kfd_process_device *pdd, int watch_id)
+{
+	bool owns_watch_id = false;
+
+	spin_lock(&pdd->dev->watch_points_lock);
+	owns_watch_id = watch_id < MAX_WATCH_ADDRESSES &&
+			((pdd->alloc_watch_ids >> watch_id) & 0x1);
+
+	spin_unlock(&pdd->dev->watch_points_lock);
+
+	return owns_watch_id;
+}
+
+int kfd_dbg_trap_clear_dev_address_watch(struct kfd_process_device *pdd,
+					uint32_t watch_id)
+{
+	int r;
+
+	if (!kfd_dbg_owns_dev_watch_id(pdd, watch_id))
+		return -EINVAL;
+
+	if (!pdd->dev->shared_resources.enable_mes) {
+		r = debug_lock_and_unmap(pdd->dev->dqm);
+		if (r)
+			return r;
+	}
+
+	amdgpu_gfx_off_ctrl(pdd->dev->adev, false);
+	pdd->watch_points[watch_id] = pdd->dev->kfd2kgd->clear_address_watch(
+							pdd->dev->adev,
+							watch_id);
+	amdgpu_gfx_off_ctrl(pdd->dev->adev, true);
+
+	if (!pdd->dev->shared_resources.enable_mes)
+		r = debug_map_and_unlock(pdd->dev->dqm);
+	else
+		r = kfd_dbg_set_mes_debug_mode(pdd);
+
+	kfd_dbg_clear_dev_watch_id(pdd, watch_id);
+
+	return r;
+}
+
+int kfd_dbg_trap_set_dev_address_watch(struct kfd_process_device *pdd,
+					uint64_t watch_address,
+					uint32_t watch_address_mask,
+					uint32_t *watch_id,
+					uint32_t watch_mode)
+{
+	int r = kfd_dbg_get_dev_watch_id(pdd, watch_id);
+
+	if (r)
+		return r;
+
+	if (!pdd->dev->shared_resources.enable_mes) {
+		r = debug_lock_and_unmap(pdd->dev->dqm);
+		if (r) {
+			kfd_dbg_clear_dev_watch_id(pdd, *watch_id);
+			return r;
+		}
+	}
+
+	amdgpu_gfx_off_ctrl(pdd->dev->adev, false);
+	pdd->watch_points[*watch_id] = pdd->dev->kfd2kgd->set_address_watch(
+				pdd->dev->adev,
+				watch_address,
+				watch_address_mask,
+				*watch_id,
+				watch_mode,
+				pdd->dev->vm_info.last_vmid_kfd);
+	amdgpu_gfx_off_ctrl(pdd->dev->adev, true);
+
+	if (!pdd->dev->shared_resources.enable_mes)
+		r = debug_map_and_unlock(pdd->dev->dqm);
+	else
+		r = kfd_dbg_set_mes_debug_mode(pdd);
+
+	/* HWS is broken so no point in HW rollback but release the watchpoint anyways */
+	if (r)
+		kfd_dbg_clear_dev_watch_id(pdd, *watch_id);
+
+	return 0;
+}
+
+static void kfd_dbg_clear_process_address_watch(struct kfd_process *target)
+{
+	int i, j;
+
+	for (i = 0; i < target->n_pdds; i++)
+		for (j = 0; j < MAX_WATCH_ADDRESSES; j++)
+			kfd_dbg_trap_clear_dev_address_watch(target->pdds[i], j);
+}
+
+
 /* kfd_dbg_trap_deactivate:
  *	target: target process
  *	unwind: If this is unwinding a failed kfd_dbg_trap_enable()
@@ -305,6 +440,7 @@ void kfd_dbg_trap_deactivate(struct kfd_process *target, bool unwind, int unwind
 
 	if (!unwind) {
 		cancel_work_sync(&target->debug_event_workarea);
+		kfd_dbg_clear_process_address_watch(target);
 		kfd_dbg_trap_set_wave_launch_mode(target, 0);
 	}
 
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.h b/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
index 0d70f162d6d8..63c716ce5ab9 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
@@ -50,7 +50,13 @@ int kfd_dbg_trap_set_wave_launch_override(struct kfd_process *target,
 					uint32_t *trap_mask_supported);
 int kfd_dbg_trap_set_wave_launch_mode(struct kfd_process *target,
 					uint8_t wave_launch_mode);
-
+int kfd_dbg_trap_clear_dev_address_watch(struct kfd_process_device *pdd,
+					uint32_t watch_id);
+int kfd_dbg_trap_set_dev_address_watch(struct kfd_process_device *pdd,
+					uint64_t watch_address,
+					uint32_t watch_address_mask,
+					uint32_t *watch_id,
+					uint32_t watch_mode);
 int kfd_dbg_send_exception_to_runtime(struct kfd_process *p,
 					unsigned int dev_id,
 					unsigned int queue_id,
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
index 6e25238d18f9..ca849cd051d5 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
@@ -641,6 +641,7 @@ bool kgd2kfd_device_init(struct kfd_dev *kfd,
 	}
 
 	kfd_smi_init(kfd);
+	spin_lock_init(&kfd->watch_points_lock);
 
 	kfd->init_complete = true;
 	dev_info(kfd_device, "added device %x:%x\n", kfd->adev->pdev->vendor,
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index 8dc7cc1e18a5..cfc50d1690c7 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -348,6 +348,10 @@ struct kfd_dev {
 
 	/* HMM page migration MEMORY_DEVICE_PRIVATE mapping */
 	struct dev_pagemap pgmap;
+
+	/* Track per device allocated watch points */
+	uint32_t alloc_watch_ids;
+	spinlock_t watch_points_lock;
 };
 
 enum kfd_mempool {
@@ -799,6 +803,7 @@ struct kfd_process_device {
 	uint32_t spi_dbg_override;
 	uint32_t spi_dbg_launch_mode;
 	uint32_t watch_points[4];
+	uint32_t alloc_watch_ids;
 
 	/*
 	 * If this process has been checkpointed before, then the user
@@ -955,7 +960,6 @@ struct kfd_process {
 	struct semaphore runtime_enable_sema;
 	bool is_runtime_retry;
 	struct kfd_runtime_info runtime_info;
-
 };
 
 #define KFD_PROCESS_TABLE_SIZE 5 /* bits: 32 entries */
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 68+ messages in thread

* [PATCH 27/32] drm/amdkfd: add debug set flags operation
  2023-01-25 19:53 [PATCH 00/32] Upstream of kernel support for AMDGPU ISA debugging Jonathan Kim
                   ` (25 preceding siblings ...)
  2023-01-25 19:53 ` [PATCH 26/32] drm/amdkfd: add debug set and clear address watch points operation Jonathan Kim
@ 2023-01-25 19:53 ` Jonathan Kim
  2023-03-22 21:47   ` Felix Kuehling
  2023-01-25 19:53 ` [PATCH 28/32] drm/amdkfd: add debug query event operation Jonathan Kim
                   ` (4 subsequent siblings)
  31 siblings, 1 reply; 68+ messages in thread
From: Jonathan Kim @ 2023-01-25 19:53 UTC (permalink / raw)
  To: amd-gfx, dri-devel; +Cc: Felix.Kuehling, Jonathan.Kim

Allow the debugger to set single memory and single ALU operations.

Some exceptions are imprecise (memory violations, address watch) in the
sense that a trap occurs only when the exception interrupt occurs and
not at the non-halting faulty instruction.  Trap temporaries 0 & 1 save
the program counter address, which means that these values will not point
to the faulty instruction address but to whenever the interrupt was
raised.

Setting the Single Memory Operations flag will inject an automatic wait
on every memory operation instruction forcing imprecise memory exceptions
to become precise at the cost of performance.  This setting is not
permitted on debug devices that support only a global setting of this
option.

Return the previous set flags to the debugger as well.

v3: make precise mem op the only available flag for now.

v2: add gfx11 support.

Signed-off-by: Jonathan Kim <jonathan.kim@amd.com>
---
 drivers/gpu/drm/amd/amdkfd/kfd_chardev.c |  2 ++
 drivers/gpu/drm/amd/amdkfd/kfd_debug.c   | 38 ++++++++++++++++++++++++
 drivers/gpu/drm/amd/amdkfd/kfd_debug.h   |  1 +
 3 files changed, 41 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index 8f2ede781863..c34caa14b84e 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -2947,6 +2947,8 @@ static int kfd_ioctl_set_debug_trap(struct file *filep, struct kfd_process *p, v
 				args->clear_node_address_watch.id);
 		break;
 	case KFD_IOC_DBG_TRAP_SET_FLAGS:
+		r = kfd_dbg_trap_set_flags(target, &args->set_flags.flags);
+		break;
 	case KFD_IOC_DBG_TRAP_QUERY_DEBUG_EVENT:
 	case KFD_IOC_DBG_TRAP_QUERY_EXCEPTION_INFO:
 	case KFD_IOC_DBG_TRAP_GET_QUEUE_SNAPSHOT:
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
index 8d2e1adb442d..77ba7da2bb9d 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
@@ -23,6 +23,7 @@
 #include "kfd_debug.h"
 #include "kfd_device_queue_manager.h"
 #include <linux/file.h>
+#include <uapi/linux/kfd_ioctl.h>
 
 #define MAX_WATCH_ADDRESSES	4
 
@@ -425,6 +426,40 @@ static void kfd_dbg_clear_process_address_watch(struct kfd_process *target)
 			kfd_dbg_trap_clear_dev_address_watch(target->pdds[i], j);
 }
 
+int kfd_dbg_trap_set_flags(struct kfd_process *target, uint32_t *flags)
+{
+	uint32_t prev_flags = target->dbg_flags;
+	int i, r = 0;
+
+	for (i = 0; i < target->n_pdds; i++) {
+		if (!kfd_dbg_is_per_vmid_supported(target->pdds[i]->dev) &&
+			(*flags & KFD_DBG_TRAP_FLAG_SINGLE_MEM_OP)) {
+			*flags = prev_flags;
+			return -EACCES;
+		}
+	}
+
+	target->dbg_flags = *flags & KFD_DBG_TRAP_FLAG_SINGLE_MEM_OP;
+	*flags = prev_flags;
+	for (i = 0; i < target->n_pdds; i++) {
+		struct kfd_process_device *pdd = target->pdds[i];
+
+		if (!kfd_dbg_is_per_vmid_supported(pdd->dev))
+			continue;
+
+		if (!pdd->dev->shared_resources.enable_mes)
+			r = debug_refresh_runlist(pdd->dev->dqm);
+		else
+			r = kfd_dbg_set_mes_debug_mode(pdd);
+
+		if (r) {
+			target->dbg_flags = prev_flags;
+			break;
+		}
+	}
+
+	return r;
+}
 
 /* kfd_dbg_trap_deactivate:
  *	target: target process
@@ -439,9 +474,12 @@ void kfd_dbg_trap_deactivate(struct kfd_process *target, bool unwind, int unwind
 	int i, count = 0;
 
 	if (!unwind) {
+		uint32_t flags = 0;
 		cancel_work_sync(&target->debug_event_workarea);
 		kfd_dbg_clear_process_address_watch(target);
 		kfd_dbg_trap_set_wave_launch_mode(target, 0);
+
+		kfd_dbg_trap_set_flags(target, &flags);
 	}
 
 	for (i = 0; i < target->n_pdds; i++) {
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.h b/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
index 63c716ce5ab9..782362d82890 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
@@ -57,6 +57,7 @@ int kfd_dbg_trap_set_dev_address_watch(struct kfd_process_device *pdd,
 					uint32_t watch_address_mask,
 					uint32_t *watch_id,
 					uint32_t watch_mode);
+int kfd_dbg_trap_set_flags(struct kfd_process *target, uint32_t *flags);
 int kfd_dbg_send_exception_to_runtime(struct kfd_process *p,
 					unsigned int dev_id,
 					unsigned int queue_id,
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 68+ messages in thread

* [PATCH 28/32] drm/amdkfd: add debug query event operation
  2023-01-25 19:53 [PATCH 00/32] Upstream of kernel support for AMDGPU ISA debugging Jonathan Kim
                   ` (26 preceding siblings ...)
  2023-01-25 19:53 ` [PATCH 27/32] drm/amdkfd: add debug set flags operation Jonathan Kim
@ 2023-01-25 19:53 ` Jonathan Kim
  2023-01-25 19:53 ` [PATCH 29/32] drm/amdkfd: add debug query exception info operation Jonathan Kim
                   ` (3 subsequent siblings)
  31 siblings, 0 replies; 68+ messages in thread
From: Jonathan Kim @ 2023-01-25 19:53 UTC (permalink / raw)
  To: amd-gfx, dri-devel; +Cc: Felix.Kuehling, Jonathan.Kim

Allow the debugger to query a single queue, device and process
exception.
The KFD should also return the GPU or Queue id of the exception.
The debugger also has the option of clearing exceptions after
being queried.

Signed-off-by: Jonathan Kim <jonathan.kim@amd.com>
Reviewed-by: Felix Kuehling <felix.kuehling@amd.com>
---
 drivers/gpu/drm/amd/amdkfd/kfd_chardev.c |  6 +++
 drivers/gpu/drm/amd/amdkfd/kfd_debug.c   | 64 ++++++++++++++++++++++++
 drivers/gpu/drm/amd/amdkfd/kfd_debug.h   |  5 ++
 3 files changed, 75 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index c34caa14b84e..0ae1237fa193 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -2950,6 +2950,12 @@ static int kfd_ioctl_set_debug_trap(struct file *filep, struct kfd_process *p, v
 		r = kfd_dbg_trap_set_flags(target, &args->set_flags.flags);
 		break;
 	case KFD_IOC_DBG_TRAP_QUERY_DEBUG_EVENT:
+		r = kfd_dbg_ev_query_debug_event(target,
+				&args->query_debug_event.queue_id,
+				&args->query_debug_event.gpu_id,
+				args->query_debug_event.exception_mask,
+				&args->query_debug_event.exception_mask);
+		break;
 	case KFD_IOC_DBG_TRAP_QUERY_EXCEPTION_INFO:
 	case KFD_IOC_DBG_TRAP_GET_QUEUE_SNAPSHOT:
 	case KFD_IOC_DBG_TRAP_GET_DEVICE_SNAPSHOT:
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
index 77ba7da2bb9d..032207efef15 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
@@ -27,6 +27,70 @@
 
 #define MAX_WATCH_ADDRESSES	4
 
+int kfd_dbg_ev_query_debug_event(struct kfd_process *process,
+		      unsigned int *queue_id,
+		      unsigned int *gpu_id,
+		      uint64_t exception_clear_mask,
+		      uint64_t *event_status)
+{
+	struct process_queue_manager *pqm;
+	struct process_queue_node *pqn;
+	int i;
+
+	if (!(process && process->debug_trap_enabled))
+		return -ENODATA;
+
+	mutex_lock(&process->event_mutex);
+	*event_status = 0;
+	*queue_id = 0;
+	*gpu_id = 0;
+
+	/* find and report queue events */
+	pqm = &process->pqm;
+	list_for_each_entry(pqn, &pqm->queues, process_queue_list) {
+		uint64_t tmp = process->exception_enable_mask;
+
+		if (!pqn->q)
+			continue;
+
+		tmp &= pqn->q->properties.exception_status;
+
+		if (!tmp)
+			continue;
+
+		*event_status = pqn->q->properties.exception_status;
+		*queue_id = pqn->q->properties.queue_id;
+		*gpu_id = pqn->q->device->id;
+		pqn->q->properties.exception_status &= ~exception_clear_mask;
+		goto out;
+	}
+
+	/* find and report device events */
+	for (i = 0; i < process->n_pdds; i++) {
+		struct kfd_process_device *pdd = process->pdds[i];
+		uint64_t tmp = process->exception_enable_mask
+						& pdd->exception_status;
+
+		if (!tmp)
+			continue;
+
+		*event_status = pdd->exception_status;
+		*gpu_id = pdd->dev->id;
+		pdd->exception_status &= ~exception_clear_mask;
+		goto out;
+	}
+
+	/* report process events */
+	if (process->exception_enable_mask & process->exception_status) {
+		*event_status = process->exception_status;
+		process->exception_status &= ~exception_clear_mask;
+	}
+
+out:
+	mutex_unlock(&process->event_mutex);
+	return *event_status ? 0 : -EAGAIN;
+}
+
 void debug_event_write_work_handler(struct work_struct *work)
 {
 	struct kfd_process *process;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.h b/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
index 782362d82890..4f2195d57ff0 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
@@ -27,6 +27,11 @@
 
 void kfd_dbg_trap_deactivate(struct kfd_process *target, bool unwind, int unwind_count);
 int kfd_dbg_trap_activate(struct kfd_process *target);
+int kfd_dbg_ev_query_debug_event(struct kfd_process *process,
+			unsigned int *queue_id,
+			unsigned int *gpu_id,
+			uint64_t exception_clear_mask,
+			uint64_t *event_status);
 bool kfd_set_dbg_ev_from_interrupt(struct kfd_dev *dev,
 				   unsigned int pasid,
 				   uint32_t doorbell_id,
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 68+ messages in thread

* [PATCH 29/32] drm/amdkfd: add debug query exception info operation
  2023-01-25 19:53 [PATCH 00/32] Upstream of kernel support for AMDGPU ISA debugging Jonathan Kim
                   ` (27 preceding siblings ...)
  2023-01-25 19:53 ` [PATCH 28/32] drm/amdkfd: add debug query event operation Jonathan Kim
@ 2023-01-25 19:53 ` Jonathan Kim
  2023-01-25 19:53 ` [PATCH 30/32] drm/amdkfd: add debug queue snapshot operation Jonathan Kim
                   ` (2 subsequent siblings)
  31 siblings, 0 replies; 68+ messages in thread
From: Jonathan Kim @ 2023-01-25 19:53 UTC (permalink / raw)
  To: amd-gfx, dri-devel; +Cc: Felix.Kuehling, Jonathan.Kim

Allow the debugger to query additional info based on an exception code.
For device exceptions, it's currently only memory violation information.
For process exceptions, it's currently only runtime information.
Queue exception only report the queue exception status.

The debugger has the option of clearing the target exception on query.

Signed-off-by: Jonathan Kim <jonathan.kim@amd.com>
Reviewed-by: Felix Kuehling <felix.kuehling@amd.com>
---
 drivers/gpu/drm/amd/amdkfd/kfd_chardev.c |   7 ++
 drivers/gpu/drm/amd/amdkfd/kfd_debug.c   | 120 +++++++++++++++++++++++
 drivers/gpu/drm/amd/amdkfd/kfd_debug.h   |   6 ++
 3 files changed, 133 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index 0ae1237fa193..d3d2026b6e65 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -2957,6 +2957,13 @@ static int kfd_ioctl_set_debug_trap(struct file *filep, struct kfd_process *p, v
 				&args->query_debug_event.exception_mask);
 		break;
 	case KFD_IOC_DBG_TRAP_QUERY_EXCEPTION_INFO:
+		r = kfd_dbg_trap_query_exception_info(target,
+				args->query_exception_info.source_id,
+				args->query_exception_info.exception_code,
+				args->query_exception_info.clear_exception,
+				(void __user *)args->query_exception_info.info_ptr,
+				&args->query_exception_info.info_size);
+		break;
 	case KFD_IOC_DBG_TRAP_GET_QUEUE_SNAPSHOT:
 	case KFD_IOC_DBG_TRAP_GET_DEVICE_SNAPSHOT:
 		pr_warn("Debug op %i not supported yet\n", args->op);
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
index 032207efef15..db316f0625f8 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
@@ -878,6 +878,126 @@ int kfd_dbg_trap_set_wave_launch_mode(struct kfd_process *target,
 	return r;
 }
 
+int kfd_dbg_trap_query_exception_info(struct kfd_process *target,
+		uint32_t source_id,
+		uint32_t exception_code,
+		bool clear_exception,
+		void __user *info,
+		uint32_t *info_size)
+{
+	bool found = false;
+	int r = 0;
+	uint32_t copy_size, actual_info_size = 0;
+	uint64_t *exception_status_ptr = NULL;
+
+	if (!target)
+		return -EINVAL;
+
+	if (!info || !info_size)
+		return -EINVAL;
+
+	mutex_lock(&target->event_mutex);
+
+	if (KFD_DBG_EC_TYPE_IS_QUEUE(exception_code)) {
+		/* Per queue exceptions */
+		struct queue *queue = NULL;
+		int i;
+
+		for (i = 0; i < target->n_pdds; i++) {
+			struct kfd_process_device *pdd = target->pdds[i];
+			struct qcm_process_device *qpd = &pdd->qpd;
+
+			list_for_each_entry(queue, &qpd->queues_list, list) {
+				if (!found && queue->properties.queue_id == source_id) {
+					found = true;
+					break;
+				}
+			}
+			if (found)
+				break;
+		}
+
+		if (!found) {
+			r = -EINVAL;
+			goto out;
+		}
+
+		if (!(queue->properties.exception_status & KFD_EC_MASK(exception_code))) {
+			r = -ENODATA;
+			goto out;
+		}
+		exception_status_ptr = &queue->properties.exception_status;
+	} else if (KFD_DBG_EC_TYPE_IS_DEVICE(exception_code)) {
+		/* Per device exceptions */
+		struct kfd_process_device *pdd = NULL;
+		int i;
+
+		for (i = 0; i < target->n_pdds; i++) {
+			pdd = target->pdds[i];
+			if (pdd->dev->id == source_id) {
+				found = true;
+				break;
+			}
+		}
+
+		if (!found) {
+			r = -EINVAL;
+			goto out;
+		}
+
+		if (!(pdd->exception_status & KFD_EC_MASK(exception_code))) {
+			r = -ENODATA;
+			goto out;
+		}
+
+		if (exception_code == EC_DEVICE_MEMORY_VIOLATION) {
+			copy_size = min((size_t)(*info_size), pdd->vm_fault_exc_data_size);
+
+			if (copy_to_user(info, pdd->vm_fault_exc_data, copy_size)) {
+				r = -EFAULT;
+				goto out;
+			}
+			actual_info_size = pdd->vm_fault_exc_data_size;
+			if (clear_exception) {
+				kfree(pdd->vm_fault_exc_data);
+				pdd->vm_fault_exc_data = NULL;
+				pdd->vm_fault_exc_data_size = 0;
+			}
+		}
+		exception_status_ptr = &pdd->exception_status;
+	} else if (KFD_DBG_EC_TYPE_IS_PROCESS(exception_code)) {
+		/* Per process exceptions */
+		if (!(target->exception_status & KFD_EC_MASK(exception_code))) {
+			r = -ENODATA;
+			goto out;
+		}
+
+		if (exception_code == EC_PROCESS_RUNTIME) {
+			copy_size = min((size_t)(*info_size), sizeof(target->runtime_info));
+
+			if (copy_to_user(info, (void *)&target->runtime_info, copy_size)) {
+				r = -EFAULT;
+				goto out;
+			}
+
+			actual_info_size = sizeof(target->runtime_info);
+		}
+
+		exception_status_ptr = &target->exception_status;
+	} else {
+		pr_debug("Bad exception type [%i]\n", exception_code);
+		r = -EINVAL;
+		goto out;
+	}
+
+	*info_size = actual_info_size;
+	if (clear_exception)
+		*exception_status_ptr &= ~KFD_EC_MASK(exception_code);
+out:
+	mutex_unlock(&target->event_mutex);
+	return r;
+}
+
 void kfd_dbg_set_enabled_debug_exception_mask(struct kfd_process *target,
 					uint64_t exception_set_mask)
 {
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.h b/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
index 4f2195d57ff0..ee12de5f7adf 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
@@ -63,6 +63,12 @@ int kfd_dbg_trap_set_dev_address_watch(struct kfd_process_device *pdd,
 					uint32_t *watch_id,
 					uint32_t watch_mode);
 int kfd_dbg_trap_set_flags(struct kfd_process *target, uint32_t *flags);
+int kfd_dbg_trap_query_exception_info(struct kfd_process *target,
+		uint32_t source_id,
+		uint32_t exception_code,
+		bool clear_exception,
+		void __user *info,
+		uint32_t *info_size);
 int kfd_dbg_send_exception_to_runtime(struct kfd_process *p,
 					unsigned int dev_id,
 					unsigned int queue_id,
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 68+ messages in thread

* [PATCH 30/32] drm/amdkfd: add debug queue snapshot operation
  2023-01-25 19:53 [PATCH 00/32] Upstream of kernel support for AMDGPU ISA debugging Jonathan Kim
                   ` (28 preceding siblings ...)
  2023-01-25 19:53 ` [PATCH 29/32] drm/amdkfd: add debug query exception info operation Jonathan Kim
@ 2023-01-25 19:53 ` Jonathan Kim
  2023-03-22 21:52   ` Felix Kuehling
  2023-01-25 19:54 ` [PATCH 31/32] drm/amdkfd: add debug device " Jonathan Kim
  2023-01-25 19:54 ` [PATCH 32/32] drm/amdkfd: bump kfd ioctl minor version for debug api availability Jonathan Kim
  31 siblings, 1 reply; 68+ messages in thread
From: Jonathan Kim @ 2023-01-25 19:53 UTC (permalink / raw)
  To: amd-gfx, dri-devel; +Cc: Felix.Kuehling, Jonathan.Kim

Allow the debugger to get a snapshot of a specified number of queues
containing various queue property information that is copied to the
debugger.

Since the debugger doesn't know how many queues exist at any given time,
allow the debugger to pass the requested number of snapshots as 0 to get
the actual number of potential snapshots to use for a subsequent snapshot
request for actual information.

To prevent future ABI breakage, pass in the requested entry_size.
The KFD will return it's own entry_size in case the debugger still wants
log the information in a core dump on sizing failure.

Also allow the debugger to clear exceptions when doing a snapshot.

v3: fix uninitialized return and change queue snapshot to type void for
proper increment on buffer copy.
use memset 0 to init snapshot entry to clear struct padding.

v2: change buf_size arg to num_queues for clarity.
fix minimum entry size calculation.

Signed-off-by: Jonathan Kim <jonathan.kim@amd.com>
---
 drivers/gpu/drm/amd/amdkfd/kfd_chardev.c      |  6 +++
 .../drm/amd/amdkfd/kfd_device_queue_manager.c | 36 ++++++++++++++++
 .../drm/amd/amdkfd/kfd_device_queue_manager.h |  3 ++
 drivers/gpu/drm/amd/amdkfd/kfd_priv.h         |  5 +++
 .../amd/amdkfd/kfd_process_queue_manager.c    | 41 +++++++++++++++++++
 5 files changed, 91 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index d3d2026b6e65..93b288233577 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -2965,6 +2965,12 @@ static int kfd_ioctl_set_debug_trap(struct file *filep, struct kfd_process *p, v
 				&args->query_exception_info.info_size);
 		break;
 	case KFD_IOC_DBG_TRAP_GET_QUEUE_SNAPSHOT:
+		r = pqm_get_queue_snapshot(&target->pqm,
+				args->queue_snapshot.exception_mask,
+				(void __user *)args->queue_snapshot.snapshot_buf_ptr,
+				&args->queue_snapshot.num_queues,
+				&args->queue_snapshot.entry_size);
+		break;
 	case KFD_IOC_DBG_TRAP_GET_DEVICE_SNAPSHOT:
 		pr_warn("Debug op %i not supported yet\n", args->op);
 		r = -EACCES;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
index 7792fe9491c5..5ae504a512f0 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -3000,6 +3000,42 @@ int suspend_queues(struct kfd_process *p,
 	return total_suspended;
 }
 
+static uint32_t set_queue_type_for_user(struct queue_properties *q_props)
+{
+	switch (q_props->type) {
+	case KFD_QUEUE_TYPE_COMPUTE:
+		return q_props->format == KFD_QUEUE_FORMAT_PM4
+					? KFD_IOC_QUEUE_TYPE_COMPUTE
+					: KFD_IOC_QUEUE_TYPE_COMPUTE_AQL;
+	case KFD_QUEUE_TYPE_SDMA:
+		return KFD_IOC_QUEUE_TYPE_SDMA;
+	case KFD_QUEUE_TYPE_SDMA_XGMI:
+		return KFD_IOC_QUEUE_TYPE_SDMA_XGMI;
+	default:
+		WARN_ONCE(true, "queue type not recognized!");
+		return 0xffffffff;
+	};
+}
+
+void set_queue_snapshot_entry(struct queue *q,
+			      uint64_t exception_clear_mask,
+			      struct kfd_queue_snapshot_entry *qss_entry)
+{
+	qss_entry->ring_base_address = q->properties.queue_address;
+	qss_entry->write_pointer_address = (uint64_t)q->properties.write_ptr;
+	qss_entry->read_pointer_address = (uint64_t)q->properties.read_ptr;
+	qss_entry->ctx_save_restore_address =
+				q->properties.ctx_save_restore_area_address;
+	qss_entry->ctx_save_restore_area_size =
+				q->properties.ctx_save_restore_area_size;
+	qss_entry->exception_status = q->properties.exception_status;
+	qss_entry->queue_id = q->properties.queue_id;
+	qss_entry->gpu_id = q->device->id;
+	qss_entry->ring_size = (uint32_t)q->properties.queue_size;
+	qss_entry->queue_type = set_queue_type_for_user(&q->properties);
+	q->properties.exception_status &= ~exception_clear_mask;
+}
+
 int debug_lock_and_unmap(struct device_queue_manager *dqm)
 {
 	int r;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
index 7ccf8d0d1867..89d4a5b293a5 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
@@ -296,6 +296,9 @@ int suspend_queues(struct kfd_process *p,
 int resume_queues(struct kfd_process *p,
 		uint32_t num_queues,
 		uint32_t *usr_queue_id_array);
+void set_queue_snapshot_entry(struct queue *q,
+			      uint64_t exception_clear_mask,
+			      struct kfd_queue_snapshot_entry *qss_entry);
 int debug_lock_and_unmap(struct device_queue_manager *dqm);
 int debug_map_and_unlock(struct device_queue_manager *dqm);
 int debug_refresh_runlist(struct device_queue_manager *dqm);
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index cfc50d1690c7..cc7816db60eb 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -1302,6 +1302,11 @@ int pqm_get_wave_state(struct process_queue_manager *pqm,
 		       void __user *ctl_stack,
 		       u32 *ctl_stack_used_size,
 		       u32 *save_area_used_size);
+int pqm_get_queue_snapshot(struct process_queue_manager *pqm,
+			   uint64_t exception_clear_mask,
+			   void __user *buf,
+			   int *num_qss_entries,
+			   uint32_t *entry_size);
 
 int amdkfd_fence_wait_timeout(uint64_t *fence_addr,
 			      uint64_t fence_value,
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
index 0ae6026c7d69..221cd4b03f1c 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
@@ -576,6 +576,47 @@ int pqm_get_wave_state(struct process_queue_manager *pqm,
 						       save_area_used_size);
 }
 
+int pqm_get_queue_snapshot(struct process_queue_manager *pqm,
+			   uint64_t exception_clear_mask,
+			   void __user *buf,
+			   int *num_qss_entries,
+			   uint32_t *entry_size)
+{
+	struct process_queue_node *pqn;
+	uint32_t tmp_entry_size = *entry_size, tmp_qss_entries = *num_qss_entries;
+	int r = 0;
+
+	*num_qss_entries = 0;
+	if (!(*entry_size))
+		return -EINVAL;
+
+	*entry_size = min_t(size_t, *entry_size, sizeof(struct kfd_queue_snapshot_entry));
+	mutex_lock(&pqm->process->event_mutex);
+
+	list_for_each_entry(pqn, &pqm->queues, process_queue_list) {
+		if (!pqn->q)
+			continue;
+
+		if (*num_qss_entries < tmp_qss_entries) {
+			struct kfd_queue_snapshot_entry src;
+
+			memset(&src, 0, sizeof(src));
+
+			set_queue_snapshot_entry(pqn->q, exception_clear_mask, &src);
+
+			if (copy_to_user(buf, &src, *entry_size)) {
+				r = -EFAULT;
+				break;
+			}
+			buf += tmp_entry_size;
+		}
+		*num_qss_entries += 1;
+	}
+
+	mutex_unlock(&pqm->process->event_mutex);
+	return r;
+}
+
 static int get_queue_data_sizes(struct kfd_process_device *pdd,
 				struct queue *q,
 				uint32_t *mqd_size,
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 68+ messages in thread

* [PATCH 31/32] drm/amdkfd: add debug device snapshot operation
  2023-01-25 19:53 [PATCH 00/32] Upstream of kernel support for AMDGPU ISA debugging Jonathan Kim
                   ` (29 preceding siblings ...)
  2023-01-25 19:53 ` [PATCH 30/32] drm/amdkfd: add debug queue snapshot operation Jonathan Kim
@ 2023-01-25 19:54 ` Jonathan Kim
  2023-03-22 21:54   ` Felix Kuehling
  2023-01-25 19:54 ` [PATCH 32/32] drm/amdkfd: bump kfd ioctl minor version for debug api availability Jonathan Kim
  31 siblings, 1 reply; 68+ messages in thread
From: Jonathan Kim @ 2023-01-25 19:54 UTC (permalink / raw)
  To: amd-gfx, dri-devel; +Cc: Felix.Kuehling, Jonathan.Kim

Similar to queue snapshot, return an array of device information using
an entry_size check and return.
Unlike queue snapshots, the debugger needs to pass to correct number of
devices that exist.  If it fails to do so, the KFD will return the
number of actual devices so that the debugger can make a subsequent
successful call.

v3: was reviewed but re-requesting review with new revision and
subvendor information.
memset 0 device info entry to clear padding.

v2: change buf_size are to num_devices for more clarity.
expand device entry new members on copy.
fix minimum entry size calculation for queue and device snapshot.
change device snapshot implementation to match queue snapshot
implementation.

Signed-off-by: Jonathan Kim <jonathan.kim@amd.com>
---
 drivers/gpu/drm/amd/amdkfd/kfd_chardev.c |  7 ++-
 drivers/gpu/drm/amd/amdkfd/kfd_debug.c   | 72 ++++++++++++++++++++++++
 drivers/gpu/drm/amd/amdkfd/kfd_debug.h   |  5 ++
 3 files changed, 82 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index 93b288233577..da74a6ef4d9b 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -2972,8 +2972,11 @@ static int kfd_ioctl_set_debug_trap(struct file *filep, struct kfd_process *p, v
 				&args->queue_snapshot.entry_size);
 		break;
 	case KFD_IOC_DBG_TRAP_GET_DEVICE_SNAPSHOT:
-		pr_warn("Debug op %i not supported yet\n", args->op);
-		r = -EACCES;
+		r = kfd_dbg_trap_device_snapshot(target,
+				args->device_snapshot.exception_mask,
+				(void __user *)args->device_snapshot.snapshot_buf_ptr,
+				&args->device_snapshot.num_devices,
+				&args->device_snapshot.entry_size);
 		break;
 	default:
 		pr_err("Invalid option: %i\n", args->op);
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
index db316f0625f8..d1c4eb9652fd 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
@@ -22,6 +22,7 @@
 
 #include "kfd_debug.h"
 #include "kfd_device_queue_manager.h"
+#include "kfd_topology.h"
 #include <linux/file.h>
 #include <uapi/linux/kfd_ioctl.h>
 
@@ -998,6 +999,77 @@ int kfd_dbg_trap_query_exception_info(struct kfd_process *target,
 	return r;
 }
 
+int kfd_dbg_trap_device_snapshot(struct kfd_process *target,
+		uint64_t exception_clear_mask,
+		void __user *user_info,
+		uint32_t *number_of_device_infos,
+		uint32_t *entry_size)
+{
+	struct kfd_dbg_device_info_entry device_info;
+	uint32_t tmp_entry_size = *entry_size, tmp_num_devices;
+	int i, r = 0;
+
+	if (!(target && user_info && number_of_device_infos && entry_size))
+		return -EINVAL;
+
+	tmp_num_devices = min_t(size_t, *number_of_device_infos, target->n_pdds);
+	*number_of_device_infos = target->n_pdds;
+	*entry_size = min_t(size_t, *entry_size, sizeof(device_info));
+
+	if (!tmp_num_devices)
+		return 0;
+
+	memset(&device_info, 0, sizeof(device_info));
+
+	mutex_lock(&target->event_mutex);
+
+	/* Run over all pdd of the process */
+	for (i = 0; i < tmp_num_devices; i++) {
+		struct kfd_process_device *pdd = target->pdds[i];
+		struct kfd_topology_device *topo_dev = kfd_topology_device_by_id(pdd->dev->id);
+
+		device_info.gpu_id = pdd->dev->id;
+		device_info.exception_status = pdd->exception_status;
+		device_info.lds_base = pdd->lds_base;
+		device_info.lds_limit = pdd->lds_limit;
+		device_info.scratch_base = pdd->scratch_base;
+		device_info.scratch_limit = pdd->scratch_limit;
+		device_info.gpuvm_base = pdd->gpuvm_base;
+		device_info.gpuvm_limit = pdd->gpuvm_limit;
+		device_info.location_id = topo_dev->node_props.location_id;
+		device_info.vendor_id = topo_dev->node_props.vendor_id;
+		device_info.device_id = topo_dev->node_props.device_id;
+		device_info.revision_id = pdd->dev->adev->pdev->revision;
+		device_info.subsystem_vendor_id = pdd->dev->adev->pdev->subsystem_vendor;
+		device_info.subsystem_device_id = pdd->dev->adev->pdev->subsystem_device;
+		device_info.fw_version = pdd->dev->mec_fw_version;
+		device_info.gfx_target_version =
+			topo_dev->node_props.gfx_target_version;
+		device_info.simd_count = topo_dev->node_props.simd_count;
+		device_info.max_waves_per_simd =
+			topo_dev->node_props.max_waves_per_simd;
+		device_info.array_count = topo_dev->node_props.array_count;
+		device_info.simd_arrays_per_engine =
+			topo_dev->node_props.simd_arrays_per_engine;
+		device_info.capability = topo_dev->node_props.capability;
+		device_info.debug_prop = topo_dev->node_props.debug_prop;
+
+		if (exception_clear_mask)
+			pdd->exception_status &= ~exception_clear_mask;
+
+		if (copy_to_user(user_info, &device_info, *entry_size)) {
+			r = -EFAULT;
+			break;
+		}
+
+		user_info += tmp_entry_size;
+	}
+
+	mutex_unlock(&target->event_mutex);
+
+	return r;
+}
+
 void kfd_dbg_set_enabled_debug_exception_mask(struct kfd_process *target,
 					uint64_t exception_set_mask)
 {
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.h b/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
index ee12de5f7adf..b31e453704fc 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
@@ -81,6 +81,11 @@ static inline bool kfd_dbg_is_per_vmid_supported(struct kfd_dev *dev)
 }
 
 void debug_event_write_work_handler(struct work_struct *work);
+int kfd_dbg_trap_device_snapshot(struct kfd_process *target,
+		uint64_t exception_clear_mask,
+		void __user *user_info,
+		uint32_t *number_of_device_infos,
+		uint32_t *entry_size);
 
 void kfd_dbg_set_enabled_debug_exception_mask(struct kfd_process *target,
 					uint64_t exception_set_mask);
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 68+ messages in thread

* [PATCH 32/32] drm/amdkfd: bump kfd ioctl minor version for debug api availability
  2023-01-25 19:53 [PATCH 00/32] Upstream of kernel support for AMDGPU ISA debugging Jonathan Kim
                   ` (30 preceding siblings ...)
  2023-01-25 19:54 ` [PATCH 31/32] drm/amdkfd: add debug device " Jonathan Kim
@ 2023-01-25 19:54 ` Jonathan Kim
  2023-03-22 21:56   ` Felix Kuehling
  31 siblings, 1 reply; 68+ messages in thread
From: Jonathan Kim @ 2023-01-25 19:54 UTC (permalink / raw)
  To: amd-gfx, dri-devel; +Cc: Felix.Kuehling, Jonathan.Kim

Bump the minor version to declare debugging capability is now
available.

Signed-off-by: Jonathan Kim <jonathan.kim@amd.com>
Reviewed-by: Felix Kuehling <felix.kuehling@amd.com>
---
 drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 1 -
 include/uapi/linux/kfd_ioctl.h           | 3 ++-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index da74a6ef4d9b..c28d4b2dd0ef 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -2896,7 +2896,6 @@ static int kfd_ioctl_set_debug_trap(struct file *filep, struct kfd_process *p, v
 		if (!r)
 			target->exception_enable_mask = args->enable.exception_mask;
 
-		pr_warn("Debug functions limited\n");
 		break;
 	case KFD_IOC_DBG_TRAP_DISABLE:
 		r = kfd_dbg_trap_disable(target);
diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h
index 9ef4eed45c19..a0efe1ccdbd6 100644
--- a/include/uapi/linux/kfd_ioctl.h
+++ b/include/uapi/linux/kfd_ioctl.h
@@ -37,9 +37,10 @@
  * - 1.9 - Add available memory ioctl
  * - 1.10 - Add SMI profiler event log
  * - 1.11 - Add unified memory for ctx save/restore area
+ * - 1.12 - Add debugger API
  */
 #define KFD_IOCTL_MAJOR_VERSION 1
-#define KFD_IOCTL_MINOR_VERSION 11
+#define KFD_IOCTL_MINOR_VERSION 12
 
 struct kfd_ioctl_get_version_args {
 	__u32 major_version;	/* from KFD */
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 68+ messages in thread

* Re: [PATCH 06/32] drm/amdgpu: add gfx9 hw debug mode enable and disable calls
  2023-01-25 19:53 ` [PATCH 06/32] drm/amdgpu: add gfx9 hw debug mode enable and disable calls Jonathan Kim
@ 2023-01-29  5:12   ` kernel test robot
  2023-02-16 22:54   ` Felix Kuehling
  1 sibling, 0 replies; 68+ messages in thread
From: kernel test robot @ 2023-01-29  5:12 UTC (permalink / raw)
  To: Jonathan Kim, amd-gfx, dri-devel
  Cc: Felix.Kuehling, Jonathan.Kim, oe-kbuild-all

Hi Jonathan,

Thank you for the patch! Perhaps something to improve:

[auto build test WARNING on drm-tip/drm-tip]
[also build test WARNING on linus/master v6.2-rc5]
[cannot apply to drm-misc/drm-misc-next drm/drm-next drm-exynos/exynos-drm-next drm-intel/for-linux-next drm-intel/for-linux-next-fixes]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/Jonathan-Kim/drm-amdkfd-add-debug-and-runtime-enable-interface/20230128-092952
base:   git://anongit.freedesktop.org/drm/drm-tip drm-tip
patch link:    https://lore.kernel.org/r/20230125195401.4183544-7-jonathan.kim%40amd.com
patch subject: [PATCH 06/32] drm/amdgpu: add gfx9 hw debug mode enable and disable calls
config: x86_64-allyesconfig (https://download.01.org/0day-ci/archive/20230129/202301291257.PRqg0VpG-lkp@intel.com/config)
compiler: gcc-11 (Debian 11.3.0-8) 11.3.0
reproduce (this is a W=1 build):
        # https://github.com/intel-lab-lkp/linux/commit/11bb8b2034cd92b687a2d5461298cc72d720d5c9
        git remote add linux-review https://github.com/intel-lab-lkp/linux
        git fetch --no-tags linux-review Jonathan-Kim/drm-amdkfd-add-debug-and-runtime-enable-interface/20230128-092952
        git checkout 11bb8b2034cd92b687a2d5461298cc72d720d5c9
        # save the config file
        mkdir build_dir && cp config build_dir/.config
        make W=1 O=build_dir ARCH=x86_64 olddefconfig
        make W=1 O=build_dir ARCH=x86_64 SHELL=/bin/bash drivers/gpu/drm/amd/amdgpu/

If you fix the issue, kindly add following tag where applicable
| Reported-by: kernel test robot <lkp@intel.com>

All warnings (new ones prefixed by >>):

>> drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c:694: warning: This comment starts with '/**', but isn't a kernel-doc comment. Refer Documentation/doc-guide/kernel-doc.rst
    * restore_dbg_reisters is ignored here but is a general interface requirement
   drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c:718: warning: This comment starts with '/**', but isn't a kernel-doc comment. Refer Documentation/doc-guide/kernel-doc.rst
    * keep_trap_enabled is ignored here but is a general interface requirement


vim +694 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c

   692	
   693	/**
 > 694	 * restore_dbg_reisters is ignored here but is a general interface requirement
   695	 * for devices that support GFXOFF and where the RLC save/restore list
   696	 * does not support hw registers for debugging i.e. the driver has to manually
   697	 * initialize the debug mode registers after it has disabled GFX off during the
   698	 * debug session.
   699	 */
   700	uint32_t kgd_gfx_v9_enable_debug_trap(struct amdgpu_device *adev,
   701					bool restore_dbg_registers,
   702					uint32_t vmid)
   703	{
   704		mutex_lock(&adev->grbm_idx_mutex);
   705	
   706		kgd_gfx_v9_set_wave_launch_stall(adev, vmid, true);
   707	
   708		WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_TRAP_MASK), 0);
   709	
   710		kgd_gfx_v9_set_wave_launch_stall(adev, vmid, false);
   711	
   712		mutex_unlock(&adev->grbm_idx_mutex);
   713	
   714		return 0;
   715	}
   716	

-- 
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests

^ permalink raw reply	[flat|nested] 68+ messages in thread

* Re: [PATCH 07/32] drm/amdgpu: add gfx9.4.1 hw debug mode enable and disable calls
  2023-01-25 19:53 ` [PATCH 07/32] drm/amdgpu: add gfx9.4.1 " Jonathan Kim
@ 2023-01-29  6:34   ` kernel test robot
  2023-02-16 23:01   ` Felix Kuehling
  1 sibling, 0 replies; 68+ messages in thread
From: kernel test robot @ 2023-01-29  6:34 UTC (permalink / raw)
  To: Jonathan Kim, amd-gfx, dri-devel
  Cc: Felix.Kuehling, Jonathan.Kim, oe-kbuild-all

Hi Jonathan,

Thank you for the patch! Perhaps something to improve:

[auto build test WARNING on drm-tip/drm-tip]
[also build test WARNING on linus/master v6.2-rc5]
[cannot apply to drm-misc/drm-misc-next drm/drm-next drm-exynos/exynos-drm-next drm-intel/for-linux-next drm-intel/for-linux-next-fixes]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/Jonathan-Kim/drm-amdkfd-add-debug-and-runtime-enable-interface/20230128-092952
base:   git://anongit.freedesktop.org/drm/drm-tip drm-tip
patch link:    https://lore.kernel.org/r/20230125195401.4183544-8-jonathan.kim%40amd.com
patch subject: [PATCH 07/32] drm/amdgpu: add gfx9.4.1 hw debug mode enable and disable calls
config: x86_64-allyesconfig (https://download.01.org/0day-ci/archive/20230129/202301291457.nbbgmBEG-lkp@intel.com/config)
compiler: gcc-11 (Debian 11.3.0-8) 11.3.0
reproduce (this is a W=1 build):
        # https://github.com/intel-lab-lkp/linux/commit/3983dc0f35ebf17452f97c096b866e38c98318db
        git remote add linux-review https://github.com/intel-lab-lkp/linux
        git fetch --no-tags linux-review Jonathan-Kim/drm-amdkfd-add-debug-and-runtime-enable-interface/20230128-092952
        git checkout 3983dc0f35ebf17452f97c096b866e38c98318db
        # save the config file
        mkdir build_dir && cp config build_dir/.config
        make W=1 O=build_dir ARCH=x86_64 olddefconfig
        make W=1 O=build_dir ARCH=x86_64 SHELL=/bin/bash drivers/gpu/drm/amd/amdgpu/

If you fix the issue, kindly add following tag where applicable
| Reported-by: kernel test robot <lkp@intel.com>

All warnings (new ones prefixed by >>):

   drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c: In function 'suspend_resume_compute_scheduler':
>> drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c:312:53: warning: implicit conversion from 'enum amd_hw_ip_block_type' to 'enum amd_ip_block_type' [-Wenum-conversion]
     312 |         return amdgpu_device_ip_wait_for_idle(adev, GC_HWIP);
         |                                                     ^~~~~~~
   In file included from drivers/gpu/drm/amd/amdgpu/../display/dc/dc_types.h:36,
                    from drivers/gpu/drm/amd/amdgpu/../display/dc/dm_services_types.h:30,
                    from drivers/gpu/drm/amd/amdgpu/../include/dm_pp_interface.h:26,
                    from drivers/gpu/drm/amd/amdgpu/amdgpu.h:64,
                    from drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c:26:
   At top level:
   drivers/gpu/drm/amd/amdgpu/../display/dc/dc_hdmi_types.h:53:22: warning: 'dp_hdmi_dongle_signature_str' defined but not used [-Wunused-const-variable=]
      53 | static const uint8_t dp_hdmi_dongle_signature_str[] = "DP-HDMI ADAPTOR";
         |                      ^~~~~~~~~~~~~~~~~~~~~~~~~~~~
--
>> drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c:343: warning: This comment starts with '/**', but isn't a kernel-doc comment. Refer Documentation/doc-guide/kernel-doc.rst
    * restore_dbg_reisters is ignored here but is a general interface requirement
   drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c:369: warning: This comment starts with '/**', but isn't a kernel-doc comment. Refer Documentation/doc-guide/kernel-doc.rst
    * keep_trap_enabled is ignored here but is a general interface requirement


vim +312 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c

   281	
   282	/*
   283	 * Helper used to suspend/resume gfx pipe for image post process work to set
   284	 * barrier behaviour.
   285	 */
   286	static int suspend_resume_compute_scheduler(struct amdgpu_device *adev, bool suspend)
   287	{
   288		int i, r = 0;
   289	
   290		for (i = 0; i < adev->gfx.num_compute_rings; i++) {
   291			struct amdgpu_ring *ring = &adev->gfx.compute_ring[i];
   292	
   293			if (!(ring && ring->sched.thread))
   294				continue;
   295	
   296			/* stop secheduler and drain ring. */
   297			if (suspend) {
   298				drm_sched_stop(&ring->sched, NULL);
   299				r = amdgpu_fence_wait_empty(ring);
   300				if (r)
   301					goto out;
   302			} else {
   303				drm_sched_start(&ring->sched, false);
   304			}
   305		}
   306	
   307	out:
   308		/* return on resume or failure to drain rings. */
   309		if (!suspend || r)
   310			return r;
   311	
 > 312		return amdgpu_device_ip_wait_for_idle(adev, GC_HWIP);
   313	}
   314	
   315	static void set_barrier_auto_waitcnt(struct amdgpu_device *adev, bool enable_waitcnt)
   316	{
   317		uint32_t data;
   318	
   319		WRITE_ONCE(adev->barrier_has_auto_waitcnt, enable_waitcnt);
   320	
   321		if (!down_read_trylock(&adev->reset_domain->sem))
   322			return;
   323	
   324		amdgpu_amdkfd_suspend(adev, false);
   325	
   326		if (suspend_resume_compute_scheduler(adev, true))
   327			goto out;
   328	
   329		data = RREG32(SOC15_REG_OFFSET(GC, 0, mmSQ_CONFIG));
   330		data = REG_SET_FIELD(data, SQ_CONFIG, DISABLE_BARRIER_WAITCNT,
   331							enable_waitcnt ? 0 : 1);
   332		WREG32(SOC15_REG_OFFSET(GC, 0, mmSQ_CONFIG), data);
   333	
   334	out:
   335		suspend_resume_compute_scheduler(adev, false);
   336	
   337		amdgpu_amdkfd_resume(adev, false);
   338	
   339		up_read(&adev->reset_domain->sem);
   340	}
   341	
   342	/**
 > 343	 * restore_dbg_reisters is ignored here but is a general interface requirement
   344	 * for devices that support GFXOFF and where the RLC save/restore list
   345	 * does not support hw registers for debugging i.e. the driver has to manually
   346	 * initialize the debug mode registers after it has disabled GFX off during the
   347	 * debug session.
   348	 */
   349	static uint32_t kgd_arcturus_enable_debug_trap(struct amdgpu_device *adev,
   350					bool restore_dbg_registers,
   351					uint32_t vmid)
   352	{
   353		mutex_lock(&adev->grbm_idx_mutex);
   354	
   355		kgd_gfx_v9_set_wave_launch_stall(adev, vmid, true);
   356	
   357		set_barrier_auto_waitcnt(adev, true);
   358	
   359		WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_TRAP_MASK), 0);
   360	
   361		kgd_gfx_v9_set_wave_launch_stall(adev, vmid, false);
   362	
   363		mutex_unlock(&adev->grbm_idx_mutex);
   364	
   365		return 0;
   366	}
   367	

-- 
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests

^ permalink raw reply	[flat|nested] 68+ messages in thread

* Re: [PATCH 08/32] drm/amdgpu: add gfx10 hw debug mode enable and disable calls
  2023-01-25 19:53 ` [PATCH 08/32] drm/amdgpu: add gfx10 " Jonathan Kim
@ 2023-01-29  7:55   ` kernel test robot
  2023-02-16 23:11   ` Felix Kuehling
  1 sibling, 0 replies; 68+ messages in thread
From: kernel test robot @ 2023-01-29  7:55 UTC (permalink / raw)
  To: Jonathan Kim, amd-gfx, dri-devel
  Cc: Felix.Kuehling, Jonathan.Kim, oe-kbuild-all

Hi Jonathan,

Thank you for the patch! Perhaps something to improve:

[auto build test WARNING on drm-tip/drm-tip]
[also build test WARNING on linus/master v6.2-rc5]
[cannot apply to drm-misc/drm-misc-next drm/drm-next drm-exynos/exynos-drm-next drm-intel/for-linux-next drm-intel/for-linux-next-fixes]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/Jonathan-Kim/drm-amdkfd-add-debug-and-runtime-enable-interface/20230128-092952
base:   git://anongit.freedesktop.org/drm/drm-tip drm-tip
patch link:    https://lore.kernel.org/r/20230125195401.4183544-9-jonathan.kim%40amd.com
patch subject: [PATCH 08/32] drm/amdgpu: add gfx10 hw debug mode enable and disable calls
config: x86_64-allyesconfig (https://download.01.org/0day-ci/archive/20230129/202301291502.el73Nchv-lkp@intel.com/config)
compiler: gcc-11 (Debian 11.3.0-8) 11.3.0
reproduce (this is a W=1 build):
        # https://github.com/intel-lab-lkp/linux/commit/ca722ec7f4d749b61a30b4654fabf05f03d8d2cf
        git remote add linux-review https://github.com/intel-lab-lkp/linux
        git fetch --no-tags linux-review Jonathan-Kim/drm-amdkfd-add-debug-and-runtime-enable-interface/20230128-092952
        git checkout ca722ec7f4d749b61a30b4654fabf05f03d8d2cf
        # save the config file
        mkdir build_dir && cp config build_dir/.config
        make W=1 O=build_dir ARCH=x86_64 olddefconfig
        make W=1 O=build_dir ARCH=x86_64 SHELL=/bin/bash drivers/gpu/drm/amd/amdgpu/

If you fix the issue, kindly add following tag where applicable
| Reported-by: kernel test robot <lkp@intel.com>

All warnings (new ones prefixed by >>):

>> drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c:750:10: warning: no previous prototype for 'kgd_gfx_v10_enable_debug_trap' [-Wmissing-prototypes]
     750 | uint32_t kgd_gfx_v10_enable_debug_trap(struct amdgpu_device *adev,
         |          ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~
>> drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c:787:10: warning: no previous prototype for 'kgd_gfx_v10_disable_debug_trap' [-Wmissing-prototypes]
     787 | uint32_t kgd_gfx_v10_disable_debug_trap(struct amdgpu_device *adev,
         |          ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
   In file included from drivers/gpu/drm/amd/amdgpu/../display/dc/dc_types.h:36,
                    from drivers/gpu/drm/amd/amdgpu/../display/dc/dm_services_types.h:30,
                    from drivers/gpu/drm/amd/amdgpu/../include/dm_pp_interface.h:26,
                    from drivers/gpu/drm/amd/amdgpu/amdgpu.h:64,
                    from drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c:22:
   drivers/gpu/drm/amd/amdgpu/../display/dc/dc_hdmi_types.h:53:22: warning: 'dp_hdmi_dongle_signature_str' defined but not used [-Wunused-const-variable=]
      53 | static const uint8_t dp_hdmi_dongle_signature_str[] = "DP-HDMI ADAPTOR";
         |                      ^~~~~~~~~~~~~~~~~~~~~~~~~~~~


vim +/kgd_gfx_v10_enable_debug_trap +750 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c

   749	
 > 750	uint32_t kgd_gfx_v10_enable_debug_trap(struct amdgpu_device *adev,
   751					bool restore_dbg_registers,
   752					uint32_t vmid)
   753	{
   754	
   755		mutex_lock(&adev->grbm_idx_mutex);
   756	
   757		kgd_gfx_v10_set_wave_launch_stall(adev, vmid, true);
   758	
   759		/* assume gfx off is disabled for the debug session if rlc restore not supported. */
   760		if (restore_dbg_registers) {
   761			uint32_t data = 0;
   762	
   763			data = REG_SET_FIELD(data, SPI_GDBG_TRAP_CONFIG,
   764					VMID_SEL, 1 << vmid);
   765			data = REG_SET_FIELD(data, SPI_GDBG_TRAP_CONFIG,
   766					TRAP_EN, 1);
   767			WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_TRAP_CONFIG), data);
   768			WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_TRAP_DATA0), 0);
   769			WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_TRAP_DATA1), 0);
   770	
   771			kgd_gfx_v10_set_wave_launch_stall(adev, vmid, false);
   772	
   773			mutex_unlock(&adev->grbm_idx_mutex);
   774	
   775			return 0;
   776		}
   777	
   778		WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_TRAP_MASK), 0);
   779	
   780		kgd_gfx_v10_set_wave_launch_stall(adev, vmid, false);
   781	
   782		mutex_unlock(&adev->grbm_idx_mutex);
   783	
   784		return 0;
   785	}
   786	
 > 787	uint32_t kgd_gfx_v10_disable_debug_trap(struct amdgpu_device *adev,
   788						bool keep_trap_enabled,
   789						uint32_t vmid)
   790	{
   791		mutex_lock(&adev->grbm_idx_mutex);
   792	
   793		kgd_gfx_v10_set_wave_launch_stall(adev, vmid, true);
   794	
   795		WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_TRAP_MASK), 0);
   796	
   797		kgd_gfx_v10_set_wave_launch_stall(adev, vmid, false);
   798	
   799		mutex_unlock(&adev->grbm_idx_mutex);
   800	
   801		return 0;
   802	}
   803	

-- 
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests

^ permalink raw reply	[flat|nested] 68+ messages in thread

* Re: [PATCH 01/32] drm/amdkfd: add debug and runtime enable interface
  2023-01-25 19:53 ` [PATCH 01/32] drm/amdkfd: add debug and runtime enable interface Jonathan Kim
@ 2023-02-16 22:16   ` Felix Kuehling
  0 siblings, 0 replies; 68+ messages in thread
From: Felix Kuehling @ 2023-02-16 22:16 UTC (permalink / raw)
  To: Jonathan Kim, amd-gfx, dri-devel


On 2023-01-25 14:53, Jonathan Kim wrote:
> Introduce the GPU debug operations interface.
>
> For ROCm-GDB to extend the GNU Debugger's ability to inspect the AMD GPU
> instruction set, provide the necessary interface to allow the debugger
> to HW debug-mode set and query exceptions per HSA queue, process or
> device.
>
> The runtime_enable interface coordinates exception handling with the
> HSA runtime.
>
> Usage is available in the kern docs at uapi/linux/kfd_ioctl.h.
>
> v2: was previously reviewed but removed deprecrated wave launch modes
> (kill and disable).
> Also remove non-needed dbg flag option.
> Add revision and subvendor info to debug device snapshot entry.
> Add trap on wave start and end override option.
>
> Signed-off-by: Jonathan Kim <jonathan.kim@amd.com>
> ---
>   drivers/gpu/drm/amd/amdkfd/kfd_chardev.c |  48 ++
>   include/uapi/linux/kfd_ioctl.h           | 663 ++++++++++++++++++++++-
>   2 files changed, 710 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> index f79b8e964140..d3b019e64093 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> @@ -2645,6 +2645,48 @@ static int kfd_ioctl_criu(struct file *filep, struct kfd_process *p, void *data)
>   	return ret;
>   }
>   
> +static int kfd_ioctl_runtime_enable(struct file *filep, struct kfd_process *p, void *data)
> +{
> +	return 0;
> +}
> +
> +static int kfd_ioctl_set_debug_trap(struct file *filep, struct kfd_process *p, void *data)
> +{
> +	struct kfd_ioctl_dbg_trap_args *args = data;
> +	int r = 0;
> +
> +	if (sched_policy == KFD_SCHED_POLICY_NO_HWS) {
> +		pr_err("Debugging does not support sched_policy %i", sched_policy);
> +		return -EINVAL;
> +	}
> +
> +	switch (args->op) {
> +	case KFD_IOC_DBG_TRAP_ENABLE:
> +	case KFD_IOC_DBG_TRAP_DISABLE:
> +	case KFD_IOC_DBG_TRAP_SEND_RUNTIME_EVENT:
> +	case KFD_IOC_DBG_TRAP_SET_EXCEPTIONS_ENABLED:
> +	case KFD_IOC_DBG_TRAP_SET_WAVE_LAUNCH_OVERRIDE:
> +	case KFD_IOC_DBG_TRAP_SET_WAVE_LAUNCH_MODE:
> +	case KFD_IOC_DBG_TRAP_SUSPEND_QUEUES:
> +	case KFD_IOC_DBG_TRAP_RESUME_QUEUES:
> +	case KFD_IOC_DBG_TRAP_SET_NODE_ADDRESS_WATCH:
> +	case KFD_IOC_DBG_TRAP_CLEAR_NODE_ADDRESS_WATCH:
> +	case KFD_IOC_DBG_TRAP_SET_FLAGS:
> +	case KFD_IOC_DBG_TRAP_QUERY_DEBUG_EVENT:
> +	case KFD_IOC_DBG_TRAP_QUERY_EXCEPTION_INFO:
> +	case KFD_IOC_DBG_TRAP_GET_QUEUE_SNAPSHOT:
> +	case KFD_IOC_DBG_TRAP_GET_DEVICE_SNAPSHOT:
> +		pr_warn("Debugging not supported yet\n");
> +		r = -EACCES;
> +		break;
> +	default:
> +		pr_err("Invalid option: %i\n", args->op);
> +		r = -EINVAL;
> +	}
> +
> +	return r;
> +}
> +
>   #define AMDKFD_IOCTL_DEF(ioctl, _func, _flags) \
>   	[_IOC_NR(ioctl)] = {.cmd = ioctl, .func = _func, .flags = _flags, \
>   			    .cmd_drv = 0, .name = #ioctl}
> @@ -2754,6 +2796,12 @@ static const struct amdkfd_ioctl_desc amdkfd_ioctls[] = {
>   
>   	AMDKFD_IOCTL_DEF(AMDKFD_IOC_AVAILABLE_MEMORY,
>   			kfd_ioctl_get_available_memory, 0),
> +
> +	AMDKFD_IOCTL_DEF(AMDKFD_IOC_RUNTIME_ENABLE,
> +			kfd_ioctl_runtime_enable, 0),
> +
> +	AMDKFD_IOCTL_DEF(AMDKFD_IOC_DBG_TRAP,
> +			kfd_ioctl_set_debug_trap, 0),
>   };
>   
>   #define AMDKFD_CORE_IOCTL_COUNT	ARRAY_SIZE(amdkfd_ioctls)
> diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h
> index 42b60198b6c5..9ef4eed45c19 100644
> --- a/include/uapi/linux/kfd_ioctl.h
> +++ b/include/uapi/linux/kfd_ioctl.h
> @@ -109,6 +109,32 @@ struct kfd_ioctl_get_available_memory_args {
>   	__u32 pad;
>   };
>   
> +struct kfd_dbg_device_info_entry {
> +	__u64 exception_status;
> +	__u64 lds_base;
> +	__u64 lds_limit;
> +	__u64 scratch_base;
> +	__u64 scratch_limit;
> +	__u64 gpuvm_base;
> +	__u64 gpuvm_limit;
> +	__u32 gpu_id;
> +	__u32 location_id;
> +	__u32 vendor_id;
> +	__u32 device_id;
> +	__u32 revision_id;
> +	__u32 subsystem_vendor_id;
> +	__u32 subsystem_device_id;
> +	__u32 fw_version;
> +	__u32 gfx_target_version;
> +	__u32 simd_count;
> +	__u32 max_waves_per_simd;
> +	__u32 array_count;
> +	__u32 simd_arrays_per_engine;
> +	__u32 capability;
> +	__u32 debug_prop;
> +	__u32 pad;
> +};
> +
>   /* For kfd_ioctl_set_memory_policy_args.default_policy and alternate_policy */
>   #define KFD_IOC_CACHE_POLICY_COHERENT 0
>   #define KFD_IOC_CACHE_POLICY_NONCOHERENT 1
> @@ -766,6 +792,635 @@ struct kfd_ioctl_set_xnack_mode_args {
>   	__s32 xnack_enabled;
>   };
>   
> +/* Wave launch override modes */
> +enum kfd_dbg_trap_override_mode {
> +	KFD_DBG_TRAP_OVERRIDE_OR = 0,
> +	KFD_DBG_TRAP_OVERRIDE_REPLACE = 1
> +};
> +
> +/* Wave launch overrides */
> +enum kfd_dbg_trap_mask {
> +	KFD_DBG_TRAP_MASK_FP_INVALID = 1,
> +	KFD_DBG_TRAP_MASK_FP_INPUT_DENORMAL = 2,
> +	KFD_DBG_TRAP_MASK_FP_DIVIDE_BY_ZERO = 4,
> +	KFD_DBG_TRAP_MASK_FP_OVERFLOW = 8,
> +	KFD_DBG_TRAP_MASK_FP_UNDERFLOW = 16,
> +	KFD_DBG_TRAP_MASK_FP_INEXACT = 32,
> +	KFD_DBG_TRAP_MASK_INT_DIVIDE_BY_ZERO = 64,
> +	KFD_DBG_TRAP_MASK_DBG_ADDRESS_WATCH = 128,
> +	KFD_DBG_TRAP_MASK_DBG_MEMORY_VIOLATION = 256,
> +	KFD_DBG_TRAP_MASK_TRAP_ON_WAVE_START = (1 << 30),
> +	KFD_DBG_TRAP_MASK_TRAP_ON_WAVE_END = (1 << 31)
> +};
> +
> +/* Wave launch modes */
> +enum kfd_dbg_trap_wave_launch_mode {
> +	KFD_DBG_TRAP_WAVE_LAUNCH_MODE_NORMAL = 0,
> +	KFD_DBG_TRAP_WAVE_LAUNCH_MODE_HALT = 1,
> +	KFD_DBG_TRAP_WAVE_LAUNCH_MODE_DEBUG = 3
> +};
> +
> +/* Address watch modes */
> +enum kfd_dbg_trap_address_watch_mode {
> +	KFD_DBG_TRAP_ADDRESS_WATCH_MODE_READ = 0,
> +	KFD_DBG_TRAP_ADDRESS_WATCH_MODE_NONREAD = 1,
> +	KFD_DBG_TRAP_ADDRESS_WATCH_MODE_ATOMIC = 2,
> +	KFD_DBG_TRAP_ADDRESS_WATCH_MODE_ALL = 3
> +};
> +
> +/* Additional wave settings */
> +enum kfd_dbg_trap_flags {
> +	KFD_DBG_TRAP_FLAG_SINGLE_MEM_OP = 1,
> +};
> +
> +/* Trap exceptions */
> +enum kfd_dbg_trap_exception_code {
> +	EC_NONE = 0,
> +	/* per queue */
> +	EC_QUEUE_WAVE_ABORT = 1,
> +	EC_QUEUE_WAVE_TRAP = 2,
> +	EC_QUEUE_WAVE_MATH_ERROR = 3,
> +	EC_QUEUE_WAVE_ILLEGAL_INSTRUCTION = 4,
> +	EC_QUEUE_WAVE_MEMORY_VIOLATION = 5,
> +	EC_QUEUE_WAVE_APERTURE_VIOLATION = 6,
> +	EC_QUEUE_PACKET_DISPATCH_DIM_INVALID = 16,
> +	EC_QUEUE_PACKET_DISPATCH_GROUP_SEGMENT_SIZE_INVALID = 17,
> +	EC_QUEUE_PACKET_DISPATCH_CODE_INVALID = 18,
> +	EC_QUEUE_PACKET_RESERVED = 19,
> +	EC_QUEUE_PACKET_UNSUPPORTED = 20,
> +	EC_QUEUE_PACKET_DISPATCH_WORK_GROUP_SIZE_INVALID = 21,
> +	EC_QUEUE_PACKET_DISPATCH_REGISTER_INVALID = 22,
> +	EC_QUEUE_PACKET_VENDOR_UNSUPPORTED = 23,
> +	EC_QUEUE_PREEMPTION_ERROR = 30,
> +	EC_QUEUE_NEW = 31,
> +	/* per device */
> +	EC_DEVICE_QUEUE_DELETE = 32,
> +	EC_DEVICE_MEMORY_VIOLATION = 33,
> +	EC_DEVICE_RAS_ERROR = 34,
> +	EC_DEVICE_FATAL_HALT = 35,
> +	EC_DEVICE_NEW = 36,
> +	/* per process */
> +	EC_PROCESS_RUNTIME = 48,
> +	EC_PROCESS_DEVICE_REMOVE = 49,
> +	EC_MAX
> +};
> +
> +/* Mask generated by ecode in kfd_dbg_trap_exception_code */
> +#define KFD_EC_MASK(ecode)	(1ULL << (ecode - 1))
> +
> +/* Masks for exception code type checks below */
> +#define KFD_EC_MASK_QUEUE	(KFD_EC_MASK(EC_QUEUE_WAVE_ABORT) |	\
> +				 KFD_EC_MASK(EC_QUEUE_WAVE_TRAP) |	\
> +				 KFD_EC_MASK(EC_QUEUE_WAVE_MATH_ERROR) |	\
> +				 KFD_EC_MASK(EC_QUEUE_WAVE_ILLEGAL_INSTRUCTION) |	\
> +				 KFD_EC_MASK(EC_QUEUE_WAVE_MEMORY_VIOLATION) |	\
> +				 KFD_EC_MASK(EC_QUEUE_WAVE_APERTURE_VIOLATION) |	\
> +				 KFD_EC_MASK(EC_QUEUE_PACKET_DISPATCH_DIM_INVALID) |	\
> +				 KFD_EC_MASK(EC_QUEUE_PACKET_DISPATCH_GROUP_SEGMENT_SIZE_INVALID) |	\
> +				 KFD_EC_MASK(EC_QUEUE_PACKET_DISPATCH_CODE_INVALID) |	\
> +				 KFD_EC_MASK(EC_QUEUE_PACKET_UNSUPPORTED) |	\
> +				 KFD_EC_MASK(EC_QUEUE_PACKET_DISPATCH_WORK_GROUP_SIZE_INVALID) |	\
> +				 KFD_EC_MASK(EC_QUEUE_PACKET_DISPATCH_REGISTER_INVALID) |	\
> +				 KFD_EC_MASK(EC_QUEUE_PACKET_VENDOR_UNSUPPORTED)	|	\
> +				 KFD_EC_MASK(EC_QUEUE_PREEMPTION_ERROR)	|	\
> +				 KFD_EC_MASK(EC_QUEUE_NEW))
> +#define KFD_EC_MASK_DEVICE	(KFD_EC_MASK(EC_DEVICE_QUEUE_DELETE) |		\
> +				 KFD_EC_MASK(EC_DEVICE_RAS_ERROR) |		\
> +				 KFD_EC_MASK(EC_DEVICE_FATAL_HALT) |		\
> +				 KFD_EC_MASK(EC_DEVICE_MEMORY_VIOLATION) |	\
> +				 KFD_EC_MASK(EC_DEVICE_NEW))
> +#define KFD_EC_MASK_PROCESS	(KFD_EC_MASK(EC_PROCESS_RUNTIME) |	\
> +				 KFD_EC_MASK(EC_PROCESS_DEVICE_REMOVE))
> +
> +/* Checks for exception code types for KFD search */
> +#define KFD_DBG_EC_TYPE_IS_QUEUE(ecode)					\
> +			(!!(KFD_EC_MASK(ecode) & KFD_EC_MASK_QUEUE))
> +#define KFD_DBG_EC_TYPE_IS_DEVICE(ecode)				\
> +			(!!(KFD_EC_MASK(ecode) & KFD_EC_MASK_DEVICE))
> +#define KFD_DBG_EC_TYPE_IS_PROCESS(ecode)				\
> +			(!!(KFD_EC_MASK(ecode) & KFD_EC_MASK_PROCESS))
> +
> +
> +/* Runtime enable states */
> +enum kfd_dbg_runtime_state {
> +	DEBUG_RUNTIME_STATE_DISABLED = 0,
> +	DEBUG_RUNTIME_STATE_ENABLED = 1,
> +	DEBUG_RUNTIME_STATE_ENABLED_BUSY = 2,
> +	DEBUG_RUNTIME_STATE_ENABLED_ERROR = 3
> +};
> +
> +/* Runtime enable status */
> +struct kfd_runtime_info {
> +	__u64 r_debug;
> +	__u32 runtime_state;
> +	__u32 ttmp_setup;
> +};
> +
> +/* Enable modes for runtime enable */
> +#define KFD_RUNTIME_ENABLE_MODE_ENABLE_MASK	1
> +#define KFD_RUNTIME_ENABLE_MODE_TTMP_SAVE_MASK	2
> +
> +/**
> + * kfd_ioctl_runtime_enable_args - Arguments for runtime enable
> + *
> + * Coordinates debug exception signalling and debug device enablement with runtime.
> + *
> + * @r_debug - pointer to user struct for sharing information between ROCr and the debuggger
> + * @mode_mask - mask to set mode
> + *	KFD_RUNTIME_ENABLE_MODE_ENABLE_MASK - enable runtime for debugging, otherwise disable
> + *	KFD_RUNTIME_ENABLE_MODE_TTMP_SAVE_MASK - enable trap temporary setup (ignore on disable)
> + *
> + * Return - 0 on SUCCESS.
> + *	  - EBUSY if runtime enable call already pending.
> + *	  - EEXIST if user queues already active prior to call.
> + *	    If process is debug enabled, runtime enable will enable debug devices and
> + *	    wait for debugger process to send runtime exception EC_PROCESS_RUNTIME
> + *	    to unblock - see kfd_ioctl_dbg_trap_args.
> + *
> + */
> +struct kfd_ioctl_runtime_enable_args {
> +	__u64 r_debug;
> +	__u32 mode_mask;

As discussed offline, this will get a capabilities_mask field to be 
returned to ROCr to help with core dumps. Other than that, this patch is

Reviewed-by: Felix Kuehling <Felix.Kuehling@amd.com>


> +};
> +
> +/* Queue information */
> +struct kfd_queue_snapshot_entry {
> +	__u64 exception_status;
> +	__u64 ring_base_address;
> +	__u64 write_pointer_address;
> +	__u64 read_pointer_address;
> +	__u64 ctx_save_restore_address;
> +	__u32 queue_id;
> +	__u32 gpu_id;
> +	__u32 ring_size;
> +	__u32 queue_type;
> +	__u32 ctx_save_restore_area_size;
> +	__u32 reserved;
> +};
> +
> +/* Queue status return for suspend/resume */
> +#define KFD_DBG_QUEUE_ERROR_BIT		30
> +#define KFD_DBG_QUEUE_INVALID_BIT	31
> +#define KFD_DBG_QUEUE_ERROR_MASK	(1 << KFD_DBG_QUEUE_ERROR_BIT)
> +#define KFD_DBG_QUEUE_INVALID_MASK	(1 << KFD_DBG_QUEUE_INVALID_BIT)
> +
> +/* Context save area header information */
> +struct kfd_context_save_area_header {
> +	__u32 control_stack_offset;
> +	__u32 control_stack_size;
> +	__u32 wave_state_offset;
> +	__u32 wave_state_size;
> +	__u32 debug_offset;
> +	__u32 debug_size;
> +	__u64 err_payload_addr;
> +	__u32 err_event_id;
> +	__u32 reserved1;
> +};
> +
> +/*
> + * Debug operations
> + *
> + * For specifics on usage and return values, see documentation per operation
> + * below.  Otherwise, generic error returns apply:
> + *	- ESRCH if the process to debug does not exist.
> + *
> + *	- EINVAL (with KFD_IOC_DBG_TRAP_ENABLE exempt) if operation
> + *		 KFD_IOC_DBG_TRAP_ENABLE has not succeeded prior.
> + *		 Also returns this error if GPU hardware scheduling is not supported.
> + *
> + *	- EPERM (with KFD_IOC_DBG_TRAP_DISABLE exempt) if target process is not
> + *		 PTRACE_ATTACHED.  KFD_IOC_DBG_TRAP_DISABLE is exempt to allow
> + *		 clean up of debug mode as long as process is debug enabled.
> + *
> + *	- EACCES if any DBG_HW_OP (debug hardware operation) is requested when
> + *		 AMDKFD_IOC_RUNTIME_ENABLE has not succeeded prior.
> + *
> + *	- ENODEV if any GPU does not support debugging on a DBG_HW_OP call.
> + *
> + *	- Other errors may be returned when a DBG_HW_OP occurs while the GPU
> + *	  is in a fatal state.
> + *
> + */
> +enum kfd_dbg_trap_operations {
> +	KFD_IOC_DBG_TRAP_ENABLE = 0,
> +	KFD_IOC_DBG_TRAP_DISABLE = 1,
> +	KFD_IOC_DBG_TRAP_SEND_RUNTIME_EVENT = 2,
> +	KFD_IOC_DBG_TRAP_SET_EXCEPTIONS_ENABLED = 3,
> +	KFD_IOC_DBG_TRAP_SET_WAVE_LAUNCH_OVERRIDE = 4,  /* DBG_HW_OP */
> +	KFD_IOC_DBG_TRAP_SET_WAVE_LAUNCH_MODE = 5,      /* DBG_HW_OP */
> +	KFD_IOC_DBG_TRAP_SUSPEND_QUEUES = 6,		/* DBG_HW_OP */
> +	KFD_IOC_DBG_TRAP_RESUME_QUEUES = 7,		/* DBG_HW_OP */
> +	KFD_IOC_DBG_TRAP_SET_NODE_ADDRESS_WATCH = 8,	/* DBG_HW_OP */
> +	KFD_IOC_DBG_TRAP_CLEAR_NODE_ADDRESS_WATCH = 9,	/* DBG_HW_OP */
> +	KFD_IOC_DBG_TRAP_SET_FLAGS = 10,
> +	KFD_IOC_DBG_TRAP_QUERY_DEBUG_EVENT = 11,
> +	KFD_IOC_DBG_TRAP_QUERY_EXCEPTION_INFO = 12,
> +	KFD_IOC_DBG_TRAP_GET_QUEUE_SNAPSHOT = 13,
> +	KFD_IOC_DBG_TRAP_GET_DEVICE_SNAPSHOT = 14
> +};
> +
> +/**
> + * kfd_ioctl_dbg_trap_enable_args
> + *
> + *     Arguments for KFD_IOC_DBG_TRAP_ENABLE.
> + *
> + *     Enables debug session for target process. Call @op KFD_IOC_DBG_TRAP_DISABLE in
> + *     kfd_ioctl_dbg_trap_args to disable debug session.
> + *
> + *     @exception_mask (IN)	- exceptions to raise to the debugger
> + *     @rinfo_ptr      (IN)	- pointer to runtime info buffer (see kfd_runtime_info)
> + *     @rinfo_size     (IN/OUT)	- size of runtime info buffer in bytes
> + *     @dbg_fd	       (IN)	- fd the KFD will nofify the debugger with of raised
> + *				  exceptions set in exception_mask.
> + *
> + *     Generic errors apply (see kfd_dbg_trap_operations).
> + *     Return - 0 on SUCCESS.
> + *		Copies KFD saved kfd_runtime_info to @rinfo_ptr on enable.
> + *		Size of kfd_runtime saved by the KFD returned to @rinfo_size.
> + *            - EBADF if KFD cannot get a reference to dbg_fd.
> + *            - EFAULT if KFD cannot copy runtime info to rinfo_ptr.
> + *            - EINVAL if target process is already debug enabled.
> + *
> + */
> +struct kfd_ioctl_dbg_trap_enable_args {
> +	__u64 exception_mask;
> +	__u64 rinfo_ptr;
> +	__u32 rinfo_size;
> +	__u32 dbg_fd;
> +};
> +
> +/**
> + * kfd_ioctl_dbg_trap_send_runtime_event_args
> + *
> + *
> + *     Arguments for KFD_IOC_DBG_TRAP_SEND_RUNTIME_EVENT.
> + *     Raises exceptions to runtime.
> + *
> + *     @exception_mask (IN) - exceptions to raise to runtime
> + *     @gpu_id	       (IN) - target device id
> + *     @queue_id       (IN) - target queue id
> + *
> + *     Generic errors apply (see kfd_dbg_trap_operations).
> + *     Return - 0 on SUCCESS.
> + *	      - ENODEV if gpu_id not found.
> + *		If exception_mask contains EC_PROCESS_RUNTIME, unblocks pending
> + *		AMDKFD_IOC_RUNTIME_ENABLE call - see kfd_ioctl_runtime_enable_args.
> + *		All other exceptions are raised to runtime through err_payload_addr.
> + *		See kfd_context_save_area_header.
> + */
> +struct kfd_ioctl_dbg_trap_send_runtime_event_args {
> +	__u64 exception_mask;
> +	__u32 gpu_id;
> +	__u32 queue_id;
> +};
> +
> +/**
> + * kfd_ioctl_dbg_trap_set_exceptions_enabled_args
> + *
> + *     Arguments for KFD_IOC_SET_EXCEPTIONS_ENABLED
> + *     Set new exceptions to be raised to the debugger.
> + *
> + *     @exception_mask (IN) - new exceptions to raise the debugger
> + *
> + *     Generic errors apply (see kfd_dbg_trap_operations).
> + *     Return - 0 on SUCCESS.
> + */
> +struct kfd_ioctl_dbg_trap_set_exceptions_enabled_args {
> +	__u64 exception_mask;
> +};
> +
> +/**
> + * kfd_ioctl_dbg_trap_set_wave_launch_override_args
> + *
> + *     Arguments for KFD_IOC_DBG_TRAP_SET_WAVE_LAUNCH_OVERRIDE
> + *     Enable HW exceptions to raise trap.
> + *
> + *     @override_mode	     (IN)     - see kfd_dbg_trap_override_mode
> + *     @enable_mask	     (IN/OUT) - reference kfd_dbg_trap_mask.
> + *					IN is the override modes requested to be enabled.
> + *					OUT is referenced in Return below.
> + *     @support_request_mask (IN/OUT) - reference kfd_dbg_trap_mask.
> + *					IN is the override modes requested for support check.
> + *					OUT is referenced in Return below.
> + *
> + *     Generic errors apply (see kfd_dbg_trap_operations).
> + *     Return - 0 on SUCCESS.
> + *		Previous enablement is returned in @enable_mask.
> + *		Actual override support is returned in @support_request_mask.
> + *	      - EINVAL if override mode is not supported.
> + *	      - EACCES if trap support requested is not actually supported.
> + *		i.e. enable_mask (IN) is not a subset of support_request_mask (OUT).
> + *		Otherwise it is considered a generic error (see kfd_dbg_trap_operations).
> + */
> +struct kfd_ioctl_dbg_trap_set_wave_launch_override_args {
> +	__u32 override_mode;
> +	__u32 enable_mask;
> +	__u32 support_request_mask;
> +	__u32 pad;
> +};
> +
> +/**
> + * kfd_ioctl_dbg_trap_set_wave_launch_mode_args
> + *
> + *     Arguments for KFD_IOC_DBG_TRAP_SET_WAVE_LAUNCH_MODE
> + *     Set wave launch mode.
> + *
> + *     @mode (IN) - see kfd_dbg_trap_wave_launch_mode
> + *
> + *     Generic errors apply (see kfd_dbg_trap_operations).
> + *     Return - 0 on SUCCESS.
> + */
> +struct kfd_ioctl_dbg_trap_set_wave_launch_mode_args {
> +	__u32 launch_mode;
> +	__u32 pad;
> +};
> +
> +/**
> + * kfd_ioctl_dbg_trap_suspend_queues_ags
> + *
> + *     Arguments for KFD_IOC_DBG_TRAP_SUSPEND_QUEUES
> + *     Suspend queues.
> + *
> + *     @exception_mask	(IN) - raised exceptions to clear
> + *     @queue_array_ptr (IN) - pointer to array of queue ids (u32 per queue id)
> + *			       to suspend
> + *     @num_queues	(IN) - number of queues to suspend in @queue_array_ptr
> + *     @grace_period	(IN) - wave time allowance before preemption
> + *			       per 1K GPU clock cycle unit
> + *
> + *     Generic errors apply (see kfd_dbg_trap_operations).
> + *     Destruction of a suspended queue is blocked until the queue is
> + *     resumed.  This allows the debugger to access queue information and
> + *     the its context save area without running into a race condition on
> + *     queue destruction.
> + *     Automatically copies per queue context save area header information
> + *     into the save area base
> + *     (see kfd_queue_snapshot_entry and kfd_context_save_area_header).
> + *
> + *     Return - Number of queues suspended on SUCCESS.
> + *	.	KFD_DBG_QUEUE_ERROR_MASK and KFD_DBG_QUEUE_INVALID_MASK masked
> + *		for each queue id in @queue_array_ptr array reports unsuccessful
> + *		suspend reason.
> + *		KFD_DBG_QUEUE_ERROR_MASK = HW failure.
> + *		KFD_DBG_QUEUE_INVALID_MASK = queue does not exist, is new or
> + *		is being destroyed.
> + */
> +struct kfd_ioctl_dbg_trap_suspend_queues_args {
> +	__u64 exception_mask;
> +	__u64 queue_array_ptr;
> +	__u32 num_queues;
> +	__u32 grace_period;
> +};
> +
> +/**
> + * kfd_ioctl_dbg_trap_resume_queues_args
> + *
> + *     Arguments for KFD_IOC_DBG_TRAP_RESUME_QUEUES
> + *     Resume queues.
> + *
> + *     @queue_array_ptr (IN) - pointer to array of queue ids (u32 per queue id)
> + *			       to resume
> + *     @num_queues	(IN) - number of queues to resume in @queue_array_ptr
> + *
> + *     Generic errors apply (see kfd_dbg_trap_operations).
> + *     Return - Number of queues resumed on SUCCESS.
> + *		KFD_DBG_QUEUE_ERROR_MASK and KFD_DBG_QUEUE_INVALID_MASK mask
> + *		for each queue id in @queue_array_ptr array reports unsuccessful
> + *		resume reason.
> + *		KFD_DBG_QUEUE_ERROR_MASK = HW failure.
> + *		KFD_DBG_QUEUE_INVALID_MASK = queue does not exist.
> + */
> +struct kfd_ioctl_dbg_trap_resume_queues_args {
> +	__u64 queue_array_ptr;
> +	__u32 num_queues;
> +	__u32 pad;
> +};
> +
> +/**
> + * kfd_ioctl_dbg_trap_set_node_address_watch_args
> + *
> + *     Arguments for KFD_IOC_DBG_TRAP_SET_NODE_ADDRESS_WATCH
> + *     Sets address watch for device.
> + *
> + *     @address	(IN)  - watch address to set
> + *     @mode    (IN)  - see kfd_dbg_trap_address_watch_mode
> + *     @mask    (IN)  - watch address mask
> + *     @gpu_id  (IN)  - target gpu to set watch point
> + *     @id      (OUT) - watch id allocated
> + *
> + *     Generic errors apply (see kfd_dbg_trap_operations).
> + *     Return - 0 on SUCCESS.
> + *		Allocated watch ID returned to @id.
> + *	      - ENODEV if gpu_id not found.
> + *	      - ENOMEM if watch IDs can be allocated
> + */
> +struct kfd_ioctl_dbg_trap_set_node_address_watch_args {
> +	__u64 address;
> +	__u32 mode;
> +	__u32 mask;
> +	__u32 gpu_id;
> +	__u32 id;
> +};
> +
> +/**
> + * kfd_ioctl_dbg_trap_clear_node_address_watch_args
> + *
> + *     Arguments for KFD_IOC_DBG_TRAP_CLEAR_NODE_ADDRESS_WATCH
> + *     Clear address watch for device.
> + *
> + *     @gpu_id  (IN)  - target device to clear watch point
> + *     @id      (IN) - allocated watch id to clear
> + *
> + *     Generic errors apply (see kfd_dbg_trap_operations).
> + *     Return - 0 on SUCCESS.
> + *	      - ENODEV if gpu_id not found.
> + *	      - EINVAL if watch ID has not been allocated.
> + */
> +struct kfd_ioctl_dbg_trap_clear_node_address_watch_args {
> +	__u32 gpu_id;
> +	__u32 id;
> +};
> +
> +/**
> + * kfd_ioctl_dbg_trap_set_flags_args
> + *
> + *     Arguments for KFD_IOC_DBG_TRAP_SET_FLAGS
> + *     Sets flags for wave behaviour.
> + *
> + *     @flags (IN/OUT) - IN = flags to enable, OUT = flags previously enabled
> + *
> + *     Generic errors apply (see kfd_dbg_trap_operations).
> + *     Return - 0 on SUCCESS.
> + *	      - EACCESS if any debug device does not allow flag options.
> + */
> +struct kfd_ioctl_dbg_trap_set_flags_args {
> +	__u32 flags;
> +	__u32 pad;
> +};
> +
> +/**
> + * kfd_ioctl_dbg_trap_query_debug_event_args
> + *
> + *     Arguments for KFD_IOC_DBG_TRAP_QUERY_DEBUG_EVENT
> + *
> + *     Find one or more raised exceptions. This function can return multiple
> + *     exceptions from a single queue or a single device with one call. To find
> + *     all raised exceptions, this function must be called repeatedly until it
> + *     returns -EAGAIN. Returned exceptions can optionally be cleared by
> + *     setting the corresponding bit in the @exception_mask input parameter.
> + *     However, clearing an exception prevents retrieving further information
> + *     about it with KFD_IOC_DBG_TRAP_QUERY_EXCEPTION_INFO.
> + *
> + *     @exception_mask (IN/OUT) - exception to clear (IN) and raised (OUT)
> + *     @gpu_id	       (OUT)    - gpu id of exceptions raised
> + *     @queue_id       (OUT)    - queue id of exceptions raised
> + *
> + *     Generic errors apply (see kfd_dbg_trap_operations).
> + *     Return - 0 on raised exception found
> + *              Raised exceptions found are returned in @exception mask
> + *              with reported source id returned in @gpu_id or @queue_id.
> + *            - EAGAIN if no raised exception has been found
> + */
> +struct kfd_ioctl_dbg_trap_query_debug_event_args {
> +	__u64 exception_mask;
> +	__u32 gpu_id;
> +	__u32 queue_id;
> +};
> +
> +/**
> + * kfd_ioctl_dbg_trap_query_exception_info_args
> + *
> + *     Arguments KFD_IOC_DBG_TRAP_QUERY_EXCEPTION_INFO
> + *     Get additional info on raised exception.
> + *
> + *     @info_ptr	(IN)	 - pointer to exception info buffer to copy to
> + *     @info_size	(IN/OUT) - exception info buffer size (bytes)
> + *     @source_id	(IN)     - target gpu or queue id
> + *     @exception_code	(IN)     - target exception
> + *     @clear_exception	(IN)     - clear raised @exception_code exception
> + *				   (0 = false, 1 = true)
> + *
> + *     Generic errors apply (see kfd_dbg_trap_operations).
> + *     Return - 0 on SUCCESS.
> + *              If @exception_code is EC_DEVICE_MEMORY_VIOLATION, copy @info_size(OUT)
> + *		bytes of memory exception data to @info_ptr.
> + *              If @exception_code is EC_PROCESS_RUNTIME, copy saved
> + *              kfd_runtime_info to @info_ptr.
> + *              Actual required @info_ptr size (bytes) is returned in @info_size.
> + */
> +struct kfd_ioctl_dbg_trap_query_exception_info_args {
> +	__u64 info_ptr;
> +	__u32 info_size;
> +	__u32 source_id;
> +	__u32 exception_code;
> +	__u32 clear_exception;
> +};
> +
> +/**
> + * kfd_ioctl_dbg_trap_get_queue_snapshot_args
> + *
> + *     Arguments KFD_IOC_DBG_TRAP_GET_QUEUE_SNAPSHOT
> + *     Get queue information.
> + *
> + *     @exception_mask	 (IN)	  - exceptions raised to clear
> + *     @snapshot_buf_ptr (IN)	  - queue snapshot entry buffer (see kfd_queue_snapshot_entry)
> + *     @num_queues	 (IN/OUT) - number of queue snapshot entries
> + *         The debugger specifies the size of the array allocated in @num_queues.
> + *         KFD returns the number of queues that actually existed. If this is
> + *         larger than the size specified by the debugger, KFD will not overflow
> + *         the array allocated by the debugger.
> + *
> + *     @entry_size	 (IN/OUT) - size per entry in bytes
> + *         The debugger specifies sizeof(struct kfd_queue_snapshot_entry) in
> + *         @entry_size. KFD returns the number of bytes actually populated per
> + *         entry. The debugger should use the KFD_IOCTL_MINOR_VERSION to determine,
> + *         which fields in struct kfd_queue_snapshot_entry are valid. This allows
> + *         growing the ABI in a backwards compatible manner.
> + *         Note that entry_size(IN) should still be used to stride the snapshot buffer in the
> + *         event that it's larger than actual kfd_queue_snapshot_entry.
> + *
> + *     Generic errors apply (see kfd_dbg_trap_operations).
> + *     Return - 0 on SUCCESS.
> + *              Copies @num_queues(IN) queue snapshot entries of size @entry_size(IN)
> + *              into @snapshot_buf_ptr if @num_queues(IN) > 0.
> + *              Otherwise return @num_queues(OUT) queue snapshot entries that exist.
> + */
> +struct kfd_ioctl_dbg_trap_queue_snapshot_args {
> +	__u64 exception_mask;
> +	__u64 snapshot_buf_ptr;
> +	__u32 num_queues;
> +	__u32 entry_size;
> +};
> +
> +/**
> + * kfd_ioctl_dbg_trap_get_device_snapshot_args
> + *
> + *     Arguments for KFD_IOC_DBG_TRAP_GET_DEVICE_SNAPSHOT
> + *     Get device information.
> + *
> + *     @exception_mask	 (IN)	  - exceptions raised to clear
> + *     @snapshot_buf_ptr (IN)	  - pointer to snapshot buffer (see kfd_dbg_device_info_entry)
> + *     @num_devices	 (IN/OUT) - number of debug devices to snapshot
> + *         The debugger specifies the size of the array allocated in @num_devices.
> + *         KFD returns the number of devices that actually existed. If this is
> + *         larger than the size specified by the debugger, KFD will not overflow
> + *         the array allocated by the debugger.
> + *
> + *     @entry_size	 (IN/OUT) - size per entry in bytes
> + *         The debugger specifies sizeof(struct kfd_dbg_device_info_entry) in
> + *         @entry_size. KFD returns the number of bytes actually populated. The
> + *         debugger should use KFD_IOCTL_MINOR_VERSION to determine, which fields
> + *         in struct kfd_dbg_device_info_entry are valid. This allows growing the
> + *         ABI in a backwards compatible manner.
> + *         Note that entry_size(IN) should still be used to stride the snapshot buffer in the
> + *         event that it's larger than actual kfd_dbg_device_info_entry.
> + *
> + *     Generic errors apply (see kfd_dbg_trap_operations).
> + *     Return - 0 on SUCCESS.
> + *              Copies @num_devices(IN) device snapshot entries of size @entry_size(IN)
> + *              into @snapshot_buf_ptr if @num_devices(IN) > 0.
> + *              Otherwise return @num_devices(OUT) queue snapshot entries that exist.
> + */
> +struct kfd_ioctl_dbg_trap_device_snapshot_args {
> +	__u64 exception_mask;
> +	__u64 snapshot_buf_ptr;
> +	__u32 num_devices;
> +	__u32 entry_size;
> +};
> +
> +/**
> + * kfd_ioctl_dbg_trap_args
> + *
> + * Arguments to debug target process.
> + *
> + *     @pid - target process to debug
> + *     @op  - debug operation (see kfd_dbg_trap_operations)
> + *
> + *     @op determines which union struct args to use.
> + *     Refer to kern docs for each kfd_ioctl_dbg_trap_*_args struct.
> + */
> +struct kfd_ioctl_dbg_trap_args {
> +	__u32 pid;
> +	__u32 op;
> +
> +	union {
> +		struct kfd_ioctl_dbg_trap_enable_args enable;
> +		struct kfd_ioctl_dbg_trap_send_runtime_event_args send_runtime_event;
> +		struct kfd_ioctl_dbg_trap_set_exceptions_enabled_args set_exceptions_enabled;
> +		struct kfd_ioctl_dbg_trap_set_wave_launch_override_args launch_override;
> +		struct kfd_ioctl_dbg_trap_set_wave_launch_mode_args launch_mode;
> +		struct kfd_ioctl_dbg_trap_suspend_queues_args suspend_queues;
> +		struct kfd_ioctl_dbg_trap_resume_queues_args resume_queues;
> +		struct kfd_ioctl_dbg_trap_set_node_address_watch_args set_node_address_watch;
> +		struct kfd_ioctl_dbg_trap_clear_node_address_watch_args clear_node_address_watch;
> +		struct kfd_ioctl_dbg_trap_set_flags_args set_flags;
> +		struct kfd_ioctl_dbg_trap_query_debug_event_args query_debug_event;
> +		struct kfd_ioctl_dbg_trap_query_exception_info_args query_exception_info;
> +		struct kfd_ioctl_dbg_trap_queue_snapshot_args queue_snapshot;
> +		struct kfd_ioctl_dbg_trap_device_snapshot_args device_snapshot;
> +	};
> +};
> +
>   #define AMDKFD_IOCTL_BASE 'K'
>   #define AMDKFD_IO(nr)			_IO(AMDKFD_IOCTL_BASE, nr)
>   #define AMDKFD_IOR(nr, type)		_IOR(AMDKFD_IOCTL_BASE, nr, type)
> @@ -877,7 +1532,13 @@ struct kfd_ioctl_set_xnack_mode_args {
>   #define AMDKFD_IOC_AVAILABLE_MEMORY		\
>   		AMDKFD_IOWR(0x23, struct kfd_ioctl_get_available_memory_args)
>   
> +#define AMDKFD_IOC_RUNTIME_ENABLE		\
> +		AMDKFD_IOWR(0x24, struct kfd_ioctl_runtime_enable_args)
> +
> +#define AMDKFD_IOC_DBG_TRAP			\
> +		AMDKFD_IOWR(0x25, struct kfd_ioctl_dbg_trap_args)
> +
>   #define AMDKFD_COMMAND_START		0x01
> -#define AMDKFD_COMMAND_END		0x24
> +#define AMDKFD_COMMAND_END		0x26
>   
>   #endif

^ permalink raw reply	[flat|nested] 68+ messages in thread

* Re: [PATCH 02/32] drm/amdkfd: display debug capabilities
  2023-01-25 19:53 ` [PATCH 02/32] drm/amdkfd: display debug capabilities Jonathan Kim
@ 2023-02-16 22:24   ` Felix Kuehling
  0 siblings, 0 replies; 68+ messages in thread
From: Felix Kuehling @ 2023-02-16 22:24 UTC (permalink / raw)
  To: Jonathan Kim, amd-gfx, dri-devel


On 2023-01-25 14:53, Jonathan Kim wrote:
> Expose debug capabilities in the KFD topology node's HSA capabilities and
> debug properties flags.
>
> Ensure correct capabilities are exposed based on firmware support.
>
> Flag definitions can be referenced in uapi/linux/kfd_sysfs.h.
>
> v2: v1 was reviewed but re-requesting review for the following.
> - remove asic family code name comments in firmware support checking
> - add gfx11 requirements in fw support checks and debug props and caps
>
> Signed-off-by: Jonathan Kim <jonathan.kim@amd.com>

Reviewed-by: Felix Kuehling <Felix.Kuehling@amd.com>


> ---
>   drivers/gpu/drm/amd/amdkfd/kfd_topology.c | 101 ++++++++++++++++++++--
>   drivers/gpu/drm/amd/amdkfd/kfd_topology.h |   6 ++
>   include/uapi/linux/kfd_sysfs.h            |  15 ++++
>   3 files changed, 117 insertions(+), 5 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
> index 3fdaba56be6f..647a14142da9 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
> @@ -551,6 +551,8 @@ static ssize_t node_show(struct kobject *kobj, struct attribute *attr,
>   				      dev->gpu->mec_fw_version);
>   		sysfs_show_32bit_prop(buffer, offs, "capability",
>   				      dev->node_props.capability);
> +		sysfs_show_64bit_prop(buffer, offs, "debug_prop",
> +				      dev->node_props.debug_prop);
>   		sysfs_show_32bit_prop(buffer, offs, "sdma_fw_version",
>   				      dev->gpu->sdma_fw_version);
>   		sysfs_show_64bit_prop(buffer, offs, "unique_id",
> @@ -1865,6 +1867,97 @@ static int kfd_topology_add_device_locked(struct kfd_dev *gpu, uint32_t gpu_id,
>   	return res;
>   }
>   
> +static void kfd_topology_set_dbg_firmware_support(struct kfd_topology_device *dev)
> +{
> +	bool firmware_supported = true;
> +
> +	if (KFD_GC_VERSION(dev->gpu) >= IP_VERSION(11, 0, 0) &&
> +			KFD_GC_VERSION(dev->gpu) < IP_VERSION(12, 0, 0)) {
> +		firmware_supported =
> +			(dev->gpu->adev->mes.sched_version & AMDGPU_MES_VERSION_MASK) >= 9;
> +		goto out;
> +	}
> +
> +	/*
> +	 * Note: Any unlisted devices here are assumed to support exception handling.
> +	 * Add additional checks here as needed.
> +	 */
> +	switch (KFD_GC_VERSION(dev->gpu)) {
> +	case IP_VERSION(9, 0, 1):
> +		firmware_supported = dev->gpu->mec_fw_version >= 459 + 32768;
> +		break;
> +	case IP_VERSION(9, 1, 0):
> +	case IP_VERSION(9, 2, 1):
> +	case IP_VERSION(9, 2, 2):
> +	case IP_VERSION(9, 3, 0):
> +	case IP_VERSION(9, 4, 0):
> +		firmware_supported = dev->gpu->mec_fw_version >= 459;
> +		break;
> +	case IP_VERSION(9, 4, 1):
> +		firmware_supported = dev->gpu->mec_fw_version >= 60;
> +		break;
> +	case IP_VERSION(9, 4, 2):
> +		firmware_supported = dev->gpu->mec_fw_version >= 51;
> +		break;
> +	case IP_VERSION(10, 1, 10):
> +	case IP_VERSION(10, 1, 2):
> +	case IP_VERSION(10, 1, 1):
> +		firmware_supported = dev->gpu->mec_fw_version >= 144;
> +		break;
> +	case IP_VERSION(10, 3, 0):
> +	case IP_VERSION(10, 3, 2):
> +	case IP_VERSION(10, 3, 1):
> +	case IP_VERSION(10, 3, 4):
> +	case IP_VERSION(10, 3, 5):
> +		firmware_supported = dev->gpu->mec_fw_version >= 89;
> +		break;
> +	case IP_VERSION(10, 1, 3):
> +	case IP_VERSION(10, 3, 3):
> +		firmware_supported = false;
> +		break;
> +	default:
> +		break;
> +	}
> +
> +out:
> +	if (firmware_supported)
> +		dev->node_props.capability |= HSA_CAP_TRAP_DEBUG_FIRMWARE_SUPPORTED;
> +}
> +
> +static void kfd_topology_set_capabilities(struct kfd_topology_device *dev)
> +{
> +	dev->node_props.capability |= ((HSA_CAP_DOORBELL_TYPE_2_0 <<
> +				HSA_CAP_DOORBELL_TYPE_TOTALBITS_SHIFT) &
> +				HSA_CAP_DOORBELL_TYPE_TOTALBITS_MASK);
> +
> +	dev->node_props.capability |= HSA_CAP_TRAP_DEBUG_SUPPORT |
> +			HSA_CAP_TRAP_DEBUG_WAVE_LAUNCH_TRAP_OVERRIDE_SUPPORTED |
> +			HSA_CAP_TRAP_DEBUG_WAVE_LAUNCH_MODE_SUPPORTED;
> +
> +	if (KFD_GC_VERSION(dev->gpu) < IP_VERSION(10, 0, 0)) {
> +		dev->node_props.debug_prop |= HSA_DBG_WATCH_ADDR_MASK_LO_BIT_GFX9 |
> +						HSA_DBG_WATCH_ADDR_MASK_HI_BIT;
> +
> +		if (KFD_GC_VERSION(dev->gpu) < IP_VERSION(9, 4, 2))
> +			dev->node_props.debug_prop |=
> +				HSA_DBG_DISPATCH_INFO_ALWAYS_VALID;
> +		else
> +			dev->node_props.capability |=
> +				HSA_CAP_TRAP_DEBUG_PRECISE_MEMORY_OPERATIONS_SUPPORTED;
> +	} else {
> +		dev->node_props.debug_prop |= HSA_DBG_WATCH_ADDR_MASK_LO_BIT_GFX10 |
> +					HSA_DBG_WATCH_ADDR_MASK_HI_BIT;
> +
> +		if (KFD_GC_VERSION(dev->gpu) < IP_VERSION(11, 0, 0))
> +			dev->node_props.debug_prop |= HSA_DBG_DISPATCH_INFO_ALWAYS_VALID;
> +		else
> +			dev->node_props.capability |=
> +				HSA_CAP_TRAP_DEBUG_PRECISE_MEMORY_OPERATIONS_SUPPORTED;
> +	}
> +
> +	kfd_topology_set_dbg_firmware_support(dev);
> +}
> +
>   int kfd_topology_add_device(struct kfd_dev *gpu)
>   {
>   	uint32_t gpu_id;
> @@ -1966,13 +2059,11 @@ int kfd_topology_add_device(struct kfd_dev *gpu)
>   			HSA_CAP_DOORBELL_TYPE_TOTALBITS_MASK);
>   		break;
>   	default:
> -		if (KFD_GC_VERSION(dev->gpu) >= IP_VERSION(9, 0, 1))
> -			dev->node_props.capability |= ((HSA_CAP_DOORBELL_TYPE_2_0 <<
> -				HSA_CAP_DOORBELL_TYPE_TOTALBITS_SHIFT) &
> -				HSA_CAP_DOORBELL_TYPE_TOTALBITS_MASK);
> -		else
> +		if (KFD_GC_VERSION(dev->gpu) < IP_VERSION(9, 0, 1))
>   			WARN(1, "Unexpected ASIC family %u",
>   			     dev->gpu->adev->asic_type);
> +		else
> +			kfd_topology_set_capabilities(dev);
>   	}
>   
>   	/*
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.h b/drivers/gpu/drm/amd/amdkfd/kfd_topology.h
> index fca30d00a9bb..53b9b7bf52ee 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.h
> @@ -31,6 +31,11 @@
>   
>   #define KFD_TOPOLOGY_PUBLIC_NAME_SIZE 32
>   
> +#define HSA_DBG_WATCH_ADDR_MASK_LO_BIT_GFX9	6
> +#define HSA_DBG_WATCH_ADDR_MASK_LO_BIT_GFX10	7
> +#define HSA_DBG_WATCH_ADDR_MASK_HI_BIT  \
> +			(29 << HSA_DBG_WATCH_ADDR_MASK_HI_BIT_SHIFT)
> +
>   struct kfd_node_properties {
>   	uint64_t hive_id;
>   	uint32_t cpu_cores_count;
> @@ -42,6 +47,7 @@ struct kfd_node_properties {
>   	uint32_t cpu_core_id_base;
>   	uint32_t simd_id_base;
>   	uint32_t capability;
> +	uint64_t debug_prop;
>   	uint32_t max_waves_per_simd;
>   	uint32_t lds_size_in_kb;
>   	uint32_t gds_size_in_kb;
> diff --git a/include/uapi/linux/kfd_sysfs.h b/include/uapi/linux/kfd_sysfs.h
> index 3e330f368917..a51b7331e0b4 100644
> --- a/include/uapi/linux/kfd_sysfs.h
> +++ b/include/uapi/linux/kfd_sysfs.h
> @@ -43,6 +43,11 @@
>   #define HSA_CAP_DOORBELL_TYPE_2_0		0x2
>   #define HSA_CAP_AQL_QUEUE_DOUBLE_MAP		0x00004000
>   
> +#define HSA_CAP_TRAP_DEBUG_SUPPORT              0x00008000
> +#define HSA_CAP_TRAP_DEBUG_WAVE_LAUNCH_TRAP_OVERRIDE_SUPPORTED  0x00010000
> +#define HSA_CAP_TRAP_DEBUG_WAVE_LAUNCH_MODE_SUPPORTED           0x00020000
> +#define HSA_CAP_TRAP_DEBUG_PRECISE_MEMORY_OPERATIONS_SUPPORTED  0x00040000
> +
>   /* Old buggy user mode depends on this being 0 */
>   #define HSA_CAP_RESERVED_WAS_SRAM_EDCSUPPORTED	0x00080000
>   
> @@ -53,8 +58,18 @@
>   #define HSA_CAP_SRAM_EDCSUPPORTED		0x04000000
>   #define HSA_CAP_SVMAPI_SUPPORTED		0x08000000
>   #define HSA_CAP_FLAGS_COHERENTHOSTACCESS	0x10000000
> +#define HSA_CAP_TRAP_DEBUG_FIRMWARE_SUPPORTED   0x20000000
>   #define HSA_CAP_RESERVED			0xe00f8000
>   
> +/* debug_prop bits in node properties */
> +#define HSA_DBG_WATCH_ADDR_MASK_LO_BIT_MASK     0x0000000f
> +#define HSA_DBG_WATCH_ADDR_MASK_LO_BIT_SHIFT    0
> +#define HSA_DBG_WATCH_ADDR_MASK_HI_BIT_MASK     0x000003f0
> +#define HSA_DBG_WATCH_ADDR_MASK_HI_BIT_SHIFT    4
> +#define HSA_DBG_DISPATCH_INFO_ALWAYS_VALID      0x00000400
> +#define HSA_DBG_WATCHPOINTS_EXCLUSIVE           0x00000800
> +#define HSA_DBG_RESERVED                0xfffffffffffff000ull
> +
>   /* Heap types in memory properties */
>   #define HSA_MEM_HEAP_TYPE_SYSTEM	0
>   #define HSA_MEM_HEAP_TYPE_FB_PUBLIC	1

^ permalink raw reply	[flat|nested] 68+ messages in thread

* Re: [PATCH 05/32] drm/amdgpu: setup hw debug registers on driver initialization
  2023-01-25 19:53 ` [PATCH 05/32] drm/amdgpu: setup hw debug registers on driver initialization Jonathan Kim
@ 2023-02-16 22:39   ` Felix Kuehling
  0 siblings, 0 replies; 68+ messages in thread
From: Felix Kuehling @ 2023-02-16 22:39 UTC (permalink / raw)
  To: Jonathan Kim, amd-gfx, dri-devel

On 2023-01-25 14:53, Jonathan Kim wrote:
> Add missing debug trap registers references and initialize all debug
> registers on boot by clearing the hardware exception overrides and the
> wave allocation ID index.
>
> The debugger requires that TTMPs 6 & 7 save the dispatch ID to map
> waves onto dispatch during compute context inspection.
> In order to correctly set this up, set the special reserved CP bit by
> default whenever the MQD is initailized.
>
> v2: leave TRAP_EN set for multi-process debugging as per process disable
> will be taken care of in later patches.
> fixup typo in description.
> enable ttmp setup for dispatch boundary in mqd init for gfx11.
> add trap on wave start and end registers for gfx11.
>
> Signed-off-by: Jonathan Kim <jonathan.kim@amd.com>

Reviewed-by: Felix Kuehling <Felix.Kuehling@amd.com>


> ---
>   drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c        | 26 +++++++
>   drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c        |  1 +
>   drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c         | 30 ++++++++
>   .../gpu/drm/amd/amdkfd/kfd_mqd_manager_v10.c  |  5 ++
>   .../gpu/drm/amd/amdkfd/kfd_mqd_manager_v11.c  |  5 ++
>   .../gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c   |  5 ++
>   .../include/asic_reg/gc/gc_10_1_0_offset.h    | 14 ++++
>   .../include/asic_reg/gc/gc_10_1_0_sh_mask.h   | 69 +++++++++++++++++++
>   .../include/asic_reg/gc/gc_10_3_0_offset.h    | 10 +++
>   .../include/asic_reg/gc/gc_10_3_0_sh_mask.h   |  4 ++
>   .../include/asic_reg/gc/gc_11_0_0_sh_mask.h   |  4 ++
>   11 files changed, 173 insertions(+)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> index 6983acc456b2..a5faf23805b5 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> @@ -4823,6 +4823,29 @@ static u32 gfx_v10_0_init_pa_sc_tile_steering_override(struct amdgpu_device *ade
>   
>   #define DEFAULT_SH_MEM_BASES	(0x6000)
>   
> +static void gfx_v10_0_debug_trap_config_init(struct amdgpu_device *adev,
> +				uint32_t first_vmid,
> +				uint32_t last_vmid)
> +{
> +	uint32_t data;
> +	uint32_t trap_config_vmid_mask = 0;
> +	int i;
> +
> +	/* Calculate trap config vmid mask */
> +	for (i = first_vmid; i < last_vmid; i++)
> +		trap_config_vmid_mask |= (1 << i);
> +
> +	data = REG_SET_FIELD(0, SPI_GDBG_TRAP_CONFIG,
> +			VMID_SEL, trap_config_vmid_mask);
> +	data = REG_SET_FIELD(data, SPI_GDBG_TRAP_CONFIG,
> +			TRAP_EN, 1);
> +	WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_TRAP_CONFIG), data);
> +	WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_TRAP_MASK), 0);
> +
> +	WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_TRAP_DATA0), 0);
> +	WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_TRAP_DATA1), 0);
> +}
> +
>   static void gfx_v10_0_init_compute_vmid(struct amdgpu_device *adev)
>   {
>   	int i;
> @@ -4854,6 +4877,9 @@ static void gfx_v10_0_init_compute_vmid(struct amdgpu_device *adev)
>   		WREG32_SOC15_OFFSET(GC, 0, mmGDS_GWS_VMID0, i, 0);
>   		WREG32_SOC15_OFFSET(GC, 0, mmGDS_OA_VMID0, i, 0);
>   	}
> +
> +	gfx_v10_0_debug_trap_config_init(adev, adev->vm_manager.first_kfd_vmid,
> +					AMDGPU_NUM_VMID);
>   }
>   
>   static void gfx_v10_0_init_gds_vmid(struct amdgpu_device *adev)
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
> index c621b2ad7ba3..3ca7a31fb770 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
> @@ -1572,6 +1572,7 @@ static void gfx_v11_0_init_compute_vmid(struct amdgpu_device *adev)
>   		/* Enable trap for each kfd vmid. */
>   		data = RREG32_SOC15(GC, 0, regSPI_GDBG_PER_VMID_CNTL);
>   		data = REG_SET_FIELD(data, SPI_GDBG_PER_VMID_CNTL, TRAP_EN, 1);
> +		WREG32_SOC15(GC, 0, regSPI_GDBG_PER_VMID_CNTL, data);
>   	}
>   	soc21_grbm_select(adev, 0, 0, 0, 0);
>   	mutex_unlock(&adev->srbm_mutex);
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> index 8ad5c03506f2..222fe87161b7 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> @@ -2289,6 +2289,29 @@ static void gfx_v9_0_setup_rb(struct amdgpu_device *adev)
>   	adev->gfx.config.num_rbs = hweight32(active_rbs);
>   }
>   
> +static void gfx_v9_0_debug_trap_config_init(struct amdgpu_device *adev,
> +				uint32_t first_vmid,
> +				uint32_t last_vmid)
> +{
> +	uint32_t data;
> +	uint32_t trap_config_vmid_mask = 0;
> +	int i;
> +
> +	/* Calculate trap config vmid mask */
> +	for (i = first_vmid; i < last_vmid; i++)
> +		trap_config_vmid_mask |= (1 << i);
> +
> +	data = REG_SET_FIELD(0, SPI_GDBG_TRAP_CONFIG,
> +			VMID_SEL, trap_config_vmid_mask);
> +	data = REG_SET_FIELD(data, SPI_GDBG_TRAP_CONFIG,
> +			TRAP_EN, 1);
> +	WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_TRAP_CONFIG), data);
> +	WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_TRAP_MASK), 0);
> +
> +	WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_TRAP_DATA0), 0);
> +	WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_TRAP_DATA1), 0);
> +}
> +
>   #define DEFAULT_SH_MEM_BASES	(0x6000)
>   static void gfx_v9_0_init_compute_vmid(struct amdgpu_device *adev)
>   {
> @@ -4565,6 +4588,13 @@ static int gfx_v9_0_late_init(void *handle)
>   	if (r)
>   		return r;
>   
> +	if (adev->ip_versions[GC_HWIP][0] == IP_VERSION(9, 4, 2))
> +		gfx_v9_4_2_debug_trap_config_init(adev,
> +			adev->vm_manager.first_kfd_vmid, AMDGPU_NUM_VMID);
> +	else
> +		gfx_v9_0_debug_trap_config_init(adev,
> +			adev->vm_manager.first_kfd_vmid, AMDGPU_NUM_VMID);
> +
>   	return 0;
>   }
>   
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v10.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v10.c
> index d3e2b6a599a4..cb484ace17de 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v10.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v10.c
> @@ -117,6 +117,11 @@ static void init_mqd(struct mqd_manager *mm, void **mqd,
>   			1 << CP_HQD_QUANTUM__QUANTUM_SCALE__SHIFT |
>   			1 << CP_HQD_QUANTUM__QUANTUM_DURATION__SHIFT;
>   
> +	/* Set cp_hqd_hq_scheduler0 bit 14 to 1 to have the CP set up the
> +	 * DISPATCH_PTR.  This is required for the kfd debugger
> +	 */
> +	m->cp_hqd_hq_scheduler0 = 1 << 14;
> +
>   	if (q->format == KFD_QUEUE_FORMAT_AQL) {
>   		m->cp_hqd_aql_control =
>   			1 << CP_HQD_AQL_CONTROL__CONTROL0__SHIFT;
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v11.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v11.c
> index 4f6390f3236e..ac7c8fc83c94 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v11.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v11.c
> @@ -143,6 +143,11 @@ static void init_mqd(struct mqd_manager *mm, void **mqd,
>   			1 << CP_HQD_QUANTUM__QUANTUM_SCALE__SHIFT |
>   			1 << CP_HQD_QUANTUM__QUANTUM_DURATION__SHIFT;
>   
> +	/* Set cp_hqd_hq_scheduler0 bit 14 to 1 to have the CP set up the
> +	 * DISPATCH_PTR.  This is required for the kfd debugger
> +	 */
> +	m->cp_hqd_hq_status0 = 1 << 14;
> +
>   	if (q->format == KFD_QUEUE_FORMAT_AQL) {
>   		m->cp_hqd_aql_control =
>   			1 << CP_HQD_AQL_CONTROL__CONTROL0__SHIFT;
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c
> index 0778e587a2d6..86f1cf090246 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c
> @@ -164,6 +164,11 @@ static void init_mqd(struct mqd_manager *mm, void **mqd,
>   			1 << CP_HQD_QUANTUM__QUANTUM_SCALE__SHIFT |
>   			1 << CP_HQD_QUANTUM__QUANTUM_DURATION__SHIFT;
>   
> +	/* Set cp_hqd_hq_scheduler0 bit 14 to 1 to have the CP set up the
> +	 * DISPATCH_PTR.  This is required for the kfd debugger
> +	 */
> +	m->cp_hqd_hq_status0 = 1 << 14;
> +
>   	if (q->format == KFD_QUEUE_FORMAT_AQL) {
>   		m->cp_hqd_aql_control =
>   			1 << CP_HQD_AQL_CONTROL__CONTROL0__SHIFT;
> diff --git a/drivers/gpu/drm/amd/include/asic_reg/gc/gc_10_1_0_offset.h b/drivers/gpu/drm/amd/include/asic_reg/gc/gc_10_1_0_offset.h
> index 18d34bbceebe..7d384f86bd67 100644
> --- a/drivers/gpu/drm/amd/include/asic_reg/gc/gc_10_1_0_offset.h
> +++ b/drivers/gpu/drm/amd/include/asic_reg/gc/gc_10_1_0_offset.h
> @@ -5190,6 +5190,20 @@
>   #define mmSPI_WCL_PIPE_PERCENT_CS6_BASE_IDX                                                            0
>   #define mmSPI_WCL_PIPE_PERCENT_CS7                                                                     0x1f70
>   #define mmSPI_WCL_PIPE_PERCENT_CS7_BASE_IDX                                                            0
> +#define mmSPI_GDBG_WAVE_CNTL                                                                           0x1f71
> +#define mmSPI_GDBG_WAVE_CNTL_BASE_IDX                                                                  0
> +#define mmSPI_GDBG_TRAP_CONFIG                                                                         0x1f72
> +#define mmSPI_GDBG_TRAP_CONFIG_BASE_IDX                                                                0
> +#define mmSPI_GDBG_TRAP_MASK                                                                           0x1f73
> +#define mmSPI_GDBG_TRAP_MASK_BASE_IDX                                                                  0
> +#define mmSPI_GDBG_WAVE_CNTL2                                                                          0x1f74
> +#define mmSPI_GDBG_WAVE_CNTL2_BASE_IDX                                                                 0
> +#define mmSPI_GDBG_WAVE_CNTL3                                                                          0x1f75
> +#define mmSPI_GDBG_WAVE_CNTL3_BASE_IDX                                                                 0
> +#define mmSPI_GDBG_TRAP_DATA0                                                                          0x1f78
> +#define mmSPI_GDBG_TRAP_DATA0_BASE_IDX                                                                 0
> +#define mmSPI_GDBG_TRAP_DATA1                                                                          0x1f79
> +#define mmSPI_GDBG_TRAP_DATA1_BASE_IDX                                                                 0
>   #define mmSPI_COMPUTE_QUEUE_RESET                                                                      0x1f7b
>   #define mmSPI_COMPUTE_QUEUE_RESET_BASE_IDX                                                             0
>   #define mmSPI_RESOURCE_RESERVE_CU_0                                                                    0x1f7c
> diff --git a/drivers/gpu/drm/amd/include/asic_reg/gc/gc_10_1_0_sh_mask.h b/drivers/gpu/drm/amd/include/asic_reg/gc/gc_10_1_0_sh_mask.h
> index 4127896ffcdf..08772ba845b0 100644
> --- a/drivers/gpu/drm/amd/include/asic_reg/gc/gc_10_1_0_sh_mask.h
> +++ b/drivers/gpu/drm/amd/include/asic_reg/gc/gc_10_1_0_sh_mask.h
> @@ -19646,6 +19646,75 @@
>   //SPI_WCL_PIPE_PERCENT_CS7
>   #define SPI_WCL_PIPE_PERCENT_CS7__VALUE__SHIFT                                                                0x0
>   #define SPI_WCL_PIPE_PERCENT_CS7__VALUE_MASK                                                                  0x7FL
> +//SPI_GDBG_WAVE_CNTL
> +#define SPI_GDBG_WAVE_CNTL__STALL_RA__SHIFT                                                                   0x0
> +#define SPI_GDBG_WAVE_CNTL__STALL_VMID__SHIFT                                                                 0x1
> +#define SPI_GDBG_WAVE_CNTL__STALL_RA_MASK                                                                     0x00000001L
> +#define SPI_GDBG_WAVE_CNTL__STALL_VMID_MASK                                                                   0x0001FFFEL
> +//SPI_GDBG_TRAP_CONFIG
> +#define SPI_GDBG_TRAP_CONFIG__ME_SEL__SHIFT                                                                   0x0
> +#define SPI_GDBG_TRAP_CONFIG__PIPE_SEL__SHIFT                                                                 0x2
> +#define SPI_GDBG_TRAP_CONFIG__QUEUE_SEL__SHIFT                                                                0x4
> +#define SPI_GDBG_TRAP_CONFIG__ME_MATCH__SHIFT                                                                 0x7
> +#define SPI_GDBG_TRAP_CONFIG__PIPE_MATCH__SHIFT                                                               0x8
> +#define SPI_GDBG_TRAP_CONFIG__QUEUE_MATCH__SHIFT                                                              0x9
> +#define SPI_GDBG_TRAP_CONFIG__TRAP_EN__SHIFT                                                                  0xf
> +#define SPI_GDBG_TRAP_CONFIG__VMID_SEL__SHIFT                                                                 0x10
> +#define SPI_GDBG_TRAP_CONFIG__ME_SEL_MASK                                                                     0x00000003L
> +#define SPI_GDBG_TRAP_CONFIG__PIPE_SEL_MASK                                                                   0x0000000CL
> +#define SPI_GDBG_TRAP_CONFIG__QUEUE_SEL_MASK                                                                  0x00000070L
> +#define SPI_GDBG_TRAP_CONFIG__ME_MATCH_MASK                                                                   0x00000080L
> +#define SPI_GDBG_TRAP_CONFIG__PIPE_MATCH_MASK                                                                 0x00000100L
> +#define SPI_GDBG_TRAP_CONFIG__QUEUE_MATCH_MASK                                                                0x00000200L
> +#define SPI_GDBG_TRAP_CONFIG__TRAP_EN_MASK                                                                    0x00008000L
> +#define SPI_GDBG_TRAP_CONFIG__VMID_SEL_MASK                                                                   0xFFFF0000L
> +//SPI_GDBG_TRAP_MASK
> +#define SPI_GDBG_TRAP_MASK__EXCP_EN__SHIFT                                                                    0x0
> +#define SPI_GDBG_TRAP_MASK__REPLACE__SHIFT                                                                    0x9
> +#define SPI_GDBG_TRAP_MASK__EXCP_EN_MASK                                                                      0x01FFL
> +#define SPI_GDBG_TRAP_MASK__REPLACE_MASK                                                                      0x0200L
> +//SPI_GDBG_WAVE_CNTL2
> +#define SPI_GDBG_WAVE_CNTL2__VMID_MASK__SHIFT                                                                 0x0
> +#define SPI_GDBG_WAVE_CNTL2__MODE__SHIFT                                                                      0x10
> +#define SPI_GDBG_WAVE_CNTL2__VMID_MASK_MASK                                                                   0x0000FFFFL
> +#define SPI_GDBG_WAVE_CNTL2__MODE_MASK                                                                        0x00030000L
> +//SPI_GDBG_WAVE_CNTL3
> +#define SPI_GDBG_WAVE_CNTL3__STALL_PS__SHIFT                                                                  0x0
> +#define SPI_GDBG_WAVE_CNTL3__STALL_VS__SHIFT                                                                  0x1
> +#define SPI_GDBG_WAVE_CNTL3__STALL_GS__SHIFT                                                                  0x2
> +#define SPI_GDBG_WAVE_CNTL3__STALL_HS__SHIFT                                                                  0x3
> +#define SPI_GDBG_WAVE_CNTL3__STALL_CSG__SHIFT                                                                 0x4
> +#define SPI_GDBG_WAVE_CNTL3__STALL_CS0__SHIFT                                                                 0x5
> +#define SPI_GDBG_WAVE_CNTL3__STALL_CS1__SHIFT                                                                 0x6
> +#define SPI_GDBG_WAVE_CNTL3__STALL_CS2__SHIFT                                                                 0x7
> +#define SPI_GDBG_WAVE_CNTL3__STALL_CS3__SHIFT                                                                 0x8
> +#define SPI_GDBG_WAVE_CNTL3__STALL_CS4__SHIFT                                                                 0x9
> +#define SPI_GDBG_WAVE_CNTL3__STALL_CS5__SHIFT                                                                 0xa
> +#define SPI_GDBG_WAVE_CNTL3__STALL_CS6__SHIFT                                                                 0xb
> +#define SPI_GDBG_WAVE_CNTL3__STALL_CS7__SHIFT                                                                 0xc
> +#define SPI_GDBG_WAVE_CNTL3__STALL_DURATION__SHIFT                                                            0xd
> +#define SPI_GDBG_WAVE_CNTL3__STALL_MULT__SHIFT                                                                0x1c
> +#define SPI_GDBG_WAVE_CNTL3__STALL_PS_MASK                                                                    0x00000001L
> +#define SPI_GDBG_WAVE_CNTL3__STALL_VS_MASK                                                                    0x00000002L
> +#define SPI_GDBG_WAVE_CNTL3__STALL_GS_MASK                                                                    0x00000004L
> +#define SPI_GDBG_WAVE_CNTL3__STALL_HS_MASK                                                                    0x00000008L
> +#define SPI_GDBG_WAVE_CNTL3__STALL_CSG_MASK                                                                   0x00000010L
> +#define SPI_GDBG_WAVE_CNTL3__STALL_CS0_MASK                                                                   0x00000020L
> +#define SPI_GDBG_WAVE_CNTL3__STALL_CS1_MASK                                                                   0x00000040L
> +#define SPI_GDBG_WAVE_CNTL3__STALL_CS2_MASK                                                                   0x00000080L
> +#define SPI_GDBG_WAVE_CNTL3__STALL_CS3_MASK                                                                   0x00000100L
> +#define SPI_GDBG_WAVE_CNTL3__STALL_CS4_MASK                                                                   0x00000200L
> +#define SPI_GDBG_WAVE_CNTL3__STALL_CS5_MASK                                                                   0x00000400L
> +#define SPI_GDBG_WAVE_CNTL3__STALL_CS6_MASK                                                                   0x00000800L
> +#define SPI_GDBG_WAVE_CNTL3__STALL_CS7_MASK                                                                   0x00001000L
> +#define SPI_GDBG_WAVE_CNTL3__STALL_DURATION_MASK                                                              0x0FFFE000L
> +#define SPI_GDBG_WAVE_CNTL3__STALL_MULT_MASK                                                                  0x10000000L
> +//SPI_GDBG_TRAP_DATA0
> +#define SPI_GDBG_TRAP_DATA0__DATA__SHIFT                                                                      0x0
> +#define SPI_GDBG_TRAP_DATA0__DATA_MASK                                                                        0xFFFFFFFFL
> +//SPI_GDBG_TRAP_DATA1
> +#define SPI_GDBG_TRAP_DATA1__DATA__SHIFT                                                                      0x0
> +#define SPI_GDBG_TRAP_DATA1__DATA_MASK                                                                        0xFFFFFFFFL
>   //SPI_COMPUTE_QUEUE_RESET
>   #define SPI_COMPUTE_QUEUE_RESET__RESET__SHIFT                                                                 0x0
>   #define SPI_COMPUTE_QUEUE_RESET__RESET_MASK                                                                   0x01L
> diff --git a/drivers/gpu/drm/amd/include/asic_reg/gc/gc_10_3_0_offset.h b/drivers/gpu/drm/amd/include/asic_reg/gc/gc_10_3_0_offset.h
> index 3973110f149c..d09f1a06f4bf 100644
> --- a/drivers/gpu/drm/amd/include/asic_reg/gc/gc_10_3_0_offset.h
> +++ b/drivers/gpu/drm/amd/include/asic_reg/gc/gc_10_3_0_offset.h
> @@ -26,6 +26,8 @@
>   #define mmSQ_DEBUG_STS_GLOBAL_BASE_IDX                                                                 0
>   #define mmSQ_DEBUG_STS_GLOBAL2                                                                         0x10B0
>   #define mmSQ_DEBUG_STS_GLOBAL2_BASE_IDX                                                                0
> +#define mmSQ_DEBUG                                                                                     0x10B1
> +#define mmSQ_DEBUG_BASE_IDX                                                                            0
>   
>   // addressBlock: gc_sdma0_sdma0dec
>   // base address: 0x4980
> @@ -4849,10 +4851,18 @@
>   #define mmSPI_WCL_PIPE_PERCENT_CS3_BASE_IDX                                                            0
>   #define mmSPI_GDBG_WAVE_CNTL                                                                           0x1f71
>   #define mmSPI_GDBG_WAVE_CNTL_BASE_IDX                                                                  0
> +#define mmSPI_GDBG_TRAP_CONFIG                                                                         0x1f72
> +#define mmSPI_GDBG_TRAP_CONFIG_BASE_IDX                                                                0
>   #define mmSPI_GDBG_TRAP_MASK                                                                           0x1f73
>   #define mmSPI_GDBG_TRAP_MASK_BASE_IDX                                                                  0
>   #define mmSPI_GDBG_WAVE_CNTL2                                                                          0x1f74
>   #define mmSPI_GDBG_WAVE_CNTL2_BASE_IDX                                                                 0
> +#define mmSPI_GDBG_WAVE_CNTL3                                                                          0x1f75
> +#define mmSPI_GDBG_WAVE_CNTL3_BASE_IDX                                                                 0
> +#define mmSPI_GDBG_TRAP_DATA0                                                                          0x1f78
> +#define mmSPI_GDBG_TRAP_DATA0_BASE_IDX                                                                 0
> +#define mmSPI_GDBG_TRAP_DATA1                                                                          0x1f79
> +#define mmSPI_GDBG_TRAP_DATA1_BASE_IDX                                                                 0
>   #define mmSPI_COMPUTE_QUEUE_RESET                                                                      0x1f7b
>   #define mmSPI_COMPUTE_QUEUE_RESET_BASE_IDX                                                             0
>   #define mmSPI_RESOURCE_RESERVE_CU_0                                                                    0x1f7c
> diff --git a/drivers/gpu/drm/amd/include/asic_reg/gc/gc_10_3_0_sh_mask.h b/drivers/gpu/drm/amd/include/asic_reg/gc/gc_10_3_0_sh_mask.h
> index d4e8ff22ecb8..fc85aee010fe 100644
> --- a/drivers/gpu/drm/amd/include/asic_reg/gc/gc_10_3_0_sh_mask.h
> +++ b/drivers/gpu/drm/amd/include/asic_reg/gc/gc_10_3_0_sh_mask.h
> @@ -47853,6 +47853,10 @@
>   
>   
>   // addressBlock: sqind
> +//SQ_DEBUG
> +#define SQ_DEBUG__SINGLE_MEMOP_MASK 0x00000001L
> +#define SQ_DEBUG__SINGLE_MEMOP__SHIFT 0x00000000
> +
>   //SQ_DEBUG_STS_GLOBAL
>   #define SQ_DEBUG_STS_GLOBAL2__FIFO_LEVEL_GFX0_MASK 0x000000ffL
>   #define SQ_DEBUG_STS_GLOBAL2__FIFO_LEVEL_GFX0__SHIFT 0x00000000
> diff --git a/drivers/gpu/drm/amd/include/asic_reg/gc/gc_11_0_0_sh_mask.h b/drivers/gpu/drm/amd/include/asic_reg/gc/gc_11_0_0_sh_mask.h
> index 4f08f90856fc..3088a4a13cb5 100644
> --- a/drivers/gpu/drm/amd/include/asic_reg/gc/gc_11_0_0_sh_mask.h
> +++ b/drivers/gpu/drm/amd/include/asic_reg/gc/gc_11_0_0_sh_mask.h
> @@ -17216,11 +17216,15 @@
>   #define SPI_GDBG_PER_VMID_CNTL__TRAP_EN__SHIFT                                                                0x3
>   #define SPI_GDBG_PER_VMID_CNTL__EXCP_EN__SHIFT                                                                0x4
>   #define SPI_GDBG_PER_VMID_CNTL__EXCP_REPLACE__SHIFT                                                           0xd
> +#define SPI_GDBG_PER_VMID_CNTL__TRAP_ON_START__SHIFT                                                          0xe
> +#define SPI_GDBG_PER_VMID_CNTL__TRAP_ON_END__SHIFT                                                            0xf
>   #define SPI_GDBG_PER_VMID_CNTL__STALL_VMID_MASK                                                               0x00000001L
>   #define SPI_GDBG_PER_VMID_CNTL__LAUNCH_MODE_MASK                                                              0x00000006L
>   #define SPI_GDBG_PER_VMID_CNTL__TRAP_EN_MASK                                                                  0x00000008L
>   #define SPI_GDBG_PER_VMID_CNTL__EXCP_EN_MASK                                                                  0x00001FF0L
>   #define SPI_GDBG_PER_VMID_CNTL__EXCP_REPLACE_MASK                                                             0x00002000L
> +#define SPI_GDBG_PER_VMID_CNTL__TRAP_ON_START_MASK                                                            0x00004000L
> +#define SPI_GDBG_PER_VMID_CNTL__TRAP_ON_END_MASK                                                              0x00008000L
>   //SPI_COMPUTE_QUEUE_RESET
>   #define SPI_COMPUTE_QUEUE_RESET__RESET__SHIFT                                                                 0x0
>   #define SPI_COMPUTE_QUEUE_RESET__RESET_MASK                                                                   0x01L

^ permalink raw reply	[flat|nested] 68+ messages in thread

* Re: [PATCH 06/32] drm/amdgpu: add gfx9 hw debug mode enable and disable calls
  2023-01-25 19:53 ` [PATCH 06/32] drm/amdgpu: add gfx9 hw debug mode enable and disable calls Jonathan Kim
  2023-01-29  5:12   ` kernel test robot
@ 2023-02-16 22:54   ` Felix Kuehling
  1 sibling, 0 replies; 68+ messages in thread
From: Felix Kuehling @ 2023-02-16 22:54 UTC (permalink / raw)
  To: Jonathan Kim, amd-gfx, dri-devel


On 2023-01-25 14:53, Jonathan Kim wrote:
> Implement the per-device calls to enable or disable HW debug mode for
> GFX9 prior to GFX9.4.1.
>
> GFX9.4.1 and onward will require their own enable/disable sequence as
> follow on patches.
>
> When hardware debug mode setting is requested, waves will inherit
> these settings in the Shader Processor Input's (SPI) Sequencer Global
> Block (SQG). This means that the KGD must drain all waves from the SPI
> into SQG (approximately 96 SPI clock cycles) prior to debug mode setting
> to ensure that the order of operations that the debugger expects with
> regards to debug mode setting transaction requests and wave inheritence
> of that mode is upheld.
>
> Also ensure that exception overrides are reset to their original state
> prior to debug enable or disable.
>
> v2: remove unnecessary static srbm lock renaming.
> add comments to explain ignored arguments for debug trap enable and
> disable.
>
> Signed-off-by: Jonathan Kim <jonathan.kim@amd.com>
> ---
> .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c | 93 +++++++++++++++++++
> .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h | 9 ++
> drivers/gpu/drm/amd/amdkfd/kfd_debug.h | 3 +
> 3 files changed, 105 insertions(+)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
> index e92b93557c13..94a9fd9bd984 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
> @@ -646,6 +646,97 @@ int kgd_gfx_v9_wave_control_execute(struct 
> amdgpu_device *adev,
> return 0;
> }
> +/*
> + * GFX9 helper for wave launch stall requirements on debug trap setting.
> + *
> + * vmid:
> + * Target VMID to stall/unstall.
> + *
> + * stall:
> + * 0-unstall wave launch (enable), 1-stall wave launch (disable).
> + * After wavefront launch has been stalled, allocated waves must 
> drain from
> + * SPI in order for debug trap settings to take effect on those waves.
> + * This is roughly a ~96 clock cycle wait on SPI where a read on
> + * SPI_GDBG_WAVE_CNTL translates to ~32 clock cycles.
> + * KGD_GFX_V9_WAVE_LAUNCH_SPI_DRAIN_LATENCY indicates the number of 
> reads required.
> + *
> + * NOTE: We can afford to clear the entire STALL_VMID field on unstall
> + * because GFX9.4.1 cannot support multi-process debugging due to trap
> + * configuration and masking being limited to global scope. Always assume
> + * single process conditions.
> +
> + */
> +#define KGD_GFX_V9_WAVE_LAUNCH_SPI_DRAIN_LATENCY 3
> +void kgd_gfx_v9_set_wave_launch_stall(struct amdgpu_device *adev,
> + uint32_t vmid,
> + bool stall)
> +{
> + int i;
> + uint32_t data = RREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_WAVE_CNTL));
> +
> + if (adev->ip_versions[GC_HWIP][0] == IP_VERSION(9, 4, 1))
> + data = REG_SET_FIELD(data, SPI_GDBG_WAVE_CNTL, STALL_VMID,
> + stall ? 1 << vmid : 0);
> + else
> + data = REG_SET_FIELD(data, SPI_GDBG_WAVE_CNTL, STALL_RA,
> + stall ? 1 : 0);
> +
> + WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_WAVE_CNTL), data);
> +
> + if (!stall)
> + return;
> +
> + for (i = 0; i < KGD_GFX_V9_WAVE_LAUNCH_SPI_DRAIN_LATENCY; i++)
> + RREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_WAVE_CNTL));
> +}
> +
> +/**

This was flagged by the kernel test robot. Should just be /* because 
it's not a formal doc comment.


> + * restore_dbg_reisters is ignored here but is a general interface 
> requirement

Typo: reisters -> registers


> + * for devices that support GFXOFF and where the RLC save/restore list
> + * does not support hw registers for debugging i.e. the driver has to 
> manually
> + * initialize the debug mode registers after it has disabled GFX off 
> during the
> + * debug session.
> + */
> +uint32_t kgd_gfx_v9_enable_debug_trap(struct amdgpu_device *adev,
> + bool restore_dbg_registers,
> + uint32_t vmid)
> +{
> + mutex_lock(&adev->grbm_idx_mutex);
> +
> + kgd_gfx_v9_set_wave_launch_stall(adev, vmid, true);
> +
> + WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_TRAP_MASK), 0);
> +
> + kgd_gfx_v9_set_wave_launch_stall(adev, vmid, false);
> +
> + mutex_unlock(&adev->grbm_idx_mutex);
> +
> + return 0;
> +}
> +
> +/**

Same as above. With those fixed, the patch is

Reviewed-by: Felix Kuehling <Felix.Kuehling@amd.com>


> + * keep_trap_enabled is ignored here but is a general interface 
> requirement
> + * for devices that support multi-process debugging where the performance
> + * overhead from trap temporary setup needs to be bypassed when the debug
> + * session has ended.
> + */
> +uint32_t kgd_gfx_v9_disable_debug_trap(struct amdgpu_device *adev,
> + bool keep_trap_enabled,
> + uint32_t vmid)
> +{
> + mutex_lock(&adev->grbm_idx_mutex);
> +
> + kgd_gfx_v9_set_wave_launch_stall(adev, vmid, true);
> +
> + WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_TRAP_MASK), 0);
> +
> + kgd_gfx_v9_set_wave_launch_stall(adev, vmid, false);
> +
> + mutex_unlock(&adev->grbm_idx_mutex);
> +
> + return 0;
> +}
> +
> void kgd_gfx_v9_set_vm_context_page_table_base(struct amdgpu_device *adev,
> uint32_t vmid, uint64_t page_table_base)
> {
> @@ -871,6 +962,8 @@ const struct kfd2kgd_calls gfx_v9_kfd2kgd = {
> .get_atc_vmid_pasid_mapping_info =
> kgd_gfx_v9_get_atc_vmid_pasid_mapping_info,
> .set_vm_context_page_table_base = 
> kgd_gfx_v9_set_vm_context_page_table_base,
> + .enable_debug_trap = kgd_gfx_v9_enable_debug_trap,
> + .disable_debug_trap = kgd_gfx_v9_disable_debug_trap,
> .get_cu_occupancy = kgd_gfx_v9_get_cu_occupancy,
> .program_trap_handler_settings = kgd_gfx_v9_program_trap_handler_settings,
> };
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h
> index c7ed3bc9053c..d39256162616 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h
> @@ -58,3 +58,12 @@ void kgd_gfx_v9_get_cu_occupancy(struct 
> amdgpu_device *adev, int pasid,
> int *pasid_wave_cnt, int *max_waves_per_cu);
> void kgd_gfx_v9_program_trap_handler_settings(struct amdgpu_device *adev,
> uint32_t vmid, uint64_t tba_addr, uint64_t tma_addr);
> +void kgd_gfx_v9_set_wave_launch_stall(struct amdgpu_device *adev,
> + uint32_t vmid,
> + bool stall);
> +uint32_t kgd_gfx_v9_enable_debug_trap(struct amdgpu_device *adev,
> + bool restore_dbg_registers,
> + uint32_t vmid);
> +uint32_t kgd_gfx_v9_disable_debug_trap(struct amdgpu_device *adev,
> + bool keep_trap_enabled,
> + uint32_t vmid);
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.h 
> b/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
> index b2217eb1399c..8aa7a3ad4e97 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
> @@ -25,6 +25,9 @@
> #include "kfd_priv.h"
> +void kgd_gfx_v9_set_wave_launch_stall(struct amdgpu_device *adev,
> + uint32_t vmid,
> + bool stall);
> int kfd_dbg_trap_disable(struct kfd_process *target);
> int kfd_dbg_trap_enable(struct kfd_process *target, uint32_t fd,
> void __user *runtime_info,

^ permalink raw reply	[flat|nested] 68+ messages in thread

* Re: [PATCH 07/32] drm/amdgpu: add gfx9.4.1 hw debug mode enable and disable calls
  2023-01-25 19:53 ` [PATCH 07/32] drm/amdgpu: add gfx9.4.1 " Jonathan Kim
  2023-01-29  6:34   ` kernel test robot
@ 2023-02-16 23:01   ` Felix Kuehling
  1 sibling, 0 replies; 68+ messages in thread
From: Felix Kuehling @ 2023-02-16 23:01 UTC (permalink / raw)
  To: Jonathan Kim, amd-gfx, dri-devel


On 2023-01-25 14:53, Jonathan Kim wrote:
> On GFX9.4.1, the implicit wait count instruction on s_barrier is
> disabled by default in the driver during normal operation for
> performance requirements.
>
> There is a hardware bug in GFX9.4.1 where if the implicit wait count
> instruction after an s_barrier instruction is disabled, any wave that
> hits an exception may step over the s_barrier when returning from the
> trap handler with the barrier logic having no ability to be
> aware of this, thereby causing other waves to wait at the barrier
> indefinitely resulting in a shader hang.  This bug has been corrected
> for GFX9.4.2 and onward.
>
> Since the debugger subscribes to hardware exceptions, in order to avoid
> this bug, the debugger must enable implicit wait count on s_barrier
> for a debug session and disable it on detach.
>
> In order to change this setting in the in the device global SQ_CONFIG
> register, the GFX pipeline must be idle.  GFX9.4.1 as a compute device
> will either dispatch work through the compute ring buffers used for
> image post processing or through the hardware scheduler by the KFD.
>
> Have the KGD suspend and drain the compute ring buffer, then suspend the
> hardware scheduler and block any future KFD process job requests before
> changing the implicit wait count setting.  Once set, resume all work.
>
> v2: remove flush on kfd suspend as that will be a general fix required
> outside of this patch series.
> comment on trap enable/disable ignored variables.
>
> Signed-off-by: Jonathan Kim <jonathan.kim@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu.h           |   3 +
>   .../drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c   | 118 +++++++++++++++++-
>   drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c         |   4 +-
>   3 files changed, 122 insertions(+), 3 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index 872450a3a164..3c03e34c194c 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -1041,6 +1041,9 @@ struct amdgpu_device {
>   	struct pci_saved_state          *pci_state;
>   	pci_channel_state_t		pci_channel_state;
>   
> +	/* Track auto wait count on s_barrier settings */
> +	bool				barrier_has_auto_waitcnt;
> +
>   	struct amdgpu_reset_control     *reset_cntl;
>   	uint32_t                        ip_versions[MAX_HWIP][HWIP_MAX_INSTANCE];
>   
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c
> index 4191af5a3f13..d5bb86ccd617 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c
> @@ -26,6 +26,7 @@
>   #include "amdgpu.h"
>   #include "amdgpu_amdkfd.h"
>   #include "amdgpu_amdkfd_arcturus.h"
> +#include "amdgpu_reset.h"
>   #include "sdma0/sdma0_4_2_2_offset.h"
>   #include "sdma0/sdma0_4_2_2_sh_mask.h"
>   #include "sdma1/sdma1_4_2_2_offset.h"
> @@ -48,6 +49,8 @@
>   #include "amdgpu_amdkfd_gfx_v9.h"
>   #include "gfxhub_v1_0.h"
>   #include "mmhub_v9_4.h"
> +#include "gc/gc_9_0_offset.h"
> +#include "gc/gc_9_0_sh_mask.h"
>   
>   #define HQD_N_REGS 56
>   #define DUMP_REG(addr) do {				\
> @@ -276,6 +279,117 @@ int kgd_arcturus_hqd_sdma_destroy(struct amdgpu_device *adev, void *mqd,
>   	return 0;
>   }
>   
> +/*
> + * Helper used to suspend/resume gfx pipe for image post process work to set
> + * barrier behaviour.
> + */
> +static int suspend_resume_compute_scheduler(struct amdgpu_device *adev, bool suspend)
> +{
> +	int i, r = 0;
> +
> +	for (i = 0; i < adev->gfx.num_compute_rings; i++) {
> +		struct amdgpu_ring *ring = &adev->gfx.compute_ring[i];
> +
> +		if (!(ring && ring->sched.thread))
> +			continue;
> +
> +		/* stop secheduler and drain ring. */
> +		if (suspend) {
> +			drm_sched_stop(&ring->sched, NULL);
> +			r = amdgpu_fence_wait_empty(ring);
> +			if (r)
> +				goto out;
> +		} else {
> +			drm_sched_start(&ring->sched, false);
> +		}
> +	}
> +
> +out:
> +	/* return on resume or failure to drain rings. */
> +	if (!suspend || r)
> +		return r;
> +
> +	return amdgpu_device_ip_wait_for_idle(adev, GC_HWIP);
> +}
> +
> +static void set_barrier_auto_waitcnt(struct amdgpu_device *adev, bool enable_waitcnt)
> +{
> +	uint32_t data;
> +
> +	WRITE_ONCE(adev->barrier_has_auto_waitcnt, enable_waitcnt);
> +
> +	if (!down_read_trylock(&adev->reset_domain->sem))
> +		return;
> +
> +	amdgpu_amdkfd_suspend(adev, false);
> +
> +	if (suspend_resume_compute_scheduler(adev, true))
> +		goto out;
> +
> +	data = RREG32(SOC15_REG_OFFSET(GC, 0, mmSQ_CONFIG));
> +	data = REG_SET_FIELD(data, SQ_CONFIG, DISABLE_BARRIER_WAITCNT,
> +						enable_waitcnt ? 0 : 1);

This could be ..., !enable_waitcnt);


> +	WREG32(SOC15_REG_OFFSET(GC, 0, mmSQ_CONFIG), data);
> +
> +out:
> +	suspend_resume_compute_scheduler(adev, false);
> +
> +	amdgpu_amdkfd_resume(adev, false);
> +
> +	up_read(&adev->reset_domain->sem);
> +}
> +
> +/**

Use /* here, since this is not a doc comment.


> + * restore_dbg_reisters is ignored here but is a general interface requirement

Typo: registers


> + * for devices that support GFXOFF and where the RLC save/restore list
> + * does not support hw registers for debugging i.e. the driver has to manually
> + * initialize the debug mode registers after it has disabled GFX off during the
> + * debug session.
> + */
> +static uint32_t kgd_arcturus_enable_debug_trap(struct amdgpu_device *adev,
> +				bool restore_dbg_registers,
> +				uint32_t vmid)
> +{
> +	mutex_lock(&adev->grbm_idx_mutex);
> +
> +	kgd_gfx_v9_set_wave_launch_stall(adev, vmid, true);
> +
> +	set_barrier_auto_waitcnt(adev, true);
> +
> +	WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_TRAP_MASK), 0);
> +
> +	kgd_gfx_v9_set_wave_launch_stall(adev, vmid, false);
> +
> +	mutex_unlock(&adev->grbm_idx_mutex);
> +
> +	return 0;
> +}
> +
> +/**

/*


> + * keep_trap_enabled is ignored here but is a general interface requirement
> + * for devices that support multi-process debugging where the performance
> + * overhead from trap temporary setup needs to be bypassed when the debug
> + * session has ended.
> + */
> +static uint32_t kgd_arcturus_disable_debug_trap(struct amdgpu_device *adev,
> +					bool keep_trap_enabled,
> +					uint32_t vmid)
> +{
> +
> +	mutex_lock(&adev->grbm_idx_mutex);
> +
> +	kgd_gfx_v9_set_wave_launch_stall(adev, vmid, true);
> +
> +	set_barrier_auto_waitcnt(adev, false);
> +
> +	WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_TRAP_MASK), 0);
> +
> +	kgd_gfx_v9_set_wave_launch_stall(adev, vmid, false);
> +
> +	mutex_unlock(&adev->grbm_idx_mutex);
> +
> +	return 0;
> +}
>   const struct kfd2kgd_calls arcturus_kfd2kgd = {
>   	.program_sh_mem_settings = kgd_gfx_v9_program_sh_mem_settings,
>   	.set_pasid_vmid_mapping = kgd_gfx_v9_set_pasid_vmid_mapping,
> @@ -294,6 +408,8 @@ const struct kfd2kgd_calls arcturus_kfd2kgd = {
>   				kgd_gfx_v9_get_atc_vmid_pasid_mapping_info,
>   	.set_vm_context_page_table_base =
>   				kgd_gfx_v9_set_vm_context_page_table_base,
> +	.enable_debug_trap = kgd_arcturus_enable_debug_trap,
> +	.disable_debug_trap = kgd_arcturus_disable_debug_trap,
>   	.get_cu_occupancy = kgd_gfx_v9_get_cu_occupancy,
> -	.program_trap_handler_settings = kgd_gfx_v9_program_trap_handler_settings
> +	.program_trap_handler_settings = kgd_gfx_v9_program_trap_handler_settings,
>   };
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> index 222fe87161b7..56d25a6f1da9 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> @@ -2376,8 +2376,8 @@ static void gfx_v9_0_init_sq_config(struct amdgpu_device *adev)
>   	switch (adev->ip_versions[GC_HWIP][0]) {
>   	case IP_VERSION(9, 4, 1):
>   		tmp = RREG32_SOC15(GC, 0, mmSQ_CONFIG);
> -		tmp = REG_SET_FIELD(tmp, SQ_CONFIG,
> -					DISABLE_BARRIER_WAITCNT, 1);
> +		tmp = REG_SET_FIELD(tmp, SQ_CONFIG, DISABLE_BARRIER_WAITCNT,
> +				READ_ONCE(adev->barrier_has_auto_waitcnt) ? 0 : 1);

This could be ..., !READ_ONCE(adev->barrier_has_auto_waitcnt));

With those nit-picks fixed, the patch is

Reviewed-by: Felix Kuehling <Felix.Kuehling@amd.com>


>   		WREG32_SOC15(GC, 0, mmSQ_CONFIG, tmp);
>   		break;
>   	default:

^ permalink raw reply	[flat|nested] 68+ messages in thread

* Re: [PATCH 08/32] drm/amdgpu: add gfx10 hw debug mode enable and disable calls
  2023-01-25 19:53 ` [PATCH 08/32] drm/amdgpu: add gfx10 " Jonathan Kim
  2023-01-29  7:55   ` kernel test robot
@ 2023-02-16 23:11   ` Felix Kuehling
  1 sibling, 0 replies; 68+ messages in thread
From: Felix Kuehling @ 2023-02-16 23:11 UTC (permalink / raw)
  To: Jonathan Kim, amd-gfx, dri-devel


On 2023-01-25 14:53, Jonathan Kim wrote:
> Similar to GFX9 debug devices, set the hardware debug mode by draining
> the SPI appropriately prior the mode setting request.
>
> Because GFX10 has waves allocated by the work group boundaray and each

Typo: boundary?


> SE's SPI instances do not communicate, the SPI drain time is much longer.
> This long drain time will be fixed for GFX11 onwards.
>
> Also remove a bunch of deprecated misplaced references for GFX10.3.
>
> Signed-off-by: Jonathan Kim <jonathan.kim@amd.com>
> ---
>   .../drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c    |  95 +++++++++++
>   .../drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.h    |  28 ++++
>   .../drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10_3.c  | 147 +-----------------
>   3 files changed, 126 insertions(+), 144 deletions(-)
>   create mode 100644 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.h
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
> index 9378fc79e9ea..c09b45de02d0 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
> @@ -708,6 +708,99 @@ static void set_vm_context_page_table_base(struct amdgpu_device *adev,
>   	adev->gfxhub.funcs->setup_vm_pt_regs(adev, vmid, page_table_base);
>   }
>   
> +/*
> + * GFX10 helper for wave launch stall requirements on debug trap setting.
> + *
> + * vmid:
> + *   Target VMID to stall/unstall.
> + *
> + * stall:
> + *   0-unstall wave launch (enable), 1-stall wave launch (disable).
> + *   After wavefront launch has been stalled, allocated waves must drain from
> + *   SPI in order for debug trap settings to take effect on those waves.
> + *   This is roughly a ~3500 clock cycle wait on SPI where a read on
> + *   SPI_GDBG_WAVE_CNTL translates to ~32 clock cycles.
> + *   KGD_GFX_V10_WAVE_LAUNCH_SPI_DRAIN_LATENCY indicates the number of reads required.
> + *
> + *   NOTE: We can afford to clear the entire STALL_VMID field on unstall
> + *   because current GFX10 chips cannot support multi-process debugging due to
> + *   trap configuration and masking being limited to global scope.  Always
> + *   assume single process conditions.
> + *
> + */
> +
> +#define KGD_GFX_V10_WAVE_LAUNCH_SPI_DRAIN_LATENCY	110
> +static void kgd_gfx_v10_set_wave_launch_stall(struct amdgpu_device *adev, uint32_t vmid, bool stall)
> +{
> +	uint32_t data = RREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_WAVE_CNTL));
> +	int i;
> +
> +	data = REG_SET_FIELD(data, SPI_GDBG_WAVE_CNTL, STALL_VMID,
> +							stall ? 1 << vmid : 0);
> +
> +	WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_WAVE_CNTL), data);
> +
> +	if (!stall)
> +		return;
> +
> +	for (i = 0; i < KGD_GFX_V10_WAVE_LAUNCH_SPI_DRAIN_LATENCY; i++)
> +		RREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_WAVE_CNTL));
> +}
> +
> +uint32_t kgd_gfx_v10_enable_debug_trap(struct amdgpu_device *adev,

The kernel test robot flagged a missing prototype or this function. You 
probably need to #include amdgpu_amdkfd_gfx_v10.h to fix this.


> +				bool restore_dbg_registers,
> +				uint32_t vmid)
> +{
> +
> +	mutex_lock(&adev->grbm_idx_mutex);
> +
> +	kgd_gfx_v10_set_wave_launch_stall(adev, vmid, true);
> +
> +	/* assume gfx off is disabled for the debug session if rlc restore not supported. */
> +	if (restore_dbg_registers) {
> +		uint32_t data = 0;
> +
> +		data = REG_SET_FIELD(data, SPI_GDBG_TRAP_CONFIG,
> +				VMID_SEL, 1 << vmid);
> +		data = REG_SET_FIELD(data, SPI_GDBG_TRAP_CONFIG,
> +				TRAP_EN, 1);
> +		WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_TRAP_CONFIG), data);
> +		WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_TRAP_DATA0), 0);
> +		WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_TRAP_DATA1), 0);
> +
> +		kgd_gfx_v10_set_wave_launch_stall(adev, vmid, false);
> +
> +		mutex_unlock(&adev->grbm_idx_mutex);
> +
> +		return 0;
> +	}
> +
> +	WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_TRAP_MASK), 0);
> +
> +	kgd_gfx_v10_set_wave_launch_stall(adev, vmid, false);
> +
> +	mutex_unlock(&adev->grbm_idx_mutex);
> +
> +	return 0;
> +}
> +
> +uint32_t kgd_gfx_v10_disable_debug_trap(struct amdgpu_device *adev,

Same as above.

With that fixed, the patch is

Reviewed-by: Felix Kuehling <Felix.Kuehling@amd.com>


> +					bool keep_trap_enabled,
> +					uint32_t vmid)
> +{
> +	mutex_lock(&adev->grbm_idx_mutex);
> +
> +	kgd_gfx_v10_set_wave_launch_stall(adev, vmid, true);
> +
> +	WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_TRAP_MASK), 0);
> +
> +	kgd_gfx_v10_set_wave_launch_stall(adev, vmid, false);
> +
> +	mutex_unlock(&adev->grbm_idx_mutex);
> +
> +	return 0;
> +}
> +
>   static void program_trap_handler_settings(struct amdgpu_device *adev,
>   		uint32_t vmid, uint64_t tba_addr, uint64_t tma_addr)
>   {
> @@ -750,5 +843,7 @@ const struct kfd2kgd_calls gfx_v10_kfd2kgd = {
>   	.get_atc_vmid_pasid_mapping_info =
>   			get_atc_vmid_pasid_mapping_info,
>   	.set_vm_context_page_table_base = set_vm_context_page_table_base,
> +	.enable_debug_trap = kgd_gfx_v10_enable_debug_trap,
> +	.disable_debug_trap = kgd_gfx_v10_disable_debug_trap,
>   	.program_trap_handler_settings = program_trap_handler_settings,
>   };
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.h
> new file mode 100644
> index 000000000000..370d6c312981
> --- /dev/null
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.h
> @@ -0,0 +1,28 @@
> +/*
> + * Copyright 2022 Advanced Micro Devices, Inc.
> + *
> + * Permission is hereby granted, free of charge, to any person obtaining a
> + * copy of this software and associated documentation files (the "Software"),
> + * to deal in the Software without restriction, including without limitation
> + * the rights to use, copy, modify, merge, publish, distribute, sublicense,
> + * and/or sell copies of the Software, and to permit persons to whom the
> + * Software is furnished to do so, subject to the following conditions:
> + *
> + * The above copyright notice and this permission notice shall be included in
> + * all copies or substantial portions of the Software.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
> + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
> + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
> + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
> + * OTHER DEALINGS IN THE SOFTWARE.
> + */
> +
> +uint32_t kgd_gfx_v10_enable_debug_trap(struct amdgpu_device *adev,
> +				      bool restore_dbg_registers,
> +				      uint32_t vmid);
> +uint32_t kgd_gfx_v10_disable_debug_trap(struct amdgpu_device *adev,
> +					bool keep_trap_enabled,
> +					uint32_t vmid);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10_3.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10_3.c
> index ba21ec6b35e0..73e3b9ae1fb0 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10_3.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10_3.c
> @@ -22,6 +22,7 @@
>   #include <linux/mmu_context.h>
>   #include "amdgpu.h"
>   #include "amdgpu_amdkfd.h"
> +#include "amdgpu_amdkfd_gfx_v10.h"
>   #include "gc/gc_10_3_0_offset.h"
>   #include "gc/gc_10_3_0_sh_mask.h"
>   #include "oss/osssys_5_0_0_offset.h"
> @@ -652,142 +653,6 @@ static void program_trap_handler_settings_v10_3(struct amdgpu_device *adev,
>   	unlock_srbm(adev);
>   }
>   
> -#if 0
> -uint32_t enable_debug_trap_v10_3(struct amdgpu_device *adev,
> -				uint32_t trap_debug_wave_launch_mode,
> -				uint32_t vmid)
> -{
> -	uint32_t data = 0;
> -	uint32_t orig_wave_cntl_value;
> -	uint32_t orig_stall_vmid;
> -
> -	mutex_lock(&adev->grbm_idx_mutex);
> -
> -	orig_wave_cntl_value = RREG32(SOC15_REG_OFFSET(GC,
> -				0,
> -				mmSPI_GDBG_WAVE_CNTL));
> -	orig_stall_vmid = REG_GET_FIELD(orig_wave_cntl_value,
> -			SPI_GDBG_WAVE_CNTL,
> -			STALL_VMID);
> -
> -	data = REG_SET_FIELD(data, SPI_GDBG_WAVE_CNTL, STALL_RA, 1);
> -	WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_WAVE_CNTL), data);
> -
> -	data = 0;
> -	WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_TRAP_MASK), data);
> -
> -	WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_WAVE_CNTL), orig_stall_vmid);
> -
> -	mutex_unlock(&adev->grbm_idx_mutex);
> -
> -	return 0;
> -}
> -
> -uint32_t disable_debug_trap_v10_3(struct amdgpu_device *adev)
> -{
> -	mutex_lock(&adev->grbm_idx_mutex);
> -
> -	WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_TRAP_MASK), 0);
> -
> -	mutex_unlock(&adev->grbm_idx_mutex);
> -
> -	return 0;
> -}
> -
> -uint32_t set_wave_launch_trap_override_v10_3(struct amdgpu_device *adev,
> -						uint32_t trap_override,
> -						uint32_t trap_mask)
> -{
> -	uint32_t data = 0;
> -
> -	mutex_lock(&adev->grbm_idx_mutex);
> -
> -	data = RREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_WAVE_CNTL));
> -	data = REG_SET_FIELD(data, SPI_GDBG_WAVE_CNTL, STALL_RA, 1);
> -	WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_WAVE_CNTL), data);
> -
> -	data = 0;
> -	data = REG_SET_FIELD(data, SPI_GDBG_TRAP_MASK,
> -			EXCP_EN, trap_mask);
> -	data = REG_SET_FIELD(data, SPI_GDBG_TRAP_MASK,
> -			REPLACE, trap_override);
> -	WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_TRAP_MASK), data);
> -
> -	data = RREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_WAVE_CNTL));
> -	data = REG_SET_FIELD(data, SPI_GDBG_WAVE_CNTL, STALL_RA, 0);
> -	WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_WAVE_CNTL), data);
> -
> -	mutex_unlock(&adev->grbm_idx_mutex);
> -
> -	return 0;
> -}
> -
> -uint32_t set_wave_launch_mode_v10_3(struct amdgpu_device *adev,
> -					uint8_t wave_launch_mode,
> -					uint32_t vmid)
> -{
> -	uint32_t data = 0;
> -	bool is_stall_mode;
> -	bool is_mode_set;
> -
> -	is_stall_mode = (wave_launch_mode == 4);
> -	is_mode_set = (wave_launch_mode != 0 && wave_launch_mode != 4);
> -
> -	mutex_lock(&adev->grbm_idx_mutex);
> -
> -	data = REG_SET_FIELD(data, SPI_GDBG_WAVE_CNTL2,
> -			VMID_MASK, is_mode_set ? 1 << vmid : 0);
> -	data = REG_SET_FIELD(data, SPI_GDBG_WAVE_CNTL2,
> -			MODE, is_mode_set ? wave_launch_mode : 0);
> -	WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_WAVE_CNTL2), data);
> -
> -	data = RREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_WAVE_CNTL));
> -	data = REG_SET_FIELD(data, SPI_GDBG_WAVE_CNTL,
> -			STALL_VMID, is_stall_mode ? 1 << vmid : 0);
> -	data = REG_SET_FIELD(data, SPI_GDBG_WAVE_CNTL,
> -			STALL_RA, is_stall_mode ? 1 : 0);
> -	WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_WAVE_CNTL), data);
> -
> -	mutex_unlock(&adev->grbm_idx_mutex);
> -
> -	return 0;
> -}
> -
> -/* kgd_get_iq_wait_times: Returns the mmCP_IQ_WAIT_TIME1/2 values
> - * The values read are:
> - *	ib_offload_wait_time     -- Wait Count for Indirect Buffer Offloads.
> - *	atomic_offload_wait_time -- Wait Count for L2 and GDS Atomics Offloads.
> - *	wrm_offload_wait_time    -- Wait Count for WAIT_REG_MEM Offloads.
> - *	gws_wait_time            -- Wait Count for Global Wave Syncs.
> - *	que_sleep_wait_time      -- Wait Count for Dequeue Retry.
> - *	sch_wave_wait_time       -- Wait Count for Scheduling Wave Message.
> - *	sem_rearm_wait_time      -- Wait Count for Semaphore re-arm.
> - *	deq_retry_wait_time      -- Wait Count for Global Wave Syncs.
> - */
> -void get_iq_wait_times_v10_3(struct amdgpu_device *adev,
> -					uint32_t *wait_times)
> -
> -{
> -	*wait_times = RREG32(SOC15_REG_OFFSET(GC, 0, mmCP_IQ_WAIT_TIME2));
> -}
> -
> -void build_grace_period_packet_info_v10_3(struct amdgpu_device *adev,
> -						uint32_t wait_times,
> -						uint32_t grace_period,
> -						uint32_t *reg_offset,
> -						uint32_t *reg_data)
> -{
> -	*reg_data = wait_times;
> -
> -	*reg_data = REG_SET_FIELD(*reg_data,
> -			CP_IQ_WAIT_TIME2,
> -			SCH_WAVE,
> -			grace_period);
> -
> -	*reg_offset = mmCP_IQ_WAIT_TIME2;
> -}
> -#endif
> -
>   const struct kfd2kgd_calls gfx_v10_3_kfd2kgd = {
>   	.program_sh_mem_settings = program_sh_mem_settings_v10_3,
>   	.set_pasid_vmid_mapping = set_pasid_vmid_mapping_v10_3,
> @@ -805,12 +670,6 @@ const struct kfd2kgd_calls gfx_v10_3_kfd2kgd = {
>   	.get_atc_vmid_pasid_mapping_info = get_atc_vmid_pasid_mapping_info_v10_3,
>   	.set_vm_context_page_table_base = set_vm_context_page_table_base_v10_3,
>   	.program_trap_handler_settings = program_trap_handler_settings_v10_3,
> -#if 0
> -	.enable_debug_trap = enable_debug_trap_v10_3,
> -	.disable_debug_trap = disable_debug_trap_v10_3,
> -	.set_wave_launch_trap_override = set_wave_launch_trap_override_v10_3,
> -	.set_wave_launch_mode = set_wave_launch_mode_v10_3,
> -	.get_iq_wait_times = get_iq_wait_times_v10_3,
> -	.build_grace_period_packet_info = build_grace_period_packet_info_v10_3,
> -#endif
> +	.enable_debug_trap = kgd_gfx_v10_enable_debug_trap,
> +	.disable_debug_trap = kgd_gfx_v10_disable_debug_trap
>   };

^ permalink raw reply	[flat|nested] 68+ messages in thread

* Re: [PATCH 09/32] drm/amdgpu: add gfx9.4.2 hw debug mode enable and disable calls
  2023-01-25 19:53 ` [PATCH 09/32] drm/amdgpu: add gfx9.4.2 " Jonathan Kim
@ 2023-02-16 23:14   ` Felix Kuehling
  0 siblings, 0 replies; 68+ messages in thread
From: Felix Kuehling @ 2023-02-16 23:14 UTC (permalink / raw)
  To: Jonathan Kim, amd-gfx, dri-devel


On 2023-01-25 14:53, Jonathan Kim wrote:
> GFX9.4.2 now supports per-VMID debug mode controls registers
> (SPI_GDBG_PER_VMID_CNTL).
>
> Because the KFD lets the HWS handle PASID-VMID mapping, the KFD will
> forward all debug mode setting register writes to the HWS scheduler
> using a new MAP_PROCESS API, so instead of writing to registers, return
> the required register values that the HWS needs to write on debug enable
> and disable.
>
> v2: add commentary on unused restore_dbg_registers for debug enable.
>
> Signed-off-by: Jonathan Kim <jonathan.kim@amd.com>
> ---
>   .../drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c  | 43 ++++++++++++++++++-
>   1 file changed, 41 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c
> index 4485bb29bec9..89868f9927ae 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c
> @@ -23,6 +23,44 @@
>   #include "amdgpu_amdkfd.h"
>   #include "amdgpu_amdkfd_arcturus.h"
>   #include "amdgpu_amdkfd_gfx_v9.h"
> +#include "gc/gc_9_4_2_offset.h"
> +#include "gc/gc_9_4_2_sh_mask.h"
> +
> +/**

Use /* here.


> + * Returns TRAP_EN, EXCP_EN and EXCP_REPLACE.
> + *
> + * restore_dbg_reisters is ignored here but is a general interface requirement

Typo: registers


> + * for devices that support GFXOFF and where the RLC save/restore list
> + * does not support hw registers for debugging i.e. the driver has to manually
> + * initialize the debug mode registers after it has disabled GFX off during the
> + * debug session.
> + */
> +static uint32_t kgd_aldebaran_enable_debug_trap(struct amdgpu_device *adev,
> +					    bool restore_dbg_registers,
> +					    uint32_t vmid)
> +{
> +	uint32_t data = 0;
> +
> +	data = REG_SET_FIELD(data, SPI_GDBG_PER_VMID_CNTL, TRAP_EN, 1);
> +	data = REG_SET_FIELD(data, SPI_GDBG_PER_VMID_CNTL, EXCP_EN, 0);
> +	data = REG_SET_FIELD(data, SPI_GDBG_PER_VMID_CNTL, EXCP_REPLACE, 0);
> +
> +	return data;
> +}
> +
> +/* returns TRAP_EN, EXCP_EN and EXCP_REPLACE. */
> +static uint32_t kgd_aldebaran_disable_debug_trap(struct amdgpu_device *adev,
> +						bool keep_trap_enabled,
> +						uint32_t vmid)
> +{
> +	uint32_t data = 0;
> +
> +	data = REG_SET_FIELD(data, SPI_GDBG_PER_VMID_CNTL, TRAP_EN, keep_trap_enabled);
> +	data = REG_SET_FIELD(data, SPI_GDBG_PER_VMID_CNTL, EXCP_EN, 0);
> +	data = REG_SET_FIELD(data, SPI_GDBG_PER_VMID_CNTL, EXCP_REPLACE, 0);
> +
> +	return data;
> +}
>   
>   const struct kfd2kgd_calls aldebaran_kfd2kgd = {
>   	.program_sh_mem_settings = kgd_gfx_v9_program_sh_mem_settings,
> @@ -41,6 +79,7 @@ const struct kfd2kgd_calls aldebaran_kfd2kgd = {
>   	.get_atc_vmid_pasid_mapping_info =
>   				kgd_gfx_v9_get_atc_vmid_pasid_mapping_info,
>   	.set_vm_context_page_table_base = kgd_gfx_v9_set_vm_context_page_table_base,
> -	.get_cu_occupancy = kgd_gfx_v9_get_cu_occupancy,

I think you're removing get_cu_occupancy accidentally here?

With those issues fixed, the patch is

Reviewed-by: Felix Kuehling <Felix.Kuehling@amd.com>


> -	.program_trap_handler_settings = kgd_gfx_v9_program_trap_handler_settings
> +	.enable_debug_trap = kgd_aldebaran_enable_debug_trap,
> +	.disable_debug_trap = kgd_aldebaran_disable_debug_trap,
> +	.program_trap_handler_settings = kgd_gfx_v9_program_trap_handler_settings,
>   };

^ permalink raw reply	[flat|nested] 68+ messages in thread

* Re: [PATCH 10/32] drm/amdgpu: add gfx11 hw debug mode enable and disable calls
  2023-01-25 19:53 ` [PATCH 10/32] drm/amdgpu: add gfx11 " Jonathan Kim
@ 2023-02-16 23:19   ` Felix Kuehling
  0 siblings, 0 replies; 68+ messages in thread
From: Felix Kuehling @ 2023-02-16 23:19 UTC (permalink / raw)
  To: Jonathan Kim, amd-gfx, dri-devel


On 2023-01-25 14:53, Jonathan Kim wrote:
> Implement the per-device calls to enable or disable HW debug mode
> for GFX11.
>
> Signed-off-by: Jonathan Kim <jonathan.kim@amd.com>
> ---
>   .../drm/amd/amdgpu/amdgpu_amdkfd_gfx_v11.c    | 39 +++++++++++++++++++
>   1 file changed, 39 insertions(+)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v11.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v11.c
> index 7e80caa05060..34aeff692eba 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v11.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v11.c
> @@ -30,6 +30,7 @@
>   #include "soc15d.h"
>   #include "v11_structs.h"
>   #include "soc21.h"
> +#include <uapi/linux/kfd_ioctl.h>

What is this needed for? Maybe for a later patch?


>   
>   enum hqd_dequeue_request_type {
>   	NO_ACTION = 0,
> @@ -606,6 +607,42 @@ static void set_vm_context_page_table_base_v11(struct amdgpu_device *adev,
>   	adev->gfxhub.funcs->setup_vm_pt_regs(adev, vmid, page_table_base);
>   }
>   
> +/**
Use /* here.
> + * Returns TRAP_EN, EXCP_EN and EXCP_REPLACE.
> + *
> + * restore_dbg_reisters is ignored here but is a general interface requirement

Typo: registers

With those fixed, the patch is

Reviewed-by: Felix Kuehling <Felix.Kuehling@amd.com>


> + * for devices that support GFXOFF and where the RLC save/restore list
> + * does not support hw registers for debugging i.e. the driver has to manually
> + * initialize the debug mode registers after it has disabled GFX off during the
> + * debug session.
> + */
> +static uint32_t kgd_gfx_v11_enable_debug_trap(struct amdgpu_device *adev,
> +					    bool restore_dbg_registers,
> +					    uint32_t vmid)
> +{
> +	uint32_t data = 0;
> +
> +	data = REG_SET_FIELD(data, SPI_GDBG_PER_VMID_CNTL, TRAP_EN, 1);
> +	data = REG_SET_FIELD(data, SPI_GDBG_PER_VMID_CNTL, EXCP_EN, 0);
> +	data = REG_SET_FIELD(data, SPI_GDBG_PER_VMID_CNTL, EXCP_REPLACE, 0);
> +
> +	return data;
> +}
> +
> +/* Returns TRAP_EN, EXCP_EN and EXCP_REPLACE. */
> +static uint32_t kgd_gfx_v11_disable_debug_trap(struct amdgpu_device *adev,
> +						bool keep_trap_enabled,
> +						uint32_t vmid)
> +{
> +	uint32_t data = 0;
> +
> +	data = REG_SET_FIELD(data, SPI_GDBG_PER_VMID_CNTL, TRAP_EN, keep_trap_enabled);
> +	data = REG_SET_FIELD(data, SPI_GDBG_PER_VMID_CNTL, EXCP_EN, 0);
> +	data = REG_SET_FIELD(data, SPI_GDBG_PER_VMID_CNTL, EXCP_REPLACE, 0);
> +
> +	return data;
> +}
> +
>   const struct kfd2kgd_calls gfx_v11_kfd2kgd = {
>   	.program_sh_mem_settings = program_sh_mem_settings_v11,
>   	.set_pasid_vmid_mapping = set_pasid_vmid_mapping_v11,
> @@ -622,4 +659,6 @@ const struct kfd2kgd_calls gfx_v11_kfd2kgd = {
>   	.wave_control_execute = wave_control_execute_v11,
>   	.get_atc_vmid_pasid_mapping_info = NULL,
>   	.set_vm_context_page_table_base = set_vm_context_page_table_base_v11,
> +	.enable_debug_trap = kgd_gfx_v11_enable_debug_trap,
> +	.disable_debug_trap = kgd_gfx_v11_disable_debug_trap
>   };

^ permalink raw reply	[flat|nested] 68+ messages in thread

* Re: [PATCH 03/32] drm/amdkfd: prepare per-process debug enable and disable
  2023-01-25 19:53 ` [PATCH 03/32] drm/amdkfd: prepare per-process debug enable and disable Jonathan Kim
@ 2023-02-16 23:44   ` Felix Kuehling
  2023-03-23 19:12     ` Kim, Jonathan
  0 siblings, 1 reply; 68+ messages in thread
From: Felix Kuehling @ 2023-02-16 23:44 UTC (permalink / raw)
  To: Jonathan Kim, amd-gfx, dri-devel


On 2023-01-25 14:53, Jonathan Kim wrote:
> The ROCm debugger will attach to a process to debug by PTRACE and will
> expect the KFD to prepare a process for the target PID, whether the
> target PID has opened the KFD device or not.
>
> This patch is to explicity handle this requirement.  Further HW mode
> setting and runtime coordination requirements will be handled in
> following patches.
>
> In the case where the target process has not opened the KFD device,
> a new KFD process must be created for the target PID.
> The debugger as well as the target process for this case will have not
> acquired any VMs so handle process restoration to correctly account for
> this.
>
> To coordinate with HSA runtime, the debugger must be aware of the target
> process' runtime enablement status and will copy the runtime status
> information into the debugged KFD process for later query.
>
> On enablement, the debugger will subscribe to a set of exceptions where
> each exception events will notify the debugger through a pollable FIFO
> file descriptor that the debugger provides to the KFD to manage.
> Some events will be synchronously raised while other are scheduled,
> which is why a debug_event_workarea worker is initialized.
>
> Finally on process termination of either the debugger or the target,
> debugging must be disabled if it has not been done so.
>
> v3: fix typo on debug trap disable and PTRACE ATTACH relax check.
> remove unnecessary queue eviction counter reset when there's nothing
> to evict.
> change err code to EALREADY if attaching to an already attached process.
> move debug disable to release worker to avoid race with disable from
> ioctl call.
>
> v2: relax debug trap disable and PTRACE ATTACH requirement.
>
> Signed-off-by: Jonathan Kim<jonathan.kim@amd.com>
> ---
>   drivers/gpu/drm/amd/amdkfd/Makefile           |  3 +-
>   drivers/gpu/drm/amd/amdkfd/kfd_chardev.c      | 88 ++++++++++++++++-
>   drivers/gpu/drm/amd/amdkfd/kfd_debug.c        | 94 +++++++++++++++++++
>   drivers/gpu/drm/amd/amdkfd/kfd_debug.h        | 33 +++++++
>   .../drm/amd/amdkfd/kfd_device_queue_manager.c | 22 ++++-
>   drivers/gpu/drm/amd/amdkfd/kfd_priv.h         | 34 ++++++-
>   drivers/gpu/drm/amd/amdkfd/kfd_process.c      | 63 +++++++++----
>   7 files changed, 308 insertions(+), 29 deletions(-)
>   create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_debug.c
>   create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_debug.h
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/Makefile b/drivers/gpu/drm/amd/amdkfd/Makefile
> index e758c2a24cd0..747754428073 100644
> --- a/drivers/gpu/drm/amd/amdkfd/Makefile
> +++ b/drivers/gpu/drm/amd/amdkfd/Makefile
> @@ -55,7 +55,8 @@ AMDKFD_FILES	:= $(AMDKFD_PATH)/kfd_module.o \
>   		$(AMDKFD_PATH)/kfd_int_process_v9.o \
>   		$(AMDKFD_PATH)/kfd_int_process_v11.o \
>   		$(AMDKFD_PATH)/kfd_smi_events.o \
> -		$(AMDKFD_PATH)/kfd_crat.o
> +		$(AMDKFD_PATH)/kfd_crat.o \
> +		$(AMDKFD_PATH)/kfd_debug.o
>   
>   ifneq ($(CONFIG_AMD_IOMMU_V2),)
>   AMDKFD_FILES += $(AMDKFD_PATH)/kfd_iommu.o
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> index d3b019e64093..ee05c2e54ef6 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> @@ -44,6 +44,7 @@
>   #include "amdgpu_amdkfd.h"
>   #include "kfd_smi_events.h"
>   #include "amdgpu_dma_buf.h"
> +#include "kfd_debug.h"
>   
>   static long kfd_ioctl(struct file *, unsigned int, unsigned long);
>   static int kfd_open(struct inode *, struct file *);
> @@ -142,10 +143,15 @@ static int kfd_open(struct inode *inode, struct file *filep)
>   		return -EPERM;
>   	}
>   
> -	process = kfd_create_process(filep);
> +	process = kfd_create_process(current);
>   	if (IS_ERR(process))
>   		return PTR_ERR(process);
>   
> +	if (kfd_process_init_cwsr_apu(process, filep)) {
> +		kfd_unref_process(process);
> +		return -EFAULT;
> +	}
> +
>   	if (kfd_is_locked()) {
>   		dev_dbg(kfd_device, "kfd is locked!\n"
>   				"process %d unreferenced", process->pasid);
> @@ -2653,6 +2659,9 @@ static int kfd_ioctl_runtime_enable(struct file *filep, struct kfd_process *p, v
>   static int kfd_ioctl_set_debug_trap(struct file *filep, struct kfd_process *p, void *data)
>   {
>   	struct kfd_ioctl_dbg_trap_args *args = data;
> +	struct task_struct *thread = NULL;
> +	struct pid *pid = NULL;
> +	struct kfd_process *target = NULL;
>   	int r = 0;
>   
>   	if (sched_policy == KFD_SCHED_POLICY_NO_HWS) {
> @@ -2660,9 +2669,71 @@ static int kfd_ioctl_set_debug_trap(struct file *filep, struct kfd_process *p, v
>   		return -EINVAL;
>   	}
>   
> +	pid = find_get_pid(args->pid);
> +	if (!pid) {
> +		pr_debug("Cannot find pid info for %i\n", args->pid);
> +		r = -ESRCH;
> +		goto out;
> +	}
> +
> +	thread = get_pid_task(pid, PIDTYPE_PID);
> +
> +	if (args->op == KFD_IOC_DBG_TRAP_ENABLE) {
> +		bool create_process;
> +
> +		rcu_read_lock();
> +		create_process = thread && thread != current && ptrace_parent(thread) == current;
> +		rcu_read_unlock();
> +
> +		target = create_process ? kfd_create_process(thread) :
> +					kfd_lookup_process_by_pid(pid);
> +	} else {
> +		target = kfd_lookup_process_by_pid(pid);
> +	}
> +
> +	if (!target) {
> +		pr_debug("Cannot find process PID %i to debug\n", args->pid);
> +		r = -ESRCH;
> +		goto out;
> +	}
> +
> +	/* Check if target is still PTRACED. */
> +	rcu_read_lock();
> +	if (target != p && args->op != KFD_IOC_DBG_TRAP_DISABLE
> +				&& ptrace_parent(target->lead_thread) != current) {
> +		pr_err("PID %i is not PTRACED and cannot be debugged\n", args->pid);
> +		r = -EPERM;
> +	}
> +	rcu_read_unlock();
> +
> +	if (r)
> +		goto out;
> +
> +	mutex_lock(&target->mutex);
> +
> +	if (args->op != KFD_IOC_DBG_TRAP_ENABLE && !target->debug_trap_enabled) {
> +		pr_err("PID %i not debug enabled for op %i\n", args->pid, args->op);
> +		r = -EINVAL;
> +		goto unlock_out;
> +	}
> +
>   	switch (args->op) {
>   	case KFD_IOC_DBG_TRAP_ENABLE:
> +		if (target != p)
> +			target->debugger_process = p;
> +
> +		r = kfd_dbg_trap_enable(target,
> +					args->enable.dbg_fd,
> +					(void __user *)args->enable.rinfo_ptr,
> +					&args->enable.rinfo_size);
> +		if (!r)
> +			target->exception_enable_mask = args->enable.exception_mask;
> +
> +		pr_warn("Debug functions limited\n");
> +		break;
>   	case KFD_IOC_DBG_TRAP_DISABLE:
> +		r = kfd_dbg_trap_disable(target);
> +		break;
>   	case KFD_IOC_DBG_TRAP_SEND_RUNTIME_EVENT:
>   	case KFD_IOC_DBG_TRAP_SET_EXCEPTIONS_ENABLED:
>   	case KFD_IOC_DBG_TRAP_SET_WAVE_LAUNCH_OVERRIDE:
> @@ -2676,7 +2747,7 @@ static int kfd_ioctl_set_debug_trap(struct file *filep, struct kfd_process *p, v
>   	case KFD_IOC_DBG_TRAP_QUERY_EXCEPTION_INFO:
>   	case KFD_IOC_DBG_TRAP_GET_QUEUE_SNAPSHOT:
>   	case KFD_IOC_DBG_TRAP_GET_DEVICE_SNAPSHOT:
> -		pr_warn("Debugging not supported yet\n");
> +		pr_warn("Debug op %i not supported yet\n", args->op);
>   		r = -EACCES;
>   		break;
>   	default:
> @@ -2684,6 +2755,19 @@ static int kfd_ioctl_set_debug_trap(struct file *filep, struct kfd_process *p, v
>   		r = -EINVAL;
>   	}
>   
> +unlock_out:
> +	mutex_unlock(&target->mutex);
> +
> +out:
> +	if (thread)
> +		put_task_struct(thread);
> +
> +	if (pid)
> +		put_pid(pid);
> +
> +	if (target)
> +		kfd_unref_process(target);
> +
>   	return r;
>   }
>   
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
> new file mode 100644
> index 000000000000..f6ea6db266b4
> --- /dev/null
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
> @@ -0,0 +1,94 @@
> +/*
> + * Copyright 2022 Advanced Micro Devices, Inc.
> + *
> + * Permission is hereby granted, free of charge, to any person obtaining a
> + * copy of this software and associated documentation files (the "Software"),
> + * to deal in the Software without restriction, including without limitation
> + * the rights to use, copy, modify, merge, publish, distribute, sublicense,
> + * and/or sell copies of the Software, and to permit persons to whom the
> + * Software is furnished to do so, subject to the following conditions:
> + *
> + * The above copyright notice and this permission notice shall be included in
> + * all copies or substantial portions of the Software.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
> + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
> + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
> + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
> + * OTHER DEALINGS IN THE SOFTWARE.
> + */
> +
> +#include "kfd_debug.h"
> +#include <linux/file.h>
> +
> +void debug_event_write_work_handler(struct work_struct *work)
> +{
> +	struct kfd_process *process;
> +
> +	static const char write_data = '.';
> +	loff_t pos = 0;
> +
> +	process = container_of(work,
> +			struct kfd_process,
> +			debug_event_workarea);
> +
> +	kernel_write(process->dbg_ev_file, &write_data, 1, &pos);
> +}
> +
> +int kfd_dbg_trap_disable(struct kfd_process *target)
> +{
> +	if (!target->debug_trap_enabled)
> +		return 0;
> +
> +	fput(target->dbg_ev_file);
> +	target->dbg_ev_file = NULL;
> +
> +	if (target->debugger_process) {
> +		atomic_dec(&target->debugger_process->debugged_process_count);
> +		target->debugger_process = NULL;
> +	}
> +
> +	target->debug_trap_enabled = false;
> +	kfd_unref_process(target);
> +
> +	return 0;
> +}
> +
> +int kfd_dbg_trap_enable(struct kfd_process *target, uint32_t fd,
> +			void __user *runtime_info, uint32_t *runtime_size)
> +{
> +	struct file *f;
> +	uint32_t copy_size;
> +	int r = 0;
> +
> +	if (target->debug_trap_enabled)
> +		return -EALREADY;
> +
> +	copy_size = min((size_t)(*runtime_size), sizeof(target->runtime_info));
> +
> +	f = fget(fd);
> +	if (!f) {
> +		pr_err("Failed to get file for (%i)\n", fd);
> +		return -EBADF;
> +	}
> +
> +	target->dbg_ev_file = f;
> +
> +	/* We already hold the process reference but hold another one for the
> +	 * debug session.
> +	 */
> +	kref_get(&target->ref);
> +	target->debug_trap_enabled = true;
> +
> +	if (target->debugger_process)
> +		atomic_inc(&target->debugger_process->debugged_process_count);
> +
> +	if (copy_to_user(runtime_info, (void *)&target->runtime_info, copy_size))
> +		r = -EFAULT;
> +
> +	*runtime_size = sizeof(target->runtime_info);
> +
> +	return r;
> +}
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.h b/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
> new file mode 100644
> index 000000000000..b2217eb1399c
> --- /dev/null
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
> @@ -0,0 +1,33 @@
> +/*
> + * Copyright 2022 Advanced Micro Devices, Inc.
> + *
> + * Permission is hereby granted, free of charge, to any person obtaining a
> + * copy of this software and associated documentation files (the "Software"),
> + * to deal in the Software without restriction, including without limitation
> + * the rights to use, copy, modify, merge, publish, distribute, sublicense,
> + * and/or sell copies of the Software, and to permit persons to whom the
> + * Software is furnished to do so, subject to the following conditions:
> + *
> + * The above copyright notice and this permission notice shall be included in
> + * all copies or substantial portions of the Software.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
> + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
> + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
> + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
> + * OTHER DEALINGS IN THE SOFTWARE.
> + */
> +
> +#ifndef KFD_DEBUG_EVENTS_H_INCLUDED
> +#define KFD_DEBUG_EVENTS_H_INCLUDED
> +
> +#include "kfd_priv.h"
> +
> +int kfd_dbg_trap_disable(struct kfd_process *target);
> +int kfd_dbg_trap_enable(struct kfd_process *target, uint32_t fd,
> +			void __user *runtime_info,
> +			uint32_t *runtime_info_size);
> +void debug_event_write_work_handler(struct work_struct *work);
> +#endif
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> index c06ada0844ba..a2ac98d06e71 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> @@ -979,6 +979,14 @@ static int evict_process_queues_cpsch(struct device_queue_manager *dqm,
>   		goto out;
>   
>   	pdd = qpd_to_pdd(qpd);
> +
> +	/* The debugger creates processes that temporarily have not acquired
> +	 * all VMs for all devices and has no VMs itself.
> +	 * Skip queue eviction on process eviction.
> +	 */
> +	if (!pdd->drm_priv)
> +		goto out;
> +
This should be before qpd->
>   	pr_debug_ratelimited("Evicting PASID 0x%x queues\n",
>   			    pdd->process->pasid);
>   
> @@ -1100,13 +1108,10 @@ static int restore_process_queues_cpsch(struct device_queue_manager *dqm,
>   {
>   	struct queue *q;
>   	struct kfd_process_device *pdd;
> -	uint64_t pd_base;
>   	uint64_t eviction_duration;
>   	int retval = 0;
>   
>   	pdd = qpd_to_pdd(qpd);
> -	/* Retrieve PD base */
> -	pd_base = amdgpu_amdkfd_gpuvm_get_process_page_dir(pdd->drm_priv);
>   
>   	dqm_lock(dqm);
>   	if (WARN_ON_ONCE(!qpd->evicted)) /* already restored, do nothing */
> @@ -1116,12 +1121,19 @@ static int restore_process_queues_cpsch(struct device_queue_manager *dqm,
>   		goto out;
>   	}
>   
> +	/* The debugger creates processes that temporarily have not acquired
> +	 * all VMs for all devices and has no VMs itself.
> +	 * Skip queue restore on process restore.
> +	 */
> +	if (!pdd->drm_priv)
> +		goto out;
> +

I had a comment here that "qpd->evicted = 0;" was duplicated. It is 
still needed in this case. Otherwise the process will end up being 
created with all queues in an evicted state and no way to execute 
anything on the GPU.

You only need one instance of "qpd->evicted = 0;", but it needs to be in 
the right place (after the vm_not_acquired label you had in v1 of this 
patch).

Regards,
   Felix


>   	pr_debug_ratelimited("Restoring PASID 0x%x queues\n",
>   			    pdd->process->pasid);
>   
>   	/* Update PD Base in QPD */
> -	qpd->page_table_base = pd_base;
> -	pr_debug("Updated PD address to 0x%llx\n", pd_base);
> +	qpd->page_table_base = amdgpu_amdkfd_gpuvm_get_process_page_dir(pdd->drm_priv);
> +	pr_debug("Updated PD address to 0x%llx\n", qpd->page_table_base);
>   
>   	/* activate all active queues on the qpd */
>   	list_for_each_entry(q, &qpd->queues_list, list) {
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> index bfa30d12406b..62b75ba28425 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> @@ -886,19 +886,48 @@ struct kfd_process {
>   	 */
>   	unsigned long last_restore_timestamp;
>   
> +	/* Indicates device process is debug attached with reserved vmid. */
> +	bool debug_trap_enabled;
> +
> +	/* per-process-per device debug event fd file */
> +	struct file *dbg_ev_file;
> +
> +	/* If the process is a kfd debugger, we need to know so we can clean
> +	 * up at exit time.  If a process enables debugging on itself, it does
> +	 * its own clean-up, so we don't set the flag here.  We track this by
> +	 * counting the number of processes this process is debugging.
> +	 */
> +	atomic_t debugged_process_count;
> +
> +	/* If the process is a debugged, this is the debugger process */
> +	struct kfd_process *debugger_process;
> +
>   	/* Kobj for our procfs */
>   	struct kobject *kobj;
>   	struct kobject *kobj_queues;
>   	struct attribute attr_pasid;
>   
> +	/* Keep track cwsr init */
> +	bool has_cwsr;
> +
> +	/* Exception code enable mask and status */
> +	uint64_t exception_enable_mask;
> +
>   	/* shared virtual memory registered by this process */
>   	struct svm_range_list svms;
>   
>   	bool xnack_enabled;
>   
> +	/* Work area for debugger event writer worker. */
> +	struct work_struct debug_event_workarea;
> +
>   	atomic_t poison;
>   	/* Queues are in paused stated because we are in the process of doing a CRIU checkpoint */
>   	bool queues_paused;
> +
> +	/* Tracks runtime enable status */
> +	struct kfd_runtime_info runtime_info;
> +
>   };
>   
>   #define KFD_PROCESS_TABLE_SIZE 5 /* bits: 32 entries */
> @@ -928,7 +957,7 @@ bool kfd_dev_is_large_bar(struct kfd_dev *dev);
>   
>   int kfd_process_create_wq(void);
>   void kfd_process_destroy_wq(void);
> -struct kfd_process *kfd_create_process(struct file *filep);
> +struct kfd_process *kfd_create_process(struct task_struct *thread);
>   struct kfd_process *kfd_get_process(const struct task_struct *task);
>   struct kfd_process *kfd_lookup_process_by_pasid(u32 pasid);
>   struct kfd_process *kfd_lookup_process_by_mm(const struct mm_struct *mm);
> @@ -1055,6 +1084,9 @@ void kfd_process_set_trap_handler(struct qcm_process_device *qpd,
>   				  uint64_t tba_addr,
>   				  uint64_t tma_addr);
>   
> +/* CWSR initialization */
> +int kfd_process_init_cwsr_apu(struct kfd_process *process, struct file *filep);
> +
>   /* CRIU */
>   /*
>    * Need to increment KFD_CRIU_PRIV_VERSION each time a change is made to any of the CRIU private
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> index 72df6286e240..e935158ab311 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> @@ -44,6 +44,7 @@ struct mm_struct;
>   #include "kfd_iommu.h"
>   #include "kfd_svm.h"
>   #include "kfd_smi_events.h"
> +#include "kfd_debug.h"
>   
>   /*
>    * List of struct kfd_process (field kfd_process).
> @@ -69,7 +70,6 @@ static struct kfd_process *find_process(const struct task_struct *thread,
>   					bool ref);
>   static void kfd_process_ref_release(struct kref *ref);
>   static struct kfd_process *create_process(const struct task_struct *thread);
> -static int kfd_process_init_cwsr_apu(struct kfd_process *p, struct file *filep);
>   
>   static void evict_process_worker(struct work_struct *work);
>   static void restore_process_worker(struct work_struct *work);
> @@ -798,18 +798,19 @@ static void kfd_process_device_destroy_ib_mem(struct kfd_process_device *pdd)
>   	kfd_process_free_gpuvm(qpd->ib_mem, pdd, &qpd->ib_kaddr);
>   }
>   
> -struct kfd_process *kfd_create_process(struct file *filep)
> +struct kfd_process *kfd_create_process(struct task_struct *thread)
>   {
>   	struct kfd_process *process;
> -	struct task_struct *thread = current;
>   	int ret;
>   
> -	if (!thread->mm)
> +	if (!(thread->mm && mmget_not_zero(thread->mm)))
>   		return ERR_PTR(-EINVAL);
>   
>   	/* Only the pthreads threading model is supported. */
> -	if (thread->group_leader->mm != thread->mm)
> +	if (thread->group_leader->mm != thread->mm) {
> +		mmput(thread->mm);
>   		return ERR_PTR(-EINVAL);
> +	}
>   
>   	/*
>   	 * take kfd processes mutex before starting of process creation
> @@ -827,10 +828,6 @@ struct kfd_process *kfd_create_process(struct file *filep)
>   		if (IS_ERR(process))
>   			goto out;
>   
> -		ret = kfd_process_init_cwsr_apu(process, filep);
> -		if (ret)
> -			goto out_destroy;
> -
>   		if (!procfs.kobj)
>   			goto out;
>   
> @@ -864,16 +861,9 @@ struct kfd_process *kfd_create_process(struct file *filep)
>   	if (!IS_ERR(process))
>   		kref_get(&process->ref);
>   	mutex_unlock(&kfd_processes_mutex);
> +	mmput(thread->mm);
>   
>   	return process;
> -
> -out_destroy:
> -	hash_del_rcu(&process->kfd_processes);
> -	mutex_unlock(&kfd_processes_mutex);
> -	synchronize_srcu(&kfd_processes_srcu);
> -	/* kfd_process_free_notifier will trigger the cleanup */
> -	mmu_notifier_put(&process->mmu_notifier);
> -	return ERR_PTR(ret);
>   }
>   
>   struct kfd_process *kfd_get_process(const struct task_struct *thread)
> @@ -1115,6 +1105,26 @@ static void kfd_process_wq_release(struct work_struct *work)
>   	struct kfd_process *p = container_of(work, struct kfd_process,
>   					     release_work);
>   
> +	kfd_dbg_trap_disable(p);
> +
> +	if (atomic_read(&p->debugged_process_count) > 0) {
> +		struct kfd_process *target;
> +		unsigned int temp;
> +		int idx = srcu_read_lock(&kfd_processes_srcu);
> +
> +		hash_for_each_rcu(kfd_processes_table, temp, target, kfd_processes) {
> +			if (target->debugger_process && target->debugger_process == p) {
> +				mutex_lock(&target->mutex);
> +				kfd_dbg_trap_disable(target);
> +				mutex_unlock(&target->mutex);
> +				if (atomic_read(&p->debugged_process_count) == 0)
> +					break;
> +			}
> +		}
> +
> +		srcu_read_unlock(&kfd_processes_srcu, idx);
> +	}
> +
>   	kfd_process_dequeue_from_all_devices(p);
>   	pqm_uninit(&p->pqm);
>   
> @@ -1200,11 +1210,14 @@ static const struct mmu_notifier_ops kfd_process_mmu_notifier_ops = {
>   	.free_notifier = kfd_process_free_notifier,
>   };
>   
> -static int kfd_process_init_cwsr_apu(struct kfd_process *p, struct file *filep)
> +int kfd_process_init_cwsr_apu(struct kfd_process *p, struct file *filep)
>   {
>   	unsigned long  offset;
>   	int i;
>   
> +	if (p->has_cwsr)
> +		return 0;
> +
>   	for (i = 0; i < p->n_pdds; i++) {
>   		struct kfd_dev *dev = p->pdds[i]->dev;
>   		struct qcm_process_device *qpd = &p->pdds[i]->qpd;
> @@ -1233,6 +1246,8 @@ static int kfd_process_init_cwsr_apu(struct kfd_process *p, struct file *filep)
>   			qpd->tba_addr, qpd->tma_addr, qpd->cwsr_kaddr);
>   	}
>   
> +	p->has_cwsr = true;
> +
>   	return 0;
>   }
>   
> @@ -1375,6 +1390,10 @@ static struct kfd_process *create_process(const struct task_struct *thread)
>   	if (err)
>   		goto err_event_init;
>   	process->is_32bit_user_mode = in_compat_syscall();
> +	process->debug_trap_enabled = false;
> +	process->debugger_process = NULL;
> +	process->exception_enable_mask = 0;
> +	atomic_set(&process->debugged_process_count, 0);
>   
>   	process->pasid = kfd_pasid_alloc();
>   	if (process->pasid == 0) {
> @@ -1422,6 +1441,8 @@ static struct kfd_process *create_process(const struct task_struct *thread)
>   	kfd_unref_process(process);
>   	get_task_struct(process->lead_thread);
>   
> +	INIT_WORK(&process->debug_event_workarea, debug_event_write_work_handler);
> +
>   	return process;
>   
>   err_register_notifier:
> @@ -1908,8 +1929,10 @@ static void restore_process_worker(struct work_struct *work)
>   	 */
>   
>   	p->last_restore_timestamp = get_jiffies_64();
> -	ret = amdgpu_amdkfd_gpuvm_restore_process_bos(p->kgd_process_info,
> -						     &p->ef);
> +	/* VMs may not have been acquired yet during debugging. */
> +	if (p->kgd_process_info)
> +		ret = amdgpu_amdkfd_gpuvm_restore_process_bos(p->kgd_process_info,
> +							     &p->ef);
>   	if (ret) {
>   		pr_debug("Failed to restore BOs of pasid 0x%x, retry after %d ms\n",
>   			 p->pasid, PROCESS_BACK_OFF_TIME_MS);

^ permalink raw reply	[flat|nested] 68+ messages in thread

* Re: [PATCH 11/32] drm/amdgpu: add configurable grace period for unmap queues
  2023-01-25 19:53 ` [PATCH 11/32] drm/amdgpu: add configurable grace period for unmap queues Jonathan Kim
@ 2023-03-20 19:19   ` Felix Kuehling
  0 siblings, 0 replies; 68+ messages in thread
From: Felix Kuehling @ 2023-03-20 19:19 UTC (permalink / raw)
  To: Jonathan Kim, amd-gfx, dri-devel

On 2023-01-25 14:53, Jonathan Kim wrote:
> The HWS schedule allows a grace period for wave completion prior to
> preemption for better performance by avoiding CWSR on waves that can
> potentially complete quickly. The debugger, on the other hand, will
> want to inspect wave status immediately after it actively triggers
> preemption (a suspend function to be provided).
>
> To minimize latency between preemption and debugger wave inspection, allow
> immediate preemption by setting the grace period to 0.
>
> Note that setting the preepmtion grace period to 0 will result in an
> infinite grace period being set due to a CP FW bug so set it to 1 for now.
>
> v2: clarify purpose in the description of this patch
>
> Signed-off-by: Jonathan Kim <jonathan.kim@amd.com>

Reviewed-by: Felix Kuehling <Felix.Kuehling@amd.com>


> ---
>   .../drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c  |  2 +
>   .../drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c   |  2 +
>   .../drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c    | 43 ++++++++++++
>   .../drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.h    |  6 ++
>   .../drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10_3.c  |  2 +
>   .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c | 43 ++++++++++++
>   .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h |  9 ++-
>   .../drm/amd/amdkfd/kfd_device_queue_manager.c | 61 ++++++++++++-----
>   .../drm/amd/amdkfd/kfd_device_queue_manager.h |  2 +
>   .../gpu/drm/amd/amdkfd/kfd_packet_manager.c   | 32 +++++++++
>   .../drm/amd/amdkfd/kfd_packet_manager_v9.c    | 39 +++++++++++
>   .../gpu/drm/amd/amdkfd/kfd_pm4_headers_ai.h   | 65 +++++++++++++++++++
>   drivers/gpu/drm/amd/amdkfd/kfd_priv.h         |  5 ++
>   13 files changed, 291 insertions(+), 20 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c
> index 89868f9927ae..a64a53f9efe6 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c
> @@ -81,5 +81,7 @@ const struct kfd2kgd_calls aldebaran_kfd2kgd = {
>   	.set_vm_context_page_table_base = kgd_gfx_v9_set_vm_context_page_table_base,
>   	.enable_debug_trap = kgd_aldebaran_enable_debug_trap,
>   	.disable_debug_trap = kgd_aldebaran_disable_debug_trap,
> +	.get_iq_wait_times = kgd_gfx_v9_get_iq_wait_times,
> +	.build_grace_period_packet_info = kgd_gfx_v9_build_grace_period_packet_info,
>   	.program_trap_handler_settings = kgd_gfx_v9_program_trap_handler_settings,
>   };
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c
> index d5bb86ccd617..ef8befc31fc6 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c
> @@ -410,6 +410,8 @@ const struct kfd2kgd_calls arcturus_kfd2kgd = {
>   				kgd_gfx_v9_set_vm_context_page_table_base,
>   	.enable_debug_trap = kgd_arcturus_enable_debug_trap,
>   	.disable_debug_trap = kgd_arcturus_disable_debug_trap,
> +	.get_iq_wait_times = kgd_gfx_v9_get_iq_wait_times,
> +	.build_grace_period_packet_info = kgd_gfx_v9_build_grace_period_packet_info,
>   	.get_cu_occupancy = kgd_gfx_v9_get_cu_occupancy,
>   	.program_trap_handler_settings = kgd_gfx_v9_program_trap_handler_settings,
>   };
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
> index c09b45de02d0..2491402afd58 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
> @@ -801,6 +801,47 @@ uint32_t kgd_gfx_v10_disable_debug_trap(struct amdgpu_device *adev,
>   	return 0;
>   }
>   
> +/* kgd_gfx_v10_get_iq_wait_times: Returns the mmCP_IQ_WAIT_TIME1/2 values
> + * The values read are:
> + *     ib_offload_wait_time     -- Wait Count for Indirect Buffer Offloads.
> + *     atomic_offload_wait_time -- Wait Count for L2 and GDS Atomics Offloads.
> + *     wrm_offload_wait_time    -- Wait Count for WAIT_REG_MEM Offloads.
> + *     gws_wait_time            -- Wait Count for Global Wave Syncs.
> + *     que_sleep_wait_time      -- Wait Count for Dequeue Retry.
> + *     sch_wave_wait_time       -- Wait Count for Scheduling Wave Message.
> + *     sem_rearm_wait_time      -- Wait Count for Semaphore re-arm.
> + *     deq_retry_wait_time      -- Wait Count for Global Wave Syncs.
> + */
> +void kgd_gfx_v10_get_iq_wait_times(struct amdgpu_device *adev,
> +					uint32_t *wait_times)
> +
> +{
> +	*wait_times = RREG32(SOC15_REG_OFFSET(GC, 0, mmCP_IQ_WAIT_TIME2));
> +}
> +
> +void kgd_gfx_v10_build_grace_period_packet_info(struct amdgpu_device *adev,
> +						uint32_t wait_times,
> +						uint32_t grace_period,
> +						uint32_t *reg_offset,
> +						uint32_t *reg_data)
> +{
> +	*reg_data = wait_times;
> +
> +	/*
> +	 * The CP cannont handle a 0 grace period input and will result in
> +	 * an infinite grace period being set so set to 1 to prevent this.
> +	 */
> +	if (grace_period == 0)
> +		grace_period = 1;
> +
> +	*reg_data = REG_SET_FIELD(*reg_data,
> +			CP_IQ_WAIT_TIME2,
> +			SCH_WAVE,
> +			grace_period);
> +
> +	*reg_offset = SOC15_REG_OFFSET(GC, 0, mmCP_IQ_WAIT_TIME2);
> +}
> +
>   static void program_trap_handler_settings(struct amdgpu_device *adev,
>   		uint32_t vmid, uint64_t tba_addr, uint64_t tma_addr)
>   {
> @@ -845,5 +886,7 @@ const struct kfd2kgd_calls gfx_v10_kfd2kgd = {
>   	.set_vm_context_page_table_base = set_vm_context_page_table_base,
>   	.enable_debug_trap = kgd_gfx_v10_enable_debug_trap,
>   	.disable_debug_trap = kgd_gfx_v10_disable_debug_trap,
> +	.get_iq_wait_times = kgd_gfx_v10_get_iq_wait_times,
> +	.build_grace_period_packet_info = kgd_gfx_v10_build_grace_period_packet_info,
>   	.program_trap_handler_settings = program_trap_handler_settings,
>   };
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.h
> index 370d6c312981..0abc1e805180 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.h
> @@ -26,3 +26,9 @@ uint32_t kgd_gfx_v10_enable_debug_trap(struct amdgpu_device *adev,
>   uint32_t kgd_gfx_v10_disable_debug_trap(struct amdgpu_device *adev,
>   					bool keep_trap_enabled,
>   					uint32_t vmid);
> +void kgd_gfx_v10_get_iq_wait_times(struct amdgpu_device *adev, uint32_t *wait_times);
> +void kgd_gfx_v10_build_grace_period_packet_info(struct amdgpu_device *adev,
> +					       uint32_t wait_times,
> +					       uint32_t grace_period,
> +					       uint32_t *reg_offset,
> +					       uint32_t *reg_data);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10_3.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10_3.c
> index 73e3b9ae1fb0..c57f2a6b6e23 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10_3.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10_3.c
> @@ -670,6 +670,8 @@ const struct kfd2kgd_calls gfx_v10_3_kfd2kgd = {
>   	.get_atc_vmid_pasid_mapping_info = get_atc_vmid_pasid_mapping_info_v10_3,
>   	.set_vm_context_page_table_base = set_vm_context_page_table_base_v10_3,
>   	.program_trap_handler_settings = program_trap_handler_settings_v10_3,
> +	.get_iq_wait_times = kgd_gfx_v10_get_iq_wait_times,
> +	.build_grace_period_packet_info = kgd_gfx_v10_build_grace_period_packet_info,
>   	.enable_debug_trap = kgd_gfx_v10_enable_debug_trap,
>   	.disable_debug_trap = kgd_gfx_v10_disable_debug_trap
>   };
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
> index 94a9fd9bd984..4a8bd266d3f6 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
> @@ -737,6 +737,24 @@ uint32_t kgd_gfx_v9_disable_debug_trap(struct amdgpu_device *adev,
>   	return 0;
>   }
>   
> +/* kgd_gfx_v9_get_iq_wait_times: Returns the mmCP_IQ_WAIT_TIME1/2 values
> + * The values read are:
> + *     ib_offload_wait_time     -- Wait Count for Indirect Buffer Offloads.
> + *     atomic_offload_wait_time -- Wait Count for L2 and GDS Atomics Offloads.
> + *     wrm_offload_wait_time    -- Wait Count for WAIT_REG_MEM Offloads.
> + *     gws_wait_time            -- Wait Count for Global Wave Syncs.
> + *     que_sleep_wait_time      -- Wait Count for Dequeue Retry.
> + *     sch_wave_wait_time       -- Wait Count for Scheduling Wave Message.
> + *     sem_rearm_wait_time      -- Wait Count for Semaphore re-arm.
> + *     deq_retry_wait_time      -- Wait Count for Global Wave Syncs.
> + */
> +void kgd_gfx_v9_get_iq_wait_times(struct amdgpu_device *adev,
> +					uint32_t *wait_times)
> +
> +{
> +	*wait_times = RREG32(SOC15_REG_OFFSET(GC, 0, mmCP_IQ_WAIT_TIME2));
> +}
> +
>   void kgd_gfx_v9_set_vm_context_page_table_base(struct amdgpu_device *adev,
>   			uint32_t vmid, uint64_t page_table_base)
>   {
> @@ -921,6 +939,29 @@ void kgd_gfx_v9_get_cu_occupancy(struct amdgpu_device *adev, int pasid,
>   				adev->gfx.cu_info.max_waves_per_simd;
>   }
>   
> +void kgd_gfx_v9_build_grace_period_packet_info(struct amdgpu_device *adev,
> +		uint32_t wait_times,
> +		uint32_t grace_period,
> +		uint32_t *reg_offset,
> +		uint32_t *reg_data)
> +{
> +	*reg_data = wait_times;
> +
> +	/*
> +	 * The CP cannont handle a 0 grace period input and will result in
> +	 * an infinite grace period being set so set to 1 to prevent this.
> +	 */
> +	if (grace_period == 0)
> +		grace_period = 1;
> +
> +	*reg_data = REG_SET_FIELD(*reg_data,
> +			CP_IQ_WAIT_TIME2,
> +			SCH_WAVE,
> +			grace_period);
> +
> +	*reg_offset = SOC15_REG_OFFSET(GC, 0, mmCP_IQ_WAIT_TIME2);
> +}
> +
>   void kgd_gfx_v9_program_trap_handler_settings(struct amdgpu_device *adev,
>                           uint32_t vmid, uint64_t tba_addr, uint64_t tma_addr)
>   {
> @@ -964,6 +1005,8 @@ const struct kfd2kgd_calls gfx_v9_kfd2kgd = {
>   	.set_vm_context_page_table_base = kgd_gfx_v9_set_vm_context_page_table_base,
>   	.enable_debug_trap = kgd_gfx_v9_enable_debug_trap,
>   	.disable_debug_trap = kgd_gfx_v9_disable_debug_trap,
> +	.get_iq_wait_times = kgd_gfx_v9_get_iq_wait_times,
> +	.build_grace_period_packet_info = kgd_gfx_v9_build_grace_period_packet_info,
>   	.get_cu_occupancy = kgd_gfx_v9_get_cu_occupancy,
>   	.program_trap_handler_settings = kgd_gfx_v9_program_trap_handler_settings,
>   };
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h
> index d39256162616..c0866497cb5c 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h
> @@ -20,8 +20,6 @@
>    * OTHER DEALINGS IN THE SOFTWARE.
>    */
>   
> -
> -
>   void kgd_gfx_v9_program_sh_mem_settings(struct amdgpu_device *adev, uint32_t vmid,
>   		uint32_t sh_mem_config,
>   		uint32_t sh_mem_ape1_base, uint32_t sh_mem_ape1_limit,
> @@ -51,7 +49,6 @@ int kgd_gfx_v9_wave_control_execute(struct amdgpu_device *adev,
>   					uint32_t sq_cmd);
>   bool kgd_gfx_v9_get_atc_vmid_pasid_mapping_info(struct amdgpu_device *adev,
>   					uint8_t vmid, uint16_t *p_pasid);
> -
>   void kgd_gfx_v9_set_vm_context_page_table_base(struct amdgpu_device *adev,
>   			uint32_t vmid, uint64_t page_table_base);
>   void kgd_gfx_v9_get_cu_occupancy(struct amdgpu_device *adev, int pasid,
> @@ -67,3 +64,9 @@ uint32_t kgd_gfx_v9_enable_debug_trap(struct amdgpu_device *adev,
>   uint32_t kgd_gfx_v9_disable_debug_trap(struct amdgpu_device *adev,
>   					bool keep_trap_enabled,
>   					uint32_t vmid);
> +void kgd_gfx_v9_get_iq_wait_times(struct amdgpu_device *adev, uint32_t *wait_times);
> +void kgd_gfx_v9_build_grace_period_packet_info(struct amdgpu_device *adev,
> +					       uint32_t wait_times,
> +					       uint32_t grace_period,
> +					       uint32_t *reg_offset,
> +					       uint32_t *reg_data);
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> index a2ac98d06e71..7556f80d41e4 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> @@ -46,10 +46,13 @@ static int set_pasid_vmid_mapping(struct device_queue_manager *dqm,
>   
>   static int execute_queues_cpsch(struct device_queue_manager *dqm,
>   				enum kfd_unmap_queues_filter filter,
> -				uint32_t filter_param);
> +				uint32_t filter_param,
> +				uint32_t grace_period);
>   static int unmap_queues_cpsch(struct device_queue_manager *dqm,
>   				enum kfd_unmap_queues_filter filter,
> -				uint32_t filter_param, bool reset);
> +				uint32_t filter_param,
> +				uint32_t grace_period,
> +				bool reset);
>   
>   static int map_queues_cpsch(struct device_queue_manager *dqm);
>   
> @@ -839,7 +842,7 @@ static int update_queue(struct device_queue_manager *dqm, struct queue *q,
>   	if (dqm->sched_policy != KFD_SCHED_POLICY_NO_HWS) {
>   		if (!dqm->dev->shared_resources.enable_mes)
>   			retval = unmap_queues_cpsch(dqm,
> -						    KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0, false);
> +						    KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0, USE_DEFAULT_GRACE_PERIOD, false);
>   		else if (prev_active)
>   			retval = remove_queue_mes(dqm, q, &pdd->qpd);
>   
> @@ -1015,7 +1018,8 @@ static int evict_process_queues_cpsch(struct device_queue_manager *dqm,
>   		retval = execute_queues_cpsch(dqm,
>   					      qpd->is_debug ?
>   					      KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES :
> -					      KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0);
> +					      KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0,
> +					      USE_DEFAULT_GRACE_PERIOD);
>   
>   out:
>   	dqm_unlock(dqm);
> @@ -1155,7 +1159,7 @@ static int restore_process_queues_cpsch(struct device_queue_manager *dqm,
>   	}
>   	if (!dqm->dev->shared_resources.enable_mes)
>   		retval = execute_queues_cpsch(dqm,
> -					      KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0);
> +					      KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0, USE_DEFAULT_GRACE_PERIOD);
>   	qpd->evicted = 0;
>   	eviction_duration = get_jiffies_64() - pdd->last_evict_timestamp;
>   	atomic64_add(eviction_duration, &pdd->evict_duration_counter);
> @@ -1490,6 +1494,9 @@ static int initialize_cpsch(struct device_queue_manager *dqm)
>   
>   	init_sdma_bitmaps(dqm);
>   
> +	if (dqm->dev->kfd2kgd->get_iq_wait_times)
> +		dqm->dev->kfd2kgd->get_iq_wait_times(dqm->dev->adev,
> +					&dqm->wait_times);
>   	return 0;
>   }
>   
> @@ -1529,7 +1536,7 @@ static int start_cpsch(struct device_queue_manager *dqm)
>   	dqm->is_resetting = false;
>   	dqm->sched_running = true;
>   	if (!dqm->dev->shared_resources.enable_mes)
> -		execute_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0);
> +		execute_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0, USE_DEFAULT_GRACE_PERIOD);
>   	dqm_unlock(dqm);
>   
>   	return 0;
> @@ -1554,7 +1561,7 @@ static int stop_cpsch(struct device_queue_manager *dqm)
>   
>   	if (!dqm->is_hws_hang) {
>   		if (!dqm->dev->shared_resources.enable_mes)
> -			unmap_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0, false);
> +			unmap_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0, USE_DEFAULT_GRACE_PERIOD, false);
>   		else
>   			remove_all_queues_mes(dqm);
>   	}
> @@ -1596,7 +1603,8 @@ static int create_kernel_queue_cpsch(struct device_queue_manager *dqm,
>   	list_add(&kq->list, &qpd->priv_queue_list);
>   	increment_queue_count(dqm, qpd, kq->queue);
>   	qpd->is_debug = true;
> -	execute_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0);
> +	execute_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0,
> +			USE_DEFAULT_GRACE_PERIOD);
>   	dqm_unlock(dqm);
>   
>   	return 0;
> @@ -1610,7 +1618,8 @@ static void destroy_kernel_queue_cpsch(struct device_queue_manager *dqm,
>   	list_del(&kq->list);
>   	decrement_queue_count(dqm, qpd, kq->queue);
>   	qpd->is_debug = false;
> -	execute_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0);
> +	execute_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0,
> +			USE_DEFAULT_GRACE_PERIOD);
>   	/*
>   	 * Unconditionally decrement this counter, regardless of the queue's
>   	 * type.
> @@ -1687,7 +1696,7 @@ static int create_queue_cpsch(struct device_queue_manager *dqm, struct queue *q,
>   
>   		if (!dqm->dev->shared_resources.enable_mes)
>   			retval = execute_queues_cpsch(dqm,
> -					KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0);
> +					KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0, USE_DEFAULT_GRACE_PERIOD);
>   		else
>   			retval = add_queue_mes(dqm, q, qpd);
>   		if (retval)
> @@ -1776,7 +1785,9 @@ static int map_queues_cpsch(struct device_queue_manager *dqm)
>   /* dqm->lock mutex has to be locked before calling this function */
>   static int unmap_queues_cpsch(struct device_queue_manager *dqm,
>   				enum kfd_unmap_queues_filter filter,
> -				uint32_t filter_param, bool reset)
> +				uint32_t filter_param,
> +				uint32_t grace_period,
> +				bool reset)
>   {
>   	int retval = 0;
>   	struct mqd_manager *mqd_mgr;
> @@ -1788,6 +1799,12 @@ static int unmap_queues_cpsch(struct device_queue_manager *dqm,
>   	if (!dqm->active_runlist)
>   		return retval;
>   
> +	if (grace_period != USE_DEFAULT_GRACE_PERIOD) {
> +		retval = pm_update_grace_period(&dqm->packet_mgr, grace_period);
> +		if (retval)
> +			return retval;
> +	}
> +
>   	retval = pm_send_unmap_queue(&dqm->packet_mgr, filter, filter_param, reset);
>   	if (retval)
>   		return retval;
> @@ -1820,6 +1837,13 @@ static int unmap_queues_cpsch(struct device_queue_manager *dqm,
>   		return -ETIME;
>   	}
>   
> +	/* We need to reset the grace period value for this device */
> +	if (grace_period != USE_DEFAULT_GRACE_PERIOD) {
> +		if (pm_update_grace_period(&dqm->packet_mgr,
> +					USE_DEFAULT_GRACE_PERIOD))
> +			pr_err("Failed to reset grace period\n");
> +	}
> +
>   	pm_release_ib(&dqm->packet_mgr);
>   	dqm->active_runlist = false;
>   
> @@ -1835,7 +1859,7 @@ static int reset_queues_cpsch(struct device_queue_manager *dqm,
>   	dqm_lock(dqm);
>   
>   	retval = unmap_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_BY_PASID,
> -			pasid, true);
> +			pasid, USE_DEFAULT_GRACE_PERIOD, true);
>   
>   	dqm_unlock(dqm);
>   	return retval;
> @@ -1844,13 +1868,14 @@ static int reset_queues_cpsch(struct device_queue_manager *dqm,
>   /* dqm->lock mutex has to be locked before calling this function */
>   static int execute_queues_cpsch(struct device_queue_manager *dqm,
>   				enum kfd_unmap_queues_filter filter,
> -				uint32_t filter_param)
> +				uint32_t filter_param,
> +				uint32_t grace_period)
>   {
>   	int retval;
>   
>   	if (dqm->is_hws_hang)
>   		return -EIO;
> -	retval = unmap_queues_cpsch(dqm, filter, filter_param, false);
> +	retval = unmap_queues_cpsch(dqm, filter, filter_param, grace_period, false);
>   	if (retval)
>   		return retval;
>   
> @@ -1908,7 +1933,8 @@ static int destroy_queue_cpsch(struct device_queue_manager *dqm,
>   		if (!dqm->dev->shared_resources.enable_mes) {
>   			decrement_queue_count(dqm, qpd, q);
>   			retval = execute_queues_cpsch(dqm,
> -						      KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0);
> +						      KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0,
> +						      USE_DEFAULT_GRACE_PERIOD);
>   			if (retval == -ETIME)
>   				qpd->reset_wavefronts = true;
>   		} else {
> @@ -2193,7 +2219,7 @@ static int process_termination_cpsch(struct device_queue_manager *dqm,
>   	}
>   
>   	if (!dqm->dev->shared_resources.enable_mes)
> -		retval = execute_queues_cpsch(dqm, filter, 0);
> +		retval = execute_queues_cpsch(dqm, filter, 0, USE_DEFAULT_GRACE_PERIOD);
>   
>   	if ((!dqm->is_hws_hang) && (retval || qpd->reset_wavefronts)) {
>   		pr_warn("Resetting wave fronts (cpsch) on dev %p\n", dqm->dev);
> @@ -2537,7 +2563,8 @@ int dqm_debugfs_hang_hws(struct device_queue_manager *dqm)
>   		return r;
>   	}
>   	dqm->active_runlist = true;
> -	r = execute_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0);
> +	r = execute_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES,
> +				0, USE_DEFAULT_GRACE_PERIOD);
>   	dqm_unlock(dqm);
>   
>   	return r;
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
> index a537b9ef3e16..fb48b124161f 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
> @@ -37,6 +37,7 @@
>   
>   #define KFD_MES_PROCESS_QUANTUM		100000
>   #define KFD_MES_GANG_QUANTUM		10000
> +#define USE_DEFAULT_GRACE_PERIOD 0xffffffff
>   
>   struct device_process_node {
>   	struct qcm_process_device *qpd;
> @@ -256,6 +257,7 @@ struct device_queue_manager {
>   	struct work_struct	hw_exception_work;
>   	struct kfd_mem_obj	hiq_sdma_mqd;
>   	bool			sched_running;
> +	uint32_t		wait_times;
>   };
>   
>   void device_queue_manager_init_cik(
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c
> index ed02b6d8bf63..c57f9a46dfcc 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c
> @@ -369,6 +369,38 @@ int pm_send_query_status(struct packet_manager *pm, uint64_t fence_address,
>   	return retval;
>   }
>   
> +int pm_update_grace_period(struct packet_manager *pm, uint32_t grace_period)
> +{
> +	int retval = 0;
> +	uint32_t *buffer, size;
> +
> +	size = pm->pmf->set_grace_period_size;
> +
> +	mutex_lock(&pm->lock);
> +
> +	if (size) {
> +		kq_acquire_packet_buffer(pm->priv_queue,
> +			size / sizeof(uint32_t),
> +			(unsigned int **)&buffer);
> +
> +		if (!buffer) {
> +			pr_err("Failed to allocate buffer on kernel queue\n");
> +			retval = -ENOMEM;
> +			goto out;
> +		}
> +
> +		retval = pm->pmf->set_grace_period(pm, buffer, grace_period);
> +		if (!retval)
> +			kq_submit_packet(pm->priv_queue);
> +		else
> +			kq_rollback_packet(pm->priv_queue);
> +	}
> +
> +out:
> +	mutex_unlock(&pm->lock);
> +	return retval;
> +}
> +
>   int pm_send_unmap_queue(struct packet_manager *pm,
>   			enum kfd_unmap_queues_filter filter,
>   			uint32_t filter_param, bool reset)
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager_v9.c
> index 18250845a989..f0cdc8695b8c 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager_v9.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager_v9.c
> @@ -251,6 +251,41 @@ static int pm_map_queues_v9(struct packet_manager *pm, uint32_t *buffer,
>   	return 0;
>   }
>   
> +static int pm_set_grace_period_v9(struct packet_manager *pm,
> +		uint32_t *buffer,
> +		uint32_t grace_period)
> +{
> +	struct pm4_mec_write_data_mmio *packet;
> +	uint32_t reg_offset = 0;
> +	uint32_t reg_data = 0;
> +
> +	pm->dqm->dev->kfd2kgd->build_grace_period_packet_info(
> +			pm->dqm->dev->adev,
> +			pm->dqm->wait_times,
> +			grace_period,
> +			&reg_offset,
> +			&reg_data);
> +
> +	if (grace_period == USE_DEFAULT_GRACE_PERIOD)
> +		reg_data = pm->dqm->wait_times;
> +
> +	packet = (struct pm4_mec_write_data_mmio *)buffer;
> +	memset(buffer, 0, sizeof(struct pm4_mec_write_data_mmio));
> +
> +	packet->header.u32All = pm_build_pm4_header(IT_WRITE_DATA,
> +					sizeof(struct pm4_mec_write_data_mmio));
> +
> +	packet->bitfields2.dst_sel  = dst_sel___write_data__mem_mapped_register;
> +	packet->bitfields2.addr_incr =
> +			addr_incr___write_data__do_not_increment_address;
> +
> +	packet->bitfields3.dst_mmreg_addr = reg_offset;
> +
> +	packet->data = reg_data;
> +
> +	return 0;
> +}
> +
>   static int pm_unmap_queues_v9(struct packet_manager *pm, uint32_t *buffer,
>   			enum kfd_unmap_queues_filter filter,
>   			uint32_t filter_param, bool reset)
> @@ -333,6 +368,7 @@ const struct packet_manager_funcs kfd_v9_pm_funcs = {
>   	.set_resources		= pm_set_resources_v9,
>   	.map_queues		= pm_map_queues_v9,
>   	.unmap_queues		= pm_unmap_queues_v9,
> +	.set_grace_period       = pm_set_grace_period_v9,
>   	.query_status		= pm_query_status_v9,
>   	.release_mem		= NULL,
>   	.map_process_size	= sizeof(struct pm4_mes_map_process),
> @@ -340,6 +376,7 @@ const struct packet_manager_funcs kfd_v9_pm_funcs = {
>   	.set_resources_size	= sizeof(struct pm4_mes_set_resources),
>   	.map_queues_size	= sizeof(struct pm4_mes_map_queues),
>   	.unmap_queues_size	= sizeof(struct pm4_mes_unmap_queues),
> +	.set_grace_period_size  = sizeof(struct pm4_mec_write_data_mmio),
>   	.query_status_size	= sizeof(struct pm4_mes_query_status),
>   	.release_mem_size	= 0,
>   };
> @@ -350,6 +387,7 @@ const struct packet_manager_funcs kfd_aldebaran_pm_funcs = {
>   	.set_resources		= pm_set_resources_v9,
>   	.map_queues		= pm_map_queues_v9,
>   	.unmap_queues		= pm_unmap_queues_v9,
> +	.set_grace_period       = pm_set_grace_period_v9,
>   	.query_status		= pm_query_status_v9,
>   	.release_mem		= NULL,
>   	.map_process_size	= sizeof(struct pm4_mes_map_process_aldebaran),
> @@ -357,6 +395,7 @@ const struct packet_manager_funcs kfd_aldebaran_pm_funcs = {
>   	.set_resources_size	= sizeof(struct pm4_mes_set_resources),
>   	.map_queues_size	= sizeof(struct pm4_mes_map_queues),
>   	.unmap_queues_size	= sizeof(struct pm4_mes_unmap_queues),
> +	.set_grace_period_size  = sizeof(struct pm4_mec_write_data_mmio),
>   	.query_status_size	= sizeof(struct pm4_mes_query_status),
>   	.release_mem_size	= 0,
>   };
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers_ai.h b/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers_ai.h
> index a666710ed403..795001c947e1 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers_ai.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers_ai.h
> @@ -583,6 +583,71 @@ struct pm4_mec_release_mem {
>   
>   #endif
>   
> +#ifndef PM4_MEC_WRITE_DATA_DEFINED
> +#define PM4_MEC_WRITE_DATA_DEFINED
> +
> +enum WRITE_DATA_dst_sel_enum {
> +	dst_sel___write_data__mem_mapped_register = 0,
> +	dst_sel___write_data__tc_l2 = 2,
> +	dst_sel___write_data__gds = 3,
> +	dst_sel___write_data__memory = 5,
> +	dst_sel___write_data__memory_mapped_adc_persistent_state = 6,
> +};
> +
> +enum WRITE_DATA_addr_incr_enum {
> +	addr_incr___write_data__increment_address = 0,
> +	addr_incr___write_data__do_not_increment_address = 1
> +};
> +
> +enum WRITE_DATA_wr_confirm_enum {
> +	wr_confirm___write_data__do_not_wait_for_write_confirmation = 0,
> +	wr_confirm___write_data__wait_for_write_confirmation = 1
> +};
> +
> +enum WRITE_DATA_cache_policy_enum {
> +	cache_policy___write_data__lru = 0,
> +	cache_policy___write_data__stream = 1
> +};
> +
> +
> +struct pm4_mec_write_data_mmio {
> +	union {
> +		union PM4_MES_TYPE_3_HEADER header;     /*header */
> +		unsigned int ordinal1;
> +	};
> +
> +	union {
> +		struct {
> +			unsigned int reserved1:8;
> +			unsigned int dst_sel:4;
> +			unsigned int reserved2:4;
> +			unsigned int addr_incr:1;
> +			unsigned int reserved3:2;
> +			unsigned int resume_vf:1;
> +			unsigned int wr_confirm:1;
> +			unsigned int reserved4:4;
> +			unsigned int cache_policy:2;
> +			unsigned int reserved5:5;
> +		} bitfields2;
> +		unsigned int ordinal2;
> +	};
> +
> +	union {
> +		struct {
> +			unsigned int dst_mmreg_addr:18;
> +			unsigned int reserved6:14;
> +		} bitfields3;
> +		unsigned int ordinal3;
> +	};
> +
> +	uint32_t reserved7;
> +
> +	uint32_t data;
> +
> +};
> +
> +#endif
> +
>   enum {
>   	CACHE_FLUSH_AND_INV_TS_EVENT = 0x00000014
>   };
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> index 62b75ba28425..d557a7ae756c 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> @@ -1300,6 +1300,8 @@ struct packet_manager_funcs {
>   	int (*unmap_queues)(struct packet_manager *pm, uint32_t *buffer,
>   			enum kfd_unmap_queues_filter mode,
>   			uint32_t filter_param, bool reset);
> +	int (*set_grace_period)(struct packet_manager *pm, uint32_t *buffer,
> +			uint32_t grace_period);
>   	int (*query_status)(struct packet_manager *pm, uint32_t *buffer,
>   			uint64_t fence_address,	uint64_t fence_value);
>   	int (*release_mem)(uint64_t gpu_addr, uint32_t *buffer);
> @@ -1310,6 +1312,7 @@ struct packet_manager_funcs {
>   	int set_resources_size;
>   	int map_queues_size;
>   	int unmap_queues_size;
> +	int set_grace_period_size;
>   	int query_status_size;
>   	int release_mem_size;
>   };
> @@ -1332,6 +1335,8 @@ int pm_send_unmap_queue(struct packet_manager *pm,
>   
>   void pm_release_ib(struct packet_manager *pm);
>   
> +int pm_update_grace_period(struct packet_manager *pm, uint32_t grace_period);
> +
>   /* Following PM funcs can be shared among VI and AI */
>   unsigned int pm_build_pm4_header(unsigned int opcode, size_t packet_size);
>   

^ permalink raw reply	[flat|nested] 68+ messages in thread

* Re: [PATCH 12/32] drm/amdkfd: prepare map process for single process debug devices
  2023-01-25 19:53 ` [PATCH 12/32] drm/amdkfd: prepare map process for single process debug devices Jonathan Kim
@ 2023-03-20 20:06   ` Felix Kuehling
  0 siblings, 0 replies; 68+ messages in thread
From: Felix Kuehling @ 2023-03-20 20:06 UTC (permalink / raw)
  To: Jonathan Kim, amd-gfx, dri-devel


On 2023-01-25 14:53, Jonathan Kim wrote:
> Older HW only supports debugging on a single process because the
> SPI debug mode setting registers are device global.
>
> The HWS has supplied a single pinned VMID (0xf) for MAP_PROCESS
> for debug purposes. To pin the VMID, the KFD will remove the VMID from
> the HWS dynamic VMID allocation via SET_RESOUCES so that a debugged
> process will never migrate away from its pinned VMID.
>
> The KFD is responsible for reserving and releasing this pinned VMID
> accordingly whenever the debugger attaches and detaches respectively.
>
> Signed-off-by: Jonathan Kim <jonathan.kim@amd.com>
> ---
>   .../drm/amd/amdkfd/kfd_device_queue_manager.c | 101 +++++++++++++++++-
>   .../drm/amd/amdkfd/kfd_device_queue_manager.h |   5 +
>   .../drm/amd/amdkfd/kfd_packet_manager_v9.c    |   9 ++
>   .../gpu/drm/amd/amdkfd/kfd_pm4_headers_ai.h   |   5 +-
>   4 files changed, 114 insertions(+), 6 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> index 7556f80d41e4..0cd3a5e9ff25 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> @@ -1490,7 +1490,7 @@ static int initialize_cpsch(struct device_queue_manager *dqm)
>   	dqm->active_cp_queue_count = 0;
>   	dqm->gws_queue_count = 0;
>   	dqm->active_runlist = false;
> -	INIT_WORK(&dqm->hw_exception_work, kfd_process_hw_exception);
> +	dqm->trap_debug_vmid = 0;

Are you removing the INIT_WORK on purpose here? Looks like a mistake 
that would break GPU recovery.


>   
>   	init_sdma_bitmaps(dqm);
>   
> @@ -1933,8 +1933,7 @@ static int destroy_queue_cpsch(struct device_queue_manager *dqm,
>   		if (!dqm->dev->shared_resources.enable_mes) {
>   			decrement_queue_count(dqm, qpd, q);
>   			retval = execute_queues_cpsch(dqm,
> -						      KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0,
> -						      USE_DEFAULT_GRACE_PERIOD);
> +						      KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0, USE_DEFAULT_GRACE_PERIOD);

Unnecessary formatting change.


>   			if (retval == -ETIME)
>   				qpd->reset_wavefronts = true;
>   		} else {
> @@ -2463,6 +2462,98 @@ static void kfd_process_hw_exception(struct work_struct *work)
>   	amdgpu_amdkfd_gpu_reset(dqm->dev->adev);
>   }
>   
> +int reserve_debug_trap_vmid(struct device_queue_manager *dqm,
> +				struct qcm_process_device *qpd)
> +{
> +	int r;
> +	int updated_vmid_mask;
> +
> +	if (dqm->sched_policy == KFD_SCHED_POLICY_NO_HWS) {
> +		pr_err("Unsupported on sched_policy: %i\n", dqm->sched_policy);
> +		return -EINVAL;
> +	}
> +
> +	dqm_lock(dqm);
> +
> +	if (dqm->trap_debug_vmid != 0) {
> +		pr_err("Trap debug id already reserved\n");
> +		r = -EBUSY;
> +		goto out_unlock;
> +	}
> +
> +	r = unmap_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0,
> +			USE_DEFAULT_GRACE_PERIOD, false);
> +	if (r)
> +		goto out_unlock;
> +
> +	updated_vmid_mask = dqm->dev->shared_resources.compute_vmid_bitmap;
> +	updated_vmid_mask &= ~(1 << dqm->dev->vm_info.last_vmid_kfd);
> +
> +	dqm->dev->shared_resources.compute_vmid_bitmap = updated_vmid_mask;
> +	dqm->trap_debug_vmid = dqm->dev->vm_info.last_vmid_kfd;
> +	r = set_sched_resources(dqm);
> +	if (r)
> +		goto out_unlock;
> +
> +	r = map_queues_cpsch(dqm);
> +	if (r)
> +		goto out_unlock;
> +
> +	pr_debug("Reserved VMID for trap debug: %i\n", dqm->trap_debug_vmid);
> +
> +out_unlock:
> +	dqm_unlock(dqm);
> +	return r;
> +}
> +
> +/*
> + * Releases vmid for the trap debugger
> + */
> +int release_debug_trap_vmid(struct device_queue_manager *dqm,
> +			struct qcm_process_device *qpd)
> +{
> +	int r;
> +	int updated_vmid_mask;
> +	uint32_t trap_debug_vmid;
> +
> +	if (dqm->sched_policy == KFD_SCHED_POLICY_NO_HWS) {
> +		pr_err("Unsupported on sched_policy: %i\n", dqm->sched_policy);
> +		return -EINVAL;
> +	}
> +
> +	dqm_lock(dqm);
> +	trap_debug_vmid = dqm->trap_debug_vmid;
> +	if (dqm->trap_debug_vmid == 0) {
> +		pr_err("Trap debug id is not reserved\n");
> +		r = -EINVAL;
> +		goto out_unlock;
> +	}
> +
> +	r = unmap_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0,
> +			USE_DEFAULT_GRACE_PERIOD, false);
> +	if (r)
> +		goto out_unlock;
> +
> +	updated_vmid_mask = dqm->dev->shared_resources.compute_vmid_bitmap;
> +	updated_vmid_mask |= (1 << dqm->dev->vm_info.last_vmid_kfd);
> +
> +	dqm->dev->shared_resources.compute_vmid_bitmap = updated_vmid_mask;
> +	dqm->trap_debug_vmid = 0;
> +	r = set_sched_resources(dqm);
> +	if (r)
> +		goto out_unlock;
> +
> +	r = map_queues_cpsch(dqm);
> +	if (r)
> +		goto out_unlock;
> +
> +	pr_debug("Released VMID for trap debug: %i\n", trap_debug_vmid);
> +
> +out_unlock:
> +	dqm_unlock(dqm);
> +	return r;
> +}
> +
>   #if defined(CONFIG_DEBUG_FS)
>   
>   static void seq_reg_dump(struct seq_file *m,
> @@ -2563,8 +2654,8 @@ int dqm_debugfs_hang_hws(struct device_queue_manager *dqm)
>   		return r;
>   	}
>   	dqm->active_runlist = true;
> -	r = execute_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES,
> -				0, USE_DEFAULT_GRACE_PERIOD);
> +	r = execute_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0,
> +			USE_DEFAULT_GRACE_PERIOD);

Unnecessary formatting change.

Regards,
   Felix


>   	dqm_unlock(dqm);
>   
>   	return r;
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
> index fb48b124161f..0cb1504d24cf 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
> @@ -250,6 +250,7 @@ struct device_queue_manager {
>   	struct kfd_mem_obj	*fence_mem;
>   	bool			active_runlist;
>   	int			sched_policy;
> +	uint32_t		trap_debug_vmid;
>   
>   	/* hw exception  */
>   	bool			is_hws_hang;
> @@ -281,6 +282,10 @@ unsigned int get_queues_per_pipe(struct device_queue_manager *dqm);
>   unsigned int get_pipes_per_mec(struct device_queue_manager *dqm);
>   unsigned int get_num_sdma_queues(struct device_queue_manager *dqm);
>   unsigned int get_num_xgmi_sdma_queues(struct device_queue_manager *dqm);
> +int reserve_debug_trap_vmid(struct device_queue_manager *dqm,
> +			struct qcm_process_device *qpd);
> +int release_debug_trap_vmid(struct device_queue_manager *dqm,
> +			struct qcm_process_device *qpd);
>   
>   static inline unsigned int get_sh_mem_bases_32(struct kfd_process_device *pdd)
>   {
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager_v9.c
> index f0cdc8695b8c..363cf8e005cc 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager_v9.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager_v9.c
> @@ -34,6 +34,9 @@ static int pm_map_process_v9(struct packet_manager *pm,
>   {
>   	struct pm4_mes_map_process *packet;
>   	uint64_t vm_page_table_base_addr = qpd->page_table_base;
> +	struct kfd_dev *kfd = pm->dqm->dev;
> +	struct kfd_process_device *pdd =
> +			container_of(qpd, struct kfd_process_device, qpd);
>   
>   	packet = (struct pm4_mes_map_process *)buffer;
>   	memset(buffer, 0, sizeof(struct pm4_mes_map_process));
> @@ -49,6 +52,12 @@ static int pm_map_process_v9(struct packet_manager *pm,
>   	packet->bitfields14.sdma_enable = 1;
>   	packet->bitfields14.num_queues = (qpd->is_debug) ? 0 : qpd->queue_count;
>   
> +	if (kfd->dqm->trap_debug_vmid && pdd->process->debug_trap_enabled &&
> +			pdd->process->runtime_info.runtime_state == DEBUG_RUNTIME_STATE_ENABLED) {
> +		packet->bitfields2.debug_vmid = kfd->dqm->trap_debug_vmid;
> +		packet->bitfields2.new_debug = 1;
> +	}
> +
>   	packet->sh_mem_config = qpd->sh_mem_config;
>   	packet->sh_mem_bases = qpd->sh_mem_bases;
>   	if (qpd->tba_addr) {
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers_ai.h b/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers_ai.h
> index 795001c947e1..bb6edbc27de7 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers_ai.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers_ai.h
> @@ -146,7 +146,10 @@ struct pm4_mes_map_process {
>   	union {
>   		struct {
>   			uint32_t pasid:16;
> -			uint32_t reserved1:8;
> +			uint32_t reserved1:2;
> +			uint32_t debug_vmid:4;
> +			uint32_t new_debug:1;
> +			uint32_t reserved2:1;
>   			uint32_t diq_enable:1;
>   			uint32_t process_quantum:7;
>   		} bitfields2;

^ permalink raw reply	[flat|nested] 68+ messages in thread

* Re: [PATCH 13/32] drm/amdgpu: prepare map process for multi-process debug devices
  2023-01-25 19:53 ` [PATCH 13/32] drm/amdgpu: prepare map process for multi-process " Jonathan Kim
@ 2023-03-20 20:16   ` Felix Kuehling
  0 siblings, 0 replies; 68+ messages in thread
From: Felix Kuehling @ 2023-03-20 20:16 UTC (permalink / raw)
  To: Jonathan Kim, amd-gfx, dri-devel


On 2023-01-25 14:53, Jonathan Kim wrote:
> Unlike single process debug devices, multi-process debug devices allow
> debug mode setting per-VMID (non-device-global).
>
> Because the HWS manages PASID-VMID mapping, the new MAP_PROCESS API allows
> the KFD to forward the required SPI debug register write requests.
>
> To request a new debug mode setting change, the KFD must be able to
> preempt all queues then remap all queues with these new setting
> requests for MAP_PROCESS to take effect.
>
> Note that by default, trap enablement in non-debug mode must be disabled
> for performance reasons for multi-process debug devices due to setup
> overhead in FW.
>
> v2: remove asic family code name comment in per vmid support check
>
> Signed-off-by: Jonathan Kim <jonathan.kim@amd.com>
> ---
>   drivers/gpu/drm/amd/amdkfd/kfd_debug.h        |  7 +++
>   .../drm/amd/amdkfd/kfd_device_queue_manager.c | 50 +++++++++++++++++++
>   .../drm/amd/amdkfd/kfd_device_queue_manager.h |  3 ++
>   .../drm/amd/amdkfd/kfd_packet_manager_v9.c    | 15 ++++++
>   drivers/gpu/drm/amd/amdkfd/kfd_priv.h         |  9 ++++
>   drivers/gpu/drm/amd/amdkfd/kfd_process.c      |  5 ++
>   6 files changed, 89 insertions(+)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.h b/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
> index 8aa7a3ad4e97..53c5a3e55bd2 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
> @@ -32,5 +32,12 @@ int kfd_dbg_trap_disable(struct kfd_process *target);
>   int kfd_dbg_trap_enable(struct kfd_process *target, uint32_t fd,
>   			void __user *runtime_info,
>   			uint32_t *runtime_info_size);
> +
> +static inline bool kfd_dbg_is_per_vmid_supported(struct kfd_dev *dev)
> +{
> +	return KFD_GC_VERSION(dev) == IP_VERSION(9, 4, 2);
> +}
> +
>   void debug_event_write_work_handler(struct work_struct *work);
> +
>   #endif
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> index 0cd3a5e9ff25..2517716d7cbc 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> @@ -2554,6 +2554,56 @@ int release_debug_trap_vmid(struct device_queue_manager *dqm,
>   	return r;
>   }
>   
> +int debug_lock_and_unmap(struct device_queue_manager *dqm)
> +{
> +	int r;
> +
> +	if (dqm->sched_policy == KFD_SCHED_POLICY_NO_HWS) {
> +		pr_err("Unsupported on sched_policy: %i\n", dqm->sched_policy);
> +		return -EINVAL;
> +	}
> +
> +	if (!kfd_dbg_is_per_vmid_supported(dqm->dev))
> +		return 0;
> +
> +	dqm_lock(dqm);
> +
> +	r = unmap_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0, 0, false);
> +	if (r)
> +		dqm_unlock(dqm);
> +
> +	return r;
> +}
> +
> +int debug_map_and_unlock(struct device_queue_manager *dqm)
> +{
> +	int r;
> +
> +	if (dqm->sched_policy == KFD_SCHED_POLICY_NO_HWS) {
> +		pr_err("Unsupported on sched_policy: %i\n", dqm->sched_policy);
> +		return -EINVAL;
> +	}
> +
> +	if (!kfd_dbg_is_per_vmid_supported(dqm->dev))
> +		return 0;
> +
> +	r = map_queues_cpsch(dqm);
> +
> +	dqm_unlock(dqm);
> +
> +	return r;
> +}
> +
> +int debug_refresh_runlist(struct device_queue_manager *dqm)
> +{
> +	int r = debug_lock_and_unmap(dqm);
> +
> +	if (r)
> +		return r;
> +
> +	return debug_map_and_unlock(dqm);
> +}
> +
>   #if defined(CONFIG_DEBUG_FS)
>   
>   static void seq_reg_dump(struct seq_file *m,
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
> index 0cb1504d24cf..bef3be84c5cc 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
> @@ -286,6 +286,9 @@ int reserve_debug_trap_vmid(struct device_queue_manager *dqm,
>   			struct qcm_process_device *qpd);
>   int release_debug_trap_vmid(struct device_queue_manager *dqm,
>   			struct qcm_process_device *qpd);
> +int debug_lock_and_unmap(struct device_queue_manager *dqm);
> +int debug_map_and_unlock(struct device_queue_manager *dqm);
> +int debug_refresh_runlist(struct device_queue_manager *dqm);
>   
>   static inline unsigned int get_sh_mem_bases_32(struct kfd_process_device *pdd)
>   {
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager_v9.c
> index 363cf8e005cc..f19c506da23d 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager_v9.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager_v9.c
> @@ -88,6 +88,10 @@ static int pm_map_process_aldebaran(struct packet_manager *pm,
>   {
>   	struct pm4_mes_map_process_aldebaran *packet;
>   	uint64_t vm_page_table_base_addr = qpd->page_table_base;
> +	struct kfd_dev *kfd = pm->dqm->dev;
> +	struct kfd_process_device *pdd =
> +			container_of(qpd, struct kfd_process_device, qpd);
> +	int i;
>   
>   	packet = (struct pm4_mes_map_process_aldebaran *)buffer;
>   	memset(buffer, 0, sizeof(struct pm4_mes_map_process_aldebaran));
> @@ -102,6 +106,17 @@ static int pm_map_process_aldebaran(struct packet_manager *pm,
>   	packet->bitfields14.num_oac = qpd->num_oac;
>   	packet->bitfields14.sdma_enable = 1;
>   	packet->bitfields14.num_queues = (qpd->is_debug) ? 0 : qpd->queue_count;
> +	/* TRAP_EN is set on boot so keep it set in non-debug mode. */

I don't understand how the comment relates to the code here. Where is 
that TRAP_ENABLE bit being set or preserved?

Regards,
   Felix


> +	packet->spi_gdbg_per_vmid_cntl = pdd->spi_dbg_override |
> +						pdd->spi_dbg_launch_mode;
> +
> +	if (pdd->process->debug_trap_enabled) {
> +		for (i = 0; i < kfd->device_info.num_of_watch_points; i++)
> +			packet->tcp_watch_cntl[i] = pdd->watch_points[i];
> +
> +		packet->bitfields2.single_memops =
> +				!!(pdd->process->dbg_flags & KFD_DBG_TRAP_FLAG_SINGLE_MEM_OP);
> +	}
>   
>   	packet->sh_mem_config = qpd->sh_mem_config;
>   	packet->sh_mem_bases = qpd->sh_mem_bases;
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> index d557a7ae756c..8f1e2f9023db 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> @@ -782,6 +782,12 @@ struct kfd_process_device {
>   	uint64_t faults;
>   	uint64_t page_in;
>   	uint64_t page_out;
> +
> +	/* Tracks debug per-vmid request settings */
> +	uint32_t spi_dbg_override;
> +	uint32_t spi_dbg_launch_mode;
> +	uint32_t watch_points[4];
> +
>   	/*
>   	 * If this process has been checkpointed before, then the user
>   	 * application will use the original gpu_id on the
> @@ -918,6 +924,9 @@ struct kfd_process {
>   
>   	bool xnack_enabled;
>   
> +	/* Tracks debug per-vmid request for debug flags */
> +	bool dbg_flags;
> +
>   	/* Work area for debugger event writer worker. */
>   	struct work_struct debug_event_workarea;
>   
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> index e935158ab311..94c6545a58b4 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> @@ -1556,6 +1556,11 @@ struct kfd_process_device *kfd_create_process_device_data(struct kfd_dev *dev,
>   	}
>   
>   	p->pdds[p->n_pdds++] = pdd;
> +	if (kfd_dbg_is_per_vmid_supported(pdd->dev))
> +		pdd->spi_dbg_override = pdd->dev->kfd2kgd->disable_debug_trap(
> +							pdd->dev->adev,
> +							false,
> +							0);
>   
>   	/* Init idr used for memory handle translation */
>   	idr_init(&pdd->alloc_idr);

^ permalink raw reply	[flat|nested] 68+ messages in thread

* Re: [PATCH 14/32] drm/amdgpu: expose debug api for mes
  2023-01-25 19:53 ` [PATCH 14/32] drm/amdgpu: expose debug api for mes Jonathan Kim
@ 2023-03-20 20:47   ` Felix Kuehling
  0 siblings, 0 replies; 68+ messages in thread
From: Felix Kuehling @ 2023-03-20 20:47 UTC (permalink / raw)
  To: Jonathan Kim, amd-gfx, dri-devel

On 2023-01-25 14:53, Jonathan Kim wrote:
> Similar to the F32 HWS, the RS64 HWS for GFX11 now supports a multi-process
> debug API.
>
> The skip_process_ctx_clear ADD_QUEUE requirement is to prevent the MES
> from clearing the process context when the first queue is added to the
> scheduler in order to maintain debug mode settings during queue preemption
> and restore.  The MES clears the process context in this case due to an
> unresolved FW caching bug during normal mode operations.
> During debug mode, the KFD will hold a reference to the target process
> so the process context should never go stale and MES can afford to skip
> this requirement.
>
> Signed-off-by: Jonathan Kim <jonathan.kim@amd.com>

Reviewed-by: Felix Kuehling <Felix.Kuehling@amd.com>


> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c       | 32 +++++++++++++++++++
>   drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h       | 20 ++++++++++++
>   drivers/gpu/drm/amd/amdgpu/mes_v11_0.c        | 12 +++++++
>   drivers/gpu/drm/amd/include/mes_v11_api_def.h | 21 +++++++++++-
>   4 files changed, 84 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
> index 82e27bd4f038..4916e0b0156f 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
> @@ -924,6 +924,38 @@ int amdgpu_mes_reg_wait(struct amdgpu_device *adev, uint32_t reg,
>   	return r;
>   }
>   
> +int amdgpu_mes_set_shader_debugger(struct amdgpu_device *adev,
> +				uint64_t process_context_addr,
> +				uint32_t spi_gdbg_per_vmid_cntl,
> +				const uint32_t *tcp_watch_cntl,
> +				uint32_t flags)
> +{
> +	struct mes_misc_op_input op_input = {0};
> +	int r;
> +
> +	if (!adev->mes.funcs->misc_op) {
> +		DRM_ERROR("mes set shader debugger is not supported!\n");
> +		return -EINVAL;
> +	}
> +
> +	op_input.op = MES_MISC_OP_SET_SHADER_DEBUGGER;
> +	op_input.set_shader_debugger.process_context_addr = process_context_addr;
> +	op_input.set_shader_debugger.flags.u32all = flags;
> +	op_input.set_shader_debugger.spi_gdbg_per_vmid_cntl = spi_gdbg_per_vmid_cntl;
> +	memcpy(op_input.set_shader_debugger.tcp_watch_cntl, tcp_watch_cntl,
> +			sizeof(op_input.set_shader_debugger.tcp_watch_cntl));
> +
> +	amdgpu_mes_lock(&adev->mes);
> +
> +	r = adev->mes.funcs->misc_op(&adev->mes, &op_input);
> +	if (r)
> +		DRM_ERROR("failed to set_shader_debugger\n");
> +
> +	amdgpu_mes_unlock(&adev->mes);
> +
> +	return r;
> +}
> +
>   static void
>   amdgpu_mes_ring_to_queue_props(struct amdgpu_device *adev,
>   			       struct amdgpu_ring *ring,
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
> index 547ec35691fa..d20df0cf0d88 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
> @@ -256,6 +256,7 @@ enum mes_misc_opcode {
>   	MES_MISC_OP_READ_REG,
>   	MES_MISC_OP_WRM_REG_WAIT,
>   	MES_MISC_OP_WRM_REG_WR_WAIT,
> +	MES_MISC_OP_SET_SHADER_DEBUGGER,
>   };
>   
>   struct mes_misc_op_input {
> @@ -278,6 +279,20 @@ struct mes_misc_op_input {
>   			uint32_t                   reg0;
>   			uint32_t                   reg1;
>   		} wrm_reg;
> +
> +		struct {
> +			uint64_t process_context_addr;
> +			union {
> +				struct {
> +					uint64_t single_memop : 1;
> +					uint64_t single_alu_op : 1;
> +					uint64_t reserved: 30;
> +				};
> +				uint32_t u32all;
> +			} flags;
> +			uint32_t spi_gdbg_per_vmid_cntl;
> +			uint32_t tcp_watch_cntl[4];
> +		} set_shader_debugger;
>   	};
>   };
>   
> @@ -340,6 +355,11 @@ int amdgpu_mes_reg_wait(struct amdgpu_device *adev, uint32_t reg,
>   int amdgpu_mes_reg_write_reg_wait(struct amdgpu_device *adev,
>   				  uint32_t reg0, uint32_t reg1,
>   				  uint32_t ref, uint32_t mask);
> +int amdgpu_mes_set_shader_debugger(struct amdgpu_device *adev,
> +				uint64_t process_context_addr,
> +				uint32_t spi_gdbg_per_vmid_cntl,
> +				const uint32_t *tcp_watch_cntl,
> +				uint32_t flags);
>   
>   int amdgpu_mes_add_ring(struct amdgpu_device *adev, int gang_id,
>   			int queue_type, int idx,
> diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
> index 62cdd2113135..fbacdc42efac 100644
> --- a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
> @@ -334,6 +334,18 @@ static int mes_v11_0_misc_op(struct amdgpu_mes *mes,
>   		misc_pkt.wait_reg_mem.reg_offset1 = input->wrm_reg.reg0;
>   		misc_pkt.wait_reg_mem.reg_offset2 = input->wrm_reg.reg1;
>   		break;
> +	case MES_MISC_OP_SET_SHADER_DEBUGGER:
> +		misc_pkt.opcode = MESAPI_MISC__SET_SHADER_DEBUGGER;
> +		misc_pkt.set_shader_debugger.process_context_addr =
> +				input->set_shader_debugger.process_context_addr;
> +		misc_pkt.set_shader_debugger.flags.u32all =
> +				input->set_shader_debugger.flags.u32all;
> +		misc_pkt.set_shader_debugger.spi_gdbg_per_vmid_cntl =
> +				input->set_shader_debugger.spi_gdbg_per_vmid_cntl;
> +		memcpy(misc_pkt.set_shader_debugger.tcp_watch_cntl,
> +				input->set_shader_debugger.tcp_watch_cntl,
> +				sizeof(misc_pkt.set_shader_debugger.tcp_watch_cntl));
> +		break;
>   	default:
>   		DRM_ERROR("unsupported misc op (%d) \n", input->op);
>   		return -EINVAL;
> diff --git a/drivers/gpu/drm/amd/include/mes_v11_api_def.h b/drivers/gpu/drm/amd/include/mes_v11_api_def.h
> index dc694cb246d9..f3c15f18ddb5 100644
> --- a/drivers/gpu/drm/amd/include/mes_v11_api_def.h
> +++ b/drivers/gpu/drm/amd/include/mes_v11_api_def.h
> @@ -274,7 +274,8 @@ union MESAPI__ADD_QUEUE {
>   			uint32_t is_kfd_process		: 1;
>   			uint32_t trap_en		: 1;
>   			uint32_t is_aql_queue		: 1;
> -			uint32_t reserved		: 20;
> +			uint32_t skip_process_ctx_clear : 1;
> +			uint32_t reserved		: 19;
>   		};
>   		struct MES_API_STATUS		api_status;
>   		uint64_t                        tma_addr;
> @@ -523,6 +524,7 @@ enum MESAPI_MISC_OPCODE {
>   	MESAPI_MISC__QUERY_STATUS,
>   	MESAPI_MISC__READ_REG,
>   	MESAPI_MISC__WAIT_REG_MEM,
> +	MESAPI_MISC__SET_SHADER_DEBUGGER,
>   	MESAPI_MISC__MAX,
>   };
>   
> @@ -561,6 +563,20 @@ struct QUERY_STATUS {
>   	uint32_t context_id;
>   };
>   
> +struct SET_SHADER_DEBUGGER {
> +	uint64_t process_context_addr;
> +	union {
> +		struct {
> +			uint32_t single_memop : 1;  /* SQ_DEBUG.single_memop */
> +			uint32_t single_alu_op : 1; /* SQ_DEBUG.single_alu_op */
> +			uint32_t reserved : 30;
> +		};
> +		uint32_t u32all;
> +	} flags;
> +	uint32_t spi_gdbg_per_vmid_cntl;
> +	uint32_t tcp_watch_cntl[4]; /* TCP_WATCHx_CNTL */
> +};
> +
>   union MESAPI__MISC {
>   	struct {
>   		union MES_API_HEADER	header;
> @@ -573,6 +589,9 @@ union MESAPI__MISC {
>   			struct		QUERY_STATUS query_status;
>   			struct		READ_REG read_reg;
>   			struct          WAIT_REG_MEM wait_reg_mem;
> +			struct		SET_SHADER_DEBUGGER set_shader_debugger;
> +			enum MES_AMD_PRIORITY_LEVEL queue_sch_level;
> +
>   			uint32_t	data[MISC_DATA_MAX_SIZE_IN_DWORDS];
>   		};
>   	};

^ permalink raw reply	[flat|nested] 68+ messages in thread

* Re: [PATCH 15/32] drm/amdkfd: prepare trap workaround for gfx11
  2023-01-25 19:53 ` [PATCH 15/32] drm/amdkfd: prepare trap workaround for gfx11 Jonathan Kim
@ 2023-03-20 21:49   ` Felix Kuehling
  2023-03-23 13:50     ` Kim, Jonathan
  0 siblings, 1 reply; 68+ messages in thread
From: Felix Kuehling @ 2023-03-20 21:49 UTC (permalink / raw)
  To: Jonathan Kim, amd-gfx, dri-devel


On 2023-01-25 14:53, Jonathan Kim wrote:
> Due to a HW bug, waves in only half the shader arrays can enter trap.
>
> When starting a debug session, relocate all waves to the first shader
> array of each shader engine and mask off the 2nd shader array as
> unavailable.
>
> When ending a debug session, re-enable the 2nd shader array per
> shader engine.
>
> User CU masking per queue cannot be guaranteed to remain functional
> if requested during debugging (e.g. user cu mask requests only 2nd shader
> array as an available resource leading to zero HW resources available)
> nor can runtime be alerted of any of these changes during execution.
>
> Make user CU masking and debugging mutual exclusive with respect to
> availability.
>
> If the debugger tries to attach to a process with a user cu masked
> queue, return the runtime status as enabled but busy.
>
> If the debugger tries to attach and fails to reallocate queue waves to
> the first shader array of each shader engine, return the runtime status
> as enabled but with an error.
>
> In addition, like any other mutli-process debug supported devices,
> disable trap temporary setup per-process to avoid performance impact from
> setup overhead.
>
> Signed-off-by: Jonathan Kim <jonathan.kim@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h       |  2 +
>   drivers/gpu/drm/amd/amdgpu/mes_v11_0.c        |  7 +-
>   drivers/gpu/drm/amd/amdkfd/kfd_chardev.c      |  2 -
>   drivers/gpu/drm/amd/amdkfd/kfd_debug.c        | 64 +++++++++++++++++++
>   drivers/gpu/drm/amd/amdkfd/kfd_debug.h        |  3 +-
>   .../drm/amd/amdkfd/kfd_device_queue_manager.c |  7 ++
>   .../gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c  |  3 +-
>   .../gpu/drm/amd/amdkfd/kfd_mqd_manager_v10.c  |  3 +-
>   .../gpu/drm/amd/amdkfd/kfd_mqd_manager_v11.c  | 42 ++++++++----
>   .../gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c   |  3 +-
>   .../gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c   |  3 +-
>   drivers/gpu/drm/amd/amdkfd/kfd_priv.h         |  5 +-
>   .../amd/amdkfd/kfd_process_queue_manager.c    |  9 ++-
>   13 files changed, 124 insertions(+), 29 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
> index d20df0cf0d88..b5f5eed2b5ef 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
> @@ -219,6 +219,8 @@ struct mes_add_queue_input {
>   	uint32_t        gws_size;
>   	uint64_t	tba_addr;
>   	uint64_t	tma_addr;
> +	uint32_t	trap_en;
> +	uint32_t	skip_process_ctx_clear;
>   	uint32_t	is_kfd_process;
>   	uint32_t	is_aql_queue;
>   	uint32_t	queue_size;
> diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
> index fbacdc42efac..38c7a0cbf264 100644
> --- a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
> @@ -197,17 +197,14 @@ static int mes_v11_0_add_hw_queue(struct amdgpu_mes *mes,
>   	mes_add_queue_pkt.gws_size = input->gws_size;
>   	mes_add_queue_pkt.trap_handler_addr = input->tba_addr;
>   	mes_add_queue_pkt.tma_addr = input->tma_addr;
> +	mes_add_queue_pkt.trap_en = input->trap_en;
> +	mes_add_queue_pkt.skip_process_ctx_clear = input->skip_process_ctx_clear;
>   	mes_add_queue_pkt.is_kfd_process = input->is_kfd_process;
>   
>   	/* For KFD, gds_size is re-used for queue size (needed in MES for AQL queues) */
>   	mes_add_queue_pkt.is_aql_queue = input->is_aql_queue;
>   	mes_add_queue_pkt.gds_size = input->queue_size;
>   
> -	if (!(((adev->mes.sched_version & AMDGPU_MES_VERSION_MASK) >= 4) &&
> -		  (adev->ip_versions[GC_HWIP][0] >= IP_VERSION(11, 0, 0)) &&
> -		  (adev->ip_versions[GC_HWIP][0] <= IP_VERSION(11, 0, 3))))
> -		mes_add_queue_pkt.trap_en = 1;
> -
>   	/* For KFD, gds_size is re-used for queue size (needed in MES for AQL queues) */
>   	mes_add_queue_pkt.is_aql_queue = input->is_aql_queue;
>   	mes_add_queue_pkt.gds_size = input->queue_size;
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> index ee05c2e54ef6..f5f639de28f0 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> @@ -530,8 +530,6 @@ static int kfd_ioctl_set_cu_mask(struct file *filp, struct kfd_process *p,
>   		goto out;
>   	}
>   
> -	minfo.update_flag = UPDATE_FLAG_CU_MASK;
> -
>   	mutex_lock(&p->mutex);
>   
>   	retval = pqm_update_mqd(&p->pqm, args->queue_id, &minfo);
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
> index f6ea6db266b4..6e99a0160275 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
> @@ -37,6 +37,70 @@ void debug_event_write_work_handler(struct work_struct *work)
>   	kernel_write(process->dbg_ev_file, &write_data, 1, &pos);
>   }
>   
> +static int kfd_dbg_set_queue_workaround(struct queue *q, bool enable)
> +{
> +	struct mqd_update_info minfo = {0};
> +	int err;
> +
> +	if (!q || (!q->properties.is_dbg_wa && !enable))

Should this condition be:

     if (!q || q->properties.is_dbg_wa != enable)


> +		return 0;
> +
> +	if (KFD_GC_VERSION(q->device) < IP_VERSION(11, 0, 0) ||
> +			KFD_GC_VERSION(q->device) >= IP_VERSION(12, 0, 0))

Indentation. It would be more readable if the KFD_GC_VERSIONs were aligned.


> +		return 0;
> +
> +	if (enable && q->properties.is_user_cu_masked)
> +		return -EBUSY;
> +
> +	minfo.update_flag = enable ? UPDATE_FLAG_DBG_WA_ENABLE : UPDATE_FLAG_DBG_WA_DISABLE;
> +
> +	q->properties.is_dbg_wa = enable;
> +	err = q->device->dqm->ops.update_queue(q->device->dqm, q, &minfo);
> +	if (err)
> +		q->properties.is_dbg_wa = false;
> +
> +	return err;
> +}
> +
> +static int kfd_dbg_set_workaround(struct kfd_process *target, bool enable)
> +{
> +	struct process_queue_manager *pqm = &target->pqm;
> +	struct process_queue_node *pqn;
> +	int r = 0;
> +
> +	list_for_each_entry(pqn, &pqm->queues, process_queue_list) {
> +		r = kfd_dbg_set_queue_workaround(pqn->q, enable);
> +		if (enable && r)
> +			goto unwind;
> +	}
> +
> +	return 0;
> +
> +unwind:
> +	list_for_each_entry(pqn, &pqm->queues, process_queue_list)
> +		kfd_dbg_set_queue_workaround(pqn->q, false);
> +
> +	if (enable) {
> +		target->runtime_info.runtime_state = r == -EBUSY ?
> +				DEBUG_RUNTIME_STATE_ENABLED_BUSY :
> +				DEBUG_RUNTIME_STATE_ENABLED_ERROR;
> +	}

Braces are not needed here.


> +
> +	return r;
> +}
> +
> +static int kfd_dbg_set_mes_debug_mode(struct kfd_process_device *pdd)
> +{
> +	uint32_t spi_dbg_cntl = pdd->spi_dbg_override | pdd->spi_dbg_launch_mode;
> +	uint32_t flags = pdd->process->dbg_flags;
> +
> +	if (!kfd_dbg_is_per_vmid_supported(pdd->dev))
> +		return 0;
> +
> +	return amdgpu_mes_set_shader_debugger(pdd->dev->adev, pdd->proc_ctx_gpu_addr, spi_dbg_cntl,
> +						pdd->watch_points, flags);
> +}
> +

You're adding some unused static functions here. This will cause compile 
warnings until the patch that starts using them. You could avoid this by 
reordering this and the next patch and moving the function calls into 
this patch. That would also make it more obvious where the workaround 
plugs into the debug code.

Regards,
   Felix


>   int kfd_dbg_trap_disable(struct kfd_process *target)
>   {
>   	if (!target->debug_trap_enabled)
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.h b/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
> index 53c5a3e55bd2..0c09f1729325 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
> @@ -35,7 +35,8 @@ int kfd_dbg_trap_enable(struct kfd_process *target, uint32_t fd,
>   
>   static inline bool kfd_dbg_is_per_vmid_supported(struct kfd_dev *dev)
>   {
> -	return KFD_GC_VERSION(dev) == IP_VERSION(9, 4, 2);
> +	return KFD_GC_VERSION(dev) == IP_VERSION(9, 4, 2) ||
> +	       KFD_GC_VERSION(dev) >= IP_VERSION(11, 0, 0);
>   }
>   
>   void debug_event_write_work_handler(struct work_struct *work);
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> index 2517716d7cbc..be1985b87ea7 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> @@ -214,6 +214,10 @@ static int add_queue_mes(struct device_queue_manager *dqm, struct queue *q,
>   	queue_input.paging = false;
>   	queue_input.tba_addr = qpd->tba_addr;
>   	queue_input.tma_addr = qpd->tma_addr;
> +	queue_input.trap_en = KFD_GC_VERSION(q->device) < IP_VERSION(11, 0, 0) ||
> +			      KFD_GC_VERSION(q->device) >= IP_VERSION(12, 0, 0) ||
> +			      q->properties.is_dbg_wa;
> +	queue_input.skip_process_ctx_clear = qpd->pqm->process->debug_trap_enabled;
>   
>   	queue_type = convert_to_mes_queue_type(q->properties.type);
>   	if (queue_type < 0) {
> @@ -1679,6 +1683,9 @@ static int create_queue_cpsch(struct device_queue_manager *dqm, struct queue *q,
>   	 * updates the is_evicted flag but is a no-op otherwise.
>   	 */
>   	q->properties.is_evicted = !!qpd->evicted;
> +	q->properties.is_dbg_wa = qpd->pqm->process->debug_trap_enabled &&
> +			KFD_GC_VERSION(q->device) >= IP_VERSION(11, 0, 0) &&
> +			KFD_GC_VERSION(q->device) < IP_VERSION(12, 0, 0);
>   
>   	if (qd)
>   		mqd_mgr->restore_mqd(mqd_mgr, &q->mqd, q->mqd_mem_obj, &q->gart_mqd_addr,
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c
> index 4889865c725c..c2a7226fc588 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c
> @@ -48,8 +48,7 @@ static void update_cu_mask(struct mqd_manager *mm, void *mqd,
>   	struct cik_mqd *m;
>   	uint32_t se_mask[4] = {0}; /* 4 is the max # of SEs */
>   
> -	if (!minfo || (minfo->update_flag != UPDATE_FLAG_CU_MASK) ||
> -	    !minfo->cu_mask.ptr)
> +	if (!minfo || !minfo->cu_mask.ptr)
>   		return;
>   
>   	mqd_symmetrically_map_cu_mask(mm,
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v10.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v10.c
> index cb484ace17de..8248e77751e7 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v10.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v10.c
> @@ -48,8 +48,7 @@ static void update_cu_mask(struct mqd_manager *mm, void *mqd,
>   	struct v10_compute_mqd *m;
>   	uint32_t se_mask[4] = {0}; /* 4 is the max # of SEs */
>   
> -	if (!minfo || (minfo->update_flag != UPDATE_FLAG_CU_MASK) ||
> -	    !minfo->cu_mask.ptr)
> +	if (!minfo || !minfo->cu_mask.ptr)
>   		return;
>   
>   	mqd_symmetrically_map_cu_mask(mm,
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v11.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v11.c
> index ac7c8fc83c94..18ab613e787c 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v11.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v11.c
> @@ -46,15 +46,33 @@ static void update_cu_mask(struct mqd_manager *mm, void *mqd,
>   {
>   	struct v11_compute_mqd *m;
>   	uint32_t se_mask[KFD_MAX_NUM_SE] = {0};
> +	bool has_wa_flag = minfo && (minfo->update_flag & (UPDATE_FLAG_DBG_WA_ENABLE |
> +			UPDATE_FLAG_DBG_WA_DISABLE));
>   
> -	if (!minfo || (minfo->update_flag != UPDATE_FLAG_CU_MASK) ||
> -	    !minfo->cu_mask.ptr)
> +	if (!minfo || !(has_wa_flag || minfo->cu_mask.ptr))
>   		return;
>   
> +	m = get_mqd(mqd);
> +
> +	if (has_wa_flag) {
> +		uint32_t wa_mask = minfo->update_flag == UPDATE_FLAG_DBG_WA_ENABLE ?
> +						0xffff : 0xffffffff;
> +
> +		m->compute_static_thread_mgmt_se0 = wa_mask;
> +		m->compute_static_thread_mgmt_se1 = wa_mask;
> +		m->compute_static_thread_mgmt_se2 = wa_mask;
> +		m->compute_static_thread_mgmt_se3 = wa_mask;
> +		m->compute_static_thread_mgmt_se4 = wa_mask;
> +		m->compute_static_thread_mgmt_se5 = wa_mask;
> +		m->compute_static_thread_mgmt_se6 = wa_mask;
> +		m->compute_static_thread_mgmt_se7 = wa_mask;
> +
> +		return;
> +	}
> +
>   	mqd_symmetrically_map_cu_mask(mm,
>   		minfo->cu_mask.ptr, minfo->cu_mask.count, se_mask);
>   
> -	m = get_mqd(mqd);
>   	m->compute_static_thread_mgmt_se0 = se_mask[0];
>   	m->compute_static_thread_mgmt_se1 = se_mask[1];
>   	m->compute_static_thread_mgmt_se2 = se_mask[2];
> @@ -109,6 +127,7 @@ static void init_mqd(struct mqd_manager *mm, void **mqd,
>   	uint64_t addr;
>   	struct v11_compute_mqd *m;
>   	int size;
> +	uint32_t wa_mask = q->is_dbg_wa ? 0xffff : 0xffffffff;
>   
>   	m = (struct v11_compute_mqd *) mqd_mem_obj->cpu_ptr;
>   	addr = mqd_mem_obj->gpu_addr;
> @@ -122,14 +141,15 @@ static void init_mqd(struct mqd_manager *mm, void **mqd,
>   
>   	m->header = 0xC0310800;
>   	m->compute_pipelinestat_enable = 1;
> -	m->compute_static_thread_mgmt_se0 = 0xFFFFFFFF;
> -	m->compute_static_thread_mgmt_se1 = 0xFFFFFFFF;
> -	m->compute_static_thread_mgmt_se2 = 0xFFFFFFFF;
> -	m->compute_static_thread_mgmt_se3 = 0xFFFFFFFF;
> -	m->compute_static_thread_mgmt_se4 = 0xFFFFFFFF;
> -	m->compute_static_thread_mgmt_se5 = 0xFFFFFFFF;
> -	m->compute_static_thread_mgmt_se6 = 0xFFFFFFFF;
> -	m->compute_static_thread_mgmt_se7 = 0xFFFFFFFF;
> +
> +	m->compute_static_thread_mgmt_se0 = wa_mask;
> +	m->compute_static_thread_mgmt_se1 = wa_mask;
> +	m->compute_static_thread_mgmt_se2 = wa_mask;
> +	m->compute_static_thread_mgmt_se3 = wa_mask;
> +	m->compute_static_thread_mgmt_se4 = wa_mask;
> +	m->compute_static_thread_mgmt_se5 = wa_mask;
> +	m->compute_static_thread_mgmt_se6 = wa_mask;
> +	m->compute_static_thread_mgmt_se7 = wa_mask;
>   
>   	m->cp_hqd_persistent_state = CP_HQD_PERSISTENT_STATE__PRELOAD_REQ_MASK |
>   			0x55 << CP_HQD_PERSISTENT_STATE__PRELOAD_SIZE__SHIFT;
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c
> index 86f1cf090246..50da16dd4c96 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c
> @@ -49,8 +49,7 @@ static void update_cu_mask(struct mqd_manager *mm, void *mqd,
>   	struct v9_mqd *m;
>   	uint32_t se_mask[KFD_MAX_NUM_SE] = {0};
>   
> -	if (!minfo || (minfo->update_flag != UPDATE_FLAG_CU_MASK) ||
> -	    !minfo->cu_mask.ptr)
> +	if (!minfo || !minfo->cu_mask.ptr)
>   		return;
>   
>   	mqd_symmetrically_map_cu_mask(mm,
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c
> index 530ba6f5b57e..58b40bff3e0c 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c
> @@ -51,8 +51,7 @@ static void update_cu_mask(struct mqd_manager *mm, void *mqd,
>   	struct vi_mqd *m;
>   	uint32_t se_mask[4] = {0}; /* 4 is the max # of SEs */
>   
> -	if (!minfo || (minfo->update_flag != UPDATE_FLAG_CU_MASK) ||
> -	    !minfo->cu_mask.ptr)
> +	if (!minfo || !minfo->cu_mask.ptr)
>   		return;
>   
>   	mqd_symmetrically_map_cu_mask(mm,
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> index 8f1e2f9023db..75521d96e937 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> @@ -479,6 +479,8 @@ struct queue_properties {
>   	bool is_evicted;
>   	bool is_active;
>   	bool is_gws;
> +	bool is_dbg_wa;
> +	bool is_user_cu_masked;
>   	/* Not relevant for user mode queues in cp scheduling */
>   	unsigned int vmid;
>   	/* Relevant only for sdma queues*/
> @@ -501,7 +503,8 @@ struct queue_properties {
>   			    !(q).is_evicted)
>   
>   enum mqd_update_flag {
> -	UPDATE_FLAG_CU_MASK = 0,
> +	UPDATE_FLAG_DBG_WA_ENABLE = 1,
> +	UPDATE_FLAG_DBG_WA_DISABLE = 2,
>   };
>   
>   struct mqd_update_info {
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
> index 5137476ec18e..d8f032214481 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
> @@ -498,8 +498,12 @@ int pqm_update_mqd(struct process_queue_manager *pqm,
>   		return -EFAULT;
>   	}
>   
> +	/* CUs are masked for debugger requirements so deny user mask  */
> +	if (pqn->q->properties.is_dbg_wa && minfo && minfo->cu_mask.ptr)
> +		return -EBUSY;
> +
>   	/* ASICs that have WGPs must enforce pairwise enabled mask checks. */
> -	if (minfo && minfo->update_flag == UPDATE_FLAG_CU_MASK && minfo->cu_mask.ptr &&
> +	if (minfo && minfo->cu_mask.ptr &&
>   			KFD_GC_VERSION(pqn->q->device) >= IP_VERSION(10, 0, 0)) {
>   		int i;
>   
> @@ -518,6 +522,9 @@ int pqm_update_mqd(struct process_queue_manager *pqm,
>   	if (retval != 0)
>   		return retval;
>   
> +	if (minfo && minfo->cu_mask.ptr)
> +		pqn->q->properties.is_user_cu_masked = true;
> +
>   	return 0;
>   }
>   

^ permalink raw reply	[flat|nested] 68+ messages in thread

* Re: [PATCH 16/32] drm/amdkfd: add per process hw trap enable and disable functions
  2023-01-25 19:53 ` [PATCH 16/32] drm/amdkfd: add per process hw trap enable and disable functions Jonathan Kim
@ 2023-03-20 23:06   ` Felix Kuehling
  0 siblings, 0 replies; 68+ messages in thread
From: Felix Kuehling @ 2023-03-20 23:06 UTC (permalink / raw)
  To: Jonathan Kim, amd-gfx, dri-devel


On 2023-01-25 14:53, Jonathan Kim wrote:
> To enable HW debug mode per process, all devices must be debug enabled
> successfully.  If a failure occures, rewind the enablement of debug mode
> on the enabled devices.
>
> A power management scenario that needs to be considered is HW
> debug mode setting during GFXOFF.  During GFXOFF, these registers
> will be unreachable so we have to transiently disable GFXOFF when
> setting.  Also, some devices don't support the RLC save restore
> function for these debug registers so we have to disable GFXOFF
> completely during a debug session.
>
> Cooperative launch also has debugging restriction based on HW/FW bugs.
> If such bugs exists, the debugger cannot attach to a process that uses GWS
> resources nor can GWS resources be requested if a process is being
> debugged.
>
> Multi-process debug devices can only enable trap temporaries based
> on certain runtime scenerios, which will be explained when the
> runtime enable functions are implemented in a follow up patch.
>
> v2: add gfx11 support. fix fw checks. remove asic family name comments.
>
> Signed-off-by: Jonathan Kim <jonathan.kim@amd.com>
> ---
>   drivers/gpu/drm/amd/amdkfd/kfd_chardev.c      |   5 +
>   drivers/gpu/drm/amd/amdkfd/kfd_debug.c        | 148 +++++++++++++++++-
>   drivers/gpu/drm/amd/amdkfd/kfd_debug.h        |  29 ++++
>   .../drm/amd/amdkfd/kfd_device_queue_manager.c |   1 +
>   drivers/gpu/drm/amd/amdkfd/kfd_process.c      |   9 ++
>   5 files changed, 190 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> index f5f639de28f0..628178126d3b 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> @@ -1453,6 +1453,11 @@ static int kfd_ioctl_alloc_queue_gws(struct file *filep,
>   		goto out_unlock;
>   	}
>   
> +	if (!kfd_dbg_has_gws_support(dev) && p->debug_trap_enabled) {
> +		retval = -EBUSY;
> +		goto out_unlock;
> +	}
> +
>   	retval = pqm_set_gws(&p->pqm, args->queue_id, args->num_gws ? dev->gws : NULL);
>   	mutex_unlock(&p->mutex);
>   
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
> index 6e99a0160275..659dfc7411fe 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
> @@ -21,6 +21,7 @@
>    */
>   
>   #include "kfd_debug.h"
> +#include "kfd_device_queue_manager.h"
>   #include <linux/file.h>
>   
>   void debug_event_write_work_handler(struct work_struct *work)
> @@ -101,11 +102,68 @@ static int kfd_dbg_set_mes_debug_mode(struct kfd_process_device *pdd)
>   						pdd->watch_points, flags);
>   }
>   
> +/* kfd_dbg_trap_deactivate:
> + *	target: target process
> + *	unwind: If this is unwinding a failed kfd_dbg_trap_enable()
> + *	unwind_count:
> + *		If unwind == true, how far down the pdd list we need
> + *				to unwind
> + *		else: ignored
> + */
> +static void kfd_dbg_trap_deactivate(struct kfd_process *target, bool unwind, int unwind_count)
> +{
> +	int i, count = 0;
> +
> +	for (i = 0; i < target->n_pdds; i++) {
> +		struct kfd_process_device *pdd = target->pdds[i];
> +
> +		/* If this is an unwind, and we have unwound the required
> +		 * enable calls on the pdd list, we need to stop now
> +		 * otherwise we may mess up another debugger session.
> +		 */
> +		if (unwind && count == unwind_count)
> +			break;
> +
> +		/* GFX off is already disabled by debug activate if not RLC restore supported. */
> +		if (kfd_dbg_is_rlc_restore_supported(pdd->dev))
> +			amdgpu_gfx_off_ctrl(pdd->dev->adev, false);
> +		pdd->spi_dbg_override =
> +				pdd->dev->kfd2kgd->disable_debug_trap(
> +				pdd->dev->adev,
> +				target->runtime_info.ttmp_setup,
> +				pdd->dev->vm_info.last_vmid_kfd);
> +		if (kfd_dbg_is_rlc_restore_supported(pdd->dev))
> +			amdgpu_gfx_off_ctrl(pdd->dev->adev, true);

Shouldn't this reenable GFXOFF unconditionally? It should not stay 
disabled on devices without RLC restore support, because we're ending 
the debug session here.


> +
> +		if (!kfd_dbg_is_per_vmid_supported(pdd->dev) &&
> +				release_debug_trap_vmid(pdd->dev->dqm, &pdd->qpd))
> +			pr_err("Failed to release debug vmid on [%i]\n", pdd->dev->id);
> +
> +		if (!pdd->dev->shared_resources.enable_mes)
> +			debug_refresh_runlist(pdd->dev->dqm);
> +		else
> +			kfd_dbg_set_mes_debug_mode(pdd);
> +
> +		count++;

Isn't count the same as i? Why do we need a separate variable here?

Regards,
   Felix


> +	}
> +
> +	kfd_dbg_set_workaround(target, false);
> +}
> +
>   int kfd_dbg_trap_disable(struct kfd_process *target)
>   {
>   	if (!target->debug_trap_enabled)
>   		return 0;
>   
> +	/*
> +	 * Defer deactivation to runtime if runtime not enabled otherwise reset
> +	 * attached running target runtime state to enable for re-attach.
> +	 */
> +	if (target->runtime_info.runtime_state == DEBUG_RUNTIME_STATE_ENABLED)
> +		kfd_dbg_trap_deactivate(target, false, 0);
> +	else if (target->runtime_info.runtime_state != DEBUG_RUNTIME_STATE_DISABLED)
> +		target->runtime_info.runtime_state = DEBUG_RUNTIME_STATE_ENABLED;
> +
>   	fput(target->dbg_ev_file);
>   	target->dbg_ev_file = NULL;
>   
> @@ -120,16 +178,96 @@ int kfd_dbg_trap_disable(struct kfd_process *target)
>   	return 0;
>   }
>   
> +static int kfd_dbg_trap_activate(struct kfd_process *target)
> +{
> +	int i, r = 0, unwind_count = 0;
> +
> +	r = kfd_dbg_set_workaround(target, true);
> +	if (r)
> +		return r;
> +
> +	for (i = 0; i < target->n_pdds; i++) {
> +		struct kfd_process_device *pdd = target->pdds[i];
> +
> +		if (!kfd_dbg_is_per_vmid_supported(pdd->dev)) {
> +			r = reserve_debug_trap_vmid(pdd->dev->dqm, &pdd->qpd);
> +
> +			if (r) {
> +				target->runtime_info.runtime_state = (r == -EBUSY) ?
> +							DEBUG_RUNTIME_STATE_ENABLED_BUSY :
> +							DEBUG_RUNTIME_STATE_ENABLED_ERROR;
> +
> +				goto unwind_err;
> +			}
> +		}
> +
> +		/* Disable GFX OFF to prevent garbage read/writes to debug registers.
> +		 * If RLC restore of debug registers is not supported and runtime enable
> +		 * hasn't done so already on ttmp setup request, restore the trap config registers.
> +		 *
> +		 * If RLC restore of debug registers is not supported, keep gfx off disabled for
> +		 * the debug session.
> +		 */
> +		amdgpu_gfx_off_ctrl(pdd->dev->adev, false);
> +		if (!(kfd_dbg_is_rlc_restore_supported(pdd->dev) ||
> +						target->runtime_info.ttmp_setup))
> +			pdd->dev->kfd2kgd->enable_debug_trap(pdd->dev->adev, true,
> +								pdd->dev->vm_info.last_vmid_kfd);
> +
> +		pdd->spi_dbg_override = pdd->dev->kfd2kgd->enable_debug_trap(
> +					pdd->dev->adev,
> +					false,
> +					pdd->dev->vm_info.last_vmid_kfd);
> +
> +		if (kfd_dbg_is_rlc_restore_supported(pdd->dev))
> +			amdgpu_gfx_off_ctrl(pdd->dev->adev, true);
> +
> +		if (!pdd->dev->shared_resources.enable_mes)
> +			r = debug_refresh_runlist(pdd->dev->dqm);
> +		else
> +			r = kfd_dbg_set_mes_debug_mode(pdd);
> +
> +		if (r) {
> +			target->runtime_info.runtime_state =
> +					DEBUG_RUNTIME_STATE_ENABLED_ERROR;
> +			goto unwind_err;
> +		}
> +
> +		/* Increment unwind_count as the last step */
> +		unwind_count++;

Similar to above. I think unwind_count is redundant. It'll have the same 
value as "i" in the next loop iteration.


> +	}
> +
> +	return 0;
> +
> +unwind_err:
> +	/* Enabling debug failed, we need to disable on
> +	 * all GPUs so the enable is all or nothing.
> +	 */
> +	kfd_dbg_trap_deactivate(target, true, unwind_count);
> +	return r;
> +}
> +
>   int kfd_dbg_trap_enable(struct kfd_process *target, uint32_t fd,
>   			void __user *runtime_info, uint32_t *runtime_size)
>   {
>   	struct file *f;
>   	uint32_t copy_size;
> -	int r = 0;
> +	int i, r = 0;
>   
>   	if (target->debug_trap_enabled)
>   		return -EALREADY;
>   
> +	/* Enable pre-checks */
> +	for (i = 0; i < target->n_pdds; i++) {
> +		struct kfd_process_device *pdd = target->pdds[i];
> +
> +		if (!KFD_IS_SOC15(pdd->dev))
> +			return -ENODEV;
> +
> +		if (!kfd_dbg_has_gws_support(pdd->dev) && pdd->qpd.num_gws)
> +			return -EBUSY;
> +	}
> +
>   	copy_size = min((size_t)(*runtime_size), sizeof(target->runtime_info));
>   
>   	f = fget(fd);
> @@ -140,6 +278,10 @@ int kfd_dbg_trap_enable(struct kfd_process *target, uint32_t fd,
>   
>   	target->dbg_ev_file = f;
>   
> +	/* defer activation to runtime if not runtime enabled */
> +	if (target->runtime_info.runtime_state == DEBUG_RUNTIME_STATE_ENABLED)
> +		kfd_dbg_trap_activate(target);
> +
>   	/* We already hold the process reference but hold another one for the
>   	 * debug session.
>   	 */
> @@ -149,8 +291,10 @@ int kfd_dbg_trap_enable(struct kfd_process *target, uint32_t fd,
>   	if (target->debugger_process)
>   		atomic_inc(&target->debugger_process->debugged_process_count);
>   
> -	if (copy_to_user(runtime_info, (void *)&target->runtime_info, copy_size))
> +	if (copy_to_user(runtime_info, (void *)&target->runtime_info, copy_size)) {
> +		kfd_dbg_trap_deactivate(target, false, 0);
>   		r = -EFAULT;
> +	}
>   
>   	*runtime_size = sizeof(target->runtime_info);
>   
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.h b/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
> index 0c09f1729325..f199698d8d60 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
> @@ -41,4 +41,33 @@ static inline bool kfd_dbg_is_per_vmid_supported(struct kfd_dev *dev)
>   
>   void debug_event_write_work_handler(struct work_struct *work);
>   
> +/*
> + * If GFX off is enabled, chips that do not support RLC restore for the debug
> + * registers will disable GFX off temporarily for the entire debug session.
> + * See disable_on_trap_action_entry and enable_on_trap_action_exit for details.
> + */
> +static inline bool kfd_dbg_is_rlc_restore_supported(struct kfd_dev *dev)
> +{
> +	return !(KFD_GC_VERSION(dev) == IP_VERSION(10, 1, 10) ||
> +		 KFD_GC_VERSION(dev) == IP_VERSION(10, 1, 1));
> +}
> +
> +static inline bool kfd_dbg_has_gws_support(struct kfd_dev *dev)
> +{
> +	if ((KFD_GC_VERSION(dev) == IP_VERSION(9, 0, 1)
> +			&& dev->mec2_fw_version < 0x81b6) ||
> +		(KFD_GC_VERSION(dev) >= IP_VERSION(9, 1, 0)
> +			&& KFD_GC_VERSION(dev) <= IP_VERSION(9, 2, 2)
> +			&& dev->mec2_fw_version < 0x1b6) ||
> +		(KFD_GC_VERSION(dev) == IP_VERSION(9, 4, 0)
> +			&& dev->mec2_fw_version < 0x1b6) ||
> +		(KFD_GC_VERSION(dev) == IP_VERSION(9, 4, 1)
> +			&& dev->mec2_fw_version < 0x30) ||
> +		(KFD_GC_VERSION(dev) >= IP_VERSION(11, 0, 0) &&
> +			KFD_GC_VERSION(dev) < IP_VERSION(12, 0, 0)))
> +		return false;
> +
> +	/* Assume debugging and cooperative launch supported otherwise. */
> +	return true;
> +}
>   #endif
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> index be1985b87ea7..3b747e51684e 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> @@ -36,6 +36,7 @@
>   #include "kfd_kernel_queue.h"
>   #include "amdgpu_amdkfd.h"
>   #include "mes_api_def.h"
> +#include "kfd_debug.h"
>   
>   /* Size of the per-pipe EOP queue */
>   #define CIK_HPD_EOP_BYTES_LOG2 11
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> index 94c6545a58b4..0ef2d00af8b1 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> @@ -1181,6 +1181,7 @@ static void kfd_process_notifier_release(struct mmu_notifier *mn,
>   					struct mm_struct *mm)
>   {
>   	struct kfd_process *p;
> +	int i;
>   
>   	/*
>   	 * The kfd_process structure can not be free because the
> @@ -1198,6 +1199,14 @@ static void kfd_process_notifier_release(struct mmu_notifier *mn,
>   	cancel_delayed_work_sync(&p->eviction_work);
>   	cancel_delayed_work_sync(&p->restore_work);
>   
> +	for (i = 0; i < p->n_pdds; i++) {
> +		struct kfd_process_device *pdd = p->pdds[i];
> +
> +		/* re-enable GFX OFF since runtime enable with ttmp setup disabled it. */
> +		if (!kfd_dbg_is_rlc_restore_supported(pdd->dev) && p->runtime_info.ttmp_setup)
> +			amdgpu_gfx_off_ctrl(pdd->dev->adev, true);
> +	}
> +
>   	/* Indicate to other users that MM is no longer valid */
>   	p->mm = NULL;
>   

^ permalink raw reply	[flat|nested] 68+ messages in thread

* Re: [PATCH 17/32] drm/amdkfd: add raise exception event function
  2023-01-25 19:53 ` [PATCH 17/32] drm/amdkfd: add raise exception event function Jonathan Kim
@ 2023-03-20 23:18   ` Felix Kuehling
  0 siblings, 0 replies; 68+ messages in thread
From: Felix Kuehling @ 2023-03-20 23:18 UTC (permalink / raw)
  To: Jonathan Kim, amd-gfx, dri-devel


On 2023-01-25 14:53, Jonathan Kim wrote:
> Exception events can be generated from interrupts or queue activitity.
>
> The raise event function will save exception status of a queue, device
> or process then notify the debugger of the status change by writing to
> a debugger polled file descriptor that the debugger provides during
> debug attach.
>
> For memory violation exceptions, extra exception data will be saved.
>
> The debugger will be able to query the saved exception states by query
> operation that will be provided by follow up patches.
>
> Signed-off-by: Jonathan Kim <jonathan.kim@amd.com>
> ---
>   drivers/gpu/drm/amd/amdkfd/kfd_debug.c | 91 +++++++++++++++++++++++++-
>   drivers/gpu/drm/amd/amdkfd/kfd_debug.h |  5 ++
>   drivers/gpu/drm/amd/amdkfd/kfd_priv.h  |  7 ++
>   3 files changed, 102 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
> index 659dfc7411fe..fcd064b13f6a 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
> @@ -38,6 +38,93 @@ void debug_event_write_work_handler(struct work_struct *work)
>   	kernel_write(process->dbg_ev_file, &write_data, 1, &pos);
>   }
>   
> +/* update process/device/queue exception status, write to descriptor
> + * only if exception_status is enabled.
> + */
> +bool kfd_dbg_ev_raise(uint64_t event_mask,
> +			struct kfd_process *process, struct kfd_dev *dev,
> +			unsigned int source_id, bool use_worker,
> +			void *exception_data, size_t exception_data_size)
> +{
> +	struct process_queue_manager *pqm;
> +	struct process_queue_node *pqn;
> +	int i;
> +	static const char write_data = '.';
> +	loff_t pos = 0;
> +	bool is_subscribed = true;
> +
> +	if (!(process && process->debug_trap_enabled))
> +		return false;
> +
> +	mutex_lock(&process->event_mutex);
> +
> +	if (event_mask & KFD_EC_MASK_DEVICE) {
> +		for (i = 0; i < process->n_pdds; i++) {
> +			struct kfd_process_device *pdd = process->pdds[i];
> +
> +			if (pdd->dev != dev)
> +				continue;
> +
> +			pdd->exception_status |= event_mask & KFD_EC_MASK_DEVICE;
> +
> +			if (event_mask & KFD_EC_MASK(EC_DEVICE_MEMORY_VIOLATION)) {
> +				if (!pdd->vm_fault_exc_data) {
> +					pdd->vm_fault_exc_data = kmemdup(
> +							exception_data,
> +							exception_data_size,
> +							GFP_KERNEL);
> +					if (!pdd->vm_fault_exc_data)
> +						pr_debug("Failed to allocate exception data memory");
> +				} else {
> +					pr_debug("Debugger exception data not saved\n");
> +					print_hex_dump_bytes("exception data: ",
> +							DUMP_PREFIX_OFFSET,
> +							exception_data,
> +							exception_data_size);
> +				}
> +			}
> +			break;
> +		}
> +	} else if (event_mask & KFD_EC_MASK_PROCESS) {
> +		process->exception_status |= event_mask & KFD_EC_MASK_PROCESS;
> +	} else {
> +		pqm = &process->pqm;
> +		list_for_each_entry(pqn, &pqm->queues,
> +				process_queue_list) {
> +			int target_id;
> +
> +			if (!pqn->q)
> +				continue;
> +
> +			target_id = event_mask & KFD_EC_MASK(EC_QUEUE_NEW) ?
> +					pqn->q->properties.queue_id :
> +							pqn->q->doorbell_id;
> +
> +			if (pqn->q->device != dev || target_id != source_id)
> +				continue;
> +
> +			pqn->q->properties.exception_status |= event_mask;
> +			break;
> +		}
> +	}
> +
> +	if (process->exception_enable_mask & event_mask) {
> +		if (use_worker)
> +			schedule_work(&process->debug_event_workarea);

The worker definition should be in the same patch.


> +		else
> +			kernel_write(process->dbg_ev_file,
> +					&write_data,
> +					1,
> +					&pos);
> +	} else {
> +		is_subscribed = false;
> +	}
> +
> +	mutex_unlock(&process->event_mutex);
> +
> +	return is_subscribed;
> +}
> +
>   static int kfd_dbg_set_queue_workaround(struct queue *q, bool enable)
>   {
>   	struct mqd_update_info minfo = {0};
> @@ -88,7 +175,6 @@ static int kfd_dbg_set_workaround(struct kfd_process *target, bool enable)
>   	}
>   
>   	return r;
> -}

This looks wrong.

Regards,
   Felix


>   
>   static int kfd_dbg_set_mes_debug_mode(struct kfd_process_device *pdd)
>   {
> @@ -114,6 +200,9 @@ static void kfd_dbg_trap_deactivate(struct kfd_process *target, bool unwind, int
>   {
>   	int i, count = 0;
>   
> +	if (!unwind)
> +		cancel_work_sync(&target->debug_event_workarea);
> +
>   	for (i = 0; i < target->n_pdds; i++) {
>   		struct kfd_process_device *pdd = target->pdds[i];
>   
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.h b/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
> index f199698d8d60..2d5bc102f6b4 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
> @@ -28,6 +28,11 @@
>   void kgd_gfx_v9_set_wave_launch_stall(struct amdgpu_device *adev,
>   					uint32_t vmid,
>   					bool stall);
> +bool kfd_dbg_ev_raise(uint64_t event_mask,
> +			struct kfd_process *process, struct kfd_dev *dev,
> +			unsigned int source_id, bool use_worker,
> +			void *exception_data,
> +			size_t exception_data_size);
>   int kfd_dbg_trap_disable(struct kfd_process *target);
>   int kfd_dbg_trap_enable(struct kfd_process *target, uint32_t fd,
>   			void __user *runtime_info,
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> index 75521d96e937..e503bd94dda6 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> @@ -495,6 +495,7 @@ struct queue_properties {
>   	uint32_t ctl_stack_size;
>   	uint64_t tba_addr;
>   	uint64_t tma_addr;
> +	uint64_t exception_status;
>   };
>   
>   #define QUEUE_IS_ACTIVE(q) ((q).queue_size > 0 &&	\
> @@ -786,6 +787,11 @@ struct kfd_process_device {
>   	uint64_t page_in;
>   	uint64_t page_out;
>   
> +	/* Exception code status*/
> +	uint64_t exception_status;
> +	void *vm_fault_exc_data;
> +	size_t vm_fault_exc_data_size;
> +
>   	/* Tracks debug per-vmid request settings */
>   	uint32_t spi_dbg_override;
>   	uint32_t spi_dbg_launch_mode;
> @@ -921,6 +927,7 @@ struct kfd_process {
>   
>   	/* Exception code enable mask and status */
>   	uint64_t exception_enable_mask;
> +	uint64_t exception_status;
>   
>   	/* shared virtual memory registered by this process */
>   	struct svm_range_list svms;

^ permalink raw reply	[flat|nested] 68+ messages in thread

* Re: [PATCH 18/32] drm/amdkfd: add send exception operation
  2023-01-25 19:53 ` [PATCH 18/32] drm/amdkfd: add send exception operation Jonathan Kim
@ 2023-03-20 23:26   ` Felix Kuehling
  0 siblings, 0 replies; 68+ messages in thread
From: Felix Kuehling @ 2023-03-20 23:26 UTC (permalink / raw)
  To: Jonathan Kim, amd-gfx, dri-devel


On 2023-01-25 14:53, Jonathan Kim wrote:
> Add a debug operation that allows the debugger to send an exception
> directly to runtime through a payload address.
>
> For memory violations, normal vmfault signals will be applied to
> notify runtime instead after passing in the saved exception data
> when a memory violation was raised to the debugger.
>
> For runtime exceptions, this will unblock the runtime enable
> function which will be explained and implemented in a follow up
> patch.
>
> Signed-off-by: Jonathan Kim <jonathan.kim@amd.com>
> ---
>   .../gpu/drm/amd/amdkfd/cik_event_interrupt.c  |  4 +-
>   drivers/gpu/drm/amd/amdkfd/kfd_chardev.c      |  5 ++
>   drivers/gpu/drm/amd/amdkfd/kfd_debug.c        | 44 ++++++++++++
>   drivers/gpu/drm/amd/amdkfd/kfd_debug.h        |  5 ++
>   drivers/gpu/drm/amd/amdkfd/kfd_events.c       |  3 +-
>   .../gpu/drm/amd/amdkfd/kfd_int_process_v9.c   |  2 +-
>   drivers/gpu/drm/amd/amdkfd/kfd_priv.h         |  7 +-
>   drivers/gpu/drm/amd/amdkfd/kfd_process.c      | 71 ++++++++++++++++++-
>   8 files changed, 135 insertions(+), 6 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
> index 5c8023cba196..62a38cd820fc 100644
> --- a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
> +++ b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
> @@ -118,9 +118,9 @@ static void cik_event_interrupt_wq(struct kfd_dev *dev,
>   			return;
>   
>   		if (info.vmid == vmid)
> -			kfd_signal_vm_fault_event(dev, pasid, &info);
> +			kfd_signal_vm_fault_event(dev, pasid, &info, NULL);
>   		else
> -			kfd_signal_vm_fault_event(dev, pasid, NULL);
> +			kfd_signal_vm_fault_event(dev, pasid, NULL, NULL);
>   	}
>   }
>   
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> index 628178126d3b..09fe8576dc8c 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> @@ -2738,6 +2738,11 @@ static int kfd_ioctl_set_debug_trap(struct file *filep, struct kfd_process *p, v
>   		r = kfd_dbg_trap_disable(target);
>   		break;
>   	case KFD_IOC_DBG_TRAP_SEND_RUNTIME_EVENT:
> +		r = kfd_dbg_send_exception_to_runtime(target,
> +				args->send_runtime_event.gpu_id,
> +				args->send_runtime_event.queue_id,
> +				args->send_runtime_event.exception_mask);
> +		break;
>   	case KFD_IOC_DBG_TRAP_SET_EXCEPTIONS_ENABLED:
>   	case KFD_IOC_DBG_TRAP_SET_WAVE_LAUNCH_OVERRIDE:
>   	case KFD_IOC_DBG_TRAP_SET_WAVE_LAUNCH_MODE:
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
> index fcd064b13f6a..4174b479ea6f 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
> @@ -125,6 +125,49 @@ bool kfd_dbg_ev_raise(uint64_t event_mask,
>   	return is_subscribed;
>   }
>   
> +int kfd_dbg_send_exception_to_runtime(struct kfd_process *p,
> +					unsigned int dev_id,
> +					unsigned int queue_id,
> +					uint64_t error_reason)
> +{
> +	if (error_reason & KFD_EC_MASK(EC_DEVICE_MEMORY_VIOLATION)) {
> +		struct kfd_process_device *pdd = NULL;
> +		struct kfd_hsa_memory_exception_data *data;
> +		int i;
> +
> +		for (i = 0; i < p->n_pdds; i++) {
> +			if (p->pdds[i]->dev->id == dev_id) {
> +				pdd = p->pdds[i];
> +				break;
> +			}
> +		}
> +
> +		if (!pdd)
> +			return -ENODEV;
> +
> +		data = (struct kfd_hsa_memory_exception_data *)
> +						pdd->vm_fault_exc_data;
> +
> +		kfd_dqm_evict_pasid(pdd->dev->dqm, p->pasid);
> +		kfd_signal_vm_fault_event(pdd->dev, p->pasid, NULL, data);
> +		error_reason &= ~KFD_EC_MASK(EC_DEVICE_MEMORY_VIOLATION);
> +	}
> +
> +	if (error_reason & (KFD_EC_MASK(EC_PROCESS_RUNTIME))) {
> +		/*
> +		 * block should only happen after the debugger receives runtime
> +		 * enable notice.
> +		 */
> +		up(&p->runtime_enable_sema);
> +		error_reason &= ~KFD_EC_MASK(EC_PROCESS_RUNTIME);
> +	}
> +
> +	if (error_reason)
> +		return kfd_send_exception_to_runtime(p, queue_id, error_reason);
> +
> +	return 0;
> +}
> +
>   static int kfd_dbg_set_queue_workaround(struct queue *q, bool enable)
>   {
>   	struct mqd_update_info minfo = {0};
> @@ -175,6 +218,7 @@ static int kfd_dbg_set_workaround(struct kfd_process *target, bool enable)
>   	}
>   
>   	return r;
> +}

Ah, here you're fixing up the mistake from the last patch. Hint: An easy 
way to compile-test every patch in a large patch series is with 
something like this:

     git rebase -i HEAD~32 --exec build_kernel.sh

Regards,
   Felix


>   
>   static int kfd_dbg_set_mes_debug_mode(struct kfd_process_device *pdd)
>   {
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.h b/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
> index 2d5bc102f6b4..fefb9dc5cf69 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
> @@ -38,6 +38,11 @@ int kfd_dbg_trap_enable(struct kfd_process *target, uint32_t fd,
>   			void __user *runtime_info,
>   			uint32_t *runtime_info_size);
>   
> +int kfd_dbg_send_exception_to_runtime(struct kfd_process *p,
> +					unsigned int dev_id,
> +					unsigned int queue_id,
> +					uint64_t error_reason);
> +
>   static inline bool kfd_dbg_is_per_vmid_supported(struct kfd_dev *dev)
>   {
>   	return KFD_GC_VERSION(dev) == IP_VERSION(9, 4, 2) ||
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_events.c b/drivers/gpu/drm/amd/amdkfd/kfd_events.c
> index 729d26d648af..0efd447762d6 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_events.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_events.c
> @@ -1225,7 +1225,8 @@ void kfd_signal_hw_exception_event(u32 pasid)
>   }
>   
>   void kfd_signal_vm_fault_event(struct kfd_dev *dev, u32 pasid,
> -				struct kfd_vm_fault_info *info)
> +				struct kfd_vm_fault_info *info,
> +				struct kfd_hsa_memory_exception_data *data)
>   {
>   	struct kfd_event *ev;
>   	uint32_t id;
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
> index 0b75a37b689b..e092563f22de 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
> @@ -362,7 +362,7 @@ static void event_interrupt_wq_v9(struct kfd_dev *dev,
>   
>   		kfd_smi_event_update_vmfault(dev, pasid);
>   		kfd_dqm_evict_pasid(dev->dqm, pasid);
> -		kfd_signal_vm_fault_event(dev, pasid, &info);
> +		kfd_signal_vm_fault_event(dev, pasid, &info, NULL);
>   	}
>   }
>   
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> index e503bd94dda6..4cb433a21e3d 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> @@ -945,6 +945,7 @@ struct kfd_process {
>   	bool queues_paused;
>   
>   	/* Tracks runtime enable status */
> +	struct semaphore runtime_enable_sema;
>   	struct kfd_runtime_info runtime_info;
>   
>   };
> @@ -1394,7 +1395,8 @@ int kfd_get_num_events(struct kfd_process *p);
>   int kfd_event_destroy(struct kfd_process *p, uint32_t event_id);
>   
>   void kfd_signal_vm_fault_event(struct kfd_dev *dev, u32 pasid,
> -				struct kfd_vm_fault_info *info);
> +				struct kfd_vm_fault_info *info,
> +				struct kfd_hsa_memory_exception_data *data);
>   
>   void kfd_signal_reset_event(struct kfd_dev *dev);
>   
> @@ -1410,6 +1412,9 @@ static inline bool kfd_flush_tlb_after_unmap(struct kfd_dev *dev)
>   	       KFD_GC_VERSION(dev) == IP_VERSION(9, 4, 0);
>   }
>   
> +int kfd_send_exception_to_runtime(struct kfd_process *p,
> +				unsigned int queue_id,
> +				uint64_t error_reason);
>   bool kfd_is_locked(void);
>   
>   /* Compute profile */
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> index 0ef2d00af8b1..8519604f7249 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> @@ -1403,6 +1403,7 @@ static struct kfd_process *create_process(const struct task_struct *thread)
>   	process->debugger_process = NULL;
>   	process->exception_enable_mask = 0;
>   	atomic_set(&process->debugged_process_count, 0);
> +	sema_init(&process->runtime_enable_sema, 0);
>   
>   	process->pasid = kfd_pasid_alloc();
>   	if (process->pasid == 0) {
> @@ -2058,6 +2059,75 @@ void kfd_flush_tlb(struct kfd_process_device *pdd, enum TLB_FLUSH_TYPE type)
>   	}
>   }
>   
> +struct send_exception_work_handler_workarea {
> +	struct work_struct work;
> +	struct kfd_process *p;
> +	unsigned int queue_id;
> +	uint64_t error_reason;
> +};
> +
> +static void send_exception_work_handler(struct work_struct *work)
> +{
> +	struct send_exception_work_handler_workarea *workarea;
> +	struct kfd_process *p;
> +	struct queue *q;
> +	struct mm_struct *mm;
> +	struct kfd_context_save_area_header __user *csa_header;
> +	uint64_t __user *err_payload_ptr;
> +	uint64_t cur_err;
> +	uint32_t ev_id;
> +
> +	workarea = container_of(work,
> +				struct send_exception_work_handler_workarea,
> +				work);
> +	p = workarea->p;
> +
> +	mm = get_task_mm(p->lead_thread);
> +
> +	if (!mm)
> +		return;
> +
> +	kthread_use_mm(mm);
> +
> +	q = pqm_get_user_queue(&p->pqm, workarea->queue_id);
> +
> +	if (!q)
> +		goto out;
> +
> +	csa_header = (void __user *)q->properties.ctx_save_restore_area_address;
> +
> +	get_user(err_payload_ptr, (uint64_t __user **)&csa_header->err_payload_addr);
> +	get_user(cur_err, err_payload_ptr);
> +	cur_err |= workarea->error_reason;
> +	put_user(cur_err, err_payload_ptr);
> +	get_user(ev_id, &csa_header->err_event_id);
> +
> +	kfd_set_event(p, ev_id);
> +
> +out:
> +	kthread_unuse_mm(mm);
> +	mmput(mm);
> +}
> +
> +int kfd_send_exception_to_runtime(struct kfd_process *p,
> +			unsigned int queue_id,
> +			uint64_t error_reason)
> +{
> +	struct send_exception_work_handler_workarea worker;
> +
> +	INIT_WORK_ONSTACK(&worker.work, send_exception_work_handler);
> +
> +	worker.p = p;
> +	worker.queue_id = queue_id;
> +	worker.error_reason = error_reason;
> +
> +	schedule_work(&worker.work);
> +	flush_work(&worker.work);
> +	destroy_work_on_stack(&worker.work);
> +
> +	return 0;
> +}
> +
>   struct kfd_process_device *kfd_process_device_data_by_id(struct kfd_process *p, uint32_t gpu_id)
>   {
>   	int i;
> @@ -2117,4 +2187,3 @@ int kfd_debugfs_mqds_by_process(struct seq_file *m, void *data)
>   }
>   
>   #endif
> -

^ permalink raw reply	[flat|nested] 68+ messages in thread

* Re: [PATCH 19/32] drm/amdkfd: add runtime enable operation
  2023-01-25 19:53 ` [PATCH 19/32] drm/amdkfd: add runtime enable operation Jonathan Kim
@ 2023-03-21  0:31   ` Felix Kuehling
  2023-03-23 19:45     ` Kim, Jonathan
  0 siblings, 1 reply; 68+ messages in thread
From: Felix Kuehling @ 2023-03-21  0:31 UTC (permalink / raw)
  To: Jonathan Kim, amd-gfx, dri-devel


On 2023-01-25 14:53, Jonathan Kim wrote:
> The debugger can attach to a process prior to HSA enablement (i.e.
> inferior is spawned by the debugger and attached to immediately before
> target process has been enabled for HSA dispatches) or it
> can attach to a running target that is already HSA enabled.  Either
> way, the debugger needs to know the enablement status to know when
> it can inspect queues.
>
> For the scenario where the debugger spawns the target process,
> it will have to wait for ROCr's runtime enable request from the target.
> The runtime enable request will be able to see that its process has been
> debug attached.  ROCr raises an EC_PROCESS_RUNTIME signal to the
> debugger then blocks the target process while waiting the debugger's
> response. Once the debugger has received the runtime signal, it will
> unblock the target process.
>
> For the scenario where the debugger attaches to a running target
> process, ROCr will set the target process' runtime status as enabled so
> that on an attach request, the debugger will be able to see this
> status and will continue with debug enablement as normal.
>
> A secondary requirement is to conditionally enable the trap tempories only
> if the user requests it (env var HSA_ENABLE_DEBUG=1) or if the debugger
> attaches with HSA runtime enabled.  This is because setting up the trap
> temporaries incurs a performance overhead that is unacceptable for
> microbench performance in normal mode for certain customers.
>
> In the scenario where the debugger spawns the target process, when ROCr
> detects that the debugger has attached during the runtime enable
> request, it will enable the trap temporaries before it blocks the target
> process while waiting for the debugger to respond.
>
> In the scenario where the debugger attaches to a running target process,
> it will enable to trap temporaries itself.
>
> Finally, there is an additional restriction that is required to be
> enforced with runtime enable and HW debug mode setting. The debugger must
> first ensure that HW debug mode has been enabled before permitting HW debug
> mode operations.
>
> With single process debug devices, allowing the debugger to set debug
> HW modes prior to trap activation means that debug HW mode setting can
> occur before the KFD has reserved the debug VMID (0xf) from the hardware
> scheduler's VMID allocation resource pool.  This can result in the
> hardware scheduler assigning VMID 0xf to a non-debugged process and
> having that process inherit debug HW mode settings intended for the
> debugged target process instead, which is both incorrect and potentially
> fatal for normal mode operation.
>
> With multi process debug devices, allowing the debugger to set debug
> HW modes prior to trap activation means that non-debugged processes
> migrating to a new VMID could inherit unintended debug settings.
>
> All debug operations that touch HW settings must require trap activation
> where trap activation is triggered by both debug attach and runtime
> enablement (target has KFD opened and is ready to dispatch work).
>
> v2: fix up hierarchy of semantics in description.
>
> Signed-off-by: Jonathan Kim <jonathan.kim@amd.com>
> ---
>   drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 150 ++++++++++++++++++++++-
>   drivers/gpu/drm/amd/amdkfd/kfd_debug.c   |   6 +-
>   drivers/gpu/drm/amd/amdkfd/kfd_debug.h   |   4 +
>   drivers/gpu/drm/amd/amdkfd/kfd_priv.h    |   1 +
>   4 files changed, 157 insertions(+), 4 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> index 09fe8576dc8c..46f9d453dc5e 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> @@ -2654,11 +2654,147 @@ static int kfd_ioctl_criu(struct file *filep, struct kfd_process *p, void *data)
>   	return ret;
>   }
>   
> -static int kfd_ioctl_runtime_enable(struct file *filep, struct kfd_process *p, void *data)
> +static int runtime_enable(struct kfd_process *p, uint64_t r_debug,
> +			bool enable_ttmp_setup)
>   {
> +	int i = 0, ret = 0;
> +
> +	if (p->is_runtime_retry)
> +		goto retry;
> +
> +	if (p->runtime_info.runtime_state != DEBUG_RUNTIME_STATE_DISABLED)
> +		return -EBUSY;
> +
> +	for (i = 0; i < p->n_pdds; i++) {
> +		struct kfd_process_device *pdd = p->pdds[i];
> +
> +		if (pdd->qpd.queue_count)
> +			return -EEXIST;
> +	}
> +
> +	p->runtime_info.runtime_state = DEBUG_RUNTIME_STATE_ENABLED;
> +	p->runtime_info.r_debug = r_debug;
> +	p->runtime_info.ttmp_setup = enable_ttmp_setup;
> +
> +	if (p->runtime_info.ttmp_setup) {
> +		for (i = 0; i < p->n_pdds; i++) {
> +			struct kfd_process_device *pdd = p->pdds[i];
> +
> +			if (!kfd_dbg_is_rlc_restore_supported(pdd->dev)) {
> +				amdgpu_gfx_off_ctrl(pdd->dev->adev, false);
> +				pdd->dev->kfd2kgd->enable_debug_trap(
> +						pdd->dev->adev,
> +						true,
> +						pdd->dev->vm_info.last_vmid_kfd);
> +			}
> +
> +			if (kfd_dbg_is_per_vmid_supported(pdd->dev)) {

Should this be else-if? It seems weird that enable_debug_trap could be 
called twice in a row. If RLC restore is only applicable on 
single-process debug devices, then maybe put the per-VMID case first.


> +				pdd->spi_dbg_override = pdd->dev->kfd2kgd->enable_debug_trap(
> +						pdd->dev->adev,
> +						false,
> +						pdd->dev->vm_info.last_vmid_kfd);
> +
> +				if (!pdd->dev->shared_resources.enable_mes)
> +					debug_refresh_runlist(pdd->dev->dqm);
> +				else
> +					kfd_dbg_set_mes_debug_mode(pdd);

Do we really need to update the runlist here? When the runtime gets 
enabled, there are no queues yet for the process. So there should be no 
change to the runlist until the process creates its first queue.


> +			}
> +		}
> +	}
> +
> +retry:
> +	if (p->debug_trap_enabled) {
> +		if (!p->is_runtime_retry) {
> +			kfd_dbg_trap_activate(p);
> +			kfd_dbg_ev_raise(KFD_EC_MASK(EC_PROCESS_RUNTIME),
> +					p, NULL, 0, false, NULL, 0);
> +		}
> +
> +		mutex_unlock(&p->mutex);
> +		ret = down_interruptible(&p->runtime_enable_sema);
> +		mutex_lock(&p->mutex);
> +
> +		p->is_runtime_retry = !!ret;
> +	}
> +
> +	return ret;
> +}
> +
> +static int runtime_disable(struct kfd_process *p)
> +{
> +	int i = 0, ret;
> +	bool was_enabled = p->runtime_info.runtime_state == DEBUG_RUNTIME_STATE_ENABLED;
> +
> +	p->runtime_info.runtime_state = DEBUG_RUNTIME_STATE_DISABLED;
> +	p->runtime_info.r_debug = 0;
> +
> +	if (p->debug_trap_enabled) {
> +		if (was_enabled)
> +			kfd_dbg_trap_deactivate(p, false, 0);

Does this call kfd_dbg_trap_deactivate multiple times on retry? Is that 
a problem?

Regards,
   Felix


> +
> +		if (!p->is_runtime_retry)
> +			kfd_dbg_ev_raise(KFD_EC_MASK(EC_PROCESS_RUNTIME),
> +					p, NULL, 0, false, NULL, 0);
> +
> +		mutex_unlock(&p->mutex);
> +		ret = down_interruptible(&p->runtime_enable_sema);
> +		mutex_lock(&p->mutex);
> +
> +		p->is_runtime_retry = !!ret;
> +		if (ret)
> +			return ret;
> +	}
> +
> +	if (was_enabled && p->runtime_info.ttmp_setup) {
> +		for (i = 0; i < p->n_pdds; i++) {
> +			struct kfd_process_device *pdd = p->pdds[i];
> +
> +			if (!kfd_dbg_is_rlc_restore_supported(pdd->dev))
> +				amdgpu_gfx_off_ctrl(pdd->dev->adev, true);
> +		}
> +	}
> +
> +	p->runtime_info.ttmp_setup = false;
> +
> +	/* disable DISPATCH_PTR save */
> +	for (i = 0; i < p->n_pdds; i++) {
> +		struct kfd_process_device *pdd = p->pdds[i];
> +
> +		if (kfd_dbg_is_per_vmid_supported(pdd->dev)) {
> +			pdd->spi_dbg_override =
> +					pdd->dev->kfd2kgd->disable_debug_trap(
> +					pdd->dev->adev,
> +					false,
> +					pdd->dev->vm_info.last_vmid_kfd);
> +
> +			if (!pdd->dev->shared_resources.enable_mes)
> +				debug_refresh_runlist(pdd->dev->dqm);
> +			else
> +				kfd_dbg_set_mes_debug_mode(pdd);
> +		}
> +	}
> +
>   	return 0;
>   }
>   
> +static int kfd_ioctl_runtime_enable(struct file *filep, struct kfd_process *p, void *data)
> +{
> +	struct kfd_ioctl_runtime_enable_args *args = data;
> +	int r;
> +
> +	mutex_lock(&p->mutex);
> +
> +	if (args->mode_mask & KFD_RUNTIME_ENABLE_MODE_ENABLE_MASK)
> +		r = runtime_enable(p, args->r_debug,
> +				!!(args->mode_mask & KFD_RUNTIME_ENABLE_MODE_TTMP_SAVE_MASK));
> +	else
> +		r = runtime_disable(p);
> +
> +	mutex_unlock(&p->mutex);
> +
> +	return r;
> +}
> +
>   static int kfd_ioctl_set_debug_trap(struct file *filep, struct kfd_process *p, void *data)
>   {
>   	struct kfd_ioctl_dbg_trap_args *args = data;
> @@ -2720,6 +2856,18 @@ static int kfd_ioctl_set_debug_trap(struct file *filep, struct kfd_process *p, v
>   		goto unlock_out;
>   	}
>   
> +	if (target->runtime_info.runtime_state != DEBUG_RUNTIME_STATE_ENABLED &&
> +			(args->op == KFD_IOC_DBG_TRAP_SET_WAVE_LAUNCH_OVERRIDE ||
> +			 args->op == KFD_IOC_DBG_TRAP_SET_WAVE_LAUNCH_MODE ||
> +			 args->op == KFD_IOC_DBG_TRAP_SUSPEND_QUEUES ||
> +			 args->op == KFD_IOC_DBG_TRAP_RESUME_QUEUES ||
> +			 args->op == KFD_IOC_DBG_TRAP_SET_NODE_ADDRESS_WATCH ||
> +			 args->op == KFD_IOC_DBG_TRAP_CLEAR_NODE_ADDRESS_WATCH ||
> +			 args->op == KFD_IOC_DBG_TRAP_SET_FLAGS)) {
> +		r = -EPERM;
> +		goto unlock_out;
> +	}
> +
>   	switch (args->op) {
>   	case KFD_IOC_DBG_TRAP_ENABLE:
>   		if (target != p)
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
> index 4174b479ea6f..47f8425a0db3 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
> @@ -220,7 +220,7 @@ static int kfd_dbg_set_workaround(struct kfd_process *target, bool enable)
>   	return r;
>   }
>   
> -static int kfd_dbg_set_mes_debug_mode(struct kfd_process_device *pdd)
> +int kfd_dbg_set_mes_debug_mode(struct kfd_process_device *pdd)
>   {
>   	uint32_t spi_dbg_cntl = pdd->spi_dbg_override | pdd->spi_dbg_launch_mode;
>   	uint32_t flags = pdd->process->dbg_flags;
> @@ -240,7 +240,7 @@ static int kfd_dbg_set_mes_debug_mode(struct kfd_process_device *pdd)
>    *				to unwind
>    *		else: ignored
>    */
> -static void kfd_dbg_trap_deactivate(struct kfd_process *target, bool unwind, int unwind_count)
> +void kfd_dbg_trap_deactivate(struct kfd_process *target, bool unwind, int unwind_count)
>   {
>   	int i, count = 0;
>   
> @@ -311,7 +311,7 @@ int kfd_dbg_trap_disable(struct kfd_process *target)
>   	return 0;
>   }
>   
> -static int kfd_dbg_trap_activate(struct kfd_process *target)
> +int kfd_dbg_trap_activate(struct kfd_process *target)
>   {
>   	int i, r = 0, unwind_count = 0;
>   
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.h b/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
> index fefb9dc5cf69..22707f7a2368 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
> @@ -28,6 +28,8 @@
>   void kgd_gfx_v9_set_wave_launch_stall(struct amdgpu_device *adev,
>   					uint32_t vmid,
>   					bool stall);
> +void kfd_dbg_trap_deactivate(struct kfd_process *target, bool unwind, int unwind_count);
> +int kfd_dbg_trap_activate(struct kfd_process *target);
>   bool kfd_dbg_ev_raise(uint64_t event_mask,
>   			struct kfd_process *process, struct kfd_dev *dev,
>   			unsigned int source_id, bool use_worker,
> @@ -80,4 +82,6 @@ static inline bool kfd_dbg_has_gws_support(struct kfd_dev *dev)
>   	/* Assume debugging and cooperative launch supported otherwise. */
>   	return true;
>   }
> +
> +int kfd_dbg_set_mes_debug_mode(struct kfd_process_device *pdd);
>   #endif
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> index 4cb433a21e3d..63c59ad2a4ca 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> @@ -946,6 +946,7 @@ struct kfd_process {
>   
>   	/* Tracks runtime enable status */
>   	struct semaphore runtime_enable_sema;
> +	bool is_runtime_retry;
>   	struct kfd_runtime_info runtime_info;
>   
>   };

^ permalink raw reply	[flat|nested] 68+ messages in thread

* Re: [PATCH 21/32] drm/amdkfd: update process interrupt handling for debug events
  2023-01-25 19:53 ` [PATCH 21/32] drm/amdkfd: update process interrupt handling for debug events Jonathan Kim
@ 2023-03-21 21:07   ` Felix Kuehling
  0 siblings, 0 replies; 68+ messages in thread
From: Felix Kuehling @ 2023-03-21 21:07 UTC (permalink / raw)
  To: Jonathan Kim, amd-gfx, dri-devel


On 2023-01-25 14:53, Jonathan Kim wrote:
> The debugger must be notified by any debugger subscribed exception
> that comes from hardware interrupts.
>
> If a debugger session exits, any exceptions it subscribed to may still
> have interrupts in the interrupt ring buffer or KGD/KFD pipeline.
> To prevent a new session from inheriting stale interrupts, when a new
> queue is created, open an interrupt drain and allow the IH ring to drain
> from a timestamped checkpoint.  Then inject a custom IV so that once
> the custom IV is picked up by the KFD, it's safe to close the drain
> and proceed with queue creation.
>
> The drain must also be on debug disable as SW interrupts may still
> be processed.  Drain at this time and clear all the exception status.
>
> The debugger may also not be attached nor subscibed to certain
> exceptions so forward them directly to the runtime.
>
> GFX10 also requires its own IV processing, hence the creation of
> kfd_int_process_v10.c.  This is because the IV from SQ interrupts are
> packed into a new continguous format unlike GFX9. To make this clear,
> a separate interrupting handling code file was created.
>
> v3: enable gfx11 interrupts
> v2: fix interrupt drain on debug disable.
> fix interrupt drain on queue create during -ERESTARTSYS.
> fix up macros naming for ECODE parsing.
>
> Signed-off-by: Jonathan Kim <jonathan.kim@amd.com>

Some indentation nit-picks inline. With those fixed, the patch is

Reviewed-by: Felix Kuehling <Felix.Kuehling@amd.com>


> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c    |  16 +
>   drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h    |   2 +
>   drivers/gpu/drm/amd/amdkfd/Makefile           |   1 +
>   drivers/gpu/drm/amd/amdkfd/kfd_debug.c        |  85 ++++
>   drivers/gpu/drm/amd/amdkfd/kfd_debug.h        |   6 +
>   drivers/gpu/drm/amd/amdkfd/kfd_device.c       |   4 +-
>   .../gpu/drm/amd/amdkfd/kfd_int_process_v10.c  | 405 ++++++++++++++++++
>   .../gpu/drm/amd/amdkfd/kfd_int_process_v11.c  |  21 +-
>   .../gpu/drm/amd/amdkfd/kfd_int_process_v9.c   |  98 ++++-
>   drivers/gpu/drm/amd/amdkfd/kfd_priv.h         |  12 +
>   drivers/gpu/drm/amd/amdkfd/kfd_process.c      |  47 ++
>   .../amd/amdkfd/kfd_process_queue_manager.c    |   4 +
>   12 files changed, 681 insertions(+), 20 deletions(-)
>   create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_int_process_v10.c
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
> index 8816853e50c0..60c3b0449d86 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
> @@ -763,6 +763,22 @@ void amdgpu_amdkfd_ras_poison_consumption_handler(struct amdgpu_device *adev, bo
>   	amdgpu_umc_poison_handler(adev, reset);
>   }
>   
> +int amdgpu_amdkfd_send_close_event_drain_irq(struct amdgpu_device *adev,
> +					uint32_t *payload)
> +{
> +	int ret;
> +
> +	/* Device or IH ring is not ready so bail. */
> +	ret = amdgpu_ih_wait_on_checkpoint_process_ts(adev, &adev->irq.ih);
> +	if (ret)
> +		return ret;
> +
> +	/* Send payload to fence KFD interrupts */
> +	amdgpu_amdkfd_interrupt(adev, payload);
> +
> +	return 0;
> +}
> +
>   bool amdgpu_amdkfd_ras_query_utcl2_poison_status(struct amdgpu_device *adev)
>   {
>   	if (adev->gfx.ras && adev->gfx.ras->query_utcl2_poison_status)
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
> index 333780491867..df782274a4c8 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
> @@ -241,6 +241,8 @@ int amdgpu_amdkfd_get_xgmi_bandwidth_mbytes(struct amdgpu_device *dst,
>   					    struct amdgpu_device *src,
>   					    bool is_min);
>   int amdgpu_amdkfd_get_pcie_bandwidth_mbytes(struct amdgpu_device *adev, bool is_min);
> +int amdgpu_amdkfd_send_close_event_drain_irq(struct amdgpu_device *adev,
> +					uint32_t *payload);
>   
>   /* Read user wptr from a specified user address space with page fault
>    * disabled. The memory must be pinned and mapped to the hardware when
> diff --git a/drivers/gpu/drm/amd/amdkfd/Makefile b/drivers/gpu/drm/amd/amdkfd/Makefile
> index 747754428073..2ec8f27c5366 100644
> --- a/drivers/gpu/drm/amd/amdkfd/Makefile
> +++ b/drivers/gpu/drm/amd/amdkfd/Makefile
> @@ -53,6 +53,7 @@ AMDKFD_FILES	:= $(AMDKFD_PATH)/kfd_module.o \
>   		$(AMDKFD_PATH)/kfd_events.o \
>   		$(AMDKFD_PATH)/cik_event_interrupt.o \
>   		$(AMDKFD_PATH)/kfd_int_process_v9.o \
> +		$(AMDKFD_PATH)/kfd_int_process_v10.o \
>   		$(AMDKFD_PATH)/kfd_int_process_v11.o \
>   		$(AMDKFD_PATH)/kfd_smi_events.o \
>   		$(AMDKFD_PATH)/kfd_crat.o \
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
> index 16acf3d416eb..0c876172db4b 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
> @@ -125,6 +125,65 @@ bool kfd_dbg_ev_raise(uint64_t event_mask,
>   	return is_subscribed;
>   }
>   
> +/* set pending event queue entry from ring entry  */
> +bool kfd_set_dbg_ev_from_interrupt(struct kfd_dev *dev,
> +				   unsigned int pasid,
> +				   uint32_t doorbell_id,
> +				   uint64_t trap_mask,
> +				   void *exception_data,
> +				   size_t exception_data_size)
> +{
> +	struct kfd_process *p;
> +	bool signaled_to_debugger_or_runtime = false;
> +
> +	p = kfd_lookup_process_by_pasid(pasid);
> +
> +	if (!p)
> +		return false;
> +
> +	if (!kfd_dbg_ev_raise(trap_mask, p, dev, doorbell_id, true,
> +					exception_data, exception_data_size)) {

There are some coding style issues in this function with the 
indentation. For readability the second line should be aligned with the 
open parenthesis in the line above.


> +		struct process_queue_manager *pqm;
> +		struct process_queue_node *pqn;
> +
> +		if (!!(trap_mask & KFD_EC_MASK_QUEUE) &&
> +				p->runtime_info.runtime_state == DEBUG_RUNTIME_STATE_ENABLED) {

Same as above.


> +			mutex_lock(&p->mutex);
> +
> +			pqm = &p->pqm;
> +			list_for_each_entry(pqn, &pqm->queues,
> +							process_queue_list) {
> +
> +				if (!(pqn->q && pqn->q->device == dev &&
> +						pqn->q->doorbell_id == doorbell_id))

Same as above.


> +					continue;
> +
> +				kfd_send_exception_to_runtime(p,
> +						pqn->q->properties.queue_id,
> +						trap_mask);

Same as above.


> +
> +				signaled_to_debugger_or_runtime = true;
> +
> +				break;
> +			}
> +
> +			mutex_unlock(&p->mutex);
> +		} else if (trap_mask & KFD_EC_MASK(EC_DEVICE_MEMORY_VIOLATION)) {
> +			kfd_dqm_evict_pasid(dev->dqm, p->pasid);
> +			kfd_signal_vm_fault_event(dev, p->pasid, NULL,
> +							exception_data);
> +
> +			signaled_to_debugger_or_runtime = true;
> +		}
> +	} else {
> +		signaled_to_debugger_or_runtime = true;
> +	}
> +
> +	kfd_unref_process(p);
> +
> +	return signaled_to_debugger_or_runtime;
> +}
> +

[snip]

@@ -2074,6 +2076,51 @@ void kfd_flush_tlb(struct kfd_process_device 
*pdd, enum TLB_FLUSH_TYPE type)
>   	}
>   }
>   
> +/* assumes caller holds process lock. */
> +int kfd_process_drain_interrupts(struct kfd_process_device *pdd)
> +{
> +	uint32_t irq_drain_fence[8];
> +	int r = 0;
> +
> +	if (!KFD_IS_SOC15(pdd->dev))
> +		return 0;
> +
> +	pdd->process->irq_drain_is_open = true;
> +
> +	memset(irq_drain_fence, 0, sizeof(irq_drain_fence));
> +	irq_drain_fence[0] = (KFD_IRQ_FENCE_SOURCEID << 8) |
> +							KFD_IRQ_FENCE_CLIENTID;
> +	irq_drain_fence[3] = pdd->process->pasid;
> +
> +	/* ensure stale irqs scheduled KFD interrupts and send drain fence. */
> +	if (amdgpu_amdkfd_send_close_event_drain_irq(pdd->dev->adev,
> +							irq_drain_fence)) {

Same as above.


> +		pdd->process->irq_drain_is_open = false;
> +		return 0;
> +	}
> +
> +	r = wait_event_interruptible(pdd->process->wait_irq_drain,
> +				!READ_ONCE(pdd->process->irq_drain_is_open));

Same as above.

Regards,
   Felix


> +	if (r)
> +		pdd->process->irq_drain_is_open = false;
> +
> +	return r;
> +}
> +
> +void kfd_process_close_interrupt_drain(unsigned int pasid)
> +{
> +	struct kfd_process *p;
> +
> +	p = kfd_lookup_process_by_pasid(pasid);
> +
> +	if (!p)
> +		return;
> +
> +	WRITE_ONCE(p->irq_drain_is_open, false);
> +	wake_up_all(&p->wait_irq_drain);
> +	kfd_unref_process(p);
> +}
> +
>   struct send_exception_work_handler_workarea {
>   	struct work_struct work;
>   	struct kfd_process *p;
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
> index d8f032214481..0ae6026c7d69 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
> @@ -330,6 +330,10 @@ int pqm_create_queue(struct process_queue_manager *pqm,
>   		kq->queue->properties.queue_id = *qid;
>   		pqn->kq = kq;
>   		pqn->q = NULL;
> +		retval = kfd_process_drain_interrupts(pdd);
> +		if (retval)
> +			break;
> +
>   		retval = dev->dqm->ops.create_kernel_queue(dev->dqm,
>   							kq, &pdd->qpd);
>   		break;

^ permalink raw reply	[flat|nested] 68+ messages in thread

* Re: [PATCH 23/32] drm/amdkfd: add debug wave launch override operation
  2023-01-25 19:53 ` [PATCH 23/32] drm/amdkfd: add debug wave launch override operation Jonathan Kim
@ 2023-03-21 21:37   ` Felix Kuehling
  0 siblings, 0 replies; 68+ messages in thread
From: Felix Kuehling @ 2023-03-21 21:37 UTC (permalink / raw)
  To: Jonathan Kim, amd-gfx, dri-devel


On 2023-01-25 14:53, Jonathan Kim wrote:
> This operation allows the debugger to override the enabled HW
> exceptions on the device.
>
> On debug devices that only support the debugging of a single process,
> the HW exceptions are global and set through the SPI_GDBG_TRAP_MASK
> register.
> Because they are global, only address watch exceptions are allowed to
> be enabled.  In other words, the debugger must preserve all non-address
> watch exception states in normal mode operation by barring a full
> replacement override or a non-address watch override request.
>
> For multi-process debugging, all HW exception overrides are per-VMID so
> all exceptions can be overridden or fully replaced.
>
> In order for the debugger to know what is permissible, returned the
> supported override mask back to the debugger along with the previously
> enable overrides.
>
> v3: v2 was reviewed but requesting re-review for GFX11 added supported.
>
> v2: switch unsupported override mode return from EPERM to EINVAL to
> support unique EPERM on PTRACE failure.
>
> Signed-off-by: Jonathan Kim <jonathan.kim@amd.com>
> ---
>   .../drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c  | 47 ++++++++++
>   .../drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c   |  2 +
>   .../drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c    | 55 ++++++++++++
>   .../drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.h    | 10 +++
>   .../drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10_3.c  |  5 +-
>   .../drm/amd/amdgpu/amdgpu_amdkfd_gfx_v11.c    | 86 ++++++++++++++++++-
>   .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c | 55 ++++++++++++
>   .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h | 10 +++
>   drivers/gpu/drm/amd/amdkfd/kfd_chardev.c      |  7 ++
>   drivers/gpu/drm/amd/amdkfd/kfd_debug.c        | 69 +++++++++++++++
>   drivers/gpu/drm/amd/amdkfd/kfd_debug.h        |  6 ++
>   11 files changed, 350 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c
> index a64a53f9efe6..84a9d9391ea4 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c
> @@ -25,6 +25,7 @@
>   #include "amdgpu_amdkfd_gfx_v9.h"
>   #include "gc/gc_9_4_2_offset.h"
>   #include "gc/gc_9_4_2_sh_mask.h"
> +#include <uapi/linux/kfd_ioctl.h>
>   
>   /**
>    * Returns TRAP_EN, EXCP_EN and EXCP_REPLACE.
> @@ -62,6 +63,50 @@ static uint32_t kgd_aldebaran_disable_debug_trap(struct amdgpu_device *adev,
>   	return data;
>   }
>   
> +static int kgd_aldebaran_validate_trap_override_request(struct amdgpu_device *adev,
> +							uint32_t trap_override,
> +							uint32_t *trap_mask_supported)
> +{
> +	*trap_mask_supported &= KFD_DBG_TRAP_MASK_FP_INVALID |
> +				KFD_DBG_TRAP_MASK_FP_INPUT_DENORMAL |
> +				KFD_DBG_TRAP_MASK_FP_DIVIDE_BY_ZERO |
> +				KFD_DBG_TRAP_MASK_FP_OVERFLOW |
> +				KFD_DBG_TRAP_MASK_FP_UNDERFLOW |
> +				KFD_DBG_TRAP_MASK_FP_INEXACT |
> +				KFD_DBG_TRAP_MASK_INT_DIVIDE_BY_ZERO |
> +				KFD_DBG_TRAP_MASK_DBG_ADDRESS_WATCH |
> +				KFD_DBG_TRAP_MASK_DBG_MEMORY_VIOLATION;
> +
> +	if (trap_override != KFD_DBG_TRAP_OVERRIDE_OR &&
> +			trap_override != KFD_DBG_TRAP_OVERRIDE_REPLACE)
> +		return -EPERM;
> +
> +	return 0;
> +}
> +
> +/* returns TRAP_EN, EXCP_EN and EXCP_RPLACE. */
> +static uint32_t kgd_aldebaran_set_wave_launch_trap_override(struct amdgpu_device *adev,
> +					uint32_t vmid,
> +					uint32_t trap_override,
> +					uint32_t trap_mask_bits,
> +					uint32_t trap_mask_request,
> +					uint32_t *trap_mask_prev,
> +					uint32_t kfd_dbg_trap_cntl_prev)
> +
> +{
> +	uint32_t data = 0;
> +
> +	*trap_mask_prev = REG_GET_FIELD(kfd_dbg_trap_cntl_prev, SPI_GDBG_PER_VMID_CNTL, EXCP_EN);
> +	trap_mask_bits = (trap_mask_bits & trap_mask_request) |
> +		(*trap_mask_prev & ~trap_mask_request);
> +
> +	data = REG_SET_FIELD(data, SPI_GDBG_PER_VMID_CNTL, TRAP_EN, 1);
> +	data = REG_SET_FIELD(data, SPI_GDBG_PER_VMID_CNTL, EXCP_EN, trap_mask_bits);
> +	data = REG_SET_FIELD(data, SPI_GDBG_PER_VMID_CNTL, EXCP_REPLACE, trap_override);
> +
> +	return data;
> +}
> +
>   const struct kfd2kgd_calls aldebaran_kfd2kgd = {
>   	.program_sh_mem_settings = kgd_gfx_v9_program_sh_mem_settings,
>   	.set_pasid_vmid_mapping = kgd_gfx_v9_set_pasid_vmid_mapping,
> @@ -81,6 +126,8 @@ const struct kfd2kgd_calls aldebaran_kfd2kgd = {
>   	.set_vm_context_page_table_base = kgd_gfx_v9_set_vm_context_page_table_base,
>   	.enable_debug_trap = kgd_aldebaran_enable_debug_trap,
>   	.disable_debug_trap = kgd_aldebaran_disable_debug_trap,
> +	.validate_trap_override_request = kgd_aldebaran_validate_trap_override_request,
> +	.set_wave_launch_trap_override = kgd_aldebaran_set_wave_launch_trap_override,
>   	.get_iq_wait_times = kgd_gfx_v9_get_iq_wait_times,
>   	.build_grace_period_packet_info = kgd_gfx_v9_build_grace_period_packet_info,
>   	.program_trap_handler_settings = kgd_gfx_v9_program_trap_handler_settings,
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c
> index ef8befc31fc6..0405725e95e3 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c
> @@ -410,6 +410,8 @@ const struct kfd2kgd_calls arcturus_kfd2kgd = {
>   				kgd_gfx_v9_set_vm_context_page_table_base,
>   	.enable_debug_trap = kgd_arcturus_enable_debug_trap,
>   	.disable_debug_trap = kgd_arcturus_disable_debug_trap,
> +	.validate_trap_override_request = kgd_gfx_v9_validate_trap_override_request,
> +	.set_wave_launch_trap_override = kgd_gfx_v9_set_wave_launch_trap_override,
>   	.get_iq_wait_times = kgd_gfx_v9_get_iq_wait_times,
>   	.build_grace_period_packet_info = kgd_gfx_v9_build_grace_period_packet_info,
>   	.get_cu_occupancy = kgd_gfx_v9_get_cu_occupancy,
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
> index 2491402afd58..32a6e5fbeacd 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
> @@ -31,6 +31,7 @@
>   #include "v10_structs.h"
>   #include "nv.h"
>   #include "nvd.h"
> +#include <uapi/linux/kfd_ioctl.h>
>   
>   enum hqd_dequeue_request_type {
>   	NO_ACTION = 0,
> @@ -801,6 +802,58 @@ uint32_t kgd_gfx_v10_disable_debug_trap(struct amdgpu_device *adev,
>   	return 0;
>   }
>   
> +int kgd_gfx_v10_validate_trap_override_request(struct amdgpu_device *adev,
> +					      uint32_t trap_override,
> +					      uint32_t *trap_mask_supported)
> +{
> +	*trap_mask_supported &= KFD_DBG_TRAP_MASK_DBG_ADDRESS_WATCH;
> +
> +	/* The SPI_GDBG_TRAP_MASK register is global and affects all
> +	 * processes. Only allow OR-ing the address-watch bit, since
> +	 * this only affects processes under the debugger. Other bits
> +	 * should stay 0 to avoid the debugger interfering with other
> +	 * processes.
> +	 */
> +	if (trap_override != KFD_DBG_TRAP_OVERRIDE_OR)
> +		return -EINVAL;
> +
> +	return 0;
> +}
> +
> +uint32_t kgd_gfx_v10_set_wave_launch_trap_override(struct amdgpu_device *adev,
> +					      uint32_t vmid,
> +					      uint32_t trap_override,
> +					      uint32_t trap_mask_bits,
> +					      uint32_t trap_mask_request,
> +					      uint32_t *trap_mask_prev,
> +					      uint32_t kfd_dbg_trap_cntl_prev)
> +{
> +	uint32_t data, wave_cntl_prev;
> +
> +	mutex_lock(&adev->grbm_idx_mutex);
> +
> +	wave_cntl_prev = RREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_WAVE_CNTL));
> +
> +	kgd_gfx_v10_set_wave_launch_stall(adev, vmid, true);
> +
> +	data = RREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_TRAP_MASK));
> +	*trap_mask_prev = REG_GET_FIELD(data, SPI_GDBG_TRAP_MASK, EXCP_EN);
> +
> +	trap_mask_bits = (trap_mask_bits & trap_mask_request) |
> +		(*trap_mask_prev & ~trap_mask_request);
> +
> +	data = REG_SET_FIELD(data, SPI_GDBG_TRAP_MASK, EXCP_EN, trap_mask_bits);
> +	data = REG_SET_FIELD(data, SPI_GDBG_TRAP_MASK, REPLACE, trap_override);
> +	WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_TRAP_MASK), data);
> +
> +	/* We need to preserve wave launch mode stall settings. */
> +	WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_WAVE_CNTL), wave_cntl_prev);
> +
> +	mutex_unlock(&adev->grbm_idx_mutex);
> +
> +	return 0;
> +}
> +
>   /* kgd_gfx_v10_get_iq_wait_times: Returns the mmCP_IQ_WAIT_TIME1/2 values
>    * The values read are:
>    *     ib_offload_wait_time     -- Wait Count for Indirect Buffer Offloads.
> @@ -886,6 +939,8 @@ const struct kfd2kgd_calls gfx_v10_kfd2kgd = {
>   	.set_vm_context_page_table_base = set_vm_context_page_table_base,
>   	.enable_debug_trap = kgd_gfx_v10_enable_debug_trap,
>   	.disable_debug_trap = kgd_gfx_v10_disable_debug_trap,
> +	.validate_trap_override_request = kgd_gfx_v10_validate_trap_override_request,
> +	.set_wave_launch_trap_override = kgd_gfx_v10_set_wave_launch_trap_override,
>   	.get_iq_wait_times = kgd_gfx_v10_get_iq_wait_times,
>   	.build_grace_period_packet_info = kgd_gfx_v10_build_grace_period_packet_info,
>   	.program_trap_handler_settings = program_trap_handler_settings,
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.h
> index 0abc1e805180..85c929fc2926 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.h
> @@ -26,6 +26,16 @@ uint32_t kgd_gfx_v10_enable_debug_trap(struct amdgpu_device *adev,
>   uint32_t kgd_gfx_v10_disable_debug_trap(struct amdgpu_device *adev,
>   					bool keep_trap_enabled,
>   					uint32_t vmid);
> +int kgd_gfx_v10_validate_trap_override_request(struct amdgpu_device *adev,
> +					     uint32_t trap_override,
> +					     uint32_t *trap_mask_supported);
> +uint32_t kgd_gfx_v10_set_wave_launch_trap_override(struct amdgpu_device *adev,
> +					     uint32_t vmid,
> +					     uint32_t trap_override,
> +					     uint32_t trap_mask_bits,
> +					     uint32_t trap_mask_request,
> +					     uint32_t *trap_mask_prev,
> +					     uint32_t kfd_dbg_trap_cntl_prev);
>   void kgd_gfx_v10_get_iq_wait_times(struct amdgpu_device *adev, uint32_t *wait_times);
>   void kgd_gfx_v10_build_grace_period_packet_info(struct amdgpu_device *adev,
>   					       uint32_t wait_times,
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10_3.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10_3.c
> index c57f2a6b6e23..ae3ead207df4 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10_3.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10_3.c
> @@ -673,5 +673,8 @@ const struct kfd2kgd_calls gfx_v10_3_kfd2kgd = {
>   	.get_iq_wait_times = kgd_gfx_v10_get_iq_wait_times,
>   	.build_grace_period_packet_info = kgd_gfx_v10_build_grace_period_packet_info,
>   	.enable_debug_trap = kgd_gfx_v10_enable_debug_trap,
> -	.disable_debug_trap = kgd_gfx_v10_disable_debug_trap
> +	.disable_debug_trap = kgd_gfx_v10_disable_debug_trap,
> +	.validate_trap_override_request = kgd_gfx_v10_validate_trap_override_request,
> +	.set_wave_launch_trap_override = kgd_gfx_v10_set_wave_launch_trap_override
> +
>   };
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v11.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v11.c
> index 34aeff692eba..3fb81e6e9422 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v11.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v11.c
> @@ -643,6 +643,88 @@ static uint32_t kgd_gfx_v11_disable_debug_trap(struct amdgpu_device *adev,
>   	return data;
>   }
>   
> +static int kgd_gfx_v11_validate_trap_override_request(struct amdgpu_device *adev,
> +							uint32_t trap_override,
> +							uint32_t *trap_mask_supported)
> +{
> +	*trap_mask_supported &= KFD_DBG_TRAP_MASK_FP_INVALID |
> +				KFD_DBG_TRAP_MASK_FP_INPUT_DENORMAL |
> +				KFD_DBG_TRAP_MASK_FP_DIVIDE_BY_ZERO |
> +				KFD_DBG_TRAP_MASK_FP_OVERFLOW |
> +				KFD_DBG_TRAP_MASK_FP_UNDERFLOW |
> +				KFD_DBG_TRAP_MASK_FP_INEXACT |
> +				KFD_DBG_TRAP_MASK_INT_DIVIDE_BY_ZERO |
> +				KFD_DBG_TRAP_MASK_DBG_ADDRESS_WATCH |
> +				KFD_DBG_TRAP_MASK_DBG_MEMORY_VIOLATION;
> +
> +	if (adev->ip_versions[GC_HWIP][0] >= IP_VERSION(11, 0, 4))
> +		*trap_mask_supported |= KFD_DBG_TRAP_MASK_TRAP_ON_WAVE_START |
> +					KFD_DBG_TRAP_MASK_TRAP_ON_WAVE_END;
> +
> +	if (trap_override != KFD_DBG_TRAP_OVERRIDE_OR &&
> +			trap_override != KFD_DBG_TRAP_OVERRIDE_REPLACE)
> +		return -EPERM;
> +
> +	return 0;
> +}
> +
> +static uint32_t trap_mask_map_sw_to_hw(uint32_t mask)
> +{
> +	uint32_t trap_on_start = (mask & KFD_DBG_TRAP_MASK_TRAP_ON_WAVE_START) ? 1 : 0;
> +	uint32_t trap_on_end = (mask & KFD_DBG_TRAP_MASK_TRAP_ON_WAVE_END) ? 1 : 0;
> +	uint32_t excp_en = mask & (KFD_DBG_TRAP_MASK_FP_INVALID |
> +			KFD_DBG_TRAP_MASK_FP_INPUT_DENORMAL |
> +			KFD_DBG_TRAP_MASK_FP_DIVIDE_BY_ZERO |
> +			KFD_DBG_TRAP_MASK_FP_OVERFLOW |
> +			KFD_DBG_TRAP_MASK_FP_UNDERFLOW |
> +			KFD_DBG_TRAP_MASK_FP_INEXACT |
> +			KFD_DBG_TRAP_MASK_INT_DIVIDE_BY_ZERO |
> +			KFD_DBG_TRAP_MASK_DBG_ADDRESS_WATCH |
> +			KFD_DBG_TRAP_MASK_DBG_MEMORY_VIOLATION);
> +	uint32_t ret;
> +
> +	ret = REG_SET_FIELD(0, SPI_GDBG_PER_VMID_CNTL, EXCP_EN, excp_en);
> +	ret = REG_SET_FIELD(ret, SPI_GDBG_PER_VMID_CNTL, TRAP_ON_START, trap_on_start);
> +	ret = REG_SET_FIELD(ret, SPI_GDBG_PER_VMID_CNTL, TRAP_ON_END, trap_on_end);
> +
> +	return ret;
> +}
> +
> +static uint32_t trap_mask_map_hw_to_sw(uint32_t mask)
> +{
> +	uint32_t ret = REG_GET_FIELD(mask, SPI_GDBG_PER_VMID_CNTL, EXCP_EN);
> +
> +	if (REG_GET_FIELD(mask, SPI_GDBG_PER_VMID_CNTL, TRAP_ON_START))
> +		ret |= KFD_DBG_TRAP_MASK_TRAP_ON_WAVE_START;
> +
> +	if (REG_GET_FIELD(mask, SPI_GDBG_PER_VMID_CNTL, TRAP_ON_END))
> +		ret |= KFD_DBG_TRAP_MASK_TRAP_ON_WAVE_END;
> +
> +	return ret;
> +}
> +
> +/* Returns TRAP_EN, EXCP_EN and EXCP_REPLACE. */
> +static uint32_t kgd_gfx_v11_set_wave_launch_trap_override(struct amdgpu_device *adev,
> +					uint32_t vmid,
> +					uint32_t trap_override,
> +					uint32_t trap_mask_bits,
> +					uint32_t trap_mask_request,
> +					uint32_t *trap_mask_prev,
> +					uint32_t kfd_dbg_trap_cntl_prev)
> +{
> +	uint32_t data = 0;
> +
> +	*trap_mask_prev = trap_mask_map_hw_to_sw(kfd_dbg_trap_cntl_prev);
> +
> +	data = (trap_mask_bits & trap_mask_request) | (*trap_mask_prev & ~trap_mask_request);
> +	data = trap_mask_map_sw_to_hw(data);
> +
> +	data = REG_SET_FIELD(data, SPI_GDBG_PER_VMID_CNTL, TRAP_EN, 1);
> +	data = REG_SET_FIELD(data, SPI_GDBG_PER_VMID_CNTL, EXCP_REPLACE, trap_override);
> +
> +	return data;
> +}
> +
>   const struct kfd2kgd_calls gfx_v11_kfd2kgd = {
>   	.program_sh_mem_settings = program_sh_mem_settings_v11,
>   	.set_pasid_vmid_mapping = set_pasid_vmid_mapping_v11,
> @@ -660,5 +742,7 @@ const struct kfd2kgd_calls gfx_v11_kfd2kgd = {
>   	.get_atc_vmid_pasid_mapping_info = NULL,
>   	.set_vm_context_page_table_base = set_vm_context_page_table_base_v11,
>   	.enable_debug_trap = kgd_gfx_v11_enable_debug_trap,
> -	.disable_debug_trap = kgd_gfx_v11_disable_debug_trap
> +	.disable_debug_trap = kgd_gfx_v11_disable_debug_trap,
> +	.validate_trap_override_request = kgd_gfx_v11_validate_trap_override_request,
> +	.set_wave_launch_trap_override = kgd_gfx_v11_set_wave_launch_trap_override
>   };
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
> index 4a8bd266d3f6..81643385512a 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
> @@ -38,6 +38,7 @@
>   #include "soc15d.h"
>   #include "gfx_v9_0.h"
>   #include "amdgpu_amdkfd_gfx_v9.h"
> +#include <uapi/linux/kfd_ioctl.h>
>   
>   enum hqd_dequeue_request_type {
>   	NO_ACTION = 0,
> @@ -737,6 +738,58 @@ uint32_t kgd_gfx_v9_disable_debug_trap(struct amdgpu_device *adev,
>   	return 0;
>   }
>   
> +int kgd_gfx_v9_validate_trap_override_request(struct amdgpu_device *adev,
> +					uint32_t trap_override,
> +					uint32_t *trap_mask_supported)
> +{
> +	*trap_mask_supported &= KFD_DBG_TRAP_MASK_DBG_ADDRESS_WATCH;
> +
> +	/* The SPI_GDBG_TRAP_MASK register is global and affects all
> +	 * processes. Only allow OR-ing the address-watch bit, since
> +	 * this only affects processes under the debugger. Other bits
> +	 * should stay 0 to avoid the debugger interfering with other
> +	 * processes.
> +	 */
> +	if (trap_override != KFD_DBG_TRAP_OVERRIDE_OR)
> +		return -EINVAL;
> +
> +	return 0;
> +}
> +
> +uint32_t kgd_gfx_v9_set_wave_launch_trap_override(struct amdgpu_device *adev,
> +					     uint32_t vmid,
> +					     uint32_t trap_override,
> +					     uint32_t trap_mask_bits,
> +					     uint32_t trap_mask_request,
> +					     uint32_t *trap_mask_prev,
> +					     uint32_t kfd_dbg_cntl_prev)
> +{
> +	uint32_t data, wave_cntl_prev;
> +
> +	mutex_lock(&adev->grbm_idx_mutex);
> +
> +	wave_cntl_prev = RREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_WAVE_CNTL));
> +
> +	kgd_gfx_v9_set_wave_launch_stall(adev, vmid, true);
> +
> +	data = RREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_TRAP_MASK));
> +	*trap_mask_prev = REG_GET_FIELD(data, SPI_GDBG_TRAP_MASK, EXCP_EN);
> +
> +	trap_mask_bits = (trap_mask_bits & trap_mask_request) |
> +		(*trap_mask_prev & ~trap_mask_request);
> +
> +	data = REG_SET_FIELD(data, SPI_GDBG_TRAP_MASK, EXCP_EN, trap_mask_bits);
> +	data = REG_SET_FIELD(data, SPI_GDBG_TRAP_MASK, REPLACE, trap_override);
> +	WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_TRAP_MASK), data);
> +
> +	/* We need to preserve wave launch mode stall settings. */
> +	WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_WAVE_CNTL), wave_cntl_prev);
> +
> +	mutex_unlock(&adev->grbm_idx_mutex);
> +
> +	return 0;
> +}
> +
>   /* kgd_gfx_v9_get_iq_wait_times: Returns the mmCP_IQ_WAIT_TIME1/2 values
>    * The values read are:
>    *     ib_offload_wait_time     -- Wait Count for Indirect Buffer Offloads.
> @@ -1005,6 +1058,8 @@ const struct kfd2kgd_calls gfx_v9_kfd2kgd = {
>   	.set_vm_context_page_table_base = kgd_gfx_v9_set_vm_context_page_table_base,
>   	.enable_debug_trap = kgd_gfx_v9_enable_debug_trap,
>   	.disable_debug_trap = kgd_gfx_v9_disable_debug_trap,
> +	.validate_trap_override_request = kgd_gfx_v9_validate_trap_override_request,
> +	.set_wave_launch_trap_override = kgd_gfx_v9_set_wave_launch_trap_override,
>   	.get_iq_wait_times = kgd_gfx_v9_get_iq_wait_times,
>   	.build_grace_period_packet_info = kgd_gfx_v9_build_grace_period_packet_info,
>   	.get_cu_occupancy = kgd_gfx_v9_get_cu_occupancy,
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h
> index c0866497cb5c..47cff392b434 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h
> @@ -64,6 +64,16 @@ uint32_t kgd_gfx_v9_enable_debug_trap(struct amdgpu_device *adev,
>   uint32_t kgd_gfx_v9_disable_debug_trap(struct amdgpu_device *adev,
>   					bool keep_trap_enabled,
>   					uint32_t vmid);
> +int kgd_gfx_v9_validate_trap_override_request(struct amdgpu_device *adev,
> +					     uint32_t trap_override,
> +					     uint32_t *trap_mask_supported);
> +uint32_t kgd_gfx_v9_set_wave_launch_trap_override(struct amdgpu_device *adev,
> +					     uint32_t vmid,
> +					     uint32_t trap_override,
> +					     uint32_t trap_mask_bits,
> +					     uint32_t trap_mask_request,
> +					     uint32_t *trap_mask_prev,
> +					     uint32_t kfd_dbg_trap_cntl_prev);
>   void kgd_gfx_v9_get_iq_wait_times(struct amdgpu_device *adev, uint32_t *wait_times);
>   void kgd_gfx_v9_build_grace_period_packet_info(struct amdgpu_device *adev,
>   					       uint32_t wait_times,
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> index 9b87ba351eff..28b9db5806f4 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> @@ -2896,6 +2896,13 @@ static int kfd_ioctl_set_debug_trap(struct file *filep, struct kfd_process *p, v
>   				args->set_exceptions_enabled.exception_mask);
>   		break;
>   	case KFD_IOC_DBG_TRAP_SET_WAVE_LAUNCH_OVERRIDE:
> +		r = kfd_dbg_trap_set_wave_launch_override(target,
> +				args->launch_override.override_mode,
> +				args->launch_override.enable_mask,
> +				args->launch_override.support_request_mask,
> +				&args->launch_override.enable_mask,
> +				&args->launch_override.support_request_mask);
> +		break;
>   	case KFD_IOC_DBG_TRAP_SET_WAVE_LAUNCH_MODE:
>   	case KFD_IOC_DBG_TRAP_SUSPEND_QUEUES:
>   	case KFD_IOC_DBG_TRAP_RESUME_QUEUES:
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
> index 3ea53aaa776b..a9b52f114ac6 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
> @@ -530,6 +530,75 @@ int kfd_dbg_trap_enable(struct kfd_process *target, uint32_t fd,
>   	return r;
>   }
>   
> +static int kfd_dbg_validate_trap_override_request(struct kfd_process *p,
> +						uint32_t trap_override,
> +						uint32_t trap_mask_request,
> +						uint32_t *trap_mask_supported)
> +{
> +	int i = 0;
> +
> +	*trap_mask_supported = 0xffffffff;
> +
> +	for (i = 0; i < p->n_pdds; i++) {
> +		struct kfd_process_device *pdd = p->pdds[i];
> +		int err = pdd->dev->kfd2kgd->validate_trap_override_request(
> +								pdd->dev->adev,
> +								trap_override,
> +								trap_mask_supported);
> +
> +		if (err)
> +			return err;
> +	}
> +
> +	if (trap_mask_request & ~*trap_mask_supported)
> +		return -EACCES;
> +
> +	return 0;
> +}
> +
> +int kfd_dbg_trap_set_wave_launch_override(struct kfd_process *target,
> +					uint32_t trap_override,
> +					uint32_t trap_mask_bits,
> +					uint32_t trap_mask_request,
> +					uint32_t *trap_mask_prev,
> +					uint32_t *trap_mask_supported)
> +{
> +	int r = 0, i;
> +
> +	r = kfd_dbg_validate_trap_override_request(target,
> +						trap_override,
> +						trap_mask_request,
> +						trap_mask_supported);
> +
> +	if (r)
> +		return r;
> +
> +	for (i = 0; i < target->n_pdds; i++) {
> +		struct kfd_process_device *pdd = target->pdds[i];
> +
> +		amdgpu_gfx_off_ctrl(pdd->dev->adev, false);

I think the gfxoff stuff is not needed on HW that supports multi-process 
debugging because these functions don't touch HW registers in that case. 
I think the gfxoff stuff should be done in the HW-specific functions 
only if needed. Maybe that comment applies to more patches than just 
this one. It's an improvement we could do in a follow up to make sure 
we're not missing any cases.

The patch is

Reviewed-by: Felix Kuehling <Felix.Kuehling@amd.com>


> +		pdd->spi_dbg_override = pdd->dev->kfd2kgd->set_wave_launch_trap_override(
> +				pdd->dev->adev,
> +				pdd->dev->vm_info.last_vmid_kfd,
> +				trap_override,
> +				trap_mask_bits,
> +				trap_mask_request,
> +				trap_mask_prev,
> +				pdd->spi_dbg_override);
> +		amdgpu_gfx_off_ctrl(pdd->dev->adev, true);
> +
> +		if (!pdd->dev->shared_resources.enable_mes)
> +			r = debug_refresh_runlist(pdd->dev->dqm);
> +		else
> +			r = kfd_dbg_set_mes_debug_mode(pdd);
> +
> +		if (r)
> +			break;
> +	}
> +
> +	return r;
> +}
> +
>   void kfd_dbg_set_enabled_debug_exception_mask(struct kfd_process *target,
>   					uint64_t exception_set_mask)
>   {
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.h b/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
> index 81557579ab04..864eb01f8973 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
> @@ -45,6 +45,12 @@ int kfd_dbg_trap_disable(struct kfd_process *target);
>   int kfd_dbg_trap_enable(struct kfd_process *target, uint32_t fd,
>   			void __user *runtime_info,
>   			uint32_t *runtime_info_size);
> +int kfd_dbg_trap_set_wave_launch_override(struct kfd_process *target,
> +					uint32_t trap_override,
> +					uint32_t trap_mask_bits,
> +					uint32_t trap_mask_request,
> +					uint32_t *trap_mask_prev,
> +					uint32_t *trap_mask_supported);
>   
>   int kfd_dbg_send_exception_to_runtime(struct kfd_process *p,
>   					unsigned int dev_id,

^ permalink raw reply	[flat|nested] 68+ messages in thread

* Re: [PATCH 24/32] drm/amdkfd: add debug wave launch mode operation
  2023-01-25 19:53 ` [PATCH 24/32] drm/amdkfd: add debug wave launch mode operation Jonathan Kim
@ 2023-03-21 21:42   ` Felix Kuehling
  0 siblings, 0 replies; 68+ messages in thread
From: Felix Kuehling @ 2023-03-21 21:42 UTC (permalink / raw)
  To: Jonathan Kim, amd-gfx, dri-devel

On 2023-01-25 14:53, Jonathan Kim wrote:
> Allow the debugger to set wave behaviour on to either normally operate,
> halt at launch, trap on every instruction, terminate immediately or
> stall on allocation.
>
> v2: add gfx11 support and remove deprecated launch mode options
>
> Signed-off-by: Jonathan Kim <jonathan.kim@amd.com>
> ---
>   .../drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c  | 12 +++++++
>   .../drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c   |  1 +
>   .../drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c    | 25 +++++++++++++
>   .../drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.h    |  3 ++
>   .../drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10_3.c  |  3 +-
>   .../drm/amd/amdgpu/amdgpu_amdkfd_gfx_v11.c    | 14 +++++++-
>   .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c | 25 +++++++++++++
>   .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h |  3 ++
>   drivers/gpu/drm/amd/amdkfd/kfd_chardev.c      |  3 ++
>   drivers/gpu/drm/amd/amdkfd/kfd_debug.c        | 36 ++++++++++++++++++-
>   drivers/gpu/drm/amd/amdkfd/kfd_debug.h        |  5 ++-
>   11 files changed, 124 insertions(+), 6 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c
> index 84a9d9391ea4..4de2066215b4 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c
> @@ -107,6 +107,17 @@ static uint32_t kgd_aldebaran_set_wave_launch_trap_override(struct amdgpu_device
>   	return data;
>   }
>   
> +static uint32_t kgd_aldebaran_set_wave_launch_mode(struct amdgpu_device *adev,
> +					uint8_t wave_launch_mode,
> +					uint32_t vmid)
> +{
> +	uint32_t data = 0;
> +
> +	data = REG_SET_FIELD(data, SPI_GDBG_PER_VMID_CNTL, LAUNCH_MODE, wave_launch_mode);
> +
> +	return data;
> +}
> +
>   const struct kfd2kgd_calls aldebaran_kfd2kgd = {
>   	.program_sh_mem_settings = kgd_gfx_v9_program_sh_mem_settings,
>   	.set_pasid_vmid_mapping = kgd_gfx_v9_set_pasid_vmid_mapping,
> @@ -128,6 +139,7 @@ const struct kfd2kgd_calls aldebaran_kfd2kgd = {
>   	.disable_debug_trap = kgd_aldebaran_disable_debug_trap,
>   	.validate_trap_override_request = kgd_aldebaran_validate_trap_override_request,
>   	.set_wave_launch_trap_override = kgd_aldebaran_set_wave_launch_trap_override,
> +	.set_wave_launch_mode = kgd_aldebaran_set_wave_launch_mode,
>   	.get_iq_wait_times = kgd_gfx_v9_get_iq_wait_times,
>   	.build_grace_period_packet_info = kgd_gfx_v9_build_grace_period_packet_info,
>   	.program_trap_handler_settings = kgd_gfx_v9_program_trap_handler_settings,
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c
> index 0405725e95e3..500013540356 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c
> @@ -412,6 +412,7 @@ const struct kfd2kgd_calls arcturus_kfd2kgd = {
>   	.disable_debug_trap = kgd_arcturus_disable_debug_trap,
>   	.validate_trap_override_request = kgd_gfx_v9_validate_trap_override_request,
>   	.set_wave_launch_trap_override = kgd_gfx_v9_set_wave_launch_trap_override,
> +	.set_wave_launch_mode = kgd_gfx_v9_set_wave_launch_mode,
>   	.get_iq_wait_times = kgd_gfx_v9_get_iq_wait_times,
>   	.build_grace_period_packet_info = kgd_gfx_v9_build_grace_period_packet_info,
>   	.get_cu_occupancy = kgd_gfx_v9_get_cu_occupancy,
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
> index 32a6e5fbeacd..7591145bc69f 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
> @@ -854,6 +854,30 @@ uint32_t kgd_gfx_v10_set_wave_launch_trap_override(struct amdgpu_device *adev,
>   	return 0;
>   }
>   
> +uint32_t kgd_gfx_v10_set_wave_launch_mode(struct amdgpu_device *adev,
> +					uint8_t wave_launch_mode,
> +					uint32_t vmid)
> +{
> +	uint32_t data = 0;
> +	bool is_mode_set = !!wave_launch_mode;
> +
> +	mutex_lock(&adev->grbm_idx_mutex);
> +
> +	kgd_gfx_v10_set_wave_launch_stall(adev, vmid, true);
> +
> +	data = REG_SET_FIELD(data, SPI_GDBG_WAVE_CNTL2,
> +			VMID_MASK, is_mode_set ? 1 << vmid : 0);
> +	data = REG_SET_FIELD(data, SPI_GDBG_WAVE_CNTL2,
> +			MODE, is_mode_set ? wave_launch_mode : 0);
> +	WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_WAVE_CNTL2), data);
> +
> +	kgd_gfx_v10_set_wave_launch_stall(adev, vmid, false);
> +
> +	mutex_unlock(&adev->grbm_idx_mutex);
> +
> +	return 0;
> +}
> +
>   /* kgd_gfx_v10_get_iq_wait_times: Returns the mmCP_IQ_WAIT_TIME1/2 values
>    * The values read are:
>    *     ib_offload_wait_time     -- Wait Count for Indirect Buffer Offloads.
> @@ -941,6 +965,7 @@ const struct kfd2kgd_calls gfx_v10_kfd2kgd = {
>   	.disable_debug_trap = kgd_gfx_v10_disable_debug_trap,
>   	.validate_trap_override_request = kgd_gfx_v10_validate_trap_override_request,
>   	.set_wave_launch_trap_override = kgd_gfx_v10_set_wave_launch_trap_override,
> +	.set_wave_launch_mode = kgd_gfx_v10_set_wave_launch_mode,
>   	.get_iq_wait_times = kgd_gfx_v10_get_iq_wait_times,
>   	.build_grace_period_packet_info = kgd_gfx_v10_build_grace_period_packet_info,
>   	.program_trap_handler_settings = program_trap_handler_settings,
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.h
> index 85c929fc2926..34c04a2bb83b 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.h
> @@ -36,6 +36,9 @@ uint32_t kgd_gfx_v10_set_wave_launch_trap_override(struct amdgpu_device *adev,
>   					     uint32_t trap_mask_request,
>   					     uint32_t *trap_mask_prev,
>   					     uint32_t kfd_dbg_trap_cntl_prev);
> +uint32_t kgd_gfx_v10_set_wave_launch_mode(struct amdgpu_device *adev,
> +					 uint8_t wave_launch_mode,
> +					 uint32_t vmid);
>   void kgd_gfx_v10_get_iq_wait_times(struct amdgpu_device *adev, uint32_t *wait_times);
>   void kgd_gfx_v10_build_grace_period_packet_info(struct amdgpu_device *adev,
>   					       uint32_t wait_times,
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10_3.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10_3.c
> index ae3ead207df4..8627c5458973 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10_3.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10_3.c
> @@ -675,6 +675,7 @@ const struct kfd2kgd_calls gfx_v10_3_kfd2kgd = {
>   	.enable_debug_trap = kgd_gfx_v10_enable_debug_trap,
>   	.disable_debug_trap = kgd_gfx_v10_disable_debug_trap,
>   	.validate_trap_override_request = kgd_gfx_v10_validate_trap_override_request,
> -	.set_wave_launch_trap_override = kgd_gfx_v10_set_wave_launch_trap_override
> +	.set_wave_launch_trap_override = kgd_gfx_v10_set_wave_launch_trap_override,
> +	.set_wave_launch_mode = kgd_gfx_v10_set_wave_launch_mode
>   
>   };
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v11.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v11.c
> index 3fb81e6e9422..4fdc25222dcd 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v11.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v11.c
> @@ -725,6 +725,17 @@ static uint32_t kgd_gfx_v11_set_wave_launch_trap_override(struct amdgpu_device *
>   	return data;
>   }
>   
> +static uint32_t kgd_gfx_v11_set_wave_launch_mode(struct amdgpu_device *adev,
> +					uint8_t wave_launch_mode,
> +					uint32_t vmid)
> +{
> +	uint32_t data = 0;
> +
> +	data = REG_SET_FIELD(data, SPI_GDBG_PER_VMID_CNTL, LAUNCH_MODE, wave_launch_mode);
> +
> +	return data;
> +}
> +
>   const struct kfd2kgd_calls gfx_v11_kfd2kgd = {
>   	.program_sh_mem_settings = program_sh_mem_settings_v11,
>   	.set_pasid_vmid_mapping = set_pasid_vmid_mapping_v11,
> @@ -744,5 +755,6 @@ const struct kfd2kgd_calls gfx_v11_kfd2kgd = {
>   	.enable_debug_trap = kgd_gfx_v11_enable_debug_trap,
>   	.disable_debug_trap = kgd_gfx_v11_disable_debug_trap,
>   	.validate_trap_override_request = kgd_gfx_v11_validate_trap_override_request,
> -	.set_wave_launch_trap_override = kgd_gfx_v11_set_wave_launch_trap_override
> +	.set_wave_launch_trap_override = kgd_gfx_v11_set_wave_launch_trap_override,
> +	.set_wave_launch_mode = kgd_gfx_v11_set_wave_launch_mode
>   };
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
> index 81643385512a..a3c8f5578788 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
> @@ -790,6 +790,30 @@ uint32_t kgd_gfx_v9_set_wave_launch_trap_override(struct amdgpu_device *adev,
>   	return 0;
>   }
>   
> +uint32_t kgd_gfx_v9_set_wave_launch_mode(struct amdgpu_device *adev,
> +					uint8_t wave_launch_mode,
> +					uint32_t vmid)
> +{
> +	uint32_t data = 0;
> +	bool is_mode_set = !!wave_launch_mode;
> +
> +	mutex_lock(&adev->grbm_idx_mutex);
> +
> +	kgd_gfx_v9_set_wave_launch_stall(adev, vmid, true);
> +
> +	data = REG_SET_FIELD(data, SPI_GDBG_WAVE_CNTL2,
> +		VMID_MASK, is_mode_set ? 1 << vmid : 0);
> +	data = REG_SET_FIELD(data, SPI_GDBG_WAVE_CNTL2,
> +		MODE, is_mode_set ? wave_launch_mode : 0);
> +	WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_WAVE_CNTL2), data);
> +
> +	kgd_gfx_v9_set_wave_launch_stall(adev, vmid, false);
> +
> +	mutex_unlock(&adev->grbm_idx_mutex);
> +
> +	return 0;
> +}
> +
>   /* kgd_gfx_v9_get_iq_wait_times: Returns the mmCP_IQ_WAIT_TIME1/2 values
>    * The values read are:
>    *     ib_offload_wait_time     -- Wait Count for Indirect Buffer Offloads.
> @@ -1060,6 +1084,7 @@ const struct kfd2kgd_calls gfx_v9_kfd2kgd = {
>   	.disable_debug_trap = kgd_gfx_v9_disable_debug_trap,
>   	.validate_trap_override_request = kgd_gfx_v9_validate_trap_override_request,
>   	.set_wave_launch_trap_override = kgd_gfx_v9_set_wave_launch_trap_override,
> +	.set_wave_launch_mode = kgd_gfx_v9_set_wave_launch_mode,
>   	.get_iq_wait_times = kgd_gfx_v9_get_iq_wait_times,
>   	.build_grace_period_packet_info = kgd_gfx_v9_build_grace_period_packet_info,
>   	.get_cu_occupancy = kgd_gfx_v9_get_cu_occupancy,
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h
> index 47cff392b434..2a2ab42037e4 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h
> @@ -67,6 +67,9 @@ uint32_t kgd_gfx_v9_disable_debug_trap(struct amdgpu_device *adev,
>   int kgd_gfx_v9_validate_trap_override_request(struct amdgpu_device *adev,
>   					     uint32_t trap_override,
>   					     uint32_t *trap_mask_supported);
> +uint32_t kgd_gfx_v9_set_wave_launch_mode(struct amdgpu_device *adev,
> +					uint8_t wave_launch_mode,
> +					uint32_t vmid);
>   uint32_t kgd_gfx_v9_set_wave_launch_trap_override(struct amdgpu_device *adev,
>   					     uint32_t vmid,
>   					     uint32_t trap_override,
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> index 28b9db5806f4..205a487d91d2 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> @@ -2904,6 +2904,9 @@ static int kfd_ioctl_set_debug_trap(struct file *filep, struct kfd_process *p, v
>   				&args->launch_override.support_request_mask);
>   		break;
>   	case KFD_IOC_DBG_TRAP_SET_WAVE_LAUNCH_MODE:
> +		r = kfd_dbg_trap_set_wave_launch_mode(target,
> +				args->launch_mode.launch_mode);
> +		break;
>   	case KFD_IOC_DBG_TRAP_SUSPEND_QUEUES:
>   	case KFD_IOC_DBG_TRAP_RESUME_QUEUES:
>   	case KFD_IOC_DBG_TRAP_SET_NODE_ADDRESS_WATCH:
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
> index a9b52f114ac6..b630633609b0 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
> @@ -303,8 +303,10 @@ void kfd_dbg_trap_deactivate(struct kfd_process *target, bool unwind, int unwind
>   {
>   	int i, count = 0;
>   
> -	if (!unwind)
> +	if (!unwind) {
>   		cancel_work_sync(&target->debug_event_workarea);
> +		kfd_dbg_trap_set_wave_launch_mode(target, 0);
> +	}
>   
>   	for (i = 0; i < target->n_pdds; i++) {
>   		struct kfd_process_device *pdd = target->pdds[i];
> @@ -599,6 +601,38 @@ int kfd_dbg_trap_set_wave_launch_override(struct kfd_process *target,
>   	return r;
>   }
>   
> +int kfd_dbg_trap_set_wave_launch_mode(struct kfd_process *target,
> +					uint8_t wave_launch_mode)
> +{
> +	int r = 0, i;
> +
> +	if (wave_launch_mode != KFD_DBG_TRAP_WAVE_LAUNCH_MODE_NORMAL &&
> +			wave_launch_mode != KFD_DBG_TRAP_WAVE_LAUNCH_MODE_HALT &&
> +			wave_launch_mode != KFD_DBG_TRAP_WAVE_LAUNCH_MODE_DEBUG)
> +		return -EINVAL;
> +
> +	for (i = 0; i < target->n_pdds; i++) {
> +		struct kfd_process_device *pdd = target->pdds[i];
> +
> +		amdgpu_gfx_off_ctrl(pdd->dev->adev, false);

Same comment as in the last patch. This should probably be done in the 
HW-specific code, only if it actually touches HW registers. Maybe in a 
separate patch to make sure we catch all the cases.


> +		pdd->spi_dbg_launch_mode = pdd->dev->kfd2kgd->set_wave_launch_mode(
> +				pdd->dev->adev,
> +				wave_launch_mode,
> +				pdd->dev->vm_info.last_vmid_kfd);
> +		amdgpu_gfx_off_ctrl(pdd->dev->adev, true);
> +
> +		if (!pdd->dev->shared_resources.enable_mes)
> +			r = debug_refresh_runlist(pdd->dev->dqm);
> +		else
> +			r = kfd_dbg_set_mes_debug_mode(pdd);
> +
> +		if (r)
> +			break;
> +	}
> +
> +	return r;
> +}
> +
>   void kfd_dbg_set_enabled_debug_exception_mask(struct kfd_process *target,
>   					uint64_t exception_set_mask)
>   {
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.h b/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
> index 864eb01f8973..0d70f162d6d8 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
> @@ -25,9 +25,6 @@
>   
>   #include "kfd_priv.h"
>   
> -void kgd_gfx_v9_set_wave_launch_stall(struct amdgpu_device *adev,
> -					uint32_t vmid,
> -					bool stall);

This seems unrelated to this patch.

Regards,
   Felix


>   void kfd_dbg_trap_deactivate(struct kfd_process *target, bool unwind, int unwind_count);
>   int kfd_dbg_trap_activate(struct kfd_process *target);
>   bool kfd_set_dbg_ev_from_interrupt(struct kfd_dev *dev,
> @@ -51,6 +48,8 @@ int kfd_dbg_trap_set_wave_launch_override(struct kfd_process *target,
>   					uint32_t trap_mask_request,
>   					uint32_t *trap_mask_prev,
>   					uint32_t *trap_mask_supported);
> +int kfd_dbg_trap_set_wave_launch_mode(struct kfd_process *target,
> +					uint8_t wave_launch_mode);
>   
>   int kfd_dbg_send_exception_to_runtime(struct kfd_process *p,
>   					unsigned int dev_id,

^ permalink raw reply	[flat|nested] 68+ messages in thread

* Re: [PATCH 25/32] drm/amdkfd: add debug suspend and resume process queues operation
  2023-01-25 19:53 ` [PATCH 25/32] drm/amdkfd: add debug suspend and resume process queues operation Jonathan Kim
@ 2023-03-21 22:16   ` Felix Kuehling
  0 siblings, 0 replies; 68+ messages in thread
From: Felix Kuehling @ 2023-03-21 22:16 UTC (permalink / raw)
  To: Jonathan Kim, amd-gfx, dri-devel


On 2023-01-25 14:53, Jonathan Kim wrote:
> In order to inspect waves from the saved context at any point during a
> debug session, the debugger must be able to preempt queues to trigger
> context save by suspending them.
>
> On queue suspend, the KFD will copy the context save header information
> so that the debugger can correctly crawl the appropriate size of the saved
> context. The debugger must then also be allowed to resume suspended queues.
>
> A queue that is newly created cannot be suspended because queue ids are
> recycled after destruction so the debugger needs to know that this has
> occurred.  Query functions will be later added that will clear a given
> queue of its new queue status.
>
> A queue cannot be destroyed while it is suspended to preserve its saved
> context during debugger inspection.  Have queue destruction block while
> a queue is suspended and unblocked when it is resumed.  Likewise, if a
> queue is about to be destroyed, it cannot be suspended.
>
> Return the number of queues successfully suspended or resumed along with
> a per queue status array where the upper bits per queue status show that
> the request was invalid (new/destroyed queue suspend request, missing
> queue) or an error occurred (HWS in a fatal state so it can't suspend or
> resume queues).
>
> v2: add gfx11/mes support.
> prevent header copy on suspend from overwriting user fields.
> simplify resume_queues function.
> address other nit-picks
>
> Signed-off-by: Jonathan Kim <jonathan.kim@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c    |   5 +
>   drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h    |   1 +
>   drivers/gpu/drm/amd/amdkfd/kfd_chardev.c      |  11 +
>   drivers/gpu/drm/amd/amdkfd/kfd_debug.c        |   7 +
>   .../drm/amd/amdkfd/kfd_device_queue_manager.c | 446 +++++++++++++++++-
>   .../drm/amd/amdkfd/kfd_device_queue_manager.h |  10 +
>   .../gpu/drm/amd/amdkfd/kfd_mqd_manager_v10.c  |  14 +
>   .../gpu/drm/amd/amdkfd/kfd_mqd_manager_v11.c  |  11 +-
>   .../gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c   |  18 +-
>   drivers/gpu/drm/amd/amdkfd/kfd_priv.h         |   5 +-
>   10 files changed, 518 insertions(+), 10 deletions(-)
>
[snip]
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c
> index 50da16dd4c96..047c43418a1a 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c
> @@ -288,6 +288,11 @@ static int get_wave_state(struct mqd_manager *mm, void *mqd,
>   			  u32 *save_area_used_size)
>   {
>   	struct v9_mqd *m;
> +	struct kfd_context_save_area_header header;
> +	size_t header_copy_size = sizeof(header.control_stack_size) +
> +		sizeof(header.wave_state_size) +
> +		sizeof(header.wave_state_offset) +
> +		sizeof(header.control_stack_offset);

This makes assumptions about the structure layout. I'd feel better if 
these fields were in a sub-structure, which would make this easier and 
safer to handle.

struct kfd_context_save_area_header {
	struct {
		__u32 control_stack_offset;
		__u32 control_stack_size;
		__u32 wave_state_offset;
		__u32 wave_state_size;
	} wave_state;
	...
};

...

|static int get_wave_state(...) { struct kfd_context_save_area_header 
header; ... header.wave_state.control_stack_size = *ctl_stack_used_size; 
header.wave_state.wave_state_size = *save_area_used_size; 
header.wave_state.wave_state_offset = m->cp_hqd_wg_state_offset; 
header.wave_state.control_stack_offset = m->cp_hqd_cntl_stack_offset; if 
(copy_to_user(ctl_stack, &header.wave_state, sizeof(header.wave_state))) 
return -EFAULT; ... } |

This way you're sure you only copy initialized data. The only assumption 
this still makes is, that wave_state is at the start of the header 
structure.

Regards,
   Felix


>   
>   	/* Control stack is located one page after MQD. */
>   	void *mqd_ctl_stack = (void *)((uintptr_t)mqd + PAGE_SIZE);
> @@ -299,7 +304,18 @@ static int get_wave_state(struct mqd_manager *mm, void *mqd,
>   	*save_area_used_size = m->cp_hqd_wg_state_offset -
>   		m->cp_hqd_cntl_stack_size;
>   
> -	if (copy_to_user(ctl_stack, mqd_ctl_stack, m->cp_hqd_cntl_stack_size))
> +	header.control_stack_size = *ctl_stack_used_size;
> +	header.wave_state_size = *save_area_used_size;
> +
> +	header.wave_state_offset = m->cp_hqd_wg_state_offset;
> +	header.control_stack_offset = m->cp_hqd_cntl_stack_offset;
> +
> +	if (copy_to_user(ctl_stack, &header, header_copy_size))
> +		return -EFAULT;
> +
> +	if (copy_to_user(ctl_stack + m->cp_hqd_cntl_stack_offset,
> +				mqd_ctl_stack + m->cp_hqd_cntl_stack_offset,
> +				*ctl_stack_used_size))
>   		return -EFAULT;
>   
>   	return 0;
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> index 6f7dc23af104..8dc7cc1e18a5 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> @@ -477,6 +477,8 @@ struct queue_properties {
>   	uint32_t doorbell_off;
>   	bool is_interop;
>   	bool is_evicted;
> +	bool is_suspended;
> +	bool is_being_destroyed;
>   	bool is_active;
>   	bool is_gws;
>   	bool is_dbg_wa;
> @@ -501,7 +503,8 @@ struct queue_properties {
>   #define QUEUE_IS_ACTIVE(q) ((q).queue_size > 0 &&	\
>   			    (q).queue_address != 0 &&	\
>   			    (q).queue_percent > 0 &&	\
> -			    !(q).is_evicted)
> +			    !(q).is_evicted &&		\
> +			    !(q).is_suspended)
>   
>   enum mqd_update_flag {
>   	UPDATE_FLAG_DBG_WA_ENABLE = 1,

^ permalink raw reply	[flat|nested] 68+ messages in thread

* Re: [PATCH 26/32] drm/amdkfd: add debug set and clear address watch points operation
  2023-01-25 19:53 ` [PATCH 26/32] drm/amdkfd: add debug set and clear address watch points operation Jonathan Kim
@ 2023-03-22 21:38   ` Felix Kuehling
  0 siblings, 0 replies; 68+ messages in thread
From: Felix Kuehling @ 2023-03-22 21:38 UTC (permalink / raw)
  To: Jonathan Kim, amd-gfx, dri-devel


Am 2023-01-25 um 14:53 schrieb Jonathan Kim:
> Shader read, write and atomic memory operations can be alerted to the
> debugger as an address watch exception.
>
> Allow the debugger to pass in a watch point to a particular memory
> address per device.
>
> Note that there exists only 4 watch points per devices to date, so have
> the KFD keep track of what watch points are allocated or not.
>
> v3: add gfx11 support.
> cleanup gfx9 kgd calls to set and clear address watch.
> use per device spinlock to set watch points.
> fixup runlist refresh calls on set/clear address watch.
>
> v2: change dev_id arg to gpu_id for consistency
>
> Signed-off-by: Jonathan Kim <jonathan.kim@amd.com>
> ---
>   .../drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c  |  51 +++++++
>   .../drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c   |   2 +
>   .../drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c    |  78 ++++++++++
>   .../drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.h    |   8 ++
>   .../drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10_3.c  |   5 +-
>   .../drm/amd/amdgpu/amdgpu_amdkfd_gfx_v11.c    |  52 ++++++-
>   .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c |  77 ++++++++++
>   .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h |   8 ++
>   drivers/gpu/drm/amd/amdkfd/kfd_chardev.c      |  24 ++++
>   drivers/gpu/drm/amd/amdkfd/kfd_debug.c        | 136 ++++++++++++++++++
>   drivers/gpu/drm/amd/amdkfd/kfd_debug.h        |   8 +-
>   drivers/gpu/drm/amd/amdkfd/kfd_device.c       |   1 +
>   drivers/gpu/drm/amd/amdkfd/kfd_priv.h         |   6 +-
>   13 files changed, 451 insertions(+), 5 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c
> index 4de2066215b4..18baf1cd8c01 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c
> @@ -118,6 +118,55 @@ static uint32_t kgd_aldebaran_set_wave_launch_mode(struct amdgpu_device *adev,
>   	return data;
>   }
>   
> +#define TCP_WATCH_STRIDE (regTCP_WATCH1_ADDR_H - regTCP_WATCH0_ADDR_H)
> +static uint32_t kgd_gfx_aldebaran_set_address_watch(
> +					struct amdgpu_device *adev,
> +					uint64_t watch_address,
> +					uint32_t watch_address_mask,
> +					uint32_t watch_id,
> +					uint32_t watch_mode,
> +					uint32_t debug_vmid)
> +{
> +	uint32_t watch_address_high;
> +	uint32_t watch_address_low;
> +	uint32_t watch_address_cntl;
> +
> +	watch_address_cntl = 0;
> +	watch_address_low = lower_32_bits(watch_address);
> +	watch_address_high = upper_32_bits(watch_address) & 0xffff;
> +
> +	watch_address_cntl = REG_SET_FIELD(watch_address_cntl,
> +			TCP_WATCH0_CNTL,
> +			MODE,
> +			watch_mode);
> +
> +	watch_address_cntl = REG_SET_FIELD(watch_address_cntl,
> +			TCP_WATCH0_CNTL,
> +			MASK,
> +			watch_address_mask >> 6);
> +
> +	watch_address_cntl = REG_SET_FIELD(watch_address_cntl,
> +			TCP_WATCH0_CNTL,
> +			VALID,
> +			1);
> +
> +	WREG32_RLC((SOC15_REG_OFFSET(GC, 0, regTCP_WATCH0_ADDR_H) +
> +			(watch_id * TCP_WATCH_STRIDE)),
> +			watch_address_high);
> +
> +	WREG32_RLC((SOC15_REG_OFFSET(GC, 0, regTCP_WATCH0_ADDR_L) +
> +			(watch_id * TCP_WATCH_STRIDE)),
> +			watch_address_low);
> +
> +	return watch_address_cntl;
> +}
> +
> +uint32_t kgd_gfx_aldebaran_clear_address_watch(struct amdgpu_device *adev,
> +					uint32_t watch_id)
> +{
> +	return 0;
> +}
> +
>   const struct kfd2kgd_calls aldebaran_kfd2kgd = {
>   	.program_sh_mem_settings = kgd_gfx_v9_program_sh_mem_settings,
>   	.set_pasid_vmid_mapping = kgd_gfx_v9_set_pasid_vmid_mapping,
> @@ -140,6 +189,8 @@ const struct kfd2kgd_calls aldebaran_kfd2kgd = {
>   	.validate_trap_override_request = kgd_aldebaran_validate_trap_override_request,
>   	.set_wave_launch_trap_override = kgd_aldebaran_set_wave_launch_trap_override,
>   	.set_wave_launch_mode = kgd_aldebaran_set_wave_launch_mode,
> +	.set_address_watch = kgd_gfx_aldebaran_set_address_watch,
> +	.clear_address_watch = kgd_gfx_aldebaran_clear_address_watch,
>   	.get_iq_wait_times = kgd_gfx_v9_get_iq_wait_times,
>   	.build_grace_period_packet_info = kgd_gfx_v9_build_grace_period_packet_info,
>   	.program_trap_handler_settings = kgd_gfx_v9_program_trap_handler_settings,
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c
> index 500013540356..a7fb5ef13166 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c
> @@ -413,6 +413,8 @@ const struct kfd2kgd_calls arcturus_kfd2kgd = {
>   	.validate_trap_override_request = kgd_gfx_v9_validate_trap_override_request,
>   	.set_wave_launch_trap_override = kgd_gfx_v9_set_wave_launch_trap_override,
>   	.set_wave_launch_mode = kgd_gfx_v9_set_wave_launch_mode,
> +	.set_address_watch = kgd_gfx_v9_set_address_watch,
> +	.clear_address_watch = kgd_gfx_v9_clear_address_watch,
>   	.get_iq_wait_times = kgd_gfx_v9_get_iq_wait_times,
>   	.build_grace_period_packet_info = kgd_gfx_v9_build_grace_period_packet_info,
>   	.get_cu_occupancy = kgd_gfx_v9_get_cu_occupancy,
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
> index 7591145bc69f..c9246370984c 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
> @@ -878,6 +878,82 @@ uint32_t kgd_gfx_v10_set_wave_launch_mode(struct amdgpu_device *adev,
>   	return 0;
>   }
>   
> +#define TCP_WATCH_STRIDE (mmTCP_WATCH1_ADDR_H - mmTCP_WATCH0_ADDR_H)
> +uint32_t kgd_gfx_v10_set_address_watch(struct amdgpu_device *adev,
> +					uint64_t watch_address,
> +					uint32_t watch_address_mask,
> +					uint32_t watch_id,
> +					uint32_t watch_mode,
> +					uint32_t debug_vmid)
> +{
> +	uint32_t watch_address_high;
> +	uint32_t watch_address_low;
> +	uint32_t watch_address_cntl;
> +
> +	watch_address_cntl = 0;
> +
> +	watch_address_low = lower_32_bits(watch_address);
> +	watch_address_high = upper_32_bits(watch_address) & 0xffff;
> +
> +	watch_address_cntl = REG_SET_FIELD(watch_address_cntl,
> +			TCP_WATCH0_CNTL,
> +			VMID,
> +			debug_vmid);
> +	watch_address_cntl = REG_SET_FIELD(watch_address_cntl,
> +			TCP_WATCH0_CNTL,
> +			MODE,
> +			watch_mode);
> +	watch_address_cntl = REG_SET_FIELD(watch_address_cntl,
> +			TCP_WATCH0_CNTL,
> +			MASK,
> +			watch_address_mask >> 7);
> +
> +	/* Turning off this watch point until we set all the registers */
> +	watch_address_cntl = REG_SET_FIELD(watch_address_cntl,
> +			TCP_WATCH0_CNTL,
> +			VALID,
> +			0);
> +
> +	WREG32((SOC15_REG_OFFSET(GC, 0, mmTCP_WATCH0_CNTL) +
> +			(watch_id * TCP_WATCH_STRIDE)),
> +			watch_address_cntl);
> +
> +	WREG32((SOC15_REG_OFFSET(GC, 0, mmTCP_WATCH0_ADDR_H) +
> +			(watch_id * TCP_WATCH_STRIDE)),
> +			watch_address_high);
> +
> +	WREG32((SOC15_REG_OFFSET(GC, 0, mmTCP_WATCH0_ADDR_L) +
> +			(watch_id * TCP_WATCH_STRIDE)),
> +			watch_address_low);
> +
> +	/* Enable the watch point */
> +	watch_address_cntl = REG_SET_FIELD(watch_address_cntl,
> +			TCP_WATCH0_CNTL,
> +			VALID,
> +			1);
> +
> +	WREG32((SOC15_REG_OFFSET(GC, 0, mmTCP_WATCH0_CNTL) +
> +			(watch_id * TCP_WATCH_STRIDE)),
> +			watch_address_cntl);
> +
> +	return 0;
> +}
> +
> +uint32_t kgd_gfx_v10_clear_address_watch(struct amdgpu_device *adev,
> +					uint32_t watch_id)
> +{
> +	uint32_t watch_address_cntl;
> +
> +	watch_address_cntl = 0;
> +
> +	WREG32((SOC15_REG_OFFSET(GC, 0, mmTCP_WATCH0_CNTL) +
> +			(watch_id * TCP_WATCH_STRIDE)),
> +			watch_address_cntl);
> +
> +	return 0;
> +}
> +
> +
>   /* kgd_gfx_v10_get_iq_wait_times: Returns the mmCP_IQ_WAIT_TIME1/2 values
>    * The values read are:
>    *     ib_offload_wait_time     -- Wait Count for Indirect Buffer Offloads.
> @@ -966,6 +1042,8 @@ const struct kfd2kgd_calls gfx_v10_kfd2kgd = {
>   	.validate_trap_override_request = kgd_gfx_v10_validate_trap_override_request,
>   	.set_wave_launch_trap_override = kgd_gfx_v10_set_wave_launch_trap_override,
>   	.set_wave_launch_mode = kgd_gfx_v10_set_wave_launch_mode,
> +	.set_address_watch = kgd_gfx_v10_set_address_watch,
> +	.clear_address_watch = kgd_gfx_v10_clear_address_watch,
>   	.get_iq_wait_times = kgd_gfx_v10_get_iq_wait_times,
>   	.build_grace_period_packet_info = kgd_gfx_v10_build_grace_period_packet_info,
>   	.program_trap_handler_settings = program_trap_handler_settings,
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.h
> index 34c04a2bb83b..334ff16e25db 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.h
> @@ -39,6 +39,14 @@ uint32_t kgd_gfx_v10_set_wave_launch_trap_override(struct amdgpu_device *adev,
>   uint32_t kgd_gfx_v10_set_wave_launch_mode(struct amdgpu_device *adev,
>   					 uint8_t wave_launch_mode,
>   					 uint32_t vmid);
> +uint32_t kgd_gfx_v10_set_address_watch(struct amdgpu_device *adev,
> +					uint64_t watch_address,
> +					uint32_t watch_address_mask,
> +					uint32_t watch_id,
> +					uint32_t watch_mode,
> +					uint32_t debug_vmid);
> +uint32_t kgd_gfx_v10_clear_address_watch(struct amdgpu_device *adev,
> +					uint32_t watch_id);
>   void kgd_gfx_v10_get_iq_wait_times(struct amdgpu_device *adev, uint32_t *wait_times);
>   void kgd_gfx_v10_build_grace_period_packet_info(struct amdgpu_device *adev,
>   					       uint32_t wait_times,
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10_3.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10_3.c
> index 8627c5458973..ee36ba045dcf 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10_3.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10_3.c
> @@ -676,6 +676,7 @@ const struct kfd2kgd_calls gfx_v10_3_kfd2kgd = {
>   	.disable_debug_trap = kgd_gfx_v10_disable_debug_trap,
>   	.validate_trap_override_request = kgd_gfx_v10_validate_trap_override_request,
>   	.set_wave_launch_trap_override = kgd_gfx_v10_set_wave_launch_trap_override,
> -	.set_wave_launch_mode = kgd_gfx_v10_set_wave_launch_mode
> -
> +	.set_wave_launch_mode = kgd_gfx_v10_set_wave_launch_mode,
> +	.set_address_watch = kgd_gfx_v10_set_address_watch,
> +	.clear_address_watch = kgd_gfx_v10_clear_address_watch
>   };
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v11.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v11.c
> index 4fdc25222dcd..358c219fb704 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v11.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v11.c
> @@ -736,6 +736,54 @@ static uint32_t kgd_gfx_v11_set_wave_launch_mode(struct amdgpu_device *adev,
>   	return data;
>   }
>   
> +#define TCP_WATCH_STRIDE (regTCP_WATCH1_ADDR_H - regTCP_WATCH0_ADDR_H)
> +static uint32_t kgd_gfx_v11_set_address_watch(struct amdgpu_device *adev,
> +					uint64_t watch_address,
> +					uint32_t watch_address_mask,
> +					uint32_t watch_id,
> +					uint32_t watch_mode,
> +					uint32_t debug_vmid)
> +{
> +	uint32_t watch_address_high;
> +	uint32_t watch_address_low;
> +	uint32_t watch_address_cntl;
> +
> +	watch_address_cntl = 0;
> +	watch_address_low = lower_32_bits(watch_address);
> +	watch_address_high = upper_32_bits(watch_address) & 0xffff;
> +
> +	watch_address_cntl = REG_SET_FIELD(watch_address_cntl,
> +			TCP_WATCH0_CNTL,
> +			MODE,
> +			watch_mode);
> +
> +	watch_address_cntl = REG_SET_FIELD(watch_address_cntl,
> +			TCP_WATCH0_CNTL,
> +			MASK,
> +			watch_address_mask >> 7);
> +
> +	watch_address_cntl = REG_SET_FIELD(watch_address_cntl,
> +			TCP_WATCH0_CNTL,
> +			VALID,
> +			1);
> +
> +	WREG32_RLC((SOC15_REG_OFFSET(GC, 0, regTCP_WATCH0_ADDR_H) +
> +			(watch_id * TCP_WATCH_STRIDE)),
> +			watch_address_high);
> +
> +	WREG32_RLC((SOC15_REG_OFFSET(GC, 0, regTCP_WATCH0_ADDR_L) +
> +			(watch_id * TCP_WATCH_STRIDE)),
> +			watch_address_low);
> +
> +	return watch_address_cntl;
> +}
> +
> +uint32_t kgd_gfx_v11_clear_address_watch(struct amdgpu_device *adev,
> +					uint32_t watch_id)
> +{
> +	return 0;
> +}
> +
>   const struct kfd2kgd_calls gfx_v11_kfd2kgd = {
>   	.program_sh_mem_settings = program_sh_mem_settings_v11,
>   	.set_pasid_vmid_mapping = set_pasid_vmid_mapping_v11,
> @@ -756,5 +804,7 @@ const struct kfd2kgd_calls gfx_v11_kfd2kgd = {
>   	.disable_debug_trap = kgd_gfx_v11_disable_debug_trap,
>   	.validate_trap_override_request = kgd_gfx_v11_validate_trap_override_request,
>   	.set_wave_launch_trap_override = kgd_gfx_v11_set_wave_launch_trap_override,
> -	.set_wave_launch_mode = kgd_gfx_v11_set_wave_launch_mode
> +	.set_wave_launch_mode = kgd_gfx_v11_set_wave_launch_mode,
> +	.set_address_watch = kgd_gfx_v11_set_address_watch,
> +	.clear_address_watch = kgd_gfx_v11_clear_address_watch
>   };
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
> index a3c8f5578788..43296b78d888 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
> @@ -814,6 +814,81 @@ uint32_t kgd_gfx_v9_set_wave_launch_mode(struct amdgpu_device *adev,
>   	return 0;
>   }
>   
> +#define TCP_WATCH_STRIDE (mmTCP_WATCH1_ADDR_H - mmTCP_WATCH0_ADDR_H)
> +uint32_t kgd_gfx_v9_set_address_watch(struct amdgpu_device *adev,
> +					uint64_t watch_address,
> +					uint32_t watch_address_mask,
> +					uint32_t watch_id,
> +					uint32_t watch_mode,
> +					uint32_t debug_vmid)
> +{
> +	uint32_t watch_address_high;
> +	uint32_t watch_address_low;
> +	uint32_t watch_address_cntl;
> +
> +	watch_address_cntl = 0;
> +
> +	watch_address_low = lower_32_bits(watch_address);
> +	watch_address_high = upper_32_bits(watch_address) & 0xffff;
> +
> +	watch_address_cntl = REG_SET_FIELD(watch_address_cntl,
> +			TCP_WATCH0_CNTL,
> +			VMID,
> +			debug_vmid);
> +	watch_address_cntl = REG_SET_FIELD(watch_address_cntl,
> +			TCP_WATCH0_CNTL,
> +			MODE,
> +			watch_mode);
> +	watch_address_cntl = REG_SET_FIELD(watch_address_cntl,
> +			TCP_WATCH0_CNTL,
> +			MASK,
> +			watch_address_mask >> 6);
> +
> +	/* Turning off this watch point until we set all the registers */
> +	watch_address_cntl = REG_SET_FIELD(watch_address_cntl,
> +			TCP_WATCH0_CNTL,
> +			VALID,
> +			0);
> +
> +	WREG32_RLC((SOC15_REG_OFFSET(GC, 0, mmTCP_WATCH0_CNTL) +
> +			(watch_id * TCP_WATCH_STRIDE)),
> +			watch_address_cntl);
> +
> +	WREG32_RLC((SOC15_REG_OFFSET(GC, 0, mmTCP_WATCH0_ADDR_H) +
> +			(watch_id * TCP_WATCH_STRIDE)),
> +			watch_address_high);
> +
> +	WREG32_RLC((SOC15_REG_OFFSET(GC, 0, mmTCP_WATCH0_ADDR_L) +
> +			(watch_id * TCP_WATCH_STRIDE)),
> +			watch_address_low);
> +
> +	/* Enable the watch point */
> +	watch_address_cntl = REG_SET_FIELD(watch_address_cntl,
> +			TCP_WATCH0_CNTL,
> +			VALID,
> +			1);
> +
> +	WREG32_RLC((SOC15_REG_OFFSET(GC, 0, mmTCP_WATCH0_CNTL) +
> +			(watch_id * TCP_WATCH_STRIDE)),
> +			watch_address_cntl);
> +
> +	return 0;
> +}
> +
> +uint32_t kgd_gfx_v9_clear_address_watch(struct amdgpu_device *adev,
> +					uint32_t watch_id)
> +{
> +	uint32_t watch_address_cntl;
> +
> +	watch_address_cntl = 0;
> +
> +	WREG32_RLC((SOC15_REG_OFFSET(GC, 0, mmTCP_WATCH0_CNTL) +
> +			(watch_id * TCP_WATCH_STRIDE)),
> +			watch_address_cntl);
> +
> +	return 0;
> +}
> +
>   /* kgd_gfx_v9_get_iq_wait_times: Returns the mmCP_IQ_WAIT_TIME1/2 values
>    * The values read are:
>    *     ib_offload_wait_time     -- Wait Count for Indirect Buffer Offloads.
> @@ -1085,6 +1160,8 @@ const struct kfd2kgd_calls gfx_v9_kfd2kgd = {
>   	.validate_trap_override_request = kgd_gfx_v9_validate_trap_override_request,
>   	.set_wave_launch_trap_override = kgd_gfx_v9_set_wave_launch_trap_override,
>   	.set_wave_launch_mode = kgd_gfx_v9_set_wave_launch_mode,
> +	.set_address_watch = kgd_gfx_v9_set_address_watch,
> +	.clear_address_watch = kgd_gfx_v9_clear_address_watch,
>   	.get_iq_wait_times = kgd_gfx_v9_get_iq_wait_times,
>   	.build_grace_period_packet_info = kgd_gfx_v9_build_grace_period_packet_info,
>   	.get_cu_occupancy = kgd_gfx_v9_get_cu_occupancy,
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h
> index 2a2ab42037e4..ba52b61b68c5 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h
> @@ -77,6 +77,14 @@ uint32_t kgd_gfx_v9_set_wave_launch_trap_override(struct amdgpu_device *adev,
>   					     uint32_t trap_mask_request,
>   					     uint32_t *trap_mask_prev,
>   					     uint32_t kfd_dbg_trap_cntl_prev);
> +uint32_t kgd_gfx_v9_set_address_watch(struct amdgpu_device *adev,
> +					uint64_t watch_address,
> +					uint32_t watch_address_mask,
> +					uint32_t watch_id,
> +					uint32_t watch_mode,
> +					uint32_t debug_vmid);
> +uint32_t kgd_gfx_v9_clear_address_watch(struct amdgpu_device *adev,
> +					uint32_t watch_id);
>   void kgd_gfx_v9_get_iq_wait_times(struct amdgpu_device *adev, uint32_t *wait_times);
>   void kgd_gfx_v9_build_grace_period_packet_info(struct amdgpu_device *adev,
>   					       uint32_t wait_times,
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> index b62e93b35a44..8f2ede781863 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> @@ -2802,6 +2802,7 @@ static int kfd_ioctl_set_debug_trap(struct file *filep, struct kfd_process *p, v
>   	struct task_struct *thread = NULL;
>   	struct pid *pid = NULL;
>   	struct kfd_process *target = NULL;
> +	struct kfd_process_device *pdd = NULL;
>   	int r = 0;
>   
>   	if (sched_policy == KFD_SCHED_POLICY_NO_HWS) {
> @@ -2869,6 +2870,20 @@ static int kfd_ioctl_set_debug_trap(struct file *filep, struct kfd_process *p, v
>   		goto unlock_out;
>   	}
>   
> +	if (args->op == KFD_IOC_DBG_TRAP_SET_NODE_ADDRESS_WATCH ||
> +			args->op == KFD_IOC_DBG_TRAP_CLEAR_NODE_ADDRESS_WATCH) {

The indentation is still messed up here. The second line should be 
aligned with the open parenthesis. With that fixed, the patch is

Reviewed-by: Felix Kuehling <Felix.Kuehling@amd.com>


> +		int user_gpu_id = kfd_process_get_user_gpu_id(target,
> +				args->op == KFD_IOC_DBG_TRAP_SET_NODE_ADDRESS_WATCH ?
> +					args->set_node_address_watch.gpu_id :
> +					args->clear_node_address_watch.gpu_id);
> +
> +		pdd = kfd_process_device_data_by_id(target, user_gpu_id);
> +		if (user_gpu_id == -EINVAL || !pdd) {
> +			r = -ENODEV;
> +			goto unlock_out;
> +		}
> +	}
> +
>   	switch (args->op) {
>   	case KFD_IOC_DBG_TRAP_ENABLE:
>   		if (target != p)
> @@ -2921,7 +2936,16 @@ static int kfd_ioctl_set_debug_trap(struct file *filep, struct kfd_process *p, v
>   				(uint32_t *)args->resume_queues.queue_array_ptr);
>   		break;
>   	case KFD_IOC_DBG_TRAP_SET_NODE_ADDRESS_WATCH:
> +		r = kfd_dbg_trap_set_dev_address_watch(pdd,
> +				args->set_node_address_watch.address,
> +				args->set_node_address_watch.mask,
> +				&args->set_node_address_watch.id,
> +				args->set_node_address_watch.mode);
> +		break;
>   	case KFD_IOC_DBG_TRAP_CLEAR_NODE_ADDRESS_WATCH:
> +		r = kfd_dbg_trap_clear_dev_address_watch(pdd,
> +				args->clear_node_address_watch.id);
> +		break;
>   	case KFD_IOC_DBG_TRAP_SET_FLAGS:
>   	case KFD_IOC_DBG_TRAP_QUERY_DEBUG_EVENT:
>   	case KFD_IOC_DBG_TRAP_QUERY_EXCEPTION_INFO:
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
> index 730e53584113..8d2e1adb442d 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
> @@ -24,6 +24,8 @@
>   #include "kfd_device_queue_manager.h"
>   #include <linux/file.h>
>   
> +#define MAX_WATCH_ADDRESSES	4
> +
>   void debug_event_write_work_handler(struct work_struct *work)
>   {
>   	struct kfd_process *process;
> @@ -291,6 +293,139 @@ int kfd_dbg_set_mes_debug_mode(struct kfd_process_device *pdd)
>   						pdd->watch_points, flags);
>   }
>   
> +#define KFD_DEBUGGER_INVALID_WATCH_POINT_ID -1
> +static int kfd_dbg_get_dev_watch_id(struct kfd_process_device *pdd, int *watch_id)
> +{
> +	int i;
> +
> +	*watch_id = KFD_DEBUGGER_INVALID_WATCH_POINT_ID;
> +
> +	spin_lock(&pdd->dev->watch_points_lock);
> +
> +	for (i = 0; i < MAX_WATCH_ADDRESSES; i++) {
> +		/* device watchpoint in use so skip */
> +		if ((pdd->dev->alloc_watch_ids >> i) & 0x1)
> +			continue;
> +
> +		pdd->alloc_watch_ids |= 0x1 << i;
> +		pdd->dev->alloc_watch_ids |= 0x1 << i;
> +		*watch_id = i;
> +		spin_unlock(&pdd->dev->watch_points_lock);
> +		return 0;
> +	}
> +
> +	spin_unlock(&pdd->dev->watch_points_lock);
> +
> +	return -ENOMEM;
> +}
> +
> +static void kfd_dbg_clear_dev_watch_id(struct kfd_process_device *pdd, int watch_id)
> +{
> +	spin_lock(&pdd->dev->watch_points_lock);
> +
> +	/* process owns device watch point so safe to clear */
> +	if ((pdd->alloc_watch_ids >> watch_id) & 0x1) {
> +		pdd->alloc_watch_ids &= ~(0x1 << watch_id);
> +		pdd->dev->alloc_watch_ids &= ~(0x1 << watch_id);
> +	}
> +
> +	spin_unlock(&pdd->dev->watch_points_lock);
> +}
> +
> +static bool kfd_dbg_owns_dev_watch_id(struct kfd_process_device *pdd, int watch_id)
> +{
> +	bool owns_watch_id = false;
> +
> +	spin_lock(&pdd->dev->watch_points_lock);
> +	owns_watch_id = watch_id < MAX_WATCH_ADDRESSES &&
> +			((pdd->alloc_watch_ids >> watch_id) & 0x1);
> +
> +	spin_unlock(&pdd->dev->watch_points_lock);
> +
> +	return owns_watch_id;
> +}
> +
> +int kfd_dbg_trap_clear_dev_address_watch(struct kfd_process_device *pdd,
> +					uint32_t watch_id)
> +{
> +	int r;
> +
> +	if (!kfd_dbg_owns_dev_watch_id(pdd, watch_id))
> +		return -EINVAL;
> +
> +	if (!pdd->dev->shared_resources.enable_mes) {
> +		r = debug_lock_and_unmap(pdd->dev->dqm);
> +		if (r)
> +			return r;
> +	}
> +
> +	amdgpu_gfx_off_ctrl(pdd->dev->adev, false);
> +	pdd->watch_points[watch_id] = pdd->dev->kfd2kgd->clear_address_watch(
> +							pdd->dev->adev,
> +							watch_id);
> +	amdgpu_gfx_off_ctrl(pdd->dev->adev, true);
> +
> +	if (!pdd->dev->shared_resources.enable_mes)
> +		r = debug_map_and_unlock(pdd->dev->dqm);
> +	else
> +		r = kfd_dbg_set_mes_debug_mode(pdd);
> +
> +	kfd_dbg_clear_dev_watch_id(pdd, watch_id);
> +
> +	return r;
> +}
> +
> +int kfd_dbg_trap_set_dev_address_watch(struct kfd_process_device *pdd,
> +					uint64_t watch_address,
> +					uint32_t watch_address_mask,
> +					uint32_t *watch_id,
> +					uint32_t watch_mode)
> +{
> +	int r = kfd_dbg_get_dev_watch_id(pdd, watch_id);
> +
> +	if (r)
> +		return r;
> +
> +	if (!pdd->dev->shared_resources.enable_mes) {
> +		r = debug_lock_and_unmap(pdd->dev->dqm);
> +		if (r) {
> +			kfd_dbg_clear_dev_watch_id(pdd, *watch_id);
> +			return r;
> +		}
> +	}
> +
> +	amdgpu_gfx_off_ctrl(pdd->dev->adev, false);
> +	pdd->watch_points[*watch_id] = pdd->dev->kfd2kgd->set_address_watch(
> +				pdd->dev->adev,
> +				watch_address,
> +				watch_address_mask,
> +				*watch_id,
> +				watch_mode,
> +				pdd->dev->vm_info.last_vmid_kfd);
> +	amdgpu_gfx_off_ctrl(pdd->dev->adev, true);
> +
> +	if (!pdd->dev->shared_resources.enable_mes)
> +		r = debug_map_and_unlock(pdd->dev->dqm);
> +	else
> +		r = kfd_dbg_set_mes_debug_mode(pdd);
> +
> +	/* HWS is broken so no point in HW rollback but release the watchpoint anyways */
> +	if (r)
> +		kfd_dbg_clear_dev_watch_id(pdd, *watch_id);
> +
> +	return 0;
> +}
> +
> +static void kfd_dbg_clear_process_address_watch(struct kfd_process *target)
> +{
> +	int i, j;
> +
> +	for (i = 0; i < target->n_pdds; i++)
> +		for (j = 0; j < MAX_WATCH_ADDRESSES; j++)
> +			kfd_dbg_trap_clear_dev_address_watch(target->pdds[i], j);
> +}
> +
> +
>   /* kfd_dbg_trap_deactivate:
>    *	target: target process
>    *	unwind: If this is unwinding a failed kfd_dbg_trap_enable()
> @@ -305,6 +440,7 @@ void kfd_dbg_trap_deactivate(struct kfd_process *target, bool unwind, int unwind
>   
>   	if (!unwind) {
>   		cancel_work_sync(&target->debug_event_workarea);
> +		kfd_dbg_clear_process_address_watch(target);
>   		kfd_dbg_trap_set_wave_launch_mode(target, 0);
>   	}
>   
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.h b/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
> index 0d70f162d6d8..63c716ce5ab9 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
> @@ -50,7 +50,13 @@ int kfd_dbg_trap_set_wave_launch_override(struct kfd_process *target,
>   					uint32_t *trap_mask_supported);
>   int kfd_dbg_trap_set_wave_launch_mode(struct kfd_process *target,
>   					uint8_t wave_launch_mode);
> -
> +int kfd_dbg_trap_clear_dev_address_watch(struct kfd_process_device *pdd,
> +					uint32_t watch_id);
> +int kfd_dbg_trap_set_dev_address_watch(struct kfd_process_device *pdd,
> +					uint64_t watch_address,
> +					uint32_t watch_address_mask,
> +					uint32_t *watch_id,
> +					uint32_t watch_mode);
>   int kfd_dbg_send_exception_to_runtime(struct kfd_process *p,
>   					unsigned int dev_id,
>   					unsigned int queue_id,
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
> index 6e25238d18f9..ca849cd051d5 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
> @@ -641,6 +641,7 @@ bool kgd2kfd_device_init(struct kfd_dev *kfd,
>   	}
>   
>   	kfd_smi_init(kfd);
> +	spin_lock_init(&kfd->watch_points_lock);
>   
>   	kfd->init_complete = true;
>   	dev_info(kfd_device, "added device %x:%x\n", kfd->adev->pdev->vendor,
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> index 8dc7cc1e18a5..cfc50d1690c7 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> @@ -348,6 +348,10 @@ struct kfd_dev {
>   
>   	/* HMM page migration MEMORY_DEVICE_PRIVATE mapping */
>   	struct dev_pagemap pgmap;
> +
> +	/* Track per device allocated watch points */
> +	uint32_t alloc_watch_ids;
> +	spinlock_t watch_points_lock;
>   };
>   
>   enum kfd_mempool {
> @@ -799,6 +803,7 @@ struct kfd_process_device {
>   	uint32_t spi_dbg_override;
>   	uint32_t spi_dbg_launch_mode;
>   	uint32_t watch_points[4];
> +	uint32_t alloc_watch_ids;
>   
>   	/*
>   	 * If this process has been checkpointed before, then the user
> @@ -955,7 +960,6 @@ struct kfd_process {
>   	struct semaphore runtime_enable_sema;
>   	bool is_runtime_retry;
>   	struct kfd_runtime_info runtime_info;
> -
>   };
>   
>   #define KFD_PROCESS_TABLE_SIZE 5 /* bits: 32 entries */

^ permalink raw reply	[flat|nested] 68+ messages in thread

* Re: [PATCH 27/32] drm/amdkfd: add debug set flags operation
  2023-01-25 19:53 ` [PATCH 27/32] drm/amdkfd: add debug set flags operation Jonathan Kim
@ 2023-03-22 21:47   ` Felix Kuehling
  0 siblings, 0 replies; 68+ messages in thread
From: Felix Kuehling @ 2023-03-22 21:47 UTC (permalink / raw)
  To: Jonathan Kim, amd-gfx, dri-devel


Am 2023-01-25 um 14:53 schrieb Jonathan Kim:
> Allow the debugger to set single memory and single ALU operations.
>
> Some exceptions are imprecise (memory violations, address watch) in the
> sense that a trap occurs only when the exception interrupt occurs and
> not at the non-halting faulty instruction.  Trap temporaries 0 & 1 save
> the program counter address, which means that these values will not point
> to the faulty instruction address but to whenever the interrupt was
> raised.
>
> Setting the Single Memory Operations flag will inject an automatic wait
> on every memory operation instruction forcing imprecise memory exceptions
> to become precise at the cost of performance.  This setting is not
> permitted on debug devices that support only a global setting of this
> option.
>
> Return the previous set flags to the debugger as well.
>
> v3: make precise mem op the only available flag for now.
>
> v2: add gfx11 support.
>
> Signed-off-by: Jonathan Kim <jonathan.kim@amd.com>
> ---
>   drivers/gpu/drm/amd/amdkfd/kfd_chardev.c |  2 ++
>   drivers/gpu/drm/amd/amdkfd/kfd_debug.c   | 38 ++++++++++++++++++++++++
>   drivers/gpu/drm/amd/amdkfd/kfd_debug.h   |  1 +
>   3 files changed, 41 insertions(+)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> index 8f2ede781863..c34caa14b84e 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> @@ -2947,6 +2947,8 @@ static int kfd_ioctl_set_debug_trap(struct file *filep, struct kfd_process *p, v
>   				args->clear_node_address_watch.id);
>   		break;
>   	case KFD_IOC_DBG_TRAP_SET_FLAGS:
> +		r = kfd_dbg_trap_set_flags(target, &args->set_flags.flags);
> +		break;
>   	case KFD_IOC_DBG_TRAP_QUERY_DEBUG_EVENT:
>   	case KFD_IOC_DBG_TRAP_QUERY_EXCEPTION_INFO:
>   	case KFD_IOC_DBG_TRAP_GET_QUEUE_SNAPSHOT:
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
> index 8d2e1adb442d..77ba7da2bb9d 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
> @@ -23,6 +23,7 @@
>   #include "kfd_debug.h"
>   #include "kfd_device_queue_manager.h"
>   #include <linux/file.h>
> +#include <uapi/linux/kfd_ioctl.h>
>   
>   #define MAX_WATCH_ADDRESSES	4
>   
> @@ -425,6 +426,40 @@ static void kfd_dbg_clear_process_address_watch(struct kfd_process *target)
>   			kfd_dbg_trap_clear_dev_address_watch(target->pdds[i], j);
>   }
>   
> +int kfd_dbg_trap_set_flags(struct kfd_process *target, uint32_t *flags)
> +{
> +	uint32_t prev_flags = target->dbg_flags;
> +	int i, r = 0;
> +
> +	for (i = 0; i < target->n_pdds; i++) {
> +		if (!kfd_dbg_is_per_vmid_supported(target->pdds[i]->dev) &&
> +			(*flags & KFD_DBG_TRAP_FLAG_SINGLE_MEM_OP)) {
> +			*flags = prev_flags;
> +			return -EACCES;
> +		}
> +	}
> +
> +	target->dbg_flags = *flags & KFD_DBG_TRAP_FLAG_SINGLE_MEM_OP;
> +	*flags = prev_flags;
> +	for (i = 0; i < target->n_pdds; i++) {
> +		struct kfd_process_device *pdd = target->pdds[i];
> +
> +		if (!kfd_dbg_is_per_vmid_supported(pdd->dev))
> +			continue;
> +
> +		if (!pdd->dev->shared_resources.enable_mes)
> +			r = debug_refresh_runlist(pdd->dev->dqm);
> +		else
> +			r = kfd_dbg_set_mes_debug_mode(pdd);
> +
> +		if (r) {
> +			target->dbg_flags = prev_flags;
> +			break;

Do we need to roll back changes on the other GPUs when this happens?


> +		}
> +	}
> +
> +	return r;
> +}
>   
>   /* kfd_dbg_trap_deactivate:
>    *	target: target process
> @@ -439,9 +474,12 @@ void kfd_dbg_trap_deactivate(struct kfd_process *target, bool unwind, int unwind
>   	int i, count = 0;
>   
>   	if (!unwind) {
> +		uint32_t flags = 0;

checkpatch.pl will complain without an empty line after the variable 
declaration.

Regards,
   Felix


>   		cancel_work_sync(&target->debug_event_workarea);
>   		kfd_dbg_clear_process_address_watch(target);
>   		kfd_dbg_trap_set_wave_launch_mode(target, 0);
> +
> +		kfd_dbg_trap_set_flags(target, &flags);
>   	}
>   
>   	for (i = 0; i < target->n_pdds; i++) {
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.h b/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
> index 63c716ce5ab9..782362d82890 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
> @@ -57,6 +57,7 @@ int kfd_dbg_trap_set_dev_address_watch(struct kfd_process_device *pdd,
>   					uint32_t watch_address_mask,
>   					uint32_t *watch_id,
>   					uint32_t watch_mode);
> +int kfd_dbg_trap_set_flags(struct kfd_process *target, uint32_t *flags);
>   int kfd_dbg_send_exception_to_runtime(struct kfd_process *p,
>   					unsigned int dev_id,
>   					unsigned int queue_id,

^ permalink raw reply	[flat|nested] 68+ messages in thread

* Re: [PATCH 30/32] drm/amdkfd: add debug queue snapshot operation
  2023-01-25 19:53 ` [PATCH 30/32] drm/amdkfd: add debug queue snapshot operation Jonathan Kim
@ 2023-03-22 21:52   ` Felix Kuehling
  0 siblings, 0 replies; 68+ messages in thread
From: Felix Kuehling @ 2023-03-22 21:52 UTC (permalink / raw)
  To: Jonathan Kim, amd-gfx, dri-devel


Am 2023-01-25 um 14:53 schrieb Jonathan Kim:
> Allow the debugger to get a snapshot of a specified number of queues
> containing various queue property information that is copied to the
> debugger.
>
> Since the debugger doesn't know how many queues exist at any given time,
> allow the debugger to pass the requested number of snapshots as 0 to get
> the actual number of potential snapshots to use for a subsequent snapshot
> request for actual information.
>
> To prevent future ABI breakage, pass in the requested entry_size.
> The KFD will return it's own entry_size in case the debugger still wants
> log the information in a core dump on sizing failure.
>
> Also allow the debugger to clear exceptions when doing a snapshot.
>
> v3: fix uninitialized return and change queue snapshot to type void for
> proper increment on buffer copy.
> use memset 0 to init snapshot entry to clear struct padding.
>
> v2: change buf_size arg to num_queues for clarity.
> fix minimum entry size calculation.
>
> Signed-off-by: Jonathan Kim <jonathan.kim@amd.com>
> ---
>   drivers/gpu/drm/amd/amdkfd/kfd_chardev.c      |  6 +++
>   .../drm/amd/amdkfd/kfd_device_queue_manager.c | 36 ++++++++++++++++
>   .../drm/amd/amdkfd/kfd_device_queue_manager.h |  3 ++
>   drivers/gpu/drm/amd/amdkfd/kfd_priv.h         |  5 +++
>   .../amd/amdkfd/kfd_process_queue_manager.c    | 41 +++++++++++++++++++
>   5 files changed, 91 insertions(+)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> index d3d2026b6e65..93b288233577 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> @@ -2965,6 +2965,12 @@ static int kfd_ioctl_set_debug_trap(struct file *filep, struct kfd_process *p, v
>   				&args->query_exception_info.info_size);
>   		break;
>   	case KFD_IOC_DBG_TRAP_GET_QUEUE_SNAPSHOT:
> +		r = pqm_get_queue_snapshot(&target->pqm,
> +				args->queue_snapshot.exception_mask,
> +				(void __user *)args->queue_snapshot.snapshot_buf_ptr,
> +				&args->queue_snapshot.num_queues,
> +				&args->queue_snapshot.entry_size);
> +		break;
>   	case KFD_IOC_DBG_TRAP_GET_DEVICE_SNAPSHOT:
>   		pr_warn("Debug op %i not supported yet\n", args->op);
>   		r = -EACCES;
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> index 7792fe9491c5..5ae504a512f0 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> @@ -3000,6 +3000,42 @@ int suspend_queues(struct kfd_process *p,
>   	return total_suspended;
>   }
>   
> +static uint32_t set_queue_type_for_user(struct queue_properties *q_props)
> +{
> +	switch (q_props->type) {
> +	case KFD_QUEUE_TYPE_COMPUTE:
> +		return q_props->format == KFD_QUEUE_FORMAT_PM4
> +					? KFD_IOC_QUEUE_TYPE_COMPUTE
> +					: KFD_IOC_QUEUE_TYPE_COMPUTE_AQL;
> +	case KFD_QUEUE_TYPE_SDMA:
> +		return KFD_IOC_QUEUE_TYPE_SDMA;
> +	case KFD_QUEUE_TYPE_SDMA_XGMI:
> +		return KFD_IOC_QUEUE_TYPE_SDMA_XGMI;
> +	default:
> +		WARN_ONCE(true, "queue type not recognized!");
> +		return 0xffffffff;
> +	};
> +}
> +
> +void set_queue_snapshot_entry(struct queue *q,
> +			      uint64_t exception_clear_mask,
> +			      struct kfd_queue_snapshot_entry *qss_entry)
> +{
> +	qss_entry->ring_base_address = q->properties.queue_address;
> +	qss_entry->write_pointer_address = (uint64_t)q->properties.write_ptr;
> +	qss_entry->read_pointer_address = (uint64_t)q->properties.read_ptr;
> +	qss_entry->ctx_save_restore_address =
> +				q->properties.ctx_save_restore_area_address;
> +	qss_entry->ctx_save_restore_area_size =
> +				q->properties.ctx_save_restore_area_size;
> +	qss_entry->exception_status = q->properties.exception_status;
> +	qss_entry->queue_id = q->properties.queue_id;
> +	qss_entry->gpu_id = q->device->id;
> +	qss_entry->ring_size = (uint32_t)q->properties.queue_size;
> +	qss_entry->queue_type = set_queue_type_for_user(&q->properties);
> +	q->properties.exception_status &= ~exception_clear_mask;
> +}
> +
>   int debug_lock_and_unmap(struct device_queue_manager *dqm)
>   {
>   	int r;
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
> index 7ccf8d0d1867..89d4a5b293a5 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
> @@ -296,6 +296,9 @@ int suspend_queues(struct kfd_process *p,
>   int resume_queues(struct kfd_process *p,
>   		uint32_t num_queues,
>   		uint32_t *usr_queue_id_array);
> +void set_queue_snapshot_entry(struct queue *q,
> +			      uint64_t exception_clear_mask,
> +			      struct kfd_queue_snapshot_entry *qss_entry);
>   int debug_lock_and_unmap(struct device_queue_manager *dqm);
>   int debug_map_and_unlock(struct device_queue_manager *dqm);
>   int debug_refresh_runlist(struct device_queue_manager *dqm);
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> index cfc50d1690c7..cc7816db60eb 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> @@ -1302,6 +1302,11 @@ int pqm_get_wave_state(struct process_queue_manager *pqm,
>   		       void __user *ctl_stack,
>   		       u32 *ctl_stack_used_size,
>   		       u32 *save_area_used_size);
> +int pqm_get_queue_snapshot(struct process_queue_manager *pqm,
> +			   uint64_t exception_clear_mask,
> +			   void __user *buf,
> +			   int *num_qss_entries,
> +			   uint32_t *entry_size);
>   
>   int amdkfd_fence_wait_timeout(uint64_t *fence_addr,
>   			      uint64_t fence_value,
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
> index 0ae6026c7d69..221cd4b03f1c 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
> @@ -576,6 +576,47 @@ int pqm_get_wave_state(struct process_queue_manager *pqm,
>   						       save_area_used_size);
>   }
>   
> +int pqm_get_queue_snapshot(struct process_queue_manager *pqm,
> +			   uint64_t exception_clear_mask,
> +			   void __user *buf,
> +			   int *num_qss_entries,
> +			   uint32_t *entry_size)
> +{
> +	struct process_queue_node *pqn;
> +	uint32_t tmp_entry_size = *entry_size, tmp_qss_entries = *num_qss_entries;
> +	int r = 0;
> +
> +	*num_qss_entries = 0;
> +	if (!(*entry_size))
> +		return -EINVAL;
> +
> +	*entry_size = min_t(size_t, *entry_size, sizeof(struct kfd_queue_snapshot_entry));
> +	mutex_lock(&pqm->process->event_mutex);
> +
> +	list_for_each_entry(pqn, &pqm->queues, process_queue_list) {
> +		if (!pqn->q)
> +			continue;
> +
> +		if (*num_qss_entries < tmp_qss_entries) {
> +			struct kfd_queue_snapshot_entry src;
> +
> +			memset(&src, 0, sizeof(src));

I'd move the variable declaration up to the function scope. That way you 
only need to memset it once outside the loop. With that fixed, the patch is

Reviewed-by: Felix Kuehling <Felix.Kuehling@amd.com>


> +
> +			set_queue_snapshot_entry(pqn->q, exception_clear_mask, &src);
> +
> +			if (copy_to_user(buf, &src, *entry_size)) {
> +				r = -EFAULT;
> +				break;
> +			}
> +			buf += tmp_entry_size;
> +		}
> +		*num_qss_entries += 1;
> +	}
> +
> +	mutex_unlock(&pqm->process->event_mutex);
> +	return r;
> +}
> +
>   static int get_queue_data_sizes(struct kfd_process_device *pdd,
>   				struct queue *q,
>   				uint32_t *mqd_size,

^ permalink raw reply	[flat|nested] 68+ messages in thread

* Re: [PATCH 31/32] drm/amdkfd: add debug device snapshot operation
  2023-01-25 19:54 ` [PATCH 31/32] drm/amdkfd: add debug device " Jonathan Kim
@ 2023-03-22 21:54   ` Felix Kuehling
  0 siblings, 0 replies; 68+ messages in thread
From: Felix Kuehling @ 2023-03-22 21:54 UTC (permalink / raw)
  To: Jonathan Kim, amd-gfx, dri-devel

Am 2023-01-25 um 14:54 schrieb Jonathan Kim:
> Similar to queue snapshot, return an array of device information using
> an entry_size check and return.
> Unlike queue snapshots, the debugger needs to pass to correct number of
> devices that exist.  If it fails to do so, the KFD will return the
> number of actual devices so that the debugger can make a subsequent
> successful call.
>
> v3: was reviewed but re-requesting review with new revision and
> subvendor information.
> memset 0 device info entry to clear padding.
>
> v2: change buf_size are to num_devices for more clarity.
> expand device entry new members on copy.
> fix minimum entry size calculation for queue and device snapshot.
> change device snapshot implementation to match queue snapshot
> implementation.
>
> Signed-off-by: Jonathan Kim <jonathan.kim@amd.com>

Reviewed-by: Felix Kuehling <Felix.Kuehling@amd.com>


> ---
>   drivers/gpu/drm/amd/amdkfd/kfd_chardev.c |  7 ++-
>   drivers/gpu/drm/amd/amdkfd/kfd_debug.c   | 72 ++++++++++++++++++++++++
>   drivers/gpu/drm/amd/amdkfd/kfd_debug.h   |  5 ++
>   3 files changed, 82 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> index 93b288233577..da74a6ef4d9b 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> @@ -2972,8 +2972,11 @@ static int kfd_ioctl_set_debug_trap(struct file *filep, struct kfd_process *p, v
>   				&args->queue_snapshot.entry_size);
>   		break;
>   	case KFD_IOC_DBG_TRAP_GET_DEVICE_SNAPSHOT:
> -		pr_warn("Debug op %i not supported yet\n", args->op);
> -		r = -EACCES;
> +		r = kfd_dbg_trap_device_snapshot(target,
> +				args->device_snapshot.exception_mask,
> +				(void __user *)args->device_snapshot.snapshot_buf_ptr,
> +				&args->device_snapshot.num_devices,
> +				&args->device_snapshot.entry_size);
>   		break;
>   	default:
>   		pr_err("Invalid option: %i\n", args->op);
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
> index db316f0625f8..d1c4eb9652fd 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
> @@ -22,6 +22,7 @@
>   
>   #include "kfd_debug.h"
>   #include "kfd_device_queue_manager.h"
> +#include "kfd_topology.h"
>   #include <linux/file.h>
>   #include <uapi/linux/kfd_ioctl.h>
>   
> @@ -998,6 +999,77 @@ int kfd_dbg_trap_query_exception_info(struct kfd_process *target,
>   	return r;
>   }
>   
> +int kfd_dbg_trap_device_snapshot(struct kfd_process *target,
> +		uint64_t exception_clear_mask,
> +		void __user *user_info,
> +		uint32_t *number_of_device_infos,
> +		uint32_t *entry_size)
> +{
> +	struct kfd_dbg_device_info_entry device_info;
> +	uint32_t tmp_entry_size = *entry_size, tmp_num_devices;
> +	int i, r = 0;
> +
> +	if (!(target && user_info && number_of_device_infos && entry_size))
> +		return -EINVAL;
> +
> +	tmp_num_devices = min_t(size_t, *number_of_device_infos, target->n_pdds);
> +	*number_of_device_infos = target->n_pdds;
> +	*entry_size = min_t(size_t, *entry_size, sizeof(device_info));
> +
> +	if (!tmp_num_devices)
> +		return 0;
> +
> +	memset(&device_info, 0, sizeof(device_info));
> +
> +	mutex_lock(&target->event_mutex);
> +
> +	/* Run over all pdd of the process */
> +	for (i = 0; i < tmp_num_devices; i++) {
> +		struct kfd_process_device *pdd = target->pdds[i];
> +		struct kfd_topology_device *topo_dev = kfd_topology_device_by_id(pdd->dev->id);
> +
> +		device_info.gpu_id = pdd->dev->id;
> +		device_info.exception_status = pdd->exception_status;
> +		device_info.lds_base = pdd->lds_base;
> +		device_info.lds_limit = pdd->lds_limit;
> +		device_info.scratch_base = pdd->scratch_base;
> +		device_info.scratch_limit = pdd->scratch_limit;
> +		device_info.gpuvm_base = pdd->gpuvm_base;
> +		device_info.gpuvm_limit = pdd->gpuvm_limit;
> +		device_info.location_id = topo_dev->node_props.location_id;
> +		device_info.vendor_id = topo_dev->node_props.vendor_id;
> +		device_info.device_id = topo_dev->node_props.device_id;
> +		device_info.revision_id = pdd->dev->adev->pdev->revision;
> +		device_info.subsystem_vendor_id = pdd->dev->adev->pdev->subsystem_vendor;
> +		device_info.subsystem_device_id = pdd->dev->adev->pdev->subsystem_device;
> +		device_info.fw_version = pdd->dev->mec_fw_version;
> +		device_info.gfx_target_version =
> +			topo_dev->node_props.gfx_target_version;
> +		device_info.simd_count = topo_dev->node_props.simd_count;
> +		device_info.max_waves_per_simd =
> +			topo_dev->node_props.max_waves_per_simd;
> +		device_info.array_count = topo_dev->node_props.array_count;
> +		device_info.simd_arrays_per_engine =
> +			topo_dev->node_props.simd_arrays_per_engine;
> +		device_info.capability = topo_dev->node_props.capability;
> +		device_info.debug_prop = topo_dev->node_props.debug_prop;
> +
> +		if (exception_clear_mask)
> +			pdd->exception_status &= ~exception_clear_mask;
> +
> +		if (copy_to_user(user_info, &device_info, *entry_size)) {
> +			r = -EFAULT;
> +			break;
> +		}
> +
> +		user_info += tmp_entry_size;
> +	}
> +
> +	mutex_unlock(&target->event_mutex);
> +
> +	return r;
> +}
> +
>   void kfd_dbg_set_enabled_debug_exception_mask(struct kfd_process *target,
>   					uint64_t exception_set_mask)
>   {
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.h b/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
> index ee12de5f7adf..b31e453704fc 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
> @@ -81,6 +81,11 @@ static inline bool kfd_dbg_is_per_vmid_supported(struct kfd_dev *dev)
>   }
>   
>   void debug_event_write_work_handler(struct work_struct *work);
> +int kfd_dbg_trap_device_snapshot(struct kfd_process *target,
> +		uint64_t exception_clear_mask,
> +		void __user *user_info,
> +		uint32_t *number_of_device_infos,
> +		uint32_t *entry_size);
>   
>   void kfd_dbg_set_enabled_debug_exception_mask(struct kfd_process *target,
>   					uint64_t exception_set_mask);

^ permalink raw reply	[flat|nested] 68+ messages in thread

* Re: [PATCH 32/32] drm/amdkfd: bump kfd ioctl minor version for debug api availability
  2023-01-25 19:54 ` [PATCH 32/32] drm/amdkfd: bump kfd ioctl minor version for debug api availability Jonathan Kim
@ 2023-03-22 21:56   ` Felix Kuehling
  0 siblings, 0 replies; 68+ messages in thread
From: Felix Kuehling @ 2023-03-22 21:56 UTC (permalink / raw)
  To: Jonathan Kim, amd-gfx, dri-devel

Am 2023-01-25 um 14:54 schrieb Jonathan Kim:
> Bump the minor version to declare debugging capability is now
> available.
>
> Signed-off-by: Jonathan Kim <jonathan.kim@amd.com>
> Reviewed-by: Felix Kuehling <felix.kuehling@amd.com>

This needs to be bumped to 1.13 once you rebase on the latest staging. 
With that fixed, the patch is

Reviewed-by: Felix Kuehling <Felix.Kuehling@amd.com>


> ---
>   drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 1 -
>   include/uapi/linux/kfd_ioctl.h           | 3 ++-
>   2 files changed, 2 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> index da74a6ef4d9b..c28d4b2dd0ef 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> @@ -2896,7 +2896,6 @@ static int kfd_ioctl_set_debug_trap(struct file *filep, struct kfd_process *p, v
>   		if (!r)
>   			target->exception_enable_mask = args->enable.exception_mask;
>   
> -		pr_warn("Debug functions limited\n");
>   		break;
>   	case KFD_IOC_DBG_TRAP_DISABLE:
>   		r = kfd_dbg_trap_disable(target);
> diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h
> index 9ef4eed45c19..a0efe1ccdbd6 100644
> --- a/include/uapi/linux/kfd_ioctl.h
> +++ b/include/uapi/linux/kfd_ioctl.h
> @@ -37,9 +37,10 @@
>    * - 1.9 - Add available memory ioctl
>    * - 1.10 - Add SMI profiler event log
>    * - 1.11 - Add unified memory for ctx save/restore area
> + * - 1.12 - Add debugger API
>    */
>   #define KFD_IOCTL_MAJOR_VERSION 1
> -#define KFD_IOCTL_MINOR_VERSION 11
> +#define KFD_IOCTL_MINOR_VERSION 12
>   
>   struct kfd_ioctl_get_version_args {
>   	__u32 major_version;	/* from KFD */

^ permalink raw reply	[flat|nested] 68+ messages in thread

* RE: [PATCH 15/32] drm/amdkfd: prepare trap workaround for gfx11
  2023-03-20 21:49   ` Felix Kuehling
@ 2023-03-23 13:50     ` Kim, Jonathan
  2023-03-23 14:00       ` Felix Kuehling
  0 siblings, 1 reply; 68+ messages in thread
From: Kim, Jonathan @ 2023-03-23 13:50 UTC (permalink / raw)
  To: Kuehling, Felix, amd-gfx, dri-devel

[Public]

> -----Original Message-----
> From: Kuehling, Felix <Felix.Kuehling@amd.com>
> Sent: Monday, March 20, 2023 5:50 PM
> To: Kim, Jonathan <Jonathan.Kim@amd.com>; amd-
> gfx@lists.freedesktop.org; dri-devel@lists.freedesktop.org
> Subject: Re: [PATCH 15/32] drm/amdkfd: prepare trap workaround for gfx11
>
>
> On 2023-01-25 14:53, Jonathan Kim wrote:
> > Due to a HW bug, waves in only half the shader arrays can enter trap.
> >
> > When starting a debug session, relocate all waves to the first shader
> > array of each shader engine and mask off the 2nd shader array as
> > unavailable.
> >
> > When ending a debug session, re-enable the 2nd shader array per
> > shader engine.
> >
> > User CU masking per queue cannot be guaranteed to remain functional
> > if requested during debugging (e.g. user cu mask requests only 2nd shader
> > array as an available resource leading to zero HW resources available)
> > nor can runtime be alerted of any of these changes during execution.
> >
> > Make user CU masking and debugging mutual exclusive with respect to
> > availability.
> >
> > If the debugger tries to attach to a process with a user cu masked
> > queue, return the runtime status as enabled but busy.
> >
> > If the debugger tries to attach and fails to reallocate queue waves to
> > the first shader array of each shader engine, return the runtime status
> > as enabled but with an error.
> >
> > In addition, like any other mutli-process debug supported devices,
> > disable trap temporary setup per-process to avoid performance impact
> from
> > setup overhead.
> >
> > Signed-off-by: Jonathan Kim <jonathan.kim@amd.com>
> > ---
> >   drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h       |  2 +
> >   drivers/gpu/drm/amd/amdgpu/mes_v11_0.c        |  7 +-
> >   drivers/gpu/drm/amd/amdkfd/kfd_chardev.c      |  2 -
> >   drivers/gpu/drm/amd/amdkfd/kfd_debug.c        | 64
> +++++++++++++++++++
> >   drivers/gpu/drm/amd/amdkfd/kfd_debug.h        |  3 +-
> >   .../drm/amd/amdkfd/kfd_device_queue_manager.c |  7 ++
> >   .../gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c  |  3 +-
> >   .../gpu/drm/amd/amdkfd/kfd_mqd_manager_v10.c  |  3 +-
> >   .../gpu/drm/amd/amdkfd/kfd_mqd_manager_v11.c  | 42 ++++++++----
> >   .../gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c   |  3 +-
> >   .../gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c   |  3 +-
> >   drivers/gpu/drm/amd/amdkfd/kfd_priv.h         |  5 +-
> >   .../amd/amdkfd/kfd_process_queue_manager.c    |  9 ++-
> >   13 files changed, 124 insertions(+), 29 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
> > index d20df0cf0d88..b5f5eed2b5ef 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
> > @@ -219,6 +219,8 @@ struct mes_add_queue_input {
> >     uint32_t        gws_size;
> >     uint64_t        tba_addr;
> >     uint64_t        tma_addr;
> > +   uint32_t        trap_en;
> > +   uint32_t        skip_process_ctx_clear;
> >     uint32_t        is_kfd_process;
> >     uint32_t        is_aql_queue;
> >     uint32_t        queue_size;
> > diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
> b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
> > index fbacdc42efac..38c7a0cbf264 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
> > @@ -197,17 +197,14 @@ static int mes_v11_0_add_hw_queue(struct
> amdgpu_mes *mes,
> >     mes_add_queue_pkt.gws_size = input->gws_size;
> >     mes_add_queue_pkt.trap_handler_addr = input->tba_addr;
> >     mes_add_queue_pkt.tma_addr = input->tma_addr;
> > +   mes_add_queue_pkt.trap_en = input->trap_en;
> > +   mes_add_queue_pkt.skip_process_ctx_clear = input-
> >skip_process_ctx_clear;
> >     mes_add_queue_pkt.is_kfd_process = input->is_kfd_process;
> >
> >     /* For KFD, gds_size is re-used for queue size (needed in MES for AQL
> queues) */
> >     mes_add_queue_pkt.is_aql_queue = input->is_aql_queue;
> >     mes_add_queue_pkt.gds_size = input->queue_size;
> >
> > -   if (!(((adev->mes.sched_version & AMDGPU_MES_VERSION_MASK) >=
> 4) &&
> > -             (adev->ip_versions[GC_HWIP][0] >= IP_VERSION(11, 0, 0))
> &&
> > -             (adev->ip_versions[GC_HWIP][0] <= IP_VERSION(11, 0, 3))))
> > -           mes_add_queue_pkt.trap_en = 1;
> > -
> >     /* For KFD, gds_size is re-used for queue size (needed in MES for AQL
> queues) */
> >     mes_add_queue_pkt.is_aql_queue = input->is_aql_queue;
> >     mes_add_queue_pkt.gds_size = input->queue_size;
> > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> > index ee05c2e54ef6..f5f639de28f0 100644
> > --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> > @@ -530,8 +530,6 @@ static int kfd_ioctl_set_cu_mask(struct file *filp,
> struct kfd_process *p,
> >             goto out;
> >     }
> >
> > -   minfo.update_flag = UPDATE_FLAG_CU_MASK;
> > -
> >     mutex_lock(&p->mutex);
> >
> >     retval = pqm_update_mqd(&p->pqm, args->queue_id, &minfo);
> > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
> b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
> > index f6ea6db266b4..6e99a0160275 100644
> > --- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
> > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
> > @@ -37,6 +37,70 @@ void debug_event_write_work_handler(struct
> work_struct *work)
> >     kernel_write(process->dbg_ev_file, &write_data, 1, &pos);
> >   }
> >
> > +static int kfd_dbg_set_queue_workaround(struct queue *q, bool enable)
> > +{
> > +   struct mqd_update_info minfo = {0};
> > +   int err;
> > +
> > +   if (!q || (!q->properties.is_dbg_wa && !enable))
>
> Should this condition be:
>
>      if (!q || q->properties.is_dbg_wa != enable)

The latter part should probably be q->properties.is_dbg_wa == enable.  q->properties.is_dbg_wa != enable would always skip a request to change the queue's current workaround state.
I think we can just drop the latter half of this test condition as a redundant queue workaround update is harmless.
It's a static call from a process wide call and the process wide call is static itself and only gets called twice, once on attach and once on detach.

Thanks,

Jon

>
>
> > +           return 0;
> > +
> > +   if (KFD_GC_VERSION(q->device) < IP_VERSION(11, 0, 0) ||
> > +                   KFD_GC_VERSION(q->device) >= IP_VERSION(12, 0, 0))
>
> Indentation. It would be more readable if the KFD_GC_VERSIONs were
> aligned.
>
>
> > +           return 0;
> > +
> > +   if (enable && q->properties.is_user_cu_masked)
> > +           return -EBUSY;
> > +
> > +   minfo.update_flag = enable ? UPDATE_FLAG_DBG_WA_ENABLE :
> UPDATE_FLAG_DBG_WA_DISABLE;
> > +
> > +   q->properties.is_dbg_wa = enable;
> > +   err = q->device->dqm->ops.update_queue(q->device->dqm, q,
> &minfo);
> > +   if (err)
> > +           q->properties.is_dbg_wa = false;
> > +
> > +   return err;
> > +}
> > +
> > +static int kfd_dbg_set_workaround(struct kfd_process *target, bool
> enable)
> > +{
> > +   struct process_queue_manager *pqm = &target->pqm;
> > +   struct process_queue_node *pqn;
> > +   int r = 0;
> > +
> > +   list_for_each_entry(pqn, &pqm->queues, process_queue_list) {
> > +           r = kfd_dbg_set_queue_workaround(pqn->q, enable);
> > +           if (enable && r)
> > +                   goto unwind;
> > +   }
> > +
> > +   return 0;
> > +
> > +unwind:
> > +   list_for_each_entry(pqn, &pqm->queues, process_queue_list)
> > +           kfd_dbg_set_queue_workaround(pqn->q, false);
> > +
> > +   if (enable) {
> > +           target->runtime_info.runtime_state = r == -EBUSY ?
> > +                           DEBUG_RUNTIME_STATE_ENABLED_BUSY :
> > +                           DEBUG_RUNTIME_STATE_ENABLED_ERROR;
> > +   }
>
> Braces are not needed here.
>
>
> > +
> > +   return r;
> > +}
> > +
> > +static int kfd_dbg_set_mes_debug_mode(struct kfd_process_device *pdd)
> > +{
> > +   uint32_t spi_dbg_cntl = pdd->spi_dbg_override | pdd-
> >spi_dbg_launch_mode;
> > +   uint32_t flags = pdd->process->dbg_flags;
> > +
> > +   if (!kfd_dbg_is_per_vmid_supported(pdd->dev))
> > +           return 0;
> > +
> > +   return amdgpu_mes_set_shader_debugger(pdd->dev->adev, pdd-
> >proc_ctx_gpu_addr, spi_dbg_cntl,
> > +                                           pdd->watch_points, flags);
> > +}
> > +
>
> You're adding some unused static functions here. This will cause compile
> warnings until the patch that starts using them. You could avoid this by
> reordering this and the next patch and moving the function calls into
> this patch. That would also make it more obvious where the workaround
> plugs into the debug code.
>
> Regards,
>    Felix
>
>
> >   int kfd_dbg_trap_disable(struct kfd_process *target)
> >   {
> >     if (!target->debug_trap_enabled)
> > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
> b/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
> > index 53c5a3e55bd2..0c09f1729325 100644
> > --- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
> > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
> > @@ -35,7 +35,8 @@ int kfd_dbg_trap_enable(struct kfd_process *target,
> uint32_t fd,
> >
> >   static inline bool kfd_dbg_is_per_vmid_supported(struct kfd_dev *dev)
> >   {
> > -   return KFD_GC_VERSION(dev) == IP_VERSION(9, 4, 2);
> > +   return KFD_GC_VERSION(dev) == IP_VERSION(9, 4, 2) ||
> > +          KFD_GC_VERSION(dev) >= IP_VERSION(11, 0, 0);
> >   }
> >
> >   void debug_event_write_work_handler(struct work_struct *work);
> > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> > index 2517716d7cbc..be1985b87ea7 100644
> > --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> > @@ -214,6 +214,10 @@ static int add_queue_mes(struct
> device_queue_manager *dqm, struct queue *q,
> >     queue_input.paging = false;
> >     queue_input.tba_addr = qpd->tba_addr;
> >     queue_input.tma_addr = qpd->tma_addr;
> > +   queue_input.trap_en = KFD_GC_VERSION(q->device) <
> IP_VERSION(11, 0, 0) ||
> > +                         KFD_GC_VERSION(q->device) >= IP_VERSION(12, 0,
> 0) ||
> > +                         q->properties.is_dbg_wa;
> > +   queue_input.skip_process_ctx_clear = qpd->pqm->process-
> >debug_trap_enabled;
> >
> >     queue_type = convert_to_mes_queue_type(q->properties.type);
> >     if (queue_type < 0) {
> > @@ -1679,6 +1683,9 @@ static int create_queue_cpsch(struct
> device_queue_manager *dqm, struct queue *q,
> >      * updates the is_evicted flag but is a no-op otherwise.
> >      */
> >     q->properties.is_evicted = !!qpd->evicted;
> > +   q->properties.is_dbg_wa = qpd->pqm->process-
> >debug_trap_enabled &&
> > +                   KFD_GC_VERSION(q->device) >= IP_VERSION(11, 0, 0)
> &&
> > +                   KFD_GC_VERSION(q->device) < IP_VERSION(12, 0, 0);
> >
> >     if (qd)
> >             mqd_mgr->restore_mqd(mqd_mgr, &q->mqd, q-
> >mqd_mem_obj, &q->gart_mqd_addr,
> > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c
> b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c
> > index 4889865c725c..c2a7226fc588 100644
> > --- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c
> > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c
> > @@ -48,8 +48,7 @@ static void update_cu_mask(struct mqd_manager
> *mm, void *mqd,
> >     struct cik_mqd *m;
> >     uint32_t se_mask[4] = {0}; /* 4 is the max # of SEs */
> >
> > -   if (!minfo || (minfo->update_flag != UPDATE_FLAG_CU_MASK) ||
> > -       !minfo->cu_mask.ptr)
> > +   if (!minfo || !minfo->cu_mask.ptr)
> >             return;
> >
> >     mqd_symmetrically_map_cu_mask(mm,
> > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v10.c
> b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v10.c
> > index cb484ace17de..8248e77751e7 100644
> > --- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v10.c
> > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v10.c
> > @@ -48,8 +48,7 @@ static void update_cu_mask(struct mqd_manager
> *mm, void *mqd,
> >     struct v10_compute_mqd *m;
> >     uint32_t se_mask[4] = {0}; /* 4 is the max # of SEs */
> >
> > -   if (!minfo || (minfo->update_flag != UPDATE_FLAG_CU_MASK) ||
> > -       !minfo->cu_mask.ptr)
> > +   if (!minfo || !minfo->cu_mask.ptr)
> >             return;
> >
> >     mqd_symmetrically_map_cu_mask(mm,
> > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v11.c
> b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v11.c
> > index ac7c8fc83c94..18ab613e787c 100644
> > --- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v11.c
> > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v11.c
> > @@ -46,15 +46,33 @@ static void update_cu_mask(struct mqd_manager
> *mm, void *mqd,
> >   {
> >     struct v11_compute_mqd *m;
> >     uint32_t se_mask[KFD_MAX_NUM_SE] = {0};
> > +   bool has_wa_flag = minfo && (minfo->update_flag &
> (UPDATE_FLAG_DBG_WA_ENABLE |
> > +                   UPDATE_FLAG_DBG_WA_DISABLE));
> >
> > -   if (!minfo || (minfo->update_flag != UPDATE_FLAG_CU_MASK) ||
> > -       !minfo->cu_mask.ptr)
> > +   if (!minfo || !(has_wa_flag || minfo->cu_mask.ptr))
> >             return;
> >
> > +   m = get_mqd(mqd);
> > +
> > +   if (has_wa_flag) {
> > +           uint32_t wa_mask = minfo->update_flag ==
> UPDATE_FLAG_DBG_WA_ENABLE ?
> > +                                           0xffff : 0xffffffff;
> > +
> > +           m->compute_static_thread_mgmt_se0 = wa_mask;
> > +           m->compute_static_thread_mgmt_se1 = wa_mask;
> > +           m->compute_static_thread_mgmt_se2 = wa_mask;
> > +           m->compute_static_thread_mgmt_se3 = wa_mask;
> > +           m->compute_static_thread_mgmt_se4 = wa_mask;
> > +           m->compute_static_thread_mgmt_se5 = wa_mask;
> > +           m->compute_static_thread_mgmt_se6 = wa_mask;
> > +           m->compute_static_thread_mgmt_se7 = wa_mask;
> > +
> > +           return;
> > +   }
> > +
> >     mqd_symmetrically_map_cu_mask(mm,
> >             minfo->cu_mask.ptr, minfo->cu_mask.count, se_mask);
> >
> > -   m = get_mqd(mqd);
> >     m->compute_static_thread_mgmt_se0 = se_mask[0];
> >     m->compute_static_thread_mgmt_se1 = se_mask[1];
> >     m->compute_static_thread_mgmt_se2 = se_mask[2];
> > @@ -109,6 +127,7 @@ static void init_mqd(struct mqd_manager *mm,
> void **mqd,
> >     uint64_t addr;
> >     struct v11_compute_mqd *m;
> >     int size;
> > +   uint32_t wa_mask = q->is_dbg_wa ? 0xffff : 0xffffffff;
> >
> >     m = (struct v11_compute_mqd *) mqd_mem_obj->cpu_ptr;
> >     addr = mqd_mem_obj->gpu_addr;
> > @@ -122,14 +141,15 @@ static void init_mqd(struct mqd_manager *mm,
> void **mqd,
> >
> >     m->header = 0xC0310800;
> >     m->compute_pipelinestat_enable = 1;
> > -   m->compute_static_thread_mgmt_se0 = 0xFFFFFFFF;
> > -   m->compute_static_thread_mgmt_se1 = 0xFFFFFFFF;
> > -   m->compute_static_thread_mgmt_se2 = 0xFFFFFFFF;
> > -   m->compute_static_thread_mgmt_se3 = 0xFFFFFFFF;
> > -   m->compute_static_thread_mgmt_se4 = 0xFFFFFFFF;
> > -   m->compute_static_thread_mgmt_se5 = 0xFFFFFFFF;
> > -   m->compute_static_thread_mgmt_se6 = 0xFFFFFFFF;
> > -   m->compute_static_thread_mgmt_se7 = 0xFFFFFFFF;
> > +
> > +   m->compute_static_thread_mgmt_se0 = wa_mask;
> > +   m->compute_static_thread_mgmt_se1 = wa_mask;
> > +   m->compute_static_thread_mgmt_se2 = wa_mask;
> > +   m->compute_static_thread_mgmt_se3 = wa_mask;
> > +   m->compute_static_thread_mgmt_se4 = wa_mask;
> > +   m->compute_static_thread_mgmt_se5 = wa_mask;
> > +   m->compute_static_thread_mgmt_se6 = wa_mask;
> > +   m->compute_static_thread_mgmt_se7 = wa_mask;
> >
> >     m->cp_hqd_persistent_state =
> CP_HQD_PERSISTENT_STATE__PRELOAD_REQ_MASK |
> >                     0x55 <<
> CP_HQD_PERSISTENT_STATE__PRELOAD_SIZE__SHIFT;
> > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c
> b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c
> > index 86f1cf090246..50da16dd4c96 100644
> > --- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c
> > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c
> > @@ -49,8 +49,7 @@ static void update_cu_mask(struct mqd_manager
> *mm, void *mqd,
> >     struct v9_mqd *m;
> >     uint32_t se_mask[KFD_MAX_NUM_SE] = {0};
> >
> > -   if (!minfo || (minfo->update_flag != UPDATE_FLAG_CU_MASK) ||
> > -       !minfo->cu_mask.ptr)
> > +   if (!minfo || !minfo->cu_mask.ptr)
> >             return;
> >
> >     mqd_symmetrically_map_cu_mask(mm,
> > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c
> b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c
> > index 530ba6f5b57e..58b40bff3e0c 100644
> > --- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c
> > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c
> > @@ -51,8 +51,7 @@ static void update_cu_mask(struct mqd_manager
> *mm, void *mqd,
> >     struct vi_mqd *m;
> >     uint32_t se_mask[4] = {0}; /* 4 is the max # of SEs */
> >
> > -   if (!minfo || (minfo->update_flag != UPDATE_FLAG_CU_MASK) ||
> > -       !minfo->cu_mask.ptr)
> > +   if (!minfo || !minfo->cu_mask.ptr)
> >             return;
> >
> >     mqd_symmetrically_map_cu_mask(mm,
> > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> > index 8f1e2f9023db..75521d96e937 100644
> > --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> > @@ -479,6 +479,8 @@ struct queue_properties {
> >     bool is_evicted;
> >     bool is_active;
> >     bool is_gws;
> > +   bool is_dbg_wa;
> > +   bool is_user_cu_masked;
> >     /* Not relevant for user mode queues in cp scheduling */
> >     unsigned int vmid;
> >     /* Relevant only for sdma queues*/
> > @@ -501,7 +503,8 @@ struct queue_properties {
> >                         !(q).is_evicted)
> >
> >   enum mqd_update_flag {
> > -   UPDATE_FLAG_CU_MASK = 0,
> > +   UPDATE_FLAG_DBG_WA_ENABLE = 1,
> > +   UPDATE_FLAG_DBG_WA_DISABLE = 2,
> >   };
> >
> >   struct mqd_update_info {
> > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
> b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
> > index 5137476ec18e..d8f032214481 100644
> > --- a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
> > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
> > @@ -498,8 +498,12 @@ int pqm_update_mqd(struct
> process_queue_manager *pqm,
> >             return -EFAULT;
> >     }
> >
> > +   /* CUs are masked for debugger requirements so deny user mask  */
> > +   if (pqn->q->properties.is_dbg_wa && minfo && minfo->cu_mask.ptr)
> > +           return -EBUSY;
> > +
> >     /* ASICs that have WGPs must enforce pairwise enabled mask
> checks. */
> > -   if (minfo && minfo->update_flag == UPDATE_FLAG_CU_MASK &&
> minfo->cu_mask.ptr &&
> > +   if (minfo && minfo->cu_mask.ptr &&
> >                     KFD_GC_VERSION(pqn->q->device) >= IP_VERSION(10,
> 0, 0)) {
> >             int i;
> >
> > @@ -518,6 +522,9 @@ int pqm_update_mqd(struct
> process_queue_manager *pqm,
> >     if (retval != 0)
> >             return retval;
> >
> > +   if (minfo && minfo->cu_mask.ptr)
> > +           pqn->q->properties.is_user_cu_masked = true;
> > +
> >     return 0;
> >   }
> >

^ permalink raw reply	[flat|nested] 68+ messages in thread

* Re: [PATCH 15/32] drm/amdkfd: prepare trap workaround for gfx11
  2023-03-23 13:50     ` Kim, Jonathan
@ 2023-03-23 14:00       ` Felix Kuehling
  0 siblings, 0 replies; 68+ messages in thread
From: Felix Kuehling @ 2023-03-23 14:00 UTC (permalink / raw)
  To: Kim, Jonathan, amd-gfx, dri-devel


Am 2023-03-23 um 09:50 schrieb Kim, Jonathan:
> [Public]
>
>> -----Original Message-----
>> From: Kuehling, Felix <Felix.Kuehling@amd.com>
>> Sent: Monday, March 20, 2023 5:50 PM
>> To: Kim, Jonathan <Jonathan.Kim@amd.com>; amd-
>> gfx@lists.freedesktop.org; dri-devel@lists.freedesktop.org
>> Subject: Re: [PATCH 15/32] drm/amdkfd: prepare trap workaround for gfx11
>>
>>
>> On 2023-01-25 14:53, Jonathan Kim wrote:
>>> Due to a HW bug, waves in only half the shader arrays can enter trap.
>>>
>>> When starting a debug session, relocate all waves to the first shader
>>> array of each shader engine and mask off the 2nd shader array as
>>> unavailable.
>>>
>>> When ending a debug session, re-enable the 2nd shader array per
>>> shader engine.
>>>
>>> User CU masking per queue cannot be guaranteed to remain functional
>>> if requested during debugging (e.g. user cu mask requests only 2nd shader
>>> array as an available resource leading to zero HW resources available)
>>> nor can runtime be alerted of any of these changes during execution.
>>>
>>> Make user CU masking and debugging mutual exclusive with respect to
>>> availability.
>>>
>>> If the debugger tries to attach to a process with a user cu masked
>>> queue, return the runtime status as enabled but busy.
>>>
>>> If the debugger tries to attach and fails to reallocate queue waves to
>>> the first shader array of each shader engine, return the runtime status
>>> as enabled but with an error.
>>>
>>> In addition, like any other mutli-process debug supported devices,
>>> disable trap temporary setup per-process to avoid performance impact
>> from
>>> setup overhead.
>>>
>>> Signed-off-by: Jonathan Kim <jonathan.kim@amd.com>
>>> ---
>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h       |  2 +
>>>    drivers/gpu/drm/amd/amdgpu/mes_v11_0.c        |  7 +-
>>>    drivers/gpu/drm/amd/amdkfd/kfd_chardev.c      |  2 -
>>>    drivers/gpu/drm/amd/amdkfd/kfd_debug.c        | 64
>> +++++++++++++++++++
>>>    drivers/gpu/drm/amd/amdkfd/kfd_debug.h        |  3 +-
>>>    .../drm/amd/amdkfd/kfd_device_queue_manager.c |  7 ++
>>>    .../gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c  |  3 +-
>>>    .../gpu/drm/amd/amdkfd/kfd_mqd_manager_v10.c  |  3 +-
>>>    .../gpu/drm/amd/amdkfd/kfd_mqd_manager_v11.c  | 42 ++++++++----
>>>    .../gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c   |  3 +-
>>>    .../gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c   |  3 +-
>>>    drivers/gpu/drm/amd/amdkfd/kfd_priv.h         |  5 +-
>>>    .../amd/amdkfd/kfd_process_queue_manager.c    |  9 ++-
>>>    13 files changed, 124 insertions(+), 29 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
>>> index d20df0cf0d88..b5f5eed2b5ef 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
>>> @@ -219,6 +219,8 @@ struct mes_add_queue_input {
>>>      uint32_t        gws_size;
>>>      uint64_t        tba_addr;
>>>      uint64_t        tma_addr;
>>> +   uint32_t        trap_en;
>>> +   uint32_t        skip_process_ctx_clear;
>>>      uint32_t        is_kfd_process;
>>>      uint32_t        is_aql_queue;
>>>      uint32_t        queue_size;
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
>> b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
>>> index fbacdc42efac..38c7a0cbf264 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
>>> @@ -197,17 +197,14 @@ static int mes_v11_0_add_hw_queue(struct
>> amdgpu_mes *mes,
>>>      mes_add_queue_pkt.gws_size = input->gws_size;
>>>      mes_add_queue_pkt.trap_handler_addr = input->tba_addr;
>>>      mes_add_queue_pkt.tma_addr = input->tma_addr;
>>> +   mes_add_queue_pkt.trap_en = input->trap_en;
>>> +   mes_add_queue_pkt.skip_process_ctx_clear = input-
>>> skip_process_ctx_clear;
>>>      mes_add_queue_pkt.is_kfd_process = input->is_kfd_process;
>>>
>>>      /* For KFD, gds_size is re-used for queue size (needed in MES for AQL
>> queues) */
>>>      mes_add_queue_pkt.is_aql_queue = input->is_aql_queue;
>>>      mes_add_queue_pkt.gds_size = input->queue_size;
>>>
>>> -   if (!(((adev->mes.sched_version & AMDGPU_MES_VERSION_MASK) >=
>> 4) &&
>>> -             (adev->ip_versions[GC_HWIP][0] >= IP_VERSION(11, 0, 0))
>> &&
>>> -             (adev->ip_versions[GC_HWIP][0] <= IP_VERSION(11, 0, 3))))
>>> -           mes_add_queue_pkt.trap_en = 1;
>>> -
>>>      /* For KFD, gds_size is re-used for queue size (needed in MES for AQL
>> queues) */
>>>      mes_add_queue_pkt.is_aql_queue = input->is_aql_queue;
>>>      mes_add_queue_pkt.gds_size = input->queue_size;
>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
>> b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
>>> index ee05c2e54ef6..f5f639de28f0 100644
>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
>>> @@ -530,8 +530,6 @@ static int kfd_ioctl_set_cu_mask(struct file *filp,
>> struct kfd_process *p,
>>>              goto out;
>>>      }
>>>
>>> -   minfo.update_flag = UPDATE_FLAG_CU_MASK;
>>> -
>>>      mutex_lock(&p->mutex);
>>>
>>>      retval = pqm_update_mqd(&p->pqm, args->queue_id, &minfo);
>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
>> b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
>>> index f6ea6db266b4..6e99a0160275 100644
>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
>>> @@ -37,6 +37,70 @@ void debug_event_write_work_handler(struct
>> work_struct *work)
>>>      kernel_write(process->dbg_ev_file, &write_data, 1, &pos);
>>>    }
>>>
>>> +static int kfd_dbg_set_queue_workaround(struct queue *q, bool enable)
>>> +{
>>> +   struct mqd_update_info minfo = {0};
>>> +   int err;
>>> +
>>> +   if (!q || (!q->properties.is_dbg_wa && !enable))
>> Should this condition be:
>>
>>       if (!q || q->properties.is_dbg_wa != enable)
> The latter part should probably be q->properties.is_dbg_wa == enable.  q->properties.is_dbg_wa != enable would always skip a request to change the queue's current workaround state.

Yeah, I that's what I meant.


> I think we can just drop the latter half of this test condition as a redundant queue workaround update is harmless.
> It's a static call from a process wide call and the process wide call is static itself and only gets called twice, once on attach and once on detach.

Sounds good.

Thanks,
   Felix


>
> Thanks,
>
> Jon
>
>>> +           return 0;
>>> +
>>> +   if (KFD_GC_VERSION(q->device) < IP_VERSION(11, 0, 0) ||
>>> +                   KFD_GC_VERSION(q->device) >= IP_VERSION(12, 0, 0))
>> Indentation. It would be more readable if the KFD_GC_VERSIONs were
>> aligned.
>>
>>
>>> +           return 0;
>>> +
>>> +   if (enable && q->properties.is_user_cu_masked)
>>> +           return -EBUSY;
>>> +
>>> +   minfo.update_flag = enable ? UPDATE_FLAG_DBG_WA_ENABLE :
>> UPDATE_FLAG_DBG_WA_DISABLE;
>>> +
>>> +   q->properties.is_dbg_wa = enable;
>>> +   err = q->device->dqm->ops.update_queue(q->device->dqm, q,
>> &minfo);
>>> +   if (err)
>>> +           q->properties.is_dbg_wa = false;
>>> +
>>> +   return err;
>>> +}
>>> +
>>> +static int kfd_dbg_set_workaround(struct kfd_process *target, bool
>> enable)
>>> +{
>>> +   struct process_queue_manager *pqm = &target->pqm;
>>> +   struct process_queue_node *pqn;
>>> +   int r = 0;
>>> +
>>> +   list_for_each_entry(pqn, &pqm->queues, process_queue_list) {
>>> +           r = kfd_dbg_set_queue_workaround(pqn->q, enable);
>>> +           if (enable && r)
>>> +                   goto unwind;
>>> +   }
>>> +
>>> +   return 0;
>>> +
>>> +unwind:
>>> +   list_for_each_entry(pqn, &pqm->queues, process_queue_list)
>>> +           kfd_dbg_set_queue_workaround(pqn->q, false);
>>> +
>>> +   if (enable) {
>>> +           target->runtime_info.runtime_state = r == -EBUSY ?
>>> +                           DEBUG_RUNTIME_STATE_ENABLED_BUSY :
>>> +                           DEBUG_RUNTIME_STATE_ENABLED_ERROR;
>>> +   }
>> Braces are not needed here.
>>
>>
>>> +
>>> +   return r;
>>> +}
>>> +
>>> +static int kfd_dbg_set_mes_debug_mode(struct kfd_process_device *pdd)
>>> +{
>>> +   uint32_t spi_dbg_cntl = pdd->spi_dbg_override | pdd-
>>> spi_dbg_launch_mode;
>>> +   uint32_t flags = pdd->process->dbg_flags;
>>> +
>>> +   if (!kfd_dbg_is_per_vmid_supported(pdd->dev))
>>> +           return 0;
>>> +
>>> +   return amdgpu_mes_set_shader_debugger(pdd->dev->adev, pdd-
>>> proc_ctx_gpu_addr, spi_dbg_cntl,
>>> +                                           pdd->watch_points, flags);
>>> +}
>>> +
>> You're adding some unused static functions here. This will cause compile
>> warnings until the patch that starts using them. You could avoid this by
>> reordering this and the next patch and moving the function calls into
>> this patch. That would also make it more obvious where the workaround
>> plugs into the debug code.
>>
>> Regards,
>>     Felix
>>
>>
>>>    int kfd_dbg_trap_disable(struct kfd_process *target)
>>>    {
>>>      if (!target->debug_trap_enabled)
>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
>> b/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
>>> index 53c5a3e55bd2..0c09f1729325 100644
>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
>>> @@ -35,7 +35,8 @@ int kfd_dbg_trap_enable(struct kfd_process *target,
>> uint32_t fd,
>>>    static inline bool kfd_dbg_is_per_vmid_supported(struct kfd_dev *dev)
>>>    {
>>> -   return KFD_GC_VERSION(dev) == IP_VERSION(9, 4, 2);
>>> +   return KFD_GC_VERSION(dev) == IP_VERSION(9, 4, 2) ||
>>> +          KFD_GC_VERSION(dev) >= IP_VERSION(11, 0, 0);
>>>    }
>>>
>>>    void debug_event_write_work_handler(struct work_struct *work);
>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
>> b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
>>> index 2517716d7cbc..be1985b87ea7 100644
>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
>>> @@ -214,6 +214,10 @@ static int add_queue_mes(struct
>> device_queue_manager *dqm, struct queue *q,
>>>      queue_input.paging = false;
>>>      queue_input.tba_addr = qpd->tba_addr;
>>>      queue_input.tma_addr = qpd->tma_addr;
>>> +   queue_input.trap_en = KFD_GC_VERSION(q->device) <
>> IP_VERSION(11, 0, 0) ||
>>> +                         KFD_GC_VERSION(q->device) >= IP_VERSION(12, 0,
>> 0) ||
>>> +                         q->properties.is_dbg_wa;
>>> +   queue_input.skip_process_ctx_clear = qpd->pqm->process-
>>> debug_trap_enabled;
>>>
>>>      queue_type = convert_to_mes_queue_type(q->properties.type);
>>>      if (queue_type < 0) {
>>> @@ -1679,6 +1683,9 @@ static int create_queue_cpsch(struct
>> device_queue_manager *dqm, struct queue *q,
>>>       * updates the is_evicted flag but is a no-op otherwise.
>>>       */
>>>      q->properties.is_evicted = !!qpd->evicted;
>>> +   q->properties.is_dbg_wa = qpd->pqm->process-
>>> debug_trap_enabled &&
>>> +                   KFD_GC_VERSION(q->device) >= IP_VERSION(11, 0, 0)
>> &&
>>> +                   KFD_GC_VERSION(q->device) < IP_VERSION(12, 0, 0);
>>>
>>>      if (qd)
>>>              mqd_mgr->restore_mqd(mqd_mgr, &q->mqd, q-
>>> mqd_mem_obj, &q->gart_mqd_addr,
>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c
>> b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c
>>> index 4889865c725c..c2a7226fc588 100644
>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c
>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c
>>> @@ -48,8 +48,7 @@ static void update_cu_mask(struct mqd_manager
>> *mm, void *mqd,
>>>      struct cik_mqd *m;
>>>      uint32_t se_mask[4] = {0}; /* 4 is the max # of SEs */
>>>
>>> -   if (!minfo || (minfo->update_flag != UPDATE_FLAG_CU_MASK) ||
>>> -       !minfo->cu_mask.ptr)
>>> +   if (!minfo || !minfo->cu_mask.ptr)
>>>              return;
>>>
>>>      mqd_symmetrically_map_cu_mask(mm,
>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v10.c
>> b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v10.c
>>> index cb484ace17de..8248e77751e7 100644
>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v10.c
>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v10.c
>>> @@ -48,8 +48,7 @@ static void update_cu_mask(struct mqd_manager
>> *mm, void *mqd,
>>>      struct v10_compute_mqd *m;
>>>      uint32_t se_mask[4] = {0}; /* 4 is the max # of SEs */
>>>
>>> -   if (!minfo || (minfo->update_flag != UPDATE_FLAG_CU_MASK) ||
>>> -       !minfo->cu_mask.ptr)
>>> +   if (!minfo || !minfo->cu_mask.ptr)
>>>              return;
>>>
>>>      mqd_symmetrically_map_cu_mask(mm,
>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v11.c
>> b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v11.c
>>> index ac7c8fc83c94..18ab613e787c 100644
>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v11.c
>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v11.c
>>> @@ -46,15 +46,33 @@ static void update_cu_mask(struct mqd_manager
>> *mm, void *mqd,
>>>    {
>>>      struct v11_compute_mqd *m;
>>>      uint32_t se_mask[KFD_MAX_NUM_SE] = {0};
>>> +   bool has_wa_flag = minfo && (minfo->update_flag &
>> (UPDATE_FLAG_DBG_WA_ENABLE |
>>> +                   UPDATE_FLAG_DBG_WA_DISABLE));
>>>
>>> -   if (!minfo || (minfo->update_flag != UPDATE_FLAG_CU_MASK) ||
>>> -       !minfo->cu_mask.ptr)
>>> +   if (!minfo || !(has_wa_flag || minfo->cu_mask.ptr))
>>>              return;
>>>
>>> +   m = get_mqd(mqd);
>>> +
>>> +   if (has_wa_flag) {
>>> +           uint32_t wa_mask = minfo->update_flag ==
>> UPDATE_FLAG_DBG_WA_ENABLE ?
>>> +                                           0xffff : 0xffffffff;
>>> +
>>> +           m->compute_static_thread_mgmt_se0 = wa_mask;
>>> +           m->compute_static_thread_mgmt_se1 = wa_mask;
>>> +           m->compute_static_thread_mgmt_se2 = wa_mask;
>>> +           m->compute_static_thread_mgmt_se3 = wa_mask;
>>> +           m->compute_static_thread_mgmt_se4 = wa_mask;
>>> +           m->compute_static_thread_mgmt_se5 = wa_mask;
>>> +           m->compute_static_thread_mgmt_se6 = wa_mask;
>>> +           m->compute_static_thread_mgmt_se7 = wa_mask;
>>> +
>>> +           return;
>>> +   }
>>> +
>>>      mqd_symmetrically_map_cu_mask(mm,
>>>              minfo->cu_mask.ptr, minfo->cu_mask.count, se_mask);
>>>
>>> -   m = get_mqd(mqd);
>>>      m->compute_static_thread_mgmt_se0 = se_mask[0];
>>>      m->compute_static_thread_mgmt_se1 = se_mask[1];
>>>      m->compute_static_thread_mgmt_se2 = se_mask[2];
>>> @@ -109,6 +127,7 @@ static void init_mqd(struct mqd_manager *mm,
>> void **mqd,
>>>      uint64_t addr;
>>>      struct v11_compute_mqd *m;
>>>      int size;
>>> +   uint32_t wa_mask = q->is_dbg_wa ? 0xffff : 0xffffffff;
>>>
>>>      m = (struct v11_compute_mqd *) mqd_mem_obj->cpu_ptr;
>>>      addr = mqd_mem_obj->gpu_addr;
>>> @@ -122,14 +141,15 @@ static void init_mqd(struct mqd_manager *mm,
>> void **mqd,
>>>      m->header = 0xC0310800;
>>>      m->compute_pipelinestat_enable = 1;
>>> -   m->compute_static_thread_mgmt_se0 = 0xFFFFFFFF;
>>> -   m->compute_static_thread_mgmt_se1 = 0xFFFFFFFF;
>>> -   m->compute_static_thread_mgmt_se2 = 0xFFFFFFFF;
>>> -   m->compute_static_thread_mgmt_se3 = 0xFFFFFFFF;
>>> -   m->compute_static_thread_mgmt_se4 = 0xFFFFFFFF;
>>> -   m->compute_static_thread_mgmt_se5 = 0xFFFFFFFF;
>>> -   m->compute_static_thread_mgmt_se6 = 0xFFFFFFFF;
>>> -   m->compute_static_thread_mgmt_se7 = 0xFFFFFFFF;
>>> +
>>> +   m->compute_static_thread_mgmt_se0 = wa_mask;
>>> +   m->compute_static_thread_mgmt_se1 = wa_mask;
>>> +   m->compute_static_thread_mgmt_se2 = wa_mask;
>>> +   m->compute_static_thread_mgmt_se3 = wa_mask;
>>> +   m->compute_static_thread_mgmt_se4 = wa_mask;
>>> +   m->compute_static_thread_mgmt_se5 = wa_mask;
>>> +   m->compute_static_thread_mgmt_se6 = wa_mask;
>>> +   m->compute_static_thread_mgmt_se7 = wa_mask;
>>>
>>>      m->cp_hqd_persistent_state =
>> CP_HQD_PERSISTENT_STATE__PRELOAD_REQ_MASK |
>>>                      0x55 <<
>> CP_HQD_PERSISTENT_STATE__PRELOAD_SIZE__SHIFT;
>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c
>> b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c
>>> index 86f1cf090246..50da16dd4c96 100644
>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c
>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c
>>> @@ -49,8 +49,7 @@ static void update_cu_mask(struct mqd_manager
>> *mm, void *mqd,
>>>      struct v9_mqd *m;
>>>      uint32_t se_mask[KFD_MAX_NUM_SE] = {0};
>>>
>>> -   if (!minfo || (minfo->update_flag != UPDATE_FLAG_CU_MASK) ||
>>> -       !minfo->cu_mask.ptr)
>>> +   if (!minfo || !minfo->cu_mask.ptr)
>>>              return;
>>>
>>>      mqd_symmetrically_map_cu_mask(mm,
>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c
>> b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c
>>> index 530ba6f5b57e..58b40bff3e0c 100644
>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c
>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c
>>> @@ -51,8 +51,7 @@ static void update_cu_mask(struct mqd_manager
>> *mm, void *mqd,
>>>      struct vi_mqd *m;
>>>      uint32_t se_mask[4] = {0}; /* 4 is the max # of SEs */
>>>
>>> -   if (!minfo || (minfo->update_flag != UPDATE_FLAG_CU_MASK) ||
>>> -       !minfo->cu_mask.ptr)
>>> +   if (!minfo || !minfo->cu_mask.ptr)
>>>              return;
>>>
>>>      mqd_symmetrically_map_cu_mask(mm,
>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>> b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>>> index 8f1e2f9023db..75521d96e937 100644
>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>>> @@ -479,6 +479,8 @@ struct queue_properties {
>>>      bool is_evicted;
>>>      bool is_active;
>>>      bool is_gws;
>>> +   bool is_dbg_wa;
>>> +   bool is_user_cu_masked;
>>>      /* Not relevant for user mode queues in cp scheduling */
>>>      unsigned int vmid;
>>>      /* Relevant only for sdma queues*/
>>> @@ -501,7 +503,8 @@ struct queue_properties {
>>>                          !(q).is_evicted)
>>>
>>>    enum mqd_update_flag {
>>> -   UPDATE_FLAG_CU_MASK = 0,
>>> +   UPDATE_FLAG_DBG_WA_ENABLE = 1,
>>> +   UPDATE_FLAG_DBG_WA_DISABLE = 2,
>>>    };
>>>
>>>    struct mqd_update_info {
>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
>> b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
>>> index 5137476ec18e..d8f032214481 100644
>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
>>> @@ -498,8 +498,12 @@ int pqm_update_mqd(struct
>> process_queue_manager *pqm,
>>>              return -EFAULT;
>>>      }
>>>
>>> +   /* CUs are masked for debugger requirements so deny user mask  */
>>> +   if (pqn->q->properties.is_dbg_wa && minfo && minfo->cu_mask.ptr)
>>> +           return -EBUSY;
>>> +
>>>      /* ASICs that have WGPs must enforce pairwise enabled mask
>> checks. */
>>> -   if (minfo && minfo->update_flag == UPDATE_FLAG_CU_MASK &&
>> minfo->cu_mask.ptr &&
>>> +   if (minfo && minfo->cu_mask.ptr &&
>>>                      KFD_GC_VERSION(pqn->q->device) >= IP_VERSION(10,
>> 0, 0)) {
>>>              int i;
>>>
>>> @@ -518,6 +522,9 @@ int pqm_update_mqd(struct
>> process_queue_manager *pqm,
>>>      if (retval != 0)
>>>              return retval;
>>>
>>> +   if (minfo && minfo->cu_mask.ptr)
>>> +           pqn->q->properties.is_user_cu_masked = true;
>>> +
>>>      return 0;
>>>    }
>>>

^ permalink raw reply	[flat|nested] 68+ messages in thread

* RE: [PATCH 03/32] drm/amdkfd: prepare per-process debug enable and disable
  2023-02-16 23:44   ` Felix Kuehling
@ 2023-03-23 19:12     ` Kim, Jonathan
  2023-03-23 20:08       ` Felix Kuehling
  0 siblings, 1 reply; 68+ messages in thread
From: Kim, Jonathan @ 2023-03-23 19:12 UTC (permalink / raw)
  To: Kuehling, Felix, amd-gfx, dri-devel

[Public]

> -----Original Message-----
> From: Kuehling, Felix <Felix.Kuehling@amd.com>
> Sent: Thursday, February 16, 2023 6:44 PM
> To: Kim, Jonathan <Jonathan.Kim@amd.com>; amd-
> gfx@lists.freedesktop.org; dri-devel@lists.freedesktop.org
> Subject: Re: [PATCH 03/32] drm/amdkfd: prepare per-process debug enable
> and disable
>
>
> On 2023-01-25 14:53, Jonathan Kim wrote:
> > The ROCm debugger will attach to a process to debug by PTRACE and will
> > expect the KFD to prepare a process for the target PID, whether the
> > target PID has opened the KFD device or not.
> >
> > This patch is to explicity handle this requirement.  Further HW mode
> > setting and runtime coordination requirements will be handled in
> > following patches.
> >
> > In the case where the target process has not opened the KFD device,
> > a new KFD process must be created for the target PID.
> > The debugger as well as the target process for this case will have not
> > acquired any VMs so handle process restoration to correctly account for
> > this.
> >
> > To coordinate with HSA runtime, the debugger must be aware of the target
> > process' runtime enablement status and will copy the runtime status
> > information into the debugged KFD process for later query.
> >
> > On enablement, the debugger will subscribe to a set of exceptions where
> > each exception events will notify the debugger through a pollable FIFO
> > file descriptor that the debugger provides to the KFD to manage.
> > Some events will be synchronously raised while other are scheduled,
> > which is why a debug_event_workarea worker is initialized.
> >
> > Finally on process termination of either the debugger or the target,
> > debugging must be disabled if it has not been done so.
> >
> > v3: fix typo on debug trap disable and PTRACE ATTACH relax check.
> > remove unnecessary queue eviction counter reset when there's nothing
> > to evict.
> > change err code to EALREADY if attaching to an already attached process.
> > move debug disable to release worker to avoid race with disable from
> > ioctl call.
> >
> > v2: relax debug trap disable and PTRACE ATTACH requirement.
> >
> > Signed-off-by: Jonathan Kim<jonathan.kim@amd.com>
> > ---
> >   drivers/gpu/drm/amd/amdkfd/Makefile           |  3 +-
> >   drivers/gpu/drm/amd/amdkfd/kfd_chardev.c      | 88 ++++++++++++++++-
> >   drivers/gpu/drm/amd/amdkfd/kfd_debug.c        | 94
> +++++++++++++++++++
> >   drivers/gpu/drm/amd/amdkfd/kfd_debug.h        | 33 +++++++
> >   .../drm/amd/amdkfd/kfd_device_queue_manager.c | 22 ++++-
> >   drivers/gpu/drm/amd/amdkfd/kfd_priv.h         | 34 ++++++-
> >   drivers/gpu/drm/amd/amdkfd/kfd_process.c      | 63 +++++++++----
> >   7 files changed, 308 insertions(+), 29 deletions(-)
> >   create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_debug.c
> >   create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_debug.h
> >
> > diff --git a/drivers/gpu/drm/amd/amdkfd/Makefile
> b/drivers/gpu/drm/amd/amdkfd/Makefile
> > index e758c2a24cd0..747754428073 100644
> > --- a/drivers/gpu/drm/amd/amdkfd/Makefile
> > +++ b/drivers/gpu/drm/amd/amdkfd/Makefile
> > @@ -55,7 +55,8 @@ AMDKFD_FILES      := $(AMDKFD_PATH)/kfd_module.o \
> >             $(AMDKFD_PATH)/kfd_int_process_v9.o \
> >             $(AMDKFD_PATH)/kfd_int_process_v11.o \
> >             $(AMDKFD_PATH)/kfd_smi_events.o \
> > -           $(AMDKFD_PATH)/kfd_crat.o
> > +           $(AMDKFD_PATH)/kfd_crat.o \
> > +           $(AMDKFD_PATH)/kfd_debug.o
> >
> >   ifneq ($(CONFIG_AMD_IOMMU_V2),)
> >   AMDKFD_FILES += $(AMDKFD_PATH)/kfd_iommu.o
> > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> > index d3b019e64093..ee05c2e54ef6 100644
> > --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> > @@ -44,6 +44,7 @@
> >   #include "amdgpu_amdkfd.h"
> >   #include "kfd_smi_events.h"
> >   #include "amdgpu_dma_buf.h"
> > +#include "kfd_debug.h"
> >
> >   static long kfd_ioctl(struct file *, unsigned int, unsigned long);
> >   static int kfd_open(struct inode *, struct file *);
> > @@ -142,10 +143,15 @@ static int kfd_open(struct inode *inode, struct
> file *filep)
> >             return -EPERM;
> >     }
> >
> > -   process = kfd_create_process(filep);
> > +   process = kfd_create_process(current);
> >     if (IS_ERR(process))
> >             return PTR_ERR(process);
> >
> > +   if (kfd_process_init_cwsr_apu(process, filep)) {
> > +           kfd_unref_process(process);
> > +           return -EFAULT;
> > +   }
> > +
> >     if (kfd_is_locked()) {
> >             dev_dbg(kfd_device, "kfd is locked!\n"
> >                             "process %d unreferenced", process->pasid);
> > @@ -2653,6 +2659,9 @@ static int kfd_ioctl_runtime_enable(struct file
> *filep, struct kfd_process *p, v
> >   static int kfd_ioctl_set_debug_trap(struct file *filep, struct kfd_process
> *p, void *data)
> >   {
> >     struct kfd_ioctl_dbg_trap_args *args = data;
> > +   struct task_struct *thread = NULL;
> > +   struct pid *pid = NULL;
> > +   struct kfd_process *target = NULL;
> >     int r = 0;
> >
> >     if (sched_policy == KFD_SCHED_POLICY_NO_HWS) {
> > @@ -2660,9 +2669,71 @@ static int kfd_ioctl_set_debug_trap(struct file
> *filep, struct kfd_process *p, v
> >             return -EINVAL;
> >     }
> >
> > +   pid = find_get_pid(args->pid);
> > +   if (!pid) {
> > +           pr_debug("Cannot find pid info for %i\n", args->pid);
> > +           r = -ESRCH;
> > +           goto out;
> > +   }
> > +
> > +   thread = get_pid_task(pid, PIDTYPE_PID);
> > +
> > +   if (args->op == KFD_IOC_DBG_TRAP_ENABLE) {
> > +           bool create_process;
> > +
> > +           rcu_read_lock();
> > +           create_process = thread && thread != current &&
> ptrace_parent(thread) == current;
> > +           rcu_read_unlock();
> > +
> > +           target = create_process ? kfd_create_process(thread) :
> > +                                   kfd_lookup_process_by_pid(pid);
> > +   } else {
> > +           target = kfd_lookup_process_by_pid(pid);
> > +   }
> > +
> > +   if (!target) {
> > +           pr_debug("Cannot find process PID %i to debug\n", args-
> >pid);
> > +           r = -ESRCH;
> > +           goto out;
> > +   }
> > +
> > +   /* Check if target is still PTRACED. */
> > +   rcu_read_lock();
> > +   if (target != p && args->op != KFD_IOC_DBG_TRAP_DISABLE
> > +                           && ptrace_parent(target->lead_thread) !=
> current) {
> > +           pr_err("PID %i is not PTRACED and cannot be debugged\n",
> args->pid);
> > +           r = -EPERM;
> > +   }
> > +   rcu_read_unlock();
> > +
> > +   if (r)
> > +           goto out;
> > +
> > +   mutex_lock(&target->mutex);
> > +
> > +   if (args->op != KFD_IOC_DBG_TRAP_ENABLE && !target-
> >debug_trap_enabled) {
> > +           pr_err("PID %i not debug enabled for op %i\n", args->pid,
> args->op);
> > +           r = -EINVAL;
> > +           goto unlock_out;
> > +   }
> > +
> >     switch (args->op) {
> >     case KFD_IOC_DBG_TRAP_ENABLE:
> > +           if (target != p)
> > +                   target->debugger_process = p;
> > +
> > +           r = kfd_dbg_trap_enable(target,
> > +                                   args->enable.dbg_fd,
> > +                                   (void __user *)args->enable.rinfo_ptr,
> > +                                   &args->enable.rinfo_size);
> > +           if (!r)
> > +                   target->exception_enable_mask = args-
> >enable.exception_mask;
> > +
> > +           pr_warn("Debug functions limited\n");
> > +           break;
> >     case KFD_IOC_DBG_TRAP_DISABLE:
> > +           r = kfd_dbg_trap_disable(target);
> > +           break;
> >     case KFD_IOC_DBG_TRAP_SEND_RUNTIME_EVENT:
> >     case KFD_IOC_DBG_TRAP_SET_EXCEPTIONS_ENABLED:
> >     case KFD_IOC_DBG_TRAP_SET_WAVE_LAUNCH_OVERRIDE:
> > @@ -2676,7 +2747,7 @@ static int kfd_ioctl_set_debug_trap(struct file
> *filep, struct kfd_process *p, v
> >     case KFD_IOC_DBG_TRAP_QUERY_EXCEPTION_INFO:
> >     case KFD_IOC_DBG_TRAP_GET_QUEUE_SNAPSHOT:
> >     case KFD_IOC_DBG_TRAP_GET_DEVICE_SNAPSHOT:
> > -           pr_warn("Debugging not supported yet\n");
> > +           pr_warn("Debug op %i not supported yet\n", args->op);
> >             r = -EACCES;
> >             break;
> >     default:
> > @@ -2684,6 +2755,19 @@ static int kfd_ioctl_set_debug_trap(struct file
> *filep, struct kfd_process *p, v
> >             r = -EINVAL;
> >     }
> >
> > +unlock_out:
> > +   mutex_unlock(&target->mutex);
> > +
> > +out:
> > +   if (thread)
> > +           put_task_struct(thread);
> > +
> > +   if (pid)
> > +           put_pid(pid);
> > +
> > +   if (target)
> > +           kfd_unref_process(target);
> > +
> >     return r;
> >   }
> >
> > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
> b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
> > new file mode 100644
> > index 000000000000..f6ea6db266b4
> > --- /dev/null
> > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
> > @@ -0,0 +1,94 @@
> > +/*
> > + * Copyright 2022 Advanced Micro Devices, Inc.
> > + *
> > + * Permission is hereby granted, free of charge, to any person obtaining a
> > + * copy of this software and associated documentation files (the
> "Software"),
> > + * to deal in the Software without restriction, including without limitation
> > + * the rights to use, copy, modify, merge, publish, distribute, sublicense,
> > + * and/or sell copies of the Software, and to permit persons to whom the
> > + * Software is furnished to do so, subject to the following conditions:
> > + *
> > + * The above copyright notice and this permission notice shall be included
> in
> > + * all copies or substantial portions of the Software.
> > + *
> > + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY
> KIND, EXPRESS OR
> > + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
> MERCHANTABILITY,
> > + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO
> EVENT SHALL
> > + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM,
> DAMAGES OR
> > + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
> OTHERWISE,
> > + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
> THE USE OR
> > + * OTHER DEALINGS IN THE SOFTWARE.
> > + */
> > +
> > +#include "kfd_debug.h"
> > +#include <linux/file.h>
> > +
> > +void debug_event_write_work_handler(struct work_struct *work)
> > +{
> > +   struct kfd_process *process;
> > +
> > +   static const char write_data = '.';
> > +   loff_t pos = 0;
> > +
> > +   process = container_of(work,
> > +                   struct kfd_process,
> > +                   debug_event_workarea);
> > +
> > +   kernel_write(process->dbg_ev_file, &write_data, 1, &pos);
> > +}
> > +
> > +int kfd_dbg_trap_disable(struct kfd_process *target)
> > +{
> > +   if (!target->debug_trap_enabled)
> > +           return 0;
> > +
> > +   fput(target->dbg_ev_file);
> > +   target->dbg_ev_file = NULL;
> > +
> > +   if (target->debugger_process) {
> > +           atomic_dec(&target->debugger_process-
> >debugged_process_count);
> > +           target->debugger_process = NULL;
> > +   }
> > +
> > +   target->debug_trap_enabled = false;
> > +   kfd_unref_process(target);
> > +
> > +   return 0;
> > +}
> > +
> > +int kfd_dbg_trap_enable(struct kfd_process *target, uint32_t fd,
> > +                   void __user *runtime_info, uint32_t *runtime_size)
> > +{
> > +   struct file *f;
> > +   uint32_t copy_size;
> > +   int r = 0;
> > +
> > +   if (target->debug_trap_enabled)
> > +           return -EALREADY;
> > +
> > +   copy_size = min((size_t)(*runtime_size), sizeof(target-
> >runtime_info));
> > +
> > +   f = fget(fd);
> > +   if (!f) {
> > +           pr_err("Failed to get file for (%i)\n", fd);
> > +           return -EBADF;
> > +   }
> > +
> > +   target->dbg_ev_file = f;
> > +
> > +   /* We already hold the process reference but hold another one for
> the
> > +    * debug session.
> > +    */
> > +   kref_get(&target->ref);
> > +   target->debug_trap_enabled = true;
> > +
> > +   if (target->debugger_process)
> > +           atomic_inc(&target->debugger_process-
> >debugged_process_count);
> > +
> > +   if (copy_to_user(runtime_info, (void *)&target->runtime_info,
> copy_size))
> > +           r = -EFAULT;
> > +
> > +   *runtime_size = sizeof(target->runtime_info);
> > +
> > +   return r;
> > +}
> > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
> b/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
> > new file mode 100644
> > index 000000000000..b2217eb1399c
> > --- /dev/null
> > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
> > @@ -0,0 +1,33 @@
> > +/*
> > + * Copyright 2022 Advanced Micro Devices, Inc.
> > + *
> > + * Permission is hereby granted, free of charge, to any person obtaining a
> > + * copy of this software and associated documentation files (the
> "Software"),
> > + * to deal in the Software without restriction, including without limitation
> > + * the rights to use, copy, modify, merge, publish, distribute, sublicense,
> > + * and/or sell copies of the Software, and to permit persons to whom the
> > + * Software is furnished to do so, subject to the following conditions:
> > + *
> > + * The above copyright notice and this permission notice shall be included
> in
> > + * all copies or substantial portions of the Software.
> > + *
> > + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY
> KIND, EXPRESS OR
> > + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
> MERCHANTABILITY,
> > + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO
> EVENT SHALL
> > + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM,
> DAMAGES OR
> > + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
> OTHERWISE,
> > + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
> THE USE OR
> > + * OTHER DEALINGS IN THE SOFTWARE.
> > + */
> > +
> > +#ifndef KFD_DEBUG_EVENTS_H_INCLUDED
> > +#define KFD_DEBUG_EVENTS_H_INCLUDED
> > +
> > +#include "kfd_priv.h"
> > +
> > +int kfd_dbg_trap_disable(struct kfd_process *target);
> > +int kfd_dbg_trap_enable(struct kfd_process *target, uint32_t fd,
> > +                   void __user *runtime_info,
> > +                   uint32_t *runtime_info_size);
> > +void debug_event_write_work_handler(struct work_struct *work);
> > +#endif
> > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> > index c06ada0844ba..a2ac98d06e71 100644
> > --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> > @@ -979,6 +979,14 @@ static int evict_process_queues_cpsch(struct
> device_queue_manager *dqm,
> >             goto out;
> >
> >     pdd = qpd_to_pdd(qpd);
> > +
> > +   /* The debugger creates processes that temporarily have not
> acquired
> > +    * all VMs for all devices and has no VMs itself.
> > +    * Skip queue eviction on process eviction.
> > +    */
> > +   if (!pdd->drm_priv)
> > +           goto out;
> > +
> This should be before qpd->

Sorry I didn't quite catch what you were saying here (did your comment get cutoff?).
Did you mean the pdd->drm_priv check needs to go before the if (qpd->evicted++ > 0) /* already evicted, do nothing */ check?

Thanks,

Jon

> >     pr_debug_ratelimited("Evicting PASID 0x%x queues\n",
> >                         pdd->process->pasid);
> >
> > @@ -1100,13 +1108,10 @@ static int restore_process_queues_cpsch(struct
> device_queue_manager *dqm,
> >   {
> >     struct queue *q;
> >     struct kfd_process_device *pdd;
> > -   uint64_t pd_base;
> >     uint64_t eviction_duration;
> >     int retval = 0;
> >
> >     pdd = qpd_to_pdd(qpd);
> > -   /* Retrieve PD base */
> > -   pd_base = amdgpu_amdkfd_gpuvm_get_process_page_dir(pdd-
> >drm_priv);
> >
> >     dqm_lock(dqm);
> >     if (WARN_ON_ONCE(!qpd->evicted)) /* already restored, do nothing
> */
> > @@ -1116,12 +1121,19 @@ static int restore_process_queues_cpsch(struct
> device_queue_manager *dqm,
> >             goto out;
> >     }
> >
> > +   /* The debugger creates processes that temporarily have not
> acquired
> > +    * all VMs for all devices and has no VMs itself.
> > +    * Skip queue restore on process restore.
> > +    */
> > +   if (!pdd->drm_priv)
> > +           goto out;
> > +
>
> I had a comment here that "qpd->evicted = 0;" was duplicated. It is
> still needed in this case. Otherwise the process will end up being
> created with all queues in an evicted state and no way to execute
> anything on the GPU.
>
> You only need one instance of "qpd->evicted = 0;", but it needs to be in
> the right place (after the vm_not_acquired label you had in v1 of this
> patch).
>
> Regards,
>    Felix
>
>
> >     pr_debug_ratelimited("Restoring PASID 0x%x queues\n",
> >                         pdd->process->pasid);
> >
> >     /* Update PD Base in QPD */
> > -   qpd->page_table_base = pd_base;
> > -   pr_debug("Updated PD address to 0x%llx\n", pd_base);
> > +   qpd->page_table_base =
> amdgpu_amdkfd_gpuvm_get_process_page_dir(pdd->drm_priv);
> > +   pr_debug("Updated PD address to 0x%llx\n", qpd-
> >page_table_base);
> >
> >     /* activate all active queues on the qpd */
> >     list_for_each_entry(q, &qpd->queues_list, list) {
> > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> > index bfa30d12406b..62b75ba28425 100644
> > --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> > @@ -886,19 +886,48 @@ struct kfd_process {
> >      */
> >     unsigned long last_restore_timestamp;
> >
> > +   /* Indicates device process is debug attached with reserved vmid. */
> > +   bool debug_trap_enabled;
> > +
> > +   /* per-process-per device debug event fd file */
> > +   struct file *dbg_ev_file;
> > +
> > +   /* If the process is a kfd debugger, we need to know so we can clean
> > +    * up at exit time.  If a process enables debugging on itself, it does
> > +    * its own clean-up, so we don't set the flag here.  We track this by
> > +    * counting the number of processes this process is debugging.
> > +    */
> > +   atomic_t debugged_process_count;
> > +
> > +   /* If the process is a debugged, this is the debugger process */
> > +   struct kfd_process *debugger_process;
> > +
> >     /* Kobj for our procfs */
> >     struct kobject *kobj;
> >     struct kobject *kobj_queues;
> >     struct attribute attr_pasid;
> >
> > +   /* Keep track cwsr init */
> > +   bool has_cwsr;
> > +
> > +   /* Exception code enable mask and status */
> > +   uint64_t exception_enable_mask;
> > +
> >     /* shared virtual memory registered by this process */
> >     struct svm_range_list svms;
> >
> >     bool xnack_enabled;
> >
> > +   /* Work area for debugger event writer worker. */
> > +   struct work_struct debug_event_workarea;
> > +
> >     atomic_t poison;
> >     /* Queues are in paused stated because we are in the process of
> doing a CRIU checkpoint */
> >     bool queues_paused;
> > +
> > +   /* Tracks runtime enable status */
> > +   struct kfd_runtime_info runtime_info;
> > +
> >   };
> >
> >   #define KFD_PROCESS_TABLE_SIZE 5 /* bits: 32 entries */
> > @@ -928,7 +957,7 @@ bool kfd_dev_is_large_bar(struct kfd_dev *dev);
> >
> >   int kfd_process_create_wq(void);
> >   void kfd_process_destroy_wq(void);
> > -struct kfd_process *kfd_create_process(struct file *filep);
> > +struct kfd_process *kfd_create_process(struct task_struct *thread);
> >   struct kfd_process *kfd_get_process(const struct task_struct *task);
> >   struct kfd_process *kfd_lookup_process_by_pasid(u32 pasid);
> >   struct kfd_process *kfd_lookup_process_by_mm(const struct mm_struct
> *mm);
> > @@ -1055,6 +1084,9 @@ void kfd_process_set_trap_handler(struct
> qcm_process_device *qpd,
> >                               uint64_t tba_addr,
> >                               uint64_t tma_addr);
> >
> > +/* CWSR initialization */
> > +int kfd_process_init_cwsr_apu(struct kfd_process *process, struct file
> *filep);
> > +
> >   /* CRIU */
> >   /*
> >    * Need to increment KFD_CRIU_PRIV_VERSION each time a change is
> made to any of the CRIU private
> > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> > index 72df6286e240..e935158ab311 100644
> > --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> > @@ -44,6 +44,7 @@ struct mm_struct;
> >   #include "kfd_iommu.h"
> >   #include "kfd_svm.h"
> >   #include "kfd_smi_events.h"
> > +#include "kfd_debug.h"
> >
> >   /*
> >    * List of struct kfd_process (field kfd_process).
> > @@ -69,7 +70,6 @@ static struct kfd_process *find_process(const struct
> task_struct *thread,
> >                                     bool ref);
> >   static void kfd_process_ref_release(struct kref *ref);
> >   static struct kfd_process *create_process(const struct task_struct
> *thread);
> > -static int kfd_process_init_cwsr_apu(struct kfd_process *p, struct file
> *filep);
> >
> >   static void evict_process_worker(struct work_struct *work);
> >   static void restore_process_worker(struct work_struct *work);
> > @@ -798,18 +798,19 @@ static void
> kfd_process_device_destroy_ib_mem(struct kfd_process_device *pdd)
> >     kfd_process_free_gpuvm(qpd->ib_mem, pdd, &qpd->ib_kaddr);
> >   }
> >
> > -struct kfd_process *kfd_create_process(struct file *filep)
> > +struct kfd_process *kfd_create_process(struct task_struct *thread)
> >   {
> >     struct kfd_process *process;
> > -   struct task_struct *thread = current;
> >     int ret;
> >
> > -   if (!thread->mm)
> > +   if (!(thread->mm && mmget_not_zero(thread->mm)))
> >             return ERR_PTR(-EINVAL);
> >
> >     /* Only the pthreads threading model is supported. */
> > -   if (thread->group_leader->mm != thread->mm)
> > +   if (thread->group_leader->mm != thread->mm) {
> > +           mmput(thread->mm);
> >             return ERR_PTR(-EINVAL);
> > +   }
> >
> >     /*
> >      * take kfd processes mutex before starting of process creation
> > @@ -827,10 +828,6 @@ struct kfd_process *kfd_create_process(struct file
> *filep)
> >             if (IS_ERR(process))
> >                     goto out;
> >
> > -           ret = kfd_process_init_cwsr_apu(process, filep);
> > -           if (ret)
> > -                   goto out_destroy;
> > -
> >             if (!procfs.kobj)
> >                     goto out;
> >
> > @@ -864,16 +861,9 @@ struct kfd_process *kfd_create_process(struct file
> *filep)
> >     if (!IS_ERR(process))
> >             kref_get(&process->ref);
> >     mutex_unlock(&kfd_processes_mutex);
> > +   mmput(thread->mm);
> >
> >     return process;
> > -
> > -out_destroy:
> > -   hash_del_rcu(&process->kfd_processes);
> > -   mutex_unlock(&kfd_processes_mutex);
> > -   synchronize_srcu(&kfd_processes_srcu);
> > -   /* kfd_process_free_notifier will trigger the cleanup */
> > -   mmu_notifier_put(&process->mmu_notifier);
> > -   return ERR_PTR(ret);
> >   }
> >
> >   struct kfd_process *kfd_get_process(const struct task_struct *thread)
> > @@ -1115,6 +1105,26 @@ static void kfd_process_wq_release(struct
> work_struct *work)
> >     struct kfd_process *p = container_of(work, struct kfd_process,
> >                                          release_work);
> >
> > +   kfd_dbg_trap_disable(p);
> > +
> > +   if (atomic_read(&p->debugged_process_count) > 0) {
> > +           struct kfd_process *target;
> > +           unsigned int temp;
> > +           int idx = srcu_read_lock(&kfd_processes_srcu);
> > +
> > +           hash_for_each_rcu(kfd_processes_table, temp, target,
> kfd_processes) {
> > +                   if (target->debugger_process && target-
> >debugger_process == p) {
> > +                           mutex_lock(&target->mutex);
> > +                           kfd_dbg_trap_disable(target);
> > +                           mutex_unlock(&target->mutex);
> > +                           if (atomic_read(&p-
> >debugged_process_count) == 0)
> > +                                   break;
> > +                   }
> > +           }
> > +
> > +           srcu_read_unlock(&kfd_processes_srcu, idx);
> > +   }
> > +
> >     kfd_process_dequeue_from_all_devices(p);
> >     pqm_uninit(&p->pqm);
> >
> > @@ -1200,11 +1210,14 @@ static const struct mmu_notifier_ops
> kfd_process_mmu_notifier_ops = {
> >     .free_notifier = kfd_process_free_notifier,
> >   };
> >
> > -static int kfd_process_init_cwsr_apu(struct kfd_process *p, struct file
> *filep)
> > +int kfd_process_init_cwsr_apu(struct kfd_process *p, struct file *filep)
> >   {
> >     unsigned long  offset;
> >     int i;
> >
> > +   if (p->has_cwsr)
> > +           return 0;
> > +
> >     for (i = 0; i < p->n_pdds; i++) {
> >             struct kfd_dev *dev = p->pdds[i]->dev;
> >             struct qcm_process_device *qpd = &p->pdds[i]->qpd;
> > @@ -1233,6 +1246,8 @@ static int kfd_process_init_cwsr_apu(struct
> kfd_process *p, struct file *filep)
> >                     qpd->tba_addr, qpd->tma_addr, qpd->cwsr_kaddr);
> >     }
> >
> > +   p->has_cwsr = true;
> > +
> >     return 0;
> >   }
> >
> > @@ -1375,6 +1390,10 @@ static struct kfd_process *create_process(const
> struct task_struct *thread)
> >     if (err)
> >             goto err_event_init;
> >     process->is_32bit_user_mode = in_compat_syscall();
> > +   process->debug_trap_enabled = false;
> > +   process->debugger_process = NULL;
> > +   process->exception_enable_mask = 0;
> > +   atomic_set(&process->debugged_process_count, 0);
> >
> >     process->pasid = kfd_pasid_alloc();
> >     if (process->pasid == 0) {
> > @@ -1422,6 +1441,8 @@ static struct kfd_process *create_process(const
> struct task_struct *thread)
> >     kfd_unref_process(process);
> >     get_task_struct(process->lead_thread);
> >
> > +   INIT_WORK(&process->debug_event_workarea,
> debug_event_write_work_handler);
> > +
> >     return process;
> >
> >   err_register_notifier:
> > @@ -1908,8 +1929,10 @@ static void restore_process_worker(struct
> work_struct *work)
> >      */
> >
> >     p->last_restore_timestamp = get_jiffies_64();
> > -   ret = amdgpu_amdkfd_gpuvm_restore_process_bos(p-
> >kgd_process_info,
> > -                                                &p->ef);
> > +   /* VMs may not have been acquired yet during debugging. */
> > +   if (p->kgd_process_info)
> > +           ret = amdgpu_amdkfd_gpuvm_restore_process_bos(p-
> >kgd_process_info,
> > +                                                        &p->ef);
> >     if (ret) {
> >             pr_debug("Failed to restore BOs of pasid 0x%x, retry after %d
> ms\n",
> >                      p->pasid, PROCESS_BACK_OFF_TIME_MS);

^ permalink raw reply	[flat|nested] 68+ messages in thread

* RE: [PATCH 19/32] drm/amdkfd: add runtime enable operation
  2023-03-21  0:31   ` Felix Kuehling
@ 2023-03-23 19:45     ` Kim, Jonathan
  0 siblings, 0 replies; 68+ messages in thread
From: Kim, Jonathan @ 2023-03-23 19:45 UTC (permalink / raw)
  To: Kuehling, Felix, amd-gfx, dri-devel

[AMD Official Use Only - General]

> -----Original Message-----
> From: Kuehling, Felix <Felix.Kuehling@amd.com>
> Sent: Monday, March 20, 2023 8:31 PM
> To: Kim, Jonathan <Jonathan.Kim@amd.com>; amd-
> gfx@lists.freedesktop.org; dri-devel@lists.freedesktop.org
> Subject: Re: [PATCH 19/32] drm/amdkfd: add runtime enable operation
>
>
> On 2023-01-25 14:53, Jonathan Kim wrote:
> > The debugger can attach to a process prior to HSA enablement (i.e.
> > inferior is spawned by the debugger and attached to immediately before
> > target process has been enabled for HSA dispatches) or it
> > can attach to a running target that is already HSA enabled.  Either
> > way, the debugger needs to know the enablement status to know when
> > it can inspect queues.
> >
> > For the scenario where the debugger spawns the target process,
> > it will have to wait for ROCr's runtime enable request from the target.
> > The runtime enable request will be able to see that its process has been
> > debug attached.  ROCr raises an EC_PROCESS_RUNTIME signal to the
> > debugger then blocks the target process while waiting the debugger's
> > response. Once the debugger has received the runtime signal, it will
> > unblock the target process.
> >
> > For the scenario where the debugger attaches to a running target
> > process, ROCr will set the target process' runtime status as enabled so
> > that on an attach request, the debugger will be able to see this
> > status and will continue with debug enablement as normal.
> >
> > A secondary requirement is to conditionally enable the trap tempories
> only
> > if the user requests it (env var HSA_ENABLE_DEBUG=1) or if the debugger
> > attaches with HSA runtime enabled.  This is because setting up the trap
> > temporaries incurs a performance overhead that is unacceptable for
> > microbench performance in normal mode for certain customers.
> >
> > In the scenario where the debugger spawns the target process, when ROCr
> > detects that the debugger has attached during the runtime enable
> > request, it will enable the trap temporaries before it blocks the target
> > process while waiting for the debugger to respond.
> >
> > In the scenario where the debugger attaches to a running target process,
> > it will enable to trap temporaries itself.
> >
> > Finally, there is an additional restriction that is required to be
> > enforced with runtime enable and HW debug mode setting. The debugger
> must
> > first ensure that HW debug mode has been enabled before permitting HW
> debug
> > mode operations.
> >
> > With single process debug devices, allowing the debugger to set debug
> > HW modes prior to trap activation means that debug HW mode setting can
> > occur before the KFD has reserved the debug VMID (0xf) from the hardware
> > scheduler's VMID allocation resource pool.  This can result in the
> > hardware scheduler assigning VMID 0xf to a non-debugged process and
> > having that process inherit debug HW mode settings intended for the
> > debugged target process instead, which is both incorrect and potentially
> > fatal for normal mode operation.
> >
> > With multi process debug devices, allowing the debugger to set debug
> > HW modes prior to trap activation means that non-debugged processes
> > migrating to a new VMID could inherit unintended debug settings.
> >
> > All debug operations that touch HW settings must require trap activation
> > where trap activation is triggered by both debug attach and runtime
> > enablement (target has KFD opened and is ready to dispatch work).
> >
> > v2: fix up hierarchy of semantics in description.
> >
> > Signed-off-by: Jonathan Kim <jonathan.kim@amd.com>
> > ---
> >   drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 150
> ++++++++++++++++++++++-
> >   drivers/gpu/drm/amd/amdkfd/kfd_debug.c   |   6 +-
> >   drivers/gpu/drm/amd/amdkfd/kfd_debug.h   |   4 +
> >   drivers/gpu/drm/amd/amdkfd/kfd_priv.h    |   1 +
> >   4 files changed, 157 insertions(+), 4 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> > index 09fe8576dc8c..46f9d453dc5e 100644
> > --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> > @@ -2654,11 +2654,147 @@ static int kfd_ioctl_criu(struct file *filep,
> struct kfd_process *p, void *data)
> >     return ret;
> >   }
> >
> > -static int kfd_ioctl_runtime_enable(struct file *filep, struct kfd_process *p,
> void *data)
> > +static int runtime_enable(struct kfd_process *p, uint64_t r_debug,
> > +                   bool enable_ttmp_setup)
> >   {
> > +   int i = 0, ret = 0;
> > +
> > +   if (p->is_runtime_retry)
> > +           goto retry;
> > +
> > +   if (p->runtime_info.runtime_state !=
> DEBUG_RUNTIME_STATE_DISABLED)
> > +           return -EBUSY;
> > +
> > +   for (i = 0; i < p->n_pdds; i++) {
> > +           struct kfd_process_device *pdd = p->pdds[i];
> > +
> > +           if (pdd->qpd.queue_count)
> > +                   return -EEXIST;
> > +   }
> > +
> > +   p->runtime_info.runtime_state =
> DEBUG_RUNTIME_STATE_ENABLED;
> > +   p->runtime_info.r_debug = r_debug;
> > +   p->runtime_info.ttmp_setup = enable_ttmp_setup;
> > +
> > +   if (p->runtime_info.ttmp_setup) {
> > +           for (i = 0; i < p->n_pdds; i++) {
> > +                   struct kfd_process_device *pdd = p->pdds[i];
> > +
> > +                   if (!kfd_dbg_is_rlc_restore_supported(pdd->dev)) {
> > +                           amdgpu_gfx_off_ctrl(pdd->dev->adev, false);
> > +                           pdd->dev->kfd2kgd->enable_debug_trap(
> > +                                           pdd->dev->adev,
> > +                                           true,
> > +                                           pdd->dev-
> >vm_info.last_vmid_kfd);
> > +                   }
> > +
> > +                   if (kfd_dbg_is_per_vmid_supported(pdd->dev)) {
>
> Should this be else-if? It seems weird that enable_debug_trap could be
> called twice in a row. If RLC restore is only applicable on
> single-process debug devices, then maybe put the per-VMID case first.
>
>
> > +                           pdd->spi_dbg_override = pdd->dev->kfd2kgd-
> >enable_debug_trap(
> > +                                           pdd->dev->adev,
> > +                                           false,
> > +                                           pdd->dev-
> >vm_info.last_vmid_kfd);
> > +
> > +                           if (!pdd->dev-
> >shared_resources.enable_mes)
> > +                                   debug_refresh_runlist(pdd->dev-
> >dqm);
> > +                           else
> > +                                   kfd_dbg_set_mes_debug_mode(pdd);
>
> Do we really need to update the runlist here? When the runtime gets
> enabled, there are no queues yet for the process. So there should be no
> change to the runlist until the process creates its first queue.

Acked.  Yeah this seems unnecessary and can be dropped.
>
>
> > +                   }
> > +           }
> > +   }
> > +
> > +retry:
> > +   if (p->debug_trap_enabled) {
> > +           if (!p->is_runtime_retry) {
> > +                   kfd_dbg_trap_activate(p);
> > +
>       kfd_dbg_ev_raise(KFD_EC_MASK(EC_PROCESS_RUNTIME),
> > +                                   p, NULL, 0, false, NULL, 0);
> > +           }
> > +
> > +           mutex_unlock(&p->mutex);
> > +           ret = down_interruptible(&p->runtime_enable_sema);
> > +           mutex_lock(&p->mutex);
> > +
> > +           p->is_runtime_retry = !!ret;
> > +   }
> > +
> > +   return ret;
> > +}
> > +
> > +static int runtime_disable(struct kfd_process *p)
> > +{
> > +   int i = 0, ret;
> > +   bool was_enabled = p->runtime_info.runtime_state ==
> DEBUG_RUNTIME_STATE_ENABLED;
> > +
> > +   p->runtime_info.runtime_state =
> DEBUG_RUNTIME_STATE_DISABLED;
> > +   p->runtime_info.r_debug = 0;
> > +
> > +   if (p->debug_trap_enabled) {
> > +           if (was_enabled)
> > +                   kfd_dbg_trap_deactivate(p, false, 0);
>
> Does this call kfd_dbg_trap_deactivate multiple times on retry? Is that
> a problem?

I don't think dbg_trap_deactivate gets called again on retry.  Prior to the deactivate call, the per-process runtime state gets saved as DEBUG_RUNTIME_STATE_DISABLED and early returns on a down_interruptible error.  If the caller retries on the error, this means was_enabled is false so we never deactivate again.

Thanks,

Jon

>
> Regards,
>    Felix
>
>
> > +
> > +           if (!p->is_runtime_retry)
> > +
>       kfd_dbg_ev_raise(KFD_EC_MASK(EC_PROCESS_RUNTIME),
> > +                                   p, NULL, 0, false, NULL, 0);
> > +
> > +           mutex_unlock(&p->mutex);
> > +           ret = down_interruptible(&p->runtime_enable_sema);
> > +           mutex_lock(&p->mutex);
> > +
> > +           p->is_runtime_retry = !!ret;
> > +           if (ret)
> > +                   return ret;
> > +   }
> > +
> > +   if (was_enabled && p->runtime_info.ttmp_setup) {
> > +           for (i = 0; i < p->n_pdds; i++) {
> > +                   struct kfd_process_device *pdd = p->pdds[i];
> > +
> > +                   if (!kfd_dbg_is_rlc_restore_supported(pdd->dev))
> > +                           amdgpu_gfx_off_ctrl(pdd->dev->adev, true);
> > +           }
> > +   }
> > +
> > +   p->runtime_info.ttmp_setup = false;
> > +
> > +   /* disable DISPATCH_PTR save */
> > +   for (i = 0; i < p->n_pdds; i++) {
> > +           struct kfd_process_device *pdd = p->pdds[i];
> > +
> > +           if (kfd_dbg_is_per_vmid_supported(pdd->dev)) {
> > +                   pdd->spi_dbg_override =
> > +                                   pdd->dev->kfd2kgd-
> >disable_debug_trap(
> > +                                   pdd->dev->adev,
> > +                                   false,
> > +                                   pdd->dev->vm_info.last_vmid_kfd);
> > +
> > +                   if (!pdd->dev->shared_resources.enable_mes)
> > +                           debug_refresh_runlist(pdd->dev->dqm);
> > +                   else
> > +                           kfd_dbg_set_mes_debug_mode(pdd);
> > +           }
> > +   }
> > +
> >     return 0;
> >   }
> >
> > +static int kfd_ioctl_runtime_enable(struct file *filep, struct kfd_process
> *p, void *data)
> > +{
> > +   struct kfd_ioctl_runtime_enable_args *args = data;
> > +   int r;
> > +
> > +   mutex_lock(&p->mutex);
> > +
> > +   if (args->mode_mask &
> KFD_RUNTIME_ENABLE_MODE_ENABLE_MASK)
> > +           r = runtime_enable(p, args->r_debug,
> > +                           !!(args->mode_mask &
> KFD_RUNTIME_ENABLE_MODE_TTMP_SAVE_MASK));
> > +   else
> > +           r = runtime_disable(p);
> > +
> > +   mutex_unlock(&p->mutex);
> > +
> > +   return r;
> > +}
> > +
> >   static int kfd_ioctl_set_debug_trap(struct file *filep, struct kfd_process
> *p, void *data)
> >   {
> >     struct kfd_ioctl_dbg_trap_args *args = data;
> > @@ -2720,6 +2856,18 @@ static int kfd_ioctl_set_debug_trap(struct file
> *filep, struct kfd_process *p, v
> >             goto unlock_out;
> >     }
> >
> > +   if (target->runtime_info.runtime_state !=
> DEBUG_RUNTIME_STATE_ENABLED &&
> > +                   (args->op ==
> KFD_IOC_DBG_TRAP_SET_WAVE_LAUNCH_OVERRIDE ||
> > +                    args->op ==
> KFD_IOC_DBG_TRAP_SET_WAVE_LAUNCH_MODE ||
> > +                    args->op == KFD_IOC_DBG_TRAP_SUSPEND_QUEUES
> ||
> > +                    args->op == KFD_IOC_DBG_TRAP_RESUME_QUEUES
> ||
> > +                    args->op ==
> KFD_IOC_DBG_TRAP_SET_NODE_ADDRESS_WATCH ||
> > +                    args->op ==
> KFD_IOC_DBG_TRAP_CLEAR_NODE_ADDRESS_WATCH ||
> > +                    args->op == KFD_IOC_DBG_TRAP_SET_FLAGS)) {
> > +           r = -EPERM;
> > +           goto unlock_out;
> > +   }
> > +
> >     switch (args->op) {
> >     case KFD_IOC_DBG_TRAP_ENABLE:
> >             if (target != p)
> > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
> b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
> > index 4174b479ea6f..47f8425a0db3 100644
> > --- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
> > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
> > @@ -220,7 +220,7 @@ static int kfd_dbg_set_workaround(struct
> kfd_process *target, bool enable)
> >     return r;
> >   }
> >
> > -static int kfd_dbg_set_mes_debug_mode(struct kfd_process_device *pdd)
> > +int kfd_dbg_set_mes_debug_mode(struct kfd_process_device *pdd)
> >   {
> >     uint32_t spi_dbg_cntl = pdd->spi_dbg_override | pdd-
> >spi_dbg_launch_mode;
> >     uint32_t flags = pdd->process->dbg_flags;
> > @@ -240,7 +240,7 @@ static int kfd_dbg_set_mes_debug_mode(struct
> kfd_process_device *pdd)
> >    *                                to unwind
> >    *                else: ignored
> >    */
> > -static void kfd_dbg_trap_deactivate(struct kfd_process *target, bool
> unwind, int unwind_count)
> > +void kfd_dbg_trap_deactivate(struct kfd_process *target, bool unwind,
> int unwind_count)
> >   {
> >     int i, count = 0;
> >
> > @@ -311,7 +311,7 @@ int kfd_dbg_trap_disable(struct kfd_process
> *target)
> >     return 0;
> >   }
> >
> > -static int kfd_dbg_trap_activate(struct kfd_process *target)
> > +int kfd_dbg_trap_activate(struct kfd_process *target)
> >   {
> >     int i, r = 0, unwind_count = 0;
> >
> > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
> b/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
> > index fefb9dc5cf69..22707f7a2368 100644
> > --- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
> > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
> > @@ -28,6 +28,8 @@
> >   void kgd_gfx_v9_set_wave_launch_stall(struct amdgpu_device *adev,
> >                                     uint32_t vmid,
> >                                     bool stall);
> > +void kfd_dbg_trap_deactivate(struct kfd_process *target, bool unwind,
> int unwind_count);
> > +int kfd_dbg_trap_activate(struct kfd_process *target);
> >   bool kfd_dbg_ev_raise(uint64_t event_mask,
> >                     struct kfd_process *process, struct kfd_dev *dev,
> >                     unsigned int source_id, bool use_worker,
> > @@ -80,4 +82,6 @@ static inline bool kfd_dbg_has_gws_support(struct
> kfd_dev *dev)
> >     /* Assume debugging and cooperative launch supported otherwise.
> */
> >     return true;
> >   }
> > +
> > +int kfd_dbg_set_mes_debug_mode(struct kfd_process_device *pdd);
> >   #endif
> > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> > index 4cb433a21e3d..63c59ad2a4ca 100644
> > --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> > @@ -946,6 +946,7 @@ struct kfd_process {
> >
> >     /* Tracks runtime enable status */
> >     struct semaphore runtime_enable_sema;
> > +   bool is_runtime_retry;
> >     struct kfd_runtime_info runtime_info;
> >
> >   };

^ permalink raw reply	[flat|nested] 68+ messages in thread

* Re: [PATCH 03/32] drm/amdkfd: prepare per-process debug enable and disable
  2023-03-23 19:12     ` Kim, Jonathan
@ 2023-03-23 20:08       ` Felix Kuehling
  0 siblings, 0 replies; 68+ messages in thread
From: Felix Kuehling @ 2023-03-23 20:08 UTC (permalink / raw)
  To: Kim, Jonathan, amd-gfx, dri-devel

Sorry, I think that was just a stray comment that I messed up while 
editing my response. You can ignore it.

Regards,
   Felix


Am 2023-03-23 um 15:12 schrieb Kim, Jonathan:
>>> index c06ada0844ba..a2ac98d06e71 100644
>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
>>> @@ -979,6 +979,14 @@ static int evict_process_queues_cpsch(struct
>> device_queue_manager *dqm,
>>>              goto out;
>>>
>>>      pdd = qpd_to_pdd(qpd);
>>> +
>>> +   /* The debugger creates processes that temporarily have not
>> acquired
>>> +    * all VMs for all devices and has no VMs itself.
>>> +    * Skip queue eviction on process eviction.
>>> +    */
>>> +   if (!pdd->drm_priv)
>>> +           goto out;
>>> +
>> This should be before qpd->
> Sorry I didn't quite catch what you were saying here (did your comment get cutoff?).
> Did you mean the pdd->drm_priv check needs to go before the if (qpd->evicted++ > 0) /* already evicted, do nothing */ check?
>
> Thanks,
>
> Jon
>

^ permalink raw reply	[flat|nested] 68+ messages in thread

end of thread, other threads:[~2023-03-23 20:08 UTC | newest]

Thread overview: 68+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-01-25 19:53 [PATCH 00/32] Upstream of kernel support for AMDGPU ISA debugging Jonathan Kim
2023-01-25 19:53 ` [PATCH 01/32] drm/amdkfd: add debug and runtime enable interface Jonathan Kim
2023-02-16 22:16   ` Felix Kuehling
2023-01-25 19:53 ` [PATCH 02/32] drm/amdkfd: display debug capabilities Jonathan Kim
2023-02-16 22:24   ` Felix Kuehling
2023-01-25 19:53 ` [PATCH 03/32] drm/amdkfd: prepare per-process debug enable and disable Jonathan Kim
2023-02-16 23:44   ` Felix Kuehling
2023-03-23 19:12     ` Kim, Jonathan
2023-03-23 20:08       ` Felix Kuehling
2023-01-25 19:53 ` [PATCH 04/32] drm/amdgpu: add kgd hw debug mode setting interface Jonathan Kim
2023-01-25 19:53 ` [PATCH 05/32] drm/amdgpu: setup hw debug registers on driver initialization Jonathan Kim
2023-02-16 22:39   ` Felix Kuehling
2023-01-25 19:53 ` [PATCH 06/32] drm/amdgpu: add gfx9 hw debug mode enable and disable calls Jonathan Kim
2023-01-29  5:12   ` kernel test robot
2023-02-16 22:54   ` Felix Kuehling
2023-01-25 19:53 ` [PATCH 07/32] drm/amdgpu: add gfx9.4.1 " Jonathan Kim
2023-01-29  6:34   ` kernel test robot
2023-02-16 23:01   ` Felix Kuehling
2023-01-25 19:53 ` [PATCH 08/32] drm/amdgpu: add gfx10 " Jonathan Kim
2023-01-29  7:55   ` kernel test robot
2023-02-16 23:11   ` Felix Kuehling
2023-01-25 19:53 ` [PATCH 09/32] drm/amdgpu: add gfx9.4.2 " Jonathan Kim
2023-02-16 23:14   ` Felix Kuehling
2023-01-25 19:53 ` [PATCH 10/32] drm/amdgpu: add gfx11 " Jonathan Kim
2023-02-16 23:19   ` Felix Kuehling
2023-01-25 19:53 ` [PATCH 11/32] drm/amdgpu: add configurable grace period for unmap queues Jonathan Kim
2023-03-20 19:19   ` Felix Kuehling
2023-01-25 19:53 ` [PATCH 12/32] drm/amdkfd: prepare map process for single process debug devices Jonathan Kim
2023-03-20 20:06   ` Felix Kuehling
2023-01-25 19:53 ` [PATCH 13/32] drm/amdgpu: prepare map process for multi-process " Jonathan Kim
2023-03-20 20:16   ` Felix Kuehling
2023-01-25 19:53 ` [PATCH 14/32] drm/amdgpu: expose debug api for mes Jonathan Kim
2023-03-20 20:47   ` Felix Kuehling
2023-01-25 19:53 ` [PATCH 15/32] drm/amdkfd: prepare trap workaround for gfx11 Jonathan Kim
2023-03-20 21:49   ` Felix Kuehling
2023-03-23 13:50     ` Kim, Jonathan
2023-03-23 14:00       ` Felix Kuehling
2023-01-25 19:53 ` [PATCH 16/32] drm/amdkfd: add per process hw trap enable and disable functions Jonathan Kim
2023-03-20 23:06   ` Felix Kuehling
2023-01-25 19:53 ` [PATCH 17/32] drm/amdkfd: add raise exception event function Jonathan Kim
2023-03-20 23:18   ` Felix Kuehling
2023-01-25 19:53 ` [PATCH 18/32] drm/amdkfd: add send exception operation Jonathan Kim
2023-03-20 23:26   ` Felix Kuehling
2023-01-25 19:53 ` [PATCH 19/32] drm/amdkfd: add runtime enable operation Jonathan Kim
2023-03-21  0:31   ` Felix Kuehling
2023-03-23 19:45     ` Kim, Jonathan
2023-01-25 19:53 ` [PATCH 20/32] drm/amdkfd: add debug trap enabled flag to tma Jonathan Kim
2023-01-25 19:53 ` [PATCH 21/32] drm/amdkfd: update process interrupt handling for debug events Jonathan Kim
2023-03-21 21:07   ` Felix Kuehling
2023-01-25 19:53 ` [PATCH 22/32] drm/amdkfd: add debug set exceptions enabled operation Jonathan Kim
2023-01-25 19:53 ` [PATCH 23/32] drm/amdkfd: add debug wave launch override operation Jonathan Kim
2023-03-21 21:37   ` Felix Kuehling
2023-01-25 19:53 ` [PATCH 24/32] drm/amdkfd: add debug wave launch mode operation Jonathan Kim
2023-03-21 21:42   ` Felix Kuehling
2023-01-25 19:53 ` [PATCH 25/32] drm/amdkfd: add debug suspend and resume process queues operation Jonathan Kim
2023-03-21 22:16   ` Felix Kuehling
2023-01-25 19:53 ` [PATCH 26/32] drm/amdkfd: add debug set and clear address watch points operation Jonathan Kim
2023-03-22 21:38   ` Felix Kuehling
2023-01-25 19:53 ` [PATCH 27/32] drm/amdkfd: add debug set flags operation Jonathan Kim
2023-03-22 21:47   ` Felix Kuehling
2023-01-25 19:53 ` [PATCH 28/32] drm/amdkfd: add debug query event operation Jonathan Kim
2023-01-25 19:53 ` [PATCH 29/32] drm/amdkfd: add debug query exception info operation Jonathan Kim
2023-01-25 19:53 ` [PATCH 30/32] drm/amdkfd: add debug queue snapshot operation Jonathan Kim
2023-03-22 21:52   ` Felix Kuehling
2023-01-25 19:54 ` [PATCH 31/32] drm/amdkfd: add debug device " Jonathan Kim
2023-03-22 21:54   ` Felix Kuehling
2023-01-25 19:54 ` [PATCH 32/32] drm/amdkfd: bump kfd ioctl minor version for debug api availability Jonathan Kim
2023-03-22 21:56   ` Felix Kuehling

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).