All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 01/29] drm/amdkfd: add debug and runtime enable interface
@ 2022-10-31 16:23 Jonathan Kim
  2022-10-31 16:23 ` [PATCH 02/29] drm/amdkfd: display debug capabilities Jonathan Kim
                   ` (28 more replies)
  0 siblings, 29 replies; 63+ messages in thread
From: Jonathan Kim @ 2022-10-31 16:23 UTC (permalink / raw)
  To: amd-gfx; +Cc: Felix.Kuehling

Introduce the GPU debug operations interface.

For ROCm-GDB to extend the GNU Debugger's ability to inspect the AMD GPU
instruction set, provide the necessary interface to allow the debugger
to HW debug-mode set and query exceptions per HSA queue, process or
device.

The runtime_enable interface coordinates exception handling with the
HSA runtime.

Usage is available in the kern docs at uapi/linux/kfd_ioctl.h.

v2: add more documentation on semantics and error returns.
expand kfd_dbg_device_info_entry with new fields.
update device_snapshot sematics to match queue snapshot semantics

Signed-off-by: Jonathan Kim <jonathan.kim@amd.com>
---
 drivers/gpu/drm/amd/amdkfd/kfd_chardev.c |  48 ++
 include/uapi/linux/kfd_ioctl.h           | 655 ++++++++++++++++++++++-
 2 files changed, 702 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index 5feaba6a77de..11a960c83fb2 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -2644,6 +2644,48 @@ static int kfd_ioctl_criu(struct file *filep, struct kfd_process *p, void *data)
 	return ret;
 }
 
+static int kfd_ioctl_runtime_enable(struct file *filep, struct kfd_process *p, void *data)
+{
+	return 0;
+}
+
+static int kfd_ioctl_set_debug_trap(struct file *filep, struct kfd_process *p, void *data)
+{
+	struct kfd_ioctl_dbg_trap_args *args = data;
+	int r = 0;
+
+	if (sched_policy == KFD_SCHED_POLICY_NO_HWS) {
+		pr_err("Debugging does not support sched_policy %i", sched_policy);
+		return -EINVAL;
+	}
+
+	switch (args->op) {
+	case KFD_IOC_DBG_TRAP_ENABLE:
+	case KFD_IOC_DBG_TRAP_DISABLE:
+	case KFD_IOC_DBG_TRAP_SEND_RUNTIME_EVENT:
+	case KFD_IOC_DBG_TRAP_SET_EXCEPTIONS_ENABLED:
+	case KFD_IOC_DBG_TRAP_SET_WAVE_LAUNCH_OVERRIDE:
+	case KFD_IOC_DBG_TRAP_SET_WAVE_LAUNCH_MODE:
+	case KFD_IOC_DBG_TRAP_SUSPEND_QUEUES:
+	case KFD_IOC_DBG_TRAP_RESUME_QUEUES:
+	case KFD_IOC_DBG_TRAP_SET_NODE_ADDRESS_WATCH:
+	case KFD_IOC_DBG_TRAP_CLEAR_NODE_ADDRESS_WATCH:
+	case KFD_IOC_DBG_TRAP_SET_FLAGS:
+	case KFD_IOC_DBG_TRAP_QUERY_DEBUG_EVENT:
+	case KFD_IOC_DBG_TRAP_QUERY_EXCEPTION_INFO:
+	case KFD_IOC_DBG_TRAP_GET_QUEUE_SNAPSHOT:
+	case KFD_IOC_DBG_TRAP_GET_DEVICE_SNAPSHOT:
+		pr_warn("Debugging not supported yet\n");
+		r = -EACCES;
+		break;
+	default:
+		pr_err("Invalid option: %i\n", args->op);
+		r = -EINVAL;
+	}
+
+	return r;
+}
+
 #define AMDKFD_IOCTL_DEF(ioctl, _func, _flags) \
 	[_IOC_NR(ioctl)] = {.cmd = ioctl, .func = _func, .flags = _flags, \
 			    .cmd_drv = 0, .name = #ioctl}
@@ -2753,6 +2795,12 @@ static const struct amdkfd_ioctl_desc amdkfd_ioctls[] = {
 
 	AMDKFD_IOCTL_DEF(AMDKFD_IOC_AVAILABLE_MEMORY,
 			kfd_ioctl_get_available_memory, 0),
+
+	AMDKFD_IOCTL_DEF(AMDKFD_IOC_RUNTIME_ENABLE,
+			kfd_ioctl_runtime_enable, 0),
+
+	AMDKFD_IOCTL_DEF(AMDKFD_IOC_DBG_TRAP,
+			kfd_ioctl_set_debug_trap, 0),
 };
 
 #define AMDKFD_CORE_IOCTL_COUNT	ARRAY_SIZE(amdkfd_ioctls)
diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h
index 42b60198b6c5..bedf1b823f57 100644
--- a/include/uapi/linux/kfd_ioctl.h
+++ b/include/uapi/linux/kfd_ioctl.h
@@ -109,6 +109,28 @@ struct kfd_ioctl_get_available_memory_args {
 	__u32 pad;
 };
 
+struct kfd_dbg_device_info_entry {
+	__u64 exception_status;
+	__u64 lds_base;
+	__u64 lds_limit;
+	__u64 scratch_base;
+	__u64 scratch_limit;
+	__u64 gpuvm_base;
+	__u64 gpuvm_limit;
+	__u32 gpu_id;
+	__u32 location_id;
+	__u32 vendor_id;
+	__u32 device_id;
+	__u32 fw_version;
+	__u32 gfx_target_version;
+	__u32 simd_count;
+	__u32 max_waves_per_simd;
+	__u32 array_count;
+	__u32 simd_arrays_per_engine;
+	__u32 capability;
+	__u32 debug_prop;
+};
+
 /* For kfd_ioctl_set_memory_policy_args.default_policy and alternate_policy */
 #define KFD_IOC_CACHE_POLICY_COHERENT 0
 #define KFD_IOC_CACHE_POLICY_NONCOHERENT 1
@@ -766,6 +788,631 @@ struct kfd_ioctl_set_xnack_mode_args {
 	__s32 xnack_enabled;
 };
 
+/* Wave launch override modes */
+enum kfd_dbg_trap_override_mode {
+	KFD_DBG_TRAP_OVERRIDE_OR = 0,
+	KFD_DBG_TRAP_OVERRIDE_REPLACE = 1
+};
+
+/* Wave launch overrides */
+enum kfd_dbg_trap_mask {
+	KFD_DBG_TRAP_MASK_FP_INVALID = 1,
+	KFD_DBG_TRAP_MASK_FP_INPUT_DENORMAL = 2,
+	KFD_DBG_TRAP_MASK_FP_DIVIDE_BY_ZERO = 4,
+	KFD_DBG_TRAP_MASK_FP_OVERFLOW = 8,
+	KFD_DBG_TRAP_MASK_FP_UNDERFLOW = 16,
+	KFD_DBG_TRAP_MASK_FP_INEXACT = 32,
+	KFD_DBG_TRAP_MASK_INT_DIVIDE_BY_ZERO = 64,
+	KFD_DBG_TRAP_MASK_DBG_ADDRESS_WATCH = 128,
+	KFD_DBG_TRAP_MASK_DBG_MEMORY_VIOLATION = 256
+};
+
+/* Wave launch modes */
+enum kfd_dbg_trap_wave_launch_mode {
+	KFD_DBG_TRAP_WAVE_LAUNCH_MODE_NORMAL = 0,
+	KFD_DBG_TRAP_WAVE_LAUNCH_MODE_HALT = 1,
+	KFD_DBG_TRAP_WAVE_LAUNCH_MODE_KILL = 2,
+	KFD_DBG_TRAP_WAVE_LAUNCH_MODE_DEBUG = 3,
+	KFD_DBG_TRAP_WAVE_LAUNCH_MODE_STALL = 4
+};
+
+/* Address watch modes */
+enum kfd_dbg_trap_address_watch_mode {
+	KFD_DBG_TRAP_ADDRESS_WATCH_MODE_READ = 0,
+	KFD_DBG_TRAP_ADDRESS_WATCH_MODE_NONREAD = 1,
+	KFD_DBG_TRAP_ADDRESS_WATCH_MODE_ATOMIC = 2,
+	KFD_DBG_TRAP_ADDRESS_WATCH_MODE_ALL = 3
+};
+
+/* Additional wave settings */
+enum kfd_dbg_trap_flags {
+	KFD_DBG_TRAP_FLAG_SINGLE_MEM_OP = 1,
+	KFD_DBG_TRAP_FLAG_SINGLE_ALU_OP = 2
+};
+
+/* Trap exceptions */
+enum kfd_dbg_trap_exception_code {
+	EC_NONE = 0,
+	/* per queue */
+	EC_QUEUE_WAVE_ABORT = 1,
+	EC_QUEUE_WAVE_TRAP = 2,
+	EC_QUEUE_WAVE_MATH_ERROR = 3,
+	EC_QUEUE_WAVE_ILLEGAL_INSTRUCTION = 4,
+	EC_QUEUE_WAVE_MEMORY_VIOLATION = 5,
+	EC_QUEUE_WAVE_APERTURE_VIOLATION = 6,
+	EC_QUEUE_PACKET_DISPATCH_DIM_INVALID = 16,
+	EC_QUEUE_PACKET_DISPATCH_GROUP_SEGMENT_SIZE_INVALID = 17,
+	EC_QUEUE_PACKET_DISPATCH_CODE_INVALID = 18,
+	EC_QUEUE_PACKET_RESERVED = 19,
+	EC_QUEUE_PACKET_UNSUPPORTED = 20,
+	EC_QUEUE_PACKET_DISPATCH_WORK_GROUP_SIZE_INVALID = 21,
+	EC_QUEUE_PACKET_DISPATCH_REGISTER_INVALID = 22,
+	EC_QUEUE_PACKET_VENDOR_UNSUPPORTED = 23,
+	EC_QUEUE_PREEMPTION_ERROR = 30,
+	EC_QUEUE_NEW = 31,
+	/* per device */
+	EC_DEVICE_QUEUE_DELETE = 32,
+	EC_DEVICE_MEMORY_VIOLATION = 33,
+	EC_DEVICE_RAS_ERROR = 34,
+	EC_DEVICE_FATAL_HALT = 35,
+	EC_DEVICE_NEW = 36,
+	/* per process */
+	EC_PROCESS_RUNTIME = 48,
+	EC_PROCESS_DEVICE_REMOVE = 49,
+	EC_MAX
+};
+
+/* Mask generated by ecode in kfd_dbg_trap_exception_code */
+#define KFD_EC_MASK(ecode)	(1ULL << (ecode - 1))
+
+/* Masks for exception code type checks below */
+#define KFD_EC_MASK_QUEUE	(KFD_EC_MASK(EC_QUEUE_WAVE_ABORT) |	\
+				 KFD_EC_MASK(EC_QUEUE_WAVE_TRAP) |	\
+				 KFD_EC_MASK(EC_QUEUE_WAVE_MATH_ERROR) |	\
+				 KFD_EC_MASK(EC_QUEUE_WAVE_ILLEGAL_INSTRUCTION) |	\
+				 KFD_EC_MASK(EC_QUEUE_WAVE_MEMORY_VIOLATION) |	\
+				 KFD_EC_MASK(EC_QUEUE_WAVE_APERTURE_VIOLATION) |	\
+				 KFD_EC_MASK(EC_QUEUE_PACKET_DISPATCH_DIM_INVALID) |	\
+				 KFD_EC_MASK(EC_QUEUE_PACKET_DISPATCH_GROUP_SEGMENT_SIZE_INVALID) |	\
+				 KFD_EC_MASK(EC_QUEUE_PACKET_DISPATCH_CODE_INVALID) |	\
+				 KFD_EC_MASK(EC_QUEUE_PACKET_UNSUPPORTED) |	\
+				 KFD_EC_MASK(EC_QUEUE_PACKET_DISPATCH_WORK_GROUP_SIZE_INVALID) |	\
+				 KFD_EC_MASK(EC_QUEUE_PACKET_DISPATCH_REGISTER_INVALID) |	\
+				 KFD_EC_MASK(EC_QUEUE_PACKET_VENDOR_UNSUPPORTED)	|	\
+				 KFD_EC_MASK(EC_QUEUE_PREEMPTION_ERROR)	|	\
+				 KFD_EC_MASK(EC_QUEUE_NEW))
+#define KFD_EC_MASK_DEVICE	(KFD_EC_MASK(EC_DEVICE_QUEUE_DELETE) |		\
+				 KFD_EC_MASK(EC_DEVICE_RAS_ERROR) |		\
+				 KFD_EC_MASK(EC_DEVICE_FATAL_HALT) |		\
+				 KFD_EC_MASK(EC_DEVICE_MEMORY_VIOLATION) |	\
+				 KFD_EC_MASK(EC_DEVICE_NEW))
+#define KFD_EC_MASK_PROCESS	(KFD_EC_MASK(EC_PROCESS_RUNTIME) |	\
+				 KFD_EC_MASK(EC_PROCESS_DEVICE_REMOVE))
+
+/* Checks for exception code types for KFD search */
+#define KFD_DBG_EC_TYPE_IS_QUEUE(ecode)					\
+			(!!(KFD_EC_MASK(ecode) & KFD_EC_MASK_QUEUE))
+#define KFD_DBG_EC_TYPE_IS_DEVICE(ecode)				\
+			(!!(KFD_EC_MASK(ecode) & KFD_EC_MASK_DEVICE))
+#define KFD_DBG_EC_TYPE_IS_PROCESS(ecode)				\
+			(!!(KFD_EC_MASK(ecode) & KFD_EC_MASK_PROCESS))
+
+
+/* Runtime enable states */
+enum kfd_dbg_runtime_state {
+	DEBUG_RUNTIME_STATE_DISABLED = 0,
+	DEBUG_RUNTIME_STATE_ENABLED = 1,
+	DEBUG_RUNTIME_STATE_ENABLED_BUSY = 2,
+	DEBUG_RUNTIME_STATE_ENABLED_ERROR = 3
+};
+
+/* Runtime enable status */
+struct kfd_runtime_info {
+	__u64 r_debug;
+	__u32 runtime_state;
+	__u32 ttmp_setup;
+};
+
+/* Enable modes for runtime enable */
+#define KFD_RUNTIME_ENABLE_MODE_ENABLE_MASK	1
+#define KFD_RUNTIME_ENABLE_MODE_TTMP_SAVE_MASK	2
+
+/**
+ * kfd_ioctl_runtime_enable_args - Arguments for runtime enable
+ *
+ * Coordinates debug exception signalling and debug device enablement with runtime.
+ *
+ * @r_debug - pointer to user struct for sharing information between ROCr and the debuggger
+ * @mode_mask - mask to set mode
+ *	KFD_RUNTIME_ENABLE_MODE_ENABLE_MASK - enable runtime for debugging, otherwise disable
+ *	KFD_RUNTIME_ENABLE_MODE_TTMP_SAVE_MASK - enable trap temporary setup (ignore on disable)
+ *
+ * Return - 0 on SUCCESS.
+ *	  - EBUSY if runtime enable call already pending.
+ *	  - EEXIST if user queues already active prior to call.
+ *	    If process is debug enabled, runtime enable will enable debug devices and
+ *	    wait for debugger process to send runtime exception EC_PROCESS_RUNTIME
+ *	    to unblock - see kfd_ioctl_dbg_trap_args.
+ *
+ */
+struct kfd_ioctl_runtime_enable_args {
+	__u64 r_debug;
+	__u32 mode_mask;
+};
+
+/* Queue information */
+struct kfd_queue_snapshot_entry {
+	__u64 exception_status;
+	__u64 ring_base_address;
+	__u64 write_pointer_address;
+	__u64 read_pointer_address;
+	__u64 ctx_save_restore_address;
+	__u32 queue_id;
+	__u32 gpu_id;
+	__u32 ring_size;
+	__u32 queue_type;
+	__u32 ctx_save_restore_area_size;
+	__u32 reserved;
+};
+
+/* Queue status return for suspend/resume */
+#define KFD_DBG_QUEUE_ERROR_BIT		30
+#define KFD_DBG_QUEUE_INVALID_BIT	31
+#define KFD_DBG_QUEUE_ERROR_MASK	(1 << KFD_DBG_QUEUE_ERROR_BIT)
+#define KFD_DBG_QUEUE_INVALID_MASK	(1 << KFD_DBG_QUEUE_INVALID_BIT)
+
+/* Context save area header information */
+struct kfd_context_save_area_header {
+	__u32 control_stack_offset;
+	__u32 control_stack_size;
+	__u32 wave_state_offset;
+	__u32 wave_state_size;
+	__u32 debug_offset;
+	__u32 debug_size;
+	__u64 err_payload_addr;
+	__u32 err_event_id;
+	__u32 reserved1;
+};
+
+/*
+ * Debug operations
+ *
+ * For specifics on usage and return values, see documentation per operation
+ * below.  Otherwise, generic error returns apply:
+ * 	- ESRCH if the process to debug does not exist.
+ *
+ * 	- EINVAL (with KFD_IOC_DBG_TRAP_ENABLE exempt) if operation
+ * 	  	  KFD_IOC_DBG_TRAP_ENABLE has not succeeded prior.
+ *		  Also returns this error if GPU hardware scheduling is not supported.
+ *
+ * 	- EPERM (with KFD_IOC_DBG_TRAP_DISABLE exempt) if target process is not
+ *		 PTRACE_ATTACHED.  KFD_IOC_DBG_TRAP_DISABLE is exempt to allow
+ *		 clean up of debug mode as long as process is debug enabled.
+ *
+ * 	- EACCES if any DBG_HW_OP (debug hardware operation) is requested when
+ *		 AMDKFD_IOC_RUNTIME_ENABLE has not succeeded prior.
+ *
+ *	- ENODEV if any GPU does not support debugging on a DBG_HW_OP call.
+ *
+ *	- Other errors may be returned when a DBG_HW_OP occurs while the GPU
+ *	  is in a fatal state.
+ *
+ */
+enum kfd_dbg_trap_operations {
+	KFD_IOC_DBG_TRAP_ENABLE = 0,
+	KFD_IOC_DBG_TRAP_DISABLE = 1,
+	KFD_IOC_DBG_TRAP_SEND_RUNTIME_EVENT = 2,
+	KFD_IOC_DBG_TRAP_SET_EXCEPTIONS_ENABLED = 3,
+	KFD_IOC_DBG_TRAP_SET_WAVE_LAUNCH_OVERRIDE = 4,  /* DBG_HW_OP */
+	KFD_IOC_DBG_TRAP_SET_WAVE_LAUNCH_MODE = 5,      /* DBG_HW_OP */
+	KFD_IOC_DBG_TRAP_SUSPEND_QUEUES = 6,		/* DBG_HW_OP */
+	KFD_IOC_DBG_TRAP_RESUME_QUEUES = 7,		/* DBG_HW_OP */
+	KFD_IOC_DBG_TRAP_SET_NODE_ADDRESS_WATCH = 8,	/* DBG_HW_OP */
+	KFD_IOC_DBG_TRAP_CLEAR_NODE_ADDRESS_WATCH = 9,	/* DBG_HW_OP */
+	KFD_IOC_DBG_TRAP_SET_FLAGS = 10,
+	KFD_IOC_DBG_TRAP_QUERY_DEBUG_EVENT = 11,
+	KFD_IOC_DBG_TRAP_QUERY_EXCEPTION_INFO = 12,
+	KFD_IOC_DBG_TRAP_GET_QUEUE_SNAPSHOT = 13,
+	KFD_IOC_DBG_TRAP_GET_DEVICE_SNAPSHOT = 14
+};
+
+/**
+ * kfd_ioctl_dbg_trap_enable_args
+ *
+ *     Arguments for KFD_IOC_DBG_TRAP_ENABLE.
+ *
+ *     Enables debug session for target process. Call @op KFD_IOC_DBG_TRAP_DISABLE in
+ *     kfd_ioctl_dbg_trap_args to disable debug session.
+ *
+ *     @exception_mask (IN)	- exceptions to raise to the debugger
+ *     @rinfo_ptr      (IN)	- pointer to runtime info buffer (see kfd_runtime_info)
+ *     @rinfo_size     (IN/OUT)	- size of runtime info buffer in bytes
+ *     @dbg_fd	       (IN)	- fd the KFD will nofify the debugger with of raised
+ *				  exceptions set in exception_mask.
+ *
+ *     Generic errors apply (see kfd_dbg_trap_operations).
+ *     Return - 0 on SUCCESS.
+ *		Copies KFD saved kfd_runtime_info to @rinfo_ptr on enable.
+ *		Size of kfd_runtime saved by the KFD returned to @rinfo_size.
+ *            - EBADF if KFD cannot get a reference to dbg_fd.
+ *            - EFAULT if KFD cannot copy runtime info to rinfo_ptr.
+ *            - EINVAL if target process is already debug enabled.
+ *
+ */
+struct kfd_ioctl_dbg_trap_enable_args {
+	__u64 exception_mask;
+	__u64 rinfo_ptr;
+	__u32 rinfo_size;
+	__u32 dbg_fd;
+};
+
+/**
+ * kfd_ioctl_dbg_trap_send_runtime_event_args
+ *
+ *
+ *     Arguments for KFD_IOC_DBG_TRAP_SEND_RUNTIME_EVENT.
+ *     Raises exceptions to runtime.
+ *
+ *     @exception_mask (IN) - exceptions to raise to runtime
+ *     @gpu_id	       (IN) - target device id
+ *     @queue_id       (IN) - target queue id
+ *
+ *     Generic errors apply (see kfd_dbg_trap_operations).
+ *     Return - 0 on SUCCESS.
+ *	      - ENODEV if gpu_id not found.
+ *		If exception_mask contains EC_PROCESS_RUNTIME, unblocks pending
+ *		AMDKFD_IOC_RUNTIME_ENABLE call - see kfd_ioctl_runtime_enable_args.
+ *		All other exceptions are raised to runtime through err_payload_addr.
+ *		See kfd_context_save_area_header.
+ */
+struct kfd_ioctl_dbg_trap_send_runtime_event_args {
+	__u64 exception_mask;
+	__u32 gpu_id;
+	__u32 queue_id;
+};
+
+/**
+ * kfd_ioctl_dbg_trap_set_exceptions_enabled_args
+ *
+ *     Arguments for KFD_IOC_SET_EXCEPTIONS_ENABLED
+ *     Set new exceptions to be raised to the debugger.
+ *
+ *     @exception_mask (IN) - new exceptions to raise the debugger
+ *
+ *     Generic errors apply (see kfd_dbg_trap_operations).
+ *     Return - 0 on SUCCESS.
+ */
+struct kfd_ioctl_dbg_trap_set_exceptions_enabled_args {
+	__u64 exception_mask;
+};
+
+/**
+ * kfd_ioctl_dbg_trap_set_wave_launch_override_args
+ *
+ *     Arguments for KFD_IOC_DBG_TRAP_SET_WAVE_LAUNCH_OVERRIDE
+ *     Enable HW exceptions to raise trap.
+ *
+ *     @override_mode	     (IN)     - see kfd_dbg_trap_override_mode
+ *     @enable_mask	     (IN/OUT) - reference kfd_dbg_trap_mask.
+ *					IN is the override modes requested to be enabled.
+ *     					OUT is referenced in Return below.
+ *     @support_request_mask (IN/OUT) - reference kfd_dbg_trap_mask.
+ *     					IN is the override modes requested for support check.
+ *     					OUT is referenced in Return below.
+ *
+ *     Generic errors apply (see kfd_dbg_trap_operations).
+ *     Return - 0 on SUCCESS.
+ *		Previous enablement is returned in @enable_mask.
+ *		Actual override support is returned in @support_request_mask.
+ *	      - EINVAL if override mode is not supported.
+ *	      - EACCES if trap support requested is not actually supported.
+ *		i.e. enable_mask (IN) is not a subset of support_request_mask (OUT).
+ *		Otherwise it is considered a generic error (see kfd_dbg_trap_operations).
+ */
+struct kfd_ioctl_dbg_trap_set_wave_launch_override_args {
+	__u32 override_mode;
+	__u32 enable_mask;
+	__u32 support_request_mask;
+	__u32 pad;
+};
+
+/**
+ * kfd_ioctl_dbg_trap_set_wave_launch_mode_args
+ *
+ *     Arguments for KFD_IOC_DBG_TRAP_SET_WAVE_LAUNCH_MODE
+ *     Set wave launch mode.
+ *
+ *     @mode (IN) - see kfd_dbg_trap_wave_launch_mode
+ *
+ *     Generic errors apply (see kfd_dbg_trap_operations).
+ *     Return - 0 on SUCCESS.
+ */
+struct kfd_ioctl_dbg_trap_set_wave_launch_mode_args {
+	__u32 launch_mode;
+	__u32 pad;
+};
+
+/**
+ * kfd_ioctl_dbg_trap_suspend_queues_args
+ *
+ *     Arguments for KFD_IOC_DBG_TRAP_SUSPEND_QUEUES
+ *     Suspend queues.
+ *
+ *     @exception_mask	(IN) - raised exceptions to clear
+ *     @queue_array_ptr (IN) - pointer to array of queue ids (u32 per queue id)
+ *			       to suspend
+ *     @num_queues	(IN) - number of queues to suspend in @queue_array_ptr
+ *     @grace_period	(IN) - wave time allowance before preemption
+ *     			       per 1K GPU clock cycle unit
+ *
+ *     Generic errors apply (see kfd_dbg_trap_operations).
+ *     Return - Number of queues suspended on SUCCESS.
+ *		KFD_DBG_QUEUE_ERROR_MASK and KFD_DBG_QUEUE_INVALID_MASK masked
+ *		for each queue id in @queue_array_ptr array reports unsuccessful
+ *		suspend reason.
+ *		KFD_DBG_QUEUE_ERROR_MASK = HW failure.
+ *		KFD_DBG_QUEUE_INVALID_MASK = queue does not exist, is new or
+ *		is being destroyed.
+ *		Destruction of a suspended queue is blocked until the queue is
+ *		resumed.  This allows the debugger to access queue information and
+ *		the its context save area without running into a race condition on
+ *		queue destruction.
+ *		Automatically copies per queue context save area header information
+ *		into the save area base
+ *		(see kfd_queue_snapshot_entry and kfd_context_save_area_header).
+ */
+struct kfd_ioctl_dbg_trap_suspend_queues_args {
+	__u64 exception_mask;
+	__u64 queue_array_ptr;
+	__u32 num_queues;
+	__u32 grace_period;
+};
+
+/**
+ * kfd_ioctl_dbg_trap_resume_queues_args
+ *
+ *     Arguments for KFD_IOC_DBG_TRAP_RESUME_QUEUES
+ *     Resume queues.
+ *
+ *     @queue_array_ptr (IN) - pointer to array of queue ids (u32 per queue id)
+ *			       to resume
+ *     @num_queues	(IN) - number of queues to resume in @queue_array_ptr
+ *
+ *     Generic errors apply (see kfd_dbg_trap_operations).
+ *     Return - Number of queues resumed on SUCCESS.
+ *		KFD_DBG_QUEUE_ERROR_MASK and KFD_DBG_QUEUE_INVALID_MASK mask
+ *		for each queue id in @queue_array_ptr array reports unsuccessful
+ *		resume reason.
+ *		KFD_DBG_QUEUE_ERROR_MASK = HW failure.
+ *		KFD_DBG_QUEUE_INVALID_MASK = queue does not exist.
+ */
+struct kfd_ioctl_dbg_trap_resume_queues_args {
+	__u64 queue_array_ptr;
+	__u32 num_queues;
+	__u32 pad;
+};
+
+/**
+ * kfd_ioctl_dbg_trap_set_node_address_watch_args
+ *
+ *     Arguments for KFD_IOC_DBG_TRAP_SET_NODE_ADDRESS_WATCH
+ *     Sets address watch for device.
+ *
+ *     @address	(IN)  - watch address to set
+ *     @mode    (IN)  - see kfd_dbg_trap_address_watch_mode
+ *     @mask    (IN)  - watch address mask
+ *     @gpu_id  (IN)  - target gpu to set watch point
+ *     @id      (OUT) - watch id allocated
+ *
+ *     Generic errors apply (see kfd_dbg_trap_operations).
+ *     Return - 0 on SUCCESS.
+ *		Allocated watch ID returned to @id.
+ *	      - ENODEV if gpu_id not found.
+ *	      - ENOMEM if watch IDs can be allocated
+ */
+struct kfd_ioctl_dbg_trap_set_node_address_watch_args {
+	__u64 address;
+	__u32 mode;
+	__u32 mask;
+	__u32 gpu_id;
+	__u32 id;
+};
+
+/**
+ * kfd_ioctl_dbg_trap_clear_node_address_watch_args
+ *
+ *     Arguments for KFD_IOC_DBG_TRAP_CLEAR_NODE_ADDRESS_WATCH
+ *     Clear address watch for device.
+ *
+ *     @gpu_id  (IN)  - target device to clear watch point
+ *     @id      (IN) - allocated watch id to clear
+ *
+ *     Generic errors apply (see kfd_dbg_trap_operations).
+ *     Return - 0 on SUCCESS.
+ *	      - ENODEV if gpu_id not found.
+ *     	      - EINVAL if watch ID has not been allocated.
+ */
+struct kfd_ioctl_dbg_trap_clear_node_address_watch_args {
+	__u32 gpu_id;
+	__u32 id;
+};
+
+/**
+ * kfd_ioctl_dbg_trap_set_flags_args
+ *
+ *     Arguments for KFD_IOC_DBG_TRAP_SET_FLAGS
+ *     Sets flags for wave behaviour.
+ *
+ *     @flags (IN/OUT) - IN = flags to enable, OUT = flags previously enabled
+ *
+ *     Generic errors apply (see kfd_dbg_trap_operations).
+ *     Return - 0 on SUCCESS.
+ *	      - EACCESS if any debug device does not allow flag options.
+ */
+struct kfd_ioctl_dbg_trap_set_flags_args {
+	__u32 flags;
+	__u32 pad;
+};
+
+/**
+ * kfd_ioctl_dbg_trap_query_debug_event_args
+ *
+ *     Arguments for KFD_IOC_DBG_TRAP_QUERY_DEBUG_EVENT
+ *
+ *     Find one or more raised exceptions. This function can return multiple
+ *     exceptions from a single queue or a single device with one call. To find
+ *     all raised exceptions, this function must be called repeatedly until it
+ *     returns -EAGAIN. Returned exceptions can optionally be cleared by
+ *     setting the corresponding bit in the @exception_mask input parameter.
+ *     However, clearing an exception prevents retrieving further information
+ *     about it with KFD_IOC_DBG_TRAP_QUERY_EXCEPTION_INFO.
+ *
+ *     @exception_mask (IN/OUT) - exception to clear (IN) and raised (OUT)
+ *     @gpu_id	       (OUT)    - gpu id of exceptions raised
+ *     @queue_id       (OUT)    - queue id of exceptions raised
+ *
+ *     Generic errors apply (see kfd_dbg_trap_operations).
+ *     Return - 0 on raised exception found
+ *              Raised exceptions found are returned in @exception mask
+ *              with reported source id returned in @gpu_id or @queue_id.
+ *            - EAGAIN if no raised exception has been found
+ */
+struct kfd_ioctl_dbg_trap_query_debug_event_args {
+	__u64 exception_mask;
+	__u32 gpu_id;
+	__u32 queue_id;
+};
+
+/**
+ * kfd_ioctl_dbg_trap_query_exception_info_args
+ *
+ *     Arguments KFD_IOC_DBG_TRAP_QUERY_EXCEPTION_INFO
+ *     Get additional info on raised exception.
+ *
+ *     @info_ptr	(IN)	 - pointer to exception info buffer to copy to
+ *     @info_size	(IN/OUT) - exception info buffer size (bytes)
+ *     @source_id	(IN)     - target gpu or queue id
+ *     @exception_code	(IN)     - target exception
+ *     @clear_exception	(IN)     - clear raised @exception_code exception
+ *				   (0 = false, 1 = true)
+ *
+ *     Generic errors apply (see kfd_dbg_trap_operations).
+ *     Return - 0 on SUCCESS.
+ *              If @exception_code is EC_DEVICE_MEMORY_VIOLATION, copy @info_size(OUT)
+ *		bytes of memory exception data to @info_ptr.
+ *              If @exception_code is EC_PROCESS_RUNTIME, copy saved
+ *              kfd_runtime_info to @info_ptr.
+ *              Actual required @info_ptr size (bytes) is returned in @info_size.
+ */
+struct kfd_ioctl_dbg_trap_query_exception_info_args {
+	__u64 info_ptr;
+	__u32 info_size;
+	__u32 source_id;
+	__u32 exception_code;
+	__u32 clear_exception;
+};
+
+/**
+ * kfd_ioctl_dbg_trap_get_queue_snapshot_args
+ *
+ *     Arguments KFD_IOC_DBG_TRAP_GET_QUEUE_SNAPSHOT
+ *     Get queue information.
+ *
+ *     @exception_mask	 (IN)	  - exceptions raised to clear
+ *     @snapshot_buf_ptr (IN)	  - queue snapshot entry buffer (see kfd_queue_snapshot_entry)
+ *     @num_queues	 (IN/OUT) - number of queue snapshot entries
+ *         The debugger specifies the size of the array allocated in @num_queues.
+ *         KFD returns the number of queues that actually existed. If this is
+ *         larger than the size specified by the debugger, KFD will not overflow
+ *         the array allocated by the debugger.
+ *
+ *     @entry_size	 (IN/OUT) - size per entry in bytes
+ *         The debugger specifies sizeof(struct kfd_queue_snapshot_entry) in
+ *         @entry_size. KFD returns the number of bytes actually populated per
+ *         entry. The debugger should use the KFD_IOCTL_MINOR_VERSION to determine,
+ *         which fields in struct kfd_queue_snapshot_entry are valid. This allows
+ *         growing the ABI in a backwards compatible manner.
+ *
+ *     Generic errors apply (see kfd_dbg_trap_operations).
+ *     Return - 0 on SUCCESS.
+ *              Copies @num_queues(IN) queue snapshot entries of size @entry_size(IN)
+ *              into @snapshot_buf_ptr if @num_queues(IN) > 0.
+ *              Otherwise return @num_queues(OUT) queue snapshot entries that exist.
+ */
+struct kfd_ioctl_dbg_trap_queue_snapshot_args {
+	__u64 exception_mask;
+	__u64 snapshot_buf_ptr;
+	__u32 num_queues;
+	__u32 entry_size;
+};
+
+/**
+ * kfd_ioctl_dbg_trap_get_device_snapshot_args
+ *
+ *     Arguments for KFD_IOC_DBG_TRAP_GET_DEVICE_SNAPSHOT
+ *     Get device information.
+ *
+ *     @exception_mask	 (IN)	  - exceptions raised to clear
+ *     @snapshot_buf_ptr (IN)	  - pointer to snapshot buffer (see kfd_dbg_device_info_entry)
+ *     @num_devices	 (IN/OUT) - number of debug devices to snapshot
+ *         The debugger specifies the size of the array allocated in @num_devices.
+ *         KFD returns the number of devices that actually existed. If this is
+ *         larger than the size specified by the debugger, KFD will not overflow
+ *         the array allocated by the debugger.
+ *
+ *     @entry_size	 (IN/OUT) - size per entry in bytes
+ *         The debugger specifies sizeof(struct kfd_dbg_device_info_entry) in
+ *         @entry_size. KFD returns the number of bytes actually populated. The
+ *         debugger should use KFD_IOCTL_MINOR_VERSION to determine, which fields
+ *         in struct kfd_dbg_device_info_entry are valid. This allows growing the
+ *         ABI in a backwards compatible manner.
+ *
+ *     Generic errors apply (see kfd_dbg_trap_operations).
+ *     Return - 0 on SUCCESS.
+ *              Copies @num_devices(IN) device snapshot entries of size @entry_size(IN)
+ *              into @snapshot_buf_ptr if @num_devices(IN) > 0.
+ *              Otherwise return @num_devices(OUT) queue snapshot entries that exist.
+ */
+struct kfd_ioctl_dbg_trap_device_snapshot_args {
+	__u64 exception_mask;
+	__u64 snapshot_buf_ptr;
+	__u32 num_devices;
+	__u32 entry_size;
+};
+
+/**
+ * kfd_ioctl_dbg_trap_args
+ *
+ * Arguments to debug target process.
+ *
+ *     @pid - target process to debug
+ *     @op  - debug operation (see kfd_dbg_trap_operations)
+ *
+ *     @op determines which union struct args to use.
+ *     Refer to kern docs for each kfd_ioctl_dbg_trap_*_args struct.
+ */
+struct kfd_ioctl_dbg_trap_args {
+	__u32 pid;
+	__u32 op;
+
+	union {
+		struct kfd_ioctl_dbg_trap_enable_args enable;
+		struct kfd_ioctl_dbg_trap_send_runtime_event_args send_runtime_event;
+		struct kfd_ioctl_dbg_trap_set_exceptions_enabled_args set_exceptions_enabled;
+		struct kfd_ioctl_dbg_trap_set_wave_launch_override_args launch_override;
+		struct kfd_ioctl_dbg_trap_set_wave_launch_mode_args launch_mode;
+		struct kfd_ioctl_dbg_trap_suspend_queues_args suspend_queues;
+		struct kfd_ioctl_dbg_trap_resume_queues_args resume_queues;
+		struct kfd_ioctl_dbg_trap_set_node_address_watch_args set_node_address_watch;
+		struct kfd_ioctl_dbg_trap_clear_node_address_watch_args clear_node_address_watch;
+		struct kfd_ioctl_dbg_trap_set_flags_args set_flags;
+		struct kfd_ioctl_dbg_trap_query_debug_event_args query_debug_event;
+		struct kfd_ioctl_dbg_trap_query_exception_info_args query_exception_info;
+		struct kfd_ioctl_dbg_trap_queue_snapshot_args queue_snapshot;
+		struct kfd_ioctl_dbg_trap_device_snapshot_args device_snapshot;
+	};
+};
+
 #define AMDKFD_IOCTL_BASE 'K'
 #define AMDKFD_IO(nr)			_IO(AMDKFD_IOCTL_BASE, nr)
 #define AMDKFD_IOR(nr, type)		_IOR(AMDKFD_IOCTL_BASE, nr, type)
@@ -877,7 +1524,13 @@ struct kfd_ioctl_set_xnack_mode_args {
 #define AMDKFD_IOC_AVAILABLE_MEMORY		\
 		AMDKFD_IOWR(0x23, struct kfd_ioctl_get_available_memory_args)
 
+#define AMDKFD_IOC_RUNTIME_ENABLE		\
+		AMDKFD_IOWR(0x24, struct kfd_ioctl_runtime_enable_args)
+
+#define AMDKFD_IOC_DBG_TRAP			\
+		AMDKFD_IOWR(0x25, struct kfd_ioctl_dbg_trap_args)
+
 #define AMDKFD_COMMAND_START		0x01
-#define AMDKFD_COMMAND_END		0x24
+#define AMDKFD_COMMAND_END		0x26
 
 #endif
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 63+ messages in thread

* [PATCH 02/29] drm/amdkfd: display debug capabilities
  2022-10-31 16:23 [PATCH 01/29] drm/amdkfd: add debug and runtime enable interface Jonathan Kim
@ 2022-10-31 16:23 ` Jonathan Kim
  2022-11-22 23:08   ` Felix Kuehling
  2022-10-31 16:23 ` [PATCH 03/29] drm/amdkfd: prepare per-process debug enable and disable Jonathan Kim
                   ` (27 subsequent siblings)
  28 siblings, 1 reply; 63+ messages in thread
From: Jonathan Kim @ 2022-10-31 16:23 UTC (permalink / raw)
  To: amd-gfx; +Cc: Felix.Kuehling

Expose debug capabilities in the KFD topology node's HSA capabilities and
debug properties flags.

Ensure correct capabilities are exposed based on firmware support.

Flag definitions can be referenced in uapi/linux/kfd_sysfs.h.

Signed-off-by: Jonathan Kim <jonathan.kim@amd.com>
---
 drivers/gpu/drm/amd/amdkfd/kfd_topology.c | 88 +++++++++++++++++++++--
 drivers/gpu/drm/amd/amdkfd/kfd_topology.h |  6 ++
 include/uapi/linux/kfd_sysfs.h            | 15 ++++
 3 files changed, 104 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
index 3f0a4a415907..cd5933a594de 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
@@ -551,6 +551,8 @@ static ssize_t node_show(struct kobject *kobj, struct attribute *attr,
 				      dev->gpu->mec_fw_version);
 		sysfs_show_32bit_prop(buffer, offs, "capability",
 				      dev->node_props.capability);
+		sysfs_show_64bit_prop(buffer, offs, "debug_prop",
+				      dev->node_props.debug_prop);
 		sysfs_show_32bit_prop(buffer, offs, "sdma_fw_version",
 				      dev->gpu->sdma_fw_version);
 		sysfs_show_64bit_prop(buffer, offs, "unique_id",
@@ -1593,6 +1595,84 @@ static int kfd_dev_create_p2p_links(void)
 	return ret;
 }
 
+static void kfd_topology_set_dbg_firmware_support(struct kfd_topology_device *dev)
+{
+	bool firmware_supported = true;
+
+	/*
+	 * Note: Any unlisted devices here are assumed to support exception handling.
+	 * Add additional checks here as needed.
+	 */
+	switch (KFD_GC_VERSION(dev->gpu)) {
+	case IP_VERSION(9, 0, 1): /* Vega10 */
+		firmware_supported = dev->gpu->mec_fw_version >= 459 + 32768;
+		break;
+	case IP_VERSION(9, 1, 0): /* Raven */
+	case IP_VERSION(9, 2, 1): /* Vega12 */
+	case IP_VERSION(9, 2, 2): /* Raven */
+	case IP_VERSION(9, 3, 0): /* Renoir */
+	case IP_VERSION(9, 4, 0): /* Vega20 */
+		firmware_supported = dev->gpu->mec_fw_version >= 459;
+		break;
+	case IP_VERSION(9, 4, 1): /* Arcturus */
+		firmware_supported = dev->gpu->mec_fw_version >= 60;
+		break;
+	case IP_VERSION(9, 4, 2): /* Aldebaran */
+		firmware_supported = dev->gpu->mec_fw_version >= 51;
+		break;
+	case IP_VERSION(10, 1, 10): /* Navi10 */
+	case IP_VERSION(10, 1, 2): /* Navi12 */
+	case IP_VERSION(10, 1, 1): /* Navi14 */
+		firmware_supported = dev->gpu->mec_fw_version >= 144;
+		break;
+	case IP_VERSION(10, 3, 0): /* Sieanna Cichlid */
+	case IP_VERSION(10, 3, 2): /* Navy Flounder */
+	case IP_VERSION(10, 3, 1): /* Van Gogh */
+	case IP_VERSION(10, 3, 4): /* Dimgrey Cavefish */
+	case IP_VERSION(10, 3, 5): /* Beige Goby */
+		firmware_supported = dev->gpu->mec_fw_version >= 89;
+		break;
+	case IP_VERSION(10, 1, 3): /* Cyan Skillfish */
+	case IP_VERSION(10, 3, 3): /* Yellow Carp*/
+		firmware_supported = false;
+		break;
+	default:
+		break;
+	}
+
+	if (firmware_supported)
+		dev->node_props.capability |= HSA_CAP_TRAP_DEBUG_FIRMWARE_SUPPORTED;
+}
+
+static void kfd_topology_set_capabilities(struct kfd_topology_device *dev)
+{
+	dev->node_props.capability |= ((HSA_CAP_DOORBELL_TYPE_2_0 <<
+				HSA_CAP_DOORBELL_TYPE_TOTALBITS_SHIFT) &
+				HSA_CAP_DOORBELL_TYPE_TOTALBITS_MASK);
+
+	dev->node_props.capability |= HSA_CAP_TRAP_DEBUG_SUPPORT |
+			HSA_CAP_TRAP_DEBUG_WAVE_LAUNCH_TRAP_OVERRIDE_SUPPORTED |
+			HSA_CAP_TRAP_DEBUG_WAVE_LAUNCH_MODE_SUPPORTED;
+
+	if (KFD_GC_VERSION(dev->gpu) < IP_VERSION(10, 0, 0)) {
+		dev->node_props.debug_prop |= HSA_DBG_WATCH_ADDR_MASK_LO_BIT_GFX9 |
+						HSA_DBG_WATCH_ADDR_MASK_HI_BIT;
+
+		if (KFD_GC_VERSION(dev->gpu) < IP_VERSION(9, 4, 2))
+			dev->node_props.debug_prop |=
+				HSA_DBG_DISPATCH_INFO_ALWAYS_VALID;
+		else
+			dev->node_props.capability |=
+				HSA_CAP_TRAP_DEBUG_PRECISE_MEMORY_OPERATIONS_SUPPORTED;
+	} else {
+		dev->node_props.debug_prop |= HSA_DBG_WATCH_ADDR_MASK_LO_BIT_GFX10 |
+					HSA_DBG_WATCH_ADDR_MASK_HI_BIT |
+					HSA_DBG_DISPATCH_INFO_ALWAYS_VALID;
+	}
+
+	kfd_topology_set_dbg_firmware_support(dev);
+}
+
 int kfd_topology_add_device(struct kfd_dev *gpu)
 {
 	uint32_t gpu_id;
@@ -1737,13 +1817,11 @@ int kfd_topology_add_device(struct kfd_dev *gpu)
 			HSA_CAP_DOORBELL_TYPE_TOTALBITS_MASK);
 		break;
 	default:
-		if (KFD_GC_VERSION(dev->gpu) >= IP_VERSION(9, 0, 1))
-			dev->node_props.capability |= ((HSA_CAP_DOORBELL_TYPE_2_0 <<
-				HSA_CAP_DOORBELL_TYPE_TOTALBITS_SHIFT) &
-				HSA_CAP_DOORBELL_TYPE_TOTALBITS_MASK);
-		else
+		if (KFD_GC_VERSION(dev->gpu) < IP_VERSION(9, 0, 1))
 			WARN(1, "Unexpected ASIC family %u",
 			     dev->gpu->adev->asic_type);
+		else
+			kfd_topology_set_capabilities(dev);
 	}
 
 	/*
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.h b/drivers/gpu/drm/amd/amdkfd/kfd_topology.h
index 9f6c949186c1..c089c26a0e77 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.h
@@ -31,6 +31,11 @@
 
 #define KFD_TOPOLOGY_PUBLIC_NAME_SIZE 32
 
+#define HSA_DBG_WATCH_ADDR_MASK_LO_BIT_GFX9	6
+#define HSA_DBG_WATCH_ADDR_MASK_LO_BIT_GFX10	7
+#define HSA_DBG_WATCH_ADDR_MASK_HI_BIT  \
+			(29 << HSA_DBG_WATCH_ADDR_MASK_HI_BIT_SHIFT)
+
 struct kfd_node_properties {
 	uint64_t hive_id;
 	uint32_t cpu_cores_count;
@@ -42,6 +47,7 @@ struct kfd_node_properties {
 	uint32_t cpu_core_id_base;
 	uint32_t simd_id_base;
 	uint32_t capability;
+	uint64_t debug_prop;
 	uint32_t max_waves_per_simd;
 	uint32_t lds_size_in_kb;
 	uint32_t gds_size_in_kb;
diff --git a/include/uapi/linux/kfd_sysfs.h b/include/uapi/linux/kfd_sysfs.h
index 3e330f368917..a51b7331e0b4 100644
--- a/include/uapi/linux/kfd_sysfs.h
+++ b/include/uapi/linux/kfd_sysfs.h
@@ -43,6 +43,11 @@
 #define HSA_CAP_DOORBELL_TYPE_2_0		0x2
 #define HSA_CAP_AQL_QUEUE_DOUBLE_MAP		0x00004000
 
+#define HSA_CAP_TRAP_DEBUG_SUPPORT              0x00008000
+#define HSA_CAP_TRAP_DEBUG_WAVE_LAUNCH_TRAP_OVERRIDE_SUPPORTED  0x00010000
+#define HSA_CAP_TRAP_DEBUG_WAVE_LAUNCH_MODE_SUPPORTED           0x00020000
+#define HSA_CAP_TRAP_DEBUG_PRECISE_MEMORY_OPERATIONS_SUPPORTED  0x00040000
+
 /* Old buggy user mode depends on this being 0 */
 #define HSA_CAP_RESERVED_WAS_SRAM_EDCSUPPORTED	0x00080000
 
@@ -53,8 +58,18 @@
 #define HSA_CAP_SRAM_EDCSUPPORTED		0x04000000
 #define HSA_CAP_SVMAPI_SUPPORTED		0x08000000
 #define HSA_CAP_FLAGS_COHERENTHOSTACCESS	0x10000000
+#define HSA_CAP_TRAP_DEBUG_FIRMWARE_SUPPORTED   0x20000000
 #define HSA_CAP_RESERVED			0xe00f8000
 
+/* debug_prop bits in node properties */
+#define HSA_DBG_WATCH_ADDR_MASK_LO_BIT_MASK     0x0000000f
+#define HSA_DBG_WATCH_ADDR_MASK_LO_BIT_SHIFT    0
+#define HSA_DBG_WATCH_ADDR_MASK_HI_BIT_MASK     0x000003f0
+#define HSA_DBG_WATCH_ADDR_MASK_HI_BIT_SHIFT    4
+#define HSA_DBG_DISPATCH_INFO_ALWAYS_VALID      0x00000400
+#define HSA_DBG_WATCHPOINTS_EXCLUSIVE           0x00000800
+#define HSA_DBG_RESERVED                0xfffffffffffff000ull
+
 /* Heap types in memory properties */
 #define HSA_MEM_HEAP_TYPE_SYSTEM	0
 #define HSA_MEM_HEAP_TYPE_FB_PUBLIC	1
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 63+ messages in thread

* [PATCH 03/29] drm/amdkfd: prepare per-process debug enable and disable
  2022-10-31 16:23 [PATCH 01/29] drm/amdkfd: add debug and runtime enable interface Jonathan Kim
  2022-10-31 16:23 ` [PATCH 02/29] drm/amdkfd: display debug capabilities Jonathan Kim
@ 2022-10-31 16:23 ` Jonathan Kim
  2022-11-22 23:31   ` Felix Kuehling
  2022-10-31 16:23 ` [PATCH 04/29] drm/amdgpu: add kgd hw debug mode setting interface Jonathan Kim
                   ` (26 subsequent siblings)
  28 siblings, 1 reply; 63+ messages in thread
From: Jonathan Kim @ 2022-10-31 16:23 UTC (permalink / raw)
  To: amd-gfx; +Cc: Felix.Kuehling

The ROCm debugger will attach to a process to debug by PTRACE and will
expect the KFD to prepare a process for the target PID, whether the
target PID has opened the KFD device or not.

This patch is to explicity handle this requirement.  Further HW mode
setting and runtime coordination requirements will be handled in
following patches.

In the case where the target process has not opened the KFD device,
a new KFD process must be created for the target PID.
The debugger as well as the target process for this case will have not
acquired any VMs so handle process restoration to correctly account for
this.

To coordinate with HSA runtime, the debugger must be aware of the target
process' runtime enablement status and will copy the runtime status
information into the debugged KFD process for later query.

On enablement, the debugger will subscribe to a set of exceptions where
each exception events will notify the debugger through a pollable FIFO
file descriptor that the debugger provides to the KFD to manage.
Some events will be synchronously raised while other are scheduled,
which is why a debug_event_workarea worker is initialized.

Finally on process termination of either the debugger or the target,
debugging must be disabled if it has not been done so.

v2: relax debug trap disable and PTRACE ATTACH requirement.

Signed-off-by: Jonathan Kim <jonathan.kim@amd.com>
---
 drivers/gpu/drm/amd/amdkfd/Makefile           |  3 +-
 drivers/gpu/drm/amd/amdkfd/kfd_chardev.c      | 88 +++++++++++++++++-
 drivers/gpu/drm/amd/amdkfd/kfd_debug.c        | 91 +++++++++++++++++++
 drivers/gpu/drm/amd/amdkfd/kfd_debug.h        | 33 +++++++
 .../drm/amd/amdkfd/kfd_device_queue_manager.c | 24 ++++-
 drivers/gpu/drm/amd/amdkfd/kfd_priv.h         | 34 ++++++-
 drivers/gpu/drm/amd/amdkfd/kfd_process.c      | 65 +++++++++----
 7 files changed, 309 insertions(+), 29 deletions(-)
 create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_debug.c
 create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_debug.h

diff --git a/drivers/gpu/drm/amd/amdkfd/Makefile b/drivers/gpu/drm/amd/amdkfd/Makefile
index e758c2a24cd0..747754428073 100644
--- a/drivers/gpu/drm/amd/amdkfd/Makefile
+++ b/drivers/gpu/drm/amd/amdkfd/Makefile
@@ -55,7 +55,8 @@ AMDKFD_FILES	:= $(AMDKFD_PATH)/kfd_module.o \
 		$(AMDKFD_PATH)/kfd_int_process_v9.o \
 		$(AMDKFD_PATH)/kfd_int_process_v11.o \
 		$(AMDKFD_PATH)/kfd_smi_events.o \
-		$(AMDKFD_PATH)/kfd_crat.o
+		$(AMDKFD_PATH)/kfd_crat.o \
+		$(AMDKFD_PATH)/kfd_debug.o
 
 ifneq ($(CONFIG_AMD_IOMMU_V2),)
 AMDKFD_FILES += $(AMDKFD_PATH)/kfd_iommu.o
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index 11a960c83fb2..d550dbe570fb 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -44,6 +44,7 @@
 #include "amdgpu_amdkfd.h"
 #include "kfd_smi_events.h"
 #include "amdgpu_dma_buf.h"
+#include "kfd_debug.h"
 
 static long kfd_ioctl(struct file *, unsigned int, unsigned long);
 static int kfd_open(struct inode *, struct file *);
@@ -142,10 +143,15 @@ static int kfd_open(struct inode *inode, struct file *filep)
 		return -EPERM;
 	}
 
-	process = kfd_create_process(filep);
+	process = kfd_create_process(current);
 	if (IS_ERR(process))
 		return PTR_ERR(process);
 
+	if (kfd_process_init_cwsr_apu(process, filep)) {
+		kfd_unref_process(process);
+		return -EFAULT;
+	}
+
 	if (kfd_is_locked()) {
 		dev_dbg(kfd_device, "kfd is locked!\n"
 				"process %d unreferenced", process->pasid);
@@ -2652,6 +2658,9 @@ static int kfd_ioctl_runtime_enable(struct file *filep, struct kfd_process *p, v
 static int kfd_ioctl_set_debug_trap(struct file *filep, struct kfd_process *p, void *data)
 {
 	struct kfd_ioctl_dbg_trap_args *args = data;
+	struct task_struct *thread = NULL;
+	struct pid *pid = NULL;
+	struct kfd_process *target = NULL;
 	int r = 0;
 
 	if (sched_policy == KFD_SCHED_POLICY_NO_HWS) {
@@ -2659,9 +2668,71 @@ static int kfd_ioctl_set_debug_trap(struct file *filep, struct kfd_process *p, v
 		return -EINVAL;
 	}
 
+	pid = find_get_pid(args->pid);
+	if (!pid) {
+		pr_debug("Cannot find pid info for %i\n", args->pid);
+		r = -ESRCH;
+		goto out;
+	}
+
+	thread = get_pid_task(pid, PIDTYPE_PID);
+
+	if (args->op == KFD_IOC_DBG_TRAP_ENABLE) {
+		bool create_process;
+
+		rcu_read_lock();
+		create_process = thread && thread != current && ptrace_parent(thread) == current;
+		rcu_read_unlock();
+
+		target = create_process ? kfd_create_process(thread) :
+					kfd_lookup_process_by_pid(pid);
+	} else {
+		target = kfd_lookup_process_by_pid(pid);
+	}
+
+	if (!target) {
+		pr_debug("Cannot find process PID %i to debug\n", args->pid);
+		r = -ESRCH;
+		goto out;
+	}
+
+	/* Check if target is still PTRACED. */
+	rcu_read_lock();
+	if (target != p && args->op == KFD_IOC_DBG_TRAP_DISABLE
+				&& ptrace_parent(target->lead_thread) != current) {
+		pr_err("PID %i is not PTRACED and cannot be debugged\n", args->pid);
+		r = -EPERM;
+	}
+	rcu_read_unlock();
+
+	if (r)
+		goto out;
+
+	mutex_lock(&target->mutex);
+
+	if (args->op != KFD_IOC_DBG_TRAP_ENABLE && !target->debug_trap_enabled) {
+		pr_err("PID %i not debug enabled for op %i\n", args->pid, args->op);
+		r = -EINVAL;
+		goto unlock_out;
+	}
+
 	switch (args->op) {
 	case KFD_IOC_DBG_TRAP_ENABLE:
+		if (target != p)
+			target->debugger_process = p;
+
+		r = kfd_dbg_trap_enable(target,
+					args->enable.dbg_fd,
+					(void __user *)args->enable.rinfo_ptr,
+					&args->enable.rinfo_size);
+		if (!r)
+			target->exception_enable_mask = args->enable.exception_mask;
+
+		pr_warn("Debug functions limited\n");
+		break;
 	case KFD_IOC_DBG_TRAP_DISABLE:
+		r = kfd_dbg_trap_disable(target);
+		break;
 	case KFD_IOC_DBG_TRAP_SEND_RUNTIME_EVENT:
 	case KFD_IOC_DBG_TRAP_SET_EXCEPTIONS_ENABLED:
 	case KFD_IOC_DBG_TRAP_SET_WAVE_LAUNCH_OVERRIDE:
@@ -2675,7 +2746,7 @@ static int kfd_ioctl_set_debug_trap(struct file *filep, struct kfd_process *p, v
 	case KFD_IOC_DBG_TRAP_QUERY_EXCEPTION_INFO:
 	case KFD_IOC_DBG_TRAP_GET_QUEUE_SNAPSHOT:
 	case KFD_IOC_DBG_TRAP_GET_DEVICE_SNAPSHOT:
-		pr_warn("Debugging not supported yet\n");
+		pr_warn("Debug op %i not supported yet\n", args->op);
 		r = -EACCES;
 		break;
 	default:
@@ -2683,6 +2754,19 @@ static int kfd_ioctl_set_debug_trap(struct file *filep, struct kfd_process *p, v
 		r = -EINVAL;
 	}
 
+unlock_out:
+	mutex_unlock(&target->mutex);
+
+out:
+	if (thread)
+		put_task_struct(thread);
+
+	if (pid)
+		put_pid(pid);
+
+	if (target)
+		kfd_unref_process(target);
+
 	return r;
 }
 
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
new file mode 100644
index 000000000000..f967f89903f7
--- /dev/null
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
@@ -0,0 +1,91 @@
+/*
+ * Copyright 2022 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "kfd_debug.h"
+#include <linux/file.h>
+
+void debug_event_write_work_handler(struct work_struct *work)
+{
+	struct kfd_process *process;
+
+	static const char write_data = '.';
+	loff_t pos = 0;
+
+	process = container_of(work,
+			struct kfd_process,
+			debug_event_workarea);
+
+	kernel_write(process->dbg_ev_file, &write_data, 1, &pos);
+}
+
+int kfd_dbg_trap_disable(struct kfd_process *target)
+{
+	fput(target->dbg_ev_file);
+	target->dbg_ev_file = NULL;
+
+	if (target->debugger_process) {
+		atomic_dec(&target->debugger_process->debugged_process_count);
+		target->debugger_process = NULL;
+	}
+
+	target->debug_trap_enabled = false;
+	kfd_unref_process(target);
+
+	return 0;
+}
+
+int kfd_dbg_trap_enable(struct kfd_process *target, uint32_t fd,
+			void __user *runtime_info, uint32_t *runtime_size)
+{
+	struct file *f;
+	uint32_t copy_size;
+	int r = 0;
+
+	if (target->debug_trap_enabled)
+		return -EINVAL;
+
+	copy_size = min((size_t)(*runtime_size), sizeof(target->runtime_info));
+
+	f = fget(fd);
+	if (!f) {
+		pr_err("Failed to get file for (%i)\n", fd);
+		return -EBADF;
+	}
+
+	target->dbg_ev_file = f;
+
+	/* We already hold the process reference but hold another one for the
+	 * debug session.
+	 */
+	kref_get(&target->ref);
+	target->debug_trap_enabled = true;
+
+	if (target->debugger_process)
+		atomic_inc(&target->debugger_process->debugged_process_count);
+
+	if (copy_to_user(runtime_info, (void *)&target->runtime_info, copy_size))
+		r = -EFAULT;
+
+	*runtime_size = sizeof(target->runtime_info);
+
+	return r;
+}
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.h b/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
new file mode 100644
index 000000000000..b2217eb1399c
--- /dev/null
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
@@ -0,0 +1,33 @@
+/*
+ * Copyright 2022 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef KFD_DEBUG_EVENTS_H_INCLUDED
+#define KFD_DEBUG_EVENTS_H_INCLUDED
+
+#include "kfd_priv.h"
+
+int kfd_dbg_trap_disable(struct kfd_process *target);
+int kfd_dbg_trap_enable(struct kfd_process *target, uint32_t fd,
+			void __user *runtime_info,
+			uint32_t *runtime_info_size);
+void debug_event_write_work_handler(struct work_struct *work);
+#endif
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
index ecb4c3abc629..faa5d8c666ee 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -979,6 +979,14 @@ static int evict_process_queues_cpsch(struct device_queue_manager *dqm,
 		goto out;
 
 	pdd = qpd_to_pdd(qpd);
+
+	/* The debugger creates processes that temporarily have not acquired
+	 * all VMs for all devices and has no VMs itself.
+	 * Skip queue eviction on process eviction.
+	 */
+	if (!pdd->drm_priv)
+		goto out;
+
 	pr_debug_ratelimited("Evicting PASID 0x%x queues\n",
 			    pdd->process->pasid);
 
@@ -1100,13 +1108,10 @@ static int restore_process_queues_cpsch(struct device_queue_manager *dqm,
 {
 	struct queue *q;
 	struct kfd_process_device *pdd;
-	uint64_t pd_base;
 	uint64_t eviction_duration;
 	int retval = 0;
 
 	pdd = qpd_to_pdd(qpd);
-	/* Retrieve PD base */
-	pd_base = amdgpu_amdkfd_gpuvm_get_process_page_dir(pdd->drm_priv);
 
 	dqm_lock(dqm);
 	if (WARN_ON_ONCE(!qpd->evicted)) /* already restored, do nothing */
@@ -1116,12 +1121,19 @@ static int restore_process_queues_cpsch(struct device_queue_manager *dqm,
 		goto out;
 	}
 
+	/* The debugger creates processes that temporarily have not acquired
+	 * all VMs for all devices and has no VMs itself.
+	 * Skip queue restore on process restore.
+	 */
+	if (!pdd->drm_priv)
+		goto vm_not_acquired;
+
 	pr_debug_ratelimited("Restoring PASID 0x%x queues\n",
 			    pdd->process->pasid);
 
 	/* Update PD Base in QPD */
-	qpd->page_table_base = pd_base;
-	pr_debug("Updated PD address to 0x%llx\n", pd_base);
+	qpd->page_table_base = amdgpu_amdkfd_gpuvm_get_process_page_dir(pdd->drm_priv);
+	pr_debug("Updated PD address to 0x%llx\n", qpd->page_table_base);
 
 	/* activate all active queues on the qpd */
 	list_for_each_entry(q, &qpd->queues_list, list) {
@@ -1147,6 +1159,8 @@ static int restore_process_queues_cpsch(struct device_queue_manager *dqm,
 	qpd->evicted = 0;
 	eviction_duration = get_jiffies_64() - pdd->last_evict_timestamp;
 	atomic64_add(eviction_duration, &pdd->evict_duration_counter);
+vm_not_acquired:
+	qpd->evicted = 0;
 out:
 	dqm_unlock(dqm);
 	return retval;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index bf610e3b683b..3ea61fa1db52 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -886,19 +886,48 @@ struct kfd_process {
 	 */
 	unsigned long last_restore_timestamp;
 
+	/* Indicates device process is debug attached with reserved vmid. */
+	bool debug_trap_enabled;
+
+	/* per-process-per device debug event fd file */
+	struct file *dbg_ev_file;
+
+	/* If the process is a kfd debugger, we need to know so we can clean
+	 * up at exit time.  If a process enables debugging on itself, it does
+	 * its own clean-up, so we don't set the flag here.  We track this by
+	 * counting the number of processes this process is debugging.
+	 */
+	atomic_t debugged_process_count;
+
+	/* If the process is a debugged, this is the debugger process */
+	struct kfd_process *debugger_process;
+
 	/* Kobj for our procfs */
 	struct kobject *kobj;
 	struct kobject *kobj_queues;
 	struct attribute attr_pasid;
 
+	/* Keep track cwsr init */
+	bool has_cwsr;
+
+	/* Exception code enable mask and status */
+	uint64_t exception_enable_mask;
+
 	/* shared virtual memory registered by this process */
 	struct svm_range_list svms;
 
 	bool xnack_enabled;
 
+	/* Work area for debugger event writer worker. */
+	struct work_struct debug_event_workarea;
+
 	atomic_t poison;
 	/* Queues are in paused stated because we are in the process of doing a CRIU checkpoint */
 	bool queues_paused;
+
+	/* Tracks runtime enable status */
+	struct kfd_runtime_info runtime_info;
+
 };
 
 #define KFD_PROCESS_TABLE_SIZE 5 /* bits: 32 entries */
@@ -928,7 +957,7 @@ bool kfd_dev_is_large_bar(struct kfd_dev *dev);
 
 int kfd_process_create_wq(void);
 void kfd_process_destroy_wq(void);
-struct kfd_process *kfd_create_process(struct file *filep);
+struct kfd_process *kfd_create_process(struct task_struct *thread);
 struct kfd_process *kfd_get_process(const struct task_struct *task);
 struct kfd_process *kfd_lookup_process_by_pasid(u32 pasid);
 struct kfd_process *kfd_lookup_process_by_mm(const struct mm_struct *mm);
@@ -1055,6 +1084,9 @@ void kfd_process_set_trap_handler(struct qcm_process_device *qpd,
 				  uint64_t tba_addr,
 				  uint64_t tma_addr);
 
+/* CWSR initialization */
+int kfd_process_init_cwsr_apu(struct kfd_process *process, struct file *filep);
+
 /* CRIU */
 /*
  * Need to increment KFD_CRIU_PRIV_VERSION each time a change is made to any of the CRIU private
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
index 951b63677248..56ad38fcd26e 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
@@ -44,6 +44,7 @@ struct mm_struct;
 #include "kfd_iommu.h"
 #include "kfd_svm.h"
 #include "kfd_smi_events.h"
+#include "kfd_debug.h"
 
 /*
  * List of struct kfd_process (field kfd_process).
@@ -69,7 +70,6 @@ static struct kfd_process *find_process(const struct task_struct *thread,
 					bool ref);
 static void kfd_process_ref_release(struct kref *ref);
 static struct kfd_process *create_process(const struct task_struct *thread);
-static int kfd_process_init_cwsr_apu(struct kfd_process *p, struct file *filep);
 
 static void evict_process_worker(struct work_struct *work);
 static void restore_process_worker(struct work_struct *work);
@@ -798,18 +798,19 @@ static void kfd_process_device_destroy_ib_mem(struct kfd_process_device *pdd)
 	kfd_process_free_gpuvm(qpd->ib_mem, pdd, qpd->ib_kaddr);
 }
 
-struct kfd_process *kfd_create_process(struct file *filep)
+struct kfd_process *kfd_create_process(struct task_struct *thread)
 {
 	struct kfd_process *process;
-	struct task_struct *thread = current;
 	int ret;
 
-	if (!thread->mm)
+	if (!(thread->mm && mmget_not_zero(thread->mm)))
 		return ERR_PTR(-EINVAL);
 
 	/* Only the pthreads threading model is supported. */
-	if (thread->group_leader->mm != thread->mm)
+	if (thread->group_leader->mm != thread->mm) {
+		mmput(thread->mm);
 		return ERR_PTR(-EINVAL);
+	}
 
 	/*
 	 * take kfd processes mutex before starting of process creation
@@ -827,10 +828,6 @@ struct kfd_process *kfd_create_process(struct file *filep)
 		if (IS_ERR(process))
 			goto out;
 
-		ret = kfd_process_init_cwsr_apu(process, filep);
-		if (ret)
-			goto out_destroy;
-
 		if (!procfs.kobj)
 			goto out;
 
@@ -864,16 +861,9 @@ struct kfd_process *kfd_create_process(struct file *filep)
 	if (!IS_ERR(process))
 		kref_get(&process->ref);
 	mutex_unlock(&kfd_processes_mutex);
+	mmput(thread->mm);
 
 	return process;
-
-out_destroy:
-	hash_del_rcu(&process->kfd_processes);
-	mutex_unlock(&kfd_processes_mutex);
-	synchronize_srcu(&kfd_processes_srcu);
-	/* kfd_process_free_notifier will trigger the cleanup */
-	mmu_notifier_put(&process->mmu_notifier);
-	return ERR_PTR(ret);
 }
 
 struct kfd_process *kfd_get_process(const struct task_struct *thread)
@@ -1188,6 +1178,28 @@ static void kfd_process_notifier_release(struct mmu_notifier *mn,
 	cancel_delayed_work_sync(&p->eviction_work);
 	cancel_delayed_work_sync(&p->restore_work);
 
+	if (p->debug_trap_enabled)
+		kfd_dbg_trap_disable(p);
+
+	if (atomic_read(&p->debugged_process_count) > 0) {
+		struct kfd_process *target;
+		unsigned int temp;
+		int idx = srcu_read_lock(&kfd_processes_srcu);
+
+		hash_for_each_rcu(kfd_processes_table, temp, target, kfd_processes) {
+			if (target->debugger_process && target->debugger_process == p) {
+				mutex_lock_nested(&target->mutex, 1);
+				if (target->debug_trap_enabled)
+					kfd_dbg_trap_disable(target);
+				mutex_unlock(&target->mutex);
+				if (atomic_read(&p->debugged_process_count) == 0)
+					break;
+			}
+		}
+
+		srcu_read_unlock(&kfd_processes_srcu, idx);
+	}
+
 	/* Indicate to other users that MM is no longer valid */
 	p->mm = NULL;
 
@@ -1200,11 +1212,14 @@ static const struct mmu_notifier_ops kfd_process_mmu_notifier_ops = {
 	.free_notifier = kfd_process_free_notifier,
 };
 
-static int kfd_process_init_cwsr_apu(struct kfd_process *p, struct file *filep)
+int kfd_process_init_cwsr_apu(struct kfd_process *p, struct file *filep)
 {
 	unsigned long  offset;
 	int i;
 
+	if (p->has_cwsr)
+		return 0;
+
 	for (i = 0; i < p->n_pdds; i++) {
 		struct kfd_dev *dev = p->pdds[i]->dev;
 		struct qcm_process_device *qpd = &p->pdds[i]->qpd;
@@ -1233,6 +1248,8 @@ static int kfd_process_init_cwsr_apu(struct kfd_process *p, struct file *filep)
 			qpd->tba_addr, qpd->tma_addr, qpd->cwsr_kaddr);
 	}
 
+	p->has_cwsr = true;
+
 	return 0;
 }
 
@@ -1375,6 +1392,10 @@ static struct kfd_process *create_process(const struct task_struct *thread)
 	if (err)
 		goto err_event_init;
 	process->is_32bit_user_mode = in_compat_syscall();
+	process->debug_trap_enabled = false;
+	process->debugger_process = NULL;
+	process->exception_enable_mask = 0;
+	atomic_set(&process->debugged_process_count, 0);
 
 	process->pasid = kfd_pasid_alloc();
 	if (process->pasid == 0) {
@@ -1422,6 +1443,8 @@ static struct kfd_process *create_process(const struct task_struct *thread)
 	kfd_unref_process(process);
 	get_task_struct(process->lead_thread);
 
+	INIT_WORK(&process->debug_event_workarea, debug_event_write_work_handler);
+
 	return process;
 
 err_register_notifier:
@@ -1894,8 +1917,10 @@ static void restore_process_worker(struct work_struct *work)
 	 */
 
 	p->last_restore_timestamp = get_jiffies_64();
-	ret = amdgpu_amdkfd_gpuvm_restore_process_bos(p->kgd_process_info,
-						     &p->ef);
+	/* VMs may not have been acquired yet during debugging. */
+	if (p->kgd_process_info)
+		ret = amdgpu_amdkfd_gpuvm_restore_process_bos(p->kgd_process_info,
+							     &p->ef);
 	if (ret) {
 		pr_debug("Failed to restore BOs of pasid 0x%x, retry after %d ms\n",
 			 p->pasid, PROCESS_BACK_OFF_TIME_MS);
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 63+ messages in thread

* [PATCH 04/29] drm/amdgpu: add kgd hw debug mode setting interface
  2022-10-31 16:23 [PATCH 01/29] drm/amdkfd: add debug and runtime enable interface Jonathan Kim
  2022-10-31 16:23 ` [PATCH 02/29] drm/amdkfd: display debug capabilities Jonathan Kim
  2022-10-31 16:23 ` [PATCH 03/29] drm/amdkfd: prepare per-process debug enable and disable Jonathan Kim
@ 2022-10-31 16:23 ` Jonathan Kim
  2022-12-01  0:08   ` Felix Kuehling
  2022-10-31 16:23 ` [PATCH 05/29] drm/amdgpu: setup hw debug registers on driver initialization Jonathan Kim
                   ` (25 subsequent siblings)
  28 siblings, 1 reply; 63+ messages in thread
From: Jonathan Kim @ 2022-10-31 16:23 UTC (permalink / raw)
  To: amd-gfx; +Cc: Felix.Kuehling

Introduce the require KGD debug calls that will execute hardware debug
mode setting.

Signed-off-by: Jonathan Kim <jonathan.kim@amd.com>
---
 .../gpu/drm/amd/include/kgd_kfd_interface.h   | 34 +++++++++++++++++++
 1 file changed, 34 insertions(+)

diff --git a/drivers/gpu/drm/amd/include/kgd_kfd_interface.h b/drivers/gpu/drm/amd/include/kgd_kfd_interface.h
index 5cb3e8634739..15e7a5c920a0 100644
--- a/drivers/gpu/drm/amd/include/kgd_kfd_interface.h
+++ b/drivers/gpu/drm/amd/include/kgd_kfd_interface.h
@@ -289,6 +289,40 @@ struct kfd2kgd_calls {
 			uint32_t vmid, uint64_t page_table_base);
 	uint32_t (*read_vmid_from_vmfault_reg)(struct amdgpu_device *adev);
 
+	uint32_t (*enable_debug_trap)(struct amdgpu_device *adev,
+					bool restore_dbg_registers,
+					uint32_t vmid);
+	uint32_t (*disable_debug_trap)(struct amdgpu_device *adev,
+					bool keep_trap_enabled,
+					uint32_t vmid);
+	int (*validate_trap_override_request)(struct amdgpu_device *adev,
+					uint32_t trap_override,
+					uint32_t *trap_mask_supported);
+	uint32_t (*set_wave_launch_trap_override)(struct amdgpu_device *adev,
+					     uint32_t vmid,
+					     uint32_t trap_override,
+					     uint32_t trap_mask_bits,
+					     uint32_t trap_mask_request,
+					     uint32_t *trap_mask_prev,
+					     uint32_t kfd_dbg_trap_cntl_prev);
+	uint32_t (*set_wave_launch_mode)(struct amdgpu_device *adev,
+					uint8_t wave_launch_mode,
+					uint32_t vmid);
+	uint32_t (*set_address_watch)(struct amdgpu_device *adev,
+					uint64_t watch_address,
+					uint32_t watch_address_mask,
+					uint32_t watch_id,
+					uint32_t watch_mode,
+					uint32_t debug_vmid);
+	uint32_t (*clear_address_watch)(struct amdgpu_device *adev,
+			uint32_t watch_id);
+	void (*get_iq_wait_times)(struct amdgpu_device *adev,
+			uint32_t *wait_times);
+	void (*build_grace_period_packet_info)(struct amdgpu_device *adev,
+			uint32_t wait_times,
+			uint32_t grace_period,
+			uint32_t *reg_offset,
+			uint32_t *reg_data);
 	void (*get_cu_occupancy)(struct amdgpu_device *adev, int pasid,
 			int *wave_cnt, int *max_waves_per_cu);
 	void (*program_trap_handler_settings)(struct amdgpu_device *adev,
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 63+ messages in thread

* [PATCH 05/29] drm/amdgpu: setup hw debug registers on driver initialization
  2022-10-31 16:23 [PATCH 01/29] drm/amdkfd: add debug and runtime enable interface Jonathan Kim
                   ` (2 preceding siblings ...)
  2022-10-31 16:23 ` [PATCH 04/29] drm/amdgpu: add kgd hw debug mode setting interface Jonathan Kim
@ 2022-10-31 16:23 ` Jonathan Kim
  2022-11-22 23:38   ` Felix Kuehling
  2022-12-01  0:23   ` Felix Kuehling
  2022-10-31 16:23 ` [PATCH 06/29] drm/amdgpu: add gfx9 hw debug mode enable and disable calls Jonathan Kim
                   ` (24 subsequent siblings)
  28 siblings, 2 replies; 63+ messages in thread
From: Jonathan Kim @ 2022-10-31 16:23 UTC (permalink / raw)
  To: amd-gfx; +Cc: Felix.Kuehling

Add missing debug trap registers references and initialize all debug
registers on boot by clearing the hardware exception overrides and the
wave allocation ID index.

For debug devices that only support single process debugging, enable
trap temporary setup by default.

Debug devices that support multi-process debugging require trap
temporary setup to be disabled by default in order to satisfy microbench
performance when in non-debug mode.

The debugger requires that TTMPs 6 & 7 save the dispatch ID to map
waves onto dispatch during compute context inspection.
In order to correctly this up, set the special reserved CP bit by default
whenever the MQD is initailized.

Signed-off-by: Jonathan Kim <jonathan.kim@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c        | 26 +++++++
 drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c         | 30 ++++++++
 .../gpu/drm/amd/amdkfd/kfd_mqd_manager_v10.c  |  5 ++
 .../gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c   |  5 ++
 .../include/asic_reg/gc/gc_10_1_0_offset.h    | 14 ++++
 .../include/asic_reg/gc/gc_10_1_0_sh_mask.h   | 69 +++++++++++++++++++
 .../include/asic_reg/gc/gc_10_3_0_offset.h    | 10 +++
 .../include/asic_reg/gc/gc_10_3_0_sh_mask.h   |  4 ++
 8 files changed, 163 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
index af94ac580d3e..d49aff0b4ba3 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
@@ -4904,6 +4904,29 @@ static u32 gfx_v10_0_init_pa_sc_tile_steering_override(struct amdgpu_device *ade
 
 #define DEFAULT_SH_MEM_BASES	(0x6000)
 
+static void gfx_v10_0_debug_trap_config_init(struct amdgpu_device *adev,
+				uint32_t first_vmid,
+				uint32_t last_vmid)
+{
+	uint32_t data;
+	uint32_t trap_config_vmid_mask = 0;
+	int i;
+
+	/* Calculate trap config vmid mask */
+	for (i = first_vmid; i < last_vmid; i++)
+		trap_config_vmid_mask |= (1 << i);
+
+	data = REG_SET_FIELD(0, SPI_GDBG_TRAP_CONFIG,
+			VMID_SEL, trap_config_vmid_mask);
+	data = REG_SET_FIELD(data, SPI_GDBG_TRAP_CONFIG,
+			TRAP_EN, 1);
+	WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_TRAP_CONFIG), data);
+	WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_TRAP_MASK), 0);
+
+	WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_TRAP_DATA0), 0);
+	WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_TRAP_DATA1), 0);
+}
+
 static void gfx_v10_0_init_compute_vmid(struct amdgpu_device *adev)
 {
 	int i;
@@ -4935,6 +4958,9 @@ static void gfx_v10_0_init_compute_vmid(struct amdgpu_device *adev)
 		WREG32_SOC15_OFFSET(GC, 0, mmGDS_GWS_VMID0, i, 0);
 		WREG32_SOC15_OFFSET(GC, 0, mmGDS_OA_VMID0, i, 0);
 	}
+
+	gfx_v10_0_debug_trap_config_init(adev, adev->vm_manager.first_kfd_vmid,
+					AMDGPU_NUM_VMID);
 }
 
 static void gfx_v10_0_init_gds_vmid(struct amdgpu_device *adev)
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
index 0320be4a5fc6..a0e5ad342f13 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
@@ -2337,6 +2337,29 @@ static void gfx_v9_0_setup_rb(struct amdgpu_device *adev)
 	adev->gfx.config.num_rbs = hweight32(active_rbs);
 }
 
+static void gfx_v9_0_debug_trap_config_init(struct amdgpu_device *adev,
+				uint32_t first_vmid,
+				uint32_t last_vmid)
+{
+	uint32_t data;
+	uint32_t trap_config_vmid_mask = 0;
+	int i;
+
+	/* Calculate trap config vmid mask */
+	for (i = first_vmid; i < last_vmid; i++)
+		trap_config_vmid_mask |= (1 << i);
+
+	data = REG_SET_FIELD(0, SPI_GDBG_TRAP_CONFIG,
+			VMID_SEL, trap_config_vmid_mask);
+	data = REG_SET_FIELD(data, SPI_GDBG_TRAP_CONFIG,
+			TRAP_EN, 1);
+	WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_TRAP_CONFIG), data);
+	WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_TRAP_MASK), 0);
+
+	WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_TRAP_DATA0), 0);
+	WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_TRAP_DATA1), 0);
+}
+
 #define DEFAULT_SH_MEM_BASES	(0x6000)
 static void gfx_v9_0_init_compute_vmid(struct amdgpu_device *adev)
 {
@@ -4609,6 +4632,13 @@ static int gfx_v9_0_late_init(void *handle)
 	if (r)
 		return r;
 
+	if (adev->ip_versions[GC_HWIP][0] == IP_VERSION(9, 4, 2))
+		gfx_v9_4_2_debug_trap_config_init(adev,
+			adev->vm_manager.first_kfd_vmid, AMDGPU_NUM_VMID);
+	else
+		gfx_v9_0_debug_trap_config_init(adev,
+			adev->vm_manager.first_kfd_vmid, AMDGPU_NUM_VMID);
+
 	return 0;
 }
 
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v10.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v10.c
index d3e2b6a599a4..cb484ace17de 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v10.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v10.c
@@ -117,6 +117,11 @@ static void init_mqd(struct mqd_manager *mm, void **mqd,
 			1 << CP_HQD_QUANTUM__QUANTUM_SCALE__SHIFT |
 			1 << CP_HQD_QUANTUM__QUANTUM_DURATION__SHIFT;
 
+	/* Set cp_hqd_hq_scheduler0 bit 14 to 1 to have the CP set up the
+	 * DISPATCH_PTR.  This is required for the kfd debugger
+	 */
+	m->cp_hqd_hq_scheduler0 = 1 << 14;
+
 	if (q->format == KFD_QUEUE_FORMAT_AQL) {
 		m->cp_hqd_aql_control =
 			1 << CP_HQD_AQL_CONTROL__CONTROL0__SHIFT;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c
index 0778e587a2d6..86f1cf090246 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c
@@ -164,6 +164,11 @@ static void init_mqd(struct mqd_manager *mm, void **mqd,
 			1 << CP_HQD_QUANTUM__QUANTUM_SCALE__SHIFT |
 			1 << CP_HQD_QUANTUM__QUANTUM_DURATION__SHIFT;
 
+	/* Set cp_hqd_hq_scheduler0 bit 14 to 1 to have the CP set up the
+	 * DISPATCH_PTR.  This is required for the kfd debugger
+	 */
+	m->cp_hqd_hq_status0 = 1 << 14;
+
 	if (q->format == KFD_QUEUE_FORMAT_AQL) {
 		m->cp_hqd_aql_control =
 			1 << CP_HQD_AQL_CONTROL__CONTROL0__SHIFT;
diff --git a/drivers/gpu/drm/amd/include/asic_reg/gc/gc_10_1_0_offset.h b/drivers/gpu/drm/amd/include/asic_reg/gc/gc_10_1_0_offset.h
index 18d34bbceebe..7d384f86bd67 100644
--- a/drivers/gpu/drm/amd/include/asic_reg/gc/gc_10_1_0_offset.h
+++ b/drivers/gpu/drm/amd/include/asic_reg/gc/gc_10_1_0_offset.h
@@ -5190,6 +5190,20 @@
 #define mmSPI_WCL_PIPE_PERCENT_CS6_BASE_IDX                                                            0
 #define mmSPI_WCL_PIPE_PERCENT_CS7                                                                     0x1f70
 #define mmSPI_WCL_PIPE_PERCENT_CS7_BASE_IDX                                                            0
+#define mmSPI_GDBG_WAVE_CNTL                                                                           0x1f71
+#define mmSPI_GDBG_WAVE_CNTL_BASE_IDX                                                                  0
+#define mmSPI_GDBG_TRAP_CONFIG                                                                         0x1f72
+#define mmSPI_GDBG_TRAP_CONFIG_BASE_IDX                                                                0
+#define mmSPI_GDBG_TRAP_MASK                                                                           0x1f73
+#define mmSPI_GDBG_TRAP_MASK_BASE_IDX                                                                  0
+#define mmSPI_GDBG_WAVE_CNTL2                                                                          0x1f74
+#define mmSPI_GDBG_WAVE_CNTL2_BASE_IDX                                                                 0
+#define mmSPI_GDBG_WAVE_CNTL3                                                                          0x1f75
+#define mmSPI_GDBG_WAVE_CNTL3_BASE_IDX                                                                 0
+#define mmSPI_GDBG_TRAP_DATA0                                                                          0x1f78
+#define mmSPI_GDBG_TRAP_DATA0_BASE_IDX                                                                 0
+#define mmSPI_GDBG_TRAP_DATA1                                                                          0x1f79
+#define mmSPI_GDBG_TRAP_DATA1_BASE_IDX                                                                 0
 #define mmSPI_COMPUTE_QUEUE_RESET                                                                      0x1f7b
 #define mmSPI_COMPUTE_QUEUE_RESET_BASE_IDX                                                             0
 #define mmSPI_RESOURCE_RESERVE_CU_0                                                                    0x1f7c
diff --git a/drivers/gpu/drm/amd/include/asic_reg/gc/gc_10_1_0_sh_mask.h b/drivers/gpu/drm/amd/include/asic_reg/gc/gc_10_1_0_sh_mask.h
index 4127896ffcdf..08772ba845b0 100644
--- a/drivers/gpu/drm/amd/include/asic_reg/gc/gc_10_1_0_sh_mask.h
+++ b/drivers/gpu/drm/amd/include/asic_reg/gc/gc_10_1_0_sh_mask.h
@@ -19646,6 +19646,75 @@
 //SPI_WCL_PIPE_PERCENT_CS7
 #define SPI_WCL_PIPE_PERCENT_CS7__VALUE__SHIFT                                                                0x0
 #define SPI_WCL_PIPE_PERCENT_CS7__VALUE_MASK                                                                  0x7FL
+//SPI_GDBG_WAVE_CNTL
+#define SPI_GDBG_WAVE_CNTL__STALL_RA__SHIFT                                                                   0x0
+#define SPI_GDBG_WAVE_CNTL__STALL_VMID__SHIFT                                                                 0x1
+#define SPI_GDBG_WAVE_CNTL__STALL_RA_MASK                                                                     0x00000001L
+#define SPI_GDBG_WAVE_CNTL__STALL_VMID_MASK                                                                   0x0001FFFEL
+//SPI_GDBG_TRAP_CONFIG
+#define SPI_GDBG_TRAP_CONFIG__ME_SEL__SHIFT                                                                   0x0
+#define SPI_GDBG_TRAP_CONFIG__PIPE_SEL__SHIFT                                                                 0x2
+#define SPI_GDBG_TRAP_CONFIG__QUEUE_SEL__SHIFT                                                                0x4
+#define SPI_GDBG_TRAP_CONFIG__ME_MATCH__SHIFT                                                                 0x7
+#define SPI_GDBG_TRAP_CONFIG__PIPE_MATCH__SHIFT                                                               0x8
+#define SPI_GDBG_TRAP_CONFIG__QUEUE_MATCH__SHIFT                                                              0x9
+#define SPI_GDBG_TRAP_CONFIG__TRAP_EN__SHIFT                                                                  0xf
+#define SPI_GDBG_TRAP_CONFIG__VMID_SEL__SHIFT                                                                 0x10
+#define SPI_GDBG_TRAP_CONFIG__ME_SEL_MASK                                                                     0x00000003L
+#define SPI_GDBG_TRAP_CONFIG__PIPE_SEL_MASK                                                                   0x0000000CL
+#define SPI_GDBG_TRAP_CONFIG__QUEUE_SEL_MASK                                                                  0x00000070L
+#define SPI_GDBG_TRAP_CONFIG__ME_MATCH_MASK                                                                   0x00000080L
+#define SPI_GDBG_TRAP_CONFIG__PIPE_MATCH_MASK                                                                 0x00000100L
+#define SPI_GDBG_TRAP_CONFIG__QUEUE_MATCH_MASK                                                                0x00000200L
+#define SPI_GDBG_TRAP_CONFIG__TRAP_EN_MASK                                                                    0x00008000L
+#define SPI_GDBG_TRAP_CONFIG__VMID_SEL_MASK                                                                   0xFFFF0000L
+//SPI_GDBG_TRAP_MASK
+#define SPI_GDBG_TRAP_MASK__EXCP_EN__SHIFT                                                                    0x0
+#define SPI_GDBG_TRAP_MASK__REPLACE__SHIFT                                                                    0x9
+#define SPI_GDBG_TRAP_MASK__EXCP_EN_MASK                                                                      0x01FFL
+#define SPI_GDBG_TRAP_MASK__REPLACE_MASK                                                                      0x0200L
+//SPI_GDBG_WAVE_CNTL2
+#define SPI_GDBG_WAVE_CNTL2__VMID_MASK__SHIFT                                                                 0x0
+#define SPI_GDBG_WAVE_CNTL2__MODE__SHIFT                                                                      0x10
+#define SPI_GDBG_WAVE_CNTL2__VMID_MASK_MASK                                                                   0x0000FFFFL
+#define SPI_GDBG_WAVE_CNTL2__MODE_MASK                                                                        0x00030000L
+//SPI_GDBG_WAVE_CNTL3
+#define SPI_GDBG_WAVE_CNTL3__STALL_PS__SHIFT                                                                  0x0
+#define SPI_GDBG_WAVE_CNTL3__STALL_VS__SHIFT                                                                  0x1
+#define SPI_GDBG_WAVE_CNTL3__STALL_GS__SHIFT                                                                  0x2
+#define SPI_GDBG_WAVE_CNTL3__STALL_HS__SHIFT                                                                  0x3
+#define SPI_GDBG_WAVE_CNTL3__STALL_CSG__SHIFT                                                                 0x4
+#define SPI_GDBG_WAVE_CNTL3__STALL_CS0__SHIFT                                                                 0x5
+#define SPI_GDBG_WAVE_CNTL3__STALL_CS1__SHIFT                                                                 0x6
+#define SPI_GDBG_WAVE_CNTL3__STALL_CS2__SHIFT                                                                 0x7
+#define SPI_GDBG_WAVE_CNTL3__STALL_CS3__SHIFT                                                                 0x8
+#define SPI_GDBG_WAVE_CNTL3__STALL_CS4__SHIFT                                                                 0x9
+#define SPI_GDBG_WAVE_CNTL3__STALL_CS5__SHIFT                                                                 0xa
+#define SPI_GDBG_WAVE_CNTL3__STALL_CS6__SHIFT                                                                 0xb
+#define SPI_GDBG_WAVE_CNTL3__STALL_CS7__SHIFT                                                                 0xc
+#define SPI_GDBG_WAVE_CNTL3__STALL_DURATION__SHIFT                                                            0xd
+#define SPI_GDBG_WAVE_CNTL3__STALL_MULT__SHIFT                                                                0x1c
+#define SPI_GDBG_WAVE_CNTL3__STALL_PS_MASK                                                                    0x00000001L
+#define SPI_GDBG_WAVE_CNTL3__STALL_VS_MASK                                                                    0x00000002L
+#define SPI_GDBG_WAVE_CNTL3__STALL_GS_MASK                                                                    0x00000004L
+#define SPI_GDBG_WAVE_CNTL3__STALL_HS_MASK                                                                    0x00000008L
+#define SPI_GDBG_WAVE_CNTL3__STALL_CSG_MASK                                                                   0x00000010L
+#define SPI_GDBG_WAVE_CNTL3__STALL_CS0_MASK                                                                   0x00000020L
+#define SPI_GDBG_WAVE_CNTL3__STALL_CS1_MASK                                                                   0x00000040L
+#define SPI_GDBG_WAVE_CNTL3__STALL_CS2_MASK                                                                   0x00000080L
+#define SPI_GDBG_WAVE_CNTL3__STALL_CS3_MASK                                                                   0x00000100L
+#define SPI_GDBG_WAVE_CNTL3__STALL_CS4_MASK                                                                   0x00000200L
+#define SPI_GDBG_WAVE_CNTL3__STALL_CS5_MASK                                                                   0x00000400L
+#define SPI_GDBG_WAVE_CNTL3__STALL_CS6_MASK                                                                   0x00000800L
+#define SPI_GDBG_WAVE_CNTL3__STALL_CS7_MASK                                                                   0x00001000L
+#define SPI_GDBG_WAVE_CNTL3__STALL_DURATION_MASK                                                              0x0FFFE000L
+#define SPI_GDBG_WAVE_CNTL3__STALL_MULT_MASK                                                                  0x10000000L
+//SPI_GDBG_TRAP_DATA0
+#define SPI_GDBG_TRAP_DATA0__DATA__SHIFT                                                                      0x0
+#define SPI_GDBG_TRAP_DATA0__DATA_MASK                                                                        0xFFFFFFFFL
+//SPI_GDBG_TRAP_DATA1
+#define SPI_GDBG_TRAP_DATA1__DATA__SHIFT                                                                      0x0
+#define SPI_GDBG_TRAP_DATA1__DATA_MASK                                                                        0xFFFFFFFFL
 //SPI_COMPUTE_QUEUE_RESET
 #define SPI_COMPUTE_QUEUE_RESET__RESET__SHIFT                                                                 0x0
 #define SPI_COMPUTE_QUEUE_RESET__RESET_MASK                                                                   0x01L
diff --git a/drivers/gpu/drm/amd/include/asic_reg/gc/gc_10_3_0_offset.h b/drivers/gpu/drm/amd/include/asic_reg/gc/gc_10_3_0_offset.h
index 3973110f149c..d09f1a06f4bf 100644
--- a/drivers/gpu/drm/amd/include/asic_reg/gc/gc_10_3_0_offset.h
+++ b/drivers/gpu/drm/amd/include/asic_reg/gc/gc_10_3_0_offset.h
@@ -26,6 +26,8 @@
 #define mmSQ_DEBUG_STS_GLOBAL_BASE_IDX                                                                 0
 #define mmSQ_DEBUG_STS_GLOBAL2                                                                         0x10B0
 #define mmSQ_DEBUG_STS_GLOBAL2_BASE_IDX                                                                0
+#define mmSQ_DEBUG                                                                                     0x10B1
+#define mmSQ_DEBUG_BASE_IDX                                                                            0
 
 // addressBlock: gc_sdma0_sdma0dec
 // base address: 0x4980
@@ -4849,10 +4851,18 @@
 #define mmSPI_WCL_PIPE_PERCENT_CS3_BASE_IDX                                                            0
 #define mmSPI_GDBG_WAVE_CNTL                                                                           0x1f71
 #define mmSPI_GDBG_WAVE_CNTL_BASE_IDX                                                                  0
+#define mmSPI_GDBG_TRAP_CONFIG                                                                         0x1f72
+#define mmSPI_GDBG_TRAP_CONFIG_BASE_IDX                                                                0
 #define mmSPI_GDBG_TRAP_MASK                                                                           0x1f73
 #define mmSPI_GDBG_TRAP_MASK_BASE_IDX                                                                  0
 #define mmSPI_GDBG_WAVE_CNTL2                                                                          0x1f74
 #define mmSPI_GDBG_WAVE_CNTL2_BASE_IDX                                                                 0
+#define mmSPI_GDBG_WAVE_CNTL3                                                                          0x1f75
+#define mmSPI_GDBG_WAVE_CNTL3_BASE_IDX                                                                 0
+#define mmSPI_GDBG_TRAP_DATA0                                                                          0x1f78
+#define mmSPI_GDBG_TRAP_DATA0_BASE_IDX                                                                 0
+#define mmSPI_GDBG_TRAP_DATA1                                                                          0x1f79
+#define mmSPI_GDBG_TRAP_DATA1_BASE_IDX                                                                 0
 #define mmSPI_COMPUTE_QUEUE_RESET                                                                      0x1f7b
 #define mmSPI_COMPUTE_QUEUE_RESET_BASE_IDX                                                             0
 #define mmSPI_RESOURCE_RESERVE_CU_0                                                                    0x1f7c
diff --git a/drivers/gpu/drm/amd/include/asic_reg/gc/gc_10_3_0_sh_mask.h b/drivers/gpu/drm/amd/include/asic_reg/gc/gc_10_3_0_sh_mask.h
index d4e8ff22ecb8..fc85aee010fe 100644
--- a/drivers/gpu/drm/amd/include/asic_reg/gc/gc_10_3_0_sh_mask.h
+++ b/drivers/gpu/drm/amd/include/asic_reg/gc/gc_10_3_0_sh_mask.h
@@ -47853,6 +47853,10 @@
 
 
 // addressBlock: sqind
+//SQ_DEBUG
+#define SQ_DEBUG__SINGLE_MEMOP_MASK 0x00000001L
+#define SQ_DEBUG__SINGLE_MEMOP__SHIFT 0x00000000
+
 //SQ_DEBUG_STS_GLOBAL
 #define SQ_DEBUG_STS_GLOBAL2__FIFO_LEVEL_GFX0_MASK 0x000000ffL
 #define SQ_DEBUG_STS_GLOBAL2__FIFO_LEVEL_GFX0__SHIFT 0x00000000
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 63+ messages in thread

* [PATCH 06/29] drm/amdgpu: add gfx9 hw debug mode enable and disable calls
  2022-10-31 16:23 [PATCH 01/29] drm/amdkfd: add debug and runtime enable interface Jonathan Kim
                   ` (3 preceding siblings ...)
  2022-10-31 16:23 ` [PATCH 05/29] drm/amdgpu: setup hw debug registers on driver initialization Jonathan Kim
@ 2022-10-31 16:23 ` Jonathan Kim
  2022-11-22 23:50   ` Felix Kuehling
  2022-10-31 16:23 ` [PATCH 07/29] drm/amdgpu: add gfx9.4.1 " Jonathan Kim
                   ` (23 subsequent siblings)
  28 siblings, 1 reply; 63+ messages in thread
From: Jonathan Kim @ 2022-10-31 16:23 UTC (permalink / raw)
  To: amd-gfx; +Cc: Felix.Kuehling

Implement the per-device calls to enable or disable HW debug mode for
GFX9 prior to GFX9.4.1.

GFX9.4.1 and onward will require their own enable/disable sequence as
follow on patches.

When hardware debug mode setting is requested, waves will inherit
these settings in the Shader Processor Input's (SPI) Sequencer Global
Block (SQG). This means that the KGD must drain all waves from the SPI
into SQG (approximately 96 SPI clock cycles) prior to debug mode setting
to ensure that the order of operations that the debugger expects with
regards to debug mode setting transaction requests and wave inheritence
of that mode is upheld.

Also ensure that exception overrides are reset to their original state
prior to debug enable or disable.

Signed-off-by: Jonathan Kim <jonathan.kim@amd.com>
---
 .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c | 100 ++++++++++++++++--
 .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h |   9 ++
 drivers/gpu/drm/amd/amdkfd/kfd_debug.h        |   3 +
 3 files changed, 102 insertions(+), 10 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
index 81e3b528bbc9..e1aac6f6d369 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
@@ -46,14 +46,14 @@ enum hqd_dequeue_request_type {
 	SAVE_WAVES
 };
 
-static void lock_srbm(struct amdgpu_device *adev, uint32_t mec, uint32_t pipe,
+static void kgd_gfx_v9_lock_srbm(struct amdgpu_device *adev, uint32_t mec, uint32_t pipe,
 			uint32_t queue, uint32_t vmid)
 {
 	mutex_lock(&adev->srbm_mutex);
 	soc15_grbm_select(adev, mec, pipe, queue, vmid);
 }
 
-static void unlock_srbm(struct amdgpu_device *adev)
+static void kgd_gfx_v9_unlock_srbm(struct amdgpu_device *adev)
 {
 	soc15_grbm_select(adev, 0, 0, 0, 0);
 	mutex_unlock(&adev->srbm_mutex);
@@ -65,7 +65,7 @@ static void acquire_queue(struct amdgpu_device *adev, uint32_t pipe_id,
 	uint32_t mec = (pipe_id / adev->gfx.mec.num_pipe_per_mec) + 1;
 	uint32_t pipe = (pipe_id % adev->gfx.mec.num_pipe_per_mec);
 
-	lock_srbm(adev, mec, pipe, queue_id, 0);
+	kgd_gfx_v9_lock_srbm(adev, mec, pipe, queue_id, 0);
 }
 
 static uint64_t get_queue_mask(struct amdgpu_device *adev,
@@ -79,7 +79,7 @@ static uint64_t get_queue_mask(struct amdgpu_device *adev,
 
 static void release_queue(struct amdgpu_device *adev)
 {
-	unlock_srbm(adev);
+	kgd_gfx_v9_unlock_srbm(adev);
 }
 
 void kgd_gfx_v9_program_sh_mem_settings(struct amdgpu_device *adev, uint32_t vmid,
@@ -88,13 +88,13 @@ void kgd_gfx_v9_program_sh_mem_settings(struct amdgpu_device *adev, uint32_t vmi
 					uint32_t sh_mem_ape1_limit,
 					uint32_t sh_mem_bases)
 {
-	lock_srbm(adev, 0, 0, 0, vmid);
+	kgd_gfx_v9_lock_srbm(adev, 0, 0, 0, vmid);
 
 	WREG32_RLC(SOC15_REG_OFFSET(GC, 0, mmSH_MEM_CONFIG), sh_mem_config);
 	WREG32_RLC(SOC15_REG_OFFSET(GC, 0, mmSH_MEM_BASES), sh_mem_bases);
 	/* APE1 no longer exists on GFX9 */
 
-	unlock_srbm(adev);
+	kgd_gfx_v9_unlock_srbm(adev);
 }
 
 int kgd_gfx_v9_set_pasid_vmid_mapping(struct amdgpu_device *adev, u32 pasid,
@@ -164,13 +164,13 @@ int kgd_gfx_v9_init_interrupts(struct amdgpu_device *adev, uint32_t pipe_id)
 	mec = (pipe_id / adev->gfx.mec.num_pipe_per_mec) + 1;
 	pipe = (pipe_id % adev->gfx.mec.num_pipe_per_mec);
 
-	lock_srbm(adev, mec, pipe, 0, 0);
+	kgd_gfx_v9_lock_srbm(adev, mec, pipe, 0, 0);
 
 	WREG32_SOC15(GC, 0, mmCPC_INT_CNTL,
 		CP_INT_CNTL_RING0__TIME_STAMP_INT_ENABLE_MASK |
 		CP_INT_CNTL_RING0__OPCODE_ERROR_INT_ENABLE_MASK);
 
-	unlock_srbm(adev);
+	kgd_gfx_v9_unlock_srbm(adev);
 
 	return 0;
 }
@@ -646,6 +646,84 @@ int kgd_gfx_v9_wave_control_execute(struct amdgpu_device *adev,
 	return 0;
 }
 
+/*
+ * GFX9 helper for wave launch stall requirements on debug trap setting.
+ *
+ * vmid:
+ *   Target VMID to stall/unstall.
+ *
+ * stall:
+ *   0-unstall wave launch (enable), 1-stall wave launch (disable).
+ *   After wavefront launch has been stalled, allocated waves must drain from
+ *   SPI in order for debug trap settings to take effect on those waves.
+ *   This is roughly a ~96 clock cycle wait on SPI where a read on
+ *   SPI_GDBG_WAVE_CNTL translates to ~32 clock cycles.
+ *   KGD_GFX_V9_WAVE_LAUNCH_SPI_DRAIN_LATENCY indicates the number of reads required.
+ *
+ *   NOTE: We can afford to clear the entire STALL_VMID field on unstall
+ *   because GFX9.4.1 cannot support multi-process debugging due to trap
+ *   configuration and masking being limited to global scope.  Always assume
+ *   single process conditions.
+
+ */
+#define KGD_GFX_V9_WAVE_LAUNCH_SPI_DRAIN_LATENCY	3
+void kgd_gfx_v9_set_wave_launch_stall(struct amdgpu_device *adev,
+					uint32_t vmid,
+					bool stall)
+{
+	int i;
+	uint32_t data = RREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_WAVE_CNTL));
+
+	if (adev->ip_versions[GC_HWIP][0] == IP_VERSION(9, 4, 1))
+		data = REG_SET_FIELD(data, SPI_GDBG_WAVE_CNTL, STALL_VMID,
+							stall ? 1 << vmid : 0);
+	else
+		data = REG_SET_FIELD(data, SPI_GDBG_WAVE_CNTL, STALL_RA,
+							stall ? 1 : 0);
+
+	WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_WAVE_CNTL), data);
+
+	if (!stall)
+		return;
+
+	for (i = 0; i < KGD_GFX_V9_WAVE_LAUNCH_SPI_DRAIN_LATENCY; i++)
+		RREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_WAVE_CNTL));
+}
+
+uint32_t kgd_gfx_v9_enable_debug_trap(struct amdgpu_device *adev,
+				bool restore_dbg_registers,
+				uint32_t vmid)
+{
+	mutex_lock(&adev->grbm_idx_mutex);
+
+	kgd_gfx_v9_set_wave_launch_stall(adev, vmid, true);
+
+	WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_TRAP_MASK), 0);
+
+	kgd_gfx_v9_set_wave_launch_stall(adev, vmid, false);
+
+	mutex_unlock(&adev->grbm_idx_mutex);
+
+	return 0;
+}
+
+uint32_t kgd_gfx_v9_disable_debug_trap(struct amdgpu_device *adev,
+					bool keep_trap_enabled,
+					uint32_t vmid)
+{
+	mutex_lock(&adev->grbm_idx_mutex);
+
+	kgd_gfx_v9_set_wave_launch_stall(adev, vmid, true);
+
+	WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_TRAP_MASK), 0);
+
+	kgd_gfx_v9_set_wave_launch_stall(adev, vmid, false);
+
+	mutex_unlock(&adev->grbm_idx_mutex);
+
+	return 0;
+}
+
 void kgd_gfx_v9_set_vm_context_page_table_base(struct amdgpu_device *adev,
 			uint32_t vmid, uint64_t page_table_base)
 {
@@ -833,7 +911,7 @@ void kgd_gfx_v9_get_cu_occupancy(struct amdgpu_device *adev, int pasid,
 void kgd_gfx_v9_program_trap_handler_settings(struct amdgpu_device *adev,
                         uint32_t vmid, uint64_t tba_addr, uint64_t tma_addr)
 {
-	lock_srbm(adev, 0, 0, 0, vmid);
+	kgd_gfx_v9_lock_srbm(adev, 0, 0, 0, vmid);
 
 	/*
 	 * Program TBA registers
@@ -851,7 +929,7 @@ void kgd_gfx_v9_program_trap_handler_settings(struct amdgpu_device *adev,
 	WREG32_SOC15(GC, 0, mmSQ_SHADER_TMA_HI,
 			upper_32_bits(tma_addr >> 8));
 
-	unlock_srbm(adev);
+	kgd_gfx_v9_unlock_srbm(adev);
 }
 
 const struct kfd2kgd_calls gfx_v9_kfd2kgd = {
@@ -871,6 +949,8 @@ const struct kfd2kgd_calls gfx_v9_kfd2kgd = {
 	.get_atc_vmid_pasid_mapping_info =
 			kgd_gfx_v9_get_atc_vmid_pasid_mapping_info,
 	.set_vm_context_page_table_base = kgd_gfx_v9_set_vm_context_page_table_base,
+	.enable_debug_trap = kgd_gfx_v9_enable_debug_trap,
+	.disable_debug_trap = kgd_gfx_v9_disable_debug_trap,
 	.get_cu_occupancy = kgd_gfx_v9_get_cu_occupancy,
 	.program_trap_handler_settings = kgd_gfx_v9_program_trap_handler_settings,
 };
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h
index c7ed3bc9053c..d39256162616 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h
@@ -58,3 +58,12 @@ void kgd_gfx_v9_get_cu_occupancy(struct amdgpu_device *adev, int pasid,
 		int *pasid_wave_cnt, int *max_waves_per_cu);
 void kgd_gfx_v9_program_trap_handler_settings(struct amdgpu_device *adev,
 		uint32_t vmid, uint64_t tba_addr, uint64_t tma_addr);
+void kgd_gfx_v9_set_wave_launch_stall(struct amdgpu_device *adev,
+					uint32_t vmid,
+					bool stall);
+uint32_t kgd_gfx_v9_enable_debug_trap(struct amdgpu_device *adev,
+				      bool restore_dbg_registers,
+				      uint32_t vmid);
+uint32_t kgd_gfx_v9_disable_debug_trap(struct amdgpu_device *adev,
+					bool keep_trap_enabled,
+					uint32_t vmid);
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.h b/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
index b2217eb1399c..8aa7a3ad4e97 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
@@ -25,6 +25,9 @@
 
 #include "kfd_priv.h"
 
+void kgd_gfx_v9_set_wave_launch_stall(struct amdgpu_device *adev,
+					uint32_t vmid,
+					bool stall);
 int kfd_dbg_trap_disable(struct kfd_process *target);
 int kfd_dbg_trap_enable(struct kfd_process *target, uint32_t fd,
 			void __user *runtime_info,
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 63+ messages in thread

* [PATCH 07/29] drm/amdgpu: add gfx9.4.1 hw debug mode enable and disable calls
  2022-10-31 16:23 [PATCH 01/29] drm/amdkfd: add debug and runtime enable interface Jonathan Kim
                   ` (4 preceding siblings ...)
  2022-10-31 16:23 ` [PATCH 06/29] drm/amdgpu: add gfx9 hw debug mode enable and disable calls Jonathan Kim
@ 2022-10-31 16:23 ` Jonathan Kim
  2022-11-22 23:59   ` Felix Kuehling
  2022-10-31 16:23 ` [PATCH 08/29] drm/amdgpu: add gfx10 " Jonathan Kim
                   ` (22 subsequent siblings)
  28 siblings, 1 reply; 63+ messages in thread
From: Jonathan Kim @ 2022-10-31 16:23 UTC (permalink / raw)
  To: amd-gfx; +Cc: Felix.Kuehling

On GFX9.4.1, the implicit wait count instruction on s_barrier is
disabled by default in the driver during normal operation for
performance requirements.

There is a hardware bug in GFX9.4.1 where if the implicit wait count
instruction after an s_barrier instruction is disabled, any wave that
hits an exception may step over the s_barrier when returning from the
trap handler with the barrier logic having no ability to be
aware of this, thereby causing other waves to wait at the barrier
indefinitely resulting in a shader hang.  This bug has been corrected
for GFX9.4.2 and onward.

Since the debugger subscribes to hardware exceptions, in order to avoid
this bug, the debugger must enable implicit wait count on s_barrier
for a debug session and disable it on detach.

In order to change this setting in the in the device global SQ_CONFIG
register, the GFX pipeline must be idle.  GFX9.4.1 as a compute device
will either dispatch work through the compute ring buffers used for
image post processing or through the hardware scheduler by the KFD.

Have the KGD suspend and drain the compute ring buffer, then suspend the
hardware scheduler and block any future KFD process job requests before
changing the implicit wait count setting.  Once set, resume all work.

Signed-off-by: Jonathan Kim <jonathan.kim@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h           |   3 +
 .../drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c   | 105 +++++++++++++++++-
 drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c         |   4 +-
 drivers/gpu/drm/amd/amdkfd/kfd_process.c      |   2 +-
 4 files changed, 110 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index 0e6ddf05c23c..9f2499f52d2c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -1034,6 +1034,9 @@ struct amdgpu_device {
 	struct pci_saved_state          *pci_state;
 	pci_channel_state_t		pci_channel_state;
 
+	/* Track auto wait count on s_barrier settings */
+	bool				barrier_has_auto_waitcnt;
+
 	struct amdgpu_reset_control     *reset_cntl;
 	uint32_t                        ip_versions[MAX_HWIP][HWIP_MAX_INSTANCE];
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c
index 4191af5a3f13..13f02a0aa828 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c
@@ -26,6 +26,7 @@
 #include "amdgpu.h"
 #include "amdgpu_amdkfd.h"
 #include "amdgpu_amdkfd_arcturus.h"
+#include "amdgpu_reset.h"
 #include "sdma0/sdma0_4_2_2_offset.h"
 #include "sdma0/sdma0_4_2_2_sh_mask.h"
 #include "sdma1/sdma1_4_2_2_offset.h"
@@ -48,6 +49,8 @@
 #include "amdgpu_amdkfd_gfx_v9.h"
 #include "gfxhub_v1_0.h"
 #include "mmhub_v9_4.h"
+#include "gc/gc_9_0_offset.h"
+#include "gc/gc_9_0_sh_mask.h"
 
 #define HQD_N_REGS 56
 #define DUMP_REG(addr) do {				\
@@ -276,6 +279,104 @@ int kgd_arcturus_hqd_sdma_destroy(struct amdgpu_device *adev, void *mqd,
 	return 0;
 }
 
+/*
+ * Helper used to suspend/resume gfx pipe for image post process work to set
+ * barrier behaviour.
+ */
+static int suspend_resume_compute_scheduler(struct amdgpu_device *adev, bool suspend)
+{
+	int i, r = 0;
+
+	for (i = 0; i < adev->gfx.num_compute_rings; i++) {
+		struct amdgpu_ring *ring = &adev->gfx.compute_ring[i];
+
+		if (!(ring && ring->sched.thread))
+			continue;
+
+		/* stop secheduler and drain ring. */
+		if (suspend) {
+			drm_sched_stop(&ring->sched, NULL);
+			r = amdgpu_fence_wait_empty(ring);
+			if (r)
+				goto out;
+		} else {
+			drm_sched_start(&ring->sched, false);
+		}
+	}
+
+out:
+	/* return on resume or failure to drain rings. */
+	if (!suspend || r)
+		return r;
+
+	return amdgpu_device_ip_wait_for_idle(adev, GC_HWIP);
+}
+
+static void set_barrier_auto_waitcnt(struct amdgpu_device *adev, bool enable_waitcnt)
+{
+	uint32_t data;
+
+	WRITE_ONCE(adev->barrier_has_auto_waitcnt, enable_waitcnt);
+
+	if (!down_read_trylock(&adev->reset_domain->sem))
+		return;
+
+	amdgpu_amdkfd_suspend(adev, false);
+
+	if (suspend_resume_compute_scheduler(adev, true))
+		goto out;
+
+	data = RREG32(SOC15_REG_OFFSET(GC, 0, mmSQ_CONFIG));
+	data = REG_SET_FIELD(data, SQ_CONFIG, DISABLE_BARRIER_WAITCNT,
+						enable_waitcnt ? 0 : 1);
+	WREG32(SOC15_REG_OFFSET(GC, 0, mmSQ_CONFIG), data);
+
+out:
+	suspend_resume_compute_scheduler(adev, false);
+
+	amdgpu_amdkfd_resume(adev, false);
+
+	up_read(&adev->reset_domain->sem);
+}
+
+static uint32_t kgd_arcturus_enable_debug_trap(struct amdgpu_device *adev,
+				bool restore_dbg_registers,
+				uint32_t vmid)
+{
+	mutex_lock(&adev->grbm_idx_mutex);
+
+	kgd_gfx_v9_set_wave_launch_stall(adev, vmid, true);
+
+	set_barrier_auto_waitcnt(adev, true);
+
+	WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_TRAP_MASK), 0);
+
+	kgd_gfx_v9_set_wave_launch_stall(adev, vmid, false);
+
+	mutex_unlock(&adev->grbm_idx_mutex);
+
+	return 0;
+}
+
+static uint32_t kgd_arcturus_disable_debug_trap(struct amdgpu_device *adev,
+					bool keep_trap_enabled,
+					uint32_t vmid)
+{
+
+	mutex_lock(&adev->grbm_idx_mutex);
+
+	kgd_gfx_v9_set_wave_launch_stall(adev, vmid, true);
+
+	set_barrier_auto_waitcnt(adev, false);
+
+	WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_TRAP_MASK), 0);
+
+	kgd_gfx_v9_set_wave_launch_stall(adev, vmid, false);
+
+	mutex_unlock(&adev->grbm_idx_mutex);
+
+	return 0;
+}
 const struct kfd2kgd_calls arcturus_kfd2kgd = {
 	.program_sh_mem_settings = kgd_gfx_v9_program_sh_mem_settings,
 	.set_pasid_vmid_mapping = kgd_gfx_v9_set_pasid_vmid_mapping,
@@ -294,6 +395,8 @@ const struct kfd2kgd_calls arcturus_kfd2kgd = {
 				kgd_gfx_v9_get_atc_vmid_pasid_mapping_info,
 	.set_vm_context_page_table_base =
 				kgd_gfx_v9_set_vm_context_page_table_base,
+	.enable_debug_trap = kgd_arcturus_enable_debug_trap,
+	.disable_debug_trap = kgd_arcturus_disable_debug_trap,
 	.get_cu_occupancy = kgd_gfx_v9_get_cu_occupancy,
-	.program_trap_handler_settings = kgd_gfx_v9_program_trap_handler_settings
+	.program_trap_handler_settings = kgd_gfx_v9_program_trap_handler_settings,
 };
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
index a0e5ad342f13..8ed1b5d255f7 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
@@ -2424,8 +2424,8 @@ static void gfx_v9_0_init_sq_config(struct amdgpu_device *adev)
 	switch (adev->ip_versions[GC_HWIP][0]) {
 	case IP_VERSION(9, 4, 1):
 		tmp = RREG32_SOC15(GC, 0, mmSQ_CONFIG);
-		tmp = REG_SET_FIELD(tmp, SQ_CONFIG,
-					DISABLE_BARRIER_WAITCNT, 1);
+		tmp = REG_SET_FIELD(tmp, SQ_CONFIG, DISABLE_BARRIER_WAITCNT,
+				READ_ONCE(adev->barrier_has_auto_waitcnt) ? 0 : 1);
 		WREG32_SOC15(GC, 0, mmSQ_CONFIG, tmp);
 		break;
 	default:
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
index 56ad38fcd26e..efb81ccef8f5 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
@@ -1946,7 +1946,7 @@ void kfd_suspend_all_processes(void)
 	WARN(debug_evictions, "Evicting all processes");
 	hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) {
 		cancel_delayed_work_sync(&p->eviction_work);
-		cancel_delayed_work_sync(&p->restore_work);
+		flush_delayed_work(&p->restore_work);
 
 		if (kfd_process_evict_queues(p, KFD_QUEUE_EVICTION_TRIGGER_SUSPEND))
 			pr_err("Failed to suspend process 0x%x\n", p->pasid);
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 63+ messages in thread

* [PATCH 08/29] drm/amdgpu: add gfx10 hw debug mode enable and disable calls
  2022-10-31 16:23 [PATCH 01/29] drm/amdkfd: add debug and runtime enable interface Jonathan Kim
                   ` (5 preceding siblings ...)
  2022-10-31 16:23 ` [PATCH 07/29] drm/amdgpu: add gfx9.4.1 " Jonathan Kim
@ 2022-10-31 16:23 ` Jonathan Kim
  2022-10-31 16:23 ` [PATCH 09/29] drm/amdgpu: add gfx9.4.2 " Jonathan Kim
                   ` (21 subsequent siblings)
  28 siblings, 0 replies; 63+ messages in thread
From: Jonathan Kim @ 2022-10-31 16:23 UTC (permalink / raw)
  To: amd-gfx; +Cc: Felix.Kuehling

Similar to GFX9 debug devices, set the hardware debug mode by draining
the SPI appropriately prior the mode setting request.

Because GFX10 has waves allocated by the work group boundaray and each
SE's SPI instances do not communicate, the SPI drain time is much longer.
This long drain time will be fixed for GFX11 onwards.

Also remove a bunch of deprecated misplaced references for GFX10.3.

Signed-off-by: Jonathan Kim <jonathan.kim@amd.com>
---
 .../drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c    |  95 +++++++++++
 .../drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.h    |  28 ++++
 .../drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10_3.c  | 147 +-----------------
 3 files changed, 126 insertions(+), 144 deletions(-)
 create mode 100644 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.h

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
index 9378fc79e9ea..c09b45de02d0 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
@@ -708,6 +708,99 @@ static void set_vm_context_page_table_base(struct amdgpu_device *adev,
 	adev->gfxhub.funcs->setup_vm_pt_regs(adev, vmid, page_table_base);
 }
 
+/*
+ * GFX10 helper for wave launch stall requirements on debug trap setting.
+ *
+ * vmid:
+ *   Target VMID to stall/unstall.
+ *
+ * stall:
+ *   0-unstall wave launch (enable), 1-stall wave launch (disable).
+ *   After wavefront launch has been stalled, allocated waves must drain from
+ *   SPI in order for debug trap settings to take effect on those waves.
+ *   This is roughly a ~3500 clock cycle wait on SPI where a read on
+ *   SPI_GDBG_WAVE_CNTL translates to ~32 clock cycles.
+ *   KGD_GFX_V10_WAVE_LAUNCH_SPI_DRAIN_LATENCY indicates the number of reads required.
+ *
+ *   NOTE: We can afford to clear the entire STALL_VMID field on unstall
+ *   because current GFX10 chips cannot support multi-process debugging due to
+ *   trap configuration and masking being limited to global scope.  Always
+ *   assume single process conditions.
+ *
+ */
+
+#define KGD_GFX_V10_WAVE_LAUNCH_SPI_DRAIN_LATENCY	110
+static void kgd_gfx_v10_set_wave_launch_stall(struct amdgpu_device *adev, uint32_t vmid, bool stall)
+{
+	uint32_t data = RREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_WAVE_CNTL));
+	int i;
+
+	data = REG_SET_FIELD(data, SPI_GDBG_WAVE_CNTL, STALL_VMID,
+							stall ? 1 << vmid : 0);
+
+	WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_WAVE_CNTL), data);
+
+	if (!stall)
+		return;
+
+	for (i = 0; i < KGD_GFX_V10_WAVE_LAUNCH_SPI_DRAIN_LATENCY; i++)
+		RREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_WAVE_CNTL));
+}
+
+uint32_t kgd_gfx_v10_enable_debug_trap(struct amdgpu_device *adev,
+				bool restore_dbg_registers,
+				uint32_t vmid)
+{
+
+	mutex_lock(&adev->grbm_idx_mutex);
+
+	kgd_gfx_v10_set_wave_launch_stall(adev, vmid, true);
+
+	/* assume gfx off is disabled for the debug session if rlc restore not supported. */
+	if (restore_dbg_registers) {
+		uint32_t data = 0;
+
+		data = REG_SET_FIELD(data, SPI_GDBG_TRAP_CONFIG,
+				VMID_SEL, 1 << vmid);
+		data = REG_SET_FIELD(data, SPI_GDBG_TRAP_CONFIG,
+				TRAP_EN, 1);
+		WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_TRAP_CONFIG), data);
+		WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_TRAP_DATA0), 0);
+		WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_TRAP_DATA1), 0);
+
+		kgd_gfx_v10_set_wave_launch_stall(adev, vmid, false);
+
+		mutex_unlock(&adev->grbm_idx_mutex);
+
+		return 0;
+	}
+
+	WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_TRAP_MASK), 0);
+
+	kgd_gfx_v10_set_wave_launch_stall(adev, vmid, false);
+
+	mutex_unlock(&adev->grbm_idx_mutex);
+
+	return 0;
+}
+
+uint32_t kgd_gfx_v10_disable_debug_trap(struct amdgpu_device *adev,
+					bool keep_trap_enabled,
+					uint32_t vmid)
+{
+	mutex_lock(&adev->grbm_idx_mutex);
+
+	kgd_gfx_v10_set_wave_launch_stall(adev, vmid, true);
+
+	WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_TRAP_MASK), 0);
+
+	kgd_gfx_v10_set_wave_launch_stall(adev, vmid, false);
+
+	mutex_unlock(&adev->grbm_idx_mutex);
+
+	return 0;
+}
+
 static void program_trap_handler_settings(struct amdgpu_device *adev,
 		uint32_t vmid, uint64_t tba_addr, uint64_t tma_addr)
 {
@@ -750,5 +843,7 @@ const struct kfd2kgd_calls gfx_v10_kfd2kgd = {
 	.get_atc_vmid_pasid_mapping_info =
 			get_atc_vmid_pasid_mapping_info,
 	.set_vm_context_page_table_base = set_vm_context_page_table_base,
+	.enable_debug_trap = kgd_gfx_v10_enable_debug_trap,
+	.disable_debug_trap = kgd_gfx_v10_disable_debug_trap,
 	.program_trap_handler_settings = program_trap_handler_settings,
 };
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.h
new file mode 100644
index 000000000000..370d6c312981
--- /dev/null
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.h
@@ -0,0 +1,28 @@
+/*
+ * Copyright 2022 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+uint32_t kgd_gfx_v10_enable_debug_trap(struct amdgpu_device *adev,
+				      bool restore_dbg_registers,
+				      uint32_t vmid);
+uint32_t kgd_gfx_v10_disable_debug_trap(struct amdgpu_device *adev,
+					bool keep_trap_enabled,
+					uint32_t vmid);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10_3.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10_3.c
index ba21ec6b35e0..73e3b9ae1fb0 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10_3.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10_3.c
@@ -22,6 +22,7 @@
 #include <linux/mmu_context.h>
 #include "amdgpu.h"
 #include "amdgpu_amdkfd.h"
+#include "amdgpu_amdkfd_gfx_v10.h"
 #include "gc/gc_10_3_0_offset.h"
 #include "gc/gc_10_3_0_sh_mask.h"
 #include "oss/osssys_5_0_0_offset.h"
@@ -652,142 +653,6 @@ static void program_trap_handler_settings_v10_3(struct amdgpu_device *adev,
 	unlock_srbm(adev);
 }
 
-#if 0
-uint32_t enable_debug_trap_v10_3(struct amdgpu_device *adev,
-				uint32_t trap_debug_wave_launch_mode,
-				uint32_t vmid)
-{
-	uint32_t data = 0;
-	uint32_t orig_wave_cntl_value;
-	uint32_t orig_stall_vmid;
-
-	mutex_lock(&adev->grbm_idx_mutex);
-
-	orig_wave_cntl_value = RREG32(SOC15_REG_OFFSET(GC,
-				0,
-				mmSPI_GDBG_WAVE_CNTL));
-	orig_stall_vmid = REG_GET_FIELD(orig_wave_cntl_value,
-			SPI_GDBG_WAVE_CNTL,
-			STALL_VMID);
-
-	data = REG_SET_FIELD(data, SPI_GDBG_WAVE_CNTL, STALL_RA, 1);
-	WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_WAVE_CNTL), data);
-
-	data = 0;
-	WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_TRAP_MASK), data);
-
-	WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_WAVE_CNTL), orig_stall_vmid);
-
-	mutex_unlock(&adev->grbm_idx_mutex);
-
-	return 0;
-}
-
-uint32_t disable_debug_trap_v10_3(struct amdgpu_device *adev)
-{
-	mutex_lock(&adev->grbm_idx_mutex);
-
-	WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_TRAP_MASK), 0);
-
-	mutex_unlock(&adev->grbm_idx_mutex);
-
-	return 0;
-}
-
-uint32_t set_wave_launch_trap_override_v10_3(struct amdgpu_device *adev,
-						uint32_t trap_override,
-						uint32_t trap_mask)
-{
-	uint32_t data = 0;
-
-	mutex_lock(&adev->grbm_idx_mutex);
-
-	data = RREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_WAVE_CNTL));
-	data = REG_SET_FIELD(data, SPI_GDBG_WAVE_CNTL, STALL_RA, 1);
-	WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_WAVE_CNTL), data);
-
-	data = 0;
-	data = REG_SET_FIELD(data, SPI_GDBG_TRAP_MASK,
-			EXCP_EN, trap_mask);
-	data = REG_SET_FIELD(data, SPI_GDBG_TRAP_MASK,
-			REPLACE, trap_override);
-	WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_TRAP_MASK), data);
-
-	data = RREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_WAVE_CNTL));
-	data = REG_SET_FIELD(data, SPI_GDBG_WAVE_CNTL, STALL_RA, 0);
-	WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_WAVE_CNTL), data);
-
-	mutex_unlock(&adev->grbm_idx_mutex);
-
-	return 0;
-}
-
-uint32_t set_wave_launch_mode_v10_3(struct amdgpu_device *adev,
-					uint8_t wave_launch_mode,
-					uint32_t vmid)
-{
-	uint32_t data = 0;
-	bool is_stall_mode;
-	bool is_mode_set;
-
-	is_stall_mode = (wave_launch_mode == 4);
-	is_mode_set = (wave_launch_mode != 0 && wave_launch_mode != 4);
-
-	mutex_lock(&adev->grbm_idx_mutex);
-
-	data = REG_SET_FIELD(data, SPI_GDBG_WAVE_CNTL2,
-			VMID_MASK, is_mode_set ? 1 << vmid : 0);
-	data = REG_SET_FIELD(data, SPI_GDBG_WAVE_CNTL2,
-			MODE, is_mode_set ? wave_launch_mode : 0);
-	WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_WAVE_CNTL2), data);
-
-	data = RREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_WAVE_CNTL));
-	data = REG_SET_FIELD(data, SPI_GDBG_WAVE_CNTL,
-			STALL_VMID, is_stall_mode ? 1 << vmid : 0);
-	data = REG_SET_FIELD(data, SPI_GDBG_WAVE_CNTL,
-			STALL_RA, is_stall_mode ? 1 : 0);
-	WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_WAVE_CNTL), data);
-
-	mutex_unlock(&adev->grbm_idx_mutex);
-
-	return 0;
-}
-
-/* kgd_get_iq_wait_times: Returns the mmCP_IQ_WAIT_TIME1/2 values
- * The values read are:
- *	ib_offload_wait_time     -- Wait Count for Indirect Buffer Offloads.
- *	atomic_offload_wait_time -- Wait Count for L2 and GDS Atomics Offloads.
- *	wrm_offload_wait_time    -- Wait Count for WAIT_REG_MEM Offloads.
- *	gws_wait_time            -- Wait Count for Global Wave Syncs.
- *	que_sleep_wait_time      -- Wait Count for Dequeue Retry.
- *	sch_wave_wait_time       -- Wait Count for Scheduling Wave Message.
- *	sem_rearm_wait_time      -- Wait Count for Semaphore re-arm.
- *	deq_retry_wait_time      -- Wait Count for Global Wave Syncs.
- */
-void get_iq_wait_times_v10_3(struct amdgpu_device *adev,
-					uint32_t *wait_times)
-
-{
-	*wait_times = RREG32(SOC15_REG_OFFSET(GC, 0, mmCP_IQ_WAIT_TIME2));
-}
-
-void build_grace_period_packet_info_v10_3(struct amdgpu_device *adev,
-						uint32_t wait_times,
-						uint32_t grace_period,
-						uint32_t *reg_offset,
-						uint32_t *reg_data)
-{
-	*reg_data = wait_times;
-
-	*reg_data = REG_SET_FIELD(*reg_data,
-			CP_IQ_WAIT_TIME2,
-			SCH_WAVE,
-			grace_period);
-
-	*reg_offset = mmCP_IQ_WAIT_TIME2;
-}
-#endif
-
 const struct kfd2kgd_calls gfx_v10_3_kfd2kgd = {
 	.program_sh_mem_settings = program_sh_mem_settings_v10_3,
 	.set_pasid_vmid_mapping = set_pasid_vmid_mapping_v10_3,
@@ -805,12 +670,6 @@ const struct kfd2kgd_calls gfx_v10_3_kfd2kgd = {
 	.get_atc_vmid_pasid_mapping_info = get_atc_vmid_pasid_mapping_info_v10_3,
 	.set_vm_context_page_table_base = set_vm_context_page_table_base_v10_3,
 	.program_trap_handler_settings = program_trap_handler_settings_v10_3,
-#if 0
-	.enable_debug_trap = enable_debug_trap_v10_3,
-	.disable_debug_trap = disable_debug_trap_v10_3,
-	.set_wave_launch_trap_override = set_wave_launch_trap_override_v10_3,
-	.set_wave_launch_mode = set_wave_launch_mode_v10_3,
-	.get_iq_wait_times = get_iq_wait_times_v10_3,
-	.build_grace_period_packet_info = build_grace_period_packet_info_v10_3,
-#endif
+	.enable_debug_trap = kgd_gfx_v10_enable_debug_trap,
+	.disable_debug_trap = kgd_gfx_v10_disable_debug_trap
 };
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 63+ messages in thread

* [PATCH 09/29] drm/amdgpu: add gfx9.4.2 hw debug mode enable and disable calls
  2022-10-31 16:23 [PATCH 01/29] drm/amdkfd: add debug and runtime enable interface Jonathan Kim
                   ` (6 preceding siblings ...)
  2022-10-31 16:23 ` [PATCH 08/29] drm/amdgpu: add gfx10 " Jonathan Kim
@ 2022-10-31 16:23 ` Jonathan Kim
  2022-10-31 16:23 ` [PATCH 10/29] drm/amdgpu: add configurable grace period for unmap queues Jonathan Kim
                   ` (20 subsequent siblings)
  28 siblings, 0 replies; 63+ messages in thread
From: Jonathan Kim @ 2022-10-31 16:23 UTC (permalink / raw)
  To: amd-gfx; +Cc: Felix.Kuehling

GFX9.4.2 now supports per-VMID debug mode controls registers
(SPI_GDBG_PER_VMID_CNTL).

Because the KFD lets the HWS handle PASID-VMID mapping, the KFD will
forward all debug mode setting register writes to the HWS scheduler
using a new MAP_PROCESS API, so instead of writing to registers, return
the required register values that the HWS needs to write on debug enable
and disable.

Signed-off-by: Jonathan Kim <jonathan.kim@amd.com>
---
 .../drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c  | 34 ++++++++++++++++++-
 1 file changed, 33 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c
index c8935d718207..42491a31f352 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c
@@ -23,6 +23,36 @@
 #include "amdgpu_amdkfd.h"
 #include "amdgpu_amdkfd_arcturus.h"
 #include "amdgpu_amdkfd_gfx_v9.h"
+#include "gc/gc_9_4_2_offset.h"
+#include "gc/gc_9_4_2_sh_mask.h"
+
+/* returns TRAP_EN, EXCP_EN and EXCP_REPLACE. */
+static uint32_t kgd_aldebaran_enable_debug_trap(struct amdgpu_device *adev,
+					    bool restore_dbg_registers,
+					    uint32_t vmid)
+{
+	uint32_t data = 0;
+
+	data = REG_SET_FIELD(data, SPI_GDBG_PER_VMID_CNTL, TRAP_EN, 1);
+	data = REG_SET_FIELD(data, SPI_GDBG_PER_VMID_CNTL, EXCP_EN, 0);
+	data = REG_SET_FIELD(data, SPI_GDBG_PER_VMID_CNTL, EXCP_REPLACE, 0);
+
+	return data;
+}
+
+/* returns TRAP_EN, EXCP_EN and EXCP_REPLACE. */
+static uint32_t kgd_aldebaran_disable_debug_trap(struct amdgpu_device *adev,
+						bool keep_trap_enabled,
+						uint32_t vmid)
+{
+	uint32_t data = 0;
+
+	data = REG_SET_FIELD(data, SPI_GDBG_PER_VMID_CNTL, TRAP_EN, keep_trap_enabled);
+	data = REG_SET_FIELD(data, SPI_GDBG_PER_VMID_CNTL, EXCP_EN, 0);
+	data = REG_SET_FIELD(data, SPI_GDBG_PER_VMID_CNTL, EXCP_REPLACE, 0);
+
+	return data;
+}
 
 const struct kfd2kgd_calls aldebaran_kfd2kgd = {
 	.program_sh_mem_settings = kgd_gfx_v9_program_sh_mem_settings,
@@ -41,5 +71,7 @@ const struct kfd2kgd_calls aldebaran_kfd2kgd = {
 	.get_atc_vmid_pasid_mapping_info =
 				kgd_gfx_v9_get_atc_vmid_pasid_mapping_info,
 	.set_vm_context_page_table_base = kgd_gfx_v9_set_vm_context_page_table_base,
-	.program_trap_handler_settings = kgd_gfx_v9_program_trap_handler_settings
+	.enable_debug_trap = kgd_aldebaran_enable_debug_trap,
+	.disable_debug_trap = kgd_aldebaran_disable_debug_trap,
+	.program_trap_handler_settings = kgd_gfx_v9_program_trap_handler_settings,
 };
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 63+ messages in thread

* [PATCH 10/29] drm/amdgpu: add configurable grace period for unmap queues
  2022-10-31 16:23 [PATCH 01/29] drm/amdkfd: add debug and runtime enable interface Jonathan Kim
                   ` (7 preceding siblings ...)
  2022-10-31 16:23 ` [PATCH 09/29] drm/amdgpu: add gfx9.4.2 " Jonathan Kim
@ 2022-10-31 16:23 ` Jonathan Kim
  2022-11-23  0:21   ` Felix Kuehling
  2022-10-31 16:23 ` [PATCH 11/29] drm/amdkfd: prepare map process for single process debug devices Jonathan Kim
                   ` (19 subsequent siblings)
  28 siblings, 1 reply; 63+ messages in thread
From: Jonathan Kim @ 2022-10-31 16:23 UTC (permalink / raw)
  To: amd-gfx; +Cc: Felix.Kuehling

The HWS schedule allows a grace period for wave completion prior to
preemption but the debugger requires good performance since it preempts
on every HW debug mode setting transaction request.

For good performance, allow immediate preemption by setting the grace
period to 0.

Note that setting the preepmtion grace period to 0 will result in an
infinite grace period being set due to a CP FW bug so set it to 1 for now.

Signed-off-by: Jonathan Kim <jonathan.kim@amd.com>
---
 .../drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c  |  2 +
 .../drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c   |  2 +
 .../drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c    | 43 ++++++++++++
 .../drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.h    |  6 ++
 .../drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10_3.c  |  2 +
 .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c | 43 ++++++++++++
 .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h |  9 ++-
 .../drm/amd/amdkfd/kfd_device_queue_manager.c | 61 ++++++++++++-----
 .../drm/amd/amdkfd/kfd_device_queue_manager.h |  2 +
 .../gpu/drm/amd/amdkfd/kfd_packet_manager.c   | 32 +++++++++
 .../drm/amd/amdkfd/kfd_packet_manager_v9.c    | 39 +++++++++++
 .../gpu/drm/amd/amdkfd/kfd_pm4_headers_ai.h   | 65 +++++++++++++++++++
 drivers/gpu/drm/amd/amdkfd/kfd_priv.h         |  5 ++
 13 files changed, 291 insertions(+), 20 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c
index 42491a31f352..c9629fc5460c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c
@@ -73,5 +73,7 @@ const struct kfd2kgd_calls aldebaran_kfd2kgd = {
 	.set_vm_context_page_table_base = kgd_gfx_v9_set_vm_context_page_table_base,
 	.enable_debug_trap = kgd_aldebaran_enable_debug_trap,
 	.disable_debug_trap = kgd_aldebaran_disable_debug_trap,
+	.get_iq_wait_times = kgd_gfx_v9_get_iq_wait_times,
+	.build_grace_period_packet_info = kgd_gfx_v9_build_grace_period_packet_info,
 	.program_trap_handler_settings = kgd_gfx_v9_program_trap_handler_settings,
 };
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c
index 13f02a0aa828..60a204f767ba 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c
@@ -397,6 +397,8 @@ const struct kfd2kgd_calls arcturus_kfd2kgd = {
 				kgd_gfx_v9_set_vm_context_page_table_base,
 	.enable_debug_trap = kgd_arcturus_enable_debug_trap,
 	.disable_debug_trap = kgd_arcturus_disable_debug_trap,
+	.get_iq_wait_times = kgd_gfx_v9_get_iq_wait_times,
+	.build_grace_period_packet_info = kgd_gfx_v9_build_grace_period_packet_info,
 	.get_cu_occupancy = kgd_gfx_v9_get_cu_occupancy,
 	.program_trap_handler_settings = kgd_gfx_v9_program_trap_handler_settings,
 };
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
index c09b45de02d0..2491402afd58 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
@@ -801,6 +801,47 @@ uint32_t kgd_gfx_v10_disable_debug_trap(struct amdgpu_device *adev,
 	return 0;
 }
 
+/* kgd_gfx_v10_get_iq_wait_times: Returns the mmCP_IQ_WAIT_TIME1/2 values
+ * The values read are:
+ *     ib_offload_wait_time     -- Wait Count for Indirect Buffer Offloads.
+ *     atomic_offload_wait_time -- Wait Count for L2 and GDS Atomics Offloads.
+ *     wrm_offload_wait_time    -- Wait Count for WAIT_REG_MEM Offloads.
+ *     gws_wait_time            -- Wait Count for Global Wave Syncs.
+ *     que_sleep_wait_time      -- Wait Count for Dequeue Retry.
+ *     sch_wave_wait_time       -- Wait Count for Scheduling Wave Message.
+ *     sem_rearm_wait_time      -- Wait Count for Semaphore re-arm.
+ *     deq_retry_wait_time      -- Wait Count for Global Wave Syncs.
+ */
+void kgd_gfx_v10_get_iq_wait_times(struct amdgpu_device *adev,
+					uint32_t *wait_times)
+
+{
+	*wait_times = RREG32(SOC15_REG_OFFSET(GC, 0, mmCP_IQ_WAIT_TIME2));
+}
+
+void kgd_gfx_v10_build_grace_period_packet_info(struct amdgpu_device *adev,
+						uint32_t wait_times,
+						uint32_t grace_period,
+						uint32_t *reg_offset,
+						uint32_t *reg_data)
+{
+	*reg_data = wait_times;
+
+	/*
+	 * The CP cannont handle a 0 grace period input and will result in
+	 * an infinite grace period being set so set to 1 to prevent this.
+	 */
+	if (grace_period == 0)
+		grace_period = 1;
+
+	*reg_data = REG_SET_FIELD(*reg_data,
+			CP_IQ_WAIT_TIME2,
+			SCH_WAVE,
+			grace_period);
+
+	*reg_offset = SOC15_REG_OFFSET(GC, 0, mmCP_IQ_WAIT_TIME2);
+}
+
 static void program_trap_handler_settings(struct amdgpu_device *adev,
 		uint32_t vmid, uint64_t tba_addr, uint64_t tma_addr)
 {
@@ -845,5 +886,7 @@ const struct kfd2kgd_calls gfx_v10_kfd2kgd = {
 	.set_vm_context_page_table_base = set_vm_context_page_table_base,
 	.enable_debug_trap = kgd_gfx_v10_enable_debug_trap,
 	.disable_debug_trap = kgd_gfx_v10_disable_debug_trap,
+	.get_iq_wait_times = kgd_gfx_v10_get_iq_wait_times,
+	.build_grace_period_packet_info = kgd_gfx_v10_build_grace_period_packet_info,
 	.program_trap_handler_settings = program_trap_handler_settings,
 };
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.h
index 370d6c312981..0abc1e805180 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.h
@@ -26,3 +26,9 @@ uint32_t kgd_gfx_v10_enable_debug_trap(struct amdgpu_device *adev,
 uint32_t kgd_gfx_v10_disable_debug_trap(struct amdgpu_device *adev,
 					bool keep_trap_enabled,
 					uint32_t vmid);
+void kgd_gfx_v10_get_iq_wait_times(struct amdgpu_device *adev, uint32_t *wait_times);
+void kgd_gfx_v10_build_grace_period_packet_info(struct amdgpu_device *adev,
+					       uint32_t wait_times,
+					       uint32_t grace_period,
+					       uint32_t *reg_offset,
+					       uint32_t *reg_data);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10_3.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10_3.c
index 73e3b9ae1fb0..c57f2a6b6e23 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10_3.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10_3.c
@@ -670,6 +670,8 @@ const struct kfd2kgd_calls gfx_v10_3_kfd2kgd = {
 	.get_atc_vmid_pasid_mapping_info = get_atc_vmid_pasid_mapping_info_v10_3,
 	.set_vm_context_page_table_base = set_vm_context_page_table_base_v10_3,
 	.program_trap_handler_settings = program_trap_handler_settings_v10_3,
+	.get_iq_wait_times = kgd_gfx_v10_get_iq_wait_times,
+	.build_grace_period_packet_info = kgd_gfx_v10_build_grace_period_packet_info,
 	.enable_debug_trap = kgd_gfx_v10_enable_debug_trap,
 	.disable_debug_trap = kgd_gfx_v10_disable_debug_trap
 };
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
index e1aac6f6d369..673c99c5523d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
@@ -724,6 +724,24 @@ uint32_t kgd_gfx_v9_disable_debug_trap(struct amdgpu_device *adev,
 	return 0;
 }
 
+/* kgd_gfx_v9_get_iq_wait_times: Returns the mmCP_IQ_WAIT_TIME1/2 values
+ * The values read are:
+ *     ib_offload_wait_time     -- Wait Count for Indirect Buffer Offloads.
+ *     atomic_offload_wait_time -- Wait Count for L2 and GDS Atomics Offloads.
+ *     wrm_offload_wait_time    -- Wait Count for WAIT_REG_MEM Offloads.
+ *     gws_wait_time            -- Wait Count for Global Wave Syncs.
+ *     que_sleep_wait_time      -- Wait Count for Dequeue Retry.
+ *     sch_wave_wait_time       -- Wait Count for Scheduling Wave Message.
+ *     sem_rearm_wait_time      -- Wait Count for Semaphore re-arm.
+ *     deq_retry_wait_time      -- Wait Count for Global Wave Syncs.
+ */
+void kgd_gfx_v9_get_iq_wait_times(struct amdgpu_device *adev,
+					uint32_t *wait_times)
+
+{
+	*wait_times = RREG32(SOC15_REG_OFFSET(GC, 0, mmCP_IQ_WAIT_TIME2));
+}
+
 void kgd_gfx_v9_set_vm_context_page_table_base(struct amdgpu_device *adev,
 			uint32_t vmid, uint64_t page_table_base)
 {
@@ -908,6 +926,29 @@ void kgd_gfx_v9_get_cu_occupancy(struct amdgpu_device *adev, int pasid,
 				adev->gfx.cu_info.max_waves_per_simd;
 }
 
+void kgd_gfx_v9_build_grace_period_packet_info(struct amdgpu_device *adev,
+		uint32_t wait_times,
+		uint32_t grace_period,
+		uint32_t *reg_offset,
+		uint32_t *reg_data)
+{
+	*reg_data = wait_times;
+
+	/*
+	 * The CP cannont handle a 0 grace period input and will result in
+	 * an infinite grace period being set so set to 1 to prevent this.
+	 */
+	if (grace_period == 0)
+		grace_period = 1;
+
+	*reg_data = REG_SET_FIELD(*reg_data,
+			CP_IQ_WAIT_TIME2,
+			SCH_WAVE,
+			grace_period);
+
+	*reg_offset = SOC15_REG_OFFSET(GC, 0, mmCP_IQ_WAIT_TIME2);
+}
+
 void kgd_gfx_v9_program_trap_handler_settings(struct amdgpu_device *adev,
                         uint32_t vmid, uint64_t tba_addr, uint64_t tma_addr)
 {
@@ -951,6 +992,8 @@ const struct kfd2kgd_calls gfx_v9_kfd2kgd = {
 	.set_vm_context_page_table_base = kgd_gfx_v9_set_vm_context_page_table_base,
 	.enable_debug_trap = kgd_gfx_v9_enable_debug_trap,
 	.disable_debug_trap = kgd_gfx_v9_disable_debug_trap,
+	.get_iq_wait_times = kgd_gfx_v9_get_iq_wait_times,
+	.build_grace_period_packet_info = kgd_gfx_v9_build_grace_period_packet_info,
 	.get_cu_occupancy = kgd_gfx_v9_get_cu_occupancy,
 	.program_trap_handler_settings = kgd_gfx_v9_program_trap_handler_settings,
 };
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h
index d39256162616..c0866497cb5c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h
@@ -20,8 +20,6 @@
  * OTHER DEALINGS IN THE SOFTWARE.
  */
 
-
-
 void kgd_gfx_v9_program_sh_mem_settings(struct amdgpu_device *adev, uint32_t vmid,
 		uint32_t sh_mem_config,
 		uint32_t sh_mem_ape1_base, uint32_t sh_mem_ape1_limit,
@@ -51,7 +49,6 @@ int kgd_gfx_v9_wave_control_execute(struct amdgpu_device *adev,
 					uint32_t sq_cmd);
 bool kgd_gfx_v9_get_atc_vmid_pasid_mapping_info(struct amdgpu_device *adev,
 					uint8_t vmid, uint16_t *p_pasid);
-
 void kgd_gfx_v9_set_vm_context_page_table_base(struct amdgpu_device *adev,
 			uint32_t vmid, uint64_t page_table_base);
 void kgd_gfx_v9_get_cu_occupancy(struct amdgpu_device *adev, int pasid,
@@ -67,3 +64,9 @@ uint32_t kgd_gfx_v9_enable_debug_trap(struct amdgpu_device *adev,
 uint32_t kgd_gfx_v9_disable_debug_trap(struct amdgpu_device *adev,
 					bool keep_trap_enabled,
 					uint32_t vmid);
+void kgd_gfx_v9_get_iq_wait_times(struct amdgpu_device *adev, uint32_t *wait_times);
+void kgd_gfx_v9_build_grace_period_packet_info(struct amdgpu_device *adev,
+					       uint32_t wait_times,
+					       uint32_t grace_period,
+					       uint32_t *reg_offset,
+					       uint32_t *reg_data);
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
index faa5d8c666ee..fbcf4ee07936 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -46,10 +46,13 @@ static int set_pasid_vmid_mapping(struct device_queue_manager *dqm,
 
 static int execute_queues_cpsch(struct device_queue_manager *dqm,
 				enum kfd_unmap_queues_filter filter,
-				uint32_t filter_param);
+				uint32_t filter_param,
+				uint32_t grace_period);
 static int unmap_queues_cpsch(struct device_queue_manager *dqm,
 				enum kfd_unmap_queues_filter filter,
-				uint32_t filter_param, bool reset);
+				uint32_t filter_param,
+				uint32_t grace_period,
+				bool reset);
 
 static int map_queues_cpsch(struct device_queue_manager *dqm);
 
@@ -839,7 +842,7 @@ static int update_queue(struct device_queue_manager *dqm, struct queue *q,
 	if (dqm->sched_policy != KFD_SCHED_POLICY_NO_HWS) {
 		if (!dqm->dev->shared_resources.enable_mes)
 			retval = unmap_queues_cpsch(dqm,
-						    KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0, false);
+						    KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0, USE_DEFAULT_GRACE_PERIOD, false);
 		else if (prev_active)
 			retval = remove_queue_mes(dqm, q, &pdd->qpd);
 
@@ -1015,7 +1018,8 @@ static int evict_process_queues_cpsch(struct device_queue_manager *dqm,
 		retval = execute_queues_cpsch(dqm,
 					      qpd->is_debug ?
 					      KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES :
-					      KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0);
+					      KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0,
+					      USE_DEFAULT_GRACE_PERIOD);
 
 out:
 	dqm_unlock(dqm);
@@ -1155,7 +1159,7 @@ static int restore_process_queues_cpsch(struct device_queue_manager *dqm,
 	}
 	if (!dqm->dev->shared_resources.enable_mes)
 		retval = execute_queues_cpsch(dqm,
-					      KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0);
+					      KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0, USE_DEFAULT_GRACE_PERIOD);
 	qpd->evicted = 0;
 	eviction_duration = get_jiffies_64() - pdd->last_evict_timestamp;
 	atomic64_add(eviction_duration, &pdd->evict_duration_counter);
@@ -1492,6 +1496,9 @@ static int initialize_cpsch(struct device_queue_manager *dqm)
 
 	init_sdma_bitmaps(dqm);
 
+	if (dqm->dev->kfd2kgd->get_iq_wait_times)
+		dqm->dev->kfd2kgd->get_iq_wait_times(dqm->dev->adev,
+					&dqm->wait_times);
 	return 0;
 }
 
@@ -1531,7 +1538,7 @@ static int start_cpsch(struct device_queue_manager *dqm)
 	dqm->is_resetting = false;
 	dqm->sched_running = true;
 	if (!dqm->dev->shared_resources.enable_mes)
-		execute_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0);
+		execute_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0, USE_DEFAULT_GRACE_PERIOD);
 	dqm_unlock(dqm);
 
 	return 0;
@@ -1556,7 +1563,7 @@ static int stop_cpsch(struct device_queue_manager *dqm)
 
 	if (!dqm->is_hws_hang) {
 		if (!dqm->dev->shared_resources.enable_mes)
-			unmap_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0, false);
+			unmap_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0, USE_DEFAULT_GRACE_PERIOD, false);
 		else
 			remove_all_queues_mes(dqm);
 	}
@@ -1598,7 +1605,8 @@ static int create_kernel_queue_cpsch(struct device_queue_manager *dqm,
 	list_add(&kq->list, &qpd->priv_queue_list);
 	increment_queue_count(dqm, qpd, kq->queue);
 	qpd->is_debug = true;
-	execute_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0);
+	execute_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0,
+			USE_DEFAULT_GRACE_PERIOD);
 	dqm_unlock(dqm);
 
 	return 0;
@@ -1612,7 +1620,8 @@ static void destroy_kernel_queue_cpsch(struct device_queue_manager *dqm,
 	list_del(&kq->list);
 	decrement_queue_count(dqm, qpd, kq->queue);
 	qpd->is_debug = false;
-	execute_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0);
+	execute_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0,
+			USE_DEFAULT_GRACE_PERIOD);
 	/*
 	 * Unconditionally decrement this counter, regardless of the queue's
 	 * type.
@@ -1689,7 +1698,7 @@ static int create_queue_cpsch(struct device_queue_manager *dqm, struct queue *q,
 
 		if (!dqm->dev->shared_resources.enable_mes)
 			retval = execute_queues_cpsch(dqm,
-					KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0);
+					KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0, USE_DEFAULT_GRACE_PERIOD);
 		else
 			retval = add_queue_mes(dqm, q, qpd);
 		if (retval)
@@ -1778,7 +1787,9 @@ static int map_queues_cpsch(struct device_queue_manager *dqm)
 /* dqm->lock mutex has to be locked before calling this function */
 static int unmap_queues_cpsch(struct device_queue_manager *dqm,
 				enum kfd_unmap_queues_filter filter,
-				uint32_t filter_param, bool reset)
+				uint32_t filter_param,
+				uint32_t grace_period,
+				bool reset)
 {
 	int retval = 0;
 	struct mqd_manager *mqd_mgr;
@@ -1790,6 +1801,12 @@ static int unmap_queues_cpsch(struct device_queue_manager *dqm,
 	if (!dqm->active_runlist)
 		return retval;
 
+	if (grace_period != USE_DEFAULT_GRACE_PERIOD) {
+		retval = pm_update_grace_period(&dqm->packet_mgr, grace_period);
+		if (retval)
+			return retval;
+	}
+
 	retval = pm_send_unmap_queue(&dqm->packet_mgr, filter, filter_param, reset);
 	if (retval)
 		return retval;
@@ -1822,6 +1839,13 @@ static int unmap_queues_cpsch(struct device_queue_manager *dqm,
 		return -ETIME;
 	}
 
+	/* We need to reset the grace period value for this device */
+	if (grace_period != USE_DEFAULT_GRACE_PERIOD) {
+		if (pm_update_grace_period(&dqm->packet_mgr,
+					USE_DEFAULT_GRACE_PERIOD))
+			pr_err("Failed to reset grace period\n");
+	}
+
 	pm_release_ib(&dqm->packet_mgr);
 	dqm->active_runlist = false;
 
@@ -1837,7 +1861,7 @@ static int reset_queues_cpsch(struct device_queue_manager *dqm,
 	dqm_lock(dqm);
 
 	retval = unmap_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_BY_PASID,
-			pasid, true);
+			pasid, USE_DEFAULT_GRACE_PERIOD, true);
 
 	dqm_unlock(dqm);
 	return retval;
@@ -1846,13 +1870,14 @@ static int reset_queues_cpsch(struct device_queue_manager *dqm,
 /* dqm->lock mutex has to be locked before calling this function */
 static int execute_queues_cpsch(struct device_queue_manager *dqm,
 				enum kfd_unmap_queues_filter filter,
-				uint32_t filter_param)
+				uint32_t filter_param,
+				uint32_t grace_period)
 {
 	int retval;
 
 	if (dqm->is_hws_hang)
 		return -EIO;
-	retval = unmap_queues_cpsch(dqm, filter, filter_param, false);
+	retval = unmap_queues_cpsch(dqm, filter, filter_param, grace_period, false);
 	if (retval)
 		return retval;
 
@@ -1910,7 +1935,8 @@ static int destroy_queue_cpsch(struct device_queue_manager *dqm,
 		if (!dqm->dev->shared_resources.enable_mes) {
 			decrement_queue_count(dqm, qpd, q);
 			retval = execute_queues_cpsch(dqm,
-						      KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0);
+						      KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0,
+						      USE_DEFAULT_GRACE_PERIOD);
 			if (retval == -ETIME)
 				qpd->reset_wavefronts = true;
 		} else {
@@ -2195,7 +2221,7 @@ static int process_termination_cpsch(struct device_queue_manager *dqm,
 	}
 
 	if (!dqm->dev->shared_resources.enable_mes)
-		retval = execute_queues_cpsch(dqm, filter, 0);
+		retval = execute_queues_cpsch(dqm, filter, 0, USE_DEFAULT_GRACE_PERIOD);
 
 	if ((!dqm->is_hws_hang) && (retval || qpd->reset_wavefronts)) {
 		pr_warn("Resetting wave fronts (cpsch) on dev %p\n", dqm->dev);
@@ -2539,7 +2565,8 @@ int dqm_debugfs_hang_hws(struct device_queue_manager *dqm)
 		return r;
 	}
 	dqm->active_runlist = true;
-	r = execute_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0);
+	r = execute_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES,
+				0, USE_DEFAULT_GRACE_PERIOD);
 	dqm_unlock(dqm);
 
 	return r;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
index a537b9ef3e16..fb48b124161f 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
@@ -37,6 +37,7 @@
 
 #define KFD_MES_PROCESS_QUANTUM		100000
 #define KFD_MES_GANG_QUANTUM		10000
+#define USE_DEFAULT_GRACE_PERIOD 0xffffffff
 
 struct device_process_node {
 	struct qcm_process_device *qpd;
@@ -256,6 +257,7 @@ struct device_queue_manager {
 	struct work_struct	hw_exception_work;
 	struct kfd_mem_obj	hiq_sdma_mqd;
 	bool			sched_running;
+	uint32_t		wait_times;
 };
 
 void device_queue_manager_init_cik(
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c
index ed02b6d8bf63..c57f9a46dfcc 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c
@@ -369,6 +369,38 @@ int pm_send_query_status(struct packet_manager *pm, uint64_t fence_address,
 	return retval;
 }
 
+int pm_update_grace_period(struct packet_manager *pm, uint32_t grace_period)
+{
+	int retval = 0;
+	uint32_t *buffer, size;
+
+	size = pm->pmf->set_grace_period_size;
+
+	mutex_lock(&pm->lock);
+
+	if (size) {
+		kq_acquire_packet_buffer(pm->priv_queue,
+			size / sizeof(uint32_t),
+			(unsigned int **)&buffer);
+
+		if (!buffer) {
+			pr_err("Failed to allocate buffer on kernel queue\n");
+			retval = -ENOMEM;
+			goto out;
+		}
+
+		retval = pm->pmf->set_grace_period(pm, buffer, grace_period);
+		if (!retval)
+			kq_submit_packet(pm->priv_queue);
+		else
+			kq_rollback_packet(pm->priv_queue);
+	}
+
+out:
+	mutex_unlock(&pm->lock);
+	return retval;
+}
+
 int pm_send_unmap_queue(struct packet_manager *pm,
 			enum kfd_unmap_queues_filter filter,
 			uint32_t filter_param, bool reset)
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager_v9.c
index 18250845a989..f0cdc8695b8c 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager_v9.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager_v9.c
@@ -251,6 +251,41 @@ static int pm_map_queues_v9(struct packet_manager *pm, uint32_t *buffer,
 	return 0;
 }
 
+static int pm_set_grace_period_v9(struct packet_manager *pm,
+		uint32_t *buffer,
+		uint32_t grace_period)
+{
+	struct pm4_mec_write_data_mmio *packet;
+	uint32_t reg_offset = 0;
+	uint32_t reg_data = 0;
+
+	pm->dqm->dev->kfd2kgd->build_grace_period_packet_info(
+			pm->dqm->dev->adev,
+			pm->dqm->wait_times,
+			grace_period,
+			&reg_offset,
+			&reg_data);
+
+	if (grace_period == USE_DEFAULT_GRACE_PERIOD)
+		reg_data = pm->dqm->wait_times;
+
+	packet = (struct pm4_mec_write_data_mmio *)buffer;
+	memset(buffer, 0, sizeof(struct pm4_mec_write_data_mmio));
+
+	packet->header.u32All = pm_build_pm4_header(IT_WRITE_DATA,
+					sizeof(struct pm4_mec_write_data_mmio));
+
+	packet->bitfields2.dst_sel  = dst_sel___write_data__mem_mapped_register;
+	packet->bitfields2.addr_incr =
+			addr_incr___write_data__do_not_increment_address;
+
+	packet->bitfields3.dst_mmreg_addr = reg_offset;
+
+	packet->data = reg_data;
+
+	return 0;
+}
+
 static int pm_unmap_queues_v9(struct packet_manager *pm, uint32_t *buffer,
 			enum kfd_unmap_queues_filter filter,
 			uint32_t filter_param, bool reset)
@@ -333,6 +368,7 @@ const struct packet_manager_funcs kfd_v9_pm_funcs = {
 	.set_resources		= pm_set_resources_v9,
 	.map_queues		= pm_map_queues_v9,
 	.unmap_queues		= pm_unmap_queues_v9,
+	.set_grace_period       = pm_set_grace_period_v9,
 	.query_status		= pm_query_status_v9,
 	.release_mem		= NULL,
 	.map_process_size	= sizeof(struct pm4_mes_map_process),
@@ -340,6 +376,7 @@ const struct packet_manager_funcs kfd_v9_pm_funcs = {
 	.set_resources_size	= sizeof(struct pm4_mes_set_resources),
 	.map_queues_size	= sizeof(struct pm4_mes_map_queues),
 	.unmap_queues_size	= sizeof(struct pm4_mes_unmap_queues),
+	.set_grace_period_size  = sizeof(struct pm4_mec_write_data_mmio),
 	.query_status_size	= sizeof(struct pm4_mes_query_status),
 	.release_mem_size	= 0,
 };
@@ -350,6 +387,7 @@ const struct packet_manager_funcs kfd_aldebaran_pm_funcs = {
 	.set_resources		= pm_set_resources_v9,
 	.map_queues		= pm_map_queues_v9,
 	.unmap_queues		= pm_unmap_queues_v9,
+	.set_grace_period       = pm_set_grace_period_v9,
 	.query_status		= pm_query_status_v9,
 	.release_mem		= NULL,
 	.map_process_size	= sizeof(struct pm4_mes_map_process_aldebaran),
@@ -357,6 +395,7 @@ const struct packet_manager_funcs kfd_aldebaran_pm_funcs = {
 	.set_resources_size	= sizeof(struct pm4_mes_set_resources),
 	.map_queues_size	= sizeof(struct pm4_mes_map_queues),
 	.unmap_queues_size	= sizeof(struct pm4_mes_unmap_queues),
+	.set_grace_period_size  = sizeof(struct pm4_mec_write_data_mmio),
 	.query_status_size	= sizeof(struct pm4_mes_query_status),
 	.release_mem_size	= 0,
 };
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers_ai.h b/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers_ai.h
index a666710ed403..795001c947e1 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers_ai.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers_ai.h
@@ -583,6 +583,71 @@ struct pm4_mec_release_mem {
 
 #endif
 
+#ifndef PM4_MEC_WRITE_DATA_DEFINED
+#define PM4_MEC_WRITE_DATA_DEFINED
+
+enum WRITE_DATA_dst_sel_enum {
+	dst_sel___write_data__mem_mapped_register = 0,
+	dst_sel___write_data__tc_l2 = 2,
+	dst_sel___write_data__gds = 3,
+	dst_sel___write_data__memory = 5,
+	dst_sel___write_data__memory_mapped_adc_persistent_state = 6,
+};
+
+enum WRITE_DATA_addr_incr_enum {
+	addr_incr___write_data__increment_address = 0,
+	addr_incr___write_data__do_not_increment_address = 1
+};
+
+enum WRITE_DATA_wr_confirm_enum {
+	wr_confirm___write_data__do_not_wait_for_write_confirmation = 0,
+	wr_confirm___write_data__wait_for_write_confirmation = 1
+};
+
+enum WRITE_DATA_cache_policy_enum {
+	cache_policy___write_data__lru = 0,
+	cache_policy___write_data__stream = 1
+};
+
+
+struct pm4_mec_write_data_mmio {
+	union {
+		union PM4_MES_TYPE_3_HEADER header;     /*header */
+		unsigned int ordinal1;
+	};
+
+	union {
+		struct {
+			unsigned int reserved1:8;
+			unsigned int dst_sel:4;
+			unsigned int reserved2:4;
+			unsigned int addr_incr:1;
+			unsigned int reserved3:2;
+			unsigned int resume_vf:1;
+			unsigned int wr_confirm:1;
+			unsigned int reserved4:4;
+			unsigned int cache_policy:2;
+			unsigned int reserved5:5;
+		} bitfields2;
+		unsigned int ordinal2;
+	};
+
+	union {
+		struct {
+			unsigned int dst_mmreg_addr:18;
+			unsigned int reserved6:14;
+		} bitfields3;
+		unsigned int ordinal3;
+	};
+
+	uint32_t reserved7;
+
+	uint32_t data;
+
+};
+
+#endif
+
 enum {
 	CACHE_FLUSH_AND_INV_TS_EVENT = 0x00000014
 };
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index 3ea61fa1db52..a851f814bc9d 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -1300,6 +1300,8 @@ struct packet_manager_funcs {
 	int (*unmap_queues)(struct packet_manager *pm, uint32_t *buffer,
 			enum kfd_unmap_queues_filter mode,
 			uint32_t filter_param, bool reset);
+	int (*set_grace_period)(struct packet_manager *pm, uint32_t *buffer,
+			uint32_t grace_period);
 	int (*query_status)(struct packet_manager *pm, uint32_t *buffer,
 			uint64_t fence_address,	uint64_t fence_value);
 	int (*release_mem)(uint64_t gpu_addr, uint32_t *buffer);
@@ -1310,6 +1312,7 @@ struct packet_manager_funcs {
 	int set_resources_size;
 	int map_queues_size;
 	int unmap_queues_size;
+	int set_grace_period_size;
 	int query_status_size;
 	int release_mem_size;
 };
@@ -1332,6 +1335,8 @@ int pm_send_unmap_queue(struct packet_manager *pm,
 
 void pm_release_ib(struct packet_manager *pm);
 
+int pm_update_grace_period(struct packet_manager *pm, uint32_t grace_period);
+
 /* Following PM funcs can be shared among VI and AI */
 unsigned int pm_build_pm4_header(unsigned int opcode, size_t packet_size);
 
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 63+ messages in thread

* [PATCH 11/29] drm/amdkfd: prepare map process for single process debug devices
  2022-10-31 16:23 [PATCH 01/29] drm/amdkfd: add debug and runtime enable interface Jonathan Kim
                   ` (8 preceding siblings ...)
  2022-10-31 16:23 ` [PATCH 10/29] drm/amdgpu: add configurable grace period for unmap queues Jonathan Kim
@ 2022-10-31 16:23 ` Jonathan Kim
  2022-10-31 16:23 ` [PATCH 12/29] drm/amdgpu: prepare map process for multi-process " Jonathan Kim
                   ` (18 subsequent siblings)
  28 siblings, 0 replies; 63+ messages in thread
From: Jonathan Kim @ 2022-10-31 16:23 UTC (permalink / raw)
  To: amd-gfx; +Cc: Felix.Kuehling

Older HW only supports debugging on a single process because the
SPI debug mode setting registers are device global.

The HWS has supplied a single pinned VMID (0xf) for MAP_PROCESS
for debug purposes. To pin the VMID, the KFD will remove the VMID from
the HWS dynamic VMID allocation via SET_RESOUCES so that a debugged
process will never migrate away from its pinned VMID.

The KFD is responsible for reserving and releasing this pinned VMID
accordingly whenever the debugger attaches and detaches respectively.

Signed-off-by: Jonathan Kim <jonathan.kim@amd.com>
---
 .../drm/amd/amdkfd/kfd_device_queue_manager.c | 101 +++++++++++++++++-
 .../drm/amd/amdkfd/kfd_device_queue_manager.h |   5 +
 .../drm/amd/amdkfd/kfd_packet_manager_v9.c    |   9 ++
 .../gpu/drm/amd/amdkfd/kfd_pm4_headers_ai.h   |   5 +-
 4 files changed, 114 insertions(+), 6 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
index fbcf4ee07936..62bb92ef1acd 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -1492,7 +1492,7 @@ static int initialize_cpsch(struct device_queue_manager *dqm)
 	dqm->active_cp_queue_count = 0;
 	dqm->gws_queue_count = 0;
 	dqm->active_runlist = false;
-	INIT_WORK(&dqm->hw_exception_work, kfd_process_hw_exception);
+	dqm->trap_debug_vmid = 0;
 
 	init_sdma_bitmaps(dqm);
 
@@ -1935,8 +1935,7 @@ static int destroy_queue_cpsch(struct device_queue_manager *dqm,
 		if (!dqm->dev->shared_resources.enable_mes) {
 			decrement_queue_count(dqm, qpd, q);
 			retval = execute_queues_cpsch(dqm,
-						      KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0,
-						      USE_DEFAULT_GRACE_PERIOD);
+						      KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0, USE_DEFAULT_GRACE_PERIOD);
 			if (retval == -ETIME)
 				qpd->reset_wavefronts = true;
 		} else {
@@ -2465,6 +2464,98 @@ static void kfd_process_hw_exception(struct work_struct *work)
 	amdgpu_amdkfd_gpu_reset(dqm->dev->adev);
 }
 
+int reserve_debug_trap_vmid(struct device_queue_manager *dqm,
+				struct qcm_process_device *qpd)
+{
+	int r;
+	int updated_vmid_mask;
+
+	if (dqm->sched_policy == KFD_SCHED_POLICY_NO_HWS) {
+		pr_err("Unsupported on sched_policy: %i\n", dqm->sched_policy);
+		return -EINVAL;
+	}
+
+	dqm_lock(dqm);
+
+	if (dqm->trap_debug_vmid != 0) {
+		pr_err("Trap debug id already reserved\n");
+		r = -EBUSY;
+		goto out_unlock;
+	}
+
+	r = unmap_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0,
+			USE_DEFAULT_GRACE_PERIOD, false);
+	if (r)
+		goto out_unlock;
+
+	updated_vmid_mask = dqm->dev->shared_resources.compute_vmid_bitmap;
+	updated_vmid_mask &= ~(1 << dqm->dev->vm_info.last_vmid_kfd);
+
+	dqm->dev->shared_resources.compute_vmid_bitmap = updated_vmid_mask;
+	dqm->trap_debug_vmid = dqm->dev->vm_info.last_vmid_kfd;
+	r = set_sched_resources(dqm);
+	if (r)
+		goto out_unlock;
+
+	r = map_queues_cpsch(dqm);
+	if (r)
+		goto out_unlock;
+
+	pr_debug("Reserved VMID for trap debug: %i\n", dqm->trap_debug_vmid);
+
+out_unlock:
+	dqm_unlock(dqm);
+	return r;
+}
+
+/*
+ * Releases vmid for the trap debugger
+ */
+int release_debug_trap_vmid(struct device_queue_manager *dqm,
+			struct qcm_process_device *qpd)
+{
+	int r;
+	int updated_vmid_mask;
+	uint32_t trap_debug_vmid;
+
+	if (dqm->sched_policy == KFD_SCHED_POLICY_NO_HWS) {
+		pr_err("Unsupported on sched_policy: %i\n", dqm->sched_policy);
+		return -EINVAL;
+	}
+
+	dqm_lock(dqm);
+	trap_debug_vmid = dqm->trap_debug_vmid;
+	if (dqm->trap_debug_vmid == 0) {
+		pr_err("Trap debug id is not reserved\n");
+		r = -EINVAL;
+		goto out_unlock;
+	}
+
+	r = unmap_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0,
+			USE_DEFAULT_GRACE_PERIOD, false);
+	if (r)
+		goto out_unlock;
+
+	updated_vmid_mask = dqm->dev->shared_resources.compute_vmid_bitmap;
+	updated_vmid_mask |= (1 << dqm->dev->vm_info.last_vmid_kfd);
+
+	dqm->dev->shared_resources.compute_vmid_bitmap = updated_vmid_mask;
+	dqm->trap_debug_vmid = 0;
+	r = set_sched_resources(dqm);
+	if (r)
+		goto out_unlock;
+
+	r = map_queues_cpsch(dqm);
+	if (r)
+		goto out_unlock;
+
+	pr_debug("Released VMID for trap debug: %i\n", trap_debug_vmid);
+
+out_unlock:
+	dqm_unlock(dqm);
+	return r;
+}
+
 #if defined(CONFIG_DEBUG_FS)
 
 static void seq_reg_dump(struct seq_file *m,
@@ -2565,8 +2656,8 @@ int dqm_debugfs_hang_hws(struct device_queue_manager *dqm)
 		return r;
 	}
 	dqm->active_runlist = true;
-	r = execute_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES,
-				0, USE_DEFAULT_GRACE_PERIOD);
+	r = execute_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0,
+			USE_DEFAULT_GRACE_PERIOD);
 	dqm_unlock(dqm);
 
 	return r;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
index fb48b124161f..0cb1504d24cf 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
@@ -250,6 +250,7 @@ struct device_queue_manager {
 	struct kfd_mem_obj	*fence_mem;
 	bool			active_runlist;
 	int			sched_policy;
+	uint32_t		trap_debug_vmid;
 
 	/* hw exception  */
 	bool			is_hws_hang;
@@ -281,6 +282,10 @@ unsigned int get_queues_per_pipe(struct device_queue_manager *dqm);
 unsigned int get_pipes_per_mec(struct device_queue_manager *dqm);
 unsigned int get_num_sdma_queues(struct device_queue_manager *dqm);
 unsigned int get_num_xgmi_sdma_queues(struct device_queue_manager *dqm);
+int reserve_debug_trap_vmid(struct device_queue_manager *dqm,
+			struct qcm_process_device *qpd);
+int release_debug_trap_vmid(struct device_queue_manager *dqm,
+			struct qcm_process_device *qpd);
 
 static inline unsigned int get_sh_mem_bases_32(struct kfd_process_device *pdd)
 {
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager_v9.c
index f0cdc8695b8c..363cf8e005cc 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager_v9.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager_v9.c
@@ -34,6 +34,9 @@ static int pm_map_process_v9(struct packet_manager *pm,
 {
 	struct pm4_mes_map_process *packet;
 	uint64_t vm_page_table_base_addr = qpd->page_table_base;
+	struct kfd_dev *kfd = pm->dqm->dev;
+	struct kfd_process_device *pdd =
+			container_of(qpd, struct kfd_process_device, qpd);
 
 	packet = (struct pm4_mes_map_process *)buffer;
 	memset(buffer, 0, sizeof(struct pm4_mes_map_process));
@@ -49,6 +52,12 @@ static int pm_map_process_v9(struct packet_manager *pm,
 	packet->bitfields14.sdma_enable = 1;
 	packet->bitfields14.num_queues = (qpd->is_debug) ? 0 : qpd->queue_count;
 
+	if (kfd->dqm->trap_debug_vmid && pdd->process->debug_trap_enabled &&
+			pdd->process->runtime_info.runtime_state == DEBUG_RUNTIME_STATE_ENABLED) {
+		packet->bitfields2.debug_vmid = kfd->dqm->trap_debug_vmid;
+		packet->bitfields2.new_debug = 1;
+	}
+
 	packet->sh_mem_config = qpd->sh_mem_config;
 	packet->sh_mem_bases = qpd->sh_mem_bases;
 	if (qpd->tba_addr) {
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers_ai.h b/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers_ai.h
index 795001c947e1..bb6edbc27de7 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers_ai.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers_ai.h
@@ -146,7 +146,10 @@ struct pm4_mes_map_process {
 	union {
 		struct {
 			uint32_t pasid:16;
-			uint32_t reserved1:8;
+			uint32_t reserved1:2;
+			uint32_t debug_vmid:4;
+			uint32_t new_debug:1;
+			uint32_t reserved2:1;
 			uint32_t diq_enable:1;
 			uint32_t process_quantum:7;
 		} bitfields2;
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 63+ messages in thread

* [PATCH 12/29] drm/amdgpu: prepare map process for multi-process debug devices
  2022-10-31 16:23 [PATCH 01/29] drm/amdkfd: add debug and runtime enable interface Jonathan Kim
                   ` (9 preceding siblings ...)
  2022-10-31 16:23 ` [PATCH 11/29] drm/amdkfd: prepare map process for single process debug devices Jonathan Kim
@ 2022-10-31 16:23 ` Jonathan Kim
  2022-10-31 16:23 ` [PATCH 13/29] drm/amdkfd: add per process hw trap enable and disable functions Jonathan Kim
                   ` (17 subsequent siblings)
  28 siblings, 0 replies; 63+ messages in thread
From: Jonathan Kim @ 2022-10-31 16:23 UTC (permalink / raw)
  To: amd-gfx; +Cc: Felix.Kuehling

Unlike single process debug devices, multi-process debug devices allow
debug mode setting per-VMID (non-device-global).

Because the HWS manages PASID-VMID mapping, the new MAP_PROCESS API allows
the KFD to forward the required SPI debug register write requests.

To request a new debug mode setting change, the KFD must be able to
preempt all queues then remap all queues with these new setting
requests for MAP_PROCESS to take effect.

Note that by default, debug mode must be disabled for performance reasons
for multi-process debug devices for performance reasons.

Signed-off-by: Jonathan Kim <jonathan.kim@amd.com>
---
 drivers/gpu/drm/amd/amdkfd/kfd_debug.h        |  7 +++
 .../drm/amd/amdkfd/kfd_device_queue_manager.c | 50 +++++++++++++++++++
 .../drm/amd/amdkfd/kfd_device_queue_manager.h |  3 ++
 .../drm/amd/amdkfd/kfd_packet_manager_v9.c    | 15 ++++++
 drivers/gpu/drm/amd/amdkfd/kfd_priv.h         |  9 ++++
 drivers/gpu/drm/amd/amdkfd/kfd_process.c      |  5 ++
 6 files changed, 89 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.h b/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
index 8aa7a3ad4e97..b7ecd603f277 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
@@ -32,5 +32,12 @@ int kfd_dbg_trap_disable(struct kfd_process *target);
 int kfd_dbg_trap_enable(struct kfd_process *target, uint32_t fd,
 			void __user *runtime_info,
 			uint32_t *runtime_info_size);
+
+static inline bool kfd_dbg_is_per_vmid_supported(struct kfd_dev *dev)
+{
+	return KFD_GC_VERSION(dev) == IP_VERSION(9, 4, 2); /* Aldebaran */
+}
+
 void debug_event_write_work_handler(struct work_struct *work);
+
 #endif
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
index 62bb92ef1acd..1634cc2ee202 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -2556,6 +2556,56 @@ int release_debug_trap_vmid(struct device_queue_manager *dqm,
 	return r;
 }
 
+int debug_lock_and_unmap(struct device_queue_manager *dqm)
+{
+	int r;
+
+	if (dqm->sched_policy == KFD_SCHED_POLICY_NO_HWS) {
+		pr_err("Unsupported on sched_policy: %i\n", dqm->sched_policy);
+		return -EINVAL;
+	}
+
+	if (!kfd_dbg_is_per_vmid_supported(dqm->dev))
+		return 0;
+
+	dqm_lock(dqm);
+
+	r = unmap_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0, 0, false);
+	if (r)
+		dqm_unlock(dqm);
+
+	return r;
+}
+
+int debug_map_and_unlock(struct device_queue_manager *dqm)
+{
+	int r;
+
+	if (dqm->sched_policy == KFD_SCHED_POLICY_NO_HWS) {
+		pr_err("Unsupported on sched_policy: %i\n", dqm->sched_policy);
+		return -EINVAL;
+	}
+
+	if (!kfd_dbg_is_per_vmid_supported(dqm->dev))
+		return 0;
+
+	r = map_queues_cpsch(dqm);
+
+	dqm_unlock(dqm);
+
+	return r;
+}
+
+int debug_refresh_runlist(struct device_queue_manager *dqm)
+{
+	int r = debug_lock_and_unmap(dqm);
+
+	if (r)
+		return r;
+
+	return debug_map_and_unlock(dqm);
+}
+
 #if defined(CONFIG_DEBUG_FS)
 
 static void seq_reg_dump(struct seq_file *m,
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
index 0cb1504d24cf..bef3be84c5cc 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
@@ -286,6 +286,9 @@ int reserve_debug_trap_vmid(struct device_queue_manager *dqm,
 			struct qcm_process_device *qpd);
 int release_debug_trap_vmid(struct device_queue_manager *dqm,
 			struct qcm_process_device *qpd);
+int debug_lock_and_unmap(struct device_queue_manager *dqm);
+int debug_map_and_unlock(struct device_queue_manager *dqm);
+int debug_refresh_runlist(struct device_queue_manager *dqm);
 
 static inline unsigned int get_sh_mem_bases_32(struct kfd_process_device *pdd)
 {
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager_v9.c
index 363cf8e005cc..f19c506da23d 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager_v9.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager_v9.c
@@ -88,6 +88,10 @@ static int pm_map_process_aldebaran(struct packet_manager *pm,
 {
 	struct pm4_mes_map_process_aldebaran *packet;
 	uint64_t vm_page_table_base_addr = qpd->page_table_base;
+	struct kfd_dev *kfd = pm->dqm->dev;
+	struct kfd_process_device *pdd =
+			container_of(qpd, struct kfd_process_device, qpd);
+	int i;
 
 	packet = (struct pm4_mes_map_process_aldebaran *)buffer;
 	memset(buffer, 0, sizeof(struct pm4_mes_map_process_aldebaran));
@@ -102,6 +106,17 @@ static int pm_map_process_aldebaran(struct packet_manager *pm,
 	packet->bitfields14.num_oac = qpd->num_oac;
 	packet->bitfields14.sdma_enable = 1;
 	packet->bitfields14.num_queues = (qpd->is_debug) ? 0 : qpd->queue_count;
+	/* TRAP_EN is set on boot so keep it set in non-debug mode. */
+	packet->spi_gdbg_per_vmid_cntl = pdd->spi_dbg_override |
+						pdd->spi_dbg_launch_mode;
+
+	if (pdd->process->debug_trap_enabled) {
+		for (i = 0; i < kfd->device_info.num_of_watch_points; i++)
+			packet->tcp_watch_cntl[i] = pdd->watch_points[i];
+
+		packet->bitfields2.single_memops =
+				!!(pdd->process->dbg_flags & KFD_DBG_TRAP_FLAG_SINGLE_MEM_OP);
+	}
 
 	packet->sh_mem_config = qpd->sh_mem_config;
 	packet->sh_mem_bases = qpd->sh_mem_bases;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index a851f814bc9d..6360b365973c 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -782,6 +782,12 @@ struct kfd_process_device {
 	uint64_t faults;
 	uint64_t page_in;
 	uint64_t page_out;
+
+	/* Tracks debug per-vmid request settings */
+	uint32_t spi_dbg_override;
+	uint32_t spi_dbg_launch_mode;
+	uint32_t watch_points[4];
+
 	/*
 	 * If this process has been checkpointed before, then the user
 	 * application will use the original gpu_id on the
@@ -918,6 +924,9 @@ struct kfd_process {
 
 	bool xnack_enabled;
 
+	/* Tracks debug per-vmid request for debug flags */
+	bool dbg_flags;
+
 	/* Work area for debugger event writer worker. */
 	struct work_struct debug_event_workarea;
 
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
index efb81ccef8f5..a57a55f6ccee 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
@@ -1558,6 +1558,11 @@ struct kfd_process_device *kfd_create_process_device_data(struct kfd_dev *dev,
 	}
 
 	p->pdds[p->n_pdds++] = pdd;
+	if (kfd_dbg_is_per_vmid_supported(pdd->dev))
+		pdd->spi_dbg_override = pdd->dev->kfd2kgd->disable_debug_trap(
+							pdd->dev->adev,
+							false,
+							0);
 
 	/* Init idr used for memory handle translation */
 	idr_init(&pdd->alloc_idr);
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 63+ messages in thread

* [PATCH 13/29] drm/amdkfd: add per process hw trap enable and disable functions
  2022-10-31 16:23 [PATCH 01/29] drm/amdkfd: add debug and runtime enable interface Jonathan Kim
                   ` (10 preceding siblings ...)
  2022-10-31 16:23 ` [PATCH 12/29] drm/amdgpu: prepare map process for multi-process " Jonathan Kim
@ 2022-10-31 16:23 ` Jonathan Kim
  2022-10-31 16:23 ` [PATCH 14/29] drm/amdkfd: add raise exception event function Jonathan Kim
                   ` (16 subsequent siblings)
  28 siblings, 0 replies; 63+ messages in thread
From: Jonathan Kim @ 2022-10-31 16:23 UTC (permalink / raw)
  To: amd-gfx; +Cc: Felix.Kuehling

To enable HW debug mode per process, all devices must be debug enabled
successfully.  If a failure occures, rewind the enablement of debug mode
on the enabled devices.

A power management scenario that needs to be considered is HW
debug mode setting during GFXOFF.  During GFXOFF, these registers
will be unreachable so we have to transiently disable GFXOFF when
setting.  Also, some devices don't support the RLC save restore
function for these debug registers so we have to disable GFXOFF
completely during a debug session.

Cooperative launch also has debugging restriction based on FW bugs.
If such bugs exists, the debugger cannot attach to a process that uses GWS
resources nor can GWS resources be requested if a process is being
debugged.

Also multi-process debug devices can only enable trap temporaries based
on certain runtime scenerios, which will be explained when the
runtime enable functions are implemented in a follow up patch.

Signed-off-by: Jonathan Kim <jonathan.kim@amd.com>
---
 drivers/gpu/drm/amd/amdkfd/kfd_chardev.c      |   5 +
 drivers/gpu/drm/amd/amdkfd/kfd_debug.c        | 134 +++++++++++++++++-
 drivers/gpu/drm/amd/amdkfd/kfd_debug.h        |  23 +++
 .../drm/amd/amdkfd/kfd_device_queue_manager.c |   1 +
 drivers/gpu/drm/amd/amdkfd/kfd_process.c      |   9 ++
 5 files changed, 170 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index d550dbe570fb..aeaedd0efa54 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -1450,6 +1450,11 @@ static int kfd_ioctl_alloc_queue_gws(struct file *filep,
 		goto out_unlock;
 	}
 
+	if (!kfd_dbg_has_gws_support(dev) && p->debug_trap_enabled) {
+		retval = -EBUSY;
+		goto out_unlock;
+	}
+
 	retval = pqm_set_gws(&p->pqm, args->queue_id, args->num_gws ? dev->gws : NULL);
 	mutex_unlock(&p->mutex);
 
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
index f967f89903f7..f5a5d17cde14 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
@@ -21,6 +21,7 @@
  */
 
 #include "kfd_debug.h"
+#include "kfd_device_queue_manager.h"
 #include <linux/file.h>
 
 void debug_event_write_work_handler(struct work_struct *work)
@@ -37,8 +38,59 @@ void debug_event_write_work_handler(struct work_struct *work)
 	kernel_write(process->dbg_ev_file, &write_data, 1, &pos);
 }
 
+/* kfd_dbg_trap_deactivate:
+ *	target: target process
+ *	unwind: If this is unwinding a failed kfd_dbg_trap_enable()
+ *	unwind_count:
+ *		If unwind == true, how far down the pdd list we need
+ *				to unwind
+ *		else: ignored
+ */
+static void kfd_dbg_trap_deactivate(struct kfd_process *target, bool unwind, int unwind_count)
+{
+	int i, count = 0;
+
+	for (i = 0; i < target->n_pdds; i++) {
+		struct kfd_process_device *pdd = target->pdds[i];
+
+		/* If this is an unwind, and we have unwound the required
+		 * enable calls on the pdd list, we need to stop now
+		 * otherwise we may mess up another debugger session.
+		 */
+		if (unwind && count == unwind_count)
+			break;
+
+		/* GFX off is already disabled by debug activate if not RLC restore supported. */
+		if (kfd_dbg_is_rlc_restore_supported(pdd->dev))
+			amdgpu_gfx_off_ctrl(pdd->dev->adev, false);
+		pdd->spi_dbg_override =
+				pdd->dev->kfd2kgd->disable_debug_trap(
+				pdd->dev->adev,
+				target->runtime_info.ttmp_setup,
+				pdd->dev->vm_info.last_vmid_kfd);
+		if (kfd_dbg_is_rlc_restore_supported(pdd->dev))
+			amdgpu_gfx_off_ctrl(pdd->dev->adev, true);
+
+		if (release_debug_trap_vmid(pdd->dev->dqm, &pdd->qpd))
+			pr_err("Failed to release debug vmid on [%i]\n", pdd->dev->id);
+
+		debug_refresh_runlist(pdd->dev->dqm);
+
+		count++;
+	}
+}
+
 int kfd_dbg_trap_disable(struct kfd_process *target)
 {
+	/*
+	 * Defer deactivation to runtime if runtime not enabled otherwise reset
+	 * attached running target runtime state to enable for re-attach.
+	 */
+	if (target->runtime_info.runtime_state == DEBUG_RUNTIME_STATE_ENABLED)
+		kfd_dbg_trap_deactivate(target, false, 0);
+	else if (target->runtime_info.runtime_state != DEBUG_RUNTIME_STATE_DISABLED)
+		target->runtime_info.runtime_state = DEBUG_RUNTIME_STATE_ENABLED;
+
 	fput(target->dbg_ev_file);
 	target->dbg_ev_file = NULL;
 
@@ -53,16 +105,88 @@ int kfd_dbg_trap_disable(struct kfd_process *target)
 	return 0;
 }
 
+static int kfd_dbg_trap_activate(struct kfd_process *target)
+{
+	int i, r = 0, unwind_count = 0;
+
+	for (i = 0; i < target->n_pdds; i++) {
+		struct kfd_process_device *pdd = target->pdds[i];
+
+		if (!kfd_dbg_is_per_vmid_supported(pdd->dev)) {
+			r = reserve_debug_trap_vmid(pdd->dev->dqm, &pdd->qpd);
+
+			if (r) {
+				target->runtime_info.runtime_state = (r == -EBUSY) ?
+							DEBUG_RUNTIME_STATE_ENABLED_BUSY :
+							DEBUG_RUNTIME_STATE_ENABLED_ERROR;
+
+				goto unwind_err;
+			}
+		}
+
+		/* Disable GFX OFF to prevent garbage read/writes to debug registers.
+		 * If RLC restore of debug registers is not supported and runtime enable
+		 * hasn't done so already on ttmp setup request, restore the trap config registers.
+		 *
+		 * If RLC restore of debug registers is not supported, keep gfx off disabled for
+		 * the debug session.
+		 */
+		amdgpu_gfx_off_ctrl(pdd->dev->adev, false);
+		if (!(kfd_dbg_is_rlc_restore_supported(pdd->dev) ||
+						target->runtime_info.ttmp_setup))
+			pdd->dev->kfd2kgd->enable_debug_trap(pdd->dev->adev, true,
+								pdd->dev->vm_info.last_vmid_kfd);
+
+		pdd->spi_dbg_override = pdd->dev->kfd2kgd->enable_debug_trap(
+					pdd->dev->adev,
+					false,
+					pdd->dev->vm_info.last_vmid_kfd);
+
+		if (kfd_dbg_is_rlc_restore_supported(pdd->dev))
+			amdgpu_gfx_off_ctrl(pdd->dev->adev, true);
+
+		r = debug_refresh_runlist(pdd->dev->dqm);
+		if (r) {
+			target->runtime_info.runtime_state =
+					DEBUG_RUNTIME_STATE_ENABLED_ERROR;
+			goto unwind_err;
+		}
+
+		/* Increment unwind_count as the last step */
+		unwind_count++;
+	}
+
+	return 0;
+
+unwind_err:
+	/* Enabling debug failed, we need to disable on
+	 * all GPUs so the enable is all or nothing.
+	 */
+	kfd_dbg_trap_deactivate(target, true, unwind_count);
+	return r;
+}
+
 int kfd_dbg_trap_enable(struct kfd_process *target, uint32_t fd,
 			void __user *runtime_info, uint32_t *runtime_size)
 {
 	struct file *f;
 	uint32_t copy_size;
-	int r = 0;
+	int i, r = 0;
 
 	if (target->debug_trap_enabled)
 		return -EINVAL;
 
+	/* Enable pre-checks */
+	for (i = 0; i < target->n_pdds; i++) {
+		struct kfd_process_device *pdd = target->pdds[i];
+
+		if (!KFD_IS_SOC15(pdd->dev))
+			return -ENODEV;
+
+		if (!kfd_dbg_has_gws_support(pdd->dev) && pdd->qpd.num_gws)
+			return -EBUSY;
+	}
+
 	copy_size = min((size_t)(*runtime_size), sizeof(target->runtime_info));
 
 	f = fget(fd);
@@ -73,6 +197,10 @@ int kfd_dbg_trap_enable(struct kfd_process *target, uint32_t fd,
 
 	target->dbg_ev_file = f;
 
+	/* defer activation to runtime if not runtime enabled */
+	if (target->runtime_info.runtime_state == DEBUG_RUNTIME_STATE_ENABLED)
+		kfd_dbg_trap_activate(target);
+
 	/* We already hold the process reference but hold another one for the
 	 * debug session.
 	 */
@@ -82,8 +210,10 @@ int kfd_dbg_trap_enable(struct kfd_process *target, uint32_t fd,
 	if (target->debugger_process)
 		atomic_inc(&target->debugger_process->debugged_process_count);
 
-	if (copy_to_user(runtime_info, (void *)&target->runtime_info, copy_size))
+	if (copy_to_user(runtime_info, (void *)&target->runtime_info, copy_size)) {
+		kfd_dbg_trap_deactivate(target, false, 0);
 		r = -EFAULT;
+	}
 
 	*runtime_size = sizeof(target->runtime_info);
 
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.h b/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
index b7ecd603f277..1053b7ca24c5 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
@@ -40,4 +40,27 @@ static inline bool kfd_dbg_is_per_vmid_supported(struct kfd_dev *dev)
 
 void debug_event_write_work_handler(struct work_struct *work);
 
+/*
+ * If GFX off is enabled, chips that do not support RLC restore for the debug
+ * registers will disable GFX off temporarily for the entire debug session.
+ * See disable_on_trap_action_entry and enable_on_trap_action_exit for details.
+ */
+static inline bool kfd_dbg_is_rlc_restore_supported(struct kfd_dev *dev)
+{
+	return !(KFD_GC_VERSION(dev) == IP_VERSION(10, 1, 10) || /* Navi10 */
+		 KFD_GC_VERSION(dev) == IP_VERSION(10, 1, 1));	 /* Navi14 */
+}
+
+static inline bool kfd_dbg_has_gws_support(struct kfd_dev *dev)
+{
+	return ((KFD_GC_VERSION(dev) == IP_VERSION(9, 0, 1)
+			&& dev->mec2_fw_version >= 0x81b6) ||
+		(KFD_GC_VERSION(dev) >= IP_VERSION(9, 1, 0)
+			&& KFD_GC_VERSION(dev) <= IP_VERSION(9, 2, 2)
+			&& dev->mec2_fw_version >= 0x1b6) ||
+		(KFD_GC_VERSION(dev) == IP_VERSION(9, 4, 0)
+			&& dev->mec2_fw_version >= 0x1b6) ||
+		(KFD_GC_VERSION(dev) == IP_VERSION(9, 4, 1)
+			&& dev->mec2_fw_version >= 0x30));
+}
 #endif
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
index 1634cc2ee202..bf4787b4dc6c 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -36,6 +36,7 @@
 #include "kfd_kernel_queue.h"
 #include "amdgpu_amdkfd.h"
 #include "mes_api_def.h"
+#include "kfd_debug.h"
 
 /* Size of the per-pipe EOP queue */
 #define CIK_HPD_EOP_BYTES_LOG2 11
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
index a57a55f6ccee..928fe5c42c1e 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
@@ -1161,6 +1161,7 @@ static void kfd_process_notifier_release(struct mmu_notifier *mn,
 					struct mm_struct *mm)
 {
 	struct kfd_process *p;
+	int i;
 
 	/*
 	 * The kfd_process structure can not be free because the
@@ -1178,6 +1179,14 @@ static void kfd_process_notifier_release(struct mmu_notifier *mn,
 	cancel_delayed_work_sync(&p->eviction_work);
 	cancel_delayed_work_sync(&p->restore_work);
 
+	for (i = 0; i < p->n_pdds; i++) {
+		struct kfd_process_device *pdd = p->pdds[i];
+
+		/* re-enable GFX OFF since runtime enable with ttmp setup disabled it. */
+		if (!kfd_dbg_is_rlc_restore_supported(pdd->dev) && p->runtime_info.ttmp_setup)
+			amdgpu_gfx_off_ctrl(pdd->dev->adev, true);
+	}
+
 	if (p->debug_trap_enabled)
 		kfd_dbg_trap_disable(p);
 
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 63+ messages in thread

* [PATCH 14/29] drm/amdkfd: add raise exception event function
  2022-10-31 16:23 [PATCH 01/29] drm/amdkfd: add debug and runtime enable interface Jonathan Kim
                   ` (11 preceding siblings ...)
  2022-10-31 16:23 ` [PATCH 13/29] drm/amdkfd: add per process hw trap enable and disable functions Jonathan Kim
@ 2022-10-31 16:23 ` Jonathan Kim
  2022-10-31 16:23 ` [PATCH 15/29] drm/amdkfd: add send exception operation Jonathan Kim
                   ` (15 subsequent siblings)
  28 siblings, 0 replies; 63+ messages in thread
From: Jonathan Kim @ 2022-10-31 16:23 UTC (permalink / raw)
  To: amd-gfx; +Cc: Felix.Kuehling

Exception events can be generated from interrupts or queue activitity.

The raise event function will save exception status of a queue, device
or process then notify the debugger of the status change by writing to
a debugger polled file descriptor that the debugger provides during
debug attach.

For memory violation exceptions, extra exception data will be saved.

The debugger will be able to query the saved exception states by query
operation that will be provided by follow up patches.

Signed-off-by: Jonathan Kim <jonathan.kim@amd.com>
---
 drivers/gpu/drm/amd/amdkfd/kfd_debug.c | 90 ++++++++++++++++++++++++++
 drivers/gpu/drm/amd/amdkfd/kfd_debug.h |  5 ++
 drivers/gpu/drm/amd/amdkfd/kfd_priv.h  |  7 ++
 3 files changed, 102 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
index f5a5d17cde14..b7a4b2ec5b36 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
@@ -38,6 +38,93 @@ void debug_event_write_work_handler(struct work_struct *work)
 	kernel_write(process->dbg_ev_file, &write_data, 1, &pos);
 }
 
+/* update process/device/queue exception status, write to descriptor
+ * only if exception_status is enabled.
+ */
+bool kfd_dbg_ev_raise(uint64_t event_mask,
+			struct kfd_process *process, struct kfd_dev *dev,
+			unsigned int source_id, bool use_worker,
+			void *exception_data, size_t exception_data_size)
+{
+	struct process_queue_manager *pqm;
+	struct process_queue_node *pqn;
+	int i;
+	static const char write_data = '.';
+	loff_t pos = 0;
+	bool is_subscribed = true;
+
+	if (!(process && process->debug_trap_enabled))
+		return false;
+
+	mutex_lock(&process->event_mutex);
+
+	if (event_mask & KFD_EC_MASK_DEVICE) {
+		for (i = 0; i < process->n_pdds; i++) {
+			struct kfd_process_device *pdd = process->pdds[i];
+
+			if (pdd->dev != dev)
+				continue;
+
+			pdd->exception_status |= event_mask & KFD_EC_MASK_DEVICE;
+
+			if (event_mask & KFD_EC_MASK(EC_DEVICE_MEMORY_VIOLATION)) {
+				if (!pdd->vm_fault_exc_data) {
+					pdd->vm_fault_exc_data = kmemdup(
+							exception_data,
+							exception_data_size,
+							GFP_KERNEL);
+					if (!pdd->vm_fault_exc_data)
+						pr_debug("Failed to allocate exception data memory");
+				} else {
+					pr_debug("Debugger exception data not saved\n");
+					print_hex_dump_bytes("exception data: ",
+							DUMP_PREFIX_OFFSET,
+							exception_data,
+							exception_data_size);
+				}
+			}
+			break;
+		}
+	} else if (event_mask & KFD_EC_MASK_PROCESS) {
+		process->exception_status |= event_mask & KFD_EC_MASK_PROCESS;
+	} else {
+		pqm = &process->pqm;
+		list_for_each_entry(pqn, &pqm->queues,
+				process_queue_list) {
+			int target_id;
+
+			if (!pqn->q)
+				continue;
+
+			target_id = event_mask & KFD_EC_MASK(EC_QUEUE_NEW) ?
+					pqn->q->properties.queue_id :
+							pqn->q->doorbell_id;
+
+			if (pqn->q->device != dev || target_id != source_id)
+				continue;
+
+			pqn->q->properties.exception_status |= event_mask;
+			break;
+		}
+	}
+
+	if (process->exception_enable_mask & event_mask) {
+		if (use_worker)
+			schedule_work(&process->debug_event_workarea);
+		else
+			kernel_write(process->dbg_ev_file,
+					&write_data,
+					1,
+					&pos);
+	} else {
+		is_subscribed = false;
+	}
+
+	mutex_unlock(&process->event_mutex);
+
+	return is_subscribed;
+}
+
 /* kfd_dbg_trap_deactivate:
  *	target: target process
  *	unwind: If this is unwinding a failed kfd_dbg_trap_enable()
@@ -50,6 +137,9 @@ static void kfd_dbg_trap_deactivate(struct kfd_process *target, bool unwind, int
 {
 	int i, count = 0;
 
+	if (!unwind)
+		cancel_work_sync(&target->debug_event_workarea);
+
 	for (i = 0; i < target->n_pdds; i++) {
 		struct kfd_process_device *pdd = target->pdds[i];
 
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.h b/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
index 1053b7ca24c5..a288ca1941a6 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
@@ -28,6 +28,11 @@
 void kgd_gfx_v9_set_wave_launch_stall(struct amdgpu_device *adev,
 					uint32_t vmid,
 					bool stall);
+bool kfd_dbg_ev_raise(uint64_t event_mask,
+			struct kfd_process *process, struct kfd_dev *dev,
+			unsigned int source_id, bool use_worker,
+			void *exception_data,
+			size_t exception_data_size);
 int kfd_dbg_trap_disable(struct kfd_process *target);
 int kfd_dbg_trap_enable(struct kfd_process *target, uint32_t fd,
 			void __user *runtime_info,
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index 6360b365973c..40a695619eab 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -493,6 +493,7 @@ struct queue_properties {
 	uint32_t ctl_stack_size;
 	uint64_t tba_addr;
 	uint64_t tma_addr;
+	uint64_t exception_status;
 };
 
 #define QUEUE_IS_ACTIVE(q) ((q).queue_size > 0 &&	\
@@ -783,6 +784,11 @@ struct kfd_process_device {
 	uint64_t page_in;
 	uint64_t page_out;
 
+	/* Exception code status*/
+	uint64_t exception_status;
+	void *vm_fault_exc_data;
+	size_t vm_fault_exc_data_size;
+
 	/* Tracks debug per-vmid request settings */
 	uint32_t spi_dbg_override;
 	uint32_t spi_dbg_launch_mode;
@@ -918,6 +924,7 @@ struct kfd_process {
 
 	/* Exception code enable mask and status */
 	uint64_t exception_enable_mask;
+	uint64_t exception_status;
 
 	/* shared virtual memory registered by this process */
 	struct svm_range_list svms;
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 63+ messages in thread

* [PATCH 15/29] drm/amdkfd: add send exception operation
  2022-10-31 16:23 [PATCH 01/29] drm/amdkfd: add debug and runtime enable interface Jonathan Kim
                   ` (12 preceding siblings ...)
  2022-10-31 16:23 ` [PATCH 14/29] drm/amdkfd: add raise exception event function Jonathan Kim
@ 2022-10-31 16:23 ` Jonathan Kim
  2022-10-31 16:23 ` [PATCH 16/29] drm/amdkfd: add runtime enable operation Jonathan Kim
                   ` (14 subsequent siblings)
  28 siblings, 0 replies; 63+ messages in thread
From: Jonathan Kim @ 2022-10-31 16:23 UTC (permalink / raw)
  To: amd-gfx; +Cc: Felix.Kuehling

Add a debug operation that allows the debugger to send an exception
directly to runtime through a payload address.

For memory violations, normal vmfault signals will be applied to
notify runtime instead after passing in the saved exception data
when a memory violation was raised to the debugger.

For runtime exceptions, this will unblock the runtime enable
function which will be explained and implemented in a follow up
patch.

Signed-off-by: Jonathan Kim <jonathan.kim@amd.com>
---
 .../gpu/drm/amd/amdkfd/cik_event_interrupt.c  |  4 +-
 drivers/gpu/drm/amd/amdkfd/kfd_chardev.c      |  5 ++
 drivers/gpu/drm/amd/amdkfd/kfd_debug.c        | 43 +++++++++++
 drivers/gpu/drm/amd/amdkfd/kfd_debug.h        |  5 ++
 drivers/gpu/drm/amd/amdkfd/kfd_events.c       |  3 +-
 .../gpu/drm/amd/amdkfd/kfd_int_process_v9.c   |  2 +-
 drivers/gpu/drm/amd/amdkfd/kfd_priv.h         |  7 +-
 drivers/gpu/drm/amd/amdkfd/kfd_process.c      | 71 ++++++++++++++++++-
 8 files changed, 134 insertions(+), 6 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
index 5c8023cba196..62a38cd820fc 100644
--- a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
+++ b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
@@ -118,9 +118,9 @@ static void cik_event_interrupt_wq(struct kfd_dev *dev,
 			return;
 
 		if (info.vmid == vmid)
-			kfd_signal_vm_fault_event(dev, pasid, &info);
+			kfd_signal_vm_fault_event(dev, pasid, &info, NULL);
 		else
-			kfd_signal_vm_fault_event(dev, pasid, NULL);
+			kfd_signal_vm_fault_event(dev, pasid, NULL, NULL);
 	}
 }
 
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index aeaedd0efa54..4b4c4200d8fb 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -2739,6 +2739,11 @@ static int kfd_ioctl_set_debug_trap(struct file *filep, struct kfd_process *p, v
 		r = kfd_dbg_trap_disable(target);
 		break;
 	case KFD_IOC_DBG_TRAP_SEND_RUNTIME_EVENT:
+		r = kfd_dbg_send_exception_to_runtime(target,
+				args->send_runtime_event.gpu_id,
+				args->send_runtime_event.queue_id,
+				args->send_runtime_event.exception_mask);
+		break;
 	case KFD_IOC_DBG_TRAP_SET_EXCEPTIONS_ENABLED:
 	case KFD_IOC_DBG_TRAP_SET_WAVE_LAUNCH_OVERRIDE:
 	case KFD_IOC_DBG_TRAP_SET_WAVE_LAUNCH_MODE:
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
index b7a4b2ec5b36..87a23b1d4d49 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
@@ -125,6 +125,49 @@ bool kfd_dbg_ev_raise(uint64_t event_mask,
 	return is_subscribed;
 }
 
+int kfd_dbg_send_exception_to_runtime(struct kfd_process *p,
+					unsigned int dev_id,
+					unsigned int queue_id,
+					uint64_t error_reason)
+{
+	if (error_reason & KFD_EC_MASK(EC_DEVICE_MEMORY_VIOLATION)) {
+		struct kfd_process_device *pdd = NULL;
+		struct kfd_hsa_memory_exception_data *data;
+		int i;
+
+		for (i = 0; i < p->n_pdds; i++) {
+			if (p->pdds[i]->dev->id == dev_id) {
+				pdd = p->pdds[i];
+				break;
+			}
+		}
+
+		if (!pdd)
+			return -ENODEV;
+
+		data = (struct kfd_hsa_memory_exception_data *)
+						pdd->vm_fault_exc_data;
+
+		kfd_dqm_evict_pasid(pdd->dev->dqm, p->pasid);
+		kfd_signal_vm_fault_event(pdd->dev, p->pasid, NULL, data);
+		error_reason &= ~KFD_EC_MASK(EC_DEVICE_MEMORY_VIOLATION);
+	}
+
+	if (error_reason & (KFD_EC_MASK(EC_PROCESS_RUNTIME))) {
+		/*
+		 * block should only happen after the debugger receives runtime
+		 * enable notice.
+		 */
+		up(&p->runtime_enable_sema);
+		error_reason &= ~KFD_EC_MASK(EC_PROCESS_RUNTIME);
+	}
+
+	if (error_reason)
+		return kfd_send_exception_to_runtime(p, queue_id, error_reason);
+
+	return 0;
+}
+
 /* kfd_dbg_trap_deactivate:
  *	target: target process
  *	unwind: If this is unwinding a failed kfd_dbg_trap_enable()
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.h b/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
index a288ca1941a6..8aa52cc3af17 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
@@ -38,6 +38,11 @@ int kfd_dbg_trap_enable(struct kfd_process *target, uint32_t fd,
 			void __user *runtime_info,
 			uint32_t *runtime_info_size);
 
+int kfd_dbg_send_exception_to_runtime(struct kfd_process *p,
+					unsigned int dev_id,
+					unsigned int queue_id,
+					uint64_t error_reason);
+
 static inline bool kfd_dbg_is_per_vmid_supported(struct kfd_dev *dev)
 {
 	return KFD_GC_VERSION(dev) == IP_VERSION(9, 4, 2); /* Aldebaran */
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_events.c b/drivers/gpu/drm/amd/amdkfd/kfd_events.c
index 83e3ce9f6049..6958c5389fbe 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_events.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_events.c
@@ -1226,7 +1226,8 @@ void kfd_signal_hw_exception_event(u32 pasid)
 }
 
 void kfd_signal_vm_fault_event(struct kfd_dev *dev, u32 pasid,
-				struct kfd_vm_fault_info *info)
+				struct kfd_vm_fault_info *info,
+				struct kfd_hsa_memory_exception_data *data)
 {
 	struct kfd_event *ev;
 	uint32_t id;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
index 0b75a37b689b..e092563f22de 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
@@ -362,7 +362,7 @@ static void event_interrupt_wq_v9(struct kfd_dev *dev,
 
 		kfd_smi_event_update_vmfault(dev, pasid);
 		kfd_dqm_evict_pasid(dev->dqm, pasid);
-		kfd_signal_vm_fault_event(dev, pasid, &info);
+		kfd_signal_vm_fault_event(dev, pasid, &info, NULL);
 	}
 }
 
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index 40a695619eab..b69f2f94a50e 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -942,6 +942,7 @@ struct kfd_process {
 	bool queues_paused;
 
 	/* Tracks runtime enable status */
+	struct semaphore runtime_enable_sema;
 	struct kfd_runtime_info runtime_info;
 
 };
@@ -1391,7 +1392,8 @@ int kfd_get_num_events(struct kfd_process *p);
 int kfd_event_destroy(struct kfd_process *p, uint32_t event_id);
 
 void kfd_signal_vm_fault_event(struct kfd_dev *dev, u32 pasid,
-				struct kfd_vm_fault_info *info);
+				struct kfd_vm_fault_info *info,
+				struct kfd_hsa_memory_exception_data *data);
 
 void kfd_signal_reset_event(struct kfd_dev *dev);
 
@@ -1407,6 +1409,9 @@ static inline bool kfd_flush_tlb_after_unmap(struct kfd_dev *dev)
 	       KFD_GC_VERSION(dev) == IP_VERSION(9, 4, 0);
 }
 
+int kfd_send_exception_to_runtime(struct kfd_process *p,
+				unsigned int queue_id,
+				uint64_t error_reason);
 bool kfd_is_locked(void);
 
 /* Compute profile */
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
index 928fe5c42c1e..59c4c38833b6 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
@@ -1405,6 +1405,7 @@ static struct kfd_process *create_process(const struct task_struct *thread)
 	process->debugger_process = NULL;
 	process->exception_enable_mask = 0;
 	atomic_set(&process->debugged_process_count, 0);
+	sema_init(&process->runtime_enable_sema, 0);
 
 	process->pasid = kfd_pasid_alloc();
 	if (process->pasid == 0) {
@@ -2046,6 +2047,75 @@ void kfd_flush_tlb(struct kfd_process_device *pdd, enum TLB_FLUSH_TYPE type)
 	}
 }
 
+struct send_exception_work_handler_workarea {
+	struct work_struct work;
+	struct kfd_process *p;
+	unsigned int queue_id;
+	uint64_t error_reason;
+};
+
+static void send_exception_work_handler(struct work_struct *work)
+{
+	struct send_exception_work_handler_workarea *workarea;
+	struct kfd_process *p;
+	struct queue *q;
+	struct mm_struct *mm;
+	struct kfd_context_save_area_header __user *csa_header;
+	uint64_t __user *err_payload_ptr;
+	uint64_t cur_err;
+	uint32_t ev_id;
+
+	workarea = container_of(work,
+				struct send_exception_work_handler_workarea,
+				work);
+	p = workarea->p;
+
+	mm = get_task_mm(p->lead_thread);
+
+	if (!mm)
+		return;
+
+	kthread_use_mm(mm);
+
+	q = pqm_get_user_queue(&p->pqm, workarea->queue_id);
+
+	if (!q)
+		goto out;
+
+	csa_header = (void __user *)q->properties.ctx_save_restore_area_address;
+
+	get_user(err_payload_ptr, (uint64_t __user **)&csa_header->err_payload_addr);
+	get_user(cur_err, err_payload_ptr);
+	cur_err |= workarea->error_reason;
+	put_user(cur_err, err_payload_ptr);
+	get_user(ev_id, &csa_header->err_event_id);
+
+	kfd_set_event(p, ev_id);
+
+out:
+	kthread_unuse_mm(mm);
+	mmput(mm);
+}
+
+int kfd_send_exception_to_runtime(struct kfd_process *p,
+			unsigned int queue_id,
+			uint64_t error_reason)
+{
+	struct send_exception_work_handler_workarea worker;
+
+	INIT_WORK_ONSTACK(&worker.work, send_exception_work_handler);
+
+	worker.p = p;
+	worker.queue_id = queue_id;
+	worker.error_reason = error_reason;
+
+	schedule_work(&worker.work);
+	flush_work(&worker.work);
+	destroy_work_on_stack(&worker.work);
+
+	return 0;
+}
+
 struct kfd_process_device *kfd_process_device_data_by_id(struct kfd_process *p, uint32_t gpu_id)
 {
 	int i;
@@ -2105,4 +2175,3 @@ int kfd_debugfs_mqds_by_process(struct seq_file *m, void *data)
 }
 
 #endif
-
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 63+ messages in thread

* [PATCH 16/29] drm/amdkfd: add runtime enable operation
  2022-10-31 16:23 [PATCH 01/29] drm/amdkfd: add debug and runtime enable interface Jonathan Kim
                   ` (13 preceding siblings ...)
  2022-10-31 16:23 ` [PATCH 15/29] drm/amdkfd: add send exception operation Jonathan Kim
@ 2022-10-31 16:23 ` Jonathan Kim
  2022-11-23  0:52   ` Felix Kuehling
  2022-10-31 16:23 ` [PATCH 17/29] drm/amdkfd: Add debug trap enabled flag to TMA Jonathan Kim
                   ` (13 subsequent siblings)
  28 siblings, 1 reply; 63+ messages in thread
From: Jonathan Kim @ 2022-10-31 16:23 UTC (permalink / raw)
  To: amd-gfx; +Cc: Felix.Kuehling

This operation coordinates the debugger with the target HSA runtime
process.

The main motive for this coordination is due to CP performance overhead
when enabling trap temporaries via SPI_GDBG_PER_VMID_CNTL.Trap_en.
This overhead is unacceptable for microbench performance in normal mode
for certain customers.

ROCr allows the user to bypass trap temporary setup through the
HSA_ENABLE_DEBUG environment variable.  As a result, the debugger has
to consider two scenarios.

For the first scenario, if the runtime enable of the target has already
occurred prior to the debugger attaching, then the debugger will go ahead
and setup the trap temporaries whether runtime has requested them or not.
The debugger will be able to query the runtime status on attach.

For the second scenario where the debugger spawns the target process,
it will have to wait for ROCr's runtime enable request from the target.
The runtime enable request will be able to see that it's process has been
debug attached.  It then enables the trap temporaries since it now
knows it's in debug mode, raises an EC_PROCESS_RUNTIME signal to the
debugger then waits for the debugger's response. Once the debugger has
received the runtime signal, it will wake the target process.

In addition there is an additional restriction that is required to be
enforced with runtime enable and HW debug mode setting.
The debugger must first ensure that HW debug mode has been enabled
before permitting HW debug mode operations.

With single process debug devices, allowing the debugger to set debug
HW modes prior to trap activation means that debug HW mode setting can
occur before the KFD has reserved the debug VMID (0xf) from the hardware
scheduler's VMID allocation resource pool.  This can result in the
hardware scheduler assigning VMID 0xf to a non-debugged process and
having that process inherit debug HW mode settings intended for the
debugged target process instead, which is both incorrect and potentially
fatal for normal mode operation.

With multi process debug devices, allowing the debugger to set debug
HW modes prior to trap activation means that non-debugged processes
migrating to a new VMID could inherit unintended debug settings.

All debug operations that touch HW settings must require trap activation
where trap activation is triggered by both debug attach and runtime
enablement (target has KFD opened and is ready to dispatch work).

Signed-off-by: Jonathan Kim <jonathan.kim@amd.com>
---
 drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 144 ++++++++++++++++++++++-
 drivers/gpu/drm/amd/amdkfd/kfd_debug.c   |   4 +-
 drivers/gpu/drm/amd/amdkfd/kfd_debug.h   |   2 +
 drivers/gpu/drm/amd/amdkfd/kfd_priv.h    |   1 +
 4 files changed, 148 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index 4b4c4200d8fb..27cd5af72521 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -2655,11 +2655,141 @@ static int kfd_ioctl_criu(struct file *filep, struct kfd_process *p, void *data)
 	return ret;
 }
 
-static int kfd_ioctl_runtime_enable(struct file *filep, struct kfd_process *p, void *data)
+static int runtime_enable(struct kfd_process *p, uint64_t r_debug,
+			bool enable_ttmp_setup)
 {
+	int i = 0, ret = 0;
+
+	if (p->is_runtime_retry)
+		goto retry;
+
+	if (p->runtime_info.runtime_state != DEBUG_RUNTIME_STATE_DISABLED)
+		return -EBUSY;
+
+	for (i = 0; i < p->n_pdds; i++) {
+		struct kfd_process_device *pdd = p->pdds[i];
+
+		if (pdd->qpd.queue_count)
+			return -EEXIST;
+	}
+
+	p->runtime_info.runtime_state = DEBUG_RUNTIME_STATE_ENABLED;
+	p->runtime_info.r_debug = r_debug;
+	p->runtime_info.ttmp_setup = enable_ttmp_setup;
+
+	if (p->runtime_info.ttmp_setup) {
+		for (i = 0; i < p->n_pdds; i++) {
+			struct kfd_process_device *pdd = p->pdds[i];
+
+			if (!kfd_dbg_is_rlc_restore_supported(pdd->dev)) {
+				amdgpu_gfx_off_ctrl(pdd->dev->adev, false);
+				pdd->dev->kfd2kgd->enable_debug_trap(
+						pdd->dev->adev,
+						true,
+						pdd->dev->vm_info.last_vmid_kfd);
+			}
+
+			if (kfd_dbg_is_per_vmid_supported(pdd->dev)) {
+				pdd->spi_dbg_override = pdd->dev->kfd2kgd->enable_debug_trap(
+						pdd->dev->adev,
+						false,
+						pdd->dev->vm_info.last_vmid_kfd);
+
+				debug_refresh_runlist(pdd->dev->dqm);
+			}
+		}
+	}
+
+retry:
+	if (p->debug_trap_enabled) {
+		if (!p->is_runtime_retry) {
+			kfd_dbg_trap_activate(p);
+			kfd_dbg_ev_raise(KFD_EC_MASK(EC_PROCESS_RUNTIME),
+					p, NULL, 0, false, NULL, 0);
+		}
+
+		mutex_unlock(&p->mutex);
+		ret = down_interruptible(&p->runtime_enable_sema);
+		mutex_lock(&p->mutex);
+
+		p->is_runtime_retry = !!ret;
+	}
+
+	return ret;
+}
+
+static int runtime_disable(struct kfd_process *p)
+{
+	int i = 0, ret;
+	bool was_enabled = p->runtime_info.runtime_state == DEBUG_RUNTIME_STATE_ENABLED;
+
+	p->runtime_info.runtime_state = DEBUG_RUNTIME_STATE_DISABLED;
+	p->runtime_info.r_debug = 0;
+
+	if (p->debug_trap_enabled) {
+		if (was_enabled)
+			kfd_dbg_trap_deactivate(p, false, 0);
+
+		if (!p->is_runtime_retry)
+			kfd_dbg_ev_raise(KFD_EC_MASK(EC_PROCESS_RUNTIME),
+					p, NULL, 0, false, NULL, 0);
+
+		mutex_unlock(&p->mutex);
+		ret = down_interruptible(&p->runtime_enable_sema);
+		mutex_lock(&p->mutex);
+
+		p->is_runtime_retry = !!ret;
+		if (ret)
+			return ret;
+	}
+
+	if (was_enabled && p->runtime_info.ttmp_setup) {
+		for (i = 0; i < p->n_pdds; i++) {
+			struct kfd_process_device *pdd = p->pdds[i];
+
+			if (!kfd_dbg_is_rlc_restore_supported(pdd->dev))
+				amdgpu_gfx_off_ctrl(pdd->dev->adev, true);
+		}
+	}
+
+	p->runtime_info.ttmp_setup = false;
+
+	/* disable DISPATCH_PTR save */
+	for (i = 0; i < p->n_pdds; i++) {
+		struct kfd_process_device *pdd = p->pdds[i];
+
+		if (kfd_dbg_is_per_vmid_supported(pdd->dev)) {
+			pdd->spi_dbg_override =
+					pdd->dev->kfd2kgd->disable_debug_trap(
+					pdd->dev->adev,
+					false,
+					pdd->dev->vm_info.last_vmid_kfd);
+
+			debug_refresh_runlist(pdd->dev->dqm);
+		}
+	}
+
 	return 0;
 }
 
+static int kfd_ioctl_runtime_enable(struct file *filep, struct kfd_process *p, void *data)
+{
+	struct kfd_ioctl_runtime_enable_args *args = data;
+	int r;
+
+	mutex_lock(&p->mutex);
+
+	if (args->mode_mask & KFD_RUNTIME_ENABLE_MODE_ENABLE_MASK)
+		r = runtime_enable(p, args->r_debug,
+				!!(args->mode_mask & KFD_RUNTIME_ENABLE_MODE_TTMP_SAVE_MASK));
+	else
+		r = runtime_disable(p);
+
+	mutex_unlock(&p->mutex);
+
+	return r;
+}
+
 static int kfd_ioctl_set_debug_trap(struct file *filep, struct kfd_process *p, void *data)
 {
 	struct kfd_ioctl_dbg_trap_args *args = data;
@@ -2721,6 +2851,18 @@ static int kfd_ioctl_set_debug_trap(struct file *filep, struct kfd_process *p, v
 		goto unlock_out;
 	}
 
+	if (target->runtime_info.runtime_state != DEBUG_RUNTIME_STATE_ENABLED &&
+			(args->op == KFD_IOC_DBG_TRAP_SET_WAVE_LAUNCH_OVERRIDE ||
+			 args->op == KFD_IOC_DBG_TRAP_SET_WAVE_LAUNCH_MODE ||
+			 args->op == KFD_IOC_DBG_TRAP_SUSPEND_QUEUES ||
+			 args->op == KFD_IOC_DBG_TRAP_RESUME_QUEUES ||
+			 args->op == KFD_IOC_DBG_TRAP_SET_NODE_ADDRESS_WATCH ||
+			 args->op == KFD_IOC_DBG_TRAP_CLEAR_NODE_ADDRESS_WATCH ||
+			 args->op == KFD_IOC_DBG_TRAP_SET_FLAGS)) {
+		r = -EPERM;
+		goto unlock_out;
+	}
+
 	switch (args->op) {
 	case KFD_IOC_DBG_TRAP_ENABLE:
 		if (target != p)
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
index 87a23b1d4d49..ae6e701a2656 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
@@ -176,7 +176,7 @@ int kfd_dbg_send_exception_to_runtime(struct kfd_process *p,
  *				to unwind
  *		else: ignored
  */
-static void kfd_dbg_trap_deactivate(struct kfd_process *target, bool unwind, int unwind_count)
+void kfd_dbg_trap_deactivate(struct kfd_process *target, bool unwind, int unwind_count)
 {
 	int i, count = 0;
 
@@ -238,7 +238,7 @@ int kfd_dbg_trap_disable(struct kfd_process *target)
 	return 0;
 }
 
-static int kfd_dbg_trap_activate(struct kfd_process *target)
+int kfd_dbg_trap_activate(struct kfd_process *target)
 {
 	int i, r = 0, unwind_count = 0;
 
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.h b/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
index 8aa52cc3af17..e31c9bb0e848 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
@@ -28,6 +28,8 @@
 void kgd_gfx_v9_set_wave_launch_stall(struct amdgpu_device *adev,
 					uint32_t vmid,
 					bool stall);
+void kfd_dbg_trap_deactivate(struct kfd_process *target, bool unwind, int unwind_count);
+int kfd_dbg_trap_activate(struct kfd_process *target);
 bool kfd_dbg_ev_raise(uint64_t event_mask,
 			struct kfd_process *process, struct kfd_dev *dev,
 			unsigned int source_id, bool use_worker,
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index b69f2f94a50e..9690a2adb9ed 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -943,6 +943,7 @@ struct kfd_process {
 
 	/* Tracks runtime enable status */
 	struct semaphore runtime_enable_sema;
+	bool is_runtime_retry;
 	struct kfd_runtime_info runtime_info;
 
 };
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 63+ messages in thread

* [PATCH 17/29] drm/amdkfd: Add debug trap enabled flag to TMA
  2022-10-31 16:23 [PATCH 01/29] drm/amdkfd: add debug and runtime enable interface Jonathan Kim
                   ` (14 preceding siblings ...)
  2022-10-31 16:23 ` [PATCH 16/29] drm/amdkfd: add runtime enable operation Jonathan Kim
@ 2022-10-31 16:23 ` Jonathan Kim
  2022-11-23  0:44   ` Felix Kuehling
  2022-10-31 16:23 ` [PATCH 18/29] drm/amdkfd: update process interrupt handling for debug events Jonathan Kim
                   ` (12 subsequent siblings)
  28 siblings, 1 reply; 63+ messages in thread
From: Jonathan Kim @ 2022-10-31 16:23 UTC (permalink / raw)
  To: amd-gfx; +Cc: Felix.Kuehling

From: Jay Cornwall <jay.cornwall@amd.com>

Trap handler behavior will differ when a debugger is attached.

Make the debug trap flag available in the trap handler TMA.
Update it when the debug trap ioctl is invoked.

v3: Rebase for upstream

v2:
Add missing debug flag setup on APUs

Signed-off-by: Jay Cornwall <jay.cornwall@amd.com>
Reviewed-by: Felix Kuehling <Felix.Kuehling@amd.com>
Signed-off-by: Jonathan Kim <jonathan.kim@amd.com>
---
 drivers/gpu/drm/amd/amdkfd/kfd_debug.c   |  4 ++++
 drivers/gpu/drm/amd/amdkfd/kfd_priv.h    |  2 ++
 drivers/gpu/drm/amd/amdkfd/kfd_process.c | 16 ++++++++++++++++
 3 files changed, 22 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
index ae6e701a2656..d4f87f2adada 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
@@ -193,6 +193,8 @@ void kfd_dbg_trap_deactivate(struct kfd_process *target, bool unwind, int unwind
 		if (unwind && count == unwind_count)
 			break;
 
+		kfd_process_set_trap_debug_flag(&pdd->qpd, false);
+
 		/* GFX off is already disabled by debug activate if not RLC restore supported. */
 		if (kfd_dbg_is_rlc_restore_supported(pdd->dev))
 			amdgpu_gfx_off_ctrl(pdd->dev->adev, false);
@@ -278,6 +280,8 @@ int kfd_dbg_trap_activate(struct kfd_process *target)
 		if (kfd_dbg_is_rlc_restore_supported(pdd->dev))
 			amdgpu_gfx_off_ctrl(pdd->dev->adev, true);
 
+		kfd_process_set_trap_debug_flag(&pdd->qpd, true);
+		
 		r = debug_refresh_runlist(pdd->dev->dqm);
 		if (r) {
 			target->runtime_info.runtime_state =
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index 9690a2adb9ed..82b28588ab72 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -1101,6 +1101,8 @@ int kfd_init_apertures(struct kfd_process *process);
 void kfd_process_set_trap_handler(struct qcm_process_device *qpd,
 				  uint64_t tba_addr,
 				  uint64_t tma_addr);
+void kfd_process_set_trap_debug_flag(struct qcm_process_device *qpd,
+				     bool enabled);
 
 /* CWSR initialization */
 int kfd_process_init_cwsr_apu(struct kfd_process *process, struct file *filep);
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
index 59c4c38833b6..d62e0c62df76 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
@@ -1252,6 +1252,8 @@ int kfd_process_init_cwsr_apu(struct kfd_process *p, struct file *filep)
 
 		memcpy(qpd->cwsr_kaddr, dev->cwsr_isa, dev->cwsr_isa_size);
 
+		kfd_process_set_trap_debug_flag(qpd, p->debug_trap_enabled);
+
 		qpd->tma_addr = qpd->tba_addr + KFD_CWSR_TMA_OFFSET;
 		pr_debug("set tba :0x%llx, tma:0x%llx, cwsr_kaddr:%p for pqm.\n",
 			qpd->tba_addr, qpd->tma_addr, qpd->cwsr_kaddr);
@@ -1288,6 +1290,9 @@ static int kfd_process_device_init_cwsr_dgpu(struct kfd_process_device *pdd)
 
 	memcpy(qpd->cwsr_kaddr, dev->cwsr_isa, dev->cwsr_isa_size);
 
+	kfd_process_set_trap_debug_flag(&pdd->qpd,
+					pdd->process->debug_trap_enabled);
+
 	qpd->tma_addr = qpd->tba_addr + KFD_CWSR_TMA_OFFSET;
 	pr_debug("set tba :0x%llx, tma:0x%llx, cwsr_kaddr:%p for pqm.\n",
 		 qpd->tba_addr, qpd->tma_addr, qpd->cwsr_kaddr);
@@ -1374,6 +1379,17 @@ bool kfd_process_xnack_mode(struct kfd_process *p, bool supported)
 	return true;
 }
 
+void kfd_process_set_trap_debug_flag(struct qcm_process_device *qpd,
+				     bool enabled)
+{
+	/* If TMA doesn't exist then flag will be set during allocation. */
+	if (qpd->cwsr_kaddr) {
+		uint64_t *tma =
+			(uint64_t *)(qpd->cwsr_kaddr + KFD_CWSR_TMA_OFFSET);
+		tma[2] = enabled;
+	}
+}
+
 /*
  * On return the kfd_process is fully operational and will be freed when the
  * mm is released
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 63+ messages in thread

* [PATCH 18/29] drm/amdkfd: update process interrupt handling for debug events
  2022-10-31 16:23 [PATCH 01/29] drm/amdkfd: add debug and runtime enable interface Jonathan Kim
                   ` (15 preceding siblings ...)
  2022-10-31 16:23 ` [PATCH 17/29] drm/amdkfd: Add debug trap enabled flag to TMA Jonathan Kim
@ 2022-10-31 16:23 ` Jonathan Kim
  2022-10-31 16:23 ` [PATCH 19/29] drm/amdkfd: add debug set exceptions enabled operation Jonathan Kim
                   ` (11 subsequent siblings)
  28 siblings, 0 replies; 63+ messages in thread
From: Jonathan Kim @ 2022-10-31 16:23 UTC (permalink / raw)
  To: amd-gfx; +Cc: Felix.Kuehling

The debugger must be notified by any debugger subscribed exception
that comes from hardware interrupts.

If a debugger session exits, any exceptions it subscribed to may still
have interrupts in the interrupt ring buffer or KGD/KFD pipeline.
To prevent a new session from inheriting stale interrupts, when a new
queue is created, open an interrupt drain and allow the IH ring to drain
from a timestamped checkpoint.  Then inject a custom IV so that once
the custom IV is picked up by the KFD, it's safe to close the drain
and proceed with queue creation.

The drain must also be on debug disable as SW interrupts may still
be processed.  Drain at this time and clear all the exception status.

The debugger may also not be attached nor subscibed to certain
exceptions so forward them directly to the runtime.

GFX10 also requires its own IV processing, hence the creation of
kfd_int_process_v10.c.  This is because the IV from SQ interrupts are
packed into a new continguous format unlike GFX9. To make this clear,
a separate interrupting handling code file was created.

v2: fix interrupt drain on debug disable.
fix interrupt drain on queue create during -ERESTARTSYS.
fix up macros naming for ECODE parsing.

Signed-off-by: Jonathan Kim <jonathan.kim@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c    |  16 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h    |   2 +
 drivers/gpu/drm/amd/amdkfd/Makefile           |   1 +
 drivers/gpu/drm/amd/amdkfd/kfd_debug.c        |  85 ++++
 drivers/gpu/drm/amd/amdkfd/kfd_debug.h        |   6 +
 drivers/gpu/drm/amd/amdkfd/kfd_device.c       |   4 +-
 .../gpu/drm/amd/amdkfd/kfd_int_process_v10.c  | 405 ++++++++++++++++++
 .../gpu/drm/amd/amdkfd/kfd_int_process_v9.c   |  98 ++++-
 drivers/gpu/drm/amd/amdkfd/kfd_priv.h         |  12 +
 drivers/gpu/drm/amd/amdkfd/kfd_process.c      |  47 ++
 .../amd/amdkfd/kfd_process_queue_manager.c    |   4 +
 11 files changed, 670 insertions(+), 10 deletions(-)
 create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_int_process_v10.c

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
index 0561812aa0a4..bcd9cd990334 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
@@ -758,6 +758,22 @@ void amdgpu_amdkfd_ras_poison_consumption_handler(struct amdgpu_device *adev, bo
 	amdgpu_umc_poison_handler(adev, &err_data, reset);
 }
 
+int amdgpu_amdkfd_send_close_event_drain_irq(struct amdgpu_device *adev,
+					uint32_t *payload)
+{
+	int ret;
+
+	/* Device or IH ring is not ready so bail. */
+	ret = amdgpu_ih_wait_on_checkpoint_process_ts(adev, &adev->irq.ih);
+	if (ret)
+		return ret;
+
+	/* Send payload to fence KFD interrupts */
+	amdgpu_amdkfd_interrupt(adev, payload);
+
+	return 0;
+}
+
 bool amdgpu_amdkfd_ras_query_utcl2_poison_status(struct amdgpu_device *adev)
 {
 	if (adev->gfx.ras && adev->gfx.ras->query_utcl2_poison_status)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
index 647220a8762d..5ba68ec08199 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
@@ -236,6 +236,8 @@ int amdgpu_amdkfd_get_xgmi_bandwidth_mbytes(struct amdgpu_device *dst,
 					    struct amdgpu_device *src,
 					    bool is_min);
 int amdgpu_amdkfd_get_pcie_bandwidth_mbytes(struct amdgpu_device *adev, bool is_min);
+int amdgpu_amdkfd_send_close_event_drain_irq(struct amdgpu_device *adev,
+					uint32_t *payload);
 
 /* Read user wptr from a specified user address space with page fault
  * disabled. The memory must be pinned and mapped to the hardware when
diff --git a/drivers/gpu/drm/amd/amdkfd/Makefile b/drivers/gpu/drm/amd/amdkfd/Makefile
index 747754428073..2ec8f27c5366 100644
--- a/drivers/gpu/drm/amd/amdkfd/Makefile
+++ b/drivers/gpu/drm/amd/amdkfd/Makefile
@@ -53,6 +53,7 @@ AMDKFD_FILES	:= $(AMDKFD_PATH)/kfd_module.o \
 		$(AMDKFD_PATH)/kfd_events.o \
 		$(AMDKFD_PATH)/cik_event_interrupt.o \
 		$(AMDKFD_PATH)/kfd_int_process_v9.o \
+		$(AMDKFD_PATH)/kfd_int_process_v10.o \
 		$(AMDKFD_PATH)/kfd_int_process_v11.o \
 		$(AMDKFD_PATH)/kfd_smi_events.o \
 		$(AMDKFD_PATH)/kfd_crat.o \
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
index d4f87f2adada..3d304e8c286e 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
@@ -125,6 +125,65 @@ bool kfd_dbg_ev_raise(uint64_t event_mask,
 	return is_subscribed;
 }
 
+/* set pending event queue entry from ring entry  */
+bool kfd_set_dbg_ev_from_interrupt(struct kfd_dev *dev,
+				   unsigned int pasid,
+				   uint32_t doorbell_id,
+				   uint64_t trap_mask,
+				   void *exception_data,
+				   size_t exception_data_size)
+{
+	struct kfd_process *p;
+	bool signaled_to_debugger_or_runtime = false;
+
+	p = kfd_lookup_process_by_pasid(pasid);
+
+	if (!p)
+		return false;
+
+	if (!kfd_dbg_ev_raise(trap_mask, p, dev, doorbell_id, true,
+					exception_data, exception_data_size)) {
+		struct process_queue_manager *pqm;
+		struct process_queue_node *pqn;
+
+		if (!!(trap_mask & KFD_EC_MASK_QUEUE) &&
+				p->runtime_info.runtime_state == DEBUG_RUNTIME_STATE_ENABLED) {
+			mutex_lock(&p->mutex);
+
+			pqm = &p->pqm;
+			list_for_each_entry(pqn, &pqm->queues,
+							process_queue_list) {
+
+				if (!(pqn->q && pqn->q->device == dev &&
+						pqn->q->doorbell_id == doorbell_id))
+					continue;
+
+				kfd_send_exception_to_runtime(p,
+						pqn->q->properties.queue_id,
+						trap_mask);
+
+				signaled_to_debugger_or_runtime = true;
+
+				break;
+			}
+
+			mutex_unlock(&p->mutex);
+		} else if (trap_mask & KFD_EC_MASK(EC_DEVICE_MEMORY_VIOLATION)) {
+			kfd_dqm_evict_pasid(dev->dqm, p->pasid);
+			kfd_signal_vm_fault_event(dev, p->pasid, NULL,
+							exception_data);
+
+			signaled_to_debugger_or_runtime = true;
+		}
+	} else {
+		signaled_to_debugger_or_runtime = true;
+	}
+
+	kfd_unref_process(p);
+
+	return signaled_to_debugger_or_runtime;
+}
+
 int kfd_dbg_send_exception_to_runtime(struct kfd_process *p,
 					unsigned int dev_id,
 					unsigned int queue_id,
@@ -215,6 +274,31 @@ void kfd_dbg_trap_deactivate(struct kfd_process *target, bool unwind, int unwind
 	}
 }
 
+static void kfd_dbg_clean_exception_status(struct kfd_process *target)
+{
+	struct process_queue_manager *pqm;
+	struct process_queue_node *pqn;
+	int i;
+
+	for (i = 0; i < target->n_pdds; i++) {
+		struct kfd_process_device *pdd = target->pdds[i];
+
+		kfd_process_drain_interrupts(pdd);
+
+		pdd->exception_status = 0;
+	}
+
+	pqm = &target->pqm;
+	list_for_each_entry(pqn, &pqm->queues, process_queue_list) {
+		if (!pqn->q)
+			continue;
+
+		pqn->q->properties.exception_status = 0;
+	}
+
+	target->exception_status = 0;
+}
+
 int kfd_dbg_trap_disable(struct kfd_process *target)
 {
 	/*
@@ -235,6 +319,7 @@ int kfd_dbg_trap_disable(struct kfd_process *target)
 	}
 
 	target->debug_trap_enabled = false;
+	kfd_dbg_clean_exception_status(target);
 	kfd_unref_process(target);
 
 	return 0;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.h b/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
index e31c9bb0e848..5270d5749828 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
@@ -30,6 +30,12 @@ void kgd_gfx_v9_set_wave_launch_stall(struct amdgpu_device *adev,
 					bool stall);
 void kfd_dbg_trap_deactivate(struct kfd_process *target, bool unwind, int unwind_count);
 int kfd_dbg_trap_activate(struct kfd_process *target);
+bool kfd_set_dbg_ev_from_interrupt(struct kfd_dev *dev,
+				   unsigned int pasid,
+				   uint32_t doorbell_id,
+				   uint64_t trap_mask,
+				   void *exception_data,
+				   size_t exception_data_size);
 bool kfd_dbg_ev_raise(uint64_t event_mask,
 			struct kfd_process *process, struct kfd_dev *dev,
 			unsigned int source_id, bool use_worker,
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
index be7a0b5a2dbc..b0ee8b2a4cb9 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
@@ -135,6 +135,8 @@ static void kfd_device_info_set_event_interrupt_class(struct kfd_dev *kfd)
 	case IP_VERSION(9, 4, 0): /* VEGA20 */
 	case IP_VERSION(9, 4, 1): /* ARCTURUS */
 	case IP_VERSION(9, 4, 2): /* ALDEBARAN */
+		kfd->device_info.event_interrupt_class = &event_interrupt_class_v9;
+		break;
 	case IP_VERSION(10, 3, 1): /* VANGOGH */
 	case IP_VERSION(10, 3, 3): /* YELLOW_CARP */
 	case IP_VERSION(10, 3, 6): /* GC 10.3.6 */
@@ -148,7 +150,7 @@ static void kfd_device_info_set_event_interrupt_class(struct kfd_dev *kfd)
 	case IP_VERSION(10, 3, 2): /* NAVY_FLOUNDER */
 	case IP_VERSION(10, 3, 4): /* DIMGREY_CAVEFISH */
 	case IP_VERSION(10, 3, 5): /* BEIGE_GOBY */
-		kfd->device_info.event_interrupt_class = &event_interrupt_class_v9;
+		kfd->device_info.event_interrupt_class = &event_interrupt_class_v10;
 		break;
 	case IP_VERSION(11, 0, 0):
 	case IP_VERSION(11, 0, 1):
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v10.c b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v10.c
new file mode 100644
index 000000000000..e1c0bf313237
--- /dev/null
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v10.c
@@ -0,0 +1,405 @@
+/*
+ * Copyright 2022 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "kfd_events.h"
+#include "kfd_debug.h"
+#include "soc15_int.h"
+#include "kfd_device_queue_manager.h"
+
+/*
+ * GFX10 SQ Interrupts
+ *
+ * There are 3 encoding types of interrupts sourced from SQ sent as a 44-bit
+ * packet to the Interrupt Handler:
+ * Auto - Generated by the SQG (various cmd overflows, timestamps etc)
+ * Wave - Generated by S_SENDMSG through a shader program
+ * Error - HW generated errors (Illegal instructions, Memviols, EDC etc)
+ *
+ * The 44-bit packet is mapped as {context_id1[7:0],context_id0[31:0]} plus
+ * 4-bits for VMID (SOC15_VMID_FROM_IH_ENTRY) as such:
+ *
+ * - context_id1[7:6]
+ * Encoding type (0 = Auto, 1 = Wave, 2 = Error)
+ *
+ * - context_id0[24]
+ * PRIV bit indicates that Wave S_SEND or error occurred within trap
+ *
+ * - context_id0[22:0]
+ * 23-bit data with the following layout per encoding type:
+ * Auto - only context_id0[8:0] is used, which reports various interrupts
+ * generated by SQG.  The rest is 0.
+ * Wave - user data sent from m0 via S_SENDMSG
+ * Error - Error type (context_id0[22:19]), Error Details (rest of bits)
+ *
+ * The other context_id bits show coordinates (SE/SH/CU/SIMD/WGP) for wave
+ * S_SENDMSG and Errors.  These are 0 for Auto.
+ */
+
+enum SQ_INTERRUPT_WORD_ENCODING {
+	SQ_INTERRUPT_WORD_ENCODING_AUTO = 0x0,
+	SQ_INTERRUPT_WORD_ENCODING_INST,
+	SQ_INTERRUPT_WORD_ENCODING_ERROR,
+};
+
+enum SQ_INTERRUPT_ERROR_TYPE {
+	SQ_INTERRUPT_ERROR_TYPE_EDC_FUE = 0x0,
+	SQ_INTERRUPT_ERROR_TYPE_ILLEGAL_INST,
+	SQ_INTERRUPT_ERROR_TYPE_MEMVIOL,
+	SQ_INTERRUPT_ERROR_TYPE_EDC_FED,
+};
+
+/* SQ_INTERRUPT_WORD_AUTO_CTXID */
+#define SQ_INTERRUPT_WORD_AUTO_CTXID0__THREAD_TRACE__SHIFT 0
+#define SQ_INTERRUPT_WORD_AUTO_CTXID0__WLT__SHIFT 1
+#define SQ_INTERRUPT_WORD_AUTO_CTXID0__THREAD_TRACE_BUF0_FULL__SHIFT 2
+#define SQ_INTERRUPT_WORD_AUTO_CTXID0__THREAD_TRACE_BUF1_FULL__SHIFT 3
+#define SQ_INTERRUPT_WORD_AUTO_CTXID0__THREAD_TRACE_UTC_ERROR__SHIFT 7
+#define SQ_INTERRUPT_WORD_AUTO_CTXID1__SE_ID__SHIFT 4
+#define SQ_INTERRUPT_WORD_AUTO_CTXID1__ENCODING__SHIFT 6
+
+#define SQ_INTERRUPT_WORD_AUTO_CTXID0__THREAD_TRACE_MASK 0x00000001
+#define SQ_INTERRUPT_WORD_AUTO_CTXID0__WLT_MASK 0x00000002
+#define SQ_INTERRUPT_WORD_AUTO_CTXID0__THREAD_TRACE_BUF0_FULL_MASK 0x00000004
+#define SQ_INTERRUPT_WORD_AUTO_CTXID0__THREAD_TRACE_BUF1_FULL_MASK 0x00000008
+#define SQ_INTERRUPT_WORD_AUTO_CTXID0__THREAD_TRACE_UTC_ERROR_MASK 0x00000080
+#define SQ_INTERRUPT_WORD_AUTO_CTXID1__SE_ID_MASK 0x030
+#define SQ_INTERRUPT_WORD_AUTO_CTXID1__ENCODING_MASK 0x0c0
+
+/* SQ_INTERRUPT_WORD_WAVE_CTXID */
+#define SQ_INTERRUPT_WORD_WAVE_CTXID0__DATA__SHIFT 0
+#define SQ_INTERRUPT_WORD_WAVE_CTXID0__SA_ID__SHIFT 23
+#define SQ_INTERRUPT_WORD_WAVE_CTXID0__PRIV__SHIFT 24
+#define SQ_INTERRUPT_WORD_WAVE_CTXID0__WAVE_ID__SHIFT 25
+#define SQ_INTERRUPT_WORD_WAVE_CTXID0__SIMD_ID__SHIFT 30
+#define SQ_INTERRUPT_WORD_WAVE_CTXID1__WGP_ID__SHIFT 0
+#define SQ_INTERRUPT_WORD_WAVE_CTXID1__SE_ID__SHIFT 4
+#define SQ_INTERRUPT_WORD_WAVE_CTXID1__ENCODING__SHIFT 6
+
+#define SQ_INTERRUPT_WORD_WAVE_CTXID0__DATA_MASK 0x000007fffff
+#define SQ_INTERRUPT_WORD_WAVE_CTXID0__SA_ID_MASK 0x0000800000
+#define SQ_INTERRUPT_WORD_WAVE_CTXID0__PRIV_MASK 0x00001000000
+#define SQ_INTERRUPT_WORD_WAVE_CTXID0__WAVE_ID_MASK 0x0003e000000
+#define SQ_INTERRUPT_WORD_WAVE_CTXID0__SIMD_ID_MASK 0x000c0000000
+#define SQ_INTERRUPT_WORD_WAVE_CTXID1__WGP_ID_MASK 0x00f
+#define SQ_INTERRUPT_WORD_WAVE_CTXID1__SE_ID_MASK 0x030
+#define SQ_INTERRUPT_WORD_WAVE_CTXID1__ENCODING_MASK 0x0c0
+
+#define KFD_CTXID0__ERR_TYPE_MASK 0x780000
+#define KFD_CTXID0__ERR_TYPE__SHIFT 19
+
+/* GFX10 SQ interrupt ENC type bit (context_id1[7:6]) for wave s_sendmsg */
+#define KFD_CONTEXT_ID1_ENC_TYPE_WAVE_MASK	0x40
+/* GFX10 SQ interrupt PRIV bit (context_id0[24]) for s_sendmsg inside trap */
+#define KFD_CONTEXT_ID0_PRIV_MASK		0x1000000
+/*
+ * The debugger will send user data(m0) with PRIV=1 to indicate it requires
+ * notification from the KFD with the following queue id (DOORBELL_ID) and
+ * trap code (TRAP_CODE).
+ */
+#define KFD_CONTEXT_ID0_DEBUG_DOORBELL_MASK	0x0003ff
+#define KFD_CONTEXT_ID0_DEBUG_TRAP_CODE_SHIFT	10
+#define KFD_CONTEXT_ID0_DEBUG_TRAP_CODE_MASK	0x07fc00
+#define KFD_DEBUG_DOORBELL_ID(ctxid0)	((ctxid0) &	\
+				KFD_CONTEXT_ID0_DEBUG_DOORBELL_MASK)
+#define KFD_DEBUG_TRAP_CODE(ctxid0)	(((ctxid0) &	\
+				KFD_CONTEXT_ID0_DEBUG_TRAP_CODE_MASK)	\
+				>> KFD_CONTEXT_ID0_DEBUG_TRAP_CODE_SHIFT)
+#define KFD_DEBUG_CP_BAD_OP_ECODE_MASK		0x3fffc00
+#define KFD_DEBUG_CP_BAD_OP_ECODE_SHIFT		10
+#define KFD_DEBUG_CP_BAD_OP_ECODE(ctxid0) (((ctxid0) &			\
+				KFD_DEBUG_CP_BAD_OP_ECODE_MASK)		\
+				>> KFD_DEBUG_CP_BAD_OP_ECODE_SHIFT)
+
+static void event_interrupt_poison_consumption(struct kfd_dev *dev,
+				uint16_t pasid, uint16_t client_id)
+{
+	int old_poison, ret = -EINVAL;
+	struct kfd_process *p = kfd_lookup_process_by_pasid(pasid);
+
+	if (!p)
+		return;
+
+	/* all queues of a process will be unmapped in one time */
+	old_poison = atomic_cmpxchg(&p->poison, 0, 1);
+	kfd_unref_process(p);
+	if (old_poison)
+		return;
+
+	switch (client_id) {
+	case SOC15_IH_CLIENTID_SE0SH:
+	case SOC15_IH_CLIENTID_SE1SH:
+	case SOC15_IH_CLIENTID_SE2SH:
+	case SOC15_IH_CLIENTID_SE3SH:
+	case SOC15_IH_CLIENTID_UTCL2:
+		ret = kfd_dqm_evict_pasid(dev->dqm, pasid);
+		break;
+	case SOC15_IH_CLIENTID_SDMA0:
+	case SOC15_IH_CLIENTID_SDMA1:
+	case SOC15_IH_CLIENTID_SDMA2:
+	case SOC15_IH_CLIENTID_SDMA3:
+	case SOC15_IH_CLIENTID_SDMA4:
+		break;
+	default:
+		break;
+	}
+
+	kfd_signal_poison_consumed_event(dev, pasid);
+
+	/* resetting queue passes, do page retirement without gpu reset
+	 * resetting queue fails, fallback to gpu reset solution
+	 */
+	if (!ret) {
+		dev_warn(dev->adev->dev,
+			"RAS poison consumption, unmap queue flow succeeded: client id %d\n",
+			client_id);
+		amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, false);
+	} else {
+		dev_warn(dev->adev->dev,
+			"RAS poison consumption, fall back to gpu reset flow: client id %d\n",
+			client_id);
+		amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, true);
+	}
+}
+
+static bool event_interrupt_isr_v10(struct kfd_dev *dev,
+					const uint32_t *ih_ring_entry,
+					uint32_t *patched_ihre,
+					bool *patched_flag)
+{
+	uint16_t source_id, client_id, pasid, vmid;
+	const uint32_t *data = ih_ring_entry;
+
+	source_id = SOC15_SOURCE_ID_FROM_IH_ENTRY(ih_ring_entry);
+	client_id = SOC15_CLIENT_ID_FROM_IH_ENTRY(ih_ring_entry);
+
+	/* Only handle interrupts from KFD VMIDs */
+	vmid = SOC15_VMID_FROM_IH_ENTRY(ih_ring_entry);
+	if (!KFD_IRQ_IS_FENCE(client_id, source_id) &&
+	   (vmid < dev->vm_info.first_vmid_kfd ||
+	    vmid > dev->vm_info.last_vmid_kfd))
+		return false;
+
+	pasid = SOC15_PASID_FROM_IH_ENTRY(ih_ring_entry);
+
+	/* Only handle clients we care about */
+	if (client_id != SOC15_IH_CLIENTID_GRBM_CP &&
+	    client_id != SOC15_IH_CLIENTID_SDMA0 &&
+	    client_id != SOC15_IH_CLIENTID_SDMA1 &&
+	    client_id != SOC15_IH_CLIENTID_SDMA2 &&
+	    client_id != SOC15_IH_CLIENTID_SDMA3 &&
+	    client_id != SOC15_IH_CLIENTID_SDMA4 &&
+	    client_id != SOC15_IH_CLIENTID_SDMA5 &&
+	    client_id != SOC15_IH_CLIENTID_SDMA6 &&
+	    client_id != SOC15_IH_CLIENTID_SDMA7 &&
+	    client_id != SOC15_IH_CLIENTID_VMC &&
+	    client_id != SOC15_IH_CLIENTID_VMC1 &&
+	    client_id != SOC15_IH_CLIENTID_UTCL2 &&
+	    client_id != SOC15_IH_CLIENTID_SE0SH &&
+	    client_id != SOC15_IH_CLIENTID_SE1SH &&
+	    client_id != SOC15_IH_CLIENTID_SE2SH &&
+	    client_id != SOC15_IH_CLIENTID_SE3SH)
+		return false;
+
+	pr_debug("client id 0x%x, source id %d, vmid %d, pasid 0x%x. raw data:\n",
+		 client_id, source_id, vmid, pasid);
+	pr_debug("%8X, %8X, %8X, %8X, %8X, %8X, %8X, %8X.\n",
+		 data[0], data[1], data[2], data[3],
+		 data[4], data[5], data[6], data[7]);
+
+	/* If there is no valid PASID, it's likely a bug */
+	if (WARN_ONCE(pasid == 0, "Bug: No PASID in KFD interrupt"))
+		return 0;
+
+	/* Interrupt types we care about: various signals and faults.
+	 * They will be forwarded to a work queue (see below).
+	 */
+	return source_id == SOC15_INTSRC_CP_END_OF_PIPE ||
+		source_id == SOC15_INTSRC_SDMA_TRAP ||
+		source_id == SOC15_INTSRC_SQ_INTERRUPT_MSG ||
+		source_id == SOC15_INTSRC_CP_BAD_OPCODE ||
+		client_id == SOC15_IH_CLIENTID_VMC ||
+		client_id == SOC15_IH_CLIENTID_VMC1 ||
+		client_id == SOC15_IH_CLIENTID_UTCL2 ||
+		KFD_IRQ_IS_FENCE(client_id, source_id);
+}
+
+static void event_interrupt_wq_v10(struct kfd_dev *dev,
+					const uint32_t *ih_ring_entry)
+{
+	uint16_t source_id, client_id, pasid, vmid;
+	uint32_t context_id0, context_id1;
+	uint32_t encoding, sq_intr_err_type;
+
+	source_id = SOC15_SOURCE_ID_FROM_IH_ENTRY(ih_ring_entry);
+	client_id = SOC15_CLIENT_ID_FROM_IH_ENTRY(ih_ring_entry);
+	pasid = SOC15_PASID_FROM_IH_ENTRY(ih_ring_entry);
+	vmid = SOC15_VMID_FROM_IH_ENTRY(ih_ring_entry);
+	context_id0 = SOC15_CONTEXT_ID0_FROM_IH_ENTRY(ih_ring_entry);
+	context_id1 = SOC15_CONTEXT_ID1_FROM_IH_ENTRY(ih_ring_entry);
+
+	if (client_id == SOC15_IH_CLIENTID_GRBM_CP ||
+	    client_id == SOC15_IH_CLIENTID_SE0SH ||
+	    client_id == SOC15_IH_CLIENTID_SE1SH ||
+	    client_id == SOC15_IH_CLIENTID_SE2SH ||
+	    client_id == SOC15_IH_CLIENTID_SE3SH) {
+		if (source_id == SOC15_INTSRC_CP_END_OF_PIPE)
+			kfd_signal_event_interrupt(pasid, context_id0, 32);
+		else if (source_id == SOC15_INTSRC_SQ_INTERRUPT_MSG) {
+			encoding = REG_GET_FIELD(context_id1,
+						SQ_INTERRUPT_WORD_WAVE_CTXID1, ENCODING);
+			switch (encoding) {
+			case SQ_INTERRUPT_WORD_ENCODING_AUTO:
+				pr_debug(
+					"sq_intr: auto, se %d, ttrace %d, wlt %d, ttrac_buf0_full %d, ttrac_buf1_full %d, ttrace_utc_err %d\n",
+					REG_GET_FIELD(context_id1, SQ_INTERRUPT_WORD_AUTO_CTXID1,
+							SE_ID),
+					REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_AUTO_CTXID0,
+							THREAD_TRACE),
+					REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_AUTO_CTXID0,
+							WLT),
+					REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_AUTO_CTXID0,
+							THREAD_TRACE_BUF0_FULL),
+					REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_AUTO_CTXID0,
+							THREAD_TRACE_BUF1_FULL),
+					REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_AUTO_CTXID0,
+							THREAD_TRACE_UTC_ERROR));
+				break;
+			case SQ_INTERRUPT_WORD_ENCODING_INST:
+				pr_debug("sq_intr: inst, se %d, data 0x%x, sa %d, priv %d, wave_id %d, simd_id %d, wgp_id %d\n",
+					REG_GET_FIELD(context_id1, SQ_INTERRUPT_WORD_WAVE_CTXID1,
+							SE_ID),
+					REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_WAVE_CTXID0,
+							DATA),
+					REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_WAVE_CTXID0,
+							SA_ID),
+					REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_WAVE_CTXID0,
+							PRIV),
+					REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_WAVE_CTXID0,
+							WAVE_ID),
+					REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_WAVE_CTXID0,
+							SIMD_ID),
+					REG_GET_FIELD(context_id1, SQ_INTERRUPT_WORD_WAVE_CTXID1,
+							WGP_ID));
+				if (context_id0 & SQ_INTERRUPT_WORD_WAVE_CTXID0__PRIV_MASK) {
+					if (kfd_set_dbg_ev_from_interrupt(dev, pasid,
+							KFD_DEBUG_DOORBELL_ID(context_id0),
+							KFD_DEBUG_TRAP_CODE(context_id0),
+							NULL, 0))
+						return;
+				}
+				break;
+			case SQ_INTERRUPT_WORD_ENCODING_ERROR:
+				sq_intr_err_type = REG_GET_FIELD(context_id0, KFD_CTXID0,
+								ERR_TYPE);
+				pr_warn("sq_intr: error, se %d, data 0x%x, sa %d, priv %d, wave_id %d, simd_id %d, wgp_id %d, err_type %d\n",
+					REG_GET_FIELD(context_id1, SQ_INTERRUPT_WORD_WAVE_CTXID1,
+							SE_ID),
+					REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_WAVE_CTXID0,
+							DATA),
+					REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_WAVE_CTXID0,
+							SA_ID),
+					REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_WAVE_CTXID0,
+							PRIV),
+					REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_WAVE_CTXID0,
+							WAVE_ID),
+					REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_WAVE_CTXID0,
+							SIMD_ID),
+					REG_GET_FIELD(context_id1, SQ_INTERRUPT_WORD_WAVE_CTXID1,
+							WGP_ID),
+					sq_intr_err_type);
+				if (sq_intr_err_type != SQ_INTERRUPT_ERROR_TYPE_ILLEGAL_INST &&
+					sq_intr_err_type != SQ_INTERRUPT_ERROR_TYPE_MEMVIOL) {
+					event_interrupt_poison_consumption(dev, pasid, source_id);
+					return;
+				}
+				break;
+			default:
+				break;
+			}
+			kfd_signal_event_interrupt(pasid, context_id0 & 0x7fffff, 23);
+		} else if (source_id == SOC15_INTSRC_CP_BAD_OPCODE) {
+			kfd_set_dbg_ev_from_interrupt(dev, pasid,
+				KFD_DEBUG_DOORBELL_ID(context_id0),
+				KFD_EC_MASK(KFD_DEBUG_CP_BAD_OP_ECODE(context_id0)),
+				NULL,
+				0);
+		}
+	} else if (client_id == SOC15_IH_CLIENTID_SDMA0 ||
+		   client_id == SOC15_IH_CLIENTID_SDMA1 ||
+		   client_id == SOC15_IH_CLIENTID_SDMA2 ||
+		   client_id == SOC15_IH_CLIENTID_SDMA3 ||
+		   (client_id == SOC15_IH_CLIENTID_SDMA3_Sienna_Cichlid &&
+		    KFD_GC_VERSION(dev) == IP_VERSION(10, 3, 0)) ||
+		   client_id == SOC15_IH_CLIENTID_SDMA4 ||
+		   client_id == SOC15_IH_CLIENTID_SDMA5 ||
+		   client_id == SOC15_IH_CLIENTID_SDMA6 ||
+		   client_id == SOC15_IH_CLIENTID_SDMA7) {
+		if (source_id == SOC15_INTSRC_SDMA_TRAP) {
+			kfd_signal_event_interrupt(pasid, context_id0 & 0xfffffff, 28);
+		} else if (source_id == SOC15_INTSRC_SDMA_ECC) {
+			event_interrupt_poison_consumption(dev, pasid, source_id);
+			return;
+		}
+	} else if (client_id == SOC15_IH_CLIENTID_VMC ||
+		   client_id == SOC15_IH_CLIENTID_VMC1 ||
+		   client_id == SOC15_IH_CLIENTID_UTCL2) {
+		struct kfd_vm_fault_info info = {0};
+		uint16_t ring_id = SOC15_RING_ID_FROM_IH_ENTRY(ih_ring_entry);
+		struct kfd_hsa_memory_exception_data exception_data;
+
+		if (client_id == SOC15_IH_CLIENTID_UTCL2 &&
+				amdgpu_amdkfd_ras_query_utcl2_poison_status(dev->adev)) {
+			event_interrupt_poison_consumption(dev, pasid, client_id);
+			return;
+		}
+
+		info.vmid = vmid;
+		info.mc_id = client_id;
+		info.page_addr = ih_ring_entry[4] |
+			(uint64_t)(ih_ring_entry[5] & 0xf) << 32;
+		info.prot_valid = ring_id & 0x08;
+		info.prot_read  = ring_id & 0x10;
+		info.prot_write = ring_id & 0x20;
+
+		memset(&exception_data, 0, sizeof(exception_data));
+		exception_data.gpu_id = dev->id;
+		exception_data.va = (info.page_addr) << PAGE_SHIFT;
+		exception_data.failure.NotPresent = info.prot_valid ? 1 : 0;
+		exception_data.failure.NoExecute = info.prot_exec ? 1 : 0;
+		exception_data.failure.ReadOnly = info.prot_write ? 1 : 0;
+		exception_data.failure.imprecise = 0;
+
+		kfd_set_dbg_ev_from_interrupt(dev,
+						pasid,
+						-1,
+						KFD_EC_MASK(EC_DEVICE_MEMORY_VIOLATION),
+						&exception_data,
+						sizeof(exception_data));
+	} else if (KFD_IRQ_IS_FENCE(client_id, source_id)) {
+		kfd_process_close_interrupt_drain(pasid);
+	}
+}
+
+const struct kfd_event_interrupt_class event_interrupt_class_v10 = {
+	.interrupt_isr = event_interrupt_isr_v10,
+	.interrupt_wq = event_interrupt_wq_v10,
+};
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
index e092563f22de..c68611857629 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
@@ -23,10 +23,40 @@
 
 #include "kfd_priv.h"
 #include "kfd_events.h"
+#include "kfd_debug.h"
 #include "soc15_int.h"
 #include "kfd_device_queue_manager.h"
 #include "kfd_smi_events.h"
 
+/*
+ * GFX9 SQ Interrupts
+ *
+ * There are 3 encoding types of interrupts sourced from SQ sent as a 44-bit
+ * packet to the Interrupt Handler:
+ * Auto - Generated by the SQG (various cmd overflows, timestamps etc)
+ * Wave - Generated by S_SENDMSG through a shader program
+ * Error - HW generated errors (Illegal instructions, Memviols, EDC etc)
+ *
+ * The 44-bit packet is mapped as {context_id1[7:0],context_id0[31:0]} plus
+ * 4-bits for VMID (SOC15_VMID_FROM_IH_ENTRY) as such:
+ *
+ * - context_id0[27:26]
+ * Encoding type (0 = Auto, 1 = Wave, 2 = Error)
+ *
+ * - context_id0[13]
+ * PRIV bit indicates that Wave S_SEND or error occurred within trap
+ *
+ * - {context_id1[7:0],context_id0[31:28],context_id0[11:0]}
+ * 24-bit data with the following layout per encoding type:
+ * Auto - only context_id0[8:0] is used, which reports various interrupts
+ * generated by SQG.  The rest is 0.
+ * Wave - user data sent from m0 via S_SENDMSG
+ * Error - Error type (context_id1[7:4]), Error Details (rest of bits)
+ *
+ * The other context_id bits show coordinates (SE/SH/CU/SIMD/WAVE) for wave
+ * S_SENDMSG and Errors.  These are 0 for Auto.
+ */
+
 enum SQ_INTERRUPT_WORD_ENCODING {
 	SQ_INTERRUPT_WORD_ENCODING_AUTO = 0x0,
 	SQ_INTERRUPT_WORD_ENCODING_INST,
@@ -84,12 +114,32 @@ enum SQ_INTERRUPT_ERROR_TYPE {
 #define SQ_INTERRUPT_WORD_WAVE_CTXID__SE_ID_MASK 0x03000000
 #define SQ_INTERRUPT_WORD_WAVE_CTXID__ENCODING_MASK 0x0c000000
 
+/* GFX9 SQ interrupt 24-bit data from context_id<0,1> */
 #define KFD_CONTEXT_ID_GET_SQ_INT_DATA(ctx0, ctx1)                             \
 	((ctx0 & 0xfff) | ((ctx0 >> 16) & 0xf000) | ((ctx1 << 16) & 0xff0000))
 
 #define KFD_SQ_INT_DATA__ERR_TYPE_MASK 0xF00000
 #define KFD_SQ_INT_DATA__ERR_TYPE__SHIFT 20
 
+/*
+ * The debugger will send user data(m0) with PRIV=1 to indicate it requires
+ * notification from the KFD with the following queue id (DOORBELL_ID) and
+ * trap code (TRAP_CODE).
+ */
+#define KFD_INT_DATA_DEBUG_DOORBELL_MASK	0x0003ff
+#define KFD_INT_DATA_DEBUG_TRAP_CODE_SHIFT	10
+#define KFD_INT_DATA_DEBUG_TRAP_CODE_MASK	0x07fc00
+#define KFD_DEBUG_DOORBELL_ID(sq_int_data)	((sq_int_data) &	\
+				KFD_INT_DATA_DEBUG_DOORBELL_MASK)
+#define KFD_DEBUG_TRAP_CODE(sq_int_data)	(((sq_int_data) &	\
+				KFD_INT_DATA_DEBUG_TRAP_CODE_MASK)	\
+				>> KFD_INT_DATA_DEBUG_TRAP_CODE_SHIFT)
+#define KFD_DEBUG_CP_BAD_OP_ECODE_MASK		0x3fffc00
+#define KFD_DEBUG_CP_BAD_OP_ECODE_SHIFT		10
+#define KFD_DEBUG_CP_BAD_OP_ECODE(ctxid0)	(((ctxid0) &		\
+				KFD_DEBUG_CP_BAD_OP_ECODE_MASK)		\
+				>> KFD_DEBUG_CP_BAD_OP_ECODE_SHIFT)
+
 static void event_interrupt_poison_consumption_v9(struct kfd_dev *dev,
 				uint16_t pasid, uint16_t client_id)
 {
@@ -168,14 +218,16 @@ static bool event_interrupt_isr_v9(struct kfd_dev *dev,
 	uint16_t source_id, client_id, pasid, vmid;
 	const uint32_t *data = ih_ring_entry;
 
+	source_id = SOC15_SOURCE_ID_FROM_IH_ENTRY(ih_ring_entry);
+	client_id = SOC15_CLIENT_ID_FROM_IH_ENTRY(ih_ring_entry);
+
 	/* Only handle interrupts from KFD VMIDs */
 	vmid = SOC15_VMID_FROM_IH_ENTRY(ih_ring_entry);
-	if (vmid < dev->vm_info.first_vmid_kfd ||
-	    vmid > dev->vm_info.last_vmid_kfd)
+	if (!KFD_IRQ_IS_FENCE(client_id, source_id) &&
+	   (vmid < dev->vm_info.first_vmid_kfd ||
+	    vmid > dev->vm_info.last_vmid_kfd))
 		return false;
 
-	source_id = SOC15_SOURCE_ID_FROM_IH_ENTRY(ih_ring_entry);
-	client_id = SOC15_CLIENT_ID_FROM_IH_ENTRY(ih_ring_entry);
 	pasid = SOC15_PASID_FROM_IH_ENTRY(ih_ring_entry);
 
 	/* Only handle clients we care about */
@@ -194,7 +246,8 @@ static bool event_interrupt_isr_v9(struct kfd_dev *dev,
 	    client_id != SOC15_IH_CLIENTID_SE0SH &&
 	    client_id != SOC15_IH_CLIENTID_SE1SH &&
 	    client_id != SOC15_IH_CLIENTID_SE2SH &&
-	    client_id != SOC15_IH_CLIENTID_SE3SH)
+	    client_id != SOC15_IH_CLIENTID_SE3SH &&
+	    !KFD_IRQ_IS_FENCE(client_id, source_id))
 		return false;
 
 	/* This is a known issue for gfx9. Under non HWS, pasid is not set
@@ -247,6 +300,7 @@ static bool event_interrupt_isr_v9(struct kfd_dev *dev,
 		source_id == SOC15_INTSRC_SDMA_ECC ||
 		source_id == SOC15_INTSRC_SQ_INTERRUPT_MSG ||
 		source_id == SOC15_INTSRC_CP_BAD_OPCODE ||
+		KFD_IRQ_IS_FENCE(client_id, source_id) ||
 		((client_id == SOC15_IH_CLIENTID_VMC ||
 		client_id == SOC15_IH_CLIENTID_VMC1 ||
 		client_id == SOC15_IH_CLIENTID_UTCL2) &&
@@ -302,6 +356,13 @@ static void event_interrupt_wq_v9(struct kfd_dev *dev,
 					REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_WAVE_CTXID, SIMD_ID),
 					REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_WAVE_CTXID, CU_ID),
 					sq_int_data);
+				if (context_id0 & SQ_INTERRUPT_WORD_WAVE_CTXID__PRIV_MASK) {
+					if (kfd_set_dbg_ev_from_interrupt(dev, pasid,
+							KFD_DEBUG_DOORBELL_ID(sq_int_data),
+							KFD_DEBUG_TRAP_CODE(sq_int_data),
+							NULL, 0))
+						return;
+				}
 				break;
 			case SQ_INTERRUPT_WORD_ENCODING_ERROR:
 				sq_intr_err = REG_GET_FIELD(sq_int_data, KFD_SQ_INT_DATA, ERR_TYPE);
@@ -324,8 +385,12 @@ static void event_interrupt_wq_v9(struct kfd_dev *dev,
 				break;
 			}
 			kfd_signal_event_interrupt(pasid, context_id0 & 0xffffff, 24);
-		} else if (source_id == SOC15_INTSRC_CP_BAD_OPCODE)
-			kfd_signal_hw_exception_event(pasid);
+		} else if (source_id == SOC15_INTSRC_CP_BAD_OPCODE) {
+			kfd_set_dbg_ev_from_interrupt(dev, pasid,
+				KFD_DEBUG_DOORBELL_ID(context_id0),
+				KFD_EC_MASK(KFD_DEBUG_CP_BAD_OP_ECODE(context_id0)),
+				NULL, 0);
+		}
 	} else if (client_id == SOC15_IH_CLIENTID_SDMA0 ||
 		   client_id == SOC15_IH_CLIENTID_SDMA1 ||
 		   client_id == SOC15_IH_CLIENTID_SDMA2 ||
@@ -345,6 +410,7 @@ static void event_interrupt_wq_v9(struct kfd_dev *dev,
 		   client_id == SOC15_IH_CLIENTID_UTCL2) {
 		struct kfd_vm_fault_info info = {0};
 		uint16_t ring_id = SOC15_RING_ID_FROM_IH_ENTRY(ih_ring_entry);
+		struct kfd_hsa_memory_exception_data exception_data;
 
 		if (client_id == SOC15_IH_CLIENTID_UTCL2 &&
 		    amdgpu_amdkfd_ras_query_utcl2_poison_status(dev->adev)) {
@@ -360,9 +426,23 @@ static void event_interrupt_wq_v9(struct kfd_dev *dev,
 		info.prot_read  = ring_id & 0x10;
 		info.prot_write = ring_id & 0x20;
 
+		memset(&exception_data, 0, sizeof(exception_data));
+		exception_data.gpu_id = dev->id;
+		exception_data.va = (info.page_addr) << PAGE_SHIFT;
+		exception_data.failure.NotPresent = info.prot_valid ? 1 : 0;
+		exception_data.failure.NoExecute = info.prot_exec ? 1 : 0;
+		exception_data.failure.ReadOnly = info.prot_write ? 1 : 0;
+		exception_data.failure.imprecise = 0;
+
+		kfd_set_dbg_ev_from_interrupt(dev,
+						pasid,
+						-1,
+						KFD_EC_MASK(EC_DEVICE_MEMORY_VIOLATION),
+						&exception_data,
+						sizeof(exception_data));
 		kfd_smi_event_update_vmfault(dev, pasid);
-		kfd_dqm_evict_pasid(dev->dqm, pasid);
-		kfd_signal_vm_fault_event(dev, pasid, &info, NULL);
+	} else if (KFD_IRQ_IS_FENCE(client_id, source_id)) {
+		kfd_process_close_interrupt_drain(pasid);
 	}
 }
 
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index 82b28588ab72..bd3d8a0b61b7 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -926,6 +926,10 @@ struct kfd_process {
 	uint64_t exception_enable_mask;
 	uint64_t exception_status;
 
+	/* Used to drain stale interrupts */
+	wait_queue_head_t wait_irq_drain;
+	bool irq_drain_is_open;
+
 	/* shared virtual memory registered by this process */
 	struct svm_range_list svms;
 
@@ -1088,12 +1092,19 @@ int kfd_numa_node_to_apic_id(int numa_node_id);
 void kfd_double_confirm_iommu_support(struct kfd_dev *gpu);
 
 /* Interrupts */
+#define	KFD_IRQ_FENCE_CLIENTID	0xff
+#define	KFD_IRQ_FENCE_SOURCEID	0xff
+#define	KFD_IRQ_IS_FENCE(client, source)				\
+				((client) == KFD_IRQ_FENCE_CLIENTID &&	\
+				(source) == KFD_IRQ_FENCE_SOURCEID)
 int kfd_interrupt_init(struct kfd_dev *dev);
 void kfd_interrupt_exit(struct kfd_dev *dev);
 bool enqueue_ih_ring_entry(struct kfd_dev *kfd,	const void *ih_ring_entry);
 bool interrupt_is_wanted(struct kfd_dev *dev,
 				const uint32_t *ih_ring_entry,
 				uint32_t *patched_ihre, bool *flag);
+int kfd_process_drain_interrupts(struct kfd_process_device *pdd);
+void kfd_process_close_interrupt_drain(unsigned int pasid);
 
 /* amdkfd Apertures */
 int kfd_init_apertures(struct kfd_process *process);
@@ -1365,6 +1376,7 @@ uint64_t kfd_get_number_elems(struct kfd_dev *kfd);
 /* Events */
 extern const struct kfd_event_interrupt_class event_interrupt_class_cik;
 extern const struct kfd_event_interrupt_class event_interrupt_class_v9;
+extern const struct kfd_event_interrupt_class event_interrupt_class_v10;
 extern const struct kfd_event_interrupt_class event_interrupt_class_v11;
 
 extern const struct kfd_device_global_init_class device_global_init_class_cik;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
index d62e0c62df76..340e7beee8d7 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
@@ -856,6 +856,8 @@ struct kfd_process *kfd_create_process(struct task_struct *thread)
 		kfd_procfs_add_sysfs_stats(process);
 		kfd_procfs_add_sysfs_files(process);
 		kfd_procfs_add_sysfs_counters(process);
+
+		init_waitqueue_head(&process->wait_irq_drain);
 	}
 out:
 	if (!IS_ERR(process))
@@ -2063,6 +2065,51 @@ void kfd_flush_tlb(struct kfd_process_device *pdd, enum TLB_FLUSH_TYPE type)
 	}
 }
 
+/* assumes caller holds process lock. */
+int kfd_process_drain_interrupts(struct kfd_process_device *pdd)
+{
+	uint32_t irq_drain_fence[8];
+	int r = 0;
+
+	if (!KFD_IS_SOC15(pdd->dev))
+		return 0;
+
+	pdd->process->irq_drain_is_open = true;
+
+	memset(irq_drain_fence, 0, sizeof(irq_drain_fence));
+	irq_drain_fence[0] = (KFD_IRQ_FENCE_SOURCEID << 8) |
+							KFD_IRQ_FENCE_CLIENTID;
+	irq_drain_fence[3] = pdd->process->pasid;
+
+	/* ensure stale irqs scheduled KFD interrupts and send drain fence. */
+	if (amdgpu_amdkfd_send_close_event_drain_irq(pdd->dev->adev,
+							irq_drain_fence)) {
+		pdd->process->irq_drain_is_open = false;
+		return 0;
+	}
+
+	r = wait_event_interruptible(pdd->process->wait_irq_drain,
+				!READ_ONCE(pdd->process->irq_drain_is_open));
+	if (r)
+		pdd->process->irq_drain_is_open = false;
+
+	return r;
+}
+
+void kfd_process_close_interrupt_drain(unsigned int pasid)
+{
+	struct kfd_process *p;
+
+	p = kfd_lookup_process_by_pasid(pasid);
+
+	if (!p)
+		return;
+
+	WRITE_ONCE(p->irq_drain_is_open, false);
+	wake_up_all(&p->wait_irq_drain);
+	kfd_unref_process(p);
+}
+
 struct send_exception_work_handler_workarea {
 	struct work_struct work;
 	struct kfd_process *p;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
index 5137476ec18e..15db83c9a585 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
@@ -330,6 +330,10 @@ int pqm_create_queue(struct process_queue_manager *pqm,
 		kq->queue->properties.queue_id = *qid;
 		pqn->kq = kq;
 		pqn->q = NULL;
+		retval = kfd_process_drain_interrupts(pdd);
+		if (retval)
+			break;
+
 		retval = dev->dqm->ops.create_kernel_queue(dev->dqm,
 							kq, &pdd->qpd);
 		break;
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 63+ messages in thread

* [PATCH 19/29] drm/amdkfd: add debug set exceptions enabled operation
  2022-10-31 16:23 [PATCH 01/29] drm/amdkfd: add debug and runtime enable interface Jonathan Kim
                   ` (16 preceding siblings ...)
  2022-10-31 16:23 ` [PATCH 18/29] drm/amdkfd: update process interrupt handling for debug events Jonathan Kim
@ 2022-10-31 16:23 ` Jonathan Kim
  2022-11-24 21:24   ` Felix Kuehling
  2022-10-31 16:23 ` [PATCH 20/29] drm/amdkfd: add debug wave launch override operation Jonathan Kim
                   ` (10 subsequent siblings)
  28 siblings, 1 reply; 63+ messages in thread
From: Jonathan Kim @ 2022-10-31 16:23 UTC (permalink / raw)
  To: amd-gfx; +Cc: Felix.Kuehling

The debugger subscibes to nofication for requested exceptions on attach.
Allow the debugger to change its subsciption later on.

Signed-off-by: Jonathan Kim <jonathan.kim@amd.com>
---
 drivers/gpu/drm/amd/amdkfd/kfd_chardev.c |  3 ++
 drivers/gpu/drm/amd/amdkfd/kfd_debug.c   | 36 ++++++++++++++++++++++++
 drivers/gpu/drm/amd/amdkfd/kfd_debug.h   |  2 ++
 3 files changed, 41 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index 27cd5af72521..61612b9bdf8c 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -2887,6 +2887,9 @@ static int kfd_ioctl_set_debug_trap(struct file *filep, struct kfd_process *p, v
 				args->send_runtime_event.exception_mask);
 		break;
 	case KFD_IOC_DBG_TRAP_SET_EXCEPTIONS_ENABLED:
+		kfd_dbg_set_enabled_debug_exception_mask(target,
+				args->set_exceptions_enabled.exception_mask);
+		break;
 	case KFD_IOC_DBG_TRAP_SET_WAVE_LAUNCH_OVERRIDE:
 	case KFD_IOC_DBG_TRAP_SET_WAVE_LAUNCH_MODE:
 	case KFD_IOC_DBG_TRAP_SUSPEND_QUEUES:
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
index 3d304e8c286e..594ccca25cae 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
@@ -441,3 +441,39 @@ int kfd_dbg_trap_enable(struct kfd_process *target, uint32_t fd,
 
 	return r;
 }
+
+void kfd_dbg_set_enabled_debug_exception_mask(struct kfd_process *target,
+					uint64_t exception_set_mask)
+{
+	uint64_t found_mask = 0;
+	struct process_queue_manager *pqm;
+	struct process_queue_node *pqn;
+	static const char write_data = '.';
+	loff_t pos = 0;
+	int i;
+
+	mutex_lock(&target->event_mutex);
+
+	found_mask |= target->exception_status;
+
+	pqm = &target->pqm;
+	list_for_each_entry(pqn, &pqm->queues, process_queue_list) {
+		if (!pqn)
+			continue;
+
+		found_mask |= pqn->q->properties.exception_status;
+	}
+
+	for (i = 0; i < target->n_pdds; i++) {
+		struct kfd_process_device *pdd = target->pdds[i];
+
+		found_mask |= pdd->exception_status;
+	}
+
+	if (exception_set_mask & found_mask)
+		kernel_write(target->dbg_ev_file, &write_data, 1, &pos);
+
+	target->exception_enable_mask = exception_set_mask;
+
+	mutex_unlock(&target->event_mutex);
+}
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.h b/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
index 5270d5749828..837e09491a76 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
@@ -58,6 +58,8 @@ static inline bool kfd_dbg_is_per_vmid_supported(struct kfd_dev *dev)
 
 void debug_event_write_work_handler(struct work_struct *work);
 
+void kfd_dbg_set_enabled_debug_exception_mask(struct kfd_process *target,
+					uint64_t exception_set_mask);
 /*
  * If GFX off is enabled, chips that do not support RLC restore for the debug
  * registers will disable GFX off temporarily for the entire debug session.
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 63+ messages in thread

* [PATCH 20/29] drm/amdkfd: add debug wave launch override operation
  2022-10-31 16:23 [PATCH 01/29] drm/amdkfd: add debug and runtime enable interface Jonathan Kim
                   ` (17 preceding siblings ...)
  2022-10-31 16:23 ` [PATCH 19/29] drm/amdkfd: add debug set exceptions enabled operation Jonathan Kim
@ 2022-10-31 16:23 ` Jonathan Kim
  2022-11-29 22:37   ` Felix Kuehling
  2022-10-31 16:23 ` [PATCH 21/29] drm/amdkfd: add debug wave launch mode operation Jonathan Kim
                   ` (9 subsequent siblings)
  28 siblings, 1 reply; 63+ messages in thread
From: Jonathan Kim @ 2022-10-31 16:23 UTC (permalink / raw)
  To: amd-gfx; +Cc: Felix.Kuehling

This operation allows the debugger to override the enabled HW
exceptions on the device.

On debug devices that only support the debugging of a single process,
the HW exceptions are global and set through the SPI_GDBG_TRAP_MASK
register.
Because they are global, only address watch exceptions are allowed to
be enabled.  In other words, the debugger must preserve all non-address
watch exception states in normal mode operation by barring a full
replacement override or a non-address watch override request.

For multi-process debugging, all HW exception overrides are per-VMID so
all exceptions can be overridden or fully replaced.

In order for the debugger to know what is permissible, returned the
supported override mask back to the debugger along with the previously
enable overrides.

v2: switch unsupported override mode return from EPERM to EINVAL to
support unique EPERM on PTRACE failure.

Signed-off-by: Jonathan Kim <jonathan.kim@amd.com>
---
 .../drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c  | 47 ++++++++++++++
 .../drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c   |  2 +
 .../drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c    | 55 ++++++++++++++++
 .../drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.h    | 10 +++
 .../drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10_3.c  |  5 +-
 .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c | 55 ++++++++++++++++
 .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h | 10 +++
 drivers/gpu/drm/amd/amdkfd/kfd_chardev.c      |  7 ++
 drivers/gpu/drm/amd/amdkfd/kfd_debug.c        | 65 +++++++++++++++++++
 drivers/gpu/drm/amd/amdkfd/kfd_debug.h        |  6 ++
 10 files changed, 261 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c
index c9629fc5460c..a5003f6f05bf 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c
@@ -25,6 +25,7 @@
 #include "amdgpu_amdkfd_gfx_v9.h"
 #include "gc/gc_9_4_2_offset.h"
 #include "gc/gc_9_4_2_sh_mask.h"
+#include <uapi/linux/kfd_ioctl.h>
 
 /* returns TRAP_EN, EXCP_EN and EXCP_REPLACE. */
 static uint32_t kgd_aldebaran_enable_debug_trap(struct amdgpu_device *adev,
@@ -54,6 +55,50 @@ static uint32_t kgd_aldebaran_disable_debug_trap(struct amdgpu_device *adev,
 	return data;
 }
 
+static int kgd_aldebaran_validate_trap_override_request(struct amdgpu_device *adev,
+							uint32_t trap_override,
+							uint32_t *trap_mask_supported)
+{
+	*trap_mask_supported &= KFD_DBG_TRAP_MASK_FP_INVALID |
+				KFD_DBG_TRAP_MASK_FP_INPUT_DENORMAL |
+				KFD_DBG_TRAP_MASK_FP_DIVIDE_BY_ZERO |
+				KFD_DBG_TRAP_MASK_FP_OVERFLOW |
+				KFD_DBG_TRAP_MASK_FP_UNDERFLOW |
+				KFD_DBG_TRAP_MASK_FP_INEXACT |
+				KFD_DBG_TRAP_MASK_INT_DIVIDE_BY_ZERO |
+				KFD_DBG_TRAP_MASK_DBG_ADDRESS_WATCH |
+				KFD_DBG_TRAP_MASK_DBG_MEMORY_VIOLATION;
+
+	if (trap_override != KFD_DBG_TRAP_OVERRIDE_OR &&
+			trap_override != KFD_DBG_TRAP_OVERRIDE_REPLACE)
+		return -EPERM;
+
+	return 0;
+}
+
+/* returns TRAP_EN, EXCP_EN and EXCP_RPLACE. */
+static uint32_t kgd_aldebaran_set_wave_launch_trap_override(struct amdgpu_device *adev,
+					uint32_t vmid,
+					uint32_t trap_override,
+					uint32_t trap_mask_bits,
+					uint32_t trap_mask_request,
+					uint32_t *trap_mask_prev,
+					uint32_t kfd_dbg_trap_cntl_prev)
+
+{
+	uint32_t data = 0;
+
+	*trap_mask_prev = REG_GET_FIELD(kfd_dbg_trap_cntl_prev, SPI_GDBG_PER_VMID_CNTL, EXCP_EN);
+	trap_mask_bits = (trap_mask_bits & trap_mask_request) |
+		(*trap_mask_prev & ~trap_mask_request);
+
+	data = REG_SET_FIELD(data, SPI_GDBG_PER_VMID_CNTL, TRAP_EN, 1);
+	data = REG_SET_FIELD(data, SPI_GDBG_PER_VMID_CNTL, EXCP_EN, trap_mask_bits);
+	data = REG_SET_FIELD(data, SPI_GDBG_PER_VMID_CNTL, EXCP_REPLACE, trap_override);
+
+	return data;
+}
+
 const struct kfd2kgd_calls aldebaran_kfd2kgd = {
 	.program_sh_mem_settings = kgd_gfx_v9_program_sh_mem_settings,
 	.set_pasid_vmid_mapping = kgd_gfx_v9_set_pasid_vmid_mapping,
@@ -73,6 +118,8 @@ const struct kfd2kgd_calls aldebaran_kfd2kgd = {
 	.set_vm_context_page_table_base = kgd_gfx_v9_set_vm_context_page_table_base,
 	.enable_debug_trap = kgd_aldebaran_enable_debug_trap,
 	.disable_debug_trap = kgd_aldebaran_disable_debug_trap,
+	.validate_trap_override_request = kgd_aldebaran_validate_trap_override_request,
+	.set_wave_launch_trap_override = kgd_aldebaran_set_wave_launch_trap_override,
 	.get_iq_wait_times = kgd_gfx_v9_get_iq_wait_times,
 	.build_grace_period_packet_info = kgd_gfx_v9_build_grace_period_packet_info,
 	.program_trap_handler_settings = kgd_gfx_v9_program_trap_handler_settings,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c
index 60a204f767ba..b3682758184f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c
@@ -397,6 +397,8 @@ const struct kfd2kgd_calls arcturus_kfd2kgd = {
 				kgd_gfx_v9_set_vm_context_page_table_base,
 	.enable_debug_trap = kgd_arcturus_enable_debug_trap,
 	.disable_debug_trap = kgd_arcturus_disable_debug_trap,
+	.validate_trap_override_request = kgd_gfx_v9_validate_trap_override_request,
+	.set_wave_launch_trap_override = kgd_gfx_v9_set_wave_launch_trap_override,
 	.get_iq_wait_times = kgd_gfx_v9_get_iq_wait_times,
 	.build_grace_period_packet_info = kgd_gfx_v9_build_grace_period_packet_info,
 	.get_cu_occupancy = kgd_gfx_v9_get_cu_occupancy,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
index 2491402afd58..32a6e5fbeacd 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
@@ -31,6 +31,7 @@
 #include "v10_structs.h"
 #include "nv.h"
 #include "nvd.h"
+#include <uapi/linux/kfd_ioctl.h>
 
 enum hqd_dequeue_request_type {
 	NO_ACTION = 0,
@@ -801,6 +802,58 @@ uint32_t kgd_gfx_v10_disable_debug_trap(struct amdgpu_device *adev,
 	return 0;
 }
 
+int kgd_gfx_v10_validate_trap_override_request(struct amdgpu_device *adev,
+					      uint32_t trap_override,
+					      uint32_t *trap_mask_supported)
+{
+	*trap_mask_supported &= KFD_DBG_TRAP_MASK_DBG_ADDRESS_WATCH;
+
+	/* The SPI_GDBG_TRAP_MASK register is global and affects all
+	 * processes. Only allow OR-ing the address-watch bit, since
+	 * this only affects processes under the debugger. Other bits
+	 * should stay 0 to avoid the debugger interfering with other
+	 * processes.
+	 */
+	if (trap_override != KFD_DBG_TRAP_OVERRIDE_OR)
+		return -EINVAL;
+
+	return 0;
+}
+
+uint32_t kgd_gfx_v10_set_wave_launch_trap_override(struct amdgpu_device *adev,
+					      uint32_t vmid,
+					      uint32_t trap_override,
+					      uint32_t trap_mask_bits,
+					      uint32_t trap_mask_request,
+					      uint32_t *trap_mask_prev,
+					      uint32_t kfd_dbg_trap_cntl_prev)
+{
+	uint32_t data, wave_cntl_prev;
+
+	mutex_lock(&adev->grbm_idx_mutex);
+
+	wave_cntl_prev = RREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_WAVE_CNTL));
+
+	kgd_gfx_v10_set_wave_launch_stall(adev, vmid, true);
+
+	data = RREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_TRAP_MASK));
+	*trap_mask_prev = REG_GET_FIELD(data, SPI_GDBG_TRAP_MASK, EXCP_EN);
+
+	trap_mask_bits = (trap_mask_bits & trap_mask_request) |
+		(*trap_mask_prev & ~trap_mask_request);
+
+	data = REG_SET_FIELD(data, SPI_GDBG_TRAP_MASK, EXCP_EN, trap_mask_bits);
+	data = REG_SET_FIELD(data, SPI_GDBG_TRAP_MASK, REPLACE, trap_override);
+	WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_TRAP_MASK), data);
+
+	/* We need to preserve wave launch mode stall settings. */
+	WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_WAVE_CNTL), wave_cntl_prev);
+
+	mutex_unlock(&adev->grbm_idx_mutex);
+
+	return 0;
+}
+
 /* kgd_gfx_v10_get_iq_wait_times: Returns the mmCP_IQ_WAIT_TIME1/2 values
  * The values read are:
  *     ib_offload_wait_time     -- Wait Count for Indirect Buffer Offloads.
@@ -886,6 +939,8 @@ const struct kfd2kgd_calls gfx_v10_kfd2kgd = {
 	.set_vm_context_page_table_base = set_vm_context_page_table_base,
 	.enable_debug_trap = kgd_gfx_v10_enable_debug_trap,
 	.disable_debug_trap = kgd_gfx_v10_disable_debug_trap,
+	.validate_trap_override_request = kgd_gfx_v10_validate_trap_override_request,
+	.set_wave_launch_trap_override = kgd_gfx_v10_set_wave_launch_trap_override,
 	.get_iq_wait_times = kgd_gfx_v10_get_iq_wait_times,
 	.build_grace_period_packet_info = kgd_gfx_v10_build_grace_period_packet_info,
 	.program_trap_handler_settings = program_trap_handler_settings,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.h
index 0abc1e805180..85c929fc2926 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.h
@@ -26,6 +26,16 @@ uint32_t kgd_gfx_v10_enable_debug_trap(struct amdgpu_device *adev,
 uint32_t kgd_gfx_v10_disable_debug_trap(struct amdgpu_device *adev,
 					bool keep_trap_enabled,
 					uint32_t vmid);
+int kgd_gfx_v10_validate_trap_override_request(struct amdgpu_device *adev,
+					     uint32_t trap_override,
+					     uint32_t *trap_mask_supported);
+uint32_t kgd_gfx_v10_set_wave_launch_trap_override(struct amdgpu_device *adev,
+					     uint32_t vmid,
+					     uint32_t trap_override,
+					     uint32_t trap_mask_bits,
+					     uint32_t trap_mask_request,
+					     uint32_t *trap_mask_prev,
+					     uint32_t kfd_dbg_trap_cntl_prev);
 void kgd_gfx_v10_get_iq_wait_times(struct amdgpu_device *adev, uint32_t *wait_times);
 void kgd_gfx_v10_build_grace_period_packet_info(struct amdgpu_device *adev,
 					       uint32_t wait_times,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10_3.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10_3.c
index c57f2a6b6e23..ae3ead207df4 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10_3.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10_3.c
@@ -673,5 +673,8 @@ const struct kfd2kgd_calls gfx_v10_3_kfd2kgd = {
 	.get_iq_wait_times = kgd_gfx_v10_get_iq_wait_times,
 	.build_grace_period_packet_info = kgd_gfx_v10_build_grace_period_packet_info,
 	.enable_debug_trap = kgd_gfx_v10_enable_debug_trap,
-	.disable_debug_trap = kgd_gfx_v10_disable_debug_trap
+	.disable_debug_trap = kgd_gfx_v10_disable_debug_trap,
+	.validate_trap_override_request = kgd_gfx_v10_validate_trap_override_request,
+	.set_wave_launch_trap_override = kgd_gfx_v10_set_wave_launch_trap_override
+
 };
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
index 673c99c5523d..cb0044bbfae5 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
@@ -38,6 +38,7 @@
 #include "soc15d.h"
 #include "gfx_v9_0.h"
 #include "amdgpu_amdkfd_gfx_v9.h"
+#include <uapi/linux/kfd_ioctl.h>
 
 enum hqd_dequeue_request_type {
 	NO_ACTION = 0,
@@ -724,6 +725,58 @@ uint32_t kgd_gfx_v9_disable_debug_trap(struct amdgpu_device *adev,
 	return 0;
 }
 
+int kgd_gfx_v9_validate_trap_override_request(struct amdgpu_device *adev,
+					uint32_t trap_override,
+					uint32_t *trap_mask_supported)
+{
+	*trap_mask_supported &= KFD_DBG_TRAP_MASK_DBG_ADDRESS_WATCH;
+
+	/* The SPI_GDBG_TRAP_MASK register is global and affects all
+	 * processes. Only allow OR-ing the address-watch bit, since
+	 * this only affects processes under the debugger. Other bits
+	 * should stay 0 to avoid the debugger interfering with other
+	 * processes.
+	 */
+	if (trap_override != KFD_DBG_TRAP_OVERRIDE_OR)
+		return -EINVAL;
+
+	return 0;
+}
+
+uint32_t kgd_gfx_v9_set_wave_launch_trap_override(struct amdgpu_device *adev,
+					     uint32_t vmid,
+					     uint32_t trap_override,
+					     uint32_t trap_mask_bits,
+					     uint32_t trap_mask_request,
+					     uint32_t *trap_mask_prev,
+					     uint32_t kfd_dbg_cntl_prev)
+{
+	uint32_t data, wave_cntl_prev;
+
+	mutex_lock(&adev->grbm_idx_mutex);
+
+	wave_cntl_prev = RREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_WAVE_CNTL));
+
+	kgd_gfx_v9_set_wave_launch_stall(adev, vmid, true);
+
+	data = RREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_TRAP_MASK));
+	*trap_mask_prev = REG_GET_FIELD(data, SPI_GDBG_TRAP_MASK, EXCP_EN);
+
+	trap_mask_bits = (trap_mask_bits & trap_mask_request) |
+		(*trap_mask_prev & ~trap_mask_request);
+
+	data = REG_SET_FIELD(data, SPI_GDBG_TRAP_MASK, EXCP_EN, trap_mask_bits);
+	data = REG_SET_FIELD(data, SPI_GDBG_TRAP_MASK, REPLACE, trap_override);
+	WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_TRAP_MASK), data);
+
+	/* We need to preserve wave launch mode stall settings. */
+	WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_WAVE_CNTL), wave_cntl_prev);
+
+	mutex_unlock(&adev->grbm_idx_mutex);
+
+	return 0;
+}
+
 /* kgd_gfx_v9_get_iq_wait_times: Returns the mmCP_IQ_WAIT_TIME1/2 values
  * The values read are:
  *     ib_offload_wait_time     -- Wait Count for Indirect Buffer Offloads.
@@ -992,6 +1045,8 @@ const struct kfd2kgd_calls gfx_v9_kfd2kgd = {
 	.set_vm_context_page_table_base = kgd_gfx_v9_set_vm_context_page_table_base,
 	.enable_debug_trap = kgd_gfx_v9_enable_debug_trap,
 	.disable_debug_trap = kgd_gfx_v9_disable_debug_trap,
+	.validate_trap_override_request = kgd_gfx_v9_validate_trap_override_request,
+	.set_wave_launch_trap_override = kgd_gfx_v9_set_wave_launch_trap_override,
 	.get_iq_wait_times = kgd_gfx_v9_get_iq_wait_times,
 	.build_grace_period_packet_info = kgd_gfx_v9_build_grace_period_packet_info,
 	.get_cu_occupancy = kgd_gfx_v9_get_cu_occupancy,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h
index c0866497cb5c..47cff392b434 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h
@@ -64,6 +64,16 @@ uint32_t kgd_gfx_v9_enable_debug_trap(struct amdgpu_device *adev,
 uint32_t kgd_gfx_v9_disable_debug_trap(struct amdgpu_device *adev,
 					bool keep_trap_enabled,
 					uint32_t vmid);
+int kgd_gfx_v9_validate_trap_override_request(struct amdgpu_device *adev,
+					     uint32_t trap_override,
+					     uint32_t *trap_mask_supported);
+uint32_t kgd_gfx_v9_set_wave_launch_trap_override(struct amdgpu_device *adev,
+					     uint32_t vmid,
+					     uint32_t trap_override,
+					     uint32_t trap_mask_bits,
+					     uint32_t trap_mask_request,
+					     uint32_t *trap_mask_prev,
+					     uint32_t kfd_dbg_trap_cntl_prev);
 void kgd_gfx_v9_get_iq_wait_times(struct amdgpu_device *adev, uint32_t *wait_times);
 void kgd_gfx_v9_build_grace_period_packet_info(struct amdgpu_device *adev,
 					       uint32_t wait_times,
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index 61612b9bdf8c..1f0ee2413b13 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -2891,6 +2891,13 @@ static int kfd_ioctl_set_debug_trap(struct file *filep, struct kfd_process *p, v
 				args->set_exceptions_enabled.exception_mask);
 		break;
 	case KFD_IOC_DBG_TRAP_SET_WAVE_LAUNCH_OVERRIDE:
+		r = kfd_dbg_trap_set_wave_launch_override(target,
+				args->launch_override.override_mode,
+				args->launch_override.enable_mask,
+				args->launch_override.support_request_mask,
+				&args->launch_override.enable_mask,
+				&args->launch_override.support_request_mask);
+		break;
 	case KFD_IOC_DBG_TRAP_SET_WAVE_LAUNCH_MODE:
 	case KFD_IOC_DBG_TRAP_SUSPEND_QUEUES:
 	case KFD_IOC_DBG_TRAP_RESUME_QUEUES:
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
index 594ccca25cae..8add359d1cb9 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
@@ -442,6 +442,71 @@ int kfd_dbg_trap_enable(struct kfd_process *target, uint32_t fd,
 	return r;
 }
 
+static int kfd_dbg_validate_trap_override_request(struct kfd_process *p,
+						uint32_t trap_override,
+						uint32_t trap_mask_request,
+						uint32_t *trap_mask_supported)
+{
+	int i = 0;
+
+	*trap_mask_supported = 0xffffffff;
+
+	for (i = 0; i < p->n_pdds; i++) {
+		struct kfd_process_device *pdd = p->pdds[i];
+		int err = pdd->dev->kfd2kgd->validate_trap_override_request(
+								pdd->dev->adev,
+								trap_override,
+								trap_mask_supported);
+
+		if (err)
+			return err;
+	}
+
+	if (trap_mask_request & ~*trap_mask_supported)
+		return -EACCES;
+
+	return 0;
+}
+
+int kfd_dbg_trap_set_wave_launch_override(struct kfd_process *target,
+					uint32_t trap_override,
+					uint32_t trap_mask_bits,
+					uint32_t trap_mask_request,
+					uint32_t *trap_mask_prev,
+					uint32_t *trap_mask_supported)
+{
+	int r = 0, i;
+
+	r = kfd_dbg_validate_trap_override_request(target,
+						trap_override,
+						trap_mask_request,
+						trap_mask_supported);
+
+	if (r)
+		return r;
+
+	for (i = 0; i < target->n_pdds; i++) {
+		struct kfd_process_device *pdd = target->pdds[i];
+
+		amdgpu_gfx_off_ctrl(pdd->dev->adev, false);
+		pdd->spi_dbg_override = pdd->dev->kfd2kgd->set_wave_launch_trap_override(
+				pdd->dev->adev,
+				pdd->dev->vm_info.last_vmid_kfd,
+				trap_override,
+				trap_mask_bits,
+				trap_mask_request,
+				trap_mask_prev,
+				pdd->spi_dbg_override);
+		amdgpu_gfx_off_ctrl(pdd->dev->adev, true);
+
+		r = debug_refresh_runlist(pdd->dev->dqm);
+		if (r)
+			break;
+	}
+
+	return r;
+}
+
 void kfd_dbg_set_enabled_debug_exception_mask(struct kfd_process *target,
 					uint64_t exception_set_mask)
 {
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.h b/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
index 837e09491a76..b54a50a5d310 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
@@ -45,6 +45,12 @@ int kfd_dbg_trap_disable(struct kfd_process *target);
 int kfd_dbg_trap_enable(struct kfd_process *target, uint32_t fd,
 			void __user *runtime_info,
 			uint32_t *runtime_info_size);
+int kfd_dbg_trap_set_wave_launch_override(struct kfd_process *target,
+					uint32_t trap_override,
+					uint32_t trap_mask_bits,
+					uint32_t trap_mask_request,
+					uint32_t *trap_mask_prev,
+					uint32_t *trap_mask_supported);
 
 int kfd_dbg_send_exception_to_runtime(struct kfd_process *p,
 					unsigned int dev_id,
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 63+ messages in thread

* [PATCH 21/29] drm/amdkfd: add debug wave launch mode operation
  2022-10-31 16:23 [PATCH 01/29] drm/amdkfd: add debug and runtime enable interface Jonathan Kim
                   ` (18 preceding siblings ...)
  2022-10-31 16:23 ` [PATCH 20/29] drm/amdkfd: add debug wave launch override operation Jonathan Kim
@ 2022-10-31 16:23 ` Jonathan Kim
  2022-12-01  0:02   ` Felix Kuehling
  2022-10-31 16:23 ` [PATCH 22/29] drm/amdkfd: add debug suspend and resume process queues operation Jonathan Kim
                   ` (8 subsequent siblings)
  28 siblings, 1 reply; 63+ messages in thread
From: Jonathan Kim @ 2022-10-31 16:23 UTC (permalink / raw)
  To: amd-gfx; +Cc: Felix.Kuehling

Allow the debugger to set wave behaviour on to either normally operate,
halt at launch, trap on every instruction, terminate immediately or
stall on allocation.

Signed-off-by: Jonathan Kim <jonathan.kim@amd.com>
---
 .../drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c  | 18 ++++++++++
 .../drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c   |  1 +
 .../drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c    | 27 +++++++++++++++
 .../drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.h    |  3 ++
 .../drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10_3.c  |  3 +-
 .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c | 34 +++++++++++++++++++
 .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h |  3 ++
 drivers/gpu/drm/amd/amdkfd/kfd_chardev.c      |  3 ++
 drivers/gpu/drm/amd/amdkfd/kfd_debug.c        | 27 ++++++++++++++-
 drivers/gpu/drm/amd/amdkfd/kfd_debug.h        |  5 ++-
 10 files changed, 119 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c
index a5003f6f05bf..91c7fdee883e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c
@@ -99,6 +99,23 @@ static uint32_t kgd_aldebaran_set_wave_launch_trap_override(struct amdgpu_device
 	return data;
 }
 
+static uint32_t kgd_aldebaran_set_wave_launch_mode(struct amdgpu_device *adev,
+					uint8_t wave_launch_mode,
+					uint32_t vmid)
+{
+	uint32_t data = 0;
+	bool is_stall_mode = wave_launch_mode == KFD_DBG_TRAP_WAVE_LAUNCH_MODE_HALT;
+
+	if (is_stall_mode)
+		data = REG_SET_FIELD(data, SPI_GDBG_PER_VMID_CNTL, STALL_VMID,
+									1);
+	else
+		data = REG_SET_FIELD(data, SPI_GDBG_PER_VMID_CNTL, LAUNCH_MODE,
+							wave_launch_mode);
+
+	return data;
+}
+
 const struct kfd2kgd_calls aldebaran_kfd2kgd = {
 	.program_sh_mem_settings = kgd_gfx_v9_program_sh_mem_settings,
 	.set_pasid_vmid_mapping = kgd_gfx_v9_set_pasid_vmid_mapping,
@@ -120,6 +137,7 @@ const struct kfd2kgd_calls aldebaran_kfd2kgd = {
 	.disable_debug_trap = kgd_aldebaran_disable_debug_trap,
 	.validate_trap_override_request = kgd_aldebaran_validate_trap_override_request,
 	.set_wave_launch_trap_override = kgd_aldebaran_set_wave_launch_trap_override,
+	.set_wave_launch_mode = kgd_aldebaran_set_wave_launch_mode,
 	.get_iq_wait_times = kgd_gfx_v9_get_iq_wait_times,
 	.build_grace_period_packet_info = kgd_gfx_v9_build_grace_period_packet_info,
 	.program_trap_handler_settings = kgd_gfx_v9_program_trap_handler_settings,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c
index b3682758184f..10470f4a4eaf 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c
@@ -399,6 +399,7 @@ const struct kfd2kgd_calls arcturus_kfd2kgd = {
 	.disable_debug_trap = kgd_arcturus_disable_debug_trap,
 	.validate_trap_override_request = kgd_gfx_v9_validate_trap_override_request,
 	.set_wave_launch_trap_override = kgd_gfx_v9_set_wave_launch_trap_override,
+	.set_wave_launch_mode = kgd_gfx_v9_set_wave_launch_mode,
 	.get_iq_wait_times = kgd_gfx_v9_get_iq_wait_times,
 	.build_grace_period_packet_info = kgd_gfx_v9_build_grace_period_packet_info,
 	.get_cu_occupancy = kgd_gfx_v9_get_cu_occupancy,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
index 32a6e5fbeacd..66a83e6fb9e5 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
@@ -854,6 +854,32 @@ uint32_t kgd_gfx_v10_set_wave_launch_trap_override(struct amdgpu_device *adev,
 	return 0;
 }
 
+uint32_t kgd_gfx_v10_set_wave_launch_mode(struct amdgpu_device *adev,
+					uint8_t wave_launch_mode,
+					uint32_t vmid)
+{
+	uint32_t data = 0;
+	bool is_stall_mode = wave_launch_mode == KFD_DBG_TRAP_WAVE_LAUNCH_MODE_STALL;
+	bool is_mode_set = wave_launch_mode && !is_stall_mode;
+
+	mutex_lock(&adev->grbm_idx_mutex);
+
+	kgd_gfx_v10_set_wave_launch_stall(adev, vmid, true);
+
+	data = REG_SET_FIELD(data, SPI_GDBG_WAVE_CNTL2,
+			VMID_MASK, is_mode_set ? 1 << vmid : 0);
+	data = REG_SET_FIELD(data, SPI_GDBG_WAVE_CNTL2,
+			MODE, is_mode_set ? wave_launch_mode : 0);
+	WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_WAVE_CNTL2), data);
+
+	if (!is_stall_mode)
+		kgd_gfx_v10_set_wave_launch_stall(adev, vmid, false);
+
+	mutex_unlock(&adev->grbm_idx_mutex);
+
+	return 0;
+}
+
 /* kgd_gfx_v10_get_iq_wait_times: Returns the mmCP_IQ_WAIT_TIME1/2 values
  * The values read are:
  *     ib_offload_wait_time     -- Wait Count for Indirect Buffer Offloads.
@@ -941,6 +967,7 @@ const struct kfd2kgd_calls gfx_v10_kfd2kgd = {
 	.disable_debug_trap = kgd_gfx_v10_disable_debug_trap,
 	.validate_trap_override_request = kgd_gfx_v10_validate_trap_override_request,
 	.set_wave_launch_trap_override = kgd_gfx_v10_set_wave_launch_trap_override,
+	.set_wave_launch_mode = kgd_gfx_v10_set_wave_launch_mode,
 	.get_iq_wait_times = kgd_gfx_v10_get_iq_wait_times,
 	.build_grace_period_packet_info = kgd_gfx_v10_build_grace_period_packet_info,
 	.program_trap_handler_settings = program_trap_handler_settings,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.h
index 85c929fc2926..34c04a2bb83b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.h
@@ -36,6 +36,9 @@ uint32_t kgd_gfx_v10_set_wave_launch_trap_override(struct amdgpu_device *adev,
 					     uint32_t trap_mask_request,
 					     uint32_t *trap_mask_prev,
 					     uint32_t kfd_dbg_trap_cntl_prev);
+uint32_t kgd_gfx_v10_set_wave_launch_mode(struct amdgpu_device *adev,
+					 uint8_t wave_launch_mode,
+					 uint32_t vmid);
 void kgd_gfx_v10_get_iq_wait_times(struct amdgpu_device *adev, uint32_t *wait_times);
 void kgd_gfx_v10_build_grace_period_packet_info(struct amdgpu_device *adev,
 					       uint32_t wait_times,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10_3.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10_3.c
index ae3ead207df4..8627c5458973 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10_3.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10_3.c
@@ -675,6 +675,7 @@ const struct kfd2kgd_calls gfx_v10_3_kfd2kgd = {
 	.enable_debug_trap = kgd_gfx_v10_enable_debug_trap,
 	.disable_debug_trap = kgd_gfx_v10_disable_debug_trap,
 	.validate_trap_override_request = kgd_gfx_v10_validate_trap_override_request,
-	.set_wave_launch_trap_override = kgd_gfx_v10_set_wave_launch_trap_override
+	.set_wave_launch_trap_override = kgd_gfx_v10_set_wave_launch_trap_override,
+	.set_wave_launch_mode = kgd_gfx_v10_set_wave_launch_mode
 
 };
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
index cb0044bbfae5..3bba7ca21926 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
@@ -777,6 +777,39 @@ uint32_t kgd_gfx_v9_set_wave_launch_trap_override(struct amdgpu_device *adev,
 	return 0;
 }
 
+uint32_t kgd_gfx_v9_set_wave_launch_mode(struct amdgpu_device *adev,
+					uint8_t wave_launch_mode,
+					uint32_t vmid)
+{
+	uint32_t data = 0;
+	bool is_stall_mode = wave_launch_mode ==
+				KFD_DBG_TRAP_WAVE_LAUNCH_MODE_STALL;
+	bool is_mode_set = wave_launch_mode && !is_stall_mode;
+
+	mutex_lock(&adev->grbm_idx_mutex);
+
+	kgd_gfx_v9_set_wave_launch_stall(adev, vmid, true);
+
+	data = REG_SET_FIELD(data, SPI_GDBG_WAVE_CNTL2,
+		VMID_MASK, is_mode_set ? 1 << vmid : 0);
+	data = REG_SET_FIELD(data, SPI_GDBG_WAVE_CNTL2,
+		MODE, is_mode_set ? wave_launch_mode : 0);
+	WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_WAVE_CNTL2), data);
+
+	/* Although Pre-GFX9.4.1 stalls globally, the per-VMID stall for
+	 * GFX9.4.1 effectively does the same thing as global STALL_RA as
+	 * all other VMID allocations are back logged by the stalled VMID.
+	 *
+	 * Use with caution.
+	 */
+	if (!is_stall_mode)
+		kgd_gfx_v9_set_wave_launch_stall(adev, vmid, false);
+
+	mutex_unlock(&adev->grbm_idx_mutex);
+
+	return 0;
+}
+
 /* kgd_gfx_v9_get_iq_wait_times: Returns the mmCP_IQ_WAIT_TIME1/2 values
  * The values read are:
  *     ib_offload_wait_time     -- Wait Count for Indirect Buffer Offloads.
@@ -1047,6 +1080,7 @@ const struct kfd2kgd_calls gfx_v9_kfd2kgd = {
 	.disable_debug_trap = kgd_gfx_v9_disable_debug_trap,
 	.validate_trap_override_request = kgd_gfx_v9_validate_trap_override_request,
 	.set_wave_launch_trap_override = kgd_gfx_v9_set_wave_launch_trap_override,
+	.set_wave_launch_mode = kgd_gfx_v9_set_wave_launch_mode,
 	.get_iq_wait_times = kgd_gfx_v9_get_iq_wait_times,
 	.build_grace_period_packet_info = kgd_gfx_v9_build_grace_period_packet_info,
 	.get_cu_occupancy = kgd_gfx_v9_get_cu_occupancy,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h
index 47cff392b434..2a2ab42037e4 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h
@@ -67,6 +67,9 @@ uint32_t kgd_gfx_v9_disable_debug_trap(struct amdgpu_device *adev,
 int kgd_gfx_v9_validate_trap_override_request(struct amdgpu_device *adev,
 					     uint32_t trap_override,
 					     uint32_t *trap_mask_supported);
+uint32_t kgd_gfx_v9_set_wave_launch_mode(struct amdgpu_device *adev,
+					uint8_t wave_launch_mode,
+					uint32_t vmid);
 uint32_t kgd_gfx_v9_set_wave_launch_trap_override(struct amdgpu_device *adev,
 					     uint32_t vmid,
 					     uint32_t trap_override,
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index 1f0ee2413b13..63665279ce4d 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -2899,6 +2899,9 @@ static int kfd_ioctl_set_debug_trap(struct file *filep, struct kfd_process *p, v
 				&args->launch_override.support_request_mask);
 		break;
 	case KFD_IOC_DBG_TRAP_SET_WAVE_LAUNCH_MODE:
+		r = kfd_dbg_trap_set_wave_launch_mode(target,
+				args->launch_mode.launch_mode);
+		break;
 	case KFD_IOC_DBG_TRAP_SUSPEND_QUEUES:
 	case KFD_IOC_DBG_TRAP_RESUME_QUEUES:
 	case KFD_IOC_DBG_TRAP_SET_NODE_ADDRESS_WATCH:
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
index 8add359d1cb9..210851f2cdb3 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
@@ -239,8 +239,10 @@ void kfd_dbg_trap_deactivate(struct kfd_process *target, bool unwind, int unwind
 {
 	int i, count = 0;
 
-	if (!unwind)
+	if (!unwind) {
 		cancel_work_sync(&target->debug_event_workarea);
+		kfd_dbg_trap_set_wave_launch_mode(target, 0);
+	}
 
 	for (i = 0; i < target->n_pdds; i++) {
 		struct kfd_process_device *pdd = target->pdds[i];
@@ -507,6 +509,29 @@ int kfd_dbg_trap_set_wave_launch_override(struct kfd_process *target,
 	return r;
 }
 
+int kfd_dbg_trap_set_wave_launch_mode(struct kfd_process *target,
+					uint8_t wave_launch_mode)
+{
+	int r = 0, i;
+
+	for (i = 0; i < target->n_pdds; i++) {
+		struct kfd_process_device *pdd = target->pdds[i];
+
+		amdgpu_gfx_off_ctrl(pdd->dev->adev, false);
+		pdd->spi_dbg_launch_mode = pdd->dev->kfd2kgd->set_wave_launch_mode(
+				pdd->dev->adev,
+				wave_launch_mode,
+				pdd->dev->vm_info.last_vmid_kfd);
+		amdgpu_gfx_off_ctrl(pdd->dev->adev, true);
+
+		r = debug_refresh_runlist(pdd->dev->dqm);
+		if (r)
+			break;
+	}
+
+	return r;
+}
+
 void kfd_dbg_set_enabled_debug_exception_mask(struct kfd_process *target,
 					uint64_t exception_set_mask)
 {
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.h b/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
index b54a50a5d310..ca3ab1f01985 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
@@ -25,9 +25,6 @@
 
 #include "kfd_priv.h"
 
-void kgd_gfx_v9_set_wave_launch_stall(struct amdgpu_device *adev,
-					uint32_t vmid,
-					bool stall);
 void kfd_dbg_trap_deactivate(struct kfd_process *target, bool unwind, int unwind_count);
 int kfd_dbg_trap_activate(struct kfd_process *target);
 bool kfd_set_dbg_ev_from_interrupt(struct kfd_dev *dev,
@@ -51,6 +48,8 @@ int kfd_dbg_trap_set_wave_launch_override(struct kfd_process *target,
 					uint32_t trap_mask_request,
 					uint32_t *trap_mask_prev,
 					uint32_t *trap_mask_supported);
+int kfd_dbg_trap_set_wave_launch_mode(struct kfd_process *target,
+					uint8_t wave_launch_mode);
 
 int kfd_dbg_send_exception_to_runtime(struct kfd_process *p,
 					unsigned int dev_id,
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 63+ messages in thread

* [PATCH 22/29] drm/amdkfd: add debug suspend and resume process queues operation
  2022-10-31 16:23 [PATCH 01/29] drm/amdkfd: add debug and runtime enable interface Jonathan Kim
                   ` (19 preceding siblings ...)
  2022-10-31 16:23 ` [PATCH 21/29] drm/amdkfd: add debug wave launch mode operation Jonathan Kim
@ 2022-10-31 16:23 ` Jonathan Kim
  2022-11-29 23:55   ` Felix Kuehling
  2022-10-31 16:23 ` [PATCH 23/29] drm/amdkfd: add debug set and clear address watch points operation Jonathan Kim
                   ` (7 subsequent siblings)
  28 siblings, 1 reply; 63+ messages in thread
From: Jonathan Kim @ 2022-10-31 16:23 UTC (permalink / raw)
  To: amd-gfx; +Cc: Felix.Kuehling

In order to inspect waves from the saved context at any point during a
debug session, the debugger must be able to preempt queues to trigger
context save by suspending them.

On queue suspend, the KFD will copy the context save header information
so that the debugger can correctly crawl the appropriate size of the saved
context. The debugger must then also be allowed to resume suspended queues.

A queue that is newly created cannot be suspended because queue ids are
recycled after destruction so the debugger needs to know that this has
occurred.  Query functions will be later added that will clear a given
queue of its new queue status.

A queue cannot be destroyed while it is suspended to preserve its saved
context during debugger inspection.  Have queue destruction block while
a queue is suspended and unblocked when it is resumed.  Likewise, if a
queue is about to be destroyed, it cannot be suspended.

Return the number of queues successfully suspended or resumed along with
a per queue status array where the upper bits per queue status show that
the request was invalid (new/destroyed queue suspend request, missing
queue) or an error occurred (HWS in a fatal state so it can't suspend or
resume queues).

Signed-off-by: Jonathan Kim <jonathan.kim@amd.com>
---
 drivers/gpu/drm/amd/amdkfd/kfd_chardev.c      |  12 +
 drivers/gpu/drm/amd/amdkfd/kfd_debug.c        |   7 +
 .../drm/amd/amdkfd/kfd_device_queue_manager.c | 401 +++++++++++++++++-
 .../drm/amd/amdkfd/kfd_device_queue_manager.h |  11 +
 .../gpu/drm/amd/amdkfd/kfd_mqd_manager_v10.c  |  10 +
 .../gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c   |  14 +-
 drivers/gpu/drm/amd/amdkfd/kfd_priv.h         |   5 +-
 7 files changed, 454 insertions(+), 6 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index 63665279ce4d..ec26c51177f9 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -410,6 +410,7 @@ static int kfd_ioctl_create_queue(struct file *filep, struct kfd_process *p,
 	pr_debug("Write ptr address   == 0x%016llX\n",
 			args->write_pointer_address);
 
+	kfd_dbg_ev_raise(KFD_EC_MASK(EC_QUEUE_NEW), p, dev, queue_id, false, NULL, 0);
 	return 0;
 
 err_create_queue:
@@ -2903,7 +2904,18 @@ static int kfd_ioctl_set_debug_trap(struct file *filep, struct kfd_process *p, v
 				args->launch_mode.launch_mode);
 		break;
 	case KFD_IOC_DBG_TRAP_SUSPEND_QUEUES:
+		r = suspend_queues(target,
+				args->suspend_queues.num_queues,
+				args->suspend_queues.grace_period,
+				args->suspend_queues.exception_mask,
+				(uint32_t *)args->suspend_queues.queue_array_ptr);
+
+		break;
 	case KFD_IOC_DBG_TRAP_RESUME_QUEUES:
+		r = resume_queues(target, false,
+				args->resume_queues.num_queues,
+				(uint32_t *)args->resume_queues.queue_array_ptr);
+		break;
 	case KFD_IOC_DBG_TRAP_SET_NODE_ADDRESS_WATCH:
 	case KFD_IOC_DBG_TRAP_CLEAR_NODE_ADDRESS_WATCH:
 	case KFD_IOC_DBG_TRAP_SET_FLAGS:
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
index 210851f2cdb3..afa56aad316b 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
@@ -274,6 +274,13 @@ void kfd_dbg_trap_deactivate(struct kfd_process *target, bool unwind, int unwind
 
 		count++;
 	}
+
+	if (!unwind) {
+		int resume_count = resume_queues(target, true, 0, NULL);
+
+		if (resume_count)
+			pr_debug("Resumed %d queues\n", resume_count);
+	}
 }
 
 static void kfd_dbg_clean_exception_status(struct kfd_process *target)
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
index bf4787b4dc6c..589efbefc8dc 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -921,6 +921,79 @@ static int update_queue(struct device_queue_manager *dqm, struct queue *q,
 	return retval;
 }
 
+/* suspend_single_queue does not lock the dqm like the
+ * evict_process_queues_cpsch or evict_process_queues_nocpsch. You should
+ * lock the dqm before calling, and unlock after calling.
+ *
+ * The reason we don't lock the dqm is because this function may be
+ * called on multiple queues in a loop, so rather than locking/unlocking
+ * multiple times, we will just keep the dqm locked for all of the calls.
+ */
+static int suspend_single_queue(struct device_queue_manager *dqm,
+				      struct kfd_process_device *pdd,
+				      struct queue *q)
+{
+	bool is_new;
+
+	if (q->properties.is_suspended)
+		return 0;
+
+	pr_debug("Suspending PASID %u queue [%i]\n",
+			pdd->process->pasid,
+			q->properties.queue_id);
+
+	is_new = q->properties.exception_status & KFD_EC_MASK(EC_QUEUE_NEW);
+
+	if (is_new || q->properties.is_being_destroyed) {
+		pr_debug("Suspend: skip %s queue id %i\n",
+				is_new ? "new" : "destroyed",
+				q->properties.queue_id);
+		return -EBUSY;
+	}
+
+	q->properties.is_suspended = true;
+	if (q->properties.is_active) {
+		decrement_queue_count(dqm, &pdd->qpd, q);
+		q->properties.is_active = false;
+	}
+
+	return 0;
+}
+
+/* resume_single_queue does not lock the dqm like the functions
+ * restore_process_queues_cpsch or restore_process_queues_nocpsch. You should
+ * lock the dqm before calling, and unlock after calling.
+ *
+ * The reason we don't lock the dqm is because this function may be
+ * called on multiple queues in a loop, so rather than locking/unlocking
+ * multiple times, we will just keep the dqm locked for all of the calls.
+ */
+static void resume_single_queue(struct device_queue_manager *dqm,
+				      struct qcm_process_device *qpd,
+				      struct queue *q)
+{
+	struct kfd_process_device *pdd;
+	uint64_t pd_base;
+
+	if (!q->properties.is_suspended)
+		return;
+
+	pdd = qpd_to_pdd(qpd);
+	/* Retrieve PD base */
+	pd_base = amdgpu_amdkfd_gpuvm_get_process_page_dir(pdd->drm_priv);
+
+	pr_debug("Restoring from suspend PASID %u queue [%i]\n",
+			    pdd->process->pasid,
+			    q->properties.queue_id);
+
+	q->properties.is_suspended = false;
+
+	if (QUEUE_IS_ACTIVE(q->properties)) {
+		q->properties.is_active = true;
+		increment_queue_count(dqm, qpd, q);
+	}
+}
+
 static int evict_process_queues_nocpsch(struct device_queue_manager *dqm,
 					struct qcm_process_device *qpd)
 {
@@ -1885,6 +1958,31 @@ static int execute_queues_cpsch(struct device_queue_manager *dqm,
 	return map_queues_cpsch(dqm);
 }
 
+static int wait_on_destroy_queue(struct device_queue_manager *dqm,
+				 struct queue *q)
+{
+	struct kfd_process_device *pdd = kfd_get_process_device_data(q->device,
+								q->process);
+	int ret = 0;
+
+	if (pdd->qpd.is_debug)
+		return ret;
+
+	q->properties.is_being_destroyed = true;
+
+	if (pdd->process->debug_trap_enabled && q->properties.is_suspended) {
+		dqm_unlock(dqm);
+		mutex_unlock(&q->process->mutex);
+		ret = wait_event_interruptible(dqm->destroy_wait,
+						!q->properties.is_suspended);
+
+		mutex_lock(&q->process->mutex);
+		dqm_lock(dqm);
+	}
+
+	return ret;
+}
+
 static int destroy_queue_cpsch(struct device_queue_manager *dqm,
 				struct qcm_process_device *qpd,
 				struct queue *q)
@@ -1904,11 +2002,16 @@ static int destroy_queue_cpsch(struct device_queue_manager *dqm,
 				q->properties.queue_id);
 	}
 
-	retval = 0;
-
 	/* remove queue from list to prevent rescheduling after preemption */
 	dqm_lock(dqm);
 
+	retval = wait_on_destroy_queue(dqm, q);
+
+	if (retval) {
+		dqm_unlock(dqm);
+		return retval;
+	}
+
 	if (qpd->is_debug) {
 		/*
 		 * error, currently we do not allow to destroy a queue
@@ -1954,7 +2057,17 @@ static int destroy_queue_cpsch(struct device_queue_manager *dqm,
 
 	dqm_unlock(dqm);
 
-	/* Do free_mqd after dqm_unlock(dqm) to avoid circular locking */
+	/*
+	 * Do free_mqd and delete raise event after dqm_unlock(dqm) to avoid
+	 * circular locking
+	 */
+	kfd_dbg_ev_raise(KFD_EC_MASK(EC_DEVICE_QUEUE_DELETE),
+			qpd->pqm->process,
+			q->device,
+			-1,
+			false,
+			NULL,
+			0);
 	mqd_mgr->free_mqd(mqd_mgr, q->mqd, q->mqd_mem_obj);
 
 	return retval;
@@ -2418,8 +2531,10 @@ struct device_queue_manager *device_queue_manager_init(struct kfd_dev *dev)
 		goto out_free;
 	}
 
-	if (!dqm->ops.initialize(dqm))
+	if (!dqm->ops.initialize(dqm)) {
+		init_waitqueue_head(&dqm->destroy_wait);
 		return dqm;
+	}
 
 out_free:
 	kfree(dqm);
@@ -2557,6 +2672,284 @@ int release_debug_trap_vmid(struct device_queue_manager *dqm,
 	return r;
 }
 
+#define QUEUE_NOT_FOUND		-1
+/* invalidate queue operation in array */
+static void q_array_invalidate(uint32_t num_queues, uint32_t *queue_ids)
+{
+	int i;
+
+	for (i = 0; i < num_queues; i++)
+		queue_ids[i] |= KFD_DBG_QUEUE_INVALID_MASK;
+}
+
+/* find queue index in array */
+static int q_array_get_index(unsigned int queue_id,
+		uint32_t num_queues,
+		uint32_t *queue_ids)
+{
+	int i;
+
+	for (i = 0; i < num_queues; i++)
+		if (queue_id == (queue_ids[i] & ~KFD_DBG_QUEUE_INVALID_MASK))
+			return i;
+
+	return QUEUE_NOT_FOUND;
+}
+
+struct copy_context_work_handler_workarea {
+	struct work_struct copy_context_work;
+	struct kfd_process *p;
+};
+
+static void copy_context_work_handler (struct work_struct *work)
+{
+	struct copy_context_work_handler_workarea *workarea;
+	struct mqd_manager *mqd_mgr;
+	struct queue *q;
+	struct mm_struct *mm;
+	struct kfd_process *p;
+	uint32_t tmp_ctl_stack_used_size, tmp_save_area_used_size;
+	int i;
+
+	workarea = container_of(work,
+			struct copy_context_work_handler_workarea,
+			copy_context_work);
+
+	p = workarea->p;
+	mm = get_task_mm(p->lead_thread);
+
+	if (!mm)
+		return;
+
+	kthread_use_mm(mm);
+	for (i = 0; i < p->n_pdds; i++) {
+		struct kfd_process_device *pdd = p->pdds[i];
+		struct device_queue_manager *dqm = pdd->dev->dqm;
+		struct qcm_process_device *qpd = &pdd->qpd;
+
+		list_for_each_entry(q, &qpd->queues_list, list) {
+			mqd_mgr = dqm->mqd_mgrs[KFD_MQD_TYPE_CP];
+
+			/* We ignore the return value from get_wave_state
+			 * because
+			 * i) right now, it always returns 0, and
+			 * ii) if we hit an error, we would continue to the
+			 *      next queue anyway.
+			 */
+			mqd_mgr->get_wave_state(mqd_mgr,
+					q->mqd,
+					(void __user *)	q->properties.ctx_save_restore_area_address,
+					&tmp_ctl_stack_used_size,
+					&tmp_save_area_used_size);
+		}
+	}
+	kthread_unuse_mm(mm);
+	mmput(mm);
+}
+
+static uint32_t *get_queue_ids(uint32_t num_queues, uint32_t *usr_queue_id_array)
+{
+	size_t array_size = num_queues * sizeof(uint32_t);
+	uint32_t *queue_ids = NULL;
+
+	if (!usr_queue_id_array)
+		return NULL;
+
+	queue_ids = kzalloc(array_size, GFP_KERNEL);
+	if (!queue_ids)
+		return ERR_PTR(-ENOMEM);
+
+	if (copy_from_user(queue_ids, usr_queue_id_array, array_size))
+		return ERR_PTR(-EFAULT);
+
+	return queue_ids;
+}
+
+int resume_queues(struct kfd_process *p,
+		bool resume_all_queues,
+		uint32_t num_queues,
+		uint32_t *usr_queue_id_array)
+{
+	uint32_t *queue_ids = get_queue_ids(num_queues, usr_queue_id_array);
+	int total_resumed = 0;
+	int i;
+
+	if (!resume_all_queues && IS_ERR(queue_ids))
+		return PTR_ERR(queue_ids);
+
+	/* mask all queues as invalid.  unmask per successful request */
+	if (!resume_all_queues)
+		q_array_invalidate(num_queues, queue_ids);
+
+	for (i = 0; i < p->n_pdds; i++) {
+		struct kfd_process_device *pdd = p->pdds[i];
+		struct device_queue_manager *dqm = pdd->dev->dqm;
+		struct qcm_process_device *qpd = &pdd->qpd;
+		struct queue *q;
+		int r, per_device_resumed = 0;
+
+		dqm_lock(dqm);
+
+		/* unmask queues that resume or already resumed as valid */
+		list_for_each_entry(q, &qpd->queues_list, list) {
+			int q_idx = QUEUE_NOT_FOUND;
+
+			if (queue_ids)
+				q_idx = q_array_get_index(
+						q->properties.queue_id,
+						num_queues,
+						queue_ids);
+
+			if (resume_all_queues || q_idx != QUEUE_NOT_FOUND) {
+				resume_single_queue(dqm, &pdd->qpd, q);
+				if (queue_ids)
+					queue_ids[q_idx] &=
+						~KFD_DBG_QUEUE_INVALID_MASK;
+				per_device_resumed++;
+			}
+		}
+
+		if (!per_device_resumed) {
+			dqm_unlock(dqm);
+			continue;
+		}
+
+		r = execute_queues_cpsch(dqm,
+					KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES,
+					0,
+					USE_DEFAULT_GRACE_PERIOD);
+		if (r) {
+			pr_err("Failed to resume process queues\n");
+			if (!resume_all_queues) {
+				list_for_each_entry(q, &qpd->queues_list, list) {
+					int q_idx = q_array_get_index(
+							q->properties.queue_id,
+							num_queues,
+							queue_ids);
+
+					/* mask queue as error on resume fail */
+					if (q_idx != QUEUE_NOT_FOUND)
+						queue_ids[q_idx] |=
+							KFD_DBG_QUEUE_ERROR_MASK;
+				}
+			}
+		} else {
+			wake_up_all(&dqm->destroy_wait);
+			total_resumed += per_device_resumed;
+		}
+
+		dqm_unlock(dqm);
+	}
+
+	if (copy_to_user((void __user *)usr_queue_id_array, queue_ids,
+			num_queues * sizeof(uint32_t)))
+		pr_err("copy_to_user failed on queue resume\n");
+
+	kfree(queue_ids);
+
+	return total_resumed;
+}
+
+int suspend_queues(struct kfd_process *p,
+			uint32_t num_queues,
+			uint32_t grace_period,
+			uint64_t exception_clear_mask,
+			uint32_t *usr_queue_id_array)
+{
+	uint32_t *queue_ids = get_queue_ids(num_queues, usr_queue_id_array);
+	int total_suspended = 0;
+	int i;
+
+	if (IS_ERR(queue_ids))
+		return PTR_ERR(queue_ids);
+
+	/* mask all queues as invalid.  umask on successful request */
+	q_array_invalidate(num_queues, queue_ids);
+
+	for (i = 0; i < p->n_pdds; i++) {
+		struct kfd_process_device *pdd = p->pdds[i];
+		struct device_queue_manager *dqm = pdd->dev->dqm;
+		struct qcm_process_device *qpd = &pdd->qpd;
+		struct queue *q;
+		int r, per_device_suspended = 0;
+
+		mutex_lock(&p->event_mutex);
+		dqm_lock(dqm);
+
+		/* unmask queues that suspend or already suspended */
+		list_for_each_entry(q, &qpd->queues_list, list) {
+			int q_idx = q_array_get_index(q->properties.queue_id,
+							num_queues,
+							queue_ids);
+
+			if (q_idx != QUEUE_NOT_FOUND &&
+					!suspend_single_queue(dqm, pdd, q)) {
+				queue_ids[q_idx] &=
+					~KFD_DBG_QUEUE_INVALID_MASK;
+				per_device_suspended++;
+			}
+		}
+
+		if (!per_device_suspended) {
+			dqm_unlock(dqm);
+			mutex_unlock(&p->event_mutex);
+			continue;
+		}
+
+		r = execute_queues_cpsch(dqm,
+			KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0,
+			grace_period);
+
+		if (r)
+			pr_err("Failed to suspend process queues.\n");
+		else
+			total_suspended += per_device_suspended;
+
+		list_for_each_entry(q, &qpd->queues_list, list) {
+			int q_idx = q_array_get_index(q->properties.queue_id,
+						num_queues, queue_ids);
+
+			if (q_idx == QUEUE_NOT_FOUND)
+				continue;
+
+			/* mask queue as error on suspend fail */
+			if (r)
+				queue_ids[q_idx] |= KFD_DBG_QUEUE_ERROR_MASK;
+			else if (exception_clear_mask)
+				q->properties.exception_status &=
+							~exception_clear_mask;
+		}
+
+		dqm_unlock(dqm);
+		mutex_unlock(&p->event_mutex);
+		amdgpu_device_flush_hdp(dqm->dev->adev, NULL);
+	}
+
+	if (total_suspended) {
+		struct copy_context_work_handler_workarea copy_context_worker;
+
+		INIT_WORK_ONSTACK(
+				&copy_context_worker.copy_context_work,
+				copy_context_work_handler);
+
+		copy_context_worker.p = p;
+
+		schedule_work(&copy_context_worker.copy_context_work);
+
+
+		flush_work(&copy_context_worker.copy_context_work);
+		destroy_work_on_stack(&copy_context_worker.copy_context_work);
+	}
+
+	if (copy_to_user((void __user *)usr_queue_id_array, queue_ids,
+			num_queues * sizeof(uint32_t)))
+		pr_err("copy_to_user failed on queue suspend\n");
+
+	kfree(queue_ids);
+
+	return total_suspended;
+}
+
 int debug_lock_and_unmap(struct device_queue_manager *dqm)
 {
 	int r;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
index bef3be84c5cc..12643528684c 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
@@ -259,6 +259,8 @@ struct device_queue_manager {
 	struct kfd_mem_obj	hiq_sdma_mqd;
 	bool			sched_running;
 	uint32_t		wait_times;
+
+	wait_queue_head_t	destroy_wait;
 };
 
 void device_queue_manager_init_cik(
@@ -286,6 +288,15 @@ int reserve_debug_trap_vmid(struct device_queue_manager *dqm,
 			struct qcm_process_device *qpd);
 int release_debug_trap_vmid(struct device_queue_manager *dqm,
 			struct qcm_process_device *qpd);
+int suspend_queues(struct kfd_process *p,
+			uint32_t num_queues,
+			uint32_t grace_period,
+			uint64_t exception_clear_mask,
+			uint32_t *usr_queue_id_array);
+int resume_queues(struct kfd_process *p,
+		bool resume_all_queues,
+		uint32_t num_queues,
+		uint32_t *usr_queue_id_array);
 int debug_lock_and_unmap(struct device_queue_manager *dqm);
 int debug_map_and_unlock(struct device_queue_manager *dqm);
 int debug_refresh_runlist(struct device_queue_manager *dqm);
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v10.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v10.c
index cb484ace17de..d74862755213 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v10.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v10.c
@@ -237,6 +237,7 @@ static int get_wave_state(struct mqd_manager *mm, void *mqd,
 			  u32 *save_area_used_size)
 {
 	struct v10_compute_mqd *m;
+	struct kfd_context_save_area_header header;
 
 	m = get_mqd(mqd);
 
@@ -255,6 +256,15 @@ static int get_wave_state(struct mqd_manager *mm, void *mqd,
 	 * accessible to user mode
 	 */
 
+	header.control_stack_size = *ctl_stack_used_size;
+	header.wave_state_size = *save_area_used_size;
+
+	header.wave_state_offset = m->cp_hqd_wg_state_offset;
+	header.control_stack_offset = m->cp_hqd_cntl_stack_offset;
+
+	if (copy_to_user(ctl_stack, &header, sizeof(header)))
+		return -EFAULT;
+
 	return 0;
 }
 
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c
index 86f1cf090246..f05a2bed655a 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c
@@ -289,6 +289,7 @@ static int get_wave_state(struct mqd_manager *mm, void *mqd,
 			  u32 *save_area_used_size)
 {
 	struct v9_mqd *m;
+	struct kfd_context_save_area_header header;
 
 	/* Control stack is located one page after MQD. */
 	void *mqd_ctl_stack = (void *)((uintptr_t)mqd + PAGE_SIZE);
@@ -300,7 +301,18 @@ static int get_wave_state(struct mqd_manager *mm, void *mqd,
 	*save_area_used_size = m->cp_hqd_wg_state_offset -
 		m->cp_hqd_cntl_stack_size;
 
-	if (copy_to_user(ctl_stack, mqd_ctl_stack, m->cp_hqd_cntl_stack_size))
+	header.control_stack_size = *ctl_stack_used_size;
+	header.wave_state_size = *save_area_used_size;
+
+	header.wave_state_offset = m->cp_hqd_wg_state_offset;
+	header.control_stack_offset = m->cp_hqd_cntl_stack_offset;
+
+	if (copy_to_user(ctl_stack, &header, sizeof(header)))
+		return -EFAULT;
+
+	if (copy_to_user(ctl_stack + m->cp_hqd_cntl_stack_offset,
+				mqd_ctl_stack + m->cp_hqd_cntl_stack_offset,
+				*ctl_stack_used_size))
 		return -EFAULT;
 
 	return 0;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index bd3d8a0b61b7..3d529c7499f8 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -477,6 +477,8 @@ struct queue_properties {
 	uint32_t doorbell_off;
 	bool is_interop;
 	bool is_evicted;
+	bool is_suspended;
+	bool is_being_destroyed;
 	bool is_active;
 	bool is_gws;
 	/* Not relevant for user mode queues in cp scheduling */
@@ -499,7 +501,8 @@ struct queue_properties {
 #define QUEUE_IS_ACTIVE(q) ((q).queue_size > 0 &&	\
 			    (q).queue_address != 0 &&	\
 			    (q).queue_percent > 0 &&	\
-			    !(q).is_evicted)
+			    !(q).is_evicted &&		\
+			    !(q).is_suspended)
 
 enum mqd_update_flag {
 	UPDATE_FLAG_CU_MASK = 0,
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 63+ messages in thread

* [PATCH 23/29] drm/amdkfd: add debug set and clear address watch points operation
  2022-10-31 16:23 [PATCH 01/29] drm/amdkfd: add debug and runtime enable interface Jonathan Kim
                   ` (20 preceding siblings ...)
  2022-10-31 16:23 ` [PATCH 22/29] drm/amdkfd: add debug suspend and resume process queues operation Jonathan Kim
@ 2022-10-31 16:23 ` Jonathan Kim
  2022-11-30  0:34   ` Felix Kuehling
  2022-10-31 16:23 ` [PATCH 24/29] drm/amdkfd: add debug set flags operation Jonathan Kim
                   ` (6 subsequent siblings)
  28 siblings, 1 reply; 63+ messages in thread
From: Jonathan Kim @ 2022-10-31 16:23 UTC (permalink / raw)
  To: amd-gfx; +Cc: Felix.Kuehling

Shader read, write and atomic memory operations can be alerted to the
debugger as an address watch exception.

Allow the debugger to pass in a watch point to a particular memory
address per device.

Note that there exists only 4 watch points per devices to date, so have
the KFD keep track of what watch points are allocated or not.

v2: change dev_id arg to gpu_id for consistency

Signed-off-by: Jonathan Kim <jonathan.kim@amd.com>
---
 .../drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c  |   2 +
 .../drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c   |   2 +
 .../drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c    |  78 +++++++++++
 .../drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.h    |   8 ++
 .../drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10_3.c  |   5 +-
 .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c | 128 +++++++++++++++++
 .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h |   8 ++
 drivers/gpu/drm/amd/amdkfd/kfd_chardev.c      |  24 ++++
 drivers/gpu/drm/amd/amdkfd/kfd_debug.c        | 130 ++++++++++++++++++
 drivers/gpu/drm/amd/amdkfd/kfd_debug.h        |   8 +-
 drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c  |   7 +
 drivers/gpu/drm/amd/amdkfd/kfd_priv.h         |   9 +-
 12 files changed, 405 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c
index 91c7fdee883e..8f9b613e3152 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c
@@ -138,6 +138,8 @@ const struct kfd2kgd_calls aldebaran_kfd2kgd = {
 	.validate_trap_override_request = kgd_aldebaran_validate_trap_override_request,
 	.set_wave_launch_trap_override = kgd_aldebaran_set_wave_launch_trap_override,
 	.set_wave_launch_mode = kgd_aldebaran_set_wave_launch_mode,
+	.set_address_watch = kgd_gfx_v9_set_address_watch,
+	.clear_address_watch = kgd_gfx_v9_clear_address_watch,
 	.get_iq_wait_times = kgd_gfx_v9_get_iq_wait_times,
 	.build_grace_period_packet_info = kgd_gfx_v9_build_grace_period_packet_info,
 	.program_trap_handler_settings = kgd_gfx_v9_program_trap_handler_settings,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c
index 10470f4a4eaf..5d6bd23a8cc1 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c
@@ -400,6 +400,8 @@ const struct kfd2kgd_calls arcturus_kfd2kgd = {
 	.validate_trap_override_request = kgd_gfx_v9_validate_trap_override_request,
 	.set_wave_launch_trap_override = kgd_gfx_v9_set_wave_launch_trap_override,
 	.set_wave_launch_mode = kgd_gfx_v9_set_wave_launch_mode,
+	.set_address_watch = kgd_gfx_v9_set_address_watch,
+	.clear_address_watch = kgd_gfx_v9_clear_address_watch,
 	.get_iq_wait_times = kgd_gfx_v9_get_iq_wait_times,
 	.build_grace_period_packet_info = kgd_gfx_v9_build_grace_period_packet_info,
 	.get_cu_occupancy = kgd_gfx_v9_get_cu_occupancy,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
index 66a83e6fb9e5..ec48677772f6 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
@@ -880,6 +880,82 @@ uint32_t kgd_gfx_v10_set_wave_launch_mode(struct amdgpu_device *adev,
 	return 0;
 }
 
+#define TCP_WATCH_STRIDE (mmTCP_WATCH1_ADDR_H - mmTCP_WATCH0_ADDR_H)
+uint32_t kgd_gfx_v10_set_address_watch(struct amdgpu_device *adev,
+					uint64_t watch_address,
+					uint32_t watch_address_mask,
+					uint32_t watch_id,
+					uint32_t watch_mode,
+					uint32_t debug_vmid)
+{
+	uint32_t watch_address_high;
+	uint32_t watch_address_low;
+	uint32_t watch_address_cntl;
+
+	watch_address_cntl = 0;
+
+	watch_address_low = lower_32_bits(watch_address);
+	watch_address_high = upper_32_bits(watch_address) & 0xffff;
+
+	watch_address_cntl = REG_SET_FIELD(watch_address_cntl,
+			TCP_WATCH0_CNTL,
+			VMID,
+			debug_vmid);
+	watch_address_cntl = REG_SET_FIELD(watch_address_cntl,
+			TCP_WATCH0_CNTL,
+			MODE,
+			watch_mode);
+	watch_address_cntl = REG_SET_FIELD(watch_address_cntl,
+			TCP_WATCH0_CNTL,
+			MASK,
+			watch_address_mask >> 7);
+
+	/* Turning off this watch point until we set all the registers */
+	watch_address_cntl = REG_SET_FIELD(watch_address_cntl,
+			TCP_WATCH0_CNTL,
+			VALID,
+			0);
+
+	WREG32((SOC15_REG_OFFSET(GC, 0, mmTCP_WATCH0_CNTL) +
+			(watch_id * TCP_WATCH_STRIDE)),
+			watch_address_cntl);
+
+	WREG32((SOC15_REG_OFFSET(GC, 0, mmTCP_WATCH0_ADDR_H) +
+			(watch_id * TCP_WATCH_STRIDE)),
+			watch_address_high);
+
+	WREG32((SOC15_REG_OFFSET(GC, 0, mmTCP_WATCH0_ADDR_L) +
+			(watch_id * TCP_WATCH_STRIDE)),
+			watch_address_low);
+
+	/* Enable the watch point */
+	watch_address_cntl = REG_SET_FIELD(watch_address_cntl,
+			TCP_WATCH0_CNTL,
+			VALID,
+			1);
+
+	WREG32((SOC15_REG_OFFSET(GC, 0, mmTCP_WATCH0_CNTL) +
+			(watch_id * TCP_WATCH_STRIDE)),
+			watch_address_cntl);
+
+	return 0;
+}
+
+uint32_t kgd_gfx_v10_clear_address_watch(struct amdgpu_device *adev,
+					uint32_t watch_id)
+{
+	uint32_t watch_address_cntl;
+
+	watch_address_cntl = 0;
+
+	WREG32((SOC15_REG_OFFSET(GC, 0, mmTCP_WATCH0_CNTL) +
+			(watch_id * TCP_WATCH_STRIDE)),
+			watch_address_cntl);
+
+	return 0;
+}
+
+
 /* kgd_gfx_v10_get_iq_wait_times: Returns the mmCP_IQ_WAIT_TIME1/2 values
  * The values read are:
  *     ib_offload_wait_time     -- Wait Count for Indirect Buffer Offloads.
@@ -968,6 +1044,8 @@ const struct kfd2kgd_calls gfx_v10_kfd2kgd = {
 	.validate_trap_override_request = kgd_gfx_v10_validate_trap_override_request,
 	.set_wave_launch_trap_override = kgd_gfx_v10_set_wave_launch_trap_override,
 	.set_wave_launch_mode = kgd_gfx_v10_set_wave_launch_mode,
+	.set_address_watch = kgd_gfx_v10_set_address_watch,
+	.clear_address_watch = kgd_gfx_v10_clear_address_watch,
 	.get_iq_wait_times = kgd_gfx_v10_get_iq_wait_times,
 	.build_grace_period_packet_info = kgd_gfx_v10_build_grace_period_packet_info,
 	.program_trap_handler_settings = program_trap_handler_settings,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.h
index 34c04a2bb83b..334ff16e25db 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.h
@@ -39,6 +39,14 @@ uint32_t kgd_gfx_v10_set_wave_launch_trap_override(struct amdgpu_device *adev,
 uint32_t kgd_gfx_v10_set_wave_launch_mode(struct amdgpu_device *adev,
 					 uint8_t wave_launch_mode,
 					 uint32_t vmid);
+uint32_t kgd_gfx_v10_set_address_watch(struct amdgpu_device *adev,
+					uint64_t watch_address,
+					uint32_t watch_address_mask,
+					uint32_t watch_id,
+					uint32_t watch_mode,
+					uint32_t debug_vmid);
+uint32_t kgd_gfx_v10_clear_address_watch(struct amdgpu_device *adev,
+					uint32_t watch_id);
 void kgd_gfx_v10_get_iq_wait_times(struct amdgpu_device *adev, uint32_t *wait_times);
 void kgd_gfx_v10_build_grace_period_packet_info(struct amdgpu_device *adev,
 					       uint32_t wait_times,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10_3.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10_3.c
index 8627c5458973..ee36ba045dcf 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10_3.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10_3.c
@@ -676,6 +676,7 @@ const struct kfd2kgd_calls gfx_v10_3_kfd2kgd = {
 	.disable_debug_trap = kgd_gfx_v10_disable_debug_trap,
 	.validate_trap_override_request = kgd_gfx_v10_validate_trap_override_request,
 	.set_wave_launch_trap_override = kgd_gfx_v10_set_wave_launch_trap_override,
-	.set_wave_launch_mode = kgd_gfx_v10_set_wave_launch_mode
-
+	.set_wave_launch_mode = kgd_gfx_v10_set_wave_launch_mode,
+	.set_address_watch = kgd_gfx_v10_set_address_watch,
+	.clear_address_watch = kgd_gfx_v10_clear_address_watch
 };
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
index 3bba7ca21926..98355a21740b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
@@ -810,6 +810,132 @@ uint32_t kgd_gfx_v9_set_wave_launch_mode(struct amdgpu_device *adev,
 	return 0;
 }
 
+#define TCP_WATCH_STRIDE (mmTCP_WATCH1_ADDR_H - mmTCP_WATCH0_ADDR_H)
+static uint32_t kgd_gfx_set_multi_process_address_watch(
+					struct amdgpu_device *adev,
+					uint64_t watch_address,
+					uint32_t watch_address_mask,
+					uint32_t watch_id,
+					uint32_t watch_mode)
+{
+	uint32_t watch_address_high;
+	uint32_t watch_address_low;
+	uint32_t watch_address_cntl;
+
+	watch_address_cntl = 0;
+	watch_address_low = lower_32_bits(watch_address);
+	watch_address_high = upper_32_bits(watch_address) & 0xffff;
+
+	watch_address_cntl = REG_SET_FIELD(watch_address_cntl,
+			TCP_WATCH0_CNTL,
+			MODE,
+			watch_mode);
+
+	watch_address_cntl = REG_SET_FIELD(watch_address_cntl,
+			TCP_WATCH0_CNTL,
+			MASK,
+			watch_address_mask >> 6);
+
+	watch_address_cntl = REG_SET_FIELD(watch_address_cntl,
+			TCP_WATCH0_CNTL,
+			VALID,
+			1);
+
+	WREG32_RLC((SOC15_REG_OFFSET(GC, 0, mmTCP_WATCH0_ADDR_H) +
+			(watch_id * TCP_WATCH_STRIDE)),
+			watch_address_high);
+
+	WREG32_RLC((SOC15_REG_OFFSET(GC, 0, mmTCP_WATCH0_ADDR_L) +
+			(watch_id * TCP_WATCH_STRIDE)),
+			watch_address_low);
+
+	return watch_address_cntl;
+}
+
+uint32_t kgd_gfx_v9_set_address_watch(struct amdgpu_device *adev,
+					uint64_t watch_address,
+					uint32_t watch_address_mask,
+					uint32_t watch_id,
+					uint32_t watch_mode,
+					uint32_t debug_vmid)
+{
+	uint32_t watch_address_high;
+	uint32_t watch_address_low;
+	uint32_t watch_address_cntl;
+
+	if (adev->ip_versions[GC_HWIP][0] == IP_VERSION(9, 4, 2))
+		return kgd_gfx_set_multi_process_address_watch(adev,
+							watch_address,
+							watch_address_mask,
+							watch_id,
+							watch_mode);
+
+	watch_address_cntl = 0;
+
+	watch_address_low = lower_32_bits(watch_address);
+	watch_address_high = upper_32_bits(watch_address) & 0xffff;
+
+	watch_address_cntl = REG_SET_FIELD(watch_address_cntl,
+			TCP_WATCH0_CNTL,
+			VMID,
+			debug_vmid);
+	watch_address_cntl = REG_SET_FIELD(watch_address_cntl,
+			TCP_WATCH0_CNTL,
+			MODE,
+			watch_mode);
+	watch_address_cntl = REG_SET_FIELD(watch_address_cntl,
+			TCP_WATCH0_CNTL,
+			MASK,
+			watch_address_mask >> 6);
+
+	/* Turning off this watch point until we set all the registers */
+	watch_address_cntl = REG_SET_FIELD(watch_address_cntl,
+			TCP_WATCH0_CNTL,
+			VALID,
+			0);
+
+	WREG32_RLC((SOC15_REG_OFFSET(GC, 0, mmTCP_WATCH0_CNTL) +
+			(watch_id * TCP_WATCH_STRIDE)),
+			watch_address_cntl);
+
+	WREG32_RLC((SOC15_REG_OFFSET(GC, 0, mmTCP_WATCH0_ADDR_H) +
+			(watch_id * TCP_WATCH_STRIDE)),
+			watch_address_high);
+
+	WREG32_RLC((SOC15_REG_OFFSET(GC, 0, mmTCP_WATCH0_ADDR_L) +
+			(watch_id * TCP_WATCH_STRIDE)),
+			watch_address_low);
+
+	/* Enable the watch point */
+	watch_address_cntl = REG_SET_FIELD(watch_address_cntl,
+			TCP_WATCH0_CNTL,
+			VALID,
+			1);
+
+	WREG32_RLC((SOC15_REG_OFFSET(GC, 0, mmTCP_WATCH0_CNTL) +
+			(watch_id * TCP_WATCH_STRIDE)),
+			watch_address_cntl);
+
+	return 0;
+}
+
+uint32_t kgd_gfx_v9_clear_address_watch(struct amdgpu_device *adev,
+					uint32_t watch_id)
+{
+	uint32_t watch_address_cntl;
+
+	if (adev->asic_type == CHIP_ALDEBARAN)
+		return 0;
+
+	watch_address_cntl = 0;
+
+	WREG32_RLC((SOC15_REG_OFFSET(GC, 0, mmTCP_WATCH0_CNTL) +
+			(watch_id * TCP_WATCH_STRIDE)),
+			watch_address_cntl);
+
+	return 0;
+}
+
 /* kgd_gfx_v9_get_iq_wait_times: Returns the mmCP_IQ_WAIT_TIME1/2 values
  * The values read are:
  *     ib_offload_wait_time     -- Wait Count for Indirect Buffer Offloads.
@@ -1081,6 +1207,8 @@ const struct kfd2kgd_calls gfx_v9_kfd2kgd = {
 	.validate_trap_override_request = kgd_gfx_v9_validate_trap_override_request,
 	.set_wave_launch_trap_override = kgd_gfx_v9_set_wave_launch_trap_override,
 	.set_wave_launch_mode = kgd_gfx_v9_set_wave_launch_mode,
+	.set_address_watch = kgd_gfx_v9_set_address_watch,
+	.clear_address_watch = kgd_gfx_v9_clear_address_watch,
 	.get_iq_wait_times = kgd_gfx_v9_get_iq_wait_times,
 	.build_grace_period_packet_info = kgd_gfx_v9_build_grace_period_packet_info,
 	.get_cu_occupancy = kgd_gfx_v9_get_cu_occupancy,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h
index 2a2ab42037e4..ba52b61b68c5 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h
@@ -77,6 +77,14 @@ uint32_t kgd_gfx_v9_set_wave_launch_trap_override(struct amdgpu_device *adev,
 					     uint32_t trap_mask_request,
 					     uint32_t *trap_mask_prev,
 					     uint32_t kfd_dbg_trap_cntl_prev);
+uint32_t kgd_gfx_v9_set_address_watch(struct amdgpu_device *adev,
+					uint64_t watch_address,
+					uint32_t watch_address_mask,
+					uint32_t watch_id,
+					uint32_t watch_mode,
+					uint32_t debug_vmid);
+uint32_t kgd_gfx_v9_clear_address_watch(struct amdgpu_device *adev,
+					uint32_t watch_id);
 void kgd_gfx_v9_get_iq_wait_times(struct amdgpu_device *adev, uint32_t *wait_times);
 void kgd_gfx_v9_build_grace_period_packet_info(struct amdgpu_device *adev,
 					       uint32_t wait_times,
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index ec26c51177f9..9b2ea6e9e078 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -2797,6 +2797,7 @@ static int kfd_ioctl_set_debug_trap(struct file *filep, struct kfd_process *p, v
 	struct task_struct *thread = NULL;
 	struct pid *pid = NULL;
 	struct kfd_process *target = NULL;
+	struct kfd_process_device *pdd = NULL;
 	int r = 0;
 
 	if (sched_policy == KFD_SCHED_POLICY_NO_HWS) {
@@ -2864,6 +2865,20 @@ static int kfd_ioctl_set_debug_trap(struct file *filep, struct kfd_process *p, v
 		goto unlock_out;
 	}
 
+	if (args->op == KFD_IOC_DBG_TRAP_SET_NODE_ADDRESS_WATCH ||
+			args->op == KFD_IOC_DBG_TRAP_CLEAR_NODE_ADDRESS_WATCH) {
+		int user_gpu_id = kfd_process_get_user_gpu_id(target,
+				args->op == KFD_IOC_DBG_TRAP_SET_NODE_ADDRESS_WATCH ?
+					args->set_node_address_watch.gpu_id :
+					args->clear_node_address_watch.gpu_id);
+
+		pdd = kfd_process_device_data_by_id(target, user_gpu_id);
+		if (user_gpu_id == -EINVAL || !pdd) {
+			r = -ENODEV;
+			goto unlock_out;
+		}
+	}
+
 	switch (args->op) {
 	case KFD_IOC_DBG_TRAP_ENABLE:
 		if (target != p)
@@ -2917,7 +2932,16 @@ static int kfd_ioctl_set_debug_trap(struct file *filep, struct kfd_process *p, v
 				(uint32_t *)args->resume_queues.queue_array_ptr);
 		break;
 	case KFD_IOC_DBG_TRAP_SET_NODE_ADDRESS_WATCH:
+		r = kfd_dbg_trap_set_dev_address_watch(pdd,
+				args->set_node_address_watch.address,
+				args->set_node_address_watch.mask,
+				&args->set_node_address_watch.id,
+				args->set_node_address_watch.mode);
+		break;
 	case KFD_IOC_DBG_TRAP_CLEAR_NODE_ADDRESS_WATCH:
+		r = kfd_dbg_trap_clear_dev_address_watch(pdd,
+				args->clear_node_address_watch.id);
+		break;
 	case KFD_IOC_DBG_TRAP_SET_FLAGS:
 	case KFD_IOC_DBG_TRAP_QUERY_DEBUG_EVENT:
 	case KFD_IOC_DBG_TRAP_QUERY_EXCEPTION_INFO:
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
index afa56aad316b..68bc1d5bfd05 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
@@ -24,6 +24,14 @@
 #include "kfd_device_queue_manager.h"
 #include <linux/file.h>
 
+/*
+ * The spinlock protects the per device dev->alloc_watch_ids for multi-process access.
+ * The per-process per-device pdd->alloc_watch_ids is protected by the debug IOCTL
+ * process mutex.
+ */
+#define MAX_WATCH_ADDRESSES	4
+static DEFINE_SPINLOCK(watch_points_lock);
+
 void debug_event_write_work_handler(struct work_struct *work)
 {
 	struct kfd_process *process;
@@ -227,6 +235,127 @@ int kfd_dbg_send_exception_to_runtime(struct kfd_process *p,
 	return 0;
 }
 
+#define KFD_DEBUGGER_INVALID_WATCH_POINT_ID -1
+static int kfd_dbg_get_dev_watch_id(struct kfd_process_device *pdd, int *watch_id)
+{
+	int i;
+
+	*watch_id = KFD_DEBUGGER_INVALID_WATCH_POINT_ID;
+
+	spin_lock(&watch_points_lock);
+
+	for (i = 0; i < MAX_WATCH_ADDRESSES; i++) {
+		/* device watchpoint in use so skip */
+		if ((pdd->dev->alloc_watch_ids >> i) & 0x1)
+			continue;
+
+		pdd->alloc_watch_ids |= 0x1 << i;
+		pdd->dev->alloc_watch_ids |= 0x1 << i;
+		*watch_id = i;
+		spin_unlock(&watch_points_lock);
+		return 0;
+	}
+
+	spin_unlock(&watch_points_lock);
+
+	return -ENOMEM;
+}
+
+static void kfd_dbg_clear_dev_watch_id(struct kfd_process_device *pdd, int watch_id)
+{
+	spin_lock(&watch_points_lock);
+
+	/* process owns device watch point so safe to clear */
+	if ((pdd->alloc_watch_ids >> watch_id) & 0x1) {
+		pdd->alloc_watch_ids &= ~(0x1 << watch_id);
+		pdd->dev->alloc_watch_ids &= ~(0x1 << watch_id);
+	}
+
+	spin_unlock(&watch_points_lock);
+}
+
+static bool kfd_dbg_owns_dev_watch_id(struct kfd_process_device *pdd, int watch_id)
+{
+	bool owns_watch_id = false;
+
+	spin_lock(&watch_points_lock);
+	owns_watch_id = watch_id < MAX_WATCH_ADDRESSES && ((pdd->alloc_watch_ids >> watch_id) & 0x1);
+
+	spin_unlock(&watch_points_lock);
+
+	return owns_watch_id;
+}
+
+int kfd_dbg_trap_clear_dev_address_watch(struct kfd_process_device *pdd,
+					uint32_t watch_id)
+{
+	int r;
+
+	if (!kfd_dbg_owns_dev_watch_id(pdd, watch_id))
+		return -EINVAL;
+
+	r = debug_lock_and_unmap(pdd->dev->dqm);
+	if (r)
+		return r;
+
+	amdgpu_gfx_off_ctrl(pdd->dev->adev, false);
+	pdd->watch_points[watch_id] = pdd->dev->kfd2kgd->clear_address_watch(
+							pdd->dev->adev,
+							watch_id);
+	amdgpu_gfx_off_ctrl(pdd->dev->adev, true);
+
+	r = debug_map_and_unlock(pdd->dev->dqm);
+
+	kfd_dbg_clear_dev_watch_id(pdd, watch_id);
+
+	return r;
+}
+
+int kfd_dbg_trap_set_dev_address_watch(struct kfd_process_device *pdd,
+					uint64_t watch_address,
+					uint32_t watch_address_mask,
+					uint32_t *watch_id,
+					uint32_t watch_mode)
+{
+	int r = kfd_dbg_get_dev_watch_id(pdd, watch_id);
+
+	if (r)
+		return r;
+
+	r = debug_lock_and_unmap(pdd->dev->dqm);
+	if (r) {
+		kfd_dbg_clear_dev_watch_id(pdd, *watch_id);
+		return r;
+	}
+
+	amdgpu_gfx_off_ctrl(pdd->dev->adev, false);
+	pdd->watch_points[*watch_id] = pdd->dev->kfd2kgd->set_address_watch(
+				pdd->dev->adev,
+				watch_address,
+				watch_address_mask,
+				*watch_id,
+				watch_mode,
+				pdd->dev->vm_info.last_vmid_kfd);
+	amdgpu_gfx_off_ctrl(pdd->dev->adev, true);
+
+	r = debug_map_and_unlock(pdd->dev->dqm);
+	/* HWS is broken so no point in HW rollback but release the watchpoint anyways */
+	if (r)
+		kfd_dbg_clear_dev_watch_id(pdd, *watch_id);
+
+	return 0;
+}
+
+static void kfd_dbg_clear_process_address_watch(struct kfd_process *target)
+{
+	int i, j;
+
+	for (i = 0; i < target->n_pdds; i++)
+		for (j = 0; j < MAX_WATCH_ADDRESSES; j++)
+			kfd_dbg_trap_clear_dev_address_watch(target->pdds[i], j);
+}
+
+
 /* kfd_dbg_trap_deactivate:
  *	target: target process
  *	unwind: If this is unwinding a failed kfd_dbg_trap_enable()
@@ -241,6 +370,7 @@ void kfd_dbg_trap_deactivate(struct kfd_process *target, bool unwind, int unwind
 
 	if (!unwind) {
 		cancel_work_sync(&target->debug_event_workarea);
+		kfd_dbg_clear_process_address_watch(target);
 		kfd_dbg_trap_set_wave_launch_mode(target, 0);
 	}
 
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.h b/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
index ca3ab1f01985..ad677e67e7eb 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
@@ -50,7 +50,13 @@ int kfd_dbg_trap_set_wave_launch_override(struct kfd_process *target,
 					uint32_t *trap_mask_supported);
 int kfd_dbg_trap_set_wave_launch_mode(struct kfd_process *target,
 					uint8_t wave_launch_mode);
-
+int kfd_dbg_trap_clear_dev_address_watch(struct kfd_process_device *pdd,
+					uint32_t watch_id);
+int kfd_dbg_trap_set_dev_address_watch(struct kfd_process_device *pdd,
+					uint64_t watch_address,
+					uint32_t watch_address_mask,
+					uint32_t *watch_id,
+					uint32_t watch_mode);
 int kfd_dbg_send_exception_to_runtime(struct kfd_process *p,
 					unsigned int dev_id,
 					unsigned int queue_id,
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c b/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c
index 8aebe408c544..733987de595a 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c
@@ -395,6 +395,8 @@ int kfd_init_apertures(struct kfd_process *process)
 			pdd->gpuvm_base = pdd->gpuvm_limit = 0;
 			pdd->scratch_base = pdd->scratch_limit = 0;
 		} else {
+			int num_watchpoints = pdd->dev->device_info.num_of_watch_points;
+
 			switch (dev->adev->asic_type) {
 			case CHIP_KAVERI:
 			case CHIP_HAWAII:
@@ -424,6 +426,11 @@ int kfd_init_apertures(struct kfd_process *process)
 				pdd->qpd.cwsr_base = SVM_CWSR_BASE;
 				pdd->qpd.ib_base = SVM_IB_BASE;
 			}
+
+			process->max_watch_points =
+				!process->max_watch_points ? num_watchpoints :
+						min(num_watchpoints, process->max_watch_points);
+
 		}
 
 		dev_dbg(kfd_device, "node id %u\n", id);
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index 3d529c7499f8..aee4fe20e676 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -348,6 +348,9 @@ struct kfd_dev {
 
 	/* HMM page migration MEMORY_DEVICE_PRIVATE mapping */
 	struct dev_pagemap pgmap;
+
+	/* Track per device allocated watch points */
+	uint32_t alloc_watch_ids;
 };
 
 enum kfd_mempool {
@@ -796,6 +799,7 @@ struct kfd_process_device {
 	uint32_t spi_dbg_override;
 	uint32_t spi_dbg_launch_mode;
 	uint32_t watch_points[4];
+	uint32_t alloc_watch_ids;
 
 	/*
 	 * If this process has been checkpointed before, then the user
@@ -907,6 +911,10 @@ struct kfd_process {
 	/* per-process-per device debug event fd file */
 	struct file *dbg_ev_file;
 
+	/* Allocated debug watch point IDs bitmask */
+	uint32_t allocated_debug_watch_point_bitmask;
+	int max_watch_points;
+
 	/* If the process is a kfd debugger, we need to know so we can clean
 	 * up at exit time.  If a process enables debugging on itself, it does
 	 * its own clean-up, so we don't set the flag here.  We track this by
@@ -952,7 +960,6 @@ struct kfd_process {
 	struct semaphore runtime_enable_sema;
 	bool is_runtime_retry;
 	struct kfd_runtime_info runtime_info;
-
 };
 
 #define KFD_PROCESS_TABLE_SIZE 5 /* bits: 32 entries */
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 63+ messages in thread

* [PATCH 24/29] drm/amdkfd: add debug set flags operation
  2022-10-31 16:23 [PATCH 01/29] drm/amdkfd: add debug and runtime enable interface Jonathan Kim
                   ` (21 preceding siblings ...)
  2022-10-31 16:23 ` [PATCH 23/29] drm/amdkfd: add debug set and clear address watch points operation Jonathan Kim
@ 2022-10-31 16:23 ` Jonathan Kim
  2022-11-30  0:39   ` Felix Kuehling
  2022-10-31 16:23 ` [PATCH 25/29] drm/amdkfd: add debug query event operation Jonathan Kim
                   ` (5 subsequent siblings)
  28 siblings, 1 reply; 63+ messages in thread
From: Jonathan Kim @ 2022-10-31 16:23 UTC (permalink / raw)
  To: amd-gfx; +Cc: Felix.Kuehling

Allow the debugger to set single memory and single ALU operations.

Some exceptions are imprecise (memory violations, address watch) in the
sense that a trap occurs only when the exception interrupt occurs and
not at the non-halting faulty instruction.  Trap temporaries 0 & 1 save
the program counter address, which means that these values will not point
to the faulty instruction address but to whenever the interrupt was
raised.

Setting the Single Memory Operations flag will inject an automatic wait
on every memory operation instruction forcing imprecise memory exceptions
to become precise at the cost of performance.  This setting is not
permitted on debug devices that support only a global setting of this
option.

Likewise, Single ALU Operations will force in-order ALU operations.
Although this is available on current hardware, it's not required so it
will be treated as a NOP.

Return the previous set flags to the debugger as well.

Signed-off-by: Jonathan Kim <jonathan.kim@amd.com>
---
 drivers/gpu/drm/amd/amdkfd/kfd_chardev.c |  2 ++
 drivers/gpu/drm/amd/amdkfd/kfd_debug.c   | 35 ++++++++++++++++++++++++
 drivers/gpu/drm/amd/amdkfd/kfd_debug.h   |  1 +
 3 files changed, 38 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index 9b2ea6e9e078..200e11f02382 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -2943,6 +2943,8 @@ static int kfd_ioctl_set_debug_trap(struct file *filep, struct kfd_process *p, v
 				args->clear_node_address_watch.id);
 		break;
 	case KFD_IOC_DBG_TRAP_SET_FLAGS:
+		r = kfd_dbg_trap_set_flags(target, &args->set_flags.flags);
+		break;
 	case KFD_IOC_DBG_TRAP_QUERY_DEBUG_EVENT:
 	case KFD_IOC_DBG_TRAP_QUERY_EXCEPTION_INFO:
 	case KFD_IOC_DBG_TRAP_GET_QUEUE_SNAPSHOT:
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
index 68bc1d5bfd05..1f4d3fa0278e 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
@@ -23,6 +23,7 @@
 #include "kfd_debug.h"
 #include "kfd_device_queue_manager.h"
 #include <linux/file.h>
+#include <uapi/linux/kfd_ioctl.h>
 
 /*
  * The spinlock protects the per device dev->alloc_watch_ids for multi-process access.
@@ -355,6 +356,37 @@ static void kfd_dbg_clear_process_address_watch(struct kfd_process *target)
 			kfd_dbg_trap_clear_dev_address_watch(target->pdds[i], j);
 }
 
+int kfd_dbg_trap_set_flags(struct kfd_process *target, uint32_t *flags)
+{
+	uint32_t prev_flags = target->dbg_flags;
+	int i, r = 0;
+
+	for (i = 0; i < target->n_pdds; i++) {
+		if (!kfd_dbg_is_per_vmid_supported(target->pdds[i]->dev) &&
+			(*flags & KFD_DBG_TRAP_FLAG_SINGLE_MEM_OP)) {
+			*flags = prev_flags;
+			return -EACCES;
+		}
+	}
+
+	target->dbg_flags = *flags;
+	*flags = prev_flags;
+	for (i = 0; i < target->n_pdds; i++) {
+		struct kfd_process_device *pdd = target->pdds[i];
+
+		if (!kfd_dbg_is_per_vmid_supported(pdd->dev))
+			continue;
+
+		r = debug_refresh_runlist(target->pdds[i]->dev->dqm);
+		if (r) {
+			target->dbg_flags = prev_flags;
+			break;
+		}
+	}
+
+	return r;
+}
+
 
 /* kfd_dbg_trap_deactivate:
  *	target: target process
@@ -369,9 +401,12 @@ void kfd_dbg_trap_deactivate(struct kfd_process *target, bool unwind, int unwind
 	int i, count = 0;
 
 	if (!unwind) {
+		uint32_t flags = 0;
 		cancel_work_sync(&target->debug_event_workarea);
 		kfd_dbg_clear_process_address_watch(target);
 		kfd_dbg_trap_set_wave_launch_mode(target, 0);
+
+		kfd_dbg_trap_set_flags(target, &flags);
 	}
 
 	for (i = 0; i < target->n_pdds; i++) {
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.h b/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
index ad677e67e7eb..12b80b6c96d0 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
@@ -57,6 +57,7 @@ int kfd_dbg_trap_set_dev_address_watch(struct kfd_process_device *pdd,
 					uint32_t watch_address_mask,
 					uint32_t *watch_id,
 					uint32_t watch_mode);
+int kfd_dbg_trap_set_flags(struct kfd_process *target, uint32_t *flags);
 int kfd_dbg_send_exception_to_runtime(struct kfd_process *p,
 					unsigned int dev_id,
 					unsigned int queue_id,
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 63+ messages in thread

* [PATCH 25/29] drm/amdkfd: add debug query event operation
  2022-10-31 16:23 [PATCH 01/29] drm/amdkfd: add debug and runtime enable interface Jonathan Kim
                   ` (22 preceding siblings ...)
  2022-10-31 16:23 ` [PATCH 24/29] drm/amdkfd: add debug set flags operation Jonathan Kim
@ 2022-10-31 16:23 ` Jonathan Kim
  2022-11-30  0:44   ` Felix Kuehling
  2022-10-31 16:23 ` [PATCH 26/29] drm/amdkfd: add debug query exception info operation Jonathan Kim
                   ` (4 subsequent siblings)
  28 siblings, 1 reply; 63+ messages in thread
From: Jonathan Kim @ 2022-10-31 16:23 UTC (permalink / raw)
  To: amd-gfx; +Cc: Felix.Kuehling

Allow the debugger to a single query queue, device and process exception
in a FIFO manner.
The KFD should also return the GPU or Queue id of the exception.
The debugger also has the option of clearing exceptions after
being queried.

Signed-off-by: Jonathan Kim <jonathan.kim@amd.com>
---
 drivers/gpu/drm/amd/amdkfd/kfd_chardev.c |  6 +++
 drivers/gpu/drm/amd/amdkfd/kfd_debug.c   | 64 ++++++++++++++++++++++++
 drivers/gpu/drm/amd/amdkfd/kfd_debug.h   |  5 ++
 3 files changed, 75 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index 200e11f02382..b918213a0087 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -2946,6 +2946,12 @@ static int kfd_ioctl_set_debug_trap(struct file *filep, struct kfd_process *p, v
 		r = kfd_dbg_trap_set_flags(target, &args->set_flags.flags);
 		break;
 	case KFD_IOC_DBG_TRAP_QUERY_DEBUG_EVENT:
+		r = kfd_dbg_ev_query_debug_event(target,
+				&args->query_debug_event.queue_id,
+				&args->query_debug_event.gpu_id,
+				args->query_debug_event.exception_mask,
+				&args->query_debug_event.exception_mask);
+		break;
 	case KFD_IOC_DBG_TRAP_QUERY_EXCEPTION_INFO:
 	case KFD_IOC_DBG_TRAP_GET_QUEUE_SNAPSHOT:
 	case KFD_IOC_DBG_TRAP_GET_DEVICE_SNAPSHOT:
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
index 1f4d3fa0278e..6985a53b83e9 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
@@ -33,6 +33,70 @@
 #define MAX_WATCH_ADDRESSES	4
 static DEFINE_SPINLOCK(watch_points_lock);
 
+int kfd_dbg_ev_query_debug_event(struct kfd_process *process,
+		      unsigned int *queue_id,
+		      unsigned int *gpu_id,
+		      uint64_t exception_clear_mask,
+		      uint64_t *event_status)
+{
+	struct process_queue_manager *pqm;
+	struct process_queue_node *pqn;
+	int i;
+
+	if (!(process && process->debug_trap_enabled))
+		return -ENODATA;
+
+	mutex_lock(&process->event_mutex);
+	*event_status = 0;
+	*queue_id = 0;
+	*gpu_id = 0;
+
+	/* find and report queue events */
+	pqm = &process->pqm;
+	list_for_each_entry(pqn, &pqm->queues, process_queue_list) {
+		uint64_t tmp = process->exception_enable_mask;
+
+		if (!pqn->q)
+			continue;
+
+		tmp &= pqn->q->properties.exception_status;
+
+		if (!tmp)
+			continue;
+
+		*event_status = pqn->q->properties.exception_status;
+		*queue_id = pqn->q->properties.queue_id;
+		*gpu_id = pqn->q->device->id;
+		pqn->q->properties.exception_status &= ~exception_clear_mask;
+		goto out;
+	}
+
+	/* find and report device events */
+	for (i = 0; i < process->n_pdds; i++) {
+		struct kfd_process_device *pdd = process->pdds[i];
+		uint64_t tmp = process->exception_enable_mask
+						& pdd->exception_status;
+
+		if (!tmp)
+			continue;
+
+		*event_status = pdd->exception_status;
+		*gpu_id = pdd->dev->id;
+		pdd->exception_status &= ~exception_clear_mask;
+		goto out;
+	}
+
+	/* report process events */
+	if (process->exception_enable_mask & process->exception_status) {
+		*event_status = process->exception_status;
+		process->exception_status &= ~exception_clear_mask;
+	}
+
+out:
+	mutex_unlock(&process->event_mutex);
+	return *event_status ? 0 : -EAGAIN;
+}
+
 void debug_event_write_work_handler(struct work_struct *work)
 {
 	struct kfd_process *process;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.h b/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
index 12b80b6c96d0..c64ffd3efc46 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
@@ -27,6 +27,11 @@
 
 void kfd_dbg_trap_deactivate(struct kfd_process *target, bool unwind, int unwind_count);
 int kfd_dbg_trap_activate(struct kfd_process *target);
+int kfd_dbg_ev_query_debug_event(struct kfd_process *process,
+			unsigned int *queue_id,
+			unsigned int *gpu_id,
+			uint64_t exception_clear_mask,
+			uint64_t *event_status);
 bool kfd_set_dbg_ev_from_interrupt(struct kfd_dev *dev,
 				   unsigned int pasid,
 				   uint32_t doorbell_id,
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 63+ messages in thread

* [PATCH 26/29] drm/amdkfd: add debug query exception info operation
  2022-10-31 16:23 [PATCH 01/29] drm/amdkfd: add debug and runtime enable interface Jonathan Kim
                   ` (23 preceding siblings ...)
  2022-10-31 16:23 ` [PATCH 25/29] drm/amdkfd: add debug query event operation Jonathan Kim
@ 2022-10-31 16:23 ` Jonathan Kim
  2022-11-30  0:50   ` Felix Kuehling
  2022-10-31 16:23 ` [PATCH 27/29] drm/amdkfd: add debug queue snapshot operation Jonathan Kim
                   ` (3 subsequent siblings)
  28 siblings, 1 reply; 63+ messages in thread
From: Jonathan Kim @ 2022-10-31 16:23 UTC (permalink / raw)
  To: amd-gfx; +Cc: Felix.Kuehling

Allow the debugger to query additional info based on an exception code.
For device exceptions, it's currently only memory violation information.
For process exceptions, it's currently only runtime information.
Queue exception only report the queue exception status.

The debugger has the option of clearing the target exception on query.

Signed-off-by: Jonathan Kim <jonathan.kim@amd.com>
---
 drivers/gpu/drm/amd/amdkfd/kfd_chardev.c |   7 ++
 drivers/gpu/drm/amd/amdkfd/kfd_debug.c   | 120 +++++++++++++++++++++++
 drivers/gpu/drm/amd/amdkfd/kfd_debug.h   |   6 ++
 3 files changed, 133 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index b918213a0087..2c8f107237ee 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -2953,6 +2953,13 @@ static int kfd_ioctl_set_debug_trap(struct file *filep, struct kfd_process *p, v
 				&args->query_debug_event.exception_mask);
 		break;
 	case KFD_IOC_DBG_TRAP_QUERY_EXCEPTION_INFO:
+		r = kfd_dbg_trap_query_exception_info(target,
+				args->query_exception_info.source_id,
+				args->query_exception_info.exception_code,
+				args->query_exception_info.clear_exception,
+				(void __user *)args->query_exception_info.info_ptr,
+				&args->query_exception_info.info_size);
+		break;
 	case KFD_IOC_DBG_TRAP_GET_QUEUE_SNAPSHOT:
 	case KFD_IOC_DBG_TRAP_GET_DEVICE_SNAPSHOT:
 		pr_warn("Debug op %i not supported yet\n", args->op);
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
index 6985a53b83e9..a05fe32eac0e 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
@@ -768,6 +768,126 @@ int kfd_dbg_trap_set_wave_launch_mode(struct kfd_process *target,
 	return r;
 }
 
+int kfd_dbg_trap_query_exception_info(struct kfd_process *target,
+		uint32_t source_id,
+		uint32_t exception_code,
+		bool clear_exception,
+		void __user *info,
+		uint32_t *info_size)
+{
+	bool found = false;
+	int r = 0;
+	uint32_t copy_size, actual_info_size = 0;
+	uint64_t *exception_status_ptr = NULL;
+
+	if (!target)
+		return -EINVAL;
+
+	if (!info || !info_size)
+		return -EINVAL;
+
+	mutex_lock(&target->event_mutex);
+
+	if (KFD_DBG_EC_TYPE_IS_QUEUE(exception_code)) {
+		/* Per queue exceptions */
+		struct queue *queue = NULL;
+		int i;
+		
+		for (i = 0; i < target->n_pdds; i++) {
+			struct kfd_process_device *pdd = target->pdds[i];
+			struct qcm_process_device *qpd = &pdd->qpd;
+
+			list_for_each_entry(queue, &qpd->queues_list, list) {
+				if (!found && queue->properties.queue_id == source_id) {
+					found = true;
+					break;
+				}
+			}
+			if (found)
+				break;
+		}
+
+		if (!found) {
+			r = -EINVAL;
+			goto out;
+		}
+
+		if (!(queue->properties.exception_status & KFD_EC_MASK(exception_code))) {
+			r = -ENODATA;
+			goto out;
+		}
+		exception_status_ptr = &queue->properties.exception_status;
+	} else if (KFD_DBG_EC_TYPE_IS_DEVICE(exception_code)) {
+		/* Per device exceptions */
+		struct kfd_process_device *pdd = NULL;
+		int i;
+
+		for (i = 0; i < target->n_pdds; i++) {
+			pdd = target->pdds[i];
+			if (pdd->dev->id == source_id) {
+				found = true;
+				break;
+			}
+		}
+
+		if (!found) {
+			r = -EINVAL;
+			goto out;
+		}
+
+		if (!(pdd->exception_status & KFD_EC_MASK(exception_code))) {
+			r = -ENODATA;
+			goto out;
+		}
+
+		if (exception_code == EC_DEVICE_MEMORY_VIOLATION) {
+			copy_size = min((size_t)(*info_size), pdd->vm_fault_exc_data_size);
+
+			if (copy_to_user(info, pdd->vm_fault_exc_data, copy_size)) {
+				r = -EFAULT;
+				goto out;
+			}
+			actual_info_size = pdd->vm_fault_exc_data_size;
+			if (clear_exception) {
+				kfree(pdd->vm_fault_exc_data);
+				pdd->vm_fault_exc_data = NULL;
+				pdd->vm_fault_exc_data_size = 0;
+			}
+		}
+		exception_status_ptr = &pdd->exception_status;
+	} else if (KFD_DBG_EC_TYPE_IS_PROCESS(exception_code)) {
+		/* Per process exceptions */
+		if (!(target->exception_status & KFD_EC_MASK(exception_code))) {
+			r = -ENODATA;
+			goto out;
+		}
+
+		if (exception_code == EC_PROCESS_RUNTIME) {
+			copy_size = min((size_t)(*info_size), sizeof(target->runtime_info));
+
+			if (copy_to_user(info, (void *)&target->runtime_info, copy_size)) {
+				r = -EFAULT;
+				goto out;
+			}
+
+			actual_info_size = sizeof(target->runtime_info);
+		}
+
+		exception_status_ptr = &target->exception_status;
+	} else {
+		pr_debug("Bad exception type [%i]\n", exception_code);
+		r = -EINVAL;
+		goto out;
+	}
+
+	*info_size = actual_info_size;
+	if (clear_exception)
+		*exception_status_ptr &= ~KFD_EC_MASK(exception_code);
+out:
+	mutex_unlock(&target->event_mutex);
+	return r;
+}
+
 void kfd_dbg_set_enabled_debug_exception_mask(struct kfd_process *target,
 					uint64_t exception_set_mask)
 {
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.h b/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
index c64ffd3efc46..58a5f14d1258 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
@@ -63,6 +63,12 @@ int kfd_dbg_trap_set_dev_address_watch(struct kfd_process_device *pdd,
 					uint32_t *watch_id,
 					uint32_t watch_mode);
 int kfd_dbg_trap_set_flags(struct kfd_process *target, uint32_t *flags);
+int kfd_dbg_trap_query_exception_info(struct kfd_process *target,
+		uint32_t source_id,
+		uint32_t exception_code,
+		bool clear_exception,
+		void __user *info,
+		uint32_t *info_size);
 int kfd_dbg_send_exception_to_runtime(struct kfd_process *p,
 					unsigned int dev_id,
 					unsigned int queue_id,
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 63+ messages in thread

* [PATCH 27/29] drm/amdkfd: add debug queue snapshot operation
  2022-10-31 16:23 [PATCH 01/29] drm/amdkfd: add debug and runtime enable interface Jonathan Kim
                   ` (24 preceding siblings ...)
  2022-10-31 16:23 ` [PATCH 26/29] drm/amdkfd: add debug query exception info operation Jonathan Kim
@ 2022-10-31 16:23 ` Jonathan Kim
  2022-11-30 23:55   ` Felix Kuehling
  2022-10-31 16:23 ` [PATCH 28/29] drm/amdkfd: add debug device " Jonathan Kim
                   ` (2 subsequent siblings)
  28 siblings, 1 reply; 63+ messages in thread
From: Jonathan Kim @ 2022-10-31 16:23 UTC (permalink / raw)
  To: amd-gfx; +Cc: Felix.Kuehling

Allow the debugger to get a snapshot of a specified number of queues
containing various queue property information that is copied to the
debugger.

Since the debugger doesn't know how many queues exist at any given time,
allow the debugger to pass the requested number of snapshots as 0 to get
the actual number of potential snapshots to use for a subsequent snapshot
request for actual information.

To prevent future ABI breakage, pass in the requested entry_size.
The KFD will return it's own entry_size in case the debugger still wants
log the information in a core dump on sizing failure.

Also allow the debugger to clear exceptions when doing a snapshot.

v2: change buf_size arg to num_queues for clarity.
fix minimum entry size calculation.

Signed-off-by: Jonathan Kim <jonathan.kim@amd.com>
---
 drivers/gpu/drm/amd/amdkfd/kfd_chardev.c      |  6 +++
 .../drm/amd/amdkfd/kfd_device_queue_manager.c | 41 +++++++++++++++++++
 .../drm/amd/amdkfd/kfd_device_queue_manager.h |  4 ++
 drivers/gpu/drm/amd/amdkfd/kfd_priv.h         |  5 +++
 .../amd/amdkfd/kfd_process_queue_manager.c    | 40 ++++++++++++++++++
 5 files changed, 96 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index 2c8f107237ee..cea393350980 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -2961,6 +2961,12 @@ static int kfd_ioctl_set_debug_trap(struct file *filep, struct kfd_process *p, v
 				&args->query_exception_info.info_size);
 		break;
 	case KFD_IOC_DBG_TRAP_GET_QUEUE_SNAPSHOT:
+		r = pqm_get_queue_snapshot(&target->pqm,
+				args->queue_snapshot.exception_mask,
+				(void __user *)args->queue_snapshot.snapshot_buf_ptr,
+				&args->queue_snapshot.num_queues,
+				&args->queue_snapshot.entry_size);
+		break;
 	case KFD_IOC_DBG_TRAP_GET_DEVICE_SNAPSHOT:
 		pr_warn("Debug op %i not supported yet\n", args->op);
 		r = -EACCES;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
index 589efbefc8dc..51f8c5676c56 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -2950,6 +2950,47 @@ int suspend_queues(struct kfd_process *p,
 	return total_suspended;
 }
 
+static uint32_t set_queue_type_for_user(struct queue_properties *q_props)
+{
+	switch (q_props->type) {
+	case KFD_QUEUE_TYPE_COMPUTE:
+		return q_props->format == KFD_QUEUE_FORMAT_PM4
+					? KFD_IOC_QUEUE_TYPE_COMPUTE
+					: KFD_IOC_QUEUE_TYPE_COMPUTE_AQL;
+	case KFD_QUEUE_TYPE_SDMA:
+		return KFD_IOC_QUEUE_TYPE_SDMA;
+	case KFD_QUEUE_TYPE_SDMA_XGMI:
+		return KFD_IOC_QUEUE_TYPE_SDMA_XGMI;
+	default:
+		WARN_ONCE(true, "queue type not recognized!");
+		return 0xffffffff;
+	};
+}
+
+void set_queue_snapshot_entry(struct device_queue_manager *dqm,
+			      struct queue *q,
+			      uint64_t exception_clear_mask,
+			      struct kfd_queue_snapshot_entry *qss_entry)
+{
+	dqm_lock(dqm);
+
+	qss_entry->ring_base_address = q->properties.queue_address;
+	qss_entry->write_pointer_address = (uint64_t)q->properties.write_ptr;
+	qss_entry->read_pointer_address = (uint64_t)q->properties.read_ptr;
+	qss_entry->ctx_save_restore_address =
+				q->properties.ctx_save_restore_area_address;
+	qss_entry->ctx_save_restore_area_size =
+				q->properties.ctx_save_restore_area_size;
+	qss_entry->exception_status = q->properties.exception_status;
+	qss_entry->queue_id = q->properties.queue_id;
+	qss_entry->gpu_id = q->device->id;
+	qss_entry->ring_size = (uint32_t)q->properties.queue_size;
+	qss_entry->queue_type = set_queue_type_for_user(&q->properties);
+	q->properties.exception_status &= ~exception_clear_mask;
+
+	dqm_unlock(dqm);
+}
+
 int debug_lock_and_unmap(struct device_queue_manager *dqm)
 {
 	int r;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
index 12643528684c..094705b932fc 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
@@ -297,6 +297,10 @@ int resume_queues(struct kfd_process *p,
 		bool resume_all_queues,
 		uint32_t num_queues,
 		uint32_t *usr_queue_id_array);
+void set_queue_snapshot_entry(struct device_queue_manager *dqm,
+			      struct queue *q,
+			      uint64_t exception_clear_mask,
+			      struct kfd_queue_snapshot_entry *qss_entry);
 int debug_lock_and_unmap(struct device_queue_manager *dqm);
 int debug_map_and_unlock(struct device_queue_manager *dqm);
 int debug_refresh_runlist(struct device_queue_manager *dqm);
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index aee4fe20e676..ebd701143981 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -1302,6 +1302,11 @@ int pqm_get_wave_state(struct process_queue_manager *pqm,
 		       void __user *ctl_stack,
 		       u32 *ctl_stack_used_size,
 		       u32 *save_area_used_size);
+int pqm_get_queue_snapshot(struct process_queue_manager *pqm,
+			   uint64_t exception_clear_mask,
+			   struct kfd_queue_snapshot_entry __user *buf,
+			   int *num_qss_entries,
+			   uint32_t *entry_size);
 
 int amdkfd_fence_wait_timeout(uint64_t *fence_addr,
 			      uint64_t fence_value,
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
index 15db83c9a585..30df1046c30b 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
@@ -569,6 +569,46 @@ int pqm_get_wave_state(struct process_queue_manager *pqm,
 						       save_area_used_size);
 }
 
+int pqm_get_queue_snapshot(struct process_queue_manager *pqm,
+			   uint64_t exception_clear_mask,
+			   struct kfd_queue_snapshot_entry __user *buf,
+			   int *num_qss_entries,
+			   uint32_t *entry_size)
+{
+	struct process_queue_node *pqn;
+	uint32_t tmp_entry_size = *entry_size, tmp_qss_entries = *num_qss_entries;
+	int r;
+
+	*num_qss_entries = 0;
+	if (!(*entry_size))
+		return -EINVAL;
+
+	*entry_size = min_t(size_t, *entry_size, sizeof(struct kfd_queue_snapshot_entry));
+	mutex_lock(&pqm->process->event_mutex);
+
+	list_for_each_entry(pqn, &pqm->queues, process_queue_list) {
+		if (!pqn->q)
+			continue;
+
+		if (*num_qss_entries < tmp_qss_entries) {
+			struct kfd_queue_snapshot_entry src = {0};
+
+			set_queue_snapshot_entry(pqn->q->device->dqm,
+					pqn->q, exception_clear_mask, &src);
+
+			if (copy_to_user(buf, &src, *entry_size)) {
+				r = -EFAULT;
+				break;
+			}
+			buf += tmp_entry_size;
+		}
+		*num_qss_entries += 1;
+	}
+
+	mutex_unlock(&pqm->process->event_mutex);
+	return r;
+}
+
 static int get_queue_data_sizes(struct kfd_process_device *pdd,
 				struct queue *q,
 				uint32_t *mqd_size,
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 63+ messages in thread

* [PATCH 28/29] drm/amdkfd: add debug device snapshot operation
  2022-10-31 16:23 [PATCH 01/29] drm/amdkfd: add debug and runtime enable interface Jonathan Kim
                   ` (25 preceding siblings ...)
  2022-10-31 16:23 ` [PATCH 27/29] drm/amdkfd: add debug queue snapshot operation Jonathan Kim
@ 2022-10-31 16:23 ` Jonathan Kim
  2022-12-01  0:00   ` Felix Kuehling
  2022-10-31 16:23 ` [PATCH 29/29] drm/amdkfd: bump kfd ioctl minor version for debug api availability Jonathan Kim
  2022-11-22 23:05 ` [PATCH 01/29] drm/amdkfd: add debug and runtime enable interface Felix Kuehling
  28 siblings, 1 reply; 63+ messages in thread
From: Jonathan Kim @ 2022-10-31 16:23 UTC (permalink / raw)
  To: amd-gfx; +Cc: Felix.Kuehling

Similar to queue snapshot, return an array of device information using
an entry_size check and return.
Unlike queue snapshots, the debugger needs to pass to correct number of
devices that exist.  If it fails to do so, the KFD will return the
number of actual devices so that the debugger can make a subsequent
successful call.

v2: change buf_size are to num_devices for more clarity.
expand device entry new members on copy.
fix minimum entry size calculation for queue and device snapshot.
change device snapshot implementation to match queue snapshot
implementation.

Signed-off-by: Jonathan Kim <jonathan.kim@amd.com>
---
 drivers/gpu/drm/amd/amdkfd/kfd_chardev.c |  7 ++-
 drivers/gpu/drm/amd/amdkfd/kfd_debug.c   | 67 ++++++++++++++++++++++++
 drivers/gpu/drm/amd/amdkfd/kfd_debug.h   |  5 ++
 3 files changed, 77 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index cea393350980..115a80686f7a 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -2968,8 +2968,11 @@ static int kfd_ioctl_set_debug_trap(struct file *filep, struct kfd_process *p, v
 				&args->queue_snapshot.entry_size);
 		break;
 	case KFD_IOC_DBG_TRAP_GET_DEVICE_SNAPSHOT:
-		pr_warn("Debug op %i not supported yet\n", args->op);
-		r = -EACCES;
+		r = kfd_dbg_trap_device_snapshot(target,
+				args->device_snapshot.exception_mask,
+				(void __user *)args->device_snapshot.snapshot_buf_ptr,
+				&args->device_snapshot.num_devices,
+				&args->device_snapshot.entry_size);
 		break;
 	default:
 		pr_err("Invalid option: %i\n", args->op);
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
index a05fe32eac0e..8d22a27cc062 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
@@ -22,6 +22,7 @@
 
 #include "kfd_debug.h"
 #include "kfd_device_queue_manager.h"
+#include "kfd_topology.h"
 #include <linux/file.h>
 #include <uapi/linux/kfd_ioctl.h>
 
@@ -888,6 +889,72 @@ int kfd_dbg_trap_query_exception_info(struct kfd_process *target,
 	return r;
 }
 
+int kfd_dbg_trap_device_snapshot(struct kfd_process *target,
+		uint64_t exception_clear_mask,
+		void __user *user_info,
+		uint32_t *number_of_device_infos,
+		uint32_t *entry_size)
+{
+	struct kfd_dbg_device_info_entry device_info = {0};
+	uint32_t tmp_entry_size = *entry_size, tmp_num_devices;
+	int i, r = 0;
+
+	if (!(target && user_info && number_of_device_infos && entry_size))
+		return -EINVAL;
+
+	tmp_num_devices = min_t(size_t, *number_of_device_infos, target->n_pdds);
+	*number_of_device_infos = target->n_pdds;
+	*entry_size = min_t(size_t, *entry_size, sizeof(device_info));
+
+	if (!tmp_num_devices)
+		return 0;
+
+	mutex_lock(&target->event_mutex);
+
+	/* Run over all pdd of the process */
+	for (i = 0; i < tmp_num_devices; i++) {
+		struct kfd_process_device *pdd = target->pdds[i];
+		struct kfd_topology_device *topo_dev = kfd_topology_device_by_id(pdd->dev->id);
+
+		device_info.gpu_id = pdd->dev->id;
+		device_info.exception_status = pdd->exception_status;
+		device_info.lds_base = pdd->lds_base;
+		device_info.lds_limit = pdd->lds_limit;
+		device_info.scratch_base = pdd->scratch_base;
+		device_info.scratch_limit = pdd->scratch_limit;
+		device_info.gpuvm_base = pdd->gpuvm_base;
+		device_info.gpuvm_limit = pdd->gpuvm_limit;
+		device_info.location_id = topo_dev->node_props.location_id;
+		device_info.vendor_id = topo_dev->node_props.vendor_id;
+		device_info.device_id = topo_dev->node_props.device_id;
+		device_info.fw_version = pdd->dev->mec_fw_version;
+		device_info.gfx_target_version =
+			topo_dev->node_props.gfx_target_version;
+		device_info.simd_count = topo_dev->node_props.simd_count;
+		device_info.max_waves_per_simd =
+			topo_dev->node_props.max_waves_per_simd;
+		device_info.array_count = topo_dev->node_props.array_count;
+		device_info.simd_arrays_per_engine =
+			topo_dev->node_props.simd_arrays_per_engine;
+		device_info.capability = topo_dev->node_props.capability;
+		device_info.debug_prop = topo_dev->node_props.debug_prop;
+
+		if (exception_clear_mask)
+			pdd->exception_status &= ~exception_clear_mask;
+
+		if (copy_to_user(user_info, &device_info, *entry_size)) {
+			r = -EFAULT;
+			break;
+		}
+
+		user_info += tmp_entry_size;
+	}
+
+	mutex_unlock(&target->event_mutex);
+
+	return r;
+}
+
 void kfd_dbg_set_enabled_debug_exception_mask(struct kfd_process *target,
 					uint64_t exception_set_mask)
 {
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.h b/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
index 58a5f14d1258..d8c0c54fffa3 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
@@ -80,6 +80,11 @@ static inline bool kfd_dbg_is_per_vmid_supported(struct kfd_dev *dev)
 }
 
 void debug_event_write_work_handler(struct work_struct *work);
+int kfd_dbg_trap_device_snapshot(struct kfd_process *target,
+		uint64_t exception_clear_mask,
+		void __user *user_info,
+		uint32_t *number_of_device_infos,
+		uint32_t *entry_size);
 
 void kfd_dbg_set_enabled_debug_exception_mask(struct kfd_process *target,
 					uint64_t exception_set_mask);
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 63+ messages in thread

* [PATCH 29/29] drm/amdkfd: bump kfd ioctl minor version for debug api availability
  2022-10-31 16:23 [PATCH 01/29] drm/amdkfd: add debug and runtime enable interface Jonathan Kim
                   ` (26 preceding siblings ...)
  2022-10-31 16:23 ` [PATCH 28/29] drm/amdkfd: add debug device " Jonathan Kim
@ 2022-10-31 16:23 ` Jonathan Kim
  2022-12-01  0:00   ` Felix Kuehling
  2022-11-22 23:05 ` [PATCH 01/29] drm/amdkfd: add debug and runtime enable interface Felix Kuehling
  28 siblings, 1 reply; 63+ messages in thread
From: Jonathan Kim @ 2022-10-31 16:23 UTC (permalink / raw)
  To: amd-gfx; +Cc: Felix.Kuehling

Bump the minor version to declare debugging capability is now
available.

Signed-off-by: Jonathan Kim <jonathan.kim@amd.com>
---
 drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 1 -
 include/uapi/linux/kfd_ioctl.h           | 3 ++-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index 115a80686f7a..2f7d8b230527 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -2891,7 +2891,6 @@ static int kfd_ioctl_set_debug_trap(struct file *filep, struct kfd_process *p, v
 		if (!r)
 			target->exception_enable_mask = args->enable.exception_mask;
 
-		pr_warn("Debug functions limited\n");
 		break;
 	case KFD_IOC_DBG_TRAP_DISABLE:
 		r = kfd_dbg_trap_disable(target);
diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h
index bedf1b823f57..fe5acee2684d 100644
--- a/include/uapi/linux/kfd_ioctl.h
+++ b/include/uapi/linux/kfd_ioctl.h
@@ -37,9 +37,10 @@
  * - 1.9 - Add available memory ioctl
  * - 1.10 - Add SMI profiler event log
  * - 1.11 - Add unified memory for ctx save/restore area
+ * - 1.12 - Add debugger API
  */
 #define KFD_IOCTL_MAJOR_VERSION 1
-#define KFD_IOCTL_MINOR_VERSION 11
+#define KFD_IOCTL_MINOR_VERSION 12
 
 struct kfd_ioctl_get_version_args {
 	__u32 major_version;	/* from KFD */
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 63+ messages in thread

* Re: [PATCH 01/29] drm/amdkfd: add debug and runtime enable interface
  2022-10-31 16:23 [PATCH 01/29] drm/amdkfd: add debug and runtime enable interface Jonathan Kim
                   ` (27 preceding siblings ...)
  2022-10-31 16:23 ` [PATCH 29/29] drm/amdkfd: bump kfd ioctl minor version for debug api availability Jonathan Kim
@ 2022-11-22 23:05 ` Felix Kuehling
  2022-11-23 20:45   ` Kim, Jonathan
  28 siblings, 1 reply; 63+ messages in thread
From: Felix Kuehling @ 2022-11-22 23:05 UTC (permalink / raw)
  To: Jonathan Kim, amd-gfx


On 2022-10-31 12:23, Jonathan Kim wrote:
> Introduce the GPU debug operations interface.
>
> For ROCm-GDB to extend the GNU Debugger's ability to inspect the AMD GPU
> instruction set, provide the necessary interface to allow the debugger
> to HW debug-mode set and query exceptions per HSA queue, process or
> device.
>
> The runtime_enable interface coordinates exception handling with the
> HSA runtime.
>
> Usage is available in the kern docs at uapi/linux/kfd_ioctl.h.
>
> v2: add more documentation on semantics and error returns.
> expand kfd_dbg_device_info_entry with new fields.
> update device_snapshot sematics to match queue snapshot semantics

This looks really good. I have 3 more nit-picks inline. Other than that, 
this patch is

Reviewed-by: Felix Kuehling <Felix.Kuehling@amd.com>

Do we have a debugger branch that uses the API yet? We should make this 
public in order to complete this upstream code review.


>
> Signed-off-by: Jonathan Kim <jonathan.kim@amd.com>
> ---
> drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 48 ++
> include/uapi/linux/kfd_ioctl.h | 655 ++++++++++++++++++++++-
> 2 files changed, 702 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c 
> b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> index 5feaba6a77de..11a960c83fb2 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> @@ -2644,6 +2644,48 @@ static int kfd_ioctl_criu(struct file *filep, 
> struct kfd_process *p, void *data)
> return ret;
> }
> +static int kfd_ioctl_runtime_enable(struct file *filep, struct 
> kfd_process *p, void *data)
> +{
> + return 0;
> +}
> +
> +static int kfd_ioctl_set_debug_trap(struct file *filep, struct 
> kfd_process *p, void *data)
> +{
> + struct kfd_ioctl_dbg_trap_args *args = data;
> + int r = 0;
> +
> + if (sched_policy == KFD_SCHED_POLICY_NO_HWS) {
> + pr_err("Debugging does not support sched_policy %i", sched_policy);
> + return -EINVAL;
> + }
> +
> + switch (args->op) {
> + case KFD_IOC_DBG_TRAP_ENABLE:
> + case KFD_IOC_DBG_TRAP_DISABLE:
> + case KFD_IOC_DBG_TRAP_SEND_RUNTIME_EVENT:
> + case KFD_IOC_DBG_TRAP_SET_EXCEPTIONS_ENABLED:
> + case KFD_IOC_DBG_TRAP_SET_WAVE_LAUNCH_OVERRIDE:
> + case KFD_IOC_DBG_TRAP_SET_WAVE_LAUNCH_MODE:
> + case KFD_IOC_DBG_TRAP_SUSPEND_QUEUES:
> + case KFD_IOC_DBG_TRAP_RESUME_QUEUES:
> + case KFD_IOC_DBG_TRAP_SET_NODE_ADDRESS_WATCH:
> + case KFD_IOC_DBG_TRAP_CLEAR_NODE_ADDRESS_WATCH:
> + case KFD_IOC_DBG_TRAP_SET_FLAGS:
> + case KFD_IOC_DBG_TRAP_QUERY_DEBUG_EVENT:
> + case KFD_IOC_DBG_TRAP_QUERY_EXCEPTION_INFO:
> + case KFD_IOC_DBG_TRAP_GET_QUEUE_SNAPSHOT:
> + case KFD_IOC_DBG_TRAP_GET_DEVICE_SNAPSHOT:
> + pr_warn("Debugging not supported yet\n");
> + r = -EACCES;
> + break;
> + default:
> + pr_err("Invalid option: %i\n", args->op);
> + r = -EINVAL;
> + }
> +
> + return r;
> +}
> +
> #define AMDKFD_IOCTL_DEF(ioctl, _func, _flags) \
> [_IOC_NR(ioctl)] = {.cmd = ioctl, .func = _func, .flags = _flags, \
> .cmd_drv = 0, .name = #ioctl}
> @@ -2753,6 +2795,12 @@ static const struct amdkfd_ioctl_desc 
> amdkfd_ioctls[] = {
> AMDKFD_IOCTL_DEF(AMDKFD_IOC_AVAILABLE_MEMORY,
> kfd_ioctl_get_available_memory, 0),
> +
> + AMDKFD_IOCTL_DEF(AMDKFD_IOC_RUNTIME_ENABLE,
> + kfd_ioctl_runtime_enable, 0),
> +
> + AMDKFD_IOCTL_DEF(AMDKFD_IOC_DBG_TRAP,
> + kfd_ioctl_set_debug_trap, 0),
> };
> #define AMDKFD_CORE_IOCTL_COUNT ARRAY_SIZE(amdkfd_ioctls)
> diff --git a/include/uapi/linux/kfd_ioctl.h 
> b/include/uapi/linux/kfd_ioctl.h
> index 42b60198b6c5..bedf1b823f57 100644
> --- a/include/uapi/linux/kfd_ioctl.h
> +++ b/include/uapi/linux/kfd_ioctl.h
> @@ -109,6 +109,28 @@ struct kfd_ioctl_get_available_memory_args {
> __u32 pad;
> };
> +struct kfd_dbg_device_info_entry {
> + __u64 exception_status;
> + __u64 lds_base;
> + __u64 lds_limit;
> + __u64 scratch_base;
> + __u64 scratch_limit;
> + __u64 gpuvm_base;
> + __u64 gpuvm_limit;
> + __u32 gpu_id;
> + __u32 location_id;
> + __u32 vendor_id;
> + __u32 device_id;
> + __u32 fw_version;
> + __u32 gfx_target_version;
> + __u32 simd_count;
> + __u32 max_waves_per_simd;
> + __u32 array_count;
> + __u32 simd_arrays_per_engine;
> + __u32 capability;
> + __u32 debug_prop;
> +};
> +
> /* For kfd_ioctl_set_memory_policy_args.default_policy and 
> alternate_policy */
> #define KFD_IOC_CACHE_POLICY_COHERENT 0
> #define KFD_IOC_CACHE_POLICY_NONCOHERENT 1
> @@ -766,6 +788,631 @@ struct kfd_ioctl_set_xnack_mode_args {
> __s32 xnack_enabled;
> };
> +/* Wave launch override modes */
> +enum kfd_dbg_trap_override_mode {
> + KFD_DBG_TRAP_OVERRIDE_OR = 0,
> + KFD_DBG_TRAP_OVERRIDE_REPLACE = 1
> +};
> +
> +/* Wave launch overrides */
> +enum kfd_dbg_trap_mask {
> + KFD_DBG_TRAP_MASK_FP_INVALID = 1,
> + KFD_DBG_TRAP_MASK_FP_INPUT_DENORMAL = 2,
> + KFD_DBG_TRAP_MASK_FP_DIVIDE_BY_ZERO = 4,
> + KFD_DBG_TRAP_MASK_FP_OVERFLOW = 8,
> + KFD_DBG_TRAP_MASK_FP_UNDERFLOW = 16,
> + KFD_DBG_TRAP_MASK_FP_INEXACT = 32,
> + KFD_DBG_TRAP_MASK_INT_DIVIDE_BY_ZERO = 64,
> + KFD_DBG_TRAP_MASK_DBG_ADDRESS_WATCH = 128,
> + KFD_DBG_TRAP_MASK_DBG_MEMORY_VIOLATION = 256
> +};
> +
> +/* Wave launch modes */
> +enum kfd_dbg_trap_wave_launch_mode {
> + KFD_DBG_TRAP_WAVE_LAUNCH_MODE_NORMAL = 0,
> + KFD_DBG_TRAP_WAVE_LAUNCH_MODE_HALT = 1,
> + KFD_DBG_TRAP_WAVE_LAUNCH_MODE_KILL = 2,
> + KFD_DBG_TRAP_WAVE_LAUNCH_MODE_DEBUG = 3,
> + KFD_DBG_TRAP_WAVE_LAUNCH_MODE_STALL = 4
> +};
> +
> +/* Address watch modes */
> +enum kfd_dbg_trap_address_watch_mode {
> + KFD_DBG_TRAP_ADDRESS_WATCH_MODE_READ = 0,
> + KFD_DBG_TRAP_ADDRESS_WATCH_MODE_NONREAD = 1,
> + KFD_DBG_TRAP_ADDRESS_WATCH_MODE_ATOMIC = 2,
> + KFD_DBG_TRAP_ADDRESS_WATCH_MODE_ALL = 3
> +};
> +
> +/* Additional wave settings */
> +enum kfd_dbg_trap_flags {
> + KFD_DBG_TRAP_FLAG_SINGLE_MEM_OP = 1,
> + KFD_DBG_TRAP_FLAG_SINGLE_ALU_OP = 2
> +};
> +
> +/* Trap exceptions */
> +enum kfd_dbg_trap_exception_code {
> + EC_NONE = 0,
> + /* per queue */
> + EC_QUEUE_WAVE_ABORT = 1,
> + EC_QUEUE_WAVE_TRAP = 2,
> + EC_QUEUE_WAVE_MATH_ERROR = 3,
> + EC_QUEUE_WAVE_ILLEGAL_INSTRUCTION = 4,
> + EC_QUEUE_WAVE_MEMORY_VIOLATION = 5,
> + EC_QUEUE_WAVE_APERTURE_VIOLATION = 6,
> + EC_QUEUE_PACKET_DISPATCH_DIM_INVALID = 16,
> + EC_QUEUE_PACKET_DISPATCH_GROUP_SEGMENT_SIZE_INVALID = 17,
> + EC_QUEUE_PACKET_DISPATCH_CODE_INVALID = 18,
> + EC_QUEUE_PACKET_RESERVED = 19,
> + EC_QUEUE_PACKET_UNSUPPORTED = 20,
> + EC_QUEUE_PACKET_DISPATCH_WORK_GROUP_SIZE_INVALID = 21,
> + EC_QUEUE_PACKET_DISPATCH_REGISTER_INVALID = 22,
> + EC_QUEUE_PACKET_VENDOR_UNSUPPORTED = 23,
> + EC_QUEUE_PREEMPTION_ERROR = 30,
> + EC_QUEUE_NEW = 31,
> + /* per device */
> + EC_DEVICE_QUEUE_DELETE = 32,
> + EC_DEVICE_MEMORY_VIOLATION = 33,
> + EC_DEVICE_RAS_ERROR = 34,
> + EC_DEVICE_FATAL_HALT = 35,
> + EC_DEVICE_NEW = 36,
> + /* per process */
> + EC_PROCESS_RUNTIME = 48,
> + EC_PROCESS_DEVICE_REMOVE = 49,
> + EC_MAX
> +};
> +
> +/* Mask generated by ecode in kfd_dbg_trap_exception_code */
> +#define KFD_EC_MASK(ecode) (1ULL << (ecode - 1))
> +
> +/* Masks for exception code type checks below */
> +#define KFD_EC_MASK_QUEUE (KFD_EC_MASK(EC_QUEUE_WAVE_ABORT) | \
> + KFD_EC_MASK(EC_QUEUE_WAVE_TRAP) | \
> + KFD_EC_MASK(EC_QUEUE_WAVE_MATH_ERROR) | \
> + KFD_EC_MASK(EC_QUEUE_WAVE_ILLEGAL_INSTRUCTION) | \
> + KFD_EC_MASK(EC_QUEUE_WAVE_MEMORY_VIOLATION) | \
> + KFD_EC_MASK(EC_QUEUE_WAVE_APERTURE_VIOLATION) | \
> + KFD_EC_MASK(EC_QUEUE_PACKET_DISPATCH_DIM_INVALID) | \
> + KFD_EC_MASK(EC_QUEUE_PACKET_DISPATCH_GROUP_SEGMENT_SIZE_INVALID) | \
> + KFD_EC_MASK(EC_QUEUE_PACKET_DISPATCH_CODE_INVALID) | \
> + KFD_EC_MASK(EC_QUEUE_PACKET_UNSUPPORTED) | \
> + KFD_EC_MASK(EC_QUEUE_PACKET_DISPATCH_WORK_GROUP_SIZE_INVALID) | \
> + KFD_EC_MASK(EC_QUEUE_PACKET_DISPATCH_REGISTER_INVALID) | \
> + KFD_EC_MASK(EC_QUEUE_PACKET_VENDOR_UNSUPPORTED) | \
> + KFD_EC_MASK(EC_QUEUE_PREEMPTION_ERROR) | \
> + KFD_EC_MASK(EC_QUEUE_NEW))
> +#define KFD_EC_MASK_DEVICE (KFD_EC_MASK(EC_DEVICE_QUEUE_DELETE) | \
> + KFD_EC_MASK(EC_DEVICE_RAS_ERROR) | \
> + KFD_EC_MASK(EC_DEVICE_FATAL_HALT) | \
> + KFD_EC_MASK(EC_DEVICE_MEMORY_VIOLATION) | \
> + KFD_EC_MASK(EC_DEVICE_NEW))
> +#define KFD_EC_MASK_PROCESS (KFD_EC_MASK(EC_PROCESS_RUNTIME) | \
> + KFD_EC_MASK(EC_PROCESS_DEVICE_REMOVE))
> +
> +/* Checks for exception code types for KFD search */
> +#define KFD_DBG_EC_TYPE_IS_QUEUE(ecode) \
> + (!!(KFD_EC_MASK(ecode) & KFD_EC_MASK_QUEUE))
> +#define KFD_DBG_EC_TYPE_IS_DEVICE(ecode) \
> + (!!(KFD_EC_MASK(ecode) & KFD_EC_MASK_DEVICE))
> +#define KFD_DBG_EC_TYPE_IS_PROCESS(ecode) \
> + (!!(KFD_EC_MASK(ecode) & KFD_EC_MASK_PROCESS))
> +
> +
> +/* Runtime enable states */
> +enum kfd_dbg_runtime_state {
> + DEBUG_RUNTIME_STATE_DISABLED = 0,
> + DEBUG_RUNTIME_STATE_ENABLED = 1,
> + DEBUG_RUNTIME_STATE_ENABLED_BUSY = 2,
> + DEBUG_RUNTIME_STATE_ENABLED_ERROR = 3
> +};
> +
> +/* Runtime enable status */
> +struct kfd_runtime_info {
> + __u64 r_debug;
> + __u32 runtime_state;
> + __u32 ttmp_setup;
> +};
> +
> +/* Enable modes for runtime enable */
> +#define KFD_RUNTIME_ENABLE_MODE_ENABLE_MASK 1
> +#define KFD_RUNTIME_ENABLE_MODE_TTMP_SAVE_MASK 2
> +
> +/**
> + * kfd_ioctl_runtime_enable_args - Arguments for runtime enable
> + *
> + * Coordinates debug exception signalling and debug device enablement 
> with runtime.
> + *
> + * @r_debug - pointer to user struct for sharing information between 
> ROCr and the debuggger
> + * @mode_mask - mask to set mode
> + * KFD_RUNTIME_ENABLE_MODE_ENABLE_MASK - enable runtime for 
> debugging, otherwise disable
> + * KFD_RUNTIME_ENABLE_MODE_TTMP_SAVE_MASK - enable trap temporary 
> setup (ignore on disable)
> + *
> + * Return - 0 on SUCCESS.
> + * - EBUSY if runtime enable call already pending.
> + * - EEXIST if user queues already active prior to call.
> + * If process is debug enabled, runtime enable will enable debug 
> devices and
> + * wait for debugger process to send runtime exception EC_PROCESS_RUNTIME
> + * to unblock - see kfd_ioctl_dbg_trap_args.
> + *
> + */
> +struct kfd_ioctl_runtime_enable_args {
> + __u64 r_debug;
> + __u32 mode_mask;
> +};
> +
> +/* Queue information */
> +struct kfd_queue_snapshot_entry {
> + __u64 exception_status;
> + __u64 ring_base_address;
> + __u64 write_pointer_address;
> + __u64 read_pointer_address;
> + __u64 ctx_save_restore_address;
> + __u32 queue_id;
> + __u32 gpu_id;
> + __u32 ring_size;
> + __u32 queue_type;
> + __u32 ctx_save_restore_area_size;
> + __u32 reserved;
> +};
> +
> +/* Queue status return for suspend/resume */
> +#define KFD_DBG_QUEUE_ERROR_BIT 30
> +#define KFD_DBG_QUEUE_INVALID_BIT 31
> +#define KFD_DBG_QUEUE_ERROR_MASK (1 << KFD_DBG_QUEUE_ERROR_BIT)
> +#define KFD_DBG_QUEUE_INVALID_MASK (1 << KFD_DBG_QUEUE_INVALID_BIT)
> +
> +/* Context save area header information */
> +struct kfd_context_save_area_header {
> + __u32 control_stack_offset;
> + __u32 control_stack_size;
> + __u32 wave_state_offset;
> + __u32 wave_state_size;
> + __u32 debug_offset;
> + __u32 debug_size;
> + __u64 err_payload_addr;
> + __u32 err_event_id;
> + __u32 reserved1;
> +};
> +
> +/*
> + * Debug operations
> + *
> + * For specifics on usage and return values, see documentation per 
> operation
> + * below. Otherwise, generic error returns apply:
> + * - ESRCH if the process to debug does not exist.
> + *
> + * - EINVAL (with KFD_IOC_DBG_TRAP_ENABLE exempt) if operation
> + * KFD_IOC_DBG_TRAP_ENABLE has not succeeded prior.
> + * Also returns this error if GPU hardware scheduling is not supported.
> + *
> + * - EPERM (with KFD_IOC_DBG_TRAP_DISABLE exempt) if target process 
> is not
> + * PTRACE_ATTACHED. KFD_IOC_DBG_TRAP_DISABLE is exempt to allow
> + * clean up of debug mode as long as process is debug enabled.
> + *
> + * - EACCES if any DBG_HW_OP (debug hardware operation) is requested when
> + * AMDKFD_IOC_RUNTIME_ENABLE has not succeeded prior.
> + *
> + * - ENODEV if any GPU does not support debugging on a DBG_HW_OP call.
> + *
> + * - Other errors may be returned when a DBG_HW_OP occurs while the GPU
> + * is in a fatal state.
> + *
> + */
> +enum kfd_dbg_trap_operations {
> + KFD_IOC_DBG_TRAP_ENABLE = 0,
> + KFD_IOC_DBG_TRAP_DISABLE = 1,
> + KFD_IOC_DBG_TRAP_SEND_RUNTIME_EVENT = 2,
> + KFD_IOC_DBG_TRAP_SET_EXCEPTIONS_ENABLED = 3,
> + KFD_IOC_DBG_TRAP_SET_WAVE_LAUNCH_OVERRIDE = 4, /* DBG_HW_OP */
> + KFD_IOC_DBG_TRAP_SET_WAVE_LAUNCH_MODE = 5, /* DBG_HW_OP */
> + KFD_IOC_DBG_TRAP_SUSPEND_QUEUES = 6, /* DBG_HW_OP */
> + KFD_IOC_DBG_TRAP_RESUME_QUEUES = 7, /* DBG_HW_OP */
> + KFD_IOC_DBG_TRAP_SET_NODE_ADDRESS_WATCH = 8, /* DBG_HW_OP */
> + KFD_IOC_DBG_TRAP_CLEAR_NODE_ADDRESS_WATCH = 9, /* DBG_HW_OP */
> + KFD_IOC_DBG_TRAP_SET_FLAGS = 10,
> + KFD_IOC_DBG_TRAP_QUERY_DEBUG_EVENT = 11,
> + KFD_IOC_DBG_TRAP_QUERY_EXCEPTION_INFO = 12,
> + KFD_IOC_DBG_TRAP_GET_QUEUE_SNAPSHOT = 13,
> + KFD_IOC_DBG_TRAP_GET_DEVICE_SNAPSHOT = 14
> +};
> +
> +/**
> + * kfd_ioctl_dbg_trap_enable_args
> + *
> + * Arguments for KFD_IOC_DBG_TRAP_ENABLE.
> + *
> + * Enables debug session for target process. Call @op 
> KFD_IOC_DBG_TRAP_DISABLE in
> + * kfd_ioctl_dbg_trap_args to disable debug session.
> + *
> + * @exception_mask (IN) - exceptions to raise to the debugger
> + * @rinfo_ptr (IN) - pointer to runtime info buffer (see 
> kfd_runtime_info)
> + * @rinfo_size (IN/OUT) - size of runtime info buffer in bytes
> + * @dbg_fd (IN) - fd the KFD will nofify the debugger with of raised
> + * exceptions set in exception_mask.
> + *
> + * Generic errors apply (see kfd_dbg_trap_operations).
> + * Return - 0 on SUCCESS.
> + * Copies KFD saved kfd_runtime_info to @rinfo_ptr on enable.
> + * Size of kfd_runtime saved by the KFD returned to @rinfo_size.
> + * - EBADF if KFD cannot get a reference to dbg_fd.
> + * - EFAULT if KFD cannot copy runtime info to rinfo_ptr.
> + * - EINVAL if target process is already debug enabled.
> + *
> + */
> +struct kfd_ioctl_dbg_trap_enable_args {
> + __u64 exception_mask;
> + __u64 rinfo_ptr;
> + __u32 rinfo_size;
> + __u32 dbg_fd;
> +};
> +
> +/**
> + * kfd_ioctl_dbg_trap_send_runtime_event_args
> + *
> + *
> + * Arguments for KFD_IOC_DBG_TRAP_SEND_RUNTIME_EVENT.
> + * Raises exceptions to runtime.
> + *
> + * @exception_mask (IN) - exceptions to raise to runtime
> + * @gpu_id (IN) - target device id
> + * @queue_id (IN) - target queue id
> + *
> + * Generic errors apply (see kfd_dbg_trap_operations).
> + * Return - 0 on SUCCESS.
> + * - ENODEV if gpu_id not found.
> + * If exception_mask contains EC_PROCESS_RUNTIME, unblocks pending
> + * AMDKFD_IOC_RUNTIME_ENABLE call - see kfd_ioctl_runtime_enable_args.
> + * All other exceptions are raised to runtime through err_payload_addr.
> + * See kfd_context_save_area_header.
> + */
> +struct kfd_ioctl_dbg_trap_send_runtime_event_args {
> + __u64 exception_mask;
> + __u32 gpu_id;
> + __u32 queue_id;
> +};
> +
> +/**
> + * kfd_ioctl_dbg_trap_set_exceptions_enabled_args
> + *
> + * Arguments for KFD_IOC_SET_EXCEPTIONS_ENABLED
> + * Set new exceptions to be raised to the debugger.
> + *
> + * @exception_mask (IN) - new exceptions to raise the debugger
> + *
> + * Generic errors apply (see kfd_dbg_trap_operations).
> + * Return - 0 on SUCCESS.
> + */
> +struct kfd_ioctl_dbg_trap_set_exceptions_enabled_args {
> + __u64 exception_mask;
> +};
> +
> +/**
> + * kfd_ioctl_dbg_trap_set_wave_launch_override_args
> + *
> + * Arguments for KFD_IOC_DBG_TRAP_SET_WAVE_LAUNCH_OVERRIDE
> + * Enable HW exceptions to raise trap.
> + *
> + * @override_mode (IN) - see kfd_dbg_trap_override_mode
> + * @enable_mask (IN/OUT) - reference kfd_dbg_trap_mask.
> + * IN is the override modes requested to be enabled.
> + * OUT is referenced in Return below.
> + * @support_request_mask (IN/OUT) - reference kfd_dbg_trap_mask.
> + * IN is the override modes requested for support check.
> + * OUT is referenced in Return below.
> + *
> + * Generic errors apply (see kfd_dbg_trap_operations).
> + * Return - 0 on SUCCESS.
> + * Previous enablement is returned in @enable_mask.
> + * Actual override support is returned in @support_request_mask.
> + * - EINVAL if override mode is not supported.
> + * - EACCES if trap support requested is not actually supported.
> + * i.e. enable_mask (IN) is not a subset of support_request_mask (OUT).
> + * Otherwise it is considered a generic error (see 
> kfd_dbg_trap_operations).
> + */
> +struct kfd_ioctl_dbg_trap_set_wave_launch_override_args {
> + __u32 override_mode;
> + __u32 enable_mask;
> + __u32 support_request_mask;
> + __u32 pad;
> +};
> +
> +/**
> + * kfd_ioctl_dbg_trap_set_wave_launch_mode_args
> + *
> + * Arguments for KFD_IOC_DBG_TRAP_SET_WAVE_LAUNCH_MODE
> + * Set wave launch mode.
> + *
> + * @mode (IN) - see kfd_dbg_trap_wave_launch_mode
> + *
> + * Generic errors apply (see kfd_dbg_trap_operations).
> + * Return - 0 on SUCCESS.
> + */
> +struct kfd_ioctl_dbg_trap_set_wave_launch_mode_args {
> + __u32 launch_mode;
> + __u32 pad;
> +};
> +
> +/**
> + * kfd_ioctl_dbg_trap_suspend_queues_args
> + *
> + * Arguments for KFD_IOC_DBG_TRAP_SUSPEND_QUEUES
> + * Suspend queues.
> + *
> + * @exception_mask (IN) - raised exceptions to clear
> + * @queue_array_ptr (IN) - pointer to array of queue ids (u32 per 
> queue id)
> + * to suspend
> + * @num_queues (IN) - number of queues to suspend in @queue_array_ptr
> + * @grace_period (IN) - wave time allowance before preemption
> + * per 1K GPU clock cycle unit
> + *
> + * Generic errors apply (see kfd_dbg_trap_operations).
> + * Return - Number of queues suspended on SUCCESS.
> + * KFD_DBG_QUEUE_ERROR_MASK and KFD_DBG_QUEUE_INVALID_MASK masked
> + * for each queue id in @queue_array_ptr array reports unsuccessful
> + * suspend reason.
> + * KFD_DBG_QUEUE_ERROR_MASK = HW failure.
> + * KFD_DBG_QUEUE_INVALID_MASK = queue does not exist, is new or
> + * is being destroyed.
> + * Destruction of a suspended queue is blocked until the queue is
> + * resumed. This allows the debugger to access queue information and
> + * the its context save area without running into a race condition on
> + * queue destruction.
> + * Automatically copies per queue context save area header information
> + * into the save area base
> + * (see kfd_queue_snapshot_entry and kfd_context_save_area_header).

The last two paragraphs would make more sens as a description above the 
Return statement.


> + */
> +struct kfd_ioctl_dbg_trap_suspend_queues_args {
> + __u64 exception_mask;
> + __u64 queue_array_ptr;
> + __u32 num_queues;
> + __u32 grace_period;
> +};
> +
> +/**
> + * kfd_ioctl_dbg_trap_resume_queues_args
> + *
> + * Arguments for KFD_IOC_DBG_TRAP_RESUME_QUEUES
> + * Resume queues.
> + *
> + * @queue_array_ptr (IN) - pointer to array of queue ids (u32 per 
> queue id)
> + * to resume
> + * @num_queues (IN) - number of queues to resume in @queue_array_ptr
> + *
> + * Generic errors apply (see kfd_dbg_trap_operations).
> + * Return - Number of queues resumed on SUCCESS.
> + * KFD_DBG_QUEUE_ERROR_MASK and KFD_DBG_QUEUE_INVALID_MASK mask
> + * for each queue id in @queue_array_ptr array reports unsuccessful
> + * resume reason.
> + * KFD_DBG_QUEUE_ERROR_MASK = HW failure.
> + * KFD_DBG_QUEUE_INVALID_MASK = queue does not exist.
> + */
> +struct kfd_ioctl_dbg_trap_resume_queues_args {
> + __u64 queue_array_ptr;
> + __u32 num_queues;
> + __u32 pad;
> +};
> +
> +/**
> + * kfd_ioctl_dbg_trap_set_node_address_watch_args
> + *
> + * Arguments for KFD_IOC_DBG_TRAP_SET_NODE_ADDRESS_WATCH
> + * Sets address watch for device.
> + *
> + * @address (IN) - watch address to set
> + * @mode (IN) - see kfd_dbg_trap_address_watch_mode
> + * @mask (IN) - watch address mask
> + * @gpu_id (IN) - target gpu to set watch point
> + * @id (OUT) - watch id allocated
> + *
> + * Generic errors apply (see kfd_dbg_trap_operations).
> + * Return - 0 on SUCCESS.
> + * Allocated watch ID returned to @id.
> + * - ENODEV if gpu_id not found.
> + * - ENOMEM if watch IDs can be allocated
> + */
> +struct kfd_ioctl_dbg_trap_set_node_address_watch_args {
> + __u64 address;
> + __u32 mode;
> + __u32 mask;
> + __u32 gpu_id;
> + __u32 id;
> +};
> +
> +/**
> + * kfd_ioctl_dbg_trap_clear_node_address_watch_args
> + *
> + * Arguments for KFD_IOC_DBG_TRAP_CLEAR_NODE_ADDRESS_WATCH
> + * Clear address watch for device.
> + *
> + * @gpu_id (IN) - target device to clear watch point
> + * @id (IN) - allocated watch id to clear
> + *
> + * Generic errors apply (see kfd_dbg_trap_operations).
> + * Return - 0 on SUCCESS.
> + * - ENODEV if gpu_id not found.
> + * - EINVAL if watch ID has not been allocated.
> + */
> +struct kfd_ioctl_dbg_trap_clear_node_address_watch_args {
> + __u32 gpu_id;
> + __u32 id;
> +};
> +
> +/**
> + * kfd_ioctl_dbg_trap_set_flags_args
> + *
> + * Arguments for KFD_IOC_DBG_TRAP_SET_FLAGS
> + * Sets flags for wave behaviour.
> + *
> + * @flags (IN/OUT) - IN = flags to enable, OUT = flags previously enabled
> + *
> + * Generic errors apply (see kfd_dbg_trap_operations).
> + * Return - 0 on SUCCESS.
> + * - EACCESS if any debug device does not allow flag options.
> + */
> +struct kfd_ioctl_dbg_trap_set_flags_args {
> + __u32 flags;
> + __u32 pad;
> +};
> +
> +/**
> + * kfd_ioctl_dbg_trap_query_debug_event_args
> + *
> + * Arguments for KFD_IOC_DBG_TRAP_QUERY_DEBUG_EVENT
> + *
> + * Find one or more raised exceptions. This function can return multiple
> + * exceptions from a single queue or a single device with one call. 
> To find
> + * all raised exceptions, this function must be called repeatedly 
> until it
> + * returns -EAGAIN. Returned exceptions can optionally be cleared by
> + * setting the corresponding bit in the @exception_mask input parameter.
> + * However, clearing an exception prevents retrieving further information
> + * about it with KFD_IOC_DBG_TRAP_QUERY_EXCEPTION_INFO.
> + *
> + * @exception_mask (IN/OUT) - exception to clear (IN) and raised (OUT)
> + * @gpu_id (OUT) - gpu id of exceptions raised
> + * @queue_id (OUT) - queue id of exceptions raised
> + *
> + * Generic errors apply (see kfd_dbg_trap_operations).
> + * Return - 0 on raised exception found
> + * Raised exceptions found are returned in @exception mask
> + * with reported source id returned in @gpu_id or @queue_id.
> + * - EAGAIN if no raised exception has been found
> + */
> +struct kfd_ioctl_dbg_trap_query_debug_event_args {
> + __u64 exception_mask;
> + __u32 gpu_id;
> + __u32 queue_id;
> +};
> +
> +/**
> + * kfd_ioctl_dbg_trap_query_exception_info_args
> + *
> + * Arguments KFD_IOC_DBG_TRAP_QUERY_EXCEPTION_INFO
> + * Get additional info on raised exception.
> + *
> + * @info_ptr (IN) - pointer to exception info buffer to copy to
> + * @info_size (IN/OUT) - exception info buffer size (bytes)
> + * @source_id (IN) - target gpu or queue id
> + * @exception_code (IN) - target exception
> + * @clear_exception (IN) - clear raised @exception_code exception
> + * (0 = false, 1 = true)
> + *
> + * Generic errors apply (see kfd_dbg_trap_operations).
> + * Return - 0 on SUCCESS.
> + * If @exception_code is EC_DEVICE_MEMORY_VIOLATION, copy @info_size(OUT)
> + * bytes of memory exception data to @info_ptr.
> + * If @exception_code is EC_PROCESS_RUNTIME, copy saved
> + * kfd_runtime_info to @info_ptr.
> + * Actual required @info_ptr size (bytes) is returned in @info_size.
> + */
> +struct kfd_ioctl_dbg_trap_query_exception_info_args {
> + __u64 info_ptr;
> + __u32 info_size;
> + __u32 source_id;
> + __u32 exception_code;
> + __u32 clear_exception;
> +};
> +
> +/**
> + * kfd_ioctl_dbg_trap_get_queue_snapshot_args
> + *
> + * Arguments KFD_IOC_DBG_TRAP_GET_QUEUE_SNAPSHOT
> + * Get queue information.
> + *
> + * @exception_mask (IN) - exceptions raised to clear
> + * @snapshot_buf_ptr (IN) - queue snapshot entry buffer (see 
> kfd_queue_snapshot_entry)
> + * @num_queues (IN/OUT) - number of queue snapshot entries
> + * The debugger specifies the size of the array allocated in @num_queues.
> + * KFD returns the number of queues that actually existed. If this is
> + * larger than the size specified by the debugger, KFD will not overflow
> + * the array allocated by the debugger.
> + *
> + * @entry_size (IN/OUT) - size per entry in bytes
> + * The debugger specifies sizeof(struct kfd_queue_snapshot_entry) in
> + * @entry_size. KFD returns the number of bytes actually populated per
> + * entry. The debugger should use the KFD_IOCTL_MINOR_VERSION to 
> determine,
> + * which fields in struct kfd_queue_snapshot_entry are valid. This allows
> + * growing the ABI in a backwards compatible manner.

It's worth mentioning that the @entry_size(in) is also used as stride if 
it is larger than the actual kfd_queue_snapshot_entry.


> + *
> + * Generic errors apply (see kfd_dbg_trap_operations).
> + * Return - 0 on SUCCESS.
> + * Copies @num_queues(IN) queue snapshot entries of size @entry_size(IN)
> + * into @snapshot_buf_ptr if @num_queues(IN) > 0.
> + * Otherwise return @num_queues(OUT) queue snapshot entries that exist.
> + */
> +struct kfd_ioctl_dbg_trap_queue_snapshot_args {
> + __u64 exception_mask;
> + __u64 snapshot_buf_ptr;
> + __u32 num_queues;
> + __u32 entry_size;
> +};
> +
> +/**
> + * kfd_ioctl_dbg_trap_get_device_snapshot_args
> + *
> + * Arguments for KFD_IOC_DBG_TRAP_GET_DEVICE_SNAPSHOT
> + * Get device information.
> + *
> + * @exception_mask (IN) - exceptions raised to clear
> + * @snapshot_buf_ptr (IN) - pointer to snapshot buffer (see 
> kfd_dbg_device_info_entry)
> + * @num_devices (IN/OUT) - number of debug devices to snapshot
> + * The debugger specifies the size of the array allocated in 
> @num_devices.
> + * KFD returns the number of devices that actually existed. If this is
> + * larger than the size specified by the debugger, KFD will not overflow
> + * the array allocated by the debugger.
> + *
> + * @entry_size (IN/OUT) - size per entry in bytes
> + * The debugger specifies sizeof(struct kfd_dbg_device_info_entry) in
> + * @entry_size. KFD returns the number of bytes actually populated. The
> + * debugger should use KFD_IOCTL_MINOR_VERSION to determine, which fields
> + * in struct kfd_dbg_device_info_entry are valid. This allows growing the
> + * ABI in a backwards compatible manner.

It's worth mentioning that the @entry_size(in) is also used as stride if 
it is larger than the actual kfd_queue_snapshot_entry.

Regards,
   Felix


> + *
> + * Generic errors apply (see kfd_dbg_trap_operations).
> + * Return - 0 on SUCCESS.
> + * Copies @num_devices(IN) device snapshot entries of size 
> @entry_size(IN)
> + * into @snapshot_buf_ptr if @num_devices(IN) > 0.
> + * Otherwise return @num_devices(OUT) queue snapshot entries that exist.
> + */
> +struct kfd_ioctl_dbg_trap_device_snapshot_args {
> + __u64 exception_mask;
> + __u64 snapshot_buf_ptr;
> + __u32 num_devices;
> + __u32 entry_size;
> +};
> +
> +/**
> + * kfd_ioctl_dbg_trap_args
> + *
> + * Arguments to debug target process.
> + *
> + * @pid - target process to debug
> + * @op - debug operation (see kfd_dbg_trap_operations)
> + *
> + * @op determines which union struct args to use.
> + * Refer to kern docs for each kfd_ioctl_dbg_trap_*_args struct.
> + */
> +struct kfd_ioctl_dbg_trap_args {
> + __u32 pid;
> + __u32 op;
> +
> + union {
> + struct kfd_ioctl_dbg_trap_enable_args enable;
> + struct kfd_ioctl_dbg_trap_send_runtime_event_args send_runtime_event;
> + struct kfd_ioctl_dbg_trap_set_exceptions_enabled_args 
> set_exceptions_enabled;
> + struct kfd_ioctl_dbg_trap_set_wave_launch_override_args launch_override;
> + struct kfd_ioctl_dbg_trap_set_wave_launch_mode_args launch_mode;
> + struct kfd_ioctl_dbg_trap_suspend_queues_args suspend_queues;
> + struct kfd_ioctl_dbg_trap_resume_queues_args resume_queues;
> + struct kfd_ioctl_dbg_trap_set_node_address_watch_args 
> set_node_address_watch;
> + struct kfd_ioctl_dbg_trap_clear_node_address_watch_args 
> clear_node_address_watch;
> + struct kfd_ioctl_dbg_trap_set_flags_args set_flags;
> + struct kfd_ioctl_dbg_trap_query_debug_event_args query_debug_event;
> + struct kfd_ioctl_dbg_trap_query_exception_info_args 
> query_exception_info;
> + struct kfd_ioctl_dbg_trap_queue_snapshot_args queue_snapshot;
> + struct kfd_ioctl_dbg_trap_device_snapshot_args device_snapshot;
> + };
> +};
> +
> #define AMDKFD_IOCTL_BASE 'K'
> #define AMDKFD_IO(nr) _IO(AMDKFD_IOCTL_BASE, nr)
> #define AMDKFD_IOR(nr, type) _IOR(AMDKFD_IOCTL_BASE, nr, type)
> @@ -877,7 +1524,13 @@ struct kfd_ioctl_set_xnack_mode_args {
> #define AMDKFD_IOC_AVAILABLE_MEMORY \
> AMDKFD_IOWR(0x23, struct kfd_ioctl_get_available_memory_args)
> +#define AMDKFD_IOC_RUNTIME_ENABLE \
> + AMDKFD_IOWR(0x24, struct kfd_ioctl_runtime_enable_args)
> +
> +#define AMDKFD_IOC_DBG_TRAP \
> + AMDKFD_IOWR(0x25, struct kfd_ioctl_dbg_trap_args)
> +
> #define AMDKFD_COMMAND_START 0x01
> -#define AMDKFD_COMMAND_END 0x24
> +#define AMDKFD_COMMAND_END 0x26
> #endif

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [PATCH 02/29] drm/amdkfd: display debug capabilities
  2022-10-31 16:23 ` [PATCH 02/29] drm/amdkfd: display debug capabilities Jonathan Kim
@ 2022-11-22 23:08   ` Felix Kuehling
  0 siblings, 0 replies; 63+ messages in thread
From: Felix Kuehling @ 2022-11-22 23:08 UTC (permalink / raw)
  To: Jonathan Kim, amd-gfx


On 2022-10-31 12:23, Jonathan Kim wrote:
> Expose debug capabilities in the KFD topology node's HSA capabilities and
> debug properties flags.
>
> Ensure correct capabilities are exposed based on firmware support.
>
> Flag definitions can be referenced in uapi/linux/kfd_sysfs.h.
>
> Signed-off-by: Jonathan Kim <jonathan.kim@amd.com>

Reviewed-by: Felix Kuehling <Felix.Kuehling@amd.com>


> ---
>   drivers/gpu/drm/amd/amdkfd/kfd_topology.c | 88 +++++++++++++++++++++--
>   drivers/gpu/drm/amd/amdkfd/kfd_topology.h |  6 ++
>   include/uapi/linux/kfd_sysfs.h            | 15 ++++
>   3 files changed, 104 insertions(+), 5 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
> index 3f0a4a415907..cd5933a594de 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
> @@ -551,6 +551,8 @@ static ssize_t node_show(struct kobject *kobj, struct attribute *attr,
>   				      dev->gpu->mec_fw_version);
>   		sysfs_show_32bit_prop(buffer, offs, "capability",
>   				      dev->node_props.capability);
> +		sysfs_show_64bit_prop(buffer, offs, "debug_prop",
> +				      dev->node_props.debug_prop);
>   		sysfs_show_32bit_prop(buffer, offs, "sdma_fw_version",
>   				      dev->gpu->sdma_fw_version);
>   		sysfs_show_64bit_prop(buffer, offs, "unique_id",
> @@ -1593,6 +1595,84 @@ static int kfd_dev_create_p2p_links(void)
>   	return ret;
>   }
>   
> +static void kfd_topology_set_dbg_firmware_support(struct kfd_topology_device *dev)
> +{
> +	bool firmware_supported = true;
> +
> +	/*
> +	 * Note: Any unlisted devices here are assumed to support exception handling.
> +	 * Add additional checks here as needed.
> +	 */
> +	switch (KFD_GC_VERSION(dev->gpu)) {
> +	case IP_VERSION(9, 0, 1): /* Vega10 */
> +		firmware_supported = dev->gpu->mec_fw_version >= 459 + 32768;
> +		break;
> +	case IP_VERSION(9, 1, 0): /* Raven */
> +	case IP_VERSION(9, 2, 1): /* Vega12 */
> +	case IP_VERSION(9, 2, 2): /* Raven */
> +	case IP_VERSION(9, 3, 0): /* Renoir */
> +	case IP_VERSION(9, 4, 0): /* Vega20 */
> +		firmware_supported = dev->gpu->mec_fw_version >= 459;
> +		break;
> +	case IP_VERSION(9, 4, 1): /* Arcturus */
> +		firmware_supported = dev->gpu->mec_fw_version >= 60;
> +		break;
> +	case IP_VERSION(9, 4, 2): /* Aldebaran */
> +		firmware_supported = dev->gpu->mec_fw_version >= 51;
> +		break;
> +	case IP_VERSION(10, 1, 10): /* Navi10 */
> +	case IP_VERSION(10, 1, 2): /* Navi12 */
> +	case IP_VERSION(10, 1, 1): /* Navi14 */
> +		firmware_supported = dev->gpu->mec_fw_version >= 144;
> +		break;
> +	case IP_VERSION(10, 3, 0): /* Sieanna Cichlid */
> +	case IP_VERSION(10, 3, 2): /* Navy Flounder */
> +	case IP_VERSION(10, 3, 1): /* Van Gogh */
> +	case IP_VERSION(10, 3, 4): /* Dimgrey Cavefish */
> +	case IP_VERSION(10, 3, 5): /* Beige Goby */
> +		firmware_supported = dev->gpu->mec_fw_version >= 89;
> +		break;
> +	case IP_VERSION(10, 1, 3): /* Cyan Skillfish */
> +	case IP_VERSION(10, 3, 3): /* Yellow Carp*/
> +		firmware_supported = false;
> +		break;
> +	default:
> +		break;
> +	}
> +
> +	if (firmware_supported)
> +		dev->node_props.capability |= HSA_CAP_TRAP_DEBUG_FIRMWARE_SUPPORTED;
> +}
> +
> +static void kfd_topology_set_capabilities(struct kfd_topology_device *dev)
> +{
> +	dev->node_props.capability |= ((HSA_CAP_DOORBELL_TYPE_2_0 <<
> +				HSA_CAP_DOORBELL_TYPE_TOTALBITS_SHIFT) &
> +				HSA_CAP_DOORBELL_TYPE_TOTALBITS_MASK);
> +
> +	dev->node_props.capability |= HSA_CAP_TRAP_DEBUG_SUPPORT |
> +			HSA_CAP_TRAP_DEBUG_WAVE_LAUNCH_TRAP_OVERRIDE_SUPPORTED |
> +			HSA_CAP_TRAP_DEBUG_WAVE_LAUNCH_MODE_SUPPORTED;
> +
> +	if (KFD_GC_VERSION(dev->gpu) < IP_VERSION(10, 0, 0)) {
> +		dev->node_props.debug_prop |= HSA_DBG_WATCH_ADDR_MASK_LO_BIT_GFX9 |
> +						HSA_DBG_WATCH_ADDR_MASK_HI_BIT;
> +
> +		if (KFD_GC_VERSION(dev->gpu) < IP_VERSION(9, 4, 2))
> +			dev->node_props.debug_prop |=
> +				HSA_DBG_DISPATCH_INFO_ALWAYS_VALID;
> +		else
> +			dev->node_props.capability |=
> +				HSA_CAP_TRAP_DEBUG_PRECISE_MEMORY_OPERATIONS_SUPPORTED;
> +	} else {
> +		dev->node_props.debug_prop |= HSA_DBG_WATCH_ADDR_MASK_LO_BIT_GFX10 |
> +					HSA_DBG_WATCH_ADDR_MASK_HI_BIT |
> +					HSA_DBG_DISPATCH_INFO_ALWAYS_VALID;
> +	}
> +
> +	kfd_topology_set_dbg_firmware_support(dev);
> +}
> +
>   int kfd_topology_add_device(struct kfd_dev *gpu)
>   {
>   	uint32_t gpu_id;
> @@ -1737,13 +1817,11 @@ int kfd_topology_add_device(struct kfd_dev *gpu)
>   			HSA_CAP_DOORBELL_TYPE_TOTALBITS_MASK);
>   		break;
>   	default:
> -		if (KFD_GC_VERSION(dev->gpu) >= IP_VERSION(9, 0, 1))
> -			dev->node_props.capability |= ((HSA_CAP_DOORBELL_TYPE_2_0 <<
> -				HSA_CAP_DOORBELL_TYPE_TOTALBITS_SHIFT) &
> -				HSA_CAP_DOORBELL_TYPE_TOTALBITS_MASK);
> -		else
> +		if (KFD_GC_VERSION(dev->gpu) < IP_VERSION(9, 0, 1))
>   			WARN(1, "Unexpected ASIC family %u",
>   			     dev->gpu->adev->asic_type);
> +		else
> +			kfd_topology_set_capabilities(dev);
>   	}
>   
>   	/*
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.h b/drivers/gpu/drm/amd/amdkfd/kfd_topology.h
> index 9f6c949186c1..c089c26a0e77 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.h
> @@ -31,6 +31,11 @@
>   
>   #define KFD_TOPOLOGY_PUBLIC_NAME_SIZE 32
>   
> +#define HSA_DBG_WATCH_ADDR_MASK_LO_BIT_GFX9	6
> +#define HSA_DBG_WATCH_ADDR_MASK_LO_BIT_GFX10	7
> +#define HSA_DBG_WATCH_ADDR_MASK_HI_BIT  \
> +			(29 << HSA_DBG_WATCH_ADDR_MASK_HI_BIT_SHIFT)
> +
>   struct kfd_node_properties {
>   	uint64_t hive_id;
>   	uint32_t cpu_cores_count;
> @@ -42,6 +47,7 @@ struct kfd_node_properties {
>   	uint32_t cpu_core_id_base;
>   	uint32_t simd_id_base;
>   	uint32_t capability;
> +	uint64_t debug_prop;
>   	uint32_t max_waves_per_simd;
>   	uint32_t lds_size_in_kb;
>   	uint32_t gds_size_in_kb;
> diff --git a/include/uapi/linux/kfd_sysfs.h b/include/uapi/linux/kfd_sysfs.h
> index 3e330f368917..a51b7331e0b4 100644
> --- a/include/uapi/linux/kfd_sysfs.h
> +++ b/include/uapi/linux/kfd_sysfs.h
> @@ -43,6 +43,11 @@
>   #define HSA_CAP_DOORBELL_TYPE_2_0		0x2
>   #define HSA_CAP_AQL_QUEUE_DOUBLE_MAP		0x00004000
>   
> +#define HSA_CAP_TRAP_DEBUG_SUPPORT              0x00008000
> +#define HSA_CAP_TRAP_DEBUG_WAVE_LAUNCH_TRAP_OVERRIDE_SUPPORTED  0x00010000
> +#define HSA_CAP_TRAP_DEBUG_WAVE_LAUNCH_MODE_SUPPORTED           0x00020000
> +#define HSA_CAP_TRAP_DEBUG_PRECISE_MEMORY_OPERATIONS_SUPPORTED  0x00040000
> +
>   /* Old buggy user mode depends on this being 0 */
>   #define HSA_CAP_RESERVED_WAS_SRAM_EDCSUPPORTED	0x00080000
>   
> @@ -53,8 +58,18 @@
>   #define HSA_CAP_SRAM_EDCSUPPORTED		0x04000000
>   #define HSA_CAP_SVMAPI_SUPPORTED		0x08000000
>   #define HSA_CAP_FLAGS_COHERENTHOSTACCESS	0x10000000
> +#define HSA_CAP_TRAP_DEBUG_FIRMWARE_SUPPORTED   0x20000000
>   #define HSA_CAP_RESERVED			0xe00f8000
>   
> +/* debug_prop bits in node properties */
> +#define HSA_DBG_WATCH_ADDR_MASK_LO_BIT_MASK     0x0000000f
> +#define HSA_DBG_WATCH_ADDR_MASK_LO_BIT_SHIFT    0
> +#define HSA_DBG_WATCH_ADDR_MASK_HI_BIT_MASK     0x000003f0
> +#define HSA_DBG_WATCH_ADDR_MASK_HI_BIT_SHIFT    4
> +#define HSA_DBG_DISPATCH_INFO_ALWAYS_VALID      0x00000400
> +#define HSA_DBG_WATCHPOINTS_EXCLUSIVE           0x00000800
> +#define HSA_DBG_RESERVED                0xfffffffffffff000ull
> +
>   /* Heap types in memory properties */
>   #define HSA_MEM_HEAP_TYPE_SYSTEM	0
>   #define HSA_MEM_HEAP_TYPE_FB_PUBLIC	1

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [PATCH 03/29] drm/amdkfd: prepare per-process debug enable and disable
  2022-10-31 16:23 ` [PATCH 03/29] drm/amdkfd: prepare per-process debug enable and disable Jonathan Kim
@ 2022-11-22 23:31   ` Felix Kuehling
  0 siblings, 0 replies; 63+ messages in thread
From: Felix Kuehling @ 2022-11-22 23:31 UTC (permalink / raw)
  To: Jonathan Kim, amd-gfx


On 2022-10-31 12:23, Jonathan Kim wrote:
> The ROCm debugger will attach to a process to debug by PTRACE and will
> expect the KFD to prepare a process for the target PID, whether the
> target PID has opened the KFD device or not.
>
> This patch is to explicity handle this requirement.  Further HW mode
> setting and runtime coordination requirements will be handled in
> following patches.
>
> In the case where the target process has not opened the KFD device,
> a new KFD process must be created for the target PID.
> The debugger as well as the target process for this case will have not
> acquired any VMs so handle process restoration to correctly account for
> this.
>
> To coordinate with HSA runtime, the debugger must be aware of the target
> process' runtime enablement status and will copy the runtime status
> information into the debugged KFD process for later query.
>
> On enablement, the debugger will subscribe to a set of exceptions where
> each exception events will notify the debugger through a pollable FIFO
> file descriptor that the debugger provides to the KFD to manage.
> Some events will be synchronously raised while other are scheduled,
> which is why a debug_event_workarea worker is initialized.
>
> Finally on process termination of either the debugger or the target,
> debugging must be disabled if it has not been done so.
>
> v2: relax debug trap disable and PTRACE ATTACH requirement.

One potential bug and one nit-pick inline ...


>
> Signed-off-by: Jonathan Kim <jonathan.kim@amd.com>
> ---
>   drivers/gpu/drm/amd/amdkfd/Makefile           |  3 +-
>   drivers/gpu/drm/amd/amdkfd/kfd_chardev.c      | 88 +++++++++++++++++-
>   drivers/gpu/drm/amd/amdkfd/kfd_debug.c        | 91 +++++++++++++++++++
>   drivers/gpu/drm/amd/amdkfd/kfd_debug.h        | 33 +++++++
>   .../drm/amd/amdkfd/kfd_device_queue_manager.c | 24 ++++-
>   drivers/gpu/drm/amd/amdkfd/kfd_priv.h         | 34 ++++++-
>   drivers/gpu/drm/amd/amdkfd/kfd_process.c      | 65 +++++++++----
>   7 files changed, 309 insertions(+), 29 deletions(-)
>   create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_debug.c
>   create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_debug.h
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/Makefile b/drivers/gpu/drm/amd/amdkfd/Makefile
> index e758c2a24cd0..747754428073 100644
> --- a/drivers/gpu/drm/amd/amdkfd/Makefile
> +++ b/drivers/gpu/drm/amd/amdkfd/Makefile
> @@ -55,7 +55,8 @@ AMDKFD_FILES	:= $(AMDKFD_PATH)/kfd_module.o \
>   		$(AMDKFD_PATH)/kfd_int_process_v9.o \
>   		$(AMDKFD_PATH)/kfd_int_process_v11.o \
>   		$(AMDKFD_PATH)/kfd_smi_events.o \
> -		$(AMDKFD_PATH)/kfd_crat.o
> +		$(AMDKFD_PATH)/kfd_crat.o \
> +		$(AMDKFD_PATH)/kfd_debug.o
>   
>   ifneq ($(CONFIG_AMD_IOMMU_V2),)
>   AMDKFD_FILES += $(AMDKFD_PATH)/kfd_iommu.o
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> index 11a960c83fb2..d550dbe570fb 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> @@ -44,6 +44,7 @@
>   #include "amdgpu_amdkfd.h"
>   #include "kfd_smi_events.h"
>   #include "amdgpu_dma_buf.h"
> +#include "kfd_debug.h"
>   
>   static long kfd_ioctl(struct file *, unsigned int, unsigned long);
>   static int kfd_open(struct inode *, struct file *);
> @@ -142,10 +143,15 @@ static int kfd_open(struct inode *inode, struct file *filep)
>   		return -EPERM;
>   	}
>   
> -	process = kfd_create_process(filep);
> +	process = kfd_create_process(current);
>   	if (IS_ERR(process))
>   		return PTR_ERR(process);
>   
> +	if (kfd_process_init_cwsr_apu(process, filep)) {
> +		kfd_unref_process(process);
> +		return -EFAULT;
> +	}
> +
>   	if (kfd_is_locked()) {
>   		dev_dbg(kfd_device, "kfd is locked!\n"
>   				"process %d unreferenced", process->pasid);
> @@ -2652,6 +2658,9 @@ static int kfd_ioctl_runtime_enable(struct file *filep, struct kfd_process *p, v
>   static int kfd_ioctl_set_debug_trap(struct file *filep, struct kfd_process *p, void *data)
>   {
>   	struct kfd_ioctl_dbg_trap_args *args = data;
> +	struct task_struct *thread = NULL;
> +	struct pid *pid = NULL;
> +	struct kfd_process *target = NULL;
>   	int r = 0;
>   
>   	if (sched_policy == KFD_SCHED_POLICY_NO_HWS) {
> @@ -2659,9 +2668,71 @@ static int kfd_ioctl_set_debug_trap(struct file *filep, struct kfd_process *p, v
>   		return -EINVAL;
>   	}
>   
> +	pid = find_get_pid(args->pid);
> +	if (!pid) {
> +		pr_debug("Cannot find pid info for %i\n", args->pid);
> +		r = -ESRCH;
> +		goto out;
> +	}
> +
> +	thread = get_pid_task(pid, PIDTYPE_PID);
> +
> +	if (args->op == KFD_IOC_DBG_TRAP_ENABLE) {
> +		bool create_process;
> +
> +		rcu_read_lock();
> +		create_process = thread && thread != current && ptrace_parent(thread) == current;
> +		rcu_read_unlock();
> +
> +		target = create_process ? kfd_create_process(thread) :
> +					kfd_lookup_process_by_pid(pid);
> +	} else {
> +		target = kfd_lookup_process_by_pid(pid);
> +	}
> +
> +	if (!target) {
> +		pr_debug("Cannot find process PID %i to debug\n", args->pid);
> +		r = -ESRCH;
> +		goto out;
> +	}
> +
> +	/* Check if target is still PTRACED. */
> +	rcu_read_lock();
> +	if (target != p && args->op == KFD_IOC_DBG_TRAP_DISABLE
> +				&& ptrace_parent(target->lead_thread) != current) {

Should this say args->op != KFD_IOC_DBT_TRAP_DISABLE? I think that's the 
only op that would be allowed when the process is not ptrace attached.


> +		pr_err("PID %i is not PTRACED and cannot be debugged\n", args->pid);
> +		r = -EPERM;
> +	}
> +	rcu_read_unlock();
> +
> +	if (r)
> +		goto out;
> +
> +	mutex_lock(&target->mutex);
> +
> +	if (args->op != KFD_IOC_DBG_TRAP_ENABLE && !target->debug_trap_enabled) {
> +		pr_err("PID %i not debug enabled for op %i\n", args->pid, args->op);
> +		r = -EINVAL;
> +		goto unlock_out;
> +	}
> +
>   	switch (args->op) {
>   	case KFD_IOC_DBG_TRAP_ENABLE:
> +		if (target != p)
> +			target->debugger_process = p;
> +
> +		r = kfd_dbg_trap_enable(target,
> +					args->enable.dbg_fd,
> +					(void __user *)args->enable.rinfo_ptr,
> +					&args->enable.rinfo_size);
> +		if (!r)
> +			target->exception_enable_mask = args->enable.exception_mask;
> +
> +		pr_warn("Debug functions limited\n");
> +		break;
>   	case KFD_IOC_DBG_TRAP_DISABLE:
> +		r = kfd_dbg_trap_disable(target);
> +		break;
>   	case KFD_IOC_DBG_TRAP_SEND_RUNTIME_EVENT:
>   	case KFD_IOC_DBG_TRAP_SET_EXCEPTIONS_ENABLED:
>   	case KFD_IOC_DBG_TRAP_SET_WAVE_LAUNCH_OVERRIDE:
> @@ -2675,7 +2746,7 @@ static int kfd_ioctl_set_debug_trap(struct file *filep, struct kfd_process *p, v
>   	case KFD_IOC_DBG_TRAP_QUERY_EXCEPTION_INFO:
>   	case KFD_IOC_DBG_TRAP_GET_QUEUE_SNAPSHOT:
>   	case KFD_IOC_DBG_TRAP_GET_DEVICE_SNAPSHOT:
> -		pr_warn("Debugging not supported yet\n");
> +		pr_warn("Debug op %i not supported yet\n", args->op);
>   		r = -EACCES;
>   		break;
>   	default:
> @@ -2683,6 +2754,19 @@ static int kfd_ioctl_set_debug_trap(struct file *filep, struct kfd_process *p, v
>   		r = -EINVAL;
>   	}
>   
> +unlock_out:
> +	mutex_unlock(&target->mutex);
> +
> +out:
> +	if (thread)
> +		put_task_struct(thread);
> +
> +	if (pid)
> +		put_pid(pid);
> +
> +	if (target)
> +		kfd_unref_process(target);
> +
>   	return r;
>   }
>   
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
> new file mode 100644
> index 000000000000..f967f89903f7
> --- /dev/null
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
> @@ -0,0 +1,91 @@
> +/*
> + * Copyright 2022 Advanced Micro Devices, Inc.
> + *
> + * Permission is hereby granted, free of charge, to any person obtaining a
> + * copy of this software and associated documentation files (the "Software"),
> + * to deal in the Software without restriction, including without limitation
> + * the rights to use, copy, modify, merge, publish, distribute, sublicense,
> + * and/or sell copies of the Software, and to permit persons to whom the
> + * Software is furnished to do so, subject to the following conditions:
> + *
> + * The above copyright notice and this permission notice shall be included in
> + * all copies or substantial portions of the Software.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
> + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
> + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
> + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
> + * OTHER DEALINGS IN THE SOFTWARE.
> + */
> +
> +#include "kfd_debug.h"
> +#include <linux/file.h>
> +
> +void debug_event_write_work_handler(struct work_struct *work)
> +{
> +	struct kfd_process *process;
> +
> +	static const char write_data = '.';
> +	loff_t pos = 0;
> +
> +	process = container_of(work,
> +			struct kfd_process,
> +			debug_event_workarea);
> +
> +	kernel_write(process->dbg_ev_file, &write_data, 1, &pos);
> +}
> +
> +int kfd_dbg_trap_disable(struct kfd_process *target)
> +{
> +	fput(target->dbg_ev_file);
> +	target->dbg_ev_file = NULL;
> +
> +	if (target->debugger_process) {
> +		atomic_dec(&target->debugger_process->debugged_process_count);
> +		target->debugger_process = NULL;
> +	}
> +
> +	target->debug_trap_enabled = false;
> +	kfd_unref_process(target);
> +
> +	return 0;
> +}
> +
> +int kfd_dbg_trap_enable(struct kfd_process *target, uint32_t fd,
> +			void __user *runtime_info, uint32_t *runtime_size)
> +{
> +	struct file *f;
> +	uint32_t copy_size;
> +	int r = 0;
> +
> +	if (target->debug_trap_enabled)
> +		return -EINVAL;
> +
> +	copy_size = min((size_t)(*runtime_size), sizeof(target->runtime_info));
> +
> +	f = fget(fd);
> +	if (!f) {
> +		pr_err("Failed to get file for (%i)\n", fd);
> +		return -EBADF;
> +	}
> +
> +	target->dbg_ev_file = f;
> +
> +	/* We already hold the process reference but hold another one for the
> +	 * debug session.
> +	 */
> +	kref_get(&target->ref);
> +	target->debug_trap_enabled = true;
> +
> +	if (target->debugger_process)
> +		atomic_inc(&target->debugger_process->debugged_process_count);
> +
> +	if (copy_to_user(runtime_info, (void *)&target->runtime_info, copy_size))
> +		r = -EFAULT;
> +
> +	*runtime_size = sizeof(target->runtime_info);
> +
> +	return r;
> +}
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.h b/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
> new file mode 100644
> index 000000000000..b2217eb1399c
> --- /dev/null
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
> @@ -0,0 +1,33 @@
> +/*
> + * Copyright 2022 Advanced Micro Devices, Inc.
> + *
> + * Permission is hereby granted, free of charge, to any person obtaining a
> + * copy of this software and associated documentation files (the "Software"),
> + * to deal in the Software without restriction, including without limitation
> + * the rights to use, copy, modify, merge, publish, distribute, sublicense,
> + * and/or sell copies of the Software, and to permit persons to whom the
> + * Software is furnished to do so, subject to the following conditions:
> + *
> + * The above copyright notice and this permission notice shall be included in
> + * all copies or substantial portions of the Software.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
> + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
> + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
> + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
> + * OTHER DEALINGS IN THE SOFTWARE.
> + */
> +
> +#ifndef KFD_DEBUG_EVENTS_H_INCLUDED
> +#define KFD_DEBUG_EVENTS_H_INCLUDED
> +
> +#include "kfd_priv.h"
> +
> +int kfd_dbg_trap_disable(struct kfd_process *target);
> +int kfd_dbg_trap_enable(struct kfd_process *target, uint32_t fd,
> +			void __user *runtime_info,
> +			uint32_t *runtime_info_size);
> +void debug_event_write_work_handler(struct work_struct *work);
> +#endif
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> index ecb4c3abc629..faa5d8c666ee 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> @@ -979,6 +979,14 @@ static int evict_process_queues_cpsch(struct device_queue_manager *dqm,
>   		goto out;
>   
>   	pdd = qpd_to_pdd(qpd);
> +
> +	/* The debugger creates processes that temporarily have not acquired
> +	 * all VMs for all devices and has no VMs itself.
> +	 * Skip queue eviction on process eviction.
> +	 */
> +	if (!pdd->drm_priv)
> +		goto out;
> +
>   	pr_debug_ratelimited("Evicting PASID 0x%x queues\n",
>   			    pdd->process->pasid);
>   
> @@ -1100,13 +1108,10 @@ static int restore_process_queues_cpsch(struct device_queue_manager *dqm,
>   {
>   	struct queue *q;
>   	struct kfd_process_device *pdd;
> -	uint64_t pd_base;
>   	uint64_t eviction_duration;
>   	int retval = 0;
>   
>   	pdd = qpd_to_pdd(qpd);
> -	/* Retrieve PD base */
> -	pd_base = amdgpu_amdkfd_gpuvm_get_process_page_dir(pdd->drm_priv);
>   
>   	dqm_lock(dqm);
>   	if (WARN_ON_ONCE(!qpd->evicted)) /* already restored, do nothing */
> @@ -1116,12 +1121,19 @@ static int restore_process_queues_cpsch(struct device_queue_manager *dqm,
>   		goto out;
>   	}
>   
> +	/* The debugger creates processes that temporarily have not acquired
> +	 * all VMs for all devices and has no VMs itself.
> +	 * Skip queue restore on process restore.
> +	 */
> +	if (!pdd->drm_priv)
> +		goto vm_not_acquired;
> +
>   	pr_debug_ratelimited("Restoring PASID 0x%x queues\n",
>   			    pdd->process->pasid);
>   
>   	/* Update PD Base in QPD */
> -	qpd->page_table_base = pd_base;
> -	pr_debug("Updated PD address to 0x%llx\n", pd_base);
> +	qpd->page_table_base = amdgpu_amdkfd_gpuvm_get_process_page_dir(pdd->drm_priv);
> +	pr_debug("Updated PD address to 0x%llx\n", qpd->page_table_base);
>   
>   	/* activate all active queues on the qpd */
>   	list_for_each_entry(q, &qpd->queues_list, list) {
> @@ -1147,6 +1159,8 @@ static int restore_process_queues_cpsch(struct device_queue_manager *dqm,
>   	qpd->evicted = 0;
>   	eviction_duration = get_jiffies_64() - pdd->last_evict_timestamp;
>   	atomic64_add(eviction_duration, &pdd->evict_duration_counter);
> +vm_not_acquired:
> +	qpd->evicted = 0;

qpd->evicted = 0 is duplicated here. It's only needed in one place.

Regards,
   Felix


>   out:
>   	dqm_unlock(dqm);
>   	return retval;
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> index bf610e3b683b..3ea61fa1db52 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> @@ -886,19 +886,48 @@ struct kfd_process {
>   	 */
>   	unsigned long last_restore_timestamp;
>   
> +	/* Indicates device process is debug attached with reserved vmid. */
> +	bool debug_trap_enabled;
> +
> +	/* per-process-per device debug event fd file */
> +	struct file *dbg_ev_file;
> +
> +	/* If the process is a kfd debugger, we need to know so we can clean
> +	 * up at exit time.  If a process enables debugging on itself, it does
> +	 * its own clean-up, so we don't set the flag here.  We track this by
> +	 * counting the number of processes this process is debugging.
> +	 */
> +	atomic_t debugged_process_count;
> +
> +	/* If the process is a debugged, this is the debugger process */
> +	struct kfd_process *debugger_process;
> +
>   	/* Kobj for our procfs */
>   	struct kobject *kobj;
>   	struct kobject *kobj_queues;
>   	struct attribute attr_pasid;
>   
> +	/* Keep track cwsr init */
> +	bool has_cwsr;
> +
> +	/* Exception code enable mask and status */
> +	uint64_t exception_enable_mask;
> +
>   	/* shared virtual memory registered by this process */
>   	struct svm_range_list svms;
>   
>   	bool xnack_enabled;
>   
> +	/* Work area for debugger event writer worker. */
> +	struct work_struct debug_event_workarea;
> +
>   	atomic_t poison;
>   	/* Queues are in paused stated because we are in the process of doing a CRIU checkpoint */
>   	bool queues_paused;
> +
> +	/* Tracks runtime enable status */
> +	struct kfd_runtime_info runtime_info;
> +
>   };
>   
>   #define KFD_PROCESS_TABLE_SIZE 5 /* bits: 32 entries */
> @@ -928,7 +957,7 @@ bool kfd_dev_is_large_bar(struct kfd_dev *dev);
>   
>   int kfd_process_create_wq(void);
>   void kfd_process_destroy_wq(void);
> -struct kfd_process *kfd_create_process(struct file *filep);
> +struct kfd_process *kfd_create_process(struct task_struct *thread);
>   struct kfd_process *kfd_get_process(const struct task_struct *task);
>   struct kfd_process *kfd_lookup_process_by_pasid(u32 pasid);
>   struct kfd_process *kfd_lookup_process_by_mm(const struct mm_struct *mm);
> @@ -1055,6 +1084,9 @@ void kfd_process_set_trap_handler(struct qcm_process_device *qpd,
>   				  uint64_t tba_addr,
>   				  uint64_t tma_addr);
>   
> +/* CWSR initialization */
> +int kfd_process_init_cwsr_apu(struct kfd_process *process, struct file *filep);
> +
>   /* CRIU */
>   /*
>    * Need to increment KFD_CRIU_PRIV_VERSION each time a change is made to any of the CRIU private
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> index 951b63677248..56ad38fcd26e 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> @@ -44,6 +44,7 @@ struct mm_struct;
>   #include "kfd_iommu.h"
>   #include "kfd_svm.h"
>   #include "kfd_smi_events.h"
> +#include "kfd_debug.h"
>   
>   /*
>    * List of struct kfd_process (field kfd_process).
> @@ -69,7 +70,6 @@ static struct kfd_process *find_process(const struct task_struct *thread,
>   					bool ref);
>   static void kfd_process_ref_release(struct kref *ref);
>   static struct kfd_process *create_process(const struct task_struct *thread);
> -static int kfd_process_init_cwsr_apu(struct kfd_process *p, struct file *filep);
>   
>   static void evict_process_worker(struct work_struct *work);
>   static void restore_process_worker(struct work_struct *work);
> @@ -798,18 +798,19 @@ static void kfd_process_device_destroy_ib_mem(struct kfd_process_device *pdd)
>   	kfd_process_free_gpuvm(qpd->ib_mem, pdd, qpd->ib_kaddr);
>   }
>   
> -struct kfd_process *kfd_create_process(struct file *filep)
> +struct kfd_process *kfd_create_process(struct task_struct *thread)
>   {
>   	struct kfd_process *process;
> -	struct task_struct *thread = current;
>   	int ret;
>   
> -	if (!thread->mm)
> +	if (!(thread->mm && mmget_not_zero(thread->mm)))
>   		return ERR_PTR(-EINVAL);
>   
>   	/* Only the pthreads threading model is supported. */
> -	if (thread->group_leader->mm != thread->mm)
> +	if (thread->group_leader->mm != thread->mm) {
> +		mmput(thread->mm);
>   		return ERR_PTR(-EINVAL);
> +	}
>   
>   	/*
>   	 * take kfd processes mutex before starting of process creation
> @@ -827,10 +828,6 @@ struct kfd_process *kfd_create_process(struct file *filep)
>   		if (IS_ERR(process))
>   			goto out;
>   
> -		ret = kfd_process_init_cwsr_apu(process, filep);
> -		if (ret)
> -			goto out_destroy;
> -
>   		if (!procfs.kobj)
>   			goto out;
>   
> @@ -864,16 +861,9 @@ struct kfd_process *kfd_create_process(struct file *filep)
>   	if (!IS_ERR(process))
>   		kref_get(&process->ref);
>   	mutex_unlock(&kfd_processes_mutex);
> +	mmput(thread->mm);
>   
>   	return process;
> -
> -out_destroy:
> -	hash_del_rcu(&process->kfd_processes);
> -	mutex_unlock(&kfd_processes_mutex);
> -	synchronize_srcu(&kfd_processes_srcu);
> -	/* kfd_process_free_notifier will trigger the cleanup */
> -	mmu_notifier_put(&process->mmu_notifier);
> -	return ERR_PTR(ret);
>   }
>   
>   struct kfd_process *kfd_get_process(const struct task_struct *thread)
> @@ -1188,6 +1178,28 @@ static void kfd_process_notifier_release(struct mmu_notifier *mn,
>   	cancel_delayed_work_sync(&p->eviction_work);
>   	cancel_delayed_work_sync(&p->restore_work);
>   
> +	if (p->debug_trap_enabled)
> +		kfd_dbg_trap_disable(p);
> +
> +	if (atomic_read(&p->debugged_process_count) > 0) {
> +		struct kfd_process *target;
> +		unsigned int temp;
> +		int idx = srcu_read_lock(&kfd_processes_srcu);
> +
> +		hash_for_each_rcu(kfd_processes_table, temp, target, kfd_processes) {
> +			if (target->debugger_process && target->debugger_process == p) {
> +				mutex_lock_nested(&target->mutex, 1);
> +				if (target->debug_trap_enabled)
> +					kfd_dbg_trap_disable(target);
> +				mutex_unlock(&target->mutex);
> +				if (atomic_read(&p->debugged_process_count) == 0)
> +					break;
> +			}
> +		}
> +
> +		srcu_read_unlock(&kfd_processes_srcu, idx);
> +	}
> +
>   	/* Indicate to other users that MM is no longer valid */
>   	p->mm = NULL;
>   
> @@ -1200,11 +1212,14 @@ static const struct mmu_notifier_ops kfd_process_mmu_notifier_ops = {
>   	.free_notifier = kfd_process_free_notifier,
>   };
>   
> -static int kfd_process_init_cwsr_apu(struct kfd_process *p, struct file *filep)
> +int kfd_process_init_cwsr_apu(struct kfd_process *p, struct file *filep)
>   {
>   	unsigned long  offset;
>   	int i;
>   
> +	if (p->has_cwsr)
> +		return 0;
> +
>   	for (i = 0; i < p->n_pdds; i++) {
>   		struct kfd_dev *dev = p->pdds[i]->dev;
>   		struct qcm_process_device *qpd = &p->pdds[i]->qpd;
> @@ -1233,6 +1248,8 @@ static int kfd_process_init_cwsr_apu(struct kfd_process *p, struct file *filep)
>   			qpd->tba_addr, qpd->tma_addr, qpd->cwsr_kaddr);
>   	}
>   
> +	p->has_cwsr = true;
> +
>   	return 0;
>   }
>   
> @@ -1375,6 +1392,10 @@ static struct kfd_process *create_process(const struct task_struct *thread)
>   	if (err)
>   		goto err_event_init;
>   	process->is_32bit_user_mode = in_compat_syscall();
> +	process->debug_trap_enabled = false;
> +	process->debugger_process = NULL;
> +	process->exception_enable_mask = 0;
> +	atomic_set(&process->debugged_process_count, 0);
>   
>   	process->pasid = kfd_pasid_alloc();
>   	if (process->pasid == 0) {
> @@ -1422,6 +1443,8 @@ static struct kfd_process *create_process(const struct task_struct *thread)
>   	kfd_unref_process(process);
>   	get_task_struct(process->lead_thread);
>   
> +	INIT_WORK(&process->debug_event_workarea, debug_event_write_work_handler);
> +
>   	return process;
>   
>   err_register_notifier:
> @@ -1894,8 +1917,10 @@ static void restore_process_worker(struct work_struct *work)
>   	 */
>   
>   	p->last_restore_timestamp = get_jiffies_64();
> -	ret = amdgpu_amdkfd_gpuvm_restore_process_bos(p->kgd_process_info,
> -						     &p->ef);
> +	/* VMs may not have been acquired yet during debugging. */
> +	if (p->kgd_process_info)
> +		ret = amdgpu_amdkfd_gpuvm_restore_process_bos(p->kgd_process_info,
> +							     &p->ef);
>   	if (ret) {
>   		pr_debug("Failed to restore BOs of pasid 0x%x, retry after %d ms\n",
>   			 p->pasid, PROCESS_BACK_OFF_TIME_MS);

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [PATCH 05/29] drm/amdgpu: setup hw debug registers on driver initialization
  2022-10-31 16:23 ` [PATCH 05/29] drm/amdgpu: setup hw debug registers on driver initialization Jonathan Kim
@ 2022-11-22 23:38   ` Felix Kuehling
  2022-11-23 20:53     ` Kim, Jonathan
  2022-12-01  0:18     ` Felix Kuehling
  2022-12-01  0:23   ` Felix Kuehling
  1 sibling, 2 replies; 63+ messages in thread
From: Felix Kuehling @ 2022-11-22 23:38 UTC (permalink / raw)
  To: Jonathan Kim, amd-gfx


On 2022-10-31 12:23, Jonathan Kim wrote:
> Add missing debug trap registers references and initialize all debug
> registers on boot by clearing the hardware exception overrides and the
> wave allocation ID index.
>
> For debug devices that only support single process debugging, enable
> trap temporary setup by default.
>
> Debug devices that support multi-process debugging require trap
> temporary setup to be disabled by default in order to satisfy microbench
> performance when in non-debug mode.

Where is this done? I don't think it's in the MQD setup because that 
happens unconditionally on all GPUs.


>
> The debugger requires that TTMPs 6 & 7 save the dispatch ID to map
> waves onto dispatch during compute context inspection.
> In order to correctly this up, set the special reserved CP bit by default
> whenever the MQD is initailized.

There is a word missing here. "In order to correctly _set_ this up ..."?

This patch covers GFXv9 and 10. Will GFXv11 be handled separately?

Regards,
   Felix


>
> Signed-off-by: Jonathan Kim <jonathan.kim@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c        | 26 +++++++
>   drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c         | 30 ++++++++
>   .../gpu/drm/amd/amdkfd/kfd_mqd_manager_v10.c  |  5 ++
>   .../gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c   |  5 ++
>   .../include/asic_reg/gc/gc_10_1_0_offset.h    | 14 ++++
>   .../include/asic_reg/gc/gc_10_1_0_sh_mask.h   | 69 +++++++++++++++++++
>   .../include/asic_reg/gc/gc_10_3_0_offset.h    | 10 +++
>   .../include/asic_reg/gc/gc_10_3_0_sh_mask.h   |  4 ++
>   8 files changed, 163 insertions(+)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> index af94ac580d3e..d49aff0b4ba3 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> @@ -4904,6 +4904,29 @@ static u32 gfx_v10_0_init_pa_sc_tile_steering_override(struct amdgpu_device *ade
>   
>   #define DEFAULT_SH_MEM_BASES	(0x6000)
>   
> +static void gfx_v10_0_debug_trap_config_init(struct amdgpu_device *adev,
> +				uint32_t first_vmid,
> +				uint32_t last_vmid)
> +{
> +	uint32_t data;
> +	uint32_t trap_config_vmid_mask = 0;
> +	int i;
> +
> +	/* Calculate trap config vmid mask */
> +	for (i = first_vmid; i < last_vmid; i++)
> +		trap_config_vmid_mask |= (1 << i);
> +
> +	data = REG_SET_FIELD(0, SPI_GDBG_TRAP_CONFIG,
> +			VMID_SEL, trap_config_vmid_mask);
> +	data = REG_SET_FIELD(data, SPI_GDBG_TRAP_CONFIG,
> +			TRAP_EN, 1);
> +	WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_TRAP_CONFIG), data);
> +	WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_TRAP_MASK), 0);
> +
> +	WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_TRAP_DATA0), 0);
> +	WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_TRAP_DATA1), 0);
> +}
> +
>   static void gfx_v10_0_init_compute_vmid(struct amdgpu_device *adev)
>   {
>   	int i;
> @@ -4935,6 +4958,9 @@ static void gfx_v10_0_init_compute_vmid(struct amdgpu_device *adev)
>   		WREG32_SOC15_OFFSET(GC, 0, mmGDS_GWS_VMID0, i, 0);
>   		WREG32_SOC15_OFFSET(GC, 0, mmGDS_OA_VMID0, i, 0);
>   	}
> +
> +	gfx_v10_0_debug_trap_config_init(adev, adev->vm_manager.first_kfd_vmid,
> +					AMDGPU_NUM_VMID);
>   }
>   
>   static void gfx_v10_0_init_gds_vmid(struct amdgpu_device *adev)
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> index 0320be4a5fc6..a0e5ad342f13 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> @@ -2337,6 +2337,29 @@ static void gfx_v9_0_setup_rb(struct amdgpu_device *adev)
>   	adev->gfx.config.num_rbs = hweight32(active_rbs);
>   }
>   
> +static void gfx_v9_0_debug_trap_config_init(struct amdgpu_device *adev,
> +				uint32_t first_vmid,
> +				uint32_t last_vmid)
> +{
> +	uint32_t data;
> +	uint32_t trap_config_vmid_mask = 0;
> +	int i;
> +
> +	/* Calculate trap config vmid mask */
> +	for (i = first_vmid; i < last_vmid; i++)
> +		trap_config_vmid_mask |= (1 << i);
> +
> +	data = REG_SET_FIELD(0, SPI_GDBG_TRAP_CONFIG,
> +			VMID_SEL, trap_config_vmid_mask);
> +	data = REG_SET_FIELD(data, SPI_GDBG_TRAP_CONFIG,
> +			TRAP_EN, 1);
> +	WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_TRAP_CONFIG), data);
> +	WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_TRAP_MASK), 0);
> +
> +	WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_TRAP_DATA0), 0);
> +	WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_TRAP_DATA1), 0);
> +}
> +
>   #define DEFAULT_SH_MEM_BASES	(0x6000)
>   static void gfx_v9_0_init_compute_vmid(struct amdgpu_device *adev)
>   {
> @@ -4609,6 +4632,13 @@ static int gfx_v9_0_late_init(void *handle)
>   	if (r)
>   		return r;
>   
> +	if (adev->ip_versions[GC_HWIP][0] == IP_VERSION(9, 4, 2))
> +		gfx_v9_4_2_debug_trap_config_init(adev,
> +			adev->vm_manager.first_kfd_vmid, AMDGPU_NUM_VMID);
> +	else
> +		gfx_v9_0_debug_trap_config_init(adev,
> +			adev->vm_manager.first_kfd_vmid, AMDGPU_NUM_VMID);
> +
>   	return 0;
>   }
>   
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v10.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v10.c
> index d3e2b6a599a4..cb484ace17de 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v10.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v10.c
> @@ -117,6 +117,11 @@ static void init_mqd(struct mqd_manager *mm, void **mqd,
>   			1 << CP_HQD_QUANTUM__QUANTUM_SCALE__SHIFT |
>   			1 << CP_HQD_QUANTUM__QUANTUM_DURATION__SHIFT;
>   
> +	/* Set cp_hqd_hq_scheduler0 bit 14 to 1 to have the CP set up the
> +	 * DISPATCH_PTR.  This is required for the kfd debugger
> +	 */
> +	m->cp_hqd_hq_scheduler0 = 1 << 14;
> +
>   	if (q->format == KFD_QUEUE_FORMAT_AQL) {
>   		m->cp_hqd_aql_control =
>   			1 << CP_HQD_AQL_CONTROL__CONTROL0__SHIFT;
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c
> index 0778e587a2d6..86f1cf090246 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c
> @@ -164,6 +164,11 @@ static void init_mqd(struct mqd_manager *mm, void **mqd,
>   			1 << CP_HQD_QUANTUM__QUANTUM_SCALE__SHIFT |
>   			1 << CP_HQD_QUANTUM__QUANTUM_DURATION__SHIFT;
>   
> +	/* Set cp_hqd_hq_scheduler0 bit 14 to 1 to have the CP set up the
> +	 * DISPATCH_PTR.  This is required for the kfd debugger
> +	 */
> +	m->cp_hqd_hq_status0 = 1 << 14;
> +
>   	if (q->format == KFD_QUEUE_FORMAT_AQL) {
>   		m->cp_hqd_aql_control =
>   			1 << CP_HQD_AQL_CONTROL__CONTROL0__SHIFT;
> diff --git a/drivers/gpu/drm/amd/include/asic_reg/gc/gc_10_1_0_offset.h b/drivers/gpu/drm/amd/include/asic_reg/gc/gc_10_1_0_offset.h
> index 18d34bbceebe..7d384f86bd67 100644
> --- a/drivers/gpu/drm/amd/include/asic_reg/gc/gc_10_1_0_offset.h
> +++ b/drivers/gpu/drm/amd/include/asic_reg/gc/gc_10_1_0_offset.h
> @@ -5190,6 +5190,20 @@
>   #define mmSPI_WCL_PIPE_PERCENT_CS6_BASE_IDX                                                            0
>   #define mmSPI_WCL_PIPE_PERCENT_CS7                                                                     0x1f70
>   #define mmSPI_WCL_PIPE_PERCENT_CS7_BASE_IDX                                                            0
> +#define mmSPI_GDBG_WAVE_CNTL                                                                           0x1f71
> +#define mmSPI_GDBG_WAVE_CNTL_BASE_IDX                                                                  0
> +#define mmSPI_GDBG_TRAP_CONFIG                                                                         0x1f72
> +#define mmSPI_GDBG_TRAP_CONFIG_BASE_IDX                                                                0
> +#define mmSPI_GDBG_TRAP_MASK                                                                           0x1f73
> +#define mmSPI_GDBG_TRAP_MASK_BASE_IDX                                                                  0
> +#define mmSPI_GDBG_WAVE_CNTL2                                                                          0x1f74
> +#define mmSPI_GDBG_WAVE_CNTL2_BASE_IDX                                                                 0
> +#define mmSPI_GDBG_WAVE_CNTL3                                                                          0x1f75
> +#define mmSPI_GDBG_WAVE_CNTL3_BASE_IDX                                                                 0
> +#define mmSPI_GDBG_TRAP_DATA0                                                                          0x1f78
> +#define mmSPI_GDBG_TRAP_DATA0_BASE_IDX                                                                 0
> +#define mmSPI_GDBG_TRAP_DATA1                                                                          0x1f79
> +#define mmSPI_GDBG_TRAP_DATA1_BASE_IDX                                                                 0
>   #define mmSPI_COMPUTE_QUEUE_RESET                                                                      0x1f7b
>   #define mmSPI_COMPUTE_QUEUE_RESET_BASE_IDX                                                             0
>   #define mmSPI_RESOURCE_RESERVE_CU_0                                                                    0x1f7c
> diff --git a/drivers/gpu/drm/amd/include/asic_reg/gc/gc_10_1_0_sh_mask.h b/drivers/gpu/drm/amd/include/asic_reg/gc/gc_10_1_0_sh_mask.h
> index 4127896ffcdf..08772ba845b0 100644
> --- a/drivers/gpu/drm/amd/include/asic_reg/gc/gc_10_1_0_sh_mask.h
> +++ b/drivers/gpu/drm/amd/include/asic_reg/gc/gc_10_1_0_sh_mask.h
> @@ -19646,6 +19646,75 @@
>   //SPI_WCL_PIPE_PERCENT_CS7
>   #define SPI_WCL_PIPE_PERCENT_CS7__VALUE__SHIFT                                                                0x0
>   #define SPI_WCL_PIPE_PERCENT_CS7__VALUE_MASK                                                                  0x7FL
> +//SPI_GDBG_WAVE_CNTL
> +#define SPI_GDBG_WAVE_CNTL__STALL_RA__SHIFT                                                                   0x0
> +#define SPI_GDBG_WAVE_CNTL__STALL_VMID__SHIFT                                                                 0x1
> +#define SPI_GDBG_WAVE_CNTL__STALL_RA_MASK                                                                     0x00000001L
> +#define SPI_GDBG_WAVE_CNTL__STALL_VMID_MASK                                                                   0x0001FFFEL
> +//SPI_GDBG_TRAP_CONFIG
> +#define SPI_GDBG_TRAP_CONFIG__ME_SEL__SHIFT                                                                   0x0
> +#define SPI_GDBG_TRAP_CONFIG__PIPE_SEL__SHIFT                                                                 0x2
> +#define SPI_GDBG_TRAP_CONFIG__QUEUE_SEL__SHIFT                                                                0x4
> +#define SPI_GDBG_TRAP_CONFIG__ME_MATCH__SHIFT                                                                 0x7
> +#define SPI_GDBG_TRAP_CONFIG__PIPE_MATCH__SHIFT                                                               0x8
> +#define SPI_GDBG_TRAP_CONFIG__QUEUE_MATCH__SHIFT                                                              0x9
> +#define SPI_GDBG_TRAP_CONFIG__TRAP_EN__SHIFT                                                                  0xf
> +#define SPI_GDBG_TRAP_CONFIG__VMID_SEL__SHIFT                                                                 0x10
> +#define SPI_GDBG_TRAP_CONFIG__ME_SEL_MASK                                                                     0x00000003L
> +#define SPI_GDBG_TRAP_CONFIG__PIPE_SEL_MASK                                                                   0x0000000CL
> +#define SPI_GDBG_TRAP_CONFIG__QUEUE_SEL_MASK                                                                  0x00000070L
> +#define SPI_GDBG_TRAP_CONFIG__ME_MATCH_MASK                                                                   0x00000080L
> +#define SPI_GDBG_TRAP_CONFIG__PIPE_MATCH_MASK                                                                 0x00000100L
> +#define SPI_GDBG_TRAP_CONFIG__QUEUE_MATCH_MASK                                                                0x00000200L
> +#define SPI_GDBG_TRAP_CONFIG__TRAP_EN_MASK                                                                    0x00008000L
> +#define SPI_GDBG_TRAP_CONFIG__VMID_SEL_MASK                                                                   0xFFFF0000L
> +//SPI_GDBG_TRAP_MASK
> +#define SPI_GDBG_TRAP_MASK__EXCP_EN__SHIFT                                                                    0x0
> +#define SPI_GDBG_TRAP_MASK__REPLACE__SHIFT                                                                    0x9
> +#define SPI_GDBG_TRAP_MASK__EXCP_EN_MASK                                                                      0x01FFL
> +#define SPI_GDBG_TRAP_MASK__REPLACE_MASK                                                                      0x0200L
> +//SPI_GDBG_WAVE_CNTL2
> +#define SPI_GDBG_WAVE_CNTL2__VMID_MASK__SHIFT                                                                 0x0
> +#define SPI_GDBG_WAVE_CNTL2__MODE__SHIFT                                                                      0x10
> +#define SPI_GDBG_WAVE_CNTL2__VMID_MASK_MASK                                                                   0x0000FFFFL
> +#define SPI_GDBG_WAVE_CNTL2__MODE_MASK                                                                        0x00030000L
> +//SPI_GDBG_WAVE_CNTL3
> +#define SPI_GDBG_WAVE_CNTL3__STALL_PS__SHIFT                                                                  0x0
> +#define SPI_GDBG_WAVE_CNTL3__STALL_VS__SHIFT                                                                  0x1
> +#define SPI_GDBG_WAVE_CNTL3__STALL_GS__SHIFT                                                                  0x2
> +#define SPI_GDBG_WAVE_CNTL3__STALL_HS__SHIFT                                                                  0x3
> +#define SPI_GDBG_WAVE_CNTL3__STALL_CSG__SHIFT                                                                 0x4
> +#define SPI_GDBG_WAVE_CNTL3__STALL_CS0__SHIFT                                                                 0x5
> +#define SPI_GDBG_WAVE_CNTL3__STALL_CS1__SHIFT                                                                 0x6
> +#define SPI_GDBG_WAVE_CNTL3__STALL_CS2__SHIFT                                                                 0x7
> +#define SPI_GDBG_WAVE_CNTL3__STALL_CS3__SHIFT                                                                 0x8
> +#define SPI_GDBG_WAVE_CNTL3__STALL_CS4__SHIFT                                                                 0x9
> +#define SPI_GDBG_WAVE_CNTL3__STALL_CS5__SHIFT                                                                 0xa
> +#define SPI_GDBG_WAVE_CNTL3__STALL_CS6__SHIFT                                                                 0xb
> +#define SPI_GDBG_WAVE_CNTL3__STALL_CS7__SHIFT                                                                 0xc
> +#define SPI_GDBG_WAVE_CNTL3__STALL_DURATION__SHIFT                                                            0xd
> +#define SPI_GDBG_WAVE_CNTL3__STALL_MULT__SHIFT                                                                0x1c
> +#define SPI_GDBG_WAVE_CNTL3__STALL_PS_MASK                                                                    0x00000001L
> +#define SPI_GDBG_WAVE_CNTL3__STALL_VS_MASK                                                                    0x00000002L
> +#define SPI_GDBG_WAVE_CNTL3__STALL_GS_MASK                                                                    0x00000004L
> +#define SPI_GDBG_WAVE_CNTL3__STALL_HS_MASK                                                                    0x00000008L
> +#define SPI_GDBG_WAVE_CNTL3__STALL_CSG_MASK                                                                   0x00000010L
> +#define SPI_GDBG_WAVE_CNTL3__STALL_CS0_MASK                                                                   0x00000020L
> +#define SPI_GDBG_WAVE_CNTL3__STALL_CS1_MASK                                                                   0x00000040L
> +#define SPI_GDBG_WAVE_CNTL3__STALL_CS2_MASK                                                                   0x00000080L
> +#define SPI_GDBG_WAVE_CNTL3__STALL_CS3_MASK                                                                   0x00000100L
> +#define SPI_GDBG_WAVE_CNTL3__STALL_CS4_MASK                                                                   0x00000200L
> +#define SPI_GDBG_WAVE_CNTL3__STALL_CS5_MASK                                                                   0x00000400L
> +#define SPI_GDBG_WAVE_CNTL3__STALL_CS6_MASK                                                                   0x00000800L
> +#define SPI_GDBG_WAVE_CNTL3__STALL_CS7_MASK                                                                   0x00001000L
> +#define SPI_GDBG_WAVE_CNTL3__STALL_DURATION_MASK                                                              0x0FFFE000L
> +#define SPI_GDBG_WAVE_CNTL3__STALL_MULT_MASK                                                                  0x10000000L
> +//SPI_GDBG_TRAP_DATA0
> +#define SPI_GDBG_TRAP_DATA0__DATA__SHIFT                                                                      0x0
> +#define SPI_GDBG_TRAP_DATA0__DATA_MASK                                                                        0xFFFFFFFFL
> +//SPI_GDBG_TRAP_DATA1
> +#define SPI_GDBG_TRAP_DATA1__DATA__SHIFT                                                                      0x0
> +#define SPI_GDBG_TRAP_DATA1__DATA_MASK                                                                        0xFFFFFFFFL
>   //SPI_COMPUTE_QUEUE_RESET
>   #define SPI_COMPUTE_QUEUE_RESET__RESET__SHIFT                                                                 0x0
>   #define SPI_COMPUTE_QUEUE_RESET__RESET_MASK                                                                   0x01L
> diff --git a/drivers/gpu/drm/amd/include/asic_reg/gc/gc_10_3_0_offset.h b/drivers/gpu/drm/amd/include/asic_reg/gc/gc_10_3_0_offset.h
> index 3973110f149c..d09f1a06f4bf 100644
> --- a/drivers/gpu/drm/amd/include/asic_reg/gc/gc_10_3_0_offset.h
> +++ b/drivers/gpu/drm/amd/include/asic_reg/gc/gc_10_3_0_offset.h
> @@ -26,6 +26,8 @@
>   #define mmSQ_DEBUG_STS_GLOBAL_BASE_IDX                                                                 0
>   #define mmSQ_DEBUG_STS_GLOBAL2                                                                         0x10B0
>   #define mmSQ_DEBUG_STS_GLOBAL2_BASE_IDX                                                                0
> +#define mmSQ_DEBUG                                                                                     0x10B1
> +#define mmSQ_DEBUG_BASE_IDX                                                                            0
>   
>   // addressBlock: gc_sdma0_sdma0dec
>   // base address: 0x4980
> @@ -4849,10 +4851,18 @@
>   #define mmSPI_WCL_PIPE_PERCENT_CS3_BASE_IDX                                                            0
>   #define mmSPI_GDBG_WAVE_CNTL                                                                           0x1f71
>   #define mmSPI_GDBG_WAVE_CNTL_BASE_IDX                                                                  0
> +#define mmSPI_GDBG_TRAP_CONFIG                                                                         0x1f72
> +#define mmSPI_GDBG_TRAP_CONFIG_BASE_IDX                                                                0
>   #define mmSPI_GDBG_TRAP_MASK                                                                           0x1f73
>   #define mmSPI_GDBG_TRAP_MASK_BASE_IDX                                                                  0
>   #define mmSPI_GDBG_WAVE_CNTL2                                                                          0x1f74
>   #define mmSPI_GDBG_WAVE_CNTL2_BASE_IDX                                                                 0
> +#define mmSPI_GDBG_WAVE_CNTL3                                                                          0x1f75
> +#define mmSPI_GDBG_WAVE_CNTL3_BASE_IDX                                                                 0
> +#define mmSPI_GDBG_TRAP_DATA0                                                                          0x1f78
> +#define mmSPI_GDBG_TRAP_DATA0_BASE_IDX                                                                 0
> +#define mmSPI_GDBG_TRAP_DATA1                                                                          0x1f79
> +#define mmSPI_GDBG_TRAP_DATA1_BASE_IDX                                                                 0
>   #define mmSPI_COMPUTE_QUEUE_RESET                                                                      0x1f7b
>   #define mmSPI_COMPUTE_QUEUE_RESET_BASE_IDX                                                             0
>   #define mmSPI_RESOURCE_RESERVE_CU_0                                                                    0x1f7c
> diff --git a/drivers/gpu/drm/amd/include/asic_reg/gc/gc_10_3_0_sh_mask.h b/drivers/gpu/drm/amd/include/asic_reg/gc/gc_10_3_0_sh_mask.h
> index d4e8ff22ecb8..fc85aee010fe 100644
> --- a/drivers/gpu/drm/amd/include/asic_reg/gc/gc_10_3_0_sh_mask.h
> +++ b/drivers/gpu/drm/amd/include/asic_reg/gc/gc_10_3_0_sh_mask.h
> @@ -47853,6 +47853,10 @@
>   
>   
>   // addressBlock: sqind
> +//SQ_DEBUG
> +#define SQ_DEBUG__SINGLE_MEMOP_MASK 0x00000001L
> +#define SQ_DEBUG__SINGLE_MEMOP__SHIFT 0x00000000
> +
>   //SQ_DEBUG_STS_GLOBAL
>   #define SQ_DEBUG_STS_GLOBAL2__FIFO_LEVEL_GFX0_MASK 0x000000ffL
>   #define SQ_DEBUG_STS_GLOBAL2__FIFO_LEVEL_GFX0__SHIFT 0x00000000

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [PATCH 06/29] drm/amdgpu: add gfx9 hw debug mode enable and disable calls
  2022-10-31 16:23 ` [PATCH 06/29] drm/amdgpu: add gfx9 hw debug mode enable and disable calls Jonathan Kim
@ 2022-11-22 23:50   ` Felix Kuehling
  0 siblings, 0 replies; 63+ messages in thread
From: Felix Kuehling @ 2022-11-22 23:50 UTC (permalink / raw)
  To: Jonathan Kim, amd-gfx


On 2022-10-31 12:23, Jonathan Kim wrote:
> Implement the per-device calls to enable or disable HW debug mode for
> GFX9 prior to GFX9.4.1.
>
> GFX9.4.1 and onward will require their own enable/disable sequence as
> follow on patches.
>
> When hardware debug mode setting is requested, waves will inherit
> these settings in the Shader Processor Input's (SPI) Sequencer Global
> Block (SQG). This means that the KGD must drain all waves from the SPI
> into SQG (approximately 96 SPI clock cycles) prior to debug mode setting
> to ensure that the order of operations that the debugger expects with
> regards to debug mode setting transaction requests and wave inheritence
> of that mode is upheld.
>
> Also ensure that exception overrides are reset to their original state
> prior to debug enable or disable.
>
> Signed-off-by: Jonathan Kim <jonathan.kim@amd.com>
> ---
>   .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c | 100 ++++++++++++++++--
>   .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h |   9 ++
>   drivers/gpu/drm/amd/amdkfd/kfd_debug.h        |   3 +
>   3 files changed, 102 insertions(+), 10 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
> index 81e3b528bbc9..e1aac6f6d369 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
> @@ -46,14 +46,14 @@ enum hqd_dequeue_request_type {
>   	SAVE_WAVES
>   };
>   
> -static void lock_srbm(struct amdgpu_device *adev, uint32_t mec, uint32_t pipe,
> +static void kgd_gfx_v9_lock_srbm(struct amdgpu_device *adev, uint32_t mec, uint32_t pipe,
>   			uint32_t queue, uint32_t vmid)

What's the reason for renaming these functions? It seems unnecessary and 
unrelated to the rest of the patch.


>   {
>   	mutex_lock(&adev->srbm_mutex);
>   	soc15_grbm_select(adev, mec, pipe, queue, vmid);
>   }
>   
> -static void unlock_srbm(struct amdgpu_device *adev)
> +static void kgd_gfx_v9_unlock_srbm(struct amdgpu_device *adev)
>   {
>   	soc15_grbm_select(adev, 0, 0, 0, 0);
>   	mutex_unlock(&adev->srbm_mutex);
> @@ -65,7 +65,7 @@ static void acquire_queue(struct amdgpu_device *adev, uint32_t pipe_id,
>   	uint32_t mec = (pipe_id / adev->gfx.mec.num_pipe_per_mec) + 1;
>   	uint32_t pipe = (pipe_id % adev->gfx.mec.num_pipe_per_mec);
>   
> -	lock_srbm(adev, mec, pipe, queue_id, 0);
> +	kgd_gfx_v9_lock_srbm(adev, mec, pipe, queue_id, 0);
>   }
>   
>   static uint64_t get_queue_mask(struct amdgpu_device *adev,
> @@ -79,7 +79,7 @@ static uint64_t get_queue_mask(struct amdgpu_device *adev,
>   
>   static void release_queue(struct amdgpu_device *adev)
>   {
> -	unlock_srbm(adev);
> +	kgd_gfx_v9_unlock_srbm(adev);
>   }
>   
>   void kgd_gfx_v9_program_sh_mem_settings(struct amdgpu_device *adev, uint32_t vmid,
> @@ -88,13 +88,13 @@ void kgd_gfx_v9_program_sh_mem_settings(struct amdgpu_device *adev, uint32_t vmi
>   					uint32_t sh_mem_ape1_limit,
>   					uint32_t sh_mem_bases)
>   {
> -	lock_srbm(adev, 0, 0, 0, vmid);
> +	kgd_gfx_v9_lock_srbm(adev, 0, 0, 0, vmid);
>   
>   	WREG32_RLC(SOC15_REG_OFFSET(GC, 0, mmSH_MEM_CONFIG), sh_mem_config);
>   	WREG32_RLC(SOC15_REG_OFFSET(GC, 0, mmSH_MEM_BASES), sh_mem_bases);
>   	/* APE1 no longer exists on GFX9 */
>   
> -	unlock_srbm(adev);
> +	kgd_gfx_v9_unlock_srbm(adev);
>   }
>   
>   int kgd_gfx_v9_set_pasid_vmid_mapping(struct amdgpu_device *adev, u32 pasid,
> @@ -164,13 +164,13 @@ int kgd_gfx_v9_init_interrupts(struct amdgpu_device *adev, uint32_t pipe_id)
>   	mec = (pipe_id / adev->gfx.mec.num_pipe_per_mec) + 1;
>   	pipe = (pipe_id % adev->gfx.mec.num_pipe_per_mec);
>   
> -	lock_srbm(adev, mec, pipe, 0, 0);
> +	kgd_gfx_v9_lock_srbm(adev, mec, pipe, 0, 0);
>   
>   	WREG32_SOC15(GC, 0, mmCPC_INT_CNTL,
>   		CP_INT_CNTL_RING0__TIME_STAMP_INT_ENABLE_MASK |
>   		CP_INT_CNTL_RING0__OPCODE_ERROR_INT_ENABLE_MASK);
>   
> -	unlock_srbm(adev);
> +	kgd_gfx_v9_unlock_srbm(adev);
>   
>   	return 0;
>   }
> @@ -646,6 +646,84 @@ int kgd_gfx_v9_wave_control_execute(struct amdgpu_device *adev,
>   	return 0;
>   }
>   
> +/*
> + * GFX9 helper for wave launch stall requirements on debug trap setting.
> + *
> + * vmid:
> + *   Target VMID to stall/unstall.
> + *
> + * stall:
> + *   0-unstall wave launch (enable), 1-stall wave launch (disable).
> + *   After wavefront launch has been stalled, allocated waves must drain from
> + *   SPI in order for debug trap settings to take effect on those waves.
> + *   This is roughly a ~96 clock cycle wait on SPI where a read on
> + *   SPI_GDBG_WAVE_CNTL translates to ~32 clock cycles.
> + *   KGD_GFX_V9_WAVE_LAUNCH_SPI_DRAIN_LATENCY indicates the number of reads required.
> + *
> + *   NOTE: We can afford to clear the entire STALL_VMID field on unstall
> + *   because GFX9.4.1 cannot support multi-process debugging due to trap
> + *   configuration and masking being limited to global scope.  Always assume
> + *   single process conditions.
> +
> + */
> +#define KGD_GFX_V9_WAVE_LAUNCH_SPI_DRAIN_LATENCY	3
> +void kgd_gfx_v9_set_wave_launch_stall(struct amdgpu_device *adev,
> +					uint32_t vmid,
> +					bool stall)
> +{
> +	int i;
> +	uint32_t data = RREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_WAVE_CNTL));
> +
> +	if (adev->ip_versions[GC_HWIP][0] == IP_VERSION(9, 4, 1))
> +		data = REG_SET_FIELD(data, SPI_GDBG_WAVE_CNTL, STALL_VMID,
> +							stall ? 1 << vmid : 0);
> +	else
> +		data = REG_SET_FIELD(data, SPI_GDBG_WAVE_CNTL, STALL_RA,
> +							stall ? 1 : 0);
> +
> +	WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_WAVE_CNTL), data);
> +
> +	if (!stall)
> +		return;
> +
> +	for (i = 0; i < KGD_GFX_V9_WAVE_LAUNCH_SPI_DRAIN_LATENCY; i++)
> +		RREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_WAVE_CNTL));
> +}
> +
> +uint32_t kgd_gfx_v9_enable_debug_trap(struct amdgpu_device *adev,
> +				bool restore_dbg_registers,
> +				uint32_t vmid)
> +{
> +	mutex_lock(&adev->grbm_idx_mutex);
> +
> +	kgd_gfx_v9_set_wave_launch_stall(adev, vmid, true);
> +
> +	WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_TRAP_MASK), 0);
> +
> +	kgd_gfx_v9_set_wave_launch_stall(adev, vmid, false);
> +
> +	mutex_unlock(&adev->grbm_idx_mutex);
> +
> +	return 0;
> +}
> +
> +uint32_t kgd_gfx_v9_disable_debug_trap(struct amdgpu_device *adev,
> +					bool keep_trap_enabled,
> +					uint32_t vmid)
> +{
> +	mutex_lock(&adev->grbm_idx_mutex);
> +
> +	kgd_gfx_v9_set_wave_launch_stall(adev, vmid, true);
> +
> +	WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_TRAP_MASK), 0);
> +
> +	kgd_gfx_v9_set_wave_launch_stall(adev, vmid, false);
> +
> +	mutex_unlock(&adev->grbm_idx_mutex);
> +
> +	return 0;
> +}

The enable and disable functions do exactly the same thing. And they 
ignore the restore_dbg_registers and keep_trap_enabled arguments. Maybe 
add a comment why that is.

Regards,
   Felix


> +
>   void kgd_gfx_v9_set_vm_context_page_table_base(struct amdgpu_device *adev,
>   			uint32_t vmid, uint64_t page_table_base)
>   {
> @@ -833,7 +911,7 @@ void kgd_gfx_v9_get_cu_occupancy(struct amdgpu_device *adev, int pasid,
>   void kgd_gfx_v9_program_trap_handler_settings(struct amdgpu_device *adev,
>                           uint32_t vmid, uint64_t tba_addr, uint64_t tma_addr)
>   {
> -	lock_srbm(adev, 0, 0, 0, vmid);
> +	kgd_gfx_v9_lock_srbm(adev, 0, 0, 0, vmid);
>   
>   	/*
>   	 * Program TBA registers
> @@ -851,7 +929,7 @@ void kgd_gfx_v9_program_trap_handler_settings(struct amdgpu_device *adev,
>   	WREG32_SOC15(GC, 0, mmSQ_SHADER_TMA_HI,
>   			upper_32_bits(tma_addr >> 8));
>   
> -	unlock_srbm(adev);
> +	kgd_gfx_v9_unlock_srbm(adev);
>   }
>   
>   const struct kfd2kgd_calls gfx_v9_kfd2kgd = {
> @@ -871,6 +949,8 @@ const struct kfd2kgd_calls gfx_v9_kfd2kgd = {
>   	.get_atc_vmid_pasid_mapping_info =
>   			kgd_gfx_v9_get_atc_vmid_pasid_mapping_info,
>   	.set_vm_context_page_table_base = kgd_gfx_v9_set_vm_context_page_table_base,
> +	.enable_debug_trap = kgd_gfx_v9_enable_debug_trap,
> +	.disable_debug_trap = kgd_gfx_v9_disable_debug_trap,
>   	.get_cu_occupancy = kgd_gfx_v9_get_cu_occupancy,
>   	.program_trap_handler_settings = kgd_gfx_v9_program_trap_handler_settings,
>   };
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h
> index c7ed3bc9053c..d39256162616 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h
> @@ -58,3 +58,12 @@ void kgd_gfx_v9_get_cu_occupancy(struct amdgpu_device *adev, int pasid,
>   		int *pasid_wave_cnt, int *max_waves_per_cu);
>   void kgd_gfx_v9_program_trap_handler_settings(struct amdgpu_device *adev,
>   		uint32_t vmid, uint64_t tba_addr, uint64_t tma_addr);
> +void kgd_gfx_v9_set_wave_launch_stall(struct amdgpu_device *adev,
> +					uint32_t vmid,
> +					bool stall);
> +uint32_t kgd_gfx_v9_enable_debug_trap(struct amdgpu_device *adev,
> +				      bool restore_dbg_registers,
> +				      uint32_t vmid);
> +uint32_t kgd_gfx_v9_disable_debug_trap(struct amdgpu_device *adev,
> +					bool keep_trap_enabled,
> +					uint32_t vmid);
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.h b/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
> index b2217eb1399c..8aa7a3ad4e97 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
> @@ -25,6 +25,9 @@
>   
>   #include "kfd_priv.h"
>   
> +void kgd_gfx_v9_set_wave_launch_stall(struct amdgpu_device *adev,
> +					uint32_t vmid,
> +					bool stall);
>   int kfd_dbg_trap_disable(struct kfd_process *target);
>   int kfd_dbg_trap_enable(struct kfd_process *target, uint32_t fd,
>   			void __user *runtime_info,

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [PATCH 07/29] drm/amdgpu: add gfx9.4.1 hw debug mode enable and disable calls
  2022-10-31 16:23 ` [PATCH 07/29] drm/amdgpu: add gfx9.4.1 " Jonathan Kim
@ 2022-11-22 23:59   ` Felix Kuehling
  2022-11-24 14:58     ` Kim, Jonathan
  0 siblings, 1 reply; 63+ messages in thread
From: Felix Kuehling @ 2022-11-22 23:59 UTC (permalink / raw)
  To: Jonathan Kim, amd-gfx


On 2022-10-31 12:23, Jonathan Kim wrote:
> On GFX9.4.1, the implicit wait count instruction on s_barrier is
> disabled by default in the driver during normal operation for
> performance requirements.
>
> There is a hardware bug in GFX9.4.1 where if the implicit wait count
> instruction after an s_barrier instruction is disabled, any wave that
> hits an exception may step over the s_barrier when returning from the
> trap handler with the barrier logic having no ability to be
> aware of this, thereby causing other waves to wait at the barrier
> indefinitely resulting in a shader hang.  This bug has been corrected
> for GFX9.4.2 and onward.
>
> Since the debugger subscribes to hardware exceptions, in order to avoid
> this bug, the debugger must enable implicit wait count on s_barrier
> for a debug session and disable it on detach.
>
> In order to change this setting in the in the device global SQ_CONFIG
> register, the GFX pipeline must be idle.  GFX9.4.1 as a compute device
> will either dispatch work through the compute ring buffers used for
> image post processing or through the hardware scheduler by the KFD.
>
> Have the KGD suspend and drain the compute ring buffer, then suspend the
> hardware scheduler and block any future KFD process job requests before
> changing the implicit wait count setting.  Once set, resume all work.
>
> Signed-off-by: Jonathan Kim <jonathan.kim@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu.h           |   3 +
>   .../drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c   | 105 +++++++++++++++++-
>   drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c         |   4 +-
>   drivers/gpu/drm/amd/amdkfd/kfd_process.c      |   2 +-
>   4 files changed, 110 insertions(+), 4 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index 0e6ddf05c23c..9f2499f52d2c 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -1034,6 +1034,9 @@ struct amdgpu_device {
>   	struct pci_saved_state          *pci_state;
>   	pci_channel_state_t		pci_channel_state;
>   
> +	/* Track auto wait count on s_barrier settings */
> +	bool				barrier_has_auto_waitcnt;
> +
>   	struct amdgpu_reset_control     *reset_cntl;
>   	uint32_t                        ip_versions[MAX_HWIP][HWIP_MAX_INSTANCE];
>   
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c
> index 4191af5a3f13..13f02a0aa828 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c
> @@ -26,6 +26,7 @@
>   #include "amdgpu.h"
>   #include "amdgpu_amdkfd.h"
>   #include "amdgpu_amdkfd_arcturus.h"
> +#include "amdgpu_reset.h"
>   #include "sdma0/sdma0_4_2_2_offset.h"
>   #include "sdma0/sdma0_4_2_2_sh_mask.h"
>   #include "sdma1/sdma1_4_2_2_offset.h"
> @@ -48,6 +49,8 @@
>   #include "amdgpu_amdkfd_gfx_v9.h"
>   #include "gfxhub_v1_0.h"
>   #include "mmhub_v9_4.h"
> +#include "gc/gc_9_0_offset.h"
> +#include "gc/gc_9_0_sh_mask.h"
>   
>   #define HQD_N_REGS 56
>   #define DUMP_REG(addr) do {				\
> @@ -276,6 +279,104 @@ int kgd_arcturus_hqd_sdma_destroy(struct amdgpu_device *adev, void *mqd,
>   	return 0;
>   }
>   
> +/*
> + * Helper used to suspend/resume gfx pipe for image post process work to set
> + * barrier behaviour.
> + */
> +static int suspend_resume_compute_scheduler(struct amdgpu_device *adev, bool suspend)
> +{
> +	int i, r = 0;
> +
> +	for (i = 0; i < adev->gfx.num_compute_rings; i++) {
> +		struct amdgpu_ring *ring = &adev->gfx.compute_ring[i];
> +
> +		if (!(ring && ring->sched.thread))
> +			continue;
> +
> +		/* stop secheduler and drain ring. */
> +		if (suspend) {
> +			drm_sched_stop(&ring->sched, NULL);
> +			r = amdgpu_fence_wait_empty(ring);
> +			if (r)
> +				goto out;
> +		} else {
> +			drm_sched_start(&ring->sched, false);
> +		}
> +	}
> +
> +out:
> +	/* return on resume or failure to drain rings. */
> +	if (!suspend || r)
> +		return r;
> +
> +	return amdgpu_device_ip_wait_for_idle(adev, GC_HWIP);
> +}
> +
> +static void set_barrier_auto_waitcnt(struct amdgpu_device *adev, bool enable_waitcnt)
> +{
> +	uint32_t data;
> +
> +	WRITE_ONCE(adev->barrier_has_auto_waitcnt, enable_waitcnt);
> +
> +	if (!down_read_trylock(&adev->reset_domain->sem))
> +		return;
> +
> +	amdgpu_amdkfd_suspend(adev, false);
> +
> +	if (suspend_resume_compute_scheduler(adev, true))
> +		goto out;
> +
> +	data = RREG32(SOC15_REG_OFFSET(GC, 0, mmSQ_CONFIG));
> +	data = REG_SET_FIELD(data, SQ_CONFIG, DISABLE_BARRIER_WAITCNT,
> +						enable_waitcnt ? 0 : 1);
> +	WREG32(SOC15_REG_OFFSET(GC, 0, mmSQ_CONFIG), data);
> +
> +out:
> +	suspend_resume_compute_scheduler(adev, false);
> +
> +	amdgpu_amdkfd_resume(adev, false);
> +
> +	up_read(&adev->reset_domain->sem);
> +}
> +
> +static uint32_t kgd_arcturus_enable_debug_trap(struct amdgpu_device *adev,
> +				bool restore_dbg_registers,
> +				uint32_t vmid)
> +{
> +	mutex_lock(&adev->grbm_idx_mutex);
> +
> +	kgd_gfx_v9_set_wave_launch_stall(adev, vmid, true);
> +
> +	set_barrier_auto_waitcnt(adev, true);
> +
> +	WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_TRAP_MASK), 0);
> +
> +	kgd_gfx_v9_set_wave_launch_stall(adev, vmid, false);
> +
> +	mutex_unlock(&adev->grbm_idx_mutex);
> +
> +	return 0;
> +}
> +
> +static uint32_t kgd_arcturus_disable_debug_trap(struct amdgpu_device *adev,
> +					bool keep_trap_enabled,
> +					uint32_t vmid)
> +{
> +
> +	mutex_lock(&adev->grbm_idx_mutex);
> +
> +	kgd_gfx_v9_set_wave_launch_stall(adev, vmid, true);
> +
> +	set_barrier_auto_waitcnt(adev, false);
> +
> +	WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_TRAP_MASK), 0);
> +
> +	kgd_gfx_v9_set_wave_launch_stall(adev, vmid, false);
> +
> +	mutex_unlock(&adev->grbm_idx_mutex);
> +
> +	return 0;
> +}
>   const struct kfd2kgd_calls arcturus_kfd2kgd = {
>   	.program_sh_mem_settings = kgd_gfx_v9_program_sh_mem_settings,
>   	.set_pasid_vmid_mapping = kgd_gfx_v9_set_pasid_vmid_mapping,
> @@ -294,6 +395,8 @@ const struct kfd2kgd_calls arcturus_kfd2kgd = {
>   				kgd_gfx_v9_get_atc_vmid_pasid_mapping_info,
>   	.set_vm_context_page_table_base =
>   				kgd_gfx_v9_set_vm_context_page_table_base,
> +	.enable_debug_trap = kgd_arcturus_enable_debug_trap,
> +	.disable_debug_trap = kgd_arcturus_disable_debug_trap,
>   	.get_cu_occupancy = kgd_gfx_v9_get_cu_occupancy,
> -	.program_trap_handler_settings = kgd_gfx_v9_program_trap_handler_settings
> +	.program_trap_handler_settings = kgd_gfx_v9_program_trap_handler_settings,
>   };
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> index a0e5ad342f13..8ed1b5d255f7 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> @@ -2424,8 +2424,8 @@ static void gfx_v9_0_init_sq_config(struct amdgpu_device *adev)
>   	switch (adev->ip_versions[GC_HWIP][0]) {
>   	case IP_VERSION(9, 4, 1):
>   		tmp = RREG32_SOC15(GC, 0, mmSQ_CONFIG);
> -		tmp = REG_SET_FIELD(tmp, SQ_CONFIG,
> -					DISABLE_BARRIER_WAITCNT, 1);
> +		tmp = REG_SET_FIELD(tmp, SQ_CONFIG, DISABLE_BARRIER_WAITCNT,
> +				READ_ONCE(adev->barrier_has_auto_waitcnt) ? 0 : 1);
>   		WREG32_SOC15(GC, 0, mmSQ_CONFIG, tmp);
>   		break;
>   	default:
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> index 56ad38fcd26e..efb81ccef8f5 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> @@ -1946,7 +1946,7 @@ void kfd_suspend_all_processes(void)
>   	WARN(debug_evictions, "Evicting all processes");
>   	hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) {
>   		cancel_delayed_work_sync(&p->eviction_work);
> -		cancel_delayed_work_sync(&p->restore_work);
> +		flush_delayed_work(&p->restore_work);

This looks like a sneak bug fix. Should this be a separate patch 
independent of this path series?

Regards,
   Felix


>   
>   		if (kfd_process_evict_queues(p, KFD_QUEUE_EVICTION_TRIGGER_SUSPEND))
>   			pr_err("Failed to suspend process 0x%x\n", p->pasid);

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [PATCH 10/29] drm/amdgpu: add configurable grace period for unmap queues
  2022-10-31 16:23 ` [PATCH 10/29] drm/amdgpu: add configurable grace period for unmap queues Jonathan Kim
@ 2022-11-23  0:21   ` Felix Kuehling
  0 siblings, 0 replies; 63+ messages in thread
From: Felix Kuehling @ 2022-11-23  0:21 UTC (permalink / raw)
  To: Jonathan Kim, amd-gfx

On 2022-10-31 12:23, Jonathan Kim wrote:
> The HWS schedule allows a grace period for wave completion prior to
> preemption but the debugger requires good performance since it preempts
> on every HW debug mode setting transaction request.
>
> For good performance, allow immediate preemption by setting the grace
> period to 0.

This is less about performance than about latency. The grace period is 
there to optimize performance in normal operation by avoiding CWSR for 
waves that would drain quickly on their own. But the debugger doesn't 
care about performance and it doesn't want to let waves drain. It wants 
low latency to preempt queues as soon as possible so it can inspect 
their state.

Regards,
   Felix

>
> Note that setting the preepmtion grace period to 0 will result in an
> infinite grace period being set due to a CP FW bug so set it to 1 for now.
>
> Signed-off-by: Jonathan Kim <jonathan.kim@amd.com>
> ---
>   .../drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c  |  2 +
>   .../drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c   |  2 +
>   .../drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c    | 43 ++++++++++++
>   .../drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.h    |  6 ++
>   .../drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10_3.c  |  2 +
>   .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c | 43 ++++++++++++
>   .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h |  9 ++-
>   .../drm/amd/amdkfd/kfd_device_queue_manager.c | 61 ++++++++++++-----
>   .../drm/amd/amdkfd/kfd_device_queue_manager.h |  2 +
>   .../gpu/drm/amd/amdkfd/kfd_packet_manager.c   | 32 +++++++++
>   .../drm/amd/amdkfd/kfd_packet_manager_v9.c    | 39 +++++++++++
>   .../gpu/drm/amd/amdkfd/kfd_pm4_headers_ai.h   | 65 +++++++++++++++++++
>   drivers/gpu/drm/amd/amdkfd/kfd_priv.h         |  5 ++
>   13 files changed, 291 insertions(+), 20 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c
> index 42491a31f352..c9629fc5460c 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c
> @@ -73,5 +73,7 @@ const struct kfd2kgd_calls aldebaran_kfd2kgd = {
>   	.set_vm_context_page_table_base = kgd_gfx_v9_set_vm_context_page_table_base,
>   	.enable_debug_trap = kgd_aldebaran_enable_debug_trap,
>   	.disable_debug_trap = kgd_aldebaran_disable_debug_trap,
> +	.get_iq_wait_times = kgd_gfx_v9_get_iq_wait_times,
> +	.build_grace_period_packet_info = kgd_gfx_v9_build_grace_period_packet_info,
>   	.program_trap_handler_settings = kgd_gfx_v9_program_trap_handler_settings,
>   };
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c
> index 13f02a0aa828..60a204f767ba 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c
> @@ -397,6 +397,8 @@ const struct kfd2kgd_calls arcturus_kfd2kgd = {
>   				kgd_gfx_v9_set_vm_context_page_table_base,
>   	.enable_debug_trap = kgd_arcturus_enable_debug_trap,
>   	.disable_debug_trap = kgd_arcturus_disable_debug_trap,
> +	.get_iq_wait_times = kgd_gfx_v9_get_iq_wait_times,
> +	.build_grace_period_packet_info = kgd_gfx_v9_build_grace_period_packet_info,
>   	.get_cu_occupancy = kgd_gfx_v9_get_cu_occupancy,
>   	.program_trap_handler_settings = kgd_gfx_v9_program_trap_handler_settings,
>   };
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
> index c09b45de02d0..2491402afd58 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
> @@ -801,6 +801,47 @@ uint32_t kgd_gfx_v10_disable_debug_trap(struct amdgpu_device *adev,
>   	return 0;
>   }
>   
> +/* kgd_gfx_v10_get_iq_wait_times: Returns the mmCP_IQ_WAIT_TIME1/2 values
> + * The values read are:
> + *     ib_offload_wait_time     -- Wait Count for Indirect Buffer Offloads.
> + *     atomic_offload_wait_time -- Wait Count for L2 and GDS Atomics Offloads.
> + *     wrm_offload_wait_time    -- Wait Count for WAIT_REG_MEM Offloads.
> + *     gws_wait_time            -- Wait Count for Global Wave Syncs.
> + *     que_sleep_wait_time      -- Wait Count for Dequeue Retry.
> + *     sch_wave_wait_time       -- Wait Count for Scheduling Wave Message.
> + *     sem_rearm_wait_time      -- Wait Count for Semaphore re-arm.
> + *     deq_retry_wait_time      -- Wait Count for Global Wave Syncs.
> + */
> +void kgd_gfx_v10_get_iq_wait_times(struct amdgpu_device *adev,
> +					uint32_t *wait_times)
> +
> +{
> +	*wait_times = RREG32(SOC15_REG_OFFSET(GC, 0, mmCP_IQ_WAIT_TIME2));
> +}
> +
> +void kgd_gfx_v10_build_grace_period_packet_info(struct amdgpu_device *adev,
> +						uint32_t wait_times,
> +						uint32_t grace_period,
> +						uint32_t *reg_offset,
> +						uint32_t *reg_data)
> +{
> +	*reg_data = wait_times;
> +
> +	/*
> +	 * The CP cannont handle a 0 grace period input and will result in
> +	 * an infinite grace period being set so set to 1 to prevent this.
> +	 */
> +	if (grace_period == 0)
> +		grace_period = 1;
> +
> +	*reg_data = REG_SET_FIELD(*reg_data,
> +			CP_IQ_WAIT_TIME2,
> +			SCH_WAVE,
> +			grace_period);
> +
> +	*reg_offset = SOC15_REG_OFFSET(GC, 0, mmCP_IQ_WAIT_TIME2);
> +}
> +
>   static void program_trap_handler_settings(struct amdgpu_device *adev,
>   		uint32_t vmid, uint64_t tba_addr, uint64_t tma_addr)
>   {
> @@ -845,5 +886,7 @@ const struct kfd2kgd_calls gfx_v10_kfd2kgd = {
>   	.set_vm_context_page_table_base = set_vm_context_page_table_base,
>   	.enable_debug_trap = kgd_gfx_v10_enable_debug_trap,
>   	.disable_debug_trap = kgd_gfx_v10_disable_debug_trap,
> +	.get_iq_wait_times = kgd_gfx_v10_get_iq_wait_times,
> +	.build_grace_period_packet_info = kgd_gfx_v10_build_grace_period_packet_info,
>   	.program_trap_handler_settings = program_trap_handler_settings,
>   };
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.h
> index 370d6c312981..0abc1e805180 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.h
> @@ -26,3 +26,9 @@ uint32_t kgd_gfx_v10_enable_debug_trap(struct amdgpu_device *adev,
>   uint32_t kgd_gfx_v10_disable_debug_trap(struct amdgpu_device *adev,
>   					bool keep_trap_enabled,
>   					uint32_t vmid);
> +void kgd_gfx_v10_get_iq_wait_times(struct amdgpu_device *adev, uint32_t *wait_times);
> +void kgd_gfx_v10_build_grace_period_packet_info(struct amdgpu_device *adev,
> +					       uint32_t wait_times,
> +					       uint32_t grace_period,
> +					       uint32_t *reg_offset,
> +					       uint32_t *reg_data);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10_3.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10_3.c
> index 73e3b9ae1fb0..c57f2a6b6e23 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10_3.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10_3.c
> @@ -670,6 +670,8 @@ const struct kfd2kgd_calls gfx_v10_3_kfd2kgd = {
>   	.get_atc_vmid_pasid_mapping_info = get_atc_vmid_pasid_mapping_info_v10_3,
>   	.set_vm_context_page_table_base = set_vm_context_page_table_base_v10_3,
>   	.program_trap_handler_settings = program_trap_handler_settings_v10_3,
> +	.get_iq_wait_times = kgd_gfx_v10_get_iq_wait_times,
> +	.build_grace_period_packet_info = kgd_gfx_v10_build_grace_period_packet_info,
>   	.enable_debug_trap = kgd_gfx_v10_enable_debug_trap,
>   	.disable_debug_trap = kgd_gfx_v10_disable_debug_trap
>   };
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
> index e1aac6f6d369..673c99c5523d 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
> @@ -724,6 +724,24 @@ uint32_t kgd_gfx_v9_disable_debug_trap(struct amdgpu_device *adev,
>   	return 0;
>   }
>   
> +/* kgd_gfx_v9_get_iq_wait_times: Returns the mmCP_IQ_WAIT_TIME1/2 values
> + * The values read are:
> + *     ib_offload_wait_time     -- Wait Count for Indirect Buffer Offloads.
> + *     atomic_offload_wait_time -- Wait Count for L2 and GDS Atomics Offloads.
> + *     wrm_offload_wait_time    -- Wait Count for WAIT_REG_MEM Offloads.
> + *     gws_wait_time            -- Wait Count for Global Wave Syncs.
> + *     que_sleep_wait_time      -- Wait Count for Dequeue Retry.
> + *     sch_wave_wait_time       -- Wait Count for Scheduling Wave Message.
> + *     sem_rearm_wait_time      -- Wait Count for Semaphore re-arm.
> + *     deq_retry_wait_time      -- Wait Count for Global Wave Syncs.
> + */
> +void kgd_gfx_v9_get_iq_wait_times(struct amdgpu_device *adev,
> +					uint32_t *wait_times)
> +
> +{
> +	*wait_times = RREG32(SOC15_REG_OFFSET(GC, 0, mmCP_IQ_WAIT_TIME2));
> +}
> +
>   void kgd_gfx_v9_set_vm_context_page_table_base(struct amdgpu_device *adev,
>   			uint32_t vmid, uint64_t page_table_base)
>   {
> @@ -908,6 +926,29 @@ void kgd_gfx_v9_get_cu_occupancy(struct amdgpu_device *adev, int pasid,
>   				adev->gfx.cu_info.max_waves_per_simd;
>   }
>   
> +void kgd_gfx_v9_build_grace_period_packet_info(struct amdgpu_device *adev,
> +		uint32_t wait_times,
> +		uint32_t grace_period,
> +		uint32_t *reg_offset,
> +		uint32_t *reg_data)
> +{
> +	*reg_data = wait_times;
> +
> +	/*
> +	 * The CP cannont handle a 0 grace period input and will result in
> +	 * an infinite grace period being set so set to 1 to prevent this.
> +	 */
> +	if (grace_period == 0)
> +		grace_period = 1;
> +
> +	*reg_data = REG_SET_FIELD(*reg_data,
> +			CP_IQ_WAIT_TIME2,
> +			SCH_WAVE,
> +			grace_period);
> +
> +	*reg_offset = SOC15_REG_OFFSET(GC, 0, mmCP_IQ_WAIT_TIME2);
> +}
> +
>   void kgd_gfx_v9_program_trap_handler_settings(struct amdgpu_device *adev,
>                           uint32_t vmid, uint64_t tba_addr, uint64_t tma_addr)
>   {
> @@ -951,6 +992,8 @@ const struct kfd2kgd_calls gfx_v9_kfd2kgd = {
>   	.set_vm_context_page_table_base = kgd_gfx_v9_set_vm_context_page_table_base,
>   	.enable_debug_trap = kgd_gfx_v9_enable_debug_trap,
>   	.disable_debug_trap = kgd_gfx_v9_disable_debug_trap,
> +	.get_iq_wait_times = kgd_gfx_v9_get_iq_wait_times,
> +	.build_grace_period_packet_info = kgd_gfx_v9_build_grace_period_packet_info,
>   	.get_cu_occupancy = kgd_gfx_v9_get_cu_occupancy,
>   	.program_trap_handler_settings = kgd_gfx_v9_program_trap_handler_settings,
>   };
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h
> index d39256162616..c0866497cb5c 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h
> @@ -20,8 +20,6 @@
>    * OTHER DEALINGS IN THE SOFTWARE.
>    */
>   
> -
> -
>   void kgd_gfx_v9_program_sh_mem_settings(struct amdgpu_device *adev, uint32_t vmid,
>   		uint32_t sh_mem_config,
>   		uint32_t sh_mem_ape1_base, uint32_t sh_mem_ape1_limit,
> @@ -51,7 +49,6 @@ int kgd_gfx_v9_wave_control_execute(struct amdgpu_device *adev,
>   					uint32_t sq_cmd);
>   bool kgd_gfx_v9_get_atc_vmid_pasid_mapping_info(struct amdgpu_device *adev,
>   					uint8_t vmid, uint16_t *p_pasid);
> -
>   void kgd_gfx_v9_set_vm_context_page_table_base(struct amdgpu_device *adev,
>   			uint32_t vmid, uint64_t page_table_base);
>   void kgd_gfx_v9_get_cu_occupancy(struct amdgpu_device *adev, int pasid,
> @@ -67,3 +64,9 @@ uint32_t kgd_gfx_v9_enable_debug_trap(struct amdgpu_device *adev,
>   uint32_t kgd_gfx_v9_disable_debug_trap(struct amdgpu_device *adev,
>   					bool keep_trap_enabled,
>   					uint32_t vmid);
> +void kgd_gfx_v9_get_iq_wait_times(struct amdgpu_device *adev, uint32_t *wait_times);
> +void kgd_gfx_v9_build_grace_period_packet_info(struct amdgpu_device *adev,
> +					       uint32_t wait_times,
> +					       uint32_t grace_period,
> +					       uint32_t *reg_offset,
> +					       uint32_t *reg_data);
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> index faa5d8c666ee..fbcf4ee07936 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> @@ -46,10 +46,13 @@ static int set_pasid_vmid_mapping(struct device_queue_manager *dqm,
>   
>   static int execute_queues_cpsch(struct device_queue_manager *dqm,
>   				enum kfd_unmap_queues_filter filter,
> -				uint32_t filter_param);
> +				uint32_t filter_param,
> +				uint32_t grace_period);
>   static int unmap_queues_cpsch(struct device_queue_manager *dqm,
>   				enum kfd_unmap_queues_filter filter,
> -				uint32_t filter_param, bool reset);
> +				uint32_t filter_param,
> +				uint32_t grace_period,
> +				bool reset);
>   
>   static int map_queues_cpsch(struct device_queue_manager *dqm);
>   
> @@ -839,7 +842,7 @@ static int update_queue(struct device_queue_manager *dqm, struct queue *q,
>   	if (dqm->sched_policy != KFD_SCHED_POLICY_NO_HWS) {
>   		if (!dqm->dev->shared_resources.enable_mes)
>   			retval = unmap_queues_cpsch(dqm,
> -						    KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0, false);
> +						    KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0, USE_DEFAULT_GRACE_PERIOD, false);
>   		else if (prev_active)
>   			retval = remove_queue_mes(dqm, q, &pdd->qpd);
>   
> @@ -1015,7 +1018,8 @@ static int evict_process_queues_cpsch(struct device_queue_manager *dqm,
>   		retval = execute_queues_cpsch(dqm,
>   					      qpd->is_debug ?
>   					      KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES :
> -					      KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0);
> +					      KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0,
> +					      USE_DEFAULT_GRACE_PERIOD);
>   
>   out:
>   	dqm_unlock(dqm);
> @@ -1155,7 +1159,7 @@ static int restore_process_queues_cpsch(struct device_queue_manager *dqm,
>   	}
>   	if (!dqm->dev->shared_resources.enable_mes)
>   		retval = execute_queues_cpsch(dqm,
> -					      KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0);
> +					      KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0, USE_DEFAULT_GRACE_PERIOD);
>   	qpd->evicted = 0;
>   	eviction_duration = get_jiffies_64() - pdd->last_evict_timestamp;
>   	atomic64_add(eviction_duration, &pdd->evict_duration_counter);
> @@ -1492,6 +1496,9 @@ static int initialize_cpsch(struct device_queue_manager *dqm)
>   
>   	init_sdma_bitmaps(dqm);
>   
> +	if (dqm->dev->kfd2kgd->get_iq_wait_times)
> +		dqm->dev->kfd2kgd->get_iq_wait_times(dqm->dev->adev,
> +					&dqm->wait_times);
>   	return 0;
>   }
>   
> @@ -1531,7 +1538,7 @@ static int start_cpsch(struct device_queue_manager *dqm)
>   	dqm->is_resetting = false;
>   	dqm->sched_running = true;
>   	if (!dqm->dev->shared_resources.enable_mes)
> -		execute_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0);
> +		execute_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0, USE_DEFAULT_GRACE_PERIOD);
>   	dqm_unlock(dqm);
>   
>   	return 0;
> @@ -1556,7 +1563,7 @@ static int stop_cpsch(struct device_queue_manager *dqm)
>   
>   	if (!dqm->is_hws_hang) {
>   		if (!dqm->dev->shared_resources.enable_mes)
> -			unmap_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0, false);
> +			unmap_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0, USE_DEFAULT_GRACE_PERIOD, false);
>   		else
>   			remove_all_queues_mes(dqm);
>   	}
> @@ -1598,7 +1605,8 @@ static int create_kernel_queue_cpsch(struct device_queue_manager *dqm,
>   	list_add(&kq->list, &qpd->priv_queue_list);
>   	increment_queue_count(dqm, qpd, kq->queue);
>   	qpd->is_debug = true;
> -	execute_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0);
> +	execute_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0,
> +			USE_DEFAULT_GRACE_PERIOD);
>   	dqm_unlock(dqm);
>   
>   	return 0;
> @@ -1612,7 +1620,8 @@ static void destroy_kernel_queue_cpsch(struct device_queue_manager *dqm,
>   	list_del(&kq->list);
>   	decrement_queue_count(dqm, qpd, kq->queue);
>   	qpd->is_debug = false;
> -	execute_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0);
> +	execute_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0,
> +			USE_DEFAULT_GRACE_PERIOD);
>   	/*
>   	 * Unconditionally decrement this counter, regardless of the queue's
>   	 * type.
> @@ -1689,7 +1698,7 @@ static int create_queue_cpsch(struct device_queue_manager *dqm, struct queue *q,
>   
>   		if (!dqm->dev->shared_resources.enable_mes)
>   			retval = execute_queues_cpsch(dqm,
> -					KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0);
> +					KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0, USE_DEFAULT_GRACE_PERIOD);
>   		else
>   			retval = add_queue_mes(dqm, q, qpd);
>   		if (retval)
> @@ -1778,7 +1787,9 @@ static int map_queues_cpsch(struct device_queue_manager *dqm)
>   /* dqm->lock mutex has to be locked before calling this function */
>   static int unmap_queues_cpsch(struct device_queue_manager *dqm,
>   				enum kfd_unmap_queues_filter filter,
> -				uint32_t filter_param, bool reset)
> +				uint32_t filter_param,
> +				uint32_t grace_period,
> +				bool reset)
>   {
>   	int retval = 0;
>   	struct mqd_manager *mqd_mgr;
> @@ -1790,6 +1801,12 @@ static int unmap_queues_cpsch(struct device_queue_manager *dqm,
>   	if (!dqm->active_runlist)
>   		return retval;
>   
> +	if (grace_period != USE_DEFAULT_GRACE_PERIOD) {
> +		retval = pm_update_grace_period(&dqm->packet_mgr, grace_period);
> +		if (retval)
> +			return retval;
> +	}
> +
>   	retval = pm_send_unmap_queue(&dqm->packet_mgr, filter, filter_param, reset);
>   	if (retval)
>   		return retval;
> @@ -1822,6 +1839,13 @@ static int unmap_queues_cpsch(struct device_queue_manager *dqm,
>   		return -ETIME;
>   	}
>   
> +	/* We need to reset the grace period value for this device */
> +	if (grace_period != USE_DEFAULT_GRACE_PERIOD) {
> +		if (pm_update_grace_period(&dqm->packet_mgr,
> +					USE_DEFAULT_GRACE_PERIOD))
> +			pr_err("Failed to reset grace period\n");
> +	}
> +
>   	pm_release_ib(&dqm->packet_mgr);
>   	dqm->active_runlist = false;
>   
> @@ -1837,7 +1861,7 @@ static int reset_queues_cpsch(struct device_queue_manager *dqm,
>   	dqm_lock(dqm);
>   
>   	retval = unmap_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_BY_PASID,
> -			pasid, true);
> +			pasid, USE_DEFAULT_GRACE_PERIOD, true);
>   
>   	dqm_unlock(dqm);
>   	return retval;
> @@ -1846,13 +1870,14 @@ static int reset_queues_cpsch(struct device_queue_manager *dqm,
>   /* dqm->lock mutex has to be locked before calling this function */
>   static int execute_queues_cpsch(struct device_queue_manager *dqm,
>   				enum kfd_unmap_queues_filter filter,
> -				uint32_t filter_param)
> +				uint32_t filter_param,
> +				uint32_t grace_period)
>   {
>   	int retval;
>   
>   	if (dqm->is_hws_hang)
>   		return -EIO;
> -	retval = unmap_queues_cpsch(dqm, filter, filter_param, false);
> +	retval = unmap_queues_cpsch(dqm, filter, filter_param, grace_period, false);
>   	if (retval)
>   		return retval;
>   
> @@ -1910,7 +1935,8 @@ static int destroy_queue_cpsch(struct device_queue_manager *dqm,
>   		if (!dqm->dev->shared_resources.enable_mes) {
>   			decrement_queue_count(dqm, qpd, q);
>   			retval = execute_queues_cpsch(dqm,
> -						      KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0);
> +						      KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0,
> +						      USE_DEFAULT_GRACE_PERIOD);
>   			if (retval == -ETIME)
>   				qpd->reset_wavefronts = true;
>   		} else {
> @@ -2195,7 +2221,7 @@ static int process_termination_cpsch(struct device_queue_manager *dqm,
>   	}
>   
>   	if (!dqm->dev->shared_resources.enable_mes)
> -		retval = execute_queues_cpsch(dqm, filter, 0);
> +		retval = execute_queues_cpsch(dqm, filter, 0, USE_DEFAULT_GRACE_PERIOD);
>   
>   	if ((!dqm->is_hws_hang) && (retval || qpd->reset_wavefronts)) {
>   		pr_warn("Resetting wave fronts (cpsch) on dev %p\n", dqm->dev);
> @@ -2539,7 +2565,8 @@ int dqm_debugfs_hang_hws(struct device_queue_manager *dqm)
>   		return r;
>   	}
>   	dqm->active_runlist = true;
> -	r = execute_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0);
> +	r = execute_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES,
> +				0, USE_DEFAULT_GRACE_PERIOD);
>   	dqm_unlock(dqm);
>   
>   	return r;
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
> index a537b9ef3e16..fb48b124161f 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
> @@ -37,6 +37,7 @@
>   
>   #define KFD_MES_PROCESS_QUANTUM		100000
>   #define KFD_MES_GANG_QUANTUM		10000
> +#define USE_DEFAULT_GRACE_PERIOD 0xffffffff
>   
>   struct device_process_node {
>   	struct qcm_process_device *qpd;
> @@ -256,6 +257,7 @@ struct device_queue_manager {
>   	struct work_struct	hw_exception_work;
>   	struct kfd_mem_obj	hiq_sdma_mqd;
>   	bool			sched_running;
> +	uint32_t		wait_times;
>   };
>   
>   void device_queue_manager_init_cik(
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c
> index ed02b6d8bf63..c57f9a46dfcc 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c
> @@ -369,6 +369,38 @@ int pm_send_query_status(struct packet_manager *pm, uint64_t fence_address,
>   	return retval;
>   }
>   
> +int pm_update_grace_period(struct packet_manager *pm, uint32_t grace_period)
> +{
> +	int retval = 0;
> +	uint32_t *buffer, size;
> +
> +	size = pm->pmf->set_grace_period_size;
> +
> +	mutex_lock(&pm->lock);
> +
> +	if (size) {
> +		kq_acquire_packet_buffer(pm->priv_queue,
> +			size / sizeof(uint32_t),
> +			(unsigned int **)&buffer);
> +
> +		if (!buffer) {
> +			pr_err("Failed to allocate buffer on kernel queue\n");
> +			retval = -ENOMEM;
> +			goto out;
> +		}
> +
> +		retval = pm->pmf->set_grace_period(pm, buffer, grace_period);
> +		if (!retval)
> +			kq_submit_packet(pm->priv_queue);
> +		else
> +			kq_rollback_packet(pm->priv_queue);
> +	}
> +
> +out:
> +	mutex_unlock(&pm->lock);
> +	return retval;
> +}
> +
>   int pm_send_unmap_queue(struct packet_manager *pm,
>   			enum kfd_unmap_queues_filter filter,
>   			uint32_t filter_param, bool reset)
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager_v9.c
> index 18250845a989..f0cdc8695b8c 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager_v9.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager_v9.c
> @@ -251,6 +251,41 @@ static int pm_map_queues_v9(struct packet_manager *pm, uint32_t *buffer,
>   	return 0;
>   }
>   
> +static int pm_set_grace_period_v9(struct packet_manager *pm,
> +		uint32_t *buffer,
> +		uint32_t grace_period)
> +{
> +	struct pm4_mec_write_data_mmio *packet;
> +	uint32_t reg_offset = 0;
> +	uint32_t reg_data = 0;
> +
> +	pm->dqm->dev->kfd2kgd->build_grace_period_packet_info(
> +			pm->dqm->dev->adev,
> +			pm->dqm->wait_times,
> +			grace_period,
> +			&reg_offset,
> +			&reg_data);
> +
> +	if (grace_period == USE_DEFAULT_GRACE_PERIOD)
> +		reg_data = pm->dqm->wait_times;
> +
> +	packet = (struct pm4_mec_write_data_mmio *)buffer;
> +	memset(buffer, 0, sizeof(struct pm4_mec_write_data_mmio));
> +
> +	packet->header.u32All = pm_build_pm4_header(IT_WRITE_DATA,
> +					sizeof(struct pm4_mec_write_data_mmio));
> +
> +	packet->bitfields2.dst_sel  = dst_sel___write_data__mem_mapped_register;
> +	packet->bitfields2.addr_incr =
> +			addr_incr___write_data__do_not_increment_address;
> +
> +	packet->bitfields3.dst_mmreg_addr = reg_offset;
> +
> +	packet->data = reg_data;
> +
> +	return 0;
> +}
> +
>   static int pm_unmap_queues_v9(struct packet_manager *pm, uint32_t *buffer,
>   			enum kfd_unmap_queues_filter filter,
>   			uint32_t filter_param, bool reset)
> @@ -333,6 +368,7 @@ const struct packet_manager_funcs kfd_v9_pm_funcs = {
>   	.set_resources		= pm_set_resources_v9,
>   	.map_queues		= pm_map_queues_v9,
>   	.unmap_queues		= pm_unmap_queues_v9,
> +	.set_grace_period       = pm_set_grace_period_v9,
>   	.query_status		= pm_query_status_v9,
>   	.release_mem		= NULL,
>   	.map_process_size	= sizeof(struct pm4_mes_map_process),
> @@ -340,6 +376,7 @@ const struct packet_manager_funcs kfd_v9_pm_funcs = {
>   	.set_resources_size	= sizeof(struct pm4_mes_set_resources),
>   	.map_queues_size	= sizeof(struct pm4_mes_map_queues),
>   	.unmap_queues_size	= sizeof(struct pm4_mes_unmap_queues),
> +	.set_grace_period_size  = sizeof(struct pm4_mec_write_data_mmio),
>   	.query_status_size	= sizeof(struct pm4_mes_query_status),
>   	.release_mem_size	= 0,
>   };
> @@ -350,6 +387,7 @@ const struct packet_manager_funcs kfd_aldebaran_pm_funcs = {
>   	.set_resources		= pm_set_resources_v9,
>   	.map_queues		= pm_map_queues_v9,
>   	.unmap_queues		= pm_unmap_queues_v9,
> +	.set_grace_period       = pm_set_grace_period_v9,
>   	.query_status		= pm_query_status_v9,
>   	.release_mem		= NULL,
>   	.map_process_size	= sizeof(struct pm4_mes_map_process_aldebaran),
> @@ -357,6 +395,7 @@ const struct packet_manager_funcs kfd_aldebaran_pm_funcs = {
>   	.set_resources_size	= sizeof(struct pm4_mes_set_resources),
>   	.map_queues_size	= sizeof(struct pm4_mes_map_queues),
>   	.unmap_queues_size	= sizeof(struct pm4_mes_unmap_queues),
> +	.set_grace_period_size  = sizeof(struct pm4_mec_write_data_mmio),
>   	.query_status_size	= sizeof(struct pm4_mes_query_status),
>   	.release_mem_size	= 0,
>   };
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers_ai.h b/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers_ai.h
> index a666710ed403..795001c947e1 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers_ai.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers_ai.h
> @@ -583,6 +583,71 @@ struct pm4_mec_release_mem {
>   
>   #endif
>   
> +#ifndef PM4_MEC_WRITE_DATA_DEFINED
> +#define PM4_MEC_WRITE_DATA_DEFINED
> +
> +enum WRITE_DATA_dst_sel_enum {
> +	dst_sel___write_data__mem_mapped_register = 0,
> +	dst_sel___write_data__tc_l2 = 2,
> +	dst_sel___write_data__gds = 3,
> +	dst_sel___write_data__memory = 5,
> +	dst_sel___write_data__memory_mapped_adc_persistent_state = 6,
> +};
> +
> +enum WRITE_DATA_addr_incr_enum {
> +	addr_incr___write_data__increment_address = 0,
> +	addr_incr___write_data__do_not_increment_address = 1
> +};
> +
> +enum WRITE_DATA_wr_confirm_enum {
> +	wr_confirm___write_data__do_not_wait_for_write_confirmation = 0,
> +	wr_confirm___write_data__wait_for_write_confirmation = 1
> +};
> +
> +enum WRITE_DATA_cache_policy_enum {
> +	cache_policy___write_data__lru = 0,
> +	cache_policy___write_data__stream = 1
> +};
> +
> +
> +struct pm4_mec_write_data_mmio {
> +	union {
> +		union PM4_MES_TYPE_3_HEADER header;     /*header */
> +		unsigned int ordinal1;
> +	};
> +
> +	union {
> +		struct {
> +			unsigned int reserved1:8;
> +			unsigned int dst_sel:4;
> +			unsigned int reserved2:4;
> +			unsigned int addr_incr:1;
> +			unsigned int reserved3:2;
> +			unsigned int resume_vf:1;
> +			unsigned int wr_confirm:1;
> +			unsigned int reserved4:4;
> +			unsigned int cache_policy:2;
> +			unsigned int reserved5:5;
> +		} bitfields2;
> +		unsigned int ordinal2;
> +	};
> +
> +	union {
> +		struct {
> +			unsigned int dst_mmreg_addr:18;
> +			unsigned int reserved6:14;
> +		} bitfields3;
> +		unsigned int ordinal3;
> +	};
> +
> +	uint32_t reserved7;
> +
> +	uint32_t data;
> +
> +};
> +
> +#endif
> +
>   enum {
>   	CACHE_FLUSH_AND_INV_TS_EVENT = 0x00000014
>   };
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> index 3ea61fa1db52..a851f814bc9d 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> @@ -1300,6 +1300,8 @@ struct packet_manager_funcs {
>   	int (*unmap_queues)(struct packet_manager *pm, uint32_t *buffer,
>   			enum kfd_unmap_queues_filter mode,
>   			uint32_t filter_param, bool reset);
> +	int (*set_grace_period)(struct packet_manager *pm, uint32_t *buffer,
> +			uint32_t grace_period);
>   	int (*query_status)(struct packet_manager *pm, uint32_t *buffer,
>   			uint64_t fence_address,	uint64_t fence_value);
>   	int (*release_mem)(uint64_t gpu_addr, uint32_t *buffer);
> @@ -1310,6 +1312,7 @@ struct packet_manager_funcs {
>   	int set_resources_size;
>   	int map_queues_size;
>   	int unmap_queues_size;
> +	int set_grace_period_size;
>   	int query_status_size;
>   	int release_mem_size;
>   };
> @@ -1332,6 +1335,8 @@ int pm_send_unmap_queue(struct packet_manager *pm,
>   
>   void pm_release_ib(struct packet_manager *pm);
>   
> +int pm_update_grace_period(struct packet_manager *pm, uint32_t grace_period);
> +
>   /* Following PM funcs can be shared among VI and AI */
>   unsigned int pm_build_pm4_header(unsigned int opcode, size_t packet_size);
>   

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [PATCH 17/29] drm/amdkfd: Add debug trap enabled flag to TMA
  2022-10-31 16:23 ` [PATCH 17/29] drm/amdkfd: Add debug trap enabled flag to TMA Jonathan Kim
@ 2022-11-23  0:44   ` Felix Kuehling
  2022-11-24 14:51     ` Kim, Jonathan
  0 siblings, 1 reply; 63+ messages in thread
From: Felix Kuehling @ 2022-11-23  0:44 UTC (permalink / raw)
  To: Jonathan Kim, amd-gfx


On 2022-10-31 12:23, Jonathan Kim wrote:
> From: Jay Cornwall <jay.cornwall@amd.com>
>
> Trap handler behavior will differ when a debugger is attached.
>
> Make the debug trap flag available in the trap handler TMA.
> Update it when the debug trap ioctl is invoked.
>
> v3: Rebase for upstream
>
> v2:
> Add missing debug flag setup on APUs
>
> Signed-off-by: Jay Cornwall <jay.cornwall@amd.com>
> Reviewed-by: Felix Kuehling <Felix.Kuehling@amd.com>
> Signed-off-by: Jonathan Kim <jonathan.kim@amd.com>
> ---
>   drivers/gpu/drm/amd/amdkfd/kfd_debug.c   |  4 ++++
>   drivers/gpu/drm/amd/amdkfd/kfd_priv.h    |  2 ++
>   drivers/gpu/drm/amd/amdkfd/kfd_process.c | 16 ++++++++++++++++
>   3 files changed, 22 insertions(+)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
> index ae6e701a2656..d4f87f2adada 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
> @@ -193,6 +193,8 @@ void kfd_dbg_trap_deactivate(struct kfd_process *target, bool unwind, int unwind
>   		if (unwind && count == unwind_count)
>   			break;
>   
> +		kfd_process_set_trap_debug_flag(&pdd->qpd, false);
> +
>   		/* GFX off is already disabled by debug activate if not RLC restore supported. */
>   		if (kfd_dbg_is_rlc_restore_supported(pdd->dev))
>   			amdgpu_gfx_off_ctrl(pdd->dev->adev, false);
> @@ -278,6 +280,8 @@ int kfd_dbg_trap_activate(struct kfd_process *target)
>   		if (kfd_dbg_is_rlc_restore_supported(pdd->dev))
>   			amdgpu_gfx_off_ctrl(pdd->dev->adev, true);
>   
> +		kfd_process_set_trap_debug_flag(&pdd->qpd, true);
> +		
>   		r = debug_refresh_runlist(pdd->dev->dqm);
>   		if (r) {
>   			target->runtime_info.runtime_state =
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> index 9690a2adb9ed..82b28588ab72 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> @@ -1101,6 +1101,8 @@ int kfd_init_apertures(struct kfd_process *process);
>   void kfd_process_set_trap_handler(struct qcm_process_device *qpd,
>   				  uint64_t tba_addr,
>   				  uint64_t tma_addr);
> +void kfd_process_set_trap_debug_flag(struct qcm_process_device *qpd,
> +				     bool enabled);
>   
>   /* CWSR initialization */
>   int kfd_process_init_cwsr_apu(struct kfd_process *process, struct file *filep);
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> index 59c4c38833b6..d62e0c62df76 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> @@ -1252,6 +1252,8 @@ int kfd_process_init_cwsr_apu(struct kfd_process *p, struct file *filep)
>   
>   		memcpy(qpd->cwsr_kaddr, dev->cwsr_isa, dev->cwsr_isa_size);
>   
> +		kfd_process_set_trap_debug_flag(qpd, p->debug_trap_enabled);
> +
>   		qpd->tma_addr = qpd->tba_addr + KFD_CWSR_TMA_OFFSET;
>   		pr_debug("set tba :0x%llx, tma:0x%llx, cwsr_kaddr:%p for pqm.\n",
>   			qpd->tba_addr, qpd->tma_addr, qpd->cwsr_kaddr);
> @@ -1288,6 +1290,9 @@ static int kfd_process_device_init_cwsr_dgpu(struct kfd_process_device *pdd)
>   
>   	memcpy(qpd->cwsr_kaddr, dev->cwsr_isa, dev->cwsr_isa_size);
>   
> +	kfd_process_set_trap_debug_flag(&pdd->qpd,
> +					pdd->process->debug_trap_enabled);
> +
>   	qpd->tma_addr = qpd->tba_addr + KFD_CWSR_TMA_OFFSET;
>   	pr_debug("set tba :0x%llx, tma:0x%llx, cwsr_kaddr:%p for pqm.\n",
>   		 qpd->tba_addr, qpd->tma_addr, qpd->cwsr_kaddr);
> @@ -1374,6 +1379,17 @@ bool kfd_process_xnack_mode(struct kfd_process *p, bool supported)
>   	return true;
>   }
>   
> +void kfd_process_set_trap_debug_flag(struct qcm_process_device *qpd,
> +				     bool enabled)
> +{
> +	/* If TMA doesn't exist then flag will be set during allocation. */

I would expect a change to the TMA allocation function, but that isn't 
in this patch?

Regards,
   Felix

> +	if (qpd->cwsr_kaddr) {
> +		uint64_t *tma =
> +			(uint64_t *)(qpd->cwsr_kaddr + KFD_CWSR_TMA_OFFSET);
> +		tma[2] = enabled;
> +	}
> +}
> +
>   /*
>    * On return the kfd_process is fully operational and will be freed when the
>    * mm is released

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [PATCH 16/29] drm/amdkfd: add runtime enable operation
  2022-10-31 16:23 ` [PATCH 16/29] drm/amdkfd: add runtime enable operation Jonathan Kim
@ 2022-11-23  0:52   ` Felix Kuehling
  0 siblings, 0 replies; 63+ messages in thread
From: Felix Kuehling @ 2022-11-23  0:52 UTC (permalink / raw)
  To: Jonathan Kim, amd-gfx


On 2022-10-31 12:23, Jonathan Kim wrote:
> This operation coordinates the debugger with the target HSA runtime
> process.
>
> The main motive for this coordination is due to CP performance overhead

I wouldn't call that the main motivation. The main motivation for 
synchronizing runtime enable with the debugger is allowing two different 
use cases:

  * Attaching the debugger to a running process (when the runtime is
    already initialized)
  * Attaching the debugger on process creation and waiting for runtime
    initialization so that all queue creations can be intercepted.

That just happens a good place to enable ttmps as well.

Regards,
   Felix


> when enabling trap temporaries via SPI_GDBG_PER_VMID_CNTL.Trap_en.
> This overhead is unacceptable for microbench performance in normal mode
> for certain customers.
>
> ROCr allows the user to bypass trap temporary setup through the
> HSA_ENABLE_DEBUG environment variable.  As a result, the debugger has
> to consider two scenarios.
>
> For the first scenario, if the runtime enable of the target has already
> occurred prior to the debugger attaching, then the debugger will go ahead
> and setup the trap temporaries whether runtime has requested them or not.
> The debugger will be able to query the runtime status on attach.
>
> For the second scenario where the debugger spawns the target process,
> it will have to wait for ROCr's runtime enable request from the target.
> The runtime enable request will be able to see that it's process has been
> debug attached.  It then enables the trap temporaries since it now
> knows it's in debug mode, raises an EC_PROCESS_RUNTIME signal to the
> debugger then waits for the debugger's response. Once the debugger has
> received the runtime signal, it will wake the target process.
>
> In addition there is an additional restriction that is required to be
> enforced with runtime enable and HW debug mode setting.
> The debugger must first ensure that HW debug mode has been enabled
> before permitting HW debug mode operations.
>
> With single process debug devices, allowing the debugger to set debug
> HW modes prior to trap activation means that debug HW mode setting can
> occur before the KFD has reserved the debug VMID (0xf) from the hardware
> scheduler's VMID allocation resource pool.  This can result in the
> hardware scheduler assigning VMID 0xf to a non-debugged process and
> having that process inherit debug HW mode settings intended for the
> debugged target process instead, which is both incorrect and potentially
> fatal for normal mode operation.
>
> With multi process debug devices, allowing the debugger to set debug
> HW modes prior to trap activation means that non-debugged processes
> migrating to a new VMID could inherit unintended debug settings.
>
> All debug operations that touch HW settings must require trap activation
> where trap activation is triggered by both debug attach and runtime
> enablement (target has KFD opened and is ready to dispatch work).
>
> Signed-off-by: Jonathan Kim <jonathan.kim@amd.com>
> ---
>   drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 144 ++++++++++++++++++++++-
>   drivers/gpu/drm/amd/amdkfd/kfd_debug.c   |   4 +-
>   drivers/gpu/drm/amd/amdkfd/kfd_debug.h   |   2 +
>   drivers/gpu/drm/amd/amdkfd/kfd_priv.h    |   1 +
>   4 files changed, 148 insertions(+), 3 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> index 4b4c4200d8fb..27cd5af72521 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> @@ -2655,11 +2655,141 @@ static int kfd_ioctl_criu(struct file *filep, struct kfd_process *p, void *data)
>   	return ret;
>   }
>   
> -static int kfd_ioctl_runtime_enable(struct file *filep, struct kfd_process *p, void *data)
> +static int runtime_enable(struct kfd_process *p, uint64_t r_debug,
> +			bool enable_ttmp_setup)
>   {
> +	int i = 0, ret = 0;
> +
> +	if (p->is_runtime_retry)
> +		goto retry;
> +
> +	if (p->runtime_info.runtime_state != DEBUG_RUNTIME_STATE_DISABLED)
> +		return -EBUSY;
> +
> +	for (i = 0; i < p->n_pdds; i++) {
> +		struct kfd_process_device *pdd = p->pdds[i];
> +
> +		if (pdd->qpd.queue_count)
> +			return -EEXIST;
> +	}
> +
> +	p->runtime_info.runtime_state = DEBUG_RUNTIME_STATE_ENABLED;
> +	p->runtime_info.r_debug = r_debug;
> +	p->runtime_info.ttmp_setup = enable_ttmp_setup;
> +
> +	if (p->runtime_info.ttmp_setup) {
> +		for (i = 0; i < p->n_pdds; i++) {
> +			struct kfd_process_device *pdd = p->pdds[i];
> +
> +			if (!kfd_dbg_is_rlc_restore_supported(pdd->dev)) {
> +				amdgpu_gfx_off_ctrl(pdd->dev->adev, false);
> +				pdd->dev->kfd2kgd->enable_debug_trap(
> +						pdd->dev->adev,
> +						true,
> +						pdd->dev->vm_info.last_vmid_kfd);
> +			}
> +
> +			if (kfd_dbg_is_per_vmid_supported(pdd->dev)) {
> +				pdd->spi_dbg_override = pdd->dev->kfd2kgd->enable_debug_trap(
> +						pdd->dev->adev,
> +						false,
> +						pdd->dev->vm_info.last_vmid_kfd);
> +
> +				debug_refresh_runlist(pdd->dev->dqm);
> +			}
> +		}
> +	}
> +
> +retry:
> +	if (p->debug_trap_enabled) {
> +		if (!p->is_runtime_retry) {
> +			kfd_dbg_trap_activate(p);
> +			kfd_dbg_ev_raise(KFD_EC_MASK(EC_PROCESS_RUNTIME),
> +					p, NULL, 0, false, NULL, 0);
> +		}
> +
> +		mutex_unlock(&p->mutex);
> +		ret = down_interruptible(&p->runtime_enable_sema);
> +		mutex_lock(&p->mutex);
> +
> +		p->is_runtime_retry = !!ret;
> +	}
> +
> +	return ret;
> +}
> +
> +static int runtime_disable(struct kfd_process *p)
> +{
> +	int i = 0, ret;
> +	bool was_enabled = p->runtime_info.runtime_state == DEBUG_RUNTIME_STATE_ENABLED;
> +
> +	p->runtime_info.runtime_state = DEBUG_RUNTIME_STATE_DISABLED;
> +	p->runtime_info.r_debug = 0;
> +
> +	if (p->debug_trap_enabled) {
> +		if (was_enabled)
> +			kfd_dbg_trap_deactivate(p, false, 0);
> +
> +		if (!p->is_runtime_retry)
> +			kfd_dbg_ev_raise(KFD_EC_MASK(EC_PROCESS_RUNTIME),
> +					p, NULL, 0, false, NULL, 0);
> +
> +		mutex_unlock(&p->mutex);
> +		ret = down_interruptible(&p->runtime_enable_sema);
> +		mutex_lock(&p->mutex);
> +
> +		p->is_runtime_retry = !!ret;
> +		if (ret)
> +			return ret;
> +	}
> +
> +	if (was_enabled && p->runtime_info.ttmp_setup) {
> +		for (i = 0; i < p->n_pdds; i++) {
> +			struct kfd_process_device *pdd = p->pdds[i];
> +
> +			if (!kfd_dbg_is_rlc_restore_supported(pdd->dev))
> +				amdgpu_gfx_off_ctrl(pdd->dev->adev, true);
> +		}
> +	}
> +
> +	p->runtime_info.ttmp_setup = false;
> +
> +	/* disable DISPATCH_PTR save */
> +	for (i = 0; i < p->n_pdds; i++) {
> +		struct kfd_process_device *pdd = p->pdds[i];
> +
> +		if (kfd_dbg_is_per_vmid_supported(pdd->dev)) {
> +			pdd->spi_dbg_override =
> +					pdd->dev->kfd2kgd->disable_debug_trap(
> +					pdd->dev->adev,
> +					false,
> +					pdd->dev->vm_info.last_vmid_kfd);
> +
> +			debug_refresh_runlist(pdd->dev->dqm);
> +		}
> +	}
> +
>   	return 0;
>   }
>   
> +static int kfd_ioctl_runtime_enable(struct file *filep, struct kfd_process *p, void *data)
> +{
> +	struct kfd_ioctl_runtime_enable_args *args = data;
> +	int r;
> +
> +	mutex_lock(&p->mutex);
> +
> +	if (args->mode_mask & KFD_RUNTIME_ENABLE_MODE_ENABLE_MASK)
> +		r = runtime_enable(p, args->r_debug,
> +				!!(args->mode_mask & KFD_RUNTIME_ENABLE_MODE_TTMP_SAVE_MASK));
> +	else
> +		r = runtime_disable(p);
> +
> +	mutex_unlock(&p->mutex);
> +
> +	return r;
> +}
> +
>   static int kfd_ioctl_set_debug_trap(struct file *filep, struct kfd_process *p, void *data)
>   {
>   	struct kfd_ioctl_dbg_trap_args *args = data;
> @@ -2721,6 +2851,18 @@ static int kfd_ioctl_set_debug_trap(struct file *filep, struct kfd_process *p, v
>   		goto unlock_out;
>   	}
>   
> +	if (target->runtime_info.runtime_state != DEBUG_RUNTIME_STATE_ENABLED &&
> +			(args->op == KFD_IOC_DBG_TRAP_SET_WAVE_LAUNCH_OVERRIDE ||
> +			 args->op == KFD_IOC_DBG_TRAP_SET_WAVE_LAUNCH_MODE ||
> +			 args->op == KFD_IOC_DBG_TRAP_SUSPEND_QUEUES ||
> +			 args->op == KFD_IOC_DBG_TRAP_RESUME_QUEUES ||
> +			 args->op == KFD_IOC_DBG_TRAP_SET_NODE_ADDRESS_WATCH ||
> +			 args->op == KFD_IOC_DBG_TRAP_CLEAR_NODE_ADDRESS_WATCH ||
> +			 args->op == KFD_IOC_DBG_TRAP_SET_FLAGS)) {
> +		r = -EPERM;
> +		goto unlock_out;
> +	}
> +
>   	switch (args->op) {
>   	case KFD_IOC_DBG_TRAP_ENABLE:
>   		if (target != p)
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
> index 87a23b1d4d49..ae6e701a2656 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
> @@ -176,7 +176,7 @@ int kfd_dbg_send_exception_to_runtime(struct kfd_process *p,
>    *				to unwind
>    *		else: ignored
>    */
> -static void kfd_dbg_trap_deactivate(struct kfd_process *target, bool unwind, int unwind_count)
> +void kfd_dbg_trap_deactivate(struct kfd_process *target, bool unwind, int unwind_count)
>   {
>   	int i, count = 0;
>   
> @@ -238,7 +238,7 @@ int kfd_dbg_trap_disable(struct kfd_process *target)
>   	return 0;
>   }
>   
> -static int kfd_dbg_trap_activate(struct kfd_process *target)
> +int kfd_dbg_trap_activate(struct kfd_process *target)
>   {
>   	int i, r = 0, unwind_count = 0;
>   
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.h b/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
> index 8aa52cc3af17..e31c9bb0e848 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
> @@ -28,6 +28,8 @@
>   void kgd_gfx_v9_set_wave_launch_stall(struct amdgpu_device *adev,
>   					uint32_t vmid,
>   					bool stall);
> +void kfd_dbg_trap_deactivate(struct kfd_process *target, bool unwind, int unwind_count);
> +int kfd_dbg_trap_activate(struct kfd_process *target);
>   bool kfd_dbg_ev_raise(uint64_t event_mask,
>   			struct kfd_process *process, struct kfd_dev *dev,
>   			unsigned int source_id, bool use_worker,
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> index b69f2f94a50e..9690a2adb9ed 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> @@ -943,6 +943,7 @@ struct kfd_process {
>   
>   	/* Tracks runtime enable status */
>   	struct semaphore runtime_enable_sema;
> +	bool is_runtime_retry;
>   	struct kfd_runtime_info runtime_info;
>   
>   };

^ permalink raw reply	[flat|nested] 63+ messages in thread

* RE: [PATCH 01/29] drm/amdkfd: add debug and runtime enable interface
  2022-11-22 23:05 ` [PATCH 01/29] drm/amdkfd: add debug and runtime enable interface Felix Kuehling
@ 2022-11-23 20:45   ` Kim, Jonathan
  0 siblings, 0 replies; 63+ messages in thread
From: Kim, Jonathan @ 2022-11-23 20:45 UTC (permalink / raw)
  To: Kuehling, Felix, amd-gfx

[-- Attachment #1: Type: text/plain, Size: 31670 bytes --]

[Public]

> -----Original Message-----
> From: Kuehling, Felix <Felix.Kuehling@amd.com>
> Sent: November 22, 2022 6:05 PM
> To: Kim, Jonathan <Jonathan.Kim@amd.com>; amd-
> gfx@lists.freedesktop.org
> Subject: Re: [PATCH 01/29] drm/amdkfd: add debug and runtime enable
> interface
>
>
> On 2022-10-31 12:23, Jonathan Kim wrote:
> > Introduce the GPU debug operations interface.
> >
> > For ROCm-GDB to extend the GNU Debugger's ability to inspect the AMD
> GPU
> > instruction set, provide the necessary interface to allow the debugger
> > to HW debug-mode set and query exceptions per HSA queue, process or
> > device.
> >
> > The runtime_enable interface coordinates exception handling with the
> > HSA runtime.
> >
> > Usage is available in the kern docs at uapi/linux/kfd_ioctl.h.
> >
> > v2: add more documentation on semantics and error returns.
> > expand kfd_dbg_device_info_entry with new fields.
> > update device_snapshot sematics to match queue snapshot semantics
>
> This looks really good. I have 3 more nit-picks inline. Other than that,
> this patch is
>
> Reviewed-by: Felix Kuehling <Felix.Kuehling@amd.com>
>
> Do we have a debugger branch that uses the API yet? We should make this
> public in order to complete this upstream code review.

Thanks for the review.  I've given the heads up to the ROCm GDB maintainers to expect to sync with this API version soon so hopefully they can surface one shortly.

Thanks,

Jon

>
>
> >
> > Signed-off-by: Jonathan Kim <jonathan.kim@amd.com>
> > ---
> > drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 48 ++
> > include/uapi/linux/kfd_ioctl.h | 655 ++++++++++++++++++++++-
> > 2 files changed, 702 insertions(+), 1 deletion(-)
> >
> > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> > b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> > index 5feaba6a77de..11a960c83fb2 100644
> > --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> > @@ -2644,6 +2644,48 @@ static int kfd_ioctl_criu(struct file *filep,
> > struct kfd_process *p, void *data)
> > return ret;
> > }
> > +static int kfd_ioctl_runtime_enable(struct file *filep, struct
> > kfd_process *p, void *data)
> > +{
> > + return 0;
> > +}
> > +
> > +static int kfd_ioctl_set_debug_trap(struct file *filep, struct
> > kfd_process *p, void *data)
> > +{
> > + struct kfd_ioctl_dbg_trap_args *args = data;
> > + int r = 0;
> > +
> > + if (sched_policy == KFD_SCHED_POLICY_NO_HWS) {
> > + pr_err("Debugging does not support sched_policy %i", sched_policy);
> > + return -EINVAL;
> > + }
> > +
> > + switch (args->op) {
> > + case KFD_IOC_DBG_TRAP_ENABLE:
> > + case KFD_IOC_DBG_TRAP_DISABLE:
> > + case KFD_IOC_DBG_TRAP_SEND_RUNTIME_EVENT:
> > + case KFD_IOC_DBG_TRAP_SET_EXCEPTIONS_ENABLED:
> > + case KFD_IOC_DBG_TRAP_SET_WAVE_LAUNCH_OVERRIDE:
> > + case KFD_IOC_DBG_TRAP_SET_WAVE_LAUNCH_MODE:
> > + case KFD_IOC_DBG_TRAP_SUSPEND_QUEUES:
> > + case KFD_IOC_DBG_TRAP_RESUME_QUEUES:
> > + case KFD_IOC_DBG_TRAP_SET_NODE_ADDRESS_WATCH:
> > + case KFD_IOC_DBG_TRAP_CLEAR_NODE_ADDRESS_WATCH:
> > + case KFD_IOC_DBG_TRAP_SET_FLAGS:
> > + case KFD_IOC_DBG_TRAP_QUERY_DEBUG_EVENT:
> > + case KFD_IOC_DBG_TRAP_QUERY_EXCEPTION_INFO:
> > + case KFD_IOC_DBG_TRAP_GET_QUEUE_SNAPSHOT:
> > + case KFD_IOC_DBG_TRAP_GET_DEVICE_SNAPSHOT:
> > + pr_warn("Debugging not supported yet\n");
> > + r = -EACCES;
> > + break;
> > + default:
> > + pr_err("Invalid option: %i\n", args->op);
> > + r = -EINVAL;
> > + }
> > +
> > + return r;
> > +}
> > +
> > #define AMDKFD_IOCTL_DEF(ioctl, _func, _flags) \
> > [_IOC_NR(ioctl)] = {.cmd = ioctl, .func = _func, .flags = _flags, \
> > .cmd_drv = 0, .name = #ioctl}
> > @@ -2753,6 +2795,12 @@ static const struct amdkfd_ioctl_desc
> > amdkfd_ioctls[] = {
> > AMDKFD_IOCTL_DEF(AMDKFD_IOC_AVAILABLE_MEMORY,
> > kfd_ioctl_get_available_memory, 0),
> > +
> > + AMDKFD_IOCTL_DEF(AMDKFD_IOC_RUNTIME_ENABLE,
> > + kfd_ioctl_runtime_enable, 0),
> > +
> > + AMDKFD_IOCTL_DEF(AMDKFD_IOC_DBG_TRAP,
> > + kfd_ioctl_set_debug_trap, 0),
> > };
> > #define AMDKFD_CORE_IOCTL_COUNT ARRAY_SIZE(amdkfd_ioctls)
> > diff --git a/include/uapi/linux/kfd_ioctl.h
> > b/include/uapi/linux/kfd_ioctl.h
> > index 42b60198b6c5..bedf1b823f57 100644
> > --- a/include/uapi/linux/kfd_ioctl.h
> > +++ b/include/uapi/linux/kfd_ioctl.h
> > @@ -109,6 +109,28 @@ struct kfd_ioctl_get_available_memory_args {
> > __u32 pad;
> > };
> > +struct kfd_dbg_device_info_entry {
> > + __u64 exception_status;
> > + __u64 lds_base;
> > + __u64 lds_limit;
> > + __u64 scratch_base;
> > + __u64 scratch_limit;
> > + __u64 gpuvm_base;
> > + __u64 gpuvm_limit;
> > + __u32 gpu_id;
> > + __u32 location_id;
> > + __u32 vendor_id;
> > + __u32 device_id;
> > + __u32 fw_version;
> > + __u32 gfx_target_version;
> > + __u32 simd_count;
> > + __u32 max_waves_per_simd;
> > + __u32 array_count;
> > + __u32 simd_arrays_per_engine;
> > + __u32 capability;
> > + __u32 debug_prop;
> > +};
> > +
> > /* For kfd_ioctl_set_memory_policy_args.default_policy and
> > alternate_policy */
> > #define KFD_IOC_CACHE_POLICY_COHERENT 0
> > #define KFD_IOC_CACHE_POLICY_NONCOHERENT 1
> > @@ -766,6 +788,631 @@ struct kfd_ioctl_set_xnack_mode_args {
> > __s32 xnack_enabled;
> > };
> > +/* Wave launch override modes */
> > +enum kfd_dbg_trap_override_mode {
> > + KFD_DBG_TRAP_OVERRIDE_OR = 0,
> > + KFD_DBG_TRAP_OVERRIDE_REPLACE = 1
> > +};
> > +
> > +/* Wave launch overrides */
> > +enum kfd_dbg_trap_mask {
> > + KFD_DBG_TRAP_MASK_FP_INVALID = 1,
> > + KFD_DBG_TRAP_MASK_FP_INPUT_DENORMAL = 2,
> > + KFD_DBG_TRAP_MASK_FP_DIVIDE_BY_ZERO = 4,
> > + KFD_DBG_TRAP_MASK_FP_OVERFLOW = 8,
> > + KFD_DBG_TRAP_MASK_FP_UNDERFLOW = 16,
> > + KFD_DBG_TRAP_MASK_FP_INEXACT = 32,
> > + KFD_DBG_TRAP_MASK_INT_DIVIDE_BY_ZERO = 64,
> > + KFD_DBG_TRAP_MASK_DBG_ADDRESS_WATCH = 128,
> > + KFD_DBG_TRAP_MASK_DBG_MEMORY_VIOLATION = 256
> > +};
> > +
> > +/* Wave launch modes */
> > +enum kfd_dbg_trap_wave_launch_mode {
> > + KFD_DBG_TRAP_WAVE_LAUNCH_MODE_NORMAL = 0,
> > + KFD_DBG_TRAP_WAVE_LAUNCH_MODE_HALT = 1,
> > + KFD_DBG_TRAP_WAVE_LAUNCH_MODE_KILL = 2,
> > + KFD_DBG_TRAP_WAVE_LAUNCH_MODE_DEBUG = 3,
> > + KFD_DBG_TRAP_WAVE_LAUNCH_MODE_STALL = 4
> > +};
> > +
> > +/* Address watch modes */
> > +enum kfd_dbg_trap_address_watch_mode {
> > + KFD_DBG_TRAP_ADDRESS_WATCH_MODE_READ = 0,
> > + KFD_DBG_TRAP_ADDRESS_WATCH_MODE_NONREAD = 1,
> > + KFD_DBG_TRAP_ADDRESS_WATCH_MODE_ATOMIC = 2,
> > + KFD_DBG_TRAP_ADDRESS_WATCH_MODE_ALL = 3
> > +};
> > +
> > +/* Additional wave settings */
> > +enum kfd_dbg_trap_flags {
> > + KFD_DBG_TRAP_FLAG_SINGLE_MEM_OP = 1,
> > + KFD_DBG_TRAP_FLAG_SINGLE_ALU_OP = 2
> > +};
> > +
> > +/* Trap exceptions */
> > +enum kfd_dbg_trap_exception_code {
> > + EC_NONE = 0,
> > + /* per queue */
> > + EC_QUEUE_WAVE_ABORT = 1,
> > + EC_QUEUE_WAVE_TRAP = 2,
> > + EC_QUEUE_WAVE_MATH_ERROR = 3,
> > + EC_QUEUE_WAVE_ILLEGAL_INSTRUCTION = 4,
> > + EC_QUEUE_WAVE_MEMORY_VIOLATION = 5,
> > + EC_QUEUE_WAVE_APERTURE_VIOLATION = 6,
> > + EC_QUEUE_PACKET_DISPATCH_DIM_INVALID = 16,
> > + EC_QUEUE_PACKET_DISPATCH_GROUP_SEGMENT_SIZE_INVALID = 17,
> > + EC_QUEUE_PACKET_DISPATCH_CODE_INVALID = 18,
> > + EC_QUEUE_PACKET_RESERVED = 19,
> > + EC_QUEUE_PACKET_UNSUPPORTED = 20,
> > + EC_QUEUE_PACKET_DISPATCH_WORK_GROUP_SIZE_INVALID = 21,
> > + EC_QUEUE_PACKET_DISPATCH_REGISTER_INVALID = 22,
> > + EC_QUEUE_PACKET_VENDOR_UNSUPPORTED = 23,
> > + EC_QUEUE_PREEMPTION_ERROR = 30,
> > + EC_QUEUE_NEW = 31,
> > + /* per device */
> > + EC_DEVICE_QUEUE_DELETE = 32,
> > + EC_DEVICE_MEMORY_VIOLATION = 33,
> > + EC_DEVICE_RAS_ERROR = 34,
> > + EC_DEVICE_FATAL_HALT = 35,
> > + EC_DEVICE_NEW = 36,
> > + /* per process */
> > + EC_PROCESS_RUNTIME = 48,
> > + EC_PROCESS_DEVICE_REMOVE = 49,
> > + EC_MAX
> > +};
> > +
> > +/* Mask generated by ecode in kfd_dbg_trap_exception_code */
> > +#define KFD_EC_MASK(ecode) (1ULL << (ecode - 1))
> > +
> > +/* Masks for exception code type checks below */
> > +#define KFD_EC_MASK_QUEUE
> (KFD_EC_MASK(EC_QUEUE_WAVE_ABORT) | \
> > + KFD_EC_MASK(EC_QUEUE_WAVE_TRAP) | \
> > + KFD_EC_MASK(EC_QUEUE_WAVE_MATH_ERROR) | \
> > + KFD_EC_MASK(EC_QUEUE_WAVE_ILLEGAL_INSTRUCTION) | \
> > + KFD_EC_MASK(EC_QUEUE_WAVE_MEMORY_VIOLATION) | \
> > + KFD_EC_MASK(EC_QUEUE_WAVE_APERTURE_VIOLATION) | \
> > + KFD_EC_MASK(EC_QUEUE_PACKET_DISPATCH_DIM_INVALID) | \
> > +
> KFD_EC_MASK(EC_QUEUE_PACKET_DISPATCH_GROUP_SEGMENT_SIZE_IN
> VALID) | \
> > + KFD_EC_MASK(EC_QUEUE_PACKET_DISPATCH_CODE_INVALID) | \
> > + KFD_EC_MASK(EC_QUEUE_PACKET_UNSUPPORTED) | \
> > +
> KFD_EC_MASK(EC_QUEUE_PACKET_DISPATCH_WORK_GROUP_SIZE_INVAL
> ID) | \
> > + KFD_EC_MASK(EC_QUEUE_PACKET_DISPATCH_REGISTER_INVALID) | \
> > + KFD_EC_MASK(EC_QUEUE_PACKET_VENDOR_UNSUPPORTED) | \
> > + KFD_EC_MASK(EC_QUEUE_PREEMPTION_ERROR) | \
> > + KFD_EC_MASK(EC_QUEUE_NEW))
> > +#define KFD_EC_MASK_DEVICE
> (KFD_EC_MASK(EC_DEVICE_QUEUE_DELETE) | \
> > + KFD_EC_MASK(EC_DEVICE_RAS_ERROR) | \
> > + KFD_EC_MASK(EC_DEVICE_FATAL_HALT) | \
> > + KFD_EC_MASK(EC_DEVICE_MEMORY_VIOLATION) | \
> > + KFD_EC_MASK(EC_DEVICE_NEW))
> > +#define KFD_EC_MASK_PROCESS
> (KFD_EC_MASK(EC_PROCESS_RUNTIME) | \
> > + KFD_EC_MASK(EC_PROCESS_DEVICE_REMOVE))
> > +
> > +/* Checks for exception code types for KFD search */
> > +#define KFD_DBG_EC_TYPE_IS_QUEUE(ecode) \
> > + (!!(KFD_EC_MASK(ecode) & KFD_EC_MASK_QUEUE))
> > +#define KFD_DBG_EC_TYPE_IS_DEVICE(ecode) \
> > + (!!(KFD_EC_MASK(ecode) & KFD_EC_MASK_DEVICE))
> > +#define KFD_DBG_EC_TYPE_IS_PROCESS(ecode) \
> > + (!!(KFD_EC_MASK(ecode) & KFD_EC_MASK_PROCESS))
> > +
> > +
> > +/* Runtime enable states */
> > +enum kfd_dbg_runtime_state {
> > + DEBUG_RUNTIME_STATE_DISABLED = 0,
> > + DEBUG_RUNTIME_STATE_ENABLED = 1,
> > + DEBUG_RUNTIME_STATE_ENABLED_BUSY = 2,
> > + DEBUG_RUNTIME_STATE_ENABLED_ERROR = 3
> > +};
> > +
> > +/* Runtime enable status */
> > +struct kfd_runtime_info {
> > + __u64 r_debug;
> > + __u32 runtime_state;
> > + __u32 ttmp_setup;
> > +};
> > +
> > +/* Enable modes for runtime enable */
> > +#define KFD_RUNTIME_ENABLE_MODE_ENABLE_MASK 1
> > +#define KFD_RUNTIME_ENABLE_MODE_TTMP_SAVE_MASK 2
> > +
> > +/**
> > + * kfd_ioctl_runtime_enable_args - Arguments for runtime enable
> > + *
> > + * Coordinates debug exception signalling and debug device enablement
> > with runtime.
> > + *
> > + * @r_debug - pointer to user struct for sharing information between
> > ROCr and the debuggger
> > + * @mode_mask - mask to set mode
> > + * KFD_RUNTIME_ENABLE_MODE_ENABLE_MASK - enable runtime for
> > debugging, otherwise disable
> > + * KFD_RUNTIME_ENABLE_MODE_TTMP_SAVE_MASK - enable trap
> temporary
> > setup (ignore on disable)
> > + *
> > + * Return - 0 on SUCCESS.
> > + * - EBUSY if runtime enable call already pending.
> > + * - EEXIST if user queues already active prior to call.
> > + * If process is debug enabled, runtime enable will enable debug
> > devices and
> > + * wait for debugger process to send runtime exception
> EC_PROCESS_RUNTIME
> > + * to unblock - see kfd_ioctl_dbg_trap_args.
> > + *
> > + */
> > +struct kfd_ioctl_runtime_enable_args {
> > + __u64 r_debug;
> > + __u32 mode_mask;
> > +};
> > +
> > +/* Queue information */
> > +struct kfd_queue_snapshot_entry {
> > + __u64 exception_status;
> > + __u64 ring_base_address;
> > + __u64 write_pointer_address;
> > + __u64 read_pointer_address;
> > + __u64 ctx_save_restore_address;
> > + __u32 queue_id;
> > + __u32 gpu_id;
> > + __u32 ring_size;
> > + __u32 queue_type;
> > + __u32 ctx_save_restore_area_size;
> > + __u32 reserved;
> > +};
> > +
> > +/* Queue status return for suspend/resume */
> > +#define KFD_DBG_QUEUE_ERROR_BIT 30
> > +#define KFD_DBG_QUEUE_INVALID_BIT 31
> > +#define KFD_DBG_QUEUE_ERROR_MASK (1 <<
> KFD_DBG_QUEUE_ERROR_BIT)
> > +#define KFD_DBG_QUEUE_INVALID_MASK (1 <<
> KFD_DBG_QUEUE_INVALID_BIT)
> > +
> > +/* Context save area header information */
> > +struct kfd_context_save_area_header {
> > + __u32 control_stack_offset;
> > + __u32 control_stack_size;
> > + __u32 wave_state_offset;
> > + __u32 wave_state_size;
> > + __u32 debug_offset;
> > + __u32 debug_size;
> > + __u64 err_payload_addr;
> > + __u32 err_event_id;
> > + __u32 reserved1;
> > +};
> > +
> > +/*
> > + * Debug operations
> > + *
> > + * For specifics on usage and return values, see documentation per
> > operation
> > + * below. Otherwise, generic error returns apply:
> > + * - ESRCH if the process to debug does not exist.
> > + *
> > + * - EINVAL (with KFD_IOC_DBG_TRAP_ENABLE exempt) if operation
> > + * KFD_IOC_DBG_TRAP_ENABLE has not succeeded prior.
> > + * Also returns this error if GPU hardware scheduling is not supported.
> > + *
> > + * - EPERM (with KFD_IOC_DBG_TRAP_DISABLE exempt) if target process
> > is not
> > + * PTRACE_ATTACHED. KFD_IOC_DBG_TRAP_DISABLE is exempt to allow
> > + * clean up of debug mode as long as process is debug enabled.
> > + *
> > + * - EACCES if any DBG_HW_OP (debug hardware operation) is requested
> when
> > + * AMDKFD_IOC_RUNTIME_ENABLE has not succeeded prior.
> > + *
> > + * - ENODEV if any GPU does not support debugging on a DBG_HW_OP
> call.
> > + *
> > + * - Other errors may be returned when a DBG_HW_OP occurs while the
> GPU
> > + * is in a fatal state.
> > + *
> > + */
> > +enum kfd_dbg_trap_operations {
> > + KFD_IOC_DBG_TRAP_ENABLE = 0,
> > + KFD_IOC_DBG_TRAP_DISABLE = 1,
> > + KFD_IOC_DBG_TRAP_SEND_RUNTIME_EVENT = 2,
> > + KFD_IOC_DBG_TRAP_SET_EXCEPTIONS_ENABLED = 3,
> > + KFD_IOC_DBG_TRAP_SET_WAVE_LAUNCH_OVERRIDE = 4, /*
> DBG_HW_OP */
> > + KFD_IOC_DBG_TRAP_SET_WAVE_LAUNCH_MODE = 5, /* DBG_HW_OP
> */
> > + KFD_IOC_DBG_TRAP_SUSPEND_QUEUES = 6, /* DBG_HW_OP */
> > + KFD_IOC_DBG_TRAP_RESUME_QUEUES = 7, /* DBG_HW_OP */
> > + KFD_IOC_DBG_TRAP_SET_NODE_ADDRESS_WATCH = 8, /* DBG_HW_OP
> */
> > + KFD_IOC_DBG_TRAP_CLEAR_NODE_ADDRESS_WATCH = 9, /*
> DBG_HW_OP */
> > + KFD_IOC_DBG_TRAP_SET_FLAGS = 10,
> > + KFD_IOC_DBG_TRAP_QUERY_DEBUG_EVENT = 11,
> > + KFD_IOC_DBG_TRAP_QUERY_EXCEPTION_INFO = 12,
> > + KFD_IOC_DBG_TRAP_GET_QUEUE_SNAPSHOT = 13,
> > + KFD_IOC_DBG_TRAP_GET_DEVICE_SNAPSHOT = 14
> > +};
> > +
> > +/**
> > + * kfd_ioctl_dbg_trap_enable_args
> > + *
> > + * Arguments for KFD_IOC_DBG_TRAP_ENABLE.
> > + *
> > + * Enables debug session for target process. Call @op
> > KFD_IOC_DBG_TRAP_DISABLE in
> > + * kfd_ioctl_dbg_trap_args to disable debug session.
> > + *
> > + * @exception_mask (IN) - exceptions to raise to the debugger
> > + * @rinfo_ptr (IN) - pointer to runtime info buffer (see
> > kfd_runtime_info)
> > + * @rinfo_size (IN/OUT) - size of runtime info buffer in bytes
> > + * @dbg_fd (IN) - fd the KFD will nofify the debugger with of raised
> > + * exceptions set in exception_mask.
> > + *
> > + * Generic errors apply (see kfd_dbg_trap_operations).
> > + * Return - 0 on SUCCESS.
> > + * Copies KFD saved kfd_runtime_info to @rinfo_ptr on enable.
> > + * Size of kfd_runtime saved by the KFD returned to @rinfo_size.
> > + * - EBADF if KFD cannot get a reference to dbg_fd.
> > + * - EFAULT if KFD cannot copy runtime info to rinfo_ptr.
> > + * - EINVAL if target process is already debug enabled.
> > + *
> > + */
> > +struct kfd_ioctl_dbg_trap_enable_args {
> > + __u64 exception_mask;
> > + __u64 rinfo_ptr;
> > + __u32 rinfo_size;
> > + __u32 dbg_fd;
> > +};
> > +
> > +/**
> > + * kfd_ioctl_dbg_trap_send_runtime_event_args
> > + *
> > + *
> > + * Arguments for KFD_IOC_DBG_TRAP_SEND_RUNTIME_EVENT.
> > + * Raises exceptions to runtime.
> > + *
> > + * @exception_mask (IN) - exceptions to raise to runtime
> > + * @gpu_id (IN) - target device id
> > + * @queue_id (IN) - target queue id
> > + *
> > + * Generic errors apply (see kfd_dbg_trap_operations).
> > + * Return - 0 on SUCCESS.
> > + * - ENODEV if gpu_id not found.
> > + * If exception_mask contains EC_PROCESS_RUNTIME, unblocks pending
> > + * AMDKFD_IOC_RUNTIME_ENABLE call - see
> kfd_ioctl_runtime_enable_args.
> > + * All other exceptions are raised to runtime through err_payload_addr.
> > + * See kfd_context_save_area_header.
> > + */
> > +struct kfd_ioctl_dbg_trap_send_runtime_event_args {
> > + __u64 exception_mask;
> > + __u32 gpu_id;
> > + __u32 queue_id;
> > +};
> > +
> > +/**
> > + * kfd_ioctl_dbg_trap_set_exceptions_enabled_args
> > + *
> > + * Arguments for KFD_IOC_SET_EXCEPTIONS_ENABLED
> > + * Set new exceptions to be raised to the debugger.
> > + *
> > + * @exception_mask (IN) - new exceptions to raise the debugger
> > + *
> > + * Generic errors apply (see kfd_dbg_trap_operations).
> > + * Return - 0 on SUCCESS.
> > + */
> > +struct kfd_ioctl_dbg_trap_set_exceptions_enabled_args {
> > + __u64 exception_mask;
> > +};
> > +
> > +/**
> > + * kfd_ioctl_dbg_trap_set_wave_launch_override_args
> > + *
> > + * Arguments for KFD_IOC_DBG_TRAP_SET_WAVE_LAUNCH_OVERRIDE
> > + * Enable HW exceptions to raise trap.
> > + *
> > + * @override_mode (IN) - see kfd_dbg_trap_override_mode
> > + * @enable_mask (IN/OUT) - reference kfd_dbg_trap_mask.
> > + * IN is the override modes requested to be enabled.
> > + * OUT is referenced in Return below.
> > + * @support_request_mask (IN/OUT) - reference kfd_dbg_trap_mask.
> > + * IN is the override modes requested for support check.
> > + * OUT is referenced in Return below.
> > + *
> > + * Generic errors apply (see kfd_dbg_trap_operations).
> > + * Return - 0 on SUCCESS.
> > + * Previous enablement is returned in @enable_mask.
> > + * Actual override support is returned in @support_request_mask.
> > + * - EINVAL if override mode is not supported.
> > + * - EACCES if trap support requested is not actually supported.
> > + * i.e. enable_mask (IN) is not a subset of support_request_mask (OUT).
> > + * Otherwise it is considered a generic error (see
> > kfd_dbg_trap_operations).
> > + */
> > +struct kfd_ioctl_dbg_trap_set_wave_launch_override_args {
> > + __u32 override_mode;
> > + __u32 enable_mask;
> > + __u32 support_request_mask;
> > + __u32 pad;
> > +};
> > +
> > +/**
> > + * kfd_ioctl_dbg_trap_set_wave_launch_mode_args
> > + *
> > + * Arguments for KFD_IOC_DBG_TRAP_SET_WAVE_LAUNCH_MODE
> > + * Set wave launch mode.
> > + *
> > + * @mode (IN) - see kfd_dbg_trap_wave_launch_mode
> > + *
> > + * Generic errors apply (see kfd_dbg_trap_operations).
> > + * Return - 0 on SUCCESS.
> > + */
> > +struct kfd_ioctl_dbg_trap_set_wave_launch_mode_args {
> > + __u32 launch_mode;
> > + __u32 pad;
> > +};
> > +
> > +/**
> > + * kfd_ioctl_dbg_trap_suspend_queues_args
> > + *
> > + * Arguments for KFD_IOC_DBG_TRAP_SUSPEND_QUEUES
> > + * Suspend queues.
> > + *
> > + * @exception_mask (IN) - raised exceptions to clear
> > + * @queue_array_ptr (IN) - pointer to array of queue ids (u32 per
> > queue id)
> > + * to suspend
> > + * @num_queues (IN) - number of queues to suspend in
> @queue_array_ptr
> > + * @grace_period (IN) - wave time allowance before preemption
> > + * per 1K GPU clock cycle unit
> > + *
> > + * Generic errors apply (see kfd_dbg_trap_operations).
> > + * Return - Number of queues suspended on SUCCESS.
> > + * KFD_DBG_QUEUE_ERROR_MASK and
> KFD_DBG_QUEUE_INVALID_MASK masked
> > + * for each queue id in @queue_array_ptr array reports unsuccessful
> > + * suspend reason.
> > + * KFD_DBG_QUEUE_ERROR_MASK = HW failure.
> > + * KFD_DBG_QUEUE_INVALID_MASK = queue does not exist, is new or
> > + * is being destroyed.
> > + * Destruction of a suspended queue is blocked until the queue is
> > + * resumed. This allows the debugger to access queue information and
> > + * the its context save area without running into a race condition on
> > + * queue destruction.
> > + * Automatically copies per queue context save area header information
> > + * into the save area base
> > + * (see kfd_queue_snapshot_entry and
> kfd_context_save_area_header).
>
> The last two paragraphs would make more sens as a description above the
> Return statement.
>
>
> > + */
> > +struct kfd_ioctl_dbg_trap_suspend_queues_args {
> > + __u64 exception_mask;
> > + __u64 queue_array_ptr;
> > + __u32 num_queues;
> > + __u32 grace_period;
> > +};
> > +
> > +/**
> > + * kfd_ioctl_dbg_trap_resume_queues_args
> > + *
> > + * Arguments for KFD_IOC_DBG_TRAP_RESUME_QUEUES
> > + * Resume queues.
> > + *
> > + * @queue_array_ptr (IN) - pointer to array of queue ids (u32 per
> > queue id)
> > + * to resume
> > + * @num_queues (IN) - number of queues to resume in
> @queue_array_ptr
> > + *
> > + * Generic errors apply (see kfd_dbg_trap_operations).
> > + * Return - Number of queues resumed on SUCCESS.
> > + * KFD_DBG_QUEUE_ERROR_MASK and
> KFD_DBG_QUEUE_INVALID_MASK mask
> > + * for each queue id in @queue_array_ptr array reports unsuccessful
> > + * resume reason.
> > + * KFD_DBG_QUEUE_ERROR_MASK = HW failure.
> > + * KFD_DBG_QUEUE_INVALID_MASK = queue does not exist.
> > + */
> > +struct kfd_ioctl_dbg_trap_resume_queues_args {
> > + __u64 queue_array_ptr;
> > + __u32 num_queues;
> > + __u32 pad;
> > +};
> > +
> > +/**
> > + * kfd_ioctl_dbg_trap_set_node_address_watch_args
> > + *
> > + * Arguments for KFD_IOC_DBG_TRAP_SET_NODE_ADDRESS_WATCH
> > + * Sets address watch for device.
> > + *
> > + * @address (IN) - watch address to set
> > + * @mode (IN) - see kfd_dbg_trap_address_watch_mode
> > + * @mask (IN) - watch address mask
> > + * @gpu_id (IN) - target gpu to set watch point
> > + * @id (OUT) - watch id allocated
> > + *
> > + * Generic errors apply (see kfd_dbg_trap_operations).
> > + * Return - 0 on SUCCESS.
> > + * Allocated watch ID returned to @id.
> > + * - ENODEV if gpu_id not found.
> > + * - ENOMEM if watch IDs can be allocated
> > + */
> > +struct kfd_ioctl_dbg_trap_set_node_address_watch_args {
> > + __u64 address;
> > + __u32 mode;
> > + __u32 mask;
> > + __u32 gpu_id;
> > + __u32 id;
> > +};
> > +
> > +/**
> > + * kfd_ioctl_dbg_trap_clear_node_address_watch_args
> > + *
> > + * Arguments for KFD_IOC_DBG_TRAP_CLEAR_NODE_ADDRESS_WATCH
> > + * Clear address watch for device.
> > + *
> > + * @gpu_id (IN) - target device to clear watch point
> > + * @id (IN) - allocated watch id to clear
> > + *
> > + * Generic errors apply (see kfd_dbg_trap_operations).
> > + * Return - 0 on SUCCESS.
> > + * - ENODEV if gpu_id not found.
> > + * - EINVAL if watch ID has not been allocated.
> > + */
> > +struct kfd_ioctl_dbg_trap_clear_node_address_watch_args {
> > + __u32 gpu_id;
> > + __u32 id;
> > +};
> > +
> > +/**
> > + * kfd_ioctl_dbg_trap_set_flags_args
> > + *
> > + * Arguments for KFD_IOC_DBG_TRAP_SET_FLAGS
> > + * Sets flags for wave behaviour.
> > + *
> > + * @flags (IN/OUT) - IN = flags to enable, OUT = flags previously enabled
> > + *
> > + * Generic errors apply (see kfd_dbg_trap_operations).
> > + * Return - 0 on SUCCESS.
> > + * - EACCESS if any debug device does not allow flag options.
> > + */
> > +struct kfd_ioctl_dbg_trap_set_flags_args {
> > + __u32 flags;
> > + __u32 pad;
> > +};
> > +
> > +/**
> > + * kfd_ioctl_dbg_trap_query_debug_event_args
> > + *
> > + * Arguments for KFD_IOC_DBG_TRAP_QUERY_DEBUG_EVENT
> > + *
> > + * Find one or more raised exceptions. This function can return multiple
> > + * exceptions from a single queue or a single device with one call.
> > To find
> > + * all raised exceptions, this function must be called repeatedly
> > until it
> > + * returns -EAGAIN. Returned exceptions can optionally be cleared by
> > + * setting the corresponding bit in the @exception_mask input
> parameter.
> > + * However, clearing an exception prevents retrieving further information
> > + * about it with KFD_IOC_DBG_TRAP_QUERY_EXCEPTION_INFO.
> > + *
> > + * @exception_mask (IN/OUT) - exception to clear (IN) and raised (OUT)
> > + * @gpu_id (OUT) - gpu id of exceptions raised
> > + * @queue_id (OUT) - queue id of exceptions raised
> > + *
> > + * Generic errors apply (see kfd_dbg_trap_operations).
> > + * Return - 0 on raised exception found
> > + * Raised exceptions found are returned in @exception mask
> > + * with reported source id returned in @gpu_id or @queue_id.
> > + * - EAGAIN if no raised exception has been found
> > + */
> > +struct kfd_ioctl_dbg_trap_query_debug_event_args {
> > + __u64 exception_mask;
> > + __u32 gpu_id;
> > + __u32 queue_id;
> > +};
> > +
> > +/**
> > + * kfd_ioctl_dbg_trap_query_exception_info_args
> > + *
> > + * Arguments KFD_IOC_DBG_TRAP_QUERY_EXCEPTION_INFO
> > + * Get additional info on raised exception.
> > + *
> > + * @info_ptr (IN) - pointer to exception info buffer to copy to
> > + * @info_size (IN/OUT) - exception info buffer size (bytes)
> > + * @source_id (IN) - target gpu or queue id
> > + * @exception_code (IN) - target exception
> > + * @clear_exception (IN) - clear raised @exception_code exception
> > + * (0 = false, 1 = true)
> > + *
> > + * Generic errors apply (see kfd_dbg_trap_operations).
> > + * Return - 0 on SUCCESS.
> > + * If @exception_code is EC_DEVICE_MEMORY_VIOLATION, copy
> @info_size(OUT)
> > + * bytes of memory exception data to @info_ptr.
> > + * If @exception_code is EC_PROCESS_RUNTIME, copy saved
> > + * kfd_runtime_info to @info_ptr.
> > + * Actual required @info_ptr size (bytes) is returned in @info_size.
> > + */
> > +struct kfd_ioctl_dbg_trap_query_exception_info_args {
> > + __u64 info_ptr;
> > + __u32 info_size;
> > + __u32 source_id;
> > + __u32 exception_code;
> > + __u32 clear_exception;
> > +};
> > +
> > +/**
> > + * kfd_ioctl_dbg_trap_get_queue_snapshot_args
> > + *
> > + * Arguments KFD_IOC_DBG_TRAP_GET_QUEUE_SNAPSHOT
> > + * Get queue information.
> > + *
> > + * @exception_mask (IN) - exceptions raised to clear
> > + * @snapshot_buf_ptr (IN) - queue snapshot entry buffer (see
> > kfd_queue_snapshot_entry)
> > + * @num_queues (IN/OUT) - number of queue snapshot entries
> > + * The debugger specifies the size of the array allocated in
> @num_queues.
> > + * KFD returns the number of queues that actually existed. If this is
> > + * larger than the size specified by the debugger, KFD will not overflow
> > + * the array allocated by the debugger.
> > + *
> > + * @entry_size (IN/OUT) - size per entry in bytes
> > + * The debugger specifies sizeof(struct kfd_queue_snapshot_entry) in
> > + * @entry_size. KFD returns the number of bytes actually populated per
> > + * entry. The debugger should use the KFD_IOCTL_MINOR_VERSION to
> > determine,
> > + * which fields in struct kfd_queue_snapshot_entry are valid. This allows
> > + * growing the ABI in a backwards compatible manner.
>
> It's worth mentioning that the @entry_size(in) is also used as stride if
> it is larger than the actual kfd_queue_snapshot_entry.
>
>
> > + *
> > + * Generic errors apply (see kfd_dbg_trap_operations).
> > + * Return - 0 on SUCCESS.
> > + * Copies @num_queues(IN) queue snapshot entries of size
> @entry_size(IN)
> > + * into @snapshot_buf_ptr if @num_queues(IN) > 0.
> > + * Otherwise return @num_queues(OUT) queue snapshot entries that
> exist.
> > + */
> > +struct kfd_ioctl_dbg_trap_queue_snapshot_args {
> > + __u64 exception_mask;
> > + __u64 snapshot_buf_ptr;
> > + __u32 num_queues;
> > + __u32 entry_size;
> > +};
> > +
> > +/**
> > + * kfd_ioctl_dbg_trap_get_device_snapshot_args
> > + *
> > + * Arguments for KFD_IOC_DBG_TRAP_GET_DEVICE_SNAPSHOT
> > + * Get device information.
> > + *
> > + * @exception_mask (IN) - exceptions raised to clear
> > + * @snapshot_buf_ptr (IN) - pointer to snapshot buffer (see
> > kfd_dbg_device_info_entry)
> > + * @num_devices (IN/OUT) - number of debug devices to snapshot
> > + * The debugger specifies the size of the array allocated in
> > @num_devices.
> > + * KFD returns the number of devices that actually existed. If this is
> > + * larger than the size specified by the debugger, KFD will not overflow
> > + * the array allocated by the debugger.
> > + *
> > + * @entry_size (IN/OUT) - size per entry in bytes
> > + * The debugger specifies sizeof(struct kfd_dbg_device_info_entry) in
> > + * @entry_size. KFD returns the number of bytes actually populated. The
> > + * debugger should use KFD_IOCTL_MINOR_VERSION to determine,
> which fields
> > + * in struct kfd_dbg_device_info_entry are valid. This allows growing the
> > + * ABI in a backwards compatible manner.
>
> It's worth mentioning that the @entry_size(in) is also used as stride if
> it is larger than the actual kfd_queue_snapshot_entry.
>
> Regards,
>    Felix
>
>
> > + *
> > + * Generic errors apply (see kfd_dbg_trap_operations).
> > + * Return - 0 on SUCCESS.
> > + * Copies @num_devices(IN) device snapshot entries of size
> > @entry_size(IN)
> > + * into @snapshot_buf_ptr if @num_devices(IN) > 0.
> > + * Otherwise return @num_devices(OUT) queue snapshot entries that
> exist.
> > + */
> > +struct kfd_ioctl_dbg_trap_device_snapshot_args {
> > + __u64 exception_mask;
> > + __u64 snapshot_buf_ptr;
> > + __u32 num_devices;
> > + __u32 entry_size;
> > +};
> > +
> > +/**
> > + * kfd_ioctl_dbg_trap_args
> > + *
> > + * Arguments to debug target process.
> > + *
> > + * @pid - target process to debug
> > + * @op - debug operation (see kfd_dbg_trap_operations)
> > + *
> > + * @op determines which union struct args to use.
> > + * Refer to kern docs for each kfd_ioctl_dbg_trap_*_args struct.
> > + */
> > +struct kfd_ioctl_dbg_trap_args {
> > + __u32 pid;
> > + __u32 op;
> > +
> > + union {
> > + struct kfd_ioctl_dbg_trap_enable_args enable;
> > + struct kfd_ioctl_dbg_trap_send_runtime_event_args
> send_runtime_event;
> > + struct kfd_ioctl_dbg_trap_set_exceptions_enabled_args
> > set_exceptions_enabled;
> > + struct kfd_ioctl_dbg_trap_set_wave_launch_override_args
> launch_override;
> > + struct kfd_ioctl_dbg_trap_set_wave_launch_mode_args launch_mode;
> > + struct kfd_ioctl_dbg_trap_suspend_queues_args suspend_queues;
> > + struct kfd_ioctl_dbg_trap_resume_queues_args resume_queues;
> > + struct kfd_ioctl_dbg_trap_set_node_address_watch_args
> > set_node_address_watch;
> > + struct kfd_ioctl_dbg_trap_clear_node_address_watch_args
> > clear_node_address_watch;
> > + struct kfd_ioctl_dbg_trap_set_flags_args set_flags;
> > + struct kfd_ioctl_dbg_trap_query_debug_event_args
> query_debug_event;
> > + struct kfd_ioctl_dbg_trap_query_exception_info_args
> > query_exception_info;
> > + struct kfd_ioctl_dbg_trap_queue_snapshot_args queue_snapshot;
> > + struct kfd_ioctl_dbg_trap_device_snapshot_args device_snapshot;
> > + };
> > +};
> > +
> > #define AMDKFD_IOCTL_BASE 'K'
> > #define AMDKFD_IO(nr) _IO(AMDKFD_IOCTL_BASE, nr)
> > #define AMDKFD_IOR(nr, type) _IOR(AMDKFD_IOCTL_BASE, nr, type)
> > @@ -877,7 +1524,13 @@ struct kfd_ioctl_set_xnack_mode_args {
> > #define AMDKFD_IOC_AVAILABLE_MEMORY \
> > AMDKFD_IOWR(0x23, struct kfd_ioctl_get_available_memory_args)
> > +#define AMDKFD_IOC_RUNTIME_ENABLE \
> > + AMDKFD_IOWR(0x24, struct kfd_ioctl_runtime_enable_args)
> > +
> > +#define AMDKFD_IOC_DBG_TRAP \
> > + AMDKFD_IOWR(0x25, struct kfd_ioctl_dbg_trap_args)
> > +
> > #define AMDKFD_COMMAND_START 0x01
> > -#define AMDKFD_COMMAND_END 0x24
> > +#define AMDKFD_COMMAND_END 0x26
> > #endif

[-- Attachment #2: winmail.dat --]
[-- Type: application/ms-tnef, Size: 26421 bytes --]

^ permalink raw reply	[flat|nested] 63+ messages in thread

* RE: [PATCH 05/29] drm/amdgpu: setup hw debug registers on driver initialization
  2022-11-22 23:38   ` Felix Kuehling
@ 2022-11-23 20:53     ` Kim, Jonathan
  2022-12-01  0:18     ` Felix Kuehling
  1 sibling, 0 replies; 63+ messages in thread
From: Kim, Jonathan @ 2022-11-23 20:53 UTC (permalink / raw)
  To: Kuehling, Felix, amd-gfx

[-- Attachment #1: Type: text/plain, Size: 16473 bytes --]

[Public]

> -----Original Message-----
> From: Kuehling, Felix <Felix.Kuehling@amd.com>
> Sent: November 22, 2022 6:39 PM
> To: Kim, Jonathan <Jonathan.Kim@amd.com>; amd-
> gfx@lists.freedesktop.org
> Subject: Re: [PATCH 05/29] drm/amdgpu: setup hw debug registers on driver
> initialization
>
>
> On 2022-10-31 12:23, Jonathan Kim wrote:
> > Add missing debug trap registers references and initialize all debug
> > registers on boot by clearing the hardware exception overrides and the
> > wave allocation ID index.
> >
> > For debug devices that only support single process debugging, enable
> > trap temporary setup by default.
> >
> > Debug devices that support multi-process debugging require trap
> > temporary setup to be disabled by default in order to satisfy microbench
> > performance when in non-debug mode.
>
> Where is this done? I don't think it's in the MQD setup because that
> happens unconditionally on all GPUs.

Right I forgot to update gfx_v9_4_2_debug_trap_config_init to clear TRAP_EN instead of setting it.
I'll fix that.

>
>
> >
> > The debugger requires that TTMPs 6 & 7 save the dispatch ID to map
> > waves onto dispatch during compute context inspection.
> > In order to correctly this up, set the special reserved CP bit by default
> > whenever the MQD is initailized.
>
> There is a word missing here. "In order to correctly _set_ this up ..."?

Whoops.  Thanks.

>
> This patch covers GFXv9 and 10. Will GFXv11 be handled separately?

Ok.  I'll include GFX11 as well for the next round of reviews in this patch.

Thanks,

Jon

>
> Regards,
>    Felix
>
>
> >
> > Signed-off-by: Jonathan Kim <jonathan.kim@amd.com>
> > ---
> >   drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c        | 26 +++++++
> >   drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c         | 30 ++++++++
> >   .../gpu/drm/amd/amdkfd/kfd_mqd_manager_v10.c  |  5 ++
> >   .../gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c   |  5 ++
> >   .../include/asic_reg/gc/gc_10_1_0_offset.h    | 14 ++++
> >   .../include/asic_reg/gc/gc_10_1_0_sh_mask.h   | 69
> +++++++++++++++++++
> >   .../include/asic_reg/gc/gc_10_3_0_offset.h    | 10 +++
> >   .../include/asic_reg/gc/gc_10_3_0_sh_mask.h   |  4 ++
> >   8 files changed, 163 insertions(+)
> >
> > diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> > index af94ac580d3e..d49aff0b4ba3 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> > @@ -4904,6 +4904,29 @@ static u32
> gfx_v10_0_init_pa_sc_tile_steering_override(struct amdgpu_device *ade
> >
> >   #define DEFAULT_SH_MEM_BASES      (0x6000)
> >
> > +static void gfx_v10_0_debug_trap_config_init(struct amdgpu_device
> *adev,
> > +                           uint32_t first_vmid,
> > +                           uint32_t last_vmid)
> > +{
> > +   uint32_t data;
> > +   uint32_t trap_config_vmid_mask = 0;
> > +   int i;
> > +
> > +   /* Calculate trap config vmid mask */
> > +   for (i = first_vmid; i < last_vmid; i++)
> > +           trap_config_vmid_mask |= (1 << i);
> > +
> > +   data = REG_SET_FIELD(0, SPI_GDBG_TRAP_CONFIG,
> > +                   VMID_SEL, trap_config_vmid_mask);
> > +   data = REG_SET_FIELD(data, SPI_GDBG_TRAP_CONFIG,
> > +                   TRAP_EN, 1);
> > +   WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_TRAP_CONFIG),
> data);
> > +   WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_TRAP_MASK),
> 0);
> > +
> > +   WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_TRAP_DATA0),
> 0);
> > +   WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_TRAP_DATA1),
> 0);
> > +}
> > +
> >   static void gfx_v10_0_init_compute_vmid(struct amdgpu_device *adev)
> >   {
> >     int i;
> > @@ -4935,6 +4958,9 @@ static void gfx_v10_0_init_compute_vmid(struct
> amdgpu_device *adev)
> >             WREG32_SOC15_OFFSET(GC, 0, mmGDS_GWS_VMID0, i, 0);
> >             WREG32_SOC15_OFFSET(GC, 0, mmGDS_OA_VMID0, i, 0);
> >     }
> > +
> > +   gfx_v10_0_debug_trap_config_init(adev, adev-
> >vm_manager.first_kfd_vmid,
> > +                                   AMDGPU_NUM_VMID);
> >   }
> >
> >   static void gfx_v10_0_init_gds_vmid(struct amdgpu_device *adev)
> > diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> > index 0320be4a5fc6..a0e5ad342f13 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> > @@ -2337,6 +2337,29 @@ static void gfx_v9_0_setup_rb(struct
> amdgpu_device *adev)
> >     adev->gfx.config.num_rbs = hweight32(active_rbs);
> >   }
> >
> > +static void gfx_v9_0_debug_trap_config_init(struct amdgpu_device
> *adev,
> > +                           uint32_t first_vmid,
> > +                           uint32_t last_vmid)
> > +{
> > +   uint32_t data;
> > +   uint32_t trap_config_vmid_mask = 0;
> > +   int i;
> > +
> > +   /* Calculate trap config vmid mask */
> > +   for (i = first_vmid; i < last_vmid; i++)
> > +           trap_config_vmid_mask |= (1 << i);
> > +
> > +   data = REG_SET_FIELD(0, SPI_GDBG_TRAP_CONFIG,
> > +                   VMID_SEL, trap_config_vmid_mask);
> > +   data = REG_SET_FIELD(data, SPI_GDBG_TRAP_CONFIG,
> > +                   TRAP_EN, 1);
> > +   WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_TRAP_CONFIG),
> data);
> > +   WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_TRAP_MASK),
> 0);
> > +
> > +   WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_TRAP_DATA0),
> 0);
> > +   WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_TRAP_DATA1),
> 0);
> > +}
> > +
> >   #define DEFAULT_SH_MEM_BASES      (0x6000)
> >   static void gfx_v9_0_init_compute_vmid(struct amdgpu_device *adev)
> >   {
> > @@ -4609,6 +4632,13 @@ static int gfx_v9_0_late_init(void *handle)
> >     if (r)
> >             return r;
> >
> > +   if (adev->ip_versions[GC_HWIP][0] == IP_VERSION(9, 4, 2))
> > +           gfx_v9_4_2_debug_trap_config_init(adev,
> > +                   adev->vm_manager.first_kfd_vmid,
> AMDGPU_NUM_VMID);
> > +   else
> > +           gfx_v9_0_debug_trap_config_init(adev,
> > +                   adev->vm_manager.first_kfd_vmid,
> AMDGPU_NUM_VMID);
> > +
> >     return 0;
> >   }
> >
> > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v10.c
> b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v10.c
> > index d3e2b6a599a4..cb484ace17de 100644
> > --- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v10.c
> > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v10.c
> > @@ -117,6 +117,11 @@ static void init_mqd(struct mqd_manager *mm,
> void **mqd,
> >                     1 <<
> CP_HQD_QUANTUM__QUANTUM_SCALE__SHIFT |
> >                     1 <<
> CP_HQD_QUANTUM__QUANTUM_DURATION__SHIFT;
> >
> > +   /* Set cp_hqd_hq_scheduler0 bit 14 to 1 to have the CP set up the
> > +    * DISPATCH_PTR.  This is required for the kfd debugger
> > +    */
> > +   m->cp_hqd_hq_scheduler0 = 1 << 14;
> > +
> >     if (q->format == KFD_QUEUE_FORMAT_AQL) {
> >             m->cp_hqd_aql_control =
> >                     1 << CP_HQD_AQL_CONTROL__CONTROL0__SHIFT;
> > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c
> b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c
> > index 0778e587a2d6..86f1cf090246 100644
> > --- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c
> > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c
> > @@ -164,6 +164,11 @@ static void init_mqd(struct mqd_manager *mm,
> void **mqd,
> >                     1 <<
> CP_HQD_QUANTUM__QUANTUM_SCALE__SHIFT |
> >                     1 <<
> CP_HQD_QUANTUM__QUANTUM_DURATION__SHIFT;
> >
> > +   /* Set cp_hqd_hq_scheduler0 bit 14 to 1 to have the CP set up the
> > +    * DISPATCH_PTR.  This is required for the kfd debugger
> > +    */
> > +   m->cp_hqd_hq_status0 = 1 << 14;
> > +
> >     if (q->format == KFD_QUEUE_FORMAT_AQL) {
> >             m->cp_hqd_aql_control =
> >                     1 << CP_HQD_AQL_CONTROL__CONTROL0__SHIFT;
> > diff --git a/drivers/gpu/drm/amd/include/asic_reg/gc/gc_10_1_0_offset.h
> b/drivers/gpu/drm/amd/include/asic_reg/gc/gc_10_1_0_offset.h
> > index 18d34bbceebe..7d384f86bd67 100644
> > --- a/drivers/gpu/drm/amd/include/asic_reg/gc/gc_10_1_0_offset.h
> > +++ b/drivers/gpu/drm/amd/include/asic_reg/gc/gc_10_1_0_offset.h
> > @@ -5190,6 +5190,20 @@
> >   #define mmSPI_WCL_PIPE_PERCENT_CS6_BASE_IDX
> 0
> >   #define mmSPI_WCL_PIPE_PERCENT_CS7
> 0x1f70
> >   #define mmSPI_WCL_PIPE_PERCENT_CS7_BASE_IDX
> 0
> > +#define mmSPI_GDBG_WAVE_CNTL
> 0x1f71
> > +#define mmSPI_GDBG_WAVE_CNTL_BASE_IDX
> 0
> > +#define mmSPI_GDBG_TRAP_CONFIG
> 0x1f72
> > +#define mmSPI_GDBG_TRAP_CONFIG_BASE_IDX
> 0
> > +#define mmSPI_GDBG_TRAP_MASK
> 0x1f73
> > +#define mmSPI_GDBG_TRAP_MASK_BASE_IDX
> 0
> > +#define mmSPI_GDBG_WAVE_CNTL2
> 0x1f74
> > +#define mmSPI_GDBG_WAVE_CNTL2_BASE_IDX
> 0
> > +#define mmSPI_GDBG_WAVE_CNTL3
> 0x1f75
> > +#define mmSPI_GDBG_WAVE_CNTL3_BASE_IDX
> 0
> > +#define mmSPI_GDBG_TRAP_DATA0
> 0x1f78
> > +#define mmSPI_GDBG_TRAP_DATA0_BASE_IDX
> 0
> > +#define mmSPI_GDBG_TRAP_DATA1
> 0x1f79
> > +#define mmSPI_GDBG_TRAP_DATA1_BASE_IDX
> 0
> >   #define mmSPI_COMPUTE_QUEUE_RESET
> 0x1f7b
> >   #define mmSPI_COMPUTE_QUEUE_RESET_BASE_IDX
> 0
> >   #define mmSPI_RESOURCE_RESERVE_CU_0
> 0x1f7c
> > diff --git
> a/drivers/gpu/drm/amd/include/asic_reg/gc/gc_10_1_0_sh_mask.h
> b/drivers/gpu/drm/amd/include/asic_reg/gc/gc_10_1_0_sh_mask.h
> > index 4127896ffcdf..08772ba845b0 100644
> > --- a/drivers/gpu/drm/amd/include/asic_reg/gc/gc_10_1_0_sh_mask.h
> > +++ b/drivers/gpu/drm/amd/include/asic_reg/gc/gc_10_1_0_sh_mask.h
> > @@ -19646,6 +19646,75 @@
> >   //SPI_WCL_PIPE_PERCENT_CS7
> >   #define SPI_WCL_PIPE_PERCENT_CS7__VALUE__SHIFT
> 0x0
> >   #define SPI_WCL_PIPE_PERCENT_CS7__VALUE_MASK
> 0x7FL
> > +//SPI_GDBG_WAVE_CNTL
> > +#define SPI_GDBG_WAVE_CNTL__STALL_RA__SHIFT
> 0x0
> > +#define SPI_GDBG_WAVE_CNTL__STALL_VMID__SHIFT
> 0x1
> > +#define SPI_GDBG_WAVE_CNTL__STALL_RA_MASK
> 0x00000001L
> > +#define SPI_GDBG_WAVE_CNTL__STALL_VMID_MASK
> 0x0001FFFEL
> > +//SPI_GDBG_TRAP_CONFIG
> > +#define SPI_GDBG_TRAP_CONFIG__ME_SEL__SHIFT
> 0x0
> > +#define SPI_GDBG_TRAP_CONFIG__PIPE_SEL__SHIFT
> 0x2
> > +#define SPI_GDBG_TRAP_CONFIG__QUEUE_SEL__SHIFT
> 0x4
> > +#define SPI_GDBG_TRAP_CONFIG__ME_MATCH__SHIFT
> 0x7
> > +#define SPI_GDBG_TRAP_CONFIG__PIPE_MATCH__SHIFT
> 0x8
> > +#define SPI_GDBG_TRAP_CONFIG__QUEUE_MATCH__SHIFT
> 0x9
> > +#define SPI_GDBG_TRAP_CONFIG__TRAP_EN__SHIFT
> 0xf
> > +#define SPI_GDBG_TRAP_CONFIG__VMID_SEL__SHIFT
> 0x10
> > +#define SPI_GDBG_TRAP_CONFIG__ME_SEL_MASK
> 0x00000003L
> > +#define SPI_GDBG_TRAP_CONFIG__PIPE_SEL_MASK
> 0x0000000CL
> > +#define SPI_GDBG_TRAP_CONFIG__QUEUE_SEL_MASK
> 0x00000070L
> > +#define SPI_GDBG_TRAP_CONFIG__ME_MATCH_MASK
> 0x00000080L
> > +#define SPI_GDBG_TRAP_CONFIG__PIPE_MATCH_MASK
> 0x00000100L
> > +#define SPI_GDBG_TRAP_CONFIG__QUEUE_MATCH_MASK
> 0x00000200L
> > +#define SPI_GDBG_TRAP_CONFIG__TRAP_EN_MASK
> 0x00008000L
> > +#define SPI_GDBG_TRAP_CONFIG__VMID_SEL_MASK
> 0xFFFF0000L
> > +//SPI_GDBG_TRAP_MASK
> > +#define SPI_GDBG_TRAP_MASK__EXCP_EN__SHIFT
> 0x0
> > +#define SPI_GDBG_TRAP_MASK__REPLACE__SHIFT
> 0x9
> > +#define SPI_GDBG_TRAP_MASK__EXCP_EN_MASK
> 0x01FFL
> > +#define SPI_GDBG_TRAP_MASK__REPLACE_MASK
> 0x0200L
> > +//SPI_GDBG_WAVE_CNTL2
> > +#define SPI_GDBG_WAVE_CNTL2__VMID_MASK__SHIFT
> 0x0
> > +#define SPI_GDBG_WAVE_CNTL2__MODE__SHIFT
> 0x10
> > +#define SPI_GDBG_WAVE_CNTL2__VMID_MASK_MASK
> 0x0000FFFFL
> > +#define SPI_GDBG_WAVE_CNTL2__MODE_MASK
> 0x00030000L
> > +//SPI_GDBG_WAVE_CNTL3
> > +#define SPI_GDBG_WAVE_CNTL3__STALL_PS__SHIFT
> 0x0
> > +#define SPI_GDBG_WAVE_CNTL3__STALL_VS__SHIFT
> 0x1
> > +#define SPI_GDBG_WAVE_CNTL3__STALL_GS__SHIFT
> 0x2
> > +#define SPI_GDBG_WAVE_CNTL3__STALL_HS__SHIFT
> 0x3
> > +#define SPI_GDBG_WAVE_CNTL3__STALL_CSG__SHIFT
> 0x4
> > +#define SPI_GDBG_WAVE_CNTL3__STALL_CS0__SHIFT
> 0x5
> > +#define SPI_GDBG_WAVE_CNTL3__STALL_CS1__SHIFT
> 0x6
> > +#define SPI_GDBG_WAVE_CNTL3__STALL_CS2__SHIFT
> 0x7
> > +#define SPI_GDBG_WAVE_CNTL3__STALL_CS3__SHIFT
> 0x8
> > +#define SPI_GDBG_WAVE_CNTL3__STALL_CS4__SHIFT
> 0x9
> > +#define SPI_GDBG_WAVE_CNTL3__STALL_CS5__SHIFT
> 0xa
> > +#define SPI_GDBG_WAVE_CNTL3__STALL_CS6__SHIFT
> 0xb
> > +#define SPI_GDBG_WAVE_CNTL3__STALL_CS7__SHIFT
> 0xc
> > +#define SPI_GDBG_WAVE_CNTL3__STALL_DURATION__SHIFT
> 0xd
> > +#define SPI_GDBG_WAVE_CNTL3__STALL_MULT__SHIFT
> 0x1c
> > +#define SPI_GDBG_WAVE_CNTL3__STALL_PS_MASK
> 0x00000001L
> > +#define SPI_GDBG_WAVE_CNTL3__STALL_VS_MASK
> 0x00000002L
> > +#define SPI_GDBG_WAVE_CNTL3__STALL_GS_MASK
> 0x00000004L
> > +#define SPI_GDBG_WAVE_CNTL3__STALL_HS_MASK
> 0x00000008L
> > +#define SPI_GDBG_WAVE_CNTL3__STALL_CSG_MASK
> 0x00000010L
> > +#define SPI_GDBG_WAVE_CNTL3__STALL_CS0_MASK
> 0x00000020L
> > +#define SPI_GDBG_WAVE_CNTL3__STALL_CS1_MASK
> 0x00000040L
> > +#define SPI_GDBG_WAVE_CNTL3__STALL_CS2_MASK
> 0x00000080L
> > +#define SPI_GDBG_WAVE_CNTL3__STALL_CS3_MASK
> 0x00000100L
> > +#define SPI_GDBG_WAVE_CNTL3__STALL_CS4_MASK
> 0x00000200L
> > +#define SPI_GDBG_WAVE_CNTL3__STALL_CS5_MASK
> 0x00000400L
> > +#define SPI_GDBG_WAVE_CNTL3__STALL_CS6_MASK
> 0x00000800L
> > +#define SPI_GDBG_WAVE_CNTL3__STALL_CS7_MASK
> 0x00001000L
> > +#define SPI_GDBG_WAVE_CNTL3__STALL_DURATION_MASK
> 0x0FFFE000L
> > +#define SPI_GDBG_WAVE_CNTL3__STALL_MULT_MASK
> 0x10000000L
> > +//SPI_GDBG_TRAP_DATA0
> > +#define SPI_GDBG_TRAP_DATA0__DATA__SHIFT
> 0x0
> > +#define SPI_GDBG_TRAP_DATA0__DATA_MASK
> 0xFFFFFFFFL
> > +//SPI_GDBG_TRAP_DATA1
> > +#define SPI_GDBG_TRAP_DATA1__DATA__SHIFT
> 0x0
> > +#define SPI_GDBG_TRAP_DATA1__DATA_MASK
> 0xFFFFFFFFL
> >   //SPI_COMPUTE_QUEUE_RESET
> >   #define SPI_COMPUTE_QUEUE_RESET__RESET__SHIFT
> 0x0
> >   #define SPI_COMPUTE_QUEUE_RESET__RESET_MASK
> 0x01L
> > diff --git a/drivers/gpu/drm/amd/include/asic_reg/gc/gc_10_3_0_offset.h
> b/drivers/gpu/drm/amd/include/asic_reg/gc/gc_10_3_0_offset.h
> > index 3973110f149c..d09f1a06f4bf 100644
> > --- a/drivers/gpu/drm/amd/include/asic_reg/gc/gc_10_3_0_offset.h
> > +++ b/drivers/gpu/drm/amd/include/asic_reg/gc/gc_10_3_0_offset.h
> > @@ -26,6 +26,8 @@
> >   #define mmSQ_DEBUG_STS_GLOBAL_BASE_IDX
> 0
> >   #define mmSQ_DEBUG_STS_GLOBAL2
> 0x10B0
> >   #define mmSQ_DEBUG_STS_GLOBAL2_BASE_IDX
> 0
> > +#define mmSQ_DEBUG                                                                                     0x10B1
> > +#define mmSQ_DEBUG_BASE_IDX                                                                            0
> >
> >   // addressBlock: gc_sdma0_sdma0dec
> >   // base address: 0x4980
> > @@ -4849,10 +4851,18 @@
> >   #define mmSPI_WCL_PIPE_PERCENT_CS3_BASE_IDX
> 0
> >   #define mmSPI_GDBG_WAVE_CNTL
> 0x1f71
> >   #define mmSPI_GDBG_WAVE_CNTL_BASE_IDX
> 0
> > +#define mmSPI_GDBG_TRAP_CONFIG
> 0x1f72
> > +#define mmSPI_GDBG_TRAP_CONFIG_BASE_IDX
> 0
> >   #define mmSPI_GDBG_TRAP_MASK
> 0x1f73
> >   #define mmSPI_GDBG_TRAP_MASK_BASE_IDX
> 0
> >   #define mmSPI_GDBG_WAVE_CNTL2
> 0x1f74
> >   #define mmSPI_GDBG_WAVE_CNTL2_BASE_IDX
> 0
> > +#define mmSPI_GDBG_WAVE_CNTL3
> 0x1f75
> > +#define mmSPI_GDBG_WAVE_CNTL3_BASE_IDX
> 0
> > +#define mmSPI_GDBG_TRAP_DATA0
> 0x1f78
> > +#define mmSPI_GDBG_TRAP_DATA0_BASE_IDX
> 0
> > +#define mmSPI_GDBG_TRAP_DATA1
> 0x1f79
> > +#define mmSPI_GDBG_TRAP_DATA1_BASE_IDX
> 0
> >   #define mmSPI_COMPUTE_QUEUE_RESET
> 0x1f7b
> >   #define mmSPI_COMPUTE_QUEUE_RESET_BASE_IDX
> 0
> >   #define mmSPI_RESOURCE_RESERVE_CU_0
> 0x1f7c
> > diff --git
> a/drivers/gpu/drm/amd/include/asic_reg/gc/gc_10_3_0_sh_mask.h
> b/drivers/gpu/drm/amd/include/asic_reg/gc/gc_10_3_0_sh_mask.h
> > index d4e8ff22ecb8..fc85aee010fe 100644
> > --- a/drivers/gpu/drm/amd/include/asic_reg/gc/gc_10_3_0_sh_mask.h
> > +++ b/drivers/gpu/drm/amd/include/asic_reg/gc/gc_10_3_0_sh_mask.h
> > @@ -47853,6 +47853,10 @@
> >
> >
> >   // addressBlock: sqind
> > +//SQ_DEBUG
> > +#define SQ_DEBUG__SINGLE_MEMOP_MASK 0x00000001L
> > +#define SQ_DEBUG__SINGLE_MEMOP__SHIFT 0x00000000
> > +
> >   //SQ_DEBUG_STS_GLOBAL
> >   #define SQ_DEBUG_STS_GLOBAL2__FIFO_LEVEL_GFX0_MASK
> 0x000000ffL
> >   #define SQ_DEBUG_STS_GLOBAL2__FIFO_LEVEL_GFX0__SHIFT
> 0x00000000

[-- Attachment #2: winmail.dat --]
[-- Type: application/ms-tnef, Size: 20864 bytes --]

^ permalink raw reply	[flat|nested] 63+ messages in thread

* RE: [PATCH 17/29] drm/amdkfd: Add debug trap enabled flag to TMA
  2022-11-23  0:44   ` Felix Kuehling
@ 2022-11-24 14:51     ` Kim, Jonathan
  2022-11-24 16:23       ` Felix Kuehling
  0 siblings, 1 reply; 63+ messages in thread
From: Kim, Jonathan @ 2022-11-24 14:51 UTC (permalink / raw)
  To: Kuehling, Felix, amd-gfx

[-- Attachment #1: Type: text/plain, Size: 5645 bytes --]

[Public]

> -----Original Message-----
> From: Kuehling, Felix <Felix.Kuehling@amd.com>
> Sent: November 22, 2022 7:45 PM
> To: Kim, Jonathan <Jonathan.Kim@amd.com>; amd-
> gfx@lists.freedesktop.org
> Subject: Re: [PATCH 17/29] drm/amdkfd: Add debug trap enabled flag to
> TMA
>
>
> On 2022-10-31 12:23, Jonathan Kim wrote:
> > From: Jay Cornwall <jay.cornwall@amd.com>
> >
> > Trap handler behavior will differ when a debugger is attached.
> >
> > Make the debug trap flag available in the trap handler TMA.
> > Update it when the debug trap ioctl is invoked.
> >
> > v3: Rebase for upstream
> >
> > v2:
> > Add missing debug flag setup on APUs
> >
> > Signed-off-by: Jay Cornwall <jay.cornwall@amd.com>
> > Reviewed-by: Felix Kuehling <Felix.Kuehling@amd.com>
> > Signed-off-by: Jonathan Kim <jonathan.kim@amd.com>
> > ---
> >   drivers/gpu/drm/amd/amdkfd/kfd_debug.c   |  4 ++++
> >   drivers/gpu/drm/amd/amdkfd/kfd_priv.h    |  2 ++
> >   drivers/gpu/drm/amd/amdkfd/kfd_process.c | 16 ++++++++++++++++
> >   3 files changed, 22 insertions(+)
> >
> > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
> b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
> > index ae6e701a2656..d4f87f2adada 100644
> > --- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
> > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
> > @@ -193,6 +193,8 @@ void kfd_dbg_trap_deactivate(struct kfd_process
> *target, bool unwind, int unwind
> >             if (unwind && count == unwind_count)
> >                     break;
> >
> > +           kfd_process_set_trap_debug_flag(&pdd->qpd, false);
> > +
> >             /* GFX off is already disabled by debug activate if not RLC
> restore supported. */
> >             if (kfd_dbg_is_rlc_restore_supported(pdd->dev))
> >                     amdgpu_gfx_off_ctrl(pdd->dev->adev, false);
> > @@ -278,6 +280,8 @@ int kfd_dbg_trap_activate(struct kfd_process
> *target)
> >             if (kfd_dbg_is_rlc_restore_supported(pdd->dev))
> >                     amdgpu_gfx_off_ctrl(pdd->dev->adev, true);
> >
> > +           kfd_process_set_trap_debug_flag(&pdd->qpd, true);
> > +
> >             r = debug_refresh_runlist(pdd->dev->dqm);
> >             if (r) {
> >                     target->runtime_info.runtime_state =
> > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> > index 9690a2adb9ed..82b28588ab72 100644
> > --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> > @@ -1101,6 +1101,8 @@ int kfd_init_apertures(struct kfd_process
> *process);
> >   void kfd_process_set_trap_handler(struct qcm_process_device *qpd,
> >                               uint64_t tba_addr,
> >                               uint64_t tma_addr);
> > +void kfd_process_set_trap_debug_flag(struct qcm_process_device
> *qpd,
> > +                                bool enabled);
> >
> >   /* CWSR initialization */
> >   int kfd_process_init_cwsr_apu(struct kfd_process *process, struct file
> *filep);
> > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> > index 59c4c38833b6..d62e0c62df76 100644
> > --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> > @@ -1252,6 +1252,8 @@ int kfd_process_init_cwsr_apu(struct
> kfd_process *p, struct file *filep)
> >
> >             memcpy(qpd->cwsr_kaddr, dev->cwsr_isa, dev-
> >cwsr_isa_size);
> >
> > +           kfd_process_set_trap_debug_flag(qpd, p-
> >debug_trap_enabled);
> > +
> >             qpd->tma_addr = qpd->tba_addr +
> KFD_CWSR_TMA_OFFSET;
> >             pr_debug("set tba :0x%llx, tma:0x%llx, cwsr_kaddr:%p for
> pqm.\n",
> >                     qpd->tba_addr, qpd->tma_addr, qpd->cwsr_kaddr);
> > @@ -1288,6 +1290,9 @@ static int
> kfd_process_device_init_cwsr_dgpu(struct kfd_process_device *pdd)
> >
> >     memcpy(qpd->cwsr_kaddr, dev->cwsr_isa, dev->cwsr_isa_size);
> >
> > +   kfd_process_set_trap_debug_flag(&pdd->qpd,
> > +                                   pdd->process-
> >debug_trap_enabled);
> > +
> >     qpd->tma_addr = qpd->tba_addr + KFD_CWSR_TMA_OFFSET;
> >     pr_debug("set tba :0x%llx, tma:0x%llx, cwsr_kaddr:%p for pqm.\n",
> >              qpd->tba_addr, qpd->tma_addr, qpd->cwsr_kaddr);
> > @@ -1374,6 +1379,17 @@ bool kfd_process_xnack_mode(struct
> kfd_process *p, bool supported)
> >     return true;
> >   }
> >
> > +void kfd_process_set_trap_debug_flag(struct qcm_process_device
> *qpd,
> > +                                bool enabled)
> > +{
> > +   /* If TMA doesn't exist then flag will be set during allocation. */
>
> I would expect a change to the TMA allocation function, but that isn't
> in this patch?

The TMA is allocated under kfd_process_init_cwsr_* and CWSR enabled is a pre-condition for the 1st level trap handler loading.
The lack of context in the patch for those functions may be hiding that fact.
Is the placement of this comment misleading?  Maybe it should go in kfd_dbg_trap_activate when kfd_process_set_trap_debug_flag is called?
Or should it just be removed since the combined calls within initialization of CWSR + debug enable seem complete for enablement?

Thanks,

Jon

>
> Regards,
>    Felix
>
> > +   if (qpd->cwsr_kaddr) {
> > +           uint64_t *tma =
> > +                   (uint64_t *)(qpd->cwsr_kaddr +
> KFD_CWSR_TMA_OFFSET);
> > +           tma[2] = enabled;
> > +   }
> > +}
> > +
> >   /*
> >    * On return the kfd_process is fully operational and will be freed when
> the
> >    * mm is released

[-- Attachment #2: winmail.dat --]
[-- Type: application/ms-tnef, Size: 18065 bytes --]

^ permalink raw reply	[flat|nested] 63+ messages in thread

* RE: [PATCH 07/29] drm/amdgpu: add gfx9.4.1 hw debug mode enable and disable calls
  2022-11-22 23:59   ` Felix Kuehling
@ 2022-11-24 14:58     ` Kim, Jonathan
  2022-11-24 16:25       ` Felix Kuehling
  0 siblings, 1 reply; 63+ messages in thread
From: Kim, Jonathan @ 2022-11-24 14:58 UTC (permalink / raw)
  To: Kuehling, Felix, amd-gfx

[-- Attachment #1: Type: text/plain, Size: 10110 bytes --]

[AMD Official Use Only - General]

> -----Original Message-----
> From: Kuehling, Felix <Felix.Kuehling@amd.com>
> Sent: November 22, 2022 6:59 PM
> To: Kim, Jonathan <Jonathan.Kim@amd.com>; amd-
> gfx@lists.freedesktop.org
> Subject: Re: [PATCH 07/29] drm/amdgpu: add gfx9.4.1 hw debug mode
> enable and disable calls
>
>
> On 2022-10-31 12:23, Jonathan Kim wrote:
> > On GFX9.4.1, the implicit wait count instruction on s_barrier is
> > disabled by default in the driver during normal operation for
> > performance requirements.
> >
> > There is a hardware bug in GFX9.4.1 where if the implicit wait count
> > instruction after an s_barrier instruction is disabled, any wave that
> > hits an exception may step over the s_barrier when returning from the
> > trap handler with the barrier logic having no ability to be
> > aware of this, thereby causing other waves to wait at the barrier
> > indefinitely resulting in a shader hang.  This bug has been corrected
> > for GFX9.4.2 and onward.
> >
> > Since the debugger subscribes to hardware exceptions, in order to avoid
> > this bug, the debugger must enable implicit wait count on s_barrier
> > for a debug session and disable it on detach.
> >
> > In order to change this setting in the in the device global SQ_CONFIG
> > register, the GFX pipeline must be idle.  GFX9.4.1 as a compute device
> > will either dispatch work through the compute ring buffers used for
> > image post processing or through the hardware scheduler by the KFD.
> >
> > Have the KGD suspend and drain the compute ring buffer, then suspend
> the
> > hardware scheduler and block any future KFD process job requests before
> > changing the implicit wait count setting.  Once set, resume all work.
> >
> > Signed-off-by: Jonathan Kim <jonathan.kim@amd.com>
> > ---
> >   drivers/gpu/drm/amd/amdgpu/amdgpu.h           |   3 +
> >   .../drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c   | 105
> +++++++++++++++++-
> >   drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c         |   4 +-
> >   drivers/gpu/drm/amd/amdkfd/kfd_process.c      |   2 +-
> >   4 files changed, 110 insertions(+), 4 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> > index 0e6ddf05c23c..9f2499f52d2c 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> > @@ -1034,6 +1034,9 @@ struct amdgpu_device {
> >     struct pci_saved_state          *pci_state;
> >     pci_channel_state_t             pci_channel_state;
> >
> > +   /* Track auto wait count on s_barrier settings */
> > +   bool                            barrier_has_auto_waitcnt;
> > +
> >     struct amdgpu_reset_control     *reset_cntl;
> >     uint32_t
> ip_versions[MAX_HWIP][HWIP_MAX_INSTANCE];
> >
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c
> > index 4191af5a3f13..13f02a0aa828 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c
> > @@ -26,6 +26,7 @@
> >   #include "amdgpu.h"
> >   #include "amdgpu_amdkfd.h"
> >   #include "amdgpu_amdkfd_arcturus.h"
> > +#include "amdgpu_reset.h"
> >   #include "sdma0/sdma0_4_2_2_offset.h"
> >   #include "sdma0/sdma0_4_2_2_sh_mask.h"
> >   #include "sdma1/sdma1_4_2_2_offset.h"
> > @@ -48,6 +49,8 @@
> >   #include "amdgpu_amdkfd_gfx_v9.h"
> >   #include "gfxhub_v1_0.h"
> >   #include "mmhub_v9_4.h"
> > +#include "gc/gc_9_0_offset.h"
> > +#include "gc/gc_9_0_sh_mask.h"
> >
> >   #define HQD_N_REGS 56
> >   #define DUMP_REG(addr) do {                               \
> > @@ -276,6 +279,104 @@ int kgd_arcturus_hqd_sdma_destroy(struct
> amdgpu_device *adev, void *mqd,
> >     return 0;
> >   }
> >
> > +/*
> > + * Helper used to suspend/resume gfx pipe for image post process work
> to set
> > + * barrier behaviour.
> > + */
> > +static int suspend_resume_compute_scheduler(struct amdgpu_device
> *adev, bool suspend)
> > +{
> > +   int i, r = 0;
> > +
> > +   for (i = 0; i < adev->gfx.num_compute_rings; i++) {
> > +           struct amdgpu_ring *ring = &adev->gfx.compute_ring[i];
> > +
> > +           if (!(ring && ring->sched.thread))
> > +                   continue;
> > +
> > +           /* stop secheduler and drain ring. */
> > +           if (suspend) {
> > +                   drm_sched_stop(&ring->sched, NULL);
> > +                   r = amdgpu_fence_wait_empty(ring);
> > +                   if (r)
> > +                           goto out;
> > +           } else {
> > +                   drm_sched_start(&ring->sched, false);
> > +           }
> > +   }
> > +
> > +out:
> > +   /* return on resume or failure to drain rings. */
> > +   if (!suspend || r)
> > +           return r;
> > +
> > +   return amdgpu_device_ip_wait_for_idle(adev, GC_HWIP);
> > +}
> > +
> > +static void set_barrier_auto_waitcnt(struct amdgpu_device *adev, bool
> enable_waitcnt)
> > +{
> > +   uint32_t data;
> > +
> > +   WRITE_ONCE(adev->barrier_has_auto_waitcnt, enable_waitcnt);
> > +
> > +   if (!down_read_trylock(&adev->reset_domain->sem))
> > +           return;
> > +
> > +   amdgpu_amdkfd_suspend(adev, false);
> > +
> > +   if (suspend_resume_compute_scheduler(adev, true))
> > +           goto out;
> > +
> > +   data = RREG32(SOC15_REG_OFFSET(GC, 0, mmSQ_CONFIG));
> > +   data = REG_SET_FIELD(data, SQ_CONFIG,
> DISABLE_BARRIER_WAITCNT,
> > +                                           enable_waitcnt ? 0 : 1);
> > +   WREG32(SOC15_REG_OFFSET(GC, 0, mmSQ_CONFIG), data);
> > +
> > +out:
> > +   suspend_resume_compute_scheduler(adev, false);
> > +
> > +   amdgpu_amdkfd_resume(adev, false);
> > +
> > +   up_read(&adev->reset_domain->sem);
> > +}
> > +
> > +static uint32_t kgd_arcturus_enable_debug_trap(struct amdgpu_device
> *adev,
> > +                           bool restore_dbg_registers,
> > +                           uint32_t vmid)
> > +{
> > +   mutex_lock(&adev->grbm_idx_mutex);
> > +
> > +   kgd_gfx_v9_set_wave_launch_stall(adev, vmid, true);
> > +
> > +   set_barrier_auto_waitcnt(adev, true);
> > +
> > +   WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_TRAP_MASK),
> 0);
> > +
> > +   kgd_gfx_v9_set_wave_launch_stall(adev, vmid, false);
> > +
> > +   mutex_unlock(&adev->grbm_idx_mutex);
> > +
> > +   return 0;
> > +}
> > +
> > +static uint32_t kgd_arcturus_disable_debug_trap(struct amdgpu_device
> *adev,
> > +                                   bool keep_trap_enabled,
> > +                                   uint32_t vmid)
> > +{
> > +
> > +   mutex_lock(&adev->grbm_idx_mutex);
> > +
> > +   kgd_gfx_v9_set_wave_launch_stall(adev, vmid, true);
> > +
> > +   set_barrier_auto_waitcnt(adev, false);
> > +
> > +   WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_TRAP_MASK),
> 0);
> > +
> > +   kgd_gfx_v9_set_wave_launch_stall(adev, vmid, false);
> > +
> > +   mutex_unlock(&adev->grbm_idx_mutex);
> > +
> > +   return 0;
> > +}
> >   const struct kfd2kgd_calls arcturus_kfd2kgd = {
> >     .program_sh_mem_settings =
> kgd_gfx_v9_program_sh_mem_settings,
> >     .set_pasid_vmid_mapping = kgd_gfx_v9_set_pasid_vmid_mapping,
> > @@ -294,6 +395,8 @@ const struct kfd2kgd_calls arcturus_kfd2kgd = {
> >
>       kgd_gfx_v9_get_atc_vmid_pasid_mapping_info,
> >     .set_vm_context_page_table_base =
> >
>       kgd_gfx_v9_set_vm_context_page_table_base,
> > +   .enable_debug_trap = kgd_arcturus_enable_debug_trap,
> > +   .disable_debug_trap = kgd_arcturus_disable_debug_trap,
> >     .get_cu_occupancy = kgd_gfx_v9_get_cu_occupancy,
> > -   .program_trap_handler_settings =
> kgd_gfx_v9_program_trap_handler_settings
> > +   .program_trap_handler_settings =
> kgd_gfx_v9_program_trap_handler_settings,
> >   };
> > diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> > index a0e5ad342f13..8ed1b5d255f7 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> > @@ -2424,8 +2424,8 @@ static void gfx_v9_0_init_sq_config(struct
> amdgpu_device *adev)
> >     switch (adev->ip_versions[GC_HWIP][0]) {
> >     case IP_VERSION(9, 4, 1):
> >             tmp = RREG32_SOC15(GC, 0, mmSQ_CONFIG);
> > -           tmp = REG_SET_FIELD(tmp, SQ_CONFIG,
> > -                                   DISABLE_BARRIER_WAITCNT, 1);
> > +           tmp = REG_SET_FIELD(tmp, SQ_CONFIG,
> DISABLE_BARRIER_WAITCNT,
> > +                           READ_ONCE(adev-
> >barrier_has_auto_waitcnt) ? 0 : 1);
> >             WREG32_SOC15(GC, 0, mmSQ_CONFIG, tmp);
> >             break;
> >     default:
> > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> > index 56ad38fcd26e..efb81ccef8f5 100644
> > --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> > @@ -1946,7 +1946,7 @@ void kfd_suspend_all_processes(void)
> >     WARN(debug_evictions, "Evicting all processes");
> >     hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) {
> >             cancel_delayed_work_sync(&p->eviction_work);
> > -           cancel_delayed_work_sync(&p->restore_work);
> > +           flush_delayed_work(&p->restore_work);
>
> This looks like a sneak bug fix. Should this be a separate patch
> independent of this path series?

Ok.  That should probably be fixed in general.
Back-to-back KFD suspends/resume calls can result in asymmetrical evictions and restores if scheduled restores are cancelled on suspend.
The bug just happens to get surfaced for mGPU GFX9.4.1 debugging, because debug attach forces that scenario.
I can send this out as a separate fix that's not related to this series.

Thanks,

Jon


>
> Regards,
>    Felix
>
>
> >
> >             if (kfd_process_evict_queues(p,
> KFD_QUEUE_EVICTION_TRIGGER_SUSPEND))
> >                     pr_err("Failed to suspend process 0x%x\n", p-
> >pasid);

[-- Attachment #2: winmail.dat --]
[-- Type: application/ms-tnef, Size: 20062 bytes --]

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [PATCH 17/29] drm/amdkfd: Add debug trap enabled flag to TMA
  2022-11-24 14:51     ` Kim, Jonathan
@ 2022-11-24 16:23       ` Felix Kuehling
  2022-11-24 20:27         ` Kim, Jonathan
  0 siblings, 1 reply; 63+ messages in thread
From: Felix Kuehling @ 2022-11-24 16:23 UTC (permalink / raw)
  To: Kim, Jonathan, amd-gfx


Am 2022-11-24 um 09:51 schrieb Kim, Jonathan:
> [Public]
>
>> -----Original Message-----
>> From: Kuehling, Felix <Felix.Kuehling@amd.com>
>> Sent: November 22, 2022 7:45 PM
>> To: Kim, Jonathan <Jonathan.Kim@amd.com>; amd-
>> gfx@lists.freedesktop.org
>> Subject: Re: [PATCH 17/29] drm/amdkfd: Add debug trap enabled flag to
>> TMA
>>
>>
>> On 2022-10-31 12:23, Jonathan Kim wrote:
>>> From: Jay Cornwall <jay.cornwall@amd.com>
>>>
>>> Trap handler behavior will differ when a debugger is attached.
>>>
>>> Make the debug trap flag available in the trap handler TMA.
>>> Update it when the debug trap ioctl is invoked.
>>>
>>> v3: Rebase for upstream
>>>
>>> v2:
>>> Add missing debug flag setup on APUs
>>>
>>> Signed-off-by: Jay Cornwall <jay.cornwall@amd.com>
>>> Reviewed-by: Felix Kuehling <Felix.Kuehling@amd.com>
>>> Signed-off-by: Jonathan Kim <jonathan.kim@amd.com>
>>> ---
>>>    drivers/gpu/drm/amd/amdkfd/kfd_debug.c   |  4 ++++
>>>    drivers/gpu/drm/amd/amdkfd/kfd_priv.h    |  2 ++
>>>    drivers/gpu/drm/amd/amdkfd/kfd_process.c | 16 ++++++++++++++++
>>>    3 files changed, 22 insertions(+)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
>> b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
>>> index ae6e701a2656..d4f87f2adada 100644
>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
>>> @@ -193,6 +193,8 @@ void kfd_dbg_trap_deactivate(struct kfd_process
>> *target, bool unwind, int unwind
>>>              if (unwind && count == unwind_count)
>>>                      break;
>>>
>>> +           kfd_process_set_trap_debug_flag(&pdd->qpd, false);
>>> +
>>>              /* GFX off is already disabled by debug activate if not RLC
>> restore supported. */
>>>              if (kfd_dbg_is_rlc_restore_supported(pdd->dev))
>>>                      amdgpu_gfx_off_ctrl(pdd->dev->adev, false);
>>> @@ -278,6 +280,8 @@ int kfd_dbg_trap_activate(struct kfd_process
>> *target)
>>>              if (kfd_dbg_is_rlc_restore_supported(pdd->dev))
>>>                      amdgpu_gfx_off_ctrl(pdd->dev->adev, true);
>>>
>>> +           kfd_process_set_trap_debug_flag(&pdd->qpd, true);
>>> +
>>>              r = debug_refresh_runlist(pdd->dev->dqm);
>>>              if (r) {
>>>                      target->runtime_info.runtime_state =
>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>> b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>>> index 9690a2adb9ed..82b28588ab72 100644
>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>>> @@ -1101,6 +1101,8 @@ int kfd_init_apertures(struct kfd_process
>> *process);
>>>    void kfd_process_set_trap_handler(struct qcm_process_device *qpd,
>>>                                uint64_t tba_addr,
>>>                                uint64_t tma_addr);
>>> +void kfd_process_set_trap_debug_flag(struct qcm_process_device
>> *qpd,
>>> +                                bool enabled);
>>>
>>>    /* CWSR initialization */
>>>    int kfd_process_init_cwsr_apu(struct kfd_process *process, struct file
>> *filep);
>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
>> b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
>>> index 59c4c38833b6..d62e0c62df76 100644
>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
>>> @@ -1252,6 +1252,8 @@ int kfd_process_init_cwsr_apu(struct
>> kfd_process *p, struct file *filep)
>>>              memcpy(qpd->cwsr_kaddr, dev->cwsr_isa, dev-
>>> cwsr_isa_size);
>>>
>>> +           kfd_process_set_trap_debug_flag(qpd, p-
>>> debug_trap_enabled);
>>> +
>>>              qpd->tma_addr = qpd->tba_addr +
>> KFD_CWSR_TMA_OFFSET;
>>>              pr_debug("set tba :0x%llx, tma:0x%llx, cwsr_kaddr:%p for
>> pqm.\n",
>>>                      qpd->tba_addr, qpd->tma_addr, qpd->cwsr_kaddr);
>>> @@ -1288,6 +1290,9 @@ static int
>> kfd_process_device_init_cwsr_dgpu(struct kfd_process_device *pdd)
>>>      memcpy(qpd->cwsr_kaddr, dev->cwsr_isa, dev->cwsr_isa_size);
>>>
>>> +   kfd_process_set_trap_debug_flag(&pdd->qpd,
>>> +                                   pdd->process-
>>> debug_trap_enabled);
>>> +
>>>      qpd->tma_addr = qpd->tba_addr + KFD_CWSR_TMA_OFFSET;
>>>      pr_debug("set tba :0x%llx, tma:0x%llx, cwsr_kaddr:%p for pqm.\n",
>>>               qpd->tba_addr, qpd->tma_addr, qpd->cwsr_kaddr);
>>> @@ -1374,6 +1379,17 @@ bool kfd_process_xnack_mode(struct
>> kfd_process *p, bool supported)
>>>      return true;
>>>    }
>>>
>>> +void kfd_process_set_trap_debug_flag(struct qcm_process_device
>> *qpd,
>>> +                                bool enabled)
>>> +{
>>> +   /* If TMA doesn't exist then flag will be set during allocation. */
>> I would expect a change to the TMA allocation function, but that isn't
>> in this patch?
> The TMA is allocated under kfd_process_init_cwsr_* and CWSR enabled is a pre-condition for the 1st level trap handler loading.
> The lack of context in the patch for those functions may be hiding that fact.
> Is the placement of this comment misleading?  Maybe it should go in kfd_dbg_trap_activate when kfd_process_set_trap_debug_flag is called?
> Or should it just be removed since the combined calls within initialization of CWSR + debug enable seem complete for enablement?

I think the comment is fine. I was sort of expecting to see the 
corresponding change in the TMA allocation in the same patch. So my 
question is just lack of context. If that change in the TMA allocation 
got squashed into another patch in the series, maybe it would make sense 
to move it into this patch instead.

Regards,
   Felix


>
> Thanks,
>
> Jon
>
>> Regards,
>>     Felix
>>
>>> +   if (qpd->cwsr_kaddr) {
>>> +           uint64_t *tma =
>>> +                   (uint64_t *)(qpd->cwsr_kaddr +
>> KFD_CWSR_TMA_OFFSET);
>>> +           tma[2] = enabled;
>>> +   }
>>> +}
>>> +
>>>    /*
>>>     * On return the kfd_process is fully operational and will be freed when
>> the
>>>     * mm is released

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [PATCH 07/29] drm/amdgpu: add gfx9.4.1 hw debug mode enable and disable calls
  2022-11-24 14:58     ` Kim, Jonathan
@ 2022-11-24 16:25       ` Felix Kuehling
  0 siblings, 0 replies; 63+ messages in thread
From: Felix Kuehling @ 2022-11-24 16:25 UTC (permalink / raw)
  To: Kim, Jonathan, amd-gfx

Am 2022-11-24 um 09:58 schrieb Kim, Jonathan:
> [AMD Official Use Only - General]
>
>> -----Original Message-----
>> From: Kuehling, Felix <Felix.Kuehling@amd.com>
>> Sent: November 22, 2022 6:59 PM
>> To: Kim, Jonathan <Jonathan.Kim@amd.com>; amd-
>> gfx@lists.freedesktop.org
>> Subject: Re: [PATCH 07/29] drm/amdgpu: add gfx9.4.1 hw debug mode
>> enable and disable calls
>>
>>
>> On 2022-10-31 12:23, Jonathan Kim wrote:
>>> On GFX9.4.1, the implicit wait count instruction on s_barrier is
>>> disabled by default in the driver during normal operation for
>>> performance requirements.
>>>
>>> There is a hardware bug in GFX9.4.1 where if the implicit wait count
>>> instruction after an s_barrier instruction is disabled, any wave that
>>> hits an exception may step over the s_barrier when returning from the
>>> trap handler with the barrier logic having no ability to be
>>> aware of this, thereby causing other waves to wait at the barrier
>>> indefinitely resulting in a shader hang.  This bug has been corrected
>>> for GFX9.4.2 and onward.
>>>
>>> Since the debugger subscribes to hardware exceptions, in order to avoid
>>> this bug, the debugger must enable implicit wait count on s_barrier
>>> for a debug session and disable it on detach.
>>>
>>> In order to change this setting in the in the device global SQ_CONFIG
>>> register, the GFX pipeline must be idle.  GFX9.4.1 as a compute device
>>> will either dispatch work through the compute ring buffers used for
>>> image post processing or through the hardware scheduler by the KFD.
>>>
>>> Have the KGD suspend and drain the compute ring buffer, then suspend
>> the
>>> hardware scheduler and block any future KFD process job requests before
>>> changing the implicit wait count setting.  Once set, resume all work.
>>>
>>> Signed-off-by: Jonathan Kim <jonathan.kim@amd.com>
>>> ---
>>>    drivers/gpu/drm/amd/amdgpu/amdgpu.h           |   3 +
>>>    .../drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c   | 105
>> +++++++++++++++++-
>>>    drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c         |   4 +-
>>>    drivers/gpu/drm/amd/amdkfd/kfd_process.c      |   2 +-
>>>    4 files changed, 110 insertions(+), 4 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>> index 0e6ddf05c23c..9f2499f52d2c 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>> @@ -1034,6 +1034,9 @@ struct amdgpu_device {
>>>      struct pci_saved_state          *pci_state;
>>>      pci_channel_state_t             pci_channel_state;
>>>
>>> +   /* Track auto wait count on s_barrier settings */
>>> +   bool                            barrier_has_auto_waitcnt;
>>> +
>>>      struct amdgpu_reset_control     *reset_cntl;
>>>      uint32_t
>> ip_versions[MAX_HWIP][HWIP_MAX_INSTANCE];
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c
>>> index 4191af5a3f13..13f02a0aa828 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c
>>> @@ -26,6 +26,7 @@
>>>    #include "amdgpu.h"
>>>    #include "amdgpu_amdkfd.h"
>>>    #include "amdgpu_amdkfd_arcturus.h"
>>> +#include "amdgpu_reset.h"
>>>    #include "sdma0/sdma0_4_2_2_offset.h"
>>>    #include "sdma0/sdma0_4_2_2_sh_mask.h"
>>>    #include "sdma1/sdma1_4_2_2_offset.h"
>>> @@ -48,6 +49,8 @@
>>>    #include "amdgpu_amdkfd_gfx_v9.h"
>>>    #include "gfxhub_v1_0.h"
>>>    #include "mmhub_v9_4.h"
>>> +#include "gc/gc_9_0_offset.h"
>>> +#include "gc/gc_9_0_sh_mask.h"
>>>
>>>    #define HQD_N_REGS 56
>>>    #define DUMP_REG(addr) do {                               \
>>> @@ -276,6 +279,104 @@ int kgd_arcturus_hqd_sdma_destroy(struct
>> amdgpu_device *adev, void *mqd,
>>>      return 0;
>>>    }
>>>
>>> +/*
>>> + * Helper used to suspend/resume gfx pipe for image post process work
>> to set
>>> + * barrier behaviour.
>>> + */
>>> +static int suspend_resume_compute_scheduler(struct amdgpu_device
>> *adev, bool suspend)
>>> +{
>>> +   int i, r = 0;
>>> +
>>> +   for (i = 0; i < adev->gfx.num_compute_rings; i++) {
>>> +           struct amdgpu_ring *ring = &adev->gfx.compute_ring[i];
>>> +
>>> +           if (!(ring && ring->sched.thread))
>>> +                   continue;
>>> +
>>> +           /* stop secheduler and drain ring. */
>>> +           if (suspend) {
>>> +                   drm_sched_stop(&ring->sched, NULL);
>>> +                   r = amdgpu_fence_wait_empty(ring);
>>> +                   if (r)
>>> +                           goto out;
>>> +           } else {
>>> +                   drm_sched_start(&ring->sched, false);
>>> +           }
>>> +   }
>>> +
>>> +out:
>>> +   /* return on resume or failure to drain rings. */
>>> +   if (!suspend || r)
>>> +           return r;
>>> +
>>> +   return amdgpu_device_ip_wait_for_idle(adev, GC_HWIP);
>>> +}
>>> +
>>> +static void set_barrier_auto_waitcnt(struct amdgpu_device *adev, bool
>> enable_waitcnt)
>>> +{
>>> +   uint32_t data;
>>> +
>>> +   WRITE_ONCE(adev->barrier_has_auto_waitcnt, enable_waitcnt);
>>> +
>>> +   if (!down_read_trylock(&adev->reset_domain->sem))
>>> +           return;
>>> +
>>> +   amdgpu_amdkfd_suspend(adev, false);
>>> +
>>> +   if (suspend_resume_compute_scheduler(adev, true))
>>> +           goto out;
>>> +
>>> +   data = RREG32(SOC15_REG_OFFSET(GC, 0, mmSQ_CONFIG));
>>> +   data = REG_SET_FIELD(data, SQ_CONFIG,
>> DISABLE_BARRIER_WAITCNT,
>>> +                                           enable_waitcnt ? 0 : 1);
>>> +   WREG32(SOC15_REG_OFFSET(GC, 0, mmSQ_CONFIG), data);
>>> +
>>> +out:
>>> +   suspend_resume_compute_scheduler(adev, false);
>>> +
>>> +   amdgpu_amdkfd_resume(adev, false);
>>> +
>>> +   up_read(&adev->reset_domain->sem);
>>> +}
>>> +
>>> +static uint32_t kgd_arcturus_enable_debug_trap(struct amdgpu_device
>> *adev,
>>> +                           bool restore_dbg_registers,
>>> +                           uint32_t vmid)
>>> +{
>>> +   mutex_lock(&adev->grbm_idx_mutex);
>>> +
>>> +   kgd_gfx_v9_set_wave_launch_stall(adev, vmid, true);
>>> +
>>> +   set_barrier_auto_waitcnt(adev, true);
>>> +
>>> +   WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_TRAP_MASK),
>> 0);
>>> +
>>> +   kgd_gfx_v9_set_wave_launch_stall(adev, vmid, false);
>>> +
>>> +   mutex_unlock(&adev->grbm_idx_mutex);
>>> +
>>> +   return 0;
>>> +}
>>> +
>>> +static uint32_t kgd_arcturus_disable_debug_trap(struct amdgpu_device
>> *adev,
>>> +                                   bool keep_trap_enabled,
>>> +                                   uint32_t vmid)
>>> +{
>>> +
>>> +   mutex_lock(&adev->grbm_idx_mutex);
>>> +
>>> +   kgd_gfx_v9_set_wave_launch_stall(adev, vmid, true);
>>> +
>>> +   set_barrier_auto_waitcnt(adev, false);
>>> +
>>> +   WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_TRAP_MASK),
>> 0);
>>> +
>>> +   kgd_gfx_v9_set_wave_launch_stall(adev, vmid, false);
>>> +
>>> +   mutex_unlock(&adev->grbm_idx_mutex);
>>> +
>>> +   return 0;
>>> +}
>>>    const struct kfd2kgd_calls arcturus_kfd2kgd = {
>>>      .program_sh_mem_settings =
>> kgd_gfx_v9_program_sh_mem_settings,
>>>      .set_pasid_vmid_mapping = kgd_gfx_v9_set_pasid_vmid_mapping,
>>> @@ -294,6 +395,8 @@ const struct kfd2kgd_calls arcturus_kfd2kgd = {
>>>
>>        kgd_gfx_v9_get_atc_vmid_pasid_mapping_info,
>>>      .set_vm_context_page_table_base =
>>>
>>        kgd_gfx_v9_set_vm_context_page_table_base,
>>> +   .enable_debug_trap = kgd_arcturus_enable_debug_trap,
>>> +   .disable_debug_trap = kgd_arcturus_disable_debug_trap,
>>>      .get_cu_occupancy = kgd_gfx_v9_get_cu_occupancy,
>>> -   .program_trap_handler_settings =
>> kgd_gfx_v9_program_trap_handler_settings
>>> +   .program_trap_handler_settings =
>> kgd_gfx_v9_program_trap_handler_settings,
>>>    };
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>> b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>>> index a0e5ad342f13..8ed1b5d255f7 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>>> @@ -2424,8 +2424,8 @@ static void gfx_v9_0_init_sq_config(struct
>> amdgpu_device *adev)
>>>      switch (adev->ip_versions[GC_HWIP][0]) {
>>>      case IP_VERSION(9, 4, 1):
>>>              tmp = RREG32_SOC15(GC, 0, mmSQ_CONFIG);
>>> -           tmp = REG_SET_FIELD(tmp, SQ_CONFIG,
>>> -                                   DISABLE_BARRIER_WAITCNT, 1);
>>> +           tmp = REG_SET_FIELD(tmp, SQ_CONFIG,
>> DISABLE_BARRIER_WAITCNT,
>>> +                           READ_ONCE(adev-
>>> barrier_has_auto_waitcnt) ? 0 : 1);
>>>              WREG32_SOC15(GC, 0, mmSQ_CONFIG, tmp);
>>>              break;
>>>      default:
>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
>> b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
>>> index 56ad38fcd26e..efb81ccef8f5 100644
>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
>>> @@ -1946,7 +1946,7 @@ void kfd_suspend_all_processes(void)
>>>      WARN(debug_evictions, "Evicting all processes");
>>>      hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) {
>>>              cancel_delayed_work_sync(&p->eviction_work);
>>> -           cancel_delayed_work_sync(&p->restore_work);
>>> +           flush_delayed_work(&p->restore_work);
>> This looks like a sneak bug fix. Should this be a separate patch
>> independent of this path series?
> Ok.  That should probably be fixed in general.
> Back-to-back KFD suspends/resume calls can result in asymmetrical evictions and restores if scheduled restores are cancelled on suspend.
> The bug just happens to get surfaced for mGPU GFX9.4.1 debugging, because debug attach forces that scenario.
> I can send this out as a separate fix that's not related to this series.

I agree.

Thanks,
   Felix


>
> Thanks,
>
> Jon
>
>
>> Regards,
>>     Felix
>>
>>
>>>              if (kfd_process_evict_queues(p,
>> KFD_QUEUE_EVICTION_TRIGGER_SUSPEND))
>>>                      pr_err("Failed to suspend process 0x%x\n", p-
>>> pasid);

^ permalink raw reply	[flat|nested] 63+ messages in thread

* RE: [PATCH 17/29] drm/amdkfd: Add debug trap enabled flag to TMA
  2022-11-24 16:23       ` Felix Kuehling
@ 2022-11-24 20:27         ` Kim, Jonathan
  2022-11-25 16:53           ` Felix Kuehling
  0 siblings, 1 reply; 63+ messages in thread
From: Kim, Jonathan @ 2022-11-24 20:27 UTC (permalink / raw)
  To: Kuehling, Felix, amd-gfx

[-- Attachment #1: Type: text/plain, Size: 7120 bytes --]

[AMD Official Use Only - General]

> -----Original Message-----
> From: Kuehling, Felix <Felix.Kuehling@amd.com>
> Sent: November 24, 2022 11:24 AM
> To: Kim, Jonathan <Jonathan.Kim@amd.com>; amd-
> gfx@lists.freedesktop.org
> Subject: Re: [PATCH 17/29] drm/amdkfd: Add debug trap enabled flag to
> TMA
>
>
> Am 2022-11-24 um 09:51 schrieb Kim, Jonathan:
> > [Public]
> >
> >> -----Original Message-----
> >> From: Kuehling, Felix <Felix.Kuehling@amd.com>
> >> Sent: November 22, 2022 7:45 PM
> >> To: Kim, Jonathan <Jonathan.Kim@amd.com>; amd-
> >> gfx@lists.freedesktop.org
> >> Subject: Re: [PATCH 17/29] drm/amdkfd: Add debug trap enabled flag to
> >> TMA
> >>
> >>
> >> On 2022-10-31 12:23, Jonathan Kim wrote:
> >>> From: Jay Cornwall <jay.cornwall@amd.com>
> >>>
> >>> Trap handler behavior will differ when a debugger is attached.
> >>>
> >>> Make the debug trap flag available in the trap handler TMA.
> >>> Update it when the debug trap ioctl is invoked.
> >>>
> >>> v3: Rebase for upstream
> >>>
> >>> v2:
> >>> Add missing debug flag setup on APUs
> >>>
> >>> Signed-off-by: Jay Cornwall <jay.cornwall@amd.com>
> >>> Reviewed-by: Felix Kuehling <Felix.Kuehling@amd.com>
> >>> Signed-off-by: Jonathan Kim <jonathan.kim@amd.com>
> >>> ---
> >>>    drivers/gpu/drm/amd/amdkfd/kfd_debug.c   |  4 ++++
> >>>    drivers/gpu/drm/amd/amdkfd/kfd_priv.h    |  2 ++
> >>>    drivers/gpu/drm/amd/amdkfd/kfd_process.c | 16
> ++++++++++++++++
> >>>    3 files changed, 22 insertions(+)
> >>>
> >>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
> >> b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
> >>> index ae6e701a2656..d4f87f2adada 100644
> >>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
> >>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
> >>> @@ -193,6 +193,8 @@ void kfd_dbg_trap_deactivate(struct
> kfd_process
> >> *target, bool unwind, int unwind
> >>>              if (unwind && count == unwind_count)
> >>>                      break;
> >>>
> >>> +           kfd_process_set_trap_debug_flag(&pdd->qpd, false);
> >>> +
> >>>              /* GFX off is already disabled by debug activate if not RLC
> >> restore supported. */
> >>>              if (kfd_dbg_is_rlc_restore_supported(pdd->dev))
> >>>                      amdgpu_gfx_off_ctrl(pdd->dev->adev, false);
> >>> @@ -278,6 +280,8 @@ int kfd_dbg_trap_activate(struct kfd_process
> >> *target)
> >>>              if (kfd_dbg_is_rlc_restore_supported(pdd->dev))
> >>>                      amdgpu_gfx_off_ctrl(pdd->dev->adev, true);
> >>>
> >>> +           kfd_process_set_trap_debug_flag(&pdd->qpd, true);
> >>> +
> >>>              r = debug_refresh_runlist(pdd->dev->dqm);
> >>>              if (r) {
> >>>                      target->runtime_info.runtime_state =
> >>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> >> b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> >>> index 9690a2adb9ed..82b28588ab72 100644
> >>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> >>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> >>> @@ -1101,6 +1101,8 @@ int kfd_init_apertures(struct kfd_process
> >> *process);
> >>>    void kfd_process_set_trap_handler(struct qcm_process_device *qpd,
> >>>                                uint64_t tba_addr,
> >>>                                uint64_t tma_addr);
> >>> +void kfd_process_set_trap_debug_flag(struct qcm_process_device
> >> *qpd,
> >>> +                                bool enabled);
> >>>
> >>>    /* CWSR initialization */
> >>>    int kfd_process_init_cwsr_apu(struct kfd_process *process, struct file
> >> *filep);
> >>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> >> b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> >>> index 59c4c38833b6..d62e0c62df76 100644
> >>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> >>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> >>> @@ -1252,6 +1252,8 @@ int kfd_process_init_cwsr_apu(struct
> >> kfd_process *p, struct file *filep)
> >>>              memcpy(qpd->cwsr_kaddr, dev->cwsr_isa, dev-
> >>> cwsr_isa_size);
> >>>
> >>> +           kfd_process_set_trap_debug_flag(qpd, p-
> >>> debug_trap_enabled);
> >>> +
> >>>              qpd->tma_addr = qpd->tba_addr +
> >> KFD_CWSR_TMA_OFFSET;
> >>>              pr_debug("set tba :0x%llx, tma:0x%llx, cwsr_kaddr:%p for
> >> pqm.\n",
> >>>                      qpd->tba_addr, qpd->tma_addr, qpd->cwsr_kaddr);
> >>> @@ -1288,6 +1290,9 @@ static int
> >> kfd_process_device_init_cwsr_dgpu(struct kfd_process_device *pdd)
> >>>      memcpy(qpd->cwsr_kaddr, dev->cwsr_isa, dev->cwsr_isa_size);
> >>>
> >>> +   kfd_process_set_trap_debug_flag(&pdd->qpd,
> >>> +                                   pdd->process-
> >>> debug_trap_enabled);
> >>> +
> >>>      qpd->tma_addr = qpd->tba_addr + KFD_CWSR_TMA_OFFSET;
> >>>      pr_debug("set tba :0x%llx, tma:0x%llx, cwsr_kaddr:%p for pqm.\n",
> >>>               qpd->tba_addr, qpd->tma_addr, qpd->cwsr_kaddr);
> >>> @@ -1374,6 +1379,17 @@ bool kfd_process_xnack_mode(struct
> >> kfd_process *p, bool supported)
> >>>      return true;
> >>>    }
> >>>
> >>> +void kfd_process_set_trap_debug_flag(struct qcm_process_device
> >> *qpd,
> >>> +                                bool enabled)
> >>> +{
> >>> +   /* If TMA doesn't exist then flag will be set during allocation. */
> >> I would expect a change to the TMA allocation function, but that isn't
> >> in this patch?
> > The TMA is allocated under kfd_process_init_cwsr_* and CWSR enabled is
> a pre-condition for the 1st level trap handler loading.
> > The lack of context in the patch for those functions may be hiding that fact.
> > Is the placement of this comment misleading?  Maybe it should go in
> kfd_dbg_trap_activate when kfd_process_set_trap_debug_flag is called?
> > Or should it just be removed since the combined calls within initialization of
> CWSR + debug enable seem complete for enablement?
>
> I think the comment is fine. I was sort of expecting to see the
> corresponding change in the TMA allocation in the same patch. So my
> question is just lack of context. If that change in the TMA allocation
> got squashed into another patch in the series, maybe it would make sense
> to move it into this patch instead.

The change to set flag on TMA allocation is done in this patch as kfd_process_set_trap_debug_flag is now called in kfd_process_init_cwsr_*.
To my knowledge, CWSR init and trap handler memory allocation should be atomic and that has been upstreamed for a while.

Did you mean the user trap handler assignment?  That should be independent from flagging.

Thanks,

Jon


>
> Regards,
>    Felix
>
>
> >
> > Thanks,
> >
> > Jon
> >
> >> Regards,
> >>     Felix
> >>
> >>> +   if (qpd->cwsr_kaddr) {
> >>> +           uint64_t *tma =
> >>> +                   (uint64_t *)(qpd->cwsr_kaddr +
> >> KFD_CWSR_TMA_OFFSET);
> >>> +           tma[2] = enabled;
> >>> +   }
> >>> +}
> >>> +
> >>>    /*
> >>>     * On return the kfd_process is fully operational and will be freed when
> >> the
> >>>     * mm is released

[-- Attachment #2: winmail.dat --]
[-- Type: application/ms-tnef, Size: 18865 bytes --]

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [PATCH 19/29] drm/amdkfd: add debug set exceptions enabled operation
  2022-10-31 16:23 ` [PATCH 19/29] drm/amdkfd: add debug set exceptions enabled operation Jonathan Kim
@ 2022-11-24 21:24   ` Felix Kuehling
  0 siblings, 0 replies; 63+ messages in thread
From: Felix Kuehling @ 2022-11-24 21:24 UTC (permalink / raw)
  To: Jonathan Kim, amd-gfx

Am 2022-10-31 um 12:23 schrieb Jonathan Kim:
> The debugger subscibes to nofication for requested exceptions on attach.
> Allow the debugger to change its subsciption later on.
>
> Signed-off-by: Jonathan Kim <jonathan.kim@amd.com>

Reviewed-by: Felix Kuehling <Felix.Kuehling@amd.com>


> ---
>   drivers/gpu/drm/amd/amdkfd/kfd_chardev.c |  3 ++
>   drivers/gpu/drm/amd/amdkfd/kfd_debug.c   | 36 ++++++++++++++++++++++++
>   drivers/gpu/drm/amd/amdkfd/kfd_debug.h   |  2 ++
>   3 files changed, 41 insertions(+)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> index 27cd5af72521..61612b9bdf8c 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> @@ -2887,6 +2887,9 @@ static int kfd_ioctl_set_debug_trap(struct file *filep, struct kfd_process *p, v
>   				args->send_runtime_event.exception_mask);
>   		break;
>   	case KFD_IOC_DBG_TRAP_SET_EXCEPTIONS_ENABLED:
> +		kfd_dbg_set_enabled_debug_exception_mask(target,
> +				args->set_exceptions_enabled.exception_mask);
> +		break;
>   	case KFD_IOC_DBG_TRAP_SET_WAVE_LAUNCH_OVERRIDE:
>   	case KFD_IOC_DBG_TRAP_SET_WAVE_LAUNCH_MODE:
>   	case KFD_IOC_DBG_TRAP_SUSPEND_QUEUES:
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
> index 3d304e8c286e..594ccca25cae 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
> @@ -441,3 +441,39 @@ int kfd_dbg_trap_enable(struct kfd_process *target, uint32_t fd,
>   
>   	return r;
>   }
> +
> +void kfd_dbg_set_enabled_debug_exception_mask(struct kfd_process *target,
> +					uint64_t exception_set_mask)
> +{
> +	uint64_t found_mask = 0;
> +	struct process_queue_manager *pqm;
> +	struct process_queue_node *pqn;
> +	static const char write_data = '.';
> +	loff_t pos = 0;
> +	int i;
> +
> +	mutex_lock(&target->event_mutex);
> +
> +	found_mask |= target->exception_status;
> +
> +	pqm = &target->pqm;
> +	list_for_each_entry(pqn, &pqm->queues, process_queue_list) {
> +		if (!pqn)
> +			continue;
> +
> +		found_mask |= pqn->q->properties.exception_status;
> +	}
> +
> +	for (i = 0; i < target->n_pdds; i++) {
> +		struct kfd_process_device *pdd = target->pdds[i];
> +
> +		found_mask |= pdd->exception_status;
> +	}
> +
> +	if (exception_set_mask & found_mask)
> +		kernel_write(target->dbg_ev_file, &write_data, 1, &pos);
> +
> +	target->exception_enable_mask = exception_set_mask;
> +
> +	mutex_unlock(&target->event_mutex);
> +}
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.h b/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
> index 5270d5749828..837e09491a76 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
> @@ -58,6 +58,8 @@ static inline bool kfd_dbg_is_per_vmid_supported(struct kfd_dev *dev)
>   
>   void debug_event_write_work_handler(struct work_struct *work);
>   
> +void kfd_dbg_set_enabled_debug_exception_mask(struct kfd_process *target,
> +					uint64_t exception_set_mask);
>   /*
>    * If GFX off is enabled, chips that do not support RLC restore for the debug
>    * registers will disable GFX off temporarily for the entire debug session.

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [PATCH 17/29] drm/amdkfd: Add debug trap enabled flag to TMA
  2022-11-24 20:27         ` Kim, Jonathan
@ 2022-11-25 16:53           ` Felix Kuehling
  0 siblings, 0 replies; 63+ messages in thread
From: Felix Kuehling @ 2022-11-25 16:53 UTC (permalink / raw)
  To: Kim, Jonathan, amd-gfx


On 2022-11-24 15:27, Kim, Jonathan wrote:
> [AMD Official Use Only - General]
>
>> -----Original Message-----
>> From: Kuehling, Felix <Felix.Kuehling@amd.com>
>> Sent: November 24, 2022 11:24 AM
>> To: Kim, Jonathan <Jonathan.Kim@amd.com>; amd-
>> gfx@lists.freedesktop.org
>> Subject: Re: [PATCH 17/29] drm/amdkfd: Add debug trap enabled flag to
>> TMA
>>
>>
>> Am 2022-11-24 um 09:51 schrieb Kim, Jonathan:
>>> [Public]
>>>
>>>> -----Original Message-----
>>>> From: Kuehling, Felix <Felix.Kuehling@amd.com>
>>>> Sent: November 22, 2022 7:45 PM
>>>> To: Kim, Jonathan <Jonathan.Kim@amd.com>; amd-
>>>> gfx@lists.freedesktop.org
>>>> Subject: Re: [PATCH 17/29] drm/amdkfd: Add debug trap enabled flag to
>>>> TMA
>>>>
>>>>
>>>> On 2022-10-31 12:23, Jonathan Kim wrote:
>>>>> From: Jay Cornwall <jay.cornwall@amd.com>
>>>>>
>>>>> Trap handler behavior will differ when a debugger is attached.
>>>>>
>>>>> Make the debug trap flag available in the trap handler TMA.
>>>>> Update it when the debug trap ioctl is invoked.
>>>>>
>>>>> v3: Rebase for upstream
>>>>>
>>>>> v2:
>>>>> Add missing debug flag setup on APUs
>>>>>
>>>>> Signed-off-by: Jay Cornwall <jay.cornwall@amd.com>
>>>>> Reviewed-by: Felix Kuehling <Felix.Kuehling@amd.com>
>>>>> Signed-off-by: Jonathan Kim <jonathan.kim@amd.com>
>>>>> ---
>>>>>     drivers/gpu/drm/amd/amdkfd/kfd_debug.c   |  4 ++++
>>>>>     drivers/gpu/drm/amd/amdkfd/kfd_priv.h    |  2 ++
>>>>>     drivers/gpu/drm/amd/amdkfd/kfd_process.c | 16
>> ++++++++++++++++
>>>>>     3 files changed, 22 insertions(+)
>>>>>
>>>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
>>>> b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
>>>>> index ae6e701a2656..d4f87f2adada 100644
>>>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
>>>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
>>>>> @@ -193,6 +193,8 @@ void kfd_dbg_trap_deactivate(struct
>> kfd_process
>>>> *target, bool unwind, int unwind
>>>>>               if (unwind && count == unwind_count)
>>>>>                       break;
>>>>>
>>>>> +           kfd_process_set_trap_debug_flag(&pdd->qpd, false);
>>>>> +
>>>>>               /* GFX off is already disabled by debug activate if not RLC
>>>> restore supported. */
>>>>>               if (kfd_dbg_is_rlc_restore_supported(pdd->dev))
>>>>>                       amdgpu_gfx_off_ctrl(pdd->dev->adev, false);
>>>>> @@ -278,6 +280,8 @@ int kfd_dbg_trap_activate(struct kfd_process
>>>> *target)
>>>>>               if (kfd_dbg_is_rlc_restore_supported(pdd->dev))
>>>>>                       amdgpu_gfx_off_ctrl(pdd->dev->adev, true);
>>>>>
>>>>> +           kfd_process_set_trap_debug_flag(&pdd->qpd, true);
>>>>> +
>>>>>               r = debug_refresh_runlist(pdd->dev->dqm);
>>>>>               if (r) {
>>>>>                       target->runtime_info.runtime_state =
>>>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>>>> b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>>>>> index 9690a2adb9ed..82b28588ab72 100644
>>>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>>>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>>>>> @@ -1101,6 +1101,8 @@ int kfd_init_apertures(struct kfd_process
>>>> *process);
>>>>>     void kfd_process_set_trap_handler(struct qcm_process_device *qpd,
>>>>>                                 uint64_t tba_addr,
>>>>>                                 uint64_t tma_addr);
>>>>> +void kfd_process_set_trap_debug_flag(struct qcm_process_device
>>>> *qpd,
>>>>> +                                bool enabled);
>>>>>
>>>>>     /* CWSR initialization */
>>>>>     int kfd_process_init_cwsr_apu(struct kfd_process *process, struct file
>>>> *filep);
>>>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
>>>> b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
>>>>> index 59c4c38833b6..d62e0c62df76 100644
>>>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
>>>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
>>>>> @@ -1252,6 +1252,8 @@ int kfd_process_init_cwsr_apu(struct
>>>> kfd_process *p, struct file *filep)
>>>>>               memcpy(qpd->cwsr_kaddr, dev->cwsr_isa, dev-
>>>>> cwsr_isa_size);
>>>>>
>>>>> +           kfd_process_set_trap_debug_flag(qpd, p-
>>>>> debug_trap_enabled);
>>>>> +
>>>>>               qpd->tma_addr = qpd->tba_addr +
>>>> KFD_CWSR_TMA_OFFSET;
>>>>>               pr_debug("set tba :0x%llx, tma:0x%llx, cwsr_kaddr:%p for
>>>> pqm.\n",
>>>>>                       qpd->tba_addr, qpd->tma_addr, qpd->cwsr_kaddr);
>>>>> @@ -1288,6 +1290,9 @@ static int
>>>> kfd_process_device_init_cwsr_dgpu(struct kfd_process_device *pdd)
>>>>>       memcpy(qpd->cwsr_kaddr, dev->cwsr_isa, dev->cwsr_isa_size);
>>>>>
>>>>> +   kfd_process_set_trap_debug_flag(&pdd->qpd,
>>>>> +                                   pdd->process-
>>>>> debug_trap_enabled);
>>>>> +
>>>>>       qpd->tma_addr = qpd->tba_addr + KFD_CWSR_TMA_OFFSET;
>>>>>       pr_debug("set tba :0x%llx, tma:0x%llx, cwsr_kaddr:%p for pqm.\n",
>>>>>                qpd->tba_addr, qpd->tma_addr, qpd->cwsr_kaddr);
>>>>> @@ -1374,6 +1379,17 @@ bool kfd_process_xnack_mode(struct
>>>> kfd_process *p, bool supported)
>>>>>       return true;
>>>>>     }
>>>>>
>>>>> +void kfd_process_set_trap_debug_flag(struct qcm_process_device
>>>> *qpd,
>>>>> +                                bool enabled)
>>>>> +{
>>>>> +   /* If TMA doesn't exist then flag will be set during allocation. */
>>>> I would expect a change to the TMA allocation function, but that isn't
>>>> in this patch?
>>> The TMA is allocated under kfd_process_init_cwsr_* and CWSR enabled is
>> a pre-condition for the 1st level trap handler loading.
>>> The lack of context in the patch for those functions may be hiding that fact.
>>> Is the placement of this comment misleading?  Maybe it should go in
>> kfd_dbg_trap_activate when kfd_process_set_trap_debug_flag is called?
>>> Or should it just be removed since the combined calls within initialization of
>> CWSR + debug enable seem complete for enablement?
>>
>> I think the comment is fine. I was sort of expecting to see the
>> corresponding change in the TMA allocation in the same patch. So my
>> question is just lack of context. If that change in the TMA allocation
>> got squashed into another patch in the series, maybe it would make sense
>> to move it into this patch instead.
> The change to set flag on TMA allocation is done in this patch as kfd_process_set_trap_debug_flag is now called in kfd_process_init_cwsr_*.
> To my knowledge, CWSR init and trap handler memory allocation should be atomic and that has been upstreamed for a while.
>
> Did you mean the user trap handler assignment?  That should be independent from flagging.

OK, now I see where my confusion came from. I missed that ths same 
function was called from different places to enable the debug flag. The 
wording of the comment implied that this function is not used during TMA 
allocation and made me look for something accessing tma[2] directly 
somewhere else. The comment also talks only about setting the flag, 
while this function is also used to reset the flag. I think this comment 
would make more sense in kfd_dbg_trap_activate, where the context is 
"setting the flag when debugging is activated, potentially before the 
TMA is allocated".

Thanks,
   Felix


>
> Thanks,
>
> Jon
>
>
>> Regards,
>>     Felix
>>
>>
>>> Thanks,
>>>
>>> Jon
>>>
>>>> Regards,
>>>>      Felix
>>>>
>>>>> +   if (qpd->cwsr_kaddr) {
>>>>> +           uint64_t *tma =
>>>>> +                   (uint64_t *)(qpd->cwsr_kaddr +
>>>> KFD_CWSR_TMA_OFFSET);
>>>>> +           tma[2] = enabled;
>>>>> +   }
>>>>> +}
>>>>> +
>>>>>     /*
>>>>>      * On return the kfd_process is fully operational and will be freed when
>>>> the
>>>>>      * mm is released

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [PATCH 20/29] drm/amdkfd: add debug wave launch override operation
  2022-10-31 16:23 ` [PATCH 20/29] drm/amdkfd: add debug wave launch override operation Jonathan Kim
@ 2022-11-29 22:37   ` Felix Kuehling
  0 siblings, 0 replies; 63+ messages in thread
From: Felix Kuehling @ 2022-11-29 22:37 UTC (permalink / raw)
  To: Jonathan Kim, amd-gfx

On 2022-10-31 12:23, Jonathan Kim wrote:
> This operation allows the debugger to override the enabled HW
> exceptions on the device.
>
> On debug devices that only support the debugging of a single process,
> the HW exceptions are global and set through the SPI_GDBG_TRAP_MASK
> register.
> Because they are global, only address watch exceptions are allowed to
> be enabled.  In other words, the debugger must preserve all non-address
> watch exception states in normal mode operation by barring a full
> replacement override or a non-address watch override request.
>
> For multi-process debugging, all HW exception overrides are per-VMID so
> all exceptions can be overridden or fully replaced.
>
> In order for the debugger to know what is permissible, returned the
> supported override mask back to the debugger along with the previously
> enable overrides.
>
> v2: switch unsupported override mode return from EPERM to EINVAL to
> support unique EPERM on PTRACE failure.
>
> Signed-off-by: Jonathan Kim <jonathan.kim@amd.com>

Reviewed-by: Felix Kuehling <Felix.Kuehling@amd.com>


> ---
>   .../drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c  | 47 ++++++++++++++
>   .../drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c   |  2 +
>   .../drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c    | 55 ++++++++++++++++
>   .../drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.h    | 10 +++
>   .../drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10_3.c  |  5 +-
>   .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c | 55 ++++++++++++++++
>   .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h | 10 +++
>   drivers/gpu/drm/amd/amdkfd/kfd_chardev.c      |  7 ++
>   drivers/gpu/drm/amd/amdkfd/kfd_debug.c        | 65 +++++++++++++++++++
>   drivers/gpu/drm/amd/amdkfd/kfd_debug.h        |  6 ++
>   10 files changed, 261 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c
> index c9629fc5460c..a5003f6f05bf 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c
> @@ -25,6 +25,7 @@
>   #include "amdgpu_amdkfd_gfx_v9.h"
>   #include "gc/gc_9_4_2_offset.h"
>   #include "gc/gc_9_4_2_sh_mask.h"
> +#include <uapi/linux/kfd_ioctl.h>
>   
>   /* returns TRAP_EN, EXCP_EN and EXCP_REPLACE. */
>   static uint32_t kgd_aldebaran_enable_debug_trap(struct amdgpu_device *adev,
> @@ -54,6 +55,50 @@ static uint32_t kgd_aldebaran_disable_debug_trap(struct amdgpu_device *adev,
>   	return data;
>   }
>   
> +static int kgd_aldebaran_validate_trap_override_request(struct amdgpu_device *adev,
> +							uint32_t trap_override,
> +							uint32_t *trap_mask_supported)
> +{
> +	*trap_mask_supported &= KFD_DBG_TRAP_MASK_FP_INVALID |
> +				KFD_DBG_TRAP_MASK_FP_INPUT_DENORMAL |
> +				KFD_DBG_TRAP_MASK_FP_DIVIDE_BY_ZERO |
> +				KFD_DBG_TRAP_MASK_FP_OVERFLOW |
> +				KFD_DBG_TRAP_MASK_FP_UNDERFLOW |
> +				KFD_DBG_TRAP_MASK_FP_INEXACT |
> +				KFD_DBG_TRAP_MASK_INT_DIVIDE_BY_ZERO |
> +				KFD_DBG_TRAP_MASK_DBG_ADDRESS_WATCH |
> +				KFD_DBG_TRAP_MASK_DBG_MEMORY_VIOLATION;
> +
> +	if (trap_override != KFD_DBG_TRAP_OVERRIDE_OR &&
> +			trap_override != KFD_DBG_TRAP_OVERRIDE_REPLACE)
> +		return -EPERM;
> +
> +	return 0;
> +}
> +
> +/* returns TRAP_EN, EXCP_EN and EXCP_RPLACE. */
> +static uint32_t kgd_aldebaran_set_wave_launch_trap_override(struct amdgpu_device *adev,
> +					uint32_t vmid,
> +					uint32_t trap_override,
> +					uint32_t trap_mask_bits,
> +					uint32_t trap_mask_request,
> +					uint32_t *trap_mask_prev,
> +					uint32_t kfd_dbg_trap_cntl_prev)
> +
> +{
> +	uint32_t data = 0;
> +
> +	*trap_mask_prev = REG_GET_FIELD(kfd_dbg_trap_cntl_prev, SPI_GDBG_PER_VMID_CNTL, EXCP_EN);
> +	trap_mask_bits = (trap_mask_bits & trap_mask_request) |
> +		(*trap_mask_prev & ~trap_mask_request);
> +
> +	data = REG_SET_FIELD(data, SPI_GDBG_PER_VMID_CNTL, TRAP_EN, 1);
> +	data = REG_SET_FIELD(data, SPI_GDBG_PER_VMID_CNTL, EXCP_EN, trap_mask_bits);
> +	data = REG_SET_FIELD(data, SPI_GDBG_PER_VMID_CNTL, EXCP_REPLACE, trap_override);
> +
> +	return data;
> +}
> +
>   const struct kfd2kgd_calls aldebaran_kfd2kgd = {
>   	.program_sh_mem_settings = kgd_gfx_v9_program_sh_mem_settings,
>   	.set_pasid_vmid_mapping = kgd_gfx_v9_set_pasid_vmid_mapping,
> @@ -73,6 +118,8 @@ const struct kfd2kgd_calls aldebaran_kfd2kgd = {
>   	.set_vm_context_page_table_base = kgd_gfx_v9_set_vm_context_page_table_base,
>   	.enable_debug_trap = kgd_aldebaran_enable_debug_trap,
>   	.disable_debug_trap = kgd_aldebaran_disable_debug_trap,
> +	.validate_trap_override_request = kgd_aldebaran_validate_trap_override_request,
> +	.set_wave_launch_trap_override = kgd_aldebaran_set_wave_launch_trap_override,
>   	.get_iq_wait_times = kgd_gfx_v9_get_iq_wait_times,
>   	.build_grace_period_packet_info = kgd_gfx_v9_build_grace_period_packet_info,
>   	.program_trap_handler_settings = kgd_gfx_v9_program_trap_handler_settings,
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c
> index 60a204f767ba..b3682758184f 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c
> @@ -397,6 +397,8 @@ const struct kfd2kgd_calls arcturus_kfd2kgd = {
>   				kgd_gfx_v9_set_vm_context_page_table_base,
>   	.enable_debug_trap = kgd_arcturus_enable_debug_trap,
>   	.disable_debug_trap = kgd_arcturus_disable_debug_trap,
> +	.validate_trap_override_request = kgd_gfx_v9_validate_trap_override_request,
> +	.set_wave_launch_trap_override = kgd_gfx_v9_set_wave_launch_trap_override,
>   	.get_iq_wait_times = kgd_gfx_v9_get_iq_wait_times,
>   	.build_grace_period_packet_info = kgd_gfx_v9_build_grace_period_packet_info,
>   	.get_cu_occupancy = kgd_gfx_v9_get_cu_occupancy,
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
> index 2491402afd58..32a6e5fbeacd 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
> @@ -31,6 +31,7 @@
>   #include "v10_structs.h"
>   #include "nv.h"
>   #include "nvd.h"
> +#include <uapi/linux/kfd_ioctl.h>
>   
>   enum hqd_dequeue_request_type {
>   	NO_ACTION = 0,
> @@ -801,6 +802,58 @@ uint32_t kgd_gfx_v10_disable_debug_trap(struct amdgpu_device *adev,
>   	return 0;
>   }
>   
> +int kgd_gfx_v10_validate_trap_override_request(struct amdgpu_device *adev,
> +					      uint32_t trap_override,
> +					      uint32_t *trap_mask_supported)
> +{
> +	*trap_mask_supported &= KFD_DBG_TRAP_MASK_DBG_ADDRESS_WATCH;
> +
> +	/* The SPI_GDBG_TRAP_MASK register is global and affects all
> +	 * processes. Only allow OR-ing the address-watch bit, since
> +	 * this only affects processes under the debugger. Other bits
> +	 * should stay 0 to avoid the debugger interfering with other
> +	 * processes.
> +	 */
> +	if (trap_override != KFD_DBG_TRAP_OVERRIDE_OR)
> +		return -EINVAL;
> +
> +	return 0;
> +}
> +
> +uint32_t kgd_gfx_v10_set_wave_launch_trap_override(struct amdgpu_device *adev,
> +					      uint32_t vmid,
> +					      uint32_t trap_override,
> +					      uint32_t trap_mask_bits,
> +					      uint32_t trap_mask_request,
> +					      uint32_t *trap_mask_prev,
> +					      uint32_t kfd_dbg_trap_cntl_prev)
> +{
> +	uint32_t data, wave_cntl_prev;
> +
> +	mutex_lock(&adev->grbm_idx_mutex);
> +
> +	wave_cntl_prev = RREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_WAVE_CNTL));
> +
> +	kgd_gfx_v10_set_wave_launch_stall(adev, vmid, true);
> +
> +	data = RREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_TRAP_MASK));
> +	*trap_mask_prev = REG_GET_FIELD(data, SPI_GDBG_TRAP_MASK, EXCP_EN);
> +
> +	trap_mask_bits = (trap_mask_bits & trap_mask_request) |
> +		(*trap_mask_prev & ~trap_mask_request);
> +
> +	data = REG_SET_FIELD(data, SPI_GDBG_TRAP_MASK, EXCP_EN, trap_mask_bits);
> +	data = REG_SET_FIELD(data, SPI_GDBG_TRAP_MASK, REPLACE, trap_override);
> +	WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_TRAP_MASK), data);
> +
> +	/* We need to preserve wave launch mode stall settings. */
> +	WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_WAVE_CNTL), wave_cntl_prev);
> +
> +	mutex_unlock(&adev->grbm_idx_mutex);
> +
> +	return 0;
> +}
> +
>   /* kgd_gfx_v10_get_iq_wait_times: Returns the mmCP_IQ_WAIT_TIME1/2 values
>    * The values read are:
>    *     ib_offload_wait_time     -- Wait Count for Indirect Buffer Offloads.
> @@ -886,6 +939,8 @@ const struct kfd2kgd_calls gfx_v10_kfd2kgd = {
>   	.set_vm_context_page_table_base = set_vm_context_page_table_base,
>   	.enable_debug_trap = kgd_gfx_v10_enable_debug_trap,
>   	.disable_debug_trap = kgd_gfx_v10_disable_debug_trap,
> +	.validate_trap_override_request = kgd_gfx_v10_validate_trap_override_request,
> +	.set_wave_launch_trap_override = kgd_gfx_v10_set_wave_launch_trap_override,
>   	.get_iq_wait_times = kgd_gfx_v10_get_iq_wait_times,
>   	.build_grace_period_packet_info = kgd_gfx_v10_build_grace_period_packet_info,
>   	.program_trap_handler_settings = program_trap_handler_settings,
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.h
> index 0abc1e805180..85c929fc2926 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.h
> @@ -26,6 +26,16 @@ uint32_t kgd_gfx_v10_enable_debug_trap(struct amdgpu_device *adev,
>   uint32_t kgd_gfx_v10_disable_debug_trap(struct amdgpu_device *adev,
>   					bool keep_trap_enabled,
>   					uint32_t vmid);
> +int kgd_gfx_v10_validate_trap_override_request(struct amdgpu_device *adev,
> +					     uint32_t trap_override,
> +					     uint32_t *trap_mask_supported);
> +uint32_t kgd_gfx_v10_set_wave_launch_trap_override(struct amdgpu_device *adev,
> +					     uint32_t vmid,
> +					     uint32_t trap_override,
> +					     uint32_t trap_mask_bits,
> +					     uint32_t trap_mask_request,
> +					     uint32_t *trap_mask_prev,
> +					     uint32_t kfd_dbg_trap_cntl_prev);
>   void kgd_gfx_v10_get_iq_wait_times(struct amdgpu_device *adev, uint32_t *wait_times);
>   void kgd_gfx_v10_build_grace_period_packet_info(struct amdgpu_device *adev,
>   					       uint32_t wait_times,
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10_3.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10_3.c
> index c57f2a6b6e23..ae3ead207df4 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10_3.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10_3.c
> @@ -673,5 +673,8 @@ const struct kfd2kgd_calls gfx_v10_3_kfd2kgd = {
>   	.get_iq_wait_times = kgd_gfx_v10_get_iq_wait_times,
>   	.build_grace_period_packet_info = kgd_gfx_v10_build_grace_period_packet_info,
>   	.enable_debug_trap = kgd_gfx_v10_enable_debug_trap,
> -	.disable_debug_trap = kgd_gfx_v10_disable_debug_trap
> +	.disable_debug_trap = kgd_gfx_v10_disable_debug_trap,
> +	.validate_trap_override_request = kgd_gfx_v10_validate_trap_override_request,
> +	.set_wave_launch_trap_override = kgd_gfx_v10_set_wave_launch_trap_override
> +
>   };
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
> index 673c99c5523d..cb0044bbfae5 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
> @@ -38,6 +38,7 @@
>   #include "soc15d.h"
>   #include "gfx_v9_0.h"
>   #include "amdgpu_amdkfd_gfx_v9.h"
> +#include <uapi/linux/kfd_ioctl.h>
>   
>   enum hqd_dequeue_request_type {
>   	NO_ACTION = 0,
> @@ -724,6 +725,58 @@ uint32_t kgd_gfx_v9_disable_debug_trap(struct amdgpu_device *adev,
>   	return 0;
>   }
>   
> +int kgd_gfx_v9_validate_trap_override_request(struct amdgpu_device *adev,
> +					uint32_t trap_override,
> +					uint32_t *trap_mask_supported)
> +{
> +	*trap_mask_supported &= KFD_DBG_TRAP_MASK_DBG_ADDRESS_WATCH;
> +
> +	/* The SPI_GDBG_TRAP_MASK register is global and affects all
> +	 * processes. Only allow OR-ing the address-watch bit, since
> +	 * this only affects processes under the debugger. Other bits
> +	 * should stay 0 to avoid the debugger interfering with other
> +	 * processes.
> +	 */
> +	if (trap_override != KFD_DBG_TRAP_OVERRIDE_OR)
> +		return -EINVAL;
> +
> +	return 0;
> +}
> +
> +uint32_t kgd_gfx_v9_set_wave_launch_trap_override(struct amdgpu_device *adev,
> +					     uint32_t vmid,
> +					     uint32_t trap_override,
> +					     uint32_t trap_mask_bits,
> +					     uint32_t trap_mask_request,
> +					     uint32_t *trap_mask_prev,
> +					     uint32_t kfd_dbg_cntl_prev)
> +{
> +	uint32_t data, wave_cntl_prev;
> +
> +	mutex_lock(&adev->grbm_idx_mutex);
> +
> +	wave_cntl_prev = RREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_WAVE_CNTL));
> +
> +	kgd_gfx_v9_set_wave_launch_stall(adev, vmid, true);
> +
> +	data = RREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_TRAP_MASK));
> +	*trap_mask_prev = REG_GET_FIELD(data, SPI_GDBG_TRAP_MASK, EXCP_EN);
> +
> +	trap_mask_bits = (trap_mask_bits & trap_mask_request) |
> +		(*trap_mask_prev & ~trap_mask_request);
> +
> +	data = REG_SET_FIELD(data, SPI_GDBG_TRAP_MASK, EXCP_EN, trap_mask_bits);
> +	data = REG_SET_FIELD(data, SPI_GDBG_TRAP_MASK, REPLACE, trap_override);
> +	WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_TRAP_MASK), data);
> +
> +	/* We need to preserve wave launch mode stall settings. */
> +	WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_WAVE_CNTL), wave_cntl_prev);
> +
> +	mutex_unlock(&adev->grbm_idx_mutex);
> +
> +	return 0;
> +}
> +
>   /* kgd_gfx_v9_get_iq_wait_times: Returns the mmCP_IQ_WAIT_TIME1/2 values
>    * The values read are:
>    *     ib_offload_wait_time     -- Wait Count for Indirect Buffer Offloads.
> @@ -992,6 +1045,8 @@ const struct kfd2kgd_calls gfx_v9_kfd2kgd = {
>   	.set_vm_context_page_table_base = kgd_gfx_v9_set_vm_context_page_table_base,
>   	.enable_debug_trap = kgd_gfx_v9_enable_debug_trap,
>   	.disable_debug_trap = kgd_gfx_v9_disable_debug_trap,
> +	.validate_trap_override_request = kgd_gfx_v9_validate_trap_override_request,
> +	.set_wave_launch_trap_override = kgd_gfx_v9_set_wave_launch_trap_override,
>   	.get_iq_wait_times = kgd_gfx_v9_get_iq_wait_times,
>   	.build_grace_period_packet_info = kgd_gfx_v9_build_grace_period_packet_info,
>   	.get_cu_occupancy = kgd_gfx_v9_get_cu_occupancy,
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h
> index c0866497cb5c..47cff392b434 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h
> @@ -64,6 +64,16 @@ uint32_t kgd_gfx_v9_enable_debug_trap(struct amdgpu_device *adev,
>   uint32_t kgd_gfx_v9_disable_debug_trap(struct amdgpu_device *adev,
>   					bool keep_trap_enabled,
>   					uint32_t vmid);
> +int kgd_gfx_v9_validate_trap_override_request(struct amdgpu_device *adev,
> +					     uint32_t trap_override,
> +					     uint32_t *trap_mask_supported);
> +uint32_t kgd_gfx_v9_set_wave_launch_trap_override(struct amdgpu_device *adev,
> +					     uint32_t vmid,
> +					     uint32_t trap_override,
> +					     uint32_t trap_mask_bits,
> +					     uint32_t trap_mask_request,
> +					     uint32_t *trap_mask_prev,
> +					     uint32_t kfd_dbg_trap_cntl_prev);
>   void kgd_gfx_v9_get_iq_wait_times(struct amdgpu_device *adev, uint32_t *wait_times);
>   void kgd_gfx_v9_build_grace_period_packet_info(struct amdgpu_device *adev,
>   					       uint32_t wait_times,
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> index 61612b9bdf8c..1f0ee2413b13 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> @@ -2891,6 +2891,13 @@ static int kfd_ioctl_set_debug_trap(struct file *filep, struct kfd_process *p, v
>   				args->set_exceptions_enabled.exception_mask);
>   		break;
>   	case KFD_IOC_DBG_TRAP_SET_WAVE_LAUNCH_OVERRIDE:
> +		r = kfd_dbg_trap_set_wave_launch_override(target,
> +				args->launch_override.override_mode,
> +				args->launch_override.enable_mask,
> +				args->launch_override.support_request_mask,
> +				&args->launch_override.enable_mask,
> +				&args->launch_override.support_request_mask);
> +		break;
>   	case KFD_IOC_DBG_TRAP_SET_WAVE_LAUNCH_MODE:
>   	case KFD_IOC_DBG_TRAP_SUSPEND_QUEUES:
>   	case KFD_IOC_DBG_TRAP_RESUME_QUEUES:
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
> index 594ccca25cae..8add359d1cb9 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
> @@ -442,6 +442,71 @@ int kfd_dbg_trap_enable(struct kfd_process *target, uint32_t fd,
>   	return r;
>   }
>   
> +static int kfd_dbg_validate_trap_override_request(struct kfd_process *p,
> +						uint32_t trap_override,
> +						uint32_t trap_mask_request,
> +						uint32_t *trap_mask_supported)
> +{
> +	int i = 0;
> +
> +	*trap_mask_supported = 0xffffffff;
> +
> +	for (i = 0; i < p->n_pdds; i++) {
> +		struct kfd_process_device *pdd = p->pdds[i];
> +		int err = pdd->dev->kfd2kgd->validate_trap_override_request(
> +								pdd->dev->adev,
> +								trap_override,
> +								trap_mask_supported);
> +
> +		if (err)
> +			return err;
> +	}
> +
> +	if (trap_mask_request & ~*trap_mask_supported)
> +		return -EACCES;
> +
> +	return 0;
> +}
> +
> +int kfd_dbg_trap_set_wave_launch_override(struct kfd_process *target,
> +					uint32_t trap_override,
> +					uint32_t trap_mask_bits,
> +					uint32_t trap_mask_request,
> +					uint32_t *trap_mask_prev,
> +					uint32_t *trap_mask_supported)
> +{
> +	int r = 0, i;
> +
> +	r = kfd_dbg_validate_trap_override_request(target,
> +						trap_override,
> +						trap_mask_request,
> +						trap_mask_supported);
> +
> +	if (r)
> +		return r;
> +
> +	for (i = 0; i < target->n_pdds; i++) {
> +		struct kfd_process_device *pdd = target->pdds[i];
> +
> +		amdgpu_gfx_off_ctrl(pdd->dev->adev, false);
> +		pdd->spi_dbg_override = pdd->dev->kfd2kgd->set_wave_launch_trap_override(
> +				pdd->dev->adev,
> +				pdd->dev->vm_info.last_vmid_kfd,
> +				trap_override,
> +				trap_mask_bits,
> +				trap_mask_request,
> +				trap_mask_prev,
> +				pdd->spi_dbg_override);
> +		amdgpu_gfx_off_ctrl(pdd->dev->adev, true);
> +
> +		r = debug_refresh_runlist(pdd->dev->dqm);
> +		if (r)
> +			break;
> +	}
> +
> +	return r;
> +}
> +
>   void kfd_dbg_set_enabled_debug_exception_mask(struct kfd_process *target,
>   					uint64_t exception_set_mask)
>   {
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.h b/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
> index 837e09491a76..b54a50a5d310 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
> @@ -45,6 +45,12 @@ int kfd_dbg_trap_disable(struct kfd_process *target);
>   int kfd_dbg_trap_enable(struct kfd_process *target, uint32_t fd,
>   			void __user *runtime_info,
>   			uint32_t *runtime_info_size);
> +int kfd_dbg_trap_set_wave_launch_override(struct kfd_process *target,
> +					uint32_t trap_override,
> +					uint32_t trap_mask_bits,
> +					uint32_t trap_mask_request,
> +					uint32_t *trap_mask_prev,
> +					uint32_t *trap_mask_supported);
>   
>   int kfd_dbg_send_exception_to_runtime(struct kfd_process *p,
>   					unsigned int dev_id,

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [PATCH 22/29] drm/amdkfd: add debug suspend and resume process queues operation
  2022-10-31 16:23 ` [PATCH 22/29] drm/amdkfd: add debug suspend and resume process queues operation Jonathan Kim
@ 2022-11-29 23:55   ` Felix Kuehling
  0 siblings, 0 replies; 63+ messages in thread
From: Felix Kuehling @ 2022-11-29 23:55 UTC (permalink / raw)
  To: Jonathan Kim, amd-gfx


On 2022-10-31 12:23, Jonathan Kim wrote:
> In order to inspect waves from the saved context at any point during a
> debug session, the debugger must be able to preempt queues to trigger
> context save by suspending them.
>
> On queue suspend, the KFD will copy the context save header information
> so that the debugger can correctly crawl the appropriate size of the saved
> context. The debugger must then also be allowed to resume suspended queues.
>
> A queue that is newly created cannot be suspended because queue ids are
> recycled after destruction so the debugger needs to know that this has
> occurred.  Query functions will be later added that will clear a given
> queue of its new queue status.
>
> A queue cannot be destroyed while it is suspended to preserve its saved
> context during debugger inspection.  Have queue destruction block while
> a queue is suspended and unblocked when it is resumed.  Likewise, if a
> queue is about to be destroyed, it cannot be suspended.
>
> Return the number of queues successfully suspended or resumed along with
> a per queue status array where the upper bits per queue status show that
> the request was invalid (new/destroyed queue suspend request, missing
> queue) or an error occurred (HWS in a fatal state so it can't suspend or
> resume queues).
>
> Signed-off-by: Jonathan Kim <jonathan.kim@amd.com>

Some nit-picks inline. Other than that, this patch looks good to me.


> ---
>   drivers/gpu/drm/amd/amdkfd/kfd_chardev.c      |  12 +
>   drivers/gpu/drm/amd/amdkfd/kfd_debug.c        |   7 +
>   .../drm/amd/amdkfd/kfd_device_queue_manager.c | 401 +++++++++++++++++-
>   .../drm/amd/amdkfd/kfd_device_queue_manager.h |  11 +
>   .../gpu/drm/amd/amdkfd/kfd_mqd_manager_v10.c  |  10 +
>   .../gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c   |  14 +-
>   drivers/gpu/drm/amd/amdkfd/kfd_priv.h         |   5 +-
>   7 files changed, 454 insertions(+), 6 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> index 63665279ce4d..ec26c51177f9 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> @@ -410,6 +410,7 @@ static int kfd_ioctl_create_queue(struct file *filep, struct kfd_process *p,
>   	pr_debug("Write ptr address   == 0x%016llX\n",
>   			args->write_pointer_address);
>   
> +	kfd_dbg_ev_raise(KFD_EC_MASK(EC_QUEUE_NEW), p, dev, queue_id, false, NULL, 0);
>   	return 0;
>   
>   err_create_queue:
> @@ -2903,7 +2904,18 @@ static int kfd_ioctl_set_debug_trap(struct file *filep, struct kfd_process *p, v
>   				args->launch_mode.launch_mode);
>   		break;
>   	case KFD_IOC_DBG_TRAP_SUSPEND_QUEUES:
> +		r = suspend_queues(target,
> +				args->suspend_queues.num_queues,
> +				args->suspend_queues.grace_period,
> +				args->suspend_queues.exception_mask,
> +				(uint32_t *)args->suspend_queues.queue_array_ptr);
> +
> +		break;
>   	case KFD_IOC_DBG_TRAP_RESUME_QUEUES:
> +		r = resume_queues(target, false,
> +				args->resume_queues.num_queues,
> +				(uint32_t *)args->resume_queues.queue_array_ptr);
> +		break;
>   	case KFD_IOC_DBG_TRAP_SET_NODE_ADDRESS_WATCH:
>   	case KFD_IOC_DBG_TRAP_CLEAR_NODE_ADDRESS_WATCH:
>   	case KFD_IOC_DBG_TRAP_SET_FLAGS:
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
> index 210851f2cdb3..afa56aad316b 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
> @@ -274,6 +274,13 @@ void kfd_dbg_trap_deactivate(struct kfd_process *target, bool unwind, int unwind
>   
>   		count++;
>   	}
> +
> +	if (!unwind) {
> +		int resume_count = resume_queues(target, true, 0, NULL);
> +
> +		if (resume_count)
> +			pr_debug("Resumed %d queues\n", resume_count);
> +	}
>   }
>   
>   static void kfd_dbg_clean_exception_status(struct kfd_process *target)
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> index bf4787b4dc6c..589efbefc8dc 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> @@ -921,6 +921,79 @@ static int update_queue(struct device_queue_manager *dqm, struct queue *q,
>   	return retval;
>   }
>   
> +/* suspend_single_queue does not lock the dqm like the
> + * evict_process_queues_cpsch or evict_process_queues_nocpsch. You should
> + * lock the dqm before calling, and unlock after calling.
> + *
> + * The reason we don't lock the dqm is because this function may be
> + * called on multiple queues in a loop, so rather than locking/unlocking
> + * multiple times, we will just keep the dqm locked for all of the calls.
> + */
> +static int suspend_single_queue(struct device_queue_manager *dqm,
> +				      struct kfd_process_device *pdd,
> +				      struct queue *q)
> +{
> +	bool is_new;
> +
> +	if (q->properties.is_suspended)
> +		return 0;
> +
> +	pr_debug("Suspending PASID %u queue [%i]\n",
> +			pdd->process->pasid,
> +			q->properties.queue_id);
> +
> +	is_new = q->properties.exception_status & KFD_EC_MASK(EC_QUEUE_NEW);
> +
> +	if (is_new || q->properties.is_being_destroyed) {
> +		pr_debug("Suspend: skip %s queue id %i\n",
> +				is_new ? "new" : "destroyed",
> +				q->properties.queue_id);
> +		return -EBUSY;
> +	}
> +
> +	q->properties.is_suspended = true;
> +	if (q->properties.is_active) {
> +		decrement_queue_count(dqm, &pdd->qpd, q);
> +		q->properties.is_active = false;
> +	}
> +
> +	return 0;
> +}
> +
> +/* resume_single_queue does not lock the dqm like the functions
> + * restore_process_queues_cpsch or restore_process_queues_nocpsch. You should
> + * lock the dqm before calling, and unlock after calling.
> + *
> + * The reason we don't lock the dqm is because this function may be
> + * called on multiple queues in a loop, so rather than locking/unlocking
> + * multiple times, we will just keep the dqm locked for all of the calls.
> + */
> +static void resume_single_queue(struct device_queue_manager *dqm,
> +				      struct qcm_process_device *qpd,
> +				      struct queue *q)
> +{
> +	struct kfd_process_device *pdd;
> +	uint64_t pd_base;
> +
> +	if (!q->properties.is_suspended)
> +		return;
> +
> +	pdd = qpd_to_pdd(qpd);
> +	/* Retrieve PD base */
> +	pd_base = amdgpu_amdkfd_gpuvm_get_process_page_dir(pdd->drm_priv);

This variable seems to be unused.


> +
> +	pr_debug("Restoring from suspend PASID %u queue [%i]\n",
> +			    pdd->process->pasid,
> +			    q->properties.queue_id);
> +
> +	q->properties.is_suspended = false;
> +
> +	if (QUEUE_IS_ACTIVE(q->properties)) {
> +		q->properties.is_active = true;
> +		increment_queue_count(dqm, qpd, q);
> +	}
> +}
> +
>   static int evict_process_queues_nocpsch(struct device_queue_manager *dqm,
>   					struct qcm_process_device *qpd)
>   {
> @@ -1885,6 +1958,31 @@ static int execute_queues_cpsch(struct device_queue_manager *dqm,
>   	return map_queues_cpsch(dqm);
>   }
>   
> +static int wait_on_destroy_queue(struct device_queue_manager *dqm,
> +				 struct queue *q)
> +{
> +	struct kfd_process_device *pdd = kfd_get_process_device_data(q->device,
> +								q->process);
> +	int ret = 0;
> +
> +	if (pdd->qpd.is_debug)
> +		return ret;
> +
> +	q->properties.is_being_destroyed = true;
> +
> +	if (pdd->process->debug_trap_enabled && q->properties.is_suspended) {
> +		dqm_unlock(dqm);
> +		mutex_unlock(&q->process->mutex);
> +		ret = wait_event_interruptible(dqm->destroy_wait,
> +						!q->properties.is_suspended);
> +
> +		mutex_lock(&q->process->mutex);
> +		dqm_lock(dqm);
> +	}
> +
> +	return ret;
> +}
> +
>   static int destroy_queue_cpsch(struct device_queue_manager *dqm,
>   				struct qcm_process_device *qpd,
>   				struct queue *q)
> @@ -1904,11 +2002,16 @@ static int destroy_queue_cpsch(struct device_queue_manager *dqm,
>   				q->properties.queue_id);
>   	}
>   
> -	retval = 0;
> -
>   	/* remove queue from list to prevent rescheduling after preemption */
>   	dqm_lock(dqm);
>   
> +	retval = wait_on_destroy_queue(dqm, q);
> +
> +	if (retval) {
> +		dqm_unlock(dqm);
> +		return retval;
> +	}
> +
>   	if (qpd->is_debug) {
>   		/*
>   		 * error, currently we do not allow to destroy a queue
> @@ -1954,7 +2057,17 @@ static int destroy_queue_cpsch(struct device_queue_manager *dqm,
>   
>   	dqm_unlock(dqm);
>   
> -	/* Do free_mqd after dqm_unlock(dqm) to avoid circular locking */
> +	/*
> +	 * Do free_mqd and delete raise event after dqm_unlock(dqm) to avoid

I think this was meant to say "... raise delete event ...".


> +	 * circular locking
> +	 */
> +	kfd_dbg_ev_raise(KFD_EC_MASK(EC_DEVICE_QUEUE_DELETE),
> +			qpd->pqm->process,
> +			q->device,
> +			-1,
> +			false,
> +			NULL,
> +			0);

One line per parameter seems excessive here. The last 4 parameters are 
basically N/A. I think this is more readable:

+	kfd_dbg_ev_raise(KFD_EC_MASK(EC_DEVICE_QUEUE_DELETE),
+			qpd->pqm->process, q->device,
+			-1, false, NULL, 0);


>   	mqd_mgr->free_mqd(mqd_mgr, q->mqd, q->mqd_mem_obj);
>   
>   	return retval;
> @@ -2418,8 +2531,10 @@ struct device_queue_manager *device_queue_manager_init(struct kfd_dev *dev)
>   		goto out_free;
>   	}
>   
> -	if (!dqm->ops.initialize(dqm))
> +	if (!dqm->ops.initialize(dqm)) {
> +		init_waitqueue_head(&dqm->destroy_wait);
>   		return dqm;
> +	}
>   
>   out_free:
>   	kfree(dqm);
> @@ -2557,6 +2672,284 @@ int release_debug_trap_vmid(struct device_queue_manager *dqm,
>   	return r;
>   }
>   
> +#define QUEUE_NOT_FOUND		-1
> +/* invalidate queue operation in array */
> +static void q_array_invalidate(uint32_t num_queues, uint32_t *queue_ids)
> +{
> +	int i;
> +
> +	for (i = 0; i < num_queues; i++)
> +		queue_ids[i] |= KFD_DBG_QUEUE_INVALID_MASK;
> +}
> +
> +/* find queue index in array */
> +static int q_array_get_index(unsigned int queue_id,
> +		uint32_t num_queues,
> +		uint32_t *queue_ids)
> +{
> +	int i;
> +
> +	for (i = 0; i < num_queues; i++)
> +		if (queue_id == (queue_ids[i] & ~KFD_DBG_QUEUE_INVALID_MASK))
> +			return i;
> +
> +	return QUEUE_NOT_FOUND;
> +}
> +
> +struct copy_context_work_handler_workarea {
> +	struct work_struct copy_context_work;
> +	struct kfd_process *p;
> +};
> +
> +static void copy_context_work_handler (struct work_struct *work)
> +{
> +	struct copy_context_work_handler_workarea *workarea;
> +	struct mqd_manager *mqd_mgr;
> +	struct queue *q;
> +	struct mm_struct *mm;
> +	struct kfd_process *p;
> +	uint32_t tmp_ctl_stack_used_size, tmp_save_area_used_size;
> +	int i;
> +
> +	workarea = container_of(work,
> +			struct copy_context_work_handler_workarea,
> +			copy_context_work);
> +
> +	p = workarea->p;
> +	mm = get_task_mm(p->lead_thread);
> +
> +	if (!mm)
> +		return;
> +
> +	kthread_use_mm(mm);
> +	for (i = 0; i < p->n_pdds; i++) {
> +		struct kfd_process_device *pdd = p->pdds[i];
> +		struct device_queue_manager *dqm = pdd->dev->dqm;
> +		struct qcm_process_device *qpd = &pdd->qpd;
> +
> +		list_for_each_entry(q, &qpd->queues_list, list) {
> +			mqd_mgr = dqm->mqd_mgrs[KFD_MQD_TYPE_CP];
> +
> +			/* We ignore the return value from get_wave_state
> +			 * because
> +			 * i) right now, it always returns 0, and
> +			 * ii) if we hit an error, we would continue to the
> +			 *      next queue anyway.
> +			 */
> +			mqd_mgr->get_wave_state(mqd_mgr,
> +					q->mqd,
> +					(void __user *)	q->properties.ctx_save_restore_area_address,
> +					&tmp_ctl_stack_used_size,
> +					&tmp_save_area_used_size);
> +		}
> +	}
> +	kthread_unuse_mm(mm);
> +	mmput(mm);
> +}
> +
> +static uint32_t *get_queue_ids(uint32_t num_queues, uint32_t *usr_queue_id_array)
> +{
> +	size_t array_size = num_queues * sizeof(uint32_t);
> +	uint32_t *queue_ids = NULL;
> +
> +	if (!usr_queue_id_array)
> +		return NULL;
> +
> +	queue_ids = kzalloc(array_size, GFP_KERNEL);
> +	if (!queue_ids)
> +		return ERR_PTR(-ENOMEM);
> +
> +	if (copy_from_user(queue_ids, usr_queue_id_array, array_size))
> +		return ERR_PTR(-EFAULT);
> +
> +	return queue_ids;
> +}
> +
> +int resume_queues(struct kfd_process *p,
> +		bool resume_all_queues,
> +		uint32_t num_queues,
> +		uint32_t *usr_queue_id_array)
> +{
> +	uint32_t *queue_ids = get_queue_ids(num_queues, usr_queue_id_array);
> +	int total_resumed = 0;
> +	int i;
> +
> +	if (!resume_all_queues && IS_ERR(queue_ids))
> +		return PTR_ERR(queue_ids);
> +
> +	/* mask all queues as invalid.  unmask per successful request */
> +	if (!resume_all_queues)
> +		q_array_invalidate(num_queues, queue_ids);
> +
> +	for (i = 0; i < p->n_pdds; i++) {
> +		struct kfd_process_device *pdd = p->pdds[i];
> +		struct device_queue_manager *dqm = pdd->dev->dqm;
> +		struct qcm_process_device *qpd = &pdd->qpd;
> +		struct queue *q;
> +		int r, per_device_resumed = 0;
> +
> +		dqm_lock(dqm);
> +
> +		/* unmask queues that resume or already resumed as valid */
> +		list_for_each_entry(q, &qpd->queues_list, list) {
> +			int q_idx = QUEUE_NOT_FOUND;
> +
> +			if (queue_ids)
> +				q_idx = q_array_get_index(
> +						q->properties.queue_id,
> +						num_queues,
> +						queue_ids);
> +
> +			if (resume_all_queues || q_idx != QUEUE_NOT_FOUND) {
> +				resume_single_queue(dqm, &pdd->qpd, q);
> +				if (queue_ids)
> +					queue_ids[q_idx] &=
> +						~KFD_DBG_QUEUE_INVALID_MASK;
> +				per_device_resumed++;
> +			}
> +		}
> +
> +		if (!per_device_resumed) {
> +			dqm_unlock(dqm);
> +			continue;
> +		}
> +
> +		r = execute_queues_cpsch(dqm,
> +					KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES,
> +					0,
> +					USE_DEFAULT_GRACE_PERIOD);
> +		if (r) {
> +			pr_err("Failed to resume process queues\n");
> +			if (!resume_all_queues) {
> +				list_for_each_entry(q, &qpd->queues_list, list) {
> +					int q_idx = q_array_get_index(
> +							q->properties.queue_id,
> +							num_queues,
> +							queue_ids);
> +
> +					/* mask queue as error on resume fail */
> +					if (q_idx != QUEUE_NOT_FOUND)
> +						queue_ids[q_idx] |=
> +							KFD_DBG_QUEUE_ERROR_MASK;
> +				}
> +			}
> +		} else {
> +			wake_up_all(&dqm->destroy_wait);
> +			total_resumed += per_device_resumed;
> +		}
> +
> +		dqm_unlock(dqm);
> +	}
> +
> +	if (copy_to_user((void __user *)usr_queue_id_array, queue_ids,
> +			num_queues * sizeof(uint32_t)))

I think we should skip this if resume_all_queues is true, because the 
queue array pointer is NULL in that case. I think we get away with it 
because num_queues is also 0 and copy_to_user becomes a NOP, but seems a 
bit weird.

	if (!resume_all_queues && copy_to_user(...))

Also, if it's legal to call this from user mode with usr_queue_id_array 
== NULL, we should instead check

	if (usr_queue_id_array && copy_to_user(...))

Maybe we could just replace the resume_all_queues parameter with a check 
for !usr_queue_id_array.

Regards,
   Felix


> +		pr_err("copy_to_user failed on queue resume\n");
> +
> +	kfree(queue_ids);
> +
> +	return total_resumed;
> +}
> +
> +int suspend_queues(struct kfd_process *p,
> +			uint32_t num_queues,
> +			uint32_t grace_period,
> +			uint64_t exception_clear_mask,
> +			uint32_t *usr_queue_id_array)
> +{
> +	uint32_t *queue_ids = get_queue_ids(num_queues, usr_queue_id_array);
> +	int total_suspended = 0;
> +	int i;
> +
> +	if (IS_ERR(queue_ids))
> +		return PTR_ERR(queue_ids);
> +
> +	/* mask all queues as invalid.  umask on successful request */
> +	q_array_invalidate(num_queues, queue_ids);
> +
> +	for (i = 0; i < p->n_pdds; i++) {
> +		struct kfd_process_device *pdd = p->pdds[i];
> +		struct device_queue_manager *dqm = pdd->dev->dqm;
> +		struct qcm_process_device *qpd = &pdd->qpd;
> +		struct queue *q;
> +		int r, per_device_suspended = 0;
> +
> +		mutex_lock(&p->event_mutex);
> +		dqm_lock(dqm);
> +
> +		/* unmask queues that suspend or already suspended */
> +		list_for_each_entry(q, &qpd->queues_list, list) {
> +			int q_idx = q_array_get_index(q->properties.queue_id,
> +							num_queues,
> +							queue_ids);
> +
> +			if (q_idx != QUEUE_NOT_FOUND &&
> +					!suspend_single_queue(dqm, pdd, q)) {
> +				queue_ids[q_idx] &=
> +					~KFD_DBG_QUEUE_INVALID_MASK;
> +				per_device_suspended++;
> +			}
> +		}
> +
> +		if (!per_device_suspended) {
> +			dqm_unlock(dqm);
> +			mutex_unlock(&p->event_mutex);
> +			continue;
> +		}
> +
> +		r = execute_queues_cpsch(dqm,
> +			KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0,
> +			grace_period);
> +
> +		if (r)
> +			pr_err("Failed to suspend process queues.\n");
> +		else
> +			total_suspended += per_device_suspended;
> +
> +		list_for_each_entry(q, &qpd->queues_list, list) {
> +			int q_idx = q_array_get_index(q->properties.queue_id,
> +						num_queues, queue_ids);
> +
> +			if (q_idx == QUEUE_NOT_FOUND)
> +				continue;
> +
> +			/* mask queue as error on suspend fail */
> +			if (r)
> +				queue_ids[q_idx] |= KFD_DBG_QUEUE_ERROR_MASK;
> +			else if (exception_clear_mask)
> +				q->properties.exception_status &=
> +							~exception_clear_mask;
> +		}
> +
> +		dqm_unlock(dqm);
> +		mutex_unlock(&p->event_mutex);
> +		amdgpu_device_flush_hdp(dqm->dev->adev, NULL);
> +	}
> +
> +	if (total_suspended) {
> +		struct copy_context_work_handler_workarea copy_context_worker;
> +
> +		INIT_WORK_ONSTACK(
> +				&copy_context_worker.copy_context_work,
> +				copy_context_work_handler);
> +
> +		copy_context_worker.p = p;
> +
> +		schedule_work(&copy_context_worker.copy_context_work);
> +
> +
> +		flush_work(&copy_context_worker.copy_context_work);
> +		destroy_work_on_stack(&copy_context_worker.copy_context_work);
> +	}
> +
> +	if (copy_to_user((void __user *)usr_queue_id_array, queue_ids,
> +			num_queues * sizeof(uint32_t)))
> +		pr_err("copy_to_user failed on queue suspend\n");
> +
> +	kfree(queue_ids);
> +
> +	return total_suspended;
> +}
> +
>   int debug_lock_and_unmap(struct device_queue_manager *dqm)
>   {
>   	int r;
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
> index bef3be84c5cc..12643528684c 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
> @@ -259,6 +259,8 @@ struct device_queue_manager {
>   	struct kfd_mem_obj	hiq_sdma_mqd;
>   	bool			sched_running;
>   	uint32_t		wait_times;
> +
> +	wait_queue_head_t	destroy_wait;
>   };
>   
>   void device_queue_manager_init_cik(
> @@ -286,6 +288,15 @@ int reserve_debug_trap_vmid(struct device_queue_manager *dqm,
>   			struct qcm_process_device *qpd);
>   int release_debug_trap_vmid(struct device_queue_manager *dqm,
>   			struct qcm_process_device *qpd);
> +int suspend_queues(struct kfd_process *p,
> +			uint32_t num_queues,
> +			uint32_t grace_period,
> +			uint64_t exception_clear_mask,
> +			uint32_t *usr_queue_id_array);
> +int resume_queues(struct kfd_process *p,
> +		bool resume_all_queues,
> +		uint32_t num_queues,
> +		uint32_t *usr_queue_id_array);
>   int debug_lock_and_unmap(struct device_queue_manager *dqm);
>   int debug_map_and_unlock(struct device_queue_manager *dqm);
>   int debug_refresh_runlist(struct device_queue_manager *dqm);
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v10.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v10.c
> index cb484ace17de..d74862755213 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v10.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v10.c
> @@ -237,6 +237,7 @@ static int get_wave_state(struct mqd_manager *mm, void *mqd,
>   			  u32 *save_area_used_size)
>   {
>   	struct v10_compute_mqd *m;
> +	struct kfd_context_save_area_header header;
>   
>   	m = get_mqd(mqd);
>   
> @@ -255,6 +256,15 @@ static int get_wave_state(struct mqd_manager *mm, void *mqd,
>   	 * accessible to user mode
>   	 */
>   
> +	header.control_stack_size = *ctl_stack_used_size;
> +	header.wave_state_size = *save_area_used_size;
> +
> +	header.wave_state_offset = m->cp_hqd_wg_state_offset;
> +	header.control_stack_offset = m->cp_hqd_cntl_stack_offset;
> +
> +	if (copy_to_user(ctl_stack, &header, sizeof(header)))
> +		return -EFAULT;
> +
>   	return 0;
>   }
>   
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c
> index 86f1cf090246..f05a2bed655a 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c
> @@ -289,6 +289,7 @@ static int get_wave_state(struct mqd_manager *mm, void *mqd,
>   			  u32 *save_area_used_size)
>   {
>   	struct v9_mqd *m;
> +	struct kfd_context_save_area_header header;
>   
>   	/* Control stack is located one page after MQD. */
>   	void *mqd_ctl_stack = (void *)((uintptr_t)mqd + PAGE_SIZE);
> @@ -300,7 +301,18 @@ static int get_wave_state(struct mqd_manager *mm, void *mqd,
>   	*save_area_used_size = m->cp_hqd_wg_state_offset -
>   		m->cp_hqd_cntl_stack_size;
>   
> -	if (copy_to_user(ctl_stack, mqd_ctl_stack, m->cp_hqd_cntl_stack_size))
> +	header.control_stack_size = *ctl_stack_used_size;
> +	header.wave_state_size = *save_area_used_size;
> +
> +	header.wave_state_offset = m->cp_hqd_wg_state_offset;
> +	header.control_stack_offset = m->cp_hqd_cntl_stack_offset;
> +
> +	if (copy_to_user(ctl_stack, &header, sizeof(header)))
> +		return -EFAULT;
> +
> +	if (copy_to_user(ctl_stack + m->cp_hqd_cntl_stack_offset,
> +				mqd_ctl_stack + m->cp_hqd_cntl_stack_offset,
> +				*ctl_stack_used_size))
>   		return -EFAULT;
>   
>   	return 0;
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> index bd3d8a0b61b7..3d529c7499f8 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> @@ -477,6 +477,8 @@ struct queue_properties {
>   	uint32_t doorbell_off;
>   	bool is_interop;
>   	bool is_evicted;
> +	bool is_suspended;
> +	bool is_being_destroyed;
>   	bool is_active;
>   	bool is_gws;
>   	/* Not relevant for user mode queues in cp scheduling */
> @@ -499,7 +501,8 @@ struct queue_properties {
>   #define QUEUE_IS_ACTIVE(q) ((q).queue_size > 0 &&	\
>   			    (q).queue_address != 0 &&	\
>   			    (q).queue_percent > 0 &&	\
> -			    !(q).is_evicted)
> +			    !(q).is_evicted &&		\
> +			    !(q).is_suspended)
>   
>   enum mqd_update_flag {
>   	UPDATE_FLAG_CU_MASK = 0,

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [PATCH 23/29] drm/amdkfd: add debug set and clear address watch points operation
  2022-10-31 16:23 ` [PATCH 23/29] drm/amdkfd: add debug set and clear address watch points operation Jonathan Kim
@ 2022-11-30  0:34   ` Felix Kuehling
  0 siblings, 0 replies; 63+ messages in thread
From: Felix Kuehling @ 2022-11-30  0:34 UTC (permalink / raw)
  To: Jonathan Kim, amd-gfx


On 2022-10-31 12:23, Jonathan Kim wrote:
> Shader read, write and atomic memory operations can be alerted to the
> debugger as an address watch exception.
>
> Allow the debugger to pass in a watch point to a particular memory
> address per device.
>
> Note that there exists only 4 watch points per devices to date, so have
> the KFD keep track of what watch points are allocated or not.
>
> v2: change dev_id arg to gpu_id for consistency
>
> Signed-off-by: Jonathan Kim <jonathan.kim@amd.com>

Nit-picks inline.


> ---
>   .../drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c  |   2 +
>   .../drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c   |   2 +
>   .../drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c    |  78 +++++++++++
>   .../drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.h    |   8 ++
>   .../drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10_3.c  |   5 +-
>   .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c | 128 +++++++++++++++++
>   .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h |   8 ++
>   drivers/gpu/drm/amd/amdkfd/kfd_chardev.c      |  24 ++++
>   drivers/gpu/drm/amd/amdkfd/kfd_debug.c        | 130 ++++++++++++++++++
>   drivers/gpu/drm/amd/amdkfd/kfd_debug.h        |   8 +-
>   drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c  |   7 +
>   drivers/gpu/drm/amd/amdkfd/kfd_priv.h         |   9 +-
>   12 files changed, 405 insertions(+), 4 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c
> index 91c7fdee883e..8f9b613e3152 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c
> @@ -138,6 +138,8 @@ const struct kfd2kgd_calls aldebaran_kfd2kgd = {
>   	.validate_trap_override_request = kgd_aldebaran_validate_trap_override_request,
>   	.set_wave_launch_trap_override = kgd_aldebaran_set_wave_launch_trap_override,
>   	.set_wave_launch_mode = kgd_aldebaran_set_wave_launch_mode,
> +	.set_address_watch = kgd_gfx_v9_set_address_watch,
> +	.clear_address_watch = kgd_gfx_v9_clear_address_watch,
>   	.get_iq_wait_times = kgd_gfx_v9_get_iq_wait_times,
>   	.build_grace_period_packet_info = kgd_gfx_v9_build_grace_period_packet_info,
>   	.program_trap_handler_settings = kgd_gfx_v9_program_trap_handler_settings,
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c
> index 10470f4a4eaf..5d6bd23a8cc1 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c
> @@ -400,6 +400,8 @@ const struct kfd2kgd_calls arcturus_kfd2kgd = {
>   	.validate_trap_override_request = kgd_gfx_v9_validate_trap_override_request,
>   	.set_wave_launch_trap_override = kgd_gfx_v9_set_wave_launch_trap_override,
>   	.set_wave_launch_mode = kgd_gfx_v9_set_wave_launch_mode,
> +	.set_address_watch = kgd_gfx_v9_set_address_watch,
> +	.clear_address_watch = kgd_gfx_v9_clear_address_watch,
>   	.get_iq_wait_times = kgd_gfx_v9_get_iq_wait_times,
>   	.build_grace_period_packet_info = kgd_gfx_v9_build_grace_period_packet_info,
>   	.get_cu_occupancy = kgd_gfx_v9_get_cu_occupancy,
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
> index 66a83e6fb9e5..ec48677772f6 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
> @@ -880,6 +880,82 @@ uint32_t kgd_gfx_v10_set_wave_launch_mode(struct amdgpu_device *adev,
>   	return 0;
>   }
>   
> +#define TCP_WATCH_STRIDE (mmTCP_WATCH1_ADDR_H - mmTCP_WATCH0_ADDR_H)
> +uint32_t kgd_gfx_v10_set_address_watch(struct amdgpu_device *adev,
> +					uint64_t watch_address,
> +					uint32_t watch_address_mask,
> +					uint32_t watch_id,
> +					uint32_t watch_mode,
> +					uint32_t debug_vmid)
> +{
> +	uint32_t watch_address_high;
> +	uint32_t watch_address_low;
> +	uint32_t watch_address_cntl;
> +
> +	watch_address_cntl = 0;
> +
> +	watch_address_low = lower_32_bits(watch_address);
> +	watch_address_high = upper_32_bits(watch_address) & 0xffff;
> +
> +	watch_address_cntl = REG_SET_FIELD(watch_address_cntl,
> +			TCP_WATCH0_CNTL,
> +			VMID,
> +			debug_vmid);
> +	watch_address_cntl = REG_SET_FIELD(watch_address_cntl,
> +			TCP_WATCH0_CNTL,
> +			MODE,
> +			watch_mode);
> +	watch_address_cntl = REG_SET_FIELD(watch_address_cntl,
> +			TCP_WATCH0_CNTL,
> +			MASK,
> +			watch_address_mask >> 7);
> +
> +	/* Turning off this watch point until we set all the registers */
> +	watch_address_cntl = REG_SET_FIELD(watch_address_cntl,
> +			TCP_WATCH0_CNTL,
> +			VALID,
> +			0);
> +
> +	WREG32((SOC15_REG_OFFSET(GC, 0, mmTCP_WATCH0_CNTL) +
> +			(watch_id * TCP_WATCH_STRIDE)),
> +			watch_address_cntl);
> +
> +	WREG32((SOC15_REG_OFFSET(GC, 0, mmTCP_WATCH0_ADDR_H) +
> +			(watch_id * TCP_WATCH_STRIDE)),
> +			watch_address_high);
> +
> +	WREG32((SOC15_REG_OFFSET(GC, 0, mmTCP_WATCH0_ADDR_L) +
> +			(watch_id * TCP_WATCH_STRIDE)),
> +			watch_address_low);
> +
> +	/* Enable the watch point */
> +	watch_address_cntl = REG_SET_FIELD(watch_address_cntl,
> +			TCP_WATCH0_CNTL,
> +			VALID,
> +			1);
> +
> +	WREG32((SOC15_REG_OFFSET(GC, 0, mmTCP_WATCH0_CNTL) +
> +			(watch_id * TCP_WATCH_STRIDE)),
> +			watch_address_cntl);
> +
> +	return 0;
> +}
> +
> +uint32_t kgd_gfx_v10_clear_address_watch(struct amdgpu_device *adev,
> +					uint32_t watch_id)
> +{
> +	uint32_t watch_address_cntl;
> +
> +	watch_address_cntl = 0;
> +
> +	WREG32((SOC15_REG_OFFSET(GC, 0, mmTCP_WATCH0_CNTL) +
> +			(watch_id * TCP_WATCH_STRIDE)),
> +			watch_address_cntl);
> +
> +	return 0;
> +}
> +
> +
>   /* kgd_gfx_v10_get_iq_wait_times: Returns the mmCP_IQ_WAIT_TIME1/2 values
>    * The values read are:
>    *     ib_offload_wait_time     -- Wait Count for Indirect Buffer Offloads.
> @@ -968,6 +1044,8 @@ const struct kfd2kgd_calls gfx_v10_kfd2kgd = {
>   	.validate_trap_override_request = kgd_gfx_v10_validate_trap_override_request,
>   	.set_wave_launch_trap_override = kgd_gfx_v10_set_wave_launch_trap_override,
>   	.set_wave_launch_mode = kgd_gfx_v10_set_wave_launch_mode,
> +	.set_address_watch = kgd_gfx_v10_set_address_watch,
> +	.clear_address_watch = kgd_gfx_v10_clear_address_watch,
>   	.get_iq_wait_times = kgd_gfx_v10_get_iq_wait_times,
>   	.build_grace_period_packet_info = kgd_gfx_v10_build_grace_period_packet_info,
>   	.program_trap_handler_settings = program_trap_handler_settings,
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.h
> index 34c04a2bb83b..334ff16e25db 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.h
> @@ -39,6 +39,14 @@ uint32_t kgd_gfx_v10_set_wave_launch_trap_override(struct amdgpu_device *adev,
>   uint32_t kgd_gfx_v10_set_wave_launch_mode(struct amdgpu_device *adev,
>   					 uint8_t wave_launch_mode,
>   					 uint32_t vmid);
> +uint32_t kgd_gfx_v10_set_address_watch(struct amdgpu_device *adev,
> +					uint64_t watch_address,
> +					uint32_t watch_address_mask,
> +					uint32_t watch_id,
> +					uint32_t watch_mode,
> +					uint32_t debug_vmid);
> +uint32_t kgd_gfx_v10_clear_address_watch(struct amdgpu_device *adev,
> +					uint32_t watch_id);
>   void kgd_gfx_v10_get_iq_wait_times(struct amdgpu_device *adev, uint32_t *wait_times);
>   void kgd_gfx_v10_build_grace_period_packet_info(struct amdgpu_device *adev,
>   					       uint32_t wait_times,
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10_3.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10_3.c
> index 8627c5458973..ee36ba045dcf 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10_3.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10_3.c
> @@ -676,6 +676,7 @@ const struct kfd2kgd_calls gfx_v10_3_kfd2kgd = {
>   	.disable_debug_trap = kgd_gfx_v10_disable_debug_trap,
>   	.validate_trap_override_request = kgd_gfx_v10_validate_trap_override_request,
>   	.set_wave_launch_trap_override = kgd_gfx_v10_set_wave_launch_trap_override,
> -	.set_wave_launch_mode = kgd_gfx_v10_set_wave_launch_mode
> -
> +	.set_wave_launch_mode = kgd_gfx_v10_set_wave_launch_mode,
> +	.set_address_watch = kgd_gfx_v10_set_address_watch,
> +	.clear_address_watch = kgd_gfx_v10_clear_address_watch
>   };
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
> index 3bba7ca21926..98355a21740b 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
> @@ -810,6 +810,132 @@ uint32_t kgd_gfx_v9_set_wave_launch_mode(struct amdgpu_device *adev,
>   	return 0;
>   }
>   
> +#define TCP_WATCH_STRIDE (mmTCP_WATCH1_ADDR_H - mmTCP_WATCH0_ADDR_H)
> +static uint32_t kgd_gfx_set_multi_process_address_watch(
> +					struct amdgpu_device *adev,
> +					uint64_t watch_address,
> +					uint32_t watch_address_mask,
> +					uint32_t watch_id,
> +					uint32_t watch_mode)
> +{
> +	uint32_t watch_address_high;
> +	uint32_t watch_address_low;
> +	uint32_t watch_address_cntl;
> +
> +	watch_address_cntl = 0;
> +	watch_address_low = lower_32_bits(watch_address);
> +	watch_address_high = upper_32_bits(watch_address) & 0xffff;
> +
> +	watch_address_cntl = REG_SET_FIELD(watch_address_cntl,
> +			TCP_WATCH0_CNTL,
> +			MODE,
> +			watch_mode);
> +
> +	watch_address_cntl = REG_SET_FIELD(watch_address_cntl,
> +			TCP_WATCH0_CNTL,
> +			MASK,
> +			watch_address_mask >> 6);
> +
> +	watch_address_cntl = REG_SET_FIELD(watch_address_cntl,
> +			TCP_WATCH0_CNTL,
> +			VALID,
> +			1);
> +
> +	WREG32_RLC((SOC15_REG_OFFSET(GC, 0, mmTCP_WATCH0_ADDR_H) +
> +			(watch_id * TCP_WATCH_STRIDE)),
> +			watch_address_high);
> +
> +	WREG32_RLC((SOC15_REG_OFFSET(GC, 0, mmTCP_WATCH0_ADDR_L) +
> +			(watch_id * TCP_WATCH_STRIDE)),
> +			watch_address_low);
> +
> +	return watch_address_cntl;
> +}
> +
> +uint32_t kgd_gfx_v9_set_address_watch(struct amdgpu_device *adev,
> +					uint64_t watch_address,
> +					uint32_t watch_address_mask,
> +					uint32_t watch_id,
> +					uint32_t watch_mode,
> +					uint32_t debug_vmid)
> +{
> +	uint32_t watch_address_high;
> +	uint32_t watch_address_low;
> +	uint32_t watch_address_cntl;
> +
> +	if (adev->ip_versions[GC_HWIP][0] == IP_VERSION(9, 4, 2))

Shouldn't this be handled by putting a different function into 
aldebaran_kfd2kgd.set_address_watch?


> +		return kgd_gfx_set_multi_process_address_watch(adev,
> +							watch_address,
> +							watch_address_mask,
> +							watch_id,
> +							watch_mode);
> +
> +	watch_address_cntl = 0;
> +
> +	watch_address_low = lower_32_bits(watch_address);
> +	watch_address_high = upper_32_bits(watch_address) & 0xffff;
> +
> +	watch_address_cntl = REG_SET_FIELD(watch_address_cntl,
> +			TCP_WATCH0_CNTL,
> +			VMID,
> +			debug_vmid);
> +	watch_address_cntl = REG_SET_FIELD(watch_address_cntl,
> +			TCP_WATCH0_CNTL,
> +			MODE,
> +			watch_mode);
> +	watch_address_cntl = REG_SET_FIELD(watch_address_cntl,
> +			TCP_WATCH0_CNTL,
> +			MASK,
> +			watch_address_mask >> 6);
> +
> +	/* Turning off this watch point until we set all the registers */
> +	watch_address_cntl = REG_SET_FIELD(watch_address_cntl,
> +			TCP_WATCH0_CNTL,
> +			VALID,
> +			0);
> +
> +	WREG32_RLC((SOC15_REG_OFFSET(GC, 0, mmTCP_WATCH0_CNTL) +
> +			(watch_id * TCP_WATCH_STRIDE)),
> +			watch_address_cntl);
> +
> +	WREG32_RLC((SOC15_REG_OFFSET(GC, 0, mmTCP_WATCH0_ADDR_H) +
> +			(watch_id * TCP_WATCH_STRIDE)),
> +			watch_address_high);
> +
> +	WREG32_RLC((SOC15_REG_OFFSET(GC, 0, mmTCP_WATCH0_ADDR_L) +
> +			(watch_id * TCP_WATCH_STRIDE)),
> +			watch_address_low);
> +
> +	/* Enable the watch point */
> +	watch_address_cntl = REG_SET_FIELD(watch_address_cntl,
> +			TCP_WATCH0_CNTL,
> +			VALID,
> +			1);
> +
> +	WREG32_RLC((SOC15_REG_OFFSET(GC, 0, mmTCP_WATCH0_CNTL) +
> +			(watch_id * TCP_WATCH_STRIDE)),
> +			watch_address_cntl);
> +
> +	return 0;
> +}
> +
> +uint32_t kgd_gfx_v9_clear_address_watch(struct amdgpu_device *adev,
> +					uint32_t watch_id)
> +{
> +	uint32_t watch_address_cntl;
> +
> +	if (adev->asic_type == CHIP_ALDEBARAN)

Same as above.


> +		return 0;
> +
> +	watch_address_cntl = 0;
> +
> +	WREG32_RLC((SOC15_REG_OFFSET(GC, 0, mmTCP_WATCH0_CNTL) +
> +			(watch_id * TCP_WATCH_STRIDE)),
> +			watch_address_cntl);
> +
> +	return 0;
> +}
> +
>   /* kgd_gfx_v9_get_iq_wait_times: Returns the mmCP_IQ_WAIT_TIME1/2 values
>    * The values read are:
>    *     ib_offload_wait_time     -- Wait Count for Indirect Buffer Offloads.
> @@ -1081,6 +1207,8 @@ const struct kfd2kgd_calls gfx_v9_kfd2kgd = {
>   	.validate_trap_override_request = kgd_gfx_v9_validate_trap_override_request,
>   	.set_wave_launch_trap_override = kgd_gfx_v9_set_wave_launch_trap_override,
>   	.set_wave_launch_mode = kgd_gfx_v9_set_wave_launch_mode,
> +	.set_address_watch = kgd_gfx_v9_set_address_watch,
> +	.clear_address_watch = kgd_gfx_v9_clear_address_watch,
>   	.get_iq_wait_times = kgd_gfx_v9_get_iq_wait_times,
>   	.build_grace_period_packet_info = kgd_gfx_v9_build_grace_period_packet_info,
>   	.get_cu_occupancy = kgd_gfx_v9_get_cu_occupancy,
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h
> index 2a2ab42037e4..ba52b61b68c5 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h
> @@ -77,6 +77,14 @@ uint32_t kgd_gfx_v9_set_wave_launch_trap_override(struct amdgpu_device *adev,
>   					     uint32_t trap_mask_request,
>   					     uint32_t *trap_mask_prev,
>   					     uint32_t kfd_dbg_trap_cntl_prev);
> +uint32_t kgd_gfx_v9_set_address_watch(struct amdgpu_device *adev,
> +					uint64_t watch_address,
> +					uint32_t watch_address_mask,
> +					uint32_t watch_id,
> +					uint32_t watch_mode,
> +					uint32_t debug_vmid);
> +uint32_t kgd_gfx_v9_clear_address_watch(struct amdgpu_device *adev,
> +					uint32_t watch_id);
>   void kgd_gfx_v9_get_iq_wait_times(struct amdgpu_device *adev, uint32_t *wait_times);
>   void kgd_gfx_v9_build_grace_period_packet_info(struct amdgpu_device *adev,
>   					       uint32_t wait_times,
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> index ec26c51177f9..9b2ea6e9e078 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> @@ -2797,6 +2797,7 @@ static int kfd_ioctl_set_debug_trap(struct file *filep, struct kfd_process *p, v
>   	struct task_struct *thread = NULL;
>   	struct pid *pid = NULL;
>   	struct kfd_process *target = NULL;
> +	struct kfd_process_device *pdd = NULL;
>   	int r = 0;
>   
>   	if (sched_policy == KFD_SCHED_POLICY_NO_HWS) {
> @@ -2864,6 +2865,20 @@ static int kfd_ioctl_set_debug_trap(struct file *filep, struct kfd_process *p, v
>   		goto unlock_out;
>   	}
>   
> +	if (args->op == KFD_IOC_DBG_TRAP_SET_NODE_ADDRESS_WATCH ||
> +			args->op == KFD_IOC_DBG_TRAP_CLEAR_NODE_ADDRESS_WATCH) {

Weird indentation.


> +		int user_gpu_id = kfd_process_get_user_gpu_id(target,
> +				args->op == KFD_IOC_DBG_TRAP_SET_NODE_ADDRESS_WATCH ?
> +					args->set_node_address_watch.gpu_id :
> +					args->clear_node_address_watch.gpu_id);
> +
> +		pdd = kfd_process_device_data_by_id(target, user_gpu_id);
> +		if (user_gpu_id == -EINVAL || !pdd) {
> +			r = -ENODEV;
> +			goto unlock_out;
> +		}
> +	}
> +
>   	switch (args->op) {
>   	case KFD_IOC_DBG_TRAP_ENABLE:
>   		if (target != p)
> @@ -2917,7 +2932,16 @@ static int kfd_ioctl_set_debug_trap(struct file *filep, struct kfd_process *p, v
>   				(uint32_t *)args->resume_queues.queue_array_ptr);
>   		break;
>   	case KFD_IOC_DBG_TRAP_SET_NODE_ADDRESS_WATCH:
> +		r = kfd_dbg_trap_set_dev_address_watch(pdd,
> +				args->set_node_address_watch.address,
> +				args->set_node_address_watch.mask,
> +				&args->set_node_address_watch.id,
> +				args->set_node_address_watch.mode);
> +		break;
>   	case KFD_IOC_DBG_TRAP_CLEAR_NODE_ADDRESS_WATCH:
> +		r = kfd_dbg_trap_clear_dev_address_watch(pdd,
> +				args->clear_node_address_watch.id);
> +		break;
>   	case KFD_IOC_DBG_TRAP_SET_FLAGS:
>   	case KFD_IOC_DBG_TRAP_QUERY_DEBUG_EVENT:
>   	case KFD_IOC_DBG_TRAP_QUERY_EXCEPTION_INFO:
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
> index afa56aad316b..68bc1d5bfd05 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
> @@ -24,6 +24,14 @@
>   #include "kfd_device_queue_manager.h"
>   #include <linux/file.h>
>   
> +/*
> + * The spinlock protects the per device dev->alloc_watch_ids for multi-process access.
> + * The per-process per-device pdd->alloc_watch_ids is protected by the debug IOCTL
> + * process mutex.
> + */
> +#define MAX_WATCH_ADDRESSES	4
> +static DEFINE_SPINLOCK(watch_points_lock);

This spin lock seems to be a left-over from when we managed watch-points 
globally. Now that they are per device, I think this spinlock should be 
per-device as well, in struct kfd_dev.


> +
>   void debug_event_write_work_handler(struct work_struct *work)
>   {
>   	struct kfd_process *process;
> @@ -227,6 +235,127 @@ int kfd_dbg_send_exception_to_runtime(struct kfd_process *p,
>   	return 0;
>   }
>   
> +#define KFD_DEBUGGER_INVALID_WATCH_POINT_ID -1
> +static int kfd_dbg_get_dev_watch_id(struct kfd_process_device *pdd, int *watch_id)
> +{
> +	int i;
> +
> +	*watch_id = KFD_DEBUGGER_INVALID_WATCH_POINT_ID;
> +
> +	spin_lock(&watch_points_lock);
> +
> +	for (i = 0; i < MAX_WATCH_ADDRESSES; i++) {
> +		/* device watchpoint in use so skip */
> +		if ((pdd->dev->alloc_watch_ids >> i) & 0x1)
> +			continue;
> +
> +		pdd->alloc_watch_ids |= 0x1 << i;
> +		pdd->dev->alloc_watch_ids |= 0x1 << i;
> +		*watch_id = i;
> +		spin_unlock(&watch_points_lock);
> +		return 0;
> +	}
> +
> +	spin_unlock(&watch_points_lock);
> +
> +	return -ENOMEM;
> +}
> +
> +static void kfd_dbg_clear_dev_watch_id(struct kfd_process_device *pdd, int watch_id)
> +{
> +	spin_lock(&watch_points_lock);
> +
> +	/* process owns device watch point so safe to clear */
> +	if ((pdd->alloc_watch_ids >> watch_id) & 0x1) {
> +		pdd->alloc_watch_ids &= ~(0x1 << watch_id);
> +		pdd->dev->alloc_watch_ids &= ~(0x1 << watch_id);
> +	}
> +
> +	spin_unlock(&watch_points_lock);
> +}
> +
> +static bool kfd_dbg_owns_dev_watch_id(struct kfd_process_device *pdd, int watch_id)
> +{
> +	bool owns_watch_id = false;
> +
> +	spin_lock(&watch_points_lock);
> +	owns_watch_id = watch_id < MAX_WATCH_ADDRESSES && ((pdd->alloc_watch_ids >> watch_id) & 0x1);
> +
> +	spin_unlock(&watch_points_lock);
> +
> +	return owns_watch_id;
> +}
> +
> +int kfd_dbg_trap_clear_dev_address_watch(struct kfd_process_device *pdd,
> +					uint32_t watch_id)
> +{
> +	int r;
> +
> +	if (!kfd_dbg_owns_dev_watch_id(pdd, watch_id))
> +		return -EINVAL;
> +
> +	r = debug_lock_and_unmap(pdd->dev->dqm);
> +	if (r)
> +		return r;
> +
> +	amdgpu_gfx_off_ctrl(pdd->dev->adev, false);
> +	pdd->watch_points[watch_id] = pdd->dev->kfd2kgd->clear_address_watch(
> +							pdd->dev->adev,
> +							watch_id);
> +	amdgpu_gfx_off_ctrl(pdd->dev->adev, true);
> +
> +	r = debug_map_and_unlock(pdd->dev->dqm);
> +
> +	kfd_dbg_clear_dev_watch_id(pdd, watch_id);
> +
> +	return r;
> +}
> +
> +int kfd_dbg_trap_set_dev_address_watch(struct kfd_process_device *pdd,
> +					uint64_t watch_address,
> +					uint32_t watch_address_mask,
> +					uint32_t *watch_id,
> +					uint32_t watch_mode)
> +{
> +	int r = kfd_dbg_get_dev_watch_id(pdd, watch_id);
> +
> +	if (r)
> +		return r;
> +
> +	r = debug_lock_and_unmap(pdd->dev->dqm);
> +	if (r) {
> +		kfd_dbg_clear_dev_watch_id(pdd, *watch_id);
> +		return r;
> +	}
> +
> +	amdgpu_gfx_off_ctrl(pdd->dev->adev, false);
> +	pdd->watch_points[*watch_id] = pdd->dev->kfd2kgd->set_address_watch(
> +				pdd->dev->adev,
> +				watch_address,
> +				watch_address_mask,
> +				*watch_id,
> +				watch_mode,
> +				pdd->dev->vm_info.last_vmid_kfd);
> +	amdgpu_gfx_off_ctrl(pdd->dev->adev, true);
> +
> +	r = debug_map_and_unlock(pdd->dev->dqm);
> +	/* HWS is broken so no point in HW rollback but release the watchpoint anyways */
> +	if (r)
> +		kfd_dbg_clear_dev_watch_id(pdd, *watch_id);
> +
> +	return 0;
> +}
> +
> +static void kfd_dbg_clear_process_address_watch(struct kfd_process *target)
> +{
> +	int i, j;
> +
> +	for (i = 0; i < target->n_pdds; i++)
> +		for (j = 0; j < MAX_WATCH_ADDRESSES; j++)
> +			kfd_dbg_trap_clear_dev_address_watch(target->pdds[i], j);
> +}
> +
> +
>   /* kfd_dbg_trap_deactivate:
>    *	target: target process
>    *	unwind: If this is unwinding a failed kfd_dbg_trap_enable()
> @@ -241,6 +370,7 @@ void kfd_dbg_trap_deactivate(struct kfd_process *target, bool unwind, int unwind
>   
>   	if (!unwind) {
>   		cancel_work_sync(&target->debug_event_workarea);
> +		kfd_dbg_clear_process_address_watch(target);
>   		kfd_dbg_trap_set_wave_launch_mode(target, 0);
>   	}
>   
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.h b/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
> index ca3ab1f01985..ad677e67e7eb 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
> @@ -50,7 +50,13 @@ int kfd_dbg_trap_set_wave_launch_override(struct kfd_process *target,
>   					uint32_t *trap_mask_supported);
>   int kfd_dbg_trap_set_wave_launch_mode(struct kfd_process *target,
>   					uint8_t wave_launch_mode);
> -
> +int kfd_dbg_trap_clear_dev_address_watch(struct kfd_process_device *pdd,
> +					uint32_t watch_id);
> +int kfd_dbg_trap_set_dev_address_watch(struct kfd_process_device *pdd,
> +					uint64_t watch_address,
> +					uint32_t watch_address_mask,
> +					uint32_t *watch_id,
> +					uint32_t watch_mode);
>   int kfd_dbg_send_exception_to_runtime(struct kfd_process *p,
>   					unsigned int dev_id,
>   					unsigned int queue_id,
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c b/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c
> index 8aebe408c544..733987de595a 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c
> @@ -395,6 +395,8 @@ int kfd_init_apertures(struct kfd_process *process)
>   			pdd->gpuvm_base = pdd->gpuvm_limit = 0;
>   			pdd->scratch_base = pdd->scratch_limit = 0;
>   		} else {
> +			int num_watchpoints = pdd->dev->device_info.num_of_watch_points;
> +
>   			switch (dev->adev->asic_type) {
>   			case CHIP_KAVERI:
>   			case CHIP_HAWAII:
> @@ -424,6 +426,11 @@ int kfd_init_apertures(struct kfd_process *process)
>   				pdd->qpd.cwsr_base = SVM_CWSR_BASE;
>   				pdd->qpd.ib_base = SVM_IB_BASE;
>   			}
> +
> +			process->max_watch_points =
> +				!process->max_watch_points ? num_watchpoints :
> +						min(num_watchpoints, process->max_watch_points);
> +
>   		}
>   
>   		dev_dbg(kfd_device, "node id %u\n", id);
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> index 3d529c7499f8..aee4fe20e676 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> @@ -348,6 +348,9 @@ struct kfd_dev {
>   
>   	/* HMM page migration MEMORY_DEVICE_PRIVATE mapping */
>   	struct dev_pagemap pgmap;
> +
> +	/* Track per device allocated watch points */
> +	uint32_t alloc_watch_ids;
>   };
>   
>   enum kfd_mempool {
> @@ -796,6 +799,7 @@ struct kfd_process_device {
>   	uint32_t spi_dbg_override;
>   	uint32_t spi_dbg_launch_mode;
>   	uint32_t watch_points[4];
> +	uint32_t alloc_watch_ids;
>   
>   	/*
>   	 * If this process has been checkpointed before, then the user
> @@ -907,6 +911,10 @@ struct kfd_process {
>   	/* per-process-per device debug event fd file */
>   	struct file *dbg_ev_file;
>   
> +	/* Allocated debug watch point IDs bitmask */
> +	uint32_t allocated_debug_watch_point_bitmask;
> +	int max_watch_points;

These two variable are unused.

Regards,
   Felix


> +
>   	/* If the process is a kfd debugger, we need to know so we can clean
>   	 * up at exit time.  If a process enables debugging on itself, it does
>   	 * its own clean-up, so we don't set the flag here.  We track this by
> @@ -952,7 +960,6 @@ struct kfd_process {
>   	struct semaphore runtime_enable_sema;
>   	bool is_runtime_retry;
>   	struct kfd_runtime_info runtime_info;
> -
>   };
>   
>   #define KFD_PROCESS_TABLE_SIZE 5 /* bits: 32 entries */

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [PATCH 24/29] drm/amdkfd: add debug set flags operation
  2022-10-31 16:23 ` [PATCH 24/29] drm/amdkfd: add debug set flags operation Jonathan Kim
@ 2022-11-30  0:39   ` Felix Kuehling
  0 siblings, 0 replies; 63+ messages in thread
From: Felix Kuehling @ 2022-11-30  0:39 UTC (permalink / raw)
  To: Jonathan Kim, amd-gfx


On 2022-10-31 12:23, Jonathan Kim wrote:
> Allow the debugger to set single memory and single ALU operations.
>
> Some exceptions are imprecise (memory violations, address watch) in the
> sense that a trap occurs only when the exception interrupt occurs and
> not at the non-halting faulty instruction.  Trap temporaries 0 & 1 save
> the program counter address, which means that these values will not point
> to the faulty instruction address but to whenever the interrupt was
> raised.
>
> Setting the Single Memory Operations flag will inject an automatic wait
> on every memory operation instruction forcing imprecise memory exceptions
> to become precise at the cost of performance.  This setting is not
> permitted on debug devices that support only a global setting of this
> option.
>
> Likewise, Single ALU Operations will force in-order ALU operations.
> Although this is available on current hardware, it's not required so it
> will be treated as a NOP.

Having a flag in the API that is just ignored is misleading. I think we 
should either remove it from the API for now, or at least make the 
function return an error if a debugger attempts to set the precise-ALU 
flag. This would be consistent with attempting to set a flag that is not 
supported on the HW.

Regards,
   Felix


>
> Return the previous set flags to the debugger as well.
>
> Signed-off-by: Jonathan Kim <jonathan.kim@amd.com>
> ---
>   drivers/gpu/drm/amd/amdkfd/kfd_chardev.c |  2 ++
>   drivers/gpu/drm/amd/amdkfd/kfd_debug.c   | 35 ++++++++++++++++++++++++
>   drivers/gpu/drm/amd/amdkfd/kfd_debug.h   |  1 +
>   3 files changed, 38 insertions(+)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> index 9b2ea6e9e078..200e11f02382 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> @@ -2943,6 +2943,8 @@ static int kfd_ioctl_set_debug_trap(struct file *filep, struct kfd_process *p, v
>   				args->clear_node_address_watch.id);
>   		break;
>   	case KFD_IOC_DBG_TRAP_SET_FLAGS:
> +		r = kfd_dbg_trap_set_flags(target, &args->set_flags.flags);
> +		break;
>   	case KFD_IOC_DBG_TRAP_QUERY_DEBUG_EVENT:
>   	case KFD_IOC_DBG_TRAP_QUERY_EXCEPTION_INFO:
>   	case KFD_IOC_DBG_TRAP_GET_QUEUE_SNAPSHOT:
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
> index 68bc1d5bfd05..1f4d3fa0278e 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
> @@ -23,6 +23,7 @@
>   #include "kfd_debug.h"
>   #include "kfd_device_queue_manager.h"
>   #include <linux/file.h>
> +#include <uapi/linux/kfd_ioctl.h>
>   
>   /*
>    * The spinlock protects the per device dev->alloc_watch_ids for multi-process access.
> @@ -355,6 +356,37 @@ static void kfd_dbg_clear_process_address_watch(struct kfd_process *target)
>   			kfd_dbg_trap_clear_dev_address_watch(target->pdds[i], j);
>   }
>   
> +int kfd_dbg_trap_set_flags(struct kfd_process *target, uint32_t *flags)
> +{
> +	uint32_t prev_flags = target->dbg_flags;
> +	int i, r = 0;
> +
> +	for (i = 0; i < target->n_pdds; i++) {
> +		if (!kfd_dbg_is_per_vmid_supported(target->pdds[i]->dev) &&
> +			(*flags & KFD_DBG_TRAP_FLAG_SINGLE_MEM_OP)) {
> +			*flags = prev_flags;
> +			return -EACCES;
> +		}
> +	}
> +
> +	target->dbg_flags = *flags;
> +	*flags = prev_flags;
> +	for (i = 0; i < target->n_pdds; i++) {
> +		struct kfd_process_device *pdd = target->pdds[i];
> +
> +		if (!kfd_dbg_is_per_vmid_supported(pdd->dev))
> +			continue;
> +
> +		r = debug_refresh_runlist(target->pdds[i]->dev->dqm);
> +		if (r) {
> +			target->dbg_flags = prev_flags;
> +			break;
> +		}
> +	}
> +
> +	return r;
> +}
> +
>   
>   /* kfd_dbg_trap_deactivate:
>    *	target: target process
> @@ -369,9 +401,12 @@ void kfd_dbg_trap_deactivate(struct kfd_process *target, bool unwind, int unwind
>   	int i, count = 0;
>   
>   	if (!unwind) {
> +		uint32_t flags = 0;
>   		cancel_work_sync(&target->debug_event_workarea);
>   		kfd_dbg_clear_process_address_watch(target);
>   		kfd_dbg_trap_set_wave_launch_mode(target, 0);
> +
> +		kfd_dbg_trap_set_flags(target, &flags);
>   	}
>   
>   	for (i = 0; i < target->n_pdds; i++) {
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.h b/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
> index ad677e67e7eb..12b80b6c96d0 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
> @@ -57,6 +57,7 @@ int kfd_dbg_trap_set_dev_address_watch(struct kfd_process_device *pdd,
>   					uint32_t watch_address_mask,
>   					uint32_t *watch_id,
>   					uint32_t watch_mode);
> +int kfd_dbg_trap_set_flags(struct kfd_process *target, uint32_t *flags);
>   int kfd_dbg_send_exception_to_runtime(struct kfd_process *p,
>   					unsigned int dev_id,
>   					unsigned int queue_id,

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [PATCH 25/29] drm/amdkfd: add debug query event operation
  2022-10-31 16:23 ` [PATCH 25/29] drm/amdkfd: add debug query event operation Jonathan Kim
@ 2022-11-30  0:44   ` Felix Kuehling
  0 siblings, 0 replies; 63+ messages in thread
From: Felix Kuehling @ 2022-11-30  0:44 UTC (permalink / raw)
  To: Jonathan Kim, amd-gfx


On 2022-10-31 12:23, Jonathan Kim wrote:
> Allow the debugger to a single query queue, device and process exception
> in a FIFO manner.

The implementation is not really FIFO because the order in which events 
are returned is independent of the order in which they were raised. Just 
remove the FIFO statement.

Other than that, this patch is

Reviewed-by: Felix Kuehling <Felix.Kuehling@amd.com>


> The KFD should also return the GPU or Queue id of the exception.
> The debugger also has the option of clearing exceptions after
> being queried.
>
> Signed-off-by: Jonathan Kim <jonathan.kim@amd.com>
> ---
>   drivers/gpu/drm/amd/amdkfd/kfd_chardev.c |  6 +++
>   drivers/gpu/drm/amd/amdkfd/kfd_debug.c   | 64 ++++++++++++++++++++++++
>   drivers/gpu/drm/amd/amdkfd/kfd_debug.h   |  5 ++
>   3 files changed, 75 insertions(+)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> index 200e11f02382..b918213a0087 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> @@ -2946,6 +2946,12 @@ static int kfd_ioctl_set_debug_trap(struct file *filep, struct kfd_process *p, v
>   		r = kfd_dbg_trap_set_flags(target, &args->set_flags.flags);
>   		break;
>   	case KFD_IOC_DBG_TRAP_QUERY_DEBUG_EVENT:
> +		r = kfd_dbg_ev_query_debug_event(target,
> +				&args->query_debug_event.queue_id,
> +				&args->query_debug_event.gpu_id,
> +				args->query_debug_event.exception_mask,
> +				&args->query_debug_event.exception_mask);
> +		break;
>   	case KFD_IOC_DBG_TRAP_QUERY_EXCEPTION_INFO:
>   	case KFD_IOC_DBG_TRAP_GET_QUEUE_SNAPSHOT:
>   	case KFD_IOC_DBG_TRAP_GET_DEVICE_SNAPSHOT:
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
> index 1f4d3fa0278e..6985a53b83e9 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
> @@ -33,6 +33,70 @@
>   #define MAX_WATCH_ADDRESSES	4
>   static DEFINE_SPINLOCK(watch_points_lock);
>   
> +int kfd_dbg_ev_query_debug_event(struct kfd_process *process,
> +		      unsigned int *queue_id,
> +		      unsigned int *gpu_id,
> +		      uint64_t exception_clear_mask,
> +		      uint64_t *event_status)
> +{
> +	struct process_queue_manager *pqm;
> +	struct process_queue_node *pqn;
> +	int i;
> +
> +	if (!(process && process->debug_trap_enabled))
> +		return -ENODATA;
> +
> +	mutex_lock(&process->event_mutex);
> +	*event_status = 0;
> +	*queue_id = 0;
> +	*gpu_id = 0;
> +
> +	/* find and report queue events */
> +	pqm = &process->pqm;
> +	list_for_each_entry(pqn, &pqm->queues, process_queue_list) {
> +		uint64_t tmp = process->exception_enable_mask;
> +
> +		if (!pqn->q)
> +			continue;
> +
> +		tmp &= pqn->q->properties.exception_status;
> +
> +		if (!tmp)
> +			continue;
> +
> +		*event_status = pqn->q->properties.exception_status;
> +		*queue_id = pqn->q->properties.queue_id;
> +		*gpu_id = pqn->q->device->id;
> +		pqn->q->properties.exception_status &= ~exception_clear_mask;
> +		goto out;
> +	}
> +
> +	/* find and report device events */
> +	for (i = 0; i < process->n_pdds; i++) {
> +		struct kfd_process_device *pdd = process->pdds[i];
> +		uint64_t tmp = process->exception_enable_mask
> +						& pdd->exception_status;
> +
> +		if (!tmp)
> +			continue;
> +
> +		*event_status = pdd->exception_status;
> +		*gpu_id = pdd->dev->id;
> +		pdd->exception_status &= ~exception_clear_mask;
> +		goto out;
> +	}
> +
> +	/* report process events */
> +	if (process->exception_enable_mask & process->exception_status) {
> +		*event_status = process->exception_status;
> +		process->exception_status &= ~exception_clear_mask;
> +	}
> +
> +out:
> +	mutex_unlock(&process->event_mutex);
> +	return *event_status ? 0 : -EAGAIN;
> +}
> +
>   void debug_event_write_work_handler(struct work_struct *work)
>   {
>   	struct kfd_process *process;
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.h b/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
> index 12b80b6c96d0..c64ffd3efc46 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
> @@ -27,6 +27,11 @@
>   
>   void kfd_dbg_trap_deactivate(struct kfd_process *target, bool unwind, int unwind_count);
>   int kfd_dbg_trap_activate(struct kfd_process *target);
> +int kfd_dbg_ev_query_debug_event(struct kfd_process *process,
> +			unsigned int *queue_id,
> +			unsigned int *gpu_id,
> +			uint64_t exception_clear_mask,
> +			uint64_t *event_status);
>   bool kfd_set_dbg_ev_from_interrupt(struct kfd_dev *dev,
>   				   unsigned int pasid,
>   				   uint32_t doorbell_id,

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [PATCH 26/29] drm/amdkfd: add debug query exception info operation
  2022-10-31 16:23 ` [PATCH 26/29] drm/amdkfd: add debug query exception info operation Jonathan Kim
@ 2022-11-30  0:50   ` Felix Kuehling
  0 siblings, 0 replies; 63+ messages in thread
From: Felix Kuehling @ 2022-11-30  0:50 UTC (permalink / raw)
  To: Jonathan Kim, amd-gfx

On 2022-10-31 12:23, Jonathan Kim wrote:
> Allow the debugger to query additional info based on an exception code.
> For device exceptions, it's currently only memory violation information.
> For process exceptions, it's currently only runtime information.
> Queue exception only report the queue exception status.
>
> The debugger has the option of clearing the target exception on query.
>
> Signed-off-by: Jonathan Kim <jonathan.kim@amd.com>

Reviewed-by: Felix Kuehling <Felix.Kuehling@amd.com>


> ---
>   drivers/gpu/drm/amd/amdkfd/kfd_chardev.c |   7 ++
>   drivers/gpu/drm/amd/amdkfd/kfd_debug.c   | 120 +++++++++++++++++++++++
>   drivers/gpu/drm/amd/amdkfd/kfd_debug.h   |   6 ++
>   3 files changed, 133 insertions(+)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> index b918213a0087..2c8f107237ee 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> @@ -2953,6 +2953,13 @@ static int kfd_ioctl_set_debug_trap(struct file *filep, struct kfd_process *p, v
>   				&args->query_debug_event.exception_mask);
>   		break;
>   	case KFD_IOC_DBG_TRAP_QUERY_EXCEPTION_INFO:
> +		r = kfd_dbg_trap_query_exception_info(target,
> +				args->query_exception_info.source_id,
> +				args->query_exception_info.exception_code,
> +				args->query_exception_info.clear_exception,
> +				(void __user *)args->query_exception_info.info_ptr,
> +				&args->query_exception_info.info_size);
> +		break;
>   	case KFD_IOC_DBG_TRAP_GET_QUEUE_SNAPSHOT:
>   	case KFD_IOC_DBG_TRAP_GET_DEVICE_SNAPSHOT:
>   		pr_warn("Debug op %i not supported yet\n", args->op);
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
> index 6985a53b83e9..a05fe32eac0e 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
> @@ -768,6 +768,126 @@ int kfd_dbg_trap_set_wave_launch_mode(struct kfd_process *target,
>   	return r;
>   }
>   
> +int kfd_dbg_trap_query_exception_info(struct kfd_process *target,
> +		uint32_t source_id,
> +		uint32_t exception_code,
> +		bool clear_exception,
> +		void __user *info,
> +		uint32_t *info_size)
> +{
> +	bool found = false;
> +	int r = 0;
> +	uint32_t copy_size, actual_info_size = 0;
> +	uint64_t *exception_status_ptr = NULL;
> +
> +	if (!target)
> +		return -EINVAL;
> +
> +	if (!info || !info_size)
> +		return -EINVAL;
> +
> +	mutex_lock(&target->event_mutex);
> +
> +	if (KFD_DBG_EC_TYPE_IS_QUEUE(exception_code)) {
> +		/* Per queue exceptions */
> +		struct queue *queue = NULL;
> +		int i;
> +		
> +		for (i = 0; i < target->n_pdds; i++) {
> +			struct kfd_process_device *pdd = target->pdds[i];
> +			struct qcm_process_device *qpd = &pdd->qpd;
> +
> +			list_for_each_entry(queue, &qpd->queues_list, list) {
> +				if (!found && queue->properties.queue_id == source_id) {
> +					found = true;
> +					break;
> +				}
> +			}
> +			if (found)
> +				break;
> +		}
> +
> +		if (!found) {
> +			r = -EINVAL;
> +			goto out;
> +		}
> +
> +		if (!(queue->properties.exception_status & KFD_EC_MASK(exception_code))) {
> +			r = -ENODATA;
> +			goto out;
> +		}
> +		exception_status_ptr = &queue->properties.exception_status;
> +	} else if (KFD_DBG_EC_TYPE_IS_DEVICE(exception_code)) {
> +		/* Per device exceptions */
> +		struct kfd_process_device *pdd = NULL;
> +		int i;
> +
> +		for (i = 0; i < target->n_pdds; i++) {
> +			pdd = target->pdds[i];
> +			if (pdd->dev->id == source_id) {
> +				found = true;
> +				break;
> +			}
> +		}
> +
> +		if (!found) {
> +			r = -EINVAL;
> +			goto out;
> +		}
> +
> +		if (!(pdd->exception_status & KFD_EC_MASK(exception_code))) {
> +			r = -ENODATA;
> +			goto out;
> +		}
> +
> +		if (exception_code == EC_DEVICE_MEMORY_VIOLATION) {
> +			copy_size = min((size_t)(*info_size), pdd->vm_fault_exc_data_size);
> +
> +			if (copy_to_user(info, pdd->vm_fault_exc_data, copy_size)) {
> +				r = -EFAULT;
> +				goto out;
> +			}
> +			actual_info_size = pdd->vm_fault_exc_data_size;
> +			if (clear_exception) {
> +				kfree(pdd->vm_fault_exc_data);
> +				pdd->vm_fault_exc_data = NULL;
> +				pdd->vm_fault_exc_data_size = 0;
> +			}
> +		}
> +		exception_status_ptr = &pdd->exception_status;
> +	} else if (KFD_DBG_EC_TYPE_IS_PROCESS(exception_code)) {
> +		/* Per process exceptions */
> +		if (!(target->exception_status & KFD_EC_MASK(exception_code))) {
> +			r = -ENODATA;
> +			goto out;
> +		}
> +
> +		if (exception_code == EC_PROCESS_RUNTIME) {
> +			copy_size = min((size_t)(*info_size), sizeof(target->runtime_info));
> +
> +			if (copy_to_user(info, (void *)&target->runtime_info, copy_size)) {
> +				r = -EFAULT;
> +				goto out;
> +			}
> +
> +			actual_info_size = sizeof(target->runtime_info);
> +		}
> +
> +		exception_status_ptr = &target->exception_status;
> +	} else {
> +		pr_debug("Bad exception type [%i]\n", exception_code);
> +		r = -EINVAL;
> +		goto out;
> +	}
> +
> +	*info_size = actual_info_size;
> +	if (clear_exception)
> +		*exception_status_ptr &= ~KFD_EC_MASK(exception_code);
> +out:
> +	mutex_unlock(&target->event_mutex);
> +	return r;
> +}
> +
>   void kfd_dbg_set_enabled_debug_exception_mask(struct kfd_process *target,
>   					uint64_t exception_set_mask)
>   {
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.h b/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
> index c64ffd3efc46..58a5f14d1258 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
> @@ -63,6 +63,12 @@ int kfd_dbg_trap_set_dev_address_watch(struct kfd_process_device *pdd,
>   					uint32_t *watch_id,
>   					uint32_t watch_mode);
>   int kfd_dbg_trap_set_flags(struct kfd_process *target, uint32_t *flags);
> +int kfd_dbg_trap_query_exception_info(struct kfd_process *target,
> +		uint32_t source_id,
> +		uint32_t exception_code,
> +		bool clear_exception,
> +		void __user *info,
> +		uint32_t *info_size);
>   int kfd_dbg_send_exception_to_runtime(struct kfd_process *p,
>   					unsigned int dev_id,
>   					unsigned int queue_id,

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [PATCH 27/29] drm/amdkfd: add debug queue snapshot operation
  2022-10-31 16:23 ` [PATCH 27/29] drm/amdkfd: add debug queue snapshot operation Jonathan Kim
@ 2022-11-30 23:55   ` Felix Kuehling
  2022-12-02 19:13     ` Kim, Jonathan
  0 siblings, 1 reply; 63+ messages in thread
From: Felix Kuehling @ 2022-11-30 23:55 UTC (permalink / raw)
  To: Jonathan Kim, amd-gfx


On 2022-10-31 12:23, Jonathan Kim wrote:
> Allow the debugger to get a snapshot of a specified number of queues
> containing various queue property information that is copied to the
> debugger.
>
> Since the debugger doesn't know how many queues exist at any given time,
> allow the debugger to pass the requested number of snapshots as 0 to get
> the actual number of potential snapshots to use for a subsequent snapshot
> request for actual information.
>
> To prevent future ABI breakage, pass in the requested entry_size.
> The KFD will return it's own entry_size in case the debugger still wants
> log the information in a core dump on sizing failure.
>
> Also allow the debugger to clear exceptions when doing a snapshot.
>
> v2: change buf_size arg to num_queues for clarity.
> fix minimum entry size calculation.
>
> Signed-off-by: Jonathan Kim <jonathan.kim@amd.com>

Two nit-picks inline.


> ---
>   drivers/gpu/drm/amd/amdkfd/kfd_chardev.c      |  6 +++
>   .../drm/amd/amdkfd/kfd_device_queue_manager.c | 41 +++++++++++++++++++
>   .../drm/amd/amdkfd/kfd_device_queue_manager.h |  4 ++
>   drivers/gpu/drm/amd/amdkfd/kfd_priv.h         |  5 +++
>   .../amd/amdkfd/kfd_process_queue_manager.c    | 40 ++++++++++++++++++
>   5 files changed, 96 insertions(+)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> index 2c8f107237ee..cea393350980 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> @@ -2961,6 +2961,12 @@ static int kfd_ioctl_set_debug_trap(struct file *filep, struct kfd_process *p, v
>   				&args->query_exception_info.info_size);
>   		break;
>   	case KFD_IOC_DBG_TRAP_GET_QUEUE_SNAPSHOT:
> +		r = pqm_get_queue_snapshot(&target->pqm,
> +				args->queue_snapshot.exception_mask,
> +				(void __user *)args->queue_snapshot.snapshot_buf_ptr,
> +				&args->queue_snapshot.num_queues,
> +				&args->queue_snapshot.entry_size);
> +		break;
>   	case KFD_IOC_DBG_TRAP_GET_DEVICE_SNAPSHOT:
>   		pr_warn("Debug op %i not supported yet\n", args->op);
>   		r = -EACCES;
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> index 589efbefc8dc..51f8c5676c56 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> @@ -2950,6 +2950,47 @@ int suspend_queues(struct kfd_process *p,
>   	return total_suspended;
>   }
>   
> +static uint32_t set_queue_type_for_user(struct queue_properties *q_props)
> +{
> +	switch (q_props->type) {
> +	case KFD_QUEUE_TYPE_COMPUTE:
> +		return q_props->format == KFD_QUEUE_FORMAT_PM4
> +					? KFD_IOC_QUEUE_TYPE_COMPUTE
> +					: KFD_IOC_QUEUE_TYPE_COMPUTE_AQL;
> +	case KFD_QUEUE_TYPE_SDMA:
> +		return KFD_IOC_QUEUE_TYPE_SDMA;
> +	case KFD_QUEUE_TYPE_SDMA_XGMI:
> +		return KFD_IOC_QUEUE_TYPE_SDMA_XGMI;
> +	default:
> +		WARN_ONCE(true, "queue type not recognized!");
> +		return 0xffffffff;
> +	};
> +}
> +
> +void set_queue_snapshot_entry(struct device_queue_manager *dqm,
> +			      struct queue *q,
> +			      uint64_t exception_clear_mask,
> +			      struct kfd_queue_snapshot_entry *qss_entry)

The dqm parameter is not needed. The function can get this from 
q->device->dqm. It's also only needed for dqm locking. I'm not sure 
that's even necessary. Aren't the event_mutex and target process mutex 
held by the caller enough to protect the exception_status and other 
queue properties?


> +{
> +	dqm_lock(dqm);
> +
> +	qss_entry->ring_base_address = q->properties.queue_address;
> +	qss_entry->write_pointer_address = (uint64_t)q->properties.write_ptr;
> +	qss_entry->read_pointer_address = (uint64_t)q->properties.read_ptr;
> +	qss_entry->ctx_save_restore_address =
> +				q->properties.ctx_save_restore_area_address;
> +	qss_entry->ctx_save_restore_area_size =
> +				q->properties.ctx_save_restore_area_size;
> +	qss_entry->exception_status = q->properties.exception_status;
> +	qss_entry->queue_id = q->properties.queue_id;
> +	qss_entry->gpu_id = q->device->id;
> +	qss_entry->ring_size = (uint32_t)q->properties.queue_size;
> +	qss_entry->queue_type = set_queue_type_for_user(&q->properties);
> +	q->properties.exception_status &= ~exception_clear_mask;
> +
> +	dqm_unlock(dqm);
> +}
> +
>   int debug_lock_and_unmap(struct device_queue_manager *dqm)
>   {
>   	int r;
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
> index 12643528684c..094705b932fc 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
> @@ -297,6 +297,10 @@ int resume_queues(struct kfd_process *p,
>   		bool resume_all_queues,
>   		uint32_t num_queues,
>   		uint32_t *usr_queue_id_array);
> +void set_queue_snapshot_entry(struct device_queue_manager *dqm,
> +			      struct queue *q,
> +			      uint64_t exception_clear_mask,
> +			      struct kfd_queue_snapshot_entry *qss_entry);
>   int debug_lock_and_unmap(struct device_queue_manager *dqm);
>   int debug_map_and_unlock(struct device_queue_manager *dqm);
>   int debug_refresh_runlist(struct device_queue_manager *dqm);
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> index aee4fe20e676..ebd701143981 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> @@ -1302,6 +1302,11 @@ int pqm_get_wave_state(struct process_queue_manager *pqm,
>   		       void __user *ctl_stack,
>   		       u32 *ctl_stack_used_size,
>   		       u32 *save_area_used_size);
> +int pqm_get_queue_snapshot(struct process_queue_manager *pqm,
> +			   uint64_t exception_clear_mask,
> +			   struct kfd_queue_snapshot_entry __user *buf,
> +			   int *num_qss_entries,
> +			   uint32_t *entry_size);
>   
>   int amdkfd_fence_wait_timeout(uint64_t *fence_addr,
>   			      uint64_t fence_value,
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
> index 15db83c9a585..30df1046c30b 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
> @@ -569,6 +569,46 @@ int pqm_get_wave_state(struct process_queue_manager *pqm,
>   						       save_area_used_size);
>   }
>   
> +int pqm_get_queue_snapshot(struct process_queue_manager *pqm,
> +			   uint64_t exception_clear_mask,
> +			   struct kfd_queue_snapshot_entry __user *buf,
> +			   int *num_qss_entries,
> +			   uint32_t *entry_size)
> +{
> +	struct process_queue_node *pqn;
> +	uint32_t tmp_entry_size = *entry_size, tmp_qss_entries = *num_qss_entries;
> +	int r;
> +
> +	*num_qss_entries = 0;
> +	if (!(*entry_size))
> +		return -EINVAL;
> +
> +	*entry_size = min_t(size_t, *entry_size, sizeof(struct kfd_queue_snapshot_entry));
> +	mutex_lock(&pqm->process->event_mutex);
> +
> +	list_for_each_entry(pqn, &pqm->queues, process_queue_list) {
> +		if (!pqn->q)
> +			continue;
> +
> +		if (*num_qss_entries < tmp_qss_entries) {
> +			struct kfd_queue_snapshot_entry src = {0};

It's safer to use memset here. This initialization may not initialize 
padding, so it doesn't guarantee that no uninitialized data leaks from 
kernel mode to user mode.

Regards,
   Felix


> +
> +			set_queue_snapshot_entry(pqn->q->device->dqm,
> +					pqn->q, exception_clear_mask, &src);
> +
> +			if (copy_to_user(buf, &src, *entry_size)) {
> +				r = -EFAULT;
> +				break;
> +			}
> +			buf += tmp_entry_size;
> +		}
> +		*num_qss_entries += 1;
> +	}
> +
> +	mutex_unlock(&pqm->process->event_mutex);
> +	return r;
> +}
> +
>   static int get_queue_data_sizes(struct kfd_process_device *pdd,
>   				struct queue *q,
>   				uint32_t *mqd_size,

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [PATCH 28/29] drm/amdkfd: add debug device snapshot operation
  2022-10-31 16:23 ` [PATCH 28/29] drm/amdkfd: add debug device " Jonathan Kim
@ 2022-12-01  0:00   ` Felix Kuehling
  0 siblings, 0 replies; 63+ messages in thread
From: Felix Kuehling @ 2022-12-01  0:00 UTC (permalink / raw)
  To: Jonathan Kim, amd-gfx

On 2022-10-31 12:23, Jonathan Kim wrote:
> Similar to queue snapshot, return an array of device information using
> an entry_size check and return.
> Unlike queue snapshots, the debugger needs to pass to correct number of
> devices that exist.  If it fails to do so, the KFD will return the
> number of actual devices so that the debugger can make a subsequent
> successful call.
>
> v2: change buf_size are to num_devices for more clarity.
> expand device entry new members on copy.
> fix minimum entry size calculation for queue and device snapshot.
> change device snapshot implementation to match queue snapshot
> implementation.
>
> Signed-off-by: Jonathan Kim <jonathan.kim@amd.com>
> ---
>   drivers/gpu/drm/amd/amdkfd/kfd_chardev.c |  7 ++-
>   drivers/gpu/drm/amd/amdkfd/kfd_debug.c   | 67 ++++++++++++++++++++++++
>   drivers/gpu/drm/amd/amdkfd/kfd_debug.h   |  5 ++
>   3 files changed, 77 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> index cea393350980..115a80686f7a 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> @@ -2968,8 +2968,11 @@ static int kfd_ioctl_set_debug_trap(struct file *filep, struct kfd_process *p, v
>   				&args->queue_snapshot.entry_size);
>   		break;
>   	case KFD_IOC_DBG_TRAP_GET_DEVICE_SNAPSHOT:
> -		pr_warn("Debug op %i not supported yet\n", args->op);
> -		r = -EACCES;
> +		r = kfd_dbg_trap_device_snapshot(target,
> +				args->device_snapshot.exception_mask,
> +				(void __user *)args->device_snapshot.snapshot_buf_ptr,
> +				&args->device_snapshot.num_devices,
> +				&args->device_snapshot.entry_size);
>   		break;
>   	default:
>   		pr_err("Invalid option: %i\n", args->op);
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
> index a05fe32eac0e..8d22a27cc062 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
> @@ -22,6 +22,7 @@
>   
>   #include "kfd_debug.h"
>   #include "kfd_device_queue_manager.h"
> +#include "kfd_topology.h"
>   #include <linux/file.h>
>   #include <uapi/linux/kfd_ioctl.h>
>   
> @@ -888,6 +889,72 @@ int kfd_dbg_trap_query_exception_info(struct kfd_process *target,
>   	return r;
>   }
>   
> +int kfd_dbg_trap_device_snapshot(struct kfd_process *target,
> +		uint64_t exception_clear_mask,
> +		void __user *user_info,
> +		uint32_t *number_of_device_infos,
> +		uint32_t *entry_size)
> +{
> +	struct kfd_dbg_device_info_entry device_info = {0};

Use memset. With that fixed, the patch is

Reviewed-by: Felix Kuehling <Felix.Kuehling@amd.com>


> +	uint32_t tmp_entry_size = *entry_size, tmp_num_devices;
> +	int i, r = 0;
> +
> +	if (!(target && user_info && number_of_device_infos && entry_size))
> +		return -EINVAL;
> +
> +	tmp_num_devices = min_t(size_t, *number_of_device_infos, target->n_pdds);
> +	*number_of_device_infos = target->n_pdds;
> +	*entry_size = min_t(size_t, *entry_size, sizeof(device_info));
> +
> +	if (!tmp_num_devices)
> +		return 0;
> +
> +	mutex_lock(&target->event_mutex);
> +
> +	/* Run over all pdd of the process */
> +	for (i = 0; i < tmp_num_devices; i++) {
> +		struct kfd_process_device *pdd = target->pdds[i];
> +		struct kfd_topology_device *topo_dev = kfd_topology_device_by_id(pdd->dev->id);
> +
> +		device_info.gpu_id = pdd->dev->id;
> +		device_info.exception_status = pdd->exception_status;
> +		device_info.lds_base = pdd->lds_base;
> +		device_info.lds_limit = pdd->lds_limit;
> +		device_info.scratch_base = pdd->scratch_base;
> +		device_info.scratch_limit = pdd->scratch_limit;
> +		device_info.gpuvm_base = pdd->gpuvm_base;
> +		device_info.gpuvm_limit = pdd->gpuvm_limit;
> +		device_info.location_id = topo_dev->node_props.location_id;
> +		device_info.vendor_id = topo_dev->node_props.vendor_id;
> +		device_info.device_id = topo_dev->node_props.device_id;
> +		device_info.fw_version = pdd->dev->mec_fw_version;
> +		device_info.gfx_target_version =
> +			topo_dev->node_props.gfx_target_version;
> +		device_info.simd_count = topo_dev->node_props.simd_count;
> +		device_info.max_waves_per_simd =
> +			topo_dev->node_props.max_waves_per_simd;
> +		device_info.array_count = topo_dev->node_props.array_count;
> +		device_info.simd_arrays_per_engine =
> +			topo_dev->node_props.simd_arrays_per_engine;
> +		device_info.capability = topo_dev->node_props.capability;
> +		device_info.debug_prop = topo_dev->node_props.debug_prop;
> +
> +		if (exception_clear_mask)
> +			pdd->exception_status &= ~exception_clear_mask;
> +
> +		if (copy_to_user(user_info, &device_info, *entry_size)) {
> +			r = -EFAULT;
> +			break;
> +		}
> +
> +		user_info += tmp_entry_size;
> +	}
> +
> +	mutex_unlock(&target->event_mutex);
> +
> +	return r;
> +}
> +
>   void kfd_dbg_set_enabled_debug_exception_mask(struct kfd_process *target,
>   					uint64_t exception_set_mask)
>   {
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.h b/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
> index 58a5f14d1258..d8c0c54fffa3 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
> @@ -80,6 +80,11 @@ static inline bool kfd_dbg_is_per_vmid_supported(struct kfd_dev *dev)
>   }
>   
>   void debug_event_write_work_handler(struct work_struct *work);
> +int kfd_dbg_trap_device_snapshot(struct kfd_process *target,
> +		uint64_t exception_clear_mask,
> +		void __user *user_info,
> +		uint32_t *number_of_device_infos,
> +		uint32_t *entry_size);
>   
>   void kfd_dbg_set_enabled_debug_exception_mask(struct kfd_process *target,
>   					uint64_t exception_set_mask);

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [PATCH 29/29] drm/amdkfd: bump kfd ioctl minor version for debug api availability
  2022-10-31 16:23 ` [PATCH 29/29] drm/amdkfd: bump kfd ioctl minor version for debug api availability Jonathan Kim
@ 2022-12-01  0:00   ` Felix Kuehling
  0 siblings, 0 replies; 63+ messages in thread
From: Felix Kuehling @ 2022-12-01  0:00 UTC (permalink / raw)
  To: Jonathan Kim, amd-gfx

On 2022-10-31 12:23, Jonathan Kim wrote:
> Bump the minor version to declare debugging capability is now
> available.
>
> Signed-off-by: Jonathan Kim <jonathan.kim@amd.com>

Reviewed-by: Felix Kuehling <Felix.Kuehling@amd.com>


> ---
>   drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 1 -
>   include/uapi/linux/kfd_ioctl.h           | 3 ++-
>   2 files changed, 2 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> index 115a80686f7a..2f7d8b230527 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> @@ -2891,7 +2891,6 @@ static int kfd_ioctl_set_debug_trap(struct file *filep, struct kfd_process *p, v
>   		if (!r)
>   			target->exception_enable_mask = args->enable.exception_mask;
>   
> -		pr_warn("Debug functions limited\n");
>   		break;
>   	case KFD_IOC_DBG_TRAP_DISABLE:
>   		r = kfd_dbg_trap_disable(target);
> diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h
> index bedf1b823f57..fe5acee2684d 100644
> --- a/include/uapi/linux/kfd_ioctl.h
> +++ b/include/uapi/linux/kfd_ioctl.h
> @@ -37,9 +37,10 @@
>    * - 1.9 - Add available memory ioctl
>    * - 1.10 - Add SMI profiler event log
>    * - 1.11 - Add unified memory for ctx save/restore area
> + * - 1.12 - Add debugger API
>    */
>   #define KFD_IOCTL_MAJOR_VERSION 1
> -#define KFD_IOCTL_MINOR_VERSION 11
> +#define KFD_IOCTL_MINOR_VERSION 12
>   
>   struct kfd_ioctl_get_version_args {
>   	__u32 major_version;	/* from KFD */

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [PATCH 21/29] drm/amdkfd: add debug wave launch mode operation
  2022-10-31 16:23 ` [PATCH 21/29] drm/amdkfd: add debug wave launch mode operation Jonathan Kim
@ 2022-12-01  0:02   ` Felix Kuehling
  0 siblings, 0 replies; 63+ messages in thread
From: Felix Kuehling @ 2022-12-01  0:02 UTC (permalink / raw)
  To: Jonathan Kim, amd-gfx


On 2022-10-31 12:23, Jonathan Kim wrote:
> Allow the debugger to set wave behaviour on to either normally operate,
> halt at launch, trap on every instruction, terminate immediately or
> stall on allocation.
>
> Signed-off-by: Jonathan Kim <jonathan.kim@amd.com>
> ---
>   .../drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c  | 18 ++++++++++
>   .../drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c   |  1 +
>   .../drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c    | 27 +++++++++++++++
>   .../drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.h    |  3 ++
>   .../drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10_3.c  |  3 +-
>   .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c | 34 +++++++++++++++++++
>   .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h |  3 ++
>   drivers/gpu/drm/amd/amdkfd/kfd_chardev.c      |  3 ++
>   drivers/gpu/drm/amd/amdkfd/kfd_debug.c        | 27 ++++++++++++++-
>   drivers/gpu/drm/amd/amdkfd/kfd_debug.h        |  5 ++-
>   10 files changed, 119 insertions(+), 5 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c
> index a5003f6f05bf..91c7fdee883e 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c
> @@ -99,6 +99,23 @@ static uint32_t kgd_aldebaran_set_wave_launch_trap_override(struct amdgpu_device
>   	return data;
>   }
>   
> +static uint32_t kgd_aldebaran_set_wave_launch_mode(struct amdgpu_device *adev,
> +					uint8_t wave_launch_mode,
> +					uint32_t vmid)
> +{
> +	uint32_t data = 0;
> +	bool is_stall_mode = wave_launch_mode == KFD_DBG_TRAP_WAVE_LAUNCH_MODE_HALT;
> +
> +	if (is_stall_mode)
> +		data = REG_SET_FIELD(data, SPI_GDBG_PER_VMID_CNTL, STALL_VMID,
> +									1);
> +	else
> +		data = REG_SET_FIELD(data, SPI_GDBG_PER_VMID_CNTL, LAUNCH_MODE,
> +							wave_launch_mode);
> +
> +	return data;
> +}
> +
>   const struct kfd2kgd_calls aldebaran_kfd2kgd = {
>   	.program_sh_mem_settings = kgd_gfx_v9_program_sh_mem_settings,
>   	.set_pasid_vmid_mapping = kgd_gfx_v9_set_pasid_vmid_mapping,
> @@ -120,6 +137,7 @@ const struct kfd2kgd_calls aldebaran_kfd2kgd = {
>   	.disable_debug_trap = kgd_aldebaran_disable_debug_trap,
>   	.validate_trap_override_request = kgd_aldebaran_validate_trap_override_request,
>   	.set_wave_launch_trap_override = kgd_aldebaran_set_wave_launch_trap_override,
> +	.set_wave_launch_mode = kgd_aldebaran_set_wave_launch_mode,
>   	.get_iq_wait_times = kgd_gfx_v9_get_iq_wait_times,
>   	.build_grace_period_packet_info = kgd_gfx_v9_build_grace_period_packet_info,
>   	.program_trap_handler_settings = kgd_gfx_v9_program_trap_handler_settings,
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c
> index b3682758184f..10470f4a4eaf 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c
> @@ -399,6 +399,7 @@ const struct kfd2kgd_calls arcturus_kfd2kgd = {
>   	.disable_debug_trap = kgd_arcturus_disable_debug_trap,
>   	.validate_trap_override_request = kgd_gfx_v9_validate_trap_override_request,
>   	.set_wave_launch_trap_override = kgd_gfx_v9_set_wave_launch_trap_override,
> +	.set_wave_launch_mode = kgd_gfx_v9_set_wave_launch_mode,
>   	.get_iq_wait_times = kgd_gfx_v9_get_iq_wait_times,
>   	.build_grace_period_packet_info = kgd_gfx_v9_build_grace_period_packet_info,
>   	.get_cu_occupancy = kgd_gfx_v9_get_cu_occupancy,
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
> index 32a6e5fbeacd..66a83e6fb9e5 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
> @@ -854,6 +854,32 @@ uint32_t kgd_gfx_v10_set_wave_launch_trap_override(struct amdgpu_device *adev,
>   	return 0;
>   }
>   
> +uint32_t kgd_gfx_v10_set_wave_launch_mode(struct amdgpu_device *adev,
> +					uint8_t wave_launch_mode,
> +					uint32_t vmid)
> +{
> +	uint32_t data = 0;
> +	bool is_stall_mode = wave_launch_mode == KFD_DBG_TRAP_WAVE_LAUNCH_MODE_STALL;
> +	bool is_mode_set = wave_launch_mode && !is_stall_mode;
> +
> +	mutex_lock(&adev->grbm_idx_mutex);
> +
> +	kgd_gfx_v10_set_wave_launch_stall(adev, vmid, true);
> +
> +	data = REG_SET_FIELD(data, SPI_GDBG_WAVE_CNTL2,
> +			VMID_MASK, is_mode_set ? 1 << vmid : 0);
> +	data = REG_SET_FIELD(data, SPI_GDBG_WAVE_CNTL2,
> +			MODE, is_mode_set ? wave_launch_mode : 0);
> +	WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_WAVE_CNTL2), data);
> +
> +	if (!is_stall_mode)
> +		kgd_gfx_v10_set_wave_launch_stall(adev, vmid, false);
> +
> +	mutex_unlock(&adev->grbm_idx_mutex);
> +
> +	return 0;
> +}
> +
>   /* kgd_gfx_v10_get_iq_wait_times: Returns the mmCP_IQ_WAIT_TIME1/2 values
>    * The values read are:
>    *     ib_offload_wait_time     -- Wait Count for Indirect Buffer Offloads.
> @@ -941,6 +967,7 @@ const struct kfd2kgd_calls gfx_v10_kfd2kgd = {
>   	.disable_debug_trap = kgd_gfx_v10_disable_debug_trap,
>   	.validate_trap_override_request = kgd_gfx_v10_validate_trap_override_request,
>   	.set_wave_launch_trap_override = kgd_gfx_v10_set_wave_launch_trap_override,
> +	.set_wave_launch_mode = kgd_gfx_v10_set_wave_launch_mode,
>   	.get_iq_wait_times = kgd_gfx_v10_get_iq_wait_times,
>   	.build_grace_period_packet_info = kgd_gfx_v10_build_grace_period_packet_info,
>   	.program_trap_handler_settings = program_trap_handler_settings,
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.h
> index 85c929fc2926..34c04a2bb83b 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.h
> @@ -36,6 +36,9 @@ uint32_t kgd_gfx_v10_set_wave_launch_trap_override(struct amdgpu_device *adev,
>   					     uint32_t trap_mask_request,
>   					     uint32_t *trap_mask_prev,
>   					     uint32_t kfd_dbg_trap_cntl_prev);
> +uint32_t kgd_gfx_v10_set_wave_launch_mode(struct amdgpu_device *adev,
> +					 uint8_t wave_launch_mode,
> +					 uint32_t vmid);
>   void kgd_gfx_v10_get_iq_wait_times(struct amdgpu_device *adev, uint32_t *wait_times);
>   void kgd_gfx_v10_build_grace_period_packet_info(struct amdgpu_device *adev,
>   					       uint32_t wait_times,
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10_3.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10_3.c
> index ae3ead207df4..8627c5458973 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10_3.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10_3.c
> @@ -675,6 +675,7 @@ const struct kfd2kgd_calls gfx_v10_3_kfd2kgd = {
>   	.enable_debug_trap = kgd_gfx_v10_enable_debug_trap,
>   	.disable_debug_trap = kgd_gfx_v10_disable_debug_trap,
>   	.validate_trap_override_request = kgd_gfx_v10_validate_trap_override_request,
> -	.set_wave_launch_trap_override = kgd_gfx_v10_set_wave_launch_trap_override
> +	.set_wave_launch_trap_override = kgd_gfx_v10_set_wave_launch_trap_override,
> +	.set_wave_launch_mode = kgd_gfx_v10_set_wave_launch_mode
>   
>   };
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
> index cb0044bbfae5..3bba7ca21926 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
> @@ -777,6 +777,39 @@ uint32_t kgd_gfx_v9_set_wave_launch_trap_override(struct amdgpu_device *adev,
>   	return 0;
>   }
>   
> +uint32_t kgd_gfx_v9_set_wave_launch_mode(struct amdgpu_device *adev,
> +					uint8_t wave_launch_mode,
> +					uint32_t vmid)
> +{
> +	uint32_t data = 0;
> +	bool is_stall_mode = wave_launch_mode ==
> +				KFD_DBG_TRAP_WAVE_LAUNCH_MODE_STALL;
> +	bool is_mode_set = wave_launch_mode && !is_stall_mode;
> +
> +	mutex_lock(&adev->grbm_idx_mutex);
> +
> +	kgd_gfx_v9_set_wave_launch_stall(adev, vmid, true);
> +
> +	data = REG_SET_FIELD(data, SPI_GDBG_WAVE_CNTL2,
> +		VMID_MASK, is_mode_set ? 1 << vmid : 0);
> +	data = REG_SET_FIELD(data, SPI_GDBG_WAVE_CNTL2,
> +		MODE, is_mode_set ? wave_launch_mode : 0);
> +	WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_WAVE_CNTL2), data);
> +
> +	/* Although Pre-GFX9.4.1 stalls globally, the per-VMID stall for
> +	 * GFX9.4.1 effectively does the same thing as global STALL_RA as
> +	 * all other VMID allocations are back logged by the stalled VMID.
> +	 *
> +	 * Use with caution.
> +	 */

This is potentially problematic. Discussing this with the debugger team, 
it turned out that they're not actually using stall mode. So we should 
remove it from the code and the API.

Other than that, this patch looks good to me.

Regards,
   Felix


> +	if (!is_stall_mode)
> +		kgd_gfx_v9_set_wave_launch_stall(adev, vmid, false);
> +
> +	mutex_unlock(&adev->grbm_idx_mutex);
> +
> +	return 0;
> +}
> +
>   /* kgd_gfx_v9_get_iq_wait_times: Returns the mmCP_IQ_WAIT_TIME1/2 values
>    * The values read are:
>    *     ib_offload_wait_time     -- Wait Count for Indirect Buffer Offloads.
> @@ -1047,6 +1080,7 @@ const struct kfd2kgd_calls gfx_v9_kfd2kgd = {
>   	.disable_debug_trap = kgd_gfx_v9_disable_debug_trap,
>   	.validate_trap_override_request = kgd_gfx_v9_validate_trap_override_request,
>   	.set_wave_launch_trap_override = kgd_gfx_v9_set_wave_launch_trap_override,
> +	.set_wave_launch_mode = kgd_gfx_v9_set_wave_launch_mode,
>   	.get_iq_wait_times = kgd_gfx_v9_get_iq_wait_times,
>   	.build_grace_period_packet_info = kgd_gfx_v9_build_grace_period_packet_info,
>   	.get_cu_occupancy = kgd_gfx_v9_get_cu_occupancy,
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h
> index 47cff392b434..2a2ab42037e4 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h
> @@ -67,6 +67,9 @@ uint32_t kgd_gfx_v9_disable_debug_trap(struct amdgpu_device *adev,
>   int kgd_gfx_v9_validate_trap_override_request(struct amdgpu_device *adev,
>   					     uint32_t trap_override,
>   					     uint32_t *trap_mask_supported);
> +uint32_t kgd_gfx_v9_set_wave_launch_mode(struct amdgpu_device *adev,
> +					uint8_t wave_launch_mode,
> +					uint32_t vmid);
>   uint32_t kgd_gfx_v9_set_wave_launch_trap_override(struct amdgpu_device *adev,
>   					     uint32_t vmid,
>   					     uint32_t trap_override,
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> index 1f0ee2413b13..63665279ce4d 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> @@ -2899,6 +2899,9 @@ static int kfd_ioctl_set_debug_trap(struct file *filep, struct kfd_process *p, v
>   				&args->launch_override.support_request_mask);
>   		break;
>   	case KFD_IOC_DBG_TRAP_SET_WAVE_LAUNCH_MODE:
> +		r = kfd_dbg_trap_set_wave_launch_mode(target,
> +				args->launch_mode.launch_mode);
> +		break;
>   	case KFD_IOC_DBG_TRAP_SUSPEND_QUEUES:
>   	case KFD_IOC_DBG_TRAP_RESUME_QUEUES:
>   	case KFD_IOC_DBG_TRAP_SET_NODE_ADDRESS_WATCH:
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
> index 8add359d1cb9..210851f2cdb3 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
> @@ -239,8 +239,10 @@ void kfd_dbg_trap_deactivate(struct kfd_process *target, bool unwind, int unwind
>   {
>   	int i, count = 0;
>   
> -	if (!unwind)
> +	if (!unwind) {
>   		cancel_work_sync(&target->debug_event_workarea);
> +		kfd_dbg_trap_set_wave_launch_mode(target, 0);
> +	}
>   
>   	for (i = 0; i < target->n_pdds; i++) {
>   		struct kfd_process_device *pdd = target->pdds[i];
> @@ -507,6 +509,29 @@ int kfd_dbg_trap_set_wave_launch_override(struct kfd_process *target,
>   	return r;
>   }
>   
> +int kfd_dbg_trap_set_wave_launch_mode(struct kfd_process *target,
> +					uint8_t wave_launch_mode)
> +{
> +	int r = 0, i;
> +
> +	for (i = 0; i < target->n_pdds; i++) {
> +		struct kfd_process_device *pdd = target->pdds[i];
> +
> +		amdgpu_gfx_off_ctrl(pdd->dev->adev, false);
> +		pdd->spi_dbg_launch_mode = pdd->dev->kfd2kgd->set_wave_launch_mode(
> +				pdd->dev->adev,
> +				wave_launch_mode,
> +				pdd->dev->vm_info.last_vmid_kfd);
> +		amdgpu_gfx_off_ctrl(pdd->dev->adev, true);
> +
> +		r = debug_refresh_runlist(pdd->dev->dqm);
> +		if (r)
> +			break;
> +	}
> +
> +	return r;
> +}
> +
>   void kfd_dbg_set_enabled_debug_exception_mask(struct kfd_process *target,
>   					uint64_t exception_set_mask)
>   {
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.h b/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
> index b54a50a5d310..ca3ab1f01985 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
> @@ -25,9 +25,6 @@
>   
>   #include "kfd_priv.h"
>   
> -void kgd_gfx_v9_set_wave_launch_stall(struct amdgpu_device *adev,
> -					uint32_t vmid,
> -					bool stall);
>   void kfd_dbg_trap_deactivate(struct kfd_process *target, bool unwind, int unwind_count);
>   int kfd_dbg_trap_activate(struct kfd_process *target);
>   bool kfd_set_dbg_ev_from_interrupt(struct kfd_dev *dev,
> @@ -51,6 +48,8 @@ int kfd_dbg_trap_set_wave_launch_override(struct kfd_process *target,
>   					uint32_t trap_mask_request,
>   					uint32_t *trap_mask_prev,
>   					uint32_t *trap_mask_supported);
> +int kfd_dbg_trap_set_wave_launch_mode(struct kfd_process *target,
> +					uint8_t wave_launch_mode);
>   
>   int kfd_dbg_send_exception_to_runtime(struct kfd_process *p,
>   					unsigned int dev_id,

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [PATCH 04/29] drm/amdgpu: add kgd hw debug mode setting interface
  2022-10-31 16:23 ` [PATCH 04/29] drm/amdgpu: add kgd hw debug mode setting interface Jonathan Kim
@ 2022-12-01  0:08   ` Felix Kuehling
  0 siblings, 0 replies; 63+ messages in thread
From: Felix Kuehling @ 2022-12-01  0:08 UTC (permalink / raw)
  To: Jonathan Kim, amd-gfx

On 2022-10-31 12:23, Jonathan Kim wrote:
> Introduce the require KGD debug calls that will execute hardware debug
> mode setting.
>
> Signed-off-by: Jonathan Kim <jonathan.kim@amd.com>

Reviewed-by: Felix Kuehling <Felix.Kuehling@amd.com>


> ---
>   .../gpu/drm/amd/include/kgd_kfd_interface.h   | 34 +++++++++++++++++++
>   1 file changed, 34 insertions(+)
>
> diff --git a/drivers/gpu/drm/amd/include/kgd_kfd_interface.h b/drivers/gpu/drm/amd/include/kgd_kfd_interface.h
> index 5cb3e8634739..15e7a5c920a0 100644
> --- a/drivers/gpu/drm/amd/include/kgd_kfd_interface.h
> +++ b/drivers/gpu/drm/amd/include/kgd_kfd_interface.h
> @@ -289,6 +289,40 @@ struct kfd2kgd_calls {
>   			uint32_t vmid, uint64_t page_table_base);
>   	uint32_t (*read_vmid_from_vmfault_reg)(struct amdgpu_device *adev);
>   
> +	uint32_t (*enable_debug_trap)(struct amdgpu_device *adev,
> +					bool restore_dbg_registers,
> +					uint32_t vmid);
> +	uint32_t (*disable_debug_trap)(struct amdgpu_device *adev,
> +					bool keep_trap_enabled,
> +					uint32_t vmid);
> +	int (*validate_trap_override_request)(struct amdgpu_device *adev,
> +					uint32_t trap_override,
> +					uint32_t *trap_mask_supported);
> +	uint32_t (*set_wave_launch_trap_override)(struct amdgpu_device *adev,
> +					     uint32_t vmid,
> +					     uint32_t trap_override,
> +					     uint32_t trap_mask_bits,
> +					     uint32_t trap_mask_request,
> +					     uint32_t *trap_mask_prev,
> +					     uint32_t kfd_dbg_trap_cntl_prev);
> +	uint32_t (*set_wave_launch_mode)(struct amdgpu_device *adev,
> +					uint8_t wave_launch_mode,
> +					uint32_t vmid);
> +	uint32_t (*set_address_watch)(struct amdgpu_device *adev,
> +					uint64_t watch_address,
> +					uint32_t watch_address_mask,
> +					uint32_t watch_id,
> +					uint32_t watch_mode,
> +					uint32_t debug_vmid);
> +	uint32_t (*clear_address_watch)(struct amdgpu_device *adev,
> +			uint32_t watch_id);
> +	void (*get_iq_wait_times)(struct amdgpu_device *adev,
> +			uint32_t *wait_times);
> +	void (*build_grace_period_packet_info)(struct amdgpu_device *adev,
> +			uint32_t wait_times,
> +			uint32_t grace_period,
> +			uint32_t *reg_offset,
> +			uint32_t *reg_data);
>   	void (*get_cu_occupancy)(struct amdgpu_device *adev, int pasid,
>   			int *wave_cnt, int *max_waves_per_cu);
>   	void (*program_trap_handler_settings)(struct amdgpu_device *adev,

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [PATCH 05/29] drm/amdgpu: setup hw debug registers on driver initialization
  2022-11-22 23:38   ` Felix Kuehling
  2022-11-23 20:53     ` Kim, Jonathan
@ 2022-12-01  0:18     ` Felix Kuehling
  1 sibling, 0 replies; 63+ messages in thread
From: Felix Kuehling @ 2022-12-01  0:18 UTC (permalink / raw)
  To: amd-gfx


On 2022-11-22 18:38, Felix Kuehling wrote:
>
> On 2022-10-31 12:23, Jonathan Kim wrote:
>> Add missing debug trap registers references and initialize all debug
>> registers on boot by clearing the hardware exception overrides and the
>> wave allocation ID index.
>>
>> For debug devices that only support single process debugging, enable
>> trap temporary setup by default.
>>
>> Debug devices that support multi-process debugging require trap
>> temporary setup to be disabled by default in order to satisfy microbench
>> performance when in non-debug mode.
>
> Where is this done? I don't think it's in the MQD setup because that 
> happens unconditionally on all GPUs.

If I understand it correctly, it's done by actually enabling the debug 
trap in patch 9 (for Aldebaran). For single-process debug devices, the 
debug trap is always on, as per this patch.

Maybe just add a reference to the Aldebaran patch to make it clearer.

Regards,
   Felix


>
>
>>
>> The debugger requires that TTMPs 6 & 7 save the dispatch ID to map
>> waves onto dispatch during compute context inspection.
>> In order to correctly this up, set the special reserved CP bit by 
>> default
>> whenever the MQD is initailized.
>
> There is a word missing here. "In order to correctly _set_ this up ..."?
>
> This patch covers GFXv9 and 10. Will GFXv11 be handled separately?
>
> Regards,
>   Felix
>
>
>>
>> Signed-off-by: Jonathan Kim <jonathan.kim@amd.com>
>> ---
>>   drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c        | 26 +++++++
>>   drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c         | 30 ++++++++
>>   .../gpu/drm/amd/amdkfd/kfd_mqd_manager_v10.c  |  5 ++
>>   .../gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c   |  5 ++
>>   .../include/asic_reg/gc/gc_10_1_0_offset.h    | 14 ++++
>>   .../include/asic_reg/gc/gc_10_1_0_sh_mask.h   | 69 +++++++++++++++++++
>>   .../include/asic_reg/gc/gc_10_3_0_offset.h    | 10 +++
>>   .../include/asic_reg/gc/gc_10_3_0_sh_mask.h   |  4 ++
>>   8 files changed, 163 insertions(+)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c 
>> b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>> index af94ac580d3e..d49aff0b4ba3 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>> @@ -4904,6 +4904,29 @@ static u32 
>> gfx_v10_0_init_pa_sc_tile_steering_override(struct amdgpu_device *ade
>>     #define DEFAULT_SH_MEM_BASES    (0x6000)
>>   +static void gfx_v10_0_debug_trap_config_init(struct amdgpu_device 
>> *adev,
>> +                uint32_t first_vmid,
>> +                uint32_t last_vmid)
>> +{
>> +    uint32_t data;
>> +    uint32_t trap_config_vmid_mask = 0;
>> +    int i;
>> +
>> +    /* Calculate trap config vmid mask */
>> +    for (i = first_vmid; i < last_vmid; i++)
>> +        trap_config_vmid_mask |= (1 << i);
>> +
>> +    data = REG_SET_FIELD(0, SPI_GDBG_TRAP_CONFIG,
>> +            VMID_SEL, trap_config_vmid_mask);
>> +    data = REG_SET_FIELD(data, SPI_GDBG_TRAP_CONFIG,
>> +            TRAP_EN, 1);
>> +    WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_TRAP_CONFIG), data);
>> +    WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_TRAP_MASK), 0);
>> +
>> +    WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_TRAP_DATA0), 0);
>> +    WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_TRAP_DATA1), 0);
>> +}
>> +
>>   static void gfx_v10_0_init_compute_vmid(struct amdgpu_device *adev)
>>   {
>>       int i;
>> @@ -4935,6 +4958,9 @@ static void gfx_v10_0_init_compute_vmid(struct 
>> amdgpu_device *adev)
>>           WREG32_SOC15_OFFSET(GC, 0, mmGDS_GWS_VMID0, i, 0);
>>           WREG32_SOC15_OFFSET(GC, 0, mmGDS_OA_VMID0, i, 0);
>>       }
>> +
>> +    gfx_v10_0_debug_trap_config_init(adev, 
>> adev->vm_manager.first_kfd_vmid,
>> +                    AMDGPU_NUM_VMID);
>>   }
>>     static void gfx_v10_0_init_gds_vmid(struct amdgpu_device *adev)
>> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c 
>> b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>> index 0320be4a5fc6..a0e5ad342f13 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>> @@ -2337,6 +2337,29 @@ static void gfx_v9_0_setup_rb(struct 
>> amdgpu_device *adev)
>>       adev->gfx.config.num_rbs = hweight32(active_rbs);
>>   }
>>   +static void gfx_v9_0_debug_trap_config_init(struct amdgpu_device 
>> *adev,
>> +                uint32_t first_vmid,
>> +                uint32_t last_vmid)
>> +{
>> +    uint32_t data;
>> +    uint32_t trap_config_vmid_mask = 0;
>> +    int i;
>> +
>> +    /* Calculate trap config vmid mask */
>> +    for (i = first_vmid; i < last_vmid; i++)
>> +        trap_config_vmid_mask |= (1 << i);
>> +
>> +    data = REG_SET_FIELD(0, SPI_GDBG_TRAP_CONFIG,
>> +            VMID_SEL, trap_config_vmid_mask);
>> +    data = REG_SET_FIELD(data, SPI_GDBG_TRAP_CONFIG,
>> +            TRAP_EN, 1);
>> +    WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_TRAP_CONFIG), data);
>> +    WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_TRAP_MASK), 0);
>> +
>> +    WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_TRAP_DATA0), 0);
>> +    WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_TRAP_DATA1), 0);
>> +}
>> +
>>   #define DEFAULT_SH_MEM_BASES    (0x6000)
>>   static void gfx_v9_0_init_compute_vmid(struct amdgpu_device *adev)
>>   {
>> @@ -4609,6 +4632,13 @@ static int gfx_v9_0_late_init(void *handle)
>>       if (r)
>>           return r;
>>   +    if (adev->ip_versions[GC_HWIP][0] == IP_VERSION(9, 4, 2))
>> +        gfx_v9_4_2_debug_trap_config_init(adev,
>> +            adev->vm_manager.first_kfd_vmid, AMDGPU_NUM_VMID);
>> +    else
>> +        gfx_v9_0_debug_trap_config_init(adev,
>> +            adev->vm_manager.first_kfd_vmid, AMDGPU_NUM_VMID);
>> +
>>       return 0;
>>   }
>>   diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v10.c 
>> b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v10.c
>> index d3e2b6a599a4..cb484ace17de 100644
>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v10.c
>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v10.c
>> @@ -117,6 +117,11 @@ static void init_mqd(struct mqd_manager *mm, 
>> void **mqd,
>>               1 << CP_HQD_QUANTUM__QUANTUM_SCALE__SHIFT |
>>               1 << CP_HQD_QUANTUM__QUANTUM_DURATION__SHIFT;
>>   +    /* Set cp_hqd_hq_scheduler0 bit 14 to 1 to have the CP set up the
>> +     * DISPATCH_PTR.  This is required for the kfd debugger
>> +     */
>> +    m->cp_hqd_hq_scheduler0 = 1 << 14;
>> +
>>       if (q->format == KFD_QUEUE_FORMAT_AQL) {
>>           m->cp_hqd_aql_control =
>>               1 << CP_HQD_AQL_CONTROL__CONTROL0__SHIFT;
>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c 
>> b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c
>> index 0778e587a2d6..86f1cf090246 100644
>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c
>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c
>> @@ -164,6 +164,11 @@ static void init_mqd(struct mqd_manager *mm, 
>> void **mqd,
>>               1 << CP_HQD_QUANTUM__QUANTUM_SCALE__SHIFT |
>>               1 << CP_HQD_QUANTUM__QUANTUM_DURATION__SHIFT;
>>   +    /* Set cp_hqd_hq_scheduler0 bit 14 to 1 to have the CP set up the
>> +     * DISPATCH_PTR.  This is required for the kfd debugger
>> +     */
>> +    m->cp_hqd_hq_status0 = 1 << 14;
>> +
>>       if (q->format == KFD_QUEUE_FORMAT_AQL) {
>>           m->cp_hqd_aql_control =
>>               1 << CP_HQD_AQL_CONTROL__CONTROL0__SHIFT;
>> diff --git 
>> a/drivers/gpu/drm/amd/include/asic_reg/gc/gc_10_1_0_offset.h 
>> b/drivers/gpu/drm/amd/include/asic_reg/gc/gc_10_1_0_offset.h
>> index 18d34bbceebe..7d384f86bd67 100644
>> --- a/drivers/gpu/drm/amd/include/asic_reg/gc/gc_10_1_0_offset.h
>> +++ b/drivers/gpu/drm/amd/include/asic_reg/gc/gc_10_1_0_offset.h
>> @@ -5190,6 +5190,20 @@
>>   #define mmSPI_WCL_PIPE_PERCENT_CS6_BASE_IDX 0
>>   #define mmSPI_WCL_PIPE_PERCENT_CS7 0x1f70
>>   #define mmSPI_WCL_PIPE_PERCENT_CS7_BASE_IDX 0
>> +#define mmSPI_GDBG_WAVE_CNTL 0x1f71
>> +#define mmSPI_GDBG_WAVE_CNTL_BASE_IDX 0
>> +#define mmSPI_GDBG_TRAP_CONFIG 0x1f72
>> +#define mmSPI_GDBG_TRAP_CONFIG_BASE_IDX 0
>> +#define mmSPI_GDBG_TRAP_MASK 0x1f73
>> +#define mmSPI_GDBG_TRAP_MASK_BASE_IDX 0
>> +#define mmSPI_GDBG_WAVE_CNTL2 0x1f74
>> +#define mmSPI_GDBG_WAVE_CNTL2_BASE_IDX 0
>> +#define mmSPI_GDBG_WAVE_CNTL3 0x1f75
>> +#define mmSPI_GDBG_WAVE_CNTL3_BASE_IDX 0
>> +#define mmSPI_GDBG_TRAP_DATA0 0x1f78
>> +#define mmSPI_GDBG_TRAP_DATA0_BASE_IDX 0
>> +#define mmSPI_GDBG_TRAP_DATA1 0x1f79
>> +#define mmSPI_GDBG_TRAP_DATA1_BASE_IDX 0
>>   #define mmSPI_COMPUTE_QUEUE_RESET 0x1f7b
>>   #define mmSPI_COMPUTE_QUEUE_RESET_BASE_IDX 0
>>   #define mmSPI_RESOURCE_RESERVE_CU_0 0x1f7c
>> diff --git 
>> a/drivers/gpu/drm/amd/include/asic_reg/gc/gc_10_1_0_sh_mask.h 
>> b/drivers/gpu/drm/amd/include/asic_reg/gc/gc_10_1_0_sh_mask.h
>> index 4127896ffcdf..08772ba845b0 100644
>> --- a/drivers/gpu/drm/amd/include/asic_reg/gc/gc_10_1_0_sh_mask.h
>> +++ b/drivers/gpu/drm/amd/include/asic_reg/gc/gc_10_1_0_sh_mask.h
>> @@ -19646,6 +19646,75 @@
>>   //SPI_WCL_PIPE_PERCENT_CS7
>>   #define SPI_WCL_PIPE_PERCENT_CS7__VALUE__SHIFT 0x0
>>   #define SPI_WCL_PIPE_PERCENT_CS7__VALUE_MASK 0x7FL
>> +//SPI_GDBG_WAVE_CNTL
>> +#define SPI_GDBG_WAVE_CNTL__STALL_RA__SHIFT 0x0
>> +#define SPI_GDBG_WAVE_CNTL__STALL_VMID__SHIFT 0x1
>> +#define SPI_GDBG_WAVE_CNTL__STALL_RA_MASK 0x00000001L
>> +#define SPI_GDBG_WAVE_CNTL__STALL_VMID_MASK 0x0001FFFEL
>> +//SPI_GDBG_TRAP_CONFIG
>> +#define SPI_GDBG_TRAP_CONFIG__ME_SEL__SHIFT 0x0
>> +#define SPI_GDBG_TRAP_CONFIG__PIPE_SEL__SHIFT 0x2
>> +#define SPI_GDBG_TRAP_CONFIG__QUEUE_SEL__SHIFT 0x4
>> +#define SPI_GDBG_TRAP_CONFIG__ME_MATCH__SHIFT 0x7
>> +#define SPI_GDBG_TRAP_CONFIG__PIPE_MATCH__SHIFT 0x8
>> +#define SPI_GDBG_TRAP_CONFIG__QUEUE_MATCH__SHIFT 0x9
>> +#define SPI_GDBG_TRAP_CONFIG__TRAP_EN__SHIFT 0xf
>> +#define SPI_GDBG_TRAP_CONFIG__VMID_SEL__SHIFT 0x10
>> +#define SPI_GDBG_TRAP_CONFIG__ME_SEL_MASK 0x00000003L
>> +#define SPI_GDBG_TRAP_CONFIG__PIPE_SEL_MASK 0x0000000CL
>> +#define SPI_GDBG_TRAP_CONFIG__QUEUE_SEL_MASK 0x00000070L
>> +#define SPI_GDBG_TRAP_CONFIG__ME_MATCH_MASK 0x00000080L
>> +#define SPI_GDBG_TRAP_CONFIG__PIPE_MATCH_MASK 0x00000100L
>> +#define SPI_GDBG_TRAP_CONFIG__QUEUE_MATCH_MASK 0x00000200L
>> +#define SPI_GDBG_TRAP_CONFIG__TRAP_EN_MASK 0x00008000L
>> +#define SPI_GDBG_TRAP_CONFIG__VMID_SEL_MASK 0xFFFF0000L
>> +//SPI_GDBG_TRAP_MASK
>> +#define SPI_GDBG_TRAP_MASK__EXCP_EN__SHIFT 0x0
>> +#define SPI_GDBG_TRAP_MASK__REPLACE__SHIFT 0x9
>> +#define SPI_GDBG_TRAP_MASK__EXCP_EN_MASK 0x01FFL
>> +#define SPI_GDBG_TRAP_MASK__REPLACE_MASK 0x0200L
>> +//SPI_GDBG_WAVE_CNTL2
>> +#define SPI_GDBG_WAVE_CNTL2__VMID_MASK__SHIFT 0x0
>> +#define SPI_GDBG_WAVE_CNTL2__MODE__SHIFT 0x10
>> +#define SPI_GDBG_WAVE_CNTL2__VMID_MASK_MASK 0x0000FFFFL
>> +#define SPI_GDBG_WAVE_CNTL2__MODE_MASK 0x00030000L
>> +//SPI_GDBG_WAVE_CNTL3
>> +#define SPI_GDBG_WAVE_CNTL3__STALL_PS__SHIFT 0x0
>> +#define SPI_GDBG_WAVE_CNTL3__STALL_VS__SHIFT 0x1
>> +#define SPI_GDBG_WAVE_CNTL3__STALL_GS__SHIFT 0x2
>> +#define SPI_GDBG_WAVE_CNTL3__STALL_HS__SHIFT 0x3
>> +#define SPI_GDBG_WAVE_CNTL3__STALL_CSG__SHIFT 0x4
>> +#define SPI_GDBG_WAVE_CNTL3__STALL_CS0__SHIFT 0x5
>> +#define SPI_GDBG_WAVE_CNTL3__STALL_CS1__SHIFT 0x6
>> +#define SPI_GDBG_WAVE_CNTL3__STALL_CS2__SHIFT 0x7
>> +#define SPI_GDBG_WAVE_CNTL3__STALL_CS3__SHIFT 0x8
>> +#define SPI_GDBG_WAVE_CNTL3__STALL_CS4__SHIFT 0x9
>> +#define SPI_GDBG_WAVE_CNTL3__STALL_CS5__SHIFT 0xa
>> +#define SPI_GDBG_WAVE_CNTL3__STALL_CS6__SHIFT 0xb
>> +#define SPI_GDBG_WAVE_CNTL3__STALL_CS7__SHIFT 0xc
>> +#define SPI_GDBG_WAVE_CNTL3__STALL_DURATION__SHIFT 0xd
>> +#define SPI_GDBG_WAVE_CNTL3__STALL_MULT__SHIFT 0x1c
>> +#define SPI_GDBG_WAVE_CNTL3__STALL_PS_MASK 0x00000001L
>> +#define SPI_GDBG_WAVE_CNTL3__STALL_VS_MASK 0x00000002L
>> +#define SPI_GDBG_WAVE_CNTL3__STALL_GS_MASK 0x00000004L
>> +#define SPI_GDBG_WAVE_CNTL3__STALL_HS_MASK 0x00000008L
>> +#define SPI_GDBG_WAVE_CNTL3__STALL_CSG_MASK 0x00000010L
>> +#define SPI_GDBG_WAVE_CNTL3__STALL_CS0_MASK 0x00000020L
>> +#define SPI_GDBG_WAVE_CNTL3__STALL_CS1_MASK 0x00000040L
>> +#define SPI_GDBG_WAVE_CNTL3__STALL_CS2_MASK 0x00000080L
>> +#define SPI_GDBG_WAVE_CNTL3__STALL_CS3_MASK 0x00000100L
>> +#define SPI_GDBG_WAVE_CNTL3__STALL_CS4_MASK 0x00000200L
>> +#define SPI_GDBG_WAVE_CNTL3__STALL_CS5_MASK 0x00000400L
>> +#define SPI_GDBG_WAVE_CNTL3__STALL_CS6_MASK 0x00000800L
>> +#define SPI_GDBG_WAVE_CNTL3__STALL_CS7_MASK 0x00001000L
>> +#define SPI_GDBG_WAVE_CNTL3__STALL_DURATION_MASK 0x0FFFE000L
>> +#define SPI_GDBG_WAVE_CNTL3__STALL_MULT_MASK 0x10000000L
>> +//SPI_GDBG_TRAP_DATA0
>> +#define SPI_GDBG_TRAP_DATA0__DATA__SHIFT 0x0
>> +#define SPI_GDBG_TRAP_DATA0__DATA_MASK 0xFFFFFFFFL
>> +//SPI_GDBG_TRAP_DATA1
>> +#define SPI_GDBG_TRAP_DATA1__DATA__SHIFT 0x0
>> +#define SPI_GDBG_TRAP_DATA1__DATA_MASK 0xFFFFFFFFL
>>   //SPI_COMPUTE_QUEUE_RESET
>>   #define SPI_COMPUTE_QUEUE_RESET__RESET__SHIFT 0x0
>>   #define SPI_COMPUTE_QUEUE_RESET__RESET_MASK 0x01L
>> diff --git 
>> a/drivers/gpu/drm/amd/include/asic_reg/gc/gc_10_3_0_offset.h 
>> b/drivers/gpu/drm/amd/include/asic_reg/gc/gc_10_3_0_offset.h
>> index 3973110f149c..d09f1a06f4bf 100644
>> --- a/drivers/gpu/drm/amd/include/asic_reg/gc/gc_10_3_0_offset.h
>> +++ b/drivers/gpu/drm/amd/include/asic_reg/gc/gc_10_3_0_offset.h
>> @@ -26,6 +26,8 @@
>>   #define mmSQ_DEBUG_STS_GLOBAL_BASE_IDX 0
>>   #define mmSQ_DEBUG_STS_GLOBAL2 0x10B0
>>   #define mmSQ_DEBUG_STS_GLOBAL2_BASE_IDX 0
>> +#define mmSQ_DEBUG 0x10B1
>> +#define mmSQ_DEBUG_BASE_IDX 0
>>     // addressBlock: gc_sdma0_sdma0dec
>>   // base address: 0x4980
>> @@ -4849,10 +4851,18 @@
>>   #define mmSPI_WCL_PIPE_PERCENT_CS3_BASE_IDX 0
>>   #define mmSPI_GDBG_WAVE_CNTL 0x1f71
>>   #define mmSPI_GDBG_WAVE_CNTL_BASE_IDX 0
>> +#define mmSPI_GDBG_TRAP_CONFIG 0x1f72
>> +#define mmSPI_GDBG_TRAP_CONFIG_BASE_IDX 0
>>   #define mmSPI_GDBG_TRAP_MASK 0x1f73
>>   #define mmSPI_GDBG_TRAP_MASK_BASE_IDX 0
>>   #define mmSPI_GDBG_WAVE_CNTL2 0x1f74
>>   #define mmSPI_GDBG_WAVE_CNTL2_BASE_IDX 0
>> +#define mmSPI_GDBG_WAVE_CNTL3 0x1f75
>> +#define mmSPI_GDBG_WAVE_CNTL3_BASE_IDX 0
>> +#define mmSPI_GDBG_TRAP_DATA0 0x1f78
>> +#define mmSPI_GDBG_TRAP_DATA0_BASE_IDX 0
>> +#define mmSPI_GDBG_TRAP_DATA1 0x1f79
>> +#define mmSPI_GDBG_TRAP_DATA1_BASE_IDX 0
>>   #define mmSPI_COMPUTE_QUEUE_RESET 0x1f7b
>>   #define mmSPI_COMPUTE_QUEUE_RESET_BASE_IDX 0
>>   #define mmSPI_RESOURCE_RESERVE_CU_0 0x1f7c
>> diff --git 
>> a/drivers/gpu/drm/amd/include/asic_reg/gc/gc_10_3_0_sh_mask.h 
>> b/drivers/gpu/drm/amd/include/asic_reg/gc/gc_10_3_0_sh_mask.h
>> index d4e8ff22ecb8..fc85aee010fe 100644
>> --- a/drivers/gpu/drm/amd/include/asic_reg/gc/gc_10_3_0_sh_mask.h
>> +++ b/drivers/gpu/drm/amd/include/asic_reg/gc/gc_10_3_0_sh_mask.h
>> @@ -47853,6 +47853,10 @@
>>       // addressBlock: sqind
>> +//SQ_DEBUG
>> +#define SQ_DEBUG__SINGLE_MEMOP_MASK 0x00000001L
>> +#define SQ_DEBUG__SINGLE_MEMOP__SHIFT 0x00000000
>> +
>>   //SQ_DEBUG_STS_GLOBAL
>>   #define SQ_DEBUG_STS_GLOBAL2__FIFO_LEVEL_GFX0_MASK 0x000000ffL
>>   #define SQ_DEBUG_STS_GLOBAL2__FIFO_LEVEL_GFX0__SHIFT 0x00000000

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [PATCH 05/29] drm/amdgpu: setup hw debug registers on driver initialization
  2022-10-31 16:23 ` [PATCH 05/29] drm/amdgpu: setup hw debug registers on driver initialization Jonathan Kim
  2022-11-22 23:38   ` Felix Kuehling
@ 2022-12-01  0:23   ` Felix Kuehling
  2022-12-02 17:42     ` Kim, Jonathan
  1 sibling, 1 reply; 63+ messages in thread
From: Felix Kuehling @ 2022-12-01  0:23 UTC (permalink / raw)
  To: Jonathan Kim, amd-gfx


On 2022-10-31 12:23, Jonathan Kim wrote:
> Add missing debug trap registers references and initialize all debug
> registers on boot by clearing the hardware exception overrides and the
> wave allocation ID index.
>
> For debug devices that only support single process debugging, enable
> trap temporary setup by default.
>
> Debug devices that support multi-process debugging require trap
> temporary setup to be disabled by default in order to satisfy microbench
> performance when in non-debug mode.
>
> The debugger requires that TTMPs 6 & 7 save the dispatch ID to map
> waves onto dispatch during compute context inspection.
> In order to correctly this up, set the special reserved CP bit by default
> whenever the MQD is initailized.
>
> Signed-off-by: Jonathan Kim <jonathan.kim@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c        | 26 +++++++
>   drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c         | 30 ++++++++
>   .../gpu/drm/amd/amdkfd/kfd_mqd_manager_v10.c  |  5 ++
>   .../gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c   |  5 ++
>   .../include/asic_reg/gc/gc_10_1_0_offset.h    | 14 ++++
>   .../include/asic_reg/gc/gc_10_1_0_sh_mask.h   | 69 +++++++++++++++++++
>   .../include/asic_reg/gc/gc_10_3_0_offset.h    | 10 +++
>   .../include/asic_reg/gc/gc_10_3_0_sh_mask.h   |  4 ++
>   8 files changed, 163 insertions(+)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> index af94ac580d3e..d49aff0b4ba3 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> @@ -4904,6 +4904,29 @@ static u32 gfx_v10_0_init_pa_sc_tile_steering_override(struct amdgpu_device *ade
>   
>   #define DEFAULT_SH_MEM_BASES	(0x6000)
>   
> +static void gfx_v10_0_debug_trap_config_init(struct amdgpu_device *adev,
> +				uint32_t first_vmid,
> +				uint32_t last_vmid)
> +{
> +	uint32_t data;
> +	uint32_t trap_config_vmid_mask = 0;
> +	int i;
> +
> +	/* Calculate trap config vmid mask */
> +	for (i = first_vmid; i < last_vmid; i++)
> +		trap_config_vmid_mask |= (1 << i);
> +
> +	data = REG_SET_FIELD(0, SPI_GDBG_TRAP_CONFIG,
> +			VMID_SEL, trap_config_vmid_mask);
> +	data = REG_SET_FIELD(data, SPI_GDBG_TRAP_CONFIG,
> +			TRAP_EN, 1);
> +	WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_TRAP_CONFIG), data);
> +	WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_TRAP_MASK), 0);
> +
> +	WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_TRAP_DATA0), 0);
> +	WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_TRAP_DATA1), 0);
> +}
> +
>   static void gfx_v10_0_init_compute_vmid(struct amdgpu_device *adev)
>   {
>   	int i;
> @@ -4935,6 +4958,9 @@ static void gfx_v10_0_init_compute_vmid(struct amdgpu_device *adev)
>   		WREG32_SOC15_OFFSET(GC, 0, mmGDS_GWS_VMID0, i, 0);
>   		WREG32_SOC15_OFFSET(GC, 0, mmGDS_OA_VMID0, i, 0);
>   	}
> +
> +	gfx_v10_0_debug_trap_config_init(adev, adev->vm_manager.first_kfd_vmid,
> +					AMDGPU_NUM_VMID);
>   }
>   
>   static void gfx_v10_0_init_gds_vmid(struct amdgpu_device *adev)
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> index 0320be4a5fc6..a0e5ad342f13 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> @@ -2337,6 +2337,29 @@ static void gfx_v9_0_setup_rb(struct amdgpu_device *adev)
>   	adev->gfx.config.num_rbs = hweight32(active_rbs);
>   }
>   
> +static void gfx_v9_0_debug_trap_config_init(struct amdgpu_device *adev,
> +				uint32_t first_vmid,
> +				uint32_t last_vmid)
> +{
> +	uint32_t data;
> +	uint32_t trap_config_vmid_mask = 0;
> +	int i;
> +
> +	/* Calculate trap config vmid mask */
> +	for (i = first_vmid; i < last_vmid; i++)
> +		trap_config_vmid_mask |= (1 << i);
> +
> +	data = REG_SET_FIELD(0, SPI_GDBG_TRAP_CONFIG,
> +			VMID_SEL, trap_config_vmid_mask);
> +	data = REG_SET_FIELD(data, SPI_GDBG_TRAP_CONFIG,
> +			TRAP_EN, 1);
> +	WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_TRAP_CONFIG), data);
> +	WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_TRAP_MASK), 0);
> +
> +	WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_TRAP_DATA0), 0);
> +	WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_TRAP_DATA1), 0);
> +}
> +
>   #define DEFAULT_SH_MEM_BASES	(0x6000)
>   static void gfx_v9_0_init_compute_vmid(struct amdgpu_device *adev)
>   {
> @@ -4609,6 +4632,13 @@ static int gfx_v9_0_late_init(void *handle)
>   	if (r)
>   		return r;
>   
> +	if (adev->ip_versions[GC_HWIP][0] == IP_VERSION(9, 4, 2))
> +		gfx_v9_4_2_debug_trap_config_init(adev,
> +			adev->vm_manager.first_kfd_vmid, AMDGPU_NUM_VMID);

Where is this function defined. I don't see it in any of your patches. 
Did you forget to git add a file?

Regards,
   Felix


> +	else
> +		gfx_v9_0_debug_trap_config_init(adev,
> +			adev->vm_manager.first_kfd_vmid, AMDGPU_NUM_VMID);
> +
>   	return 0;
>   }
>   
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v10.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v10.c
> index d3e2b6a599a4..cb484ace17de 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v10.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v10.c
> @@ -117,6 +117,11 @@ static void init_mqd(struct mqd_manager *mm, void **mqd,
>   			1 << CP_HQD_QUANTUM__QUANTUM_SCALE__SHIFT |
>   			1 << CP_HQD_QUANTUM__QUANTUM_DURATION__SHIFT;
>   
> +	/* Set cp_hqd_hq_scheduler0 bit 14 to 1 to have the CP set up the
> +	 * DISPATCH_PTR.  This is required for the kfd debugger
> +	 */
> +	m->cp_hqd_hq_scheduler0 = 1 << 14;
> +
>   	if (q->format == KFD_QUEUE_FORMAT_AQL) {
>   		m->cp_hqd_aql_control =
>   			1 << CP_HQD_AQL_CONTROL__CONTROL0__SHIFT;
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c
> index 0778e587a2d6..86f1cf090246 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c
> @@ -164,6 +164,11 @@ static void init_mqd(struct mqd_manager *mm, void **mqd,
>   			1 << CP_HQD_QUANTUM__QUANTUM_SCALE__SHIFT |
>   			1 << CP_HQD_QUANTUM__QUANTUM_DURATION__SHIFT;
>   
> +	/* Set cp_hqd_hq_scheduler0 bit 14 to 1 to have the CP set up the
> +	 * DISPATCH_PTR.  This is required for the kfd debugger
> +	 */
> +	m->cp_hqd_hq_status0 = 1 << 14;
> +
>   	if (q->format == KFD_QUEUE_FORMAT_AQL) {
>   		m->cp_hqd_aql_control =
>   			1 << CP_HQD_AQL_CONTROL__CONTROL0__SHIFT;
> diff --git a/drivers/gpu/drm/amd/include/asic_reg/gc/gc_10_1_0_offset.h b/drivers/gpu/drm/amd/include/asic_reg/gc/gc_10_1_0_offset.h
> index 18d34bbceebe..7d384f86bd67 100644
> --- a/drivers/gpu/drm/amd/include/asic_reg/gc/gc_10_1_0_offset.h
> +++ b/drivers/gpu/drm/amd/include/asic_reg/gc/gc_10_1_0_offset.h
> @@ -5190,6 +5190,20 @@
>   #define mmSPI_WCL_PIPE_PERCENT_CS6_BASE_IDX                                                            0
>   #define mmSPI_WCL_PIPE_PERCENT_CS7                                                                     0x1f70
>   #define mmSPI_WCL_PIPE_PERCENT_CS7_BASE_IDX                                                            0
> +#define mmSPI_GDBG_WAVE_CNTL                                                                           0x1f71
> +#define mmSPI_GDBG_WAVE_CNTL_BASE_IDX                                                                  0
> +#define mmSPI_GDBG_TRAP_CONFIG                                                                         0x1f72
> +#define mmSPI_GDBG_TRAP_CONFIG_BASE_IDX                                                                0
> +#define mmSPI_GDBG_TRAP_MASK                                                                           0x1f73
> +#define mmSPI_GDBG_TRAP_MASK_BASE_IDX                                                                  0
> +#define mmSPI_GDBG_WAVE_CNTL2                                                                          0x1f74
> +#define mmSPI_GDBG_WAVE_CNTL2_BASE_IDX                                                                 0
> +#define mmSPI_GDBG_WAVE_CNTL3                                                                          0x1f75
> +#define mmSPI_GDBG_WAVE_CNTL3_BASE_IDX                                                                 0
> +#define mmSPI_GDBG_TRAP_DATA0                                                                          0x1f78
> +#define mmSPI_GDBG_TRAP_DATA0_BASE_IDX                                                                 0
> +#define mmSPI_GDBG_TRAP_DATA1                                                                          0x1f79
> +#define mmSPI_GDBG_TRAP_DATA1_BASE_IDX                                                                 0
>   #define mmSPI_COMPUTE_QUEUE_RESET                                                                      0x1f7b
>   #define mmSPI_COMPUTE_QUEUE_RESET_BASE_IDX                                                             0
>   #define mmSPI_RESOURCE_RESERVE_CU_0                                                                    0x1f7c
> diff --git a/drivers/gpu/drm/amd/include/asic_reg/gc/gc_10_1_0_sh_mask.h b/drivers/gpu/drm/amd/include/asic_reg/gc/gc_10_1_0_sh_mask.h
> index 4127896ffcdf..08772ba845b0 100644
> --- a/drivers/gpu/drm/amd/include/asic_reg/gc/gc_10_1_0_sh_mask.h
> +++ b/drivers/gpu/drm/amd/include/asic_reg/gc/gc_10_1_0_sh_mask.h
> @@ -19646,6 +19646,75 @@
>   //SPI_WCL_PIPE_PERCENT_CS7
>   #define SPI_WCL_PIPE_PERCENT_CS7__VALUE__SHIFT                                                                0x0
>   #define SPI_WCL_PIPE_PERCENT_CS7__VALUE_MASK                                                                  0x7FL
> +//SPI_GDBG_WAVE_CNTL
> +#define SPI_GDBG_WAVE_CNTL__STALL_RA__SHIFT                                                                   0x0
> +#define SPI_GDBG_WAVE_CNTL__STALL_VMID__SHIFT                                                                 0x1
> +#define SPI_GDBG_WAVE_CNTL__STALL_RA_MASK                                                                     0x00000001L
> +#define SPI_GDBG_WAVE_CNTL__STALL_VMID_MASK                                                                   0x0001FFFEL
> +//SPI_GDBG_TRAP_CONFIG
> +#define SPI_GDBG_TRAP_CONFIG__ME_SEL__SHIFT                                                                   0x0
> +#define SPI_GDBG_TRAP_CONFIG__PIPE_SEL__SHIFT                                                                 0x2
> +#define SPI_GDBG_TRAP_CONFIG__QUEUE_SEL__SHIFT                                                                0x4
> +#define SPI_GDBG_TRAP_CONFIG__ME_MATCH__SHIFT                                                                 0x7
> +#define SPI_GDBG_TRAP_CONFIG__PIPE_MATCH__SHIFT                                                               0x8
> +#define SPI_GDBG_TRAP_CONFIG__QUEUE_MATCH__SHIFT                                                              0x9
> +#define SPI_GDBG_TRAP_CONFIG__TRAP_EN__SHIFT                                                                  0xf
> +#define SPI_GDBG_TRAP_CONFIG__VMID_SEL__SHIFT                                                                 0x10
> +#define SPI_GDBG_TRAP_CONFIG__ME_SEL_MASK                                                                     0x00000003L
> +#define SPI_GDBG_TRAP_CONFIG__PIPE_SEL_MASK                                                                   0x0000000CL
> +#define SPI_GDBG_TRAP_CONFIG__QUEUE_SEL_MASK                                                                  0x00000070L
> +#define SPI_GDBG_TRAP_CONFIG__ME_MATCH_MASK                                                                   0x00000080L
> +#define SPI_GDBG_TRAP_CONFIG__PIPE_MATCH_MASK                                                                 0x00000100L
> +#define SPI_GDBG_TRAP_CONFIG__QUEUE_MATCH_MASK                                                                0x00000200L
> +#define SPI_GDBG_TRAP_CONFIG__TRAP_EN_MASK                                                                    0x00008000L
> +#define SPI_GDBG_TRAP_CONFIG__VMID_SEL_MASK                                                                   0xFFFF0000L
> +//SPI_GDBG_TRAP_MASK
> +#define SPI_GDBG_TRAP_MASK__EXCP_EN__SHIFT                                                                    0x0
> +#define SPI_GDBG_TRAP_MASK__REPLACE__SHIFT                                                                    0x9
> +#define SPI_GDBG_TRAP_MASK__EXCP_EN_MASK                                                                      0x01FFL
> +#define SPI_GDBG_TRAP_MASK__REPLACE_MASK                                                                      0x0200L
> +//SPI_GDBG_WAVE_CNTL2
> +#define SPI_GDBG_WAVE_CNTL2__VMID_MASK__SHIFT                                                                 0x0
> +#define SPI_GDBG_WAVE_CNTL2__MODE__SHIFT                                                                      0x10
> +#define SPI_GDBG_WAVE_CNTL2__VMID_MASK_MASK                                                                   0x0000FFFFL
> +#define SPI_GDBG_WAVE_CNTL2__MODE_MASK                                                                        0x00030000L
> +//SPI_GDBG_WAVE_CNTL3
> +#define SPI_GDBG_WAVE_CNTL3__STALL_PS__SHIFT                                                                  0x0
> +#define SPI_GDBG_WAVE_CNTL3__STALL_VS__SHIFT                                                                  0x1
> +#define SPI_GDBG_WAVE_CNTL3__STALL_GS__SHIFT                                                                  0x2
> +#define SPI_GDBG_WAVE_CNTL3__STALL_HS__SHIFT                                                                  0x3
> +#define SPI_GDBG_WAVE_CNTL3__STALL_CSG__SHIFT                                                                 0x4
> +#define SPI_GDBG_WAVE_CNTL3__STALL_CS0__SHIFT                                                                 0x5
> +#define SPI_GDBG_WAVE_CNTL3__STALL_CS1__SHIFT                                                                 0x6
> +#define SPI_GDBG_WAVE_CNTL3__STALL_CS2__SHIFT                                                                 0x7
> +#define SPI_GDBG_WAVE_CNTL3__STALL_CS3__SHIFT                                                                 0x8
> +#define SPI_GDBG_WAVE_CNTL3__STALL_CS4__SHIFT                                                                 0x9
> +#define SPI_GDBG_WAVE_CNTL3__STALL_CS5__SHIFT                                                                 0xa
> +#define SPI_GDBG_WAVE_CNTL3__STALL_CS6__SHIFT                                                                 0xb
> +#define SPI_GDBG_WAVE_CNTL3__STALL_CS7__SHIFT                                                                 0xc
> +#define SPI_GDBG_WAVE_CNTL3__STALL_DURATION__SHIFT                                                            0xd
> +#define SPI_GDBG_WAVE_CNTL3__STALL_MULT__SHIFT                                                                0x1c
> +#define SPI_GDBG_WAVE_CNTL3__STALL_PS_MASK                                                                    0x00000001L
> +#define SPI_GDBG_WAVE_CNTL3__STALL_VS_MASK                                                                    0x00000002L
> +#define SPI_GDBG_WAVE_CNTL3__STALL_GS_MASK                                                                    0x00000004L
> +#define SPI_GDBG_WAVE_CNTL3__STALL_HS_MASK                                                                    0x00000008L
> +#define SPI_GDBG_WAVE_CNTL3__STALL_CSG_MASK                                                                   0x00000010L
> +#define SPI_GDBG_WAVE_CNTL3__STALL_CS0_MASK                                                                   0x00000020L
> +#define SPI_GDBG_WAVE_CNTL3__STALL_CS1_MASK                                                                   0x00000040L
> +#define SPI_GDBG_WAVE_CNTL3__STALL_CS2_MASK                                                                   0x00000080L
> +#define SPI_GDBG_WAVE_CNTL3__STALL_CS3_MASK                                                                   0x00000100L
> +#define SPI_GDBG_WAVE_CNTL3__STALL_CS4_MASK                                                                   0x00000200L
> +#define SPI_GDBG_WAVE_CNTL3__STALL_CS5_MASK                                                                   0x00000400L
> +#define SPI_GDBG_WAVE_CNTL3__STALL_CS6_MASK                                                                   0x00000800L
> +#define SPI_GDBG_WAVE_CNTL3__STALL_CS7_MASK                                                                   0x00001000L
> +#define SPI_GDBG_WAVE_CNTL3__STALL_DURATION_MASK                                                              0x0FFFE000L
> +#define SPI_GDBG_WAVE_CNTL3__STALL_MULT_MASK                                                                  0x10000000L
> +//SPI_GDBG_TRAP_DATA0
> +#define SPI_GDBG_TRAP_DATA0__DATA__SHIFT                                                                      0x0
> +#define SPI_GDBG_TRAP_DATA0__DATA_MASK                                                                        0xFFFFFFFFL
> +//SPI_GDBG_TRAP_DATA1
> +#define SPI_GDBG_TRAP_DATA1__DATA__SHIFT                                                                      0x0
> +#define SPI_GDBG_TRAP_DATA1__DATA_MASK                                                                        0xFFFFFFFFL
>   //SPI_COMPUTE_QUEUE_RESET
>   #define SPI_COMPUTE_QUEUE_RESET__RESET__SHIFT                                                                 0x0
>   #define SPI_COMPUTE_QUEUE_RESET__RESET_MASK                                                                   0x01L
> diff --git a/drivers/gpu/drm/amd/include/asic_reg/gc/gc_10_3_0_offset.h b/drivers/gpu/drm/amd/include/asic_reg/gc/gc_10_3_0_offset.h
> index 3973110f149c..d09f1a06f4bf 100644
> --- a/drivers/gpu/drm/amd/include/asic_reg/gc/gc_10_3_0_offset.h
> +++ b/drivers/gpu/drm/amd/include/asic_reg/gc/gc_10_3_0_offset.h
> @@ -26,6 +26,8 @@
>   #define mmSQ_DEBUG_STS_GLOBAL_BASE_IDX                                                                 0
>   #define mmSQ_DEBUG_STS_GLOBAL2                                                                         0x10B0
>   #define mmSQ_DEBUG_STS_GLOBAL2_BASE_IDX                                                                0
> +#define mmSQ_DEBUG                                                                                     0x10B1
> +#define mmSQ_DEBUG_BASE_IDX                                                                            0
>   
>   // addressBlock: gc_sdma0_sdma0dec
>   // base address: 0x4980
> @@ -4849,10 +4851,18 @@
>   #define mmSPI_WCL_PIPE_PERCENT_CS3_BASE_IDX                                                            0
>   #define mmSPI_GDBG_WAVE_CNTL                                                                           0x1f71
>   #define mmSPI_GDBG_WAVE_CNTL_BASE_IDX                                                                  0
> +#define mmSPI_GDBG_TRAP_CONFIG                                                                         0x1f72
> +#define mmSPI_GDBG_TRAP_CONFIG_BASE_IDX                                                                0
>   #define mmSPI_GDBG_TRAP_MASK                                                                           0x1f73
>   #define mmSPI_GDBG_TRAP_MASK_BASE_IDX                                                                  0
>   #define mmSPI_GDBG_WAVE_CNTL2                                                                          0x1f74
>   #define mmSPI_GDBG_WAVE_CNTL2_BASE_IDX                                                                 0
> +#define mmSPI_GDBG_WAVE_CNTL3                                                                          0x1f75
> +#define mmSPI_GDBG_WAVE_CNTL3_BASE_IDX                                                                 0
> +#define mmSPI_GDBG_TRAP_DATA0                                                                          0x1f78
> +#define mmSPI_GDBG_TRAP_DATA0_BASE_IDX                                                                 0
> +#define mmSPI_GDBG_TRAP_DATA1                                                                          0x1f79
> +#define mmSPI_GDBG_TRAP_DATA1_BASE_IDX                                                                 0
>   #define mmSPI_COMPUTE_QUEUE_RESET                                                                      0x1f7b
>   #define mmSPI_COMPUTE_QUEUE_RESET_BASE_IDX                                                             0
>   #define mmSPI_RESOURCE_RESERVE_CU_0                                                                    0x1f7c
> diff --git a/drivers/gpu/drm/amd/include/asic_reg/gc/gc_10_3_0_sh_mask.h b/drivers/gpu/drm/amd/include/asic_reg/gc/gc_10_3_0_sh_mask.h
> index d4e8ff22ecb8..fc85aee010fe 100644
> --- a/drivers/gpu/drm/amd/include/asic_reg/gc/gc_10_3_0_sh_mask.h
> +++ b/drivers/gpu/drm/amd/include/asic_reg/gc/gc_10_3_0_sh_mask.h
> @@ -47853,6 +47853,10 @@
>   
>   
>   // addressBlock: sqind
> +//SQ_DEBUG
> +#define SQ_DEBUG__SINGLE_MEMOP_MASK 0x00000001L
> +#define SQ_DEBUG__SINGLE_MEMOP__SHIFT 0x00000000
> +
>   //SQ_DEBUG_STS_GLOBAL
>   #define SQ_DEBUG_STS_GLOBAL2__FIFO_LEVEL_GFX0_MASK 0x000000ffL
>   #define SQ_DEBUG_STS_GLOBAL2__FIFO_LEVEL_GFX0__SHIFT 0x00000000

^ permalink raw reply	[flat|nested] 63+ messages in thread

* RE: [PATCH 05/29] drm/amdgpu: setup hw debug registers on driver initialization
  2022-12-01  0:23   ` Felix Kuehling
@ 2022-12-02 17:42     ` Kim, Jonathan
  0 siblings, 0 replies; 63+ messages in thread
From: Kim, Jonathan @ 2022-12-02 17:42 UTC (permalink / raw)
  To: Kuehling, Felix, amd-gfx

[-- Attachment #1: Type: text/plain, Size: 16225 bytes --]

[Public]

> -----Original Message-----
> From: Kuehling, Felix <Felix.Kuehling@amd.com>
> Sent: November 30, 2022 7:24 PM
> To: Kim, Jonathan <Jonathan.Kim@amd.com>; amd-
> gfx@lists.freedesktop.org
> Subject: Re: [PATCH 05/29] drm/amdgpu: setup hw debug registers on driver
> initialization
>
>
> On 2022-10-31 12:23, Jonathan Kim wrote:
> > Add missing debug trap registers references and initialize all debug
> > registers on boot by clearing the hardware exception overrides and the
> > wave allocation ID index.
> >
> > For debug devices that only support single process debugging, enable
> > trap temporary setup by default.
> >
> > Debug devices that support multi-process debugging require trap
> > temporary setup to be disabled by default in order to satisfy microbench
> > performance when in non-debug mode.
> >
> > The debugger requires that TTMPs 6 & 7 save the dispatch ID to map
> > waves onto dispatch during compute context inspection.
> > In order to correctly this up, set the special reserved CP bit by default
> > whenever the MQD is initailized.
> >
> > Signed-off-by: Jonathan Kim <jonathan.kim@amd.com>
> > ---
> >   drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c        | 26 +++++++
> >   drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c         | 30 ++++++++
> >   .../gpu/drm/amd/amdkfd/kfd_mqd_manager_v10.c  |  5 ++
> >   .../gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c   |  5 ++
> >   .../include/asic_reg/gc/gc_10_1_0_offset.h    | 14 ++++
> >   .../include/asic_reg/gc/gc_10_1_0_sh_mask.h   | 69
> +++++++++++++++++++
> >   .../include/asic_reg/gc/gc_10_3_0_offset.h    | 10 +++
> >   .../include/asic_reg/gc/gc_10_3_0_sh_mask.h   |  4 ++
> >   8 files changed, 163 insertions(+)
> >
> > diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> > index af94ac580d3e..d49aff0b4ba3 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> > @@ -4904,6 +4904,29 @@ static u32
> gfx_v10_0_init_pa_sc_tile_steering_override(struct amdgpu_device *ade
> >
> >   #define DEFAULT_SH_MEM_BASES      (0x6000)
> >
> > +static void gfx_v10_0_debug_trap_config_init(struct amdgpu_device
> *adev,
> > +                           uint32_t first_vmid,
> > +                           uint32_t last_vmid)
> > +{
> > +   uint32_t data;
> > +   uint32_t trap_config_vmid_mask = 0;
> > +   int i;
> > +
> > +   /* Calculate trap config vmid mask */
> > +   for (i = first_vmid; i < last_vmid; i++)
> > +           trap_config_vmid_mask |= (1 << i);
> > +
> > +   data = REG_SET_FIELD(0, SPI_GDBG_TRAP_CONFIG,
> > +                   VMID_SEL, trap_config_vmid_mask);
> > +   data = REG_SET_FIELD(data, SPI_GDBG_TRAP_CONFIG,
> > +                   TRAP_EN, 1);
> > +   WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_TRAP_CONFIG),
> data);
> > +   WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_TRAP_MASK),
> 0);
> > +
> > +   WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_TRAP_DATA0),
> 0);
> > +   WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_TRAP_DATA1),
> 0);
> > +}
> > +
> >   static void gfx_v10_0_init_compute_vmid(struct amdgpu_device *adev)
> >   {
> >     int i;
> > @@ -4935,6 +4958,9 @@ static void gfx_v10_0_init_compute_vmid(struct
> amdgpu_device *adev)
> >             WREG32_SOC15_OFFSET(GC, 0, mmGDS_GWS_VMID0, i, 0);
> >             WREG32_SOC15_OFFSET(GC, 0, mmGDS_OA_VMID0, i, 0);
> >     }
> > +
> > +   gfx_v10_0_debug_trap_config_init(adev, adev-
> >vm_manager.first_kfd_vmid,
> > +                                   AMDGPU_NUM_VMID);
> >   }
> >
> >   static void gfx_v10_0_init_gds_vmid(struct amdgpu_device *adev)
> > diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> > index 0320be4a5fc6..a0e5ad342f13 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> > @@ -2337,6 +2337,29 @@ static void gfx_v9_0_setup_rb(struct
> amdgpu_device *adev)
> >     adev->gfx.config.num_rbs = hweight32(active_rbs);
> >   }
> >
> > +static void gfx_v9_0_debug_trap_config_init(struct amdgpu_device
> *adev,
> > +                           uint32_t first_vmid,
> > +                           uint32_t last_vmid)
> > +{
> > +   uint32_t data;
> > +   uint32_t trap_config_vmid_mask = 0;
> > +   int i;
> > +
> > +   /* Calculate trap config vmid mask */
> > +   for (i = first_vmid; i < last_vmid; i++)
> > +           trap_config_vmid_mask |= (1 << i);
> > +
> > +   data = REG_SET_FIELD(0, SPI_GDBG_TRAP_CONFIG,
> > +                   VMID_SEL, trap_config_vmid_mask);
> > +   data = REG_SET_FIELD(data, SPI_GDBG_TRAP_CONFIG,
> > +                   TRAP_EN, 1);
> > +   WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_TRAP_CONFIG),
> data);
> > +   WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_TRAP_MASK),
> 0);
> > +
> > +   WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_TRAP_DATA0),
> 0);
> > +   WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_TRAP_DATA1),
> 0);
> > +}
> > +
> >   #define DEFAULT_SH_MEM_BASES      (0x6000)
> >   static void gfx_v9_0_init_compute_vmid(struct amdgpu_device *adev)
> >   {
> > @@ -4609,6 +4632,13 @@ static int gfx_v9_0_late_init(void *handle)
> >     if (r)
> >             return r;
> >
> > +   if (adev->ip_versions[GC_HWIP][0] == IP_VERSION(9, 4, 2))
> > +           gfx_v9_4_2_debug_trap_config_init(adev,
> > +                   adev->vm_manager.first_kfd_vmid,
> AMDGPU_NUM_VMID);
>
> Where is this function defined. I don't see it in any of your patches.
> Did you forget to git add a file?

gfx_v9_4_2_debug_trap_config_init  got upstreamed a long time ago for some reason.
Probably a mistake when GFX9.4.2 went public.

Thanks,

Jon

>
> Regards,
>    Felix
>
>
> > +   else
> > +           gfx_v9_0_debug_trap_config_init(adev,
> > +                   adev->vm_manager.first_kfd_vmid,
> AMDGPU_NUM_VMID);
> > +
> >     return 0;
> >   }
> >
> > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v10.c
> b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v10.c
> > index d3e2b6a599a4..cb484ace17de 100644
> > --- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v10.c
> > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v10.c
> > @@ -117,6 +117,11 @@ static void init_mqd(struct mqd_manager *mm,
> void **mqd,
> >                     1 <<
> CP_HQD_QUANTUM__QUANTUM_SCALE__SHIFT |
> >                     1 <<
> CP_HQD_QUANTUM__QUANTUM_DURATION__SHIFT;
> >
> > +   /* Set cp_hqd_hq_scheduler0 bit 14 to 1 to have the CP set up the
> > +    * DISPATCH_PTR.  This is required for the kfd debugger
> > +    */
> > +   m->cp_hqd_hq_scheduler0 = 1 << 14;
> > +
> >     if (q->format == KFD_QUEUE_FORMAT_AQL) {
> >             m->cp_hqd_aql_control =
> >                     1 << CP_HQD_AQL_CONTROL__CONTROL0__SHIFT;
> > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c
> b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c
> > index 0778e587a2d6..86f1cf090246 100644
> > --- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c
> > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c
> > @@ -164,6 +164,11 @@ static void init_mqd(struct mqd_manager *mm,
> void **mqd,
> >                     1 <<
> CP_HQD_QUANTUM__QUANTUM_SCALE__SHIFT |
> >                     1 <<
> CP_HQD_QUANTUM__QUANTUM_DURATION__SHIFT;
> >
> > +   /* Set cp_hqd_hq_scheduler0 bit 14 to 1 to have the CP set up the
> > +    * DISPATCH_PTR.  This is required for the kfd debugger
> > +    */
> > +   m->cp_hqd_hq_status0 = 1 << 14;
> > +
> >     if (q->format == KFD_QUEUE_FORMAT_AQL) {
> >             m->cp_hqd_aql_control =
> >                     1 << CP_HQD_AQL_CONTROL__CONTROL0__SHIFT;
> > diff --git a/drivers/gpu/drm/amd/include/asic_reg/gc/gc_10_1_0_offset.h
> b/drivers/gpu/drm/amd/include/asic_reg/gc/gc_10_1_0_offset.h
> > index 18d34bbceebe..7d384f86bd67 100644
> > --- a/drivers/gpu/drm/amd/include/asic_reg/gc/gc_10_1_0_offset.h
> > +++ b/drivers/gpu/drm/amd/include/asic_reg/gc/gc_10_1_0_offset.h
> > @@ -5190,6 +5190,20 @@
> >   #define mmSPI_WCL_PIPE_PERCENT_CS6_BASE_IDX
> 0
> >   #define mmSPI_WCL_PIPE_PERCENT_CS7
> 0x1f70
> >   #define mmSPI_WCL_PIPE_PERCENT_CS7_BASE_IDX
> 0
> > +#define mmSPI_GDBG_WAVE_CNTL
> 0x1f71
> > +#define mmSPI_GDBG_WAVE_CNTL_BASE_IDX
> 0
> > +#define mmSPI_GDBG_TRAP_CONFIG
> 0x1f72
> > +#define mmSPI_GDBG_TRAP_CONFIG_BASE_IDX
> 0
> > +#define mmSPI_GDBG_TRAP_MASK
> 0x1f73
> > +#define mmSPI_GDBG_TRAP_MASK_BASE_IDX
> 0
> > +#define mmSPI_GDBG_WAVE_CNTL2
> 0x1f74
> > +#define mmSPI_GDBG_WAVE_CNTL2_BASE_IDX
> 0
> > +#define mmSPI_GDBG_WAVE_CNTL3
> 0x1f75
> > +#define mmSPI_GDBG_WAVE_CNTL3_BASE_IDX
> 0
> > +#define mmSPI_GDBG_TRAP_DATA0
> 0x1f78
> > +#define mmSPI_GDBG_TRAP_DATA0_BASE_IDX
> 0
> > +#define mmSPI_GDBG_TRAP_DATA1
> 0x1f79
> > +#define mmSPI_GDBG_TRAP_DATA1_BASE_IDX
> 0
> >   #define mmSPI_COMPUTE_QUEUE_RESET
> 0x1f7b
> >   #define mmSPI_COMPUTE_QUEUE_RESET_BASE_IDX
> 0
> >   #define mmSPI_RESOURCE_RESERVE_CU_0
> 0x1f7c
> > diff --git
> a/drivers/gpu/drm/amd/include/asic_reg/gc/gc_10_1_0_sh_mask.h
> b/drivers/gpu/drm/amd/include/asic_reg/gc/gc_10_1_0_sh_mask.h
> > index 4127896ffcdf..08772ba845b0 100644
> > --- a/drivers/gpu/drm/amd/include/asic_reg/gc/gc_10_1_0_sh_mask.h
> > +++ b/drivers/gpu/drm/amd/include/asic_reg/gc/gc_10_1_0_sh_mask.h
> > @@ -19646,6 +19646,75 @@
> >   //SPI_WCL_PIPE_PERCENT_CS7
> >   #define SPI_WCL_PIPE_PERCENT_CS7__VALUE__SHIFT
> 0x0
> >   #define SPI_WCL_PIPE_PERCENT_CS7__VALUE_MASK
> 0x7FL
> > +//SPI_GDBG_WAVE_CNTL
> > +#define SPI_GDBG_WAVE_CNTL__STALL_RA__SHIFT
> 0x0
> > +#define SPI_GDBG_WAVE_CNTL__STALL_VMID__SHIFT
> 0x1
> > +#define SPI_GDBG_WAVE_CNTL__STALL_RA_MASK
> 0x00000001L
> > +#define SPI_GDBG_WAVE_CNTL__STALL_VMID_MASK
> 0x0001FFFEL
> > +//SPI_GDBG_TRAP_CONFIG
> > +#define SPI_GDBG_TRAP_CONFIG__ME_SEL__SHIFT
> 0x0
> > +#define SPI_GDBG_TRAP_CONFIG__PIPE_SEL__SHIFT
> 0x2
> > +#define SPI_GDBG_TRAP_CONFIG__QUEUE_SEL__SHIFT
> 0x4
> > +#define SPI_GDBG_TRAP_CONFIG__ME_MATCH__SHIFT
> 0x7
> > +#define SPI_GDBG_TRAP_CONFIG__PIPE_MATCH__SHIFT
> 0x8
> > +#define SPI_GDBG_TRAP_CONFIG__QUEUE_MATCH__SHIFT
> 0x9
> > +#define SPI_GDBG_TRAP_CONFIG__TRAP_EN__SHIFT
> 0xf
> > +#define SPI_GDBG_TRAP_CONFIG__VMID_SEL__SHIFT
> 0x10
> > +#define SPI_GDBG_TRAP_CONFIG__ME_SEL_MASK
> 0x00000003L
> > +#define SPI_GDBG_TRAP_CONFIG__PIPE_SEL_MASK
> 0x0000000CL
> > +#define SPI_GDBG_TRAP_CONFIG__QUEUE_SEL_MASK
> 0x00000070L
> > +#define SPI_GDBG_TRAP_CONFIG__ME_MATCH_MASK
> 0x00000080L
> > +#define SPI_GDBG_TRAP_CONFIG__PIPE_MATCH_MASK
> 0x00000100L
> > +#define SPI_GDBG_TRAP_CONFIG__QUEUE_MATCH_MASK
> 0x00000200L
> > +#define SPI_GDBG_TRAP_CONFIG__TRAP_EN_MASK
> 0x00008000L
> > +#define SPI_GDBG_TRAP_CONFIG__VMID_SEL_MASK
> 0xFFFF0000L
> > +//SPI_GDBG_TRAP_MASK
> > +#define SPI_GDBG_TRAP_MASK__EXCP_EN__SHIFT
> 0x0
> > +#define SPI_GDBG_TRAP_MASK__REPLACE__SHIFT
> 0x9
> > +#define SPI_GDBG_TRAP_MASK__EXCP_EN_MASK
> 0x01FFL
> > +#define SPI_GDBG_TRAP_MASK__REPLACE_MASK
> 0x0200L
> > +//SPI_GDBG_WAVE_CNTL2
> > +#define SPI_GDBG_WAVE_CNTL2__VMID_MASK__SHIFT
> 0x0
> > +#define SPI_GDBG_WAVE_CNTL2__MODE__SHIFT
> 0x10
> > +#define SPI_GDBG_WAVE_CNTL2__VMID_MASK_MASK
> 0x0000FFFFL
> > +#define SPI_GDBG_WAVE_CNTL2__MODE_MASK
> 0x00030000L
> > +//SPI_GDBG_WAVE_CNTL3
> > +#define SPI_GDBG_WAVE_CNTL3__STALL_PS__SHIFT
> 0x0
> > +#define SPI_GDBG_WAVE_CNTL3__STALL_VS__SHIFT
> 0x1
> > +#define SPI_GDBG_WAVE_CNTL3__STALL_GS__SHIFT
> 0x2
> > +#define SPI_GDBG_WAVE_CNTL3__STALL_HS__SHIFT
> 0x3
> > +#define SPI_GDBG_WAVE_CNTL3__STALL_CSG__SHIFT
> 0x4
> > +#define SPI_GDBG_WAVE_CNTL3__STALL_CS0__SHIFT
> 0x5
> > +#define SPI_GDBG_WAVE_CNTL3__STALL_CS1__SHIFT
> 0x6
> > +#define SPI_GDBG_WAVE_CNTL3__STALL_CS2__SHIFT
> 0x7
> > +#define SPI_GDBG_WAVE_CNTL3__STALL_CS3__SHIFT
> 0x8
> > +#define SPI_GDBG_WAVE_CNTL3__STALL_CS4__SHIFT
> 0x9
> > +#define SPI_GDBG_WAVE_CNTL3__STALL_CS5__SHIFT
> 0xa
> > +#define SPI_GDBG_WAVE_CNTL3__STALL_CS6__SHIFT
> 0xb
> > +#define SPI_GDBG_WAVE_CNTL3__STALL_CS7__SHIFT
> 0xc
> > +#define SPI_GDBG_WAVE_CNTL3__STALL_DURATION__SHIFT
> 0xd
> > +#define SPI_GDBG_WAVE_CNTL3__STALL_MULT__SHIFT
> 0x1c
> > +#define SPI_GDBG_WAVE_CNTL3__STALL_PS_MASK
> 0x00000001L
> > +#define SPI_GDBG_WAVE_CNTL3__STALL_VS_MASK
> 0x00000002L
> > +#define SPI_GDBG_WAVE_CNTL3__STALL_GS_MASK
> 0x00000004L
> > +#define SPI_GDBG_WAVE_CNTL3__STALL_HS_MASK
> 0x00000008L
> > +#define SPI_GDBG_WAVE_CNTL3__STALL_CSG_MASK
> 0x00000010L
> > +#define SPI_GDBG_WAVE_CNTL3__STALL_CS0_MASK
> 0x00000020L
> > +#define SPI_GDBG_WAVE_CNTL3__STALL_CS1_MASK
> 0x00000040L
> > +#define SPI_GDBG_WAVE_CNTL3__STALL_CS2_MASK
> 0x00000080L
> > +#define SPI_GDBG_WAVE_CNTL3__STALL_CS3_MASK
> 0x00000100L
> > +#define SPI_GDBG_WAVE_CNTL3__STALL_CS4_MASK
> 0x00000200L
> > +#define SPI_GDBG_WAVE_CNTL3__STALL_CS5_MASK
> 0x00000400L
> > +#define SPI_GDBG_WAVE_CNTL3__STALL_CS6_MASK
> 0x00000800L
> > +#define SPI_GDBG_WAVE_CNTL3__STALL_CS7_MASK
> 0x00001000L
> > +#define SPI_GDBG_WAVE_CNTL3__STALL_DURATION_MASK
> 0x0FFFE000L
> > +#define SPI_GDBG_WAVE_CNTL3__STALL_MULT_MASK
> 0x10000000L
> > +//SPI_GDBG_TRAP_DATA0
> > +#define SPI_GDBG_TRAP_DATA0__DATA__SHIFT
> 0x0
> > +#define SPI_GDBG_TRAP_DATA0__DATA_MASK
> 0xFFFFFFFFL
> > +//SPI_GDBG_TRAP_DATA1
> > +#define SPI_GDBG_TRAP_DATA1__DATA__SHIFT
> 0x0
> > +#define SPI_GDBG_TRAP_DATA1__DATA_MASK
> 0xFFFFFFFFL
> >   //SPI_COMPUTE_QUEUE_RESET
> >   #define SPI_COMPUTE_QUEUE_RESET__RESET__SHIFT
> 0x0
> >   #define SPI_COMPUTE_QUEUE_RESET__RESET_MASK
> 0x01L
> > diff --git a/drivers/gpu/drm/amd/include/asic_reg/gc/gc_10_3_0_offset.h
> b/drivers/gpu/drm/amd/include/asic_reg/gc/gc_10_3_0_offset.h
> > index 3973110f149c..d09f1a06f4bf 100644
> > --- a/drivers/gpu/drm/amd/include/asic_reg/gc/gc_10_3_0_offset.h
> > +++ b/drivers/gpu/drm/amd/include/asic_reg/gc/gc_10_3_0_offset.h
> > @@ -26,6 +26,8 @@
> >   #define mmSQ_DEBUG_STS_GLOBAL_BASE_IDX
> 0
> >   #define mmSQ_DEBUG_STS_GLOBAL2
> 0x10B0
> >   #define mmSQ_DEBUG_STS_GLOBAL2_BASE_IDX
> 0
> > +#define mmSQ_DEBUG                                                                                     0x10B1
> > +#define mmSQ_DEBUG_BASE_IDX                                                                            0
> >
> >   // addressBlock: gc_sdma0_sdma0dec
> >   // base address: 0x4980
> > @@ -4849,10 +4851,18 @@
> >   #define mmSPI_WCL_PIPE_PERCENT_CS3_BASE_IDX
> 0
> >   #define mmSPI_GDBG_WAVE_CNTL
> 0x1f71
> >   #define mmSPI_GDBG_WAVE_CNTL_BASE_IDX
> 0
> > +#define mmSPI_GDBG_TRAP_CONFIG
> 0x1f72
> > +#define mmSPI_GDBG_TRAP_CONFIG_BASE_IDX
> 0
> >   #define mmSPI_GDBG_TRAP_MASK
> 0x1f73
> >   #define mmSPI_GDBG_TRAP_MASK_BASE_IDX
> 0
> >   #define mmSPI_GDBG_WAVE_CNTL2
> 0x1f74
> >   #define mmSPI_GDBG_WAVE_CNTL2_BASE_IDX
> 0
> > +#define mmSPI_GDBG_WAVE_CNTL3
> 0x1f75
> > +#define mmSPI_GDBG_WAVE_CNTL3_BASE_IDX
> 0
> > +#define mmSPI_GDBG_TRAP_DATA0
> 0x1f78
> > +#define mmSPI_GDBG_TRAP_DATA0_BASE_IDX
> 0
> > +#define mmSPI_GDBG_TRAP_DATA1
> 0x1f79
> > +#define mmSPI_GDBG_TRAP_DATA1_BASE_IDX
> 0
> >   #define mmSPI_COMPUTE_QUEUE_RESET
> 0x1f7b
> >   #define mmSPI_COMPUTE_QUEUE_RESET_BASE_IDX
> 0
> >   #define mmSPI_RESOURCE_RESERVE_CU_0
> 0x1f7c
> > diff --git
> a/drivers/gpu/drm/amd/include/asic_reg/gc/gc_10_3_0_sh_mask.h
> b/drivers/gpu/drm/amd/include/asic_reg/gc/gc_10_3_0_sh_mask.h
> > index d4e8ff22ecb8..fc85aee010fe 100644
> > --- a/drivers/gpu/drm/amd/include/asic_reg/gc/gc_10_3_0_sh_mask.h
> > +++ b/drivers/gpu/drm/amd/include/asic_reg/gc/gc_10_3_0_sh_mask.h
> > @@ -47853,6 +47853,10 @@
> >
> >
> >   // addressBlock: sqind
> > +//SQ_DEBUG
> > +#define SQ_DEBUG__SINGLE_MEMOP_MASK 0x00000001L
> > +#define SQ_DEBUG__SINGLE_MEMOP__SHIFT 0x00000000
> > +
> >   //SQ_DEBUG_STS_GLOBAL
> >   #define SQ_DEBUG_STS_GLOBAL2__FIFO_LEVEL_GFX0_MASK
> 0x000000ffL
> >   #define SQ_DEBUG_STS_GLOBAL2__FIFO_LEVEL_GFX0__SHIFT
> 0x00000000

[-- Attachment #2: winmail.dat --]
[-- Type: application/ms-tnef, Size: 20896 bytes --]

^ permalink raw reply	[flat|nested] 63+ messages in thread

* RE: [PATCH 27/29] drm/amdkfd: add debug queue snapshot operation
  2022-11-30 23:55   ` Felix Kuehling
@ 2022-12-02 19:13     ` Kim, Jonathan
  0 siblings, 0 replies; 63+ messages in thread
From: Kim, Jonathan @ 2022-12-02 19:13 UTC (permalink / raw)
  To: Kuehling, Felix, amd-gfx

[-- Attachment #1: Type: text/plain, Size: 9996 bytes --]

[Public]

> -----Original Message-----
> From: Kuehling, Felix <Felix.Kuehling@amd.com>
> Sent: November 30, 2022 6:55 PM
> To: Kim, Jonathan <Jonathan.Kim@amd.com>; amd-
> gfx@lists.freedesktop.org
> Subject: Re: [PATCH 27/29] drm/amdkfd: add debug queue snapshot
> operation
>
>
> On 2022-10-31 12:23, Jonathan Kim wrote:
> > Allow the debugger to get a snapshot of a specified number of queues
> > containing various queue property information that is copied to the
> > debugger.
> >
> > Since the debugger doesn't know how many queues exist at any given
> time,
> > allow the debugger to pass the requested number of snapshots as 0 to get
> > the actual number of potential snapshots to use for a subsequent snapshot
> > request for actual information.
> >
> > To prevent future ABI breakage, pass in the requested entry_size.
> > The KFD will return it's own entry_size in case the debugger still wants
> > log the information in a core dump on sizing failure.
> >
> > Also allow the debugger to clear exceptions when doing a snapshot.
> >
> > v2: change buf_size arg to num_queues for clarity.
> > fix minimum entry size calculation.
> >
> > Signed-off-by: Jonathan Kim <jonathan.kim@amd.com>
>
> Two nit-picks inline.
>
>
> > ---
> >   drivers/gpu/drm/amd/amdkfd/kfd_chardev.c      |  6 +++
> >   .../drm/amd/amdkfd/kfd_device_queue_manager.c | 41
> +++++++++++++++++++
> >   .../drm/amd/amdkfd/kfd_device_queue_manager.h |  4 ++
> >   drivers/gpu/drm/amd/amdkfd/kfd_priv.h         |  5 +++
> >   .../amd/amdkfd/kfd_process_queue_manager.c    | 40
> ++++++++++++++++++
> >   5 files changed, 96 insertions(+)
> >
> > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> > index 2c8f107237ee..cea393350980 100644
> > --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> > @@ -2961,6 +2961,12 @@ static int kfd_ioctl_set_debug_trap(struct file
> *filep, struct kfd_process *p, v
> >                             &args->query_exception_info.info_size);
> >             break;
> >     case KFD_IOC_DBG_TRAP_GET_QUEUE_SNAPSHOT:
> > +           r = pqm_get_queue_snapshot(&target->pqm,
> > +                           args->queue_snapshot.exception_mask,
> > +                           (void __user *)args-
> >queue_snapshot.snapshot_buf_ptr,
> > +                           &args->queue_snapshot.num_queues,
> > +                           &args->queue_snapshot.entry_size);
> > +           break;
> >     case KFD_IOC_DBG_TRAP_GET_DEVICE_SNAPSHOT:
> >             pr_warn("Debug op %i not supported yet\n", args->op);
> >             r = -EACCES;
> > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> > index 589efbefc8dc..51f8c5676c56 100644
> > --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> > @@ -2950,6 +2950,47 @@ int suspend_queues(struct kfd_process *p,
> >     return total_suspended;
> >   }
> >
> > +static uint32_t set_queue_type_for_user(struct queue_properties
> *q_props)
> > +{
> > +   switch (q_props->type) {
> > +   case KFD_QUEUE_TYPE_COMPUTE:
> > +           return q_props->format == KFD_QUEUE_FORMAT_PM4
> > +                                   ? KFD_IOC_QUEUE_TYPE_COMPUTE
> > +                                   :
> KFD_IOC_QUEUE_TYPE_COMPUTE_AQL;
> > +   case KFD_QUEUE_TYPE_SDMA:
> > +           return KFD_IOC_QUEUE_TYPE_SDMA;
> > +   case KFD_QUEUE_TYPE_SDMA_XGMI:
> > +           return KFD_IOC_QUEUE_TYPE_SDMA_XGMI;
> > +   default:
> > +           WARN_ONCE(true, "queue type not recognized!");
> > +           return 0xffffffff;
> > +   };
> > +}
> > +
> > +void set_queue_snapshot_entry(struct device_queue_manager *dqm,
> > +                         struct queue *q,
> > +                         uint64_t exception_clear_mask,
> > +                         struct kfd_queue_snapshot_entry *qss_entry)
>
> The dqm parameter is not needed. The function can get this from
> q->device->dqm. It's also only needed for dqm locking. I'm not sure
> that's even necessary. Aren't the event_mutex and target process mutex
> held by the caller enough to protect the exception_status and other
> queue properties?

I can't really remember why we device locked in the experimental phase tbh but I think you're right.
The process event lock should protect event status on interrupt writes.
The process lock should protect everything else (property updates/destruction etc).

Thanks,

Jon

>
>
> > +{
> > +   dqm_lock(dqm);
> > +
> > +   qss_entry->ring_base_address = q->properties.queue_address;
> > +   qss_entry->write_pointer_address = (uint64_t)q-
> >properties.write_ptr;
> > +   qss_entry->read_pointer_address = (uint64_t)q-
> >properties.read_ptr;
> > +   qss_entry->ctx_save_restore_address =
> > +                           q-
> >properties.ctx_save_restore_area_address;
> > +   qss_entry->ctx_save_restore_area_size =
> > +                           q->properties.ctx_save_restore_area_size;
> > +   qss_entry->exception_status = q->properties.exception_status;
> > +   qss_entry->queue_id = q->properties.queue_id;
> > +   qss_entry->gpu_id = q->device->id;
> > +   qss_entry->ring_size = (uint32_t)q->properties.queue_size;
> > +   qss_entry->queue_type = set_queue_type_for_user(&q-
> >properties);
> > +   q->properties.exception_status &= ~exception_clear_mask;
> > +
> > +   dqm_unlock(dqm);
> > +}
> > +
> >   int debug_lock_and_unmap(struct device_queue_manager *dqm)
> >   {
> >     int r;
> > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
> b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
> > index 12643528684c..094705b932fc 100644
> > --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
> > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
> > @@ -297,6 +297,10 @@ int resume_queues(struct kfd_process *p,
> >             bool resume_all_queues,
> >             uint32_t num_queues,
> >             uint32_t *usr_queue_id_array);
> > +void set_queue_snapshot_entry(struct device_queue_manager *dqm,
> > +                         struct queue *q,
> > +                         uint64_t exception_clear_mask,
> > +                         struct kfd_queue_snapshot_entry *qss_entry);
> >   int debug_lock_and_unmap(struct device_queue_manager *dqm);
> >   int debug_map_and_unlock(struct device_queue_manager *dqm);
> >   int debug_refresh_runlist(struct device_queue_manager *dqm);
> > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> > index aee4fe20e676..ebd701143981 100644
> > --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> > @@ -1302,6 +1302,11 @@ int pqm_get_wave_state(struct
> process_queue_manager *pqm,
> >                    void __user *ctl_stack,
> >                    u32 *ctl_stack_used_size,
> >                    u32 *save_area_used_size);
> > +int pqm_get_queue_snapshot(struct process_queue_manager *pqm,
> > +                      uint64_t exception_clear_mask,
> > +                      struct kfd_queue_snapshot_entry __user *buf,
> > +                      int *num_qss_entries,
> > +                      uint32_t *entry_size);
> >
> >   int amdkfd_fence_wait_timeout(uint64_t *fence_addr,
> >                           uint64_t fence_value,
> > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
> b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
> > index 15db83c9a585..30df1046c30b 100644
> > --- a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
> > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
> > @@ -569,6 +569,46 @@ int pqm_get_wave_state(struct
> process_queue_manager *pqm,
> >                                                    save_area_used_size);
> >   }
> >
> > +int pqm_get_queue_snapshot(struct process_queue_manager *pqm,
> > +                      uint64_t exception_clear_mask,
> > +                      struct kfd_queue_snapshot_entry __user *buf,
> > +                      int *num_qss_entries,
> > +                      uint32_t *entry_size)
> > +{
> > +   struct process_queue_node *pqn;
> > +   uint32_t tmp_entry_size = *entry_size, tmp_qss_entries =
> *num_qss_entries;
> > +   int r;
> > +
> > +   *num_qss_entries = 0;
> > +   if (!(*entry_size))
> > +           return -EINVAL;
> > +
> > +   *entry_size = min_t(size_t, *entry_size, sizeof(struct
> kfd_queue_snapshot_entry));
> > +   mutex_lock(&pqm->process->event_mutex);
> > +
> > +   list_for_each_entry(pqn, &pqm->queues, process_queue_list) {
> > +           if (!pqn->q)
> > +                   continue;
> > +
> > +           if (*num_qss_entries < tmp_qss_entries) {
> > +                   struct kfd_queue_snapshot_entry src = {0};
>
> It's safer to use memset here. This initialization may not initialize
> padding, so it doesn't guarantee that no uninitialized data leaks from
> kernel mode to user mode.
>
> Regards,
>    Felix
>
>
> > +
> > +                   set_queue_snapshot_entry(pqn->q->device->dqm,
> > +                                   pqn->q, exception_clear_mask,
> &src);
> > +
> > +                   if (copy_to_user(buf, &src, *entry_size)) {
> > +                           r = -EFAULT;
> > +                           break;
> > +                   }
> > +                   buf += tmp_entry_size;
> > +           }
> > +           *num_qss_entries += 1;
> > +   }
> > +
> > +   mutex_unlock(&pqm->process->event_mutex);
> > +   return r;
> > +}
> > +
> >   static int get_queue_data_sizes(struct kfd_process_device *pdd,
> >                             struct queue *q,
> >                             uint32_t *mqd_size,

[-- Attachment #2: winmail.dat --]
[-- Type: application/ms-tnef, Size: 19417 bytes --]

^ permalink raw reply	[flat|nested] 63+ messages in thread

* [PATCH 05/29] drm/amdgpu: setup hw debug registers on driver initialization
  2022-08-29 14:29 [PATCH 0/29] Introduce AMD GPU ISA Debugging for HSA Compute Jonathan Kim
@ 2022-08-29 14:30 ` Jonathan Kim
  0 siblings, 0 replies; 63+ messages in thread
From: Jonathan Kim @ 2022-08-29 14:30 UTC (permalink / raw)
  To: amd-gfx; +Cc: Felix.Kuehling, Lancelot.Six, Laurent.Morichetti

Add missing debug trap registers references and initialize all debug
registers on boot by clearing the hardware exception overrides and the
wave allocation ID index.

For debug devices that only support single process debugging, enable
trap temporary setup by default.

Debug devices that support multi-process debugging require trap
temporary setup to be disabled by default in order to satisfy microbench
performance when in non-debug mode.

The debugger requires that TTMPs 6 & 7 save the dispatch ID to map
waves onto dispatch during compute context inspection.
In order to correctly this up, set the special reserved CP bit by default
whenever the MQD is initailized.

Signed-off-by: Jonathan Kim <jonathan.kim@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c        | 26 +++++++
 drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c         | 30 ++++++++
 .../gpu/drm/amd/amdkfd/kfd_mqd_manager_v10.c  |  5 ++
 .../gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c   |  5 ++
 .../include/asic_reg/gc/gc_10_1_0_offset.h    | 14 ++++
 .../include/asic_reg/gc/gc_10_1_0_sh_mask.h   | 69 +++++++++++++++++++
 .../include/asic_reg/gc/gc_10_3_0_offset.h    | 10 +++
 .../include/asic_reg/gc/gc_10_3_0_sh_mask.h   |  4 ++
 8 files changed, 163 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
index 1a915edccb92..54765d5d7d3a 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
@@ -5142,6 +5142,29 @@ static u32 gfx_v10_0_init_pa_sc_tile_steering_override(struct amdgpu_device *ade
 
 #define DEFAULT_SH_MEM_BASES	(0x6000)
 
+static void gfx_v10_0_debug_trap_config_init(struct amdgpu_device *adev,
+				uint32_t first_vmid,
+				uint32_t last_vmid)
+{
+	uint32_t data;
+	uint32_t trap_config_vmid_mask = 0;
+	int i;
+
+	/* Calculate trap config vmid mask */
+	for (i = first_vmid; i < last_vmid; i++)
+		trap_config_vmid_mask |= (1 << i);
+
+	data = REG_SET_FIELD(0, SPI_GDBG_TRAP_CONFIG,
+			VMID_SEL, trap_config_vmid_mask);
+	data = REG_SET_FIELD(data, SPI_GDBG_TRAP_CONFIG,
+			TRAP_EN, 1);
+	WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_TRAP_CONFIG), data);
+	WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_TRAP_MASK), 0);
+
+	WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_TRAP_DATA0), 0);
+	WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_TRAP_DATA1), 0);
+}
+
 static void gfx_v10_0_init_compute_vmid(struct amdgpu_device *adev)
 {
 	int i;
@@ -5173,6 +5196,9 @@ static void gfx_v10_0_init_compute_vmid(struct amdgpu_device *adev)
 		WREG32_SOC15_OFFSET(GC, 0, mmGDS_GWS_VMID0, i, 0);
 		WREG32_SOC15_OFFSET(GC, 0, mmGDS_OA_VMID0, i, 0);
 	}
+
+	gfx_v10_0_debug_trap_config_init(adev, adev->vm_manager.first_kfd_vmid,
+					AMDGPU_NUM_VMID);
 }
 
 static void gfx_v10_0_init_gds_vmid(struct amdgpu_device *adev)
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
index 7f187558220e..4e9cbbab0fef 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
@@ -2515,6 +2515,29 @@ static void gfx_v9_0_setup_rb(struct amdgpu_device *adev)
 	adev->gfx.config.num_rbs = hweight32(active_rbs);
 }
 
+static void gfx_v9_0_debug_trap_config_init(struct amdgpu_device *adev,
+				uint32_t first_vmid,
+				uint32_t last_vmid)
+{
+	uint32_t data;
+	uint32_t trap_config_vmid_mask = 0;
+	int i;
+
+	/* Calculate trap config vmid mask */
+	for (i = first_vmid; i < last_vmid; i++)
+		trap_config_vmid_mask |= (1 << i);
+
+	data = REG_SET_FIELD(0, SPI_GDBG_TRAP_CONFIG,
+			VMID_SEL, trap_config_vmid_mask);
+	data = REG_SET_FIELD(data, SPI_GDBG_TRAP_CONFIG,
+			TRAP_EN, 1);
+	WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_TRAP_CONFIG), data);
+	WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_TRAP_MASK), 0);
+
+	WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_TRAP_DATA0), 0);
+	WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_TRAP_DATA1), 0);
+}
+
 #define DEFAULT_SH_MEM_BASES	(0x6000)
 static void gfx_v9_0_init_compute_vmid(struct amdgpu_device *adev)
 {
@@ -4786,6 +4809,13 @@ static int gfx_v9_0_late_init(void *handle)
 	if (r)
 		return r;
 
+	if (adev->ip_versions[GC_HWIP][0] == IP_VERSION(9, 4, 2))
+		gfx_v9_4_2_debug_trap_config_init(adev,
+			adev->vm_manager.first_kfd_vmid, AMDGPU_NUM_VMID);
+	else
+		gfx_v9_0_debug_trap_config_init(adev,
+			adev->vm_manager.first_kfd_vmid, AMDGPU_NUM_VMID);
+
 	return 0;
 }
 
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v10.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v10.c
index d3e2b6a599a4..cb484ace17de 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v10.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v10.c
@@ -117,6 +117,11 @@ static void init_mqd(struct mqd_manager *mm, void **mqd,
 			1 << CP_HQD_QUANTUM__QUANTUM_SCALE__SHIFT |
 			1 << CP_HQD_QUANTUM__QUANTUM_DURATION__SHIFT;
 
+	/* Set cp_hqd_hq_scheduler0 bit 14 to 1 to have the CP set up the
+	 * DISPATCH_PTR.  This is required for the kfd debugger
+	 */
+	m->cp_hqd_hq_scheduler0 = 1 << 14;
+
 	if (q->format == KFD_QUEUE_FORMAT_AQL) {
 		m->cp_hqd_aql_control =
 			1 << CP_HQD_AQL_CONTROL__CONTROL0__SHIFT;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c
index 0778e587a2d6..86f1cf090246 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c
@@ -164,6 +164,11 @@ static void init_mqd(struct mqd_manager *mm, void **mqd,
 			1 << CP_HQD_QUANTUM__QUANTUM_SCALE__SHIFT |
 			1 << CP_HQD_QUANTUM__QUANTUM_DURATION__SHIFT;
 
+	/* Set cp_hqd_hq_scheduler0 bit 14 to 1 to have the CP set up the
+	 * DISPATCH_PTR.  This is required for the kfd debugger
+	 */
+	m->cp_hqd_hq_status0 = 1 << 14;
+
 	if (q->format == KFD_QUEUE_FORMAT_AQL) {
 		m->cp_hqd_aql_control =
 			1 << CP_HQD_AQL_CONTROL__CONTROL0__SHIFT;
diff --git a/drivers/gpu/drm/amd/include/asic_reg/gc/gc_10_1_0_offset.h b/drivers/gpu/drm/amd/include/asic_reg/gc/gc_10_1_0_offset.h
index 18d34bbceebe..7d384f86bd67 100644
--- a/drivers/gpu/drm/amd/include/asic_reg/gc/gc_10_1_0_offset.h
+++ b/drivers/gpu/drm/amd/include/asic_reg/gc/gc_10_1_0_offset.h
@@ -5190,6 +5190,20 @@
 #define mmSPI_WCL_PIPE_PERCENT_CS6_BASE_IDX                                                            0
 #define mmSPI_WCL_PIPE_PERCENT_CS7                                                                     0x1f70
 #define mmSPI_WCL_PIPE_PERCENT_CS7_BASE_IDX                                                            0
+#define mmSPI_GDBG_WAVE_CNTL                                                                           0x1f71
+#define mmSPI_GDBG_WAVE_CNTL_BASE_IDX                                                                  0
+#define mmSPI_GDBG_TRAP_CONFIG                                                                         0x1f72
+#define mmSPI_GDBG_TRAP_CONFIG_BASE_IDX                                                                0
+#define mmSPI_GDBG_TRAP_MASK                                                                           0x1f73
+#define mmSPI_GDBG_TRAP_MASK_BASE_IDX                                                                  0
+#define mmSPI_GDBG_WAVE_CNTL2                                                                          0x1f74
+#define mmSPI_GDBG_WAVE_CNTL2_BASE_IDX                                                                 0
+#define mmSPI_GDBG_WAVE_CNTL3                                                                          0x1f75
+#define mmSPI_GDBG_WAVE_CNTL3_BASE_IDX                                                                 0
+#define mmSPI_GDBG_TRAP_DATA0                                                                          0x1f78
+#define mmSPI_GDBG_TRAP_DATA0_BASE_IDX                                                                 0
+#define mmSPI_GDBG_TRAP_DATA1                                                                          0x1f79
+#define mmSPI_GDBG_TRAP_DATA1_BASE_IDX                                                                 0
 #define mmSPI_COMPUTE_QUEUE_RESET                                                                      0x1f7b
 #define mmSPI_COMPUTE_QUEUE_RESET_BASE_IDX                                                             0
 #define mmSPI_RESOURCE_RESERVE_CU_0                                                                    0x1f7c
diff --git a/drivers/gpu/drm/amd/include/asic_reg/gc/gc_10_1_0_sh_mask.h b/drivers/gpu/drm/amd/include/asic_reg/gc/gc_10_1_0_sh_mask.h
index 4127896ffcdf..08772ba845b0 100644
--- a/drivers/gpu/drm/amd/include/asic_reg/gc/gc_10_1_0_sh_mask.h
+++ b/drivers/gpu/drm/amd/include/asic_reg/gc/gc_10_1_0_sh_mask.h
@@ -19646,6 +19646,75 @@
 //SPI_WCL_PIPE_PERCENT_CS7
 #define SPI_WCL_PIPE_PERCENT_CS7__VALUE__SHIFT                                                                0x0
 #define SPI_WCL_PIPE_PERCENT_CS7__VALUE_MASK                                                                  0x7FL
+//SPI_GDBG_WAVE_CNTL
+#define SPI_GDBG_WAVE_CNTL__STALL_RA__SHIFT                                                                   0x0
+#define SPI_GDBG_WAVE_CNTL__STALL_VMID__SHIFT                                                                 0x1
+#define SPI_GDBG_WAVE_CNTL__STALL_RA_MASK                                                                     0x00000001L
+#define SPI_GDBG_WAVE_CNTL__STALL_VMID_MASK                                                                   0x0001FFFEL
+//SPI_GDBG_TRAP_CONFIG
+#define SPI_GDBG_TRAP_CONFIG__ME_SEL__SHIFT                                                                   0x0
+#define SPI_GDBG_TRAP_CONFIG__PIPE_SEL__SHIFT                                                                 0x2
+#define SPI_GDBG_TRAP_CONFIG__QUEUE_SEL__SHIFT                                                                0x4
+#define SPI_GDBG_TRAP_CONFIG__ME_MATCH__SHIFT                                                                 0x7
+#define SPI_GDBG_TRAP_CONFIG__PIPE_MATCH__SHIFT                                                               0x8
+#define SPI_GDBG_TRAP_CONFIG__QUEUE_MATCH__SHIFT                                                              0x9
+#define SPI_GDBG_TRAP_CONFIG__TRAP_EN__SHIFT                                                                  0xf
+#define SPI_GDBG_TRAP_CONFIG__VMID_SEL__SHIFT                                                                 0x10
+#define SPI_GDBG_TRAP_CONFIG__ME_SEL_MASK                                                                     0x00000003L
+#define SPI_GDBG_TRAP_CONFIG__PIPE_SEL_MASK                                                                   0x0000000CL
+#define SPI_GDBG_TRAP_CONFIG__QUEUE_SEL_MASK                                                                  0x00000070L
+#define SPI_GDBG_TRAP_CONFIG__ME_MATCH_MASK                                                                   0x00000080L
+#define SPI_GDBG_TRAP_CONFIG__PIPE_MATCH_MASK                                                                 0x00000100L
+#define SPI_GDBG_TRAP_CONFIG__QUEUE_MATCH_MASK                                                                0x00000200L
+#define SPI_GDBG_TRAP_CONFIG__TRAP_EN_MASK                                                                    0x00008000L
+#define SPI_GDBG_TRAP_CONFIG__VMID_SEL_MASK                                                                   0xFFFF0000L
+//SPI_GDBG_TRAP_MASK
+#define SPI_GDBG_TRAP_MASK__EXCP_EN__SHIFT                                                                    0x0
+#define SPI_GDBG_TRAP_MASK__REPLACE__SHIFT                                                                    0x9
+#define SPI_GDBG_TRAP_MASK__EXCP_EN_MASK                                                                      0x01FFL
+#define SPI_GDBG_TRAP_MASK__REPLACE_MASK                                                                      0x0200L
+//SPI_GDBG_WAVE_CNTL2
+#define SPI_GDBG_WAVE_CNTL2__VMID_MASK__SHIFT                                                                 0x0
+#define SPI_GDBG_WAVE_CNTL2__MODE__SHIFT                                                                      0x10
+#define SPI_GDBG_WAVE_CNTL2__VMID_MASK_MASK                                                                   0x0000FFFFL
+#define SPI_GDBG_WAVE_CNTL2__MODE_MASK                                                                        0x00030000L
+//SPI_GDBG_WAVE_CNTL3
+#define SPI_GDBG_WAVE_CNTL3__STALL_PS__SHIFT                                                                  0x0
+#define SPI_GDBG_WAVE_CNTL3__STALL_VS__SHIFT                                                                  0x1
+#define SPI_GDBG_WAVE_CNTL3__STALL_GS__SHIFT                                                                  0x2
+#define SPI_GDBG_WAVE_CNTL3__STALL_HS__SHIFT                                                                  0x3
+#define SPI_GDBG_WAVE_CNTL3__STALL_CSG__SHIFT                                                                 0x4
+#define SPI_GDBG_WAVE_CNTL3__STALL_CS0__SHIFT                                                                 0x5
+#define SPI_GDBG_WAVE_CNTL3__STALL_CS1__SHIFT                                                                 0x6
+#define SPI_GDBG_WAVE_CNTL3__STALL_CS2__SHIFT                                                                 0x7
+#define SPI_GDBG_WAVE_CNTL3__STALL_CS3__SHIFT                                                                 0x8
+#define SPI_GDBG_WAVE_CNTL3__STALL_CS4__SHIFT                                                                 0x9
+#define SPI_GDBG_WAVE_CNTL3__STALL_CS5__SHIFT                                                                 0xa
+#define SPI_GDBG_WAVE_CNTL3__STALL_CS6__SHIFT                                                                 0xb
+#define SPI_GDBG_WAVE_CNTL3__STALL_CS7__SHIFT                                                                 0xc
+#define SPI_GDBG_WAVE_CNTL3__STALL_DURATION__SHIFT                                                            0xd
+#define SPI_GDBG_WAVE_CNTL3__STALL_MULT__SHIFT                                                                0x1c
+#define SPI_GDBG_WAVE_CNTL3__STALL_PS_MASK                                                                    0x00000001L
+#define SPI_GDBG_WAVE_CNTL3__STALL_VS_MASK                                                                    0x00000002L
+#define SPI_GDBG_WAVE_CNTL3__STALL_GS_MASK                                                                    0x00000004L
+#define SPI_GDBG_WAVE_CNTL3__STALL_HS_MASK                                                                    0x00000008L
+#define SPI_GDBG_WAVE_CNTL3__STALL_CSG_MASK                                                                   0x00000010L
+#define SPI_GDBG_WAVE_CNTL3__STALL_CS0_MASK                                                                   0x00000020L
+#define SPI_GDBG_WAVE_CNTL3__STALL_CS1_MASK                                                                   0x00000040L
+#define SPI_GDBG_WAVE_CNTL3__STALL_CS2_MASK                                                                   0x00000080L
+#define SPI_GDBG_WAVE_CNTL3__STALL_CS3_MASK                                                                   0x00000100L
+#define SPI_GDBG_WAVE_CNTL3__STALL_CS4_MASK                                                                   0x00000200L
+#define SPI_GDBG_WAVE_CNTL3__STALL_CS5_MASK                                                                   0x00000400L
+#define SPI_GDBG_WAVE_CNTL3__STALL_CS6_MASK                                                                   0x00000800L
+#define SPI_GDBG_WAVE_CNTL3__STALL_CS7_MASK                                                                   0x00001000L
+#define SPI_GDBG_WAVE_CNTL3__STALL_DURATION_MASK                                                              0x0FFFE000L
+#define SPI_GDBG_WAVE_CNTL3__STALL_MULT_MASK                                                                  0x10000000L
+//SPI_GDBG_TRAP_DATA0
+#define SPI_GDBG_TRAP_DATA0__DATA__SHIFT                                                                      0x0
+#define SPI_GDBG_TRAP_DATA0__DATA_MASK                                                                        0xFFFFFFFFL
+//SPI_GDBG_TRAP_DATA1
+#define SPI_GDBG_TRAP_DATA1__DATA__SHIFT                                                                      0x0
+#define SPI_GDBG_TRAP_DATA1__DATA_MASK                                                                        0xFFFFFFFFL
 //SPI_COMPUTE_QUEUE_RESET
 #define SPI_COMPUTE_QUEUE_RESET__RESET__SHIFT                                                                 0x0
 #define SPI_COMPUTE_QUEUE_RESET__RESET_MASK                                                                   0x01L
diff --git a/drivers/gpu/drm/amd/include/asic_reg/gc/gc_10_3_0_offset.h b/drivers/gpu/drm/amd/include/asic_reg/gc/gc_10_3_0_offset.h
index 594bffce93a9..b7c1445b0bed 100644
--- a/drivers/gpu/drm/amd/include/asic_reg/gc/gc_10_3_0_offset.h
+++ b/drivers/gpu/drm/amd/include/asic_reg/gc/gc_10_3_0_offset.h
@@ -26,6 +26,8 @@
 #define mmSQ_DEBUG_STS_GLOBAL_BASE_IDX                                                                 0
 #define mmSQ_DEBUG_STS_GLOBAL2                                                                         0x10B0
 #define mmSQ_DEBUG_STS_GLOBAL2_BASE_IDX                                                                0
+#define mmSQ_DEBUG                                                                                     0x10B1
+#define mmSQ_DEBUG_BASE_IDX                                                                            0
 
 // addressBlock: gc_sdma0_sdma0dec
 // base address: 0x4980
@@ -4849,10 +4851,18 @@
 #define mmSPI_WCL_PIPE_PERCENT_CS3_BASE_IDX                                                            0
 #define mmSPI_GDBG_WAVE_CNTL                                                                           0x1f71
 #define mmSPI_GDBG_WAVE_CNTL_BASE_IDX                                                                  0
+#define mmSPI_GDBG_TRAP_CONFIG                                                                         0x1f72
+#define mmSPI_GDBG_TRAP_CONFIG_BASE_IDX                                                                0
 #define mmSPI_GDBG_TRAP_MASK                                                                           0x1f73
 #define mmSPI_GDBG_TRAP_MASK_BASE_IDX                                                                  0
 #define mmSPI_GDBG_WAVE_CNTL2                                                                          0x1f74
 #define mmSPI_GDBG_WAVE_CNTL2_BASE_IDX                                                                 0
+#define mmSPI_GDBG_WAVE_CNTL3                                                                          0x1f75
+#define mmSPI_GDBG_WAVE_CNTL3_BASE_IDX                                                                 0
+#define mmSPI_GDBG_TRAP_DATA0                                                                          0x1f78
+#define mmSPI_GDBG_TRAP_DATA0_BASE_IDX                                                                 0
+#define mmSPI_GDBG_TRAP_DATA1                                                                          0x1f79
+#define mmSPI_GDBG_TRAP_DATA1_BASE_IDX                                                                 0
 #define mmSPI_COMPUTE_QUEUE_RESET                                                                      0x1f7b
 #define mmSPI_COMPUTE_QUEUE_RESET_BASE_IDX                                                             0
 #define mmSPI_RESOURCE_RESERVE_CU_0                                                                    0x1f7c
diff --git a/drivers/gpu/drm/amd/include/asic_reg/gc/gc_10_3_0_sh_mask.h b/drivers/gpu/drm/amd/include/asic_reg/gc/gc_10_3_0_sh_mask.h
index a827b0ff8905..30d67feba1a8 100644
--- a/drivers/gpu/drm/amd/include/asic_reg/gc/gc_10_3_0_sh_mask.h
+++ b/drivers/gpu/drm/amd/include/asic_reg/gc/gc_10_3_0_sh_mask.h
@@ -46386,6 +46386,10 @@
 
 
 // addressBlock: sqind
+//SQ_DEBUG
+#define SQ_DEBUG__SINGLE_MEMOP_MASK 0x00000001L
+#define SQ_DEBUG__SINGLE_MEMOP__SHIFT 0x00000000
+
 //SQ_DEBUG_STS_GLOBAL
 #define SQ_DEBUG_STS_GLOBAL2__FIFO_LEVEL_GFX0_MASK 0x000000ffL
 #define SQ_DEBUG_STS_GLOBAL2__FIFO_LEVEL_GFX0__SHIFT 0x00000000
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 63+ messages in thread

end of thread, other threads:[~2022-12-02 19:13 UTC | newest]

Thread overview: 63+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-10-31 16:23 [PATCH 01/29] drm/amdkfd: add debug and runtime enable interface Jonathan Kim
2022-10-31 16:23 ` [PATCH 02/29] drm/amdkfd: display debug capabilities Jonathan Kim
2022-11-22 23:08   ` Felix Kuehling
2022-10-31 16:23 ` [PATCH 03/29] drm/amdkfd: prepare per-process debug enable and disable Jonathan Kim
2022-11-22 23:31   ` Felix Kuehling
2022-10-31 16:23 ` [PATCH 04/29] drm/amdgpu: add kgd hw debug mode setting interface Jonathan Kim
2022-12-01  0:08   ` Felix Kuehling
2022-10-31 16:23 ` [PATCH 05/29] drm/amdgpu: setup hw debug registers on driver initialization Jonathan Kim
2022-11-22 23:38   ` Felix Kuehling
2022-11-23 20:53     ` Kim, Jonathan
2022-12-01  0:18     ` Felix Kuehling
2022-12-01  0:23   ` Felix Kuehling
2022-12-02 17:42     ` Kim, Jonathan
2022-10-31 16:23 ` [PATCH 06/29] drm/amdgpu: add gfx9 hw debug mode enable and disable calls Jonathan Kim
2022-11-22 23:50   ` Felix Kuehling
2022-10-31 16:23 ` [PATCH 07/29] drm/amdgpu: add gfx9.4.1 " Jonathan Kim
2022-11-22 23:59   ` Felix Kuehling
2022-11-24 14:58     ` Kim, Jonathan
2022-11-24 16:25       ` Felix Kuehling
2022-10-31 16:23 ` [PATCH 08/29] drm/amdgpu: add gfx10 " Jonathan Kim
2022-10-31 16:23 ` [PATCH 09/29] drm/amdgpu: add gfx9.4.2 " Jonathan Kim
2022-10-31 16:23 ` [PATCH 10/29] drm/amdgpu: add configurable grace period for unmap queues Jonathan Kim
2022-11-23  0:21   ` Felix Kuehling
2022-10-31 16:23 ` [PATCH 11/29] drm/amdkfd: prepare map process for single process debug devices Jonathan Kim
2022-10-31 16:23 ` [PATCH 12/29] drm/amdgpu: prepare map process for multi-process " Jonathan Kim
2022-10-31 16:23 ` [PATCH 13/29] drm/amdkfd: add per process hw trap enable and disable functions Jonathan Kim
2022-10-31 16:23 ` [PATCH 14/29] drm/amdkfd: add raise exception event function Jonathan Kim
2022-10-31 16:23 ` [PATCH 15/29] drm/amdkfd: add send exception operation Jonathan Kim
2022-10-31 16:23 ` [PATCH 16/29] drm/amdkfd: add runtime enable operation Jonathan Kim
2022-11-23  0:52   ` Felix Kuehling
2022-10-31 16:23 ` [PATCH 17/29] drm/amdkfd: Add debug trap enabled flag to TMA Jonathan Kim
2022-11-23  0:44   ` Felix Kuehling
2022-11-24 14:51     ` Kim, Jonathan
2022-11-24 16:23       ` Felix Kuehling
2022-11-24 20:27         ` Kim, Jonathan
2022-11-25 16:53           ` Felix Kuehling
2022-10-31 16:23 ` [PATCH 18/29] drm/amdkfd: update process interrupt handling for debug events Jonathan Kim
2022-10-31 16:23 ` [PATCH 19/29] drm/amdkfd: add debug set exceptions enabled operation Jonathan Kim
2022-11-24 21:24   ` Felix Kuehling
2022-10-31 16:23 ` [PATCH 20/29] drm/amdkfd: add debug wave launch override operation Jonathan Kim
2022-11-29 22:37   ` Felix Kuehling
2022-10-31 16:23 ` [PATCH 21/29] drm/amdkfd: add debug wave launch mode operation Jonathan Kim
2022-12-01  0:02   ` Felix Kuehling
2022-10-31 16:23 ` [PATCH 22/29] drm/amdkfd: add debug suspend and resume process queues operation Jonathan Kim
2022-11-29 23:55   ` Felix Kuehling
2022-10-31 16:23 ` [PATCH 23/29] drm/amdkfd: add debug set and clear address watch points operation Jonathan Kim
2022-11-30  0:34   ` Felix Kuehling
2022-10-31 16:23 ` [PATCH 24/29] drm/amdkfd: add debug set flags operation Jonathan Kim
2022-11-30  0:39   ` Felix Kuehling
2022-10-31 16:23 ` [PATCH 25/29] drm/amdkfd: add debug query event operation Jonathan Kim
2022-11-30  0:44   ` Felix Kuehling
2022-10-31 16:23 ` [PATCH 26/29] drm/amdkfd: add debug query exception info operation Jonathan Kim
2022-11-30  0:50   ` Felix Kuehling
2022-10-31 16:23 ` [PATCH 27/29] drm/amdkfd: add debug queue snapshot operation Jonathan Kim
2022-11-30 23:55   ` Felix Kuehling
2022-12-02 19:13     ` Kim, Jonathan
2022-10-31 16:23 ` [PATCH 28/29] drm/amdkfd: add debug device " Jonathan Kim
2022-12-01  0:00   ` Felix Kuehling
2022-10-31 16:23 ` [PATCH 29/29] drm/amdkfd: bump kfd ioctl minor version for debug api availability Jonathan Kim
2022-12-01  0:00   ` Felix Kuehling
2022-11-22 23:05 ` [PATCH 01/29] drm/amdkfd: add debug and runtime enable interface Felix Kuehling
2022-11-23 20:45   ` Kim, Jonathan
  -- strict thread matches above, loose matches on Subject: below --
2022-08-29 14:29 [PATCH 0/29] Introduce AMD GPU ISA Debugging for HSA Compute Jonathan Kim
2022-08-29 14:30 ` [PATCH 05/29] drm/amdgpu: setup hw debug registers on driver initialization Jonathan Kim

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.